]> git.proxmox.com Git - ceph.git/blob - ceph/src/os/bluestore/BlueFS.cc
710021f0787856bbaecce31ab7c6aa752409f6cf
[ceph.git] / ceph / src / os / bluestore / BlueFS.cc
1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2 // vim: ts=8 sw=2 smarttab
3 #include <chrono>
4 #include "boost/algorithm/string.hpp"
5 #include "bluestore_common.h"
6 #include "BlueFS.h"
7
8 #include "common/debug.h"
9 #include "common/errno.h"
10 #include "common/perf_counters.h"
11 #include "Allocator.h"
12 #include "include/ceph_assert.h"
13 #include "common/admin_socket.h"
14
15 #define dout_context cct
16 #define dout_subsys ceph_subsys_bluefs
17 #undef dout_prefix
18 #define dout_prefix *_dout << "bluefs "
19 using TOPNSPC::common::cmd_getval;
20
21 using std::byte;
22 using std::list;
23 using std::make_pair;
24 using std::map;
25 using std::ostream;
26 using std::pair;
27 using std::set;
28 using std::string;
29 using std::to_string;
30 using std::vector;
31 using std::chrono::duration;
32 using std::chrono::seconds;
33
34 using ceph::bufferlist;
35 using ceph::decode;
36 using ceph::encode;
37 using ceph::Formatter;
38
39
40 MEMPOOL_DEFINE_OBJECT_FACTORY(BlueFS::File, bluefs_file, bluefs);
41 MEMPOOL_DEFINE_OBJECT_FACTORY(BlueFS::Dir, bluefs_dir, bluefs);
42 MEMPOOL_DEFINE_OBJECT_FACTORY(BlueFS::FileWriter, bluefs_file_writer, bluefs_file_writer);
43 MEMPOOL_DEFINE_OBJECT_FACTORY(BlueFS::FileReaderBuffer,
44 bluefs_file_reader_buffer, bluefs_file_reader);
45 MEMPOOL_DEFINE_OBJECT_FACTORY(BlueFS::FileReader, bluefs_file_reader, bluefs_file_reader);
46 MEMPOOL_DEFINE_OBJECT_FACTORY(BlueFS::FileLock, bluefs_file_lock, bluefs);
47
48 static void wal_discard_cb(void *priv, void* priv2) {
49 BlueFS *bluefs = static_cast<BlueFS*>(priv);
50 interval_set<uint64_t> *tmp = static_cast<interval_set<uint64_t>*>(priv2);
51 bluefs->handle_discard(BlueFS::BDEV_WAL, *tmp);
52 }
53
54 static void db_discard_cb(void *priv, void* priv2) {
55 BlueFS *bluefs = static_cast<BlueFS*>(priv);
56 interval_set<uint64_t> *tmp = static_cast<interval_set<uint64_t>*>(priv2);
57 bluefs->handle_discard(BlueFS::BDEV_DB, *tmp);
58 }
59
60 static void slow_discard_cb(void *priv, void* priv2) {
61 BlueFS *bluefs = static_cast<BlueFS*>(priv);
62 interval_set<uint64_t> *tmp = static_cast<interval_set<uint64_t>*>(priv2);
63 bluefs->handle_discard(BlueFS::BDEV_SLOW, *tmp);
64 }
65
66 class BlueFS::SocketHook : public AdminSocketHook {
67 BlueFS* bluefs;
68 public:
69 static BlueFS::SocketHook* create(BlueFS* bluefs)
70 {
71 BlueFS::SocketHook* hook = nullptr;
72 AdminSocket* admin_socket = bluefs->cct->get_admin_socket();
73 if (admin_socket) {
74 hook = new BlueFS::SocketHook(bluefs);
75 int r = admin_socket->register_command("bluestore bluefs device info "
76 "name=alloc_size,type=CephInt,req=false",
77 hook,
78 "Shows space report for bluefs devices. "
79 "This also includes an estimation for space "
80 "available to bluefs at main device. "
81 "alloc_size, if set, specifies the custom bluefs "
82 "allocation unit size for the estimation above.");
83 if (r != 0) {
84 ldout(bluefs->cct, 1) << __func__ << " cannot register SocketHook" << dendl;
85 delete hook;
86 hook = nullptr;
87 } else {
88 r = admin_socket->register_command("bluefs stats",
89 hook,
90 "Dump internal statistics for bluefs."
91 "");
92 ceph_assert(r == 0);
93 r = admin_socket->register_command("bluefs files list", hook,
94 "print files in bluefs");
95 ceph_assert(r == 0);
96 r = admin_socket->register_command("bluefs debug_inject_read_zeros", hook,
97 "Injects 8K zeros into next BlueFS read. Debug only.");
98 ceph_assert(r == 0);
99 }
100 }
101 return hook;
102 }
103
104 ~SocketHook() {
105 AdminSocket* admin_socket = bluefs->cct->get_admin_socket();
106 admin_socket->unregister_commands(this);
107 }
108 private:
109 SocketHook(BlueFS* bluefs) :
110 bluefs(bluefs) {}
111 int call(std::string_view command, const cmdmap_t& cmdmap,
112 const bufferlist&,
113 Formatter *f,
114 std::ostream& errss,
115 bufferlist& out) override {
116 if (command == "bluestore bluefs device info") {
117 int64_t alloc_size = 0;
118 cmd_getval(cmdmap, "alloc_size", alloc_size);
119 if ((alloc_size & (alloc_size - 1)) != 0) {
120 errss << "Invalid allocation size:'" << alloc_size << std::endl;
121 return -EINVAL;
122 }
123 if (alloc_size == 0)
124 alloc_size = bluefs->cct->_conf->bluefs_shared_alloc_size;
125 f->open_object_section("bluefs_device_info");
126 for (unsigned dev = BDEV_WAL; dev <= BDEV_SLOW; dev++) {
127 if (bluefs->bdev[dev]) {
128 f->open_object_section("dev");
129 f->dump_string("device", bluefs->get_device_name(dev));
130 ceph_assert(bluefs->alloc[dev]);
131 auto total = bluefs->get_total(dev);
132 auto free = bluefs->get_free(dev);
133 auto used = bluefs->get_used(dev);
134
135 f->dump_int("total", total);
136 f->dump_int("free", free);
137 f->dump_int("bluefs_used", used);
138 if (bluefs->is_shared_alloc(dev)) {
139 size_t avail = bluefs->probe_alloc_avail(dev, alloc_size);
140 f->dump_int("bluefs max available", avail);
141 }
142 f->close_section();
143 }
144 }
145
146 f->close_section();
147 } else if (command == "bluefs stats") {
148 std::stringstream ss;
149 bluefs->dump_block_extents(ss);
150 bluefs->dump_volume_selector(ss);
151 out.append(ss);
152 } else if (command == "bluefs files list") {
153 const char* devnames[3] = {"wal","db","slow"};
154 std::lock_guard l(bluefs->nodes.lock);
155 f->open_array_section("files");
156 for (auto &d : bluefs->nodes.dir_map) {
157 std::string dir = d.first;
158 for (auto &r : d.second->file_map) {
159 f->open_object_section("file");
160 f->dump_string("name", (dir + "/" + r.first).c_str());
161 std::vector<size_t> sizes;
162 sizes.resize(bluefs->bdev.size());
163 for(auto& i : r.second->fnode.extents) {
164 sizes[i.bdev] += i.length;
165 }
166 for (size_t i = 0; i < sizes.size(); i++) {
167 if (sizes[i]>0) {
168 if (i < sizeof(devnames) / sizeof(*devnames))
169 f->dump_int(devnames[i], sizes[i]);
170 else
171 f->dump_int(("dev-"+to_string(i)).c_str(), sizes[i]);
172 }
173 }
174 f->close_section();
175 }
176 }
177 f->close_section();
178 f->flush(out);
179 } else if (command == "bluefs debug_inject_read_zeros") {
180 bluefs->inject_read_zeros++;
181 } else {
182 errss << "Invalid command" << std::endl;
183 return -ENOSYS;
184 }
185 return 0;
186 }
187 };
188
189 BlueFS::BlueFS(CephContext* cct)
190 : cct(cct),
191 bdev(MAX_BDEV),
192 ioc(MAX_BDEV),
193 block_reserved(MAX_BDEV),
194 alloc(MAX_BDEV),
195 alloc_size(MAX_BDEV, 0)
196 {
197 dirty.pending_release.resize(MAX_BDEV);
198 discard_cb[BDEV_WAL] = wal_discard_cb;
199 discard_cb[BDEV_DB] = db_discard_cb;
200 discard_cb[BDEV_SLOW] = slow_discard_cb;
201 asok_hook = SocketHook::create(this);
202 }
203
204 BlueFS::~BlueFS()
205 {
206 delete asok_hook;
207 for (auto p : ioc) {
208 if (p)
209 p->aio_wait();
210 }
211 for (auto p : bdev) {
212 if (p) {
213 p->close();
214 delete p;
215 }
216 }
217 for (auto p : ioc) {
218 delete p;
219 }
220 }
221
222 void BlueFS::_init_logger()
223 {
224 PerfCountersBuilder b(cct, "bluefs",
225 l_bluefs_first, l_bluefs_last);
226 b.add_u64(l_bluefs_db_total_bytes, "db_total_bytes",
227 "Total bytes (main db device)",
228 "b", PerfCountersBuilder::PRIO_USEFUL, unit_t(UNIT_BYTES));
229 b.add_u64(l_bluefs_db_used_bytes, "db_used_bytes",
230 "Used bytes (main db device)",
231 "u", PerfCountersBuilder::PRIO_USEFUL, unit_t(UNIT_BYTES));
232 b.add_u64(l_bluefs_wal_total_bytes, "wal_total_bytes",
233 "Total bytes (wal device)",
234 "walb", PerfCountersBuilder::PRIO_USEFUL, unit_t(UNIT_BYTES));
235 b.add_u64(l_bluefs_wal_used_bytes, "wal_used_bytes",
236 "Used bytes (wal device)",
237 "walu", PerfCountersBuilder::PRIO_USEFUL, unit_t(UNIT_BYTES));
238 b.add_u64(l_bluefs_slow_total_bytes, "slow_total_bytes",
239 "Total bytes (slow device)",
240 "slob", PerfCountersBuilder::PRIO_USEFUL, unit_t(UNIT_BYTES));
241 b.add_u64(l_bluefs_slow_used_bytes, "slow_used_bytes",
242 "Used bytes (slow device)",
243 "slou", PerfCountersBuilder::PRIO_USEFUL, unit_t(UNIT_BYTES));
244 b.add_u64(l_bluefs_num_files, "num_files", "File count",
245 "f", PerfCountersBuilder::PRIO_USEFUL);
246 b.add_u64(l_bluefs_log_bytes, "log_bytes", "Size of the metadata log",
247 "jlen", PerfCountersBuilder::PRIO_INTERESTING, unit_t(UNIT_BYTES));
248 b.add_u64_counter(l_bluefs_log_compactions, "log_compactions",
249 "Compactions of the metadata log");
250 b.add_u64_counter(l_bluefs_log_write_count, "log_write_count",
251 "Write op count to the metadata log");
252 b.add_u64_counter(l_bluefs_logged_bytes, "logged_bytes",
253 "Bytes written to the metadata log",
254 "j",
255 PerfCountersBuilder::PRIO_CRITICAL, unit_t(UNIT_BYTES));
256 b.add_u64_counter(l_bluefs_files_written_wal, "files_written_wal",
257 "Files written to WAL");
258 b.add_u64_counter(l_bluefs_files_written_sst, "files_written_sst",
259 "Files written to SSTs");
260 b.add_u64_counter(l_bluefs_write_count_wal, "write_count_wal",
261 "Write op count to WAL");
262 b.add_u64_counter(l_bluefs_write_count_sst, "write_count_sst",
263 "Write op count to SSTs");
264 b.add_u64_counter(l_bluefs_bytes_written_wal, "bytes_written_wal",
265 "Bytes written to WAL",
266 "walb",
267 PerfCountersBuilder::PRIO_CRITICAL);
268 b.add_u64_counter(l_bluefs_bytes_written_sst, "bytes_written_sst",
269 "Bytes written to SSTs",
270 "sstb",
271 PerfCountersBuilder::PRIO_CRITICAL, unit_t(UNIT_BYTES));
272 b.add_u64_counter(l_bluefs_bytes_written_slow, "bytes_written_slow",
273 "Bytes written to WAL/SSTs at slow device",
274 "slwb",
275 PerfCountersBuilder::PRIO_CRITICAL, unit_t(UNIT_BYTES));
276 b.add_u64_counter(l_bluefs_max_bytes_wal, "max_bytes_wal",
277 "Maximum bytes allocated from WAL",
278 "mxwb",
279 PerfCountersBuilder::PRIO_INTERESTING,
280 unit_t(UNIT_BYTES));
281 b.add_u64_counter(l_bluefs_max_bytes_db, "max_bytes_db",
282 "Maximum bytes allocated from DB",
283 "mxdb",
284 PerfCountersBuilder::PRIO_INTERESTING,
285 unit_t(UNIT_BYTES));
286 b.add_u64_counter(l_bluefs_max_bytes_slow, "max_bytes_slow",
287 "Maximum bytes allocated from SLOW",
288 "mxwb",
289 PerfCountersBuilder::PRIO_INTERESTING,
290 unit_t(UNIT_BYTES));
291 b.add_u64_counter(l_bluefs_main_alloc_unit, "alloc_unit_main",
292 "Allocation unit size (in bytes) for primary/shared device",
293 "aumb",
294 PerfCountersBuilder::PRIO_CRITICAL,
295 unit_t(UNIT_BYTES));
296 b.add_u64_counter(l_bluefs_db_alloc_unit, "alloc_unit_db",
297 "Allocation unit size (in bytes) for standalone DB device",
298 "audb",
299 PerfCountersBuilder::PRIO_CRITICAL,
300 unit_t(UNIT_BYTES));
301 b.add_u64_counter(l_bluefs_wal_alloc_unit, "alloc_unit_wal",
302 "Allocation unit size (in bytes) for standalone WAL device",
303 "auwb",
304 PerfCountersBuilder::PRIO_CRITICAL,
305 unit_t(UNIT_BYTES));
306 b.add_u64_counter(l_bluefs_read_random_count, "read_random_count",
307 "random read requests processed",
308 NULL,
309 PerfCountersBuilder::PRIO_USEFUL);
310 b.add_u64_counter(l_bluefs_read_random_bytes, "read_random_bytes",
311 "Bytes requested in random read mode",
312 NULL,
313 PerfCountersBuilder::PRIO_USEFUL, unit_t(UNIT_BYTES));
314 b.add_u64_counter(l_bluefs_read_random_disk_count, "read_random_disk_count",
315 "random reads requests going to disk",
316 NULL,
317 PerfCountersBuilder::PRIO_USEFUL);
318 b.add_u64_counter(l_bluefs_read_random_disk_bytes, "read_random_disk_bytes",
319 "Bytes read from disk in random read mode",
320 "rrb",
321 PerfCountersBuilder::PRIO_INTERESTING,
322 unit_t(UNIT_BYTES));
323 b.add_u64_counter(l_bluefs_read_random_disk_bytes_wal, "read_random_disk_bytes_wal",
324 "random reads requests going to WAL disk",
325 NULL,
326 PerfCountersBuilder::PRIO_USEFUL, unit_t(UNIT_BYTES));
327 b.add_u64_counter(l_bluefs_read_random_disk_bytes_db, "read_random_disk_bytes_db",
328 "random reads requests going to DB disk",
329 NULL,
330 PerfCountersBuilder::PRIO_USEFUL, unit_t(UNIT_BYTES));
331 b.add_u64_counter(l_bluefs_read_random_disk_bytes_slow, "read_random_disk_bytes_slow",
332 "random reads requests going to main disk",
333 "rrsb",
334 PerfCountersBuilder::PRIO_INTERESTING,
335 unit_t(UNIT_BYTES));
336 b.add_u64_counter(l_bluefs_read_random_buffer_count, "read_random_buffer_count",
337 "random read requests processed using prefetch buffer",
338 NULL,
339 PerfCountersBuilder::PRIO_USEFUL);
340 b.add_u64_counter(l_bluefs_read_random_buffer_bytes, "read_random_buffer_bytes",
341 "Bytes read from prefetch buffer in random read mode",
342 NULL,
343 PerfCountersBuilder::PRIO_USEFUL, unit_t(UNIT_BYTES));
344 b.add_u64_counter(l_bluefs_read_count, "read_count",
345 "buffered read requests processed",
346 NULL,
347 PerfCountersBuilder::PRIO_USEFUL);
348 b.add_u64_counter(l_bluefs_read_bytes, "read_bytes",
349 "Bytes requested in buffered read mode",
350 NULL,
351 PerfCountersBuilder::PRIO_USEFUL, unit_t(UNIT_BYTES));
352 b.add_u64_counter(l_bluefs_read_disk_count, "read_disk_count",
353 "buffered reads requests going to disk",
354 NULL,
355 PerfCountersBuilder::PRIO_USEFUL);
356 b.add_u64_counter(l_bluefs_read_disk_bytes, "read_disk_bytes",
357 "Bytes read in buffered mode from disk",
358 "rb",
359 PerfCountersBuilder::PRIO_INTERESTING, unit_t(UNIT_BYTES));
360 b.add_u64_counter(l_bluefs_read_disk_bytes_wal, "read_disk_bytes_wal",
361 "reads requests going to WAL disk",
362 NULL,
363 PerfCountersBuilder::PRIO_USEFUL, unit_t(UNIT_BYTES));
364 b.add_u64_counter(l_bluefs_read_disk_bytes_db, "read_disk_bytes_db",
365 "reads requests going to DB disk",
366 NULL,
367 PerfCountersBuilder::PRIO_USEFUL, unit_t(UNIT_BYTES));
368 b.add_u64_counter(l_bluefs_read_disk_bytes_slow, "read_disk_bytes_slow",
369 "reads requests going to main disk",
370 "rsb",
371 PerfCountersBuilder::PRIO_INTERESTING, unit_t(UNIT_BYTES));
372 b.add_u64_counter(l_bluefs_read_prefetch_count, "read_prefetch_count",
373 "prefetch read requests processed",
374 NULL,
375 PerfCountersBuilder::PRIO_USEFUL);
376 b.add_u64_counter(l_bluefs_read_prefetch_bytes, "read_prefetch_bytes",
377 "Bytes requested in prefetch read mode",
378 NULL,
379 PerfCountersBuilder::PRIO_USEFUL, unit_t(UNIT_BYTES));
380 b.add_u64_counter(l_bluefs_write_count, "write_count",
381 "Write requests processed");
382 b.add_u64_counter(l_bluefs_write_disk_count, "write_disk_count",
383 "Write requests sent to disk");
384 b.add_u64_counter(l_bluefs_write_bytes, "write_bytes",
385 "Bytes written", NULL,
386 PerfCountersBuilder::PRIO_USEFUL, unit_t(UNIT_BYTES));
387 b.add_time_avg (l_bluefs_compaction_lat, "compact_lat",
388 "Average bluefs log compaction latency",
389 "c__t",
390 PerfCountersBuilder::PRIO_INTERESTING);
391 b.add_time_avg (l_bluefs_compaction_lock_lat, "compact_lock_lat",
392 "Average lock duration while compacting bluefs log",
393 "c_lt",
394 PerfCountersBuilder::PRIO_INTERESTING);
395 b.add_u64_counter(l_bluefs_alloc_shared_dev_fallbacks, "alloc_slow_fallback",
396 "Amount of allocations that required fallback to "
397 " slow/shared device",
398 "asdf",
399 PerfCountersBuilder::PRIO_USEFUL);
400 b.add_u64_counter(l_bluefs_alloc_shared_size_fallbacks, "alloc_slow_size_fallback",
401 "Amount of allocations that required fallback to shared device's "
402 "regular unit size",
403 "assf",
404 PerfCountersBuilder::PRIO_USEFUL);
405 b.add_u64(l_bluefs_read_zeros_candidate, "read_zeros_candidate",
406 "How many times bluefs read found page with all 0s");
407 b.add_u64(l_bluefs_read_zeros_errors, "read_zeros_errors",
408 "How many times bluefs read found transient page with all 0s");
409
410 logger = b.create_perf_counters();
411 cct->get_perfcounters_collection()->add(logger);
412 }
413
414 void BlueFS::_shutdown_logger()
415 {
416 cct->get_perfcounters_collection()->remove(logger);
417 delete logger;
418 }
419
420 void BlueFS::_update_logger_stats()
421 {
422 if (alloc[BDEV_WAL]) {
423 logger->set(l_bluefs_wal_total_bytes, _get_total(BDEV_WAL));
424 logger->set(l_bluefs_wal_used_bytes, _get_used(BDEV_WAL));
425 }
426 if (alloc[BDEV_DB]) {
427 logger->set(l_bluefs_db_total_bytes, _get_total(BDEV_DB));
428 logger->set(l_bluefs_db_used_bytes, _get_used(BDEV_DB));
429 }
430 if (alloc[BDEV_SLOW]) {
431 logger->set(l_bluefs_slow_total_bytes, _get_total(BDEV_SLOW));
432 logger->set(l_bluefs_slow_used_bytes, _get_used(BDEV_SLOW));
433 }
434 }
435
436 int BlueFS::add_block_device(unsigned id, const string& path, bool trim,
437 uint64_t reserved,
438 bluefs_shared_alloc_context_t* _shared_alloc)
439 {
440 dout(10) << __func__ << " bdev " << id << " path " << path << " "
441 << reserved << dendl;
442 ceph_assert(id < bdev.size());
443 ceph_assert(bdev[id] == NULL);
444 BlockDevice *b = BlockDevice::create(cct, path, NULL, NULL,
445 discard_cb[id], static_cast<void*>(this));
446 block_reserved[id] = reserved;
447 if (_shared_alloc) {
448 b->set_no_exclusive_lock();
449 }
450 int r = b->open(path);
451 if (r < 0) {
452 delete b;
453 return r;
454 }
455 if (trim) {
456 interval_set<uint64_t> whole_device;
457 whole_device.insert(0, b->get_size());
458 b->try_discard(whole_device, false);
459 }
460
461 dout(1) << __func__ << " bdev " << id << " path " << path
462 << " size " << byte_u_t(b->get_size()) << dendl;
463 bdev[id] = b;
464 ioc[id] = new IOContext(cct, NULL);
465 if (_shared_alloc) {
466 ceph_assert(!shared_alloc);
467 shared_alloc = _shared_alloc;
468 alloc[id] = shared_alloc->a;
469 shared_alloc_id = id;
470 }
471 return 0;
472 }
473
474 bool BlueFS::bdev_support_label(unsigned id)
475 {
476 ceph_assert(id < bdev.size());
477 ceph_assert(bdev[id]);
478 return bdev[id]->supported_bdev_label();
479 }
480
481 uint64_t BlueFS::get_block_device_size(unsigned id) const
482 {
483 if (id < bdev.size() && bdev[id])
484 return bdev[id]->get_size();
485 return 0;
486 }
487
488 void BlueFS::handle_discard(unsigned id, interval_set<uint64_t>& to_release)
489 {
490 dout(10) << __func__ << " bdev " << id << dendl;
491 ceph_assert(alloc[id]);
492 alloc[id]->release(to_release);
493 if (is_shared_alloc(id)) {
494 shared_alloc->bluefs_used -= to_release.size();
495 }
496 }
497
498 uint64_t BlueFS::get_used()
499 {
500 uint64_t used = 0;
501 for (unsigned id = 0; id < MAX_BDEV; ++id) {
502 used += _get_used(id);
503 }
504 return used;
505 }
506
507 uint64_t BlueFS::_get_used(unsigned id) const
508 {
509 uint64_t used = 0;
510 if (!alloc[id])
511 return 0;
512
513 if (is_shared_alloc(id)) {
514 used = shared_alloc->bluefs_used;
515 } else {
516 used = _get_total(id) - alloc[id]->get_free();
517 }
518 return used;
519 }
520
521 uint64_t BlueFS::get_used(unsigned id)
522 {
523 ceph_assert(id < alloc.size());
524 ceph_assert(alloc[id]);
525 return _get_used(id);
526 }
527
528 uint64_t BlueFS::_get_total(unsigned id) const
529 {
530 ceph_assert(id < bdev.size());
531 ceph_assert(id < block_reserved.size());
532 return get_block_device_size(id) - block_reserved[id];
533 }
534
535 uint64_t BlueFS::get_total(unsigned id)
536 {
537 return _get_total(id);
538 }
539
540 uint64_t BlueFS::get_free(unsigned id)
541 {
542 ceph_assert(id < alloc.size());
543 return alloc[id]->get_free();
544 }
545
546 void BlueFS::dump_perf_counters(Formatter *f)
547 {
548 f->open_object_section("bluefs_perf_counters");
549 logger->dump_formatted(f, false, false);
550 f->close_section();
551 }
552
553 void BlueFS::dump_block_extents(ostream& out)
554 {
555 for (unsigned i = 0; i < MAX_BDEV; ++i) {
556 if (!bdev[i]) {
557 continue;
558 }
559 auto total = get_total(i);
560 auto free = get_free(i);
561
562 out << i << " : device size 0x" << std::hex << total
563 << " : using 0x" << total - free
564 << std::dec << "(" << byte_u_t(total - free) << ")";
565 out << "\n";
566 }
567 }
568
569 void BlueFS::foreach_block_extents(
570 unsigned id,
571 std::function<void(uint64_t, uint32_t)> fn)
572 {
573 std::lock_guard nl(nodes.lock);
574 dout(10) << __func__ << " bdev " << id << dendl;
575 ceph_assert(id < alloc.size());
576 for (auto& p : nodes.file_map) {
577 for (auto& q : p.second->fnode.extents) {
578 if (q.bdev == id) {
579 fn(q.offset, q.length);
580 }
581 }
582 }
583 }
584
585 int BlueFS::mkfs(uuid_d osd_uuid, const bluefs_layout_t& layout)
586 {
587 dout(1) << __func__
588 << " osd_uuid " << osd_uuid
589 << dendl;
590
591 // set volume selector if not provided before/outside
592 if (vselector == nullptr) {
593 vselector.reset(
594 new OriginalVolumeSelector(
595 get_block_device_size(BlueFS::BDEV_WAL) * 95 / 100,
596 get_block_device_size(BlueFS::BDEV_DB) * 95 / 100,
597 get_block_device_size(BlueFS::BDEV_SLOW) * 95 / 100));
598 }
599
600 _init_logger();
601 _init_alloc();
602
603 super.version = 0;
604 super.block_size = bdev[BDEV_DB]->get_block_size();
605 super.osd_uuid = osd_uuid;
606 super.uuid.generate_random();
607 dout(1) << __func__ << " uuid " << super.uuid << dendl;
608
609 // init log
610 FileRef log_file = ceph::make_ref<File>();
611 log_file->fnode.ino = 1;
612 log_file->vselector_hint = vselector->get_hint_for_log();
613 int r = _allocate(
614 vselector->select_prefer_bdev(log_file->vselector_hint),
615 cct->_conf->bluefs_max_log_runway,
616 0,
617 &log_file->fnode);
618 vselector->add_usage(log_file->vselector_hint, log_file->fnode);
619 ceph_assert(r == 0);
620 log.writer = _create_writer(log_file);
621
622 // initial txn
623 ceph_assert(log.seq_live == 1);
624 log.t.seq = 1;
625 log.t.op_init();
626 _flush_and_sync_log_LD();
627
628 // write supers
629 super.log_fnode = log_file->fnode;
630 super.memorized_layout = layout;
631 _write_super(BDEV_DB);
632 _flush_bdev();
633
634 // clean up
635 super = bluefs_super_t();
636 _close_writer(log.writer);
637 log.writer = NULL;
638 vselector.reset(nullptr);
639 _stop_alloc();
640 _shutdown_logger();
641 if (shared_alloc) {
642 ceph_assert(shared_alloc->need_init);
643 shared_alloc->need_init = false;
644 }
645
646 dout(10) << __func__ << " success" << dendl;
647 return 0;
648 }
649
650 void BlueFS::_init_alloc()
651 {
652 dout(20) << __func__ << dendl;
653
654 size_t wal_alloc_size = 0;
655 if (bdev[BDEV_WAL]) {
656 wal_alloc_size = cct->_conf->bluefs_alloc_size;
657 alloc_size[BDEV_WAL] = wal_alloc_size;
658 }
659 logger->set(l_bluefs_wal_alloc_unit, wal_alloc_size);
660
661
662 uint64_t shared_alloc_size = cct->_conf->bluefs_shared_alloc_size;
663 if (shared_alloc && shared_alloc->a) {
664 uint64_t unit = shared_alloc->a->get_block_size();
665 shared_alloc_size = std::max(
666 unit,
667 shared_alloc_size);
668 ceph_assert(0 == p2phase(shared_alloc_size, unit));
669 }
670 if (bdev[BDEV_SLOW]) {
671 alloc_size[BDEV_DB] = cct->_conf->bluefs_alloc_size;
672 alloc_size[BDEV_SLOW] = shared_alloc_size;
673 } else {
674 alloc_size[BDEV_DB] = shared_alloc_size;
675 alloc_size[BDEV_SLOW] = 0;
676 }
677 logger->set(l_bluefs_db_alloc_unit, alloc_size[BDEV_DB]);
678 logger->set(l_bluefs_main_alloc_unit, alloc_size[BDEV_SLOW]);
679 // new wal and db devices are never shared
680 if (bdev[BDEV_NEWWAL]) {
681 alloc_size[BDEV_NEWWAL] = cct->_conf->bluefs_alloc_size;
682 }
683 if (bdev[BDEV_NEWDB]) {
684 alloc_size[BDEV_NEWDB] = cct->_conf->bluefs_alloc_size;
685 }
686
687 for (unsigned id = 0; id < bdev.size(); ++id) {
688 if (!bdev[id]) {
689 continue;
690 }
691 ceph_assert(bdev[id]->get_size());
692 if (is_shared_alloc(id)) {
693 dout(1) << __func__ << " shared, id " << id << std::hex
694 << ", capacity 0x" << bdev[id]->get_size()
695 << ", block size 0x" << alloc_size[id]
696 << std::dec << dendl;
697 } else {
698 ceph_assert(alloc_size[id]);
699 std::string name = "bluefs-";
700 const char* devnames[] = { "wal","db","slow" };
701 if (id <= BDEV_SLOW)
702 name += devnames[id];
703 else
704 name += to_string(uintptr_t(this));
705 dout(1) << __func__ << " new, id " << id << std::hex
706 << ", allocator name " << name
707 << ", allocator type " << cct->_conf->bluefs_allocator
708 << ", capacity 0x" << bdev[id]->get_size()
709 << ", block size 0x" << alloc_size[id]
710 << std::dec << dendl;
711 alloc[id] = Allocator::create(cct, cct->_conf->bluefs_allocator,
712 bdev[id]->get_size(),
713 alloc_size[id],
714 0, 0,
715 name);
716 alloc[id]->init_add_free(
717 block_reserved[id],
718 _get_total(id));
719 }
720 }
721 }
722
723 void BlueFS::_stop_alloc()
724 {
725 dout(20) << __func__ << dendl;
726 for (auto p : bdev) {
727 if (p)
728 p->discard_drain();
729 }
730
731 for (size_t i = 0; i < alloc.size(); ++i) {
732 if (alloc[i] && !is_shared_alloc(i)) {
733 alloc[i]->shutdown();
734 delete alloc[i];
735 alloc[i] = nullptr;
736 }
737 }
738 }
739
740 int BlueFS::_read_and_check(uint8_t ndev, uint64_t off, uint64_t len,
741 ceph::buffer::list *pbl, IOContext *ioc, bool buffered)
742 {
743 dout(10) << __func__ << " dev " << int(ndev)
744 << ": 0x" << std::hex << off << "~" << len << std::dec
745 << (buffered ? " buffered" : "")
746 << dendl;
747 int r;
748 bufferlist bl;
749 r = _bdev_read(ndev, off, len, &bl, ioc, buffered);
750 if (r != 0) {
751 return r;
752 }
753 uint64_t block_size = bdev[ndev]->get_block_size();
754 if (inject_read_zeros) {
755 if (len >= block_size * 2) {
756 derr << __func__ << " injecting error, zeros at "
757 << int(ndev) << ": 0x" << std::hex << (off + len / 2)
758 << "~" << (block_size * 2) << std::dec << dendl;
759 //use beginning, replace 8K in the middle with zeros, use tail
760 bufferlist temp;
761 bl.splice(0, len / 2 - block_size, &temp);
762 temp.append(buffer::create(block_size * 2, 0));
763 bl.splice(block_size * 2, len / 2 - block_size, &temp);
764 bl = temp;
765 inject_read_zeros--;
766 }
767 }
768 //make a check if there is a block with all 0
769 uint64_t to_check_len = len;
770 uint64_t skip = p2nphase(off, block_size);
771 if (skip >= to_check_len) {
772 return r;
773 }
774 auto it = bl.begin(skip);
775 to_check_len -= skip;
776 bool all_zeros = false;
777 while (all_zeros == false && to_check_len >= block_size) {
778 // checking 0s step
779 unsigned block_left = block_size;
780 unsigned avail;
781 const char* data;
782 all_zeros = true;
783 while (all_zeros && block_left > 0) {
784 avail = it.get_ptr_and_advance(block_left, &data);
785 block_left -= avail;
786 all_zeros = mem_is_zero(data, avail);
787 }
788 // skipping step
789 while (block_left > 0) {
790 avail = it.get_ptr_and_advance(block_left, &data);
791 block_left -= avail;
792 }
793 to_check_len -= block_size;
794 }
795 if (all_zeros) {
796 logger->inc(l_bluefs_read_zeros_candidate, 1);
797 bufferlist bl_reread;
798 r = _bdev_read(ndev, off, len, &bl_reread, ioc, buffered);
799 if (r != 0) {
800 return r;
801 }
802 // check if both read gave the same
803 if (!bl.contents_equal(bl_reread)) {
804 // report problems to log, but continue, maybe it will be good now...
805 derr << __func__ << " initial read of " << int(ndev)
806 << ": 0x" << std::hex << off << "~" << len
807 << std::dec << ": different then re-read " << dendl;
808 logger->inc(l_bluefs_read_zeros_errors, 1);
809 }
810 // use second read will be better if is different
811 pbl->append(bl_reread);
812 } else {
813 pbl->append(bl);
814 }
815 return r;
816 }
817
818 int BlueFS::_read_random_and_check(
819 uint8_t ndev, uint64_t off, uint64_t len, char *buf, bool buffered)
820 {
821 dout(10) << __func__ << " dev " << int(ndev)
822 << ": 0x" << std::hex << off << "~" << len << std::dec
823 << (buffered ? " buffered" : "")
824 << dendl;
825 int r;
826 r = _bdev_read_random(ndev, off, len, buf, buffered);
827 if (r != 0) {
828 return r;
829 }
830 uint64_t block_size = bdev[ndev]->get_block_size();
831 if (inject_read_zeros) {
832 if (len >= block_size * 2) {
833 derr << __func__ << " injecting error, zeros at "
834 << int(ndev) << ": 0x" << std::hex << (off + len / 2)
835 << "~" << (block_size * 2) << std::dec << dendl;
836 //zero middle 8K
837 memset(buf + len / 2 - block_size, 0, block_size * 2);
838 inject_read_zeros--;
839 }
840 }
841 //make a check if there is a block with all 0
842 uint64_t to_check_len = len;
843 const char* data = buf;
844 uint64_t skip = p2nphase(off, block_size);
845 if (skip >= to_check_len) {
846 return r;
847 }
848 to_check_len -= skip;
849 data += skip;
850
851 bool all_zeros = false;
852 while (all_zeros == false && to_check_len >= block_size) {
853 if (mem_is_zero(data, block_size)) {
854 // at least one block is all zeros
855 all_zeros = true;
856 break;
857 }
858 data += block_size;
859 to_check_len -= block_size;
860 }
861 if (all_zeros) {
862 logger->inc(l_bluefs_read_zeros_candidate, 1);
863 std::unique_ptr<char[]> data_reread(new char[len]);
864 r = _bdev_read_random(ndev, off, len, &data_reread[0], buffered);
865 if (r != 0) {
866 return r;
867 }
868 // check if both read gave the same
869 if (memcmp(buf, &data_reread[0], len) != 0) {
870 derr << __func__ << " initial read of " << int(ndev)
871 << ": 0x" << std::hex << off << "~" << len
872 << std::dec << ": different then re-read " << dendl;
873 logger->inc(l_bluefs_read_zeros_errors, 1);
874 // second read is probably better
875 memcpy(buf, &data_reread[0], len);
876 }
877 }
878 return r;
879 }
880
881 int BlueFS::_bdev_read(uint8_t ndev, uint64_t off, uint64_t len,
882 ceph::buffer::list* pbl, IOContext* ioc, bool buffered)
883 {
884 int cnt = 0;
885 switch (ndev) {
886 case BDEV_WAL: cnt = l_bluefs_read_disk_bytes_wal; break;
887 case BDEV_DB: cnt = l_bluefs_read_disk_bytes_db; break;
888 case BDEV_SLOW: cnt = l_bluefs_read_disk_bytes_slow; break;
889
890 }
891 if (cnt) {
892 logger->inc(cnt, len);
893 }
894 return bdev[ndev]->read(off, len, pbl, ioc, buffered);
895 }
896
897 int BlueFS::_bdev_read_random(uint8_t ndev, uint64_t off, uint64_t len,
898 char* buf, bool buffered)
899 {
900 int cnt = 0;
901 switch (ndev) {
902 case BDEV_WAL: cnt = l_bluefs_read_random_disk_bytes_wal; break;
903 case BDEV_DB: cnt = l_bluefs_read_random_disk_bytes_db; break;
904 case BDEV_SLOW: cnt = l_bluefs_read_random_disk_bytes_slow; break;
905 }
906 if (cnt) {
907 logger->inc(cnt, len);
908 }
909 return bdev[ndev]->read_random(off, len, buf, buffered);
910 }
911
912 int BlueFS::mount()
913 {
914 dout(1) << __func__ << dendl;
915
916 _init_logger();
917 int r = _open_super();
918 if (r < 0) {
919 derr << __func__ << " failed to open super: " << cpp_strerror(r) << dendl;
920 goto out;
921 }
922
923 // set volume selector if not provided before/outside
924 if (vselector == nullptr) {
925 vselector.reset(
926 new OriginalVolumeSelector(
927 get_block_device_size(BlueFS::BDEV_WAL) * 95 / 100,
928 get_block_device_size(BlueFS::BDEV_DB) * 95 / 100,
929 get_block_device_size(BlueFS::BDEV_SLOW) * 95 / 100));
930 }
931
932 _init_alloc();
933
934 r = _replay(false, false);
935 if (r < 0) {
936 derr << __func__ << " failed to replay log: " << cpp_strerror(r) << dendl;
937 _stop_alloc();
938 goto out;
939 }
940
941 // init freelist
942 for (auto& p : nodes.file_map) {
943 dout(30) << __func__ << " noting alloc for " << p.second->fnode << dendl;
944 for (auto& q : p.second->fnode.extents) {
945 bool is_shared = is_shared_alloc(q.bdev);
946 ceph_assert(!is_shared || (is_shared && shared_alloc));
947 if (is_shared && shared_alloc->need_init && shared_alloc->a) {
948 shared_alloc->bluefs_used += q.length;
949 alloc[q.bdev]->init_rm_free(q.offset, q.length);
950 } else if (!is_shared) {
951 alloc[q.bdev]->init_rm_free(q.offset, q.length);
952 }
953 }
954 }
955 if (shared_alloc) {
956 shared_alloc->need_init = false;
957 dout(1) << __func__ << " shared_bdev_used = "
958 << shared_alloc->bluefs_used << dendl;
959 } else {
960 dout(1) << __func__ << " shared bdev not used"
961 << dendl;
962 }
963
964 // set up the log for future writes
965 log.writer = _create_writer(_get_file(1));
966 ceph_assert(log.writer->file->fnode.ino == 1);
967 log.writer->pos = log.writer->file->fnode.size;
968 log.writer->file->fnode.reset_delta();
969 dout(10) << __func__ << " log write pos set to 0x"
970 << std::hex << log.writer->pos << std::dec
971 << dendl;
972 // update log size
973 logger->set(l_bluefs_log_bytes, log.writer->file->fnode.size);
974 return 0;
975
976 out:
977 super = bluefs_super_t();
978 return r;
979 }
980
981 int BlueFS::maybe_verify_layout(const bluefs_layout_t& layout) const
982 {
983 if (super.memorized_layout) {
984 if (layout == *super.memorized_layout) {
985 dout(10) << __func__ << " bluefs layout verified positively" << dendl;
986 } else {
987 derr << __func__ << " memorized layout doesn't fit current one" << dendl;
988 return -EIO;
989 }
990 } else {
991 dout(10) << __func__ << " no memorized_layout in bluefs superblock"
992 << dendl;
993 }
994
995 return 0;
996 }
997
998 void BlueFS::umount(bool avoid_compact)
999 {
1000 dout(1) << __func__ << dendl;
1001
1002 sync_metadata(avoid_compact);
1003 if (cct->_conf->bluefs_check_volume_selector_on_umount) {
1004 _check_vselector_LNF();
1005 }
1006 _close_writer(log.writer);
1007 log.writer = NULL;
1008 log.t.clear();
1009
1010 vselector.reset(nullptr);
1011 _stop_alloc();
1012 nodes.file_map.clear();
1013 nodes.dir_map.clear();
1014 super = bluefs_super_t();
1015 _shutdown_logger();
1016 }
1017
1018 int BlueFS::prepare_new_device(int id, const bluefs_layout_t& layout)
1019 {
1020 dout(1) << __func__ << dendl;
1021
1022 if(id == BDEV_NEWDB) {
1023 int new_log_dev_cur = BDEV_WAL;
1024 int new_log_dev_next = BDEV_WAL;
1025 if (!bdev[BDEV_WAL]) {
1026 new_log_dev_cur = BDEV_NEWDB;
1027 new_log_dev_next = BDEV_DB;
1028 }
1029 _rewrite_log_and_layout_sync_LNF_LD(false,
1030 BDEV_NEWDB,
1031 new_log_dev_cur,
1032 new_log_dev_next,
1033 RENAME_DB2SLOW,
1034 layout);
1035 } else if(id == BDEV_NEWWAL) {
1036 _rewrite_log_and_layout_sync_LNF_LD(false,
1037 BDEV_DB,
1038 BDEV_NEWWAL,
1039 BDEV_WAL,
1040 REMOVE_WAL,
1041 layout);
1042 } else {
1043 assert(false);
1044 }
1045 return 0;
1046 }
1047
1048 void BlueFS::collect_metadata(map<string,string> *pm, unsigned skip_bdev_id)
1049 {
1050 if (skip_bdev_id != BDEV_DB && bdev[BDEV_DB])
1051 bdev[BDEV_DB]->collect_metadata("bluefs_db_", pm);
1052 if (bdev[BDEV_WAL])
1053 bdev[BDEV_WAL]->collect_metadata("bluefs_wal_", pm);
1054 }
1055
1056 void BlueFS::get_devices(set<string> *ls)
1057 {
1058 for (unsigned i = 0; i < MAX_BDEV; ++i) {
1059 if (bdev[i]) {
1060 bdev[i]->get_devices(ls);
1061 }
1062 }
1063 }
1064
1065 int BlueFS::fsck()
1066 {
1067 dout(1) << __func__ << dendl;
1068 // hrm, i think we check everything on mount...
1069 return 0;
1070 }
1071
1072 int BlueFS::_write_super(int dev)
1073 {
1074 ++super.version;
1075 // build superblock
1076 bufferlist bl;
1077 encode(super, bl);
1078 uint32_t crc = bl.crc32c(-1);
1079 encode(crc, bl);
1080 dout(10) << __func__ << " super block length(encoded): " << bl.length() << dendl;
1081 dout(10) << __func__ << " superblock " << super.version << dendl;
1082 dout(10) << __func__ << " log_fnode " << super.log_fnode << dendl;
1083 ceph_assert_always(bl.length() <= get_super_length());
1084 bl.append_zero(get_super_length() - bl.length());
1085
1086 bdev[dev]->write(get_super_offset(), bl, false, WRITE_LIFE_SHORT);
1087 dout(20) << __func__ << " v " << super.version
1088 << " crc 0x" << std::hex << crc
1089 << " offset 0x" << get_super_offset() << std::dec
1090 << dendl;
1091 return 0;
1092 }
1093
1094 int BlueFS::_open_super()
1095 {
1096 dout(10) << __func__ << dendl;
1097
1098 bufferlist bl;
1099 uint32_t expected_crc, crc;
1100 int r;
1101
1102 // always the second block
1103 r = _bdev_read(BDEV_DB, get_super_offset(), get_super_length(),
1104 &bl, ioc[BDEV_DB], false);
1105 if (r < 0)
1106 return r;
1107
1108 auto p = bl.cbegin();
1109 decode(super, p);
1110 {
1111 bufferlist t;
1112 t.substr_of(bl, 0, p.get_off());
1113 crc = t.crc32c(-1);
1114 }
1115 decode(expected_crc, p);
1116 if (crc != expected_crc) {
1117 derr << __func__ << " bad crc on superblock, expected 0x"
1118 << std::hex << expected_crc << " != actual 0x" << crc << std::dec
1119 << dendl;
1120 return -EIO;
1121 }
1122 dout(10) << __func__ << " superblock " << super.version << dendl;
1123 dout(10) << __func__ << " log_fnode " << super.log_fnode << dendl;
1124 return 0;
1125 }
1126
1127 int BlueFS::_check_allocations(const bluefs_fnode_t& fnode,
1128 boost::dynamic_bitset<uint64_t>* used_blocks,
1129 bool is_alloc, //true when allocating, false when deallocating
1130 const char* op_name)
1131 {
1132 auto& fnode_extents = fnode.extents;
1133 for (auto e : fnode_extents) {
1134 auto id = e.bdev;
1135 bool fail = false;
1136 ceph_assert(id < MAX_BDEV);
1137 ceph_assert(bdev[id]);
1138 // let's use minimal allocation unit we can have
1139 auto alloc_unit = bdev[id]->get_block_size();
1140
1141 if (int r = _verify_alloc_granularity(id, e.offset, e.length,
1142 alloc_unit,
1143 op_name); r < 0) {
1144 return r;
1145 }
1146
1147 apply_for_bitset_range(e.offset, e.length, alloc_unit, used_blocks[id],
1148 [&](uint64_t pos, boost::dynamic_bitset<uint64_t> &bs) {
1149 if (is_alloc == bs.test(pos)) {
1150 fail = true;
1151 } else {
1152 bs.flip(pos);
1153 }
1154 }
1155 );
1156 if (fail) {
1157 derr << __func__ << " " << op_name << " invalid extent " << int(e.bdev)
1158 << ": 0x" << std::hex << e.offset << "~" << e.length << std::dec
1159 << (is_alloc == true ?
1160 ": duplicate reference, ino " : ": double free, ino ")
1161 << fnode.ino << dendl;
1162 return -EFAULT;
1163 }
1164 }
1165 return 0;
1166 }
1167
1168 int BlueFS::_verify_alloc_granularity(
1169 __u8 id, uint64_t offset, uint64_t length, uint64_t alloc_unit, const char *op)
1170 {
1171 if ((offset & (alloc_unit - 1)) ||
1172 (length & (alloc_unit - 1))) {
1173 derr << __func__ << " " << op << " of " << (int)id
1174 << ":0x" << std::hex << offset << "~" << length << std::dec
1175 << " does not align to alloc_size 0x"
1176 << std::hex << alloc_unit << std::dec << dendl;
1177 return -EFAULT;
1178 }
1179 return 0;
1180 }
1181
1182 int BlueFS::_replay(bool noop, bool to_stdout)
1183 {
1184 dout(10) << __func__ << (noop ? " NO-OP" : "") << dendl;
1185 ino_last = 1; // by the log
1186 uint64_t log_seq = 0;
1187
1188 FileRef log_file;
1189 log_file = _get_file(1);
1190
1191 log_file->fnode = super.log_fnode;
1192 if (!noop) {
1193 log_file->vselector_hint =
1194 vselector->get_hint_for_log();
1195 }
1196 dout(10) << __func__ << " log_fnode " << super.log_fnode << dendl;
1197 if (unlikely(to_stdout)) {
1198 std::cout << " log_fnode " << super.log_fnode << std::endl;
1199 }
1200
1201 FileReader *log_reader = new FileReader(
1202 log_file, cct->_conf->bluefs_max_prefetch,
1203 false, // !random
1204 true); // ignore eof
1205
1206 bool seen_recs = false;
1207
1208 boost::dynamic_bitset<uint64_t> used_blocks[MAX_BDEV];
1209
1210 if (!noop) {
1211 if (cct->_conf->bluefs_log_replay_check_allocations) {
1212 for (size_t i = 0; i < MAX_BDEV; ++i) {
1213 if (bdev[i] != nullptr) {
1214 // let's use minimal allocation unit we can have
1215 auto au = bdev[i]->get_block_size();
1216 //hmm... on 32TB/4K drive this would take 1GB RAM!!!
1217 used_blocks[i].resize(round_up_to(bdev[i]->get_size(), au) / au);
1218 }
1219 }
1220 // check initial log layout
1221 int r = _check_allocations(log_file->fnode,
1222 used_blocks, true, "Log from super");
1223 if (r < 0) {
1224 return r;
1225 }
1226 }
1227 }
1228
1229 while (true) {
1230 ceph_assert((log_reader->buf.pos & ~super.block_mask()) == 0);
1231 uint64_t pos = log_reader->buf.pos;
1232 uint64_t read_pos = pos;
1233 bufferlist bl;
1234 {
1235 int r = _read(log_reader, read_pos, super.block_size,
1236 &bl, NULL);
1237 if (r != (int)super.block_size && cct->_conf->bluefs_replay_recovery) {
1238 r += _do_replay_recovery_read(log_reader, pos, read_pos + r, super.block_size - r, &bl);
1239 }
1240 assert(r == (int)super.block_size);
1241 read_pos += r;
1242 }
1243 uint64_t more = 0;
1244 uint64_t seq;
1245 uuid_d uuid;
1246 {
1247 auto p = bl.cbegin();
1248 __u8 a, b;
1249 uint32_t len;
1250 decode(a, p);
1251 decode(b, p);
1252 decode(len, p);
1253 decode(uuid, p);
1254 decode(seq, p);
1255 if (len + 6 > bl.length()) {
1256 more = round_up_to(len + 6 - bl.length(), super.block_size);
1257 }
1258 }
1259 if (uuid != super.uuid) {
1260 if (seen_recs) {
1261 dout(10) << __func__ << " 0x" << std::hex << pos << std::dec
1262 << ": stop: uuid " << uuid << " != super.uuid " << super.uuid
1263 << dendl;
1264 } else {
1265 derr << __func__ << " 0x" << std::hex << pos << std::dec
1266 << ": stop: uuid " << uuid << " != super.uuid " << super.uuid
1267 << ", block dump: \n";
1268 bufferlist t;
1269 t.substr_of(bl, 0, super.block_size);
1270 t.hexdump(*_dout);
1271 *_dout << dendl;
1272 }
1273 break;
1274 }
1275 if (seq != log_seq + 1) {
1276 if (seen_recs) {
1277 dout(10) << __func__ << " 0x" << std::hex << pos << std::dec
1278 << ": stop: seq " << seq << " != expected " << log_seq + 1
1279 << dendl;;
1280 } else {
1281 derr << __func__ << " 0x" << std::hex << pos << std::dec
1282 << ": stop: seq " << seq << " != expected " << log_seq + 1
1283 << dendl;;
1284 }
1285 break;
1286 }
1287 if (more) {
1288 dout(20) << __func__ << " need 0x" << std::hex << more << std::dec
1289 << " more bytes" << dendl;
1290 bufferlist t;
1291 int r = _read(log_reader, read_pos, more, &t, NULL);
1292 if (r < (int)more) {
1293 dout(10) << __func__ << " 0x" << std::hex << pos
1294 << ": stop: len is 0x" << bl.length() + more << std::dec
1295 << ", which is past eof" << dendl;
1296 if (cct->_conf->bluefs_replay_recovery) {
1297 //try to search for more data
1298 r += _do_replay_recovery_read(log_reader, pos, read_pos + r, more - r, &t);
1299 if (r < (int)more) {
1300 //in normal mode we must read r==more, for recovery it is too strict
1301 break;
1302 }
1303 }
1304 }
1305 ceph_assert(r == (int)more);
1306 bl.claim_append(t);
1307 read_pos += r;
1308 }
1309 bluefs_transaction_t t;
1310 try {
1311 auto p = bl.cbegin();
1312 decode(t, p);
1313 seen_recs = true;
1314 }
1315 catch (ceph::buffer::error& e) {
1316 // Multi-block transactions might be incomplete due to unexpected
1317 // power off. Hence let's treat that as a regular stop condition.
1318 if (seen_recs && more) {
1319 dout(10) << __func__ << " 0x" << std::hex << pos << std::dec
1320 << ": stop: failed to decode: " << e.what()
1321 << dendl;
1322 } else {
1323 derr << __func__ << " 0x" << std::hex << pos << std::dec
1324 << ": stop: failed to decode: " << e.what()
1325 << dendl;
1326 delete log_reader;
1327 return -EIO;
1328 }
1329 break;
1330 }
1331 ceph_assert(seq == t.seq);
1332 dout(10) << __func__ << " 0x" << std::hex << pos << std::dec
1333 << ": " << t << dendl;
1334 if (unlikely(to_stdout)) {
1335 std::cout << " 0x" << std::hex << pos << std::dec
1336 << ": " << t << std::endl;
1337 }
1338
1339 auto p = t.op_bl.cbegin();
1340 auto pos0 = pos;
1341 while (!p.end()) {
1342 pos = pos0 + p.get_off();
1343 __u8 op;
1344 decode(op, p);
1345 switch (op) {
1346
1347 case bluefs_transaction_t::OP_INIT:
1348 dout(20) << __func__ << " 0x" << std::hex << pos << std::dec
1349 << ": op_init" << dendl;
1350 if (unlikely(to_stdout)) {
1351 std::cout << " 0x" << std::hex << pos << std::dec
1352 << ": op_init" << std::endl;
1353 }
1354
1355 ceph_assert(t.seq == 1);
1356 break;
1357
1358 case bluefs_transaction_t::OP_JUMP:
1359 {
1360 uint64_t next_seq;
1361 uint64_t offset;
1362 decode(next_seq, p);
1363 decode(offset, p);
1364 dout(20) << __func__ << " 0x" << std::hex << pos << std::dec
1365 << ": op_jump seq " << next_seq
1366 << " offset 0x" << std::hex << offset << std::dec << dendl;
1367 if (unlikely(to_stdout)) {
1368 std::cout << " 0x" << std::hex << pos << std::dec
1369 << ": op_jump seq " << next_seq
1370 << " offset 0x" << std::hex << offset << std::dec
1371 << std::endl;
1372 }
1373
1374 ceph_assert(next_seq > log_seq);
1375 log_seq = next_seq - 1; // we will increment it below
1376 uint64_t skip = offset - read_pos;
1377 if (skip) {
1378 bufferlist junk;
1379 int r = _read(log_reader, read_pos, skip, &junk,
1380 NULL);
1381 if (r != (int)skip) {
1382 dout(10) << __func__ << " 0x" << std::hex << read_pos
1383 << ": stop: failed to skip to " << offset
1384 << std::dec << dendl;
1385 ceph_abort_msg("problem with op_jump");
1386 }
1387 }
1388 }
1389 break;
1390
1391 case bluefs_transaction_t::OP_JUMP_SEQ:
1392 {
1393 uint64_t next_seq;
1394 decode(next_seq, p);
1395 dout(20) << __func__ << " 0x" << std::hex << pos << std::dec
1396 << ": op_jump_seq " << next_seq << dendl;
1397 if (unlikely(to_stdout)) {
1398 std::cout << " 0x" << std::hex << pos << std::dec
1399 << ": op_jump_seq " << next_seq << std::endl;
1400 }
1401
1402 ceph_assert(next_seq > log_seq);
1403 log_seq = next_seq - 1; // we will increment it below
1404 }
1405 break;
1406
1407 case bluefs_transaction_t::OP_ALLOC_ADD:
1408 // LEGACY, do nothing but read params
1409 {
1410 __u8 id;
1411 uint64_t offset, length;
1412 decode(id, p);
1413 decode(offset, p);
1414 decode(length, p);
1415 }
1416 break;
1417
1418 case bluefs_transaction_t::OP_ALLOC_RM:
1419 // LEGACY, do nothing but read params
1420 {
1421 __u8 id;
1422 uint64_t offset, length;
1423 decode(id, p);
1424 decode(offset, p);
1425 decode(length, p);
1426 }
1427 break;
1428
1429 case bluefs_transaction_t::OP_DIR_LINK:
1430 {
1431 string dirname, filename;
1432 uint64_t ino;
1433 decode(dirname, p);
1434 decode(filename, p);
1435 decode(ino, p);
1436 dout(20) << __func__ << " 0x" << std::hex << pos << std::dec
1437 << ": op_dir_link " << " " << dirname << "/" << filename
1438 << " to " << ino
1439 << dendl;
1440 if (unlikely(to_stdout)) {
1441 std::cout << " 0x" << std::hex << pos << std::dec
1442 << ": op_dir_link " << " " << dirname << "/" << filename
1443 << " to " << ino
1444 << std::endl;
1445 }
1446
1447 if (!noop) {
1448 FileRef file = _get_file(ino);
1449 ceph_assert(file->fnode.ino);
1450 map<string,DirRef>::iterator q = nodes.dir_map.find(dirname);
1451 ceph_assert(q != nodes.dir_map.end());
1452 map<string,FileRef>::iterator r = q->second->file_map.find(filename);
1453 ceph_assert(r == q->second->file_map.end());
1454
1455 vselector->sub_usage(file->vselector_hint, file->fnode);
1456 file->vselector_hint =
1457 vselector->get_hint_by_dir(dirname);
1458 vselector->add_usage(file->vselector_hint, file->fnode);
1459
1460 q->second->file_map[filename] = file;
1461 ++file->refs;
1462 }
1463 }
1464 break;
1465
1466 case bluefs_transaction_t::OP_DIR_UNLINK:
1467 {
1468 string dirname, filename;
1469 decode(dirname, p);
1470 decode(filename, p);
1471 dout(20) << __func__ << " 0x" << std::hex << pos << std::dec
1472 << ": op_dir_unlink " << " " << dirname << "/" << filename
1473 << dendl;
1474 if (unlikely(to_stdout)) {
1475 std::cout << " 0x" << std::hex << pos << std::dec
1476 << ": op_dir_unlink " << " " << dirname << "/" << filename
1477 << std::endl;
1478 }
1479
1480 if (!noop) {
1481 map<string,DirRef>::iterator q = nodes.dir_map.find(dirname);
1482 ceph_assert(q != nodes.dir_map.end());
1483 map<string,FileRef>::iterator r = q->second->file_map.find(filename);
1484 ceph_assert(r != q->second->file_map.end());
1485 ceph_assert(r->second->refs > 0);
1486 --r->second->refs;
1487 q->second->file_map.erase(r);
1488 }
1489 }
1490 break;
1491
1492 case bluefs_transaction_t::OP_DIR_CREATE:
1493 {
1494 string dirname;
1495 decode(dirname, p);
1496 dout(20) << __func__ << " 0x" << std::hex << pos << std::dec
1497 << ": op_dir_create " << dirname << dendl;
1498 if (unlikely(to_stdout)) {
1499 std::cout << " 0x" << std::hex << pos << std::dec
1500 << ": op_dir_create " << dirname << std::endl;
1501 }
1502
1503 if (!noop) {
1504 map<string,DirRef>::iterator q = nodes.dir_map.find(dirname);
1505 ceph_assert(q == nodes.dir_map.end());
1506 nodes.dir_map[dirname] = ceph::make_ref<Dir>();
1507 }
1508 }
1509 break;
1510
1511 case bluefs_transaction_t::OP_DIR_REMOVE:
1512 {
1513 string dirname;
1514 decode(dirname, p);
1515 dout(20) << __func__ << " 0x" << std::hex << pos << std::dec
1516 << ": op_dir_remove " << dirname << dendl;
1517 if (unlikely(to_stdout)) {
1518 std::cout << " 0x" << std::hex << pos << std::dec
1519 << ": op_dir_remove " << dirname << std::endl;
1520 }
1521
1522 if (!noop) {
1523 map<string,DirRef>::iterator q = nodes.dir_map.find(dirname);
1524 ceph_assert(q != nodes.dir_map.end());
1525 ceph_assert(q->second->file_map.empty());
1526 nodes.dir_map.erase(q);
1527 }
1528 }
1529 break;
1530
1531 case bluefs_transaction_t::OP_FILE_UPDATE:
1532 {
1533 bluefs_fnode_t fnode;
1534 decode(fnode, p);
1535 dout(20) << __func__ << " 0x" << std::hex << pos << std::dec
1536 << ": op_file_update " << " " << fnode << " " << dendl;
1537 if (unlikely(to_stdout)) {
1538 std::cout << " 0x" << std::hex << pos << std::dec
1539 << ": op_file_update " << " " << fnode << std::endl;
1540 }
1541 if (!noop) {
1542 FileRef f = _get_file(fnode.ino);
1543 if (cct->_conf->bluefs_log_replay_check_allocations) {
1544 int r = _check_allocations(f->fnode,
1545 used_blocks, false, "OP_FILE_UPDATE");
1546 if (r < 0) {
1547 return r;
1548 }
1549 }
1550 if (fnode.ino != 1) {
1551 vselector->sub_usage(f->vselector_hint, f->fnode);
1552 }
1553 f->fnode = fnode;
1554 if (fnode.ino != 1) {
1555 vselector->add_usage(f->vselector_hint, f->fnode);
1556 }
1557
1558 if (fnode.ino > ino_last) {
1559 ino_last = fnode.ino;
1560 }
1561 if (cct->_conf->bluefs_log_replay_check_allocations) {
1562 int r = _check_allocations(f->fnode,
1563 used_blocks, true, "OP_FILE_UPDATE");
1564 if (r < 0) {
1565 return r;
1566 }
1567 }
1568 } else if (noop && fnode.ino == 1) {
1569 FileRef f = _get_file(fnode.ino);
1570 f->fnode = fnode;
1571 }
1572 }
1573 break;
1574 case bluefs_transaction_t::OP_FILE_UPDATE_INC:
1575 {
1576 bluefs_fnode_delta_t delta;
1577 decode(delta, p);
1578 dout(20) << __func__ << " 0x" << std::hex << pos << std::dec
1579 << ": op_file_update_inc " << " " << delta << " " << dendl;
1580 if (unlikely(to_stdout)) {
1581 std::cout << " 0x" << std::hex << pos << std::dec
1582 << ": op_file_update_inc " << " " << delta << std::endl;
1583 }
1584 if (!noop) {
1585 FileRef f = _get_file(delta.ino);
1586 bluefs_fnode_t& fnode = f->fnode;
1587 if (delta.offset != fnode.allocated) {
1588 derr << __func__ << " invalid op_file_update_inc, new extents miss end of file"
1589 << " fnode=" << fnode
1590 << " delta=" << delta
1591 << dendl;
1592 ceph_assert(delta.offset == fnode.allocated);
1593 }
1594 if (cct->_conf->bluefs_log_replay_check_allocations) {
1595 int r = _check_allocations(fnode,
1596 used_blocks, false, "OP_FILE_UPDATE_INC");
1597 if (r < 0) {
1598 return r;
1599 }
1600 }
1601
1602 fnode.ino = delta.ino;
1603 fnode.mtime = delta.mtime;
1604 if (fnode.ino != 1) {
1605 vselector->sub_usage(f->vselector_hint, fnode);
1606 }
1607 fnode.size = delta.size;
1608 fnode.claim_extents(delta.extents);
1609 dout(20) << __func__ << " 0x" << std::hex << pos << std::dec
1610 << ": op_file_update_inc produced " << " " << fnode << " " << dendl;
1611
1612 if (fnode.ino != 1) {
1613 vselector->add_usage(f->vselector_hint, fnode);
1614 }
1615
1616 if (fnode.ino > ino_last) {
1617 ino_last = fnode.ino;
1618 }
1619 if (cct->_conf->bluefs_log_replay_check_allocations) {
1620 int r = _check_allocations(f->fnode,
1621 used_blocks, true, "OP_FILE_UPDATE_INC");
1622 if (r < 0) {
1623 return r;
1624 }
1625 }
1626 } else if (noop && delta.ino == 1) {
1627 // we need to track bluefs log, even in noop mode
1628 FileRef f = _get_file(1);
1629 bluefs_fnode_t& fnode = f->fnode;
1630 fnode.ino = delta.ino;
1631 fnode.mtime = delta.mtime;
1632 fnode.size = delta.size;
1633 fnode.claim_extents(delta.extents);
1634 }
1635 }
1636 break;
1637
1638 case bluefs_transaction_t::OP_FILE_REMOVE:
1639 {
1640 uint64_t ino;
1641 decode(ino, p);
1642 dout(20) << __func__ << " 0x" << std::hex << pos << std::dec
1643 << ": op_file_remove " << ino << dendl;
1644 if (unlikely(to_stdout)) {
1645 std::cout << " 0x" << std::hex << pos << std::dec
1646 << ": op_file_remove " << ino << std::endl;
1647 }
1648
1649 if (!noop) {
1650 auto p = nodes.file_map.find(ino);
1651 ceph_assert(p != nodes.file_map.end());
1652 vselector->sub_usage(p->second->vselector_hint, p->second->fnode);
1653 if (cct->_conf->bluefs_log_replay_check_allocations) {
1654 int r = _check_allocations(p->second->fnode,
1655 used_blocks, false, "OP_FILE_REMOVE");
1656 if (r < 0) {
1657 return r;
1658 }
1659 }
1660 nodes.file_map.erase(p);
1661 }
1662 }
1663 break;
1664
1665 default:
1666 derr << __func__ << " 0x" << std::hex << pos << std::dec
1667 << ": stop: unrecognized op " << (int)op << dendl;
1668 delete log_reader;
1669 return -EIO;
1670 }
1671 }
1672 ceph_assert(p.end());
1673
1674 // we successfully replayed the transaction; bump the seq and log size
1675 ++log_seq;
1676 log_file->fnode.size = log_reader->buf.pos;
1677 }
1678 if (!noop) {
1679 vselector->add_usage(log_file->vselector_hint, log_file->fnode);
1680 log.seq_live = log_seq + 1;
1681 dirty.seq_live = log_seq + 1;
1682 log.t.seq = log.seq_live;
1683 dirty.seq_stable = log_seq;
1684 }
1685
1686 dout(10) << __func__ << " log file size was 0x"
1687 << std::hex << log_file->fnode.size << std::dec << dendl;
1688 if (unlikely(to_stdout)) {
1689 std::cout << " log file size was 0x"
1690 << std::hex << log_file->fnode.size << std::dec << std::endl;
1691 }
1692
1693 delete log_reader;
1694
1695 if (!noop) {
1696 // verify file link counts are all >0
1697 for (auto& p : nodes.file_map) {
1698 if (p.second->refs == 0 &&
1699 p.second->fnode.ino > 1) {
1700 derr << __func__ << " file with link count 0: " << p.second->fnode
1701 << dendl;
1702 return -EIO;
1703 }
1704 }
1705 }
1706 // reflect file count in logger
1707 logger->set(l_bluefs_num_files, nodes.file_map.size());
1708
1709 dout(10) << __func__ << " done" << dendl;
1710 return 0;
1711 }
1712
1713 int BlueFS::log_dump()
1714 {
1715 // only dump log file's content
1716 ceph_assert(log.writer == nullptr && "cannot log_dump on mounted BlueFS");
1717 _init_logger();
1718 int r = _open_super();
1719 if (r < 0) {
1720 derr << __func__ << " failed to open super: " << cpp_strerror(r) << dendl;
1721 return r;
1722 }
1723 r = _replay(true, true);
1724 if (r < 0) {
1725 derr << __func__ << " failed to replay log: " << cpp_strerror(r) << dendl;
1726 }
1727 _shutdown_logger();
1728 super = bluefs_super_t();
1729 return r;
1730 }
1731
1732 int BlueFS::device_migrate_to_existing(
1733 CephContext *cct,
1734 const set<int>& devs_source,
1735 int dev_target,
1736 const bluefs_layout_t& layout)
1737 {
1738 vector<byte> buf;
1739 bool buffered = cct->_conf->bluefs_buffered_io;
1740
1741 dout(10) << __func__ << " devs_source " << devs_source
1742 << " dev_target " << dev_target << dendl;
1743 assert(dev_target < (int)MAX_BDEV);
1744
1745 int flags = 0;
1746 flags |= devs_source.count(BDEV_DB) ?
1747 (REMOVE_DB | RENAME_SLOW2DB) : 0;
1748 flags |= devs_source.count(BDEV_WAL) ? REMOVE_WAL : 0;
1749 int dev_target_new = dev_target;
1750
1751 // Slow device without separate DB one is addressed via BDEV_DB
1752 // Hence need renaming.
1753 if ((flags & REMOVE_DB) && dev_target == BDEV_SLOW) {
1754 dev_target_new = BDEV_DB;
1755 dout(0) << __func__ << " super to be written to " << dev_target << dendl;
1756 }
1757
1758 for (auto& [ino, file_ref] : nodes.file_map) {
1759 //do not copy log
1760 if (ino == 1) {
1761 continue;
1762 }
1763 dout(10) << __func__ << " " << ino << " " << file_ref->fnode << dendl;
1764
1765 vselector->sub_usage(file_ref->vselector_hint, file_ref->fnode);
1766
1767 bool rewrite = std::any_of(
1768 file_ref->fnode.extents.begin(),
1769 file_ref->fnode.extents.end(),
1770 [=](auto& ext) {
1771 return ext.bdev != dev_target && devs_source.count(ext.bdev);
1772 });
1773 if (rewrite) {
1774 dout(10) << __func__ << " migrating" << dendl;
1775 bluefs_fnode_t old_fnode;
1776 old_fnode.swap_extents(file_ref->fnode);
1777 auto& old_fnode_extents = old_fnode.extents;
1778 // read entire file
1779 bufferlist bl;
1780 for (const auto &old_ext : old_fnode_extents) {
1781 buf.resize(old_ext.length);
1782 int r = _bdev_read_random(old_ext.bdev,
1783 old_ext.offset,
1784 old_ext.length,
1785 (char*)&buf.at(0),
1786 buffered);
1787 if (r != 0) {
1788 derr << __func__ << " failed to read 0x" << std::hex
1789 << old_ext.offset << "~" << old_ext.length << std::dec
1790 << " from " << (int)dev_target << dendl;
1791 return -EIO;
1792 }
1793 bl.append((char*)&buf[0], old_ext.length);
1794 }
1795
1796 // write entire file
1797 auto l = _allocate(dev_target, bl.length(), 0,
1798 &file_ref->fnode, 0, false);
1799 if (l < 0) {
1800 derr << __func__ << " unable to allocate len 0x" << std::hex
1801 << bl.length() << std::dec << " from " << (int)dev_target
1802 << ": " << cpp_strerror(l) << dendl;
1803 return -ENOSPC;
1804 }
1805
1806 uint64_t off = 0;
1807 for (auto& i : file_ref->fnode.extents) {
1808 bufferlist cur;
1809 uint64_t cur_len = std::min<uint64_t>(i.length, bl.length() - off);
1810 ceph_assert(cur_len > 0);
1811 cur.substr_of(bl, off, cur_len);
1812 int r = bdev[dev_target]->write(i.offset, cur, buffered);
1813 ceph_assert(r == 0);
1814 off += cur_len;
1815 }
1816
1817 // release old extents
1818 for (const auto &old_ext : old_fnode_extents) {
1819 PExtentVector to_release;
1820 to_release.emplace_back(old_ext.offset, old_ext.length);
1821 alloc[old_ext.bdev]->release(to_release);
1822 if (is_shared_alloc(old_ext.bdev)) {
1823 shared_alloc->bluefs_used -= to_release.size();
1824 }
1825 }
1826
1827 // update fnode
1828 for (auto& i : file_ref->fnode.extents) {
1829 i.bdev = dev_target_new;
1830 }
1831 } else {
1832 for (auto& ext : file_ref->fnode.extents) {
1833 if (dev_target != dev_target_new && ext.bdev == dev_target) {
1834 dout(20) << __func__ << " " << " ... adjusting extent 0x"
1835 << std::hex << ext.offset << std::dec
1836 << " bdev " << dev_target << " -> " << dev_target_new
1837 << dendl;
1838 ext.bdev = dev_target_new;
1839 }
1840 }
1841 }
1842 vselector->add_usage(file_ref->vselector_hint, file_ref->fnode);
1843 }
1844 // new logging device in the current naming scheme
1845 int new_log_dev_cur = bdev[BDEV_WAL] ?
1846 BDEV_WAL :
1847 bdev[BDEV_DB] ? BDEV_DB : BDEV_SLOW;
1848
1849 // new logging device in new naming scheme
1850 int new_log_dev_next = new_log_dev_cur;
1851
1852 if (devs_source.count(new_log_dev_cur)) {
1853 // SLOW device is addressed via BDEV_DB too hence either WAL or DB
1854 new_log_dev_next = (flags & REMOVE_WAL) || !bdev[BDEV_WAL] ?
1855 BDEV_DB :
1856 BDEV_WAL;
1857
1858 dout(0) << __func__ << " log moved from " << new_log_dev_cur
1859 << " to " << new_log_dev_next << dendl;
1860
1861 new_log_dev_cur =
1862 (flags & REMOVE_DB) && new_log_dev_next == BDEV_DB ?
1863 BDEV_SLOW :
1864 new_log_dev_next;
1865 }
1866
1867 _rewrite_log_and_layout_sync_LNF_LD(
1868 false,
1869 (flags & REMOVE_DB) ? BDEV_SLOW : BDEV_DB,
1870 new_log_dev_cur,
1871 new_log_dev_next,
1872 flags,
1873 layout);
1874 return 0;
1875 }
1876
1877 int BlueFS::device_migrate_to_new(
1878 CephContext *cct,
1879 const set<int>& devs_source,
1880 int dev_target,
1881 const bluefs_layout_t& layout)
1882 {
1883 vector<byte> buf;
1884 bool buffered = cct->_conf->bluefs_buffered_io;
1885
1886 dout(10) << __func__ << " devs_source " << devs_source
1887 << " dev_target " << dev_target << dendl;
1888 assert(dev_target == (int)BDEV_NEWDB || dev_target == (int)BDEV_NEWWAL);
1889
1890 int flags = 0;
1891
1892 flags |= devs_source.count(BDEV_DB) ?
1893 (!bdev[BDEV_SLOW] ? RENAME_DB2SLOW: REMOVE_DB) :
1894 0;
1895 flags |= devs_source.count(BDEV_WAL) ? REMOVE_WAL : 0;
1896 int dev_target_new = dev_target; //FIXME: remove, makes no sense
1897
1898 for (auto& [ino, file_ref] : nodes.file_map) {
1899 //do not copy log
1900 if (ino == 1) {
1901 continue;
1902 }
1903 dout(10) << __func__ << " " << ino << " " << file_ref->fnode << dendl;
1904
1905 vselector->sub_usage(file_ref->vselector_hint, file_ref->fnode);
1906
1907 bool rewrite = std::any_of(
1908 file_ref->fnode.extents.begin(),
1909 file_ref->fnode.extents.end(),
1910 [=](auto& ext) {
1911 return ext.bdev != dev_target && devs_source.count(ext.bdev);
1912 });
1913 if (rewrite) {
1914 dout(10) << __func__ << " migrating" << dendl;
1915 bluefs_fnode_t old_fnode;
1916 old_fnode.swap_extents(file_ref->fnode);
1917 auto& old_fnode_extents = old_fnode.extents;
1918 // read entire file
1919 bufferlist bl;
1920 for (const auto &old_ext : old_fnode_extents) {
1921 buf.resize(old_ext.length);
1922 int r = _bdev_read_random(old_ext.bdev,
1923 old_ext.offset,
1924 old_ext.length,
1925 (char*)&buf.at(0),
1926 buffered);
1927 if (r != 0) {
1928 derr << __func__ << " failed to read 0x" << std::hex
1929 << old_ext.offset << "~" << old_ext.length << std::dec
1930 << " from " << (int)dev_target << dendl;
1931 return -EIO;
1932 }
1933 bl.append((char*)&buf[0], old_ext.length);
1934 }
1935
1936 // write entire file
1937 auto l = _allocate(dev_target, bl.length(), 0,
1938 &file_ref->fnode, 0, false);
1939 if (l < 0) {
1940 derr << __func__ << " unable to allocate len 0x" << std::hex
1941 << bl.length() << std::dec << " from " << (int)dev_target
1942 << ": " << cpp_strerror(l) << dendl;
1943 return -ENOSPC;
1944 }
1945
1946 uint64_t off = 0;
1947 for (auto& i : file_ref->fnode.extents) {
1948 bufferlist cur;
1949 uint64_t cur_len = std::min<uint64_t>(i.length, bl.length() - off);
1950 ceph_assert(cur_len > 0);
1951 cur.substr_of(bl, off, cur_len);
1952 int r = bdev[dev_target]->write(i.offset, cur, buffered);
1953 ceph_assert(r == 0);
1954 off += cur_len;
1955 }
1956
1957 // release old extents
1958 for (const auto &old_ext : old_fnode_extents) {
1959 PExtentVector to_release;
1960 to_release.emplace_back(old_ext.offset, old_ext.length);
1961 alloc[old_ext.bdev]->release(to_release);
1962 if (is_shared_alloc(old_ext.bdev)) {
1963 shared_alloc->bluefs_used -= to_release.size();
1964 }
1965 }
1966
1967 // update fnode
1968 for (auto& i : file_ref->fnode.extents) {
1969 i.bdev = dev_target_new;
1970 }
1971 }
1972 }
1973 // new logging device in the current naming scheme
1974 int new_log_dev_cur =
1975 bdev[BDEV_NEWWAL] ?
1976 BDEV_NEWWAL :
1977 bdev[BDEV_WAL] && !(flags & REMOVE_WAL) ?
1978 BDEV_WAL :
1979 bdev[BDEV_NEWDB] ?
1980 BDEV_NEWDB :
1981 bdev[BDEV_DB] && !(flags & REMOVE_DB)?
1982 BDEV_DB :
1983 BDEV_SLOW;
1984
1985 // new logging device in new naming scheme
1986 int new_log_dev_next =
1987 new_log_dev_cur == BDEV_NEWWAL ?
1988 BDEV_WAL :
1989 new_log_dev_cur == BDEV_NEWDB ?
1990 BDEV_DB :
1991 new_log_dev_cur;
1992
1993 int super_dev =
1994 dev_target == BDEV_NEWDB ?
1995 BDEV_NEWDB :
1996 bdev[BDEV_DB] ?
1997 BDEV_DB :
1998 BDEV_SLOW;
1999
2000 _rewrite_log_and_layout_sync_LNF_LD(
2001 false,
2002 super_dev,
2003 new_log_dev_cur,
2004 new_log_dev_next,
2005 flags,
2006 layout);
2007 return 0;
2008 }
2009
2010 BlueFS::FileRef BlueFS::_get_file(uint64_t ino)
2011 {
2012 auto p = nodes.file_map.find(ino);
2013 if (p == nodes.file_map.end()) {
2014 FileRef f = ceph::make_ref<File>();
2015 nodes.file_map[ino] = f;
2016 // track files count in logger
2017 logger->set(l_bluefs_num_files, nodes.file_map.size());
2018 dout(30) << __func__ << " ino " << ino << " = " << f
2019 << " (new)" << dendl;
2020 return f;
2021 } else {
2022 dout(30) << __func__ << " ino " << ino << " = " << p->second << dendl;
2023 return p->second;
2024 }
2025 }
2026
2027
2028 /**
2029 To modify fnode both FileWriter::lock and File::lock must be obtained.
2030 The special case is when we modify bluefs log (ino 1) or
2031 we are compacting log (ino 0).
2032
2033 In any case it is enough to hold File::lock to be sure fnode will not be modified.
2034 */
2035 struct lock_fnode_print {
2036 BlueFS::FileRef file;
2037 lock_fnode_print(BlueFS::FileRef file) : file(file) {};
2038 };
2039 std::ostream& operator<<(std::ostream& out, const lock_fnode_print& to_lock) {
2040 std::lock_guard l(to_lock.file->lock);
2041 out << to_lock.file->fnode;
2042 return out;
2043 }
2044
2045 void BlueFS::_drop_link_D(FileRef file)
2046 {
2047 dout(20) << __func__ << " had refs " << file->refs
2048 << " on " << lock_fnode_print(file) << dendl;
2049 ceph_assert(file->refs > 0);
2050 ceph_assert(ceph_mutex_is_locked(log.lock));
2051 ceph_assert(ceph_mutex_is_locked(nodes.lock));
2052
2053 --file->refs;
2054 if (file->refs == 0) {
2055 dout(20) << __func__ << " destroying " << file->fnode << dendl;
2056 ceph_assert(file->num_reading.load() == 0);
2057 vselector->sub_usage(file->vselector_hint, file->fnode);
2058 log.t.op_file_remove(file->fnode.ino);
2059 nodes.file_map.erase(file->fnode.ino);
2060 logger->set(l_bluefs_num_files, nodes.file_map.size());
2061 file->deleted = true;
2062
2063 std::lock_guard dl(dirty.lock);
2064 for (auto& r : file->fnode.extents) {
2065 dirty.pending_release[r.bdev].insert(r.offset, r.length);
2066 }
2067 if (file->dirty_seq > dirty.seq_stable) {
2068 // retract request to serialize changes
2069 ceph_assert(dirty.files.count(file->dirty_seq));
2070 auto it = dirty.files[file->dirty_seq].iterator_to(*file);
2071 dirty.files[file->dirty_seq].erase(it);
2072 file->dirty_seq = dirty.seq_stable;
2073 }
2074 }
2075 }
2076
2077 int64_t BlueFS::_read_random(
2078 FileReader *h, ///< [in] read from here
2079 uint64_t off, ///< [in] offset
2080 uint64_t len, ///< [in] this many bytes
2081 char *out) ///< [out] copy it here
2082 {
2083 auto* buf = &h->buf;
2084
2085 int64_t ret = 0;
2086 dout(10) << __func__ << " h " << h
2087 << " 0x" << std::hex << off << "~" << len << std::dec
2088 << " from " << lock_fnode_print(h->file) << dendl;
2089
2090 ++h->file->num_reading;
2091
2092 if (!h->ignore_eof &&
2093 off + len > h->file->fnode.size) {
2094 if (off > h->file->fnode.size)
2095 len = 0;
2096 else
2097 len = h->file->fnode.size - off;
2098 dout(20) << __func__ << " reaching (or past) eof, len clipped to 0x"
2099 << std::hex << len << std::dec << dendl;
2100 }
2101 logger->inc(l_bluefs_read_random_count, 1);
2102 logger->inc(l_bluefs_read_random_bytes, len);
2103
2104 std::shared_lock s_lock(h->lock);
2105 buf->bl.reassign_to_mempool(mempool::mempool_bluefs_file_reader);
2106 while (len > 0) {
2107 if (off < buf->bl_off || off >= buf->get_buf_end()) {
2108 s_lock.unlock();
2109 uint64_t x_off = 0;
2110 auto p = h->file->fnode.seek(off, &x_off);
2111 ceph_assert(p != h->file->fnode.extents.end());
2112 uint64_t l = std::min(p->length - x_off, len);
2113 //hard cap to 1GB
2114 l = std::min(l, uint64_t(1) << 30);
2115 dout(20) << __func__ << " read random 0x"
2116 << std::hex << x_off << "~" << l << std::dec
2117 << " of " << *p << dendl;
2118 int r;
2119 if (!cct->_conf->bluefs_check_for_zeros) {
2120 r = _bdev_read_random(p->bdev, p->offset + x_off, l, out,
2121 cct->_conf->bluefs_buffered_io);
2122 } else {
2123 r = _read_random_and_check(p->bdev, p->offset + x_off, l, out,
2124 cct->_conf->bluefs_buffered_io);
2125 }
2126 ceph_assert(r == 0);
2127 off += l;
2128 len -= l;
2129 ret += l;
2130 out += l;
2131
2132 logger->inc(l_bluefs_read_random_disk_count, 1);
2133 logger->inc(l_bluefs_read_random_disk_bytes, l);
2134 if (len > 0) {
2135 s_lock.lock();
2136 }
2137 } else {
2138 auto left = buf->get_buf_remaining(off);
2139 int64_t r = std::min(len, left);
2140 logger->inc(l_bluefs_read_random_buffer_count, 1);
2141 logger->inc(l_bluefs_read_random_buffer_bytes, r);
2142 dout(20) << __func__ << " left 0x" << std::hex << left
2143 << " 0x" << off << "~" << len << std::dec
2144 << dendl;
2145
2146 auto p = buf->bl.begin();
2147 p.seek(off - buf->bl_off);
2148 p.copy(r, out);
2149 out += r;
2150
2151 dout(30) << __func__ << " result chunk (0x"
2152 << std::hex << r << std::dec << " bytes):\n";
2153 bufferlist t;
2154 t.substr_of(buf->bl, off - buf->bl_off, r);
2155 t.hexdump(*_dout);
2156 *_dout << dendl;
2157
2158 off += r;
2159 len -= r;
2160 ret += r;
2161 buf->pos += r;
2162 }
2163 }
2164 dout(20) << __func__ << std::hex
2165 << " got 0x" << ret
2166 << std::dec << dendl;
2167 --h->file->num_reading;
2168 return ret;
2169 }
2170
2171 int64_t BlueFS::_read(
2172 FileReader *h, ///< [in] read from here
2173 uint64_t off, ///< [in] offset
2174 size_t len, ///< [in] this many bytes
2175 bufferlist *outbl, ///< [out] optional: reference the result here
2176 char *out) ///< [out] optional: or copy it here
2177 {
2178 FileReaderBuffer *buf = &(h->buf);
2179
2180 bool prefetch = !outbl && !out;
2181 dout(10) << __func__ << " h " << h
2182 << " 0x" << std::hex << off << "~" << len << std::dec
2183 << " from " << lock_fnode_print(h->file)
2184 << (prefetch ? " prefetch" : "")
2185 << dendl;
2186
2187 ++h->file->num_reading;
2188
2189 if (!h->ignore_eof &&
2190 off + len > h->file->fnode.size) {
2191 if (off > h->file->fnode.size)
2192 len = 0;
2193 else
2194 len = h->file->fnode.size - off;
2195 dout(20) << __func__ << " reaching (or past) eof, len clipped to 0x"
2196 << std::hex << len << std::dec << dendl;
2197 }
2198 logger->inc(l_bluefs_read_count, 1);
2199 logger->inc(l_bluefs_read_bytes, len);
2200 if (prefetch) {
2201 logger->inc(l_bluefs_read_prefetch_count, 1);
2202 logger->inc(l_bluefs_read_prefetch_bytes, len);
2203 }
2204
2205 if (outbl)
2206 outbl->clear();
2207
2208 int64_t ret = 0;
2209 std::shared_lock s_lock(h->lock);
2210 while (len > 0) {
2211 size_t left;
2212 if (off < buf->bl_off || off >= buf->get_buf_end()) {
2213 s_lock.unlock();
2214 std::unique_lock u_lock(h->lock);
2215 buf->bl.reassign_to_mempool(mempool::mempool_bluefs_file_reader);
2216 if (off < buf->bl_off || off >= buf->get_buf_end()) {
2217 // if precondition hasn't changed during locking upgrade.
2218 buf->bl.clear();
2219 buf->bl_off = off & super.block_mask();
2220 uint64_t x_off = 0;
2221 auto p = h->file->fnode.seek(buf->bl_off, &x_off);
2222 if (p == h->file->fnode.extents.end()) {
2223 dout(5) << __func__ << " reading less then required "
2224 << ret << "<" << ret + len << dendl;
2225 break;
2226 }
2227
2228 uint64_t want = round_up_to(len + (off & ~super.block_mask()),
2229 super.block_size);
2230 want = std::max(want, buf->max_prefetch);
2231 uint64_t l = std::min(p->length - x_off, want);
2232 //hard cap to 1GB
2233 l = std::min(l, uint64_t(1) << 30);
2234 uint64_t eof_offset = round_up_to(h->file->fnode.size, super.block_size);
2235 if (!h->ignore_eof &&
2236 buf->bl_off + l > eof_offset) {
2237 l = eof_offset - buf->bl_off;
2238 }
2239 dout(20) << __func__ << " fetching 0x"
2240 << std::hex << x_off << "~" << l << std::dec
2241 << " of " << *p << dendl;
2242 int r;
2243 // when reading BlueFS log (only happens on startup) use non-buffered io
2244 // it makes it in sync with logic in _flush_range()
2245 bool use_buffered_io = h->file->fnode.ino == 1 ? false : cct->_conf->bluefs_buffered_io;
2246 if (!cct->_conf->bluefs_check_for_zeros) {
2247 r = _bdev_read(p->bdev, p->offset + x_off, l, &buf->bl, ioc[p->bdev],
2248 use_buffered_io);
2249 } else {
2250 r = _read_and_check(
2251 p->bdev, p->offset + x_off, l, &buf->bl, ioc[p->bdev],
2252 use_buffered_io);
2253 }
2254 logger->inc(l_bluefs_read_disk_count, 1);
2255 logger->inc(l_bluefs_read_disk_bytes, l);
2256
2257 ceph_assert(r == 0);
2258 }
2259 u_lock.unlock();
2260 s_lock.lock();
2261 // we should recheck if buffer is valid after lock downgrade
2262 continue;
2263 }
2264 left = buf->get_buf_remaining(off);
2265 dout(20) << __func__ << " left 0x" << std::hex << left
2266 << " len 0x" << len << std::dec << dendl;
2267
2268 int64_t r = std::min(len, left);
2269 if (outbl) {
2270 bufferlist t;
2271 t.substr_of(buf->bl, off - buf->bl_off, r);
2272 outbl->claim_append(t);
2273 }
2274 if (out) {
2275 auto p = buf->bl.begin();
2276 p.seek(off - buf->bl_off);
2277 p.copy(r, out);
2278 out += r;
2279 }
2280
2281 dout(30) << __func__ << " result chunk (0x"
2282 << std::hex << r << std::dec << " bytes):\n";
2283 bufferlist t;
2284 t.substr_of(buf->bl, off - buf->bl_off, r);
2285 t.hexdump(*_dout);
2286 *_dout << dendl;
2287
2288 off += r;
2289 len -= r;
2290 ret += r;
2291 buf->pos += r;
2292 }
2293
2294 dout(20) << __func__ << std::hex
2295 << " got 0x" << ret
2296 << std::dec << dendl;
2297 ceph_assert(!outbl || (int)outbl->length() == ret);
2298 --h->file->num_reading;
2299 return ret;
2300 }
2301
2302 void BlueFS::invalidate_cache(FileRef f, uint64_t offset, uint64_t length)
2303 {
2304 std::lock_guard l(f->lock);
2305 dout(10) << __func__ << " file " << f->fnode
2306 << " 0x" << std::hex << offset << "~" << length << std::dec
2307 << dendl;
2308 if (offset & ~super.block_mask()) {
2309 offset &= super.block_mask();
2310 length = round_up_to(length, super.block_size);
2311 }
2312 uint64_t x_off = 0;
2313 auto p = f->fnode.seek(offset, &x_off);
2314 while (length > 0 && p != f->fnode.extents.end()) {
2315 uint64_t x_len = std::min(p->length - x_off, length);
2316 bdev[p->bdev]->invalidate_cache(p->offset + x_off, x_len);
2317 dout(20) << __func__ << " 0x" << std::hex << x_off << "~" << x_len
2318 << std:: dec << " of " << *p << dendl;
2319 offset += x_len;
2320 length -= x_len;
2321 }
2322 }
2323
2324
2325 uint64_t BlueFS::_estimate_transaction_size(bluefs_transaction_t* t)
2326 {
2327 uint64_t max_alloc_size = std::max(alloc_size[BDEV_WAL],
2328 std::max(alloc_size[BDEV_DB],
2329 alloc_size[BDEV_SLOW]));
2330
2331 // conservative estimate for final encoded size
2332 return round_up_to(t->op_bl.length() + super.block_size * 2, max_alloc_size);
2333 }
2334
2335 uint64_t BlueFS::_make_initial_transaction(uint64_t start_seq,
2336 bluefs_fnode_t& fnode,
2337 uint64_t expected_final_size,
2338 bufferlist* out)
2339 {
2340 bluefs_transaction_t t0;
2341 t0.seq = start_seq;
2342 t0.uuid = super.uuid;
2343 t0.op_init();
2344 t0.op_file_update_inc(fnode);
2345 t0.op_jump(start_seq, expected_final_size); // this is a fixed size op,
2346 // hence it's valid with fake
2347 // params for overall txc size
2348 // estimation
2349 if (!out) {
2350 return _estimate_transaction_size(&t0);
2351 }
2352
2353 ceph_assert(expected_final_size > 0);
2354 out->reserve(expected_final_size);
2355 encode(t0, *out);
2356 // make sure we're not wrong aboth the size
2357 ceph_assert(out->length() <= expected_final_size);
2358 _pad_bl(*out, expected_final_size);
2359 return expected_final_size;
2360 }
2361
2362 uint64_t BlueFS::_estimate_log_size_N()
2363 {
2364 std::lock_guard nl(nodes.lock);
2365 int avg_dir_size = 40; // fixme
2366 int avg_file_size = 12;
2367 uint64_t size = 4096 * 2;
2368 size += nodes.file_map.size() * (1 + sizeof(bluefs_fnode_t));
2369 size += nodes.dir_map.size() + (1 + avg_dir_size);
2370 size += nodes.file_map.size() * (1 + avg_dir_size + avg_file_size);
2371 return round_up_to(size, super.block_size);
2372 }
2373
2374 void BlueFS::compact_log()/*_LNF_LD_NF_D*/
2375 {
2376 if (!cct->_conf->bluefs_replay_recovery_disable_compact) {
2377 if (cct->_conf->bluefs_compact_log_sync) {
2378 _compact_log_sync_LNF_LD();
2379 } else {
2380 _compact_log_async_LD_LNF_D();
2381 }
2382 }
2383 }
2384
2385 bool BlueFS::_should_start_compact_log_L_N()
2386 {
2387 if (log_is_compacting.load() == true) {
2388 // compaction is already running
2389 return false;
2390 }
2391 uint64_t current;
2392 {
2393 std::lock_guard ll(log.lock);
2394 current = log.writer->file->fnode.size;
2395 }
2396 uint64_t expected = _estimate_log_size_N();
2397 float ratio = (float)current / (float)expected;
2398 dout(10) << __func__ << " current 0x" << std::hex << current
2399 << " expected " << expected << std::dec
2400 << " ratio " << ratio
2401 << dendl;
2402 if (current < cct->_conf->bluefs_log_compact_min_size ||
2403 ratio < cct->_conf->bluefs_log_compact_min_ratio) {
2404 return false;
2405 }
2406 return true;
2407 }
2408
2409 void BlueFS::_compact_log_dump_metadata_NF(uint64_t start_seq,
2410 bluefs_transaction_t *t,
2411 int bdev_update_flags,
2412 uint64_t capture_before_seq)
2413 {
2414 dout(20) << __func__ << dendl;
2415 t->seq = start_seq;
2416 t->uuid = super.uuid;
2417
2418 std::lock_guard nl(nodes.lock);
2419
2420 for (auto& [ino, file_ref] : nodes.file_map) {
2421 if (ino == 1)
2422 continue;
2423 ceph_assert(ino > 1);
2424 std::lock_guard fl(file_ref->lock);
2425 if (bdev_update_flags) {
2426 for(auto& e : file_ref->fnode.extents) {
2427 auto bdev = e.bdev;
2428 auto bdev_new = bdev;
2429 ceph_assert(!((bdev_update_flags & REMOVE_WAL) && bdev == BDEV_WAL));
2430 if ((bdev_update_flags & RENAME_SLOW2DB) && bdev == BDEV_SLOW) {
2431 bdev_new = BDEV_DB;
2432 }
2433 if ((bdev_update_flags & RENAME_DB2SLOW) && bdev == BDEV_DB) {
2434 bdev_new = BDEV_SLOW;
2435 }
2436 if (bdev == BDEV_NEWDB) {
2437 // REMOVE_DB xor RENAME_DB
2438 ceph_assert(!(bdev_update_flags & REMOVE_DB) != !(bdev_update_flags & RENAME_DB2SLOW));
2439 ceph_assert(!(bdev_update_flags & RENAME_SLOW2DB));
2440 bdev_new = BDEV_DB;
2441 }
2442 if (bdev == BDEV_NEWWAL) {
2443 ceph_assert(bdev_update_flags & REMOVE_WAL);
2444 bdev_new = BDEV_WAL;
2445 }
2446 e.bdev = bdev_new;
2447 }
2448 }
2449 if (capture_before_seq == 0 || file_ref->dirty_seq < capture_before_seq) {
2450 dout(20) << __func__ << " op_file_update " << file_ref->fnode << dendl;
2451 } else {
2452 dout(20) << __func__ << " op_file_update just modified, dirty_seq="
2453 << file_ref->dirty_seq << " " << file_ref->fnode << dendl;
2454 }
2455 t->op_file_update(file_ref->fnode);
2456 }
2457 for (auto& [path, dir_ref] : nodes.dir_map) {
2458 dout(20) << __func__ << " op_dir_create " << path << dendl;
2459 t->op_dir_create(path);
2460 for (auto& [fname, file_ref] : dir_ref->file_map) {
2461 dout(20) << __func__ << " op_dir_link " << path << "/" << fname
2462 << " to " << file_ref->fnode.ino << dendl;
2463 t->op_dir_link(path, fname, file_ref->fnode.ino);
2464 }
2465 }
2466 }
2467
2468 void BlueFS::_compact_log_sync_LNF_LD()
2469 {
2470 dout(10) << __func__ << dendl;
2471 uint8_t prefer_bdev;
2472 {
2473 std::lock_guard ll(log.lock);
2474 prefer_bdev =
2475 vselector->select_prefer_bdev(log.writer->file->vselector_hint);
2476 }
2477 _rewrite_log_and_layout_sync_LNF_LD(true,
2478 BDEV_DB,
2479 prefer_bdev,
2480 prefer_bdev,
2481 0,
2482 super.memorized_layout);
2483 logger->inc(l_bluefs_log_compactions);
2484 }
2485
2486 /*
2487 * SYNC LOG COMPACTION
2488 *
2489 * 0. Lock the log completely through the whole procedure
2490 *
2491 * 1. Build new log. It will include log's starter and compacted metadata
2492 * body. Jump op appended to the starter will link the pieces together.
2493 *
2494 * 2. Write out new log's content
2495 *
2496 * 3. Write out new superblock. This includes relevant device layout update.
2497 *
2498 * 4. Finalization. Old space release.
2499 */
2500
2501 void BlueFS::_rewrite_log_and_layout_sync_LNF_LD(bool permit_dev_fallback,
2502 int super_dev,
2503 int log_dev,
2504 int log_dev_new,
2505 int flags,
2506 std::optional<bluefs_layout_t> layout)
2507 {
2508 // we substitute log_dev with log_dev_new for new allocations below
2509 // and permitting fallback allocations prevents such a substitution
2510 ceph_assert((permit_dev_fallback && log_dev == log_dev_new) ||
2511 !permit_dev_fallback);
2512
2513 dout(10) << __func__ << " super_dev:" << super_dev
2514 << " log_dev:" << log_dev
2515 << " log_dev_new:" << log_dev_new
2516 << " flags:" << flags
2517 << " seq:" << log.seq_live
2518 << dendl;
2519 utime_t mtime = ceph_clock_now();
2520 uint64_t starter_seq = 1;
2521
2522 // Part 0.
2523 // Lock the log totally till the end of the procedure
2524 std::lock_guard ll(log.lock);
2525 auto t0 = mono_clock::now();
2526
2527 File *log_file = log.writer->file.get();
2528 bluefs_fnode_t fnode_tail;
2529 // log.t.seq is always set to current live seq
2530 ceph_assert(log.t.seq == log.seq_live);
2531 // Capturing entire state. Dump anything that has been stored there.
2532 log.t.clear();
2533 log.t.seq = log.seq_live;
2534 // From now on, no changes to log.t are permitted until we finish rewriting log.
2535 // Can allow dirty to remain dirty - log.seq_live will not change.
2536
2537 //
2538 // Part 1.
2539 // Build new log starter and compacted metadata body
2540 // 1.1. Build full compacted meta transaction.
2541 // Encode a bluefs transaction that dumps all of the in-memory fnodes
2542 // and names.
2543 // This might be pretty large and its allocation map can exceed
2544 // superblock size. Hence instead we'll need log starter part which
2545 // goes to superblock and refers that new meta through op_update_inc.
2546 // 1.2. Allocate space for the above transaction
2547 // using its size estimation.
2548 // 1.3. Allocate the space required for the starter part of the new log.
2549 // It should be small enough to fit into superblock.
2550 // 1.4 Building new log persistent fnode representation which will
2551 // finally land to disk.
2552 // Depending on input parameters we might need to perform device ids
2553 // rename - runtime and persistent replicas should be different when we
2554 // are in the device migration process.
2555 // 1.5 Store starter fnode to run-time superblock, to be written out later.
2556 // It doesn't contain compacted meta to fit relevant alocation map into
2557 // superblock.
2558 // 1.6 Proceed building new log persistent fnode representation.
2559 // Will add log tail with compacted meta extents from 1.1.
2560 // Device rename applied as well
2561 //
2562 // 1.7. Encode new log fnode starter,
2563 // It will include op_init, new log's op_update_inc
2564 // and jump to the compacted meta transaction beginning.
2565 // Superblock will reference this starter part
2566 //
2567 // 1.8. Encode compacted meta transaction,
2568 // extend the transaction with a jump to proper sequence no
2569 //
2570
2571
2572 // 1.1 Build full compacted meta transaction
2573 bluefs_transaction_t compacted_meta_t;
2574 _compact_log_dump_metadata_NF(starter_seq + 1, &compacted_meta_t, flags, 0);
2575
2576 // 1.2 Allocate the space required for the compacted meta transaction
2577 uint64_t compacted_meta_need =
2578 _estimate_transaction_size(&compacted_meta_t) +
2579 cct->_conf->bluefs_max_log_runway;
2580
2581 dout(20) << __func__ << " compacted_meta_need " << compacted_meta_need << dendl;
2582
2583 int r = _allocate(log_dev, compacted_meta_need, 0, &fnode_tail, 0,
2584 permit_dev_fallback);
2585 ceph_assert(r == 0);
2586
2587
2588 // 1.3 Allocate the space required for the starter part of the new log.
2589 // estimate new log fnode size to be referenced from superblock
2590 // hence use dummy fnode and jump parameters
2591 uint64_t starter_need = _make_initial_transaction(starter_seq, fnode_tail, 0, nullptr);
2592
2593 bluefs_fnode_t fnode_starter(log_file->fnode.ino, 0, mtime);
2594 r = _allocate(log_dev, starter_need, 0, &fnode_starter, 0,
2595 permit_dev_fallback);
2596 ceph_assert(r == 0);
2597
2598 // 1.4 Building starter fnode
2599 bluefs_fnode_t fnode_persistent(fnode_starter.ino, 0, mtime);
2600 for (auto p : fnode_starter.extents) {
2601 // rename device if needed - this is possible when fallback allocations
2602 // are prohibited only. Which means every extent is targeted to the same
2603 // device and we can unconditionally update them.
2604 if (log_dev != log_dev_new) {
2605 dout(10) << __func__ << " renaming log extents to "
2606 << log_dev_new << dendl;
2607 p.bdev = log_dev_new;
2608 }
2609 fnode_persistent.append_extent(p);
2610 }
2611
2612 // 1.5 Store starter fnode to run-time superblock, to be written out later
2613 super.log_fnode = fnode_persistent;
2614
2615 // 1.6 Proceed building new log persistent fnode representation
2616 // we'll build incremental update starting from this point
2617 fnode_persistent.reset_delta();
2618 for (auto p : fnode_tail.extents) {
2619 // rename device if needed - this is possible when fallback allocations
2620 // are prohibited only. Which means every extent is targeted to the same
2621 // device and we can unconditionally update them.
2622 if (log_dev != log_dev_new) {
2623 dout(10) << __func__ << " renaming log extents to "
2624 << log_dev_new << dendl;
2625 p.bdev = log_dev_new;
2626 }
2627 fnode_persistent.append_extent(p);
2628 }
2629
2630 // 1.7 Encode new log fnode
2631 // This will flush incremental part of fnode_persistent only.
2632 bufferlist starter_bl;
2633 _make_initial_transaction(starter_seq, fnode_persistent, starter_need, &starter_bl);
2634
2635 // 1.8 Encode compacted meta transaction
2636 dout(20) << __func__ << " op_jump_seq " << log.seq_live << dendl;
2637 // hopefully "compact_meta_need" estimation provides enough extra space
2638 // for this op, assert below if not
2639 compacted_meta_t.op_jump_seq(log.seq_live);
2640
2641 bufferlist compacted_meta_bl;
2642 encode(compacted_meta_t, compacted_meta_bl);
2643 _pad_bl(compacted_meta_bl);
2644 ceph_assert(compacted_meta_bl.length() <= compacted_meta_need);
2645
2646 //
2647 // Part 2
2648 // Write out new log's content
2649 // 2.1. Build the full runtime new log's fnode
2650 //
2651 // 2.2. Write out new log's
2652 //
2653 // 2.3. Do flush and wait for completion through flush_bdev()
2654 //
2655 // 2.4. Finalize log update
2656 // Update all sequence numbers
2657 //
2658
2659 // 2.1 Build the full runtime new log's fnode
2660 bluefs_fnode_t old_log_fnode;
2661 old_log_fnode.swap(fnode_starter);
2662 old_log_fnode.clone_extents(fnode_tail);
2663 old_log_fnode.reset_delta();
2664 log_file->fnode.swap(old_log_fnode);
2665
2666 // 2.2 Write out new log's content
2667 // Get rid off old writer
2668 _close_writer(log.writer);
2669 // Make new log writer and stage new log's content writing
2670 log.writer = _create_writer(log_file);
2671 log.writer->append(starter_bl);
2672 log.writer->append(compacted_meta_bl);
2673
2674 // 2.3 Do flush and wait for completion through flush_bdev()
2675 _flush_special(log.writer);
2676 #ifdef HAVE_LIBAIO
2677 if (!cct->_conf->bluefs_sync_write) {
2678 list<aio_t> completed_ios;
2679 _claim_completed_aios(log.writer, &completed_ios);
2680 _wait_for_aio(log.writer);
2681 completed_ios.clear();
2682 }
2683 #endif
2684 _flush_bdev();
2685
2686 // 2.4 Finalize log update
2687 ++log.seq_live;
2688 dirty.seq_live = log.seq_live;
2689 log.t.seq = log.seq_live;
2690 vselector->sub_usage(log_file->vselector_hint, old_log_fnode);
2691 vselector->add_usage(log_file->vselector_hint, log_file->fnode);
2692
2693 // Part 3.
2694 // Write out new superblock to reflect all the changes.
2695 //
2696
2697 super.memorized_layout = layout;
2698 _write_super(super_dev);
2699 _flush_bdev();
2700
2701 // we're mostly done
2702 dout(10) << __func__ << " log extents " << log_file->fnode.extents << dendl;
2703 logger->inc(l_bluefs_log_compactions);
2704
2705 // Part 4
2706 // Finalization. Release old space.
2707 //
2708 {
2709 dout(10) << __func__
2710 << " release old log extents " << old_log_fnode.extents
2711 << dendl;
2712 std::lock_guard dl(dirty.lock);
2713 for (auto& r : old_log_fnode.extents) {
2714 dirty.pending_release[r.bdev].insert(r.offset, r.length);
2715 }
2716 }
2717 logger->tinc(l_bluefs_compaction_lock_lat, mono_clock::now() - t0);
2718 }
2719
2720 /*
2721 * ASYNC LOG COMPACTION
2722 *
2723 * 0. Lock the log and forbid its extension. The former covers just
2724 * a part of the below procedure while the latter spans over it
2725 * completely.
2726 * 1. Allocate a new extent to continue the log, and then log an event
2727 * that jumps the log write position to the new extent. At this point, the
2728 * old extent(s) won't be written to, and reflect everything to compact.
2729 * New events will be written to the new region that we'll keep.
2730 * The latter will finally become new log tail on compaction completion.
2731 *
2732 * 2. Build new log. It will include log's starter, compacted metadata
2733 * body and the above tail. Jump ops appended to the starter and meta body
2734 * will link the pieces togather. Log's lock is releases in the mid of the
2735 * process to permit parallel access to it.
2736 *
2737 * 3. Write out new log's content.
2738 *
2739 * 4. Write out new superblock to reflect all the changes.
2740 *
2741 * 5. Apply new log fnode, log is locked for a while.
2742 *
2743 * 6. Finalization. Clean up, old space release and total unlocking.
2744 */
2745
2746 void BlueFS::_compact_log_async_LD_LNF_D() //also locks FW for new_writer
2747 {
2748 dout(10) << __func__ << dendl;
2749 utime_t mtime = ceph_clock_now();
2750 uint64_t starter_seq = 1;
2751 uint64_t old_log_jump_to = 0;
2752
2753 // Part 0.
2754 // Lock the log and forbid its expansion and other compactions
2755
2756 // only one compaction allowed at one time
2757 bool old_is_comp = std::atomic_exchange(&log_is_compacting, true);
2758 if (old_is_comp) {
2759 dout(10) << __func__ << " ongoing" <<dendl;
2760 return;
2761 }
2762 // lock log's run-time structures for a while
2763 log.lock.lock();
2764 auto t0 = mono_clock::now();
2765
2766 // Part 1.
2767 // Prepare current log for jumping into it.
2768 // 1. Allocate extent
2769 // 2. Update op to log
2770 // 3. Jump op to log
2771 // During that, no one else can write to log, otherwise we risk jumping backwards.
2772 // We need to sync log, because we are injecting discontinuity, and writer is not prepared for that.
2773
2774 //signal _maybe_extend_log that expansion of log is temporary inacceptable
2775 bool old_forbidden = atomic_exchange(&log_forbidden_to_expand, true);
2776 ceph_assert(old_forbidden == false);
2777
2778 //
2779 // Part 1.
2780 // Prepare current log for jumping into it.
2781 // 1.1. Allocate extent
2782 // 1.2. Save log's fnode extents and add new extents
2783 // 1.3. Update op to log
2784 // 1.4. Jump op to log
2785 // During that, no one else can write to log, otherwise we risk jumping backwards.
2786 // We need to sync log, because we are injecting discontinuity, and writer is not prepared for that.
2787
2788 // 1.1 allocate new log extents and store them at fnode_tail
2789 File *log_file = log.writer->file.get();
2790 old_log_jump_to = log_file->fnode.get_allocated();
2791 bluefs_fnode_t fnode_tail;
2792 uint64_t runway = log_file->fnode.get_allocated() - log.writer->get_effective_write_pos();
2793 dout(10) << __func__ << " old_log_jump_to 0x" << std::hex << old_log_jump_to
2794 << " need 0x" << cct->_conf->bluefs_max_log_runway << std::dec << dendl;
2795 int r = _allocate(vselector->select_prefer_bdev(log_file->vselector_hint),
2796 cct->_conf->bluefs_max_log_runway,
2797 0,
2798 &fnode_tail);
2799 ceph_assert(r == 0);
2800
2801 // 1.2 save log's fnode extents and add new extents
2802 bluefs_fnode_t old_log_fnode(log_file->fnode);
2803 log_file->fnode.clone_extents(fnode_tail);
2804 //adjust usage as flush below will need it
2805 vselector->sub_usage(log_file->vselector_hint, old_log_fnode);
2806 vselector->add_usage(log_file->vselector_hint, log_file->fnode);
2807 dout(10) << __func__ << " log extents " << log_file->fnode.extents << dendl;
2808
2809 // 1.3 update the log file change and log a jump to the offset where we want to
2810 // write the new entries
2811 log.t.op_file_update_inc(log_file->fnode);
2812
2813 // 1.4 jump to new position should mean next seq
2814 log.t.op_jump(log.seq_live + 1, old_log_jump_to);
2815 uint64_t seq_now = log.seq_live;
2816 // we need to flush all bdev because we will be streaming all dirty files to log
2817 // TODO - think - if _flush_and_sync_log_jump will not add dirty files nor release pending allocations
2818 // then flush_bdev() will not be necessary
2819 _flush_bdev();
2820 _flush_and_sync_log_jump_D(old_log_jump_to, runway);
2821
2822 //
2823 // Part 2.
2824 // Build new log starter and compacted metadata body
2825 // 2.1. Build full compacted meta transaction.
2826 // While still holding the lock, encode a bluefs transaction
2827 // that dumps all of the in-memory fnodes and names.
2828 // This might be pretty large and its allocation map can exceed
2829 // superblock size. Hence instead we'll need log starter part which
2830 // goes to superblock and refers that new meta through op_update_inc.
2831 // 2.2. After releasing the lock allocate space for the above transaction
2832 // using its size estimation.
2833 // Then build tailing list of extents which consists of these
2834 // newly allocated extents followed by ones from Part 1.
2835 // 2.3. Allocate the space required for the starter part of the new log.
2836 // It should be small enough to fit into superblock.
2837 // Effectively we start building new log fnode here.
2838 // 2.4. Store starter fnode to run-time superblock, to be written out later
2839 // 2.5. Finalize new log's fnode building
2840 // This will include log's starter and tailing extents built at 2.2
2841 // 2.6. Encode new log fnode starter,
2842 // It will include op_init, new log's op_update_inc
2843 // and jump to the compacted meta transaction beginning.
2844 // Superblock will reference this starter part
2845 // 2.7. Encode compacted meta transaction,
2846 // extend the transaction with a jump to the log tail from 1.1 before
2847 // encoding.
2848 //
2849
2850 // 2.1 Build full compacted meta transaction
2851 bluefs_transaction_t compacted_meta_t;
2852 _compact_log_dump_metadata_NF(starter_seq + 1, &compacted_meta_t, 0, seq_now);
2853
2854 // now state is captured to compacted_meta_t,
2855 // current log can be used to write to,
2856 //ops in log will be continuation of captured state
2857 logger->tinc(l_bluefs_compaction_lock_lat, mono_clock::now() - t0);
2858 log.lock.unlock();
2859
2860 // 2.2 Allocate the space required for the compacted meta transaction
2861 uint64_t compacted_meta_need = _estimate_transaction_size(&compacted_meta_t);
2862 dout(20) << __func__ << " compacted_meta_need " << compacted_meta_need
2863 << dendl;
2864 {
2865 bluefs_fnode_t fnode_pre_tail;
2866 // do allocate
2867 r = _allocate(vselector->select_prefer_bdev(log_file->vselector_hint),
2868 compacted_meta_need,
2869 0,
2870 &fnode_pre_tail);
2871 ceph_assert(r == 0);
2872 // build trailing list of extents in fnode_tail,
2873 // this will include newly allocated extents for compacted meta
2874 // and aux extents allocated at step 1.1
2875 fnode_pre_tail.claim_extents(fnode_tail.extents);
2876 fnode_tail.swap_extents(fnode_pre_tail);
2877 }
2878
2879 // 2.3 Allocate the space required for the starter part of the new log.
2880 // Start building New log fnode
2881 FileRef new_log = nullptr;
2882 new_log = ceph::make_ref<File>();
2883 new_log->fnode.ino = log_file->fnode.ino;
2884 new_log->fnode.mtime = mtime;
2885 // Estimate the required space
2886 uint64_t starter_need =
2887 _make_initial_transaction(starter_seq, fnode_tail, 0, nullptr);
2888 // and now allocate and store at new_log_fnode
2889 r = _allocate(vselector->select_prefer_bdev(log_file->vselector_hint),
2890 starter_need,
2891 0,
2892 &new_log->fnode);
2893 ceph_assert(r == 0);
2894
2895 // 2.4 Store starter fnode to run-time superblock, to be written out later
2896 super.log_fnode = new_log->fnode;
2897
2898 // 2.5 Finalize new log's fnode building
2899 // start collecting new log fnode updates (to make op_update_inc later)
2900 // since this point. This will include compacted meta from 2.2 and aux
2901 // extents from 1.1.
2902 new_log->fnode.reset_delta();
2903 new_log->fnode.claim_extents(fnode_tail.extents);
2904
2905 // 2.6 Encode new log fnode
2906 bufferlist starter_bl;
2907 _make_initial_transaction(starter_seq, new_log->fnode, starter_need,
2908 &starter_bl);
2909
2910 // 2.7 Encode compacted meta transaction,
2911 dout(20) << __func__
2912 << " new_log jump seq " << seq_now
2913 << std::hex << " offset 0x" << starter_need + compacted_meta_need
2914 << std::dec << dendl;
2915 // Extent compacted_meta transaction with a just to new log tail.
2916 // Hopefully "compact_meta_need" estimation provides enough extra space
2917 // for this new jump, assert below if not
2918 compacted_meta_t.op_jump(seq_now, starter_need + compacted_meta_need);
2919 // Now do encodeing and padding
2920 bufferlist compacted_meta_bl;
2921 compacted_meta_bl.reserve(compacted_meta_need);
2922 encode(compacted_meta_t, compacted_meta_bl);
2923 ceph_assert(compacted_meta_bl.length() <= compacted_meta_need);
2924 _pad_bl(compacted_meta_bl, compacted_meta_need);
2925
2926 //
2927 // Part 3.
2928 // Write out new log's content
2929 // 3.1 Stage new log's content writing
2930 // 3.2 Do flush and wait for completion through flush_bdev()
2931 //
2932
2933 // 3.1 Stage new log's content writing
2934 // Make new log writer and append bufferlists to write out.
2935 FileWriter *new_log_writer = _create_writer(new_log);
2936 // And append all new log's bufferlists to write out.
2937 new_log_writer->append(starter_bl);
2938 new_log_writer->append(compacted_meta_bl);
2939
2940 // 3.2. flush and wait
2941 _flush_special(new_log_writer);
2942 _flush_bdev(new_log_writer, false); // do not check log.lock is locked
2943
2944 // Part 4.
2945 // Write out new superblock to reflect all the changes.
2946 //
2947
2948 _write_super(BDEV_DB);
2949 _flush_bdev();
2950
2951 // Part 5.
2952 // Apply new log fnode
2953 //
2954
2955 // we need to acquire log's lock back at this point
2956 log.lock.lock();
2957 // Reconstruct actual log object from the new one.
2958 vselector->sub_usage(log_file->vselector_hint, log_file->fnode);
2959 log_file->fnode.size =
2960 log.writer->pos - old_log_jump_to + starter_need + compacted_meta_need;
2961 log_file->fnode.mtime = std::max(mtime, log_file->fnode.mtime);
2962 log_file->fnode.swap_extents(new_log->fnode);
2963 // update log's writer
2964 log.writer->pos = log.writer->file->fnode.size;
2965 vselector->add_usage(log_file->vselector_hint, log_file->fnode);
2966 // and unlock
2967 log.lock.unlock();
2968
2969 // we're mostly done
2970 dout(10) << __func__ << " log extents " << log_file->fnode.extents << dendl;
2971 logger->inc(l_bluefs_log_compactions);
2972
2973 //Part 6.
2974 // Finalization
2975 // 6.1 Permit log's extension, forbidden at step 0.
2976 //
2977 // 6.2 Release the new log writer
2978 //
2979 // 6.3 Release old space
2980 //
2981 // 6.4. Enable other compactions
2982 //
2983
2984 // 6.1 Permit log's extension, forbidden at step 0.
2985 old_forbidden = atomic_exchange(&log_forbidden_to_expand, false);
2986 ceph_assert(old_forbidden == true);
2987 //to wake up if someone was in need of expanding log
2988 log_cond.notify_all();
2989
2990 // 6.2 Release the new log writer
2991 _close_writer(new_log_writer);
2992 new_log_writer = nullptr;
2993 new_log = nullptr;
2994
2995 // 6.3 Release old space
2996 {
2997 dout(10) << __func__
2998 << " release old log extents " << old_log_fnode.extents
2999 << dendl;
3000 std::lock_guard dl(dirty.lock);
3001 for (auto& r : old_log_fnode.extents) {
3002 dirty.pending_release[r.bdev].insert(r.offset, r.length);
3003 }
3004 }
3005
3006 // 6.4. Enable other compactions
3007 old_is_comp = atomic_exchange(&log_is_compacting, false);
3008 ceph_assert(old_is_comp);
3009 }
3010
3011 void BlueFS::_pad_bl(bufferlist& bl, uint64_t pad_size)
3012 {
3013 pad_size = std::max(pad_size, uint64_t(super.block_size));
3014 uint64_t partial = bl.length() % pad_size;
3015 if (partial) {
3016 dout(10) << __func__ << " padding with 0x" << std::hex
3017 << pad_size - partial << " zeros" << std::dec << dendl;
3018 bl.append_zero(pad_size - partial);
3019 }
3020 }
3021
3022
3023 // Returns log seq that was live before advance.
3024 uint64_t BlueFS::_log_advance_seq()
3025 {
3026 ceph_assert(ceph_mutex_is_locked(dirty.lock));
3027 ceph_assert(ceph_mutex_is_locked(log.lock));
3028 //acquire new seq
3029 // this will became seq_stable once we write
3030 ceph_assert(dirty.seq_stable < dirty.seq_live);
3031 ceph_assert(log.t.seq == log.seq_live);
3032 uint64_t seq = log.seq_live;
3033 log.t.uuid = super.uuid;
3034
3035 ++dirty.seq_live;
3036 ++log.seq_live;
3037 ceph_assert(dirty.seq_live == log.seq_live);
3038 return seq;
3039 }
3040
3041
3042 // Adds to log.t file modifications mentioned in `dirty.files`.
3043 // Note: some bluefs ops may have already been stored in log.t transaction.
3044 void BlueFS::_consume_dirty(uint64_t seq)
3045 {
3046 ceph_assert(ceph_mutex_is_locked(dirty.lock));
3047 ceph_assert(ceph_mutex_is_locked(log.lock));
3048
3049 // log dirty files
3050 // we just incremented log_seq. It is now illegal to add to dirty.files[log_seq]
3051 auto lsi = dirty.files.find(seq);
3052 if (lsi != dirty.files.end()) {
3053 dout(20) << __func__ << " " << lsi->second.size() << " dirty.files" << dendl;
3054 for (auto &f : lsi->second) {
3055 // fnode here is protected indirectly
3056 // the only path that adds to dirty.files goes from _fsync()
3057 // _fsync() is executed under writer lock,
3058 // and does not exit until syncing log is done
3059 dout(20) << __func__ << " op_file_update_inc " << f.fnode << dendl;
3060 log.t.op_file_update_inc(f.fnode);
3061 }
3062 }
3063 }
3064
3065 // Extends log if its free space is smaller then bluefs_min_log_runway.
3066 // Returns space available *BEFORE* adding new space. Signed for additional <0 detection.
3067 int64_t BlueFS::_maybe_extend_log()
3068 {
3069 ceph_assert(ceph_mutex_is_locked(log.lock));
3070 // allocate some more space (before we run out)?
3071 // BTW: this triggers `flush()` in the `page_aligned_appender` of `log.writer`.
3072 int64_t runway = log.writer->file->fnode.get_allocated() -
3073 log.writer->get_effective_write_pos();
3074 if (runway < (int64_t)cct->_conf->bluefs_min_log_runway) {
3075 dout(10) << __func__ << " allocating more log runway (0x"
3076 << std::hex << runway << std::dec << " remaining)" << dendl;
3077 /*
3078 * Usually, when we are low on space in log, we just allocate new extent,
3079 * put update op(log) to log and we are fine.
3080 * Problem - it interferes with log compaction:
3081 * New log produced in compaction will include - as last op - jump into some offset (anchor) of current log.
3082 * It is assumed that log region (anchor - end) will contain all changes made by bluefs since
3083 * full state capture into new log.
3084 * Putting log update into (anchor - end) region is illegal, because any update there must be compatible with
3085 * both logs, but old log is different then new log.
3086 *
3087 * Possible solutions:
3088 * - stall extending log until we finish compacting and switch log (CURRENT)
3089 * - re-run compaction with more runway for old log
3090 * - add OP_FILE_ADDEXT that adds extent; will be compatible with both logs
3091 */
3092 if (log_forbidden_to_expand.load() == true) {
3093 return -EWOULDBLOCK;
3094 }
3095 vselector->sub_usage(log.writer->file->vselector_hint, log.writer->file->fnode);
3096 int r = _allocate(
3097 vselector->select_prefer_bdev(log.writer->file->vselector_hint),
3098 cct->_conf->bluefs_max_log_runway,
3099 0,
3100 &log.writer->file->fnode);
3101 ceph_assert(r == 0);
3102 vselector->add_usage(log.writer->file->vselector_hint, log.writer->file->fnode);
3103 log.t.op_file_update_inc(log.writer->file->fnode);
3104 }
3105 return runway;
3106 }
3107
3108 void BlueFS::_flush_and_sync_log_core(int64_t runway)
3109 {
3110 ceph_assert(ceph_mutex_is_locked(log.lock));
3111 dout(10) << __func__ << " " << log.t << dendl;
3112
3113 bufferlist bl;
3114 bl.reserve(super.block_size);
3115 encode(log.t, bl);
3116 // pad to block boundary
3117 size_t realign = super.block_size - (bl.length() % super.block_size);
3118 if (realign && realign != super.block_size)
3119 bl.append_zero(realign);
3120
3121 logger->inc(l_bluefs_log_write_count, 1);
3122 logger->inc(l_bluefs_logged_bytes, bl.length());
3123
3124 if (true) {
3125 ceph_assert(bl.length() <= runway); // if we write this, we will have an unrecoverable data loss
3126 // transaction will not fit extents before growth -> data loss on _replay
3127 }
3128
3129 log.writer->append(bl);
3130
3131 // prepare log for new transactions
3132 log.t.clear();
3133 log.t.seq = log.seq_live;
3134
3135 uint64_t new_data = _flush_special(log.writer);
3136 vselector->add_usage(log.writer->file->vselector_hint, new_data);
3137 }
3138
3139 // Clears dirty.files up to (including) seq_stable.
3140 void BlueFS::_clear_dirty_set_stable_D(uint64_t seq)
3141 {
3142 std::lock_guard dl(dirty.lock);
3143
3144 // clean dirty files
3145 if (seq > dirty.seq_stable) {
3146 dirty.seq_stable = seq;
3147 dout(20) << __func__ << " seq_stable " << dirty.seq_stable << dendl;
3148
3149 // undirty all files that were already streamed to log
3150 auto p = dirty.files.begin();
3151 while (p != dirty.files.end()) {
3152 if (p->first > dirty.seq_stable) {
3153 dout(20) << __func__ << " done cleaning up dirty files" << dendl;
3154 break;
3155 }
3156
3157 auto l = p->second.begin();
3158 while (l != p->second.end()) {
3159 File *file = &*l;
3160 ceph_assert(file->dirty_seq <= dirty.seq_stable);
3161 dout(20) << __func__ << " cleaned file " << file->fnode.ino << dendl;
3162 file->dirty_seq = dirty.seq_stable;
3163 p->second.erase(l++);
3164 }
3165
3166 ceph_assert(p->second.empty());
3167 dirty.files.erase(p++);
3168 }
3169 } else {
3170 dout(20) << __func__ << " seq_stable " << dirty.seq_stable
3171 << " already >= out seq " << seq
3172 << ", we lost a race against another log flush, done" << dendl;
3173 }
3174 }
3175
3176 void BlueFS::_release_pending_allocations(vector<interval_set<uint64_t>>& to_release)
3177 {
3178 for (unsigned i = 0; i < to_release.size(); ++i) {
3179 if (to_release[i].empty()) {
3180 continue;
3181 }
3182 /* OK, now we have the guarantee alloc[i] won't be null. */
3183
3184 bool discard_queued = bdev[i]->try_discard(to_release[i]);
3185 if (!discard_queued) {
3186 alloc[i]->release(to_release[i]);
3187 if (is_shared_alloc(i)) {
3188 shared_alloc->bluefs_used -= to_release[i].size();
3189 }
3190 }
3191 }
3192 }
3193
3194 int BlueFS::_flush_and_sync_log_LD(uint64_t want_seq)
3195 {
3196 int64_t available_runway;
3197 do {
3198 log.lock.lock();
3199 dirty.lock.lock();
3200 if (want_seq && want_seq <= dirty.seq_stable) {
3201 dout(10) << __func__ << " want_seq " << want_seq << " <= seq_stable "
3202 << dirty.seq_stable << ", done" << dendl;
3203 dirty.lock.unlock();
3204 log.lock.unlock();
3205 return 0;
3206 }
3207
3208 available_runway = _maybe_extend_log();
3209 if (available_runway == -EWOULDBLOCK) {
3210 // we are in need of adding runway, but we are during log-switch from compaction
3211 dirty.lock.unlock();
3212 //instead log.lock.unlock() do move ownership
3213 std::unique_lock<ceph::mutex> ll(log.lock, std::adopt_lock);
3214 while (log_forbidden_to_expand.load()) {
3215 log_cond.wait(ll);
3216 }
3217 } else {
3218 ceph_assert(available_runway >= 0);
3219 }
3220 } while (available_runway < 0);
3221
3222 ceph_assert(want_seq == 0 || want_seq <= dirty.seq_live); // illegal to request seq that was not created yet
3223 uint64_t seq =_log_advance_seq();
3224 _consume_dirty(seq);
3225 vector<interval_set<uint64_t>> to_release(dirty.pending_release.size());
3226 to_release.swap(dirty.pending_release);
3227 dirty.lock.unlock();
3228
3229 _flush_and_sync_log_core(available_runway);
3230 _flush_bdev(log.writer);
3231 logger->set(l_bluefs_log_bytes, log.writer->file->fnode.size);
3232 //now log.lock is no longer needed
3233 log.lock.unlock();
3234
3235 _clear_dirty_set_stable_D(seq);
3236 _release_pending_allocations(to_release);
3237
3238 _update_logger_stats();
3239 return 0;
3240 }
3241
3242 // Flushes log and immediately adjusts log_writer pos.
3243 int BlueFS::_flush_and_sync_log_jump_D(uint64_t jump_to,
3244 int64_t available_runway)
3245 {
3246 ceph_assert(ceph_mutex_is_locked(log.lock));
3247
3248 ceph_assert(jump_to);
3249 // we synchronize writing to log, by lock to log.lock
3250
3251 dirty.lock.lock();
3252 uint64_t seq =_log_advance_seq();
3253 _consume_dirty(seq);
3254 vector<interval_set<uint64_t>> to_release(dirty.pending_release.size());
3255 to_release.swap(dirty.pending_release);
3256 dirty.lock.unlock();
3257 _flush_and_sync_log_core(available_runway);
3258
3259 dout(10) << __func__ << " jumping log offset from 0x" << std::hex
3260 << log.writer->pos << " -> 0x" << jump_to << std::dec << dendl;
3261 log.writer->pos = jump_to;
3262 vselector->sub_usage(log.writer->file->vselector_hint, log.writer->file->fnode.size);
3263 log.writer->file->fnode.size = jump_to;
3264 vselector->add_usage(log.writer->file->vselector_hint, log.writer->file->fnode.size);
3265
3266 _flush_bdev(log.writer);
3267
3268 _clear_dirty_set_stable_D(seq);
3269 _release_pending_allocations(to_release);
3270
3271 logger->set(l_bluefs_log_bytes, log.writer->file->fnode.size);
3272 _update_logger_stats();
3273 return 0;
3274 }
3275
3276 ceph::bufferlist BlueFS::FileWriter::flush_buffer(
3277 CephContext* const cct,
3278 const bool partial,
3279 const unsigned length,
3280 const bluefs_super_t& super)
3281 {
3282 ceph_assert(ceph_mutex_is_locked(this->lock) || file->fnode.ino <= 1);
3283 ceph::bufferlist bl;
3284 if (partial) {
3285 tail_block.splice(0, tail_block.length(), &bl);
3286 }
3287 const auto remaining_len = length - bl.length();
3288 buffer.splice(0, remaining_len, &bl);
3289 if (buffer.length()) {
3290 dout(20) << " leaving 0x" << std::hex << buffer.length() << std::dec
3291 << " unflushed" << dendl;
3292 }
3293 if (const unsigned tail = bl.length() & ~super.block_mask(); tail) {
3294 const auto padding_len = super.block_size - tail;
3295 dout(20) << __func__ << " caching tail of 0x"
3296 << std::hex << tail
3297 << " and padding block with 0x" << padding_len
3298 << " buffer.length() " << buffer.length()
3299 << std::dec << dendl;
3300 // We need to go through the `buffer_appender` to get a chance to
3301 // preserve in-memory contiguity and not mess with the alignment.
3302 // Otherwise a costly rebuild could happen in e.g. `KernelDevice`.
3303 buffer_appender.append_zero(padding_len);
3304 buffer.splice(buffer.length() - padding_len, padding_len, &bl);
3305 // Deep copy the tail here. This allows to avoid costlier copy on
3306 // bufferlist rebuild in e.g. `KernelDevice` and minimizes number
3307 // of memory allocations.
3308 // The alternative approach would be to place the entire tail and
3309 // padding on a dedicated, 4 KB long memory chunk. This shouldn't
3310 // trigger the rebuild while still being less expensive.
3311 buffer_appender.substr_of(bl, bl.length() - padding_len - tail, tail);
3312 buffer.splice(buffer.length() - tail, tail, &tail_block);
3313 } else {
3314 tail_block.clear();
3315 }
3316 return bl;
3317 }
3318
3319 int BlueFS::_signal_dirty_to_log_D(FileWriter *h)
3320 {
3321 ceph_assert(ceph_mutex_is_locked(h->lock));
3322 std::lock_guard dl(dirty.lock);
3323 if (h->file->deleted) {
3324 dout(10) << __func__ << " deleted, no-op" << dendl;
3325 return 0;
3326 }
3327
3328 h->file->fnode.mtime = ceph_clock_now();
3329 ceph_assert(h->file->fnode.ino >= 1);
3330 if (h->file->dirty_seq <= dirty.seq_stable) {
3331 h->file->dirty_seq = dirty.seq_live;
3332 dirty.files[h->file->dirty_seq].push_back(*h->file);
3333 dout(20) << __func__ << " dirty_seq = " << dirty.seq_live
3334 << " (was clean)" << dendl;
3335 } else {
3336 if (h->file->dirty_seq != dirty.seq_live) {
3337 // need re-dirty, erase from list first
3338 ceph_assert(dirty.files.count(h->file->dirty_seq));
3339 auto it = dirty.files[h->file->dirty_seq].iterator_to(*h->file);
3340 dirty.files[h->file->dirty_seq].erase(it);
3341 h->file->dirty_seq = dirty.seq_live;
3342 dirty.files[h->file->dirty_seq].push_back(*h->file);
3343 dout(20) << __func__ << " dirty_seq = " << dirty.seq_live
3344 << " (was " << h->file->dirty_seq << ")" << dendl;
3345 } else {
3346 dout(20) << __func__ << " dirty_seq = " << dirty.seq_live
3347 << " (unchanged, do nothing) " << dendl;
3348 }
3349 }
3350 return 0;
3351 }
3352
3353 void BlueFS::flush_range(FileWriter *h, uint64_t offset, uint64_t length)/*_WF*/
3354 {
3355 _maybe_check_vselector_LNF();
3356 std::unique_lock hl(h->lock);
3357 _flush_range_F(h, offset, length);
3358 }
3359
3360 int BlueFS::_flush_range_F(FileWriter *h, uint64_t offset, uint64_t length)
3361 {
3362 ceph_assert(ceph_mutex_is_locked(h->lock));
3363 ceph_assert(h->file->num_readers.load() == 0);
3364 ceph_assert(h->file->fnode.ino > 1);
3365
3366 dout(10) << __func__ << " " << h << " pos 0x" << std::hex << h->pos
3367 << " 0x" << offset << "~" << length << std::dec
3368 << " to " << h->file->fnode << dendl;
3369 if (h->file->deleted) {
3370 dout(10) << __func__ << " deleted, no-op" << dendl;
3371 return 0;
3372 }
3373
3374 bool buffered = cct->_conf->bluefs_buffered_io;
3375
3376 if (offset + length <= h->pos)
3377 return 0;
3378 if (offset < h->pos) {
3379 length -= h->pos - offset;
3380 offset = h->pos;
3381 dout(10) << " still need 0x"
3382 << std::hex << offset << "~" << length << std::dec
3383 << dendl;
3384 }
3385 std::lock_guard file_lock(h->file->lock);
3386 ceph_assert(offset <= h->file->fnode.size);
3387
3388 uint64_t allocated = h->file->fnode.get_allocated();
3389 vselector->sub_usage(h->file->vselector_hint, h->file->fnode);
3390 // do not bother to dirty the file if we are overwriting
3391 // previously allocated extents.
3392 if (allocated < offset + length) {
3393 // we should never run out of log space here; see the min runway check
3394 // in _flush_and_sync_log.
3395 int r = _allocate(vselector->select_prefer_bdev(h->file->vselector_hint),
3396 offset + length - allocated,
3397 0,
3398 &h->file->fnode);
3399 if (r < 0) {
3400 derr << __func__ << " allocated: 0x" << std::hex << allocated
3401 << " offset: 0x" << offset << " length: 0x" << length << std::dec
3402 << dendl;
3403 vselector->add_usage(h->file->vselector_hint, h->file->fnode); // undo
3404 ceph_abort_msg("bluefs enospc");
3405 return r;
3406 }
3407 h->file->is_dirty = true;
3408 }
3409 if (h->file->fnode.size < offset + length) {
3410 h->file->fnode.size = offset + length;
3411 h->file->is_dirty = true;
3412 }
3413
3414 dout(20) << __func__ << " file now, unflushed " << h->file->fnode << dendl;
3415 int res = _flush_data(h, offset, length, buffered);
3416 vselector->add_usage(h->file->vselector_hint, h->file->fnode);
3417 return res;
3418 }
3419
3420 int BlueFS::_flush_data(FileWriter *h, uint64_t offset, uint64_t length, bool buffered)
3421 {
3422 if (h->file->fnode.ino > 1) {
3423 ceph_assert(ceph_mutex_is_locked(h->lock));
3424 ceph_assert(ceph_mutex_is_locked(h->file->lock));
3425 }
3426 uint64_t x_off = 0;
3427 auto p = h->file->fnode.seek(offset, &x_off);
3428 ceph_assert(p != h->file->fnode.extents.end());
3429 dout(20) << __func__ << " in " << *p << " x_off 0x"
3430 << std::hex << x_off << std::dec << dendl;
3431
3432 unsigned partial = x_off & ~super.block_mask();
3433 if (partial) {
3434 dout(20) << __func__ << " using partial tail 0x"
3435 << std::hex << partial << std::dec << dendl;
3436 x_off -= partial;
3437 offset -= partial;
3438 length += partial;
3439 dout(20) << __func__ << " waiting for previous aio to complete" << dendl;
3440 for (auto p : h->iocv) {
3441 if (p) {
3442 p->aio_wait();
3443 }
3444 }
3445 }
3446
3447 auto bl = h->flush_buffer(cct, partial, length, super);
3448 ceph_assert(bl.length() >= length);
3449 h->pos = offset + length;
3450 length = bl.length();
3451
3452 logger->inc(l_bluefs_write_count, 1);
3453 logger->inc(l_bluefs_write_bytes, length);
3454
3455 switch (h->writer_type) {
3456 case WRITER_WAL:
3457 logger->inc(l_bluefs_write_count_wal, 1);
3458 logger->inc(l_bluefs_bytes_written_wal, length);
3459 break;
3460 case WRITER_SST:
3461 logger->inc(l_bluefs_write_count_sst, 1);
3462 logger->inc(l_bluefs_bytes_written_sst, length);
3463 break;
3464 }
3465
3466 dout(30) << "dump:\n";
3467 bl.hexdump(*_dout);
3468 *_dout << dendl;
3469
3470 uint64_t bloff = 0;
3471 uint64_t bytes_written_slow = 0;
3472 while (length > 0) {
3473 logger->inc(l_bluefs_write_disk_count, 1);
3474
3475 uint64_t x_len = std::min(p->length - x_off, length);
3476 bufferlist t;
3477 t.substr_of(bl, bloff, x_len);
3478 if (cct->_conf->bluefs_sync_write) {
3479 bdev[p->bdev]->write(p->offset + x_off, t, buffered, h->write_hint);
3480 } else {
3481 bdev[p->bdev]->aio_write(p->offset + x_off, t, h->iocv[p->bdev], buffered, h->write_hint);
3482 }
3483 h->dirty_devs[p->bdev] = true;
3484 if (p->bdev == BDEV_SLOW) {
3485 bytes_written_slow += t.length();
3486 }
3487
3488 bloff += x_len;
3489 length -= x_len;
3490 ++p;
3491 x_off = 0;
3492 }
3493 if (bytes_written_slow) {
3494 logger->inc(l_bluefs_bytes_written_slow, bytes_written_slow);
3495 }
3496 for (unsigned i = 0; i < MAX_BDEV; ++i) {
3497 if (bdev[i]) {
3498 if (h->iocv[i] && h->iocv[i]->has_pending_aios()) {
3499 bdev[i]->aio_submit(h->iocv[i]);
3500 }
3501 }
3502 }
3503 dout(20) << __func__ << " h " << h << " pos now 0x"
3504 << std::hex << h->pos << std::dec << dendl;
3505 return 0;
3506 }
3507
3508 #ifdef HAVE_LIBAIO
3509 // we need to retire old completed aios so they don't stick around in
3510 // memory indefinitely (along with their bufferlist refs).
3511 void BlueFS::_claim_completed_aios(FileWriter *h, list<aio_t> *ls)
3512 {
3513 for (auto p : h->iocv) {
3514 if (p) {
3515 ls->splice(ls->end(), p->running_aios);
3516 }
3517 }
3518 dout(10) << __func__ << " got " << ls->size() << " aios" << dendl;
3519 }
3520
3521 void BlueFS::_wait_for_aio(FileWriter *h)
3522 {
3523 // NOTE: this is safe to call without a lock, as long as our reference is
3524 // stable.
3525 utime_t start;
3526 lgeneric_subdout(cct, bluefs, 10) << __func__;
3527 start = ceph_clock_now();
3528 *_dout << " " << h << dendl;
3529 for (auto p : h->iocv) {
3530 if (p) {
3531 p->aio_wait();
3532 }
3533 }
3534 dout(10) << __func__ << " " << h << " done in " << (ceph_clock_now() - start) << dendl;
3535 }
3536 #endif
3537
3538 void BlueFS::append_try_flush(FileWriter *h, const char* buf, size_t len)/*_WF_LNF_NF_LD_D*/
3539 {
3540 bool flushed_sum = false;
3541 {
3542 std::unique_lock hl(h->lock);
3543 size_t max_size = 1ull << 30; // cap to 1GB
3544 while (len > 0) {
3545 bool need_flush = true;
3546 auto l0 = h->get_buffer_length();
3547 if (l0 < max_size) {
3548 size_t l = std::min(len, max_size - l0);
3549 h->append(buf, l);
3550 buf += l;
3551 len -= l;
3552 need_flush = h->get_buffer_length() >= cct->_conf->bluefs_min_flush_size;
3553 }
3554 if (need_flush) {
3555 bool flushed = false;
3556 int r = _flush_F(h, true, &flushed);
3557 ceph_assert(r == 0);
3558 flushed_sum |= flushed;
3559 // make sure we've made any progress with flush hence the
3560 // loop doesn't iterate forever
3561 ceph_assert(h->get_buffer_length() < max_size);
3562 }
3563 }
3564 }
3565 if (flushed_sum) {
3566 _maybe_compact_log_LNF_NF_LD_D();
3567 }
3568 }
3569
3570 void BlueFS::flush(FileWriter *h, bool force)/*_WF_LNF_NF_LD_D*/
3571 {
3572 bool flushed = false;
3573 int r;
3574 {
3575 std::unique_lock hl(h->lock);
3576 r = _flush_F(h, force, &flushed);
3577 ceph_assert(r == 0);
3578 }
3579 if (r == 0 && flushed) {
3580 _maybe_compact_log_LNF_NF_LD_D();
3581 }
3582 }
3583
3584 int BlueFS::_flush_F(FileWriter *h, bool force, bool *flushed)
3585 {
3586 ceph_assert(ceph_mutex_is_locked(h->lock));
3587 uint64_t length = h->get_buffer_length();
3588 uint64_t offset = h->pos;
3589 if (flushed) {
3590 *flushed = false;
3591 }
3592 if (!force &&
3593 length < cct->_conf->bluefs_min_flush_size) {
3594 dout(10) << __func__ << " " << h << " ignoring, length " << length
3595 << " < min_flush_size " << cct->_conf->bluefs_min_flush_size
3596 << dendl;
3597 return 0;
3598 }
3599 if (length == 0) {
3600 dout(10) << __func__ << " " << h << " no dirty data on "
3601 << h->file->fnode << dendl;
3602 return 0;
3603 }
3604 dout(10) << __func__ << " " << h << " 0x"
3605 << std::hex << offset << "~" << length << std::dec
3606 << " to " << h->file->fnode << dendl;
3607 ceph_assert(h->pos <= h->file->fnode.size);
3608 int r = _flush_range_F(h, offset, length);
3609 if (flushed) {
3610 *flushed = true;
3611 }
3612 return r;
3613 }
3614
3615 // Flush for bluefs special files.
3616 // Does not add extents to h.
3617 // Does not mark h as dirty.
3618 // we do not need to dirty the log file (or it's compacting
3619 // replacement) when the file size changes because replay is
3620 // smart enough to discover it on its own.
3621 uint64_t BlueFS::_flush_special(FileWriter *h)
3622 {
3623 ceph_assert(h->file->fnode.ino <= 1);
3624 uint64_t length = h->get_buffer_length();
3625 uint64_t offset = h->pos;
3626 uint64_t new_data = 0;
3627 ceph_assert(length + offset <= h->file->fnode.get_allocated());
3628 if (h->file->fnode.size < offset + length) {
3629 new_data = offset + length - h->file->fnode.size;
3630 h->file->fnode.size = offset + length;
3631 }
3632 _flush_data(h, offset, length, false);
3633 return new_data;
3634 }
3635
3636 int BlueFS::truncate(FileWriter *h, uint64_t offset)/*_WF_L*/
3637 {
3638 std::lock_guard hl(h->lock);
3639 dout(10) << __func__ << " 0x" << std::hex << offset << std::dec
3640 << " file " << h->file->fnode << dendl;
3641 if (h->file->deleted) {
3642 dout(10) << __func__ << " deleted, no-op" << dendl;
3643 return 0;
3644 }
3645
3646 // we never truncate internal log files
3647 ceph_assert(h->file->fnode.ino > 1);
3648
3649 // truncate off unflushed data?
3650 if (h->pos < offset &&
3651 h->pos + h->get_buffer_length() > offset) {
3652 dout(20) << __func__ << " tossing out last " << offset - h->pos
3653 << " unflushed bytes" << dendl;
3654 ceph_abort_msg("actually this shouldn't happen");
3655 }
3656 if (h->get_buffer_length()) {
3657 int r = _flush_F(h, true);
3658 if (r < 0)
3659 return r;
3660 }
3661 if (offset == h->file->fnode.size) {
3662 return 0; // no-op!
3663 }
3664 if (offset > h->file->fnode.size) {
3665 ceph_abort_msg("truncate up not supported");
3666 }
3667 ceph_assert(h->file->fnode.size >= offset);
3668 _flush_bdev(h);
3669
3670 std::lock_guard ll(log.lock);
3671 vselector->sub_usage(h->file->vselector_hint, h->file->fnode.size);
3672 h->file->fnode.size = offset;
3673 h->file->is_dirty = true;
3674 vselector->add_usage(h->file->vselector_hint, h->file->fnode.size);
3675 log.t.op_file_update_inc(h->file->fnode);
3676 return 0;
3677 }
3678
3679 int BlueFS::fsync(FileWriter *h)/*_WF_WD_WLD_WLNF_WNF*/
3680 {
3681 _maybe_check_vselector_LNF();
3682 std::unique_lock hl(h->lock);
3683 uint64_t old_dirty_seq = 0;
3684 {
3685 dout(10) << __func__ << " " << h << " " << h->file->fnode
3686 << " dirty " << h->file->is_dirty << dendl;
3687 int r = _flush_F(h, true);
3688 if (r < 0)
3689 return r;
3690 _flush_bdev(h);
3691 if (h->file->is_dirty) {
3692 _signal_dirty_to_log_D(h);
3693 h->file->is_dirty = false;
3694 }
3695 {
3696 std::lock_guard dl(dirty.lock);
3697 if (dirty.seq_stable < h->file->dirty_seq) {
3698 old_dirty_seq = h->file->dirty_seq;
3699 dout(20) << __func__ << " file metadata was dirty (" << old_dirty_seq
3700 << ") on " << h->file->fnode << ", flushing log" << dendl;
3701 }
3702 }
3703 }
3704 if (old_dirty_seq) {
3705 _flush_and_sync_log_LD(old_dirty_seq);
3706 }
3707 _maybe_compact_log_LNF_NF_LD_D();
3708
3709 return 0;
3710 }
3711
3712 // be careful - either h->file->lock or log.lock must be taken
3713 void BlueFS::_flush_bdev(FileWriter *h, bool check_mutext_locked)
3714 {
3715 if (check_mutext_locked) {
3716 if (h->file->fnode.ino > 1) {
3717 ceph_assert(ceph_mutex_is_locked(h->lock));
3718 } else if (h->file->fnode.ino == 1) {
3719 ceph_assert(ceph_mutex_is_locked(log.lock));
3720 }
3721 }
3722 std::array<bool, MAX_BDEV> flush_devs = h->dirty_devs;
3723 h->dirty_devs.fill(false);
3724 #ifdef HAVE_LIBAIO
3725 if (!cct->_conf->bluefs_sync_write) {
3726 list<aio_t> completed_ios;
3727 _claim_completed_aios(h, &completed_ios);
3728 _wait_for_aio(h);
3729 completed_ios.clear();
3730 }
3731 #endif
3732 _flush_bdev(flush_devs);
3733 }
3734
3735 void BlueFS::_flush_bdev(std::array<bool, MAX_BDEV>& dirty_bdevs)
3736 {
3737 // NOTE: this is safe to call without a lock.
3738 dout(20) << __func__ << dendl;
3739 for (unsigned i = 0; i < MAX_BDEV; i++) {
3740 if (dirty_bdevs[i])
3741 bdev[i]->flush();
3742 }
3743 }
3744
3745 void BlueFS::_flush_bdev()
3746 {
3747 // NOTE: this is safe to call without a lock.
3748 dout(20) << __func__ << dendl;
3749 for (unsigned i = 0; i < MAX_BDEV; i++) {
3750 // alloc space from BDEV_SLOW is unexpected.
3751 // So most cases we don't alloc from BDEV_SLOW and so avoiding flush not-used device.
3752 if (bdev[i] && (i != BDEV_SLOW || _get_used(i))) {
3753 bdev[i]->flush();
3754 }
3755 }
3756 }
3757
3758 const char* BlueFS::get_device_name(unsigned id)
3759 {
3760 if (id >= MAX_BDEV) return "BDEV_INV";
3761 const char* names[] = {"BDEV_WAL", "BDEV_DB", "BDEV_SLOW", "BDEV_NEWWAL", "BDEV_NEWDB"};
3762 return names[id];
3763 }
3764
3765 int BlueFS::_allocate(uint8_t id, uint64_t len,
3766 uint64_t alloc_unit,
3767 bluefs_fnode_t* node,
3768 size_t alloc_attempts,
3769 bool permit_dev_fallback)
3770 {
3771 dout(10) << __func__ << " len 0x" << std::hex << len
3772 << " au 0x" << alloc_unit
3773 << std::dec << " from " << (int)id
3774 << " cooldown " << cooldown_deadline
3775 << dendl;
3776 ceph_assert(id < alloc.size());
3777 int64_t alloc_len = 0;
3778 PExtentVector extents;
3779 uint64_t hint = 0;
3780 int64_t need = len;
3781 bool shared = is_shared_alloc(id);
3782 auto shared_unit = shared_alloc ? shared_alloc->alloc_unit : 0;
3783 bool was_cooldown = false;
3784 if (alloc[id]) {
3785 if (!alloc_unit) {
3786 alloc_unit = alloc_size[id];
3787 }
3788 // do not attempt shared_allocator with bluefs alloc unit
3789 // when cooling down, fallback to slow dev alloc unit.
3790 if (shared && alloc_unit != shared_unit) {
3791 if (duration_cast<seconds>(real_clock::now().time_since_epoch()).count() <
3792 cooldown_deadline) {
3793 logger->inc(l_bluefs_alloc_shared_size_fallbacks);
3794 alloc_unit = shared_unit;
3795 was_cooldown = true;
3796 } else if (cooldown_deadline.fetch_and(0)) {
3797 // we might get false cooldown_deadline reset at this point
3798 // but that's mostly harmless.
3799 dout(1) << __func__ << " shared allocation cooldown period elapsed"
3800 << dendl;
3801 }
3802 }
3803 need = round_up_to(len, alloc_unit);
3804 if (!node->extents.empty() && node->extents.back().bdev == id) {
3805 hint = node->extents.back().end();
3806 }
3807 ++alloc_attempts;
3808 extents.reserve(4); // 4 should be (more than) enough for most allocations
3809 alloc_len = alloc[id]->allocate(need, alloc_unit, hint, &extents);
3810 }
3811 if (alloc_len < 0 || alloc_len < need) {
3812 if (alloc[id]) {
3813 if (alloc_len > 0) {
3814 alloc[id]->release(extents);
3815 }
3816 if (!was_cooldown && shared) {
3817 auto delay_s = cct->_conf->bluefs_failed_shared_alloc_cooldown;
3818 cooldown_deadline = delay_s +
3819 duration_cast<seconds>(real_clock::now().time_since_epoch()).count();
3820 dout(1) << __func__ << " shared allocation cooldown set for "
3821 << delay_s << "s"
3822 << dendl;
3823 }
3824 dout(1) << __func__ << " unable to allocate 0x" << std::hex << need
3825 << " on bdev " << (int)id
3826 << ", allocator name " << alloc[id]->get_name()
3827 << ", allocator type " << alloc[id]->get_type()
3828 << ", capacity 0x" << alloc[id]->get_capacity()
3829 << ", block size 0x" << alloc[id]->get_block_size()
3830 << ", alloc unit 0x" << alloc_unit
3831 << ", free 0x" << alloc[id]->get_free()
3832 << ", fragmentation " << alloc[id]->get_fragmentation()
3833 << ", allocated 0x" << (alloc_len > 0 ? alloc_len : 0)
3834 << std::dec << dendl;
3835 } else {
3836 dout(20) << __func__ << " alloc-id not set on index="<< (int)id
3837 << " unable to allocate 0x" << std::hex << need
3838 << " on bdev " << (int)id << std::dec << dendl;
3839 }
3840 if (alloc[id] && shared && alloc_unit != shared_unit) {
3841 alloc_unit = shared_unit;
3842 dout(20) << __func__ << " fallback to bdev "
3843 << (int)id
3844 << " with alloc unit 0x" << std::hex << alloc_unit
3845 << std::dec << dendl;
3846 logger->inc(l_bluefs_alloc_shared_size_fallbacks);
3847 return _allocate(id,
3848 len,
3849 alloc_unit,
3850 node,
3851 alloc_attempts,
3852 permit_dev_fallback);
3853 } else if (permit_dev_fallback && id != BDEV_SLOW && alloc[id + 1]) {
3854 dout(20) << __func__ << " fallback to bdev "
3855 << (int)id + 1
3856 << dendl;
3857 if (alloc_attempts > 0 && is_shared_alloc(id + 1)) {
3858 logger->inc(l_bluefs_alloc_shared_dev_fallbacks);
3859 }
3860 return _allocate(id + 1,
3861 len,
3862 0, // back to default alloc unit
3863 node,
3864 alloc_attempts,
3865 permit_dev_fallback);
3866 } else {
3867 derr << __func__ << " allocation failed, needed 0x" << std::hex << need
3868 << dendl;
3869 }
3870 return -ENOSPC;
3871 } else {
3872 uint64_t used = _get_used(id);
3873 if (max_bytes[id] < used) {
3874 logger->set(max_bytes_pcounters[id], used);
3875 max_bytes[id] = used;
3876 }
3877 if (shared) {
3878 shared_alloc->bluefs_used += alloc_len;
3879 }
3880 }
3881
3882 for (auto& p : extents) {
3883 node->append_extent(bluefs_extent_t(id, p.offset, p.length));
3884 }
3885
3886 return 0;
3887 }
3888
3889 int BlueFS::preallocate(FileRef f, uint64_t off, uint64_t len)/*_LF*/
3890 {
3891 std::lock_guard ll(log.lock);
3892 std::lock_guard fl(f->lock);
3893 dout(10) << __func__ << " file " << f->fnode << " 0x"
3894 << std::hex << off << "~" << len << std::dec << dendl;
3895 if (f->deleted) {
3896 dout(10) << __func__ << " deleted, no-op" << dendl;
3897 return 0;
3898 }
3899 ceph_assert(f->fnode.ino > 1);
3900 uint64_t allocated = f->fnode.get_allocated();
3901 if (off + len > allocated) {
3902 uint64_t want = off + len - allocated;
3903
3904 vselector->sub_usage(f->vselector_hint, f->fnode);
3905 int r = _allocate(vselector->select_prefer_bdev(f->vselector_hint),
3906 want,
3907 0,
3908 &f->fnode);
3909 vselector->add_usage(f->vselector_hint, f->fnode);
3910 if (r < 0)
3911 return r;
3912
3913 log.t.op_file_update_inc(f->fnode);
3914 }
3915 return 0;
3916 }
3917
3918 void BlueFS::sync_metadata(bool avoid_compact)/*_LNF_NF_LD_D*/
3919 {
3920 bool can_skip_flush;
3921 {
3922 std::lock_guard ll(log.lock);
3923 std::lock_guard dl(dirty.lock);
3924 can_skip_flush = log.t.empty() && dirty.files.empty();
3925 }
3926 if (can_skip_flush) {
3927 dout(10) << __func__ << " - no pending log events" << dendl;
3928 } else {
3929 utime_t start;
3930 lgeneric_subdout(cct, bluefs, 10) << __func__;
3931 start = ceph_clock_now();
3932 *_dout << dendl;
3933 _flush_bdev(); // FIXME?
3934 _flush_and_sync_log_LD();
3935 dout(10) << __func__ << " done in " << (ceph_clock_now() - start) << dendl;
3936 }
3937
3938 if (!avoid_compact) {
3939 _maybe_compact_log_LNF_NF_LD_D();
3940 }
3941 }
3942
3943 void BlueFS::_maybe_compact_log_LNF_NF_LD_D()
3944 {
3945 if (!cct->_conf->bluefs_replay_recovery_disable_compact &&
3946 _should_start_compact_log_L_N()) {
3947 auto t0 = mono_clock::now();
3948 if (cct->_conf->bluefs_compact_log_sync) {
3949 _compact_log_sync_LNF_LD();
3950 } else {
3951 _compact_log_async_LD_LNF_D();
3952 }
3953 logger->tinc(l_bluefs_compaction_lat, mono_clock::now() - t0);
3954 }
3955 }
3956
3957 int BlueFS::open_for_write(
3958 std::string_view dirname,
3959 std::string_view filename,
3960 FileWriter **h,
3961 bool overwrite)/*_LND*/
3962 {
3963 _maybe_check_vselector_LNF();
3964 FileRef file;
3965 bool create = false;
3966 bool truncate = false;
3967 mempool::bluefs::vector<bluefs_extent_t> pending_release_extents;
3968 {
3969 std::lock_guard ll(log.lock);
3970 std::lock_guard nl(nodes.lock);
3971 dout(10) << __func__ << " " << dirname << "/" << filename << dendl;
3972 map<string,DirRef>::iterator p = nodes.dir_map.find(dirname);
3973 DirRef dir;
3974 if (p == nodes.dir_map.end()) {
3975 // implicitly create the dir
3976 dout(20) << __func__ << " dir " << dirname
3977 << " does not exist" << dendl;
3978 return -ENOENT;
3979 } else {
3980 dir = p->second;
3981 }
3982
3983 map<string,FileRef>::iterator q = dir->file_map.find(filename);
3984 if (q == dir->file_map.end()) {
3985 if (overwrite) {
3986 dout(20) << __func__ << " dir " << dirname << " (" << dir
3987 << ") file " << filename
3988 << " does not exist" << dendl;
3989 return -ENOENT;
3990 }
3991 file = ceph::make_ref<File>();
3992 file->fnode.ino = ++ino_last;
3993 nodes.file_map[ino_last] = file;
3994 dir->file_map[string{filename}] = file;
3995 ++file->refs;
3996 create = true;
3997 logger->set(l_bluefs_num_files, nodes.file_map.size());
3998 } else {
3999 // overwrite existing file?
4000 file = q->second;
4001 if (overwrite) {
4002 dout(20) << __func__ << " dir " << dirname << " (" << dir
4003 << ") file " << filename
4004 << " already exists, overwrite in place" << dendl;
4005 } else {
4006 dout(20) << __func__ << " dir " << dirname << " (" << dir
4007 << ") file " << filename
4008 << " already exists, truncate + overwrite" << dendl;
4009 vselector->sub_usage(file->vselector_hint, file->fnode);
4010 file->fnode.size = 0;
4011 pending_release_extents.swap(file->fnode.extents);
4012 truncate = true;
4013
4014 file->fnode.clear_extents();
4015 }
4016 }
4017 ceph_assert(file->fnode.ino > 1);
4018
4019 file->fnode.mtime = ceph_clock_now();
4020 file->vselector_hint = vselector->get_hint_by_dir(dirname);
4021 if (create || truncate) {
4022 vselector->add_usage(file->vselector_hint, file->fnode); // update file count
4023 }
4024
4025 dout(20) << __func__ << " mapping " << dirname << "/" << filename
4026 << " vsel_hint " << file->vselector_hint
4027 << dendl;
4028
4029 log.t.op_file_update(file->fnode);
4030 if (create)
4031 log.t.op_dir_link(dirname, filename, file->fnode.ino);
4032
4033 std::lock_guard dl(dirty.lock);
4034 for (auto& p : pending_release_extents) {
4035 dirty.pending_release[p.bdev].insert(p.offset, p.length);
4036 }
4037 }
4038 *h = _create_writer(file);
4039
4040 if (boost::algorithm::ends_with(filename, ".log")) {
4041 (*h)->writer_type = BlueFS::WRITER_WAL;
4042 if (logger && !overwrite) {
4043 logger->inc(l_bluefs_files_written_wal);
4044 }
4045 } else if (boost::algorithm::ends_with(filename, ".sst")) {
4046 (*h)->writer_type = BlueFS::WRITER_SST;
4047 if (logger) {
4048 logger->inc(l_bluefs_files_written_sst);
4049 }
4050 }
4051
4052 dout(10) << __func__ << " h " << *h << " on " << file->fnode << dendl;
4053 return 0;
4054 }
4055
4056 BlueFS::FileWriter *BlueFS::_create_writer(FileRef f)
4057 {
4058 FileWriter *w = new FileWriter(f);
4059 for (unsigned i = 0; i < MAX_BDEV; ++i) {
4060 if (bdev[i]) {
4061 w->iocv[i] = new IOContext(cct, NULL);
4062 }
4063 }
4064 return w;
4065 }
4066
4067 void BlueFS::_drain_writer(FileWriter *h)
4068 {
4069 dout(10) << __func__ << " " << h << " type " << h->writer_type << dendl;
4070 //h->buffer.reassign_to_mempool(mempool::mempool_bluefs_file_writer);
4071 for (unsigned i=0; i<MAX_BDEV; ++i) {
4072 if (bdev[i]) {
4073 if (h->iocv[i]) {
4074 h->iocv[i]->aio_wait();
4075 delete h->iocv[i];
4076 }
4077 }
4078 }
4079 // sanity
4080 if (h->file->fnode.size >= (1ull << 30)) {
4081 dout(10) << __func__ << " file is unexpectedly large:" << h->file->fnode << dendl;
4082 }
4083 }
4084
4085 void BlueFS::_close_writer(FileWriter *h)
4086 {
4087 _drain_writer(h);
4088 delete h;
4089 }
4090 void BlueFS::close_writer(FileWriter *h)
4091 {
4092 {
4093 std::lock_guard l(h->lock);
4094 _drain_writer(h);
4095 }
4096 delete h;
4097 }
4098
4099 uint64_t BlueFS::debug_get_dirty_seq(FileWriter *h)
4100 {
4101 std::lock_guard l(h->lock);
4102 return h->file->dirty_seq;
4103 }
4104
4105 bool BlueFS::debug_get_is_dev_dirty(FileWriter *h, uint8_t dev)
4106 {
4107 std::lock_guard l(h->lock);
4108 return h->dirty_devs[dev];
4109 }
4110
4111 int BlueFS::open_for_read(
4112 std::string_view dirname,
4113 std::string_view filename,
4114 FileReader **h,
4115 bool random)/*_N*/
4116 {
4117 _maybe_check_vselector_LNF();
4118 std::lock_guard nl(nodes.lock);
4119 dout(10) << __func__ << " " << dirname << "/" << filename
4120 << (random ? " (random)":" (sequential)") << dendl;
4121 map<string,DirRef>::iterator p = nodes.dir_map.find(dirname);
4122 if (p == nodes.dir_map.end()) {
4123 dout(20) << __func__ << " dir " << dirname << " not found" << dendl;
4124 return -ENOENT;
4125 }
4126 DirRef dir = p->second;
4127
4128 map<string,FileRef>::iterator q = dir->file_map.find(filename);
4129 if (q == dir->file_map.end()) {
4130 dout(20) << __func__ << " dir " << dirname << " (" << dir
4131 << ") file " << filename
4132 << " not found" << dendl;
4133 return -ENOENT;
4134 }
4135 File *file = q->second.get();
4136
4137 *h = new FileReader(file, random ? 4096 : cct->_conf->bluefs_max_prefetch,
4138 random, false);
4139 dout(10) << __func__ << " h " << *h << " on " << file->fnode << dendl;
4140 return 0;
4141 }
4142
4143 int BlueFS::rename(
4144 std::string_view old_dirname, std::string_view old_filename,
4145 std::string_view new_dirname, std::string_view new_filename)/*_LND*/
4146 {
4147 std::lock_guard ll(log.lock);
4148 std::lock_guard nl(nodes.lock);
4149 dout(10) << __func__ << " " << old_dirname << "/" << old_filename
4150 << " -> " << new_dirname << "/" << new_filename << dendl;
4151 map<string,DirRef>::iterator p = nodes.dir_map.find(old_dirname);
4152 if (p == nodes.dir_map.end()) {
4153 dout(20) << __func__ << " dir " << old_dirname << " not found" << dendl;
4154 return -ENOENT;
4155 }
4156 DirRef old_dir = p->second;
4157 map<string,FileRef>::iterator q = old_dir->file_map.find(old_filename);
4158 if (q == old_dir->file_map.end()) {
4159 dout(20) << __func__ << " dir " << old_dirname << " (" << old_dir
4160 << ") file " << old_filename
4161 << " not found" << dendl;
4162 return -ENOENT;
4163 }
4164 FileRef file = q->second;
4165
4166 p = nodes.dir_map.find(new_dirname);
4167 if (p == nodes.dir_map.end()) {
4168 dout(20) << __func__ << " dir " << new_dirname << " not found" << dendl;
4169 return -ENOENT;
4170 }
4171 DirRef new_dir = p->second;
4172 q = new_dir->file_map.find(new_filename);
4173 if (q != new_dir->file_map.end()) {
4174 dout(20) << __func__ << " dir " << new_dirname << " (" << old_dir
4175 << ") file " << new_filename
4176 << " already exists, unlinking" << dendl;
4177 ceph_assert(q->second != file);
4178 log.t.op_dir_unlink(new_dirname, new_filename);
4179 _drop_link_D(q->second);
4180 }
4181
4182 dout(10) << __func__ << " " << new_dirname << "/" << new_filename << " "
4183 << " " << file->fnode << dendl;
4184
4185 new_dir->file_map[string{new_filename}] = file;
4186 old_dir->file_map.erase(string{old_filename});
4187
4188 log.t.op_dir_link(new_dirname, new_filename, file->fnode.ino);
4189 log.t.op_dir_unlink(old_dirname, old_filename);
4190 return 0;
4191 }
4192
4193 int BlueFS::mkdir(std::string_view dirname)/*_LN*/
4194 {
4195 std::lock_guard ll(log.lock);
4196 std::lock_guard nl(nodes.lock);
4197 dout(10) << __func__ << " " << dirname << dendl;
4198 map<string,DirRef>::iterator p = nodes.dir_map.find(dirname);
4199 if (p != nodes.dir_map.end()) {
4200 dout(20) << __func__ << " dir " << dirname << " exists" << dendl;
4201 return -EEXIST;
4202 }
4203 nodes.dir_map[string{dirname}] = ceph::make_ref<Dir>();
4204 log.t.op_dir_create(dirname);
4205 return 0;
4206 }
4207
4208 int BlueFS::rmdir(std::string_view dirname)/*_LN*/
4209 {
4210 std::lock_guard ll(log.lock);
4211 std::lock_guard nl(nodes.lock);
4212 dout(10) << __func__ << " " << dirname << dendl;
4213 auto p = nodes.dir_map.find(dirname);
4214 if (p == nodes.dir_map.end()) {
4215 dout(20) << __func__ << " dir " << dirname << " does not exist" << dendl;
4216 return -ENOENT;
4217 }
4218 DirRef dir = p->second;
4219 if (!dir->file_map.empty()) {
4220 dout(20) << __func__ << " dir " << dirname << " not empty" << dendl;
4221 return -ENOTEMPTY;
4222 }
4223 nodes.dir_map.erase(string{dirname});
4224 log.t.op_dir_remove(dirname);
4225 return 0;
4226 }
4227
4228 bool BlueFS::dir_exists(std::string_view dirname)/*_N*/
4229 {
4230 std::lock_guard nl(nodes.lock);
4231 map<string,DirRef>::iterator p = nodes.dir_map.find(dirname);
4232 bool exists = p != nodes.dir_map.end();
4233 dout(10) << __func__ << " " << dirname << " = " << (int)exists << dendl;
4234 return exists;
4235 }
4236
4237 int BlueFS::stat(std::string_view dirname, std::string_view filename,
4238 uint64_t *size, utime_t *mtime)/*_N*/
4239 {
4240 std::lock_guard nl(nodes.lock);
4241 dout(10) << __func__ << " " << dirname << "/" << filename << dendl;
4242 map<string,DirRef>::iterator p = nodes.dir_map.find(dirname);
4243 if (p == nodes.dir_map.end()) {
4244 dout(20) << __func__ << " dir " << dirname << " not found" << dendl;
4245 return -ENOENT;
4246 }
4247 DirRef dir = p->second;
4248 map<string,FileRef>::iterator q = dir->file_map.find(filename);
4249 if (q == dir->file_map.end()) {
4250 dout(20) << __func__ << " dir " << dirname << " (" << dir
4251 << ") file " << filename
4252 << " not found" << dendl;
4253 return -ENOENT;
4254 }
4255 File *file = q->second.get();
4256 dout(10) << __func__ << " " << dirname << "/" << filename
4257 << " " << file->fnode << dendl;
4258 if (size)
4259 *size = file->fnode.size;
4260 if (mtime)
4261 *mtime = file->fnode.mtime;
4262 return 0;
4263 }
4264
4265 int BlueFS::lock_file(std::string_view dirname, std::string_view filename,
4266 FileLock **plock)/*_LN*/
4267 {
4268 std::lock_guard ll(log.lock);
4269 std::lock_guard nl(nodes.lock);
4270 dout(10) << __func__ << " " << dirname << "/" << filename << dendl;
4271 map<string,DirRef>::iterator p = nodes.dir_map.find(dirname);
4272 if (p == nodes.dir_map.end()) {
4273 dout(20) << __func__ << " dir " << dirname << " not found" << dendl;
4274 return -ENOENT;
4275 }
4276 DirRef dir = p->second;
4277 auto q = dir->file_map.find(filename);
4278 FileRef file;
4279 if (q == dir->file_map.end()) {
4280 dout(20) << __func__ << " dir " << dirname << " (" << dir
4281 << ") file " << filename
4282 << " not found, creating" << dendl;
4283 file = ceph::make_ref<File>();
4284 file->fnode.ino = ++ino_last;
4285 file->fnode.mtime = ceph_clock_now();
4286 nodes.file_map[ino_last] = file;
4287 dir->file_map[string{filename}] = file;
4288 logger->set(l_bluefs_num_files, nodes.file_map.size());
4289 ++file->refs;
4290 log.t.op_file_update(file->fnode);
4291 log.t.op_dir_link(dirname, filename, file->fnode.ino);
4292 } else {
4293 file = q->second;
4294 if (file->locked) {
4295 dout(10) << __func__ << " already locked" << dendl;
4296 return -ENOLCK;
4297 }
4298 }
4299 file->locked = true;
4300 *plock = new FileLock(file);
4301 dout(10) << __func__ << " locked " << file->fnode
4302 << " with " << *plock << dendl;
4303 return 0;
4304 }
4305
4306 int BlueFS::unlock_file(FileLock *fl)/*_N*/
4307 {
4308 std::lock_guard nl(nodes.lock);
4309 dout(10) << __func__ << " " << fl << " on " << fl->file->fnode << dendl;
4310 ceph_assert(fl->file->locked);
4311 fl->file->locked = false;
4312 delete fl;
4313 return 0;
4314 }
4315
4316 int BlueFS::readdir(std::string_view dirname, vector<string> *ls)/*_N*/
4317 {
4318 // dirname may contain a trailing /
4319 if (!dirname.empty() && dirname.back() == '/') {
4320 dirname.remove_suffix(1);
4321 }
4322 std::lock_guard nl(nodes.lock);
4323 dout(10) << __func__ << " " << dirname << dendl;
4324 if (dirname.empty()) {
4325 // list dirs
4326 ls->reserve(nodes.dir_map.size() + 2);
4327 for (auto& q : nodes.dir_map) {
4328 ls->push_back(q.first);
4329 }
4330 } else {
4331 // list files in dir
4332 map<string,DirRef>::iterator p = nodes.dir_map.find(dirname);
4333 if (p == nodes.dir_map.end()) {
4334 dout(20) << __func__ << " dir " << dirname << " not found" << dendl;
4335 return -ENOENT;
4336 }
4337 DirRef dir = p->second;
4338 ls->reserve(dir->file_map.size() + 2);
4339 for (auto& q : dir->file_map) {
4340 ls->push_back(q.first);
4341 }
4342 }
4343 ls->push_back(".");
4344 ls->push_back("..");
4345 return 0;
4346 }
4347
4348 int BlueFS::unlink(std::string_view dirname, std::string_view filename)/*_LND*/
4349 {
4350 std::lock_guard ll(log.lock);
4351 std::lock_guard nl(nodes.lock);
4352 dout(10) << __func__ << " " << dirname << "/" << filename << dendl;
4353 map<string,DirRef>::iterator p = nodes.dir_map.find(dirname);
4354 if (p == nodes.dir_map.end()) {
4355 dout(20) << __func__ << " dir " << dirname << " not found" << dendl;
4356 return -ENOENT;
4357 }
4358 DirRef dir = p->second;
4359 map<string,FileRef>::iterator q = dir->file_map.find(filename);
4360 if (q == dir->file_map.end()) {
4361 dout(20) << __func__ << " file " << dirname << "/" << filename
4362 << " not found" << dendl;
4363 return -ENOENT;
4364 }
4365 FileRef file = q->second;
4366 if (file->locked) {
4367 dout(20) << __func__ << " file " << dirname << "/" << filename
4368 << " is locked" << dendl;
4369 return -EBUSY;
4370 }
4371 dir->file_map.erase(string{filename});
4372 log.t.op_dir_unlink(dirname, filename);
4373 _drop_link_D(file);
4374 return 0;
4375 }
4376
4377 bool BlueFS::wal_is_rotational()
4378 {
4379 if (bdev[BDEV_WAL]) {
4380 return bdev[BDEV_WAL]->is_rotational();
4381 } else if (bdev[BDEV_DB]) {
4382 return bdev[BDEV_DB]->is_rotational();
4383 }
4384 return bdev[BDEV_SLOW]->is_rotational();
4385 }
4386
4387 bool BlueFS::db_is_rotational()
4388 {
4389 if (bdev[BDEV_DB]) {
4390 return bdev[BDEV_DB]->is_rotational();
4391 }
4392 return bdev[BDEV_SLOW]->is_rotational();
4393 }
4394
4395 /*
4396 Algorithm.
4397 do_replay_recovery_read is used when bluefs log abruptly ends, but it seems that more data should be there.
4398 Idea is to search disk for definiton of extents that will be accompanied with bluefs log in future,
4399 and try if using it will produce healthy bluefs transaction.
4400 We encode already known bluefs log extents and search disk for these bytes.
4401 When we find it, we decode following bytes as extent.
4402 We read that whole extent and then check if merged with existing log part gives a proper bluefs transaction.
4403 */
4404 int BlueFS::_do_replay_recovery_read(FileReader *log_reader,
4405 size_t replay_pos,
4406 size_t read_offset,
4407 size_t read_len,
4408 bufferlist* bl) {
4409 dout(1) << __func__ << " replay_pos=0x" << std::hex << replay_pos <<
4410 " needs 0x" << read_offset << "~" << read_len << std::dec << dendl;
4411
4412 bluefs_fnode_t& log_fnode = log_reader->file->fnode;
4413 bufferlist bin_extents;
4414 ::encode(log_fnode.extents, bin_extents);
4415 dout(2) << __func__ << " log file encoded extents length = " << bin_extents.length() << dendl;
4416
4417 // cannot process if too small to effectively search
4418 ceph_assert(bin_extents.length() >= 32);
4419 bufferlist last_32;
4420 last_32.substr_of(bin_extents, bin_extents.length() - 32, 32);
4421
4422 //read fixed part from replay_pos to end of bluefs_log extents
4423 bufferlist fixed;
4424 uint64_t e_off = 0;
4425 auto e = log_fnode.seek(replay_pos, &e_off);
4426 ceph_assert(e != log_fnode.extents.end());
4427 int r = _bdev_read(e->bdev, e->offset + e_off, e->length - e_off, &fixed, ioc[e->bdev],
4428 cct->_conf->bluefs_buffered_io);
4429 ceph_assert(r == 0);
4430 //capture dev of last good extent
4431 uint8_t last_e_dev = e->bdev;
4432 uint64_t last_e_off = e->offset;
4433 ++e;
4434 while (e != log_fnode.extents.end()) {
4435 r = _bdev_read(e->bdev, e->offset, e->length, &fixed, ioc[e->bdev],
4436 cct->_conf->bluefs_buffered_io);
4437 ceph_assert(r == 0);
4438 last_e_dev = e->bdev;
4439 ++e;
4440 }
4441 ceph_assert(replay_pos + fixed.length() == read_offset);
4442
4443 dout(2) << __func__ << " valid data in log = " << fixed.length() << dendl;
4444
4445 struct compare {
4446 bool operator()(const bluefs_extent_t& a, const bluefs_extent_t& b) const {
4447 if (a.bdev < b.bdev) return true;
4448 if (a.offset < b.offset) return true;
4449 return a.length < b.length;
4450 }
4451 };
4452 std::set<bluefs_extent_t, compare> extents_rejected;
4453 for (int dcnt = 0; dcnt < 3; dcnt++) {
4454 uint8_t dev = (last_e_dev + dcnt) % MAX_BDEV;
4455 if (bdev[dev] == nullptr) continue;
4456 dout(2) << __func__ << " processing " << get_device_name(dev) << dendl;
4457 interval_set<uint64_t> disk_regions;
4458 disk_regions.insert(0, bdev[dev]->get_size());
4459 for (auto f : nodes.file_map) {
4460 auto& e = f.second->fnode.extents;
4461 for (auto& p : e) {
4462 if (p.bdev == dev) {
4463 disk_regions.erase(p.offset, p.length);
4464 }
4465 }
4466 }
4467 size_t disk_regions_count = disk_regions.num_intervals();
4468 dout(5) << __func__ << " " << disk_regions_count << " regions to scan on " << get_device_name(dev) << dendl;
4469
4470 auto reg = disk_regions.lower_bound(last_e_off);
4471 //for all except first, start from beginning
4472 last_e_off = 0;
4473 if (reg == disk_regions.end()) {
4474 reg = disk_regions.begin();
4475 }
4476 const uint64_t chunk_size = 4 * 1024 * 1024;
4477 const uint64_t page_size = 4096;
4478 const uint64_t max_extent_size = 16;
4479 uint64_t overlay_size = last_32.length() + max_extent_size;
4480 for (size_t i = 0; i < disk_regions_count; reg++, i++) {
4481 if (reg == disk_regions.end()) {
4482 reg = disk_regions.begin();
4483 }
4484 uint64_t pos = reg.get_start();
4485 uint64_t len = reg.get_len();
4486
4487 std::unique_ptr<char[]> raw_data_p{new char[page_size + chunk_size]};
4488 char* raw_data = raw_data_p.get();
4489 memset(raw_data, 0, page_size);
4490
4491 while (len > last_32.length()) {
4492 uint64_t chunk_len = len > chunk_size ? chunk_size : len;
4493 dout(5) << __func__ << " read "
4494 << get_device_name(dev) << ":0x" << std::hex << pos << "+" << chunk_len
4495 << std::dec << dendl;
4496 r = _bdev_read_random(dev, pos, chunk_len,
4497 raw_data + page_size, cct->_conf->bluefs_buffered_io);
4498 ceph_assert(r == 0);
4499
4500 //search for fixed_last_32
4501 char* chunk_b = raw_data + page_size;
4502 char* chunk_e = chunk_b + chunk_len;
4503
4504 char* search_b = chunk_b - overlay_size;
4505 char* search_e = chunk_e;
4506
4507 for (char* sp = search_b; ; sp += last_32.length()) {
4508 sp = (char*)memmem(sp, search_e - sp, last_32.c_str(), last_32.length());
4509 if (sp == nullptr) {
4510 break;
4511 }
4512
4513 char* n = sp + last_32.length();
4514 dout(5) << __func__ << " checking location 0x" << std::hex << pos + (n - chunk_b) << std::dec << dendl;
4515 bufferlist test;
4516 test.append(n, std::min<size_t>(max_extent_size, chunk_e - n));
4517 bluefs_extent_t ne;
4518 try {
4519 bufferlist::const_iterator p = test.begin();
4520 ::decode(ne, p);
4521 } catch (buffer::error& e) {
4522 continue;
4523 }
4524 if (extents_rejected.count(ne) != 0) {
4525 dout(5) << __func__ << " extent " << ne << " already refected" <<dendl;
4526 continue;
4527 }
4528 //insert as rejected already. if we succeed, it wouldn't make difference.
4529 extents_rejected.insert(ne);
4530
4531 if (ne.bdev >= MAX_BDEV ||
4532 bdev[ne.bdev] == nullptr ||
4533 ne.length > 16 * 1024 * 1024 ||
4534 (ne.length & 4095) != 0 ||
4535 ne.offset + ne.length > bdev[ne.bdev]->get_size() ||
4536 (ne.offset & 4095) != 0) {
4537 dout(5) << __func__ << " refusing extent " << ne << dendl;
4538 continue;
4539 }
4540 dout(5) << __func__ << " checking extent " << ne << dendl;
4541
4542 //read candidate extent - whole
4543 bufferlist candidate;
4544 candidate.append(fixed);
4545 r = _bdev_read(ne.bdev, ne.offset, ne.length, &candidate, ioc[ne.bdev],
4546 cct->_conf->bluefs_buffered_io);
4547 ceph_assert(r == 0);
4548
4549 //check if transaction & crc is ok
4550 bluefs_transaction_t t;
4551 try {
4552 bufferlist::const_iterator p = candidate.begin();
4553 ::decode(t, p);
4554 }
4555 catch (buffer::error& e) {
4556 dout(5) << __func__ << " failed match" << dendl;
4557 continue;
4558 }
4559
4560 //success, it seems a probable candidate
4561 uint64_t l = std::min<uint64_t>(ne.length, read_len);
4562 //trim to required size
4563 bufferlist requested_read;
4564 requested_read.substr_of(candidate, fixed.length(), l);
4565 bl->append(requested_read);
4566 dout(5) << __func__ << " successful extension of log " << l << "/" << read_len << dendl;
4567 log_fnode.append_extent(ne);
4568 log_fnode.recalc_allocated();
4569 log_reader->buf.pos += l;
4570 return l;
4571 }
4572 //save overlay for next search
4573 memcpy(search_b, chunk_e - overlay_size, overlay_size);
4574 pos += chunk_len;
4575 len -= chunk_len;
4576 }
4577 }
4578 }
4579 return 0;
4580 }
4581
4582 void BlueFS::_check_vselector_LNF() {
4583 BlueFSVolumeSelector* vs = vselector->clone_empty();
4584 if (!vs) {
4585 return;
4586 }
4587 std::lock_guard ll(log.lock);
4588 std::lock_guard nl(nodes.lock);
4589 // Checking vselector is under log, nodes and file(s) locks,
4590 // so any modification of vselector must be under at least one of those locks.
4591 for (auto& f : nodes.file_map) {
4592 f.second->lock.lock();
4593 vs->add_usage(f.second->vselector_hint, f.second->fnode);
4594 }
4595 bool res = vselector->compare(vs);
4596 if (!res) {
4597 dout(0) << "Current:";
4598 vselector->dump(*_dout);
4599 *_dout << dendl;
4600 dout(0) << "Expected:";
4601 vs->dump(*_dout);
4602 *_dout << dendl;
4603 }
4604 ceph_assert(res);
4605 for (auto& f : nodes.file_map) {
4606 f.second->lock.unlock();
4607 }
4608 delete vs;
4609 }
4610
4611 size_t BlueFS::probe_alloc_avail(int dev, uint64_t alloc_size)
4612 {
4613 size_t total = 0;
4614 auto iterated_allocation = [&](size_t off, size_t len) {
4615 //only count in size that is alloc_size aligned
4616 size_t dist_to_alignment;
4617 size_t offset_in_block = off & (alloc_size - 1);
4618 if (offset_in_block == 0)
4619 dist_to_alignment = 0;
4620 else
4621 dist_to_alignment = alloc_size - offset_in_block;
4622 if (dist_to_alignment >= len)
4623 return;
4624 len -= dist_to_alignment;
4625 total += p2align(len, alloc_size);
4626 };
4627 if (alloc[dev]) {
4628 alloc[dev]->foreach(iterated_allocation);
4629 }
4630 return total;
4631 }
4632 // ===============================================
4633 // OriginalVolumeSelector
4634
4635 void* OriginalVolumeSelector::get_hint_for_log() const {
4636 return reinterpret_cast<void*>(BlueFS::BDEV_WAL);
4637 }
4638 void* OriginalVolumeSelector::get_hint_by_dir(std::string_view dirname) const {
4639 uint8_t res = BlueFS::BDEV_DB;
4640 if (dirname.length() > 5) {
4641 // the "db.slow" and "db.wal" directory names are hard-coded at
4642 // match up with bluestore. the slow device is always the second
4643 // one (when a dedicated block.db device is present and used at
4644 // bdev 0). the wal device is always last.
4645 if (boost::algorithm::ends_with(dirname, ".slow") && slow_total) {
4646 res = BlueFS::BDEV_SLOW;
4647 } else if (boost::algorithm::ends_with(dirname, ".wal") && wal_total) {
4648 res = BlueFS::BDEV_WAL;
4649 }
4650 }
4651 return reinterpret_cast<void*>(res);
4652 }
4653
4654 uint8_t OriginalVolumeSelector::select_prefer_bdev(void* hint)
4655 {
4656 return (uint8_t)(reinterpret_cast<uint64_t>(hint));
4657 }
4658
4659 void OriginalVolumeSelector::get_paths(const std::string& base, paths& res) const
4660 {
4661 res.emplace_back(base, db_total);
4662 res.emplace_back(base + ".slow",
4663 slow_total ? slow_total : db_total); // use fake non-zero value if needed to
4664 // avoid RocksDB complains
4665 }
4666
4667 #undef dout_prefix
4668 #define dout_prefix *_dout << "OriginalVolumeSelector: "
4669
4670 void OriginalVolumeSelector::dump(ostream& sout) {
4671 sout<< "wal_total:" << wal_total
4672 << ", db_total:" << db_total
4673 << ", slow_total:" << slow_total
4674 << std::endl;
4675 }
4676
4677 // ===============================================
4678 // FitToFastVolumeSelector
4679
4680 void FitToFastVolumeSelector::get_paths(const std::string& base, paths& res) const {
4681 res.emplace_back(base, 1); // size of the last db_path has no effect
4682 }