1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2 // vim: ts=8 sw=2 smarttab
4 #include "boost/algorithm/string.hpp"
5 #include "bluestore_common.h"
8 #include "common/debug.h"
9 #include "common/errno.h"
10 #include "common/perf_counters.h"
11 #include "Allocator.h"
12 #include "include/ceph_assert.h"
13 #include "common/admin_socket.h"
15 #define dout_context cct
16 #define dout_subsys ceph_subsys_bluefs
18 #define dout_prefix *_dout << "bluefs "
19 using TOPNSPC::common::cmd_getval
;
32 using ceph::bufferlist
;
35 using ceph::Formatter
;
38 MEMPOOL_DEFINE_OBJECT_FACTORY(BlueFS::File
, bluefs_file
, bluefs
);
39 MEMPOOL_DEFINE_OBJECT_FACTORY(BlueFS::Dir
, bluefs_dir
, bluefs
);
40 MEMPOOL_DEFINE_OBJECT_FACTORY(BlueFS::FileWriter
, bluefs_file_writer
, bluefs_file_writer
);
41 MEMPOOL_DEFINE_OBJECT_FACTORY(BlueFS::FileReaderBuffer
,
42 bluefs_file_reader_buffer
, bluefs_file_reader
);
43 MEMPOOL_DEFINE_OBJECT_FACTORY(BlueFS::FileReader
, bluefs_file_reader
, bluefs_file_reader
);
44 MEMPOOL_DEFINE_OBJECT_FACTORY(BlueFS::FileLock
, bluefs_file_lock
, bluefs
);
46 static void wal_discard_cb(void *priv
, void* priv2
) {
47 BlueFS
*bluefs
= static_cast<BlueFS
*>(priv
);
48 interval_set
<uint64_t> *tmp
= static_cast<interval_set
<uint64_t>*>(priv2
);
49 bluefs
->handle_discard(BlueFS::BDEV_WAL
, *tmp
);
52 static void db_discard_cb(void *priv
, void* priv2
) {
53 BlueFS
*bluefs
= static_cast<BlueFS
*>(priv
);
54 interval_set
<uint64_t> *tmp
= static_cast<interval_set
<uint64_t>*>(priv2
);
55 bluefs
->handle_discard(BlueFS::BDEV_DB
, *tmp
);
58 static void slow_discard_cb(void *priv
, void* priv2
) {
59 BlueFS
*bluefs
= static_cast<BlueFS
*>(priv
);
60 interval_set
<uint64_t> *tmp
= static_cast<interval_set
<uint64_t>*>(priv2
);
61 bluefs
->handle_discard(BlueFS::BDEV_SLOW
, *tmp
);
64 class BlueFS::SocketHook
: public AdminSocketHook
{
67 static BlueFS::SocketHook
* create(BlueFS
* bluefs
)
69 BlueFS::SocketHook
* hook
= nullptr;
70 AdminSocket
* admin_socket
= bluefs
->cct
->get_admin_socket();
72 hook
= new BlueFS::SocketHook(bluefs
);
73 int r
= admin_socket
->register_command("bluestore bluefs device info "
74 "name=alloc_size,type=CephInt,req=false",
76 "Shows space report for bluefs devices. "
77 "This also includes an estimation for space "
78 "available to bluefs at main device. "
79 "alloc_size, if set, specifies the custom bluefs "
80 "allocation unit size for the estimation above.");
82 ldout(bluefs
->cct
, 1) << __func__
<< " cannot register SocketHook" << dendl
;
86 r
= admin_socket
->register_command("bluefs stats",
88 "Dump internal statistics for bluefs."
91 r
= admin_socket
->register_command("bluefs files list", hook
,
92 "print files in bluefs");
94 r
= admin_socket
->register_command("bluefs debug_inject_read_zeros", hook
,
95 "Injects 8K zeros into next BlueFS read. Debug only.");
103 AdminSocket
* admin_socket
= bluefs
->cct
->get_admin_socket();
104 admin_socket
->unregister_commands(this);
107 SocketHook(BlueFS
* bluefs
) :
109 int call(std::string_view command
, const cmdmap_t
& cmdmap
,
112 bufferlist
& out
) override
{
113 if (command
== "bluestore bluefs device info") {
114 int64_t alloc_size
= 0;
115 cmd_getval(cmdmap
, "alloc_size", alloc_size
);
116 if ((alloc_size
& (alloc_size
- 1)) != 0) {
117 errss
<< "Invalid allocation size:'" << alloc_size
<< std::endl
;
121 alloc_size
= bluefs
->cct
->_conf
->bluefs_shared_alloc_size
;
122 f
->open_object_section("bluefs_device_info");
123 for (unsigned dev
= BDEV_WAL
; dev
<= BDEV_SLOW
; dev
++) {
124 if (bluefs
->bdev
[dev
]) {
125 f
->open_object_section("dev");
126 f
->dump_string("device", bluefs
->get_device_name(dev
));
127 ceph_assert(bluefs
->alloc
[dev
]);
128 auto total
= bluefs
->get_total(dev
);
129 auto free
= bluefs
->get_free(dev
);
130 auto used
= bluefs
->get_used(dev
);
132 f
->dump_int("total", total
);
133 f
->dump_int("free", free
);
134 f
->dump_int("bluefs_used", used
);
135 if (bluefs
->is_shared_alloc(dev
)) {
136 size_t avail
= bluefs
->probe_alloc_avail(dev
, alloc_size
);
137 f
->dump_int("bluefs max available", avail
);
144 } else if (command
== "bluefs stats") {
145 std::stringstream ss
;
146 bluefs
->dump_block_extents(ss
);
147 bluefs
->dump_volume_selector(ss
);
149 } else if (command
== "bluefs files list") {
150 const char* devnames
[3] = {"wal","db","slow"};
151 std::lock_guard
l(bluefs
->nodes
.lock
);
152 f
->open_array_section("files");
153 for (auto &d
: bluefs
->nodes
.dir_map
) {
154 std::string dir
= d
.first
;
155 for (auto &r
: d
.second
->file_map
) {
156 f
->open_object_section("file");
157 f
->dump_string("name", (dir
+ "/" + r
.first
).c_str());
158 std::vector
<size_t> sizes
;
159 sizes
.resize(bluefs
->bdev
.size());
160 for(auto& i
: r
.second
->fnode
.extents
) {
161 sizes
[i
.bdev
] += i
.length
;
163 for (size_t i
= 0; i
< sizes
.size(); i
++) {
165 if (i
< sizeof(devnames
) / sizeof(*devnames
))
166 f
->dump_int(devnames
[i
], sizes
[i
]);
168 f
->dump_int(("dev-"+to_string(i
)).c_str(), sizes
[i
]);
176 } else if (command
== "bluefs debug_inject_read_zeros") {
177 bluefs
->inject_read_zeros
++;
179 errss
<< "Invalid command" << std::endl
;
186 BlueFS::BlueFS(CephContext
* cct
)
190 block_reserved(MAX_BDEV
),
192 alloc_size(MAX_BDEV
, 0)
194 dirty
.pending_release
.resize(MAX_BDEV
);
195 discard_cb
[BDEV_WAL
] = wal_discard_cb
;
196 discard_cb
[BDEV_DB
] = db_discard_cb
;
197 discard_cb
[BDEV_SLOW
] = slow_discard_cb
;
198 asok_hook
= SocketHook::create(this);
208 for (auto p
: bdev
) {
219 void BlueFS::_init_logger()
221 PerfCountersBuilder
b(cct
, "bluefs",
222 l_bluefs_first
, l_bluefs_last
);
223 b
.add_u64(l_bluefs_db_total_bytes
, "db_total_bytes",
224 "Total bytes (main db device)",
225 "b", PerfCountersBuilder::PRIO_USEFUL
, unit_t(UNIT_BYTES
));
226 b
.add_u64(l_bluefs_db_used_bytes
, "db_used_bytes",
227 "Used bytes (main db device)",
228 "u", PerfCountersBuilder::PRIO_USEFUL
, unit_t(UNIT_BYTES
));
229 b
.add_u64(l_bluefs_wal_total_bytes
, "wal_total_bytes",
230 "Total bytes (wal device)",
231 "walb", PerfCountersBuilder::PRIO_USEFUL
, unit_t(UNIT_BYTES
));
232 b
.add_u64(l_bluefs_wal_used_bytes
, "wal_used_bytes",
233 "Used bytes (wal device)",
234 "walu", PerfCountersBuilder::PRIO_USEFUL
, unit_t(UNIT_BYTES
));
235 b
.add_u64(l_bluefs_slow_total_bytes
, "slow_total_bytes",
236 "Total bytes (slow device)",
237 "slob", PerfCountersBuilder::PRIO_USEFUL
, unit_t(UNIT_BYTES
));
238 b
.add_u64(l_bluefs_slow_used_bytes
, "slow_used_bytes",
239 "Used bytes (slow device)",
240 "slou", PerfCountersBuilder::PRIO_USEFUL
, unit_t(UNIT_BYTES
));
241 b
.add_u64(l_bluefs_num_files
, "num_files", "File count",
242 "f", PerfCountersBuilder::PRIO_USEFUL
);
243 b
.add_u64(l_bluefs_log_bytes
, "log_bytes", "Size of the metadata log",
244 "jlen", PerfCountersBuilder::PRIO_INTERESTING
, unit_t(UNIT_BYTES
));
245 b
.add_u64_counter(l_bluefs_log_compactions
, "log_compactions",
246 "Compactions of the metadata log");
247 b
.add_u64_counter(l_bluefs_logged_bytes
, "logged_bytes",
248 "Bytes written to the metadata log",
250 PerfCountersBuilder::PRIO_CRITICAL
, unit_t(UNIT_BYTES
));
251 b
.add_u64_counter(l_bluefs_files_written_wal
, "files_written_wal",
252 "Files written to WAL");
253 b
.add_u64_counter(l_bluefs_files_written_sst
, "files_written_sst",
254 "Files written to SSTs");
255 b
.add_u64_counter(l_bluefs_bytes_written_wal
, "bytes_written_wal",
256 "Bytes written to WAL",
258 PerfCountersBuilder::PRIO_CRITICAL
);
259 b
.add_u64_counter(l_bluefs_bytes_written_sst
, "bytes_written_sst",
260 "Bytes written to SSTs",
262 PerfCountersBuilder::PRIO_CRITICAL
, unit_t(UNIT_BYTES
));
263 b
.add_u64_counter(l_bluefs_bytes_written_slow
, "bytes_written_slow",
264 "Bytes written to WAL/SSTs at slow device",
266 PerfCountersBuilder::PRIO_CRITICAL
, unit_t(UNIT_BYTES
));
267 b
.add_u64_counter(l_bluefs_max_bytes_wal
, "max_bytes_wal",
268 "Maximum bytes allocated from WAL",
270 PerfCountersBuilder::PRIO_INTERESTING
,
272 b
.add_u64_counter(l_bluefs_max_bytes_db
, "max_bytes_db",
273 "Maximum bytes allocated from DB",
275 PerfCountersBuilder::PRIO_INTERESTING
,
277 b
.add_u64_counter(l_bluefs_max_bytes_slow
, "max_bytes_slow",
278 "Maximum bytes allocated from SLOW",
280 PerfCountersBuilder::PRIO_INTERESTING
,
282 b
.add_u64_counter(l_bluefs_main_alloc_unit
, "alloc_unit_main",
283 "Allocation unit size (in bytes) for primary/shared device",
285 PerfCountersBuilder::PRIO_CRITICAL
,
287 b
.add_u64_counter(l_bluefs_db_alloc_unit
, "alloc_unit_db",
288 "Allocation unit size (in bytes) for standalone DB device",
290 PerfCountersBuilder::PRIO_CRITICAL
,
292 b
.add_u64_counter(l_bluefs_wal_alloc_unit
, "alloc_unit_wal",
293 "Allocation unit size (in bytes) for standalone WAL device",
295 PerfCountersBuilder::PRIO_CRITICAL
,
297 b
.add_u64_counter(l_bluefs_read_random_count
, "read_random_count",
298 "random read requests processed",
300 PerfCountersBuilder::PRIO_USEFUL
);
301 b
.add_u64_counter(l_bluefs_read_random_bytes
, "read_random_bytes",
302 "Bytes requested in random read mode",
304 PerfCountersBuilder::PRIO_USEFUL
, unit_t(UNIT_BYTES
));
305 b
.add_u64_counter(l_bluefs_read_random_disk_count
, "read_random_disk_count",
306 "random reads requests going to disk",
308 PerfCountersBuilder::PRIO_USEFUL
);
309 b
.add_u64_counter(l_bluefs_read_random_disk_bytes
, "read_random_disk_bytes",
310 "Bytes read from disk in random read mode",
312 PerfCountersBuilder::PRIO_INTERESTING
,
314 b
.add_u64_counter(l_bluefs_read_random_disk_bytes_wal
, "read_random_disk_bytes_wal",
315 "random reads requests going to WAL disk",
317 PerfCountersBuilder::PRIO_USEFUL
, unit_t(UNIT_BYTES
));
318 b
.add_u64_counter(l_bluefs_read_random_disk_bytes_db
, "read_random_disk_bytes_db",
319 "random reads requests going to DB disk",
321 PerfCountersBuilder::PRIO_USEFUL
, unit_t(UNIT_BYTES
));
322 b
.add_u64_counter(l_bluefs_read_random_disk_bytes_slow
, "read_random_disk_bytes_slow",
323 "random reads requests going to main disk",
325 PerfCountersBuilder::PRIO_INTERESTING
,
327 b
.add_u64_counter(l_bluefs_read_random_buffer_count
, "read_random_buffer_count",
328 "random read requests processed using prefetch buffer",
330 PerfCountersBuilder::PRIO_USEFUL
);
331 b
.add_u64_counter(l_bluefs_read_random_buffer_bytes
, "read_random_buffer_bytes",
332 "Bytes read from prefetch buffer in random read mode",
334 PerfCountersBuilder::PRIO_USEFUL
, unit_t(UNIT_BYTES
));
335 b
.add_u64_counter(l_bluefs_read_count
, "read_count",
336 "buffered read requests processed",
338 PerfCountersBuilder::PRIO_USEFUL
);
339 b
.add_u64_counter(l_bluefs_read_bytes
, "read_bytes",
340 "Bytes requested in buffered read mode",
342 PerfCountersBuilder::PRIO_USEFUL
, unit_t(UNIT_BYTES
));
343 b
.add_u64_counter(l_bluefs_read_disk_count
, "read_disk_count",
344 "buffered reads requests going to disk",
346 PerfCountersBuilder::PRIO_USEFUL
);
347 b
.add_u64_counter(l_bluefs_read_disk_bytes
, "read_disk_bytes",
348 "Bytes read in buffered mode from disk",
350 PerfCountersBuilder::PRIO_INTERESTING
, unit_t(UNIT_BYTES
));
351 b
.add_u64_counter(l_bluefs_read_disk_bytes_wal
, "read_disk_bytes_wal",
352 "reads requests going to WAL disk",
354 PerfCountersBuilder::PRIO_USEFUL
, unit_t(UNIT_BYTES
));
355 b
.add_u64_counter(l_bluefs_read_disk_bytes_db
, "read_disk_bytes_db",
356 "reads requests going to DB disk",
358 PerfCountersBuilder::PRIO_USEFUL
, unit_t(UNIT_BYTES
));
359 b
.add_u64_counter(l_bluefs_read_disk_bytes_slow
, "read_disk_bytes_slow",
360 "reads requests going to main disk",
362 PerfCountersBuilder::PRIO_INTERESTING
, unit_t(UNIT_BYTES
));
363 b
.add_u64_counter(l_bluefs_read_prefetch_count
, "read_prefetch_count",
364 "prefetch read requests processed",
366 PerfCountersBuilder::PRIO_USEFUL
);
367 b
.add_u64_counter(l_bluefs_read_prefetch_bytes
, "read_prefetch_bytes",
368 "Bytes requested in prefetch read mode",
370 PerfCountersBuilder::PRIO_USEFUL
, unit_t(UNIT_BYTES
));
371 b
.add_u64(l_bluefs_read_zeros_candidate
, "read_zeros_candidate",
372 "How many times bluefs read found page with all 0s");
373 b
.add_u64(l_bluefs_read_zeros_errors
, "read_zeros_errors",
374 "How many times bluefs read found transient page with all 0s");
376 logger
= b
.create_perf_counters();
377 cct
->get_perfcounters_collection()->add(logger
);
380 void BlueFS::_shutdown_logger()
382 cct
->get_perfcounters_collection()->remove(logger
);
386 void BlueFS::_update_logger_stats()
388 if (alloc
[BDEV_WAL
]) {
389 logger
->set(l_bluefs_wal_total_bytes
, _get_total(BDEV_WAL
));
390 logger
->set(l_bluefs_wal_used_bytes
, _get_used(BDEV_WAL
));
392 if (alloc
[BDEV_DB
]) {
393 logger
->set(l_bluefs_db_total_bytes
, _get_total(BDEV_DB
));
394 logger
->set(l_bluefs_db_used_bytes
, _get_used(BDEV_DB
));
396 if (alloc
[BDEV_SLOW
]) {
397 logger
->set(l_bluefs_slow_total_bytes
, _get_total(BDEV_SLOW
));
398 logger
->set(l_bluefs_slow_used_bytes
, _get_used(BDEV_SLOW
));
402 int BlueFS::add_block_device(unsigned id
, const string
& path
, bool trim
,
404 bluefs_shared_alloc_context_t
* _shared_alloc
)
406 dout(10) << __func__
<< " bdev " << id
<< " path " << path
<< " "
407 << reserved
<< dendl
;
408 ceph_assert(id
< bdev
.size());
409 ceph_assert(bdev
[id
] == NULL
);
410 BlockDevice
*b
= BlockDevice::create(cct
, path
, NULL
, NULL
,
411 discard_cb
[id
], static_cast<void*>(this));
412 block_reserved
[id
] = reserved
;
414 b
->set_no_exclusive_lock();
416 int r
= b
->open(path
);
422 b
->discard(0, b
->get_size());
425 dout(1) << __func__
<< " bdev " << id
<< " path " << path
426 << " size " << byte_u_t(b
->get_size()) << dendl
;
428 ioc
[id
] = new IOContext(cct
, NULL
);
430 ceph_assert(!shared_alloc
);
431 shared_alloc
= _shared_alloc
;
432 alloc
[id
] = shared_alloc
->a
;
433 shared_alloc_id
= id
;
438 bool BlueFS::bdev_support_label(unsigned id
)
440 ceph_assert(id
< bdev
.size());
441 ceph_assert(bdev
[id
]);
442 return bdev
[id
]->supported_bdev_label();
445 uint64_t BlueFS::get_block_device_size(unsigned id
) const
447 if (id
< bdev
.size() && bdev
[id
])
448 return bdev
[id
]->get_size();
452 void BlueFS::handle_discard(unsigned id
, interval_set
<uint64_t>& to_release
)
454 dout(10) << __func__
<< " bdev " << id
<< dendl
;
455 ceph_assert(alloc
[id
]);
456 alloc
[id
]->release(to_release
);
457 if (is_shared_alloc(id
)) {
458 shared_alloc
->bluefs_used
-= to_release
.size();
462 uint64_t BlueFS::get_used()
465 for (unsigned id
= 0; id
< MAX_BDEV
; ++id
) {
466 used
+= _get_used(id
);
471 uint64_t BlueFS::_get_used(unsigned id
) const
477 if (is_shared_alloc(id
)) {
478 used
= shared_alloc
->bluefs_used
;
480 used
= _get_total(id
) - alloc
[id
]->get_free();
485 uint64_t BlueFS::get_used(unsigned id
)
487 ceph_assert(id
< alloc
.size());
488 ceph_assert(alloc
[id
]);
489 return _get_used(id
);
492 uint64_t BlueFS::_get_total(unsigned id
) const
494 ceph_assert(id
< bdev
.size());
495 ceph_assert(id
< block_reserved
.size());
496 return get_block_device_size(id
) - block_reserved
[id
];
499 uint64_t BlueFS::get_total(unsigned id
)
501 return _get_total(id
);
504 uint64_t BlueFS::get_free(unsigned id
)
506 ceph_assert(id
< alloc
.size());
507 return alloc
[id
]->get_free();
510 void BlueFS::dump_perf_counters(Formatter
*f
)
512 f
->open_object_section("bluefs_perf_counters");
513 logger
->dump_formatted(f
,0);
517 void BlueFS::dump_block_extents(ostream
& out
)
519 for (unsigned i
= 0; i
< MAX_BDEV
; ++i
) {
523 auto total
= get_total(i
);
524 auto free
= get_free(i
);
526 out
<< i
<< " : device size 0x" << std::hex
<< total
527 << " : using 0x" << total
- free
528 << std::dec
<< "(" << byte_u_t(total
- free
) << ")";
533 int BlueFS::get_block_extents(unsigned id
, interval_set
<uint64_t> *extents
)
535 std::lock_guard
nl(nodes
.lock
);
536 dout(10) << __func__
<< " bdev " << id
<< dendl
;
537 ceph_assert(id
< alloc
.size());
538 for (auto& p
: nodes
.file_map
) {
539 for (auto& q
: p
.second
->fnode
.extents
) {
541 extents
->insert(q
.offset
, q
.length
);
548 int BlueFS::mkfs(uuid_d osd_uuid
, const bluefs_layout_t
& layout
)
551 << " osd_uuid " << osd_uuid
554 // set volume selector if not provided before/outside
555 if (vselector
== nullptr) {
557 new OriginalVolumeSelector(
558 get_block_device_size(BlueFS::BDEV_WAL
) * 95 / 100,
559 get_block_device_size(BlueFS::BDEV_DB
) * 95 / 100,
560 get_block_device_size(BlueFS::BDEV_SLOW
) * 95 / 100));
567 super
.block_size
= bdev
[BDEV_DB
]->get_block_size();
568 super
.osd_uuid
= osd_uuid
;
569 super
.uuid
.generate_random();
570 dout(1) << __func__
<< " uuid " << super
.uuid
<< dendl
;
573 FileRef log_file
= ceph::make_ref
<File
>();
574 log_file
->fnode
.ino
= 1;
575 log_file
->vselector_hint
= vselector
->get_hint_for_log();
577 vselector
->select_prefer_bdev(log_file
->vselector_hint
),
578 cct
->_conf
->bluefs_max_log_runway
,
580 vselector
->add_usage(log_file
->vselector_hint
, log_file
->fnode
);
582 log
.writer
= _create_writer(log_file
);
585 ceph_assert(log
.seq_live
== 1);
588 _flush_and_sync_log_LD();
591 super
.log_fnode
= log_file
->fnode
;
592 super
.memorized_layout
= layout
;
593 _write_super(BDEV_DB
);
597 super
= bluefs_super_t();
598 _close_writer(log
.writer
);
600 vselector
.reset(nullptr);
604 ceph_assert(shared_alloc
->need_init
);
605 shared_alloc
->need_init
= false;
608 dout(10) << __func__
<< " success" << dendl
;
612 void BlueFS::_init_alloc()
614 dout(20) << __func__
<< dendl
;
616 size_t wal_alloc_size
= 0;
617 if (bdev
[BDEV_WAL
]) {
618 wal_alloc_size
= cct
->_conf
->bluefs_alloc_size
;
619 alloc_size
[BDEV_WAL
] = wal_alloc_size
;
621 logger
->set(l_bluefs_wal_alloc_unit
, wal_alloc_size
);
623 if (bdev
[BDEV_SLOW
]) {
624 alloc_size
[BDEV_DB
] = cct
->_conf
->bluefs_alloc_size
;
625 alloc_size
[BDEV_SLOW
] = cct
->_conf
->bluefs_shared_alloc_size
;
626 logger
->set(l_bluefs_db_alloc_unit
, cct
->_conf
->bluefs_alloc_size
);
627 logger
->set(l_bluefs_main_alloc_unit
, cct
->_conf
->bluefs_shared_alloc_size
);
629 alloc_size
[BDEV_DB
] = cct
->_conf
->bluefs_shared_alloc_size
;
630 logger
->set(l_bluefs_main_alloc_unit
, 0);
631 logger
->set(l_bluefs_db_alloc_unit
, cct
->_conf
->bluefs_shared_alloc_size
);
633 // new wal and db devices are never shared
634 if (bdev
[BDEV_NEWWAL
]) {
635 alloc_size
[BDEV_NEWWAL
] = cct
->_conf
->bluefs_alloc_size
;
637 if (bdev
[BDEV_NEWDB
]) {
638 alloc_size
[BDEV_NEWDB
] = cct
->_conf
->bluefs_alloc_size
;
641 for (unsigned id
= 0; id
< bdev
.size(); ++id
) {
645 ceph_assert(bdev
[id
]->get_size());
646 ceph_assert(alloc_size
[id
]);
647 if (is_shared_alloc(id
)) {
648 dout(1) << __func__
<< " shared, id " << id
<< std::hex
649 << ", capacity 0x" << bdev
[id
]->get_size()
650 << ", block size 0x" << alloc_size
[id
]
651 << std::dec
<< dendl
;
653 std::string name
= "bluefs-";
654 const char* devnames
[] = { "wal","db","slow" };
656 name
+= devnames
[id
];
658 name
+= to_string(uintptr_t(this));
659 dout(1) << __func__
<< " new, id " << id
<< std::hex
660 << ", allocator name " << name
661 << ", allocator type " << cct
->_conf
->bluefs_allocator
662 << ", capacity 0x" << bdev
[id
]->get_size()
663 << ", block size 0x" << alloc_size
[id
]
664 << std::dec
<< dendl
;
665 alloc
[id
] = Allocator::create(cct
, cct
->_conf
->bluefs_allocator
,
666 bdev
[id
]->get_size(),
670 alloc
[id
]->init_add_free(
677 void BlueFS::_stop_alloc()
679 dout(20) << __func__
<< dendl
;
680 for (auto p
: bdev
) {
685 for (size_t i
= 0; i
< alloc
.size(); ++i
) {
686 if (alloc
[i
] && !is_shared_alloc(i
)) {
687 alloc
[i
]->shutdown();
694 int BlueFS::_read_and_check(uint8_t ndev
, uint64_t off
, uint64_t len
,
695 ceph::buffer::list
*pbl
, IOContext
*ioc
, bool buffered
)
697 dout(10) << __func__
<< " dev " << int(ndev
)
698 << ": 0x" << std::hex
<< off
<< "~" << len
<< std::dec
699 << (buffered
? " buffered" : "")
703 r
= _bdev_read(ndev
, off
, len
, &bl
, ioc
, buffered
);
707 uint64_t block_size
= bdev
[ndev
]->get_block_size();
708 if (inject_read_zeros
) {
709 if (len
>= block_size
* 2) {
710 derr
<< __func__
<< " injecting error, zeros at "
711 << int(ndev
) << ": 0x" << std::hex
<< (off
+ len
/ 2)
712 << "~" << (block_size
* 2) << std::dec
<< dendl
;
713 //use beginning, replace 8K in the middle with zeros, use tail
715 bl
.splice(0, len
/ 2 - block_size
, &temp
);
716 temp
.append(buffer::create(block_size
* 2, 0));
717 bl
.splice(block_size
* 2, len
/ 2 - block_size
, &temp
);
722 //make a check if there is a block with all 0
723 uint64_t to_check_len
= len
;
724 uint64_t skip
= p2nphase(off
, block_size
);
725 if (skip
>= to_check_len
) {
728 auto it
= bl
.begin(skip
);
729 to_check_len
-= skip
;
730 bool all_zeros
= false;
731 while (all_zeros
== false && to_check_len
>= block_size
) {
733 unsigned block_left
= block_size
;
737 while (all_zeros
&& block_left
> 0) {
738 avail
= it
.get_ptr_and_advance(block_left
, &data
);
740 all_zeros
= mem_is_zero(data
, avail
);
743 while (block_left
> 0) {
744 avail
= it
.get_ptr_and_advance(block_left
, &data
);
747 to_check_len
-= block_size
;
750 logger
->inc(l_bluefs_read_zeros_candidate
, 1);
751 bufferlist bl_reread
;
752 r
= _bdev_read(ndev
, off
, len
, &bl_reread
, ioc
, buffered
);
756 // check if both read gave the same
757 if (!bl
.contents_equal(bl_reread
)) {
758 // report problems to log, but continue, maybe it will be good now...
759 derr
<< __func__
<< " initial read of " << int(ndev
)
760 << ": 0x" << std::hex
<< off
<< "~" << len
761 << std::dec
<< ": different then re-read " << dendl
;
762 logger
->inc(l_bluefs_read_zeros_errors
, 1);
764 // use second read will be better if is different
765 pbl
->append(bl_reread
);
772 int BlueFS::_read_random_and_check(
773 uint8_t ndev
, uint64_t off
, uint64_t len
, char *buf
, bool buffered
)
775 dout(10) << __func__
<< " dev " << int(ndev
)
776 << ": 0x" << std::hex
<< off
<< "~" << len
<< std::dec
777 << (buffered
? " buffered" : "")
780 r
= _bdev_read_random(ndev
, off
, len
, buf
, buffered
);
784 uint64_t block_size
= bdev
[ndev
]->get_block_size();
785 if (inject_read_zeros
) {
786 if (len
>= block_size
* 2) {
787 derr
<< __func__
<< " injecting error, zeros at "
788 << int(ndev
) << ": 0x" << std::hex
<< (off
+ len
/ 2)
789 << "~" << (block_size
* 2) << std::dec
<< dendl
;
791 memset(buf
+ len
/ 2 - block_size
, 0, block_size
* 2);
795 //make a check if there is a block with all 0
796 uint64_t to_check_len
= len
;
797 const char* data
= buf
;
798 uint64_t skip
= p2nphase(off
, block_size
);
799 if (skip
>= to_check_len
) {
802 to_check_len
-= skip
;
805 bool all_zeros
= false;
806 while (all_zeros
== false && to_check_len
>= block_size
) {
807 if (mem_is_zero(data
, block_size
)) {
808 // at least one block is all zeros
813 to_check_len
-= block_size
;
816 logger
->inc(l_bluefs_read_zeros_candidate
, 1);
817 std::unique_ptr
<char[]> data_reread(new char[len
]);
818 r
= _bdev_read_random(ndev
, off
, len
, &data_reread
[0], buffered
);
822 // check if both read gave the same
823 if (memcmp(buf
, &data_reread
[0], len
) != 0) {
824 derr
<< __func__
<< " initial read of " << int(ndev
)
825 << ": 0x" << std::hex
<< off
<< "~" << len
826 << std::dec
<< ": different then re-read " << dendl
;
827 logger
->inc(l_bluefs_read_zeros_errors
, 1);
828 // second read is probably better
829 memcpy(buf
, &data_reread
[0], len
);
835 int BlueFS::_bdev_read(uint8_t ndev
, uint64_t off
, uint64_t len
,
836 ceph::buffer::list
* pbl
, IOContext
* ioc
, bool buffered
)
840 case BDEV_WAL
: cnt
= l_bluefs_read_disk_bytes_wal
; break;
841 case BDEV_DB
: cnt
= l_bluefs_read_disk_bytes_db
; break;
842 case BDEV_SLOW
: cnt
= l_bluefs_read_disk_bytes_slow
; break;
846 logger
->inc(cnt
, len
);
848 return bdev
[ndev
]->read(off
, len
, pbl
, ioc
, buffered
);
851 int BlueFS::_bdev_read_random(uint8_t ndev
, uint64_t off
, uint64_t len
,
852 char* buf
, bool buffered
)
856 case BDEV_WAL
: cnt
= l_bluefs_read_random_disk_bytes_wal
; break;
857 case BDEV_DB
: cnt
= l_bluefs_read_random_disk_bytes_db
; break;
858 case BDEV_SLOW
: cnt
= l_bluefs_read_random_disk_bytes_slow
; break;
861 logger
->inc(cnt
, len
);
863 return bdev
[ndev
]->read_random(off
, len
, buf
, buffered
);
868 dout(1) << __func__
<< dendl
;
871 int r
= _open_super();
873 derr
<< __func__
<< " failed to open super: " << cpp_strerror(r
) << dendl
;
877 // set volume selector if not provided before/outside
878 if (vselector
== nullptr) {
880 new OriginalVolumeSelector(
881 get_block_device_size(BlueFS::BDEV_WAL
) * 95 / 100,
882 get_block_device_size(BlueFS::BDEV_DB
) * 95 / 100,
883 get_block_device_size(BlueFS::BDEV_SLOW
) * 95 / 100));
888 r
= _replay(false, false);
890 derr
<< __func__
<< " failed to replay log: " << cpp_strerror(r
) << dendl
;
896 for (auto& p
: nodes
.file_map
) {
897 dout(30) << __func__
<< " noting alloc for " << p
.second
->fnode
<< dendl
;
898 for (auto& q
: p
.second
->fnode
.extents
) {
899 bool is_shared
= is_shared_alloc(q
.bdev
);
900 ceph_assert(!is_shared
|| (is_shared
&& shared_alloc
));
901 if (is_shared
&& shared_alloc
->need_init
&& shared_alloc
->a
) {
902 shared_alloc
->bluefs_used
+= q
.length
;
903 alloc
[q
.bdev
]->init_rm_free(q
.offset
, q
.length
);
904 } else if (!is_shared
) {
905 alloc
[q
.bdev
]->init_rm_free(q
.offset
, q
.length
);
910 shared_alloc
->need_init
= false;
911 dout(1) << __func__
<< " shared_bdev_used = "
912 << shared_alloc
->bluefs_used
<< dendl
;
914 dout(1) << __func__
<< " shared bdev not used"
918 // set up the log for future writes
919 log
.writer
= _create_writer(_get_file(1));
920 ceph_assert(log
.writer
->file
->fnode
.ino
== 1);
921 log
.writer
->pos
= log
.writer
->file
->fnode
.size
;
922 log
.writer
->file
->fnode
.reset_delta();
923 dout(10) << __func__
<< " log write pos set to 0x"
924 << std::hex
<< log
.writer
->pos
<< std::dec
927 logger
->set(l_bluefs_log_bytes
, log
.writer
->file
->fnode
.size
);
931 super
= bluefs_super_t();
935 int BlueFS::maybe_verify_layout(const bluefs_layout_t
& layout
) const
937 if (super
.memorized_layout
) {
938 if (layout
== *super
.memorized_layout
) {
939 dout(10) << __func__
<< " bluefs layout verified positively" << dendl
;
941 derr
<< __func__
<< " memorized layout doesn't fit current one" << dendl
;
945 dout(10) << __func__
<< " no memorized_layout in bluefs superblock"
952 void BlueFS::umount(bool avoid_compact
)
954 dout(1) << __func__
<< dendl
;
956 sync_metadata(avoid_compact
);
957 if (cct
->_conf
->bluefs_check_volume_selector_on_umount
) {
958 _check_vselector_LNF();
960 _close_writer(log
.writer
);
964 vselector
.reset(nullptr);
966 nodes
.file_map
.clear();
967 nodes
.dir_map
.clear();
968 super
= bluefs_super_t();
972 int BlueFS::prepare_new_device(int id
, const bluefs_layout_t
& layout
)
974 dout(1) << __func__
<< dendl
;
976 if(id
== BDEV_NEWDB
) {
977 int new_log_dev_cur
= BDEV_WAL
;
978 int new_log_dev_next
= BDEV_WAL
;
979 if (!bdev
[BDEV_WAL
]) {
980 new_log_dev_cur
= BDEV_NEWDB
;
981 new_log_dev_next
= BDEV_DB
;
983 _rewrite_log_and_layout_sync_LNF_LD(false,
990 } else if(id
== BDEV_NEWWAL
) {
991 _rewrite_log_and_layout_sync_LNF_LD(false,
1003 void BlueFS::collect_metadata(map
<string
,string
> *pm
, unsigned skip_bdev_id
)
1005 if (skip_bdev_id
!= BDEV_DB
&& bdev
[BDEV_DB
])
1006 bdev
[BDEV_DB
]->collect_metadata("bluefs_db_", pm
);
1008 bdev
[BDEV_WAL
]->collect_metadata("bluefs_wal_", pm
);
1011 void BlueFS::get_devices(set
<string
> *ls
)
1013 for (unsigned i
= 0; i
< MAX_BDEV
; ++i
) {
1015 bdev
[i
]->get_devices(ls
);
1022 dout(1) << __func__
<< dendl
;
1023 // hrm, i think we check everything on mount...
1027 int BlueFS::_write_super(int dev
)
1032 uint32_t crc
= bl
.crc32c(-1);
1034 dout(10) << __func__
<< " super block length(encoded): " << bl
.length() << dendl
;
1035 dout(10) << __func__
<< " superblock " << super
.version
<< dendl
;
1036 dout(10) << __func__
<< " log_fnode " << super
.log_fnode
<< dendl
;
1037 ceph_assert_always(bl
.length() <= get_super_length());
1038 bl
.append_zero(get_super_length() - bl
.length());
1040 bdev
[dev
]->write(get_super_offset(), bl
, false, WRITE_LIFE_SHORT
);
1041 dout(20) << __func__
<< " v " << super
.version
1042 << " crc 0x" << std::hex
<< crc
1043 << " offset 0x" << get_super_offset() << std::dec
1048 int BlueFS::_open_super()
1050 dout(10) << __func__
<< dendl
;
1053 uint32_t expected_crc
, crc
;
1056 // always the second block
1057 r
= _bdev_read(BDEV_DB
, get_super_offset(), get_super_length(),
1058 &bl
, ioc
[BDEV_DB
], false);
1062 auto p
= bl
.cbegin();
1066 t
.substr_of(bl
, 0, p
.get_off());
1069 decode(expected_crc
, p
);
1070 if (crc
!= expected_crc
) {
1071 derr
<< __func__
<< " bad crc on superblock, expected 0x"
1072 << std::hex
<< expected_crc
<< " != actual 0x" << crc
<< std::dec
1076 dout(10) << __func__
<< " superblock " << super
.version
<< dendl
;
1077 dout(10) << __func__
<< " log_fnode " << super
.log_fnode
<< dendl
;
1081 int BlueFS::_check_allocations(const bluefs_fnode_t
& fnode
,
1082 boost::dynamic_bitset
<uint64_t>* used_blocks
,
1083 bool is_alloc
, //true when allocating, false when deallocating
1084 const char* op_name
)
1086 auto& fnode_extents
= fnode
.extents
;
1087 for (auto e
: fnode_extents
) {
1090 ceph_assert(id
< MAX_BDEV
);
1091 if (int r
= _verify_alloc_granularity(id
, e
.offset
, e
.length
,
1096 apply_for_bitset_range(e
.offset
, e
.length
, alloc_size
[id
], used_blocks
[id
],
1097 [&](uint64_t pos
, boost::dynamic_bitset
<uint64_t> &bs
) {
1098 if (is_alloc
== bs
.test(pos
)) {
1106 derr
<< __func__
<< " " << op_name
<< " invalid extent " << int(e
.bdev
)
1107 << ": 0x" << std::hex
<< e
.offset
<< "~" << e
.length
<< std::dec
1108 << (is_alloc
== true ?
1109 ": duplicate reference, ino " : ": double free, ino ")
1110 << fnode
.ino
<< dendl
;
1117 int BlueFS::_verify_alloc_granularity(
1118 __u8 id
, uint64_t offset
, uint64_t length
, const char *op
)
1120 if ((offset
& (alloc_size
[id
] - 1)) ||
1121 (length
& (alloc_size
[id
] - 1))) {
1122 derr
<< __func__
<< " " << op
<< " of " << (int)id
1123 << ":0x" << std::hex
<< offset
<< "~" << length
<< std::dec
1124 << " does not align to alloc_size 0x"
1125 << std::hex
<< alloc_size
[id
] << std::dec
<< dendl
;
1127 auto need
= alloc_size
[id
];
1128 while (need
&& ((offset
& (need
- 1)) ||
1129 (length
& (need
- 1)))) {
1134 if (id
== BDEV_SLOW
||
1135 (id
== BDEV_DB
&& !bdev
[BDEV_SLOW
])) {
1136 which
= "bluefs_shared_alloc_size";
1138 which
= "bluefs_alloc_size";
1140 derr
<< "work-around by setting " << which
<< " = " << need
1141 << " for this OSD" << dendl
;
1148 int BlueFS::_replay(bool noop
, bool to_stdout
)
1150 dout(10) << __func__
<< (noop
? " NO-OP" : "") << dendl
;
1151 ino_last
= 1; // by the log
1152 uint64_t log_seq
= 0;
1155 log_file
= _get_file(1);
1157 log_file
->fnode
= super
.log_fnode
;
1159 log_file
->vselector_hint
=
1160 vselector
->get_hint_for_log();
1162 // do not use fnode from superblock in 'noop' mode - log_file's one should
1163 // be fine and up-to-date
1164 ceph_assert(log_file
->fnode
.ino
== 1);
1165 ceph_assert(log_file
->fnode
.extents
.size() != 0);
1167 dout(10) << __func__
<< " log_fnode " << super
.log_fnode
<< dendl
;
1168 if (unlikely(to_stdout
)) {
1169 std::cout
<< " log_fnode " << super
.log_fnode
<< std::endl
;
1172 FileReader
*log_reader
= new FileReader(
1173 log_file
, cct
->_conf
->bluefs_max_prefetch
,
1175 true); // ignore eof
1177 bool seen_recs
= false;
1179 boost::dynamic_bitset
<uint64_t> used_blocks
[MAX_BDEV
];
1182 if (cct
->_conf
->bluefs_log_replay_check_allocations
) {
1183 for (size_t i
= 0; i
< MAX_BDEV
; ++i
) {
1184 if (alloc_size
[i
] != 0 && bdev
[i
] != nullptr) {
1185 used_blocks
[i
].resize(round_up_to(bdev
[i
]->get_size(), alloc_size
[i
]) / alloc_size
[i
]);
1188 // check initial log layout
1189 int r
= _check_allocations(log_file
->fnode
,
1190 used_blocks
, true, "Log from super");
1198 ceph_assert((log_reader
->buf
.pos
& ~super
.block_mask()) == 0);
1199 uint64_t pos
= log_reader
->buf
.pos
;
1200 uint64_t read_pos
= pos
;
1203 int r
= _read(log_reader
, read_pos
, super
.block_size
,
1205 if (r
!= (int)super
.block_size
&& cct
->_conf
->bluefs_replay_recovery
) {
1206 r
+= _do_replay_recovery_read(log_reader
, pos
, read_pos
+ r
, super
.block_size
- r
, &bl
);
1208 assert(r
== (int)super
.block_size
);
1215 auto p
= bl
.cbegin();
1223 if (len
+ 6 > bl
.length()) {
1224 more
= round_up_to(len
+ 6 - bl
.length(), super
.block_size
);
1227 if (uuid
!= super
.uuid
) {
1229 dout(10) << __func__
<< " 0x" << std::hex
<< pos
<< std::dec
1230 << ": stop: uuid " << uuid
<< " != super.uuid " << super
.uuid
1233 derr
<< __func__
<< " 0x" << std::hex
<< pos
<< std::dec
1234 << ": stop: uuid " << uuid
<< " != super.uuid " << super
.uuid
1235 << ", block dump: \n";
1237 t
.substr_of(bl
, 0, super
.block_size
);
1243 if (seq
!= log_seq
+ 1) {
1245 dout(10) << __func__
<< " 0x" << std::hex
<< pos
<< std::dec
1246 << ": stop: seq " << seq
<< " != expected " << log_seq
+ 1
1249 derr
<< __func__
<< " 0x" << std::hex
<< pos
<< std::dec
1250 << ": stop: seq " << seq
<< " != expected " << log_seq
+ 1
1256 dout(20) << __func__
<< " need 0x" << std::hex
<< more
<< std::dec
1257 << " more bytes" << dendl
;
1259 int r
= _read(log_reader
, read_pos
, more
, &t
, NULL
);
1260 if (r
< (int)more
) {
1261 dout(10) << __func__
<< " 0x" << std::hex
<< pos
1262 << ": stop: len is 0x" << bl
.length() + more
<< std::dec
1263 << ", which is past eof" << dendl
;
1264 if (cct
->_conf
->bluefs_replay_recovery
) {
1265 //try to search for more data
1266 r
+= _do_replay_recovery_read(log_reader
, pos
, read_pos
+ r
, more
- r
, &t
);
1267 if (r
< (int)more
) {
1268 //in normal mode we must read r==more, for recovery it is too strict
1273 ceph_assert(r
== (int)more
);
1277 bluefs_transaction_t t
;
1279 auto p
= bl
.cbegin();
1283 catch (ceph::buffer::error
& e
) {
1284 // Multi-block transactions might be incomplete due to unexpected
1285 // power off. Hence let's treat that as a regular stop condition.
1286 if (seen_recs
&& more
) {
1287 dout(10) << __func__
<< " 0x" << std::hex
<< pos
<< std::dec
1288 << ": stop: failed to decode: " << e
.what()
1291 derr
<< __func__
<< " 0x" << std::hex
<< pos
<< std::dec
1292 << ": stop: failed to decode: " << e
.what()
1299 ceph_assert(seq
== t
.seq
);
1300 dout(10) << __func__
<< " 0x" << std::hex
<< pos
<< std::dec
1301 << ": " << t
<< dendl
;
1302 if (unlikely(to_stdout
)) {
1303 std::cout
<< " 0x" << std::hex
<< pos
<< std::dec
1304 << ": " << t
<< std::endl
;
1307 auto p
= t
.op_bl
.cbegin();
1313 case bluefs_transaction_t::OP_INIT
:
1314 dout(20) << __func__
<< " 0x" << std::hex
<< pos
<< std::dec
1315 << ": op_init" << dendl
;
1316 if (unlikely(to_stdout
)) {
1317 std::cout
<< " 0x" << std::hex
<< pos
<< std::dec
1318 << ": op_init" << std::endl
;
1321 ceph_assert(t
.seq
== 1);
1324 case bluefs_transaction_t::OP_JUMP
:
1328 decode(next_seq
, p
);
1330 dout(20) << __func__
<< " 0x" << std::hex
<< pos
<< std::dec
1331 << ": op_jump seq " << next_seq
1332 << " offset 0x" << std::hex
<< offset
<< std::dec
<< dendl
;
1333 if (unlikely(to_stdout
)) {
1334 std::cout
<< " 0x" << std::hex
<< pos
<< std::dec
1335 << ": op_jump seq " << next_seq
1336 << " offset 0x" << std::hex
<< offset
<< std::dec
1340 ceph_assert(next_seq
> log_seq
);
1341 log_seq
= next_seq
- 1; // we will increment it below
1342 uint64_t skip
= offset
- read_pos
;
1345 int r
= _read(log_reader
, read_pos
, skip
, &junk
,
1347 if (r
!= (int)skip
) {
1348 dout(10) << __func__
<< " 0x" << std::hex
<< read_pos
1349 << ": stop: failed to skip to " << offset
1350 << std::dec
<< dendl
;
1351 ceph_abort_msg("problem with op_jump");
1357 case bluefs_transaction_t::OP_JUMP_SEQ
:
1360 decode(next_seq
, p
);
1361 dout(20) << __func__
<< " 0x" << std::hex
<< pos
<< std::dec
1362 << ": op_jump_seq " << next_seq
<< dendl
;
1363 if (unlikely(to_stdout
)) {
1364 std::cout
<< " 0x" << std::hex
<< pos
<< std::dec
1365 << ": op_jump_seq " << next_seq
<< std::endl
;
1368 ceph_assert(next_seq
> log_seq
);
1369 log_seq
= next_seq
- 1; // we will increment it below
1373 case bluefs_transaction_t::OP_ALLOC_ADD
:
1374 // LEGACY, do nothing but read params
1377 uint64_t offset
, length
;
1384 case bluefs_transaction_t::OP_ALLOC_RM
:
1385 // LEGACY, do nothing but read params
1388 uint64_t offset
, length
;
1395 case bluefs_transaction_t::OP_DIR_LINK
:
1397 string dirname
, filename
;
1400 decode(filename
, p
);
1402 dout(20) << __func__
<< " 0x" << std::hex
<< pos
<< std::dec
1403 << ": op_dir_link " << " " << dirname
<< "/" << filename
1406 if (unlikely(to_stdout
)) {
1407 std::cout
<< " 0x" << std::hex
<< pos
<< std::dec
1408 << ": op_dir_link " << " " << dirname
<< "/" << filename
1414 FileRef file
= _get_file(ino
);
1415 ceph_assert(file
->fnode
.ino
);
1416 map
<string
,DirRef
>::iterator q
= nodes
.dir_map
.find(dirname
);
1417 ceph_assert(q
!= nodes
.dir_map
.end());
1418 map
<string
,FileRef
>::iterator r
= q
->second
->file_map
.find(filename
);
1419 ceph_assert(r
== q
->second
->file_map
.end());
1421 vselector
->sub_usage(file
->vselector_hint
, file
->fnode
);
1422 file
->vselector_hint
=
1423 vselector
->get_hint_by_dir(dirname
);
1424 vselector
->add_usage(file
->vselector_hint
, file
->fnode
);
1426 q
->second
->file_map
[filename
] = file
;
1432 case bluefs_transaction_t::OP_DIR_UNLINK
:
1434 string dirname
, filename
;
1436 decode(filename
, p
);
1437 dout(20) << __func__
<< " 0x" << std::hex
<< pos
<< std::dec
1438 << ": op_dir_unlink " << " " << dirname
<< "/" << filename
1440 if (unlikely(to_stdout
)) {
1441 std::cout
<< " 0x" << std::hex
<< pos
<< std::dec
1442 << ": op_dir_unlink " << " " << dirname
<< "/" << filename
1447 map
<string
,DirRef
>::iterator q
= nodes
.dir_map
.find(dirname
);
1448 ceph_assert(q
!= nodes
.dir_map
.end());
1449 map
<string
,FileRef
>::iterator r
= q
->second
->file_map
.find(filename
);
1450 ceph_assert(r
!= q
->second
->file_map
.end());
1451 ceph_assert(r
->second
->refs
> 0);
1453 q
->second
->file_map
.erase(r
);
1458 case bluefs_transaction_t::OP_DIR_CREATE
:
1462 dout(20) << __func__
<< " 0x" << std::hex
<< pos
<< std::dec
1463 << ": op_dir_create " << dirname
<< dendl
;
1464 if (unlikely(to_stdout
)) {
1465 std::cout
<< " 0x" << std::hex
<< pos
<< std::dec
1466 << ": op_dir_create " << dirname
<< std::endl
;
1470 map
<string
,DirRef
>::iterator q
= nodes
.dir_map
.find(dirname
);
1471 ceph_assert(q
== nodes
.dir_map
.end());
1472 nodes
.dir_map
[dirname
] = ceph::make_ref
<Dir
>();
1477 case bluefs_transaction_t::OP_DIR_REMOVE
:
1481 dout(20) << __func__
<< " 0x" << std::hex
<< pos
<< std::dec
1482 << ": op_dir_remove " << dirname
<< dendl
;
1483 if (unlikely(to_stdout
)) {
1484 std::cout
<< " 0x" << std::hex
<< pos
<< std::dec
1485 << ": op_dir_remove " << dirname
<< std::endl
;
1489 map
<string
,DirRef
>::iterator q
= nodes
.dir_map
.find(dirname
);
1490 ceph_assert(q
!= nodes
.dir_map
.end());
1491 ceph_assert(q
->second
->file_map
.empty());
1492 nodes
.dir_map
.erase(q
);
1497 case bluefs_transaction_t::OP_FILE_UPDATE
:
1499 bluefs_fnode_t fnode
;
1501 dout(20) << __func__
<< " 0x" << std::hex
<< pos
<< std::dec
1502 << ": op_file_update " << " " << fnode
<< " " << dendl
;
1503 if (unlikely(to_stdout
)) {
1504 std::cout
<< " 0x" << std::hex
<< pos
<< std::dec
1505 << ": op_file_update " << " " << fnode
<< std::endl
;
1508 FileRef f
= _get_file(fnode
.ino
);
1509 if (cct
->_conf
->bluefs_log_replay_check_allocations
) {
1510 int r
= _check_allocations(f
->fnode
,
1511 used_blocks
, false, "OP_FILE_UPDATE");
1516 if (fnode
.ino
!= 1) {
1517 vselector
->sub_usage(f
->vselector_hint
, f
->fnode
);
1520 if (fnode
.ino
!= 1) {
1521 vselector
->add_usage(f
->vselector_hint
, f
->fnode
);
1524 if (fnode
.ino
> ino_last
) {
1525 ino_last
= fnode
.ino
;
1527 if (cct
->_conf
->bluefs_log_replay_check_allocations
) {
1528 int r
= _check_allocations(f
->fnode
,
1529 used_blocks
, true, "OP_FILE_UPDATE");
1534 } else if (noop
&& fnode
.ino
== 1) {
1535 FileRef f
= _get_file(fnode
.ino
);
1540 case bluefs_transaction_t::OP_FILE_UPDATE_INC
:
1542 bluefs_fnode_delta_t delta
;
1544 dout(20) << __func__
<< " 0x" << std::hex
<< pos
<< std::dec
1545 << ": op_file_update_inc " << " " << delta
<< " " << dendl
;
1546 if (unlikely(to_stdout
)) {
1547 std::cout
<< " 0x" << std::hex
<< pos
<< std::dec
1548 << ": op_file_update_inc " << " " << delta
<< std::endl
;
1551 FileRef f
= _get_file(delta
.ino
);
1552 bluefs_fnode_t
& fnode
= f
->fnode
;
1553 if (delta
.offset
!= fnode
.allocated
) {
1554 derr
<< __func__
<< " invalid op_file_update_inc, new extents miss end of file"
1555 << " fnode=" << fnode
1556 << " delta=" << delta
1558 ceph_assert(delta
.offset
== fnode
.allocated
);
1560 if (cct
->_conf
->bluefs_log_replay_check_allocations
) {
1561 int r
= _check_allocations(fnode
,
1562 used_blocks
, false, "OP_FILE_UPDATE_INC");
1568 fnode
.ino
= delta
.ino
;
1569 fnode
.mtime
= delta
.mtime
;
1570 if (fnode
.ino
!= 1) {
1571 vselector
->sub_usage(f
->vselector_hint
, fnode
);
1573 fnode
.size
= delta
.size
;
1574 fnode
.claim_extents(delta
.extents
);
1575 dout(20) << __func__
<< " 0x" << std::hex
<< pos
<< std::dec
1576 << ": op_file_update_inc produced " << " " << fnode
<< " " << dendl
;
1578 if (fnode
.ino
!= 1) {
1579 vselector
->add_usage(f
->vselector_hint
, fnode
);
1582 if (fnode
.ino
> ino_last
) {
1583 ino_last
= fnode
.ino
;
1585 if (cct
->_conf
->bluefs_log_replay_check_allocations
) {
1586 int r
= _check_allocations(f
->fnode
,
1587 used_blocks
, true, "OP_FILE_UPDATE_INC");
1592 } else if (noop
&& delta
.ino
== 1) {
1593 // we need to track bluefs log, even in noop mode
1594 FileRef f
= _get_file(1);
1595 bluefs_fnode_t
& fnode
= f
->fnode
;
1596 fnode
.ino
= delta
.ino
;
1597 fnode
.mtime
= delta
.mtime
;
1598 fnode
.size
= delta
.size
;
1599 fnode
.claim_extents(delta
.extents
);
1604 case bluefs_transaction_t::OP_FILE_REMOVE
:
1608 dout(20) << __func__
<< " 0x" << std::hex
<< pos
<< std::dec
1609 << ": op_file_remove " << ino
<< dendl
;
1610 if (unlikely(to_stdout
)) {
1611 std::cout
<< " 0x" << std::hex
<< pos
<< std::dec
1612 << ": op_file_remove " << ino
<< std::endl
;
1616 auto p
= nodes
.file_map
.find(ino
);
1617 ceph_assert(p
!= nodes
.file_map
.end());
1618 vselector
->sub_usage(p
->second
->vselector_hint
, p
->second
->fnode
);
1619 if (cct
->_conf
->bluefs_log_replay_check_allocations
) {
1620 int r
= _check_allocations(p
->second
->fnode
,
1621 used_blocks
, false, "OP_FILE_REMOVE");
1626 nodes
.file_map
.erase(p
);
1632 derr
<< __func__
<< " 0x" << std::hex
<< pos
<< std::dec
1633 << ": stop: unrecognized op " << (int)op
<< dendl
;
1638 ceph_assert(p
.end());
1640 // we successfully replayed the transaction; bump the seq and log size
1642 log_file
->fnode
.size
= log_reader
->buf
.pos
;
1645 vselector
->add_usage(log_file
->vselector_hint
, log_file
->fnode
);
1646 log
.seq_live
= log_seq
+ 1;
1647 dirty
.seq_live
= log_seq
+ 1;
1648 log
.t
.seq
= log
.seq_live
;
1649 dirty
.seq_stable
= log_seq
;
1652 dout(10) << __func__
<< " log file size was 0x"
1653 << std::hex
<< log_file
->fnode
.size
<< std::dec
<< dendl
;
1654 if (unlikely(to_stdout
)) {
1655 std::cout
<< " log file size was 0x"
1656 << std::hex
<< log_file
->fnode
.size
<< std::dec
<< std::endl
;
1662 // verify file link counts are all >0
1663 for (auto& p
: nodes
.file_map
) {
1664 if (p
.second
->refs
== 0 &&
1665 p
.second
->fnode
.ino
> 1) {
1666 derr
<< __func__
<< " file with link count 0: " << p
.second
->fnode
1672 // reflect file count in logger
1673 logger
->set(l_bluefs_num_files
, nodes
.file_map
.size());
1675 dout(10) << __func__
<< " done" << dendl
;
1679 int BlueFS::log_dump()
1681 // only dump log file's content
1682 ceph_assert(log
.writer
== nullptr && "cannot log_dump on mounted BlueFS");
1684 int r
= _open_super();
1686 derr
<< __func__
<< " failed to open super: " << cpp_strerror(r
) << dendl
;
1689 r
= _replay(true, true);
1691 derr
<< __func__
<< " failed to replay log: " << cpp_strerror(r
) << dendl
;
1694 super
= bluefs_super_t();
1698 int BlueFS::device_migrate_to_existing(
1700 const set
<int>& devs_source
,
1702 const bluefs_layout_t
& layout
)
1705 bool buffered
= cct
->_conf
->bluefs_buffered_io
;
1707 dout(10) << __func__
<< " devs_source " << devs_source
1708 << " dev_target " << dev_target
<< dendl
;
1709 assert(dev_target
< (int)MAX_BDEV
);
1712 flags
|= devs_source
.count(BDEV_DB
) ?
1713 (REMOVE_DB
| RENAME_SLOW2DB
) : 0;
1714 flags
|= devs_source
.count(BDEV_WAL
) ? REMOVE_WAL
: 0;
1715 int dev_target_new
= dev_target
;
1717 // Slow device without separate DB one is addressed via BDEV_DB
1718 // Hence need renaming.
1719 if ((flags
& REMOVE_DB
) && dev_target
== BDEV_SLOW
) {
1720 dev_target_new
= BDEV_DB
;
1721 dout(0) << __func__
<< " super to be written to " << dev_target
<< dendl
;
1724 for (auto& [ino
, file_ref
] : nodes
.file_map
) {
1726 if (file_ref
->fnode
.ino
== 1) {
1729 dout(10) << __func__
<< " " << ino
<< " " << file_ref
->fnode
<< dendl
;
1731 auto& fnode_extents
= file_ref
->fnode
.extents
;
1732 vselector
->sub_usage(file_ref
->vselector_hint
, file_ref
->fnode
);
1734 bool rewrite
= std::any_of(
1735 fnode_extents
.begin(),
1736 fnode_extents
.end(),
1738 return ext
.bdev
!= dev_target
&& devs_source
.count(ext
.bdev
);
1741 dout(10) << __func__
<< " migrating" << dendl
;
1745 for (auto old_ext
: fnode_extents
) {
1746 buf
.resize(old_ext
.length
);
1747 int r
= _bdev_read_random(old_ext
.bdev
,
1753 derr
<< __func__
<< " failed to read 0x" << std::hex
1754 << old_ext
.offset
<< "~" << old_ext
.length
<< std::dec
1755 << " from " << (int)dev_target
<< dendl
;
1758 bl
.append((char*)&buf
[0], old_ext
.length
);
1761 // write entire file
1762 PExtentVector extents
;
1763 auto l
= _allocate_without_fallback(dev_target
, bl
.length(), &extents
);
1765 derr
<< __func__
<< " unable to allocate len 0x" << std::hex
1766 << bl
.length() << std::dec
<< " from " << (int)dev_target
1767 << ": " << cpp_strerror(l
) << dendl
;
1772 for (auto& i
: extents
) {
1774 uint64_t cur_len
= std::min
<uint64_t>(i
.length
, bl
.length() - off
);
1775 ceph_assert(cur_len
> 0);
1776 cur
.substr_of(bl
, off
, cur_len
);
1777 int r
= bdev
[dev_target
]->write(i
.offset
, cur
, buffered
);
1778 ceph_assert(r
== 0);
1782 // release old extents
1783 for (auto old_ext
: fnode_extents
) {
1784 PExtentVector to_release
;
1785 to_release
.emplace_back(old_ext
.offset
, old_ext
.length
);
1786 alloc
[old_ext
.bdev
]->release(to_release
);
1787 if (is_shared_alloc(old_ext
.bdev
)) {
1788 shared_alloc
->bluefs_used
-= to_release
.size();
1793 fnode_extents
.clear();
1794 for (auto& i
: extents
) {
1795 fnode_extents
.emplace_back(dev_target_new
, i
.offset
, i
.length
);
1798 for (auto& ext
: fnode_extents
) {
1799 if (dev_target
!= dev_target_new
&& ext
.bdev
== dev_target
) {
1800 dout(20) << __func__
<< " " << " ... adjusting extent 0x"
1801 << std::hex
<< ext
.offset
<< std::dec
1802 << " bdev " << dev_target
<< " -> " << dev_target_new
1804 ext
.bdev
= dev_target_new
;
1808 vselector
->add_usage(file_ref
->vselector_hint
, file_ref
->fnode
);
1810 // new logging device in the current naming scheme
1811 int new_log_dev_cur
= bdev
[BDEV_WAL
] ?
1813 bdev
[BDEV_DB
] ? BDEV_DB
: BDEV_SLOW
;
1815 // new logging device in new naming scheme
1816 int new_log_dev_next
= new_log_dev_cur
;
1818 if (devs_source
.count(new_log_dev_cur
)) {
1819 // SLOW device is addressed via BDEV_DB too hence either WAL or DB
1820 new_log_dev_next
= (flags
& REMOVE_WAL
) || !bdev
[BDEV_WAL
] ?
1824 dout(0) << __func__
<< " log moved from " << new_log_dev_cur
1825 << " to " << new_log_dev_next
<< dendl
;
1828 (flags
& REMOVE_DB
) && new_log_dev_next
== BDEV_DB
?
1833 _rewrite_log_and_layout_sync_LNF_LD(
1835 (flags
& REMOVE_DB
) ? BDEV_SLOW
: BDEV_DB
,
1843 int BlueFS::device_migrate_to_new(
1845 const set
<int>& devs_source
,
1847 const bluefs_layout_t
& layout
)
1850 bool buffered
= cct
->_conf
->bluefs_buffered_io
;
1852 dout(10) << __func__
<< " devs_source " << devs_source
1853 << " dev_target " << dev_target
<< dendl
;
1854 assert(dev_target
== (int)BDEV_NEWDB
|| dev_target
== (int)BDEV_NEWWAL
);
1858 flags
|= devs_source
.count(BDEV_DB
) ?
1859 (!bdev
[BDEV_SLOW
] ? RENAME_DB2SLOW
: REMOVE_DB
) :
1861 flags
|= devs_source
.count(BDEV_WAL
) ? REMOVE_WAL
: 0;
1862 int dev_target_new
= dev_target
; //FIXME: remove, makes no sense
1864 for (auto& p
: nodes
.file_map
) {
1866 if (p
.second
->fnode
.ino
== 1) {
1869 dout(10) << __func__
<< " " << p
.first
<< " " << p
.second
->fnode
<< dendl
;
1871 auto& fnode_extents
= p
.second
->fnode
.extents
;
1873 bool rewrite
= false;
1874 for (auto ext_it
= fnode_extents
.begin();
1875 ext_it
!= p
.second
->fnode
.extents
.end();
1877 if (ext_it
->bdev
!= dev_target
&& devs_source
.count(ext_it
->bdev
)) {
1883 dout(10) << __func__
<< " migrating" << dendl
;
1887 for (auto old_ext
: fnode_extents
) {
1888 buf
.resize(old_ext
.length
);
1889 int r
= _bdev_read_random(old_ext
.bdev
,
1895 derr
<< __func__
<< " failed to read 0x" << std::hex
1896 << old_ext
.offset
<< "~" << old_ext
.length
<< std::dec
1897 << " from " << (int)dev_target
<< dendl
;
1900 bl
.append((char*)&buf
[0], old_ext
.length
);
1903 // write entire file
1904 PExtentVector extents
;
1905 auto l
= _allocate_without_fallback(dev_target
, bl
.length(), &extents
);
1907 derr
<< __func__
<< " unable to allocate len 0x" << std::hex
1908 << bl
.length() << std::dec
<< " from " << (int)dev_target
1909 << ": " << cpp_strerror(l
) << dendl
;
1914 for (auto& i
: extents
) {
1916 uint64_t cur_len
= std::min
<uint64_t>(i
.length
, bl
.length() - off
);
1917 ceph_assert(cur_len
> 0);
1918 cur
.substr_of(bl
, off
, cur_len
);
1919 int r
= bdev
[dev_target
]->write(i
.offset
, cur
, buffered
);
1920 ceph_assert(r
== 0);
1924 // release old extents
1925 for (auto old_ext
: fnode_extents
) {
1926 PExtentVector to_release
;
1927 to_release
.emplace_back(old_ext
.offset
, old_ext
.length
);
1928 alloc
[old_ext
.bdev
]->release(to_release
);
1929 if (is_shared_alloc(old_ext
.bdev
)) {
1930 shared_alloc
->bluefs_used
-= to_release
.size();
1935 fnode_extents
.clear();
1936 for (auto& i
: extents
) {
1937 fnode_extents
.emplace_back(dev_target_new
, i
.offset
, i
.length
);
1941 // new logging device in the current naming scheme
1942 int new_log_dev_cur
=
1945 bdev
[BDEV_WAL
] && !(flags
& REMOVE_WAL
) ?
1949 bdev
[BDEV_DB
] && !(flags
& REMOVE_DB
)?
1953 // new logging device in new naming scheme
1954 int new_log_dev_next
=
1955 new_log_dev_cur
== BDEV_NEWWAL
?
1957 new_log_dev_cur
== BDEV_NEWDB
?
1962 dev_target
== BDEV_NEWDB
?
1968 _rewrite_log_and_layout_sync_LNF_LD(
1978 BlueFS::FileRef
BlueFS::_get_file(uint64_t ino
)
1980 auto p
= nodes
.file_map
.find(ino
);
1981 if (p
== nodes
.file_map
.end()) {
1982 FileRef f
= ceph::make_ref
<File
>();
1983 nodes
.file_map
[ino
] = f
;
1984 // track files count in logger
1985 logger
->set(l_bluefs_num_files
, nodes
.file_map
.size());
1986 dout(30) << __func__
<< " ino " << ino
<< " = " << f
1987 << " (new)" << dendl
;
1990 dout(30) << __func__
<< " ino " << ino
<< " = " << p
->second
<< dendl
;
1997 To modify fnode both FileWriter::lock and File::lock must be obtained.
1998 The special case is when we modify bluefs log (ino 1) or
1999 we are compacting log (ino 0).
2001 In any case it is enough to hold File::lock to be sure fnode will not be modified.
2003 struct lock_fnode_print
{
2004 BlueFS::FileRef file
;
2005 lock_fnode_print(BlueFS::FileRef file
) : file(file
) {};
2007 std::ostream
& operator<<(std::ostream
& out
, const lock_fnode_print
& to_lock
) {
2008 std::lock_guard
l(to_lock
.file
->lock
);
2009 out
<< to_lock
.file
->fnode
;
2013 void BlueFS::_drop_link_D(FileRef file
)
2015 dout(20) << __func__
<< " had refs " << file
->refs
2016 << " on " << lock_fnode_print(file
) << dendl
;
2017 ceph_assert(file
->refs
> 0);
2018 ceph_assert(ceph_mutex_is_locked(log
.lock
));
2019 ceph_assert(ceph_mutex_is_locked(nodes
.lock
));
2022 if (file
->refs
== 0) {
2023 dout(20) << __func__
<< " destroying " << file
->fnode
<< dendl
;
2024 ceph_assert(file
->num_reading
.load() == 0);
2025 vselector
->sub_usage(file
->vselector_hint
, file
->fnode
);
2026 log
.t
.op_file_remove(file
->fnode
.ino
);
2027 nodes
.file_map
.erase(file
->fnode
.ino
);
2028 logger
->set(l_bluefs_num_files
, nodes
.file_map
.size());
2029 file
->deleted
= true;
2031 std::lock_guard
dl(dirty
.lock
);
2032 for (auto& r
: file
->fnode
.extents
) {
2033 dirty
.pending_release
[r
.bdev
].insert(r
.offset
, r
.length
);
2035 if (file
->dirty_seq
> dirty
.seq_stable
) {
2036 // retract request to serialize changes
2037 ceph_assert(dirty
.files
.count(file
->dirty_seq
));
2038 auto it
= dirty
.files
[file
->dirty_seq
].iterator_to(*file
);
2039 dirty
.files
[file
->dirty_seq
].erase(it
);
2040 file
->dirty_seq
= dirty
.seq_stable
;
2045 int64_t BlueFS::_read_random(
2046 FileReader
*h
, ///< [in] read from here
2047 uint64_t off
, ///< [in] offset
2048 uint64_t len
, ///< [in] this many bytes
2049 char *out
) ///< [out] copy it here
2051 auto* buf
= &h
->buf
;
2054 dout(10) << __func__
<< " h " << h
2055 << " 0x" << std::hex
<< off
<< "~" << len
<< std::dec
2056 << " from " << lock_fnode_print(h
->file
) << dendl
;
2058 ++h
->file
->num_reading
;
2060 if (!h
->ignore_eof
&&
2061 off
+ len
> h
->file
->fnode
.size
) {
2062 if (off
> h
->file
->fnode
.size
)
2065 len
= h
->file
->fnode
.size
- off
;
2066 dout(20) << __func__
<< " reaching (or past) eof, len clipped to 0x"
2067 << std::hex
<< len
<< std::dec
<< dendl
;
2069 logger
->inc(l_bluefs_read_random_count
, 1);
2070 logger
->inc(l_bluefs_read_random_bytes
, len
);
2072 std::shared_lock
s_lock(h
->lock
);
2073 buf
->bl
.reassign_to_mempool(mempool::mempool_bluefs_file_reader
);
2075 if (off
< buf
->bl_off
|| off
>= buf
->get_buf_end()) {
2078 auto p
= h
->file
->fnode
.seek(off
, &x_off
);
2079 ceph_assert(p
!= h
->file
->fnode
.extents
.end());
2080 uint64_t l
= std::min(p
->length
- x_off
, len
);
2082 l
= std::min(l
, uint64_t(1) << 30);
2083 dout(20) << __func__
<< " read random 0x"
2084 << std::hex
<< x_off
<< "~" << l
<< std::dec
2085 << " of " << *p
<< dendl
;
2087 if (!cct
->_conf
->bluefs_check_for_zeros
) {
2088 r
= _bdev_read_random(p
->bdev
, p
->offset
+ x_off
, l
, out
,
2089 cct
->_conf
->bluefs_buffered_io
);
2091 r
= _read_random_and_check(p
->bdev
, p
->offset
+ x_off
, l
, out
,
2092 cct
->_conf
->bluefs_buffered_io
);
2094 ceph_assert(r
== 0);
2100 logger
->inc(l_bluefs_read_random_disk_count
, 1);
2101 logger
->inc(l_bluefs_read_random_disk_bytes
, l
);
2106 auto left
= buf
->get_buf_remaining(off
);
2107 int64_t r
= std::min(len
, left
);
2108 logger
->inc(l_bluefs_read_random_buffer_count
, 1);
2109 logger
->inc(l_bluefs_read_random_buffer_bytes
, r
);
2110 dout(20) << __func__
<< " left 0x" << std::hex
<< left
2111 << " 0x" << off
<< "~" << len
<< std::dec
2114 auto p
= buf
->bl
.begin();
2115 p
.seek(off
- buf
->bl_off
);
2119 dout(30) << __func__
<< " result chunk (0x"
2120 << std::hex
<< r
<< std::dec
<< " bytes):\n";
2122 t
.substr_of(buf
->bl
, off
- buf
->bl_off
, r
);
2132 dout(20) << __func__
<< " got " << ret
<< dendl
;
2133 --h
->file
->num_reading
;
2137 int64_t BlueFS::_read(
2138 FileReader
*h
, ///< [in] read from here
2139 uint64_t off
, ///< [in] offset
2140 size_t len
, ///< [in] this many bytes
2141 bufferlist
*outbl
, ///< [out] optional: reference the result here
2142 char *out
) ///< [out] optional: or copy it here
2144 FileReaderBuffer
*buf
= &(h
->buf
);
2146 bool prefetch
= !outbl
&& !out
;
2147 dout(10) << __func__
<< " h " << h
2148 << " 0x" << std::hex
<< off
<< "~" << len
<< std::dec
2149 << " from " << lock_fnode_print(h
->file
)
2150 << (prefetch
? " prefetch" : "")
2153 ++h
->file
->num_reading
;
2155 if (!h
->ignore_eof
&&
2156 off
+ len
> h
->file
->fnode
.size
) {
2157 if (off
> h
->file
->fnode
.size
)
2160 len
= h
->file
->fnode
.size
- off
;
2161 dout(20) << __func__
<< " reaching (or past) eof, len clipped to 0x"
2162 << std::hex
<< len
<< std::dec
<< dendl
;
2164 logger
->inc(l_bluefs_read_count
, 1);
2165 logger
->inc(l_bluefs_read_bytes
, len
);
2167 logger
->inc(l_bluefs_read_prefetch_count
, 1);
2168 logger
->inc(l_bluefs_read_prefetch_bytes
, len
);
2175 std::shared_lock
s_lock(h
->lock
);
2178 if (off
< buf
->bl_off
|| off
>= buf
->get_buf_end()) {
2180 std::unique_lock
u_lock(h
->lock
);
2181 buf
->bl
.reassign_to_mempool(mempool::mempool_bluefs_file_reader
);
2182 if (off
< buf
->bl_off
|| off
>= buf
->get_buf_end()) {
2183 // if precondition hasn't changed during locking upgrade.
2185 buf
->bl_off
= off
& super
.block_mask();
2187 auto p
= h
->file
->fnode
.seek(buf
->bl_off
, &x_off
);
2188 if (p
== h
->file
->fnode
.extents
.end()) {
2189 dout(5) << __func__
<< " reading less then required "
2190 << ret
<< "<" << ret
+ len
<< dendl
;
2194 uint64_t want
= round_up_to(len
+ (off
& ~super
.block_mask()),
2196 want
= std::max(want
, buf
->max_prefetch
);
2197 uint64_t l
= std::min(p
->length
- x_off
, want
);
2199 l
= std::min(l
, uint64_t(1) << 30);
2200 uint64_t eof_offset
= round_up_to(h
->file
->fnode
.size
, super
.block_size
);
2201 if (!h
->ignore_eof
&&
2202 buf
->bl_off
+ l
> eof_offset
) {
2203 l
= eof_offset
- buf
->bl_off
;
2205 dout(20) << __func__
<< " fetching 0x"
2206 << std::hex
<< x_off
<< "~" << l
<< std::dec
2207 << " of " << *p
<< dendl
;
2209 if (!cct
->_conf
->bluefs_check_for_zeros
) {
2210 r
= _bdev_read(p
->bdev
, p
->offset
+ x_off
, l
, &buf
->bl
, ioc
[p
->bdev
],
2211 cct
->_conf
->bluefs_buffered_io
);
2213 r
= _read_and_check(
2214 p
->bdev
, p
->offset
+ x_off
, l
, &buf
->bl
, ioc
[p
->bdev
],
2215 cct
->_conf
->bluefs_buffered_io
);
2217 logger
->inc(l_bluefs_read_disk_count
, 1);
2218 logger
->inc(l_bluefs_read_disk_bytes
, l
);
2220 ceph_assert(r
== 0);
2224 // we should recheck if buffer is valid after lock downgrade
2227 left
= buf
->get_buf_remaining(off
);
2228 dout(20) << __func__
<< " left 0x" << std::hex
<< left
2229 << " len 0x" << len
<< std::dec
<< dendl
;
2231 int64_t r
= std::min(len
, left
);
2234 t
.substr_of(buf
->bl
, off
- buf
->bl_off
, r
);
2235 outbl
->claim_append(t
);
2238 auto p
= buf
->bl
.begin();
2239 p
.seek(off
- buf
->bl_off
);
2244 dout(30) << __func__
<< " result chunk (0x"
2245 << std::hex
<< r
<< std::dec
<< " bytes):\n";
2247 t
.substr_of(buf
->bl
, off
- buf
->bl_off
, r
);
2257 dout(20) << __func__
<< " got " << ret
<< dendl
;
2258 ceph_assert(!outbl
|| (int)outbl
->length() == ret
);
2259 --h
->file
->num_reading
;
2263 void BlueFS::invalidate_cache(FileRef f
, uint64_t offset
, uint64_t length
)
2265 std::lock_guard
l(f
->lock
);
2266 dout(10) << __func__
<< " file " << f
->fnode
2267 << " 0x" << std::hex
<< offset
<< "~" << length
<< std::dec
2269 if (offset
& ~super
.block_mask()) {
2270 offset
&= super
.block_mask();
2271 length
= round_up_to(length
, super
.block_size
);
2274 auto p
= f
->fnode
.seek(offset
, &x_off
);
2275 while (length
> 0 && p
!= f
->fnode
.extents
.end()) {
2276 uint64_t x_len
= std::min(p
->length
- x_off
, length
);
2277 bdev
[p
->bdev
]->invalidate_cache(p
->offset
+ x_off
, x_len
);
2278 dout(20) << __func__
<< " 0x" << std::hex
<< x_off
<< "~" << x_len
2279 << std:: dec
<< " of " << *p
<< dendl
;
2285 uint64_t BlueFS::_estimate_log_size_N()
2287 std::lock_guard
nl(nodes
.lock
);
2288 int avg_dir_size
= 40; // fixme
2289 int avg_file_size
= 12;
2290 uint64_t size
= 4096 * 2;
2291 size
+= nodes
.file_map
.size() * (1 + sizeof(bluefs_fnode_t
));
2292 size
+= nodes
.dir_map
.size() + (1 + avg_dir_size
);
2293 size
+= nodes
.file_map
.size() * (1 + avg_dir_size
+ avg_file_size
);
2294 return round_up_to(size
, super
.block_size
);
2297 void BlueFS::compact_log()/*_LNF_LD_NF_D*/
2299 if (!cct
->_conf
->bluefs_replay_recovery_disable_compact
) {
2300 if (cct
->_conf
->bluefs_compact_log_sync
) {
2301 _compact_log_sync_LNF_LD();
2303 _compact_log_async_LD_LNF_D();
2308 bool BlueFS::_should_start_compact_log_L_N()
2310 if (log_is_compacting
.load() == true) {
2311 // compaction is already running
2316 std::lock_guard
ll(log
.lock
);
2317 current
= log
.writer
->file
->fnode
.size
;
2319 uint64_t expected
= _estimate_log_size_N();
2320 float ratio
= (float)current
/ (float)expected
;
2321 dout(10) << __func__
<< " current 0x" << std::hex
<< current
2322 << " expected " << expected
<< std::dec
2323 << " ratio " << ratio
2325 if (current
< cct
->_conf
->bluefs_log_compact_min_size
||
2326 ratio
< cct
->_conf
->bluefs_log_compact_min_ratio
) {
2332 void BlueFS::_compact_log_dump_metadata_NF(bluefs_transaction_t
*t
,
2335 std::lock_guard
nl(nodes
.lock
);
2338 t
->uuid
= super
.uuid
;
2339 dout(20) << __func__
<< " op_init" << dendl
;
2342 for (auto& [ino
, file_ref
] : nodes
.file_map
) {
2345 ceph_assert(ino
> 1);
2346 std::lock_guard
fl(file_ref
->lock
);
2347 for(auto& e
: file_ref
->fnode
.extents
) {
2349 auto bdev_new
= bdev
;
2350 ceph_assert(!((flags
& REMOVE_WAL
) && bdev
== BDEV_WAL
));
2351 if ((flags
& RENAME_SLOW2DB
) && bdev
== BDEV_SLOW
) {
2354 if ((flags
& RENAME_DB2SLOW
) && bdev
== BDEV_DB
) {
2355 bdev_new
= BDEV_SLOW
;
2357 if (bdev
== BDEV_NEWDB
) {
2358 // REMOVE_DB xor RENAME_DB
2359 ceph_assert(!(flags
& REMOVE_DB
) != !(flags
& RENAME_DB2SLOW
));
2360 ceph_assert(!(flags
& RENAME_SLOW2DB
));
2363 if (bdev
== BDEV_NEWWAL
) {
2364 ceph_assert(flags
& REMOVE_WAL
);
2365 bdev_new
= BDEV_WAL
;
2369 dout(20) << __func__
<< " op_file_update " << file_ref
->fnode
<< dendl
;
2370 t
->op_file_update(file_ref
->fnode
);
2372 for (auto& [path
, dir_ref
] : nodes
.dir_map
) {
2373 dout(20) << __func__
<< " op_dir_create " << path
<< dendl
;
2374 t
->op_dir_create(path
);
2375 for (auto& [fname
, file_ref
] : dir_ref
->file_map
) {
2376 dout(20) << __func__
<< " op_dir_link " << path
<< "/" << fname
2377 << " to " << file_ref
->fnode
.ino
<< dendl
;
2378 t
->op_dir_link(path
, fname
, file_ref
->fnode
.ino
);
2382 /* Streams to t files modified before *capture_before_seq* and all dirs */
2383 void BlueFS::_compact_log_async_dump_metadata_NF(bluefs_transaction_t
*t
,
2384 uint64_t capture_before_seq
)
2386 std::lock_guard
nl(nodes
.lock
);
2389 t
->uuid
= super
.uuid
;
2390 dout(20) << __func__
<< " op_init" << dendl
;
2393 for (auto& [ino
, file_ref
] : nodes
.file_map
) {
2396 ceph_assert(ino
> 1);
2397 std::lock_guard
fl(file_ref
->lock
);
2398 if (file_ref
->dirty_seq
< capture_before_seq
) {
2399 dout(20) << __func__
<< " op_file_update " << file_ref
->fnode
<< dendl
;
2401 dout(20) << __func__
<< " op_file_update just modified, dirty_seq="
2402 << file_ref
->dirty_seq
<< " " << file_ref
->fnode
<< dendl
;
2404 t
->op_file_update(file_ref
->fnode
);
2406 for (auto& [path
, dir_ref
] : nodes
.dir_map
) {
2407 dout(20) << __func__
<< " op_dir_create " << path
<< dendl
;
2408 t
->op_dir_create(path
);
2409 for (auto& [fname
, file_ref
] : dir_ref
->file_map
) {
2410 dout(20) << __func__
<< " op_dir_link " << path
<< "/" << fname
2411 << " to " << file_ref
->fnode
.ino
<< dendl
;
2412 t
->op_dir_link(path
, fname
, file_ref
->fnode
.ino
);
2417 void BlueFS::_compact_log_sync_LNF_LD()
2419 dout(10) << __func__
<< dendl
;
2420 uint8_t prefer_bdev
;
2422 std::lock_guard
ll(log
.lock
);
2424 vselector
->select_prefer_bdev(log
.writer
->file
->vselector_hint
);
2426 _rewrite_log_and_layout_sync_LNF_LD(true,
2431 super
.memorized_layout
);
2432 logger
->inc(l_bluefs_log_compactions
);
2435 void BlueFS::_rewrite_log_and_layout_sync_LNF_LD(bool allocate_with_fallback
,
2440 std::optional
<bluefs_layout_t
> layout
)
2442 std::lock_guard
ll(log
.lock
);
2444 File
*log_file
= log
.writer
->file
.get();
2446 // log.t.seq is always set to current live seq
2447 ceph_assert(log
.t
.seq
== log
.seq_live
);
2448 // Capturing entire state. Dump anything that has been stored there.
2450 log
.t
.seq
= log
.seq_live
;
2451 // From now on, no changes to log.t are permitted until we finish rewriting log.
2452 // Can allow dirty to remain dirty - log.seq_live will not change.
2454 dout(20) << __func__
<< " super_dev:" << super_dev
2455 << " log_dev:" << log_dev
2456 << " log_dev_new:" << log_dev_new
2457 << " flags:" << flags
2459 bluefs_transaction_t t
;
2460 _compact_log_dump_metadata_NF(&t
, flags
);
2462 dout(20) << __func__
<< " op_jump_seq " << log
.seq_live
<< dendl
;
2463 t
.op_jump_seq(log
.seq_live
);
2469 uint64_t need
= bl
.length() + cct
->_conf
->bluefs_max_log_runway
;
2470 dout(20) << __func__
<< " need " << need
<< dendl
;
2472 bluefs_fnode_t old_fnode
;
2474 vselector
->sub_usage(log_file
->vselector_hint
, log_file
->fnode
);
2475 log_file
->fnode
.swap_extents(old_fnode
);
2476 if (allocate_with_fallback
) {
2477 r
= _allocate(log_dev
, need
, &log_file
->fnode
);
2478 ceph_assert(r
== 0);
2480 PExtentVector extents
;
2481 r
= _allocate_without_fallback(log_dev
,
2484 ceph_assert(r
== 0);
2485 for (auto& p
: extents
) {
2486 log_file
->fnode
.append_extent(
2487 bluefs_extent_t(log_dev
, p
.offset
, p
.length
));
2491 _close_writer(log
.writer
);
2493 // we will write it to super
2494 log_file
->fnode
.reset_delta();
2495 log_file
->fnode
.size
= bl
.length();
2497 log
.writer
= _create_writer(log_file
);
2498 log
.writer
->append(bl
);
2499 _flush_special(log
.writer
);
2500 vselector
->add_usage(log_file
->vselector_hint
, log_file
->fnode
);
2502 if (!cct
->_conf
->bluefs_sync_write
) {
2503 list
<aio_t
> completed_ios
;
2504 _claim_completed_aios(log
.writer
, &completed_ios
);
2505 _wait_for_aio(log
.writer
);
2506 completed_ios
.clear();
2511 dirty
.seq_live
= log
.seq_live
;
2512 log
.t
.seq
= log
.seq_live
;
2514 super
.memorized_layout
= layout
;
2515 super
.log_fnode
= log_file
->fnode
;
2516 // rename device if needed
2517 if (log_dev
!= log_dev_new
) {
2518 dout(10) << __func__
<< " renaming log extents to " << log_dev_new
<< dendl
;
2519 for (auto& p
: super
.log_fnode
.extents
) {
2520 p
.bdev
= log_dev_new
;
2523 dout(10) << __func__
<< " writing super, log fnode: " << super
.log_fnode
<< dendl
;
2526 _write_super(super_dev
);
2529 dout(10) << __func__
<< " release old log extents " << old_fnode
.extents
<< dendl
;
2530 std::lock_guard
dl(dirty
.lock
);
2531 for (auto& r
: old_fnode
.extents
) {
2532 dirty
.pending_release
[r
.bdev
].insert(r
.offset
, r
.length
);
2537 * 1. Allocate a new extent to continue the log, and then log an event
2538 * that jumps the log write position to the new extent. At this point, the
2539 * old extent(s) won't be written to, and reflect everything to compact.
2540 * New events will be written to the new region that we'll keep.
2542 * 2. While still holding the lock, encode a bufferlist that dumps all of the
2543 * in-memory fnodes and names. This will become the new beginning of the
2544 * log. The last event will jump to the log continuation extent from #1.
2546 * 3. Queue a write to a new extent for the new beginnging of the log.
2548 * 4. Drop lock and wait
2550 * 5. Retake the lock.
2552 * 6. Update the log_fnode to splice in the new beginning.
2554 * 7. Write the new superblock.
2556 * 8. Release the old log space. Clean up.
2559 void BlueFS::_compact_log_async_LD_LNF_D() //also locks FW for new_writer
2561 dout(10) << __func__
<< dendl
;
2562 // only one compaction allowed at one time
2563 bool old_is_comp
= std::atomic_exchange(&log_is_compacting
, true);
2565 dout(10) << __func__
<< " ongoing" <<dendl
;
2570 File
*log_file
= log
.writer
->file
.get();
2571 FileWriter
*new_log_writer
= nullptr;
2572 FileRef new_log
= nullptr;
2573 uint64_t new_log_jump_to
= 0;
2574 uint64_t old_log_jump_to
= 0;
2576 new_log
= ceph::make_ref
<File
>();
2577 new_log
->fnode
.ino
= 0; // we use _flush_special to avoid log of the fnode
2580 // Prepare current log for jumping into it.
2581 // 1. Allocate extent
2582 // 2. Update op to log
2583 // 3. Jump op to log
2584 // During that, no one else can write to log, otherwise we risk jumping backwards.
2585 // We need to sync log, because we are injecting discontinuity, and writer is not prepared for that.
2587 //signal _maybe_extend_log that expansion of log is temporary inacceptable
2588 bool old_forbidden
= atomic_exchange(&log_forbidden_to_expand
, true);
2589 ceph_assert(old_forbidden
== false);
2591 vselector
->sub_usage(log_file
->vselector_hint
, log_file
->fnode
);
2593 // 1.1 allocate new log space and jump to it.
2594 old_log_jump_to
= log_file
->fnode
.get_allocated();
2595 uint64_t runway
= log_file
->fnode
.get_allocated() - log
.writer
->get_effective_write_pos();
2596 dout(10) << __func__
<< " old_log_jump_to 0x" << std::hex
<< old_log_jump_to
2597 << " need 0x" << (old_log_jump_to
+ cct
->_conf
->bluefs_max_log_runway
) << std::dec
<< dendl
;
2598 int r
= _allocate(vselector
->select_prefer_bdev(log_file
->vselector_hint
),
2599 cct
->_conf
->bluefs_max_log_runway
,
2601 ceph_assert(r
== 0);
2602 //adjust usage as flush below will need it
2603 vselector
->add_usage(log_file
->vselector_hint
, log_file
->fnode
);
2604 dout(10) << __func__
<< " log extents " << log_file
->fnode
.extents
<< dendl
;
2606 // update the log file change and log a jump to the offset where we want to
2607 // write the new entries
2608 log
.t
.op_file_update(log_file
->fnode
);
2609 // jump to new position should mean next seq
2610 log
.t
.op_jump(log
.seq_live
+ 1, old_log_jump_to
);
2611 uint64_t seq_now
= log
.seq_live
;
2612 // we need to flush all bdev because we will be streaming all dirty files to log
2613 // TODO - think - if _flush_and_sync_log_jump will not add dirty files nor release pending allocations
2614 // then flush_bdev() will not be necessary
2616 _flush_and_sync_log_jump_D(old_log_jump_to
, runway
);
2618 // out of jump section
2620 // 2. prepare compacted log
2621 bluefs_transaction_t t
;
2622 _compact_log_async_dump_metadata_NF(&t
, seq_now
);
2624 // now state is captured to bufferlist
2625 // log can be used to write to, ops in log will be continuation of captured state
2628 uint64_t max_alloc_size
= std::max(alloc_size
[BDEV_WAL
],
2629 std::max(alloc_size
[BDEV_DB
],
2630 alloc_size
[BDEV_SLOW
]));
2632 // conservative estimate for final encoded size
2633 new_log_jump_to
= round_up_to(t
.op_bl
.length() + super
.block_size
* 2,
2635 //newly constructed log head will jump to what we had before
2636 t
.op_jump(seq_now
, new_log_jump_to
);
2639 //FIXME: check if we want DB here?
2640 r
= _allocate(BlueFS::BDEV_DB
, new_log_jump_to
,
2642 ceph_assert(r
== 0);
2648 dout(10) << __func__
<< " new_log_jump_to 0x" << std::hex
<< new_log_jump_to
2649 << std::dec
<< dendl
;
2651 new_log_writer
= _create_writer(new_log
);
2653 new_log_writer
->append(bl
);
2655 _flush_special(new_log_writer
);
2658 _flush_bdev(new_log_writer
);
2659 // 5. update our log fnode
2660 // we need to append to new_log the extents that were allocated in step 1.1
2661 // we do it by inverse logic - we drop 'old_log_jump_to' bytes and keep rest
2662 // todo - maybe improve _allocate so we will give clear set of new allocations
2663 uint64_t processed
= 0;
2664 mempool::bluefs::vector
<bluefs_extent_t
> old_extents
;
2665 for (auto& e
: log_file
->fnode
.extents
) {
2666 if (processed
+ e
.length
<= old_log_jump_to
) {
2667 // drop whole extent
2668 dout(10) << __func__
<< " remove old log extent " << e
<< dendl
;
2669 old_extents
.push_back(e
);
2671 // keep, but how much?
2672 if (processed
< old_log_jump_to
) {
2673 ceph_assert(processed
+ e
.length
> old_log_jump_to
);
2674 ceph_assert(old_log_jump_to
- processed
<= std::numeric_limits
<uint32_t>::max());
2675 uint32_t cut_at
= uint32_t(old_log_jump_to
- processed
);
2676 // need to cut, first half gets dropped
2677 bluefs_extent_t
retire(e
.bdev
, e
.offset
, cut_at
);
2678 old_extents
.push_back(retire
);
2679 // second half goes to new log
2680 bluefs_extent_t
keep(e
.bdev
, e
.offset
+ cut_at
, e
.length
- cut_at
);
2681 new_log
->fnode
.append_extent(keep
);
2682 dout(10) << __func__
<< " kept " << keep
<< " removed " << retire
<< dendl
;
2684 // take entire extent
2685 ceph_assert(processed
>= old_log_jump_to
);
2686 new_log
->fnode
.append_extent(e
);
2687 dout(10) << __func__
<< " kept " << e
<< dendl
;
2690 processed
+= e
.length
;
2692 // we will write it to super
2693 new_log
->fnode
.reset_delta();
2695 // 6. write the super block to reflect the changes
2696 dout(10) << __func__
<< " writing super" << dendl
;
2697 new_log
->fnode
.ino
= log_file
->fnode
.ino
;
2698 new_log
->fnode
.size
= 0;
2699 new_log
->fnode
.mtime
= ceph_clock_now();
2700 super
.log_fnode
= new_log
->fnode
;
2702 _write_super(BDEV_DB
);
2706 // swapping log_file and new_log
2707 vselector
->sub_usage(log_file
->vselector_hint
, log_file
->fnode
);
2709 // clear the extents from old log file, they are added to new log
2710 log_file
->fnode
.clear_extents();
2711 // swap the log files. New log file is the log file now.
2712 new_log
->fnode
.swap_extents(log_file
->fnode
);
2714 log
.writer
->pos
= log
.writer
->file
->fnode
.size
=
2715 log
.writer
->pos
- old_log_jump_to
+ new_log_jump_to
;
2717 vselector
->add_usage(log_file
->vselector_hint
, log_file
->fnode
);
2721 old_forbidden
= atomic_exchange(&log_forbidden_to_expand
, false);
2722 ceph_assert(old_forbidden
== true);
2723 //to wake up if someone was in need of expanding log
2724 log_cond
.notify_all();
2726 // 7. release old space
2727 dout(10) << __func__
<< " release old log extents " << old_extents
<< dendl
;
2729 std::lock_guard
dl(dirty
.lock
);
2730 for (auto& r
: old_extents
) {
2731 dirty
.pending_release
[r
.bdev
].insert(r
.offset
, r
.length
);
2735 // delete the new log, remove from the dirty files list
2736 _close_writer(new_log_writer
);
2737 new_log_writer
= nullptr;
2739 log_cond
.notify_all();
2741 dout(10) << __func__
<< " log extents " << log_file
->fnode
.extents
<< dendl
;
2742 logger
->inc(l_bluefs_log_compactions
);
2744 old_is_comp
= atomic_exchange(&log_is_compacting
, false);
2745 ceph_assert(old_is_comp
);
2748 void BlueFS::_pad_bl(bufferlist
& bl
)
2750 uint64_t partial
= bl
.length() % super
.block_size
;
2752 dout(10) << __func__
<< " padding with 0x" << std::hex
2753 << super
.block_size
- partial
<< " zeros" << std::dec
<< dendl
;
2754 bl
.append_zero(super
.block_size
- partial
);
2759 // Returns log seq that was live before advance.
2760 uint64_t BlueFS::_log_advance_seq()
2762 ceph_assert(ceph_mutex_is_locked(dirty
.lock
));
2763 ceph_assert(ceph_mutex_is_locked(log
.lock
));
2765 // this will became seq_stable once we write
2766 ceph_assert(dirty
.seq_stable
< dirty
.seq_live
);
2767 ceph_assert(log
.t
.seq
== log
.seq_live
);
2768 uint64_t seq
= log
.seq_live
;
2769 log
.t
.uuid
= super
.uuid
;
2773 ceph_assert(dirty
.seq_live
== log
.seq_live
);
2778 // Adds to log.t file modifications mentioned in `dirty.files`.
2779 // Note: some bluefs ops may have already been stored in log.t transaction.
2780 void BlueFS::_consume_dirty(uint64_t seq
)
2782 ceph_assert(ceph_mutex_is_locked(dirty
.lock
));
2783 ceph_assert(ceph_mutex_is_locked(log
.lock
));
2786 // we just incremented log_seq. It is now illegal to add to dirty.files[log_seq]
2787 auto lsi
= dirty
.files
.find(seq
);
2788 if (lsi
!= dirty
.files
.end()) {
2789 dout(20) << __func__
<< " " << lsi
->second
.size() << " dirty.files" << dendl
;
2790 for (auto &f
: lsi
->second
) {
2791 // fnode here is protected indirectly
2792 // the only path that adds to dirty.files goes from _fsync()
2793 // _fsync() is executed under writer lock,
2794 // and does not exit until syncing log is done
2795 dout(20) << __func__
<< " op_file_update_inc " << f
.fnode
<< dendl
;
2796 log
.t
.op_file_update_inc(f
.fnode
);
2801 // Extends log if its free space is smaller then bluefs_min_log_runway.
2802 // Returns space available *BEFORE* adding new space. Signed for additional <0 detection.
2803 int64_t BlueFS::_maybe_extend_log()
2805 ceph_assert(ceph_mutex_is_locked(log
.lock
));
2806 // allocate some more space (before we run out)?
2807 // BTW: this triggers `flush()` in the `page_aligned_appender` of `log.writer`.
2808 int64_t runway
= log
.writer
->file
->fnode
.get_allocated() -
2809 log
.writer
->get_effective_write_pos();
2810 if (runway
< (int64_t)cct
->_conf
->bluefs_min_log_runway
) {
2811 dout(10) << __func__
<< " allocating more log runway (0x"
2812 << std::hex
<< runway
<< std::dec
<< " remaining)" << dendl
;
2814 * Usually, when we are low on space in log, we just allocate new extent,
2815 * put update op(log) to log and we are fine.
2816 * Problem - it interferes with log compaction:
2817 * New log produced in compaction will include - as last op - jump into some offset (anchor) of current log.
2818 * It is assumed that log region (anchor - end) will contain all changes made by bluefs since
2819 * full state capture into new log.
2820 * Putting log update into (anchor - end) region is illegal, because any update there must be compatible with
2821 * both logs, but old log is different then new log.
2823 * Possible solutions:
2824 * - stall extending log until we finish compacting and switch log (CURRENT)
2825 * - re-run compaction with more runway for old log
2826 * - add OP_FILE_ADDEXT that adds extent; will be compatible with both logs
2828 if (log_forbidden_to_expand
.load() == true) {
2829 return -EWOULDBLOCK
;
2831 vselector
->sub_usage(log
.writer
->file
->vselector_hint
, log
.writer
->file
->fnode
);
2833 vselector
->select_prefer_bdev(log
.writer
->file
->vselector_hint
),
2834 cct
->_conf
->bluefs_max_log_runway
,
2835 &log
.writer
->file
->fnode
);
2836 ceph_assert(r
== 0);
2837 vselector
->add_usage(log
.writer
->file
->vselector_hint
, log
.writer
->file
->fnode
);
2838 log
.t
.op_file_update_inc(log
.writer
->file
->fnode
);
2843 void BlueFS::_flush_and_sync_log_core(int64_t runway
)
2845 ceph_assert(ceph_mutex_is_locked(log
.lock
));
2846 dout(10) << __func__
<< " " << log
.t
<< dendl
;
2849 bl
.reserve(super
.block_size
);
2851 // pad to block boundary
2852 size_t realign
= super
.block_size
- (bl
.length() % super
.block_size
);
2853 if (realign
&& realign
!= super
.block_size
)
2854 bl
.append_zero(realign
);
2856 logger
->inc(l_bluefs_logged_bytes
, bl
.length());
2859 ceph_assert(bl
.length() <= runway
); // if we write this, we will have an unrecoverable data loss
2860 // transaction will not fit extents before growth -> data loss on _replay
2863 log
.writer
->append(bl
);
2865 // prepare log for new transactions
2867 log
.t
.seq
= log
.seq_live
;
2869 uint64_t new_data
= _flush_special(log
.writer
);
2870 vselector
->add_usage(log
.writer
->file
->vselector_hint
, new_data
);
2873 // Clears dirty.files up to (including) seq_stable.
2874 void BlueFS::_clear_dirty_set_stable_D(uint64_t seq
)
2876 std::lock_guard
dl(dirty
.lock
);
2878 // clean dirty files
2879 if (seq
> dirty
.seq_stable
) {
2880 dirty
.seq_stable
= seq
;
2881 dout(20) << __func__
<< " seq_stable " << dirty
.seq_stable
<< dendl
;
2883 // undirty all files that were already streamed to log
2884 auto p
= dirty
.files
.begin();
2885 while (p
!= dirty
.files
.end()) {
2886 if (p
->first
> dirty
.seq_stable
) {
2887 dout(20) << __func__
<< " done cleaning up dirty files" << dendl
;
2891 auto l
= p
->second
.begin();
2892 while (l
!= p
->second
.end()) {
2894 ceph_assert(file
->dirty_seq
<= dirty
.seq_stable
);
2895 dout(20) << __func__
<< " cleaned file " << file
->fnode
.ino
<< dendl
;
2896 file
->dirty_seq
= dirty
.seq_stable
;
2897 p
->second
.erase(l
++);
2900 ceph_assert(p
->second
.empty());
2901 dirty
.files
.erase(p
++);
2904 dout(20) << __func__
<< " seq_stable " << dirty
.seq_stable
2905 << " already >= out seq " << seq
2906 << ", we lost a race against another log flush, done" << dendl
;
2910 void BlueFS::_release_pending_allocations(vector
<interval_set
<uint64_t>>& to_release
)
2912 for (unsigned i
= 0; i
< to_release
.size(); ++i
) {
2913 if (!to_release
[i
].empty()) {
2914 /* OK, now we have the guarantee alloc[i] won't be null. */
2916 if (cct
->_conf
->bdev_enable_discard
&& cct
->_conf
->bdev_async_discard
) {
2917 r
= bdev
[i
]->queue_discard(to_release
[i
]);
2920 } else if (cct
->_conf
->bdev_enable_discard
) {
2921 for (auto p
= to_release
[i
].begin(); p
!= to_release
[i
].end(); ++p
) {
2922 bdev
[i
]->discard(p
.get_start(), p
.get_len());
2925 alloc
[i
]->release(to_release
[i
]);
2926 if (is_shared_alloc(i
)) {
2927 shared_alloc
->bluefs_used
-= to_release
[i
].size();
2933 int BlueFS::_flush_and_sync_log_LD(uint64_t want_seq
)
2935 int64_t available_runway
;
2939 if (want_seq
&& want_seq
<= dirty
.seq_stable
) {
2940 dout(10) << __func__
<< " want_seq " << want_seq
<< " <= seq_stable "
2941 << dirty
.seq_stable
<< ", done" << dendl
;
2942 dirty
.lock
.unlock();
2947 available_runway
= _maybe_extend_log();
2948 if (available_runway
== -EWOULDBLOCK
) {
2949 // we are in need of adding runway, but we are during log-switch from compaction
2950 dirty
.lock
.unlock();
2951 //instead log.lock.unlock() do move ownership
2952 std::unique_lock
<ceph::mutex
> ll(log
.lock
, std::adopt_lock
);
2953 while (log_forbidden_to_expand
.load()) {
2957 ceph_assert(available_runway
>= 0);
2959 } while (available_runway
< 0);
2961 ceph_assert(want_seq
== 0 || want_seq
<= dirty
.seq_live
); // illegal to request seq that was not created yet
2962 uint64_t seq
=_log_advance_seq();
2963 _consume_dirty(seq
);
2964 vector
<interval_set
<uint64_t>> to_release(dirty
.pending_release
.size());
2965 to_release
.swap(dirty
.pending_release
);
2966 dirty
.lock
.unlock();
2968 _flush_and_sync_log_core(available_runway
);
2969 _flush_bdev(log
.writer
);
2970 logger
->set(l_bluefs_log_bytes
, log
.writer
->file
->fnode
.size
);
2971 //now log.lock is no longer needed
2974 _clear_dirty_set_stable_D(seq
);
2975 _release_pending_allocations(to_release
);
2977 _update_logger_stats();
2981 // Flushes log and immediately adjusts log_writer pos.
2982 int BlueFS::_flush_and_sync_log_jump_D(uint64_t jump_to
,
2983 int64_t available_runway
)
2985 ceph_assert(ceph_mutex_is_locked(log
.lock
));
2987 ceph_assert(jump_to
);
2988 // we synchronize writing to log, by lock to log.lock
2991 uint64_t seq
=_log_advance_seq();
2992 _consume_dirty(seq
);
2993 vector
<interval_set
<uint64_t>> to_release(dirty
.pending_release
.size());
2994 to_release
.swap(dirty
.pending_release
);
2995 dirty
.lock
.unlock();
2996 _flush_and_sync_log_core(available_runway
);
2998 dout(10) << __func__
<< " jumping log offset from 0x" << std::hex
2999 << log
.writer
->pos
<< " -> 0x" << jump_to
<< std::dec
<< dendl
;
3000 log
.writer
->pos
= jump_to
;
3001 vselector
->sub_usage(log
.writer
->file
->vselector_hint
, log
.writer
->file
->fnode
.size
);
3002 log
.writer
->file
->fnode
.size
= jump_to
;
3003 vselector
->add_usage(log
.writer
->file
->vselector_hint
, log
.writer
->file
->fnode
.size
);
3005 _flush_bdev(log
.writer
);
3007 _clear_dirty_set_stable_D(seq
);
3008 _release_pending_allocations(to_release
);
3010 logger
->set(l_bluefs_log_bytes
, log
.writer
->file
->fnode
.size
);
3011 _update_logger_stats();
3015 ceph::bufferlist
BlueFS::FileWriter::flush_buffer(
3016 CephContext
* const cct
,
3018 const unsigned length
,
3019 const bluefs_super_t
& super
)
3021 ceph_assert(ceph_mutex_is_locked(this->lock
) || file
->fnode
.ino
<= 1);
3022 ceph::bufferlist bl
;
3024 tail_block
.splice(0, tail_block
.length(), &bl
);
3026 const auto remaining_len
= length
- bl
.length();
3027 buffer
.splice(0, remaining_len
, &bl
);
3028 if (buffer
.length()) {
3029 dout(20) << " leaving 0x" << std::hex
<< buffer
.length() << std::dec
3030 << " unflushed" << dendl
;
3032 if (const unsigned tail
= bl
.length() & ~super
.block_mask(); tail
) {
3033 const auto padding_len
= super
.block_size
- tail
;
3034 dout(20) << __func__
<< " caching tail of 0x"
3036 << " and padding block with 0x" << padding_len
3037 << " buffer.length() " << buffer
.length()
3038 << std::dec
<< dendl
;
3039 // We need to go through the `buffer_appender` to get a chance to
3040 // preserve in-memory contiguity and not mess with the alignment.
3041 // Otherwise a costly rebuild could happen in e.g. `KernelDevice`.
3042 buffer_appender
.append_zero(padding_len
);
3043 buffer
.splice(buffer
.length() - padding_len
, padding_len
, &bl
);
3044 // Deep copy the tail here. This allows to avoid costlier copy on
3045 // bufferlist rebuild in e.g. `KernelDevice` and minimizes number
3046 // of memory allocations.
3047 // The alternative approach would be to place the entire tail and
3048 // padding on a dedicated, 4 KB long memory chunk. This shouldn't
3049 // trigger the rebuild while still being less expensive.
3050 buffer_appender
.substr_of(bl
, bl
.length() - padding_len
- tail
, tail
);
3051 buffer
.splice(buffer
.length() - tail
, tail
, &tail_block
);
3058 int BlueFS::_signal_dirty_to_log_D(FileWriter
*h
)
3060 ceph_assert(ceph_mutex_is_locked(h
->lock
));
3061 std::lock_guard
dl(dirty
.lock
);
3062 h
->file
->fnode
.mtime
= ceph_clock_now();
3063 ceph_assert(h
->file
->fnode
.ino
>= 1);
3064 if (h
->file
->dirty_seq
<= dirty
.seq_stable
) {
3065 h
->file
->dirty_seq
= dirty
.seq_live
;
3066 dirty
.files
[h
->file
->dirty_seq
].push_back(*h
->file
);
3067 dout(20) << __func__
<< " dirty_seq = " << dirty
.seq_live
3068 << " (was clean)" << dendl
;
3070 if (h
->file
->dirty_seq
!= dirty
.seq_live
) {
3071 // need re-dirty, erase from list first
3072 ceph_assert(dirty
.files
.count(h
->file
->dirty_seq
));
3073 auto it
= dirty
.files
[h
->file
->dirty_seq
].iterator_to(*h
->file
);
3074 dirty
.files
[h
->file
->dirty_seq
].erase(it
);
3075 h
->file
->dirty_seq
= dirty
.seq_live
;
3076 dirty
.files
[h
->file
->dirty_seq
].push_back(*h
->file
);
3077 dout(20) << __func__
<< " dirty_seq = " << dirty
.seq_live
3078 << " (was " << h
->file
->dirty_seq
<< ")" << dendl
;
3080 dout(20) << __func__
<< " dirty_seq = " << dirty
.seq_live
3081 << " (unchanged, do nothing) " << dendl
;
3087 void BlueFS::flush_range(FileWriter
*h
, uint64_t offset
, uint64_t length
)/*_WF*/
3089 _maybe_check_vselector_LNF();
3090 std::unique_lock
hl(h
->lock
);
3091 _flush_range_F(h
, offset
, length
);
3094 int BlueFS::_flush_range_F(FileWriter
*h
, uint64_t offset
, uint64_t length
)
3096 ceph_assert(ceph_mutex_is_locked(h
->lock
));
3097 ceph_assert(h
->file
->num_readers
.load() == 0);
3098 ceph_assert(h
->file
->fnode
.ino
> 1);
3100 dout(10) << __func__
<< " " << h
<< " pos 0x" << std::hex
<< h
->pos
3101 << " 0x" << offset
<< "~" << length
<< std::dec
3102 << " to " << h
->file
->fnode
<< dendl
;
3103 if (h
->file
->deleted
) {
3104 dout(10) << __func__
<< " deleted, no-op" << dendl
;
3108 bool buffered
= cct
->_conf
->bluefs_buffered_io
;
3110 if (offset
+ length
<= h
->pos
)
3112 if (offset
< h
->pos
) {
3113 length
-= h
->pos
- offset
;
3115 dout(10) << " still need 0x"
3116 << std::hex
<< offset
<< "~" << length
<< std::dec
3119 std::lock_guard
file_lock(h
->file
->lock
);
3120 ceph_assert(offset
<= h
->file
->fnode
.size
);
3122 uint64_t allocated
= h
->file
->fnode
.get_allocated();
3123 vselector
->sub_usage(h
->file
->vselector_hint
, h
->file
->fnode
);
3124 // do not bother to dirty the file if we are overwriting
3125 // previously allocated extents.
3126 if (allocated
< offset
+ length
) {
3127 // we should never run out of log space here; see the min runway check
3128 // in _flush_and_sync_log.
3129 int r
= _allocate(vselector
->select_prefer_bdev(h
->file
->vselector_hint
),
3130 offset
+ length
- allocated
,
3133 derr
<< __func__
<< " allocated: 0x" << std::hex
<< allocated
3134 << " offset: 0x" << offset
<< " length: 0x" << length
<< std::dec
3136 vselector
->add_usage(h
->file
->vselector_hint
, h
->file
->fnode
); // undo
3137 ceph_abort_msg("bluefs enospc");
3140 h
->file
->is_dirty
= true;
3142 if (h
->file
->fnode
.size
< offset
+ length
) {
3143 h
->file
->fnode
.size
= offset
+ length
;
3144 h
->file
->is_dirty
= true;
3147 dout(20) << __func__
<< " file now, unflushed " << h
->file
->fnode
<< dendl
;
3148 int res
= _flush_data(h
, offset
, length
, buffered
);
3149 vselector
->add_usage(h
->file
->vselector_hint
, h
->file
->fnode
);
3153 int BlueFS::_flush_data(FileWriter
*h
, uint64_t offset
, uint64_t length
, bool buffered
)
3155 if (h
->file
->fnode
.ino
> 1) {
3156 ceph_assert(ceph_mutex_is_locked(h
->lock
));
3157 ceph_assert(ceph_mutex_is_locked(h
->file
->lock
));
3160 auto p
= h
->file
->fnode
.seek(offset
, &x_off
);
3161 ceph_assert(p
!= h
->file
->fnode
.extents
.end());
3162 dout(20) << __func__
<< " in " << *p
<< " x_off 0x"
3163 << std::hex
<< x_off
<< std::dec
<< dendl
;
3165 unsigned partial
= x_off
& ~super
.block_mask();
3167 dout(20) << __func__
<< " using partial tail 0x"
3168 << std::hex
<< partial
<< std::dec
<< dendl
;
3172 dout(20) << __func__
<< " waiting for previous aio to complete" << dendl
;
3173 for (auto p
: h
->iocv
) {
3180 auto bl
= h
->flush_buffer(cct
, partial
, length
, super
);
3181 ceph_assert(bl
.length() >= length
);
3182 h
->pos
= offset
+ length
;
3183 length
= bl
.length();
3185 switch (h
->writer_type
) {
3187 logger
->inc(l_bluefs_bytes_written_wal
, length
);
3190 logger
->inc(l_bluefs_bytes_written_sst
, length
);
3194 dout(30) << "dump:\n";
3199 uint64_t bytes_written_slow
= 0;
3200 while (length
> 0) {
3201 uint64_t x_len
= std::min(p
->length
- x_off
, length
);
3203 t
.substr_of(bl
, bloff
, x_len
);
3204 if (cct
->_conf
->bluefs_sync_write
) {
3205 bdev
[p
->bdev
]->write(p
->offset
+ x_off
, t
, buffered
, h
->write_hint
);
3207 bdev
[p
->bdev
]->aio_write(p
->offset
+ x_off
, t
, h
->iocv
[p
->bdev
], buffered
, h
->write_hint
);
3209 h
->dirty_devs
[p
->bdev
] = true;
3210 if (p
->bdev
== BDEV_SLOW
) {
3211 bytes_written_slow
+= t
.length();
3219 if (bytes_written_slow
) {
3220 logger
->inc(l_bluefs_bytes_written_slow
, bytes_written_slow
);
3222 for (unsigned i
= 0; i
< MAX_BDEV
; ++i
) {
3224 if (h
->iocv
[i
] && h
->iocv
[i
]->has_pending_aios()) {
3225 bdev
[i
]->aio_submit(h
->iocv
[i
]);
3229 dout(20) << __func__
<< " h " << h
<< " pos now 0x"
3230 << std::hex
<< h
->pos
<< std::dec
<< dendl
;
3235 // we need to retire old completed aios so they don't stick around in
3236 // memory indefinitely (along with their bufferlist refs).
3237 void BlueFS::_claim_completed_aios(FileWriter
*h
, list
<aio_t
> *ls
)
3239 for (auto p
: h
->iocv
) {
3241 ls
->splice(ls
->end(), p
->running_aios
);
3244 dout(10) << __func__
<< " got " << ls
->size() << " aios" << dendl
;
3247 void BlueFS::_wait_for_aio(FileWriter
*h
)
3249 // NOTE: this is safe to call without a lock, as long as our reference is
3252 lgeneric_subdout(cct
, bluefs
, 10) << __func__
;
3253 start
= ceph_clock_now();
3254 *_dout
<< " " << h
<< dendl
;
3255 for (auto p
: h
->iocv
) {
3260 dout(10) << __func__
<< " " << h
<< " done in " << (ceph_clock_now() - start
) << dendl
;
3264 void BlueFS::append_try_flush(FileWriter
*h
, const char* buf
, size_t len
)/*_WF_LNF_NF_LD_D*/
3266 bool flushed_sum
= false;
3268 std::unique_lock
hl(h
->lock
);
3269 size_t max_size
= 1ull << 30; // cap to 1GB
3271 bool need_flush
= true;
3272 auto l0
= h
->get_buffer_length();
3273 if (l0
< max_size
) {
3274 size_t l
= std::min(len
, max_size
- l0
);
3278 need_flush
= h
->get_buffer_length() >= cct
->_conf
->bluefs_min_flush_size
;
3281 bool flushed
= false;
3282 int r
= _flush_F(h
, true, &flushed
);
3283 ceph_assert(r
== 0);
3284 flushed_sum
|= flushed
;
3285 // make sure we've made any progress with flush hence the
3286 // loop doesn't iterate forever
3287 ceph_assert(h
->get_buffer_length() < max_size
);
3292 _maybe_compact_log_LNF_NF_LD_D();
3296 void BlueFS::flush(FileWriter
*h
, bool force
)/*_WF_LNF_NF_LD_D*/
3298 bool flushed
= false;
3301 std::unique_lock
hl(h
->lock
);
3302 r
= _flush_F(h
, force
, &flushed
);
3303 ceph_assert(r
== 0);
3305 if (r
== 0 && flushed
) {
3306 _maybe_compact_log_LNF_NF_LD_D();
3310 int BlueFS::_flush_F(FileWriter
*h
, bool force
, bool *flushed
)
3312 ceph_assert(ceph_mutex_is_locked(h
->lock
));
3313 uint64_t length
= h
->get_buffer_length();
3314 uint64_t offset
= h
->pos
;
3319 length
< cct
->_conf
->bluefs_min_flush_size
) {
3320 dout(10) << __func__
<< " " << h
<< " ignoring, length " << length
3321 << " < min_flush_size " << cct
->_conf
->bluefs_min_flush_size
3326 dout(10) << __func__
<< " " << h
<< " no dirty data on "
3327 << h
->file
->fnode
<< dendl
;
3330 dout(10) << __func__
<< " " << h
<< " 0x"
3331 << std::hex
<< offset
<< "~" << length
<< std::dec
3332 << " to " << h
->file
->fnode
<< dendl
;
3333 ceph_assert(h
->pos
<= h
->file
->fnode
.size
);
3334 int r
= _flush_range_F(h
, offset
, length
);
3341 // Flush for bluefs special files.
3342 // Does not add extents to h.
3343 // Does not mark h as dirty.
3344 // we do not need to dirty the log file (or it's compacting
3345 // replacement) when the file size changes because replay is
3346 // smart enough to discover it on its own.
3347 uint64_t BlueFS::_flush_special(FileWriter
*h
)
3349 ceph_assert(h
->file
->fnode
.ino
<= 1);
3350 uint64_t length
= h
->get_buffer_length();
3351 uint64_t offset
= h
->pos
;
3352 uint64_t new_data
= 0;
3353 ceph_assert(length
+ offset
<= h
->file
->fnode
.get_allocated());
3354 if (h
->file
->fnode
.size
< offset
+ length
) {
3355 new_data
= offset
+ length
- h
->file
->fnode
.size
;
3356 h
->file
->fnode
.size
= offset
+ length
;
3358 _flush_data(h
, offset
, length
, false);
3362 int BlueFS::truncate(FileWriter
*h
, uint64_t offset
)/*_WF_L*/
3364 std::lock_guard
hl(h
->lock
);
3365 dout(10) << __func__
<< " 0x" << std::hex
<< offset
<< std::dec
3366 << " file " << h
->file
->fnode
<< dendl
;
3367 if (h
->file
->deleted
) {
3368 dout(10) << __func__
<< " deleted, no-op" << dendl
;
3372 // we never truncate internal log files
3373 ceph_assert(h
->file
->fnode
.ino
> 1);
3375 // truncate off unflushed data?
3376 if (h
->pos
< offset
&&
3377 h
->pos
+ h
->get_buffer_length() > offset
) {
3378 dout(20) << __func__
<< " tossing out last " << offset
- h
->pos
3379 << " unflushed bytes" << dendl
;
3380 ceph_abort_msg("actually this shouldn't happen");
3382 if (h
->get_buffer_length()) {
3383 int r
= _flush_F(h
, true);
3387 if (offset
== h
->file
->fnode
.size
) {
3390 if (offset
> h
->file
->fnode
.size
) {
3391 ceph_abort_msg("truncate up not supported");
3393 ceph_assert(h
->file
->fnode
.size
>= offset
);
3396 std::lock_guard
ll(log
.lock
);
3397 vselector
->sub_usage(h
->file
->vselector_hint
, h
->file
->fnode
.size
);
3398 h
->file
->fnode
.size
= offset
;
3399 vselector
->add_usage(h
->file
->vselector_hint
, h
->file
->fnode
.size
);
3400 log
.t
.op_file_update_inc(h
->file
->fnode
);
3404 int BlueFS::fsync(FileWriter
*h
)/*_WF_WD_WLD_WLNF_WNF*/
3406 _maybe_check_vselector_LNF();
3407 std::unique_lock
hl(h
->lock
);
3408 uint64_t old_dirty_seq
= 0;
3410 dout(10) << __func__
<< " " << h
<< " " << h
->file
->fnode
<< dendl
;
3411 int r
= _flush_F(h
, true);
3415 if (h
->file
->is_dirty
) {
3416 _signal_dirty_to_log_D(h
);
3417 h
->file
->is_dirty
= false;
3420 std::lock_guard
dl(dirty
.lock
);
3421 if (dirty
.seq_stable
< h
->file
->dirty_seq
) {
3422 old_dirty_seq
= h
->file
->dirty_seq
;
3423 dout(20) << __func__
<< " file metadata was dirty (" << old_dirty_seq
3424 << ") on " << h
->file
->fnode
<< ", flushing log" << dendl
;
3428 if (old_dirty_seq
) {
3429 _flush_and_sync_log_LD(old_dirty_seq
);
3431 _maybe_compact_log_LNF_NF_LD_D();
3436 // be careful - either h->file->lock or log.lock must be taken
3437 void BlueFS::_flush_bdev(FileWriter
*h
)
3439 if (h
->file
->fnode
.ino
> 1) {
3440 ceph_assert(ceph_mutex_is_locked(h
->lock
));
3441 } else if (h
->file
->fnode
.ino
== 1) {
3442 ceph_assert(ceph_mutex_is_locked(log
.lock
));
3444 std::array
<bool, MAX_BDEV
> flush_devs
= h
->dirty_devs
;
3445 h
->dirty_devs
.fill(false);
3447 if (!cct
->_conf
->bluefs_sync_write
) {
3448 list
<aio_t
> completed_ios
;
3449 _claim_completed_aios(h
, &completed_ios
);
3451 completed_ios
.clear();
3454 _flush_bdev(flush_devs
);
3457 void BlueFS::_flush_bdev(std::array
<bool, MAX_BDEV
>& dirty_bdevs
)
3459 // NOTE: this is safe to call without a lock.
3460 dout(20) << __func__
<< dendl
;
3461 for (unsigned i
= 0; i
< MAX_BDEV
; i
++) {
3467 void BlueFS::_flush_bdev()
3469 // NOTE: this is safe to call without a lock.
3470 dout(20) << __func__
<< dendl
;
3471 for (unsigned i
= 0; i
< MAX_BDEV
; i
++) {
3472 // alloc space from BDEV_SLOW is unexpected.
3473 // So most cases we don't alloc from BDEV_SLOW and so avoiding flush not-used device.
3474 if (bdev
[i
] && (i
!= BDEV_SLOW
|| _get_used(i
))) {
3480 const char* BlueFS::get_device_name(unsigned id
)
3482 if (id
>= MAX_BDEV
) return "BDEV_INV";
3483 const char* names
[] = {"BDEV_WAL", "BDEV_DB", "BDEV_SLOW", "BDEV_NEWWAL", "BDEV_NEWDB"};
3487 int BlueFS::_allocate_without_fallback(uint8_t id
, uint64_t len
,
3488 PExtentVector
* extents
)
3490 dout(10) << __func__
<< " len 0x" << std::hex
<< len
<< std::dec
3491 << " from " << (int)id
<< dendl
;
3492 assert(id
< alloc
.size());
3496 extents
->reserve(4); // 4 should be (more than) enough for most allocations
3497 int64_t need
= round_up_to(len
, alloc_size
[id
]);
3498 int64_t alloc_len
= alloc
[id
]->allocate(need
, alloc_size
[id
], 0, extents
);
3499 if (alloc_len
< 0 || alloc_len
< need
) {
3500 if (alloc_len
> 0) {
3501 alloc
[id
]->release(*extents
);
3503 derr
<< __func__
<< " unable to allocate 0x" << std::hex
<< need
3504 << " on bdev " << (int)id
3505 << ", allocator name " << alloc
[id
]->get_name()
3506 << ", allocator type " << alloc
[id
]->get_type()
3507 << ", capacity 0x" << alloc
[id
]->get_capacity()
3508 << ", block size 0x" << alloc
[id
]->get_block_size()
3509 << ", alloc size 0x" << alloc_size
[id
]
3510 << ", free 0x" << alloc
[id
]->get_free()
3511 << ", fragmentation " << alloc
[id
]->get_fragmentation()
3512 << ", allocated 0x" << (alloc_len
> 0 ? alloc_len
: 0)
3513 << std::dec
<< dendl
;
3517 if (is_shared_alloc(id
)) {
3518 shared_alloc
->bluefs_used
+= alloc_len
;
3524 int BlueFS::_allocate(uint8_t id
, uint64_t len
,
3525 bluefs_fnode_t
* node
)
3527 dout(10) << __func__
<< " len 0x" << std::hex
<< len
<< std::dec
3528 << " from " << (int)id
<< dendl
;
3529 ceph_assert(id
< alloc
.size());
3530 int64_t alloc_len
= 0;
3531 PExtentVector extents
;
3535 need
= round_up_to(len
, alloc_size
[id
]);
3536 if (!node
->extents
.empty() && node
->extents
.back().bdev
== id
) {
3537 hint
= node
->extents
.back().end();
3539 extents
.reserve(4); // 4 should be (more than) enough for most allocations
3540 alloc_len
= alloc
[id
]->allocate(need
, alloc_size
[id
], hint
, &extents
);
3542 if (alloc_len
< 0 || alloc_len
< need
) {
3544 if (alloc_len
> 0) {
3545 alloc
[id
]->release(extents
);
3547 dout(1) << __func__
<< " unable to allocate 0x" << std::hex
<< need
3548 << " on bdev " << (int)id
3549 << ", allocator name " << alloc
[id
]->get_name()
3550 << ", allocator type " << alloc
[id
]->get_type()
3551 << ", capacity 0x" << alloc
[id
]->get_capacity()
3552 << ", block size 0x" << alloc
[id
]->get_block_size()
3553 << ", alloc size 0x" << alloc_size
[id
]
3554 << ", free 0x" << alloc
[id
]->get_free()
3555 << ", fragmentation " << alloc
[id
]->get_fragmentation()
3556 << ", allocated 0x" << (alloc_len
> 0 ? alloc_len
: 0)
3557 << std::dec
<< dendl
;
3559 dout(20) << __func__
<< " alloc-id not set on index="<< (int)id
<< " unable to allocate 0x" << std::hex
<< need
3560 << " on bdev " << (int)id
<< std::dec
<< dendl
;
3562 if (id
!= BDEV_SLOW
) {
3563 dout(20) << __func__
<< " fallback to bdev "
3566 return _allocate(id
+ 1, len
, node
);
3568 derr
<< __func__
<< " allocation failed, needed 0x" << std::hex
<< need
3573 uint64_t used
= _get_used(id
);
3574 if (max_bytes
[id
] < used
) {
3575 logger
->set(max_bytes_pcounters
[id
], used
);
3576 max_bytes
[id
] = used
;
3578 if (is_shared_alloc(id
)) {
3579 shared_alloc
->bluefs_used
+= alloc_len
;
3583 for (auto& p
: extents
) {
3584 node
->append_extent(bluefs_extent_t(id
, p
.offset
, p
.length
));
3590 int BlueFS::preallocate(FileRef f
, uint64_t off
, uint64_t len
)/*_LF*/
3592 std::lock_guard
ll(log
.lock
);
3593 std::lock_guard
fl(f
->lock
);
3594 dout(10) << __func__
<< " file " << f
->fnode
<< " 0x"
3595 << std::hex
<< off
<< "~" << len
<< std::dec
<< dendl
;
3597 dout(10) << __func__
<< " deleted, no-op" << dendl
;
3600 ceph_assert(f
->fnode
.ino
> 1);
3601 uint64_t allocated
= f
->fnode
.get_allocated();
3602 if (off
+ len
> allocated
) {
3603 uint64_t want
= off
+ len
- allocated
;
3605 vselector
->sub_usage(f
->vselector_hint
, f
->fnode
);
3606 int r
= _allocate(vselector
->select_prefer_bdev(f
->vselector_hint
),
3609 vselector
->add_usage(f
->vselector_hint
, f
->fnode
);
3613 log
.t
.op_file_update_inc(f
->fnode
);
3618 void BlueFS::sync_metadata(bool avoid_compact
)/*_LNF_NF_LD_D*/
3620 bool can_skip_flush
;
3622 std::lock_guard
ll(log
.lock
);
3623 std::lock_guard
dl(dirty
.lock
);
3624 can_skip_flush
= log
.t
.empty() && dirty
.files
.empty();
3626 if (can_skip_flush
) {
3627 dout(10) << __func__
<< " - no pending log events" << dendl
;
3630 lgeneric_subdout(cct
, bluefs
, 10) << __func__
;
3631 start
= ceph_clock_now();
3633 _flush_bdev(); // FIXME?
3634 _flush_and_sync_log_LD();
3635 dout(10) << __func__
<< " done in " << (ceph_clock_now() - start
) << dendl
;
3638 if (!avoid_compact
) {
3639 _maybe_compact_log_LNF_NF_LD_D();
3643 void BlueFS::_maybe_compact_log_LNF_NF_LD_D()
3645 if (!cct
->_conf
->bluefs_replay_recovery_disable_compact
&&
3646 _should_start_compact_log_L_N()) {
3647 if (cct
->_conf
->bluefs_compact_log_sync
) {
3648 _compact_log_sync_LNF_LD();
3650 _compact_log_async_LD_LNF_D();
3655 int BlueFS::open_for_write(
3656 std::string_view dirname
,
3657 std::string_view filename
,
3659 bool overwrite
)/*_N_LD*/
3661 _maybe_check_vselector_LNF();
3663 bool create
= false;
3664 bool truncate
= false;
3665 mempool::bluefs::vector
<bluefs_extent_t
> pending_release_extents
;
3667 std::unique_lock
nl(nodes
.lock
);
3668 dout(10) << __func__
<< " " << dirname
<< "/" << filename
<< dendl
;
3669 map
<string
,DirRef
>::iterator p
= nodes
.dir_map
.find(dirname
);
3671 if (p
== nodes
.dir_map
.end()) {
3672 // implicitly create the dir
3673 dout(20) << __func__
<< " dir " << dirname
3674 << " does not exist" << dendl
;
3680 map
<string
,FileRef
>::iterator q
= dir
->file_map
.find(filename
);
3681 if (q
== dir
->file_map
.end()) {
3683 dout(20) << __func__
<< " dir " << dirname
<< " (" << dir
3684 << ") file " << filename
3685 << " does not exist" << dendl
;
3688 file
= ceph::make_ref
<File
>();
3689 file
->fnode
.ino
= ++ino_last
;
3690 nodes
.file_map
[ino_last
] = file
;
3691 dir
->file_map
[string
{filename
}] = file
;
3694 logger
->set(l_bluefs_num_files
, nodes
.file_map
.size());
3696 // overwrite existing file?
3699 dout(20) << __func__
<< " dir " << dirname
<< " (" << dir
3700 << ") file " << filename
3701 << " already exists, overwrite in place" << dendl
;
3703 dout(20) << __func__
<< " dir " << dirname
<< " (" << dir
3704 << ") file " << filename
3705 << " already exists, truncate + overwrite" << dendl
;
3706 vselector
->sub_usage(file
->vselector_hint
, file
->fnode
);
3707 file
->fnode
.size
= 0;
3708 pending_release_extents
.swap(file
->fnode
.extents
);
3711 file
->fnode
.clear_extents();
3714 ceph_assert(file
->fnode
.ino
> 1);
3716 file
->fnode
.mtime
= ceph_clock_now();
3717 file
->vselector_hint
= vselector
->get_hint_by_dir(dirname
);
3718 if (create
|| truncate
) {
3719 vselector
->add_usage(file
->vselector_hint
, file
->fnode
); // update file count
3722 dout(20) << __func__
<< " mapping " << dirname
<< "/" << filename
3723 << " vsel_hint " << file
->vselector_hint
3727 std::lock_guard
ll(log
.lock
);
3728 log
.t
.op_file_update(file
->fnode
);
3730 log
.t
.op_dir_link(dirname
, filename
, file
->fnode
.ino
);
3732 std::lock_guard
dl(dirty
.lock
);
3733 for (auto& p
: pending_release_extents
) {
3734 dirty
.pending_release
[p
.bdev
].insert(p
.offset
, p
.length
);
3737 *h
= _create_writer(file
);
3739 if (boost::algorithm::ends_with(filename
, ".log")) {
3740 (*h
)->writer_type
= BlueFS::WRITER_WAL
;
3741 if (logger
&& !overwrite
) {
3742 logger
->inc(l_bluefs_files_written_wal
);
3744 } else if (boost::algorithm::ends_with(filename
, ".sst")) {
3745 (*h
)->writer_type
= BlueFS::WRITER_SST
;
3747 logger
->inc(l_bluefs_files_written_sst
);
3751 dout(10) << __func__
<< " h " << *h
<< " on " << file
->fnode
<< dendl
;
3755 BlueFS::FileWriter
*BlueFS::_create_writer(FileRef f
)
3757 FileWriter
*w
= new FileWriter(f
);
3758 for (unsigned i
= 0; i
< MAX_BDEV
; ++i
) {
3760 w
->iocv
[i
] = new IOContext(cct
, NULL
);
3766 void BlueFS::_drain_writer(FileWriter
*h
)
3768 dout(10) << __func__
<< " " << h
<< " type " << h
->writer_type
<< dendl
;
3769 //h->buffer.reassign_to_mempool(mempool::mempool_bluefs_file_writer);
3770 for (unsigned i
=0; i
<MAX_BDEV
; ++i
) {
3773 h
->iocv
[i
]->aio_wait();
3779 if (h
->file
->fnode
.size
>= (1ull << 30)) {
3780 dout(10) << __func__
<< " file is unexpectedly large:" << h
->file
->fnode
<< dendl
;
3784 void BlueFS::_close_writer(FileWriter
*h
)
3789 void BlueFS::close_writer(FileWriter
*h
)
3792 std::lock_guard
l(h
->lock
);
3798 uint64_t BlueFS::debug_get_dirty_seq(FileWriter
*h
)
3800 std::lock_guard
l(h
->lock
);
3801 return h
->file
->dirty_seq
;
3804 bool BlueFS::debug_get_is_dev_dirty(FileWriter
*h
, uint8_t dev
)
3806 std::lock_guard
l(h
->lock
);
3807 return h
->dirty_devs
[dev
];
3810 int BlueFS::open_for_read(
3811 std::string_view dirname
,
3812 std::string_view filename
,
3816 _maybe_check_vselector_LNF();
3817 std::lock_guard
nl(nodes
.lock
);
3818 dout(10) << __func__
<< " " << dirname
<< "/" << filename
3819 << (random
? " (random)":" (sequential)") << dendl
;
3820 map
<string
,DirRef
>::iterator p
= nodes
.dir_map
.find(dirname
);
3821 if (p
== nodes
.dir_map
.end()) {
3822 dout(20) << __func__
<< " dir " << dirname
<< " not found" << dendl
;
3825 DirRef dir
= p
->second
;
3827 map
<string
,FileRef
>::iterator q
= dir
->file_map
.find(filename
);
3828 if (q
== dir
->file_map
.end()) {
3829 dout(20) << __func__
<< " dir " << dirname
<< " (" << dir
3830 << ") file " << filename
3831 << " not found" << dendl
;
3834 File
*file
= q
->second
.get();
3836 *h
= new FileReader(file
, random
? 4096 : cct
->_conf
->bluefs_max_prefetch
,
3838 dout(10) << __func__
<< " h " << *h
<< " on " << file
->fnode
<< dendl
;
3843 std::string_view old_dirname
, std::string_view old_filename
,
3844 std::string_view new_dirname
, std::string_view new_filename
)/*_LND*/
3846 std::lock_guard
ll(log
.lock
);
3847 std::lock_guard
nl(nodes
.lock
);
3848 dout(10) << __func__
<< " " << old_dirname
<< "/" << old_filename
3849 << " -> " << new_dirname
<< "/" << new_filename
<< dendl
;
3850 map
<string
,DirRef
>::iterator p
= nodes
.dir_map
.find(old_dirname
);
3851 if (p
== nodes
.dir_map
.end()) {
3852 dout(20) << __func__
<< " dir " << old_dirname
<< " not found" << dendl
;
3855 DirRef old_dir
= p
->second
;
3856 map
<string
,FileRef
>::iterator q
= old_dir
->file_map
.find(old_filename
);
3857 if (q
== old_dir
->file_map
.end()) {
3858 dout(20) << __func__
<< " dir " << old_dirname
<< " (" << old_dir
3859 << ") file " << old_filename
3860 << " not found" << dendl
;
3863 FileRef file
= q
->second
;
3865 p
= nodes
.dir_map
.find(new_dirname
);
3866 if (p
== nodes
.dir_map
.end()) {
3867 dout(20) << __func__
<< " dir " << new_dirname
<< " not found" << dendl
;
3870 DirRef new_dir
= p
->second
;
3871 q
= new_dir
->file_map
.find(new_filename
);
3872 if (q
!= new_dir
->file_map
.end()) {
3873 dout(20) << __func__
<< " dir " << new_dirname
<< " (" << old_dir
3874 << ") file " << new_filename
3875 << " already exists, unlinking" << dendl
;
3876 ceph_assert(q
->second
!= file
);
3877 log
.t
.op_dir_unlink(new_dirname
, new_filename
);
3878 _drop_link_D(q
->second
);
3881 dout(10) << __func__
<< " " << new_dirname
<< "/" << new_filename
<< " "
3882 << " " << file
->fnode
<< dendl
;
3884 new_dir
->file_map
[string
{new_filename
}] = file
;
3885 old_dir
->file_map
.erase(string
{old_filename
});
3887 log
.t
.op_dir_link(new_dirname
, new_filename
, file
->fnode
.ino
);
3888 log
.t
.op_dir_unlink(old_dirname
, old_filename
);
3892 int BlueFS::mkdir(std::string_view dirname
)/*_LN*/
3894 std::lock_guard
ll(log
.lock
);
3895 std::lock_guard
nl(nodes
.lock
);
3896 dout(10) << __func__
<< " " << dirname
<< dendl
;
3897 map
<string
,DirRef
>::iterator p
= nodes
.dir_map
.find(dirname
);
3898 if (p
!= nodes
.dir_map
.end()) {
3899 dout(20) << __func__
<< " dir " << dirname
<< " exists" << dendl
;
3902 nodes
.dir_map
[string
{dirname
}] = ceph::make_ref
<Dir
>();
3903 log
.t
.op_dir_create(dirname
);
3907 int BlueFS::rmdir(std::string_view dirname
)/*_LN*/
3909 std::lock_guard
ll(log
.lock
);
3910 std::lock_guard
nl(nodes
.lock
);
3911 dout(10) << __func__
<< " " << dirname
<< dendl
;
3912 auto p
= nodes
.dir_map
.find(dirname
);
3913 if (p
== nodes
.dir_map
.end()) {
3914 dout(20) << __func__
<< " dir " << dirname
<< " does not exist" << dendl
;
3917 DirRef dir
= p
->second
;
3918 if (!dir
->file_map
.empty()) {
3919 dout(20) << __func__
<< " dir " << dirname
<< " not empty" << dendl
;
3922 nodes
.dir_map
.erase(string
{dirname
});
3923 log
.t
.op_dir_remove(dirname
);
3927 bool BlueFS::dir_exists(std::string_view dirname
)/*_N*/
3929 std::lock_guard
nl(nodes
.lock
);
3930 map
<string
,DirRef
>::iterator p
= nodes
.dir_map
.find(dirname
);
3931 bool exists
= p
!= nodes
.dir_map
.end();
3932 dout(10) << __func__
<< " " << dirname
<< " = " << (int)exists
<< dendl
;
3936 int BlueFS::stat(std::string_view dirname
, std::string_view filename
,
3937 uint64_t *size
, utime_t
*mtime
)/*_N*/
3939 std::lock_guard
nl(nodes
.lock
);
3940 dout(10) << __func__
<< " " << dirname
<< "/" << filename
<< dendl
;
3941 map
<string
,DirRef
>::iterator p
= nodes
.dir_map
.find(dirname
);
3942 if (p
== nodes
.dir_map
.end()) {
3943 dout(20) << __func__
<< " dir " << dirname
<< " not found" << dendl
;
3946 DirRef dir
= p
->second
;
3947 map
<string
,FileRef
>::iterator q
= dir
->file_map
.find(filename
);
3948 if (q
== dir
->file_map
.end()) {
3949 dout(20) << __func__
<< " dir " << dirname
<< " (" << dir
3950 << ") file " << filename
3951 << " not found" << dendl
;
3954 File
*file
= q
->second
.get();
3955 dout(10) << __func__
<< " " << dirname
<< "/" << filename
3956 << " " << file
->fnode
<< dendl
;
3958 *size
= file
->fnode
.size
;
3960 *mtime
= file
->fnode
.mtime
;
3964 int BlueFS::lock_file(std::string_view dirname
, std::string_view filename
,
3965 FileLock
**plock
)/*_LN*/
3967 std::lock_guard
ll(log
.lock
);
3968 std::lock_guard
nl(nodes
.lock
);
3969 dout(10) << __func__
<< " " << dirname
<< "/" << filename
<< dendl
;
3970 map
<string
,DirRef
>::iterator p
= nodes
.dir_map
.find(dirname
);
3971 if (p
== nodes
.dir_map
.end()) {
3972 dout(20) << __func__
<< " dir " << dirname
<< " not found" << dendl
;
3975 DirRef dir
= p
->second
;
3976 auto q
= dir
->file_map
.find(filename
);
3978 if (q
== dir
->file_map
.end()) {
3979 dout(20) << __func__
<< " dir " << dirname
<< " (" << dir
3980 << ") file " << filename
3981 << " not found, creating" << dendl
;
3982 file
= ceph::make_ref
<File
>();
3983 file
->fnode
.ino
= ++ino_last
;
3984 file
->fnode
.mtime
= ceph_clock_now();
3985 nodes
.file_map
[ino_last
] = file
;
3986 dir
->file_map
[string
{filename
}] = file
;
3987 logger
->set(l_bluefs_num_files
, nodes
.file_map
.size());
3989 log
.t
.op_file_update(file
->fnode
);
3990 log
.t
.op_dir_link(dirname
, filename
, file
->fnode
.ino
);
3994 dout(10) << __func__
<< " already locked" << dendl
;
3998 file
->locked
= true;
3999 *plock
= new FileLock(file
);
4000 dout(10) << __func__
<< " locked " << file
->fnode
4001 << " with " << *plock
<< dendl
;
4005 int BlueFS::unlock_file(FileLock
*fl
)/*_N*/
4007 std::lock_guard
nl(nodes
.lock
);
4008 dout(10) << __func__
<< " " << fl
<< " on " << fl
->file
->fnode
<< dendl
;
4009 ceph_assert(fl
->file
->locked
);
4010 fl
->file
->locked
= false;
4015 int BlueFS::readdir(std::string_view dirname
, vector
<string
> *ls
)/*_N*/
4017 // dirname may contain a trailing /
4018 if (!dirname
.empty() && dirname
.back() == '/') {
4019 dirname
.remove_suffix(1);
4021 std::lock_guard
nl(nodes
.lock
);
4022 dout(10) << __func__
<< " " << dirname
<< dendl
;
4023 if (dirname
.empty()) {
4025 ls
->reserve(nodes
.dir_map
.size() + 2);
4026 for (auto& q
: nodes
.dir_map
) {
4027 ls
->push_back(q
.first
);
4030 // list files in dir
4031 map
<string
,DirRef
>::iterator p
= nodes
.dir_map
.find(dirname
);
4032 if (p
== nodes
.dir_map
.end()) {
4033 dout(20) << __func__
<< " dir " << dirname
<< " not found" << dendl
;
4036 DirRef dir
= p
->second
;
4037 ls
->reserve(dir
->file_map
.size() + 2);
4038 for (auto& q
: dir
->file_map
) {
4039 ls
->push_back(q
.first
);
4043 ls
->push_back("..");
4047 int BlueFS::unlink(std::string_view dirname
, std::string_view filename
)/*_LND*/
4049 std::lock_guard
ll(log
.lock
);
4050 std::lock_guard
nl(nodes
.lock
);
4051 dout(10) << __func__
<< " " << dirname
<< "/" << filename
<< dendl
;
4052 map
<string
,DirRef
>::iterator p
= nodes
.dir_map
.find(dirname
);
4053 if (p
== nodes
.dir_map
.end()) {
4054 dout(20) << __func__
<< " dir " << dirname
<< " not found" << dendl
;
4057 DirRef dir
= p
->second
;
4058 map
<string
,FileRef
>::iterator q
= dir
->file_map
.find(filename
);
4059 if (q
== dir
->file_map
.end()) {
4060 dout(20) << __func__
<< " file " << dirname
<< "/" << filename
4061 << " not found" << dendl
;
4064 FileRef file
= q
->second
;
4066 dout(20) << __func__
<< " file " << dirname
<< "/" << filename
4067 << " is locked" << dendl
;
4070 dir
->file_map
.erase(string
{filename
});
4071 log
.t
.op_dir_unlink(dirname
, filename
);
4076 bool BlueFS::wal_is_rotational()
4078 if (bdev
[BDEV_WAL
]) {
4079 return bdev
[BDEV_WAL
]->is_rotational();
4080 } else if (bdev
[BDEV_DB
]) {
4081 return bdev
[BDEV_DB
]->is_rotational();
4083 return bdev
[BDEV_SLOW
]->is_rotational();
4086 bool BlueFS::db_is_rotational()
4088 if (bdev
[BDEV_DB
]) {
4089 return bdev
[BDEV_DB
]->is_rotational();
4091 return bdev
[BDEV_SLOW
]->is_rotational();
4096 do_replay_recovery_read is used when bluefs log abruptly ends, but it seems that more data should be there.
4097 Idea is to search disk for definiton of extents that will be accompanied with bluefs log in future,
4098 and try if using it will produce healthy bluefs transaction.
4099 We encode already known bluefs log extents and search disk for these bytes.
4100 When we find it, we decode following bytes as extent.
4101 We read that whole extent and then check if merged with existing log part gives a proper bluefs transaction.
4103 int BlueFS::_do_replay_recovery_read(FileReader
*log_reader
,
4108 dout(1) << __func__
<< " replay_pos=0x" << std::hex
<< replay_pos
<<
4109 " needs 0x" << read_offset
<< "~" << read_len
<< std::dec
<< dendl
;
4111 bluefs_fnode_t
& log_fnode
= log_reader
->file
->fnode
;
4112 bufferlist bin_extents
;
4113 ::encode(log_fnode
.extents
, bin_extents
);
4114 dout(2) << __func__
<< " log file encoded extents length = " << bin_extents
.length() << dendl
;
4116 // cannot process if too small to effectively search
4117 ceph_assert(bin_extents
.length() >= 32);
4119 last_32
.substr_of(bin_extents
, bin_extents
.length() - 32, 32);
4121 //read fixed part from replay_pos to end of bluefs_log extents
4124 auto e
= log_fnode
.seek(replay_pos
, &e_off
);
4125 ceph_assert(e
!= log_fnode
.extents
.end());
4126 int r
= _bdev_read(e
->bdev
, e
->offset
+ e_off
, e
->length
- e_off
, &fixed
, ioc
[e
->bdev
],
4127 cct
->_conf
->bluefs_buffered_io
);
4128 ceph_assert(r
== 0);
4129 //capture dev of last good extent
4130 uint8_t last_e_dev
= e
->bdev
;
4131 uint64_t last_e_off
= e
->offset
;
4133 while (e
!= log_fnode
.extents
.end()) {
4134 r
= _bdev_read(e
->bdev
, e
->offset
, e
->length
, &fixed
, ioc
[e
->bdev
],
4135 cct
->_conf
->bluefs_buffered_io
);
4136 ceph_assert(r
== 0);
4137 last_e_dev
= e
->bdev
;
4140 ceph_assert(replay_pos
+ fixed
.length() == read_offset
);
4142 dout(2) << __func__
<< " valid data in log = " << fixed
.length() << dendl
;
4145 bool operator()(const bluefs_extent_t
& a
, const bluefs_extent_t
& b
) const {
4146 if (a
.bdev
< b
.bdev
) return true;
4147 if (a
.offset
< b
.offset
) return true;
4148 return a
.length
< b
.length
;
4151 std::set
<bluefs_extent_t
, compare
> extents_rejected
;
4152 for (int dcnt
= 0; dcnt
< 3; dcnt
++) {
4153 uint8_t dev
= (last_e_dev
+ dcnt
) % MAX_BDEV
;
4154 if (bdev
[dev
] == nullptr) continue;
4155 dout(2) << __func__
<< " processing " << get_device_name(dev
) << dendl
;
4156 interval_set
<uint64_t> disk_regions
;
4157 disk_regions
.insert(0, bdev
[dev
]->get_size());
4158 for (auto f
: nodes
.file_map
) {
4159 auto& e
= f
.second
->fnode
.extents
;
4161 if (p
.bdev
== dev
) {
4162 disk_regions
.erase(p
.offset
, p
.length
);
4166 size_t disk_regions_count
= disk_regions
.num_intervals();
4167 dout(5) << __func__
<< " " << disk_regions_count
<< " regions to scan on " << get_device_name(dev
) << dendl
;
4169 auto reg
= disk_regions
.lower_bound(last_e_off
);
4170 //for all except first, start from beginning
4172 if (reg
== disk_regions
.end()) {
4173 reg
= disk_regions
.begin();
4175 const uint64_t chunk_size
= 4 * 1024 * 1024;
4176 const uint64_t page_size
= 4096;
4177 const uint64_t max_extent_size
= 16;
4178 uint64_t overlay_size
= last_32
.length() + max_extent_size
;
4179 for (size_t i
= 0; i
< disk_regions_count
; reg
++, i
++) {
4180 if (reg
== disk_regions
.end()) {
4181 reg
= disk_regions
.begin();
4183 uint64_t pos
= reg
.get_start();
4184 uint64_t len
= reg
.get_len();
4186 std::unique_ptr
<char[]> raw_data_p
{new char[page_size
+ chunk_size
]};
4187 char* raw_data
= raw_data_p
.get();
4188 memset(raw_data
, 0, page_size
);
4190 while (len
> last_32
.length()) {
4191 uint64_t chunk_len
= len
> chunk_size
? chunk_size
: len
;
4192 dout(5) << __func__
<< " read "
4193 << get_device_name(dev
) << ":0x" << std::hex
<< pos
<< "+" << chunk_len
4194 << std::dec
<< dendl
;
4195 r
= _bdev_read_random(dev
, pos
, chunk_len
,
4196 raw_data
+ page_size
, cct
->_conf
->bluefs_buffered_io
);
4197 ceph_assert(r
== 0);
4199 //search for fixed_last_32
4200 char* chunk_b
= raw_data
+ page_size
;
4201 char* chunk_e
= chunk_b
+ chunk_len
;
4203 char* search_b
= chunk_b
- overlay_size
;
4204 char* search_e
= chunk_e
;
4206 for (char* sp
= search_b
; ; sp
+= last_32
.length()) {
4207 sp
= (char*)memmem(sp
, search_e
- sp
, last_32
.c_str(), last_32
.length());
4208 if (sp
== nullptr) {
4212 char* n
= sp
+ last_32
.length();
4213 dout(5) << __func__
<< " checking location 0x" << std::hex
<< pos
+ (n
- chunk_b
) << std::dec
<< dendl
;
4215 test
.append(n
, std::min
<size_t>(max_extent_size
, chunk_e
- n
));
4218 bufferlist::const_iterator p
= test
.begin();
4220 } catch (buffer::error
& e
) {
4223 if (extents_rejected
.count(ne
) != 0) {
4224 dout(5) << __func__
<< " extent " << ne
<< " already refected" <<dendl
;
4227 //insert as rejected already. if we succeed, it wouldn't make difference.
4228 extents_rejected
.insert(ne
);
4230 if (ne
.bdev
>= MAX_BDEV
||
4231 bdev
[ne
.bdev
] == nullptr ||
4232 ne
.length
> 16 * 1024 * 1024 ||
4233 (ne
.length
& 4095) != 0 ||
4234 ne
.offset
+ ne
.length
> bdev
[ne
.bdev
]->get_size() ||
4235 (ne
.offset
& 4095) != 0) {
4236 dout(5) << __func__
<< " refusing extent " << ne
<< dendl
;
4239 dout(5) << __func__
<< " checking extent " << ne
<< dendl
;
4241 //read candidate extent - whole
4242 bufferlist candidate
;
4243 candidate
.append(fixed
);
4244 r
= _bdev_read(ne
.bdev
, ne
.offset
, ne
.length
, &candidate
, ioc
[ne
.bdev
],
4245 cct
->_conf
->bluefs_buffered_io
);
4246 ceph_assert(r
== 0);
4248 //check if transaction & crc is ok
4249 bluefs_transaction_t t
;
4251 bufferlist::const_iterator p
= candidate
.begin();
4254 catch (buffer::error
& e
) {
4255 dout(5) << __func__
<< " failed match" << dendl
;
4259 //success, it seems a probable candidate
4260 uint64_t l
= std::min
<uint64_t>(ne
.length
, read_len
);
4261 //trim to required size
4262 bufferlist requested_read
;
4263 requested_read
.substr_of(candidate
, fixed
.length(), l
);
4264 bl
->append(requested_read
);
4265 dout(5) << __func__
<< " successful extension of log " << l
<< "/" << read_len
<< dendl
;
4266 log_fnode
.append_extent(ne
);
4267 log_fnode
.recalc_allocated();
4268 log_reader
->buf
.pos
+= l
;
4271 //save overlay for next search
4272 memcpy(search_b
, chunk_e
- overlay_size
, overlay_size
);
4281 void BlueFS::_check_vselector_LNF() {
4282 BlueFSVolumeSelector
* vs
= vselector
->clone_empty();
4286 std::lock_guard
ll(log
.lock
);
4287 std::lock_guard
nl(nodes
.lock
);
4288 // Checking vselector is under log, nodes and file(s) locks,
4289 // so any modification of vselector must be under at least one of those locks.
4290 for (auto& f
: nodes
.file_map
) {
4291 f
.second
->lock
.lock();
4292 vs
->add_usage(f
.second
->vselector_hint
, f
.second
->fnode
);
4294 bool res
= vselector
->compare(vs
);
4296 dout(0) << "Current:";
4297 vselector
->dump(*_dout
);
4299 dout(0) << "Expected:";
4304 for (auto& f
: nodes
.file_map
) {
4305 f
.second
->lock
.unlock();
4310 size_t BlueFS::probe_alloc_avail(int dev
, uint64_t alloc_size
)
4313 auto iterated_allocation
= [&](size_t off
, size_t len
) {
4314 //only count in size that is alloc_size aligned
4315 size_t dist_to_alignment
;
4316 size_t offset_in_block
= off
& (alloc_size
- 1);
4317 if (offset_in_block
== 0)
4318 dist_to_alignment
= 0;
4320 dist_to_alignment
= alloc_size
- offset_in_block
;
4321 if (dist_to_alignment
>= len
)
4323 len
-= dist_to_alignment
;
4324 total
+= p2align(len
, alloc_size
);
4327 alloc
[dev
]->dump(iterated_allocation
);
4331 // ===============================================
4332 // OriginalVolumeSelector
4334 void* OriginalVolumeSelector::get_hint_for_log() const {
4335 return reinterpret_cast<void*>(BlueFS::BDEV_WAL
);
4337 void* OriginalVolumeSelector::get_hint_by_dir(std::string_view dirname
) const {
4338 uint8_t res
= BlueFS::BDEV_DB
;
4339 if (dirname
.length() > 5) {
4340 // the "db.slow" and "db.wal" directory names are hard-coded at
4341 // match up with bluestore. the slow device is always the second
4342 // one (when a dedicated block.db device is present and used at
4343 // bdev 0). the wal device is always last.
4344 if (boost::algorithm::ends_with(dirname
, ".slow") && slow_total
) {
4345 res
= BlueFS::BDEV_SLOW
;
4346 } else if (boost::algorithm::ends_with(dirname
, ".wal") && wal_total
) {
4347 res
= BlueFS::BDEV_WAL
;
4350 return reinterpret_cast<void*>(res
);
4353 uint8_t OriginalVolumeSelector::select_prefer_bdev(void* hint
)
4355 return (uint8_t)(reinterpret_cast<uint64_t>(hint
));
4358 void OriginalVolumeSelector::get_paths(const std::string
& base
, paths
& res
) const
4360 res
.emplace_back(base
, db_total
);
4361 res
.emplace_back(base
+ ".slow",
4362 slow_total
? slow_total
: db_total
); // use fake non-zero value if needed to
4363 // avoid RocksDB complains
4367 #define dout_prefix *_dout << "OriginalVolumeSelector: "
4369 void OriginalVolumeSelector::dump(ostream
& sout
) {
4370 sout
<< "wal_total:" << wal_total
4371 << ", db_total:" << db_total
4372 << ", slow_total:" << slow_total
4376 // ===============================================
4377 // FitToFastVolumeSelector
4379 void FitToFastVolumeSelector::get_paths(const std::string
& base
, paths
& res
) const {
4380 res
.emplace_back(base
, 1); // size of the last db_path has no effect