1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2 // vim: ts=8 sw=2 smarttab
4 #include "boost/algorithm/string.hpp"
5 #include "bluestore_common.h"
8 #include "common/debug.h"
9 #include "common/errno.h"
10 #include "common/perf_counters.h"
11 #include "Allocator.h"
12 #include "include/ceph_assert.h"
13 #include "common/admin_socket.h"
15 #define dout_context cct
16 #define dout_subsys ceph_subsys_bluefs
18 #define dout_prefix *_dout << "bluefs "
19 using TOPNSPC::common::cmd_getval
;
31 using std::chrono::duration
;
32 using std::chrono::seconds
;
34 using ceph::bufferlist
;
37 using ceph::Formatter
;
40 MEMPOOL_DEFINE_OBJECT_FACTORY(BlueFS::File
, bluefs_file
, bluefs
);
41 MEMPOOL_DEFINE_OBJECT_FACTORY(BlueFS::Dir
, bluefs_dir
, bluefs
);
42 MEMPOOL_DEFINE_OBJECT_FACTORY(BlueFS::FileWriter
, bluefs_file_writer
, bluefs_file_writer
);
43 MEMPOOL_DEFINE_OBJECT_FACTORY(BlueFS::FileReaderBuffer
,
44 bluefs_file_reader_buffer
, bluefs_file_reader
);
45 MEMPOOL_DEFINE_OBJECT_FACTORY(BlueFS::FileReader
, bluefs_file_reader
, bluefs_file_reader
);
46 MEMPOOL_DEFINE_OBJECT_FACTORY(BlueFS::FileLock
, bluefs_file_lock
, bluefs
);
48 static void wal_discard_cb(void *priv
, void* priv2
) {
49 BlueFS
*bluefs
= static_cast<BlueFS
*>(priv
);
50 interval_set
<uint64_t> *tmp
= static_cast<interval_set
<uint64_t>*>(priv2
);
51 bluefs
->handle_discard(BlueFS::BDEV_WAL
, *tmp
);
54 static void db_discard_cb(void *priv
, void* priv2
) {
55 BlueFS
*bluefs
= static_cast<BlueFS
*>(priv
);
56 interval_set
<uint64_t> *tmp
= static_cast<interval_set
<uint64_t>*>(priv2
);
57 bluefs
->handle_discard(BlueFS::BDEV_DB
, *tmp
);
60 static void slow_discard_cb(void *priv
, void* priv2
) {
61 BlueFS
*bluefs
= static_cast<BlueFS
*>(priv
);
62 interval_set
<uint64_t> *tmp
= static_cast<interval_set
<uint64_t>*>(priv2
);
63 bluefs
->handle_discard(BlueFS::BDEV_SLOW
, *tmp
);
66 class BlueFS::SocketHook
: public AdminSocketHook
{
69 static BlueFS::SocketHook
* create(BlueFS
* bluefs
)
71 BlueFS::SocketHook
* hook
= nullptr;
72 AdminSocket
* admin_socket
= bluefs
->cct
->get_admin_socket();
74 hook
= new BlueFS::SocketHook(bluefs
);
75 int r
= admin_socket
->register_command("bluestore bluefs device info "
76 "name=alloc_size,type=CephInt,req=false",
78 "Shows space report for bluefs devices. "
79 "This also includes an estimation for space "
80 "available to bluefs at main device. "
81 "alloc_size, if set, specifies the custom bluefs "
82 "allocation unit size for the estimation above.");
84 ldout(bluefs
->cct
, 1) << __func__
<< " cannot register SocketHook" << dendl
;
88 r
= admin_socket
->register_command("bluefs stats",
90 "Dump internal statistics for bluefs."
93 r
= admin_socket
->register_command("bluefs files list", hook
,
94 "print files in bluefs");
96 r
= admin_socket
->register_command("bluefs debug_inject_read_zeros", hook
,
97 "Injects 8K zeros into next BlueFS read. Debug only.");
105 AdminSocket
* admin_socket
= bluefs
->cct
->get_admin_socket();
106 admin_socket
->unregister_commands(this);
109 SocketHook(BlueFS
* bluefs
) :
111 int call(std::string_view command
, const cmdmap_t
& cmdmap
,
115 bufferlist
& out
) override
{
116 if (command
== "bluestore bluefs device info") {
117 int64_t alloc_size
= 0;
118 cmd_getval(cmdmap
, "alloc_size", alloc_size
);
119 if ((alloc_size
& (alloc_size
- 1)) != 0) {
120 errss
<< "Invalid allocation size:'" << alloc_size
<< std::endl
;
124 alloc_size
= bluefs
->cct
->_conf
->bluefs_shared_alloc_size
;
125 f
->open_object_section("bluefs_device_info");
126 for (unsigned dev
= BDEV_WAL
; dev
<= BDEV_SLOW
; dev
++) {
127 if (bluefs
->bdev
[dev
]) {
128 f
->open_object_section("dev");
129 f
->dump_string("device", bluefs
->get_device_name(dev
));
130 ceph_assert(bluefs
->alloc
[dev
]);
131 auto total
= bluefs
->get_total(dev
);
132 auto free
= bluefs
->get_free(dev
);
133 auto used
= bluefs
->get_used(dev
);
135 f
->dump_int("total", total
);
136 f
->dump_int("free", free
);
137 f
->dump_int("bluefs_used", used
);
138 if (bluefs
->is_shared_alloc(dev
)) {
139 size_t avail
= bluefs
->probe_alloc_avail(dev
, alloc_size
);
140 f
->dump_int("bluefs max available", avail
);
147 } else if (command
== "bluefs stats") {
148 std::stringstream ss
;
149 bluefs
->dump_block_extents(ss
);
150 bluefs
->dump_volume_selector(ss
);
152 } else if (command
== "bluefs files list") {
153 const char* devnames
[3] = {"wal","db","slow"};
154 std::lock_guard
l(bluefs
->nodes
.lock
);
155 f
->open_array_section("files");
156 for (auto &d
: bluefs
->nodes
.dir_map
) {
157 std::string dir
= d
.first
;
158 for (auto &r
: d
.second
->file_map
) {
159 f
->open_object_section("file");
160 f
->dump_string("name", (dir
+ "/" + r
.first
).c_str());
161 std::vector
<size_t> sizes
;
162 sizes
.resize(bluefs
->bdev
.size());
163 for(auto& i
: r
.second
->fnode
.extents
) {
164 sizes
[i
.bdev
] += i
.length
;
166 for (size_t i
= 0; i
< sizes
.size(); i
++) {
168 if (i
< sizeof(devnames
) / sizeof(*devnames
))
169 f
->dump_int(devnames
[i
], sizes
[i
]);
171 f
->dump_int(("dev-"+to_string(i
)).c_str(), sizes
[i
]);
179 } else if (command
== "bluefs debug_inject_read_zeros") {
180 bluefs
->inject_read_zeros
++;
182 errss
<< "Invalid command" << std::endl
;
189 BlueFS::BlueFS(CephContext
* cct
)
193 block_reserved(MAX_BDEV
),
195 alloc_size(MAX_BDEV
, 0)
197 dirty
.pending_release
.resize(MAX_BDEV
);
198 discard_cb
[BDEV_WAL
] = wal_discard_cb
;
199 discard_cb
[BDEV_DB
] = db_discard_cb
;
200 discard_cb
[BDEV_SLOW
] = slow_discard_cb
;
201 asok_hook
= SocketHook::create(this);
211 for (auto p
: bdev
) {
222 void BlueFS::_init_logger()
224 PerfCountersBuilder
b(cct
, "bluefs",
225 l_bluefs_first
, l_bluefs_last
);
226 b
.add_u64(l_bluefs_db_total_bytes
, "db_total_bytes",
227 "Total bytes (main db device)",
228 "b", PerfCountersBuilder::PRIO_USEFUL
, unit_t(UNIT_BYTES
));
229 b
.add_u64(l_bluefs_db_used_bytes
, "db_used_bytes",
230 "Used bytes (main db device)",
231 "u", PerfCountersBuilder::PRIO_USEFUL
, unit_t(UNIT_BYTES
));
232 b
.add_u64(l_bluefs_wal_total_bytes
, "wal_total_bytes",
233 "Total bytes (wal device)",
234 "walb", PerfCountersBuilder::PRIO_USEFUL
, unit_t(UNIT_BYTES
));
235 b
.add_u64(l_bluefs_wal_used_bytes
, "wal_used_bytes",
236 "Used bytes (wal device)",
237 "walu", PerfCountersBuilder::PRIO_USEFUL
, unit_t(UNIT_BYTES
));
238 b
.add_u64(l_bluefs_slow_total_bytes
, "slow_total_bytes",
239 "Total bytes (slow device)",
240 "slob", PerfCountersBuilder::PRIO_USEFUL
, unit_t(UNIT_BYTES
));
241 b
.add_u64(l_bluefs_slow_used_bytes
, "slow_used_bytes",
242 "Used bytes (slow device)",
243 "slou", PerfCountersBuilder::PRIO_USEFUL
, unit_t(UNIT_BYTES
));
244 b
.add_u64(l_bluefs_num_files
, "num_files", "File count",
245 "f", PerfCountersBuilder::PRIO_USEFUL
);
246 b
.add_u64(l_bluefs_log_bytes
, "log_bytes", "Size of the metadata log",
247 "jlen", PerfCountersBuilder::PRIO_INTERESTING
, unit_t(UNIT_BYTES
));
248 b
.add_u64_counter(l_bluefs_log_compactions
, "log_compactions",
249 "Compactions of the metadata log");
250 b
.add_u64_counter(l_bluefs_log_write_count
, "log_write_count",
251 "Write op count to the metadata log");
252 b
.add_u64_counter(l_bluefs_logged_bytes
, "logged_bytes",
253 "Bytes written to the metadata log",
255 PerfCountersBuilder::PRIO_CRITICAL
, unit_t(UNIT_BYTES
));
256 b
.add_u64_counter(l_bluefs_files_written_wal
, "files_written_wal",
257 "Files written to WAL");
258 b
.add_u64_counter(l_bluefs_files_written_sst
, "files_written_sst",
259 "Files written to SSTs");
260 b
.add_u64_counter(l_bluefs_write_count_wal
, "write_count_wal",
261 "Write op count to WAL");
262 b
.add_u64_counter(l_bluefs_write_count_sst
, "write_count_sst",
263 "Write op count to SSTs");
264 b
.add_u64_counter(l_bluefs_bytes_written_wal
, "bytes_written_wal",
265 "Bytes written to WAL",
267 PerfCountersBuilder::PRIO_CRITICAL
);
268 b
.add_u64_counter(l_bluefs_bytes_written_sst
, "bytes_written_sst",
269 "Bytes written to SSTs",
271 PerfCountersBuilder::PRIO_CRITICAL
, unit_t(UNIT_BYTES
));
272 b
.add_u64_counter(l_bluefs_bytes_written_slow
, "bytes_written_slow",
273 "Bytes written to WAL/SSTs at slow device",
275 PerfCountersBuilder::PRIO_CRITICAL
, unit_t(UNIT_BYTES
));
276 b
.add_u64_counter(l_bluefs_max_bytes_wal
, "max_bytes_wal",
277 "Maximum bytes allocated from WAL",
279 PerfCountersBuilder::PRIO_INTERESTING
,
281 b
.add_u64_counter(l_bluefs_max_bytes_db
, "max_bytes_db",
282 "Maximum bytes allocated from DB",
284 PerfCountersBuilder::PRIO_INTERESTING
,
286 b
.add_u64_counter(l_bluefs_max_bytes_slow
, "max_bytes_slow",
287 "Maximum bytes allocated from SLOW",
289 PerfCountersBuilder::PRIO_INTERESTING
,
291 b
.add_u64_counter(l_bluefs_main_alloc_unit
, "alloc_unit_main",
292 "Allocation unit size (in bytes) for primary/shared device",
294 PerfCountersBuilder::PRIO_CRITICAL
,
296 b
.add_u64_counter(l_bluefs_db_alloc_unit
, "alloc_unit_db",
297 "Allocation unit size (in bytes) for standalone DB device",
299 PerfCountersBuilder::PRIO_CRITICAL
,
301 b
.add_u64_counter(l_bluefs_wal_alloc_unit
, "alloc_unit_wal",
302 "Allocation unit size (in bytes) for standalone WAL device",
304 PerfCountersBuilder::PRIO_CRITICAL
,
306 b
.add_u64_counter(l_bluefs_read_random_count
, "read_random_count",
307 "random read requests processed",
309 PerfCountersBuilder::PRIO_USEFUL
);
310 b
.add_u64_counter(l_bluefs_read_random_bytes
, "read_random_bytes",
311 "Bytes requested in random read mode",
313 PerfCountersBuilder::PRIO_USEFUL
, unit_t(UNIT_BYTES
));
314 b
.add_u64_counter(l_bluefs_read_random_disk_count
, "read_random_disk_count",
315 "random reads requests going to disk",
317 PerfCountersBuilder::PRIO_USEFUL
);
318 b
.add_u64_counter(l_bluefs_read_random_disk_bytes
, "read_random_disk_bytes",
319 "Bytes read from disk in random read mode",
321 PerfCountersBuilder::PRIO_INTERESTING
,
323 b
.add_u64_counter(l_bluefs_read_random_disk_bytes_wal
, "read_random_disk_bytes_wal",
324 "random reads requests going to WAL disk",
326 PerfCountersBuilder::PRIO_USEFUL
, unit_t(UNIT_BYTES
));
327 b
.add_u64_counter(l_bluefs_read_random_disk_bytes_db
, "read_random_disk_bytes_db",
328 "random reads requests going to DB disk",
330 PerfCountersBuilder::PRIO_USEFUL
, unit_t(UNIT_BYTES
));
331 b
.add_u64_counter(l_bluefs_read_random_disk_bytes_slow
, "read_random_disk_bytes_slow",
332 "random reads requests going to main disk",
334 PerfCountersBuilder::PRIO_INTERESTING
,
336 b
.add_u64_counter(l_bluefs_read_random_buffer_count
, "read_random_buffer_count",
337 "random read requests processed using prefetch buffer",
339 PerfCountersBuilder::PRIO_USEFUL
);
340 b
.add_u64_counter(l_bluefs_read_random_buffer_bytes
, "read_random_buffer_bytes",
341 "Bytes read from prefetch buffer in random read mode",
343 PerfCountersBuilder::PRIO_USEFUL
, unit_t(UNIT_BYTES
));
344 b
.add_u64_counter(l_bluefs_read_count
, "read_count",
345 "buffered read requests processed",
347 PerfCountersBuilder::PRIO_USEFUL
);
348 b
.add_u64_counter(l_bluefs_read_bytes
, "read_bytes",
349 "Bytes requested in buffered read mode",
351 PerfCountersBuilder::PRIO_USEFUL
, unit_t(UNIT_BYTES
));
352 b
.add_u64_counter(l_bluefs_read_disk_count
, "read_disk_count",
353 "buffered reads requests going to disk",
355 PerfCountersBuilder::PRIO_USEFUL
);
356 b
.add_u64_counter(l_bluefs_read_disk_bytes
, "read_disk_bytes",
357 "Bytes read in buffered mode from disk",
359 PerfCountersBuilder::PRIO_INTERESTING
, unit_t(UNIT_BYTES
));
360 b
.add_u64_counter(l_bluefs_read_disk_bytes_wal
, "read_disk_bytes_wal",
361 "reads requests going to WAL disk",
363 PerfCountersBuilder::PRIO_USEFUL
, unit_t(UNIT_BYTES
));
364 b
.add_u64_counter(l_bluefs_read_disk_bytes_db
, "read_disk_bytes_db",
365 "reads requests going to DB disk",
367 PerfCountersBuilder::PRIO_USEFUL
, unit_t(UNIT_BYTES
));
368 b
.add_u64_counter(l_bluefs_read_disk_bytes_slow
, "read_disk_bytes_slow",
369 "reads requests going to main disk",
371 PerfCountersBuilder::PRIO_INTERESTING
, unit_t(UNIT_BYTES
));
372 b
.add_u64_counter(l_bluefs_read_prefetch_count
, "read_prefetch_count",
373 "prefetch read requests processed",
375 PerfCountersBuilder::PRIO_USEFUL
);
376 b
.add_u64_counter(l_bluefs_read_prefetch_bytes
, "read_prefetch_bytes",
377 "Bytes requested in prefetch read mode",
379 PerfCountersBuilder::PRIO_USEFUL
, unit_t(UNIT_BYTES
));
380 b
.add_u64_counter(l_bluefs_write_count
, "write_count",
381 "Write requests processed");
382 b
.add_u64_counter(l_bluefs_write_disk_count
, "write_disk_count",
383 "Write requests sent to disk");
384 b
.add_u64_counter(l_bluefs_write_bytes
, "write_bytes",
385 "Bytes written", NULL
,
386 PerfCountersBuilder::PRIO_USEFUL
, unit_t(UNIT_BYTES
));
387 b
.add_time_avg (l_bluefs_compaction_lat
, "compact_lat",
388 "Average bluefs log compaction latency",
390 PerfCountersBuilder::PRIO_INTERESTING
);
391 b
.add_time_avg (l_bluefs_compaction_lock_lat
, "compact_lock_lat",
392 "Average lock duration while compacting bluefs log",
394 PerfCountersBuilder::PRIO_INTERESTING
);
395 b
.add_u64_counter(l_bluefs_alloc_shared_dev_fallbacks
, "alloc_slow_fallback",
396 "Amount of allocations that required fallback to "
397 " slow/shared device",
399 PerfCountersBuilder::PRIO_USEFUL
);
400 b
.add_u64_counter(l_bluefs_alloc_shared_size_fallbacks
, "alloc_slow_size_fallback",
401 "Amount of allocations that required fallback to shared device's "
404 PerfCountersBuilder::PRIO_USEFUL
);
405 b
.add_u64(l_bluefs_read_zeros_candidate
, "read_zeros_candidate",
406 "How many times bluefs read found page with all 0s");
407 b
.add_u64(l_bluefs_read_zeros_errors
, "read_zeros_errors",
408 "How many times bluefs read found transient page with all 0s");
410 logger
= b
.create_perf_counters();
411 cct
->get_perfcounters_collection()->add(logger
);
414 void BlueFS::_shutdown_logger()
416 cct
->get_perfcounters_collection()->remove(logger
);
420 void BlueFS::_update_logger_stats()
422 if (alloc
[BDEV_WAL
]) {
423 logger
->set(l_bluefs_wal_total_bytes
, _get_total(BDEV_WAL
));
424 logger
->set(l_bluefs_wal_used_bytes
, _get_used(BDEV_WAL
));
426 if (alloc
[BDEV_DB
]) {
427 logger
->set(l_bluefs_db_total_bytes
, _get_total(BDEV_DB
));
428 logger
->set(l_bluefs_db_used_bytes
, _get_used(BDEV_DB
));
430 if (alloc
[BDEV_SLOW
]) {
431 logger
->set(l_bluefs_slow_total_bytes
, _get_total(BDEV_SLOW
));
432 logger
->set(l_bluefs_slow_used_bytes
, _get_used(BDEV_SLOW
));
436 int BlueFS::add_block_device(unsigned id
, const string
& path
, bool trim
,
438 bluefs_shared_alloc_context_t
* _shared_alloc
)
440 dout(10) << __func__
<< " bdev " << id
<< " path " << path
<< " "
441 << reserved
<< dendl
;
442 ceph_assert(id
< bdev
.size());
443 ceph_assert(bdev
[id
] == NULL
);
444 BlockDevice
*b
= BlockDevice::create(cct
, path
, NULL
, NULL
,
445 discard_cb
[id
], static_cast<void*>(this));
446 block_reserved
[id
] = reserved
;
448 b
->set_no_exclusive_lock();
450 int r
= b
->open(path
);
456 interval_set
<uint64_t> whole_device
;
457 whole_device
.insert(0, b
->get_size());
458 b
->try_discard(whole_device
, false);
461 dout(1) << __func__
<< " bdev " << id
<< " path " << path
462 << " size " << byte_u_t(b
->get_size()) << dendl
;
464 ioc
[id
] = new IOContext(cct
, NULL
);
466 ceph_assert(!shared_alloc
);
467 shared_alloc
= _shared_alloc
;
468 alloc
[id
] = shared_alloc
->a
;
469 shared_alloc_id
= id
;
474 bool BlueFS::bdev_support_label(unsigned id
)
476 ceph_assert(id
< bdev
.size());
477 ceph_assert(bdev
[id
]);
478 return bdev
[id
]->supported_bdev_label();
481 uint64_t BlueFS::get_block_device_size(unsigned id
) const
483 if (id
< bdev
.size() && bdev
[id
])
484 return bdev
[id
]->get_size();
488 void BlueFS::handle_discard(unsigned id
, interval_set
<uint64_t>& to_release
)
490 dout(10) << __func__
<< " bdev " << id
<< dendl
;
491 ceph_assert(alloc
[id
]);
492 alloc
[id
]->release(to_release
);
493 if (is_shared_alloc(id
)) {
494 shared_alloc
->bluefs_used
-= to_release
.size();
498 uint64_t BlueFS::get_used()
501 for (unsigned id
= 0; id
< MAX_BDEV
; ++id
) {
502 used
+= _get_used(id
);
507 uint64_t BlueFS::_get_used(unsigned id
) const
513 if (is_shared_alloc(id
)) {
514 used
= shared_alloc
->bluefs_used
;
516 used
= _get_total(id
) - alloc
[id
]->get_free();
521 uint64_t BlueFS::get_used(unsigned id
)
523 ceph_assert(id
< alloc
.size());
524 ceph_assert(alloc
[id
]);
525 return _get_used(id
);
528 uint64_t BlueFS::_get_total(unsigned id
) const
530 ceph_assert(id
< bdev
.size());
531 ceph_assert(id
< block_reserved
.size());
532 return get_block_device_size(id
) - block_reserved
[id
];
535 uint64_t BlueFS::get_total(unsigned id
)
537 return _get_total(id
);
540 uint64_t BlueFS::get_free(unsigned id
)
542 ceph_assert(id
< alloc
.size());
543 return alloc
[id
]->get_free();
546 void BlueFS::dump_perf_counters(Formatter
*f
)
548 f
->open_object_section("bluefs_perf_counters");
549 logger
->dump_formatted(f
, false, false);
553 void BlueFS::dump_block_extents(ostream
& out
)
555 for (unsigned i
= 0; i
< MAX_BDEV
; ++i
) {
559 auto total
= get_total(i
);
560 auto free
= get_free(i
);
562 out
<< i
<< " : device size 0x" << std::hex
<< total
563 << " : using 0x" << total
- free
564 << std::dec
<< "(" << byte_u_t(total
- free
) << ")";
569 void BlueFS::foreach_block_extents(
571 std::function
<void(uint64_t, uint32_t)> fn
)
573 std::lock_guard
nl(nodes
.lock
);
574 dout(10) << __func__
<< " bdev " << id
<< dendl
;
575 ceph_assert(id
< alloc
.size());
576 for (auto& p
: nodes
.file_map
) {
577 for (auto& q
: p
.second
->fnode
.extents
) {
579 fn(q
.offset
, q
.length
);
585 int BlueFS::mkfs(uuid_d osd_uuid
, const bluefs_layout_t
& layout
)
588 << " osd_uuid " << osd_uuid
591 // set volume selector if not provided before/outside
592 if (vselector
== nullptr) {
594 new OriginalVolumeSelector(
595 get_block_device_size(BlueFS::BDEV_WAL
) * 95 / 100,
596 get_block_device_size(BlueFS::BDEV_DB
) * 95 / 100,
597 get_block_device_size(BlueFS::BDEV_SLOW
) * 95 / 100));
604 super
.block_size
= bdev
[BDEV_DB
]->get_block_size();
605 super
.osd_uuid
= osd_uuid
;
606 super
.uuid
.generate_random();
607 dout(1) << __func__
<< " uuid " << super
.uuid
<< dendl
;
610 FileRef log_file
= ceph::make_ref
<File
>();
611 log_file
->fnode
.ino
= 1;
612 log_file
->vselector_hint
= vselector
->get_hint_for_log();
614 vselector
->select_prefer_bdev(log_file
->vselector_hint
),
615 cct
->_conf
->bluefs_max_log_runway
,
618 vselector
->add_usage(log_file
->vselector_hint
, log_file
->fnode
);
620 log
.writer
= _create_writer(log_file
);
623 ceph_assert(log
.seq_live
== 1);
626 _flush_and_sync_log_LD();
629 super
.log_fnode
= log_file
->fnode
;
630 super
.memorized_layout
= layout
;
631 _write_super(BDEV_DB
);
635 super
= bluefs_super_t();
636 _close_writer(log
.writer
);
638 vselector
.reset(nullptr);
642 ceph_assert(shared_alloc
->need_init
);
643 shared_alloc
->need_init
= false;
646 dout(10) << __func__
<< " success" << dendl
;
650 void BlueFS::_init_alloc()
652 dout(20) << __func__
<< dendl
;
654 size_t wal_alloc_size
= 0;
655 if (bdev
[BDEV_WAL
]) {
656 wal_alloc_size
= cct
->_conf
->bluefs_alloc_size
;
657 alloc_size
[BDEV_WAL
] = wal_alloc_size
;
659 logger
->set(l_bluefs_wal_alloc_unit
, wal_alloc_size
);
662 uint64_t shared_alloc_size
= cct
->_conf
->bluefs_shared_alloc_size
;
663 if (shared_alloc
&& shared_alloc
->a
) {
664 uint64_t unit
= shared_alloc
->a
->get_block_size();
665 shared_alloc_size
= std::max(
668 ceph_assert(0 == p2phase(shared_alloc_size
, unit
));
670 if (bdev
[BDEV_SLOW
]) {
671 alloc_size
[BDEV_DB
] = cct
->_conf
->bluefs_alloc_size
;
672 alloc_size
[BDEV_SLOW
] = shared_alloc_size
;
674 alloc_size
[BDEV_DB
] = shared_alloc_size
;
675 alloc_size
[BDEV_SLOW
] = 0;
677 logger
->set(l_bluefs_db_alloc_unit
, alloc_size
[BDEV_DB
]);
678 logger
->set(l_bluefs_main_alloc_unit
, alloc_size
[BDEV_SLOW
]);
679 // new wal and db devices are never shared
680 if (bdev
[BDEV_NEWWAL
]) {
681 alloc_size
[BDEV_NEWWAL
] = cct
->_conf
->bluefs_alloc_size
;
683 if (bdev
[BDEV_NEWDB
]) {
684 alloc_size
[BDEV_NEWDB
] = cct
->_conf
->bluefs_alloc_size
;
687 for (unsigned id
= 0; id
< bdev
.size(); ++id
) {
691 ceph_assert(bdev
[id
]->get_size());
692 if (is_shared_alloc(id
)) {
693 dout(1) << __func__
<< " shared, id " << id
<< std::hex
694 << ", capacity 0x" << bdev
[id
]->get_size()
695 << ", block size 0x" << alloc_size
[id
]
696 << std::dec
<< dendl
;
698 ceph_assert(alloc_size
[id
]);
699 std::string name
= "bluefs-";
700 const char* devnames
[] = { "wal","db","slow" };
702 name
+= devnames
[id
];
704 name
+= to_string(uintptr_t(this));
705 dout(1) << __func__
<< " new, id " << id
<< std::hex
706 << ", allocator name " << name
707 << ", allocator type " << cct
->_conf
->bluefs_allocator
708 << ", capacity 0x" << bdev
[id
]->get_size()
709 << ", block size 0x" << alloc_size
[id
]
710 << std::dec
<< dendl
;
711 alloc
[id
] = Allocator::create(cct
, cct
->_conf
->bluefs_allocator
,
712 bdev
[id
]->get_size(),
716 alloc
[id
]->init_add_free(
723 void BlueFS::_stop_alloc()
725 dout(20) << __func__
<< dendl
;
726 for (auto p
: bdev
) {
731 for (size_t i
= 0; i
< alloc
.size(); ++i
) {
732 if (alloc
[i
] && !is_shared_alloc(i
)) {
733 alloc
[i
]->shutdown();
740 int BlueFS::_read_and_check(uint8_t ndev
, uint64_t off
, uint64_t len
,
741 ceph::buffer::list
*pbl
, IOContext
*ioc
, bool buffered
)
743 dout(10) << __func__
<< " dev " << int(ndev
)
744 << ": 0x" << std::hex
<< off
<< "~" << len
<< std::dec
745 << (buffered
? " buffered" : "")
749 r
= _bdev_read(ndev
, off
, len
, &bl
, ioc
, buffered
);
753 uint64_t block_size
= bdev
[ndev
]->get_block_size();
754 if (inject_read_zeros
) {
755 if (len
>= block_size
* 2) {
756 derr
<< __func__
<< " injecting error, zeros at "
757 << int(ndev
) << ": 0x" << std::hex
<< (off
+ len
/ 2)
758 << "~" << (block_size
* 2) << std::dec
<< dendl
;
759 //use beginning, replace 8K in the middle with zeros, use tail
761 bl
.splice(0, len
/ 2 - block_size
, &temp
);
762 temp
.append(buffer::create(block_size
* 2, 0));
763 bl
.splice(block_size
* 2, len
/ 2 - block_size
, &temp
);
768 //make a check if there is a block with all 0
769 uint64_t to_check_len
= len
;
770 uint64_t skip
= p2nphase(off
, block_size
);
771 if (skip
>= to_check_len
) {
774 auto it
= bl
.begin(skip
);
775 to_check_len
-= skip
;
776 bool all_zeros
= false;
777 while (all_zeros
== false && to_check_len
>= block_size
) {
779 unsigned block_left
= block_size
;
783 while (all_zeros
&& block_left
> 0) {
784 avail
= it
.get_ptr_and_advance(block_left
, &data
);
786 all_zeros
= mem_is_zero(data
, avail
);
789 while (block_left
> 0) {
790 avail
= it
.get_ptr_and_advance(block_left
, &data
);
793 to_check_len
-= block_size
;
796 logger
->inc(l_bluefs_read_zeros_candidate
, 1);
797 bufferlist bl_reread
;
798 r
= _bdev_read(ndev
, off
, len
, &bl_reread
, ioc
, buffered
);
802 // check if both read gave the same
803 if (!bl
.contents_equal(bl_reread
)) {
804 // report problems to log, but continue, maybe it will be good now...
805 derr
<< __func__
<< " initial read of " << int(ndev
)
806 << ": 0x" << std::hex
<< off
<< "~" << len
807 << std::dec
<< ": different then re-read " << dendl
;
808 logger
->inc(l_bluefs_read_zeros_errors
, 1);
810 // use second read will be better if is different
811 pbl
->append(bl_reread
);
818 int BlueFS::_read_random_and_check(
819 uint8_t ndev
, uint64_t off
, uint64_t len
, char *buf
, bool buffered
)
821 dout(10) << __func__
<< " dev " << int(ndev
)
822 << ": 0x" << std::hex
<< off
<< "~" << len
<< std::dec
823 << (buffered
? " buffered" : "")
826 r
= _bdev_read_random(ndev
, off
, len
, buf
, buffered
);
830 uint64_t block_size
= bdev
[ndev
]->get_block_size();
831 if (inject_read_zeros
) {
832 if (len
>= block_size
* 2) {
833 derr
<< __func__
<< " injecting error, zeros at "
834 << int(ndev
) << ": 0x" << std::hex
<< (off
+ len
/ 2)
835 << "~" << (block_size
* 2) << std::dec
<< dendl
;
837 memset(buf
+ len
/ 2 - block_size
, 0, block_size
* 2);
841 //make a check if there is a block with all 0
842 uint64_t to_check_len
= len
;
843 const char* data
= buf
;
844 uint64_t skip
= p2nphase(off
, block_size
);
845 if (skip
>= to_check_len
) {
848 to_check_len
-= skip
;
851 bool all_zeros
= false;
852 while (all_zeros
== false && to_check_len
>= block_size
) {
853 if (mem_is_zero(data
, block_size
)) {
854 // at least one block is all zeros
859 to_check_len
-= block_size
;
862 logger
->inc(l_bluefs_read_zeros_candidate
, 1);
863 std::unique_ptr
<char[]> data_reread(new char[len
]);
864 r
= _bdev_read_random(ndev
, off
, len
, &data_reread
[0], buffered
);
868 // check if both read gave the same
869 if (memcmp(buf
, &data_reread
[0], len
) != 0) {
870 derr
<< __func__
<< " initial read of " << int(ndev
)
871 << ": 0x" << std::hex
<< off
<< "~" << len
872 << std::dec
<< ": different then re-read " << dendl
;
873 logger
->inc(l_bluefs_read_zeros_errors
, 1);
874 // second read is probably better
875 memcpy(buf
, &data_reread
[0], len
);
881 int BlueFS::_bdev_read(uint8_t ndev
, uint64_t off
, uint64_t len
,
882 ceph::buffer::list
* pbl
, IOContext
* ioc
, bool buffered
)
886 case BDEV_WAL
: cnt
= l_bluefs_read_disk_bytes_wal
; break;
887 case BDEV_DB
: cnt
= l_bluefs_read_disk_bytes_db
; break;
888 case BDEV_SLOW
: cnt
= l_bluefs_read_disk_bytes_slow
; break;
892 logger
->inc(cnt
, len
);
894 return bdev
[ndev
]->read(off
, len
, pbl
, ioc
, buffered
);
897 int BlueFS::_bdev_read_random(uint8_t ndev
, uint64_t off
, uint64_t len
,
898 char* buf
, bool buffered
)
902 case BDEV_WAL
: cnt
= l_bluefs_read_random_disk_bytes_wal
; break;
903 case BDEV_DB
: cnt
= l_bluefs_read_random_disk_bytes_db
; break;
904 case BDEV_SLOW
: cnt
= l_bluefs_read_random_disk_bytes_slow
; break;
907 logger
->inc(cnt
, len
);
909 return bdev
[ndev
]->read_random(off
, len
, buf
, buffered
);
914 dout(1) << __func__
<< dendl
;
917 int r
= _open_super();
919 derr
<< __func__
<< " failed to open super: " << cpp_strerror(r
) << dendl
;
923 // set volume selector if not provided before/outside
924 if (vselector
== nullptr) {
926 new OriginalVolumeSelector(
927 get_block_device_size(BlueFS::BDEV_WAL
) * 95 / 100,
928 get_block_device_size(BlueFS::BDEV_DB
) * 95 / 100,
929 get_block_device_size(BlueFS::BDEV_SLOW
) * 95 / 100));
934 r
= _replay(false, false);
936 derr
<< __func__
<< " failed to replay log: " << cpp_strerror(r
) << dendl
;
942 for (auto& p
: nodes
.file_map
) {
943 dout(30) << __func__
<< " noting alloc for " << p
.second
->fnode
<< dendl
;
944 for (auto& q
: p
.second
->fnode
.extents
) {
945 bool is_shared
= is_shared_alloc(q
.bdev
);
946 ceph_assert(!is_shared
|| (is_shared
&& shared_alloc
));
947 if (is_shared
&& shared_alloc
->need_init
&& shared_alloc
->a
) {
948 shared_alloc
->bluefs_used
+= q
.length
;
949 alloc
[q
.bdev
]->init_rm_free(q
.offset
, q
.length
);
950 } else if (!is_shared
) {
951 alloc
[q
.bdev
]->init_rm_free(q
.offset
, q
.length
);
956 shared_alloc
->need_init
= false;
957 dout(1) << __func__
<< " shared_bdev_used = "
958 << shared_alloc
->bluefs_used
<< dendl
;
960 dout(1) << __func__
<< " shared bdev not used"
964 // set up the log for future writes
965 log
.writer
= _create_writer(_get_file(1));
966 ceph_assert(log
.writer
->file
->fnode
.ino
== 1);
967 log
.writer
->pos
= log
.writer
->file
->fnode
.size
;
968 log
.writer
->file
->fnode
.reset_delta();
969 dout(10) << __func__
<< " log write pos set to 0x"
970 << std::hex
<< log
.writer
->pos
<< std::dec
973 logger
->set(l_bluefs_log_bytes
, log
.writer
->file
->fnode
.size
);
977 super
= bluefs_super_t();
981 int BlueFS::maybe_verify_layout(const bluefs_layout_t
& layout
) const
983 if (super
.memorized_layout
) {
984 if (layout
== *super
.memorized_layout
) {
985 dout(10) << __func__
<< " bluefs layout verified positively" << dendl
;
987 derr
<< __func__
<< " memorized layout doesn't fit current one" << dendl
;
991 dout(10) << __func__
<< " no memorized_layout in bluefs superblock"
998 void BlueFS::umount(bool avoid_compact
)
1000 dout(1) << __func__
<< dendl
;
1002 sync_metadata(avoid_compact
);
1003 if (cct
->_conf
->bluefs_check_volume_selector_on_umount
) {
1004 _check_vselector_LNF();
1006 _close_writer(log
.writer
);
1010 vselector
.reset(nullptr);
1012 nodes
.file_map
.clear();
1013 nodes
.dir_map
.clear();
1014 super
= bluefs_super_t();
1018 int BlueFS::prepare_new_device(int id
, const bluefs_layout_t
& layout
)
1020 dout(1) << __func__
<< dendl
;
1022 if(id
== BDEV_NEWDB
) {
1023 int new_log_dev_cur
= BDEV_WAL
;
1024 int new_log_dev_next
= BDEV_WAL
;
1025 if (!bdev
[BDEV_WAL
]) {
1026 new_log_dev_cur
= BDEV_NEWDB
;
1027 new_log_dev_next
= BDEV_DB
;
1029 _rewrite_log_and_layout_sync_LNF_LD(false,
1035 } else if(id
== BDEV_NEWWAL
) {
1036 _rewrite_log_and_layout_sync_LNF_LD(false,
1048 void BlueFS::collect_metadata(map
<string
,string
> *pm
, unsigned skip_bdev_id
)
1050 if (skip_bdev_id
!= BDEV_DB
&& bdev
[BDEV_DB
])
1051 bdev
[BDEV_DB
]->collect_metadata("bluefs_db_", pm
);
1053 bdev
[BDEV_WAL
]->collect_metadata("bluefs_wal_", pm
);
1056 void BlueFS::get_devices(set
<string
> *ls
)
1058 for (unsigned i
= 0; i
< MAX_BDEV
; ++i
) {
1060 bdev
[i
]->get_devices(ls
);
1067 dout(1) << __func__
<< dendl
;
1068 // hrm, i think we check everything on mount...
1072 int BlueFS::_write_super(int dev
)
1078 uint32_t crc
= bl
.crc32c(-1);
1080 dout(10) << __func__
<< " super block length(encoded): " << bl
.length() << dendl
;
1081 dout(10) << __func__
<< " superblock " << super
.version
<< dendl
;
1082 dout(10) << __func__
<< " log_fnode " << super
.log_fnode
<< dendl
;
1083 ceph_assert_always(bl
.length() <= get_super_length());
1084 bl
.append_zero(get_super_length() - bl
.length());
1086 bdev
[dev
]->write(get_super_offset(), bl
, false, WRITE_LIFE_SHORT
);
1087 dout(20) << __func__
<< " v " << super
.version
1088 << " crc 0x" << std::hex
<< crc
1089 << " offset 0x" << get_super_offset() << std::dec
1094 int BlueFS::_open_super()
1096 dout(10) << __func__
<< dendl
;
1099 uint32_t expected_crc
, crc
;
1102 // always the second block
1103 r
= _bdev_read(BDEV_DB
, get_super_offset(), get_super_length(),
1104 &bl
, ioc
[BDEV_DB
], false);
1108 auto p
= bl
.cbegin();
1112 t
.substr_of(bl
, 0, p
.get_off());
1115 decode(expected_crc
, p
);
1116 if (crc
!= expected_crc
) {
1117 derr
<< __func__
<< " bad crc on superblock, expected 0x"
1118 << std::hex
<< expected_crc
<< " != actual 0x" << crc
<< std::dec
1122 dout(10) << __func__
<< " superblock " << super
.version
<< dendl
;
1123 dout(10) << __func__
<< " log_fnode " << super
.log_fnode
<< dendl
;
1127 int BlueFS::_check_allocations(const bluefs_fnode_t
& fnode
,
1128 boost::dynamic_bitset
<uint64_t>* used_blocks
,
1129 bool is_alloc
, //true when allocating, false when deallocating
1130 const char* op_name
)
1132 auto& fnode_extents
= fnode
.extents
;
1133 for (auto e
: fnode_extents
) {
1136 ceph_assert(id
< MAX_BDEV
);
1137 ceph_assert(bdev
[id
]);
1138 // let's use minimal allocation unit we can have
1139 auto alloc_unit
= bdev
[id
]->get_block_size();
1141 if (int r
= _verify_alloc_granularity(id
, e
.offset
, e
.length
,
1147 apply_for_bitset_range(e
.offset
, e
.length
, alloc_unit
, used_blocks
[id
],
1148 [&](uint64_t pos
, boost::dynamic_bitset
<uint64_t> &bs
) {
1149 if (is_alloc
== bs
.test(pos
)) {
1157 derr
<< __func__
<< " " << op_name
<< " invalid extent " << int(e
.bdev
)
1158 << ": 0x" << std::hex
<< e
.offset
<< "~" << e
.length
<< std::dec
1159 << (is_alloc
== true ?
1160 ": duplicate reference, ino " : ": double free, ino ")
1161 << fnode
.ino
<< dendl
;
1168 int BlueFS::_verify_alloc_granularity(
1169 __u8 id
, uint64_t offset
, uint64_t length
, uint64_t alloc_unit
, const char *op
)
1171 if ((offset
& (alloc_unit
- 1)) ||
1172 (length
& (alloc_unit
- 1))) {
1173 derr
<< __func__
<< " " << op
<< " of " << (int)id
1174 << ":0x" << std::hex
<< offset
<< "~" << length
<< std::dec
1175 << " does not align to alloc_size 0x"
1176 << std::hex
<< alloc_unit
<< std::dec
<< dendl
;
1182 int BlueFS::_replay(bool noop
, bool to_stdout
)
1184 dout(10) << __func__
<< (noop
? " NO-OP" : "") << dendl
;
1185 ino_last
= 1; // by the log
1186 uint64_t log_seq
= 0;
1189 log_file
= _get_file(1);
1191 log_file
->fnode
= super
.log_fnode
;
1193 log_file
->vselector_hint
=
1194 vselector
->get_hint_for_log();
1196 dout(10) << __func__
<< " log_fnode " << super
.log_fnode
<< dendl
;
1197 if (unlikely(to_stdout
)) {
1198 std::cout
<< " log_fnode " << super
.log_fnode
<< std::endl
;
1201 FileReader
*log_reader
= new FileReader(
1202 log_file
, cct
->_conf
->bluefs_max_prefetch
,
1204 true); // ignore eof
1206 bool seen_recs
= false;
1208 boost::dynamic_bitset
<uint64_t> used_blocks
[MAX_BDEV
];
1211 if (cct
->_conf
->bluefs_log_replay_check_allocations
) {
1212 for (size_t i
= 0; i
< MAX_BDEV
; ++i
) {
1213 if (bdev
[i
] != nullptr) {
1214 // let's use minimal allocation unit we can have
1215 auto au
= bdev
[i
]->get_block_size();
1216 //hmm... on 32TB/4K drive this would take 1GB RAM!!!
1217 used_blocks
[i
].resize(round_up_to(bdev
[i
]->get_size(), au
) / au
);
1220 // check initial log layout
1221 int r
= _check_allocations(log_file
->fnode
,
1222 used_blocks
, true, "Log from super");
1230 ceph_assert((log_reader
->buf
.pos
& ~super
.block_mask()) == 0);
1231 uint64_t pos
= log_reader
->buf
.pos
;
1232 uint64_t read_pos
= pos
;
1235 int r
= _read(log_reader
, read_pos
, super
.block_size
,
1237 if (r
!= (int)super
.block_size
&& cct
->_conf
->bluefs_replay_recovery
) {
1238 r
+= _do_replay_recovery_read(log_reader
, pos
, read_pos
+ r
, super
.block_size
- r
, &bl
);
1240 assert(r
== (int)super
.block_size
);
1247 auto p
= bl
.cbegin();
1255 if (len
+ 6 > bl
.length()) {
1256 more
= round_up_to(len
+ 6 - bl
.length(), super
.block_size
);
1259 if (uuid
!= super
.uuid
) {
1261 dout(10) << __func__
<< " 0x" << std::hex
<< pos
<< std::dec
1262 << ": stop: uuid " << uuid
<< " != super.uuid " << super
.uuid
1265 derr
<< __func__
<< " 0x" << std::hex
<< pos
<< std::dec
1266 << ": stop: uuid " << uuid
<< " != super.uuid " << super
.uuid
1267 << ", block dump: \n";
1269 t
.substr_of(bl
, 0, super
.block_size
);
1275 if (seq
!= log_seq
+ 1) {
1277 dout(10) << __func__
<< " 0x" << std::hex
<< pos
<< std::dec
1278 << ": stop: seq " << seq
<< " != expected " << log_seq
+ 1
1281 derr
<< __func__
<< " 0x" << std::hex
<< pos
<< std::dec
1282 << ": stop: seq " << seq
<< " != expected " << log_seq
+ 1
1288 dout(20) << __func__
<< " need 0x" << std::hex
<< more
<< std::dec
1289 << " more bytes" << dendl
;
1291 int r
= _read(log_reader
, read_pos
, more
, &t
, NULL
);
1292 if (r
< (int)more
) {
1293 dout(10) << __func__
<< " 0x" << std::hex
<< pos
1294 << ": stop: len is 0x" << bl
.length() + more
<< std::dec
1295 << ", which is past eof" << dendl
;
1296 if (cct
->_conf
->bluefs_replay_recovery
) {
1297 //try to search for more data
1298 r
+= _do_replay_recovery_read(log_reader
, pos
, read_pos
+ r
, more
- r
, &t
);
1299 if (r
< (int)more
) {
1300 //in normal mode we must read r==more, for recovery it is too strict
1305 ceph_assert(r
== (int)more
);
1309 bluefs_transaction_t t
;
1311 auto p
= bl
.cbegin();
1315 catch (ceph::buffer::error
& e
) {
1316 // Multi-block transactions might be incomplete due to unexpected
1317 // power off. Hence let's treat that as a regular stop condition.
1318 if (seen_recs
&& more
) {
1319 dout(10) << __func__
<< " 0x" << std::hex
<< pos
<< std::dec
1320 << ": stop: failed to decode: " << e
.what()
1323 derr
<< __func__
<< " 0x" << std::hex
<< pos
<< std::dec
1324 << ": stop: failed to decode: " << e
.what()
1331 ceph_assert(seq
== t
.seq
);
1332 dout(10) << __func__
<< " 0x" << std::hex
<< pos
<< std::dec
1333 << ": " << t
<< dendl
;
1334 if (unlikely(to_stdout
)) {
1335 std::cout
<< " 0x" << std::hex
<< pos
<< std::dec
1336 << ": " << t
<< std::endl
;
1339 auto p
= t
.op_bl
.cbegin();
1342 pos
= pos0
+ p
.get_off();
1347 case bluefs_transaction_t::OP_INIT
:
1348 dout(20) << __func__
<< " 0x" << std::hex
<< pos
<< std::dec
1349 << ": op_init" << dendl
;
1350 if (unlikely(to_stdout
)) {
1351 std::cout
<< " 0x" << std::hex
<< pos
<< std::dec
1352 << ": op_init" << std::endl
;
1355 ceph_assert(t
.seq
== 1);
1358 case bluefs_transaction_t::OP_JUMP
:
1362 decode(next_seq
, p
);
1364 dout(20) << __func__
<< " 0x" << std::hex
<< pos
<< std::dec
1365 << ": op_jump seq " << next_seq
1366 << " offset 0x" << std::hex
<< offset
<< std::dec
<< dendl
;
1367 if (unlikely(to_stdout
)) {
1368 std::cout
<< " 0x" << std::hex
<< pos
<< std::dec
1369 << ": op_jump seq " << next_seq
1370 << " offset 0x" << std::hex
<< offset
<< std::dec
1374 ceph_assert(next_seq
> log_seq
);
1375 log_seq
= next_seq
- 1; // we will increment it below
1376 uint64_t skip
= offset
- read_pos
;
1379 int r
= _read(log_reader
, read_pos
, skip
, &junk
,
1381 if (r
!= (int)skip
) {
1382 dout(10) << __func__
<< " 0x" << std::hex
<< read_pos
1383 << ": stop: failed to skip to " << offset
1384 << std::dec
<< dendl
;
1385 ceph_abort_msg("problem with op_jump");
1391 case bluefs_transaction_t::OP_JUMP_SEQ
:
1394 decode(next_seq
, p
);
1395 dout(20) << __func__
<< " 0x" << std::hex
<< pos
<< std::dec
1396 << ": op_jump_seq " << next_seq
<< dendl
;
1397 if (unlikely(to_stdout
)) {
1398 std::cout
<< " 0x" << std::hex
<< pos
<< std::dec
1399 << ": op_jump_seq " << next_seq
<< std::endl
;
1402 ceph_assert(next_seq
> log_seq
);
1403 log_seq
= next_seq
- 1; // we will increment it below
1407 case bluefs_transaction_t::OP_ALLOC_ADD
:
1408 // LEGACY, do nothing but read params
1411 uint64_t offset
, length
;
1418 case bluefs_transaction_t::OP_ALLOC_RM
:
1419 // LEGACY, do nothing but read params
1422 uint64_t offset
, length
;
1429 case bluefs_transaction_t::OP_DIR_LINK
:
1431 string dirname
, filename
;
1434 decode(filename
, p
);
1436 dout(20) << __func__
<< " 0x" << std::hex
<< pos
<< std::dec
1437 << ": op_dir_link " << " " << dirname
<< "/" << filename
1440 if (unlikely(to_stdout
)) {
1441 std::cout
<< " 0x" << std::hex
<< pos
<< std::dec
1442 << ": op_dir_link " << " " << dirname
<< "/" << filename
1448 FileRef file
= _get_file(ino
);
1449 ceph_assert(file
->fnode
.ino
);
1450 map
<string
,DirRef
>::iterator q
= nodes
.dir_map
.find(dirname
);
1451 ceph_assert(q
!= nodes
.dir_map
.end());
1452 map
<string
,FileRef
>::iterator r
= q
->second
->file_map
.find(filename
);
1453 ceph_assert(r
== q
->second
->file_map
.end());
1455 vselector
->sub_usage(file
->vselector_hint
, file
->fnode
);
1456 file
->vselector_hint
=
1457 vselector
->get_hint_by_dir(dirname
);
1458 vselector
->add_usage(file
->vselector_hint
, file
->fnode
);
1460 q
->second
->file_map
[filename
] = file
;
1466 case bluefs_transaction_t::OP_DIR_UNLINK
:
1468 string dirname
, filename
;
1470 decode(filename
, p
);
1471 dout(20) << __func__
<< " 0x" << std::hex
<< pos
<< std::dec
1472 << ": op_dir_unlink " << " " << dirname
<< "/" << filename
1474 if (unlikely(to_stdout
)) {
1475 std::cout
<< " 0x" << std::hex
<< pos
<< std::dec
1476 << ": op_dir_unlink " << " " << dirname
<< "/" << filename
1481 map
<string
,DirRef
>::iterator q
= nodes
.dir_map
.find(dirname
);
1482 ceph_assert(q
!= nodes
.dir_map
.end());
1483 map
<string
,FileRef
>::iterator r
= q
->second
->file_map
.find(filename
);
1484 ceph_assert(r
!= q
->second
->file_map
.end());
1485 ceph_assert(r
->second
->refs
> 0);
1487 q
->second
->file_map
.erase(r
);
1492 case bluefs_transaction_t::OP_DIR_CREATE
:
1496 dout(20) << __func__
<< " 0x" << std::hex
<< pos
<< std::dec
1497 << ": op_dir_create " << dirname
<< dendl
;
1498 if (unlikely(to_stdout
)) {
1499 std::cout
<< " 0x" << std::hex
<< pos
<< std::dec
1500 << ": op_dir_create " << dirname
<< std::endl
;
1504 map
<string
,DirRef
>::iterator q
= nodes
.dir_map
.find(dirname
);
1505 ceph_assert(q
== nodes
.dir_map
.end());
1506 nodes
.dir_map
[dirname
] = ceph::make_ref
<Dir
>();
1511 case bluefs_transaction_t::OP_DIR_REMOVE
:
1515 dout(20) << __func__
<< " 0x" << std::hex
<< pos
<< std::dec
1516 << ": op_dir_remove " << dirname
<< dendl
;
1517 if (unlikely(to_stdout
)) {
1518 std::cout
<< " 0x" << std::hex
<< pos
<< std::dec
1519 << ": op_dir_remove " << dirname
<< std::endl
;
1523 map
<string
,DirRef
>::iterator q
= nodes
.dir_map
.find(dirname
);
1524 ceph_assert(q
!= nodes
.dir_map
.end());
1525 ceph_assert(q
->second
->file_map
.empty());
1526 nodes
.dir_map
.erase(q
);
1531 case bluefs_transaction_t::OP_FILE_UPDATE
:
1533 bluefs_fnode_t fnode
;
1535 dout(20) << __func__
<< " 0x" << std::hex
<< pos
<< std::dec
1536 << ": op_file_update " << " " << fnode
<< " " << dendl
;
1537 if (unlikely(to_stdout
)) {
1538 std::cout
<< " 0x" << std::hex
<< pos
<< std::dec
1539 << ": op_file_update " << " " << fnode
<< std::endl
;
1542 FileRef f
= _get_file(fnode
.ino
);
1543 if (cct
->_conf
->bluefs_log_replay_check_allocations
) {
1544 int r
= _check_allocations(f
->fnode
,
1545 used_blocks
, false, "OP_FILE_UPDATE");
1550 if (fnode
.ino
!= 1) {
1551 vselector
->sub_usage(f
->vselector_hint
, f
->fnode
);
1554 if (fnode
.ino
!= 1) {
1555 vselector
->add_usage(f
->vselector_hint
, f
->fnode
);
1558 if (fnode
.ino
> ino_last
) {
1559 ino_last
= fnode
.ino
;
1561 if (cct
->_conf
->bluefs_log_replay_check_allocations
) {
1562 int r
= _check_allocations(f
->fnode
,
1563 used_blocks
, true, "OP_FILE_UPDATE");
1568 } else if (noop
&& fnode
.ino
== 1) {
1569 FileRef f
= _get_file(fnode
.ino
);
1574 case bluefs_transaction_t::OP_FILE_UPDATE_INC
:
1576 bluefs_fnode_delta_t delta
;
1578 dout(20) << __func__
<< " 0x" << std::hex
<< pos
<< std::dec
1579 << ": op_file_update_inc " << " " << delta
<< " " << dendl
;
1580 if (unlikely(to_stdout
)) {
1581 std::cout
<< " 0x" << std::hex
<< pos
<< std::dec
1582 << ": op_file_update_inc " << " " << delta
<< std::endl
;
1585 FileRef f
= _get_file(delta
.ino
);
1586 bluefs_fnode_t
& fnode
= f
->fnode
;
1587 if (delta
.offset
!= fnode
.allocated
) {
1588 derr
<< __func__
<< " invalid op_file_update_inc, new extents miss end of file"
1589 << " fnode=" << fnode
1590 << " delta=" << delta
1592 ceph_assert(delta
.offset
== fnode
.allocated
);
1594 if (cct
->_conf
->bluefs_log_replay_check_allocations
) {
1595 int r
= _check_allocations(fnode
,
1596 used_blocks
, false, "OP_FILE_UPDATE_INC");
1602 fnode
.ino
= delta
.ino
;
1603 fnode
.mtime
= delta
.mtime
;
1604 if (fnode
.ino
!= 1) {
1605 vselector
->sub_usage(f
->vselector_hint
, fnode
);
1607 fnode
.size
= delta
.size
;
1608 fnode
.claim_extents(delta
.extents
);
1609 dout(20) << __func__
<< " 0x" << std::hex
<< pos
<< std::dec
1610 << ": op_file_update_inc produced " << " " << fnode
<< " " << dendl
;
1612 if (fnode
.ino
!= 1) {
1613 vselector
->add_usage(f
->vselector_hint
, fnode
);
1616 if (fnode
.ino
> ino_last
) {
1617 ino_last
= fnode
.ino
;
1619 if (cct
->_conf
->bluefs_log_replay_check_allocations
) {
1620 int r
= _check_allocations(f
->fnode
,
1621 used_blocks
, true, "OP_FILE_UPDATE_INC");
1626 } else if (noop
&& delta
.ino
== 1) {
1627 // we need to track bluefs log, even in noop mode
1628 FileRef f
= _get_file(1);
1629 bluefs_fnode_t
& fnode
= f
->fnode
;
1630 fnode
.ino
= delta
.ino
;
1631 fnode
.mtime
= delta
.mtime
;
1632 fnode
.size
= delta
.size
;
1633 fnode
.claim_extents(delta
.extents
);
1638 case bluefs_transaction_t::OP_FILE_REMOVE
:
1642 dout(20) << __func__
<< " 0x" << std::hex
<< pos
<< std::dec
1643 << ": op_file_remove " << ino
<< dendl
;
1644 if (unlikely(to_stdout
)) {
1645 std::cout
<< " 0x" << std::hex
<< pos
<< std::dec
1646 << ": op_file_remove " << ino
<< std::endl
;
1650 auto p
= nodes
.file_map
.find(ino
);
1651 ceph_assert(p
!= nodes
.file_map
.end());
1652 vselector
->sub_usage(p
->second
->vselector_hint
, p
->second
->fnode
);
1653 if (cct
->_conf
->bluefs_log_replay_check_allocations
) {
1654 int r
= _check_allocations(p
->second
->fnode
,
1655 used_blocks
, false, "OP_FILE_REMOVE");
1660 nodes
.file_map
.erase(p
);
1666 derr
<< __func__
<< " 0x" << std::hex
<< pos
<< std::dec
1667 << ": stop: unrecognized op " << (int)op
<< dendl
;
1672 ceph_assert(p
.end());
1674 // we successfully replayed the transaction; bump the seq and log size
1676 log_file
->fnode
.size
= log_reader
->buf
.pos
;
1679 vselector
->add_usage(log_file
->vselector_hint
, log_file
->fnode
);
1680 log
.seq_live
= log_seq
+ 1;
1681 dirty
.seq_live
= log_seq
+ 1;
1682 log
.t
.seq
= log
.seq_live
;
1683 dirty
.seq_stable
= log_seq
;
1686 dout(10) << __func__
<< " log file size was 0x"
1687 << std::hex
<< log_file
->fnode
.size
<< std::dec
<< dendl
;
1688 if (unlikely(to_stdout
)) {
1689 std::cout
<< " log file size was 0x"
1690 << std::hex
<< log_file
->fnode
.size
<< std::dec
<< std::endl
;
1696 // verify file link counts are all >0
1697 for (auto& p
: nodes
.file_map
) {
1698 if (p
.second
->refs
== 0 &&
1699 p
.second
->fnode
.ino
> 1) {
1700 derr
<< __func__
<< " file with link count 0: " << p
.second
->fnode
1706 // reflect file count in logger
1707 logger
->set(l_bluefs_num_files
, nodes
.file_map
.size());
1709 dout(10) << __func__
<< " done" << dendl
;
1713 int BlueFS::log_dump()
1715 // only dump log file's content
1716 ceph_assert(log
.writer
== nullptr && "cannot log_dump on mounted BlueFS");
1718 int r
= _open_super();
1720 derr
<< __func__
<< " failed to open super: " << cpp_strerror(r
) << dendl
;
1723 r
= _replay(true, true);
1725 derr
<< __func__
<< " failed to replay log: " << cpp_strerror(r
) << dendl
;
1728 super
= bluefs_super_t();
1732 int BlueFS::device_migrate_to_existing(
1734 const set
<int>& devs_source
,
1736 const bluefs_layout_t
& layout
)
1739 bool buffered
= cct
->_conf
->bluefs_buffered_io
;
1741 dout(10) << __func__
<< " devs_source " << devs_source
1742 << " dev_target " << dev_target
<< dendl
;
1743 assert(dev_target
< (int)MAX_BDEV
);
1746 flags
|= devs_source
.count(BDEV_DB
) ?
1747 (REMOVE_DB
| RENAME_SLOW2DB
) : 0;
1748 flags
|= devs_source
.count(BDEV_WAL
) ? REMOVE_WAL
: 0;
1749 int dev_target_new
= dev_target
;
1751 // Slow device without separate DB one is addressed via BDEV_DB
1752 // Hence need renaming.
1753 if ((flags
& REMOVE_DB
) && dev_target
== BDEV_SLOW
) {
1754 dev_target_new
= BDEV_DB
;
1755 dout(0) << __func__
<< " super to be written to " << dev_target
<< dendl
;
1758 for (auto& [ino
, file_ref
] : nodes
.file_map
) {
1763 dout(10) << __func__
<< " " << ino
<< " " << file_ref
->fnode
<< dendl
;
1765 vselector
->sub_usage(file_ref
->vselector_hint
, file_ref
->fnode
);
1767 bool rewrite
= std::any_of(
1768 file_ref
->fnode
.extents
.begin(),
1769 file_ref
->fnode
.extents
.end(),
1771 return ext
.bdev
!= dev_target
&& devs_source
.count(ext
.bdev
);
1774 dout(10) << __func__
<< " migrating" << dendl
;
1775 bluefs_fnode_t old_fnode
;
1776 old_fnode
.swap_extents(file_ref
->fnode
);
1777 auto& old_fnode_extents
= old_fnode
.extents
;
1780 for (const auto &old_ext
: old_fnode_extents
) {
1781 buf
.resize(old_ext
.length
);
1782 int r
= _bdev_read_random(old_ext
.bdev
,
1788 derr
<< __func__
<< " failed to read 0x" << std::hex
1789 << old_ext
.offset
<< "~" << old_ext
.length
<< std::dec
1790 << " from " << (int)dev_target
<< dendl
;
1793 bl
.append((char*)&buf
[0], old_ext
.length
);
1796 // write entire file
1797 auto l
= _allocate(dev_target
, bl
.length(), 0,
1798 &file_ref
->fnode
, 0, false);
1800 derr
<< __func__
<< " unable to allocate len 0x" << std::hex
1801 << bl
.length() << std::dec
<< " from " << (int)dev_target
1802 << ": " << cpp_strerror(l
) << dendl
;
1807 for (auto& i
: file_ref
->fnode
.extents
) {
1809 uint64_t cur_len
= std::min
<uint64_t>(i
.length
, bl
.length() - off
);
1810 ceph_assert(cur_len
> 0);
1811 cur
.substr_of(bl
, off
, cur_len
);
1812 int r
= bdev
[dev_target
]->write(i
.offset
, cur
, buffered
);
1813 ceph_assert(r
== 0);
1817 // release old extents
1818 for (const auto &old_ext
: old_fnode_extents
) {
1819 PExtentVector to_release
;
1820 to_release
.emplace_back(old_ext
.offset
, old_ext
.length
);
1821 alloc
[old_ext
.bdev
]->release(to_release
);
1822 if (is_shared_alloc(old_ext
.bdev
)) {
1823 shared_alloc
->bluefs_used
-= to_release
.size();
1828 for (auto& i
: file_ref
->fnode
.extents
) {
1829 i
.bdev
= dev_target_new
;
1832 for (auto& ext
: file_ref
->fnode
.extents
) {
1833 if (dev_target
!= dev_target_new
&& ext
.bdev
== dev_target
) {
1834 dout(20) << __func__
<< " " << " ... adjusting extent 0x"
1835 << std::hex
<< ext
.offset
<< std::dec
1836 << " bdev " << dev_target
<< " -> " << dev_target_new
1838 ext
.bdev
= dev_target_new
;
1842 vselector
->add_usage(file_ref
->vselector_hint
, file_ref
->fnode
);
1844 // new logging device in the current naming scheme
1845 int new_log_dev_cur
= bdev
[BDEV_WAL
] ?
1847 bdev
[BDEV_DB
] ? BDEV_DB
: BDEV_SLOW
;
1849 // new logging device in new naming scheme
1850 int new_log_dev_next
= new_log_dev_cur
;
1852 if (devs_source
.count(new_log_dev_cur
)) {
1853 // SLOW device is addressed via BDEV_DB too hence either WAL or DB
1854 new_log_dev_next
= (flags
& REMOVE_WAL
) || !bdev
[BDEV_WAL
] ?
1858 dout(0) << __func__
<< " log moved from " << new_log_dev_cur
1859 << " to " << new_log_dev_next
<< dendl
;
1862 (flags
& REMOVE_DB
) && new_log_dev_next
== BDEV_DB
?
1867 _rewrite_log_and_layout_sync_LNF_LD(
1869 (flags
& REMOVE_DB
) ? BDEV_SLOW
: BDEV_DB
,
1877 int BlueFS::device_migrate_to_new(
1879 const set
<int>& devs_source
,
1881 const bluefs_layout_t
& layout
)
1884 bool buffered
= cct
->_conf
->bluefs_buffered_io
;
1886 dout(10) << __func__
<< " devs_source " << devs_source
1887 << " dev_target " << dev_target
<< dendl
;
1888 assert(dev_target
== (int)BDEV_NEWDB
|| dev_target
== (int)BDEV_NEWWAL
);
1892 flags
|= devs_source
.count(BDEV_DB
) ?
1893 (!bdev
[BDEV_SLOW
] ? RENAME_DB2SLOW
: REMOVE_DB
) :
1895 flags
|= devs_source
.count(BDEV_WAL
) ? REMOVE_WAL
: 0;
1896 int dev_target_new
= dev_target
; //FIXME: remove, makes no sense
1898 for (auto& [ino
, file_ref
] : nodes
.file_map
) {
1903 dout(10) << __func__
<< " " << ino
<< " " << file_ref
->fnode
<< dendl
;
1905 vselector
->sub_usage(file_ref
->vselector_hint
, file_ref
->fnode
);
1907 bool rewrite
= std::any_of(
1908 file_ref
->fnode
.extents
.begin(),
1909 file_ref
->fnode
.extents
.end(),
1911 return ext
.bdev
!= dev_target
&& devs_source
.count(ext
.bdev
);
1914 dout(10) << __func__
<< " migrating" << dendl
;
1915 bluefs_fnode_t old_fnode
;
1916 old_fnode
.swap_extents(file_ref
->fnode
);
1917 auto& old_fnode_extents
= old_fnode
.extents
;
1920 for (const auto &old_ext
: old_fnode_extents
) {
1921 buf
.resize(old_ext
.length
);
1922 int r
= _bdev_read_random(old_ext
.bdev
,
1928 derr
<< __func__
<< " failed to read 0x" << std::hex
1929 << old_ext
.offset
<< "~" << old_ext
.length
<< std::dec
1930 << " from " << (int)dev_target
<< dendl
;
1933 bl
.append((char*)&buf
[0], old_ext
.length
);
1936 // write entire file
1937 auto l
= _allocate(dev_target
, bl
.length(), 0,
1938 &file_ref
->fnode
, 0, false);
1940 derr
<< __func__
<< " unable to allocate len 0x" << std::hex
1941 << bl
.length() << std::dec
<< " from " << (int)dev_target
1942 << ": " << cpp_strerror(l
) << dendl
;
1947 for (auto& i
: file_ref
->fnode
.extents
) {
1949 uint64_t cur_len
= std::min
<uint64_t>(i
.length
, bl
.length() - off
);
1950 ceph_assert(cur_len
> 0);
1951 cur
.substr_of(bl
, off
, cur_len
);
1952 int r
= bdev
[dev_target
]->write(i
.offset
, cur
, buffered
);
1953 ceph_assert(r
== 0);
1957 // release old extents
1958 for (const auto &old_ext
: old_fnode_extents
) {
1959 PExtentVector to_release
;
1960 to_release
.emplace_back(old_ext
.offset
, old_ext
.length
);
1961 alloc
[old_ext
.bdev
]->release(to_release
);
1962 if (is_shared_alloc(old_ext
.bdev
)) {
1963 shared_alloc
->bluefs_used
-= to_release
.size();
1968 for (auto& i
: file_ref
->fnode
.extents
) {
1969 i
.bdev
= dev_target_new
;
1973 // new logging device in the current naming scheme
1974 int new_log_dev_cur
=
1977 bdev
[BDEV_WAL
] && !(flags
& REMOVE_WAL
) ?
1981 bdev
[BDEV_DB
] && !(flags
& REMOVE_DB
)?
1985 // new logging device in new naming scheme
1986 int new_log_dev_next
=
1987 new_log_dev_cur
== BDEV_NEWWAL
?
1989 new_log_dev_cur
== BDEV_NEWDB
?
1994 dev_target
== BDEV_NEWDB
?
2000 _rewrite_log_and_layout_sync_LNF_LD(
2010 BlueFS::FileRef
BlueFS::_get_file(uint64_t ino
)
2012 auto p
= nodes
.file_map
.find(ino
);
2013 if (p
== nodes
.file_map
.end()) {
2014 FileRef f
= ceph::make_ref
<File
>();
2015 nodes
.file_map
[ino
] = f
;
2016 // track files count in logger
2017 logger
->set(l_bluefs_num_files
, nodes
.file_map
.size());
2018 dout(30) << __func__
<< " ino " << ino
<< " = " << f
2019 << " (new)" << dendl
;
2022 dout(30) << __func__
<< " ino " << ino
<< " = " << p
->second
<< dendl
;
2029 To modify fnode both FileWriter::lock and File::lock must be obtained.
2030 The special case is when we modify bluefs log (ino 1) or
2031 we are compacting log (ino 0).
2033 In any case it is enough to hold File::lock to be sure fnode will not be modified.
2035 struct lock_fnode_print
{
2036 BlueFS::FileRef file
;
2037 lock_fnode_print(BlueFS::FileRef file
) : file(file
) {};
2039 std::ostream
& operator<<(std::ostream
& out
, const lock_fnode_print
& to_lock
) {
2040 std::lock_guard
l(to_lock
.file
->lock
);
2041 out
<< to_lock
.file
->fnode
;
2045 void BlueFS::_drop_link_D(FileRef file
)
2047 dout(20) << __func__
<< " had refs " << file
->refs
2048 << " on " << lock_fnode_print(file
) << dendl
;
2049 ceph_assert(file
->refs
> 0);
2050 ceph_assert(ceph_mutex_is_locked(log
.lock
));
2051 ceph_assert(ceph_mutex_is_locked(nodes
.lock
));
2054 if (file
->refs
== 0) {
2055 dout(20) << __func__
<< " destroying " << file
->fnode
<< dendl
;
2056 ceph_assert(file
->num_reading
.load() == 0);
2057 vselector
->sub_usage(file
->vselector_hint
, file
->fnode
);
2058 log
.t
.op_file_remove(file
->fnode
.ino
);
2059 nodes
.file_map
.erase(file
->fnode
.ino
);
2060 logger
->set(l_bluefs_num_files
, nodes
.file_map
.size());
2061 file
->deleted
= true;
2063 std::lock_guard
dl(dirty
.lock
);
2064 for (auto& r
: file
->fnode
.extents
) {
2065 dirty
.pending_release
[r
.bdev
].insert(r
.offset
, r
.length
);
2067 if (file
->dirty_seq
> dirty
.seq_stable
) {
2068 // retract request to serialize changes
2069 ceph_assert(dirty
.files
.count(file
->dirty_seq
));
2070 auto it
= dirty
.files
[file
->dirty_seq
].iterator_to(*file
);
2071 dirty
.files
[file
->dirty_seq
].erase(it
);
2072 file
->dirty_seq
= dirty
.seq_stable
;
2077 int64_t BlueFS::_read_random(
2078 FileReader
*h
, ///< [in] read from here
2079 uint64_t off
, ///< [in] offset
2080 uint64_t len
, ///< [in] this many bytes
2081 char *out
) ///< [out] copy it here
2083 auto* buf
= &h
->buf
;
2086 dout(10) << __func__
<< " h " << h
2087 << " 0x" << std::hex
<< off
<< "~" << len
<< std::dec
2088 << " from " << lock_fnode_print(h
->file
) << dendl
;
2090 ++h
->file
->num_reading
;
2092 if (!h
->ignore_eof
&&
2093 off
+ len
> h
->file
->fnode
.size
) {
2094 if (off
> h
->file
->fnode
.size
)
2097 len
= h
->file
->fnode
.size
- off
;
2098 dout(20) << __func__
<< " reaching (or past) eof, len clipped to 0x"
2099 << std::hex
<< len
<< std::dec
<< dendl
;
2101 logger
->inc(l_bluefs_read_random_count
, 1);
2102 logger
->inc(l_bluefs_read_random_bytes
, len
);
2104 std::shared_lock
s_lock(h
->lock
);
2105 buf
->bl
.reassign_to_mempool(mempool::mempool_bluefs_file_reader
);
2107 if (off
< buf
->bl_off
|| off
>= buf
->get_buf_end()) {
2110 auto p
= h
->file
->fnode
.seek(off
, &x_off
);
2111 ceph_assert(p
!= h
->file
->fnode
.extents
.end());
2112 uint64_t l
= std::min(p
->length
- x_off
, len
);
2114 l
= std::min(l
, uint64_t(1) << 30);
2115 dout(20) << __func__
<< " read random 0x"
2116 << std::hex
<< x_off
<< "~" << l
<< std::dec
2117 << " of " << *p
<< dendl
;
2119 if (!cct
->_conf
->bluefs_check_for_zeros
) {
2120 r
= _bdev_read_random(p
->bdev
, p
->offset
+ x_off
, l
, out
,
2121 cct
->_conf
->bluefs_buffered_io
);
2123 r
= _read_random_and_check(p
->bdev
, p
->offset
+ x_off
, l
, out
,
2124 cct
->_conf
->bluefs_buffered_io
);
2126 ceph_assert(r
== 0);
2132 logger
->inc(l_bluefs_read_random_disk_count
, 1);
2133 logger
->inc(l_bluefs_read_random_disk_bytes
, l
);
2138 auto left
= buf
->get_buf_remaining(off
);
2139 int64_t r
= std::min(len
, left
);
2140 logger
->inc(l_bluefs_read_random_buffer_count
, 1);
2141 logger
->inc(l_bluefs_read_random_buffer_bytes
, r
);
2142 dout(20) << __func__
<< " left 0x" << std::hex
<< left
2143 << " 0x" << off
<< "~" << len
<< std::dec
2146 auto p
= buf
->bl
.begin();
2147 p
.seek(off
- buf
->bl_off
);
2151 dout(30) << __func__
<< " result chunk (0x"
2152 << std::hex
<< r
<< std::dec
<< " bytes):\n";
2154 t
.substr_of(buf
->bl
, off
- buf
->bl_off
, r
);
2164 dout(20) << __func__
<< std::hex
2166 << std::dec
<< dendl
;
2167 --h
->file
->num_reading
;
2171 int64_t BlueFS::_read(
2172 FileReader
*h
, ///< [in] read from here
2173 uint64_t off
, ///< [in] offset
2174 size_t len
, ///< [in] this many bytes
2175 bufferlist
*outbl
, ///< [out] optional: reference the result here
2176 char *out
) ///< [out] optional: or copy it here
2178 FileReaderBuffer
*buf
= &(h
->buf
);
2180 bool prefetch
= !outbl
&& !out
;
2181 dout(10) << __func__
<< " h " << h
2182 << " 0x" << std::hex
<< off
<< "~" << len
<< std::dec
2183 << " from " << lock_fnode_print(h
->file
)
2184 << (prefetch
? " prefetch" : "")
2187 ++h
->file
->num_reading
;
2189 if (!h
->ignore_eof
&&
2190 off
+ len
> h
->file
->fnode
.size
) {
2191 if (off
> h
->file
->fnode
.size
)
2194 len
= h
->file
->fnode
.size
- off
;
2195 dout(20) << __func__
<< " reaching (or past) eof, len clipped to 0x"
2196 << std::hex
<< len
<< std::dec
<< dendl
;
2198 logger
->inc(l_bluefs_read_count
, 1);
2199 logger
->inc(l_bluefs_read_bytes
, len
);
2201 logger
->inc(l_bluefs_read_prefetch_count
, 1);
2202 logger
->inc(l_bluefs_read_prefetch_bytes
, len
);
2209 std::shared_lock
s_lock(h
->lock
);
2212 if (off
< buf
->bl_off
|| off
>= buf
->get_buf_end()) {
2214 std::unique_lock
u_lock(h
->lock
);
2215 buf
->bl
.reassign_to_mempool(mempool::mempool_bluefs_file_reader
);
2216 if (off
< buf
->bl_off
|| off
>= buf
->get_buf_end()) {
2217 // if precondition hasn't changed during locking upgrade.
2219 buf
->bl_off
= off
& super
.block_mask();
2221 auto p
= h
->file
->fnode
.seek(buf
->bl_off
, &x_off
);
2222 if (p
== h
->file
->fnode
.extents
.end()) {
2223 dout(5) << __func__
<< " reading less then required "
2224 << ret
<< "<" << ret
+ len
<< dendl
;
2228 uint64_t want
= round_up_to(len
+ (off
& ~super
.block_mask()),
2230 want
= std::max(want
, buf
->max_prefetch
);
2231 uint64_t l
= std::min(p
->length
- x_off
, want
);
2233 l
= std::min(l
, uint64_t(1) << 30);
2234 uint64_t eof_offset
= round_up_to(h
->file
->fnode
.size
, super
.block_size
);
2235 if (!h
->ignore_eof
&&
2236 buf
->bl_off
+ l
> eof_offset
) {
2237 l
= eof_offset
- buf
->bl_off
;
2239 dout(20) << __func__
<< " fetching 0x"
2240 << std::hex
<< x_off
<< "~" << l
<< std::dec
2241 << " of " << *p
<< dendl
;
2243 // when reading BlueFS log (only happens on startup) use non-buffered io
2244 // it makes it in sync with logic in _flush_range()
2245 bool use_buffered_io
= h
->file
->fnode
.ino
== 1 ? false : cct
->_conf
->bluefs_buffered_io
;
2246 if (!cct
->_conf
->bluefs_check_for_zeros
) {
2247 r
= _bdev_read(p
->bdev
, p
->offset
+ x_off
, l
, &buf
->bl
, ioc
[p
->bdev
],
2250 r
= _read_and_check(
2251 p
->bdev
, p
->offset
+ x_off
, l
, &buf
->bl
, ioc
[p
->bdev
],
2254 logger
->inc(l_bluefs_read_disk_count
, 1);
2255 logger
->inc(l_bluefs_read_disk_bytes
, l
);
2257 ceph_assert(r
== 0);
2261 // we should recheck if buffer is valid after lock downgrade
2264 left
= buf
->get_buf_remaining(off
);
2265 dout(20) << __func__
<< " left 0x" << std::hex
<< left
2266 << " len 0x" << len
<< std::dec
<< dendl
;
2268 int64_t r
= std::min(len
, left
);
2271 t
.substr_of(buf
->bl
, off
- buf
->bl_off
, r
);
2272 outbl
->claim_append(t
);
2275 auto p
= buf
->bl
.begin();
2276 p
.seek(off
- buf
->bl_off
);
2281 dout(30) << __func__
<< " result chunk (0x"
2282 << std::hex
<< r
<< std::dec
<< " bytes):\n";
2284 t
.substr_of(buf
->bl
, off
- buf
->bl_off
, r
);
2294 dout(20) << __func__
<< std::hex
2296 << std::dec
<< dendl
;
2297 ceph_assert(!outbl
|| (int)outbl
->length() == ret
);
2298 --h
->file
->num_reading
;
2302 void BlueFS::invalidate_cache(FileRef f
, uint64_t offset
, uint64_t length
)
2304 std::lock_guard
l(f
->lock
);
2305 dout(10) << __func__
<< " file " << f
->fnode
2306 << " 0x" << std::hex
<< offset
<< "~" << length
<< std::dec
2308 if (offset
& ~super
.block_mask()) {
2309 offset
&= super
.block_mask();
2310 length
= round_up_to(length
, super
.block_size
);
2313 auto p
= f
->fnode
.seek(offset
, &x_off
);
2314 while (length
> 0 && p
!= f
->fnode
.extents
.end()) {
2315 uint64_t x_len
= std::min(p
->length
- x_off
, length
);
2316 bdev
[p
->bdev
]->invalidate_cache(p
->offset
+ x_off
, x_len
);
2317 dout(20) << __func__
<< " 0x" << std::hex
<< x_off
<< "~" << x_len
2318 << std:: dec
<< " of " << *p
<< dendl
;
2325 uint64_t BlueFS::_estimate_transaction_size(bluefs_transaction_t
* t
)
2327 uint64_t max_alloc_size
= std::max(alloc_size
[BDEV_WAL
],
2328 std::max(alloc_size
[BDEV_DB
],
2329 alloc_size
[BDEV_SLOW
]));
2331 // conservative estimate for final encoded size
2332 return round_up_to(t
->op_bl
.length() + super
.block_size
* 2, max_alloc_size
);
2335 uint64_t BlueFS::_make_initial_transaction(uint64_t start_seq
,
2336 bluefs_fnode_t
& fnode
,
2337 uint64_t expected_final_size
,
2340 bluefs_transaction_t t0
;
2342 t0
.uuid
= super
.uuid
;
2344 t0
.op_file_update_inc(fnode
);
2345 t0
.op_jump(start_seq
, expected_final_size
); // this is a fixed size op,
2346 // hence it's valid with fake
2347 // params for overall txc size
2350 return _estimate_transaction_size(&t0
);
2353 ceph_assert(expected_final_size
> 0);
2354 out
->reserve(expected_final_size
);
2356 // make sure we're not wrong aboth the size
2357 ceph_assert(out
->length() <= expected_final_size
);
2358 _pad_bl(*out
, expected_final_size
);
2359 return expected_final_size
;
2362 uint64_t BlueFS::_estimate_log_size_N()
2364 std::lock_guard
nl(nodes
.lock
);
2365 int avg_dir_size
= 40; // fixme
2366 int avg_file_size
= 12;
2367 uint64_t size
= 4096 * 2;
2368 size
+= nodes
.file_map
.size() * (1 + sizeof(bluefs_fnode_t
));
2369 size
+= nodes
.dir_map
.size() + (1 + avg_dir_size
);
2370 size
+= nodes
.file_map
.size() * (1 + avg_dir_size
+ avg_file_size
);
2371 return round_up_to(size
, super
.block_size
);
2374 void BlueFS::compact_log()/*_LNF_LD_NF_D*/
2376 if (!cct
->_conf
->bluefs_replay_recovery_disable_compact
) {
2377 if (cct
->_conf
->bluefs_compact_log_sync
) {
2378 _compact_log_sync_LNF_LD();
2380 _compact_log_async_LD_LNF_D();
2385 bool BlueFS::_should_start_compact_log_L_N()
2387 if (log_is_compacting
.load() == true) {
2388 // compaction is already running
2393 std::lock_guard
ll(log
.lock
);
2394 current
= log
.writer
->file
->fnode
.size
;
2396 uint64_t expected
= _estimate_log_size_N();
2397 float ratio
= (float)current
/ (float)expected
;
2398 dout(10) << __func__
<< " current 0x" << std::hex
<< current
2399 << " expected " << expected
<< std::dec
2400 << " ratio " << ratio
2402 if (current
< cct
->_conf
->bluefs_log_compact_min_size
||
2403 ratio
< cct
->_conf
->bluefs_log_compact_min_ratio
) {
2409 void BlueFS::_compact_log_dump_metadata_NF(uint64_t start_seq
,
2410 bluefs_transaction_t
*t
,
2411 int bdev_update_flags
,
2412 uint64_t capture_before_seq
)
2414 dout(20) << __func__
<< dendl
;
2416 t
->uuid
= super
.uuid
;
2418 std::lock_guard
nl(nodes
.lock
);
2420 for (auto& [ino
, file_ref
] : nodes
.file_map
) {
2423 ceph_assert(ino
> 1);
2424 std::lock_guard
fl(file_ref
->lock
);
2425 if (bdev_update_flags
) {
2426 for(auto& e
: file_ref
->fnode
.extents
) {
2428 auto bdev_new
= bdev
;
2429 ceph_assert(!((bdev_update_flags
& REMOVE_WAL
) && bdev
== BDEV_WAL
));
2430 if ((bdev_update_flags
& RENAME_SLOW2DB
) && bdev
== BDEV_SLOW
) {
2433 if ((bdev_update_flags
& RENAME_DB2SLOW
) && bdev
== BDEV_DB
) {
2434 bdev_new
= BDEV_SLOW
;
2436 if (bdev
== BDEV_NEWDB
) {
2437 // REMOVE_DB xor RENAME_DB
2438 ceph_assert(!(bdev_update_flags
& REMOVE_DB
) != !(bdev_update_flags
& RENAME_DB2SLOW
));
2439 ceph_assert(!(bdev_update_flags
& RENAME_SLOW2DB
));
2442 if (bdev
== BDEV_NEWWAL
) {
2443 ceph_assert(bdev_update_flags
& REMOVE_WAL
);
2444 bdev_new
= BDEV_WAL
;
2449 if (capture_before_seq
== 0 || file_ref
->dirty_seq
< capture_before_seq
) {
2450 dout(20) << __func__
<< " op_file_update " << file_ref
->fnode
<< dendl
;
2452 dout(20) << __func__
<< " op_file_update just modified, dirty_seq="
2453 << file_ref
->dirty_seq
<< " " << file_ref
->fnode
<< dendl
;
2455 t
->op_file_update(file_ref
->fnode
);
2457 for (auto& [path
, dir_ref
] : nodes
.dir_map
) {
2458 dout(20) << __func__
<< " op_dir_create " << path
<< dendl
;
2459 t
->op_dir_create(path
);
2460 for (auto& [fname
, file_ref
] : dir_ref
->file_map
) {
2461 dout(20) << __func__
<< " op_dir_link " << path
<< "/" << fname
2462 << " to " << file_ref
->fnode
.ino
<< dendl
;
2463 t
->op_dir_link(path
, fname
, file_ref
->fnode
.ino
);
2468 void BlueFS::_compact_log_sync_LNF_LD()
2470 dout(10) << __func__
<< dendl
;
2471 uint8_t prefer_bdev
;
2473 std::lock_guard
ll(log
.lock
);
2475 vselector
->select_prefer_bdev(log
.writer
->file
->vselector_hint
);
2477 _rewrite_log_and_layout_sync_LNF_LD(true,
2482 super
.memorized_layout
);
2483 logger
->inc(l_bluefs_log_compactions
);
2487 * SYNC LOG COMPACTION
2489 * 0. Lock the log completely through the whole procedure
2491 * 1. Build new log. It will include log's starter and compacted metadata
2492 * body. Jump op appended to the starter will link the pieces together.
2494 * 2. Write out new log's content
2496 * 3. Write out new superblock. This includes relevant device layout update.
2498 * 4. Finalization. Old space release.
2501 void BlueFS::_rewrite_log_and_layout_sync_LNF_LD(bool permit_dev_fallback
,
2506 std::optional
<bluefs_layout_t
> layout
)
2508 // we substitute log_dev with log_dev_new for new allocations below
2509 // and permitting fallback allocations prevents such a substitution
2510 ceph_assert((permit_dev_fallback
&& log_dev
== log_dev_new
) ||
2511 !permit_dev_fallback
);
2513 dout(10) << __func__
<< " super_dev:" << super_dev
2514 << " log_dev:" << log_dev
2515 << " log_dev_new:" << log_dev_new
2516 << " flags:" << flags
2517 << " seq:" << log
.seq_live
2519 utime_t mtime
= ceph_clock_now();
2520 uint64_t starter_seq
= 1;
2523 // Lock the log totally till the end of the procedure
2524 std::lock_guard
ll(log
.lock
);
2525 auto t0
= mono_clock::now();
2527 File
*log_file
= log
.writer
->file
.get();
2528 bluefs_fnode_t fnode_tail
;
2529 // log.t.seq is always set to current live seq
2530 ceph_assert(log
.t
.seq
== log
.seq_live
);
2531 // Capturing entire state. Dump anything that has been stored there.
2533 log
.t
.seq
= log
.seq_live
;
2534 // From now on, no changes to log.t are permitted until we finish rewriting log.
2535 // Can allow dirty to remain dirty - log.seq_live will not change.
2539 // Build new log starter and compacted metadata body
2540 // 1.1. Build full compacted meta transaction.
2541 // Encode a bluefs transaction that dumps all of the in-memory fnodes
2543 // This might be pretty large and its allocation map can exceed
2544 // superblock size. Hence instead we'll need log starter part which
2545 // goes to superblock and refers that new meta through op_update_inc.
2546 // 1.2. Allocate space for the above transaction
2547 // using its size estimation.
2548 // 1.3. Allocate the space required for the starter part of the new log.
2549 // It should be small enough to fit into superblock.
2550 // 1.4 Building new log persistent fnode representation which will
2551 // finally land to disk.
2552 // Depending on input parameters we might need to perform device ids
2553 // rename - runtime and persistent replicas should be different when we
2554 // are in the device migration process.
2555 // 1.5 Store starter fnode to run-time superblock, to be written out later.
2556 // It doesn't contain compacted meta to fit relevant alocation map into
2558 // 1.6 Proceed building new log persistent fnode representation.
2559 // Will add log tail with compacted meta extents from 1.1.
2560 // Device rename applied as well
2562 // 1.7. Encode new log fnode starter,
2563 // It will include op_init, new log's op_update_inc
2564 // and jump to the compacted meta transaction beginning.
2565 // Superblock will reference this starter part
2567 // 1.8. Encode compacted meta transaction,
2568 // extend the transaction with a jump to proper sequence no
2572 // 1.1 Build full compacted meta transaction
2573 bluefs_transaction_t compacted_meta_t
;
2574 _compact_log_dump_metadata_NF(starter_seq
+ 1, &compacted_meta_t
, flags
, 0);
2576 // 1.2 Allocate the space required for the compacted meta transaction
2577 uint64_t compacted_meta_need
=
2578 _estimate_transaction_size(&compacted_meta_t
) +
2579 cct
->_conf
->bluefs_max_log_runway
;
2581 dout(20) << __func__
<< " compacted_meta_need " << compacted_meta_need
<< dendl
;
2583 int r
= _allocate(log_dev
, compacted_meta_need
, 0, &fnode_tail
, 0,
2584 permit_dev_fallback
);
2585 ceph_assert(r
== 0);
2588 // 1.3 Allocate the space required for the starter part of the new log.
2589 // estimate new log fnode size to be referenced from superblock
2590 // hence use dummy fnode and jump parameters
2591 uint64_t starter_need
= _make_initial_transaction(starter_seq
, fnode_tail
, 0, nullptr);
2593 bluefs_fnode_t
fnode_starter(log_file
->fnode
.ino
, 0, mtime
);
2594 r
= _allocate(log_dev
, starter_need
, 0, &fnode_starter
, 0,
2595 permit_dev_fallback
);
2596 ceph_assert(r
== 0);
2598 // 1.4 Building starter fnode
2599 bluefs_fnode_t
fnode_persistent(fnode_starter
.ino
, 0, mtime
);
2600 for (auto p
: fnode_starter
.extents
) {
2601 // rename device if needed - this is possible when fallback allocations
2602 // are prohibited only. Which means every extent is targeted to the same
2603 // device and we can unconditionally update them.
2604 if (log_dev
!= log_dev_new
) {
2605 dout(10) << __func__
<< " renaming log extents to "
2606 << log_dev_new
<< dendl
;
2607 p
.bdev
= log_dev_new
;
2609 fnode_persistent
.append_extent(p
);
2612 // 1.5 Store starter fnode to run-time superblock, to be written out later
2613 super
.log_fnode
= fnode_persistent
;
2615 // 1.6 Proceed building new log persistent fnode representation
2616 // we'll build incremental update starting from this point
2617 fnode_persistent
.reset_delta();
2618 for (auto p
: fnode_tail
.extents
) {
2619 // rename device if needed - this is possible when fallback allocations
2620 // are prohibited only. Which means every extent is targeted to the same
2621 // device and we can unconditionally update them.
2622 if (log_dev
!= log_dev_new
) {
2623 dout(10) << __func__
<< " renaming log extents to "
2624 << log_dev_new
<< dendl
;
2625 p
.bdev
= log_dev_new
;
2627 fnode_persistent
.append_extent(p
);
2630 // 1.7 Encode new log fnode
2631 // This will flush incremental part of fnode_persistent only.
2632 bufferlist starter_bl
;
2633 _make_initial_transaction(starter_seq
, fnode_persistent
, starter_need
, &starter_bl
);
2635 // 1.8 Encode compacted meta transaction
2636 dout(20) << __func__
<< " op_jump_seq " << log
.seq_live
<< dendl
;
2637 // hopefully "compact_meta_need" estimation provides enough extra space
2638 // for this op, assert below if not
2639 compacted_meta_t
.op_jump_seq(log
.seq_live
);
2641 bufferlist compacted_meta_bl
;
2642 encode(compacted_meta_t
, compacted_meta_bl
);
2643 _pad_bl(compacted_meta_bl
);
2644 ceph_assert(compacted_meta_bl
.length() <= compacted_meta_need
);
2648 // Write out new log's content
2649 // 2.1. Build the full runtime new log's fnode
2651 // 2.2. Write out new log's
2653 // 2.3. Do flush and wait for completion through flush_bdev()
2655 // 2.4. Finalize log update
2656 // Update all sequence numbers
2659 // 2.1 Build the full runtime new log's fnode
2660 bluefs_fnode_t old_log_fnode
;
2661 old_log_fnode
.swap(fnode_starter
);
2662 old_log_fnode
.clone_extents(fnode_tail
);
2663 old_log_fnode
.reset_delta();
2664 log_file
->fnode
.swap(old_log_fnode
);
2666 // 2.2 Write out new log's content
2667 // Get rid off old writer
2668 _close_writer(log
.writer
);
2669 // Make new log writer and stage new log's content writing
2670 log
.writer
= _create_writer(log_file
);
2671 log
.writer
->append(starter_bl
);
2672 log
.writer
->append(compacted_meta_bl
);
2674 // 2.3 Do flush and wait for completion through flush_bdev()
2675 _flush_special(log
.writer
);
2677 if (!cct
->_conf
->bluefs_sync_write
) {
2678 list
<aio_t
> completed_ios
;
2679 _claim_completed_aios(log
.writer
, &completed_ios
);
2680 _wait_for_aio(log
.writer
);
2681 completed_ios
.clear();
2686 // 2.4 Finalize log update
2688 dirty
.seq_live
= log
.seq_live
;
2689 log
.t
.seq
= log
.seq_live
;
2690 vselector
->sub_usage(log_file
->vselector_hint
, old_log_fnode
);
2691 vselector
->add_usage(log_file
->vselector_hint
, log_file
->fnode
);
2694 // Write out new superblock to reflect all the changes.
2697 super
.memorized_layout
= layout
;
2698 _write_super(super_dev
);
2701 // we're mostly done
2702 dout(10) << __func__
<< " log extents " << log_file
->fnode
.extents
<< dendl
;
2703 logger
->inc(l_bluefs_log_compactions
);
2706 // Finalization. Release old space.
2709 dout(10) << __func__
2710 << " release old log extents " << old_log_fnode
.extents
2712 std::lock_guard
dl(dirty
.lock
);
2713 for (auto& r
: old_log_fnode
.extents
) {
2714 dirty
.pending_release
[r
.bdev
].insert(r
.offset
, r
.length
);
2717 logger
->tinc(l_bluefs_compaction_lock_lat
, mono_clock::now() - t0
);
2721 * ASYNC LOG COMPACTION
2723 * 0. Lock the log and forbid its extension. The former covers just
2724 * a part of the below procedure while the latter spans over it
2726 * 1. Allocate a new extent to continue the log, and then log an event
2727 * that jumps the log write position to the new extent. At this point, the
2728 * old extent(s) won't be written to, and reflect everything to compact.
2729 * New events will be written to the new region that we'll keep.
2730 * The latter will finally become new log tail on compaction completion.
2732 * 2. Build new log. It will include log's starter, compacted metadata
2733 * body and the above tail. Jump ops appended to the starter and meta body
2734 * will link the pieces togather. Log's lock is releases in the mid of the
2735 * process to permit parallel access to it.
2737 * 3. Write out new log's content.
2739 * 4. Write out new superblock to reflect all the changes.
2741 * 5. Apply new log fnode, log is locked for a while.
2743 * 6. Finalization. Clean up, old space release and total unlocking.
2746 void BlueFS::_compact_log_async_LD_LNF_D() //also locks FW for new_writer
2748 dout(10) << __func__
<< dendl
;
2749 utime_t mtime
= ceph_clock_now();
2750 uint64_t starter_seq
= 1;
2751 uint64_t old_log_jump_to
= 0;
2754 // Lock the log and forbid its expansion and other compactions
2756 // only one compaction allowed at one time
2757 bool old_is_comp
= std::atomic_exchange(&log_is_compacting
, true);
2759 dout(10) << __func__
<< " ongoing" <<dendl
;
2762 // lock log's run-time structures for a while
2764 auto t0
= mono_clock::now();
2767 // Prepare current log for jumping into it.
2768 // 1. Allocate extent
2769 // 2. Update op to log
2770 // 3. Jump op to log
2771 // During that, no one else can write to log, otherwise we risk jumping backwards.
2772 // We need to sync log, because we are injecting discontinuity, and writer is not prepared for that.
2774 //signal _maybe_extend_log that expansion of log is temporary inacceptable
2775 bool old_forbidden
= atomic_exchange(&log_forbidden_to_expand
, true);
2776 ceph_assert(old_forbidden
== false);
2780 // Prepare current log for jumping into it.
2781 // 1.1. Allocate extent
2782 // 1.2. Save log's fnode extents and add new extents
2783 // 1.3. Update op to log
2784 // 1.4. Jump op to log
2785 // During that, no one else can write to log, otherwise we risk jumping backwards.
2786 // We need to sync log, because we are injecting discontinuity, and writer is not prepared for that.
2788 // 1.1 allocate new log extents and store them at fnode_tail
2789 File
*log_file
= log
.writer
->file
.get();
2790 old_log_jump_to
= log_file
->fnode
.get_allocated();
2791 bluefs_fnode_t fnode_tail
;
2792 uint64_t runway
= log_file
->fnode
.get_allocated() - log
.writer
->get_effective_write_pos();
2793 dout(10) << __func__
<< " old_log_jump_to 0x" << std::hex
<< old_log_jump_to
2794 << " need 0x" << cct
->_conf
->bluefs_max_log_runway
<< std::dec
<< dendl
;
2795 int r
= _allocate(vselector
->select_prefer_bdev(log_file
->vselector_hint
),
2796 cct
->_conf
->bluefs_max_log_runway
,
2799 ceph_assert(r
== 0);
2801 // 1.2 save log's fnode extents and add new extents
2802 bluefs_fnode_t
old_log_fnode(log_file
->fnode
);
2803 log_file
->fnode
.clone_extents(fnode_tail
);
2804 //adjust usage as flush below will need it
2805 vselector
->sub_usage(log_file
->vselector_hint
, old_log_fnode
);
2806 vselector
->add_usage(log_file
->vselector_hint
, log_file
->fnode
);
2807 dout(10) << __func__
<< " log extents " << log_file
->fnode
.extents
<< dendl
;
2809 // 1.3 update the log file change and log a jump to the offset where we want to
2810 // write the new entries
2811 log
.t
.op_file_update_inc(log_file
->fnode
);
2813 // 1.4 jump to new position should mean next seq
2814 log
.t
.op_jump(log
.seq_live
+ 1, old_log_jump_to
);
2815 uint64_t seq_now
= log
.seq_live
;
2816 // we need to flush all bdev because we will be streaming all dirty files to log
2817 // TODO - think - if _flush_and_sync_log_jump will not add dirty files nor release pending allocations
2818 // then flush_bdev() will not be necessary
2820 _flush_and_sync_log_jump_D(old_log_jump_to
, runway
);
2824 // Build new log starter and compacted metadata body
2825 // 2.1. Build full compacted meta transaction.
2826 // While still holding the lock, encode a bluefs transaction
2827 // that dumps all of the in-memory fnodes and names.
2828 // This might be pretty large and its allocation map can exceed
2829 // superblock size. Hence instead we'll need log starter part which
2830 // goes to superblock and refers that new meta through op_update_inc.
2831 // 2.2. After releasing the lock allocate space for the above transaction
2832 // using its size estimation.
2833 // Then build tailing list of extents which consists of these
2834 // newly allocated extents followed by ones from Part 1.
2835 // 2.3. Allocate the space required for the starter part of the new log.
2836 // It should be small enough to fit into superblock.
2837 // Effectively we start building new log fnode here.
2838 // 2.4. Store starter fnode to run-time superblock, to be written out later
2839 // 2.5. Finalize new log's fnode building
2840 // This will include log's starter and tailing extents built at 2.2
2841 // 2.6. Encode new log fnode starter,
2842 // It will include op_init, new log's op_update_inc
2843 // and jump to the compacted meta transaction beginning.
2844 // Superblock will reference this starter part
2845 // 2.7. Encode compacted meta transaction,
2846 // extend the transaction with a jump to the log tail from 1.1 before
2850 // 2.1 Build full compacted meta transaction
2851 bluefs_transaction_t compacted_meta_t
;
2852 _compact_log_dump_metadata_NF(starter_seq
+ 1, &compacted_meta_t
, 0, seq_now
);
2854 // now state is captured to compacted_meta_t,
2855 // current log can be used to write to,
2856 //ops in log will be continuation of captured state
2857 logger
->tinc(l_bluefs_compaction_lock_lat
, mono_clock::now() - t0
);
2860 // 2.2 Allocate the space required for the compacted meta transaction
2861 uint64_t compacted_meta_need
= _estimate_transaction_size(&compacted_meta_t
);
2862 dout(20) << __func__
<< " compacted_meta_need " << compacted_meta_need
2865 bluefs_fnode_t fnode_pre_tail
;
2867 r
= _allocate(vselector
->select_prefer_bdev(log_file
->vselector_hint
),
2868 compacted_meta_need
,
2871 ceph_assert(r
== 0);
2872 // build trailing list of extents in fnode_tail,
2873 // this will include newly allocated extents for compacted meta
2874 // and aux extents allocated at step 1.1
2875 fnode_pre_tail
.claim_extents(fnode_tail
.extents
);
2876 fnode_tail
.swap_extents(fnode_pre_tail
);
2879 // 2.3 Allocate the space required for the starter part of the new log.
2880 // Start building New log fnode
2881 FileRef new_log
= nullptr;
2882 new_log
= ceph::make_ref
<File
>();
2883 new_log
->fnode
.ino
= log_file
->fnode
.ino
;
2884 new_log
->fnode
.mtime
= mtime
;
2885 // Estimate the required space
2886 uint64_t starter_need
=
2887 _make_initial_transaction(starter_seq
, fnode_tail
, 0, nullptr);
2888 // and now allocate and store at new_log_fnode
2889 r
= _allocate(vselector
->select_prefer_bdev(log_file
->vselector_hint
),
2893 ceph_assert(r
== 0);
2895 // 2.4 Store starter fnode to run-time superblock, to be written out later
2896 super
.log_fnode
= new_log
->fnode
;
2898 // 2.5 Finalize new log's fnode building
2899 // start collecting new log fnode updates (to make op_update_inc later)
2900 // since this point. This will include compacted meta from 2.2 and aux
2901 // extents from 1.1.
2902 new_log
->fnode
.reset_delta();
2903 new_log
->fnode
.claim_extents(fnode_tail
.extents
);
2905 // 2.6 Encode new log fnode
2906 bufferlist starter_bl
;
2907 _make_initial_transaction(starter_seq
, new_log
->fnode
, starter_need
,
2910 // 2.7 Encode compacted meta transaction,
2911 dout(20) << __func__
2912 << " new_log jump seq " << seq_now
2913 << std::hex
<< " offset 0x" << starter_need
+ compacted_meta_need
2914 << std::dec
<< dendl
;
2915 // Extent compacted_meta transaction with a just to new log tail.
2916 // Hopefully "compact_meta_need" estimation provides enough extra space
2917 // for this new jump, assert below if not
2918 compacted_meta_t
.op_jump(seq_now
, starter_need
+ compacted_meta_need
);
2919 // Now do encodeing and padding
2920 bufferlist compacted_meta_bl
;
2921 compacted_meta_bl
.reserve(compacted_meta_need
);
2922 encode(compacted_meta_t
, compacted_meta_bl
);
2923 ceph_assert(compacted_meta_bl
.length() <= compacted_meta_need
);
2924 _pad_bl(compacted_meta_bl
, compacted_meta_need
);
2928 // Write out new log's content
2929 // 3.1 Stage new log's content writing
2930 // 3.2 Do flush and wait for completion through flush_bdev()
2933 // 3.1 Stage new log's content writing
2934 // Make new log writer and append bufferlists to write out.
2935 FileWriter
*new_log_writer
= _create_writer(new_log
);
2936 // And append all new log's bufferlists to write out.
2937 new_log_writer
->append(starter_bl
);
2938 new_log_writer
->append(compacted_meta_bl
);
2940 // 3.2. flush and wait
2941 _flush_special(new_log_writer
);
2942 _flush_bdev(new_log_writer
, false); // do not check log.lock is locked
2945 // Write out new superblock to reflect all the changes.
2948 _write_super(BDEV_DB
);
2952 // Apply new log fnode
2955 // we need to acquire log's lock back at this point
2957 // Reconstruct actual log object from the new one.
2958 vselector
->sub_usage(log_file
->vselector_hint
, log_file
->fnode
);
2959 log_file
->fnode
.size
=
2960 log
.writer
->pos
- old_log_jump_to
+ starter_need
+ compacted_meta_need
;
2961 log_file
->fnode
.mtime
= std::max(mtime
, log_file
->fnode
.mtime
);
2962 log_file
->fnode
.swap_extents(new_log
->fnode
);
2963 // update log's writer
2964 log
.writer
->pos
= log
.writer
->file
->fnode
.size
;
2965 vselector
->add_usage(log_file
->vselector_hint
, log_file
->fnode
);
2969 // we're mostly done
2970 dout(10) << __func__
<< " log extents " << log_file
->fnode
.extents
<< dendl
;
2971 logger
->inc(l_bluefs_log_compactions
);
2975 // 6.1 Permit log's extension, forbidden at step 0.
2977 // 6.2 Release the new log writer
2979 // 6.3 Release old space
2981 // 6.4. Enable other compactions
2984 // 6.1 Permit log's extension, forbidden at step 0.
2985 old_forbidden
= atomic_exchange(&log_forbidden_to_expand
, false);
2986 ceph_assert(old_forbidden
== true);
2987 //to wake up if someone was in need of expanding log
2988 log_cond
.notify_all();
2990 // 6.2 Release the new log writer
2991 _close_writer(new_log_writer
);
2992 new_log_writer
= nullptr;
2995 // 6.3 Release old space
2997 dout(10) << __func__
2998 << " release old log extents " << old_log_fnode
.extents
3000 std::lock_guard
dl(dirty
.lock
);
3001 for (auto& r
: old_log_fnode
.extents
) {
3002 dirty
.pending_release
[r
.bdev
].insert(r
.offset
, r
.length
);
3006 // 6.4. Enable other compactions
3007 old_is_comp
= atomic_exchange(&log_is_compacting
, false);
3008 ceph_assert(old_is_comp
);
3011 void BlueFS::_pad_bl(bufferlist
& bl
, uint64_t pad_size
)
3013 pad_size
= std::max(pad_size
, uint64_t(super
.block_size
));
3014 uint64_t partial
= bl
.length() % pad_size
;
3016 dout(10) << __func__
<< " padding with 0x" << std::hex
3017 << pad_size
- partial
<< " zeros" << std::dec
<< dendl
;
3018 bl
.append_zero(pad_size
- partial
);
3023 // Returns log seq that was live before advance.
3024 uint64_t BlueFS::_log_advance_seq()
3026 ceph_assert(ceph_mutex_is_locked(dirty
.lock
));
3027 ceph_assert(ceph_mutex_is_locked(log
.lock
));
3029 // this will became seq_stable once we write
3030 ceph_assert(dirty
.seq_stable
< dirty
.seq_live
);
3031 ceph_assert(log
.t
.seq
== log
.seq_live
);
3032 uint64_t seq
= log
.seq_live
;
3033 log
.t
.uuid
= super
.uuid
;
3037 ceph_assert(dirty
.seq_live
== log
.seq_live
);
3042 // Adds to log.t file modifications mentioned in `dirty.files`.
3043 // Note: some bluefs ops may have already been stored in log.t transaction.
3044 void BlueFS::_consume_dirty(uint64_t seq
)
3046 ceph_assert(ceph_mutex_is_locked(dirty
.lock
));
3047 ceph_assert(ceph_mutex_is_locked(log
.lock
));
3050 // we just incremented log_seq. It is now illegal to add to dirty.files[log_seq]
3051 auto lsi
= dirty
.files
.find(seq
);
3052 if (lsi
!= dirty
.files
.end()) {
3053 dout(20) << __func__
<< " " << lsi
->second
.size() << " dirty.files" << dendl
;
3054 for (auto &f
: lsi
->second
) {
3055 // fnode here is protected indirectly
3056 // the only path that adds to dirty.files goes from _fsync()
3057 // _fsync() is executed under writer lock,
3058 // and does not exit until syncing log is done
3059 dout(20) << __func__
<< " op_file_update_inc " << f
.fnode
<< dendl
;
3060 log
.t
.op_file_update_inc(f
.fnode
);
3065 // Extends log if its free space is smaller then bluefs_min_log_runway.
3066 // Returns space available *BEFORE* adding new space. Signed for additional <0 detection.
3067 int64_t BlueFS::_maybe_extend_log()
3069 ceph_assert(ceph_mutex_is_locked(log
.lock
));
3070 // allocate some more space (before we run out)?
3071 // BTW: this triggers `flush()` in the `page_aligned_appender` of `log.writer`.
3072 int64_t runway
= log
.writer
->file
->fnode
.get_allocated() -
3073 log
.writer
->get_effective_write_pos();
3074 if (runway
< (int64_t)cct
->_conf
->bluefs_min_log_runway
) {
3075 dout(10) << __func__
<< " allocating more log runway (0x"
3076 << std::hex
<< runway
<< std::dec
<< " remaining)" << dendl
;
3078 * Usually, when we are low on space in log, we just allocate new extent,
3079 * put update op(log) to log and we are fine.
3080 * Problem - it interferes with log compaction:
3081 * New log produced in compaction will include - as last op - jump into some offset (anchor) of current log.
3082 * It is assumed that log region (anchor - end) will contain all changes made by bluefs since
3083 * full state capture into new log.
3084 * Putting log update into (anchor - end) region is illegal, because any update there must be compatible with
3085 * both logs, but old log is different then new log.
3087 * Possible solutions:
3088 * - stall extending log until we finish compacting and switch log (CURRENT)
3089 * - re-run compaction with more runway for old log
3090 * - add OP_FILE_ADDEXT that adds extent; will be compatible with both logs
3092 if (log_forbidden_to_expand
.load() == true) {
3093 return -EWOULDBLOCK
;
3095 vselector
->sub_usage(log
.writer
->file
->vselector_hint
, log
.writer
->file
->fnode
);
3097 vselector
->select_prefer_bdev(log
.writer
->file
->vselector_hint
),
3098 cct
->_conf
->bluefs_max_log_runway
,
3100 &log
.writer
->file
->fnode
);
3101 ceph_assert(r
== 0);
3102 vselector
->add_usage(log
.writer
->file
->vselector_hint
, log
.writer
->file
->fnode
);
3103 log
.t
.op_file_update_inc(log
.writer
->file
->fnode
);
3108 void BlueFS::_flush_and_sync_log_core(int64_t runway
)
3110 ceph_assert(ceph_mutex_is_locked(log
.lock
));
3111 dout(10) << __func__
<< " " << log
.t
<< dendl
;
3114 bl
.reserve(super
.block_size
);
3116 // pad to block boundary
3117 size_t realign
= super
.block_size
- (bl
.length() % super
.block_size
);
3118 if (realign
&& realign
!= super
.block_size
)
3119 bl
.append_zero(realign
);
3121 logger
->inc(l_bluefs_log_write_count
, 1);
3122 logger
->inc(l_bluefs_logged_bytes
, bl
.length());
3125 ceph_assert(bl
.length() <= runway
); // if we write this, we will have an unrecoverable data loss
3126 // transaction will not fit extents before growth -> data loss on _replay
3129 log
.writer
->append(bl
);
3131 // prepare log for new transactions
3133 log
.t
.seq
= log
.seq_live
;
3135 uint64_t new_data
= _flush_special(log
.writer
);
3136 vselector
->add_usage(log
.writer
->file
->vselector_hint
, new_data
);
3139 // Clears dirty.files up to (including) seq_stable.
3140 void BlueFS::_clear_dirty_set_stable_D(uint64_t seq
)
3142 std::lock_guard
dl(dirty
.lock
);
3144 // clean dirty files
3145 if (seq
> dirty
.seq_stable
) {
3146 dirty
.seq_stable
= seq
;
3147 dout(20) << __func__
<< " seq_stable " << dirty
.seq_stable
<< dendl
;
3149 // undirty all files that were already streamed to log
3150 auto p
= dirty
.files
.begin();
3151 while (p
!= dirty
.files
.end()) {
3152 if (p
->first
> dirty
.seq_stable
) {
3153 dout(20) << __func__
<< " done cleaning up dirty files" << dendl
;
3157 auto l
= p
->second
.begin();
3158 while (l
!= p
->second
.end()) {
3160 ceph_assert(file
->dirty_seq
<= dirty
.seq_stable
);
3161 dout(20) << __func__
<< " cleaned file " << file
->fnode
.ino
<< dendl
;
3162 file
->dirty_seq
= dirty
.seq_stable
;
3163 p
->second
.erase(l
++);
3166 ceph_assert(p
->second
.empty());
3167 dirty
.files
.erase(p
++);
3170 dout(20) << __func__
<< " seq_stable " << dirty
.seq_stable
3171 << " already >= out seq " << seq
3172 << ", we lost a race against another log flush, done" << dendl
;
3176 void BlueFS::_release_pending_allocations(vector
<interval_set
<uint64_t>>& to_release
)
3178 for (unsigned i
= 0; i
< to_release
.size(); ++i
) {
3179 if (to_release
[i
].empty()) {
3182 /* OK, now we have the guarantee alloc[i] won't be null. */
3184 bool discard_queued
= bdev
[i
]->try_discard(to_release
[i
]);
3185 if (!discard_queued
) {
3186 alloc
[i
]->release(to_release
[i
]);
3187 if (is_shared_alloc(i
)) {
3188 shared_alloc
->bluefs_used
-= to_release
[i
].size();
3194 int BlueFS::_flush_and_sync_log_LD(uint64_t want_seq
)
3196 int64_t available_runway
;
3200 if (want_seq
&& want_seq
<= dirty
.seq_stable
) {
3201 dout(10) << __func__
<< " want_seq " << want_seq
<< " <= seq_stable "
3202 << dirty
.seq_stable
<< ", done" << dendl
;
3203 dirty
.lock
.unlock();
3208 available_runway
= _maybe_extend_log();
3209 if (available_runway
== -EWOULDBLOCK
) {
3210 // we are in need of adding runway, but we are during log-switch from compaction
3211 dirty
.lock
.unlock();
3212 //instead log.lock.unlock() do move ownership
3213 std::unique_lock
<ceph::mutex
> ll(log
.lock
, std::adopt_lock
);
3214 while (log_forbidden_to_expand
.load()) {
3218 ceph_assert(available_runway
>= 0);
3220 } while (available_runway
< 0);
3222 ceph_assert(want_seq
== 0 || want_seq
<= dirty
.seq_live
); // illegal to request seq that was not created yet
3223 uint64_t seq
=_log_advance_seq();
3224 _consume_dirty(seq
);
3225 vector
<interval_set
<uint64_t>> to_release(dirty
.pending_release
.size());
3226 to_release
.swap(dirty
.pending_release
);
3227 dirty
.lock
.unlock();
3229 _flush_and_sync_log_core(available_runway
);
3230 _flush_bdev(log
.writer
);
3231 logger
->set(l_bluefs_log_bytes
, log
.writer
->file
->fnode
.size
);
3232 //now log.lock is no longer needed
3235 _clear_dirty_set_stable_D(seq
);
3236 _release_pending_allocations(to_release
);
3238 _update_logger_stats();
3242 // Flushes log and immediately adjusts log_writer pos.
3243 int BlueFS::_flush_and_sync_log_jump_D(uint64_t jump_to
,
3244 int64_t available_runway
)
3246 ceph_assert(ceph_mutex_is_locked(log
.lock
));
3248 ceph_assert(jump_to
);
3249 // we synchronize writing to log, by lock to log.lock
3252 uint64_t seq
=_log_advance_seq();
3253 _consume_dirty(seq
);
3254 vector
<interval_set
<uint64_t>> to_release(dirty
.pending_release
.size());
3255 to_release
.swap(dirty
.pending_release
);
3256 dirty
.lock
.unlock();
3257 _flush_and_sync_log_core(available_runway
);
3259 dout(10) << __func__
<< " jumping log offset from 0x" << std::hex
3260 << log
.writer
->pos
<< " -> 0x" << jump_to
<< std::dec
<< dendl
;
3261 log
.writer
->pos
= jump_to
;
3262 vselector
->sub_usage(log
.writer
->file
->vselector_hint
, log
.writer
->file
->fnode
.size
);
3263 log
.writer
->file
->fnode
.size
= jump_to
;
3264 vselector
->add_usage(log
.writer
->file
->vselector_hint
, log
.writer
->file
->fnode
.size
);
3266 _flush_bdev(log
.writer
);
3268 _clear_dirty_set_stable_D(seq
);
3269 _release_pending_allocations(to_release
);
3271 logger
->set(l_bluefs_log_bytes
, log
.writer
->file
->fnode
.size
);
3272 _update_logger_stats();
3276 ceph::bufferlist
BlueFS::FileWriter::flush_buffer(
3277 CephContext
* const cct
,
3279 const unsigned length
,
3280 const bluefs_super_t
& super
)
3282 ceph_assert(ceph_mutex_is_locked(this->lock
) || file
->fnode
.ino
<= 1);
3283 ceph::bufferlist bl
;
3285 tail_block
.splice(0, tail_block
.length(), &bl
);
3287 const auto remaining_len
= length
- bl
.length();
3288 buffer
.splice(0, remaining_len
, &bl
);
3289 if (buffer
.length()) {
3290 dout(20) << " leaving 0x" << std::hex
<< buffer
.length() << std::dec
3291 << " unflushed" << dendl
;
3293 if (const unsigned tail
= bl
.length() & ~super
.block_mask(); tail
) {
3294 const auto padding_len
= super
.block_size
- tail
;
3295 dout(20) << __func__
<< " caching tail of 0x"
3297 << " and padding block with 0x" << padding_len
3298 << " buffer.length() " << buffer
.length()
3299 << std::dec
<< dendl
;
3300 // We need to go through the `buffer_appender` to get a chance to
3301 // preserve in-memory contiguity and not mess with the alignment.
3302 // Otherwise a costly rebuild could happen in e.g. `KernelDevice`.
3303 buffer_appender
.append_zero(padding_len
);
3304 buffer
.splice(buffer
.length() - padding_len
, padding_len
, &bl
);
3305 // Deep copy the tail here. This allows to avoid costlier copy on
3306 // bufferlist rebuild in e.g. `KernelDevice` and minimizes number
3307 // of memory allocations.
3308 // The alternative approach would be to place the entire tail and
3309 // padding on a dedicated, 4 KB long memory chunk. This shouldn't
3310 // trigger the rebuild while still being less expensive.
3311 buffer_appender
.substr_of(bl
, bl
.length() - padding_len
- tail
, tail
);
3312 buffer
.splice(buffer
.length() - tail
, tail
, &tail_block
);
3319 int BlueFS::_signal_dirty_to_log_D(FileWriter
*h
)
3321 ceph_assert(ceph_mutex_is_locked(h
->lock
));
3322 std::lock_guard
dl(dirty
.lock
);
3323 if (h
->file
->deleted
) {
3324 dout(10) << __func__
<< " deleted, no-op" << dendl
;
3328 h
->file
->fnode
.mtime
= ceph_clock_now();
3329 ceph_assert(h
->file
->fnode
.ino
>= 1);
3330 if (h
->file
->dirty_seq
<= dirty
.seq_stable
) {
3331 h
->file
->dirty_seq
= dirty
.seq_live
;
3332 dirty
.files
[h
->file
->dirty_seq
].push_back(*h
->file
);
3333 dout(20) << __func__
<< " dirty_seq = " << dirty
.seq_live
3334 << " (was clean)" << dendl
;
3336 if (h
->file
->dirty_seq
!= dirty
.seq_live
) {
3337 // need re-dirty, erase from list first
3338 ceph_assert(dirty
.files
.count(h
->file
->dirty_seq
));
3339 auto it
= dirty
.files
[h
->file
->dirty_seq
].iterator_to(*h
->file
);
3340 dirty
.files
[h
->file
->dirty_seq
].erase(it
);
3341 h
->file
->dirty_seq
= dirty
.seq_live
;
3342 dirty
.files
[h
->file
->dirty_seq
].push_back(*h
->file
);
3343 dout(20) << __func__
<< " dirty_seq = " << dirty
.seq_live
3344 << " (was " << h
->file
->dirty_seq
<< ")" << dendl
;
3346 dout(20) << __func__
<< " dirty_seq = " << dirty
.seq_live
3347 << " (unchanged, do nothing) " << dendl
;
3353 void BlueFS::flush_range(FileWriter
*h
, uint64_t offset
, uint64_t length
)/*_WF*/
3355 _maybe_check_vselector_LNF();
3356 std::unique_lock
hl(h
->lock
);
3357 _flush_range_F(h
, offset
, length
);
3360 int BlueFS::_flush_range_F(FileWriter
*h
, uint64_t offset
, uint64_t length
)
3362 ceph_assert(ceph_mutex_is_locked(h
->lock
));
3363 ceph_assert(h
->file
->num_readers
.load() == 0);
3364 ceph_assert(h
->file
->fnode
.ino
> 1);
3366 dout(10) << __func__
<< " " << h
<< " pos 0x" << std::hex
<< h
->pos
3367 << " 0x" << offset
<< "~" << length
<< std::dec
3368 << " to " << h
->file
->fnode
<< dendl
;
3369 if (h
->file
->deleted
) {
3370 dout(10) << __func__
<< " deleted, no-op" << dendl
;
3374 bool buffered
= cct
->_conf
->bluefs_buffered_io
;
3376 if (offset
+ length
<= h
->pos
)
3378 if (offset
< h
->pos
) {
3379 length
-= h
->pos
- offset
;
3381 dout(10) << " still need 0x"
3382 << std::hex
<< offset
<< "~" << length
<< std::dec
3385 std::lock_guard
file_lock(h
->file
->lock
);
3386 ceph_assert(offset
<= h
->file
->fnode
.size
);
3388 uint64_t allocated
= h
->file
->fnode
.get_allocated();
3389 vselector
->sub_usage(h
->file
->vselector_hint
, h
->file
->fnode
);
3390 // do not bother to dirty the file if we are overwriting
3391 // previously allocated extents.
3392 if (allocated
< offset
+ length
) {
3393 // we should never run out of log space here; see the min runway check
3394 // in _flush_and_sync_log.
3395 int r
= _allocate(vselector
->select_prefer_bdev(h
->file
->vselector_hint
),
3396 offset
+ length
- allocated
,
3400 derr
<< __func__
<< " allocated: 0x" << std::hex
<< allocated
3401 << " offset: 0x" << offset
<< " length: 0x" << length
<< std::dec
3403 vselector
->add_usage(h
->file
->vselector_hint
, h
->file
->fnode
); // undo
3404 ceph_abort_msg("bluefs enospc");
3407 h
->file
->is_dirty
= true;
3409 if (h
->file
->fnode
.size
< offset
+ length
) {
3410 h
->file
->fnode
.size
= offset
+ length
;
3411 h
->file
->is_dirty
= true;
3414 dout(20) << __func__
<< " file now, unflushed " << h
->file
->fnode
<< dendl
;
3415 int res
= _flush_data(h
, offset
, length
, buffered
);
3416 vselector
->add_usage(h
->file
->vselector_hint
, h
->file
->fnode
);
3420 int BlueFS::_flush_data(FileWriter
*h
, uint64_t offset
, uint64_t length
, bool buffered
)
3422 if (h
->file
->fnode
.ino
> 1) {
3423 ceph_assert(ceph_mutex_is_locked(h
->lock
));
3424 ceph_assert(ceph_mutex_is_locked(h
->file
->lock
));
3427 auto p
= h
->file
->fnode
.seek(offset
, &x_off
);
3428 ceph_assert(p
!= h
->file
->fnode
.extents
.end());
3429 dout(20) << __func__
<< " in " << *p
<< " x_off 0x"
3430 << std::hex
<< x_off
<< std::dec
<< dendl
;
3432 unsigned partial
= x_off
& ~super
.block_mask();
3434 dout(20) << __func__
<< " using partial tail 0x"
3435 << std::hex
<< partial
<< std::dec
<< dendl
;
3439 dout(20) << __func__
<< " waiting for previous aio to complete" << dendl
;
3440 for (auto p
: h
->iocv
) {
3447 auto bl
= h
->flush_buffer(cct
, partial
, length
, super
);
3448 ceph_assert(bl
.length() >= length
);
3449 h
->pos
= offset
+ length
;
3450 length
= bl
.length();
3452 logger
->inc(l_bluefs_write_count
, 1);
3453 logger
->inc(l_bluefs_write_bytes
, length
);
3455 switch (h
->writer_type
) {
3457 logger
->inc(l_bluefs_write_count_wal
, 1);
3458 logger
->inc(l_bluefs_bytes_written_wal
, length
);
3461 logger
->inc(l_bluefs_write_count_sst
, 1);
3462 logger
->inc(l_bluefs_bytes_written_sst
, length
);
3466 dout(30) << "dump:\n";
3471 uint64_t bytes_written_slow
= 0;
3472 while (length
> 0) {
3473 logger
->inc(l_bluefs_write_disk_count
, 1);
3475 uint64_t x_len
= std::min(p
->length
- x_off
, length
);
3477 t
.substr_of(bl
, bloff
, x_len
);
3478 if (cct
->_conf
->bluefs_sync_write
) {
3479 bdev
[p
->bdev
]->write(p
->offset
+ x_off
, t
, buffered
, h
->write_hint
);
3481 bdev
[p
->bdev
]->aio_write(p
->offset
+ x_off
, t
, h
->iocv
[p
->bdev
], buffered
, h
->write_hint
);
3483 h
->dirty_devs
[p
->bdev
] = true;
3484 if (p
->bdev
== BDEV_SLOW
) {
3485 bytes_written_slow
+= t
.length();
3493 if (bytes_written_slow
) {
3494 logger
->inc(l_bluefs_bytes_written_slow
, bytes_written_slow
);
3496 for (unsigned i
= 0; i
< MAX_BDEV
; ++i
) {
3498 if (h
->iocv
[i
] && h
->iocv
[i
]->has_pending_aios()) {
3499 bdev
[i
]->aio_submit(h
->iocv
[i
]);
3503 dout(20) << __func__
<< " h " << h
<< " pos now 0x"
3504 << std::hex
<< h
->pos
<< std::dec
<< dendl
;
3509 // we need to retire old completed aios so they don't stick around in
3510 // memory indefinitely (along with their bufferlist refs).
3511 void BlueFS::_claim_completed_aios(FileWriter
*h
, list
<aio_t
> *ls
)
3513 for (auto p
: h
->iocv
) {
3515 ls
->splice(ls
->end(), p
->running_aios
);
3518 dout(10) << __func__
<< " got " << ls
->size() << " aios" << dendl
;
3521 void BlueFS::_wait_for_aio(FileWriter
*h
)
3523 // NOTE: this is safe to call without a lock, as long as our reference is
3526 lgeneric_subdout(cct
, bluefs
, 10) << __func__
;
3527 start
= ceph_clock_now();
3528 *_dout
<< " " << h
<< dendl
;
3529 for (auto p
: h
->iocv
) {
3534 dout(10) << __func__
<< " " << h
<< " done in " << (ceph_clock_now() - start
) << dendl
;
3538 void BlueFS::append_try_flush(FileWriter
*h
, const char* buf
, size_t len
)/*_WF_LNF_NF_LD_D*/
3540 bool flushed_sum
= false;
3542 std::unique_lock
hl(h
->lock
);
3543 size_t max_size
= 1ull << 30; // cap to 1GB
3545 bool need_flush
= true;
3546 auto l0
= h
->get_buffer_length();
3547 if (l0
< max_size
) {
3548 size_t l
= std::min(len
, max_size
- l0
);
3552 need_flush
= h
->get_buffer_length() >= cct
->_conf
->bluefs_min_flush_size
;
3555 bool flushed
= false;
3556 int r
= _flush_F(h
, true, &flushed
);
3557 ceph_assert(r
== 0);
3558 flushed_sum
|= flushed
;
3559 // make sure we've made any progress with flush hence the
3560 // loop doesn't iterate forever
3561 ceph_assert(h
->get_buffer_length() < max_size
);
3566 _maybe_compact_log_LNF_NF_LD_D();
3570 void BlueFS::flush(FileWriter
*h
, bool force
)/*_WF_LNF_NF_LD_D*/
3572 bool flushed
= false;
3575 std::unique_lock
hl(h
->lock
);
3576 r
= _flush_F(h
, force
, &flushed
);
3577 ceph_assert(r
== 0);
3579 if (r
== 0 && flushed
) {
3580 _maybe_compact_log_LNF_NF_LD_D();
3584 int BlueFS::_flush_F(FileWriter
*h
, bool force
, bool *flushed
)
3586 ceph_assert(ceph_mutex_is_locked(h
->lock
));
3587 uint64_t length
= h
->get_buffer_length();
3588 uint64_t offset
= h
->pos
;
3593 length
< cct
->_conf
->bluefs_min_flush_size
) {
3594 dout(10) << __func__
<< " " << h
<< " ignoring, length " << length
3595 << " < min_flush_size " << cct
->_conf
->bluefs_min_flush_size
3600 dout(10) << __func__
<< " " << h
<< " no dirty data on "
3601 << h
->file
->fnode
<< dendl
;
3604 dout(10) << __func__
<< " " << h
<< " 0x"
3605 << std::hex
<< offset
<< "~" << length
<< std::dec
3606 << " to " << h
->file
->fnode
<< dendl
;
3607 ceph_assert(h
->pos
<= h
->file
->fnode
.size
);
3608 int r
= _flush_range_F(h
, offset
, length
);
3615 // Flush for bluefs special files.
3616 // Does not add extents to h.
3617 // Does not mark h as dirty.
3618 // we do not need to dirty the log file (or it's compacting
3619 // replacement) when the file size changes because replay is
3620 // smart enough to discover it on its own.
3621 uint64_t BlueFS::_flush_special(FileWriter
*h
)
3623 ceph_assert(h
->file
->fnode
.ino
<= 1);
3624 uint64_t length
= h
->get_buffer_length();
3625 uint64_t offset
= h
->pos
;
3626 uint64_t new_data
= 0;
3627 ceph_assert(length
+ offset
<= h
->file
->fnode
.get_allocated());
3628 if (h
->file
->fnode
.size
< offset
+ length
) {
3629 new_data
= offset
+ length
- h
->file
->fnode
.size
;
3630 h
->file
->fnode
.size
= offset
+ length
;
3632 _flush_data(h
, offset
, length
, false);
3636 int BlueFS::truncate(FileWriter
*h
, uint64_t offset
)/*_WF_L*/
3638 std::lock_guard
hl(h
->lock
);
3639 dout(10) << __func__
<< " 0x" << std::hex
<< offset
<< std::dec
3640 << " file " << h
->file
->fnode
<< dendl
;
3641 if (h
->file
->deleted
) {
3642 dout(10) << __func__
<< " deleted, no-op" << dendl
;
3646 // we never truncate internal log files
3647 ceph_assert(h
->file
->fnode
.ino
> 1);
3649 // truncate off unflushed data?
3650 if (h
->pos
< offset
&&
3651 h
->pos
+ h
->get_buffer_length() > offset
) {
3652 dout(20) << __func__
<< " tossing out last " << offset
- h
->pos
3653 << " unflushed bytes" << dendl
;
3654 ceph_abort_msg("actually this shouldn't happen");
3656 if (h
->get_buffer_length()) {
3657 int r
= _flush_F(h
, true);
3661 if (offset
== h
->file
->fnode
.size
) {
3664 if (offset
> h
->file
->fnode
.size
) {
3665 ceph_abort_msg("truncate up not supported");
3667 ceph_assert(h
->file
->fnode
.size
>= offset
);
3670 std::lock_guard
ll(log
.lock
);
3671 vselector
->sub_usage(h
->file
->vselector_hint
, h
->file
->fnode
.size
);
3672 h
->file
->fnode
.size
= offset
;
3673 h
->file
->is_dirty
= true;
3674 vselector
->add_usage(h
->file
->vselector_hint
, h
->file
->fnode
.size
);
3675 log
.t
.op_file_update_inc(h
->file
->fnode
);
3679 int BlueFS::fsync(FileWriter
*h
)/*_WF_WD_WLD_WLNF_WNF*/
3681 _maybe_check_vselector_LNF();
3682 std::unique_lock
hl(h
->lock
);
3683 uint64_t old_dirty_seq
= 0;
3685 dout(10) << __func__
<< " " << h
<< " " << h
->file
->fnode
3686 << " dirty " << h
->file
->is_dirty
<< dendl
;
3687 int r
= _flush_F(h
, true);
3691 if (h
->file
->is_dirty
) {
3692 _signal_dirty_to_log_D(h
);
3693 h
->file
->is_dirty
= false;
3696 std::lock_guard
dl(dirty
.lock
);
3697 if (dirty
.seq_stable
< h
->file
->dirty_seq
) {
3698 old_dirty_seq
= h
->file
->dirty_seq
;
3699 dout(20) << __func__
<< " file metadata was dirty (" << old_dirty_seq
3700 << ") on " << h
->file
->fnode
<< ", flushing log" << dendl
;
3704 if (old_dirty_seq
) {
3705 _flush_and_sync_log_LD(old_dirty_seq
);
3707 _maybe_compact_log_LNF_NF_LD_D();
3712 // be careful - either h->file->lock or log.lock must be taken
3713 void BlueFS::_flush_bdev(FileWriter
*h
, bool check_mutext_locked
)
3715 if (check_mutext_locked
) {
3716 if (h
->file
->fnode
.ino
> 1) {
3717 ceph_assert(ceph_mutex_is_locked(h
->lock
));
3718 } else if (h
->file
->fnode
.ino
== 1) {
3719 ceph_assert(ceph_mutex_is_locked(log
.lock
));
3722 std::array
<bool, MAX_BDEV
> flush_devs
= h
->dirty_devs
;
3723 h
->dirty_devs
.fill(false);
3725 if (!cct
->_conf
->bluefs_sync_write
) {
3726 list
<aio_t
> completed_ios
;
3727 _claim_completed_aios(h
, &completed_ios
);
3729 completed_ios
.clear();
3732 _flush_bdev(flush_devs
);
3735 void BlueFS::_flush_bdev(std::array
<bool, MAX_BDEV
>& dirty_bdevs
)
3737 // NOTE: this is safe to call without a lock.
3738 dout(20) << __func__
<< dendl
;
3739 for (unsigned i
= 0; i
< MAX_BDEV
; i
++) {
3745 void BlueFS::_flush_bdev()
3747 // NOTE: this is safe to call without a lock.
3748 dout(20) << __func__
<< dendl
;
3749 for (unsigned i
= 0; i
< MAX_BDEV
; i
++) {
3750 // alloc space from BDEV_SLOW is unexpected.
3751 // So most cases we don't alloc from BDEV_SLOW and so avoiding flush not-used device.
3752 if (bdev
[i
] && (i
!= BDEV_SLOW
|| _get_used(i
))) {
3758 const char* BlueFS::get_device_name(unsigned id
)
3760 if (id
>= MAX_BDEV
) return "BDEV_INV";
3761 const char* names
[] = {"BDEV_WAL", "BDEV_DB", "BDEV_SLOW", "BDEV_NEWWAL", "BDEV_NEWDB"};
3765 int BlueFS::_allocate(uint8_t id
, uint64_t len
,
3766 uint64_t alloc_unit
,
3767 bluefs_fnode_t
* node
,
3768 size_t alloc_attempts
,
3769 bool permit_dev_fallback
)
3771 dout(10) << __func__
<< " len 0x" << std::hex
<< len
3772 << " au 0x" << alloc_unit
3773 << std::dec
<< " from " << (int)id
3774 << " cooldown " << cooldown_deadline
3776 ceph_assert(id
< alloc
.size());
3777 int64_t alloc_len
= 0;
3778 PExtentVector extents
;
3781 bool shared
= is_shared_alloc(id
);
3782 auto shared_unit
= shared_alloc
? shared_alloc
->alloc_unit
: 0;
3783 bool was_cooldown
= false;
3786 alloc_unit
= alloc_size
[id
];
3788 // do not attempt shared_allocator with bluefs alloc unit
3789 // when cooling down, fallback to slow dev alloc unit.
3790 if (shared
&& alloc_unit
!= shared_unit
) {
3791 if (duration_cast
<seconds
>(real_clock::now().time_since_epoch()).count() <
3792 cooldown_deadline
) {
3793 logger
->inc(l_bluefs_alloc_shared_size_fallbacks
);
3794 alloc_unit
= shared_unit
;
3795 was_cooldown
= true;
3796 } else if (cooldown_deadline
.fetch_and(0)) {
3797 // we might get false cooldown_deadline reset at this point
3798 // but that's mostly harmless.
3799 dout(1) << __func__
<< " shared allocation cooldown period elapsed"
3803 need
= round_up_to(len
, alloc_unit
);
3804 if (!node
->extents
.empty() && node
->extents
.back().bdev
== id
) {
3805 hint
= node
->extents
.back().end();
3808 extents
.reserve(4); // 4 should be (more than) enough for most allocations
3809 alloc_len
= alloc
[id
]->allocate(need
, alloc_unit
, hint
, &extents
);
3811 if (alloc_len
< 0 || alloc_len
< need
) {
3813 if (alloc_len
> 0) {
3814 alloc
[id
]->release(extents
);
3816 if (!was_cooldown
&& shared
) {
3817 auto delay_s
= cct
->_conf
->bluefs_failed_shared_alloc_cooldown
;
3818 cooldown_deadline
= delay_s
+
3819 duration_cast
<seconds
>(real_clock::now().time_since_epoch()).count();
3820 dout(1) << __func__
<< " shared allocation cooldown set for "
3824 dout(1) << __func__
<< " unable to allocate 0x" << std::hex
<< need
3825 << " on bdev " << (int)id
3826 << ", allocator name " << alloc
[id
]->get_name()
3827 << ", allocator type " << alloc
[id
]->get_type()
3828 << ", capacity 0x" << alloc
[id
]->get_capacity()
3829 << ", block size 0x" << alloc
[id
]->get_block_size()
3830 << ", alloc unit 0x" << alloc_unit
3831 << ", free 0x" << alloc
[id
]->get_free()
3832 << ", fragmentation " << alloc
[id
]->get_fragmentation()
3833 << ", allocated 0x" << (alloc_len
> 0 ? alloc_len
: 0)
3834 << std::dec
<< dendl
;
3836 dout(20) << __func__
<< " alloc-id not set on index="<< (int)id
3837 << " unable to allocate 0x" << std::hex
<< need
3838 << " on bdev " << (int)id
<< std::dec
<< dendl
;
3840 if (alloc
[id
] && shared
&& alloc_unit
!= shared_unit
) {
3841 alloc_unit
= shared_unit
;
3842 dout(20) << __func__
<< " fallback to bdev "
3844 << " with alloc unit 0x" << std::hex
<< alloc_unit
3845 << std::dec
<< dendl
;
3846 logger
->inc(l_bluefs_alloc_shared_size_fallbacks
);
3847 return _allocate(id
,
3852 permit_dev_fallback
);
3853 } else if (permit_dev_fallback
&& id
!= BDEV_SLOW
&& alloc
[id
+ 1]) {
3854 dout(20) << __func__
<< " fallback to bdev "
3857 if (alloc_attempts
> 0 && is_shared_alloc(id
+ 1)) {
3858 logger
->inc(l_bluefs_alloc_shared_dev_fallbacks
);
3860 return _allocate(id
+ 1,
3862 0, // back to default alloc unit
3865 permit_dev_fallback
);
3867 derr
<< __func__
<< " allocation failed, needed 0x" << std::hex
<< need
3872 uint64_t used
= _get_used(id
);
3873 if (max_bytes
[id
] < used
) {
3874 logger
->set(max_bytes_pcounters
[id
], used
);
3875 max_bytes
[id
] = used
;
3878 shared_alloc
->bluefs_used
+= alloc_len
;
3882 for (auto& p
: extents
) {
3883 node
->append_extent(bluefs_extent_t(id
, p
.offset
, p
.length
));
3889 int BlueFS::preallocate(FileRef f
, uint64_t off
, uint64_t len
)/*_LF*/
3891 std::lock_guard
ll(log
.lock
);
3892 std::lock_guard
fl(f
->lock
);
3893 dout(10) << __func__
<< " file " << f
->fnode
<< " 0x"
3894 << std::hex
<< off
<< "~" << len
<< std::dec
<< dendl
;
3896 dout(10) << __func__
<< " deleted, no-op" << dendl
;
3899 ceph_assert(f
->fnode
.ino
> 1);
3900 uint64_t allocated
= f
->fnode
.get_allocated();
3901 if (off
+ len
> allocated
) {
3902 uint64_t want
= off
+ len
- allocated
;
3904 vselector
->sub_usage(f
->vselector_hint
, f
->fnode
);
3905 int r
= _allocate(vselector
->select_prefer_bdev(f
->vselector_hint
),
3909 vselector
->add_usage(f
->vselector_hint
, f
->fnode
);
3913 log
.t
.op_file_update_inc(f
->fnode
);
3918 void BlueFS::sync_metadata(bool avoid_compact
)/*_LNF_NF_LD_D*/
3920 bool can_skip_flush
;
3922 std::lock_guard
ll(log
.lock
);
3923 std::lock_guard
dl(dirty
.lock
);
3924 can_skip_flush
= log
.t
.empty() && dirty
.files
.empty();
3926 if (can_skip_flush
) {
3927 dout(10) << __func__
<< " - no pending log events" << dendl
;
3930 lgeneric_subdout(cct
, bluefs
, 10) << __func__
;
3931 start
= ceph_clock_now();
3933 _flush_bdev(); // FIXME?
3934 _flush_and_sync_log_LD();
3935 dout(10) << __func__
<< " done in " << (ceph_clock_now() - start
) << dendl
;
3938 if (!avoid_compact
) {
3939 _maybe_compact_log_LNF_NF_LD_D();
3943 void BlueFS::_maybe_compact_log_LNF_NF_LD_D()
3945 if (!cct
->_conf
->bluefs_replay_recovery_disable_compact
&&
3946 _should_start_compact_log_L_N()) {
3947 auto t0
= mono_clock::now();
3948 if (cct
->_conf
->bluefs_compact_log_sync
) {
3949 _compact_log_sync_LNF_LD();
3951 _compact_log_async_LD_LNF_D();
3953 logger
->tinc(l_bluefs_compaction_lat
, mono_clock::now() - t0
);
3957 int BlueFS::open_for_write(
3958 std::string_view dirname
,
3959 std::string_view filename
,
3961 bool overwrite
)/*_LND*/
3963 _maybe_check_vselector_LNF();
3965 bool create
= false;
3966 bool truncate
= false;
3967 mempool::bluefs::vector
<bluefs_extent_t
> pending_release_extents
;
3969 std::lock_guard
ll(log
.lock
);
3970 std::lock_guard
nl(nodes
.lock
);
3971 dout(10) << __func__
<< " " << dirname
<< "/" << filename
<< dendl
;
3972 map
<string
,DirRef
>::iterator p
= nodes
.dir_map
.find(dirname
);
3974 if (p
== nodes
.dir_map
.end()) {
3975 // implicitly create the dir
3976 dout(20) << __func__
<< " dir " << dirname
3977 << " does not exist" << dendl
;
3983 map
<string
,FileRef
>::iterator q
= dir
->file_map
.find(filename
);
3984 if (q
== dir
->file_map
.end()) {
3986 dout(20) << __func__
<< " dir " << dirname
<< " (" << dir
3987 << ") file " << filename
3988 << " does not exist" << dendl
;
3991 file
= ceph::make_ref
<File
>();
3992 file
->fnode
.ino
= ++ino_last
;
3993 nodes
.file_map
[ino_last
] = file
;
3994 dir
->file_map
[string
{filename
}] = file
;
3997 logger
->set(l_bluefs_num_files
, nodes
.file_map
.size());
3999 // overwrite existing file?
4002 dout(20) << __func__
<< " dir " << dirname
<< " (" << dir
4003 << ") file " << filename
4004 << " already exists, overwrite in place" << dendl
;
4006 dout(20) << __func__
<< " dir " << dirname
<< " (" << dir
4007 << ") file " << filename
4008 << " already exists, truncate + overwrite" << dendl
;
4009 vselector
->sub_usage(file
->vselector_hint
, file
->fnode
);
4010 file
->fnode
.size
= 0;
4011 pending_release_extents
.swap(file
->fnode
.extents
);
4014 file
->fnode
.clear_extents();
4017 ceph_assert(file
->fnode
.ino
> 1);
4019 file
->fnode
.mtime
= ceph_clock_now();
4020 file
->vselector_hint
= vselector
->get_hint_by_dir(dirname
);
4021 if (create
|| truncate
) {
4022 vselector
->add_usage(file
->vselector_hint
, file
->fnode
); // update file count
4025 dout(20) << __func__
<< " mapping " << dirname
<< "/" << filename
4026 << " vsel_hint " << file
->vselector_hint
4029 log
.t
.op_file_update(file
->fnode
);
4031 log
.t
.op_dir_link(dirname
, filename
, file
->fnode
.ino
);
4033 std::lock_guard
dl(dirty
.lock
);
4034 for (auto& p
: pending_release_extents
) {
4035 dirty
.pending_release
[p
.bdev
].insert(p
.offset
, p
.length
);
4038 *h
= _create_writer(file
);
4040 if (boost::algorithm::ends_with(filename
, ".log")) {
4041 (*h
)->writer_type
= BlueFS::WRITER_WAL
;
4042 if (logger
&& !overwrite
) {
4043 logger
->inc(l_bluefs_files_written_wal
);
4045 } else if (boost::algorithm::ends_with(filename
, ".sst")) {
4046 (*h
)->writer_type
= BlueFS::WRITER_SST
;
4048 logger
->inc(l_bluefs_files_written_sst
);
4052 dout(10) << __func__
<< " h " << *h
<< " on " << file
->fnode
<< dendl
;
4056 BlueFS::FileWriter
*BlueFS::_create_writer(FileRef f
)
4058 FileWriter
*w
= new FileWriter(f
);
4059 for (unsigned i
= 0; i
< MAX_BDEV
; ++i
) {
4061 w
->iocv
[i
] = new IOContext(cct
, NULL
);
4067 void BlueFS::_drain_writer(FileWriter
*h
)
4069 dout(10) << __func__
<< " " << h
<< " type " << h
->writer_type
<< dendl
;
4070 //h->buffer.reassign_to_mempool(mempool::mempool_bluefs_file_writer);
4071 for (unsigned i
=0; i
<MAX_BDEV
; ++i
) {
4074 h
->iocv
[i
]->aio_wait();
4080 if (h
->file
->fnode
.size
>= (1ull << 30)) {
4081 dout(10) << __func__
<< " file is unexpectedly large:" << h
->file
->fnode
<< dendl
;
4085 void BlueFS::_close_writer(FileWriter
*h
)
4090 void BlueFS::close_writer(FileWriter
*h
)
4093 std::lock_guard
l(h
->lock
);
4099 uint64_t BlueFS::debug_get_dirty_seq(FileWriter
*h
)
4101 std::lock_guard
l(h
->lock
);
4102 return h
->file
->dirty_seq
;
4105 bool BlueFS::debug_get_is_dev_dirty(FileWriter
*h
, uint8_t dev
)
4107 std::lock_guard
l(h
->lock
);
4108 return h
->dirty_devs
[dev
];
4111 int BlueFS::open_for_read(
4112 std::string_view dirname
,
4113 std::string_view filename
,
4117 _maybe_check_vselector_LNF();
4118 std::lock_guard
nl(nodes
.lock
);
4119 dout(10) << __func__
<< " " << dirname
<< "/" << filename
4120 << (random
? " (random)":" (sequential)") << dendl
;
4121 map
<string
,DirRef
>::iterator p
= nodes
.dir_map
.find(dirname
);
4122 if (p
== nodes
.dir_map
.end()) {
4123 dout(20) << __func__
<< " dir " << dirname
<< " not found" << dendl
;
4126 DirRef dir
= p
->second
;
4128 map
<string
,FileRef
>::iterator q
= dir
->file_map
.find(filename
);
4129 if (q
== dir
->file_map
.end()) {
4130 dout(20) << __func__
<< " dir " << dirname
<< " (" << dir
4131 << ") file " << filename
4132 << " not found" << dendl
;
4135 File
*file
= q
->second
.get();
4137 *h
= new FileReader(file
, random
? 4096 : cct
->_conf
->bluefs_max_prefetch
,
4139 dout(10) << __func__
<< " h " << *h
<< " on " << file
->fnode
<< dendl
;
4144 std::string_view old_dirname
, std::string_view old_filename
,
4145 std::string_view new_dirname
, std::string_view new_filename
)/*_LND*/
4147 std::lock_guard
ll(log
.lock
);
4148 std::lock_guard
nl(nodes
.lock
);
4149 dout(10) << __func__
<< " " << old_dirname
<< "/" << old_filename
4150 << " -> " << new_dirname
<< "/" << new_filename
<< dendl
;
4151 map
<string
,DirRef
>::iterator p
= nodes
.dir_map
.find(old_dirname
);
4152 if (p
== nodes
.dir_map
.end()) {
4153 dout(20) << __func__
<< " dir " << old_dirname
<< " not found" << dendl
;
4156 DirRef old_dir
= p
->second
;
4157 map
<string
,FileRef
>::iterator q
= old_dir
->file_map
.find(old_filename
);
4158 if (q
== old_dir
->file_map
.end()) {
4159 dout(20) << __func__
<< " dir " << old_dirname
<< " (" << old_dir
4160 << ") file " << old_filename
4161 << " not found" << dendl
;
4164 FileRef file
= q
->second
;
4166 p
= nodes
.dir_map
.find(new_dirname
);
4167 if (p
== nodes
.dir_map
.end()) {
4168 dout(20) << __func__
<< " dir " << new_dirname
<< " not found" << dendl
;
4171 DirRef new_dir
= p
->second
;
4172 q
= new_dir
->file_map
.find(new_filename
);
4173 if (q
!= new_dir
->file_map
.end()) {
4174 dout(20) << __func__
<< " dir " << new_dirname
<< " (" << old_dir
4175 << ") file " << new_filename
4176 << " already exists, unlinking" << dendl
;
4177 ceph_assert(q
->second
!= file
);
4178 log
.t
.op_dir_unlink(new_dirname
, new_filename
);
4179 _drop_link_D(q
->second
);
4182 dout(10) << __func__
<< " " << new_dirname
<< "/" << new_filename
<< " "
4183 << " " << file
->fnode
<< dendl
;
4185 new_dir
->file_map
[string
{new_filename
}] = file
;
4186 old_dir
->file_map
.erase(string
{old_filename
});
4188 log
.t
.op_dir_link(new_dirname
, new_filename
, file
->fnode
.ino
);
4189 log
.t
.op_dir_unlink(old_dirname
, old_filename
);
4193 int BlueFS::mkdir(std::string_view dirname
)/*_LN*/
4195 std::lock_guard
ll(log
.lock
);
4196 std::lock_guard
nl(nodes
.lock
);
4197 dout(10) << __func__
<< " " << dirname
<< dendl
;
4198 map
<string
,DirRef
>::iterator p
= nodes
.dir_map
.find(dirname
);
4199 if (p
!= nodes
.dir_map
.end()) {
4200 dout(20) << __func__
<< " dir " << dirname
<< " exists" << dendl
;
4203 nodes
.dir_map
[string
{dirname
}] = ceph::make_ref
<Dir
>();
4204 log
.t
.op_dir_create(dirname
);
4208 int BlueFS::rmdir(std::string_view dirname
)/*_LN*/
4210 std::lock_guard
ll(log
.lock
);
4211 std::lock_guard
nl(nodes
.lock
);
4212 dout(10) << __func__
<< " " << dirname
<< dendl
;
4213 auto p
= nodes
.dir_map
.find(dirname
);
4214 if (p
== nodes
.dir_map
.end()) {
4215 dout(20) << __func__
<< " dir " << dirname
<< " does not exist" << dendl
;
4218 DirRef dir
= p
->second
;
4219 if (!dir
->file_map
.empty()) {
4220 dout(20) << __func__
<< " dir " << dirname
<< " not empty" << dendl
;
4223 nodes
.dir_map
.erase(string
{dirname
});
4224 log
.t
.op_dir_remove(dirname
);
4228 bool BlueFS::dir_exists(std::string_view dirname
)/*_N*/
4230 std::lock_guard
nl(nodes
.lock
);
4231 map
<string
,DirRef
>::iterator p
= nodes
.dir_map
.find(dirname
);
4232 bool exists
= p
!= nodes
.dir_map
.end();
4233 dout(10) << __func__
<< " " << dirname
<< " = " << (int)exists
<< dendl
;
4237 int BlueFS::stat(std::string_view dirname
, std::string_view filename
,
4238 uint64_t *size
, utime_t
*mtime
)/*_N*/
4240 std::lock_guard
nl(nodes
.lock
);
4241 dout(10) << __func__
<< " " << dirname
<< "/" << filename
<< dendl
;
4242 map
<string
,DirRef
>::iterator p
= nodes
.dir_map
.find(dirname
);
4243 if (p
== nodes
.dir_map
.end()) {
4244 dout(20) << __func__
<< " dir " << dirname
<< " not found" << dendl
;
4247 DirRef dir
= p
->second
;
4248 map
<string
,FileRef
>::iterator q
= dir
->file_map
.find(filename
);
4249 if (q
== dir
->file_map
.end()) {
4250 dout(20) << __func__
<< " dir " << dirname
<< " (" << dir
4251 << ") file " << filename
4252 << " not found" << dendl
;
4255 File
*file
= q
->second
.get();
4256 dout(10) << __func__
<< " " << dirname
<< "/" << filename
4257 << " " << file
->fnode
<< dendl
;
4259 *size
= file
->fnode
.size
;
4261 *mtime
= file
->fnode
.mtime
;
4265 int BlueFS::lock_file(std::string_view dirname
, std::string_view filename
,
4266 FileLock
**plock
)/*_LN*/
4268 std::lock_guard
ll(log
.lock
);
4269 std::lock_guard
nl(nodes
.lock
);
4270 dout(10) << __func__
<< " " << dirname
<< "/" << filename
<< dendl
;
4271 map
<string
,DirRef
>::iterator p
= nodes
.dir_map
.find(dirname
);
4272 if (p
== nodes
.dir_map
.end()) {
4273 dout(20) << __func__
<< " dir " << dirname
<< " not found" << dendl
;
4276 DirRef dir
= p
->second
;
4277 auto q
= dir
->file_map
.find(filename
);
4279 if (q
== dir
->file_map
.end()) {
4280 dout(20) << __func__
<< " dir " << dirname
<< " (" << dir
4281 << ") file " << filename
4282 << " not found, creating" << dendl
;
4283 file
= ceph::make_ref
<File
>();
4284 file
->fnode
.ino
= ++ino_last
;
4285 file
->fnode
.mtime
= ceph_clock_now();
4286 nodes
.file_map
[ino_last
] = file
;
4287 dir
->file_map
[string
{filename
}] = file
;
4288 logger
->set(l_bluefs_num_files
, nodes
.file_map
.size());
4290 log
.t
.op_file_update(file
->fnode
);
4291 log
.t
.op_dir_link(dirname
, filename
, file
->fnode
.ino
);
4295 dout(10) << __func__
<< " already locked" << dendl
;
4299 file
->locked
= true;
4300 *plock
= new FileLock(file
);
4301 dout(10) << __func__
<< " locked " << file
->fnode
4302 << " with " << *plock
<< dendl
;
4306 int BlueFS::unlock_file(FileLock
*fl
)/*_N*/
4308 std::lock_guard
nl(nodes
.lock
);
4309 dout(10) << __func__
<< " " << fl
<< " on " << fl
->file
->fnode
<< dendl
;
4310 ceph_assert(fl
->file
->locked
);
4311 fl
->file
->locked
= false;
4316 int BlueFS::readdir(std::string_view dirname
, vector
<string
> *ls
)/*_N*/
4318 // dirname may contain a trailing /
4319 if (!dirname
.empty() && dirname
.back() == '/') {
4320 dirname
.remove_suffix(1);
4322 std::lock_guard
nl(nodes
.lock
);
4323 dout(10) << __func__
<< " " << dirname
<< dendl
;
4324 if (dirname
.empty()) {
4326 ls
->reserve(nodes
.dir_map
.size() + 2);
4327 for (auto& q
: nodes
.dir_map
) {
4328 ls
->push_back(q
.first
);
4331 // list files in dir
4332 map
<string
,DirRef
>::iterator p
= nodes
.dir_map
.find(dirname
);
4333 if (p
== nodes
.dir_map
.end()) {
4334 dout(20) << __func__
<< " dir " << dirname
<< " not found" << dendl
;
4337 DirRef dir
= p
->second
;
4338 ls
->reserve(dir
->file_map
.size() + 2);
4339 for (auto& q
: dir
->file_map
) {
4340 ls
->push_back(q
.first
);
4344 ls
->push_back("..");
4348 int BlueFS::unlink(std::string_view dirname
, std::string_view filename
)/*_LND*/
4350 std::lock_guard
ll(log
.lock
);
4351 std::lock_guard
nl(nodes
.lock
);
4352 dout(10) << __func__
<< " " << dirname
<< "/" << filename
<< dendl
;
4353 map
<string
,DirRef
>::iterator p
= nodes
.dir_map
.find(dirname
);
4354 if (p
== nodes
.dir_map
.end()) {
4355 dout(20) << __func__
<< " dir " << dirname
<< " not found" << dendl
;
4358 DirRef dir
= p
->second
;
4359 map
<string
,FileRef
>::iterator q
= dir
->file_map
.find(filename
);
4360 if (q
== dir
->file_map
.end()) {
4361 dout(20) << __func__
<< " file " << dirname
<< "/" << filename
4362 << " not found" << dendl
;
4365 FileRef file
= q
->second
;
4367 dout(20) << __func__
<< " file " << dirname
<< "/" << filename
4368 << " is locked" << dendl
;
4371 dir
->file_map
.erase(string
{filename
});
4372 log
.t
.op_dir_unlink(dirname
, filename
);
4377 bool BlueFS::wal_is_rotational()
4379 if (bdev
[BDEV_WAL
]) {
4380 return bdev
[BDEV_WAL
]->is_rotational();
4381 } else if (bdev
[BDEV_DB
]) {
4382 return bdev
[BDEV_DB
]->is_rotational();
4384 return bdev
[BDEV_SLOW
]->is_rotational();
4387 bool BlueFS::db_is_rotational()
4389 if (bdev
[BDEV_DB
]) {
4390 return bdev
[BDEV_DB
]->is_rotational();
4392 return bdev
[BDEV_SLOW
]->is_rotational();
4397 do_replay_recovery_read is used when bluefs log abruptly ends, but it seems that more data should be there.
4398 Idea is to search disk for definiton of extents that will be accompanied with bluefs log in future,
4399 and try if using it will produce healthy bluefs transaction.
4400 We encode already known bluefs log extents and search disk for these bytes.
4401 When we find it, we decode following bytes as extent.
4402 We read that whole extent and then check if merged with existing log part gives a proper bluefs transaction.
4404 int BlueFS::_do_replay_recovery_read(FileReader
*log_reader
,
4409 dout(1) << __func__
<< " replay_pos=0x" << std::hex
<< replay_pos
<<
4410 " needs 0x" << read_offset
<< "~" << read_len
<< std::dec
<< dendl
;
4412 bluefs_fnode_t
& log_fnode
= log_reader
->file
->fnode
;
4413 bufferlist bin_extents
;
4414 ::encode(log_fnode
.extents
, bin_extents
);
4415 dout(2) << __func__
<< " log file encoded extents length = " << bin_extents
.length() << dendl
;
4417 // cannot process if too small to effectively search
4418 ceph_assert(bin_extents
.length() >= 32);
4420 last_32
.substr_of(bin_extents
, bin_extents
.length() - 32, 32);
4422 //read fixed part from replay_pos to end of bluefs_log extents
4425 auto e
= log_fnode
.seek(replay_pos
, &e_off
);
4426 ceph_assert(e
!= log_fnode
.extents
.end());
4427 int r
= _bdev_read(e
->bdev
, e
->offset
+ e_off
, e
->length
- e_off
, &fixed
, ioc
[e
->bdev
],
4428 cct
->_conf
->bluefs_buffered_io
);
4429 ceph_assert(r
== 0);
4430 //capture dev of last good extent
4431 uint8_t last_e_dev
= e
->bdev
;
4432 uint64_t last_e_off
= e
->offset
;
4434 while (e
!= log_fnode
.extents
.end()) {
4435 r
= _bdev_read(e
->bdev
, e
->offset
, e
->length
, &fixed
, ioc
[e
->bdev
],
4436 cct
->_conf
->bluefs_buffered_io
);
4437 ceph_assert(r
== 0);
4438 last_e_dev
= e
->bdev
;
4441 ceph_assert(replay_pos
+ fixed
.length() == read_offset
);
4443 dout(2) << __func__
<< " valid data in log = " << fixed
.length() << dendl
;
4446 bool operator()(const bluefs_extent_t
& a
, const bluefs_extent_t
& b
) const {
4447 if (a
.bdev
< b
.bdev
) return true;
4448 if (a
.offset
< b
.offset
) return true;
4449 return a
.length
< b
.length
;
4452 std::set
<bluefs_extent_t
, compare
> extents_rejected
;
4453 for (int dcnt
= 0; dcnt
< 3; dcnt
++) {
4454 uint8_t dev
= (last_e_dev
+ dcnt
) % MAX_BDEV
;
4455 if (bdev
[dev
] == nullptr) continue;
4456 dout(2) << __func__
<< " processing " << get_device_name(dev
) << dendl
;
4457 interval_set
<uint64_t> disk_regions
;
4458 disk_regions
.insert(0, bdev
[dev
]->get_size());
4459 for (auto f
: nodes
.file_map
) {
4460 auto& e
= f
.second
->fnode
.extents
;
4462 if (p
.bdev
== dev
) {
4463 disk_regions
.erase(p
.offset
, p
.length
);
4467 size_t disk_regions_count
= disk_regions
.num_intervals();
4468 dout(5) << __func__
<< " " << disk_regions_count
<< " regions to scan on " << get_device_name(dev
) << dendl
;
4470 auto reg
= disk_regions
.lower_bound(last_e_off
);
4471 //for all except first, start from beginning
4473 if (reg
== disk_regions
.end()) {
4474 reg
= disk_regions
.begin();
4476 const uint64_t chunk_size
= 4 * 1024 * 1024;
4477 const uint64_t page_size
= 4096;
4478 const uint64_t max_extent_size
= 16;
4479 uint64_t overlay_size
= last_32
.length() + max_extent_size
;
4480 for (size_t i
= 0; i
< disk_regions_count
; reg
++, i
++) {
4481 if (reg
== disk_regions
.end()) {
4482 reg
= disk_regions
.begin();
4484 uint64_t pos
= reg
.get_start();
4485 uint64_t len
= reg
.get_len();
4487 std::unique_ptr
<char[]> raw_data_p
{new char[page_size
+ chunk_size
]};
4488 char* raw_data
= raw_data_p
.get();
4489 memset(raw_data
, 0, page_size
);
4491 while (len
> last_32
.length()) {
4492 uint64_t chunk_len
= len
> chunk_size
? chunk_size
: len
;
4493 dout(5) << __func__
<< " read "
4494 << get_device_name(dev
) << ":0x" << std::hex
<< pos
<< "+" << chunk_len
4495 << std::dec
<< dendl
;
4496 r
= _bdev_read_random(dev
, pos
, chunk_len
,
4497 raw_data
+ page_size
, cct
->_conf
->bluefs_buffered_io
);
4498 ceph_assert(r
== 0);
4500 //search for fixed_last_32
4501 char* chunk_b
= raw_data
+ page_size
;
4502 char* chunk_e
= chunk_b
+ chunk_len
;
4504 char* search_b
= chunk_b
- overlay_size
;
4505 char* search_e
= chunk_e
;
4507 for (char* sp
= search_b
; ; sp
+= last_32
.length()) {
4508 sp
= (char*)memmem(sp
, search_e
- sp
, last_32
.c_str(), last_32
.length());
4509 if (sp
== nullptr) {
4513 char* n
= sp
+ last_32
.length();
4514 dout(5) << __func__
<< " checking location 0x" << std::hex
<< pos
+ (n
- chunk_b
) << std::dec
<< dendl
;
4516 test
.append(n
, std::min
<size_t>(max_extent_size
, chunk_e
- n
));
4519 bufferlist::const_iterator p
= test
.begin();
4521 } catch (buffer::error
& e
) {
4524 if (extents_rejected
.count(ne
) != 0) {
4525 dout(5) << __func__
<< " extent " << ne
<< " already refected" <<dendl
;
4528 //insert as rejected already. if we succeed, it wouldn't make difference.
4529 extents_rejected
.insert(ne
);
4531 if (ne
.bdev
>= MAX_BDEV
||
4532 bdev
[ne
.bdev
] == nullptr ||
4533 ne
.length
> 16 * 1024 * 1024 ||
4534 (ne
.length
& 4095) != 0 ||
4535 ne
.offset
+ ne
.length
> bdev
[ne
.bdev
]->get_size() ||
4536 (ne
.offset
& 4095) != 0) {
4537 dout(5) << __func__
<< " refusing extent " << ne
<< dendl
;
4540 dout(5) << __func__
<< " checking extent " << ne
<< dendl
;
4542 //read candidate extent - whole
4543 bufferlist candidate
;
4544 candidate
.append(fixed
);
4545 r
= _bdev_read(ne
.bdev
, ne
.offset
, ne
.length
, &candidate
, ioc
[ne
.bdev
],
4546 cct
->_conf
->bluefs_buffered_io
);
4547 ceph_assert(r
== 0);
4549 //check if transaction & crc is ok
4550 bluefs_transaction_t t
;
4552 bufferlist::const_iterator p
= candidate
.begin();
4555 catch (buffer::error
& e
) {
4556 dout(5) << __func__
<< " failed match" << dendl
;
4560 //success, it seems a probable candidate
4561 uint64_t l
= std::min
<uint64_t>(ne
.length
, read_len
);
4562 //trim to required size
4563 bufferlist requested_read
;
4564 requested_read
.substr_of(candidate
, fixed
.length(), l
);
4565 bl
->append(requested_read
);
4566 dout(5) << __func__
<< " successful extension of log " << l
<< "/" << read_len
<< dendl
;
4567 log_fnode
.append_extent(ne
);
4568 log_fnode
.recalc_allocated();
4569 log_reader
->buf
.pos
+= l
;
4572 //save overlay for next search
4573 memcpy(search_b
, chunk_e
- overlay_size
, overlay_size
);
4582 void BlueFS::_check_vselector_LNF() {
4583 BlueFSVolumeSelector
* vs
= vselector
->clone_empty();
4587 std::lock_guard
ll(log
.lock
);
4588 std::lock_guard
nl(nodes
.lock
);
4589 // Checking vselector is under log, nodes and file(s) locks,
4590 // so any modification of vselector must be under at least one of those locks.
4591 for (auto& f
: nodes
.file_map
) {
4592 f
.second
->lock
.lock();
4593 vs
->add_usage(f
.second
->vselector_hint
, f
.second
->fnode
);
4595 bool res
= vselector
->compare(vs
);
4597 dout(0) << "Current:";
4598 vselector
->dump(*_dout
);
4600 dout(0) << "Expected:";
4605 for (auto& f
: nodes
.file_map
) {
4606 f
.second
->lock
.unlock();
4611 size_t BlueFS::probe_alloc_avail(int dev
, uint64_t alloc_size
)
4614 auto iterated_allocation
= [&](size_t off
, size_t len
) {
4615 //only count in size that is alloc_size aligned
4616 size_t dist_to_alignment
;
4617 size_t offset_in_block
= off
& (alloc_size
- 1);
4618 if (offset_in_block
== 0)
4619 dist_to_alignment
= 0;
4621 dist_to_alignment
= alloc_size
- offset_in_block
;
4622 if (dist_to_alignment
>= len
)
4624 len
-= dist_to_alignment
;
4625 total
+= p2align(len
, alloc_size
);
4628 alloc
[dev
]->foreach(iterated_allocation
);
4632 // ===============================================
4633 // OriginalVolumeSelector
4635 void* OriginalVolumeSelector::get_hint_for_log() const {
4636 return reinterpret_cast<void*>(BlueFS::BDEV_WAL
);
4638 void* OriginalVolumeSelector::get_hint_by_dir(std::string_view dirname
) const {
4639 uint8_t res
= BlueFS::BDEV_DB
;
4640 if (dirname
.length() > 5) {
4641 // the "db.slow" and "db.wal" directory names are hard-coded at
4642 // match up with bluestore. the slow device is always the second
4643 // one (when a dedicated block.db device is present and used at
4644 // bdev 0). the wal device is always last.
4645 if (boost::algorithm::ends_with(dirname
, ".slow") && slow_total
) {
4646 res
= BlueFS::BDEV_SLOW
;
4647 } else if (boost::algorithm::ends_with(dirname
, ".wal") && wal_total
) {
4648 res
= BlueFS::BDEV_WAL
;
4651 return reinterpret_cast<void*>(res
);
4654 uint8_t OriginalVolumeSelector::select_prefer_bdev(void* hint
)
4656 return (uint8_t)(reinterpret_cast<uint64_t>(hint
));
4659 void OriginalVolumeSelector::get_paths(const std::string
& base
, paths
& res
) const
4661 res
.emplace_back(base
, db_total
);
4662 res
.emplace_back(base
+ ".slow",
4663 slow_total
? slow_total
: db_total
); // use fake non-zero value if needed to
4664 // avoid RocksDB complains
4668 #define dout_prefix *_dout << "OriginalVolumeSelector: "
4670 void OriginalVolumeSelector::dump(ostream
& sout
) {
4671 sout
<< "wal_total:" << wal_total
4672 << ", db_total:" << db_total
4673 << ", slow_total:" << slow_total
4677 // ===============================================
4678 // FitToFastVolumeSelector
4680 void FitToFastVolumeSelector::get_paths(const std::string
& base
, paths
& res
) const {
4681 res
.emplace_back(base
, 1); // size of the last db_path has no effect