1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2 // vim: ts=8 sw=2 smarttab
4 #include "boost/algorithm/string.hpp"
5 #include "bluestore_common.h"
8 #include "common/debug.h"
9 #include "common/errno.h"
10 #include "common/perf_counters.h"
11 #include "BlockDevice.h"
12 #include "Allocator.h"
13 #include "include/ceph_assert.h"
14 #include "common/admin_socket.h"
16 #define dout_context cct
17 #define dout_subsys ceph_subsys_bluefs
19 #define dout_prefix *_dout << "bluefs "
20 using TOPNSPC::common::cmd_getval
;
21 MEMPOOL_DEFINE_OBJECT_FACTORY(BlueFS::File
, bluefs_file
, bluefs
);
22 MEMPOOL_DEFINE_OBJECT_FACTORY(BlueFS::Dir
, bluefs_dir
, bluefs
);
23 MEMPOOL_DEFINE_OBJECT_FACTORY(BlueFS::FileWriter
, bluefs_file_writer
, bluefs_file_writer
);
24 MEMPOOL_DEFINE_OBJECT_FACTORY(BlueFS::FileReaderBuffer
,
25 bluefs_file_reader_buffer
, bluefs_file_reader
);
26 MEMPOOL_DEFINE_OBJECT_FACTORY(BlueFS::FileReader
, bluefs_file_reader
, bluefs_file_reader
);
27 MEMPOOL_DEFINE_OBJECT_FACTORY(BlueFS::FileLock
, bluefs_file_lock
, bluefs
);
29 static void wal_discard_cb(void *priv
, void* priv2
) {
30 BlueFS
*bluefs
= static_cast<BlueFS
*>(priv
);
31 interval_set
<uint64_t> *tmp
= static_cast<interval_set
<uint64_t>*>(priv2
);
32 bluefs
->handle_discard(BlueFS::BDEV_WAL
, *tmp
);
35 static void db_discard_cb(void *priv
, void* priv2
) {
36 BlueFS
*bluefs
= static_cast<BlueFS
*>(priv
);
37 interval_set
<uint64_t> *tmp
= static_cast<interval_set
<uint64_t>*>(priv2
);
38 bluefs
->handle_discard(BlueFS::BDEV_DB
, *tmp
);
41 static void slow_discard_cb(void *priv
, void* priv2
) {
42 BlueFS
*bluefs
= static_cast<BlueFS
*>(priv
);
43 interval_set
<uint64_t> *tmp
= static_cast<interval_set
<uint64_t>*>(priv2
);
44 bluefs
->handle_discard(BlueFS::BDEV_SLOW
, *tmp
);
47 class BlueFS::SocketHook
: public AdminSocketHook
{
50 static BlueFS::SocketHook
* create(BlueFS
* bluefs
)
52 BlueFS::SocketHook
* hook
= nullptr;
53 AdminSocket
* admin_socket
= bluefs
->cct
->get_admin_socket();
55 hook
= new BlueFS::SocketHook(bluefs
);
56 int r
= admin_socket
->register_command("bluestore bluefs available "
57 "name=alloc_size,type=CephInt,req=false",
59 "Report available space for bluefs. "
60 "If alloc_size set, make simulation.");
62 ldout(bluefs
->cct
, 1) << __func__
<< " cannot register SocketHook" << dendl
;
66 r
= admin_socket
->register_command("bluefs stats",
68 "Dump internal statistics for bluefs."
77 AdminSocket
* admin_socket
= bluefs
->cct
->get_admin_socket();
78 admin_socket
->unregister_commands(this);
81 SocketHook(BlueFS
* bluefs
) :
83 int call(std::string_view command
, const cmdmap_t
& cmdmap
,
86 bufferlist
& out
) override
{
87 if (command
== "bluestore bluefs available") {
88 int64_t alloc_size
= 0;
89 cmd_getval(cmdmap
, "alloc_size", alloc_size
);
90 if ((alloc_size
& (alloc_size
- 1)) != 0) {
91 errss
<< "Invalid allocation size:'" << alloc_size
<< std::endl
;
95 alloc_size
= bluefs
->cct
->_conf
->bluefs_alloc_size
;
96 f
->open_object_section("bluefs_available_space");
97 for (unsigned dev
= BDEV_WAL
; dev
<= BDEV_SLOW
; dev
++) {
98 if (bluefs
->bdev
[dev
]) {
99 f
->open_object_section("dev");
100 f
->dump_string("device", bluefs
->get_device_name(dev
));
101 ceph_assert(bluefs
->alloc
[dev
]);
102 f
->dump_int("free", bluefs
->alloc
[dev
]->get_free());
106 size_t extra_space
= 0;
107 if (bluefs
->slow_dev_expander
) {
108 extra_space
= bluefs
->slow_dev_expander
->available_freespace(alloc_size
);
110 f
->dump_int("available_from_bluestore", extra_space
);
112 } else if (command
== "bluefs stats") {
113 std::stringstream ss
;
114 bluefs
->dump_block_extents(ss
);
115 bluefs
->dump_volume_selector(ss
);
118 errss
<< "Invalid command" << std::endl
;
125 BlueFS::BlueFS(CephContext
* cct
)
131 discard_cb
[BDEV_WAL
] = wal_discard_cb
;
132 discard_cb
[BDEV_DB
] = db_discard_cb
;
133 discard_cb
[BDEV_SLOW
] = slow_discard_cb
;
134 asok_hook
= SocketHook::create(this);
144 for (auto p
: bdev
) {
155 void BlueFS::_init_logger()
157 PerfCountersBuilder
b(cct
, "bluefs",
158 l_bluefs_first
, l_bluefs_last
);
159 b
.add_u64_counter(l_bluefs_gift_bytes
, "gift_bytes",
160 "Bytes gifted from BlueStore", NULL
, 0, unit_t(UNIT_BYTES
));
161 b
.add_u64_counter(l_bluefs_reclaim_bytes
, "reclaim_bytes",
162 "Bytes reclaimed by BlueStore", NULL
, 0, unit_t(UNIT_BYTES
));
163 b
.add_u64(l_bluefs_db_total_bytes
, "db_total_bytes",
164 "Total bytes (main db device)",
165 "b", PerfCountersBuilder::PRIO_USEFUL
, unit_t(UNIT_BYTES
));
166 b
.add_u64(l_bluefs_db_used_bytes
, "db_used_bytes",
167 "Used bytes (main db device)",
168 "u", PerfCountersBuilder::PRIO_USEFUL
, unit_t(UNIT_BYTES
));
169 b
.add_u64(l_bluefs_wal_total_bytes
, "wal_total_bytes",
170 "Total bytes (wal device)",
171 "walb", PerfCountersBuilder::PRIO_USEFUL
, unit_t(UNIT_BYTES
));
172 b
.add_u64(l_bluefs_wal_used_bytes
, "wal_used_bytes",
173 "Used bytes (wal device)",
174 "walu", PerfCountersBuilder::PRIO_USEFUL
, unit_t(UNIT_BYTES
));
175 b
.add_u64(l_bluefs_slow_total_bytes
, "slow_total_bytes",
176 "Total bytes (slow device)",
177 "slob", PerfCountersBuilder::PRIO_USEFUL
, unit_t(UNIT_BYTES
));
178 b
.add_u64(l_bluefs_slow_used_bytes
, "slow_used_bytes",
179 "Used bytes (slow device)",
180 "slou", PerfCountersBuilder::PRIO_USEFUL
, unit_t(UNIT_BYTES
));
181 b
.add_u64(l_bluefs_num_files
, "num_files", "File count",
182 "f", PerfCountersBuilder::PRIO_USEFUL
);
183 b
.add_u64(l_bluefs_log_bytes
, "log_bytes", "Size of the metadata log",
184 "jlen", PerfCountersBuilder::PRIO_INTERESTING
, unit_t(UNIT_BYTES
));
185 b
.add_u64_counter(l_bluefs_log_compactions
, "log_compactions",
186 "Compactions of the metadata log");
187 b
.add_u64_counter(l_bluefs_logged_bytes
, "logged_bytes",
188 "Bytes written to the metadata log", "j",
189 PerfCountersBuilder::PRIO_CRITICAL
, unit_t(UNIT_BYTES
));
190 b
.add_u64_counter(l_bluefs_files_written_wal
, "files_written_wal",
191 "Files written to WAL");
192 b
.add_u64_counter(l_bluefs_files_written_sst
, "files_written_sst",
193 "Files written to SSTs");
194 b
.add_u64_counter(l_bluefs_bytes_written_wal
, "bytes_written_wal",
195 "Bytes written to WAL", "wal",
196 PerfCountersBuilder::PRIO_CRITICAL
);
197 b
.add_u64_counter(l_bluefs_bytes_written_sst
, "bytes_written_sst",
198 "Bytes written to SSTs", "sst",
199 PerfCountersBuilder::PRIO_CRITICAL
, unit_t(UNIT_BYTES
));
200 b
.add_u64_counter(l_bluefs_bytes_written_slow
, "bytes_written_slow",
201 "Bytes written to WAL/SSTs at slow device", NULL
,
202 PerfCountersBuilder::PRIO_USEFUL
, unit_t(UNIT_BYTES
));
203 b
.add_u64_counter(l_bluefs_max_bytes_wal
, "max_bytes_wal",
204 "Maximum bytes allocated from WAL");
205 b
.add_u64_counter(l_bluefs_max_bytes_db
, "max_bytes_db",
206 "Maximum bytes allocated from DB");
207 b
.add_u64_counter(l_bluefs_max_bytes_slow
, "max_bytes_slow",
208 "Maximum bytes allocated from SLOW");
210 b
.add_u64_counter(l_bluefs_read_random_count
, "read_random_count",
211 "random read requests processed");
212 b
.add_u64_counter(l_bluefs_read_random_bytes
, "read_random_bytes",
213 "Bytes requested in random read mode", NULL
,
214 PerfCountersBuilder::PRIO_USEFUL
, unit_t(UNIT_BYTES
));
215 b
.add_u64_counter(l_bluefs_read_random_disk_count
, "read_random_disk_count",
216 "random reads requests going to disk");
217 b
.add_u64_counter(l_bluefs_read_random_disk_bytes
, "read_random_disk_bytes",
218 "Bytes read from disk in random read mode", NULL
,
219 PerfCountersBuilder::PRIO_USEFUL
, unit_t(UNIT_BYTES
));
220 b
.add_u64_counter(l_bluefs_read_random_buffer_count
, "read_random_buffer_count",
221 "random read requests processed using prefetch buffer");
222 b
.add_u64_counter(l_bluefs_read_random_buffer_bytes
, "read_random_buffer_bytes",
223 "Bytes read from prefetch buffer in random read mode", NULL
,
224 PerfCountersBuilder::PRIO_USEFUL
, unit_t(UNIT_BYTES
));
226 b
.add_u64_counter(l_bluefs_read_count
, "read_count",
227 "buffered read requests processed");
228 b
.add_u64_counter(l_bluefs_read_bytes
, "read_bytes",
229 "Bytes requested in buffered read mode", NULL
,
230 PerfCountersBuilder::PRIO_USEFUL
, unit_t(UNIT_BYTES
));
232 b
.add_u64_counter(l_bluefs_read_prefetch_count
, "read_prefetch_count",
233 "prefetch read requests processed");
234 b
.add_u64_counter(l_bluefs_read_prefetch_bytes
, "read_prefetch_bytes",
235 "Bytes requested in prefetch read mode", NULL
,
236 PerfCountersBuilder::PRIO_USEFUL
, unit_t(UNIT_BYTES
));
238 logger
= b
.create_perf_counters();
239 cct
->get_perfcounters_collection()->add(logger
);
242 void BlueFS::_shutdown_logger()
244 cct
->get_perfcounters_collection()->remove(logger
);
248 void BlueFS::_update_logger_stats()
250 // we must be holding the lock
251 logger
->set(l_bluefs_num_files
, file_map
.size());
252 logger
->set(l_bluefs_log_bytes
, log_writer
->file
->fnode
.size
);
254 if (alloc
[BDEV_WAL
]) {
255 logger
->set(l_bluefs_wal_total_bytes
, block_all
[BDEV_WAL
].size());
256 logger
->set(l_bluefs_wal_used_bytes
,
257 block_all
[BDEV_WAL
].size() - alloc
[BDEV_WAL
]->get_free());
259 if (alloc
[BDEV_DB
]) {
260 logger
->set(l_bluefs_db_total_bytes
, block_all
[BDEV_DB
].size());
261 logger
->set(l_bluefs_db_used_bytes
,
262 block_all
[BDEV_DB
].size() - alloc
[BDEV_DB
]->get_free());
264 if (alloc
[BDEV_SLOW
]) {
265 logger
->set(l_bluefs_slow_total_bytes
, block_all
[BDEV_SLOW
].size());
266 logger
->set(l_bluefs_slow_used_bytes
,
267 block_all
[BDEV_SLOW
].size() - alloc
[BDEV_SLOW
]->get_free());
271 int BlueFS::add_block_device(unsigned id
, const string
& path
, bool trim
,
272 bool shared_with_bluestore
)
274 dout(10) << __func__
<< " bdev " << id
<< " path " << path
<< dendl
;
275 ceph_assert(id
< bdev
.size());
276 ceph_assert(bdev
[id
] == NULL
);
277 BlockDevice
*b
= BlockDevice::create(cct
, path
, NULL
, NULL
,
278 discard_cb
[id
], static_cast<void*>(this));
279 if (shared_with_bluestore
) {
280 b
->set_no_exclusive_lock();
282 int r
= b
->open(path
);
288 b
->discard(0, b
->get_size());
291 dout(1) << __func__
<< " bdev " << id
<< " path " << path
292 << " size " << byte_u_t(b
->get_size()) << dendl
;
294 ioc
[id
] = new IOContext(cct
, NULL
);
298 bool BlueFS::bdev_support_label(unsigned id
)
300 ceph_assert(id
< bdev
.size());
301 ceph_assert(bdev
[id
]);
302 return bdev
[id
]->supported_bdev_label();
305 uint64_t BlueFS::get_block_device_size(unsigned id
)
307 if (id
< bdev
.size() && bdev
[id
])
308 return bdev
[id
]->get_size();
312 void BlueFS::_add_block_extent(unsigned id
, uint64_t offset
, uint64_t length
,
315 dout(1) << __func__
<< " bdev " << id
316 << " 0x" << std::hex
<< offset
<< "~" << length
<< std::dec
320 ceph_assert(id
< bdev
.size());
321 ceph_assert(bdev
[id
]);
322 ceph_assert(bdev
[id
]->get_size() >= offset
+ length
);
323 block_all
[id
].insert(offset
, length
);
325 if (id
< alloc
.size() && alloc
[id
]) {
327 log_t
.op_alloc_add(id
, offset
, length
);
329 alloc
[id
]->init_add_free(offset
, length
);
333 logger
->inc(l_bluefs_gift_bytes
, length
);
334 dout(10) << __func__
<< " done" << dendl
;
337 int BlueFS::reclaim_blocks(unsigned id
, uint64_t want
,
338 PExtentVector
*extents
)
340 std::unique_lock
l(lock
);
341 dout(1) << __func__
<< " bdev " << id
342 << " want 0x" << std::hex
<< want
<< std::dec
<< dendl
;
343 ceph_assert(id
< alloc
.size());
344 ceph_assert(alloc
[id
]);
347 interval_set
<uint64_t> granular
;
348 while (want
> 0 && !block_unused_too_granular
[id
].empty()) {
349 auto p
= block_unused_too_granular
[id
].begin();
350 dout(20) << __func__
<< " unused " << (int)id
<< ":"
351 << std::hex
<< p
.get_start() << "~" << p
.get_len() << dendl
;
352 extents
->push_back({p
.get_start(), p
.get_len()});
353 granular
.insert(p
.get_start(), p
.get_len());
354 if (want
>= p
.get_len()) {
360 block_unused_too_granular
[id
].erase(p
);
364 got
+= alloc
[id
]->allocate(want
, alloc_size
[id
], 0, extents
);
365 ceph_assert(got
!= 0);
367 derr
<< __func__
<< " failed to allocate space to return to bluestore"
370 block_unused_too_granular
[id
].insert(granular
);
374 for (auto& p
: *extents
) {
375 block_all
[id
].erase(p
.offset
, p
.length
);
376 log_t
.op_alloc_rm(id
, p
.offset
, p
.length
);
380 int r
= _flush_and_sync_log(l
);
384 logger
->inc(l_bluefs_reclaim_bytes
, got
);
385 dout(1) << __func__
<< " bdev " << id
<< " want 0x" << std::hex
<< want
386 << " got " << *extents
<< dendl
;
390 void BlueFS::handle_discard(unsigned id
, interval_set
<uint64_t>& to_release
)
392 dout(10) << __func__
<< " bdev " << id
<< dendl
;
393 ceph_assert(alloc
[id
]);
394 alloc
[id
]->release(to_release
);
397 uint64_t BlueFS::get_used()
399 std::lock_guard
l(lock
);
401 for (unsigned id
= 0; id
< MAX_BDEV
; ++id
) {
403 used
+= block_all
[id
].size() - alloc
[id
]->get_free();
409 uint64_t BlueFS::get_total(unsigned id
)
411 std::lock_guard
l(lock
);
412 ceph_assert(id
< block_all
.size());
413 return block_all
[id
].size();
416 uint64_t BlueFS::get_free(unsigned id
)
418 std::lock_guard
l(lock
);
419 ceph_assert(id
< alloc
.size());
420 return alloc
[id
]->get_free();
423 void BlueFS::dump_perf_counters(Formatter
*f
)
425 f
->open_object_section("bluefs_perf_counters");
426 logger
->dump_formatted(f
,0);
430 void BlueFS::dump_block_extents(ostream
& out
)
432 for (unsigned i
= 0; i
< MAX_BDEV
; ++i
) {
436 auto owned
= get_total(i
);
437 auto free
= get_free(i
);
439 out
<< i
<< " : device size 0x" << std::hex
<< bdev
[i
]->get_size()
440 << " : own 0x" << block_all
[i
]
442 << " : using 0x" << owned
- free
443 << std::dec
<< "(" << byte_u_t(owned
- free
) << ")";
444 if (i
== _get_slow_device_id()) {
445 ceph_assert(slow_dev_expander
);
446 ceph_assert(alloc
[i
]);
447 free
= slow_dev_expander
->available_freespace(alloc_size
[i
]);
449 << " : bluestore has 0x" << free
450 << std::dec
<< "(" << byte_u_t(free
) << ") available";
456 void BlueFS::get_usage(vector
<pair
<uint64_t,uint64_t>> *usage
)
458 std::lock_guard
l(lock
);
459 usage
->resize(bdev
.size());
460 for (unsigned id
= 0; id
< bdev
.size(); ++id
) {
462 (*usage
)[id
] = make_pair(0, 0);
465 (*usage
)[id
].first
= alloc
[id
]->get_free();
466 (*usage
)[id
].second
= block_all
[id
].size();
468 (block_all
[id
].size() - (*usage
)[id
].first
) * 100 / block_all
[id
].size();
469 dout(10) << __func__
<< " bdev " << id
470 << " free " << (*usage
)[id
].first
471 << " (" << byte_u_t((*usage
)[id
].first
) << ")"
472 << " / " << (*usage
)[id
].second
473 << " (" << byte_u_t((*usage
)[id
].second
) << ")"
474 << ", used " << used
<< "%"
479 int BlueFS::get_block_extents(unsigned id
, interval_set
<uint64_t> *extents
)
481 std::lock_guard
l(lock
);
482 dout(10) << __func__
<< " bdev " << id
<< dendl
;
483 if (id
>= block_all
.size())
485 *extents
= block_all
[id
];
489 int BlueFS::mkfs(uuid_d osd_uuid
, const bluefs_layout_t
& layout
)
491 std::unique_lock
l(lock
);
493 << " osd_uuid " << osd_uuid
496 // set volume selector if not provided before/outside
497 if (vselector
== nullptr) {
499 new OriginalVolumeSelector(
500 get_block_device_size(BlueFS::BDEV_WAL
) * 95 / 100,
501 get_block_device_size(BlueFS::BDEV_DB
) * 95 / 100,
502 get_block_device_size(BlueFS::BDEV_SLOW
) * 95 / 100));
509 super
.block_size
= bdev
[BDEV_DB
]->get_block_size();
510 super
.osd_uuid
= osd_uuid
;
511 super
.uuid
.generate_random();
512 dout(1) << __func__
<< " uuid " << super
.uuid
<< dendl
;
515 FileRef log_file
= ceph::make_ref
<File
>();
516 log_file
->fnode
.ino
= 1;
517 log_file
->vselector_hint
= vselector
->get_hint_for_log();
519 vselector
->select_prefer_bdev(log_file
->vselector_hint
),
520 cct
->_conf
->bluefs_max_log_runway
,
522 vselector
->add_usage(log_file
->vselector_hint
, log_file
->fnode
);
524 log_writer
= _create_writer(log_file
);
528 for (unsigned bdev
= 0; bdev
< MAX_BDEV
; ++bdev
) {
529 interval_set
<uint64_t>& p
= block_all
[bdev
];
532 for (interval_set
<uint64_t>::iterator q
= p
.begin(); q
!= p
.end(); ++q
) {
533 dout(20) << __func__
<< " op_alloc_add " << bdev
<< " 0x"
534 << std::hex
<< q
.get_start() << "~" << q
.get_len() << std::dec
536 log_t
.op_alloc_add(bdev
, q
.get_start(), q
.get_len());
539 _flush_and_sync_log(l
);
542 super
.log_fnode
= log_file
->fnode
;
543 super
.memorized_layout
= layout
;
544 _write_super(BDEV_DB
);
548 super
= bluefs_super_t();
549 _close_writer(log_writer
);
552 vselector
.reset(nullptr);
556 dout(10) << __func__
<< " success" << dendl
;
560 void BlueFS::_init_alloc()
562 dout(20) << __func__
<< dendl
;
563 alloc
.resize(MAX_BDEV
);
564 alloc_size
.resize(MAX_BDEV
, 0);
565 pending_release
.resize(MAX_BDEV
);
566 block_unused_too_granular
.resize(MAX_BDEV
);
568 if (bdev
[BDEV_WAL
]) {
569 alloc_size
[BDEV_WAL
] = cct
->_conf
->bluefs_alloc_size
;
571 if (bdev
[BDEV_SLOW
]) {
572 alloc_size
[BDEV_DB
] = cct
->_conf
->bluefs_alloc_size
;
573 alloc_size
[BDEV_SLOW
] = cct
->_conf
->bluefs_shared_alloc_size
;
575 alloc_size
[BDEV_DB
] = cct
->_conf
->bluefs_shared_alloc_size
;
577 // new wal and db devices are never shared
578 if (bdev
[BDEV_NEWWAL
]) {
579 alloc_size
[BDEV_NEWWAL
] = cct
->_conf
->bluefs_alloc_size
;
581 if (bdev
[BDEV_NEWDB
]) {
582 alloc_size
[BDEV_NEWDB
] = cct
->_conf
->bluefs_alloc_size
;
585 for (unsigned id
= 0; id
< bdev
.size(); ++id
) {
589 ceph_assert(bdev
[id
]->get_size());
590 std::string name
= "bluefs-";
591 const char* devnames
[] = {"wal","db","slow"};
593 name
+= devnames
[id
];
595 name
+= to_string(uintptr_t(this));
596 ceph_assert(alloc_size
[id
]);
597 dout(1) << __func__
<< " id " << id
598 << " alloc_size 0x" << std::hex
<< alloc_size
[id
]
599 << " size 0x" << bdev
[id
]->get_size() << std::dec
<< dendl
;
600 alloc
[id
] = Allocator::create(cct
, cct
->_conf
->bluefs_allocator
,
601 bdev
[id
]->get_size(),
602 alloc_size
[id
], name
);
603 interval_set
<uint64_t>& p
= block_all
[id
];
604 for (interval_set
<uint64_t>::iterator q
= p
.begin(); q
!= p
.end(); ++q
) {
605 alloc
[id
]->init_add_free(q
.get_start(), q
.get_len());
610 void BlueFS::_stop_alloc()
612 dout(20) << __func__
<< dendl
;
613 for (auto p
: bdev
) {
618 for (auto p
: alloc
) {
625 block_unused_too_granular
.clear();
630 dout(1) << __func__
<< dendl
;
632 int r
= _open_super();
634 derr
<< __func__
<< " failed to open super: " << cpp_strerror(r
) << dendl
;
638 // set volume selector if not provided before/outside
639 if (vselector
== nullptr) {
641 new OriginalVolumeSelector(
642 get_block_device_size(BlueFS::BDEV_WAL
) * 95 / 100,
643 get_block_device_size(BlueFS::BDEV_DB
) * 95 / 100,
644 get_block_device_size(BlueFS::BDEV_SLOW
) * 95 / 100));
648 block_all
.resize(MAX_BDEV
);
652 r
= _replay(false, false);
654 derr
<< __func__
<< " failed to replay log: " << cpp_strerror(r
) << dendl
;
660 for (auto& p
: file_map
) {
661 dout(30) << __func__
<< " noting alloc for " << p
.second
->fnode
<< dendl
;
662 for (auto& q
: p
.second
->fnode
.extents
) {
663 alloc
[q
.bdev
]->init_rm_free(q
.offset
, q
.length
);
667 // set up the log for future writes
668 log_writer
= _create_writer(_get_file(1));
669 ceph_assert(log_writer
->file
->fnode
.ino
== 1);
670 log_writer
->pos
= log_writer
->file
->fnode
.size
;
671 dout(10) << __func__
<< " log write pos set to 0x"
672 << std::hex
<< log_writer
->pos
<< std::dec
678 super
= bluefs_super_t();
682 int BlueFS::maybe_verify_layout(const bluefs_layout_t
& layout
) const
684 if (super
.memorized_layout
) {
685 if (layout
== *super
.memorized_layout
) {
686 dout(10) << __func__
<< " bluefs layout verified positively" << dendl
;
688 derr
<< __func__
<< " memorized layout doesn't fit current one" << dendl
;
692 dout(10) << __func__
<< " no memorized_layout in bluefs superblock"
699 void BlueFS::umount(bool avoid_compact
)
701 dout(1) << __func__
<< dendl
;
703 sync_metadata(avoid_compact
);
705 _close_writer(log_writer
);
708 vselector
.reset(nullptr);
712 super
= bluefs_super_t();
717 int BlueFS::prepare_new_device(int id
, const bluefs_layout_t
& layout
)
719 dout(1) << __func__
<< dendl
;
721 if(id
== BDEV_NEWDB
) {
722 int new_log_dev_cur
= BDEV_WAL
;
723 int new_log_dev_next
= BDEV_WAL
;
724 if (!bdev
[BDEV_WAL
]) {
725 new_log_dev_cur
= BDEV_NEWDB
;
726 new_log_dev_next
= BDEV_DB
;
728 _rewrite_log_and_layout_sync(false,
735 } else if(id
== BDEV_NEWWAL
) {
736 _rewrite_log_and_layout_sync(false,
748 void BlueFS::collect_metadata(map
<string
,string
> *pm
, unsigned skip_bdev_id
)
750 if (skip_bdev_id
!= BDEV_DB
&& bdev
[BDEV_DB
])
751 bdev
[BDEV_DB
]->collect_metadata("bluefs_db_", pm
);
753 bdev
[BDEV_WAL
]->collect_metadata("bluefs_wal_", pm
);
756 void BlueFS::get_devices(set
<string
> *ls
)
758 for (unsigned i
= 0; i
< MAX_BDEV
; ++i
) {
760 bdev
[i
]->get_devices(ls
);
767 std::lock_guard
l(lock
);
768 dout(1) << __func__
<< dendl
;
769 // hrm, i think we check everything on mount...
773 int BlueFS::_write_super(int dev
)
778 uint32_t crc
= bl
.crc32c(-1);
780 dout(10) << __func__
<< " super block length(encoded): " << bl
.length() << dendl
;
781 dout(10) << __func__
<< " superblock " << super
.version
<< dendl
;
782 dout(10) << __func__
<< " log_fnode " << super
.log_fnode
<< dendl
;
783 ceph_assert_always(bl
.length() <= get_super_length());
784 bl
.append_zero(get_super_length() - bl
.length());
786 bdev
[dev
]->write(get_super_offset(), bl
, false, WRITE_LIFE_SHORT
);
787 dout(20) << __func__
<< " v " << super
.version
788 << " crc 0x" << std::hex
<< crc
789 << " offset 0x" << get_super_offset() << std::dec
794 int BlueFS::_open_super()
796 dout(10) << __func__
<< dendl
;
799 uint32_t expected_crc
, crc
;
802 // always the second block
803 r
= bdev
[BDEV_DB
]->read(get_super_offset(), get_super_length(),
804 &bl
, ioc
[BDEV_DB
], false);
808 auto p
= bl
.cbegin();
812 t
.substr_of(bl
, 0, p
.get_off());
815 decode(expected_crc
, p
);
816 if (crc
!= expected_crc
) {
817 derr
<< __func__
<< " bad crc on superblock, expected 0x"
818 << std::hex
<< expected_crc
<< " != actual 0x" << crc
<< std::dec
822 dout(10) << __func__
<< " superblock " << super
.version
<< dendl
;
823 dout(10) << __func__
<< " log_fnode " << super
.log_fnode
<< dendl
;
827 int BlueFS::_check_new_allocations(const bluefs_fnode_t
& fnode
,
829 boost::dynamic_bitset
<uint64_t>* owned_blocks
,
830 boost::dynamic_bitset
<uint64_t>* used_blocks
)
832 auto& fnode_extents
= fnode
.extents
;
833 for (auto e
: fnode_extents
) {
836 ceph_assert(id
< dev_count
);
837 apply_for_bitset_range(e
.offset
, e
.length
, alloc_size
[id
], owned_blocks
[id
],
838 [&](uint64_t pos
, boost::dynamic_bitset
<uint64_t> &bs
) {
845 derr
<< __func__
<< " invalid extent " << int(id
)
846 << ": 0x" << std::hex
<< e
.offset
<< "~" << e
.length
848 << ": wasn't given but allocated for ino " << fnode
.ino
853 apply_for_bitset_range(e
.offset
, e
.length
, alloc_size
[id
], used_blocks
[id
],
854 [&](uint64_t pos
, boost::dynamic_bitset
<uint64_t> &bs
) {
862 derr
<< __func__
<< " invalid extent " << int(e
.bdev
)
863 << ": 0x" << std::hex
<< e
.offset
<< "~" << e
.length
864 << std::dec
<< ": duplicate reference, ino " << fnode
.ino
872 int BlueFS::_adjust_granularity(
873 __u8 id
, uint64_t *offset
, uint64_t *length
, bool alloc
)
875 const char *op
= alloc
? "op_alloc_add" : "op_alloc_rm";
878 if (*offset
& (alloc_size
[id
] - 1)) {
879 *offset
&= ~(alloc_size
[id
] - 1);
880 *offset
+= alloc_size
[id
];
881 if (*length
> *offset
- oldo
) {
883 block_unused_too_granular
[id
].insert(oldo
, *offset
- oldo
);
885 block_unused_too_granular
[id
].erase(oldo
, *offset
- oldo
);
887 *length
-= (*offset
- oldo
);
890 block_unused_too_granular
[id
].insert(oldo
, *length
);
892 block_unused_too_granular
[id
].erase(oldo
, *length
);
897 if (*length
& (alloc_size
[id
] - 1)) {
898 *length
&= ~(alloc_size
[id
] - 1);
900 block_unused_too_granular
[id
].insert(
902 oldo
+ oldl
- *offset
- *length
);
904 block_unused_too_granular
[id
].erase(
906 oldo
+ oldl
- *offset
- *length
);
909 if (oldo
!= *offset
|| oldl
!= *length
) {
910 dout(10) << __func__
<< " " << op
<< " "
911 << (int)id
<< ":" << std::hex
<< oldo
<< "~" << oldl
912 << " -> " << (int)id
<< ":" << *offset
<< "~" << *length
<< dendl
;
917 int BlueFS::_verify_alloc_granularity(
918 __u8 id
, uint64_t offset
, uint64_t length
, const char *op
)
920 if ((offset
& (alloc_size
[id
] - 1)) ||
921 (length
& (alloc_size
[id
] - 1))) {
922 derr
<< __func__
<< " " << op
<< " of " << (int)id
923 << ":0x" << std::hex
<< offset
<< "~" << length
<< std::dec
924 << " does not align to alloc_size 0x"
925 << std::hex
<< alloc_size
[id
] << std::dec
<< dendl
;
927 auto need
= alloc_size
[id
];
928 while (need
&& ((offset
& (need
- 1)) ||
929 (length
& (need
- 1)))) {
934 if (id
== BDEV_SLOW
||
935 (id
== BDEV_DB
&& !bdev
[BDEV_SLOW
])) {
936 which
= "bluefs_shared_alloc_size";
938 which
= "bluefs_alloc_size";
940 derr
<< "work-around by setting " << which
<< " = " << need
941 << " for this OSD" << dendl
;
948 int BlueFS::_replay(bool noop
, bool to_stdout
)
950 dout(10) << __func__
<< (noop
? " NO-OP" : "") << dendl
;
951 ino_last
= 1; // by the log
955 log_file
= _get_file(1);
958 for (auto& a
: block_unused_too_granular
) {
959 ceph_assert(a
.empty());
963 log_file
->fnode
= super
.log_fnode
;
964 log_file
->vselector_hint
=
965 vselector
->get_hint_for_log();
967 // do not use fnode from superblock in 'noop' mode - log_file's one should
968 // be fine and up-to-date
969 ceph_assert(log_file
->fnode
.ino
== 1);
970 ceph_assert(log_file
->fnode
.extents
.size() != 0);
972 dout(10) << __func__
<< " log_fnode " << super
.log_fnode
<< dendl
;
973 if (unlikely(to_stdout
)) {
974 std::cout
<< " log_fnode " << super
.log_fnode
<< std::endl
;
977 FileReader
*log_reader
= new FileReader(
978 log_file
, cct
->_conf
->bluefs_max_prefetch
,
982 bool seen_recs
= false;
984 boost::dynamic_bitset
<uint64_t> used_blocks
[MAX_BDEV
];
985 boost::dynamic_bitset
<uint64_t> owned_blocks
[MAX_BDEV
];
987 if (cct
->_conf
->bluefs_log_replay_check_allocations
) {
988 for (size_t i
= 0; i
< MAX_BDEV
; ++i
) {
989 if (alloc_size
[i
] != 0 && bdev
[i
] != nullptr) {
990 used_blocks
[i
].resize(round_up_to(bdev
[i
]->get_size(), alloc_size
[i
]) / alloc_size
[i
]);
991 owned_blocks
[i
].resize(round_up_to(bdev
[i
]->get_size(), alloc_size
[i
]) / alloc_size
[i
]);
996 bool first_log_check
= true;
999 ceph_assert((log_reader
->buf
.pos
& ~super
.block_mask()) == 0);
1000 uint64_t pos
= log_reader
->buf
.pos
;
1001 uint64_t read_pos
= pos
;
1004 int r
= _read(log_reader
, &log_reader
->buf
, read_pos
, super
.block_size
,
1006 if (r
!= (int)super
.block_size
&& cct
->_conf
->bluefs_replay_recovery
) {
1007 r
+= do_replay_recovery_read(log_reader
, pos
, read_pos
+ r
, super
.block_size
- r
, &bl
);
1009 assert(r
== (int)super
.block_size
);
1016 auto p
= bl
.cbegin();
1024 if (len
+ 6 > bl
.length()) {
1025 more
= round_up_to(len
+ 6 - bl
.length(), super
.block_size
);
1028 if (uuid
!= super
.uuid
) {
1030 dout(10) << __func__
<< " 0x" << std::hex
<< pos
<< std::dec
1031 << ": stop: uuid " << uuid
<< " != super.uuid " << super
.uuid
1034 derr
<< __func__
<< " 0x" << std::hex
<< pos
<< std::dec
1035 << ": stop: uuid " << uuid
<< " != super.uuid " << super
.uuid
1036 << ", block dump: \n";
1038 t
.substr_of(bl
, 0, super
.block_size
);
1044 if (seq
!= log_seq
+ 1) {
1046 dout(10) << __func__
<< " 0x" << std::hex
<< pos
<< std::dec
1047 << ": stop: seq " << seq
<< " != expected " << log_seq
+ 1
1050 derr
<< __func__
<< " 0x" << std::hex
<< pos
<< std::dec
1051 << ": stop: seq " << seq
<< " != expected " << log_seq
+ 1
1057 dout(20) << __func__
<< " need 0x" << std::hex
<< more
<< std::dec
1058 << " more bytes" << dendl
;
1060 int r
= _read(log_reader
, &log_reader
->buf
, read_pos
, more
, &t
, NULL
);
1061 if (r
< (int)more
) {
1062 dout(10) << __func__
<< " 0x" << std::hex
<< pos
1063 << ": stop: len is 0x" << bl
.length() + more
<< std::dec
1064 << ", which is past eof" << dendl
;
1065 if (cct
->_conf
->bluefs_replay_recovery
) {
1066 //try to search for more data
1067 r
+= do_replay_recovery_read(log_reader
, pos
, read_pos
+ r
, more
- r
, &t
);
1068 if (r
< (int)more
) {
1069 //in normal mode we must read r==more, for recovery it is too strict
1074 ceph_assert(r
== (int)more
);
1079 bluefs_transaction_t t
;
1081 auto p
= bl
.cbegin();
1084 catch (buffer::error
& e
) {
1085 derr
<< __func__
<< " 0x" << std::hex
<< pos
<< std::dec
1086 << ": stop: failed to decode: " << e
.what()
1091 ceph_assert(seq
== t
.seq
);
1092 dout(10) << __func__
<< " 0x" << std::hex
<< pos
<< std::dec
1093 << ": " << t
<< dendl
;
1094 if (unlikely(to_stdout
)) {
1095 std::cout
<< " 0x" << std::hex
<< pos
<< std::dec
1096 << ": " << t
<< std::endl
;
1099 auto p
= t
.op_bl
.cbegin();
1105 case bluefs_transaction_t::OP_INIT
:
1106 dout(20) << __func__
<< " 0x" << std::hex
<< pos
<< std::dec
1107 << ": op_init" << dendl
;
1108 if (unlikely(to_stdout
)) {
1109 std::cout
<< " 0x" << std::hex
<< pos
<< std::dec
1110 << ": op_init" << std::endl
;
1113 ceph_assert(t
.seq
== 1);
1116 case bluefs_transaction_t::OP_JUMP
:
1120 decode(next_seq
, p
);
1122 dout(20) << __func__
<< " 0x" << std::hex
<< pos
<< std::dec
1123 << ": op_jump seq " << next_seq
1124 << " offset 0x" << std::hex
<< offset
<< std::dec
<< dendl
;
1125 if (unlikely(to_stdout
)) {
1126 std::cout
<< " 0x" << std::hex
<< pos
<< std::dec
1127 << ": op_jump seq " << next_seq
1128 << " offset 0x" << std::hex
<< offset
<< std::dec
1132 ceph_assert(next_seq
>= log_seq
);
1133 log_seq
= next_seq
- 1; // we will increment it below
1134 uint64_t skip
= offset
- read_pos
;
1137 int r
= _read(log_reader
, &log_reader
->buf
, read_pos
, skip
, &junk
,
1139 if (r
!= (int)skip
) {
1140 dout(10) << __func__
<< " 0x" << std::hex
<< read_pos
1141 << ": stop: failed to skip to " << offset
1142 << std::dec
<< dendl
;
1143 ceph_abort_msg("problem with op_jump");
1149 case bluefs_transaction_t::OP_JUMP_SEQ
:
1152 decode(next_seq
, p
);
1153 dout(20) << __func__
<< " 0x" << std::hex
<< pos
<< std::dec
1154 << ": op_jump_seq " << next_seq
<< dendl
;
1155 if (unlikely(to_stdout
)) {
1156 std::cout
<< " 0x" << std::hex
<< pos
<< std::dec
1157 << ": op_jump_seq " << next_seq
<< std::endl
;
1160 ceph_assert(next_seq
>= log_seq
);
1161 log_seq
= next_seq
- 1; // we will increment it below
1165 case bluefs_transaction_t::OP_ALLOC_ADD
:
1168 uint64_t offset
, length
;
1172 dout(20) << __func__
<< " 0x" << std::hex
<< pos
<< std::dec
1173 << ": op_alloc_add " << " " << (int)id
1174 << ":0x" << std::hex
<< offset
<< "~" << length
<< std::dec
1176 if (unlikely(to_stdout
)) {
1177 std::cout
<< " 0x" << std::hex
<< pos
<< std::dec
1178 << ": op_alloc_add " << " " << (int)id
1179 << ":0x" << std::hex
<< offset
<< "~" << length
<< std::dec
1183 block_all
[id
].insert(offset
, length
);
1184 _adjust_granularity(id
, &offset
, &length
, true);
1186 alloc
[id
]->init_add_free(offset
, length
);
1189 if (cct
->_conf
->bluefs_log_replay_check_allocations
) {
1191 apply_for_bitset_range(offset
, length
, alloc_size
[id
], owned_blocks
[id
],
1192 [&](uint64_t pos
, boost::dynamic_bitset
<uint64_t> &bs
) {
1201 derr
<< __func__
<< " invalid extent " << (int)id
1202 << ": 0x" << std::hex
<< offset
<< "~" << length
1203 << std::dec
<< ": already given" << dendl
;
1206 apply_for_bitset_range(offset
, length
, alloc_size
[id
], used_blocks
[id
],
1207 [&](uint64_t pos
, boost::dynamic_bitset
<uint64_t> &bs
) {
1214 derr
<< __func__
<< " invalid extent " << int(id
)
1215 << ": 0x" << std::hex
<< offset
<< "~" << length
1216 << std::dec
<< ": already in use" << dendl
;
1224 case bluefs_transaction_t::OP_ALLOC_RM
:
1227 uint64_t offset
, length
;
1231 dout(20) << __func__
<< " 0x" << std::hex
<< pos
<< std::dec
1232 << ": op_alloc_rm " << " " << (int)id
1233 << ":0x" << std::hex
<< offset
<< "~" << length
<< std::dec
1235 if (unlikely(to_stdout
)) {
1236 std::cout
<< " 0x" << std::hex
<< pos
<< std::dec
1237 << ": op_alloc_rm " << " " << (int)id
1238 << ":0x" << std::hex
<< offset
<< "~" << length
<< std::dec
1242 block_all
[id
].erase(offset
, length
);
1243 _adjust_granularity(id
, &offset
, &length
, false);
1245 alloc
[id
]->init_rm_free(offset
, length
);
1247 if (cct
->_conf
->bluefs_log_replay_check_allocations
) {
1249 apply_for_bitset_range(offset
, length
, alloc_size
[id
], owned_blocks
[id
],
1250 [&](uint64_t pos
, boost::dynamic_bitset
<uint64_t> &bs
) {
1251 if (!bs
.test(pos
)) {
1259 derr
<< __func__
<< " invalid extent " << int(id
)
1260 << ": 0x" << std::hex
<< offset
<< "~" << length
1261 << std::dec
<< ": wasn't given" << dendl
;
1265 apply_for_bitset_range(offset
, length
, alloc_size
[id
], used_blocks
[id
],
1266 [&](uint64_t pos
, boost::dynamic_bitset
<uint64_t> &bs
) {
1273 derr
<< __func__
<< " invalid extent " << (int)id
1274 << ": 0x" << std::hex
<< offset
<< "~" << length
1275 << std::dec
<< ": still in use" << dendl
;
1283 case bluefs_transaction_t::OP_DIR_LINK
:
1285 string dirname
, filename
;
1288 decode(filename
, p
);
1290 dout(20) << __func__
<< " 0x" << std::hex
<< pos
<< std::dec
1291 << ": op_dir_link " << " " << dirname
<< "/" << filename
1294 if (unlikely(to_stdout
)) {
1295 std::cout
<< " 0x" << std::hex
<< pos
<< std::dec
1296 << ": op_dir_link " << " " << dirname
<< "/" << filename
1302 FileRef file
= _get_file(ino
);
1303 ceph_assert(file
->fnode
.ino
);
1304 map
<string
,DirRef
>::iterator q
= dir_map
.find(dirname
);
1305 ceph_assert(q
!= dir_map
.end());
1306 map
<string
,FileRef
>::iterator r
= q
->second
->file_map
.find(filename
);
1307 ceph_assert(r
== q
->second
->file_map
.end());
1309 vselector
->sub_usage(file
->vselector_hint
, file
->fnode
);
1310 file
->vselector_hint
=
1311 vselector
->get_hint_by_dir(dirname
);
1312 vselector
->add_usage(file
->vselector_hint
, file
->fnode
);
1314 q
->second
->file_map
[filename
] = file
;
1320 case bluefs_transaction_t::OP_DIR_UNLINK
:
1322 string dirname
, filename
;
1324 decode(filename
, p
);
1325 dout(20) << __func__
<< " 0x" << std::hex
<< pos
<< std::dec
1326 << ": op_dir_unlink " << " " << dirname
<< "/" << filename
1328 if (unlikely(to_stdout
)) {
1329 std::cout
<< " 0x" << std::hex
<< pos
<< std::dec
1330 << ": op_dir_unlink " << " " << dirname
<< "/" << filename
1335 map
<string
,DirRef
>::iterator q
= dir_map
.find(dirname
);
1336 ceph_assert(q
!= dir_map
.end());
1337 map
<string
,FileRef
>::iterator r
= q
->second
->file_map
.find(filename
);
1338 ceph_assert(r
!= q
->second
->file_map
.end());
1339 ceph_assert(r
->second
->refs
> 0);
1341 q
->second
->file_map
.erase(r
);
1346 case bluefs_transaction_t::OP_DIR_CREATE
:
1350 dout(20) << __func__
<< " 0x" << std::hex
<< pos
<< std::dec
1351 << ": op_dir_create " << dirname
<< dendl
;
1352 if (unlikely(to_stdout
)) {
1353 std::cout
<< " 0x" << std::hex
<< pos
<< std::dec
1354 << ": op_dir_create " << dirname
<< std::endl
;
1358 map
<string
,DirRef
>::iterator q
= dir_map
.find(dirname
);
1359 ceph_assert(q
== dir_map
.end());
1360 dir_map
[dirname
] = ceph::make_ref
<Dir
>();
1365 case bluefs_transaction_t::OP_DIR_REMOVE
:
1369 dout(20) << __func__
<< " 0x" << std::hex
<< pos
<< std::dec
1370 << ": op_dir_remove " << dirname
<< dendl
;
1371 if (unlikely(to_stdout
)) {
1372 std::cout
<< " 0x" << std::hex
<< pos
<< std::dec
1373 << ": op_dir_remove " << dirname
<< std::endl
;
1377 map
<string
,DirRef
>::iterator q
= dir_map
.find(dirname
);
1378 ceph_assert(q
!= dir_map
.end());
1379 ceph_assert(q
->second
->file_map
.empty());
1385 case bluefs_transaction_t::OP_FILE_UPDATE
:
1387 bluefs_fnode_t fnode
;
1389 dout(20) << __func__
<< " 0x" << std::hex
<< pos
<< std::dec
1390 << ": op_file_update " << " " << fnode
<< " " << dendl
;
1391 if (unlikely(to_stdout
)) {
1392 std::cout
<< " 0x" << std::hex
<< pos
<< std::dec
1393 << ": op_file_update " << " " << fnode
<< std::endl
;
1396 FileRef f
= _get_file(fnode
.ino
);
1397 if (cct
->_conf
->bluefs_log_replay_check_allocations
) {
1398 // check initial log layout
1399 if (first_log_check
) {
1400 first_log_check
= false;
1401 int r
= _check_new_allocations(log_file
->fnode
,
1402 MAX_BDEV
, owned_blocks
, used_blocks
);
1408 auto& fnode_extents
= f
->fnode
.extents
;
1409 for (auto e
: fnode_extents
) {
1411 if (int r
= _verify_alloc_granularity(id
, e
.offset
, e
.length
,
1412 "OP_FILE_UPDATE"); r
< 0) {
1415 apply_for_bitset_range(e
.offset
, e
.length
, alloc_size
[id
],
1417 [&](uint64_t pos
, boost::dynamic_bitset
<uint64_t> &bs
) {
1418 ceph_assert(bs
.test(pos
));
1425 if (fnode
.ino
!= 1) {
1426 vselector
->sub_usage(f
->vselector_hint
, f
->fnode
);
1429 if (fnode
.ino
!= 1) {
1430 vselector
->add_usage(f
->vselector_hint
, f
->fnode
);
1433 if (fnode
.ino
> ino_last
) {
1434 ino_last
= fnode
.ino
;
1436 if (cct
->_conf
->bluefs_log_replay_check_allocations
) {
1437 int r
= _check_new_allocations(f
->fnode
,
1438 MAX_BDEV
, owned_blocks
, used_blocks
);
1447 case bluefs_transaction_t::OP_FILE_REMOVE
:
1451 dout(20) << __func__
<< " 0x" << std::hex
<< pos
<< std::dec
1452 << ": op_file_remove " << ino
<< dendl
;
1453 if (unlikely(to_stdout
)) {
1454 std::cout
<< " 0x" << std::hex
<< pos
<< std::dec
1455 << ": op_file_remove " << ino
<< std::endl
;
1459 auto p
= file_map
.find(ino
);
1460 ceph_assert(p
!= file_map
.end());
1461 vselector
->sub_usage(p
->second
->vselector_hint
, p
->second
->fnode
);
1462 if (cct
->_conf
->bluefs_log_replay_check_allocations
) {
1463 auto& fnode_extents
= p
->second
->fnode
.extents
;
1464 for (auto e
: fnode_extents
) {
1467 apply_for_bitset_range(e
.offset
, e
.length
, alloc_size
[id
], owned_blocks
[id
],
1468 [&](uint64_t pos
, boost::dynamic_bitset
<uint64_t> &bs
) {
1469 if (!bs
.test(pos
)) {
1475 derr
<< __func__
<< " invalid extent " << int(id
)
1476 << ": 0x" << std::hex
<< e
.offset
<< "~" << e
.length
1478 << ": wasn't given but is allocated for removed ino " << ino
1483 apply_for_bitset_range(e
.offset
, e
.length
, alloc_size
[id
], used_blocks
[id
],
1484 [&](uint64_t pos
, boost::dynamic_bitset
<uint64_t> &bs
) {
1485 if (!bs
.test(pos
)) {
1492 derr
<< __func__
<< " invalid extent " << int(id
)
1493 << ": 0x" << std::hex
<< e
.offset
<< "~" << e
.length
1495 << ": not in use but is allocated for removed ino " << ino
1507 derr
<< __func__
<< " 0x" << std::hex
<< pos
<< std::dec
1508 << ": stop: unrecognized op " << (int)op
<< dendl
;
1513 ceph_assert(p
.end());
1515 // we successfully replayed the transaction; bump the seq and log size
1517 log_file
->fnode
.size
= log_reader
->buf
.pos
;
1519 vselector
->add_usage(log_file
->vselector_hint
, log_file
->fnode
);
1521 if (!noop
&& first_log_check
&&
1522 cct
->_conf
->bluefs_log_replay_check_allocations
) {
1523 int r
= _check_new_allocations(log_file
->fnode
,
1524 MAX_BDEV
, owned_blocks
, used_blocks
);
1530 dout(10) << __func__
<< " log file size was 0x"
1531 << std::hex
<< log_file
->fnode
.size
<< std::dec
<< dendl
;
1532 if (unlikely(to_stdout
)) {
1533 std::cout
<< " log file size was 0x"
1534 << std::hex
<< log_file
->fnode
.size
<< std::dec
<< std::endl
;
1540 // verify file link counts are all >0
1541 for (auto& p
: file_map
) {
1542 if (p
.second
->refs
== 0 &&
1543 p
.second
->fnode
.ino
> 1) {
1544 derr
<< __func__
<< " file with link count 0: " << p
.second
->fnode
1551 for (unsigned id
= 0; id
< MAX_BDEV
; ++id
) {
1552 dout(10) << __func__
<< " block_unused_too_granular " << id
<< ": "
1553 << block_unused_too_granular
[id
] << dendl
;
1555 dout(10) << __func__
<< " done" << dendl
;
1559 int BlueFS::log_dump()
1561 // only dump log file's content
1562 int r
= _replay(true, true);
1564 derr
<< __func__
<< " failed to replay log: " << cpp_strerror(r
) << dendl
;
1571 int BlueFS::device_migrate_to_existing(
1573 const set
<int>& devs_source
,
1575 const bluefs_layout_t
& layout
)
1578 bool buffered
= cct
->_conf
->bluefs_buffered_io
;
1580 dout(10) << __func__
<< " devs_source " << devs_source
1581 << " dev_target " << dev_target
<< dendl
;
1582 assert(dev_target
< (int)MAX_BDEV
);
1585 flags
|= devs_source
.count(BDEV_DB
) ?
1586 (REMOVE_DB
| RENAME_SLOW2DB
) : 0;
1587 flags
|= devs_source
.count(BDEV_WAL
) ? REMOVE_WAL
: 0;
1588 int dev_target_new
= dev_target
;
1590 // Slow device without separate DB one is addressed via BDEV_DB
1591 // Hence need renaming.
1592 if ((flags
& REMOVE_DB
) && dev_target
== BDEV_SLOW
) {
1593 dev_target_new
= BDEV_DB
;
1594 dout(0) << __func__
<< " super to be written to " << dev_target
<< dendl
;
1597 for (auto& [ino
, file_ref
] : file_map
) {
1599 if (file_ref
->fnode
.ino
== 1) {
1602 dout(10) << __func__
<< " " << ino
<< " " << file_ref
->fnode
<< dendl
;
1604 auto& fnode_extents
= file_ref
->fnode
.extents
;
1606 bool rewrite
= std::any_of(
1607 fnode_extents
.begin(),
1608 fnode_extents
.end(),
1610 return ext
.bdev
!= dev_target
&& devs_source
.count(ext
.bdev
);
1613 dout(10) << __func__
<< " migrating" << dendl
;
1617 for (auto old_ext
: fnode_extents
) {
1618 buf
.resize(old_ext
.length
);
1619 int r
= bdev
[old_ext
.bdev
]->read_random(
1625 derr
<< __func__
<< " failed to read 0x" << std::hex
1626 << old_ext
.offset
<< "~" << old_ext
.length
<< std::dec
1627 << " from " << (int)dev_target
<< dendl
;
1630 bl
.append((char*)&buf
[0], old_ext
.length
);
1633 // write entire file
1634 PExtentVector extents
;
1635 auto l
= _allocate_without_fallback(dev_target
, bl
.length(), &extents
);
1637 derr
<< __func__
<< " unable to allocate len 0x" << std::hex
1638 << bl
.length() << std::dec
<< " from " << (int)dev_target
1639 << ": " << cpp_strerror(l
) << dendl
;
1644 for (auto& i
: extents
) {
1646 uint64_t cur_len
= std::min
<uint64_t>(i
.length
, bl
.length() - off
);
1647 ceph_assert(cur_len
> 0);
1648 cur
.substr_of(bl
, off
, cur_len
);
1649 int r
= bdev
[dev_target
]->write(i
.offset
, cur
, buffered
);
1650 ceph_assert(r
== 0);
1654 // release old extents
1655 for (auto old_ext
: fnode_extents
) {
1656 PExtentVector to_release
;
1657 to_release
.emplace_back(old_ext
.offset
, old_ext
.length
);
1658 alloc
[old_ext
.bdev
]->release(to_release
);
1662 fnode_extents
.clear();
1663 for (auto& i
: extents
) {
1664 fnode_extents
.emplace_back(dev_target_new
, i
.offset
, i
.length
);
1667 for (auto& ext
: fnode_extents
) {
1668 if (dev_target
!= dev_target_new
&& ext
.bdev
== dev_target
) {
1669 dout(20) << __func__
<< " " << " ... adjusting extent 0x"
1670 << std::hex
<< ext
.offset
<< std::dec
1671 << " bdev " << dev_target
<< " -> " << dev_target_new
1673 ext
.bdev
= dev_target_new
;
1678 // new logging device in the current naming scheme
1679 int new_log_dev_cur
= bdev
[BDEV_WAL
] ?
1681 bdev
[BDEV_DB
] ? BDEV_DB
: BDEV_SLOW
;
1683 // new logging device in new naming scheme
1684 int new_log_dev_next
= new_log_dev_cur
;
1686 if (devs_source
.count(new_log_dev_cur
)) {
1687 // SLOW device is addressed via BDEV_DB too hence either WAL or DB
1688 new_log_dev_next
= (flags
& REMOVE_WAL
) || !bdev
[BDEV_WAL
] ?
1692 dout(0) << __func__
<< " log moved from " << new_log_dev_cur
1693 << " to " << new_log_dev_next
<< dendl
;
1696 (flags
& REMOVE_DB
) && new_log_dev_next
== BDEV_DB
?
1701 _rewrite_log_and_layout_sync(
1703 (flags
& REMOVE_DB
) ? BDEV_SLOW
: BDEV_DB
,
1711 int BlueFS::device_migrate_to_new(
1713 const set
<int>& devs_source
,
1715 const bluefs_layout_t
& layout
)
1718 bool buffered
= cct
->_conf
->bluefs_buffered_io
;
1720 dout(10) << __func__
<< " devs_source " << devs_source
1721 << " dev_target " << dev_target
<< dendl
;
1722 assert(dev_target
== (int)BDEV_NEWDB
|| (int)BDEV_NEWWAL
);
1726 flags
|= devs_source
.count(BDEV_DB
) ?
1727 (!bdev
[BDEV_SLOW
] ? RENAME_DB2SLOW
: REMOVE_DB
) :
1729 flags
|= devs_source
.count(BDEV_WAL
) ? REMOVE_WAL
: 0;
1730 int dev_target_new
= dev_target
; //FIXME: remove, makes no sense
1732 for (auto& p
: file_map
) {
1734 if (p
.second
->fnode
.ino
== 1) {
1737 dout(10) << __func__
<< " " << p
.first
<< " " << p
.second
->fnode
<< dendl
;
1739 auto& fnode_extents
= p
.second
->fnode
.extents
;
1741 bool rewrite
= false;
1742 for (auto ext_it
= fnode_extents
.begin();
1743 ext_it
!= p
.second
->fnode
.extents
.end();
1745 if (ext_it
->bdev
!= dev_target
&& devs_source
.count(ext_it
->bdev
)) {
1751 dout(10) << __func__
<< " migrating" << dendl
;
1755 for (auto old_ext
: fnode_extents
) {
1756 buf
.resize(old_ext
.length
);
1757 int r
= bdev
[old_ext
.bdev
]->read_random(
1763 derr
<< __func__
<< " failed to read 0x" << std::hex
1764 << old_ext
.offset
<< "~" << old_ext
.length
<< std::dec
1765 << " from " << (int)dev_target
<< dendl
;
1768 bl
.append((char*)&buf
[0], old_ext
.length
);
1771 // write entire file
1772 PExtentVector extents
;
1773 auto l
= _allocate_without_fallback(dev_target
, bl
.length(), &extents
);
1775 derr
<< __func__
<< " unable to allocate len 0x" << std::hex
1776 << bl
.length() << std::dec
<< " from " << (int)dev_target
1777 << ": " << cpp_strerror(l
) << dendl
;
1782 for (auto& i
: extents
) {
1784 uint64_t cur_len
= std::min
<uint64_t>(i
.length
, bl
.length() - off
);
1785 ceph_assert(cur_len
> 0);
1786 cur
.substr_of(bl
, off
, cur_len
);
1787 int r
= bdev
[dev_target
]->write(i
.offset
, cur
, buffered
);
1788 ceph_assert(r
== 0);
1792 // release old extents
1793 for (auto old_ext
: fnode_extents
) {
1794 PExtentVector to_release
;
1795 to_release
.emplace_back(old_ext
.offset
, old_ext
.length
);
1796 alloc
[old_ext
.bdev
]->release(to_release
);
1800 fnode_extents
.clear();
1801 for (auto& i
: extents
) {
1802 fnode_extents
.emplace_back(dev_target_new
, i
.offset
, i
.length
);
1806 // new logging device in the current naming scheme
1807 int new_log_dev_cur
=
1810 bdev
[BDEV_WAL
] && !(flags
& REMOVE_WAL
) ?
1814 bdev
[BDEV_DB
] && !(flags
& REMOVE_DB
)?
1818 // new logging device in new naming scheme
1819 int new_log_dev_next
=
1820 new_log_dev_cur
== BDEV_NEWWAL
?
1822 new_log_dev_cur
== BDEV_NEWDB
?
1827 dev_target
== BDEV_NEWDB
?
1833 _rewrite_log_and_layout_sync(
1843 BlueFS::FileRef
BlueFS::_get_file(uint64_t ino
)
1845 auto p
= file_map
.find(ino
);
1846 if (p
== file_map
.end()) {
1847 FileRef f
= ceph::make_ref
<File
>();
1849 dout(30) << __func__
<< " ino " << ino
<< " = " << f
1850 << " (new)" << dendl
;
1853 dout(30) << __func__
<< " ino " << ino
<< " = " << p
->second
<< dendl
;
1858 void BlueFS::_drop_link(FileRef file
)
1860 dout(20) << __func__
<< " had refs " << file
->refs
1861 << " on " << file
->fnode
<< dendl
;
1862 ceph_assert(file
->refs
> 0);
1864 if (file
->refs
== 0) {
1865 dout(20) << __func__
<< " destroying " << file
->fnode
<< dendl
;
1866 ceph_assert(file
->num_reading
.load() == 0);
1867 vselector
->sub_usage(file
->vselector_hint
, file
->fnode
);
1868 log_t
.op_file_remove(file
->fnode
.ino
);
1869 for (auto& r
: file
->fnode
.extents
) {
1870 pending_release
[r
.bdev
].insert(r
.offset
, r
.length
);
1872 file_map
.erase(file
->fnode
.ino
);
1873 file
->deleted
= true;
1875 if (file
->dirty_seq
) {
1876 ceph_assert(file
->dirty_seq
> log_seq_stable
);
1877 ceph_assert(dirty_files
.count(file
->dirty_seq
));
1878 auto it
= dirty_files
[file
->dirty_seq
].iterator_to(*file
);
1879 dirty_files
[file
->dirty_seq
].erase(it
);
1880 file
->dirty_seq
= 0;
1885 int64_t BlueFS::_read_random(
1886 FileReader
*h
, ///< [in] read from here
1887 uint64_t off
, ///< [in] offset
1888 uint64_t len
, ///< [in] this many bytes
1889 char *out
) ///< [out] optional: or copy it here
1891 auto* buf
= &h
->buf
;
1894 dout(10) << __func__
<< " h " << h
1895 << " 0x" << std::hex
<< off
<< "~" << len
<< std::dec
1896 << " from " << h
->file
->fnode
<< dendl
;
1898 ++h
->file
->num_reading
;
1900 if (!h
->ignore_eof
&&
1901 off
+ len
> h
->file
->fnode
.size
) {
1902 if (off
> h
->file
->fnode
.size
)
1905 len
= h
->file
->fnode
.size
- off
;
1906 dout(20) << __func__
<< " reaching (or past) eof, len clipped to 0x"
1907 << std::hex
<< len
<< std::dec
<< dendl
;
1909 logger
->inc(l_bluefs_read_random_count
, 1);
1910 logger
->inc(l_bluefs_read_random_bytes
, len
);
1912 std::shared_lock
s_lock(h
->lock
);
1913 buf
->bl
.reassign_to_mempool(mempool::mempool_bluefs_file_reader
);
1915 if (off
< buf
->bl_off
|| off
>= buf
->get_buf_end()) {
1918 auto p
= h
->file
->fnode
.seek(off
, &x_off
);
1919 ceph_assert(p
!= h
->file
->fnode
.extents
.end());
1920 uint64_t l
= std::min(p
->length
- x_off
, len
);
1922 l
= std::min(l
, uint64_t(1) << 30);
1923 dout(20) << __func__
<< " read random 0x"
1924 << std::hex
<< x_off
<< "~" << l
<< std::dec
1925 << " of " << *p
<< dendl
;
1926 int r
= bdev
[p
->bdev
]->read_random(p
->offset
+ x_off
, l
, out
,
1927 cct
->_conf
->bluefs_buffered_io
);
1928 ceph_assert(r
== 0);
1934 logger
->inc(l_bluefs_read_random_disk_count
, 1);
1935 logger
->inc(l_bluefs_read_random_disk_bytes
, l
);
1940 auto left
= buf
->get_buf_remaining(off
);
1941 int64_t r
= std::min(len
, left
);
1942 logger
->inc(l_bluefs_read_random_buffer_count
, 1);
1943 logger
->inc(l_bluefs_read_random_buffer_bytes
, r
);
1944 dout(20) << __func__
<< " left 0x" << std::hex
<< left
1945 << " 0x" << off
<< "~" << len
<< std::dec
1949 // NOTE: h->bl is normally a contiguous buffer so c_str() is free.
1950 memcpy(out
, buf
->bl
.c_str() + off
- buf
->bl_off
, r
);
1954 dout(30) << __func__
<< " result chunk (0x"
1955 << std::hex
<< r
<< std::dec
<< " bytes):\n";
1957 t
.substr_of(buf
->bl
, off
- buf
->bl_off
, r
);
1967 dout(20) << __func__
<< " got " << ret
<< dendl
;
1968 --h
->file
->num_reading
;
1972 int64_t BlueFS::_read(
1973 FileReader
*h
, ///< [in] read from here
1974 FileReaderBuffer
*buf
, ///< [in] reader state
1975 uint64_t off
, ///< [in] offset
1976 size_t len
, ///< [in] this many bytes
1977 bufferlist
*outbl
, ///< [out] optional: reference the result here
1978 char *out
) ///< [out] optional: or copy it here
1980 bool prefetch
= !outbl
&& !out
;
1981 dout(10) << __func__
<< " h " << h
1982 << " 0x" << std::hex
<< off
<< "~" << len
<< std::dec
1983 << " from " << h
->file
->fnode
1984 << (prefetch
? " prefetch" : "")
1987 ++h
->file
->num_reading
;
1989 if (!h
->ignore_eof
&&
1990 off
+ len
> h
->file
->fnode
.size
) {
1991 if (off
> h
->file
->fnode
.size
)
1994 len
= h
->file
->fnode
.size
- off
;
1995 dout(20) << __func__
<< " reaching (or past) eof, len clipped to 0x"
1996 << std::hex
<< len
<< std::dec
<< dendl
;
1998 logger
->inc(l_bluefs_read_count
, 1);
1999 logger
->inc(l_bluefs_read_bytes
, len
);
2001 logger
->inc(l_bluefs_read_prefetch_count
, 1);
2002 logger
->inc(l_bluefs_read_prefetch_bytes
, len
);
2009 std::shared_lock
s_lock(h
->lock
);
2012 if (off
< buf
->bl_off
|| off
>= buf
->get_buf_end()) {
2014 std::unique_lock
u_lock(h
->lock
);
2015 buf
->bl
.reassign_to_mempool(mempool::mempool_bluefs_file_reader
);
2016 if (off
< buf
->bl_off
|| off
>= buf
->get_buf_end()) {
2017 // if precondition hasn't changed during locking upgrade.
2019 buf
->bl_off
= off
& super
.block_mask();
2021 auto p
= h
->file
->fnode
.seek(buf
->bl_off
, &x_off
);
2022 if (p
== h
->file
->fnode
.extents
.end()) {
2023 dout(5) << __func__
<< " reading less then required "
2024 << ret
<< "<" << ret
+ len
<< dendl
;
2028 uint64_t want
= round_up_to(len
+ (off
& ~super
.block_mask()),
2030 want
= std::max(want
, buf
->max_prefetch
);
2031 uint64_t l
= std::min(p
->length
- x_off
, want
);
2033 l
= std::min(l
, uint64_t(1) << 30);
2034 uint64_t eof_offset
= round_up_to(h
->file
->fnode
.size
, super
.block_size
);
2035 if (!h
->ignore_eof
&&
2036 buf
->bl_off
+ l
> eof_offset
) {
2037 l
= eof_offset
- buf
->bl_off
;
2039 dout(20) << __func__
<< " fetching 0x"
2040 << std::hex
<< x_off
<< "~" << l
<< std::dec
2041 << " of " << *p
<< dendl
;
2042 int r
= bdev
[p
->bdev
]->read(p
->offset
+ x_off
, l
, &buf
->bl
, ioc
[p
->bdev
],
2043 cct
->_conf
->bluefs_buffered_io
);
2044 ceph_assert(r
== 0);
2048 // we should recheck if buffer is valid after lock downgrade
2051 left
= buf
->get_buf_remaining(off
);
2052 dout(20) << __func__
<< " left 0x" << std::hex
<< left
2053 << " len 0x" << len
<< std::dec
<< dendl
;
2055 int64_t r
= std::min(len
, left
);
2058 t
.substr_of(buf
->bl
, off
- buf
->bl_off
, r
);
2059 outbl
->claim_append(t
);
2062 // NOTE: h->bl is normally a contiguous buffer so c_str() is free.
2063 memcpy(out
, buf
->bl
.c_str() + off
- buf
->bl_off
, r
);
2067 dout(30) << __func__
<< " result chunk (0x"
2068 << std::hex
<< r
<< std::dec
<< " bytes):\n";
2070 t
.substr_of(buf
->bl
, off
- buf
->bl_off
, r
);
2079 dout(20) << __func__
<< " got " << ret
<< dendl
;
2080 ceph_assert(!outbl
|| (int)outbl
->length() == ret
);
2081 --h
->file
->num_reading
;
2085 void BlueFS::_invalidate_cache(FileRef f
, uint64_t offset
, uint64_t length
)
2087 dout(10) << __func__
<< " file " << f
->fnode
2088 << " 0x" << std::hex
<< offset
<< "~" << length
<< std::dec
2090 if (offset
& ~super
.block_mask()) {
2091 offset
&= super
.block_mask();
2092 length
= round_up_to(length
, super
.block_size
);
2095 auto p
= f
->fnode
.seek(offset
, &x_off
);
2096 while (length
> 0 && p
!= f
->fnode
.extents
.end()) {
2097 uint64_t x_len
= std::min(p
->length
- x_off
, length
);
2098 bdev
[p
->bdev
]->invalidate_cache(p
->offset
+ x_off
, x_len
);
2099 dout(20) << __func__
<< " 0x" << std::hex
<< x_off
<< "~" << x_len
2100 << std:: dec
<< " of " << *p
<< dendl
;
2106 uint64_t BlueFS::_estimate_log_size()
2108 int avg_dir_size
= 40; // fixme
2109 int avg_file_size
= 12;
2110 uint64_t size
= 4096 * 2;
2111 size
+= file_map
.size() * (1 + sizeof(bluefs_fnode_t
));
2112 for (auto& p
: block_all
)
2113 size
+= p
.num_intervals() * (1 + 1 + sizeof(uint64_t) * 2);
2114 size
+= dir_map
.size() + (1 + avg_dir_size
);
2115 size
+= file_map
.size() * (1 + avg_dir_size
+ avg_file_size
);
2116 return round_up_to(size
, super
.block_size
);
2119 void BlueFS::compact_log()
2121 std::unique_lock
<ceph::mutex
> l(lock
);
2122 if (!cct
->_conf
->bluefs_replay_recovery_disable_compact
) {
2123 if (cct
->_conf
->bluefs_compact_log_sync
) {
2124 _compact_log_sync();
2126 _compact_log_async(l
);
2131 bool BlueFS::_should_compact_log()
2133 uint64_t current
= log_writer
->file
->fnode
.size
;
2134 uint64_t expected
= _estimate_log_size();
2135 float ratio
= (float)current
/ (float)expected
;
2136 dout(10) << __func__
<< " current 0x" << std::hex
<< current
2137 << " expected " << expected
<< std::dec
2138 << " ratio " << ratio
2139 << (new_log
? " (async compaction in progress)" : "")
2142 current
< cct
->_conf
->bluefs_log_compact_min_size
||
2143 ratio
< cct
->_conf
->bluefs_log_compact_min_ratio
) {
2149 void BlueFS::_compact_log_dump_metadata(bluefs_transaction_t
*t
,
2153 t
->uuid
= super
.uuid
;
2154 dout(20) << __func__
<< " op_init" << dendl
;
2157 for (unsigned bdev
= 0; bdev
< MAX_BDEV
; ++bdev
) {
2158 interval_set
<uint64_t>& p
= block_all
[bdev
];
2159 for (interval_set
<uint64_t>::iterator q
= p
.begin(); q
!= p
.end(); ++q
) {
2160 auto bdev_new
= bdev
;
2161 if ((flags
& REMOVE_WAL
) && bdev
== BDEV_WAL
) {
2164 if ((flags
& REMOVE_DB
) && bdev
== BDEV_DB
) {
2167 if ((flags
& RENAME_SLOW2DB
) && bdev
== BDEV_SLOW
) {
2170 if ((flags
& RENAME_DB2SLOW
) && bdev
== BDEV_DB
) {
2171 bdev_new
= BDEV_SLOW
;
2173 if (bdev
== BDEV_NEWDB
) {
2174 // REMOVE_DB xor RENAME_DB
2175 ceph_assert(!(flags
& REMOVE_DB
) != !(flags
& RENAME_DB2SLOW
));
2176 ceph_assert(!(flags
& RENAME_SLOW2DB
));
2179 if (bdev
== BDEV_NEWWAL
) {
2180 ceph_assert(flags
& REMOVE_WAL
);
2181 bdev_new
= BDEV_WAL
;
2183 dout(20) << __func__
<< " op_alloc_add " << bdev_new
<< " 0x"
2184 << std::hex
<< q
.get_start() << "~" << q
.get_len() << std::dec
2186 t
->op_alloc_add(bdev_new
, q
.get_start(), q
.get_len());
2189 for (auto& [ino
, file_ref
] : file_map
) {
2192 ceph_assert(ino
> 1);
2194 for(auto& e
: file_ref
->fnode
.extents
) {
2196 auto bdev_new
= bdev
;
2197 ceph_assert(!((flags
& REMOVE_WAL
) && bdev
== BDEV_WAL
));
2198 if ((flags
& RENAME_SLOW2DB
) && bdev
== BDEV_SLOW
) {
2201 if ((flags
& RENAME_DB2SLOW
) && bdev
== BDEV_DB
) {
2202 bdev_new
= BDEV_SLOW
;
2204 if (bdev
== BDEV_NEWDB
) {
2205 // REMOVE_DB xor RENAME_DB
2206 ceph_assert(!(flags
& REMOVE_DB
) != !(flags
& RENAME_DB2SLOW
));
2207 ceph_assert(!(flags
& RENAME_SLOW2DB
));
2210 if (bdev
== BDEV_NEWWAL
) {
2211 ceph_assert(flags
& REMOVE_WAL
);
2212 bdev_new
= BDEV_WAL
;
2216 dout(20) << __func__
<< " op_file_update " << file_ref
->fnode
<< dendl
;
2217 t
->op_file_update(file_ref
->fnode
);
2219 for (auto& [path
, dir_ref
] : dir_map
) {
2220 dout(20) << __func__
<< " op_dir_create " << path
<< dendl
;
2221 t
->op_dir_create(path
);
2222 for (auto& [fname
, file_ref
] : dir_ref
->file_map
) {
2223 dout(20) << __func__
<< " op_dir_link " << path
<< "/" << fname
2224 << " to " << file_ref
->fnode
.ino
<< dendl
;
2225 t
->op_dir_link(path
, fname
, file_ref
->fnode
.ino
);
2230 void BlueFS::_compact_log_sync()
2232 dout(10) << __func__
<< dendl
;
2234 vselector
->select_prefer_bdev(log_writer
->file
->vselector_hint
);
2235 _rewrite_log_and_layout_sync(true,
2240 super
.memorized_layout
);
2241 logger
->inc(l_bluefs_log_compactions
);
2244 void BlueFS::_rewrite_log_and_layout_sync(bool allocate_with_fallback
,
2249 std::optional
<bluefs_layout_t
> layout
)
2251 File
*log_file
= log_writer
->file
.get();
2253 // clear out log (be careful who calls us!!!)
2256 dout(20) << __func__
<< " super_dev:" << super_dev
2257 << " log_dev:" << log_dev
2258 << " log_dev_new:" << log_dev_new
2259 << " flags:" << flags
2261 bluefs_transaction_t t
;
2262 _compact_log_dump_metadata(&t
, flags
);
2264 dout(20) << __func__
<< " op_jump_seq " << log_seq
<< dendl
;
2265 t
.op_jump_seq(log_seq
);
2271 uint64_t need
= bl
.length() + cct
->_conf
->bluefs_max_log_runway
;
2272 dout(20) << __func__
<< " need " << need
<< dendl
;
2274 bluefs_fnode_t old_fnode
;
2276 log_file
->fnode
.swap_extents(old_fnode
);
2277 if (allocate_with_fallback
) {
2278 r
= _allocate(log_dev
, need
, &log_file
->fnode
);
2279 ceph_assert(r
== 0);
2281 PExtentVector extents
;
2282 r
= _allocate_without_fallback(log_dev
,
2285 ceph_assert(r
== 0);
2286 for (auto& p
: extents
) {
2287 log_file
->fnode
.append_extent(
2288 bluefs_extent_t(log_dev
, p
.offset
, p
.length
));
2292 _close_writer(log_writer
);
2294 log_file
->fnode
.size
= bl
.length();
2295 vselector
->sub_usage(log_file
->vselector_hint
, old_fnode
);
2296 vselector
->add_usage(log_file
->vselector_hint
, log_file
->fnode
);
2298 log_writer
= _create_writer(log_file
);
2299 log_writer
->append(bl
);
2300 r
= _flush(log_writer
, true);
2301 ceph_assert(r
== 0);
2303 if (!cct
->_conf
->bluefs_sync_write
) {
2304 list
<aio_t
> completed_ios
;
2305 _claim_completed_aios(log_writer
, &completed_ios
);
2306 wait_for_aio(log_writer
);
2307 completed_ios
.clear();
2312 super
.memorized_layout
= layout
;
2313 super
.log_fnode
= log_file
->fnode
;
2314 // rename device if needed
2315 if (log_dev
!= log_dev_new
) {
2316 dout(10) << __func__
<< " renaming log extents to " << log_dev_new
<< dendl
;
2317 for (auto& p
: super
.log_fnode
.extents
) {
2318 p
.bdev
= log_dev_new
;
2321 dout(10) << __func__
<< " writing super, log fnode: " << super
.log_fnode
<< dendl
;
2324 _write_super(super_dev
);
2327 dout(10) << __func__
<< " release old log extents " << old_fnode
.extents
<< dendl
;
2328 for (auto& r
: old_fnode
.extents
) {
2329 pending_release
[r
.bdev
].insert(r
.offset
, r
.length
);
2334 * 1. Allocate a new extent to continue the log, and then log an event
2335 * that jumps the log write position to the new extent. At this point, the
2336 * old extent(s) won't be written to, and reflect everything to compact.
2337 * New events will be written to the new region that we'll keep.
2339 * 2. While still holding the lock, encode a bufferlist that dumps all of the
2340 * in-memory fnodes and names. This will become the new beginning of the
2341 * log. The last event will jump to the log continuation extent from #1.
2343 * 3. Queue a write to a new extent for the new beginnging of the log.
2345 * 4. Drop lock and wait
2347 * 5. Retake the lock.
2349 * 6. Update the log_fnode to splice in the new beginning.
2351 * 7. Write the new superblock.
2353 * 8. Release the old log space. Clean up.
2355 void BlueFS::_compact_log_async(std::unique_lock
<ceph::mutex
>& l
)
2357 dout(10) << __func__
<< dendl
;
2358 File
*log_file
= log_writer
->file
.get();
2359 ceph_assert(!new_log
);
2360 ceph_assert(!new_log_writer
);
2362 // create a new log [writer] so that we know compaction is in progress
2363 // (see _should_compact_log)
2364 new_log
= ceph::make_ref
<File
>();
2365 new_log
->fnode
.ino
= 0; // so that _flush_range won't try to log the fnode
2367 // 0. wait for any racing flushes to complete. (We do not want to block
2368 // in _flush_sync_log with jump_to set or else a racing thread might flush
2369 // our entries and our jump_to update won't be correct.)
2370 while (log_flushing
) {
2371 dout(10) << __func__
<< " log is currently flushing, waiting" << dendl
;
2375 vselector
->sub_usage(log_file
->vselector_hint
, log_file
->fnode
);
2377 // 1. allocate new log space and jump to it.
2378 old_log_jump_to
= log_file
->fnode
.get_allocated();
2379 dout(10) << __func__
<< " old_log_jump_to 0x" << std::hex
<< old_log_jump_to
2380 << " need 0x" << (old_log_jump_to
+ cct
->_conf
->bluefs_max_log_runway
) << std::dec
<< dendl
;
2381 int r
= _allocate(vselector
->select_prefer_bdev(log_file
->vselector_hint
),
2382 cct
->_conf
->bluefs_max_log_runway
,
2384 ceph_assert(r
== 0);
2385 //adjust usage as flush below will need it
2386 vselector
->add_usage(log_file
->vselector_hint
, log_file
->fnode
);
2387 dout(10) << __func__
<< " log extents " << log_file
->fnode
.extents
<< dendl
;
2389 // update the log file change and log a jump to the offset where we want to
2390 // write the new entries
2391 log_t
.op_file_update(log_file
->fnode
);
2392 log_t
.op_jump(log_seq
, old_log_jump_to
);
2394 flush_bdev(); // FIXME?
2396 _flush_and_sync_log(l
, 0, old_log_jump_to
);
2398 // 2. prepare compacted log
2399 bluefs_transaction_t t
;
2400 //avoid record two times in log_t and _compact_log_dump_metadata.
2402 _compact_log_dump_metadata(&t
, 0);
2404 uint64_t max_alloc_size
= std::max(alloc_size
[BDEV_WAL
],
2405 std::max(alloc_size
[BDEV_DB
],
2406 alloc_size
[BDEV_SLOW
]));
2408 // conservative estimate for final encoded size
2409 new_log_jump_to
= round_up_to(t
.op_bl
.length() + super
.block_size
* 2,
2411 t
.op_jump(log_seq
, new_log_jump_to
);
2414 //FIXME: check if we want DB here?
2415 r
= _allocate(BlueFS::BDEV_DB
, new_log_jump_to
,
2417 ceph_assert(r
== 0);
2419 // we might have some more ops in log_t due to _allocate call
2426 dout(10) << __func__
<< " new_log_jump_to 0x" << std::hex
<< new_log_jump_to
2427 << std::dec
<< dendl
;
2429 new_log_writer
= _create_writer(new_log
);
2430 new_log_writer
->append(bl
);
2433 r
= _flush(new_log_writer
, true);
2434 ceph_assert(r
== 0);
2437 _flush_bdev_safely(new_log_writer
);
2439 // 5. update our log fnode
2440 // discard first old_log_jump_to extents
2442 dout(10) << __func__
<< " remove 0x" << std::hex
<< old_log_jump_to
<< std::dec
2443 << " of " << log_file
->fnode
.extents
<< dendl
;
2444 uint64_t discarded
= 0;
2445 mempool::bluefs::vector
<bluefs_extent_t
> old_extents
;
2446 while (discarded
< old_log_jump_to
) {
2447 ceph_assert(!log_file
->fnode
.extents
.empty());
2448 bluefs_extent_t
& e
= log_file
->fnode
.extents
.front();
2449 bluefs_extent_t temp
= e
;
2450 if (discarded
+ e
.length
<= old_log_jump_to
) {
2451 dout(10) << __func__
<< " remove old log extent " << e
<< dendl
;
2452 discarded
+= e
.length
;
2453 log_file
->fnode
.pop_front_extent();
2455 dout(10) << __func__
<< " remove front of old log extent " << e
<< dendl
;
2456 uint64_t drop
= old_log_jump_to
- discarded
;
2461 dout(10) << __func__
<< " kept " << e
<< " removed " << temp
<< dendl
;
2463 old_extents
.push_back(temp
);
2465 auto from
= log_file
->fnode
.extents
.begin();
2466 auto to
= log_file
->fnode
.extents
.end();
2467 while (from
!= to
) {
2468 new_log
->fnode
.append_extent(*from
);
2472 vselector
->sub_usage(log_file
->vselector_hint
, log_file
->fnode
);
2474 // clear the extents from old log file, they are added to new log
2475 log_file
->fnode
.clear_extents();
2476 // swap the log files. New log file is the log file now.
2477 new_log
->fnode
.swap_extents(log_file
->fnode
);
2479 log_writer
->pos
= log_writer
->file
->fnode
.size
=
2480 log_writer
->pos
- old_log_jump_to
+ new_log_jump_to
;
2482 vselector
->add_usage(log_file
->vselector_hint
, log_file
->fnode
);
2484 // 6. write the super block to reflect the changes
2485 dout(10) << __func__
<< " writing super" << dendl
;
2486 super
.log_fnode
= log_file
->fnode
;
2488 _write_super(BDEV_DB
);
2494 // 7. release old space
2495 dout(10) << __func__
<< " release old log extents " << old_extents
<< dendl
;
2496 for (auto& r
: old_extents
) {
2497 pending_release
[r
.bdev
].insert(r
.offset
, r
.length
);
2500 // delete the new log, remove from the dirty files list
2501 _close_writer(new_log_writer
);
2502 if (new_log
->dirty_seq
) {
2503 ceph_assert(dirty_files
.count(new_log
->dirty_seq
));
2504 auto it
= dirty_files
[new_log
->dirty_seq
].iterator_to(*new_log
);
2505 dirty_files
[new_log
->dirty_seq
].erase(it
);
2507 new_log_writer
= nullptr;
2509 log_cond
.notify_all();
2511 dout(10) << __func__
<< " log extents " << log_file
->fnode
.extents
<< dendl
;
2512 logger
->inc(l_bluefs_log_compactions
);
2515 void BlueFS::_pad_bl(bufferlist
& bl
)
2517 uint64_t partial
= bl
.length() % super
.block_size
;
2519 dout(10) << __func__
<< " padding with 0x" << std::hex
2520 << super
.block_size
- partial
<< " zeros" << std::dec
<< dendl
;
2521 bl
.append_zero(super
.block_size
- partial
);
2526 int BlueFS::_flush_and_sync_log(std::unique_lock
<ceph::mutex
>& l
,
2530 while (log_flushing
) {
2531 dout(10) << __func__
<< " want_seq " << want_seq
2532 << " log is currently flushing, waiting" << dendl
;
2533 ceph_assert(!jump_to
);
2536 if (want_seq
&& want_seq
<= log_seq_stable
) {
2537 dout(10) << __func__
<< " want_seq " << want_seq
<< " <= log_seq_stable "
2538 << log_seq_stable
<< ", done" << dendl
;
2539 ceph_assert(!jump_to
);
2542 if (log_t
.empty() && dirty_files
.empty()) {
2543 dout(10) << __func__
<< " want_seq " << want_seq
2544 << " " << log_t
<< " not dirty, dirty_files empty, no-op" << dendl
;
2545 ceph_assert(!jump_to
);
2549 vector
<interval_set
<uint64_t>> to_release(pending_release
.size());
2550 to_release
.swap(pending_release
);
2552 uint64_t seq
= log_t
.seq
= ++log_seq
;
2553 ceph_assert(want_seq
== 0 || want_seq
<= seq
);
2554 log_t
.uuid
= super
.uuid
;
2557 auto lsi
= dirty_files
.find(seq
);
2558 if (lsi
!= dirty_files
.end()) {
2559 dout(20) << __func__
<< " " << lsi
->second
.size() << " dirty_files" << dendl
;
2560 for (auto &f
: lsi
->second
) {
2561 dout(20) << __func__
<< " op_file_update " << f
.fnode
<< dendl
;
2562 log_t
.op_file_update(f
.fnode
);
2566 dout(10) << __func__
<< " " << log_t
<< dendl
;
2567 ceph_assert(!log_t
.empty());
2569 // allocate some more space (before we run out)?
2570 int64_t runway
= log_writer
->file
->fnode
.get_allocated() -
2571 log_writer
->get_effective_write_pos();
2572 bool just_expanded_log
= false;
2573 if (runway
< (int64_t)cct
->_conf
->bluefs_min_log_runway
) {
2574 dout(10) << __func__
<< " allocating more log runway (0x"
2575 << std::hex
<< runway
<< std::dec
<< " remaining)" << dendl
;
2576 while (new_log_writer
) {
2577 dout(10) << __func__
<< " waiting for async compaction" << dendl
;
2580 vselector
->sub_usage(log_writer
->file
->vselector_hint
, log_writer
->file
->fnode
);
2582 vselector
->select_prefer_bdev(log_writer
->file
->vselector_hint
),
2583 cct
->_conf
->bluefs_max_log_runway
,
2584 &log_writer
->file
->fnode
);
2585 ceph_assert(r
== 0);
2586 vselector
->add_usage(log_writer
->file
->vselector_hint
, log_writer
->file
->fnode
);
2587 log_t
.op_file_update(log_writer
->file
->fnode
);
2588 just_expanded_log
= true;
2592 bl
.reserve(super
.block_size
);
2594 // pad to block boundary
2595 size_t realign
= super
.block_size
- (bl
.length() % super
.block_size
);
2596 if (realign
&& realign
!= super
.block_size
)
2597 bl
.append_zero(realign
);
2599 logger
->inc(l_bluefs_logged_bytes
, bl
.length());
2601 if (just_expanded_log
) {
2602 ceph_assert(bl
.length() <= runway
); // if we write this, we will have an unrecoverable data loss
2605 log_writer
->append(bl
);
2608 log_t
.seq
= 0; // just so debug output is less confusing
2609 log_flushing
= true;
2611 int r
= _flush(log_writer
, true);
2612 ceph_assert(r
== 0);
2615 dout(10) << __func__
<< " jumping log offset from 0x" << std::hex
2616 << log_writer
->pos
<< " -> 0x" << jump_to
<< std::dec
<< dendl
;
2617 log_writer
->pos
= jump_to
;
2618 vselector
->sub_usage(log_writer
->file
->vselector_hint
, log_writer
->file
->fnode
.size
);
2619 log_writer
->file
->fnode
.size
= jump_to
;
2620 vselector
->add_usage(log_writer
->file
->vselector_hint
, log_writer
->file
->fnode
.size
);
2623 _flush_bdev_safely(log_writer
);
2625 log_flushing
= false;
2626 log_cond
.notify_all();
2628 // clean dirty files
2629 if (seq
> log_seq_stable
) {
2630 log_seq_stable
= seq
;
2631 dout(20) << __func__
<< " log_seq_stable " << log_seq_stable
<< dendl
;
2633 auto p
= dirty_files
.begin();
2634 while (p
!= dirty_files
.end()) {
2635 if (p
->first
> log_seq_stable
) {
2636 dout(20) << __func__
<< " done cleaning up dirty files" << dendl
;
2640 auto l
= p
->second
.begin();
2641 while (l
!= p
->second
.end()) {
2643 ceph_assert(file
->dirty_seq
> 0);
2644 ceph_assert(file
->dirty_seq
<= log_seq_stable
);
2645 dout(20) << __func__
<< " cleaned file " << file
->fnode
<< dendl
;
2646 file
->dirty_seq
= 0;
2647 p
->second
.erase(l
++);
2650 ceph_assert(p
->second
.empty());
2651 dirty_files
.erase(p
++);
2654 dout(20) << __func__
<< " log_seq_stable " << log_seq_stable
2655 << " already >= out seq " << seq
2656 << ", we lost a race against another log flush, done" << dendl
;
2659 for (unsigned i
= 0; i
< to_release
.size(); ++i
) {
2660 if (!to_release
[i
].empty()) {
2661 /* OK, now we have the guarantee alloc[i] won't be null. */
2663 if (cct
->_conf
->bdev_enable_discard
&& cct
->_conf
->bdev_async_discard
) {
2664 r
= bdev
[i
]->queue_discard(to_release
[i
]);
2667 } else if (cct
->_conf
->bdev_enable_discard
) {
2668 for (auto p
= to_release
[i
].begin(); p
!= to_release
[i
].end(); ++p
) {
2669 bdev
[i
]->discard(p
.get_start(), p
.get_len());
2672 alloc
[i
]->release(to_release
[i
]);
2676 _update_logger_stats();
2681 int BlueFS::_flush_range(FileWriter
*h
, uint64_t offset
, uint64_t length
)
2683 dout(10) << __func__
<< " " << h
<< " pos 0x" << std::hex
<< h
->pos
2684 << " 0x" << offset
<< "~" << length
<< std::dec
2685 << " to " << h
->file
->fnode
<< dendl
;
2686 ceph_assert(!h
->file
->deleted
);
2687 ceph_assert(h
->file
->num_readers
.load() == 0);
2689 h
->buffer_appender
.flush();
2692 if (h
->file
->fnode
.ino
== 1)
2695 buffered
= cct
->_conf
->bluefs_buffered_io
;
2697 if (offset
+ length
<= h
->pos
)
2699 if (offset
< h
->pos
) {
2700 length
-= h
->pos
- offset
;
2702 dout(10) << " still need 0x"
2703 << std::hex
<< offset
<< "~" << length
<< std::dec
2706 ceph_assert(offset
<= h
->file
->fnode
.size
);
2708 uint64_t allocated
= h
->file
->fnode
.get_allocated();
2709 vselector
->sub_usage(h
->file
->vselector_hint
, h
->file
->fnode
);
2710 // do not bother to dirty the file if we are overwriting
2711 // previously allocated extents.
2712 bool must_dirty
= false;
2713 if (allocated
< offset
+ length
) {
2714 // we should never run out of log space here; see the min runway check
2715 // in _flush_and_sync_log.
2716 ceph_assert(h
->file
->fnode
.ino
!= 1);
2717 int r
= _allocate(vselector
->select_prefer_bdev(h
->file
->vselector_hint
),
2718 offset
+ length
- allocated
,
2721 derr
<< __func__
<< " allocated: 0x" << std::hex
<< allocated
2722 << " offset: 0x" << offset
<< " length: 0x" << length
<< std::dec
2724 vselector
->add_usage(h
->file
->vselector_hint
, h
->file
->fnode
); // undo
2725 ceph_abort_msg("bluefs enospc");
2730 if (h
->file
->fnode
.size
< offset
+ length
) {
2731 h
->file
->fnode
.size
= offset
+ length
;
2732 if (h
->file
->fnode
.ino
> 1) {
2733 // we do not need to dirty the log file (or it's compacting
2734 // replacement) when the file size changes because replay is
2735 // smart enough to discover it on its own.
2740 h
->file
->fnode
.mtime
= ceph_clock_now();
2741 ceph_assert(h
->file
->fnode
.ino
>= 1);
2742 if (h
->file
->dirty_seq
== 0) {
2743 h
->file
->dirty_seq
= log_seq
+ 1;
2744 dirty_files
[h
->file
->dirty_seq
].push_back(*h
->file
);
2745 dout(20) << __func__
<< " dirty_seq = " << log_seq
+ 1
2746 << " (was clean)" << dendl
;
2748 if (h
->file
->dirty_seq
!= log_seq
+ 1) {
2749 // need re-dirty, erase from list first
2750 ceph_assert(dirty_files
.count(h
->file
->dirty_seq
));
2751 auto it
= dirty_files
[h
->file
->dirty_seq
].iterator_to(*h
->file
);
2752 dirty_files
[h
->file
->dirty_seq
].erase(it
);
2753 h
->file
->dirty_seq
= log_seq
+ 1;
2754 dirty_files
[h
->file
->dirty_seq
].push_back(*h
->file
);
2755 dout(20) << __func__
<< " dirty_seq = " << log_seq
+ 1
2756 << " (was " << h
->file
->dirty_seq
<< ")" << dendl
;
2758 dout(20) << __func__
<< " dirty_seq = " << log_seq
+ 1
2759 << " (unchanged, do nothing) " << dendl
;
2763 dout(20) << __func__
<< " file now " << h
->file
->fnode
<< dendl
;
2766 auto p
= h
->file
->fnode
.seek(offset
, &x_off
);
2767 ceph_assert(p
!= h
->file
->fnode
.extents
.end());
2768 dout(20) << __func__
<< " in " << *p
<< " x_off 0x"
2769 << std::hex
<< x_off
<< std::dec
<< dendl
;
2771 unsigned partial
= x_off
& ~super
.block_mask();
2774 dout(20) << __func__
<< " using partial tail 0x"
2775 << std::hex
<< partial
<< std::dec
<< dendl
;
2776 ceph_assert(h
->tail_block
.length() == partial
);
2777 bl
.claim_append_piecewise(h
->tail_block
);
2781 dout(20) << __func__
<< " waiting for previous aio to complete" << dendl
;
2782 for (auto p
: h
->iocv
) {
2788 if (length
== partial
+ h
->buffer
.length()) {
2789 /* in case of inital allocation and need to zero, limited flush is unacceptable */
2790 bl
.claim_append_piecewise(h
->buffer
);
2793 h
->buffer
.splice(0, length
, &t
);
2794 bl
.claim_append_piecewise(t
);
2795 t
.substr_of(h
->buffer
, length
, h
->buffer
.length() - length
);
2797 dout(20) << " leaving 0x" << std::hex
<< h
->buffer
.length() << std::dec
2798 << " unflushed" << dendl
;
2800 ceph_assert(bl
.length() == length
);
2802 h
->pos
= offset
+ length
;
2804 unsigned tail
= bl
.length() & ~super
.block_mask();
2806 dout(20) << __func__
<< " caching tail of 0x"
2808 << " and padding block with 0x" << (super
.block_size
- tail
)
2809 << std::dec
<< dendl
;
2810 h
->tail_block
.substr_of(bl
, bl
.length() - tail
, tail
);
2811 bl
.append_zero(super
.block_size
- tail
);
2812 length
+= super
.block_size
- tail
;
2814 h
->tail_block
.clear();
2816 ceph_assert(bl
.length() == length
);
2818 switch (h
->writer_type
) {
2820 logger
->inc(l_bluefs_bytes_written_wal
, length
);
2823 logger
->inc(l_bluefs_bytes_written_sst
, length
);
2827 dout(30) << "dump:\n";
2832 uint64_t bytes_written_slow
= 0;
2833 while (length
> 0) {
2834 uint64_t x_len
= std::min(p
->length
- x_off
, length
);
2836 t
.substr_of(bl
, bloff
, x_len
);
2837 if (cct
->_conf
->bluefs_sync_write
) {
2838 bdev
[p
->bdev
]->write(p
->offset
+ x_off
, t
, buffered
, h
->write_hint
);
2840 bdev
[p
->bdev
]->aio_write(p
->offset
+ x_off
, t
, h
->iocv
[p
->bdev
], buffered
, h
->write_hint
);
2842 h
->dirty_devs
[p
->bdev
] = true;
2843 if (p
->bdev
== BDEV_SLOW
) {
2844 bytes_written_slow
+= t
.length();
2852 logger
->inc(l_bluefs_bytes_written_slow
, bytes_written_slow
);
2853 for (unsigned i
= 0; i
< MAX_BDEV
; ++i
) {
2855 if (h
->iocv
[i
] && h
->iocv
[i
]->has_pending_aios()) {
2856 bdev
[i
]->aio_submit(h
->iocv
[i
]);
2860 vselector
->add_usage(h
->file
->vselector_hint
, h
->file
->fnode
);
2861 dout(20) << __func__
<< " h " << h
<< " pos now 0x"
2862 << std::hex
<< h
->pos
<< std::dec
<< dendl
;
2867 // we need to retire old completed aios so they don't stick around in
2868 // memory indefinitely (along with their bufferlist refs).
2869 void BlueFS::_claim_completed_aios(FileWriter
*h
, list
<aio_t
> *ls
)
2871 for (auto p
: h
->iocv
) {
2873 ls
->splice(ls
->end(), p
->running_aios
);
2876 dout(10) << __func__
<< " got " << ls
->size() << " aios" << dendl
;
2879 void BlueFS::wait_for_aio(FileWriter
*h
)
2881 // NOTE: this is safe to call without a lock, as long as our reference is
2883 dout(10) << __func__
<< " " << h
<< dendl
;
2884 utime_t start
= ceph_clock_now();
2885 for (auto p
: h
->iocv
) {
2890 dout(10) << __func__
<< " " << h
<< " done in " << (ceph_clock_now() - start
) << dendl
;
2894 int BlueFS::_flush(FileWriter
*h
, bool force
, std::unique_lock
<ceph::mutex
>& l
)
2896 bool flushed
= false;
2897 int r
= _flush(h
, force
, &flushed
);
2898 if (r
== 0 && flushed
) {
2899 _maybe_compact_log(l
);
2904 int BlueFS::_flush(FileWriter
*h
, bool force
, bool *flushed
)
2906 h
->buffer_appender
.flush();
2907 uint64_t length
= h
->buffer
.length();
2908 uint64_t offset
= h
->pos
;
2913 length
< cct
->_conf
->bluefs_min_flush_size
) {
2914 dout(10) << __func__
<< " " << h
<< " ignoring, length " << length
2915 << " < min_flush_size " << cct
->_conf
->bluefs_min_flush_size
2920 dout(10) << __func__
<< " " << h
<< " no dirty data on "
2921 << h
->file
->fnode
<< dendl
;
2924 dout(10) << __func__
<< " " << h
<< " 0x"
2925 << std::hex
<< offset
<< "~" << length
<< std::dec
2926 << " to " << h
->file
->fnode
<< dendl
;
2927 ceph_assert(h
->pos
<= h
->file
->fnode
.size
);
2928 int r
= _flush_range(h
, offset
, length
);
2935 int BlueFS::_truncate(FileWriter
*h
, uint64_t offset
)
2937 dout(10) << __func__
<< " 0x" << std::hex
<< offset
<< std::dec
2938 << " file " << h
->file
->fnode
<< dendl
;
2939 if (h
->file
->deleted
) {
2940 dout(10) << __func__
<< " deleted, no-op" << dendl
;
2944 // we never truncate internal log files
2945 ceph_assert(h
->file
->fnode
.ino
> 1);
2947 h
->buffer_appender
.flush();
2949 // truncate off unflushed data?
2950 if (h
->pos
< offset
&&
2951 h
->pos
+ h
->buffer
.length() > offset
) {
2953 dout(20) << __func__
<< " tossing out last " << offset
- h
->pos
2954 << " unflushed bytes" << dendl
;
2955 t
.substr_of(h
->buffer
, 0, offset
- h
->pos
);
2957 ceph_abort_msg("actually this shouldn't happen");
2959 if (h
->buffer
.length()) {
2960 int r
= _flush(h
, true);
2964 if (offset
== h
->file
->fnode
.size
) {
2967 if (offset
> h
->file
->fnode
.size
) {
2968 ceph_abort_msg("truncate up not supported");
2970 ceph_assert(h
->file
->fnode
.size
>= offset
);
2971 vselector
->sub_usage(h
->file
->vselector_hint
, h
->file
->fnode
.size
);
2972 h
->file
->fnode
.size
= offset
;
2973 vselector
->add_usage(h
->file
->vselector_hint
, h
->file
->fnode
.size
);
2974 log_t
.op_file_update(h
->file
->fnode
);
2978 int BlueFS::_fsync(FileWriter
*h
, std::unique_lock
<ceph::mutex
>& l
)
2980 dout(10) << __func__
<< " " << h
<< " " << h
->file
->fnode
<< dendl
;
2981 int r
= _flush(h
, true);
2984 uint64_t old_dirty_seq
= h
->file
->dirty_seq
;
2986 _flush_bdev_safely(h
);
2988 if (old_dirty_seq
) {
2989 uint64_t s
= log_seq
;
2990 dout(20) << __func__
<< " file metadata was dirty (" << old_dirty_seq
2991 << ") on " << h
->file
->fnode
<< ", flushing log" << dendl
;
2992 _flush_and_sync_log(l
, old_dirty_seq
);
2993 ceph_assert(h
->file
->dirty_seq
== 0 || // cleaned
2994 h
->file
->dirty_seq
> s
); // or redirtied by someone else
2999 void BlueFS::_flush_bdev_safely(FileWriter
*h
)
3001 std::array
<bool, MAX_BDEV
> flush_devs
= h
->dirty_devs
;
3002 h
->dirty_devs
.fill(false);
3004 if (!cct
->_conf
->bluefs_sync_write
) {
3005 list
<aio_t
> completed_ios
;
3006 _claim_completed_aios(h
, &completed_ios
);
3009 completed_ios
.clear();
3010 flush_bdev(flush_devs
);
3016 flush_bdev(flush_devs
);
3021 void BlueFS::flush_bdev(std::array
<bool, MAX_BDEV
>& dirty_bdevs
)
3023 // NOTE: this is safe to call without a lock.
3024 dout(20) << __func__
<< dendl
;
3025 for (unsigned i
= 0; i
< MAX_BDEV
; i
++) {
3031 void BlueFS::flush_bdev()
3033 // NOTE: this is safe to call without a lock.
3034 dout(20) << __func__
<< dendl
;
3035 for (auto p
: bdev
) {
3041 const char* BlueFS::get_device_name(unsigned id
)
3043 if (id
>= MAX_BDEV
) return "BDEV_INV";
3044 const char* names
[] = {"BDEV_WAL", "BDEV_DB", "BDEV_SLOW", "BDEV_NEWWAL", "BDEV_NEWDB"};
3048 int BlueFS::_expand_slow_device(uint64_t need
, PExtentVector
& extents
)
3051 if (slow_dev_expander
) {
3052 auto id
= _get_slow_device_id();
3053 auto min_alloc_size
= alloc_size
[id
];
3054 ceph_assert(id
<= alloc
.size() && alloc
[id
]);
3055 auto min_need
= round_up_to(need
, min_alloc_size
);
3056 need
= std::max(need
,
3057 slow_dev_expander
->get_recommended_expansion_delta(
3058 alloc
[id
]->get_free(), block_all
[id
].size()));
3060 need
= round_up_to(need
, min_alloc_size
);
3061 dout(10) << __func__
<< " expanding slow device by 0x"
3062 << std::hex
<< need
<< std::dec
3064 r
= slow_dev_expander
->allocate_freespace(min_need
, need
, extents
);
3069 int BlueFS::_allocate_without_fallback(uint8_t id
, uint64_t len
,
3070 PExtentVector
* extents
)
3072 dout(10) << __func__
<< " len 0x" << std::hex
<< len
<< std::dec
3073 << " from " << (int)id
<< dendl
;
3074 assert(id
< alloc
.size());
3078 extents
->reserve(4); // 4 should be (more than) enough for most allocations
3079 uint64_t min_alloc_size
= alloc_size
[id
];
3080 uint64_t left
= round_up_to(len
, min_alloc_size
);
3081 int64_t alloc_len
= alloc
[id
]->allocate(left
, min_alloc_size
, 0, extents
);
3082 if (alloc_len
< 0 || alloc_len
< (int64_t)left
) {
3083 if (alloc_len
> 0) {
3084 alloc
[id
]->release(*extents
);
3087 derr
<< __func__
<< " failed to allocate 0x" << std::hex
<< left
3088 << " on bdev " << (int)id
3089 << ", free 0x" << alloc
[id
]->get_free() << std::dec
<< dendl
;
3091 derr
<< __func__
<< " failed to allocate 0x" << std::hex
<< left
3092 << " on bdev " << (int)id
<< ", dne" << std::dec
<< dendl
;
3101 int BlueFS::_allocate(uint8_t id
, uint64_t len
,
3102 bluefs_fnode_t
* node
)
3104 dout(10) << __func__
<< " len 0x" << std::hex
<< len
<< std::dec
3105 << " from " << (int)id
<< dendl
;
3106 ceph_assert(id
< alloc
.size());
3107 int64_t alloc_len
= 0;
3108 PExtentVector extents
;
3111 if (!node
->extents
.empty() && node
->extents
.back().bdev
== id
) {
3112 hint
= node
->extents
.back().end();
3114 extents
.reserve(4); // 4 should be (more than) enough for most allocations
3115 alloc_len
= alloc
[id
]->allocate(round_up_to(len
, alloc_size
[id
]),
3116 alloc_size
[id
], hint
, &extents
);
3120 alloc_len
< (int64_t)round_up_to(len
, alloc_size
[id
])) {
3121 if (alloc_len
> 0) {
3122 alloc
[id
]->release(extents
);
3124 if (id
!= BDEV_SLOW
) {
3126 dout(1) << __func__
<< " failed to allocate 0x" << std::hex
<< len
3127 << " on bdev " << (int)id
3128 << ", free 0x" << alloc
[id
]->get_free()
3129 << "; fallback to bdev " << (int)id
+ 1
3130 << std::dec
<< dendl
;
3132 return _allocate(id
+ 1, len
, node
);
3134 dout(1) << __func__
<< " unable to allocate 0x" << std::hex
<< len
3135 << " on bdev " << (int)id
<< ", free 0x"
3136 << (alloc
[id
] ? alloc
[id
]->get_free() : (uint64_t)-1)
3137 << "; fallback to slow device expander "
3138 << std::dec
<< dendl
;
3140 if (_expand_slow_device(len
, extents
) == 0) {
3141 id
= _get_slow_device_id();
3142 for (auto& e
: extents
) {
3143 _add_block_extent(id
, e
.offset
, e
.length
);
3146 auto* last_alloc
= alloc
[id
];
3147 ceph_assert(last_alloc
);
3149 alloc_len
= last_alloc
->allocate(round_up_to(len
, alloc_size
[id
]),
3150 alloc_size
[id
], hint
, &extents
);
3151 if (alloc_len
< 0 || alloc_len
< (int64_t)len
) {
3152 if (alloc_len
> 0) {
3153 last_alloc
->release(extents
);
3155 derr
<< __func__
<< " failed to allocate 0x" << std::hex
<< len
3156 << " on bdev " << (int)id
3157 << ", free 0x" << last_alloc
->get_free() << std::dec
<< dendl
;
3161 derr
<< __func__
<< " failed to expand slow device to fit +0x"
3162 << std::hex
<< len
<< std::dec
3167 uint64_t total_allocated
=
3168 block_all
[id
].size() - alloc
[id
]->get_free();
3169 if (max_bytes
[id
] < total_allocated
) {
3170 logger
->set(max_bytes_pcounters
[id
], total_allocated
);
3171 max_bytes
[id
] = total_allocated
;
3175 for (auto& p
: extents
) {
3176 node
->append_extent(bluefs_extent_t(id
, p
.offset
, p
.length
));
3182 int BlueFS::_preallocate(FileRef f
, uint64_t off
, uint64_t len
)
3184 dout(10) << __func__
<< " file " << f
->fnode
<< " 0x"
3185 << std::hex
<< off
<< "~" << len
<< std::dec
<< dendl
;
3187 dout(10) << __func__
<< " deleted, no-op" << dendl
;
3190 ceph_assert(f
->fnode
.ino
> 1);
3191 uint64_t allocated
= f
->fnode
.get_allocated();
3192 if (off
+ len
> allocated
) {
3193 uint64_t want
= off
+ len
- allocated
;
3194 vselector
->sub_usage(f
->vselector_hint
, f
->fnode
);
3196 int r
= _allocate(vselector
->select_prefer_bdev(f
->vselector_hint
),
3199 vselector
->add_usage(f
->vselector_hint
, f
->fnode
);
3202 log_t
.op_file_update(f
->fnode
);
3207 void BlueFS::sync_metadata(bool avoid_compact
)
3209 std::unique_lock
<ceph::mutex
> l(lock
);
3210 if (log_t
.empty() && dirty_files
.empty()) {
3211 dout(10) << __func__
<< " - no pending log events" << dendl
;
3213 dout(10) << __func__
<< dendl
;
3214 utime_t start
= ceph_clock_now();
3215 flush_bdev(); // FIXME?
3216 _flush_and_sync_log(l
);
3217 dout(10) << __func__
<< " done in " << (ceph_clock_now() - start
) << dendl
;
3220 if (!avoid_compact
) {
3221 _maybe_compact_log(l
);
3225 void BlueFS::_maybe_compact_log(std::unique_lock
<ceph::mutex
>& l
)
3227 if (!cct
->_conf
->bluefs_replay_recovery_disable_compact
&&
3228 _should_compact_log()) {
3229 if (cct
->_conf
->bluefs_compact_log_sync
) {
3230 _compact_log_sync();
3232 _compact_log_async(l
);
3237 int BlueFS::open_for_write(
3238 const string
& dirname
,
3239 const string
& filename
,
3243 std::lock_guard
l(lock
);
3244 dout(10) << __func__
<< " " << dirname
<< "/" << filename
<< dendl
;
3245 map
<string
,DirRef
>::iterator p
= dir_map
.find(dirname
);
3247 if (p
== dir_map
.end()) {
3248 // implicitly create the dir
3249 dout(20) << __func__
<< " dir " << dirname
3250 << " does not exist" << dendl
;
3257 bool create
= false;
3258 bool truncate
= false;
3259 map
<string
,FileRef
>::iterator q
= dir
->file_map
.find(filename
);
3260 if (q
== dir
->file_map
.end()) {
3262 dout(20) << __func__
<< " dir " << dirname
<< " (" << dir
3263 << ") file " << filename
3264 << " does not exist" << dendl
;
3267 file
= ceph::make_ref
<File
>();
3268 file
->fnode
.ino
= ++ino_last
;
3269 file_map
[ino_last
] = file
;
3270 dir
->file_map
[filename
] = file
;
3274 // overwrite existing file?
3277 dout(20) << __func__
<< " dir " << dirname
<< " (" << dir
3278 << ") file " << filename
3279 << " already exists, overwrite in place" << dendl
;
3281 dout(20) << __func__
<< " dir " << dirname
<< " (" << dir
3282 << ") file " << filename
3283 << " already exists, truncate + overwrite" << dendl
;
3284 vselector
->sub_usage(file
->vselector_hint
, file
->fnode
);
3285 file
->fnode
.size
= 0;
3286 for (auto& p
: file
->fnode
.extents
) {
3287 pending_release
[p
.bdev
].insert(p
.offset
, p
.length
);
3291 file
->fnode
.clear_extents();
3294 ceph_assert(file
->fnode
.ino
> 1);
3296 file
->fnode
.mtime
= ceph_clock_now();
3297 file
->vselector_hint
= vselector
->get_hint_by_dir(dirname
);
3298 if (create
|| truncate
) {
3299 vselector
->add_usage(file
->vselector_hint
, file
->fnode
); // update file count
3302 dout(20) << __func__
<< " mapping " << dirname
<< "/" << filename
3303 << " vsel_hint " << file
->vselector_hint
3306 log_t
.op_file_update(file
->fnode
);
3308 log_t
.op_dir_link(dirname
, filename
, file
->fnode
.ino
);
3310 *h
= _create_writer(file
);
3312 if (boost::algorithm::ends_with(filename
, ".log")) {
3313 (*h
)->writer_type
= BlueFS::WRITER_WAL
;
3314 if (logger
&& !overwrite
) {
3315 logger
->inc(l_bluefs_files_written_wal
);
3317 } else if (boost::algorithm::ends_with(filename
, ".sst")) {
3318 (*h
)->writer_type
= BlueFS::WRITER_SST
;
3320 logger
->inc(l_bluefs_files_written_sst
);
3324 dout(10) << __func__
<< " h " << *h
<< " on " << file
->fnode
<< dendl
;
3328 BlueFS::FileWriter
*BlueFS::_create_writer(FileRef f
)
3330 FileWriter
*w
= new FileWriter(f
);
3331 for (unsigned i
= 0; i
< MAX_BDEV
; ++i
) {
3333 w
->iocv
[i
] = new IOContext(cct
, NULL
);
3339 void BlueFS::_close_writer(FileWriter
*h
)
3341 dout(10) << __func__
<< " " << h
<< " type " << h
->writer_type
<< dendl
;
3342 h
->buffer
.reassign_to_mempool(mempool::mempool_bluefs_file_writer
);
3343 for (unsigned i
=0; i
<MAX_BDEV
; ++i
) {
3346 h
->iocv
[i
]->aio_wait();
3347 bdev
[i
]->queue_reap_ioc(h
->iocv
[i
]);
3354 int BlueFS::open_for_read(
3355 const string
& dirname
,
3356 const string
& filename
,
3360 std::lock_guard
l(lock
);
3361 dout(10) << __func__
<< " " << dirname
<< "/" << filename
3362 << (random
? " (random)":" (sequential)") << dendl
;
3363 map
<string
,DirRef
>::iterator p
= dir_map
.find(dirname
);
3364 if (p
== dir_map
.end()) {
3365 dout(20) << __func__
<< " dir " << dirname
<< " not found" << dendl
;
3368 DirRef dir
= p
->second
;
3370 map
<string
,FileRef
>::iterator q
= dir
->file_map
.find(filename
);
3371 if (q
== dir
->file_map
.end()) {
3372 dout(20) << __func__
<< " dir " << dirname
<< " (" << dir
3373 << ") file " << filename
3374 << " not found" << dendl
;
3377 File
*file
= q
->second
.get();
3379 *h
= new FileReader(file
, random
? 4096 : cct
->_conf
->bluefs_max_prefetch
,
3381 dout(10) << __func__
<< " h " << *h
<< " on " << file
->fnode
<< dendl
;
3386 const string
& old_dirname
, const string
& old_filename
,
3387 const string
& new_dirname
, const string
& new_filename
)
3389 std::lock_guard
l(lock
);
3390 dout(10) << __func__
<< " " << old_dirname
<< "/" << old_filename
3391 << " -> " << new_dirname
<< "/" << new_filename
<< dendl
;
3392 map
<string
,DirRef
>::iterator p
= dir_map
.find(old_dirname
);
3393 if (p
== dir_map
.end()) {
3394 dout(20) << __func__
<< " dir " << old_dirname
<< " not found" << dendl
;
3397 DirRef old_dir
= p
->second
;
3398 map
<string
,FileRef
>::iterator q
= old_dir
->file_map
.find(old_filename
);
3399 if (q
== old_dir
->file_map
.end()) {
3400 dout(20) << __func__
<< " dir " << old_dirname
<< " (" << old_dir
3401 << ") file " << old_filename
3402 << " not found" << dendl
;
3405 FileRef file
= q
->second
;
3407 p
= dir_map
.find(new_dirname
);
3408 if (p
== dir_map
.end()) {
3409 dout(20) << __func__
<< " dir " << new_dirname
<< " not found" << dendl
;
3412 DirRef new_dir
= p
->second
;
3413 q
= new_dir
->file_map
.find(new_filename
);
3414 if (q
!= new_dir
->file_map
.end()) {
3415 dout(20) << __func__
<< " dir " << new_dirname
<< " (" << old_dir
3416 << ") file " << new_filename
3417 << " already exists, unlinking" << dendl
;
3418 ceph_assert(q
->second
!= file
);
3419 log_t
.op_dir_unlink(new_dirname
, new_filename
);
3420 _drop_link(q
->second
);
3423 dout(10) << __func__
<< " " << new_dirname
<< "/" << new_filename
<< " "
3424 << " " << file
->fnode
<< dendl
;
3426 new_dir
->file_map
[new_filename
] = file
;
3427 old_dir
->file_map
.erase(old_filename
);
3429 log_t
.op_dir_link(new_dirname
, new_filename
, file
->fnode
.ino
);
3430 log_t
.op_dir_unlink(old_dirname
, old_filename
);
3434 int BlueFS::mkdir(const string
& dirname
)
3436 std::lock_guard
l(lock
);
3437 dout(10) << __func__
<< " " << dirname
<< dendl
;
3438 map
<string
,DirRef
>::iterator p
= dir_map
.find(dirname
);
3439 if (p
!= dir_map
.end()) {
3440 dout(20) << __func__
<< " dir " << dirname
<< " exists" << dendl
;
3443 dir_map
[dirname
] = ceph::make_ref
<Dir
>();
3444 log_t
.op_dir_create(dirname
);
3448 int BlueFS::rmdir(const string
& dirname
)
3450 std::lock_guard
l(lock
);
3451 dout(10) << __func__
<< " " << dirname
<< dendl
;
3452 map
<string
,DirRef
>::iterator p
= dir_map
.find(dirname
);
3453 if (p
== dir_map
.end()) {
3454 dout(20) << __func__
<< " dir " << dirname
<< " does not exist" << dendl
;
3457 DirRef dir
= p
->second
;
3458 if (!dir
->file_map
.empty()) {
3459 dout(20) << __func__
<< " dir " << dirname
<< " not empty" << dendl
;
3462 dir_map
.erase(dirname
);
3463 log_t
.op_dir_remove(dirname
);
3467 bool BlueFS::dir_exists(const string
& dirname
)
3469 std::lock_guard
l(lock
);
3470 map
<string
,DirRef
>::iterator p
= dir_map
.find(dirname
);
3471 bool exists
= p
!= dir_map
.end();
3472 dout(10) << __func__
<< " " << dirname
<< " = " << (int)exists
<< dendl
;
3476 int BlueFS::stat(const string
& dirname
, const string
& filename
,
3477 uint64_t *size
, utime_t
*mtime
)
3479 std::lock_guard
l(lock
);
3480 dout(10) << __func__
<< " " << dirname
<< "/" << filename
<< dendl
;
3481 map
<string
,DirRef
>::iterator p
= dir_map
.find(dirname
);
3482 if (p
== dir_map
.end()) {
3483 dout(20) << __func__
<< " dir " << dirname
<< " not found" << dendl
;
3486 DirRef dir
= p
->second
;
3487 map
<string
,FileRef
>::iterator q
= dir
->file_map
.find(filename
);
3488 if (q
== dir
->file_map
.end()) {
3489 dout(20) << __func__
<< " dir " << dirname
<< " (" << dir
3490 << ") file " << filename
3491 << " not found" << dendl
;
3494 File
*file
= q
->second
.get();
3495 dout(10) << __func__
<< " " << dirname
<< "/" << filename
3496 << " " << file
->fnode
<< dendl
;
3498 *size
= file
->fnode
.size
;
3500 *mtime
= file
->fnode
.mtime
;
3504 int BlueFS::lock_file(const string
& dirname
, const string
& filename
,
3507 std::lock_guard
l(lock
);
3508 dout(10) << __func__
<< " " << dirname
<< "/" << filename
<< dendl
;
3509 map
<string
,DirRef
>::iterator p
= dir_map
.find(dirname
);
3510 if (p
== dir_map
.end()) {
3511 dout(20) << __func__
<< " dir " << dirname
<< " not found" << dendl
;
3514 DirRef dir
= p
->second
;
3515 map
<string
,FileRef
>::iterator q
= dir
->file_map
.find(filename
);
3517 if (q
== dir
->file_map
.end()) {
3518 dout(20) << __func__
<< " dir " << dirname
<< " (" << dir
3519 << ") file " << filename
3520 << " not found, creating" << dendl
;
3521 file
= ceph::make_ref
<File
>();
3522 file
->fnode
.ino
= ++ino_last
;
3523 file
->fnode
.mtime
= ceph_clock_now();
3524 file_map
[ino_last
] = file
;
3525 dir
->file_map
[filename
] = file
;
3527 log_t
.op_file_update(file
->fnode
);
3528 log_t
.op_dir_link(dirname
, filename
, file
->fnode
.ino
);
3532 dout(10) << __func__
<< " already locked" << dendl
;
3536 file
->locked
= true;
3537 *plock
= new FileLock(file
);
3538 dout(10) << __func__
<< " locked " << file
->fnode
3539 << " with " << *plock
<< dendl
;
3543 int BlueFS::unlock_file(FileLock
*fl
)
3545 std::lock_guard
l(lock
);
3546 dout(10) << __func__
<< " " << fl
<< " on " << fl
->file
->fnode
<< dendl
;
3547 ceph_assert(fl
->file
->locked
);
3548 fl
->file
->locked
= false;
3553 int BlueFS::readdir(const string
& dirname
, vector
<string
> *ls
)
3555 std::lock_guard
l(lock
);
3556 dout(10) << __func__
<< " " << dirname
<< dendl
;
3557 if (dirname
.empty()) {
3559 ls
->reserve(dir_map
.size() + 2);
3560 for (auto& q
: dir_map
) {
3561 ls
->push_back(q
.first
);
3564 // list files in dir
3565 map
<string
,DirRef
>::iterator p
= dir_map
.find(dirname
);
3566 if (p
== dir_map
.end()) {
3567 dout(20) << __func__
<< " dir " << dirname
<< " not found" << dendl
;
3570 DirRef dir
= p
->second
;
3571 ls
->reserve(dir
->file_map
.size() + 2);
3572 for (auto& q
: dir
->file_map
) {
3573 ls
->push_back(q
.first
);
3577 ls
->push_back("..");
3581 int BlueFS::unlink(const string
& dirname
, const string
& filename
)
3583 std::lock_guard
l(lock
);
3584 dout(10) << __func__
<< " " << dirname
<< "/" << filename
<< dendl
;
3585 map
<string
,DirRef
>::iterator p
= dir_map
.find(dirname
);
3586 if (p
== dir_map
.end()) {
3587 dout(20) << __func__
<< " dir " << dirname
<< " not found" << dendl
;
3590 DirRef dir
= p
->second
;
3591 map
<string
,FileRef
>::iterator q
= dir
->file_map
.find(filename
);
3592 if (q
== dir
->file_map
.end()) {
3593 dout(20) << __func__
<< " file " << dirname
<< "/" << filename
3594 << " not found" << dendl
;
3597 FileRef file
= q
->second
;
3599 dout(20) << __func__
<< " file " << dirname
<< "/" << filename
3600 << " is locked" << dendl
;
3603 dir
->file_map
.erase(filename
);
3604 log_t
.op_dir_unlink(dirname
, filename
);
3609 bool BlueFS::wal_is_rotational()
3611 if (bdev
[BDEV_WAL
]) {
3612 return bdev
[BDEV_WAL
]->is_rotational();
3613 } else if (bdev
[BDEV_DB
]) {
3614 return bdev
[BDEV_DB
]->is_rotational();
3616 return bdev
[BDEV_SLOW
]->is_rotational();
3621 do_replay_recovery_read is used when bluefs log abruptly ends, but it seems that more data should be there.
3622 Idea is to search disk for definiton of extents that will be accompanied with bluefs log in future,
3623 and try if using it will produce healthy bluefs transaction.
3624 We encode already known bluefs log extents and search disk for these bytes.
3625 When we find it, we decode following bytes as extent.
3626 We read that whole extent and then check if merged with existing log part gives a proper bluefs transaction.
3628 int BlueFS::do_replay_recovery_read(FileReader
*log_reader
,
3633 dout(1) << __func__
<< " replay_pos=0x" << std::hex
<< replay_pos
<<
3634 " needs 0x" << read_offset
<< "~" << read_len
<< std::dec
<< dendl
;
3636 bluefs_fnode_t
& log_fnode
= log_reader
->file
->fnode
;
3637 bufferlist bin_extents
;
3638 ceph::encode(log_fnode
.extents
, bin_extents
);
3639 dout(2) << __func__
<< " log file encoded extents length = " << bin_extents
.length() << dendl
;
3641 // cannot process if too small to effectively search
3642 ceph_assert(bin_extents
.length() >= 32);
3644 last_32
.substr_of(bin_extents
, bin_extents
.length() - 32, 32);
3646 //read fixed part from replay_pos to end of bluefs_log extents
3649 auto e
= log_fnode
.seek(replay_pos
, &e_off
);
3650 ceph_assert(e
!= log_fnode
.extents
.end());
3651 int r
= bdev
[e
->bdev
]->read(e
->offset
+ e_off
, e
->length
- e_off
, &fixed
, ioc
[e
->bdev
],
3652 cct
->_conf
->bluefs_buffered_io
);
3653 ceph_assert(r
== 0);
3654 //capture dev of last good extent
3655 uint8_t last_e_dev
= e
->bdev
;
3656 uint64_t last_e_off
= e
->offset
;
3658 while (e
!= log_fnode
.extents
.end()) {
3659 r
= bdev
[e
->bdev
]->read(e
->offset
, e
->length
, &fixed
, ioc
[e
->bdev
],
3660 cct
->_conf
->bluefs_buffered_io
);
3661 ceph_assert(r
== 0);
3662 last_e_dev
= e
->bdev
;
3665 ceph_assert(replay_pos
+ fixed
.length() == read_offset
);
3667 dout(2) << __func__
<< " valid data in log = " << fixed
.length() << dendl
;
3670 bool operator()(const bluefs_extent_t
& a
, const bluefs_extent_t
& b
) const {
3671 if (a
.bdev
< b
.bdev
) return true;
3672 if (a
.offset
< b
.offset
) return true;
3673 return a
.length
< b
.length
;
3676 std::set
<bluefs_extent_t
, compare
> extents_rejected
;
3677 for (int dcnt
= 0; dcnt
< 3; dcnt
++) {
3678 uint8_t dev
= (last_e_dev
+ dcnt
) % MAX_BDEV
;
3679 if (bdev
[dev
] == nullptr) continue;
3680 dout(2) << __func__
<< " processing " << get_device_name(dev
) << dendl
;
3681 interval_set
<uint64_t> disk_regions
;
3682 disk_regions
.insert(0, bdev
[dev
]->get_size());
3683 for (auto f
: file_map
) {
3684 auto& e
= f
.second
->fnode
.extents
;
3686 if (p
.bdev
== dev
) {
3687 disk_regions
.erase(p
.offset
, p
.length
);
3691 size_t disk_regions_count
= disk_regions
.num_intervals();
3692 dout(5) << __func__
<< " " << disk_regions_count
<< " regions to scan on " << get_device_name(dev
) << dendl
;
3694 auto reg
= disk_regions
.lower_bound(last_e_off
);
3695 //for all except first, start from beginning
3697 if (reg
== disk_regions
.end()) {
3698 reg
= disk_regions
.begin();
3700 const uint64_t chunk_size
= 4 * 1024 * 1024;
3701 const uint64_t page_size
= 4096;
3702 const uint64_t max_extent_size
= 16;
3703 uint64_t overlay_size
= last_32
.length() + max_extent_size
;
3704 for (size_t i
= 0; i
< disk_regions_count
; reg
++, i
++) {
3705 if (reg
== disk_regions
.end()) {
3706 reg
= disk_regions
.begin();
3708 uint64_t pos
= reg
.get_start();
3709 uint64_t len
= reg
.get_len();
3711 std::unique_ptr
<char[]> raw_data_p
{new char[page_size
+ chunk_size
]};
3712 char* raw_data
= raw_data_p
.get();
3713 memset(raw_data
, 0, page_size
);
3715 while (len
> last_32
.length()) {
3716 uint64_t chunk_len
= len
> chunk_size
? chunk_size
: len
;
3717 dout(5) << __func__
<< " read "
3718 << get_device_name(dev
) << ":0x" << std::hex
<< pos
<< "+" << chunk_len
<< std::dec
<< dendl
;
3719 r
= bdev
[dev
]->read_random(pos
, chunk_len
, raw_data
+ page_size
, cct
->_conf
->bluefs_buffered_io
);
3720 ceph_assert(r
== 0);
3722 //search for fixed_last_32
3723 char* chunk_b
= raw_data
+ page_size
;
3724 char* chunk_e
= chunk_b
+ chunk_len
;
3726 char* search_b
= chunk_b
- overlay_size
;
3727 char* search_e
= chunk_e
;
3729 for (char* sp
= search_b
; ; sp
+= last_32
.length()) {
3730 sp
= (char*)memmem(sp
, search_e
- sp
, last_32
.c_str(), last_32
.length());
3731 if (sp
== nullptr) {
3735 char* n
= sp
+ last_32
.length();
3736 dout(5) << __func__
<< " checking location 0x" << std::hex
<< pos
+ (n
- chunk_b
) << std::dec
<< dendl
;
3738 test
.append(n
, std::min
<size_t>(max_extent_size
, chunk_e
- n
));
3741 bufferlist::const_iterator p
= test
.begin();
3742 ceph::decode(ne
, p
);
3743 } catch (buffer::error
& e
) {
3746 if (extents_rejected
.count(ne
) != 0) {
3747 dout(5) << __func__
<< " extent " << ne
<< " already refected" <<dendl
;
3750 //insert as rejected already. if we succeed, it wouldn't make difference.
3751 extents_rejected
.insert(ne
);
3753 if (ne
.bdev
>= MAX_BDEV
||
3754 bdev
[ne
.bdev
] == nullptr ||
3755 ne
.length
> 16 * 1024 * 1024 ||
3756 (ne
.length
& 4095) != 0 ||
3757 ne
.offset
+ ne
.length
> bdev
[ne
.bdev
]->get_size() ||
3758 (ne
.offset
& 4095) != 0) {
3759 dout(5) << __func__
<< " refusing extent " << ne
<< dendl
;
3762 dout(5) << __func__
<< " checking extent " << ne
<< dendl
;
3764 //read candidate extent - whole
3765 bufferlist candidate
;
3766 candidate
.append(fixed
);
3767 r
= bdev
[ne
.bdev
]->read(ne
.offset
, ne
.length
, &candidate
, ioc
[ne
.bdev
],
3768 cct
->_conf
->bluefs_buffered_io
);
3769 ceph_assert(r
== 0);
3771 //check if transaction & crc is ok
3772 bluefs_transaction_t t
;
3774 bufferlist::const_iterator p
= candidate
.cbegin();
3777 catch (buffer::error
& e
) {
3778 dout(5) << __func__
<< " failed match" << dendl
;
3782 //success, it seems a probable candidate
3783 uint64_t l
= std::min
<uint64_t>(ne
.length
, read_len
);
3784 //trim to required size
3785 bufferlist requested_read
;
3786 requested_read
.substr_of(candidate
, fixed
.length(), l
);
3787 bl
->append(requested_read
);
3788 dout(5) << __func__
<< " successful extension of log " << l
<< "/" << read_len
<< dendl
;
3789 log_fnode
.append_extent(ne
);
3790 log_fnode
.recalc_allocated();
3791 log_reader
->buf
.pos
+= l
;
3794 //save overlay for next search
3795 memcpy(search_b
, chunk_e
- overlay_size
, overlay_size
);
3804 void BlueFS::debug_inject_duplicate_gift(unsigned id
,
3808 dout(0) << __func__
<< dendl
;
3809 if (id
< alloc
.size() && alloc
[id
]) {
3810 alloc
[id
]->init_add_free(offset
, len
);
3814 // ===============================================
3815 // OriginalVolumeSelector
3817 void* OriginalVolumeSelector::get_hint_for_log() const {
3818 return reinterpret_cast<void*>(BlueFS::BDEV_WAL
);
3820 void* OriginalVolumeSelector::get_hint_by_dir(const string
& dirname
) const {
3821 uint8_t res
= BlueFS::BDEV_DB
;
3822 if (dirname
.length() > 5) {
3823 // the "db.slow" and "db.wal" directory names are hard-coded at
3824 // match up with bluestore. the slow device is always the second
3825 // one (when a dedicated block.db device is present and used at
3826 // bdev 0). the wal device is always last.
3827 if (boost::algorithm::ends_with(dirname
, ".slow")) {
3828 res
= BlueFS::BDEV_SLOW
;
3830 else if (boost::algorithm::ends_with(dirname
, ".wal")) {
3831 res
= BlueFS::BDEV_WAL
;
3834 return reinterpret_cast<void*>(res
);
3837 uint8_t OriginalVolumeSelector::select_prefer_bdev(void* hint
)
3839 return (uint8_t)(reinterpret_cast<uint64_t>(hint
));
3842 void OriginalVolumeSelector::get_paths(const std::string
& base
, paths
& res
) const
3844 res
.emplace_back(base
, db_total
);
3845 res
.emplace_back(base
+ ".slow", slow_total
);
3849 #define dout_prefix *_dout << "OriginalVolumeSelector: "
3851 void OriginalVolumeSelector::dump(ostream
& sout
) {
3852 sout
<< "wal_total:" << wal_total
3853 << ", db_total:" << db_total
3854 << ", slow_total:" << slow_total