1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2 // vim: ts=8 sw=2 smarttab
4 #include "boost/algorithm/string.hpp"
5 #include "bluestore_common.h"
8 #include "common/debug.h"
9 #include "common/errno.h"
10 #include "common/perf_counters.h"
11 #include "BlockDevice.h"
12 #include "Allocator.h"
13 #include "include/ceph_assert.h"
14 #include "common/admin_socket.h"
16 #define dout_context cct
17 #define dout_subsys ceph_subsys_bluefs
19 #define dout_prefix *_dout << "bluefs "
20 using TOPNSPC::common::cmd_getval
;
21 MEMPOOL_DEFINE_OBJECT_FACTORY(BlueFS::File
, bluefs_file
, bluefs
);
22 MEMPOOL_DEFINE_OBJECT_FACTORY(BlueFS::Dir
, bluefs_dir
, bluefs
);
23 MEMPOOL_DEFINE_OBJECT_FACTORY(BlueFS::FileWriter
, bluefs_file_writer
, bluefs
);
24 MEMPOOL_DEFINE_OBJECT_FACTORY(BlueFS::FileReaderBuffer
,
25 bluefs_file_reader_buffer
, bluefs
);
26 MEMPOOL_DEFINE_OBJECT_FACTORY(BlueFS::FileReader
, bluefs_file_reader
, bluefs
);
27 MEMPOOL_DEFINE_OBJECT_FACTORY(BlueFS::FileLock
, bluefs_file_lock
, bluefs
);
29 static void wal_discard_cb(void *priv
, void* priv2
) {
30 BlueFS
*bluefs
= static_cast<BlueFS
*>(priv
);
31 interval_set
<uint64_t> *tmp
= static_cast<interval_set
<uint64_t>*>(priv2
);
32 bluefs
->handle_discard(BlueFS::BDEV_WAL
, *tmp
);
35 static void db_discard_cb(void *priv
, void* priv2
) {
36 BlueFS
*bluefs
= static_cast<BlueFS
*>(priv
);
37 interval_set
<uint64_t> *tmp
= static_cast<interval_set
<uint64_t>*>(priv2
);
38 bluefs
->handle_discard(BlueFS::BDEV_DB
, *tmp
);
41 static void slow_discard_cb(void *priv
, void* priv2
) {
42 BlueFS
*bluefs
= static_cast<BlueFS
*>(priv
);
43 interval_set
<uint64_t> *tmp
= static_cast<interval_set
<uint64_t>*>(priv2
);
44 bluefs
->handle_discard(BlueFS::BDEV_SLOW
, *tmp
);
47 class BlueFS::SocketHook
: public AdminSocketHook
{
50 static BlueFS::SocketHook
* create(BlueFS
* bluefs
)
52 BlueFS::SocketHook
* hook
= nullptr;
53 AdminSocket
* admin_socket
= bluefs
->cct
->get_admin_socket();
55 hook
= new BlueFS::SocketHook(bluefs
);
56 int r
= admin_socket
->register_command("bluestore bluefs available "
57 "name=alloc_size,type=CephInt,req=false",
59 "Report available space for bluefs. "
60 "If alloc_size set, make simulation.");
62 ldout(bluefs
->cct
, 1) << __func__
<< " cannot register SocketHook" << dendl
;
66 r
= admin_socket
->register_command("bluestore bluefs stats",
68 "Dump internal statistics for bluefs."
77 AdminSocket
* admin_socket
= bluefs
->cct
->get_admin_socket();
78 admin_socket
->unregister_commands(this);
81 SocketHook(BlueFS
* bluefs
) :
83 int call(std::string_view command
, const cmdmap_t
& cmdmap
,
86 bufferlist
& out
) override
{
87 if (command
== "bluestore bluefs available") {
88 int64_t alloc_size
= 0;
89 cmd_getval(cmdmap
, "alloc_size", alloc_size
);
90 if ((alloc_size
& (alloc_size
- 1)) != 0) {
91 errss
<< "Invalid allocation size:'" << alloc_size
<< std::endl
;
95 alloc_size
= bluefs
->cct
->_conf
->bluefs_alloc_size
;
96 f
->open_object_section("bluefs_available_space");
97 for (unsigned dev
= BDEV_WAL
; dev
<= BDEV_SLOW
; dev
++) {
98 if (bluefs
->bdev
[dev
]) {
99 f
->open_object_section("dev");
100 f
->dump_string("device", bluefs
->get_device_name(dev
));
101 ceph_assert(bluefs
->alloc
[dev
]);
102 f
->dump_int("free", bluefs
->alloc
[dev
]->get_free());
106 size_t extra_space
= 0;
107 if (bluefs
->slow_dev_expander
) {
108 extra_space
= bluefs
->slow_dev_expander
->available_freespace(alloc_size
);
110 f
->dump_int("available_from_bluestore", extra_space
);
112 } else if (command
== "bluefs stats") {
113 std::stringstream ss
;
114 bluefs
->dump_block_extents(ss
);
115 bluefs
->dump_volume_selector(ss
);
118 errss
<< "Invalid command" << std::endl
;
125 BlueFS::BlueFS(CephContext
* cct
)
131 discard_cb
[BDEV_WAL
] = wal_discard_cb
;
132 discard_cb
[BDEV_DB
] = db_discard_cb
;
133 discard_cb
[BDEV_SLOW
] = slow_discard_cb
;
134 asok_hook
= SocketHook::create(this);
144 for (auto p
: bdev
) {
155 void BlueFS::_init_logger()
157 PerfCountersBuilder
b(cct
, "bluefs",
158 l_bluefs_first
, l_bluefs_last
);
159 b
.add_u64_counter(l_bluefs_gift_bytes
, "gift_bytes",
160 "Bytes gifted from BlueStore", NULL
, 0, unit_t(UNIT_BYTES
));
161 b
.add_u64_counter(l_bluefs_reclaim_bytes
, "reclaim_bytes",
162 "Bytes reclaimed by BlueStore", NULL
, 0, unit_t(UNIT_BYTES
));
163 b
.add_u64(l_bluefs_db_total_bytes
, "db_total_bytes",
164 "Total bytes (main db device)",
165 "b", PerfCountersBuilder::PRIO_USEFUL
, unit_t(UNIT_BYTES
));
166 b
.add_u64(l_bluefs_db_used_bytes
, "db_used_bytes",
167 "Used bytes (main db device)",
168 "u", PerfCountersBuilder::PRIO_USEFUL
, unit_t(UNIT_BYTES
));
169 b
.add_u64(l_bluefs_wal_total_bytes
, "wal_total_bytes",
170 "Total bytes (wal device)",
171 "walb", PerfCountersBuilder::PRIO_USEFUL
, unit_t(UNIT_BYTES
));
172 b
.add_u64(l_bluefs_wal_used_bytes
, "wal_used_bytes",
173 "Used bytes (wal device)",
174 "walu", PerfCountersBuilder::PRIO_USEFUL
, unit_t(UNIT_BYTES
));
175 b
.add_u64(l_bluefs_slow_total_bytes
, "slow_total_bytes",
176 "Total bytes (slow device)",
177 "slob", PerfCountersBuilder::PRIO_USEFUL
, unit_t(UNIT_BYTES
));
178 b
.add_u64(l_bluefs_slow_used_bytes
, "slow_used_bytes",
179 "Used bytes (slow device)",
180 "slou", PerfCountersBuilder::PRIO_USEFUL
, unit_t(UNIT_BYTES
));
181 b
.add_u64(l_bluefs_num_files
, "num_files", "File count",
182 "f", PerfCountersBuilder::PRIO_USEFUL
);
183 b
.add_u64(l_bluefs_log_bytes
, "log_bytes", "Size of the metadata log",
184 "jlen", PerfCountersBuilder::PRIO_INTERESTING
, unit_t(UNIT_BYTES
));
185 b
.add_u64_counter(l_bluefs_log_compactions
, "log_compactions",
186 "Compactions of the metadata log");
187 b
.add_u64_counter(l_bluefs_logged_bytes
, "logged_bytes",
188 "Bytes written to the metadata log", "j",
189 PerfCountersBuilder::PRIO_CRITICAL
, unit_t(UNIT_BYTES
));
190 b
.add_u64_counter(l_bluefs_files_written_wal
, "files_written_wal",
191 "Files written to WAL");
192 b
.add_u64_counter(l_bluefs_files_written_sst
, "files_written_sst",
193 "Files written to SSTs");
194 b
.add_u64_counter(l_bluefs_bytes_written_wal
, "bytes_written_wal",
195 "Bytes written to WAL", "wal",
196 PerfCountersBuilder::PRIO_CRITICAL
);
197 b
.add_u64_counter(l_bluefs_bytes_written_sst
, "bytes_written_sst",
198 "Bytes written to SSTs", "sst",
199 PerfCountersBuilder::PRIO_CRITICAL
, unit_t(UNIT_BYTES
));
200 b
.add_u64_counter(l_bluefs_bytes_written_slow
, "bytes_written_slow",
201 "Bytes written to WAL/SSTs at slow device", NULL
,
202 PerfCountersBuilder::PRIO_USEFUL
, unit_t(UNIT_BYTES
));
203 b
.add_u64_counter(l_bluefs_max_bytes_wal
, "max_bytes_wal",
204 "Maximum bytes allocated from WAL");
205 b
.add_u64_counter(l_bluefs_max_bytes_db
, "max_bytes_db",
206 "Maximum bytes allocated from DB");
207 b
.add_u64_counter(l_bluefs_max_bytes_slow
, "max_bytes_slow",
208 "Maximum bytes allocated from SLOW");
210 b
.add_u64_counter(l_bluefs_read_random_count
, "read_random_count",
211 "random read requests processed");
212 b
.add_u64_counter(l_bluefs_read_random_bytes
, "read_random_bytes",
213 "Bytes requested in random read mode", NULL
,
214 PerfCountersBuilder::PRIO_USEFUL
, unit_t(UNIT_BYTES
));
215 b
.add_u64_counter(l_bluefs_read_random_disk_count
, "read_random_disk_count",
216 "random reads requests going to disk");
217 b
.add_u64_counter(l_bluefs_read_random_disk_bytes
, "read_random_disk_bytes",
218 "Bytes read from disk in random read mode", NULL
,
219 PerfCountersBuilder::PRIO_USEFUL
, unit_t(UNIT_BYTES
));
220 b
.add_u64_counter(l_bluefs_read_random_buffer_count
, "read_random_buffer_count",
221 "random read requests processed using prefetch buffer");
222 b
.add_u64_counter(l_bluefs_read_random_buffer_bytes
, "read_random_buffer_bytes",
223 "Bytes read from prefetch buffer in random read mode", NULL
,
224 PerfCountersBuilder::PRIO_USEFUL
, unit_t(UNIT_BYTES
));
226 b
.add_u64_counter(l_bluefs_read_count
, "read_count",
227 "buffered read requests processed");
228 b
.add_u64_counter(l_bluefs_read_bytes
, "read_bytes",
229 "Bytes requested in buffered read mode", NULL
,
230 PerfCountersBuilder::PRIO_USEFUL
, unit_t(UNIT_BYTES
));
232 b
.add_u64_counter(l_bluefs_read_prefetch_count
, "read_prefetch_count",
233 "prefetch read requests processed");
234 b
.add_u64_counter(l_bluefs_read_prefetch_bytes
, "read_prefetch_bytes",
235 "Bytes requested in prefetch read mode", NULL
,
236 PerfCountersBuilder::PRIO_USEFUL
, unit_t(UNIT_BYTES
));
238 logger
= b
.create_perf_counters();
239 cct
->get_perfcounters_collection()->add(logger
);
242 void BlueFS::_shutdown_logger()
244 cct
->get_perfcounters_collection()->remove(logger
);
248 void BlueFS::_update_logger_stats()
250 // we must be holding the lock
251 logger
->set(l_bluefs_num_files
, file_map
.size());
252 logger
->set(l_bluefs_log_bytes
, log_writer
->file
->fnode
.size
);
254 if (alloc
[BDEV_WAL
]) {
255 logger
->set(l_bluefs_wal_total_bytes
, block_all
[BDEV_WAL
].size());
256 logger
->set(l_bluefs_wal_used_bytes
,
257 block_all
[BDEV_WAL
].size() - alloc
[BDEV_WAL
]->get_free());
259 if (alloc
[BDEV_DB
]) {
260 logger
->set(l_bluefs_db_total_bytes
, block_all
[BDEV_DB
].size());
261 logger
->set(l_bluefs_db_used_bytes
,
262 block_all
[BDEV_DB
].size() - alloc
[BDEV_DB
]->get_free());
264 if (alloc
[BDEV_SLOW
]) {
265 logger
->set(l_bluefs_slow_total_bytes
, block_all
[BDEV_SLOW
].size());
266 logger
->set(l_bluefs_slow_used_bytes
,
267 block_all
[BDEV_SLOW
].size() - alloc
[BDEV_SLOW
]->get_free());
271 int BlueFS::add_block_device(unsigned id
, const string
& path
, bool trim
,
272 bool shared_with_bluestore
)
274 dout(10) << __func__
<< " bdev " << id
<< " path " << path
<< dendl
;
275 ceph_assert(id
< bdev
.size());
276 ceph_assert(bdev
[id
] == NULL
);
277 BlockDevice
*b
= BlockDevice::create(cct
, path
, NULL
, NULL
,
278 discard_cb
[id
], static_cast<void*>(this));
279 if (shared_with_bluestore
) {
280 b
->set_no_exclusive_lock();
282 int r
= b
->open(path
);
288 b
->discard(0, b
->get_size());
291 dout(1) << __func__
<< " bdev " << id
<< " path " << path
292 << " size " << byte_u_t(b
->get_size()) << dendl
;
294 ioc
[id
] = new IOContext(cct
, NULL
);
298 bool BlueFS::bdev_support_label(unsigned id
)
300 ceph_assert(id
< bdev
.size());
301 ceph_assert(bdev
[id
]);
302 return bdev
[id
]->supported_bdev_label();
305 uint64_t BlueFS::get_block_device_size(unsigned id
)
307 if (id
< bdev
.size() && bdev
[id
])
308 return bdev
[id
]->get_size();
312 void BlueFS::_add_block_extent(unsigned id
, uint64_t offset
, uint64_t length
)
314 dout(1) << __func__
<< " bdev " << id
315 << " 0x" << std::hex
<< offset
<< "~" << length
<< std::dec
318 ceph_assert(id
< bdev
.size());
319 ceph_assert(bdev
[id
]);
320 ceph_assert(bdev
[id
]->get_size() >= offset
+ length
);
321 block_all
[id
].insert(offset
, length
);
323 if (id
< alloc
.size() && alloc
[id
]) {
324 log_t
.op_alloc_add(id
, offset
, length
);
325 alloc
[id
]->init_add_free(offset
, length
);
329 logger
->inc(l_bluefs_gift_bytes
, length
);
330 dout(10) << __func__
<< " done" << dendl
;
333 int BlueFS::reclaim_blocks(unsigned id
, uint64_t want
,
334 PExtentVector
*extents
)
336 std::unique_lock
l(lock
);
337 dout(1) << __func__
<< " bdev " << id
338 << " want 0x" << std::hex
<< want
<< std::dec
<< dendl
;
339 ceph_assert(id
< alloc
.size());
340 ceph_assert(alloc
[id
]);
343 interval_set
<uint64_t> granular
;
344 while (want
> 0 && !block_unused_too_granular
[id
].empty()) {
345 auto p
= block_unused_too_granular
[id
].begin();
346 dout(20) << __func__
<< " unused " << (int)id
<< ":"
347 << std::hex
<< p
.get_start() << "~" << p
.get_len() << dendl
;
348 extents
->push_back({p
.get_start(), p
.get_len()});
349 granular
.insert(p
.get_start(), p
.get_len());
350 if (want
>= p
.get_len()) {
356 block_unused_too_granular
[id
].erase(p
);
360 got
+= alloc
[id
]->allocate(want
, alloc_size
[id
], 0, extents
);
361 ceph_assert(got
!= 0);
363 derr
<< __func__
<< " failed to allocate space to return to bluestore"
366 block_unused_too_granular
[id
].insert(granular
);
370 for (auto& p
: *extents
) {
371 block_all
[id
].erase(p
.offset
, p
.length
);
372 log_t
.op_alloc_rm(id
, p
.offset
, p
.length
);
376 int r
= _flush_and_sync_log(l
);
380 logger
->inc(l_bluefs_reclaim_bytes
, got
);
381 dout(1) << __func__
<< " bdev " << id
<< " want 0x" << std::hex
<< want
382 << " got " << *extents
<< dendl
;
386 void BlueFS::handle_discard(unsigned id
, interval_set
<uint64_t>& to_release
)
388 dout(10) << __func__
<< " bdev " << id
<< dendl
;
389 ceph_assert(alloc
[id
]);
390 alloc
[id
]->release(to_release
);
393 uint64_t BlueFS::get_used()
395 std::lock_guard
l(lock
);
397 for (unsigned id
= 0; id
< MAX_BDEV
; ++id
) {
399 used
+= block_all
[id
].size() - alloc
[id
]->get_free();
405 uint64_t BlueFS::get_total(unsigned id
)
407 std::lock_guard
l(lock
);
408 ceph_assert(id
< block_all
.size());
409 return block_all
[id
].size();
412 uint64_t BlueFS::get_free(unsigned id
)
414 std::lock_guard
l(lock
);
415 ceph_assert(id
< alloc
.size());
416 return alloc
[id
]->get_free();
419 void BlueFS::dump_perf_counters(Formatter
*f
)
421 f
->open_object_section("bluefs_perf_counters");
422 logger
->dump_formatted(f
,0);
426 void BlueFS::dump_block_extents(ostream
& out
)
428 for (unsigned i
= 0; i
< MAX_BDEV
; ++i
) {
432 auto owned
= get_total(i
);
433 auto free
= get_free(i
);
434 out
<< i
<< " : device size 0x" << std::hex
<< bdev
[i
]->get_size()
435 << " : own 0x" << block_all
[i
]
437 << " : using 0x" << owned
- free
438 << std::dec
<< "(" << byte_u_t(owned
- free
) << ")"
443 void BlueFS::get_usage(vector
<pair
<uint64_t,uint64_t>> *usage
)
445 std::lock_guard
l(lock
);
446 usage
->resize(bdev
.size());
447 for (unsigned id
= 0; id
< bdev
.size(); ++id
) {
449 (*usage
)[id
] = make_pair(0, 0);
452 (*usage
)[id
].first
= alloc
[id
]->get_free();
453 (*usage
)[id
].second
= block_all
[id
].size();
455 (block_all
[id
].size() - (*usage
)[id
].first
) * 100 / block_all
[id
].size();
456 dout(10) << __func__
<< " bdev " << id
457 << " free " << (*usage
)[id
].first
458 << " (" << byte_u_t((*usage
)[id
].first
) << ")"
459 << " / " << (*usage
)[id
].second
460 << " (" << byte_u_t((*usage
)[id
].second
) << ")"
461 << ", used " << used
<< "%"
466 int BlueFS::get_block_extents(unsigned id
, interval_set
<uint64_t> *extents
)
468 std::lock_guard
l(lock
);
469 dout(10) << __func__
<< " bdev " << id
<< dendl
;
470 if (id
>= block_all
.size())
472 *extents
= block_all
[id
];
476 int BlueFS::mkfs(uuid_d osd_uuid
, const bluefs_layout_t
& layout
)
478 std::unique_lock
l(lock
);
480 << " osd_uuid " << osd_uuid
483 // set volume selector if not provided before/outside
484 if (vselector
== nullptr) {
486 new OriginalVolumeSelector(
487 get_block_device_size(BlueFS::BDEV_WAL
) * 95 / 100,
488 get_block_device_size(BlueFS::BDEV_DB
) * 95 / 100,
489 get_block_device_size(BlueFS::BDEV_SLOW
) * 95 / 100));
496 super
.block_size
= bdev
[BDEV_DB
]->get_block_size();
497 super
.osd_uuid
= osd_uuid
;
498 super
.uuid
.generate_random();
499 dout(1) << __func__
<< " uuid " << super
.uuid
<< dendl
;
502 FileRef log_file
= ceph::make_ref
<File
>();
503 log_file
->fnode
.ino
= 1;
504 log_file
->vselector_hint
= vselector
->get_hint_by_device(BDEV_WAL
);
506 vselector
->select_prefer_bdev(log_file
->vselector_hint
),
507 cct
->_conf
->bluefs_max_log_runway
,
509 vselector
->add_usage(log_file
->vselector_hint
, log_file
->fnode
);
511 log_writer
= _create_writer(log_file
);
515 for (unsigned bdev
= 0; bdev
< MAX_BDEV
; ++bdev
) {
516 interval_set
<uint64_t>& p
= block_all
[bdev
];
519 for (interval_set
<uint64_t>::iterator q
= p
.begin(); q
!= p
.end(); ++q
) {
520 dout(20) << __func__
<< " op_alloc_add " << bdev
<< " 0x"
521 << std::hex
<< q
.get_start() << "~" << q
.get_len() << std::dec
523 log_t
.op_alloc_add(bdev
, q
.get_start(), q
.get_len());
526 _flush_and_sync_log(l
);
529 super
.log_fnode
= log_file
->fnode
;
530 super
.memorized_layout
= layout
;
531 _write_super(BDEV_DB
);
535 super
= bluefs_super_t();
536 _close_writer(log_writer
);
539 vselector
.reset(nullptr);
543 dout(10) << __func__
<< " success" << dendl
;
547 void BlueFS::_init_alloc()
549 dout(20) << __func__
<< dendl
;
550 alloc
.resize(MAX_BDEV
);
551 alloc_size
.resize(MAX_BDEV
, 0);
552 pending_release
.resize(MAX_BDEV
);
553 block_unused_too_granular
.resize(MAX_BDEV
);
555 if (bdev
[BDEV_WAL
]) {
556 alloc_size
[BDEV_WAL
] = cct
->_conf
->bluefs_alloc_size
;
558 if (bdev
[BDEV_SLOW
]) {
559 alloc_size
[BDEV_DB
] = cct
->_conf
->bluefs_alloc_size
;
560 alloc_size
[BDEV_SLOW
] = cct
->_conf
->bluefs_shared_alloc_size
;
562 alloc_size
[BDEV_DB
] = cct
->_conf
->bluefs_shared_alloc_size
;
564 // new wal and db devices are never shared
565 if (bdev
[BDEV_NEWWAL
]) {
566 alloc_size
[BDEV_NEWWAL
] = cct
->_conf
->bluefs_alloc_size
;
568 if (bdev
[BDEV_NEWDB
]) {
569 alloc_size
[BDEV_NEWDB
] = cct
->_conf
->bluefs_alloc_size
;
572 for (unsigned id
= 0; id
< bdev
.size(); ++id
) {
576 ceph_assert(bdev
[id
]->get_size());
577 std::string name
= "bluefs-";
578 const char* devnames
[] = {"wal","db","slow"};
580 name
+= devnames
[id
];
582 name
+= to_string(uintptr_t(this));
583 ceph_assert(alloc_size
[id
]);
584 dout(1) << __func__
<< " id " << id
585 << " alloc_size 0x" << std::hex
<< alloc_size
[id
]
586 << " size 0x" << bdev
[id
]->get_size() << std::dec
<< dendl
;
587 alloc
[id
] = Allocator::create(cct
, cct
->_conf
->bluefs_allocator
,
588 bdev
[id
]->get_size(),
589 alloc_size
[id
], name
);
590 interval_set
<uint64_t>& p
= block_all
[id
];
591 for (interval_set
<uint64_t>::iterator q
= p
.begin(); q
!= p
.end(); ++q
) {
592 alloc
[id
]->init_add_free(q
.get_start(), q
.get_len());
597 void BlueFS::_stop_alloc()
599 dout(20) << __func__
<< dendl
;
600 for (auto p
: bdev
) {
605 for (auto p
: alloc
) {
612 block_unused_too_granular
.clear();
617 dout(1) << __func__
<< dendl
;
619 int r
= _open_super();
621 derr
<< __func__
<< " failed to open super: " << cpp_strerror(r
) << dendl
;
625 // set volume selector if not provided before/outside
626 if (vselector
== nullptr) {
628 new OriginalVolumeSelector(
629 get_block_device_size(BlueFS::BDEV_WAL
) * 95 / 100,
630 get_block_device_size(BlueFS::BDEV_DB
) * 95 / 100,
631 get_block_device_size(BlueFS::BDEV_SLOW
) * 95 / 100));
635 block_all
.resize(MAX_BDEV
);
639 r
= _replay(false, false);
641 derr
<< __func__
<< " failed to replay log: " << cpp_strerror(r
) << dendl
;
647 for (auto& p
: file_map
) {
648 dout(30) << __func__
<< " noting alloc for " << p
.second
->fnode
<< dendl
;
649 for (auto& q
: p
.second
->fnode
.extents
) {
650 alloc
[q
.bdev
]->init_rm_free(q
.offset
, q
.length
);
654 // set up the log for future writes
655 log_writer
= _create_writer(_get_file(1));
656 ceph_assert(log_writer
->file
->fnode
.ino
== 1);
657 log_writer
->pos
= log_writer
->file
->fnode
.size
;
658 dout(10) << __func__
<< " log write pos set to 0x"
659 << std::hex
<< log_writer
->pos
<< std::dec
665 super
= bluefs_super_t();
669 int BlueFS::maybe_verify_layout(const bluefs_layout_t
& layout
) const
671 if (super
.memorized_layout
) {
672 if (layout
== *super
.memorized_layout
) {
673 dout(10) << __func__
<< " bluefs layout verified positively" << dendl
;
675 derr
<< __func__
<< " memorized layout doesn't fit current one" << dendl
;
679 dout(10) << __func__
<< " no memorized_layout in bluefs superblock"
686 void BlueFS::umount()
688 dout(1) << __func__
<< dendl
;
692 _close_writer(log_writer
);
695 vselector
.reset(nullptr);
699 super
= bluefs_super_t();
704 int BlueFS::prepare_new_device(int id
, const bluefs_layout_t
& layout
)
706 dout(1) << __func__
<< dendl
;
708 if(id
== BDEV_NEWDB
) {
709 int new_log_dev_cur
= BDEV_WAL
;
710 int new_log_dev_next
= BDEV_WAL
;
711 if (!bdev
[BDEV_WAL
]) {
712 new_log_dev_cur
= BDEV_NEWDB
;
713 new_log_dev_next
= BDEV_DB
;
715 _rewrite_log_and_layout_sync(false,
722 } else if(id
== BDEV_NEWWAL
) {
723 _rewrite_log_and_layout_sync(false,
735 void BlueFS::collect_metadata(map
<string
,string
> *pm
, unsigned skip_bdev_id
)
737 if (skip_bdev_id
!= BDEV_DB
&& bdev
[BDEV_DB
])
738 bdev
[BDEV_DB
]->collect_metadata("bluefs_db_", pm
);
740 bdev
[BDEV_WAL
]->collect_metadata("bluefs_wal_", pm
);
743 void BlueFS::get_devices(set
<string
> *ls
)
745 for (unsigned i
= 0; i
< MAX_BDEV
; ++i
) {
747 bdev
[i
]->get_devices(ls
);
754 std::lock_guard
l(lock
);
755 dout(1) << __func__
<< dendl
;
756 // hrm, i think we check everything on mount...
760 int BlueFS::_write_super(int dev
)
765 uint32_t crc
= bl
.crc32c(-1);
767 dout(10) << __func__
<< " super block length(encoded): " << bl
.length() << dendl
;
768 dout(10) << __func__
<< " superblock " << super
.version
<< dendl
;
769 dout(10) << __func__
<< " log_fnode " << super
.log_fnode
<< dendl
;
770 ceph_assert_always(bl
.length() <= get_super_length());
771 bl
.append_zero(get_super_length() - bl
.length());
773 bdev
[dev
]->write(get_super_offset(), bl
, false, WRITE_LIFE_SHORT
);
774 dout(20) << __func__
<< " v " << super
.version
775 << " crc 0x" << std::hex
<< crc
776 << " offset 0x" << get_super_offset() << std::dec
781 int BlueFS::_open_super()
783 dout(10) << __func__
<< dendl
;
786 uint32_t expected_crc
, crc
;
789 // always the second block
790 r
= bdev
[BDEV_DB
]->read(get_super_offset(), get_super_length(),
791 &bl
, ioc
[BDEV_DB
], false);
795 auto p
= bl
.cbegin();
799 t
.substr_of(bl
, 0, p
.get_off());
802 decode(expected_crc
, p
);
803 if (crc
!= expected_crc
) {
804 derr
<< __func__
<< " bad crc on superblock, expected 0x"
805 << std::hex
<< expected_crc
<< " != actual 0x" << crc
<< std::dec
809 dout(10) << __func__
<< " superblock " << super
.version
<< dendl
;
810 dout(10) << __func__
<< " log_fnode " << super
.log_fnode
<< dendl
;
814 int BlueFS::_check_new_allocations(const bluefs_fnode_t
& fnode
,
816 boost::dynamic_bitset
<uint64_t>* owned_blocks
,
817 boost::dynamic_bitset
<uint64_t>* used_blocks
)
819 auto& fnode_extents
= fnode
.extents
;
820 for (auto e
: fnode_extents
) {
823 ceph_assert(id
< dev_count
);
824 apply_for_bitset_range(e
.offset
, e
.length
, alloc_size
[id
], owned_blocks
[id
],
825 [&](uint64_t pos
, boost::dynamic_bitset
<uint64_t> &bs
) {
832 derr
<< __func__
<< " invalid extent " << int(id
)
833 << ": 0x" << std::hex
<< e
.offset
<< "~" << e
.length
835 << ": wasn't given but allocated for ino " << fnode
.ino
840 apply_for_bitset_range(e
.offset
, e
.length
, alloc_size
[id
], used_blocks
[id
],
841 [&](uint64_t pos
, boost::dynamic_bitset
<uint64_t> &bs
) {
849 derr
<< __func__
<< " invalid extent " << int(e
.bdev
)
850 << ": 0x" << std::hex
<< e
.offset
<< "~" << e
.length
851 << std::dec
<< ": duplicate reference, ino " << fnode
.ino
859 int BlueFS::_adjust_granularity(
860 __u8 id
, uint64_t *offset
, uint64_t *length
, bool alloc
)
862 const char *op
= alloc
? "op_alloc_add" : "op_alloc_rm";
865 if (*offset
& (alloc_size
[id
] - 1)) {
866 *offset
&= ~(alloc_size
[id
] - 1);
867 *offset
+= alloc_size
[id
];
868 if (*length
> *offset
- oldo
) {
870 block_unused_too_granular
[id
].insert(oldo
, *offset
- oldo
);
872 block_unused_too_granular
[id
].erase(oldo
, *offset
- oldo
);
874 *length
-= (*offset
- oldo
);
877 block_unused_too_granular
[id
].insert(oldo
, *length
);
879 block_unused_too_granular
[id
].erase(oldo
, *length
);
884 if (*length
& (alloc_size
[id
] - 1)) {
885 *length
&= ~(alloc_size
[id
] - 1);
887 block_unused_too_granular
[id
].insert(
889 oldo
+ oldl
- *offset
- *length
);
891 block_unused_too_granular
[id
].erase(
893 oldo
+ oldl
- *offset
- *length
);
896 if (oldo
!= *offset
|| oldl
!= *length
) {
897 dout(10) << __func__
<< " " << op
<< " "
898 << (int)id
<< ":" << std::hex
<< oldo
<< "~" << oldl
899 << " -> " << (int)id
<< ":" << *offset
<< "~" << *length
<< dendl
;
904 int BlueFS::_verify_alloc_granularity(
905 __u8 id
, uint64_t offset
, uint64_t length
, const char *op
)
907 if ((offset
& (alloc_size
[id
] - 1)) ||
908 (length
& (alloc_size
[id
] - 1))) {
909 derr
<< __func__
<< " " << op
<< " of " << (int)id
910 << ":0x" << std::hex
<< offset
<< "~" << length
<< std::dec
911 << " does not align to alloc_size 0x"
912 << std::hex
<< alloc_size
[id
] << std::dec
<< dendl
;
914 auto need
= alloc_size
[id
];
915 while (need
&& ((offset
& (need
- 1)) ||
916 (length
& (need
- 1)))) {
921 if (id
== BDEV_SLOW
||
922 (id
== BDEV_DB
&& !bdev
[BDEV_SLOW
])) {
923 which
= "bluefs_shared_alloc_size";
925 which
= "bluefs_alloc_size";
927 derr
<< "work-around by setting " << which
<< " = " << need
928 << " for this OSD" << dendl
;
935 int BlueFS::_replay(bool noop
, bool to_stdout
)
937 dout(10) << __func__
<< (noop
? " NO-OP" : "") << dendl
;
938 ino_last
= 1; // by the log
942 log_file
= _get_file(1);
945 for (auto& a
: block_unused_too_granular
) {
946 ceph_assert(a
.empty());
950 log_file
->fnode
= super
.log_fnode
;
951 log_file
->vselector_hint
=
952 vselector
->get_hint_by_device(BDEV_WAL
);
954 // do not use fnode from superblock in 'noop' mode - log_file's one should
955 // be fine and up-to-date
956 ceph_assert(log_file
->fnode
.ino
== 1);
957 ceph_assert(log_file
->fnode
.extents
.size() != 0);
959 dout(10) << __func__
<< " log_fnode " << super
.log_fnode
<< dendl
;
960 if (unlikely(to_stdout
)) {
961 std::cout
<< " log_fnode " << super
.log_fnode
<< std::endl
;
964 FileReader
*log_reader
= new FileReader(
965 log_file
, cct
->_conf
->bluefs_max_prefetch
,
969 bool seen_recs
= false;
971 boost::dynamic_bitset
<uint64_t> used_blocks
[MAX_BDEV
];
972 boost::dynamic_bitset
<uint64_t> owned_blocks
[MAX_BDEV
];
974 if (cct
->_conf
->bluefs_log_replay_check_allocations
) {
975 for (size_t i
= 0; i
< MAX_BDEV
; ++i
) {
976 if (alloc_size
[i
] != 0 && bdev
[i
] != nullptr) {
977 used_blocks
[i
].resize(round_up_to(bdev
[i
]->get_size(), alloc_size
[i
]) / alloc_size
[i
]);
978 owned_blocks
[i
].resize(round_up_to(bdev
[i
]->get_size(), alloc_size
[i
]) / alloc_size
[i
]);
983 bool first_log_check
= true;
986 ceph_assert((log_reader
->buf
.pos
& ~super
.block_mask()) == 0);
987 uint64_t pos
= log_reader
->buf
.pos
;
988 uint64_t read_pos
= pos
;
991 int r
= _read(log_reader
, &log_reader
->buf
, read_pos
, super
.block_size
,
993 ceph_assert(r
== (int)super
.block_size
);
1000 auto p
= bl
.cbegin();
1008 if (len
+ 6 > bl
.length()) {
1009 more
= round_up_to(len
+ 6 - bl
.length(), super
.block_size
);
1012 if (uuid
!= super
.uuid
) {
1014 dout(10) << __func__
<< " 0x" << std::hex
<< pos
<< std::dec
1015 << ": stop: uuid " << uuid
<< " != super.uuid " << super
.uuid
1018 derr
<< __func__
<< " 0x" << std::hex
<< pos
<< std::dec
1019 << ": stop: uuid " << uuid
<< " != super.uuid " << super
.uuid
1020 << ", block dump: \n";
1022 t
.substr_of(bl
, 0, super
.block_size
);
1028 if (seq
!= log_seq
+ 1) {
1030 dout(10) << __func__
<< " 0x" << std::hex
<< pos
<< std::dec
1031 << ": stop: seq " << seq
<< " != expected " << log_seq
+ 1
1034 derr
<< __func__
<< " 0x" << std::hex
<< pos
<< std::dec
1035 << ": stop: seq " << seq
<< " != expected " << log_seq
+ 1
1041 dout(20) << __func__
<< " need 0x" << std::hex
<< more
<< std::dec
1042 << " more bytes" << dendl
;
1044 int r
= _read(log_reader
, &log_reader
->buf
, read_pos
, more
, &t
, NULL
);
1045 if (r
< (int)more
) {
1046 derr
<< __func__
<< " 0x" << std::hex
<< pos
1047 << ": stop: len is 0x" << bl
.length() + more
<< std::dec
1048 << ", which is past eof" << dendl
;
1051 ceph_assert(r
== (int)more
);
1056 bluefs_transaction_t t
;
1058 auto p
= bl
.cbegin();
1061 catch (buffer::error
& e
) {
1062 derr
<< __func__
<< " 0x" << std::hex
<< pos
<< std::dec
1063 << ": stop: failed to decode: " << e
.what()
1068 ceph_assert(seq
== t
.seq
);
1069 dout(10) << __func__
<< " 0x" << std::hex
<< pos
<< std::dec
1070 << ": " << t
<< dendl
;
1071 if (unlikely(to_stdout
)) {
1072 std::cout
<< " 0x" << std::hex
<< pos
<< std::dec
1073 << ": " << t
<< std::endl
;
1076 auto p
= t
.op_bl
.cbegin();
1082 case bluefs_transaction_t::OP_INIT
:
1083 dout(20) << __func__
<< " 0x" << std::hex
<< pos
<< std::dec
1084 << ": op_init" << dendl
;
1085 if (unlikely(to_stdout
)) {
1086 std::cout
<< " 0x" << std::hex
<< pos
<< std::dec
1087 << ": op_init" << std::endl
;
1090 ceph_assert(t
.seq
== 1);
1093 case bluefs_transaction_t::OP_JUMP
:
1097 decode(next_seq
, p
);
1099 dout(20) << __func__
<< " 0x" << std::hex
<< pos
<< std::dec
1100 << ": op_jump seq " << next_seq
1101 << " offset 0x" << std::hex
<< offset
<< std::dec
<< dendl
;
1102 if (unlikely(to_stdout
)) {
1103 std::cout
<< " 0x" << std::hex
<< pos
<< std::dec
1104 << ": op_jump seq " << next_seq
1105 << " offset 0x" << std::hex
<< offset
<< std::dec
1109 ceph_assert(next_seq
>= log_seq
);
1110 log_seq
= next_seq
- 1; // we will increment it below
1111 uint64_t skip
= offset
- read_pos
;
1114 int r
= _read(log_reader
, &log_reader
->buf
, read_pos
, skip
, &junk
,
1116 if (r
!= (int)skip
) {
1117 dout(10) << __func__
<< " 0x" << std::hex
<< read_pos
1118 << ": stop: failed to skip to " << offset
1119 << std::dec
<< dendl
;
1120 ceph_abort_msg("problem with op_jump");
1126 case bluefs_transaction_t::OP_JUMP_SEQ
:
1129 decode(next_seq
, p
);
1130 dout(20) << __func__
<< " 0x" << std::hex
<< pos
<< std::dec
1131 << ": op_jump_seq " << next_seq
<< dendl
;
1132 if (unlikely(to_stdout
)) {
1133 std::cout
<< " 0x" << std::hex
<< pos
<< std::dec
1134 << ": op_jump_seq " << next_seq
<< std::endl
;
1137 ceph_assert(next_seq
>= log_seq
);
1138 log_seq
= next_seq
- 1; // we will increment it below
1142 case bluefs_transaction_t::OP_ALLOC_ADD
:
1145 uint64_t offset
, length
;
1149 dout(20) << __func__
<< " 0x" << std::hex
<< pos
<< std::dec
1150 << ": op_alloc_add " << " " << (int)id
1151 << ":0x" << std::hex
<< offset
<< "~" << length
<< std::dec
1153 if (unlikely(to_stdout
)) {
1154 std::cout
<< " 0x" << std::hex
<< pos
<< std::dec
1155 << ": op_alloc_add " << " " << (int)id
1156 << ":0x" << std::hex
<< offset
<< "~" << length
<< std::dec
1160 block_all
[id
].insert(offset
, length
);
1161 _adjust_granularity(id
, &offset
, &length
, true);
1163 alloc
[id
]->init_add_free(offset
, length
);
1166 if (cct
->_conf
->bluefs_log_replay_check_allocations
) {
1168 apply_for_bitset_range(offset
, length
, alloc_size
[id
], owned_blocks
[id
],
1169 [&](uint64_t pos
, boost::dynamic_bitset
<uint64_t> &bs
) {
1178 derr
<< __func__
<< " invalid extent " << (int)id
1179 << ": 0x" << std::hex
<< offset
<< "~" << length
1180 << std::dec
<< ": already given" << dendl
;
1183 apply_for_bitset_range(offset
, length
, alloc_size
[id
], used_blocks
[id
],
1184 [&](uint64_t pos
, boost::dynamic_bitset
<uint64_t> &bs
) {
1191 derr
<< __func__
<< " invalid extent " << int(id
)
1192 << ": 0x" << std::hex
<< offset
<< "~" << length
1193 << std::dec
<< ": already in use" << dendl
;
1201 case bluefs_transaction_t::OP_ALLOC_RM
:
1204 uint64_t offset
, length
;
1208 dout(20) << __func__
<< " 0x" << std::hex
<< pos
<< std::dec
1209 << ": op_alloc_rm " << " " << (int)id
1210 << ":0x" << std::hex
<< offset
<< "~" << length
<< std::dec
1212 if (unlikely(to_stdout
)) {
1213 std::cout
<< " 0x" << std::hex
<< pos
<< std::dec
1214 << ": op_alloc_rm " << " " << (int)id
1215 << ":0x" << std::hex
<< offset
<< "~" << length
<< std::dec
1219 block_all
[id
].erase(offset
, length
);
1220 _adjust_granularity(id
, &offset
, &length
, false);
1222 alloc
[id
]->init_rm_free(offset
, length
);
1224 if (cct
->_conf
->bluefs_log_replay_check_allocations
) {
1226 apply_for_bitset_range(offset
, length
, alloc_size
[id
], owned_blocks
[id
],
1227 [&](uint64_t pos
, boost::dynamic_bitset
<uint64_t> &bs
) {
1228 if (!bs
.test(pos
)) {
1236 derr
<< __func__
<< " invalid extent " << int(id
)
1237 << ": 0x" << std::hex
<< offset
<< "~" << length
1238 << std::dec
<< ": wasn't given" << dendl
;
1242 apply_for_bitset_range(offset
, length
, alloc_size
[id
], used_blocks
[id
],
1243 [&](uint64_t pos
, boost::dynamic_bitset
<uint64_t> &bs
) {
1250 derr
<< __func__
<< " invalid extent " << (int)id
1251 << ": 0x" << std::hex
<< offset
<< "~" << length
1252 << std::dec
<< ": still in use" << dendl
;
1260 case bluefs_transaction_t::OP_DIR_LINK
:
1262 string dirname
, filename
;
1265 decode(filename
, p
);
1267 dout(20) << __func__
<< " 0x" << std::hex
<< pos
<< std::dec
1268 << ": op_dir_link " << " " << dirname
<< "/" << filename
1271 if (unlikely(to_stdout
)) {
1272 std::cout
<< " 0x" << std::hex
<< pos
<< std::dec
1273 << ": op_dir_link " << " " << dirname
<< "/" << filename
1279 FileRef file
= _get_file(ino
);
1280 ceph_assert(file
->fnode
.ino
);
1281 map
<string
,DirRef
>::iterator q
= dir_map
.find(dirname
);
1282 ceph_assert(q
!= dir_map
.end());
1283 map
<string
,FileRef
>::iterator r
= q
->second
->file_map
.find(filename
);
1284 ceph_assert(r
== q
->second
->file_map
.end());
1286 vselector
->sub_usage(file
->vselector_hint
, file
->fnode
);
1287 file
->vselector_hint
=
1288 vselector
->get_hint_by_dir(dirname
);
1289 vselector
->add_usage(file
->vselector_hint
, file
->fnode
);
1291 q
->second
->file_map
[filename
] = file
;
1297 case bluefs_transaction_t::OP_DIR_UNLINK
:
1299 string dirname
, filename
;
1301 decode(filename
, p
);
1302 dout(20) << __func__
<< " 0x" << std::hex
<< pos
<< std::dec
1303 << ": op_dir_unlink " << " " << dirname
<< "/" << filename
1305 if (unlikely(to_stdout
)) {
1306 std::cout
<< " 0x" << std::hex
<< pos
<< std::dec
1307 << ": op_dir_unlink " << " " << dirname
<< "/" << filename
1312 map
<string
,DirRef
>::iterator q
= dir_map
.find(dirname
);
1313 ceph_assert(q
!= dir_map
.end());
1314 map
<string
,FileRef
>::iterator r
= q
->second
->file_map
.find(filename
);
1315 ceph_assert(r
!= q
->second
->file_map
.end());
1316 ceph_assert(r
->second
->refs
> 0);
1318 q
->second
->file_map
.erase(r
);
1323 case bluefs_transaction_t::OP_DIR_CREATE
:
1327 dout(20) << __func__
<< " 0x" << std::hex
<< pos
<< std::dec
1328 << ": op_dir_create " << dirname
<< dendl
;
1329 if (unlikely(to_stdout
)) {
1330 std::cout
<< " 0x" << std::hex
<< pos
<< std::dec
1331 << ": op_dir_create " << dirname
<< std::endl
;
1335 map
<string
,DirRef
>::iterator q
= dir_map
.find(dirname
);
1336 ceph_assert(q
== dir_map
.end());
1337 dir_map
[dirname
] = ceph::make_ref
<Dir
>();
1342 case bluefs_transaction_t::OP_DIR_REMOVE
:
1346 dout(20) << __func__
<< " 0x" << std::hex
<< pos
<< std::dec
1347 << ": op_dir_remove " << dirname
<< dendl
;
1348 if (unlikely(to_stdout
)) {
1349 std::cout
<< " 0x" << std::hex
<< pos
<< std::dec
1350 << ": op_dir_remove " << dirname
<< std::endl
;
1354 map
<string
,DirRef
>::iterator q
= dir_map
.find(dirname
);
1355 ceph_assert(q
!= dir_map
.end());
1356 ceph_assert(q
->second
->file_map
.empty());
1362 case bluefs_transaction_t::OP_FILE_UPDATE
:
1364 bluefs_fnode_t fnode
;
1366 dout(20) << __func__
<< " 0x" << std::hex
<< pos
<< std::dec
1367 << ": op_file_update " << " " << fnode
<< " " << dendl
;
1368 if (unlikely(to_stdout
)) {
1369 std::cout
<< " 0x" << std::hex
<< pos
<< std::dec
1370 << ": op_file_update " << " " << fnode
<< std::endl
;
1373 FileRef f
= _get_file(fnode
.ino
);
1374 if (cct
->_conf
->bluefs_log_replay_check_allocations
) {
1375 // check initial log layout
1376 if (first_log_check
) {
1377 first_log_check
= false;
1378 int r
= _check_new_allocations(log_file
->fnode
,
1379 MAX_BDEV
, owned_blocks
, used_blocks
);
1385 auto& fnode_extents
= f
->fnode
.extents
;
1386 for (auto e
: fnode_extents
) {
1388 if (int r
= _verify_alloc_granularity(id
, e
.offset
, e
.length
,
1389 "OP_FILE_UPDATE"); r
< 0) {
1392 apply_for_bitset_range(e
.offset
, e
.length
, alloc_size
[id
],
1394 [&](uint64_t pos
, boost::dynamic_bitset
<uint64_t> &bs
) {
1395 ceph_assert(bs
.test(pos
));
1402 if (fnode
.ino
!= 1) {
1403 vselector
->sub_usage(f
->vselector_hint
, f
->fnode
);
1406 if (fnode
.ino
!= 1) {
1407 vselector
->add_usage(f
->vselector_hint
, f
->fnode
);
1410 if (fnode
.ino
> ino_last
) {
1411 ino_last
= fnode
.ino
;
1413 if (cct
->_conf
->bluefs_log_replay_check_allocations
) {
1414 int r
= _check_new_allocations(f
->fnode
,
1415 MAX_BDEV
, owned_blocks
, used_blocks
);
1424 case bluefs_transaction_t::OP_FILE_REMOVE
:
1428 dout(20) << __func__
<< " 0x" << std::hex
<< pos
<< std::dec
1429 << ": op_file_remove " << ino
<< dendl
;
1430 if (unlikely(to_stdout
)) {
1431 std::cout
<< " 0x" << std::hex
<< pos
<< std::dec
1432 << ": op_file_remove " << ino
<< std::endl
;
1436 auto p
= file_map
.find(ino
);
1437 ceph_assert(p
!= file_map
.end());
1438 vselector
->sub_usage(p
->second
->vselector_hint
, p
->second
->fnode
);
1439 if (cct
->_conf
->bluefs_log_replay_check_allocations
) {
1440 auto& fnode_extents
= p
->second
->fnode
.extents
;
1441 for (auto e
: fnode_extents
) {
1444 apply_for_bitset_range(e
.offset
, e
.length
, alloc_size
[id
], owned_blocks
[id
],
1445 [&](uint64_t pos
, boost::dynamic_bitset
<uint64_t> &bs
) {
1446 if (!bs
.test(pos
)) {
1452 derr
<< __func__
<< " invalid extent " << int(id
)
1453 << ": 0x" << std::hex
<< e
.offset
<< "~" << e
.length
1455 << ": wasn't given but is allocated for removed ino " << ino
1460 apply_for_bitset_range(e
.offset
, e
.length
, alloc_size
[id
], used_blocks
[id
],
1461 [&](uint64_t pos
, boost::dynamic_bitset
<uint64_t> &bs
) {
1462 if (!bs
.test(pos
)) {
1469 derr
<< __func__
<< " invalid extent " << int(id
)
1470 << ": 0x" << std::hex
<< e
.offset
<< "~" << e
.length
1472 << ": not in use but is allocated for removed ino " << ino
1484 derr
<< __func__
<< " 0x" << std::hex
<< pos
<< std::dec
1485 << ": stop: unrecognized op " << (int)op
<< dendl
;
1490 ceph_assert(p
.end());
1492 // we successfully replayed the transaction; bump the seq and log size
1494 log_file
->fnode
.size
= log_reader
->buf
.pos
;
1496 vselector
->add_usage(log_file
->vselector_hint
, log_file
->fnode
);
1498 if (!noop
&& first_log_check
&&
1499 cct
->_conf
->bluefs_log_replay_check_allocations
) {
1500 int r
= _check_new_allocations(log_file
->fnode
,
1501 MAX_BDEV
, owned_blocks
, used_blocks
);
1507 dout(10) << __func__
<< " log file size was 0x"
1508 << std::hex
<< log_file
->fnode
.size
<< std::dec
<< dendl
;
1509 if (unlikely(to_stdout
)) {
1510 std::cout
<< " log file size was 0x"
1511 << std::hex
<< log_file
->fnode
.size
<< std::dec
<< std::endl
;
1517 // verify file link counts are all >0
1518 for (auto& p
: file_map
) {
1519 if (p
.second
->refs
== 0 &&
1520 p
.second
->fnode
.ino
> 1) {
1521 derr
<< __func__
<< " file with link count 0: " << p
.second
->fnode
1528 for (unsigned id
= 0; id
< MAX_BDEV
; ++id
) {
1529 dout(10) << __func__
<< " block_unused_too_granular " << id
<< ": "
1530 << block_unused_too_granular
[id
] << dendl
;
1532 dout(10) << __func__
<< " done" << dendl
;
1536 int BlueFS::log_dump()
1538 // only dump log file's content
1539 int r
= _replay(true, true);
1541 derr
<< __func__
<< " failed to replay log: " << cpp_strerror(r
) << dendl
;
1548 int BlueFS::device_migrate_to_existing(
1550 const set
<int>& devs_source
,
1552 const bluefs_layout_t
& layout
)
1555 bool buffered
= cct
->_conf
->bluefs_buffered_io
;
1557 dout(10) << __func__
<< " devs_source " << devs_source
1558 << " dev_target " << dev_target
<< dendl
;
1559 assert(dev_target
< (int)MAX_BDEV
);
1562 flags
|= devs_source
.count(BDEV_DB
) ?
1563 (REMOVE_DB
| RENAME_SLOW2DB
) : 0;
1564 flags
|= devs_source
.count(BDEV_WAL
) ? REMOVE_WAL
: 0;
1565 int dev_target_new
= dev_target
;
1567 // Slow device without separate DB one is addressed via BDEV_DB
1568 // Hence need renaming.
1569 if ((flags
& REMOVE_DB
) && dev_target
== BDEV_SLOW
) {
1570 dev_target_new
= BDEV_DB
;
1571 dout(0) << __func__
<< " super to be written to " << dev_target
<< dendl
;
1574 for (auto& [ino
, file_ref
] : file_map
) {
1576 if (file_ref
->fnode
.ino
== 1) {
1579 dout(10) << __func__
<< " " << ino
<< " " << file_ref
->fnode
<< dendl
;
1581 auto& fnode_extents
= file_ref
->fnode
.extents
;
1583 bool rewrite
= std::any_of(
1584 fnode_extents
.begin(),
1585 fnode_extents
.end(),
1587 return ext
.bdev
!= dev_target
&& devs_source
.count(ext
.bdev
);
1590 dout(10) << __func__
<< " migrating" << dendl
;
1594 for (auto old_ext
: fnode_extents
) {
1595 buf
.resize(old_ext
.length
);
1596 int r
= bdev
[old_ext
.bdev
]->read_random(
1602 derr
<< __func__
<< " failed to read 0x" << std::hex
1603 << old_ext
.offset
<< "~" << old_ext
.length
<< std::dec
1604 << " from " << (int)dev_target
<< dendl
;
1607 bl
.append((char*)&buf
[0], old_ext
.length
);
1610 // write entire file
1611 PExtentVector extents
;
1612 auto l
= _allocate_without_fallback(dev_target
, bl
.length(), &extents
);
1614 derr
<< __func__
<< " unable to allocate len 0x" << std::hex
1615 << bl
.length() << std::dec
<< " from " << (int)dev_target
1616 << ": " << cpp_strerror(l
) << dendl
;
1621 for (auto& i
: extents
) {
1623 uint64_t cur_len
= std::min
<uint64_t>(i
.length
, bl
.length() - off
);
1624 ceph_assert(cur_len
> 0);
1625 cur
.substr_of(bl
, off
, cur_len
);
1626 int r
= bdev
[dev_target
]->write(i
.offset
, cur
, buffered
);
1627 ceph_assert(r
== 0);
1631 // release old extents
1632 for (auto old_ext
: fnode_extents
) {
1633 PExtentVector to_release
;
1634 to_release
.emplace_back(old_ext
.offset
, old_ext
.length
);
1635 alloc
[old_ext
.bdev
]->release(to_release
);
1639 fnode_extents
.clear();
1640 for (auto& i
: extents
) {
1641 fnode_extents
.emplace_back(dev_target_new
, i
.offset
, i
.length
);
1644 for (auto& ext
: fnode_extents
) {
1645 if (dev_target
!= dev_target_new
&& ext
.bdev
== dev_target
) {
1646 dout(20) << __func__
<< " " << " ... adjusting extent 0x"
1647 << std::hex
<< ext
.offset
<< std::dec
1648 << " bdev " << dev_target
<< " -> " << dev_target_new
1650 ext
.bdev
= dev_target_new
;
1655 // new logging device in the current naming scheme
1656 int new_log_dev_cur
= bdev
[BDEV_WAL
] ?
1658 bdev
[BDEV_DB
] ? BDEV_DB
: BDEV_SLOW
;
1660 // new logging device in new naming scheme
1661 int new_log_dev_next
= new_log_dev_cur
;
1663 if (devs_source
.count(new_log_dev_cur
)) {
1664 // SLOW device is addressed via BDEV_DB too hence either WAL or DB
1665 new_log_dev_next
= (flags
& REMOVE_WAL
) || !bdev
[BDEV_WAL
] ?
1669 dout(0) << __func__
<< " log moved from " << new_log_dev_cur
1670 << " to " << new_log_dev_next
<< dendl
;
1673 (flags
& REMOVE_DB
) && new_log_dev_next
== BDEV_DB
?
1678 _rewrite_log_and_layout_sync(
1680 (flags
& REMOVE_DB
) ? BDEV_SLOW
: BDEV_DB
,
1688 int BlueFS::device_migrate_to_new(
1690 const set
<int>& devs_source
,
1692 const bluefs_layout_t
& layout
)
1695 bool buffered
= cct
->_conf
->bluefs_buffered_io
;
1697 dout(10) << __func__
<< " devs_source " << devs_source
1698 << " dev_target " << dev_target
<< dendl
;
1699 assert(dev_target
== (int)BDEV_NEWDB
|| (int)BDEV_NEWWAL
);
1703 flags
|= devs_source
.count(BDEV_DB
) ?
1704 (!bdev
[BDEV_SLOW
] ? RENAME_DB2SLOW
: REMOVE_DB
) :
1706 flags
|= devs_source
.count(BDEV_WAL
) ? REMOVE_WAL
: 0;
1707 int dev_target_new
= dev_target
; //FIXME: remove, makes no sense
1709 for (auto& p
: file_map
) {
1711 if (p
.second
->fnode
.ino
== 1) {
1714 dout(10) << __func__
<< " " << p
.first
<< " " << p
.second
->fnode
<< dendl
;
1716 auto& fnode_extents
= p
.second
->fnode
.extents
;
1718 bool rewrite
= false;
1719 for (auto ext_it
= fnode_extents
.begin();
1720 ext_it
!= p
.second
->fnode
.extents
.end();
1722 if (ext_it
->bdev
!= dev_target
&& devs_source
.count(ext_it
->bdev
)) {
1728 dout(10) << __func__
<< " migrating" << dendl
;
1732 for (auto old_ext
: fnode_extents
) {
1733 buf
.resize(old_ext
.length
);
1734 int r
= bdev
[old_ext
.bdev
]->read_random(
1740 derr
<< __func__
<< " failed to read 0x" << std::hex
1741 << old_ext
.offset
<< "~" << old_ext
.length
<< std::dec
1742 << " from " << (int)dev_target
<< dendl
;
1745 bl
.append((char*)&buf
[0], old_ext
.length
);
1748 // write entire file
1749 PExtentVector extents
;
1750 auto l
= _allocate_without_fallback(dev_target
, bl
.length(), &extents
);
1752 derr
<< __func__
<< " unable to allocate len 0x" << std::hex
1753 << bl
.length() << std::dec
<< " from " << (int)dev_target
1754 << ": " << cpp_strerror(l
) << dendl
;
1759 for (auto& i
: extents
) {
1761 uint64_t cur_len
= std::min
<uint64_t>(i
.length
, bl
.length() - off
);
1762 ceph_assert(cur_len
> 0);
1763 cur
.substr_of(bl
, off
, cur_len
);
1764 int r
= bdev
[dev_target
]->write(i
.offset
, cur
, buffered
);
1765 ceph_assert(r
== 0);
1769 // release old extents
1770 for (auto old_ext
: fnode_extents
) {
1771 PExtentVector to_release
;
1772 to_release
.emplace_back(old_ext
.offset
, old_ext
.length
);
1773 alloc
[old_ext
.bdev
]->release(to_release
);
1777 fnode_extents
.clear();
1778 for (auto& i
: extents
) {
1779 fnode_extents
.emplace_back(dev_target_new
, i
.offset
, i
.length
);
1783 // new logging device in the current naming scheme
1784 int new_log_dev_cur
=
1787 bdev
[BDEV_WAL
] && !(flags
& REMOVE_WAL
) ?
1791 bdev
[BDEV_DB
] && !(flags
& REMOVE_DB
)?
1795 // new logging device in new naming scheme
1796 int new_log_dev_next
=
1797 new_log_dev_cur
== BDEV_NEWWAL
?
1799 new_log_dev_cur
== BDEV_NEWDB
?
1804 dev_target
== BDEV_NEWDB
?
1810 _rewrite_log_and_layout_sync(
1820 BlueFS::FileRef
BlueFS::_get_file(uint64_t ino
)
1822 auto p
= file_map
.find(ino
);
1823 if (p
== file_map
.end()) {
1824 FileRef f
= ceph::make_ref
<File
>();
1826 dout(30) << __func__
<< " ino " << ino
<< " = " << f
1827 << " (new)" << dendl
;
1830 dout(30) << __func__
<< " ino " << ino
<< " = " << p
->second
<< dendl
;
1835 void BlueFS::_drop_link(FileRef file
)
1837 dout(20) << __func__
<< " had refs " << file
->refs
1838 << " on " << file
->fnode
<< dendl
;
1839 ceph_assert(file
->refs
> 0);
1841 if (file
->refs
== 0) {
1842 dout(20) << __func__
<< " destroying " << file
->fnode
<< dendl
;
1843 ceph_assert(file
->num_reading
.load() == 0);
1844 vselector
->sub_usage(file
->vselector_hint
, file
->fnode
);
1845 log_t
.op_file_remove(file
->fnode
.ino
);
1846 for (auto& r
: file
->fnode
.extents
) {
1847 pending_release
[r
.bdev
].insert(r
.offset
, r
.length
);
1849 file_map
.erase(file
->fnode
.ino
);
1850 file
->deleted
= true;
1852 if (file
->dirty_seq
) {
1853 ceph_assert(file
->dirty_seq
> log_seq_stable
);
1854 ceph_assert(dirty_files
.count(file
->dirty_seq
));
1855 auto it
= dirty_files
[file
->dirty_seq
].iterator_to(*file
);
1856 dirty_files
[file
->dirty_seq
].erase(it
);
1857 file
->dirty_seq
= 0;
1862 int BlueFS::_read_random(
1863 FileReader
*h
, ///< [in] read from here
1864 uint64_t off
, ///< [in] offset
1865 uint64_t len
, ///< [in] this many bytes
1866 char *out
) ///< [out] optional: or copy it here
1868 auto* buf
= &h
->buf
;
1871 dout(10) << __func__
<< " h " << h
1872 << " 0x" << std::hex
<< off
<< "~" << len
<< std::dec
1873 << " from " << h
->file
->fnode
<< dendl
;
1875 ++h
->file
->num_reading
;
1877 if (!h
->ignore_eof
&&
1878 off
+ len
> h
->file
->fnode
.size
) {
1879 if (off
> h
->file
->fnode
.size
)
1882 len
= h
->file
->fnode
.size
- off
;
1883 dout(20) << __func__
<< " reaching (or past) eof, len clipped to 0x"
1884 << std::hex
<< len
<< std::dec
<< dendl
;
1886 logger
->inc(l_bluefs_read_random_count
, 1);
1887 logger
->inc(l_bluefs_read_random_bytes
, len
);
1889 std::shared_lock
s_lock(h
->lock
);
1891 if (off
< buf
->bl_off
|| off
>= buf
->get_buf_end()) {
1894 auto p
= h
->file
->fnode
.seek(off
, &x_off
);
1895 uint64_t l
= std::min(p
->length
- x_off
, len
);
1896 dout(20) << __func__
<< " read random 0x"
1897 << std::hex
<< x_off
<< "~" << l
<< std::dec
1898 << " of " << *p
<< dendl
;
1899 int r
= bdev
[p
->bdev
]->read_random(p
->offset
+ x_off
, l
, out
,
1900 cct
->_conf
->bluefs_buffered_io
);
1901 ceph_assert(r
== 0);
1907 logger
->inc(l_bluefs_read_random_disk_count
, 1);
1908 logger
->inc(l_bluefs_read_random_disk_bytes
, l
);
1913 auto left
= buf
->get_buf_remaining(off
);
1914 int r
= std::min(len
, left
);
1915 logger
->inc(l_bluefs_read_random_buffer_count
, 1);
1916 logger
->inc(l_bluefs_read_random_buffer_bytes
, r
);
1917 dout(20) << __func__
<< " left 0x" << std::hex
<< left
1918 << " 0x" << off
<< "~" << len
<< std::dec
1922 // NOTE: h->bl is normally a contiguous buffer so c_str() is free.
1923 memcpy(out
, buf
->bl
.c_str() + off
- buf
->bl_off
, r
);
1927 dout(30) << __func__
<< " result chunk (0x"
1928 << std::hex
<< r
<< std::dec
<< " bytes):\n";
1930 t
.substr_of(buf
->bl
, off
- buf
->bl_off
, r
);
1940 dout(20) << __func__
<< " got " << ret
<< dendl
;
1941 --h
->file
->num_reading
;
1946 FileReader
*h
, ///< [in] read from here
1947 FileReaderBuffer
*buf
, ///< [in] reader state
1948 uint64_t off
, ///< [in] offset
1949 size_t len
, ///< [in] this many bytes
1950 bufferlist
*outbl
, ///< [out] optional: reference the result here
1951 char *out
) ///< [out] optional: or copy it here
1953 bool prefetch
= !outbl
&& !out
;
1954 dout(10) << __func__
<< " h " << h
1955 << " 0x" << std::hex
<< off
<< "~" << len
<< std::dec
1956 << " from " << h
->file
->fnode
1957 << (prefetch
? " prefetch" : "")
1960 ++h
->file
->num_reading
;
1962 if (!h
->ignore_eof
&&
1963 off
+ len
> h
->file
->fnode
.size
) {
1964 if (off
> h
->file
->fnode
.size
)
1967 len
= h
->file
->fnode
.size
- off
;
1968 dout(20) << __func__
<< " reaching (or past) eof, len clipped to 0x"
1969 << std::hex
<< len
<< std::dec
<< dendl
;
1971 logger
->inc(l_bluefs_read_count
, 1);
1972 logger
->inc(l_bluefs_read_bytes
, len
);
1974 logger
->inc(l_bluefs_read_prefetch_count
, 1);
1975 logger
->inc(l_bluefs_read_prefetch_bytes
, len
);
1982 std::shared_lock
s_lock(h
->lock
);
1985 if (off
< buf
->bl_off
|| off
>= buf
->get_buf_end()) {
1987 std::unique_lock
u_lock(h
->lock
);
1988 if (off
< buf
->bl_off
|| off
>= buf
->get_buf_end()) {
1989 // if precondition hasn't changed during locking upgrade.
1991 buf
->bl_off
= off
& super
.block_mask();
1993 auto p
= h
->file
->fnode
.seek(buf
->bl_off
, &x_off
);
1994 uint64_t want
= round_up_to(len
+ (off
& ~super
.block_mask()),
1996 want
= std::max(want
, buf
->max_prefetch
);
1997 uint64_t l
= std::min(p
->length
- x_off
, want
);
1998 uint64_t eof_offset
= round_up_to(h
->file
->fnode
.size
, super
.block_size
);
1999 if (!h
->ignore_eof
&&
2000 buf
->bl_off
+ l
> eof_offset
) {
2001 l
= eof_offset
- buf
->bl_off
;
2003 dout(20) << __func__
<< " fetching 0x"
2004 << std::hex
<< x_off
<< "~" << l
<< std::dec
2005 << " of " << *p
<< dendl
;
2006 int r
= bdev
[p
->bdev
]->read(p
->offset
+ x_off
, l
, &buf
->bl
, ioc
[p
->bdev
],
2007 cct
->_conf
->bluefs_buffered_io
);
2008 ceph_assert(r
== 0);
2012 // we should recheck if buffer is valid after lock downgrade
2015 left
= buf
->get_buf_remaining(off
);
2016 dout(20) << __func__
<< " left 0x" << std::hex
<< left
2017 << " len 0x" << len
<< std::dec
<< dendl
;
2019 int r
= std::min(len
, left
);
2022 t
.substr_of(buf
->bl
, off
- buf
->bl_off
, r
);
2023 outbl
->claim_append(t
);
2026 // NOTE: h->bl is normally a contiguous buffer so c_str() is free.
2027 memcpy(out
, buf
->bl
.c_str() + off
- buf
->bl_off
, r
);
2031 dout(30) << __func__
<< " result chunk (0x"
2032 << std::hex
<< r
<< std::dec
<< " bytes):\n";
2034 t
.substr_of(buf
->bl
, off
- buf
->bl_off
, r
);
2044 dout(20) << __func__
<< " got " << ret
<< dendl
;
2045 ceph_assert(!outbl
|| (int)outbl
->length() == ret
);
2046 --h
->file
->num_reading
;
2050 void BlueFS::_invalidate_cache(FileRef f
, uint64_t offset
, uint64_t length
)
2052 dout(10) << __func__
<< " file " << f
->fnode
2053 << " 0x" << std::hex
<< offset
<< "~" << length
<< std::dec
2055 if (offset
& ~super
.block_mask()) {
2056 offset
&= super
.block_mask();
2057 length
= round_up_to(length
, super
.block_size
);
2060 auto p
= f
->fnode
.seek(offset
, &x_off
);
2061 while (length
> 0 && p
!= f
->fnode
.extents
.end()) {
2062 uint64_t x_len
= std::min(p
->length
- x_off
, length
);
2063 bdev
[p
->bdev
]->invalidate_cache(p
->offset
+ x_off
, x_len
);
2064 dout(20) << __func__
<< " 0x" << std::hex
<< x_off
<< "~" << x_len
2065 << std:: dec
<< " of " << *p
<< dendl
;
2071 uint64_t BlueFS::_estimate_log_size()
2073 int avg_dir_size
= 40; // fixme
2074 int avg_file_size
= 12;
2075 uint64_t size
= 4096 * 2;
2076 size
+= file_map
.size() * (1 + sizeof(bluefs_fnode_t
));
2077 for (auto& p
: block_all
)
2078 size
+= p
.num_intervals() * (1 + 1 + sizeof(uint64_t) * 2);
2079 size
+= dir_map
.size() + (1 + avg_dir_size
);
2080 size
+= file_map
.size() * (1 + avg_dir_size
+ avg_file_size
);
2081 return round_up_to(size
, super
.block_size
);
2084 void BlueFS::compact_log()
2086 std::unique_lock
l(lock
);
2087 if (cct
->_conf
->bluefs_compact_log_sync
) {
2088 _compact_log_sync();
2090 _compact_log_async(l
);
2094 bool BlueFS::_should_compact_log()
2096 uint64_t current
= log_writer
->file
->fnode
.size
;
2097 uint64_t expected
= _estimate_log_size();
2098 float ratio
= (float)current
/ (float)expected
;
2099 dout(10) << __func__
<< " current 0x" << std::hex
<< current
2100 << " expected " << expected
<< std::dec
2101 << " ratio " << ratio
2102 << (new_log
? " (async compaction in progress)" : "")
2105 current
< cct
->_conf
->bluefs_log_compact_min_size
||
2106 ratio
< cct
->_conf
->bluefs_log_compact_min_ratio
) {
2112 void BlueFS::_compact_log_dump_metadata(bluefs_transaction_t
*t
,
2116 t
->uuid
= super
.uuid
;
2117 dout(20) << __func__
<< " op_init" << dendl
;
2120 for (unsigned bdev
= 0; bdev
< MAX_BDEV
; ++bdev
) {
2121 interval_set
<uint64_t>& p
= block_all
[bdev
];
2122 for (interval_set
<uint64_t>::iterator q
= p
.begin(); q
!= p
.end(); ++q
) {
2123 auto bdev_new
= bdev
;
2124 if ((flags
& REMOVE_WAL
) && bdev
== BDEV_WAL
) {
2127 if ((flags
& REMOVE_DB
) && bdev
== BDEV_DB
) {
2130 if ((flags
& RENAME_SLOW2DB
) && bdev
== BDEV_SLOW
) {
2133 if ((flags
& RENAME_DB2SLOW
) && bdev
== BDEV_DB
) {
2134 bdev_new
= BDEV_SLOW
;
2136 if (bdev
== BDEV_NEWDB
) {
2137 // REMOVE_DB xor RENAME_DB
2138 ceph_assert(!(flags
& REMOVE_DB
) != !(flags
& RENAME_DB2SLOW
));
2139 ceph_assert(!(flags
& RENAME_SLOW2DB
));
2142 if (bdev
== BDEV_NEWWAL
) {
2143 ceph_assert(flags
& REMOVE_WAL
);
2144 bdev_new
= BDEV_WAL
;
2146 dout(20) << __func__
<< " op_alloc_add " << bdev_new
<< " 0x"
2147 << std::hex
<< q
.get_start() << "~" << q
.get_len() << std::dec
2149 t
->op_alloc_add(bdev_new
, q
.get_start(), q
.get_len());
2152 for (auto& [ino
, file_ref
] : file_map
) {
2155 ceph_assert(ino
> 1);
2157 for(auto& e
: file_ref
->fnode
.extents
) {
2159 auto bdev_new
= bdev
;
2160 ceph_assert(!((flags
& REMOVE_WAL
) && bdev
== BDEV_WAL
));
2161 if ((flags
& RENAME_SLOW2DB
) && bdev
== BDEV_SLOW
) {
2164 if ((flags
& RENAME_DB2SLOW
) && bdev
== BDEV_DB
) {
2165 bdev_new
= BDEV_SLOW
;
2167 if (bdev
== BDEV_NEWDB
) {
2168 // REMOVE_DB xor RENAME_DB
2169 ceph_assert(!(flags
& REMOVE_DB
) != !(flags
& RENAME_DB2SLOW
));
2170 ceph_assert(!(flags
& RENAME_SLOW2DB
));
2173 if (bdev
== BDEV_NEWWAL
) {
2174 ceph_assert(flags
& REMOVE_WAL
);
2175 bdev_new
= BDEV_WAL
;
2179 dout(20) << __func__
<< " op_file_update " << file_ref
->fnode
<< dendl
;
2180 t
->op_file_update(file_ref
->fnode
);
2182 for (auto& [path
, dir_ref
] : dir_map
) {
2183 dout(20) << __func__
<< " op_dir_create " << path
<< dendl
;
2184 t
->op_dir_create(path
);
2185 for (auto& [fname
, file_ref
] : dir_ref
->file_map
) {
2186 dout(20) << __func__
<< " op_dir_link " << path
<< "/" << fname
2187 << " to " << file_ref
->fnode
.ino
<< dendl
;
2188 t
->op_dir_link(path
, fname
, file_ref
->fnode
.ino
);
2193 void BlueFS::_compact_log_sync()
2195 dout(10) << __func__
<< dendl
;
2197 vselector
->select_prefer_bdev(log_writer
->file
->vselector_hint
);
2198 _rewrite_log_and_layout_sync(true,
2203 super
.memorized_layout
);
2204 logger
->inc(l_bluefs_log_compactions
);
2207 void BlueFS::_rewrite_log_and_layout_sync(bool allocate_with_fallback
,
2212 std::optional
<bluefs_layout_t
> layout
)
2214 File
*log_file
= log_writer
->file
.get();
2216 // clear out log (be careful who calls us!!!)
2219 dout(20) << __func__
<< " super_dev:" << super_dev
2220 << " log_dev:" << log_dev
2221 << " log_dev_new:" << log_dev_new
2222 << " flags:" << flags
2224 bluefs_transaction_t t
;
2225 _compact_log_dump_metadata(&t
, flags
);
2227 dout(20) << __func__
<< " op_jump_seq " << log_seq
<< dendl
;
2228 t
.op_jump_seq(log_seq
);
2234 uint64_t need
= bl
.length() + cct
->_conf
->bluefs_max_log_runway
;
2235 dout(20) << __func__
<< " need " << need
<< dendl
;
2237 bluefs_fnode_t old_fnode
;
2239 log_file
->fnode
.swap_extents(old_fnode
);
2240 if (allocate_with_fallback
) {
2241 r
= _allocate(log_dev
, need
, &log_file
->fnode
);
2242 ceph_assert(r
== 0);
2244 PExtentVector extents
;
2245 r
= _allocate_without_fallback(log_dev
,
2248 ceph_assert(r
== 0);
2249 for (auto& p
: extents
) {
2250 log_file
->fnode
.append_extent(
2251 bluefs_extent_t(log_dev
, p
.offset
, p
.length
));
2255 _close_writer(log_writer
);
2257 log_file
->fnode
.size
= bl
.length();
2258 vselector
->sub_usage(log_file
->vselector_hint
, old_fnode
);
2259 vselector
->add_usage(log_file
->vselector_hint
, log_file
->fnode
);
2261 log_writer
= _create_writer(log_file
);
2262 log_writer
->append(bl
);
2263 r
= _flush(log_writer
, true);
2264 ceph_assert(r
== 0);
2266 if (!cct
->_conf
->bluefs_sync_write
) {
2267 list
<aio_t
> completed_ios
;
2268 _claim_completed_aios(log_writer
, &completed_ios
);
2269 wait_for_aio(log_writer
);
2270 completed_ios
.clear();
2275 super
.memorized_layout
= layout
;
2276 super
.log_fnode
= log_file
->fnode
;
2277 // rename device if needed
2278 if (log_dev
!= log_dev_new
) {
2279 dout(10) << __func__
<< " renaming log extents to " << log_dev_new
<< dendl
;
2280 for (auto& p
: super
.log_fnode
.extents
) {
2281 p
.bdev
= log_dev_new
;
2284 dout(10) << __func__
<< " writing super, log fnode: " << super
.log_fnode
<< dendl
;
2287 _write_super(super_dev
);
2290 dout(10) << __func__
<< " release old log extents " << old_fnode
.extents
<< dendl
;
2291 for (auto& r
: old_fnode
.extents
) {
2292 pending_release
[r
.bdev
].insert(r
.offset
, r
.length
);
2297 * 1. Allocate a new extent to continue the log, and then log an event
2298 * that jumps the log write position to the new extent. At this point, the
2299 * old extent(s) won't be written to, and reflect everything to compact.
2300 * New events will be written to the new region that we'll keep.
2302 * 2. While still holding the lock, encode a bufferlist that dumps all of the
2303 * in-memory fnodes and names. This will become the new beginning of the
2304 * log. The last event will jump to the log continuation extent from #1.
2306 * 3. Queue a write to a new extent for the new beginnging of the log.
2308 * 4. Drop lock and wait
2310 * 5. Retake the lock.
2312 * 6. Update the log_fnode to splice in the new beginning.
2314 * 7. Write the new superblock.
2316 * 8. Release the old log space. Clean up.
2318 void BlueFS::_compact_log_async(std::unique_lock
<ceph::mutex
>& l
)
2320 dout(10) << __func__
<< dendl
;
2321 File
*log_file
= log_writer
->file
.get();
2322 ceph_assert(!new_log
);
2323 ceph_assert(!new_log_writer
);
2325 // create a new log [writer] so that we know compaction is in progress
2326 // (see _should_compact_log)
2327 new_log
= ceph::make_ref
<File
>();
2328 new_log
->fnode
.ino
= 0; // so that _flush_range won't try to log the fnode
2330 // 0. wait for any racing flushes to complete. (We do not want to block
2331 // in _flush_sync_log with jump_to set or else a racing thread might flush
2332 // our entries and our jump_to update won't be correct.)
2333 while (log_flushing
) {
2334 dout(10) << __func__
<< " log is currently flushing, waiting" << dendl
;
2338 vselector
->sub_usage(log_file
->vselector_hint
, log_file
->fnode
);
2340 // 1. allocate new log space and jump to it.
2341 old_log_jump_to
= log_file
->fnode
.get_allocated();
2342 dout(10) << __func__
<< " old_log_jump_to 0x" << std::hex
<< old_log_jump_to
2343 << " need 0x" << (old_log_jump_to
+ cct
->_conf
->bluefs_max_log_runway
) << std::dec
<< dendl
;
2344 int r
= _allocate(vselector
->select_prefer_bdev(log_file
->vselector_hint
),
2345 cct
->_conf
->bluefs_max_log_runway
,
2347 ceph_assert(r
== 0);
2348 //adjust usage as flush below will need it
2349 vselector
->add_usage(log_file
->vselector_hint
, log_file
->fnode
);
2350 dout(10) << __func__
<< " log extents " << log_file
->fnode
.extents
<< dendl
;
2352 // update the log file change and log a jump to the offset where we want to
2353 // write the new entries
2354 log_t
.op_file_update(log_file
->fnode
);
2355 log_t
.op_jump(log_seq
, old_log_jump_to
);
2357 flush_bdev(); // FIXME?
2359 _flush_and_sync_log(l
, 0, old_log_jump_to
);
2361 // 2. prepare compacted log
2362 bluefs_transaction_t t
;
2363 //avoid record two times in log_t and _compact_log_dump_metadata.
2365 _compact_log_dump_metadata(&t
, 0);
2367 uint64_t max_alloc_size
= std::max(alloc_size
[BDEV_WAL
],
2368 std::max(alloc_size
[BDEV_DB
],
2369 alloc_size
[BDEV_SLOW
]));
2371 // conservative estimate for final encoded size
2372 new_log_jump_to
= round_up_to(t
.op_bl
.length() + super
.block_size
* 2,
2374 t
.op_jump(log_seq
, new_log_jump_to
);
2377 //FIXME: check if we want DB here?
2378 r
= _allocate(BlueFS::BDEV_DB
, new_log_jump_to
,
2380 ceph_assert(r
== 0);
2382 // we might have some more ops in log_t due to _allocate call
2389 dout(10) << __func__
<< " new_log_jump_to 0x" << std::hex
<< new_log_jump_to
2390 << std::dec
<< dendl
;
2392 new_log_writer
= _create_writer(new_log
);
2393 new_log_writer
->append(bl
);
2396 r
= _flush(new_log_writer
, true);
2397 ceph_assert(r
== 0);
2400 _flush_bdev_safely(new_log_writer
);
2402 // 5. update our log fnode
2403 // discard first old_log_jump_to extents
2405 dout(10) << __func__
<< " remove 0x" << std::hex
<< old_log_jump_to
<< std::dec
2406 << " of " << log_file
->fnode
.extents
<< dendl
;
2407 uint64_t discarded
= 0;
2408 mempool::bluefs::vector
<bluefs_extent_t
> old_extents
;
2409 while (discarded
< old_log_jump_to
) {
2410 ceph_assert(!log_file
->fnode
.extents
.empty());
2411 bluefs_extent_t
& e
= log_file
->fnode
.extents
.front();
2412 bluefs_extent_t temp
= e
;
2413 if (discarded
+ e
.length
<= old_log_jump_to
) {
2414 dout(10) << __func__
<< " remove old log extent " << e
<< dendl
;
2415 discarded
+= e
.length
;
2416 log_file
->fnode
.pop_front_extent();
2418 dout(10) << __func__
<< " remove front of old log extent " << e
<< dendl
;
2419 uint64_t drop
= old_log_jump_to
- discarded
;
2424 dout(10) << __func__
<< " kept " << e
<< " removed " << temp
<< dendl
;
2426 old_extents
.push_back(temp
);
2428 auto from
= log_file
->fnode
.extents
.begin();
2429 auto to
= log_file
->fnode
.extents
.end();
2430 while (from
!= to
) {
2431 new_log
->fnode
.append_extent(*from
);
2435 vselector
->sub_usage(log_file
->vselector_hint
, log_file
->fnode
);
2437 // clear the extents from old log file, they are added to new log
2438 log_file
->fnode
.clear_extents();
2439 // swap the log files. New log file is the log file now.
2440 new_log
->fnode
.swap_extents(log_file
->fnode
);
2442 log_writer
->pos
= log_writer
->file
->fnode
.size
=
2443 log_writer
->pos
- old_log_jump_to
+ new_log_jump_to
;
2445 vselector
->add_usage(log_file
->vselector_hint
, log_file
->fnode
);
2447 // 6. write the super block to reflect the changes
2448 dout(10) << __func__
<< " writing super" << dendl
;
2449 super
.log_fnode
= log_file
->fnode
;
2451 _write_super(BDEV_DB
);
2457 // 7. release old space
2458 dout(10) << __func__
<< " release old log extents " << old_extents
<< dendl
;
2459 for (auto& r
: old_extents
) {
2460 pending_release
[r
.bdev
].insert(r
.offset
, r
.length
);
2463 // delete the new log, remove from the dirty files list
2464 _close_writer(new_log_writer
);
2465 if (new_log
->dirty_seq
) {
2466 ceph_assert(dirty_files
.count(new_log
->dirty_seq
));
2467 auto it
= dirty_files
[new_log
->dirty_seq
].iterator_to(*new_log
);
2468 dirty_files
[new_log
->dirty_seq
].erase(it
);
2470 new_log_writer
= nullptr;
2472 log_cond
.notify_all();
2474 dout(10) << __func__
<< " log extents " << log_file
->fnode
.extents
<< dendl
;
2475 logger
->inc(l_bluefs_log_compactions
);
2478 void BlueFS::_pad_bl(bufferlist
& bl
)
2480 uint64_t partial
= bl
.length() % super
.block_size
;
2482 dout(10) << __func__
<< " padding with 0x" << std::hex
2483 << super
.block_size
- partial
<< " zeros" << std::dec
<< dendl
;
2484 bl
.append_zero(super
.block_size
- partial
);
2489 int BlueFS::_flush_and_sync_log(std::unique_lock
<ceph::mutex
>& l
,
2493 while (log_flushing
) {
2494 dout(10) << __func__
<< " want_seq " << want_seq
2495 << " log is currently flushing, waiting" << dendl
;
2496 ceph_assert(!jump_to
);
2499 if (want_seq
&& want_seq
<= log_seq_stable
) {
2500 dout(10) << __func__
<< " want_seq " << want_seq
<< " <= log_seq_stable "
2501 << log_seq_stable
<< ", done" << dendl
;
2502 ceph_assert(!jump_to
);
2505 if (log_t
.empty() && dirty_files
.empty()) {
2506 dout(10) << __func__
<< " want_seq " << want_seq
2507 << " " << log_t
<< " not dirty, dirty_files empty, no-op" << dendl
;
2508 ceph_assert(!jump_to
);
2512 vector
<interval_set
<uint64_t>> to_release(pending_release
.size());
2513 to_release
.swap(pending_release
);
2515 uint64_t seq
= log_t
.seq
= ++log_seq
;
2516 ceph_assert(want_seq
== 0 || want_seq
<= seq
);
2517 log_t
.uuid
= super
.uuid
;
2520 auto lsi
= dirty_files
.find(seq
);
2521 if (lsi
!= dirty_files
.end()) {
2522 dout(20) << __func__
<< " " << lsi
->second
.size() << " dirty_files" << dendl
;
2523 for (auto &f
: lsi
->second
) {
2524 dout(20) << __func__
<< " op_file_update " << f
.fnode
<< dendl
;
2525 log_t
.op_file_update(f
.fnode
);
2529 dout(10) << __func__
<< " " << log_t
<< dendl
;
2530 ceph_assert(!log_t
.empty());
2532 // allocate some more space (before we run out)?
2533 int64_t runway
= log_writer
->file
->fnode
.get_allocated() -
2534 log_writer
->get_effective_write_pos();
2535 if (runway
< (int64_t)cct
->_conf
->bluefs_min_log_runway
) {
2536 dout(10) << __func__
<< " allocating more log runway (0x"
2537 << std::hex
<< runway
<< std::dec
<< " remaining)" << dendl
;
2538 while (new_log_writer
) {
2539 dout(10) << __func__
<< " waiting for async compaction" << dendl
;
2542 vselector
->sub_usage(log_writer
->file
->vselector_hint
, log_writer
->file
->fnode
);
2544 vselector
->select_prefer_bdev(log_writer
->file
->vselector_hint
),
2545 cct
->_conf
->bluefs_max_log_runway
,
2546 &log_writer
->file
->fnode
);
2547 ceph_assert(r
== 0);
2548 vselector
->add_usage(log_writer
->file
->vselector_hint
, log_writer
->file
->fnode
);
2549 log_t
.op_file_update(log_writer
->file
->fnode
);
2553 bl
.reserve(super
.block_size
);
2555 // pad to block boundary
2556 size_t realign
= super
.block_size
- (bl
.length() % super
.block_size
);
2557 if (realign
&& realign
!= super
.block_size
)
2558 bl
.append_zero(realign
);
2560 logger
->inc(l_bluefs_logged_bytes
, bl
.length());
2562 log_writer
->append(bl
);
2565 log_t
.seq
= 0; // just so debug output is less confusing
2566 log_flushing
= true;
2568 int r
= _flush(log_writer
, true);
2569 ceph_assert(r
== 0);
2572 dout(10) << __func__
<< " jumping log offset from 0x" << std::hex
2573 << log_writer
->pos
<< " -> 0x" << jump_to
<< std::dec
<< dendl
;
2574 log_writer
->pos
= jump_to
;
2575 vselector
->sub_usage(log_writer
->file
->vselector_hint
, log_writer
->file
->fnode
.size
);
2576 log_writer
->file
->fnode
.size
= jump_to
;
2577 vselector
->add_usage(log_writer
->file
->vselector_hint
, log_writer
->file
->fnode
.size
);
2580 _flush_bdev_safely(log_writer
);
2582 log_flushing
= false;
2583 log_cond
.notify_all();
2585 // clean dirty files
2586 if (seq
> log_seq_stable
) {
2587 log_seq_stable
= seq
;
2588 dout(20) << __func__
<< " log_seq_stable " << log_seq_stable
<< dendl
;
2590 auto p
= dirty_files
.begin();
2591 while (p
!= dirty_files
.end()) {
2592 if (p
->first
> log_seq_stable
) {
2593 dout(20) << __func__
<< " done cleaning up dirty files" << dendl
;
2597 auto l
= p
->second
.begin();
2598 while (l
!= p
->second
.end()) {
2600 ceph_assert(file
->dirty_seq
> 0);
2601 ceph_assert(file
->dirty_seq
<= log_seq_stable
);
2602 dout(20) << __func__
<< " cleaned file " << file
->fnode
<< dendl
;
2603 file
->dirty_seq
= 0;
2604 p
->second
.erase(l
++);
2607 ceph_assert(p
->second
.empty());
2608 dirty_files
.erase(p
++);
2611 dout(20) << __func__
<< " log_seq_stable " << log_seq_stable
2612 << " already >= out seq " << seq
2613 << ", we lost a race against another log flush, done" << dendl
;
2616 for (unsigned i
= 0; i
< to_release
.size(); ++i
) {
2617 if (!to_release
[i
].empty()) {
2618 /* OK, now we have the guarantee alloc[i] won't be null. */
2620 if (cct
->_conf
->bdev_enable_discard
&& cct
->_conf
->bdev_async_discard
) {
2621 r
= bdev
[i
]->queue_discard(to_release
[i
]);
2624 } else if (cct
->_conf
->bdev_enable_discard
) {
2625 for (auto p
= to_release
[i
].begin(); p
!= to_release
[i
].end(); ++p
) {
2626 bdev
[i
]->discard(p
.get_start(), p
.get_len());
2629 alloc
[i
]->release(to_release
[i
]);
2633 _update_logger_stats();
2638 int BlueFS::_flush_range(FileWriter
*h
, uint64_t offset
, uint64_t length
)
2640 dout(10) << __func__
<< " " << h
<< " pos 0x" << std::hex
<< h
->pos
2641 << " 0x" << offset
<< "~" << length
<< std::dec
2642 << " to " << h
->file
->fnode
<< dendl
;
2643 ceph_assert(!h
->file
->deleted
);
2644 ceph_assert(h
->file
->num_readers
.load() == 0);
2646 h
->buffer_appender
.flush();
2649 if (h
->file
->fnode
.ino
== 1)
2652 buffered
= cct
->_conf
->bluefs_buffered_io
;
2654 if (offset
+ length
<= h
->pos
)
2656 if (offset
< h
->pos
) {
2657 length
-= h
->pos
- offset
;
2659 dout(10) << " still need 0x"
2660 << std::hex
<< offset
<< "~" << length
<< std::dec
2663 ceph_assert(offset
<= h
->file
->fnode
.size
);
2665 uint64_t allocated
= h
->file
->fnode
.get_allocated();
2666 vselector
->sub_usage(h
->file
->vselector_hint
, h
->file
->fnode
);
2667 // do not bother to dirty the file if we are overwriting
2668 // previously allocated extents.
2669 bool must_dirty
= false;
2670 uint64_t clear_upto
= 0;
2671 if (allocated
< offset
+ length
) {
2672 // we should never run out of log space here; see the min runway check
2673 // in _flush_and_sync_log.
2674 ceph_assert(h
->file
->fnode
.ino
!= 1);
2675 int r
= _allocate(vselector
->select_prefer_bdev(h
->file
->vselector_hint
),
2676 offset
+ length
- allocated
,
2679 derr
<< __func__
<< " allocated: 0x" << std::hex
<< allocated
2680 << " offset: 0x" << offset
<< " length: 0x" << length
<< std::dec
2682 vselector
->add_usage(h
->file
->vselector_hint
, h
->file
->fnode
); // undo
2683 ceph_abort_msg("bluefs enospc");
2686 if (cct
->_conf
->bluefs_preextend_wal_files
&&
2687 h
->writer_type
== WRITER_WAL
) {
2688 // NOTE: this *requires* that rocksdb also has log recycling
2689 // enabled and is therefore doing robust CRCs on the log
2690 // records. otherwise, we will fail to reply the rocksdb log
2691 // properly due to garbage on the device.
2692 h
->file
->fnode
.size
= h
->file
->fnode
.get_allocated();
2693 clear_upto
= h
->file
->fnode
.size
;
2694 dout(10) << __func__
<< " extending WAL size to 0x" << std::hex
2695 << h
->file
->fnode
.size
<< std::dec
<< " to include allocated"
2700 if (h
->file
->fnode
.size
< offset
+ length
) {
2701 h
->file
->fnode
.size
= offset
+ length
;
2702 if (h
->file
->fnode
.ino
> 1) {
2703 // we do not need to dirty the log file (or it's compacting
2704 // replacement) when the file size changes because replay is
2705 // smart enough to discover it on its own.
2710 h
->file
->fnode
.mtime
= ceph_clock_now();
2711 ceph_assert(h
->file
->fnode
.ino
>= 1);
2712 if (h
->file
->dirty_seq
== 0) {
2713 h
->file
->dirty_seq
= log_seq
+ 1;
2714 dirty_files
[h
->file
->dirty_seq
].push_back(*h
->file
);
2715 dout(20) << __func__
<< " dirty_seq = " << log_seq
+ 1
2716 << " (was clean)" << dendl
;
2718 if (h
->file
->dirty_seq
!= log_seq
+ 1) {
2719 // need re-dirty, erase from list first
2720 ceph_assert(dirty_files
.count(h
->file
->dirty_seq
));
2721 auto it
= dirty_files
[h
->file
->dirty_seq
].iterator_to(*h
->file
);
2722 dirty_files
[h
->file
->dirty_seq
].erase(it
);
2723 h
->file
->dirty_seq
= log_seq
+ 1;
2724 dirty_files
[h
->file
->dirty_seq
].push_back(*h
->file
);
2725 dout(20) << __func__
<< " dirty_seq = " << log_seq
+ 1
2726 << " (was " << h
->file
->dirty_seq
<< ")" << dendl
;
2728 dout(20) << __func__
<< " dirty_seq = " << log_seq
+ 1
2729 << " (unchanged, do nothing) " << dendl
;
2733 dout(20) << __func__
<< " file now " << h
->file
->fnode
<< dendl
;
2736 auto p
= h
->file
->fnode
.seek(offset
, &x_off
);
2737 ceph_assert(p
!= h
->file
->fnode
.extents
.end());
2738 dout(20) << __func__
<< " in " << *p
<< " x_off 0x"
2739 << std::hex
<< x_off
<< std::dec
<< dendl
;
2741 unsigned partial
= x_off
& ~super
.block_mask();
2744 dout(20) << __func__
<< " using partial tail 0x"
2745 << std::hex
<< partial
<< std::dec
<< dendl
;
2746 ceph_assert(h
->tail_block
.length() == partial
);
2747 bl
.claim_append_piecewise(h
->tail_block
);
2751 dout(20) << __func__
<< " waiting for previous aio to complete" << dendl
;
2752 for (auto p
: h
->iocv
) {
2758 if (length
== partial
+ h
->buffer
.length() || clear_upto
!= 0) {
2759 /* in case of inital allocation and need to zero, limited flush is unacceptable */
2760 bl
.claim_append_piecewise(h
->buffer
);
2763 h
->buffer
.splice(0, length
, &t
);
2764 bl
.claim_append_piecewise(t
);
2765 t
.substr_of(h
->buffer
, length
, h
->buffer
.length() - length
);
2767 dout(20) << " leaving 0x" << std::hex
<< h
->buffer
.length() << std::dec
2768 << " unflushed" << dendl
;
2770 ceph_assert(bl
.length() == length
);
2772 h
->pos
= offset
+ length
;
2774 unsigned tail
= bl
.length() & ~super
.block_mask();
2776 dout(20) << __func__
<< " caching tail of 0x"
2778 << " and padding block with 0x" << (super
.block_size
- tail
)
2779 << std::dec
<< dendl
;
2780 h
->tail_block
.substr_of(bl
, bl
.length() - tail
, tail
);
2781 bl
.append_zero(super
.block_size
- tail
);
2782 length
+= super
.block_size
- tail
;
2784 h
->tail_block
.clear();
2786 if (clear_upto
!= 0) {
2787 if (offset
+ length
< clear_upto
) {
2788 dout(20) << __func__
<< " zeroing WAL log up to 0x"
2789 << std::hex
<< clear_upto
2790 << std::dec
<< dendl
;
2791 bl
.append_zero(clear_upto
- (offset
+ length
));
2792 length
+= clear_upto
- (offset
+ length
);
2795 ceph_assert(bl
.length() == length
);
2797 switch (h
->writer_type
) {
2799 logger
->inc(l_bluefs_bytes_written_wal
, length
);
2802 logger
->inc(l_bluefs_bytes_written_sst
, length
);
2806 dout(30) << "dump:\n";
2811 uint64_t bytes_written_slow
= 0;
2812 while (length
> 0) {
2813 uint64_t x_len
= std::min(p
->length
- x_off
, length
);
2815 t
.substr_of(bl
, bloff
, x_len
);
2816 if (cct
->_conf
->bluefs_sync_write
) {
2817 bdev
[p
->bdev
]->write(p
->offset
+ x_off
, t
, buffered
, h
->write_hint
);
2819 bdev
[p
->bdev
]->aio_write(p
->offset
+ x_off
, t
, h
->iocv
[p
->bdev
], buffered
, h
->write_hint
);
2821 h
->dirty_devs
[p
->bdev
] = true;
2822 if (p
->bdev
== BDEV_SLOW
) {
2823 bytes_written_slow
+= t
.length();
2831 logger
->inc(l_bluefs_bytes_written_slow
, bytes_written_slow
);
2832 for (unsigned i
= 0; i
< MAX_BDEV
; ++i
) {
2834 if (h
->iocv
[i
] && h
->iocv
[i
]->has_pending_aios()) {
2835 bdev
[i
]->aio_submit(h
->iocv
[i
]);
2839 vselector
->add_usage(h
->file
->vselector_hint
, h
->file
->fnode
);
2840 dout(20) << __func__
<< " h " << h
<< " pos now 0x"
2841 << std::hex
<< h
->pos
<< std::dec
<< dendl
;
2846 // we need to retire old completed aios so they don't stick around in
2847 // memory indefinitely (along with their bufferlist refs).
2848 void BlueFS::_claim_completed_aios(FileWriter
*h
, list
<aio_t
> *ls
)
2850 for (auto p
: h
->iocv
) {
2852 ls
->splice(ls
->end(), p
->running_aios
);
2855 dout(10) << __func__
<< " got " << ls
->size() << " aios" << dendl
;
2858 void BlueFS::wait_for_aio(FileWriter
*h
)
2860 // NOTE: this is safe to call without a lock, as long as our reference is
2862 dout(10) << __func__
<< " " << h
<< dendl
;
2863 utime_t start
= ceph_clock_now();
2864 for (auto p
: h
->iocv
) {
2869 dout(10) << __func__
<< " " << h
<< " done in " << (ceph_clock_now() - start
) << dendl
;
2873 int BlueFS::_flush(FileWriter
*h
, bool force
)
2875 h
->buffer_appender
.flush();
2876 uint64_t length
= h
->buffer
.length();
2877 uint64_t offset
= h
->pos
;
2879 length
< cct
->_conf
->bluefs_min_flush_size
) {
2880 dout(10) << __func__
<< " " << h
<< " ignoring, length " << length
2881 << " < min_flush_size " << cct
->_conf
->bluefs_min_flush_size
2886 dout(10) << __func__
<< " " << h
<< " no dirty data on "
2887 << h
->file
->fnode
<< dendl
;
2890 dout(10) << __func__
<< " " << h
<< " 0x"
2891 << std::hex
<< offset
<< "~" << length
<< std::dec
2892 << " to " << h
->file
->fnode
<< dendl
;
2893 ceph_assert(h
->pos
<= h
->file
->fnode
.size
);
2894 return _flush_range(h
, offset
, length
);
2897 int BlueFS::_truncate(FileWriter
*h
, uint64_t offset
)
2899 dout(10) << __func__
<< " 0x" << std::hex
<< offset
<< std::dec
2900 << " file " << h
->file
->fnode
<< dendl
;
2901 if (h
->file
->deleted
) {
2902 dout(10) << __func__
<< " deleted, no-op" << dendl
;
2906 // we never truncate internal log files
2907 ceph_assert(h
->file
->fnode
.ino
> 1);
2909 h
->buffer_appender
.flush();
2911 // truncate off unflushed data?
2912 if (h
->pos
< offset
&&
2913 h
->pos
+ h
->buffer
.length() > offset
) {
2915 dout(20) << __func__
<< " tossing out last " << offset
- h
->pos
2916 << " unflushed bytes" << dendl
;
2917 t
.substr_of(h
->buffer
, 0, offset
- h
->pos
);
2919 ceph_abort_msg("actually this shouldn't happen");
2921 if (h
->buffer
.length()) {
2922 int r
= _flush(h
, true);
2926 if (offset
== h
->file
->fnode
.size
) {
2929 if (offset
> h
->file
->fnode
.size
) {
2930 ceph_abort_msg("truncate up not supported");
2932 ceph_assert(h
->file
->fnode
.size
>= offset
);
2933 vselector
->sub_usage(h
->file
->vselector_hint
, h
->file
->fnode
.size
);
2934 h
->file
->fnode
.size
= offset
;
2935 vselector
->add_usage(h
->file
->vselector_hint
, h
->file
->fnode
.size
);
2936 log_t
.op_file_update(h
->file
->fnode
);
2940 int BlueFS::_fsync(FileWriter
*h
, std::unique_lock
<ceph::mutex
>& l
)
2942 dout(10) << __func__
<< " " << h
<< " " << h
->file
->fnode
<< dendl
;
2943 int r
= _flush(h
, true);
2946 uint64_t old_dirty_seq
= h
->file
->dirty_seq
;
2948 _flush_bdev_safely(h
);
2950 if (old_dirty_seq
) {
2951 uint64_t s
= log_seq
;
2952 dout(20) << __func__
<< " file metadata was dirty (" << old_dirty_seq
2953 << ") on " << h
->file
->fnode
<< ", flushing log" << dendl
;
2954 _flush_and_sync_log(l
, old_dirty_seq
);
2955 ceph_assert(h
->file
->dirty_seq
== 0 || // cleaned
2956 h
->file
->dirty_seq
> s
); // or redirtied by someone else
2961 void BlueFS::_flush_bdev_safely(FileWriter
*h
)
2963 std::array
<bool, MAX_BDEV
> flush_devs
= h
->dirty_devs
;
2964 h
->dirty_devs
.fill(false);
2966 if (!cct
->_conf
->bluefs_sync_write
) {
2967 list
<aio_t
> completed_ios
;
2968 _claim_completed_aios(h
, &completed_ios
);
2971 completed_ios
.clear();
2972 flush_bdev(flush_devs
);
2978 flush_bdev(flush_devs
);
2983 void BlueFS::flush_bdev(std::array
<bool, MAX_BDEV
>& dirty_bdevs
)
2985 // NOTE: this is safe to call without a lock.
2986 dout(20) << __func__
<< dendl
;
2987 for (unsigned i
= 0; i
< MAX_BDEV
; i
++) {
2993 void BlueFS::flush_bdev()
2995 // NOTE: this is safe to call without a lock.
2996 dout(20) << __func__
<< dendl
;
2997 for (auto p
: bdev
) {
3003 const char* BlueFS::get_device_name(unsigned id
)
3005 if (id
>= MAX_BDEV
) return "BDEV_INV";
3006 const char* names
[] = {"BDEV_WAL", "BDEV_DB", "BDEV_SLOW", "BDEV_NEWWAL", "BDEV_NEWDB"};
3010 int BlueFS::_expand_slow_device(uint64_t need
, PExtentVector
& extents
)
3013 if (slow_dev_expander
) {
3014 int id
= _get_slow_device_id();
3015 auto min_alloc_size
= alloc_size
[id
];
3016 ceph_assert(id
<= (int)alloc
.size() && alloc
[id
]);
3017 auto min_need
= round_up_to(need
, min_alloc_size
);
3018 need
= std::max(need
,
3019 slow_dev_expander
->get_recommended_expansion_delta(
3020 alloc
[id
]->get_free(), block_all
[id
].size()));
3022 need
= round_up_to(need
, min_alloc_size
);
3023 dout(10) << __func__
<< " expanding slow device by 0x"
3024 << std::hex
<< need
<< std::dec
3026 r
= slow_dev_expander
->allocate_freespace(min_need
, need
, extents
);
3031 int BlueFS::_allocate_without_fallback(uint8_t id
, uint64_t len
,
3032 PExtentVector
* extents
)
3034 dout(10) << __func__
<< " len 0x" << std::hex
<< len
<< std::dec
3035 << " from " << (int)id
<< dendl
;
3036 assert(id
< alloc
.size());
3040 extents
->reserve(4); // 4 should be (more than) enough for most allocations
3041 uint64_t min_alloc_size
= alloc_size
[id
];
3042 uint64_t left
= round_up_to(len
, min_alloc_size
);
3043 int64_t alloc_len
= alloc
[id
]->allocate(left
, min_alloc_size
, 0, extents
);
3044 if (alloc_len
< 0 || alloc_len
< (int64_t)left
) {
3045 if (alloc_len
> 0) {
3046 alloc
[id
]->release(*extents
);
3049 derr
<< __func__
<< " failed to allocate 0x" << std::hex
<< left
3050 << " on bdev " << (int)id
3051 << ", free 0x" << alloc
[id
]->get_free() << std::dec
<< dendl
;
3053 derr
<< __func__
<< " failed to allocate 0x" << std::hex
<< left
3054 << " on bdev " << (int)id
<< ", dne" << std::dec
<< dendl
;
3063 int BlueFS::_allocate(uint8_t id
, uint64_t len
,
3064 bluefs_fnode_t
* node
)
3066 dout(10) << __func__
<< " len 0x" << std::hex
<< len
<< std::dec
3067 << " from " << (int)id
<< dendl
;
3068 ceph_assert(id
< alloc
.size());
3069 int64_t alloc_len
= 0;
3070 PExtentVector extents
;
3073 if (!node
->extents
.empty() && node
->extents
.back().bdev
== id
) {
3074 hint
= node
->extents
.back().end();
3076 extents
.reserve(4); // 4 should be (more than) enough for most allocations
3077 alloc_len
= alloc
[id
]->allocate(round_up_to(len
, alloc_size
[id
]),
3078 alloc_size
[id
], hint
, &extents
);
3082 alloc_len
< (int64_t)round_up_to(len
, alloc_size
[id
])) {
3083 if (alloc_len
> 0) {
3084 alloc
[id
]->release(extents
);
3086 if (id
!= BDEV_SLOW
) {
3088 dout(1) << __func__
<< " failed to allocate 0x" << std::hex
<< len
3089 << " on bdev " << (int)id
3090 << ", free 0x" << alloc
[id
]->get_free()
3091 << "; fallback to bdev " << (int)id
+ 1
3092 << std::dec
<< dendl
;
3094 return _allocate(id
+ 1, len
, node
);
3096 dout(1) << __func__
<< " unable to allocate 0x" << std::hex
<< len
3097 << " on bdev " << (int)id
<< ", free 0x"
3098 << (alloc
[id
] ? alloc
[id
]->get_free() : (uint64_t)-1)
3099 << "; fallback to slow device expander "
3100 << std::dec
<< dendl
;
3102 if (_expand_slow_device(len
, extents
) == 0) {
3103 id
= _get_slow_device_id();
3104 for (auto& e
: extents
) {
3105 _add_block_extent(id
, e
.offset
, e
.length
);
3108 auto* last_alloc
= alloc
[id
];
3109 ceph_assert(last_alloc
);
3111 alloc_len
= last_alloc
->allocate(round_up_to(len
, alloc_size
[id
]),
3112 alloc_size
[id
], hint
, &extents
);
3113 if (alloc_len
< 0 || alloc_len
< (int64_t)len
) {
3114 if (alloc_len
> 0) {
3115 last_alloc
->release(extents
);
3117 derr
<< __func__
<< " failed to allocate 0x" << std::hex
<< len
3118 << " on bdev " << (int)id
3119 << ", free 0x" << last_alloc
->get_free() << std::dec
<< dendl
;
3123 derr
<< __func__
<< " failed to expand slow device to fit +0x"
3124 << std::hex
<< len
<< std::dec
3129 uint64_t total_allocated
=
3130 block_all
[id
].size() - alloc
[id
]->get_free();
3131 if (max_bytes
[id
] < total_allocated
) {
3132 logger
->set(max_bytes_pcounters
[id
], total_allocated
);
3133 max_bytes
[id
] = total_allocated
;
3137 for (auto& p
: extents
) {
3138 node
->append_extent(bluefs_extent_t(id
, p
.offset
, p
.length
));
3144 int BlueFS::_preallocate(FileRef f
, uint64_t off
, uint64_t len
)
3146 dout(10) << __func__
<< " file " << f
->fnode
<< " 0x"
3147 << std::hex
<< off
<< "~" << len
<< std::dec
<< dendl
;
3149 dout(10) << __func__
<< " deleted, no-op" << dendl
;
3152 ceph_assert(f
->fnode
.ino
> 1);
3153 uint64_t allocated
= f
->fnode
.get_allocated();
3154 if (off
+ len
> allocated
) {
3155 uint64_t want
= off
+ len
- allocated
;
3156 vselector
->sub_usage(f
->vselector_hint
, f
->fnode
);
3158 int r
= _allocate(vselector
->select_prefer_bdev(f
->vselector_hint
),
3161 vselector
->add_usage(f
->vselector_hint
, f
->fnode
);
3164 log_t
.op_file_update(f
->fnode
);
3169 void BlueFS::sync_metadata()
3171 std::unique_lock
l(lock
);
3172 if (log_t
.empty() && dirty_files
.empty()) {
3173 dout(10) << __func__
<< " - no pending log events" << dendl
;
3175 dout(10) << __func__
<< dendl
;
3176 utime_t start
= ceph_clock_now();
3177 flush_bdev(); // FIXME?
3178 _flush_and_sync_log(l
);
3179 dout(10) << __func__
<< " done in " << (ceph_clock_now() - start
) << dendl
;
3182 if (_should_compact_log()) {
3183 if (cct
->_conf
->bluefs_compact_log_sync
) {
3184 _compact_log_sync();
3186 _compact_log_async(l
);
3191 int BlueFS::open_for_write(
3192 const string
& dirname
,
3193 const string
& filename
,
3197 std::lock_guard
l(lock
);
3198 dout(10) << __func__
<< " " << dirname
<< "/" << filename
<< dendl
;
3199 map
<string
,DirRef
>::iterator p
= dir_map
.find(dirname
);
3201 if (p
== dir_map
.end()) {
3202 // implicitly create the dir
3203 dout(20) << __func__
<< " dir " << dirname
3204 << " does not exist" << dendl
;
3211 bool create
= false;
3212 map
<string
,FileRef
>::iterator q
= dir
->file_map
.find(filename
);
3213 if (q
== dir
->file_map
.end()) {
3215 dout(20) << __func__
<< " dir " << dirname
<< " (" << dir
3216 << ") file " << filename
3217 << " does not exist" << dendl
;
3220 file
= ceph::make_ref
<File
>();
3221 file
->fnode
.ino
= ++ino_last
;
3222 file_map
[ino_last
] = file
;
3223 dir
->file_map
[filename
] = file
;
3227 // overwrite existing file?
3230 dout(20) << __func__
<< " dir " << dirname
<< " (" << dir
3231 << ") file " << filename
3232 << " already exists, overwrite in place" << dendl
;
3234 dout(20) << __func__
<< " dir " << dirname
<< " (" << dir
3235 << ") file " << filename
3236 << " already exists, truncate + overwrite" << dendl
;
3237 vselector
->sub_usage(file
->vselector_hint
, file
->fnode
);
3238 file
->fnode
.size
= 0;
3239 for (auto& p
: file
->fnode
.extents
) {
3240 pending_release
[p
.bdev
].insert(p
.offset
, p
.length
);
3243 file
->fnode
.clear_extents();
3246 ceph_assert(file
->fnode
.ino
> 1);
3248 file
->fnode
.mtime
= ceph_clock_now();
3249 file
->vselector_hint
= vselector
->get_hint_by_dir(dirname
);
3251 dout(20) << __func__
<< " mapping " << dirname
<< "/" << filename
3252 << " vsel_hint " << file
->vselector_hint
3255 log_t
.op_file_update(file
->fnode
);
3257 log_t
.op_dir_link(dirname
, filename
, file
->fnode
.ino
);
3259 *h
= _create_writer(file
);
3261 if (boost::algorithm::ends_with(filename
, ".log")) {
3262 (*h
)->writer_type
= BlueFS::WRITER_WAL
;
3263 if (logger
&& !overwrite
) {
3264 logger
->inc(l_bluefs_files_written_wal
);
3266 } else if (boost::algorithm::ends_with(filename
, ".sst")) {
3267 (*h
)->writer_type
= BlueFS::WRITER_SST
;
3269 logger
->inc(l_bluefs_files_written_sst
);
3273 dout(10) << __func__
<< " h " << *h
<< " on " << file
->fnode
<< dendl
;
3277 BlueFS::FileWriter
*BlueFS::_create_writer(FileRef f
)
3279 FileWriter
*w
= new FileWriter(f
);
3280 for (unsigned i
= 0; i
< MAX_BDEV
; ++i
) {
3282 w
->iocv
[i
] = new IOContext(cct
, NULL
);
3288 void BlueFS::_close_writer(FileWriter
*h
)
3290 dout(10) << __func__
<< " " << h
<< " type " << h
->writer_type
<< dendl
;
3291 for (unsigned i
=0; i
<MAX_BDEV
; ++i
) {
3294 h
->iocv
[i
]->aio_wait();
3295 bdev
[i
]->queue_reap_ioc(h
->iocv
[i
]);
3302 int BlueFS::open_for_read(
3303 const string
& dirname
,
3304 const string
& filename
,
3308 std::lock_guard
l(lock
);
3309 dout(10) << __func__
<< " " << dirname
<< "/" << filename
3310 << (random
? " (random)":" (sequential)") << dendl
;
3311 map
<string
,DirRef
>::iterator p
= dir_map
.find(dirname
);
3312 if (p
== dir_map
.end()) {
3313 dout(20) << __func__
<< " dir " << dirname
<< " not found" << dendl
;
3316 DirRef dir
= p
->second
;
3318 map
<string
,FileRef
>::iterator q
= dir
->file_map
.find(filename
);
3319 if (q
== dir
->file_map
.end()) {
3320 dout(20) << __func__
<< " dir " << dirname
<< " (" << dir
3321 << ") file " << filename
3322 << " not found" << dendl
;
3325 File
*file
= q
->second
.get();
3327 *h
= new FileReader(file
, random
? 4096 : cct
->_conf
->bluefs_max_prefetch
,
3329 dout(10) << __func__
<< " h " << *h
<< " on " << file
->fnode
<< dendl
;
3334 const string
& old_dirname
, const string
& old_filename
,
3335 const string
& new_dirname
, const string
& new_filename
)
3337 std::lock_guard
l(lock
);
3338 dout(10) << __func__
<< " " << old_dirname
<< "/" << old_filename
3339 << " -> " << new_dirname
<< "/" << new_filename
<< dendl
;
3340 map
<string
,DirRef
>::iterator p
= dir_map
.find(old_dirname
);
3341 if (p
== dir_map
.end()) {
3342 dout(20) << __func__
<< " dir " << old_dirname
<< " not found" << dendl
;
3345 DirRef old_dir
= p
->second
;
3346 map
<string
,FileRef
>::iterator q
= old_dir
->file_map
.find(old_filename
);
3347 if (q
== old_dir
->file_map
.end()) {
3348 dout(20) << __func__
<< " dir " << old_dirname
<< " (" << old_dir
3349 << ") file " << old_filename
3350 << " not found" << dendl
;
3353 FileRef file
= q
->second
;
3355 p
= dir_map
.find(new_dirname
);
3356 if (p
== dir_map
.end()) {
3357 dout(20) << __func__
<< " dir " << new_dirname
<< " not found" << dendl
;
3360 DirRef new_dir
= p
->second
;
3361 q
= new_dir
->file_map
.find(new_filename
);
3362 if (q
!= new_dir
->file_map
.end()) {
3363 dout(20) << __func__
<< " dir " << new_dirname
<< " (" << old_dir
3364 << ") file " << new_filename
3365 << " already exists, unlinking" << dendl
;
3366 ceph_assert(q
->second
!= file
);
3367 log_t
.op_dir_unlink(new_dirname
, new_filename
);
3368 _drop_link(q
->second
);
3371 dout(10) << __func__
<< " " << new_dirname
<< "/" << new_filename
<< " "
3372 << " " << file
->fnode
<< dendl
;
3374 new_dir
->file_map
[new_filename
] = file
;
3375 old_dir
->file_map
.erase(old_filename
);
3377 log_t
.op_dir_link(new_dirname
, new_filename
, file
->fnode
.ino
);
3378 log_t
.op_dir_unlink(old_dirname
, old_filename
);
3382 int BlueFS::mkdir(const string
& dirname
)
3384 std::lock_guard
l(lock
);
3385 dout(10) << __func__
<< " " << dirname
<< dendl
;
3386 map
<string
,DirRef
>::iterator p
= dir_map
.find(dirname
);
3387 if (p
!= dir_map
.end()) {
3388 dout(20) << __func__
<< " dir " << dirname
<< " exists" << dendl
;
3391 dir_map
[dirname
] = ceph::make_ref
<Dir
>();
3392 log_t
.op_dir_create(dirname
);
3396 int BlueFS::rmdir(const string
& dirname
)
3398 std::lock_guard
l(lock
);
3399 dout(10) << __func__
<< " " << dirname
<< dendl
;
3400 map
<string
,DirRef
>::iterator p
= dir_map
.find(dirname
);
3401 if (p
== dir_map
.end()) {
3402 dout(20) << __func__
<< " dir " << dirname
<< " does not exist" << dendl
;
3405 DirRef dir
= p
->second
;
3406 if (!dir
->file_map
.empty()) {
3407 dout(20) << __func__
<< " dir " << dirname
<< " not empty" << dendl
;
3410 dir_map
.erase(dirname
);
3411 log_t
.op_dir_remove(dirname
);
3415 bool BlueFS::dir_exists(const string
& dirname
)
3417 std::lock_guard
l(lock
);
3418 map
<string
,DirRef
>::iterator p
= dir_map
.find(dirname
);
3419 bool exists
= p
!= dir_map
.end();
3420 dout(10) << __func__
<< " " << dirname
<< " = " << (int)exists
<< dendl
;
3424 int BlueFS::stat(const string
& dirname
, const string
& filename
,
3425 uint64_t *size
, utime_t
*mtime
)
3427 std::lock_guard
l(lock
);
3428 dout(10) << __func__
<< " " << dirname
<< "/" << filename
<< dendl
;
3429 map
<string
,DirRef
>::iterator p
= dir_map
.find(dirname
);
3430 if (p
== dir_map
.end()) {
3431 dout(20) << __func__
<< " dir " << dirname
<< " not found" << dendl
;
3434 DirRef dir
= p
->second
;
3435 map
<string
,FileRef
>::iterator q
= dir
->file_map
.find(filename
);
3436 if (q
== dir
->file_map
.end()) {
3437 dout(20) << __func__
<< " dir " << dirname
<< " (" << dir
3438 << ") file " << filename
3439 << " not found" << dendl
;
3442 File
*file
= q
->second
.get();
3443 dout(10) << __func__
<< " " << dirname
<< "/" << filename
3444 << " " << file
->fnode
<< dendl
;
3446 *size
= file
->fnode
.size
;
3448 *mtime
= file
->fnode
.mtime
;
3452 int BlueFS::lock_file(const string
& dirname
, const string
& filename
,
3455 std::lock_guard
l(lock
);
3456 dout(10) << __func__
<< " " << dirname
<< "/" << filename
<< dendl
;
3457 map
<string
,DirRef
>::iterator p
= dir_map
.find(dirname
);
3458 if (p
== dir_map
.end()) {
3459 dout(20) << __func__
<< " dir " << dirname
<< " not found" << dendl
;
3462 DirRef dir
= p
->second
;
3463 map
<string
,FileRef
>::iterator q
= dir
->file_map
.find(filename
);
3465 if (q
== dir
->file_map
.end()) {
3466 dout(20) << __func__
<< " dir " << dirname
<< " (" << dir
3467 << ") file " << filename
3468 << " not found, creating" << dendl
;
3469 file
= ceph::make_ref
<File
>();
3470 file
->fnode
.ino
= ++ino_last
;
3471 file
->fnode
.mtime
= ceph_clock_now();
3472 file_map
[ino_last
] = file
;
3473 dir
->file_map
[filename
] = file
;
3475 log_t
.op_file_update(file
->fnode
);
3476 log_t
.op_dir_link(dirname
, filename
, file
->fnode
.ino
);
3480 dout(10) << __func__
<< " already locked" << dendl
;
3484 file
->locked
= true;
3485 *plock
= new FileLock(file
);
3486 dout(10) << __func__
<< " locked " << file
->fnode
3487 << " with " << *plock
<< dendl
;
3491 int BlueFS::unlock_file(FileLock
*fl
)
3493 std::lock_guard
l(lock
);
3494 dout(10) << __func__
<< " " << fl
<< " on " << fl
->file
->fnode
<< dendl
;
3495 ceph_assert(fl
->file
->locked
);
3496 fl
->file
->locked
= false;
3501 int BlueFS::readdir(const string
& dirname
, vector
<string
> *ls
)
3503 std::lock_guard
l(lock
);
3504 dout(10) << __func__
<< " " << dirname
<< dendl
;
3505 if (dirname
.empty()) {
3507 ls
->reserve(dir_map
.size() + 2);
3508 for (auto& q
: dir_map
) {
3509 ls
->push_back(q
.first
);
3512 // list files in dir
3513 map
<string
,DirRef
>::iterator p
= dir_map
.find(dirname
);
3514 if (p
== dir_map
.end()) {
3515 dout(20) << __func__
<< " dir " << dirname
<< " not found" << dendl
;
3518 DirRef dir
= p
->second
;
3519 ls
->reserve(dir
->file_map
.size() + 2);
3520 for (auto& q
: dir
->file_map
) {
3521 ls
->push_back(q
.first
);
3525 ls
->push_back("..");
3529 int BlueFS::unlink(const string
& dirname
, const string
& filename
)
3531 std::lock_guard
l(lock
);
3532 dout(10) << __func__
<< " " << dirname
<< "/" << filename
<< dendl
;
3533 map
<string
,DirRef
>::iterator p
= dir_map
.find(dirname
);
3534 if (p
== dir_map
.end()) {
3535 dout(20) << __func__
<< " dir " << dirname
<< " not found" << dendl
;
3538 DirRef dir
= p
->second
;
3539 map
<string
,FileRef
>::iterator q
= dir
->file_map
.find(filename
);
3540 if (q
== dir
->file_map
.end()) {
3541 dout(20) << __func__
<< " file " << dirname
<< "/" << filename
3542 << " not found" << dendl
;
3545 FileRef file
= q
->second
;
3547 dout(20) << __func__
<< " file " << dirname
<< "/" << filename
3548 << " is locked" << dendl
;
3551 dir
->file_map
.erase(filename
);
3552 log_t
.op_dir_unlink(dirname
, filename
);
3557 bool BlueFS::wal_is_rotational()
3559 if (bdev
[BDEV_WAL
]) {
3560 return bdev
[BDEV_WAL
]->is_rotational();
3561 } else if (bdev
[BDEV_DB
]) {
3562 return bdev
[BDEV_DB
]->is_rotational();
3564 return bdev
[BDEV_SLOW
]->is_rotational();
3567 void BlueFS::debug_inject_duplicate_gift(unsigned id
,
3571 dout(0) << __func__
<< dendl
;
3572 if (id
< alloc
.size() && alloc
[id
]) {
3573 alloc
[id
]->init_add_free(offset
, len
);
3577 // ===============================================
3578 // OriginalVolumeSelector
3580 void* OriginalVolumeSelector::get_hint_by_device(uint8_t dev
) const {
3581 return reinterpret_cast<void*>(dev
);
3583 void* OriginalVolumeSelector::get_hint_by_dir(const string
& dirname
) const {
3584 uint8_t res
= BlueFS::BDEV_DB
;
3585 if (dirname
.length() > 5) {
3586 // the "db.slow" and "db.wal" directory names are hard-coded at
3587 // match up with bluestore. the slow device is always the second
3588 // one (when a dedicated block.db device is present and used at
3589 // bdev 0). the wal device is always last.
3590 if (boost::algorithm::ends_with(dirname
, ".slow")) {
3591 res
= BlueFS::BDEV_SLOW
;
3593 else if (boost::algorithm::ends_with(dirname
, ".wal")) {
3594 res
= BlueFS::BDEV_WAL
;
3597 return reinterpret_cast<void*>(res
);
3600 uint8_t OriginalVolumeSelector::select_prefer_bdev(void* hint
)
3602 return (uint8_t)(reinterpret_cast<uint64_t>(hint
));
3605 void OriginalVolumeSelector::get_paths(const std::string
& base
, paths
& res
) const
3607 res
.emplace_back(base
, db_total
);
3608 res
.emplace_back(base
+ ".slow", slow_total
);
3612 #define dout_prefix *_dout << "OriginalVolumeSelector: "
3614 void OriginalVolumeSelector::dump(ostream
& sout
) {
3615 sout
<< "wal_total:" << wal_total
3616 << ", db_total:" << db_total
3617 << ", slow_total:" << slow_total