1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2 // vim: ts=8 sw=2 smarttab
4 #include "boost/algorithm/string.hpp"
5 #include "bluestore_common.h"
8 #include "common/debug.h"
9 #include "common/errno.h"
10 #include "common/perf_counters.h"
11 #include "BlockDevice.h"
12 #include "Allocator.h"
13 #include "include/ceph_assert.h"
14 #include "common/admin_socket.h"
16 #define dout_context cct
17 #define dout_subsys ceph_subsys_bluefs
19 #define dout_prefix *_dout << "bluefs "
20 using TOPNSPC::common::cmd_getval
;
21 MEMPOOL_DEFINE_OBJECT_FACTORY(BlueFS::File
, bluefs_file
, bluefs
);
22 MEMPOOL_DEFINE_OBJECT_FACTORY(BlueFS::Dir
, bluefs_dir
, bluefs
);
23 MEMPOOL_DEFINE_OBJECT_FACTORY(BlueFS::FileWriter
, bluefs_file_writer
, bluefs_file_writer
);
24 MEMPOOL_DEFINE_OBJECT_FACTORY(BlueFS::FileReaderBuffer
,
25 bluefs_file_reader_buffer
, bluefs_file_reader
);
26 MEMPOOL_DEFINE_OBJECT_FACTORY(BlueFS::FileReader
, bluefs_file_reader
, bluefs_file_reader
);
27 MEMPOOL_DEFINE_OBJECT_FACTORY(BlueFS::FileLock
, bluefs_file_lock
, bluefs
);
29 static void wal_discard_cb(void *priv
, void* priv2
) {
30 BlueFS
*bluefs
= static_cast<BlueFS
*>(priv
);
31 interval_set
<uint64_t> *tmp
= static_cast<interval_set
<uint64_t>*>(priv2
);
32 bluefs
->handle_discard(BlueFS::BDEV_WAL
, *tmp
);
35 static void db_discard_cb(void *priv
, void* priv2
) {
36 BlueFS
*bluefs
= static_cast<BlueFS
*>(priv
);
37 interval_set
<uint64_t> *tmp
= static_cast<interval_set
<uint64_t>*>(priv2
);
38 bluefs
->handle_discard(BlueFS::BDEV_DB
, *tmp
);
41 static void slow_discard_cb(void *priv
, void* priv2
) {
42 BlueFS
*bluefs
= static_cast<BlueFS
*>(priv
);
43 interval_set
<uint64_t> *tmp
= static_cast<interval_set
<uint64_t>*>(priv2
);
44 bluefs
->handle_discard(BlueFS::BDEV_SLOW
, *tmp
);
47 class BlueFS::SocketHook
: public AdminSocketHook
{
50 static BlueFS::SocketHook
* create(BlueFS
* bluefs
)
52 BlueFS::SocketHook
* hook
= nullptr;
53 AdminSocket
* admin_socket
= bluefs
->cct
->get_admin_socket();
55 hook
= new BlueFS::SocketHook(bluefs
);
56 int r
= admin_socket
->register_command("bluestore bluefs available "
57 "name=alloc_size,type=CephInt,req=false",
59 "Report available space for bluefs. "
60 "If alloc_size set, make simulation.");
62 ldout(bluefs
->cct
, 1) << __func__
<< " cannot register SocketHook" << dendl
;
66 r
= admin_socket
->register_command("bluefs stats",
68 "Dump internal statistics for bluefs."
71 r
= admin_socket
->register_command("bluefs debug_inject_read_zeros", hook
,
72 "Injects 8K zeros into next BlueFS read. Debug only.");
80 AdminSocket
* admin_socket
= bluefs
->cct
->get_admin_socket();
81 admin_socket
->unregister_commands(this);
84 SocketHook(BlueFS
* bluefs
) :
86 int call(std::string_view command
, const cmdmap_t
& cmdmap
,
89 bufferlist
& out
) override
{
90 if (command
== "bluestore bluefs available") {
91 int64_t alloc_size
= 0;
92 cmd_getval(cmdmap
, "alloc_size", alloc_size
);
93 if ((alloc_size
& (alloc_size
- 1)) != 0) {
94 errss
<< "Invalid allocation size:'" << alloc_size
<< std::endl
;
98 alloc_size
= bluefs
->cct
->_conf
->bluefs_alloc_size
;
99 f
->open_object_section("bluefs_available_space");
100 for (unsigned dev
= BDEV_WAL
; dev
<= BDEV_SLOW
; dev
++) {
101 if (bluefs
->bdev
[dev
]) {
102 f
->open_object_section("dev");
103 f
->dump_string("device", bluefs
->get_device_name(dev
));
104 ceph_assert(bluefs
->alloc
[dev
]);
105 f
->dump_int("free", bluefs
->alloc
[dev
]->get_free());
109 size_t extra_space
= 0;
110 if (bluefs
->slow_dev_expander
) {
111 extra_space
= bluefs
->slow_dev_expander
->available_freespace(alloc_size
);
113 f
->dump_int("available_from_bluestore", extra_space
);
115 } else if (command
== "bluefs stats") {
116 std::stringstream ss
;
117 bluefs
->dump_block_extents(ss
);
118 bluefs
->dump_volume_selector(ss
);
120 } else if (command
== "bluefs debug_inject_read_zeros") {
121 bluefs
->inject_read_zeros
++;
123 errss
<< "Invalid command" << std::endl
;
130 BlueFS::BlueFS(CephContext
* cct
)
136 discard_cb
[BDEV_WAL
] = wal_discard_cb
;
137 discard_cb
[BDEV_DB
] = db_discard_cb
;
138 discard_cb
[BDEV_SLOW
] = slow_discard_cb
;
139 asok_hook
= SocketHook::create(this);
149 for (auto p
: bdev
) {
160 void BlueFS::_init_logger()
162 PerfCountersBuilder
b(cct
, "bluefs",
163 l_bluefs_first
, l_bluefs_last
);
164 b
.add_u64_counter(l_bluefs_gift_bytes
, "gift_bytes",
165 "Bytes gifted from BlueStore", NULL
, 0, unit_t(UNIT_BYTES
));
166 b
.add_u64_counter(l_bluefs_reclaim_bytes
, "reclaim_bytes",
167 "Bytes reclaimed by BlueStore", NULL
, 0, unit_t(UNIT_BYTES
));
168 b
.add_u64(l_bluefs_db_total_bytes
, "db_total_bytes",
169 "Total bytes (main db device)",
170 "b", PerfCountersBuilder::PRIO_USEFUL
, unit_t(UNIT_BYTES
));
171 b
.add_u64(l_bluefs_db_used_bytes
, "db_used_bytes",
172 "Used bytes (main db device)",
173 "u", PerfCountersBuilder::PRIO_USEFUL
, unit_t(UNIT_BYTES
));
174 b
.add_u64(l_bluefs_wal_total_bytes
, "wal_total_bytes",
175 "Total bytes (wal device)",
176 "walb", PerfCountersBuilder::PRIO_USEFUL
, unit_t(UNIT_BYTES
));
177 b
.add_u64(l_bluefs_wal_used_bytes
, "wal_used_bytes",
178 "Used bytes (wal device)",
179 "walu", PerfCountersBuilder::PRIO_USEFUL
, unit_t(UNIT_BYTES
));
180 b
.add_u64(l_bluefs_slow_total_bytes
, "slow_total_bytes",
181 "Total bytes (slow device)",
182 "slob", PerfCountersBuilder::PRIO_USEFUL
, unit_t(UNIT_BYTES
));
183 b
.add_u64(l_bluefs_slow_used_bytes
, "slow_used_bytes",
184 "Used bytes (slow device)",
185 "slou", PerfCountersBuilder::PRIO_USEFUL
, unit_t(UNIT_BYTES
));
186 b
.add_u64(l_bluefs_num_files
, "num_files", "File count",
187 "f", PerfCountersBuilder::PRIO_USEFUL
);
188 b
.add_u64(l_bluefs_log_bytes
, "log_bytes", "Size of the metadata log",
189 "jlen", PerfCountersBuilder::PRIO_INTERESTING
, unit_t(UNIT_BYTES
));
190 b
.add_u64_counter(l_bluefs_log_compactions
, "log_compactions",
191 "Compactions of the metadata log");
192 b
.add_u64_counter(l_bluefs_logged_bytes
, "logged_bytes",
193 "Bytes written to the metadata log", "j",
194 PerfCountersBuilder::PRIO_CRITICAL
, unit_t(UNIT_BYTES
));
195 b
.add_u64_counter(l_bluefs_files_written_wal
, "files_written_wal",
196 "Files written to WAL");
197 b
.add_u64_counter(l_bluefs_files_written_sst
, "files_written_sst",
198 "Files written to SSTs");
199 b
.add_u64_counter(l_bluefs_bytes_written_wal
, "bytes_written_wal",
200 "Bytes written to WAL", "wal",
201 PerfCountersBuilder::PRIO_CRITICAL
);
202 b
.add_u64_counter(l_bluefs_bytes_written_sst
, "bytes_written_sst",
203 "Bytes written to SSTs", "sst",
204 PerfCountersBuilder::PRIO_CRITICAL
, unit_t(UNIT_BYTES
));
205 b
.add_u64_counter(l_bluefs_bytes_written_slow
, "bytes_written_slow",
206 "Bytes written to WAL/SSTs at slow device", NULL
,
207 PerfCountersBuilder::PRIO_USEFUL
, unit_t(UNIT_BYTES
));
208 b
.add_u64_counter(l_bluefs_max_bytes_wal
, "max_bytes_wal",
209 "Maximum bytes allocated from WAL");
210 b
.add_u64_counter(l_bluefs_max_bytes_db
, "max_bytes_db",
211 "Maximum bytes allocated from DB");
212 b
.add_u64_counter(l_bluefs_max_bytes_slow
, "max_bytes_slow",
213 "Maximum bytes allocated from SLOW");
215 b
.add_u64_counter(l_bluefs_read_random_count
, "read_random_count",
216 "random read requests processed");
217 b
.add_u64_counter(l_bluefs_read_random_bytes
, "read_random_bytes",
218 "Bytes requested in random read mode", NULL
,
219 PerfCountersBuilder::PRIO_USEFUL
, unit_t(UNIT_BYTES
));
220 b
.add_u64_counter(l_bluefs_read_random_disk_count
, "read_random_disk_count",
221 "random reads requests going to disk");
222 b
.add_u64_counter(l_bluefs_read_random_disk_bytes
, "read_random_disk_bytes",
223 "Bytes read from disk in random read mode", NULL
,
224 PerfCountersBuilder::PRIO_USEFUL
, unit_t(UNIT_BYTES
));
225 b
.add_u64_counter(l_bluefs_read_random_buffer_count
, "read_random_buffer_count",
226 "random read requests processed using prefetch buffer");
227 b
.add_u64_counter(l_bluefs_read_random_buffer_bytes
, "read_random_buffer_bytes",
228 "Bytes read from prefetch buffer in random read mode", NULL
,
229 PerfCountersBuilder::PRIO_USEFUL
, unit_t(UNIT_BYTES
));
231 b
.add_u64_counter(l_bluefs_read_count
, "read_count",
232 "buffered read requests processed");
233 b
.add_u64_counter(l_bluefs_read_bytes
, "read_bytes",
234 "Bytes requested in buffered read mode", NULL
,
235 PerfCountersBuilder::PRIO_USEFUL
, unit_t(UNIT_BYTES
));
237 b
.add_u64_counter(l_bluefs_read_prefetch_count
, "read_prefetch_count",
238 "prefetch read requests processed");
239 b
.add_u64_counter(l_bluefs_read_prefetch_bytes
, "read_prefetch_bytes",
240 "Bytes requested in prefetch read mode", NULL
,
241 PerfCountersBuilder::PRIO_USEFUL
, unit_t(UNIT_BYTES
));
242 b
.add_u64(l_bluefs_read_zeros_candidate
, "read_zeros_candidate",
243 "How many times bluefs read found page with all 0s");
244 b
.add_u64(l_bluefs_read_zeros_errors
, "read_zeros_errors",
245 "How many times bluefs read found transient page with all 0s");
247 logger
= b
.create_perf_counters();
248 cct
->get_perfcounters_collection()->add(logger
);
251 void BlueFS::_shutdown_logger()
253 cct
->get_perfcounters_collection()->remove(logger
);
257 void BlueFS::_update_logger_stats()
259 // we must be holding the lock
260 logger
->set(l_bluefs_num_files
, file_map
.size());
261 logger
->set(l_bluefs_log_bytes
, log_writer
->file
->fnode
.size
);
263 if (alloc
[BDEV_WAL
]) {
264 logger
->set(l_bluefs_wal_total_bytes
, block_all
[BDEV_WAL
].size());
265 logger
->set(l_bluefs_wal_used_bytes
,
266 block_all
[BDEV_WAL
].size() - alloc
[BDEV_WAL
]->get_free());
268 if (alloc
[BDEV_DB
]) {
269 logger
->set(l_bluefs_db_total_bytes
, block_all
[BDEV_DB
].size());
270 logger
->set(l_bluefs_db_used_bytes
,
271 block_all
[BDEV_DB
].size() - alloc
[BDEV_DB
]->get_free());
273 if (alloc
[BDEV_SLOW
]) {
274 logger
->set(l_bluefs_slow_total_bytes
, block_all
[BDEV_SLOW
].size());
275 logger
->set(l_bluefs_slow_used_bytes
,
276 block_all
[BDEV_SLOW
].size() - alloc
[BDEV_SLOW
]->get_free());
280 int BlueFS::add_block_device(unsigned id
, const string
& path
, bool trim
,
281 bool shared_with_bluestore
)
283 dout(10) << __func__
<< " bdev " << id
<< " path " << path
<< dendl
;
284 ceph_assert(id
< bdev
.size());
285 ceph_assert(bdev
[id
] == NULL
);
286 BlockDevice
*b
= BlockDevice::create(cct
, path
, NULL
, NULL
,
287 discard_cb
[id
], static_cast<void*>(this));
288 if (shared_with_bluestore
) {
289 b
->set_no_exclusive_lock();
291 int r
= b
->open(path
);
297 b
->discard(0, b
->get_size());
300 dout(1) << __func__
<< " bdev " << id
<< " path " << path
301 << " size " << byte_u_t(b
->get_size()) << dendl
;
303 ioc
[id
] = new IOContext(cct
, NULL
);
307 bool BlueFS::bdev_support_label(unsigned id
)
309 ceph_assert(id
< bdev
.size());
310 ceph_assert(bdev
[id
]);
311 return bdev
[id
]->supported_bdev_label();
314 uint64_t BlueFS::get_block_device_size(unsigned id
)
316 if (id
< bdev
.size() && bdev
[id
])
317 return bdev
[id
]->get_size();
321 void BlueFS::_add_block_extent(unsigned id
, uint64_t offset
, uint64_t length
,
324 dout(1) << __func__
<< " bdev " << id
325 << " 0x" << std::hex
<< offset
<< "~" << length
<< std::dec
329 ceph_assert(id
< bdev
.size());
330 ceph_assert(bdev
[id
]);
331 ceph_assert(bdev
[id
]->get_size() >= offset
+ length
);
332 block_all
[id
].insert(offset
, length
);
334 if (id
< alloc
.size() && alloc
[id
]) {
336 log_t
.op_alloc_add(id
, offset
, length
);
338 alloc
[id
]->init_add_free(offset
, length
);
342 logger
->inc(l_bluefs_gift_bytes
, length
);
343 dout(10) << __func__
<< " done" << dendl
;
346 int BlueFS::reclaim_blocks(unsigned id
, uint64_t want
,
347 PExtentVector
*extents
)
349 std::unique_lock
l(lock
);
350 dout(1) << __func__
<< " bdev " << id
351 << " want 0x" << std::hex
<< want
<< std::dec
<< dendl
;
352 ceph_assert(id
< alloc
.size());
353 ceph_assert(alloc
[id
]);
356 interval_set
<uint64_t> granular
;
357 while (want
> 0 && !block_unused_too_granular
[id
].empty()) {
358 auto p
= block_unused_too_granular
[id
].begin();
359 dout(20) << __func__
<< " unused " << (int)id
<< ":"
360 << std::hex
<< p
.get_start() << "~" << p
.get_len() << dendl
;
361 extents
->push_back({p
.get_start(), p
.get_len()});
362 granular
.insert(p
.get_start(), p
.get_len());
363 if (want
>= p
.get_len()) {
369 block_unused_too_granular
[id
].erase(p
);
373 got
+= alloc
[id
]->allocate(want
, alloc_size
[id
], 0, extents
);
374 ceph_assert(got
!= 0);
376 derr
<< __func__
<< " failed to allocate space to return to bluestore"
379 block_unused_too_granular
[id
].insert(granular
);
383 for (auto& p
: *extents
) {
384 block_all
[id
].erase(p
.offset
, p
.length
);
385 log_t
.op_alloc_rm(id
, p
.offset
, p
.length
);
389 int r
= _flush_and_sync_log(l
);
393 logger
->inc(l_bluefs_reclaim_bytes
, got
);
394 dout(1) << __func__
<< " bdev " << id
<< " want 0x" << std::hex
<< want
395 << " got " << *extents
<< dendl
;
399 void BlueFS::handle_discard(unsigned id
, interval_set
<uint64_t>& to_release
)
401 dout(10) << __func__
<< " bdev " << id
<< dendl
;
402 ceph_assert(alloc
[id
]);
403 alloc
[id
]->release(to_release
);
406 uint64_t BlueFS::get_used()
408 std::lock_guard
l(lock
);
410 for (unsigned id
= 0; id
< MAX_BDEV
; ++id
) {
412 used
+= block_all
[id
].size() - alloc
[id
]->get_free();
418 uint64_t BlueFS::get_total(unsigned id
)
420 std::lock_guard
l(lock
);
421 ceph_assert(id
< block_all
.size());
422 return block_all
[id
].size();
425 uint64_t BlueFS::get_free(unsigned id
)
427 std::lock_guard
l(lock
);
428 ceph_assert(id
< alloc
.size());
429 return alloc
[id
]->get_free();
432 void BlueFS::dump_perf_counters(Formatter
*f
)
434 f
->open_object_section("bluefs_perf_counters");
435 logger
->dump_formatted(f
,0);
439 void BlueFS::dump_block_extents(ostream
& out
)
441 for (unsigned i
= 0; i
< MAX_BDEV
; ++i
) {
445 auto owned
= get_total(i
);
446 auto free
= get_free(i
);
448 out
<< i
<< " : device size 0x" << std::hex
<< bdev
[i
]->get_size()
449 << " : own 0x" << block_all
[i
]
451 << " : using 0x" << owned
- free
452 << std::dec
<< "(" << byte_u_t(owned
- free
) << ")";
453 if (i
== _get_slow_device_id()) {
454 ceph_assert(slow_dev_expander
);
455 ceph_assert(alloc
[i
]);
456 free
= slow_dev_expander
->available_freespace(alloc_size
[i
]);
458 << " : bluestore has 0x" << free
459 << std::dec
<< "(" << byte_u_t(free
) << ") available";
465 void BlueFS::get_usage(vector
<pair
<uint64_t,uint64_t>> *usage
)
467 std::lock_guard
l(lock
);
468 usage
->resize(bdev
.size());
469 for (unsigned id
= 0; id
< bdev
.size(); ++id
) {
471 (*usage
)[id
] = make_pair(0, 0);
474 (*usage
)[id
].first
= alloc
[id
]->get_free();
475 (*usage
)[id
].second
= block_all
[id
].size();
477 (block_all
[id
].size() - (*usage
)[id
].first
) * 100 / block_all
[id
].size();
478 dout(10) << __func__
<< " bdev " << id
479 << " free " << (*usage
)[id
].first
480 << " (" << byte_u_t((*usage
)[id
].first
) << ")"
481 << " / " << (*usage
)[id
].second
482 << " (" << byte_u_t((*usage
)[id
].second
) << ")"
483 << ", used " << used
<< "%"
488 int BlueFS::get_block_extents(unsigned id
, interval_set
<uint64_t> *extents
)
490 std::lock_guard
l(lock
);
491 dout(10) << __func__
<< " bdev " << id
<< dendl
;
492 if (id
>= block_all
.size())
494 *extents
= block_all
[id
];
498 int BlueFS::mkfs(uuid_d osd_uuid
, const bluefs_layout_t
& layout
)
500 std::unique_lock
l(lock
);
502 << " osd_uuid " << osd_uuid
505 // set volume selector if not provided before/outside
506 if (vselector
== nullptr) {
508 new OriginalVolumeSelector(
509 get_block_device_size(BlueFS::BDEV_WAL
) * 95 / 100,
510 get_block_device_size(BlueFS::BDEV_DB
) * 95 / 100,
511 get_block_device_size(BlueFS::BDEV_SLOW
) * 95 / 100));
518 super
.block_size
= bdev
[BDEV_DB
]->get_block_size();
519 super
.osd_uuid
= osd_uuid
;
520 super
.uuid
.generate_random();
521 dout(1) << __func__
<< " uuid " << super
.uuid
<< dendl
;
524 FileRef log_file
= ceph::make_ref
<File
>();
525 log_file
->fnode
.ino
= 1;
526 log_file
->vselector_hint
= vselector
->get_hint_for_log();
528 vselector
->select_prefer_bdev(log_file
->vselector_hint
),
529 cct
->_conf
->bluefs_max_log_runway
,
531 vselector
->add_usage(log_file
->vselector_hint
, log_file
->fnode
);
533 log_writer
= _create_writer(log_file
);
537 for (unsigned bdev
= 0; bdev
< MAX_BDEV
; ++bdev
) {
538 interval_set
<uint64_t>& p
= block_all
[bdev
];
541 for (interval_set
<uint64_t>::iterator q
= p
.begin(); q
!= p
.end(); ++q
) {
542 dout(20) << __func__
<< " op_alloc_add " << bdev
<< " 0x"
543 << std::hex
<< q
.get_start() << "~" << q
.get_len() << std::dec
545 log_t
.op_alloc_add(bdev
, q
.get_start(), q
.get_len());
548 _flush_and_sync_log(l
);
551 super
.log_fnode
= log_file
->fnode
;
552 super
.memorized_layout
= layout
;
553 _write_super(BDEV_DB
);
557 super
= bluefs_super_t();
558 _close_writer(log_writer
);
561 vselector
.reset(nullptr);
565 dout(10) << __func__
<< " success" << dendl
;
569 void BlueFS::_init_alloc()
571 dout(20) << __func__
<< dendl
;
572 alloc
.resize(MAX_BDEV
);
573 alloc_size
.resize(MAX_BDEV
, 0);
574 pending_release
.resize(MAX_BDEV
);
575 block_unused_too_granular
.resize(MAX_BDEV
);
577 if (bdev
[BDEV_WAL
]) {
578 alloc_size
[BDEV_WAL
] = cct
->_conf
->bluefs_alloc_size
;
580 if (bdev
[BDEV_SLOW
]) {
581 alloc_size
[BDEV_DB
] = cct
->_conf
->bluefs_alloc_size
;
582 alloc_size
[BDEV_SLOW
] = cct
->_conf
->bluefs_shared_alloc_size
;
584 alloc_size
[BDEV_DB
] = cct
->_conf
->bluefs_shared_alloc_size
;
586 // new wal and db devices are never shared
587 if (bdev
[BDEV_NEWWAL
]) {
588 alloc_size
[BDEV_NEWWAL
] = cct
->_conf
->bluefs_alloc_size
;
590 if (bdev
[BDEV_NEWDB
]) {
591 alloc_size
[BDEV_NEWDB
] = cct
->_conf
->bluefs_alloc_size
;
594 for (unsigned id
= 0; id
< bdev
.size(); ++id
) {
598 ceph_assert(bdev
[id
]->get_size());
599 std::string name
= "bluefs-";
600 const char* devnames
[] = {"wal","db","slow"};
602 name
+= devnames
[id
];
604 name
+= to_string(uintptr_t(this));
605 ceph_assert(alloc_size
[id
]);
606 dout(1) << __func__
<< " id " << id
607 << " alloc_size 0x" << std::hex
<< alloc_size
[id
]
608 << " size 0x" << bdev
[id
]->get_size() << std::dec
<< dendl
;
609 alloc
[id
] = Allocator::create(cct
, cct
->_conf
->bluefs_allocator
,
610 bdev
[id
]->get_size(),
611 alloc_size
[id
], name
);
612 interval_set
<uint64_t>& p
= block_all
[id
];
613 for (interval_set
<uint64_t>::iterator q
= p
.begin(); q
!= p
.end(); ++q
) {
614 alloc
[id
]->init_add_free(q
.get_start(), q
.get_len());
619 void BlueFS::_stop_alloc()
621 dout(20) << __func__
<< dendl
;
622 for (auto p
: bdev
) {
627 for (auto p
: alloc
) {
634 block_unused_too_granular
.clear();
637 int BlueFS::read(uint8_t ndev
, uint64_t off
, uint64_t len
,
638 ceph::buffer::list
*pbl
, IOContext
*ioc
, bool buffered
)
640 dout(10) << __func__
<< " dev " << int(ndev
)
641 << ": 0x" << std::hex
<< off
<< "~" << len
<< std::dec
642 << (buffered
? " buffered" : "")
646 r
= bdev
[ndev
]->read(off
, len
, &bl
, ioc
, buffered
);
650 uint64_t block_size
= bdev
[ndev
]->get_block_size();
651 if (inject_read_zeros
) {
652 if (len
>= block_size
* 2) {
653 derr
<< __func__
<< " injecting error, zeros at "
654 << int(ndev
) << ": 0x" << std::hex
<< (off
+ len
/ 2)
655 << "~" << (block_size
* 2) << std::dec
<< dendl
;
656 //use beginning, replace 8K in the middle with zeros, use tail
658 bl
.splice(0, len
/ 2 - block_size
, &temp
);
659 temp
.append_zero(block_size
* 2);
660 bl
.splice(block_size
* 2, len
/ 2 - block_size
, &temp
);
665 //make a check if there is a block with all 0
666 uint64_t to_check_len
= len
;
667 uint64_t skip
= p2nphase(off
, block_size
);
668 if (skip
>= to_check_len
) {
671 auto it
= bl
.begin(skip
);
672 to_check_len
-= skip
;
673 bool all_zeros
= false;
674 while (all_zeros
== false && to_check_len
>= block_size
) {
676 unsigned block_left
= block_size
;
680 while (all_zeros
&& block_left
> 0) {
681 avail
= it
.get_ptr_and_advance(block_left
, &data
);
683 all_zeros
= mem_is_zero(data
, avail
);
686 while (block_left
> 0) {
687 avail
= it
.get_ptr_and_advance(block_left
, &data
);
690 to_check_len
-= block_size
;
693 logger
->inc(l_bluefs_read_zeros_candidate
, 1);
694 bufferlist bl_reread
;
695 r
= bdev
[ndev
]->read(off
, len
, &bl_reread
, ioc
, buffered
);
699 // check if both read gave the same
700 if (!bl
.contents_equal(bl_reread
)) {
701 // report problems to log, but continue, maybe it will be good now...
702 derr
<< __func__
<< " initial read of " << int(ndev
)
703 << ": 0x" << std::hex
<< off
<< "~" << len
704 << std::dec
<< ": different then re-read " << dendl
;
705 logger
->inc(l_bluefs_read_zeros_errors
, 1);
707 // use second read will be better if is different
708 pbl
->append(bl_reread
);
715 int BlueFS::read_random(uint8_t ndev
, uint64_t off
, uint64_t len
, char *buf
, bool buffered
)
717 dout(10) << __func__
<< " dev " << int(ndev
)
718 << ": 0x" << std::hex
<< off
<< "~" << len
<< std::dec
719 << (buffered
? " buffered" : "")
722 r
= bdev
[ndev
]->read_random(off
, len
, buf
, buffered
);
726 uint64_t block_size
= bdev
[ndev
]->get_block_size();
727 if (inject_read_zeros
) {
728 if (len
>= block_size
* 2) {
729 derr
<< __func__
<< " injecting error, zeros at "
730 << int(ndev
) << ": 0x" << std::hex
<< (off
+ len
/ 2)
731 << "~" << (block_size
* 2) << std::dec
<< dendl
;
733 memset(buf
+ len
/ 2 - block_size
, 0, block_size
* 2);
737 //make a check if there is a block with all 0
738 uint64_t to_check_len
= len
;
739 const char* data
= buf
;
740 uint64_t skip
= p2nphase(off
, block_size
);
741 if (skip
>= to_check_len
) {
744 to_check_len
-= skip
;
747 bool all_zeros
= false;
748 while (all_zeros
== false && to_check_len
>= block_size
) {
749 if (mem_is_zero(data
, block_size
)) {
750 // at least one block is all zeros
755 to_check_len
-= block_size
;
758 logger
->inc(l_bluefs_read_zeros_candidate
, 1);
759 std::unique_ptr
<char[]> data_reread(new char[len
]);
760 r
= bdev
[ndev
]->read_random(off
, len
, &data_reread
[0], buffered
);
764 // check if both read gave the same
765 if (memcmp(buf
, &data_reread
[0], len
) != 0) {
766 derr
<< __func__
<< " initial read of " << int(ndev
)
767 << ": 0x" << std::hex
<< off
<< "~" << len
768 << std::dec
<< ": different then re-read " << dendl
;
769 logger
->inc(l_bluefs_read_zeros_errors
, 1);
770 // second read is probably better
771 memcpy(buf
, &data_reread
[0], len
);
779 dout(1) << __func__
<< dendl
;
781 int r
= _open_super();
783 derr
<< __func__
<< " failed to open super: " << cpp_strerror(r
) << dendl
;
787 // set volume selector if not provided before/outside
788 if (vselector
== nullptr) {
790 new OriginalVolumeSelector(
791 get_block_device_size(BlueFS::BDEV_WAL
) * 95 / 100,
792 get_block_device_size(BlueFS::BDEV_DB
) * 95 / 100,
793 get_block_device_size(BlueFS::BDEV_SLOW
) * 95 / 100));
797 block_all
.resize(MAX_BDEV
);
801 r
= _replay(false, false);
803 derr
<< __func__
<< " failed to replay log: " << cpp_strerror(r
) << dendl
;
809 for (auto& p
: file_map
) {
810 dout(30) << __func__
<< " noting alloc for " << p
.second
->fnode
<< dendl
;
811 for (auto& q
: p
.second
->fnode
.extents
) {
812 alloc
[q
.bdev
]->init_rm_free(q
.offset
, q
.length
);
816 // set up the log for future writes
817 log_writer
= _create_writer(_get_file(1));
818 ceph_assert(log_writer
->file
->fnode
.ino
== 1);
819 log_writer
->pos
= log_writer
->file
->fnode
.size
;
820 dout(10) << __func__
<< " log write pos set to 0x"
821 << std::hex
<< log_writer
->pos
<< std::dec
827 super
= bluefs_super_t();
831 int BlueFS::maybe_verify_layout(const bluefs_layout_t
& layout
) const
833 if (super
.memorized_layout
) {
834 if (layout
== *super
.memorized_layout
) {
835 dout(10) << __func__
<< " bluefs layout verified positively" << dendl
;
837 derr
<< __func__
<< " memorized layout doesn't fit current one" << dendl
;
841 dout(10) << __func__
<< " no memorized_layout in bluefs superblock"
848 void BlueFS::umount(bool avoid_compact
)
850 dout(1) << __func__
<< dendl
;
852 sync_metadata(avoid_compact
);
854 _close_writer(log_writer
);
857 vselector
.reset(nullptr);
861 super
= bluefs_super_t();
866 int BlueFS::prepare_new_device(int id
, const bluefs_layout_t
& layout
)
868 dout(1) << __func__
<< dendl
;
870 if(id
== BDEV_NEWDB
) {
871 int new_log_dev_cur
= BDEV_WAL
;
872 int new_log_dev_next
= BDEV_WAL
;
873 if (!bdev
[BDEV_WAL
]) {
874 new_log_dev_cur
= BDEV_NEWDB
;
875 new_log_dev_next
= BDEV_DB
;
877 _rewrite_log_and_layout_sync(false,
884 } else if(id
== BDEV_NEWWAL
) {
885 _rewrite_log_and_layout_sync(false,
897 void BlueFS::collect_metadata(map
<string
,string
> *pm
, unsigned skip_bdev_id
)
899 if (skip_bdev_id
!= BDEV_DB
&& bdev
[BDEV_DB
])
900 bdev
[BDEV_DB
]->collect_metadata("bluefs_db_", pm
);
902 bdev
[BDEV_WAL
]->collect_metadata("bluefs_wal_", pm
);
905 void BlueFS::get_devices(set
<string
> *ls
)
907 for (unsigned i
= 0; i
< MAX_BDEV
; ++i
) {
909 bdev
[i
]->get_devices(ls
);
916 std::lock_guard
l(lock
);
917 dout(1) << __func__
<< dendl
;
918 // hrm, i think we check everything on mount...
922 int BlueFS::_write_super(int dev
)
927 uint32_t crc
= bl
.crc32c(-1);
929 dout(10) << __func__
<< " super block length(encoded): " << bl
.length() << dendl
;
930 dout(10) << __func__
<< " superblock " << super
.version
<< dendl
;
931 dout(10) << __func__
<< " log_fnode " << super
.log_fnode
<< dendl
;
932 ceph_assert_always(bl
.length() <= get_super_length());
933 bl
.append_zero(get_super_length() - bl
.length());
935 bdev
[dev
]->write(get_super_offset(), bl
, false, WRITE_LIFE_SHORT
);
936 dout(20) << __func__
<< " v " << super
.version
937 << " crc 0x" << std::hex
<< crc
938 << " offset 0x" << get_super_offset() << std::dec
943 int BlueFS::_open_super()
945 dout(10) << __func__
<< dendl
;
948 uint32_t expected_crc
, crc
;
951 // always the second block
952 r
= bdev
[BDEV_DB
]->read(get_super_offset(), get_super_length(),
953 &bl
, ioc
[BDEV_DB
], false);
957 auto p
= bl
.cbegin();
961 t
.substr_of(bl
, 0, p
.get_off());
964 decode(expected_crc
, p
);
965 if (crc
!= expected_crc
) {
966 derr
<< __func__
<< " bad crc on superblock, expected 0x"
967 << std::hex
<< expected_crc
<< " != actual 0x" << crc
<< std::dec
971 dout(10) << __func__
<< " superblock " << super
.version
<< dendl
;
972 dout(10) << __func__
<< " log_fnode " << super
.log_fnode
<< dendl
;
976 int BlueFS::_check_new_allocations(const bluefs_fnode_t
& fnode
,
978 boost::dynamic_bitset
<uint64_t>* owned_blocks
,
979 boost::dynamic_bitset
<uint64_t>* used_blocks
)
981 auto& fnode_extents
= fnode
.extents
;
982 for (auto e
: fnode_extents
) {
985 ceph_assert(id
< dev_count
);
986 apply_for_bitset_range(e
.offset
, e
.length
, alloc_size
[id
], owned_blocks
[id
],
987 [&](uint64_t pos
, boost::dynamic_bitset
<uint64_t> &bs
) {
994 derr
<< __func__
<< " invalid extent " << int(id
)
995 << ": 0x" << std::hex
<< e
.offset
<< "~" << e
.length
997 << ": wasn't given but allocated for ino " << fnode
.ino
1002 apply_for_bitset_range(e
.offset
, e
.length
, alloc_size
[id
], used_blocks
[id
],
1003 [&](uint64_t pos
, boost::dynamic_bitset
<uint64_t> &bs
) {
1011 derr
<< __func__
<< " invalid extent " << int(e
.bdev
)
1012 << ": 0x" << std::hex
<< e
.offset
<< "~" << e
.length
1013 << std::dec
<< ": duplicate reference, ino " << fnode
.ino
1021 int BlueFS::_adjust_granularity(
1022 __u8 id
, uint64_t *offset
, uint64_t *length
, bool alloc
)
1024 const char *op
= alloc
? "op_alloc_add" : "op_alloc_rm";
1025 auto oldo
= *offset
;
1026 auto oldl
= *length
;
1027 if (*offset
& (alloc_size
[id
] - 1)) {
1028 *offset
&= ~(alloc_size
[id
] - 1);
1029 *offset
+= alloc_size
[id
];
1030 if (*length
> *offset
- oldo
) {
1032 block_unused_too_granular
[id
].insert(oldo
, *offset
- oldo
);
1034 block_unused_too_granular
[id
].erase(oldo
, *offset
- oldo
);
1036 *length
-= (*offset
- oldo
);
1039 block_unused_too_granular
[id
].insert(oldo
, *length
);
1041 block_unused_too_granular
[id
].erase(oldo
, *length
);
1046 if (*length
& (alloc_size
[id
] - 1)) {
1047 *length
&= ~(alloc_size
[id
] - 1);
1049 block_unused_too_granular
[id
].insert(
1051 oldo
+ oldl
- *offset
- *length
);
1053 block_unused_too_granular
[id
].erase(
1055 oldo
+ oldl
- *offset
- *length
);
1058 if (oldo
!= *offset
|| oldl
!= *length
) {
1059 dout(10) << __func__
<< " " << op
<< " "
1060 << (int)id
<< ":" << std::hex
<< oldo
<< "~" << oldl
1061 << " -> " << (int)id
<< ":" << *offset
<< "~" << *length
<< dendl
;
1066 int BlueFS::_verify_alloc_granularity(
1067 __u8 id
, uint64_t offset
, uint64_t length
, const char *op
)
1069 if ((offset
& (alloc_size
[id
] - 1)) ||
1070 (length
& (alloc_size
[id
] - 1))) {
1071 derr
<< __func__
<< " " << op
<< " of " << (int)id
1072 << ":0x" << std::hex
<< offset
<< "~" << length
<< std::dec
1073 << " does not align to alloc_size 0x"
1074 << std::hex
<< alloc_size
[id
] << std::dec
<< dendl
;
1076 auto need
= alloc_size
[id
];
1077 while (need
&& ((offset
& (need
- 1)) ||
1078 (length
& (need
- 1)))) {
1083 if (id
== BDEV_SLOW
||
1084 (id
== BDEV_DB
&& !bdev
[BDEV_SLOW
])) {
1085 which
= "bluefs_shared_alloc_size";
1087 which
= "bluefs_alloc_size";
1089 derr
<< "work-around by setting " << which
<< " = " << need
1090 << " for this OSD" << dendl
;
1097 int BlueFS::_replay(bool noop
, bool to_stdout
)
1099 dout(10) << __func__
<< (noop
? " NO-OP" : "") << dendl
;
1100 ino_last
= 1; // by the log
1104 log_file
= _get_file(1);
1107 for (auto& a
: block_unused_too_granular
) {
1108 ceph_assert(a
.empty());
1112 log_file
->fnode
= super
.log_fnode
;
1113 log_file
->vselector_hint
=
1114 vselector
->get_hint_for_log();
1116 // do not use fnode from superblock in 'noop' mode - log_file's one should
1117 // be fine and up-to-date
1118 ceph_assert(log_file
->fnode
.ino
== 1);
1119 ceph_assert(log_file
->fnode
.extents
.size() != 0);
1121 dout(10) << __func__
<< " log_fnode " << super
.log_fnode
<< dendl
;
1122 if (unlikely(to_stdout
)) {
1123 std::cout
<< " log_fnode " << super
.log_fnode
<< std::endl
;
1126 FileReader
*log_reader
= new FileReader(
1127 log_file
, cct
->_conf
->bluefs_max_prefetch
,
1129 true); // ignore eof
1131 bool seen_recs
= false;
1133 boost::dynamic_bitset
<uint64_t> used_blocks
[MAX_BDEV
];
1134 boost::dynamic_bitset
<uint64_t> owned_blocks
[MAX_BDEV
];
1136 if (cct
->_conf
->bluefs_log_replay_check_allocations
) {
1137 for (size_t i
= 0; i
< MAX_BDEV
; ++i
) {
1138 if (alloc_size
[i
] != 0 && bdev
[i
] != nullptr) {
1139 used_blocks
[i
].resize(round_up_to(bdev
[i
]->get_size(), alloc_size
[i
]) / alloc_size
[i
]);
1140 owned_blocks
[i
].resize(round_up_to(bdev
[i
]->get_size(), alloc_size
[i
]) / alloc_size
[i
]);
1145 bool first_log_check
= true;
1148 ceph_assert((log_reader
->buf
.pos
& ~super
.block_mask()) == 0);
1149 uint64_t pos
= log_reader
->buf
.pos
;
1150 uint64_t read_pos
= pos
;
1153 int r
= _read(log_reader
, &log_reader
->buf
, read_pos
, super
.block_size
,
1155 if (r
!= (int)super
.block_size
&& cct
->_conf
->bluefs_replay_recovery
) {
1156 r
+= do_replay_recovery_read(log_reader
, pos
, read_pos
+ r
, super
.block_size
- r
, &bl
);
1158 assert(r
== (int)super
.block_size
);
1165 auto p
= bl
.cbegin();
1173 if (len
+ 6 > bl
.length()) {
1174 more
= round_up_to(len
+ 6 - bl
.length(), super
.block_size
);
1177 if (uuid
!= super
.uuid
) {
1179 dout(10) << __func__
<< " 0x" << std::hex
<< pos
<< std::dec
1180 << ": stop: uuid " << uuid
<< " != super.uuid " << super
.uuid
1183 derr
<< __func__
<< " 0x" << std::hex
<< pos
<< std::dec
1184 << ": stop: uuid " << uuid
<< " != super.uuid " << super
.uuid
1185 << ", block dump: \n";
1187 t
.substr_of(bl
, 0, super
.block_size
);
1193 if (seq
!= log_seq
+ 1) {
1195 dout(10) << __func__
<< " 0x" << std::hex
<< pos
<< std::dec
1196 << ": stop: seq " << seq
<< " != expected " << log_seq
+ 1
1199 derr
<< __func__
<< " 0x" << std::hex
<< pos
<< std::dec
1200 << ": stop: seq " << seq
<< " != expected " << log_seq
+ 1
1206 dout(20) << __func__
<< " need 0x" << std::hex
<< more
<< std::dec
1207 << " more bytes" << dendl
;
1209 int r
= _read(log_reader
, &log_reader
->buf
, read_pos
, more
, &t
, NULL
);
1210 if (r
< (int)more
) {
1211 dout(10) << __func__
<< " 0x" << std::hex
<< pos
1212 << ": stop: len is 0x" << bl
.length() + more
<< std::dec
1213 << ", which is past eof" << dendl
;
1214 if (cct
->_conf
->bluefs_replay_recovery
) {
1215 //try to search for more data
1216 r
+= do_replay_recovery_read(log_reader
, pos
, read_pos
+ r
, more
- r
, &t
);
1217 if (r
< (int)more
) {
1218 //in normal mode we must read r==more, for recovery it is too strict
1223 ceph_assert(r
== (int)more
);
1228 bluefs_transaction_t t
;
1230 auto p
= bl
.cbegin();
1233 catch (buffer::error
& e
) {
1234 derr
<< __func__
<< " 0x" << std::hex
<< pos
<< std::dec
1235 << ": stop: failed to decode: " << e
.what()
1240 ceph_assert(seq
== t
.seq
);
1241 dout(10) << __func__
<< " 0x" << std::hex
<< pos
<< std::dec
1242 << ": " << t
<< dendl
;
1243 if (unlikely(to_stdout
)) {
1244 std::cout
<< " 0x" << std::hex
<< pos
<< std::dec
1245 << ": " << t
<< std::endl
;
1248 auto p
= t
.op_bl
.cbegin();
1254 case bluefs_transaction_t::OP_INIT
:
1255 dout(20) << __func__
<< " 0x" << std::hex
<< pos
<< std::dec
1256 << ": op_init" << dendl
;
1257 if (unlikely(to_stdout
)) {
1258 std::cout
<< " 0x" << std::hex
<< pos
<< std::dec
1259 << ": op_init" << std::endl
;
1262 ceph_assert(t
.seq
== 1);
1265 case bluefs_transaction_t::OP_JUMP
:
1269 decode(next_seq
, p
);
1271 dout(20) << __func__
<< " 0x" << std::hex
<< pos
<< std::dec
1272 << ": op_jump seq " << next_seq
1273 << " offset 0x" << std::hex
<< offset
<< std::dec
<< dendl
;
1274 if (unlikely(to_stdout
)) {
1275 std::cout
<< " 0x" << std::hex
<< pos
<< std::dec
1276 << ": op_jump seq " << next_seq
1277 << " offset 0x" << std::hex
<< offset
<< std::dec
1281 ceph_assert(next_seq
>= log_seq
);
1282 log_seq
= next_seq
- 1; // we will increment it below
1283 uint64_t skip
= offset
- read_pos
;
1286 int r
= _read(log_reader
, &log_reader
->buf
, read_pos
, skip
, &junk
,
1288 if (r
!= (int)skip
) {
1289 dout(10) << __func__
<< " 0x" << std::hex
<< read_pos
1290 << ": stop: failed to skip to " << offset
1291 << std::dec
<< dendl
;
1292 ceph_abort_msg("problem with op_jump");
1298 case bluefs_transaction_t::OP_JUMP_SEQ
:
1301 decode(next_seq
, p
);
1302 dout(20) << __func__
<< " 0x" << std::hex
<< pos
<< std::dec
1303 << ": op_jump_seq " << next_seq
<< dendl
;
1304 if (unlikely(to_stdout
)) {
1305 std::cout
<< " 0x" << std::hex
<< pos
<< std::dec
1306 << ": op_jump_seq " << next_seq
<< std::endl
;
1309 ceph_assert(next_seq
>= log_seq
);
1310 log_seq
= next_seq
- 1; // we will increment it below
1314 case bluefs_transaction_t::OP_ALLOC_ADD
:
1317 uint64_t offset
, length
;
1321 dout(20) << __func__
<< " 0x" << std::hex
<< pos
<< std::dec
1322 << ": op_alloc_add " << " " << (int)id
1323 << ":0x" << std::hex
<< offset
<< "~" << length
<< std::dec
1325 if (unlikely(to_stdout
)) {
1326 std::cout
<< " 0x" << std::hex
<< pos
<< std::dec
1327 << ": op_alloc_add " << " " << (int)id
1328 << ":0x" << std::hex
<< offset
<< "~" << length
<< std::dec
1332 block_all
[id
].insert(offset
, length
);
1333 _adjust_granularity(id
, &offset
, &length
, true);
1335 alloc
[id
]->init_add_free(offset
, length
);
1338 if (cct
->_conf
->bluefs_log_replay_check_allocations
) {
1340 apply_for_bitset_range(offset
, length
, alloc_size
[id
], owned_blocks
[id
],
1341 [&](uint64_t pos
, boost::dynamic_bitset
<uint64_t> &bs
) {
1350 derr
<< __func__
<< " invalid extent " << (int)id
1351 << ": 0x" << std::hex
<< offset
<< "~" << length
1352 << std::dec
<< ": already given" << dendl
;
1355 apply_for_bitset_range(offset
, length
, alloc_size
[id
], used_blocks
[id
],
1356 [&](uint64_t pos
, boost::dynamic_bitset
<uint64_t> &bs
) {
1363 derr
<< __func__
<< " invalid extent " << int(id
)
1364 << ": 0x" << std::hex
<< offset
<< "~" << length
1365 << std::dec
<< ": already in use" << dendl
;
1373 case bluefs_transaction_t::OP_ALLOC_RM
:
1376 uint64_t offset
, length
;
1380 dout(20) << __func__
<< " 0x" << std::hex
<< pos
<< std::dec
1381 << ": op_alloc_rm " << " " << (int)id
1382 << ":0x" << std::hex
<< offset
<< "~" << length
<< std::dec
1384 if (unlikely(to_stdout
)) {
1385 std::cout
<< " 0x" << std::hex
<< pos
<< std::dec
1386 << ": op_alloc_rm " << " " << (int)id
1387 << ":0x" << std::hex
<< offset
<< "~" << length
<< std::dec
1391 block_all
[id
].erase(offset
, length
);
1392 _adjust_granularity(id
, &offset
, &length
, false);
1394 alloc
[id
]->init_rm_free(offset
, length
);
1396 if (cct
->_conf
->bluefs_log_replay_check_allocations
) {
1398 apply_for_bitset_range(offset
, length
, alloc_size
[id
], owned_blocks
[id
],
1399 [&](uint64_t pos
, boost::dynamic_bitset
<uint64_t> &bs
) {
1400 if (!bs
.test(pos
)) {
1408 derr
<< __func__
<< " invalid extent " << int(id
)
1409 << ": 0x" << std::hex
<< offset
<< "~" << length
1410 << std::dec
<< ": wasn't given" << dendl
;
1414 apply_for_bitset_range(offset
, length
, alloc_size
[id
], used_blocks
[id
],
1415 [&](uint64_t pos
, boost::dynamic_bitset
<uint64_t> &bs
) {
1422 derr
<< __func__
<< " invalid extent " << (int)id
1423 << ": 0x" << std::hex
<< offset
<< "~" << length
1424 << std::dec
<< ": still in use" << dendl
;
1432 case bluefs_transaction_t::OP_DIR_LINK
:
1434 string dirname
, filename
;
1437 decode(filename
, p
);
1439 dout(20) << __func__
<< " 0x" << std::hex
<< pos
<< std::dec
1440 << ": op_dir_link " << " " << dirname
<< "/" << filename
1443 if (unlikely(to_stdout
)) {
1444 std::cout
<< " 0x" << std::hex
<< pos
<< std::dec
1445 << ": op_dir_link " << " " << dirname
<< "/" << filename
1451 FileRef file
= _get_file(ino
);
1452 ceph_assert(file
->fnode
.ino
);
1453 map
<string
,DirRef
>::iterator q
= dir_map
.find(dirname
);
1454 ceph_assert(q
!= dir_map
.end());
1455 map
<string
,FileRef
>::iterator r
= q
->second
->file_map
.find(filename
);
1456 ceph_assert(r
== q
->second
->file_map
.end());
1458 vselector
->sub_usage(file
->vselector_hint
, file
->fnode
);
1459 file
->vselector_hint
=
1460 vselector
->get_hint_by_dir(dirname
);
1461 vselector
->add_usage(file
->vselector_hint
, file
->fnode
);
1463 q
->second
->file_map
[filename
] = file
;
1469 case bluefs_transaction_t::OP_DIR_UNLINK
:
1471 string dirname
, filename
;
1473 decode(filename
, p
);
1474 dout(20) << __func__
<< " 0x" << std::hex
<< pos
<< std::dec
1475 << ": op_dir_unlink " << " " << dirname
<< "/" << filename
1477 if (unlikely(to_stdout
)) {
1478 std::cout
<< " 0x" << std::hex
<< pos
<< std::dec
1479 << ": op_dir_unlink " << " " << dirname
<< "/" << filename
1484 map
<string
,DirRef
>::iterator q
= dir_map
.find(dirname
);
1485 ceph_assert(q
!= dir_map
.end());
1486 map
<string
,FileRef
>::iterator r
= q
->second
->file_map
.find(filename
);
1487 ceph_assert(r
!= q
->second
->file_map
.end());
1488 ceph_assert(r
->second
->refs
> 0);
1490 q
->second
->file_map
.erase(r
);
1495 case bluefs_transaction_t::OP_DIR_CREATE
:
1499 dout(20) << __func__
<< " 0x" << std::hex
<< pos
<< std::dec
1500 << ": op_dir_create " << dirname
<< dendl
;
1501 if (unlikely(to_stdout
)) {
1502 std::cout
<< " 0x" << std::hex
<< pos
<< std::dec
1503 << ": op_dir_create " << dirname
<< std::endl
;
1507 map
<string
,DirRef
>::iterator q
= dir_map
.find(dirname
);
1508 ceph_assert(q
== dir_map
.end());
1509 dir_map
[dirname
] = ceph::make_ref
<Dir
>();
1514 case bluefs_transaction_t::OP_DIR_REMOVE
:
1518 dout(20) << __func__
<< " 0x" << std::hex
<< pos
<< std::dec
1519 << ": op_dir_remove " << dirname
<< dendl
;
1520 if (unlikely(to_stdout
)) {
1521 std::cout
<< " 0x" << std::hex
<< pos
<< std::dec
1522 << ": op_dir_remove " << dirname
<< std::endl
;
1526 map
<string
,DirRef
>::iterator q
= dir_map
.find(dirname
);
1527 ceph_assert(q
!= dir_map
.end());
1528 ceph_assert(q
->second
->file_map
.empty());
1534 case bluefs_transaction_t::OP_FILE_UPDATE
:
1536 bluefs_fnode_t fnode
;
1538 dout(20) << __func__
<< " 0x" << std::hex
<< pos
<< std::dec
1539 << ": op_file_update " << " " << fnode
<< " " << dendl
;
1540 if (unlikely(to_stdout
)) {
1541 std::cout
<< " 0x" << std::hex
<< pos
<< std::dec
1542 << ": op_file_update " << " " << fnode
<< std::endl
;
1545 FileRef f
= _get_file(fnode
.ino
);
1546 if (cct
->_conf
->bluefs_log_replay_check_allocations
) {
1547 // check initial log layout
1548 if (first_log_check
) {
1549 first_log_check
= false;
1550 int r
= _check_new_allocations(log_file
->fnode
,
1551 MAX_BDEV
, owned_blocks
, used_blocks
);
1557 auto& fnode_extents
= f
->fnode
.extents
;
1558 for (auto e
: fnode_extents
) {
1560 if (int r
= _verify_alloc_granularity(id
, e
.offset
, e
.length
,
1561 "OP_FILE_UPDATE"); r
< 0) {
1564 apply_for_bitset_range(e
.offset
, e
.length
, alloc_size
[id
],
1566 [&](uint64_t pos
, boost::dynamic_bitset
<uint64_t> &bs
) {
1567 ceph_assert(bs
.test(pos
));
1574 if (fnode
.ino
!= 1) {
1575 vselector
->sub_usage(f
->vselector_hint
, f
->fnode
);
1578 if (fnode
.ino
!= 1) {
1579 vselector
->add_usage(f
->vselector_hint
, f
->fnode
);
1582 if (fnode
.ino
> ino_last
) {
1583 ino_last
= fnode
.ino
;
1585 if (cct
->_conf
->bluefs_log_replay_check_allocations
) {
1586 int r
= _check_new_allocations(f
->fnode
,
1587 MAX_BDEV
, owned_blocks
, used_blocks
);
1596 case bluefs_transaction_t::OP_FILE_REMOVE
:
1600 dout(20) << __func__
<< " 0x" << std::hex
<< pos
<< std::dec
1601 << ": op_file_remove " << ino
<< dendl
;
1602 if (unlikely(to_stdout
)) {
1603 std::cout
<< " 0x" << std::hex
<< pos
<< std::dec
1604 << ": op_file_remove " << ino
<< std::endl
;
1608 auto p
= file_map
.find(ino
);
1609 ceph_assert(p
!= file_map
.end());
1610 vselector
->sub_usage(p
->second
->vselector_hint
, p
->second
->fnode
);
1611 if (cct
->_conf
->bluefs_log_replay_check_allocations
) {
1612 auto& fnode_extents
= p
->second
->fnode
.extents
;
1613 for (auto e
: fnode_extents
) {
1616 apply_for_bitset_range(e
.offset
, e
.length
, alloc_size
[id
], owned_blocks
[id
],
1617 [&](uint64_t pos
, boost::dynamic_bitset
<uint64_t> &bs
) {
1618 if (!bs
.test(pos
)) {
1624 derr
<< __func__
<< " invalid extent " << int(id
)
1625 << ": 0x" << std::hex
<< e
.offset
<< "~" << e
.length
1627 << ": wasn't given but is allocated for removed ino " << ino
1632 apply_for_bitset_range(e
.offset
, e
.length
, alloc_size
[id
], used_blocks
[id
],
1633 [&](uint64_t pos
, boost::dynamic_bitset
<uint64_t> &bs
) {
1634 if (!bs
.test(pos
)) {
1641 derr
<< __func__
<< " invalid extent " << int(id
)
1642 << ": 0x" << std::hex
<< e
.offset
<< "~" << e
.length
1644 << ": not in use but is allocated for removed ino " << ino
1656 derr
<< __func__
<< " 0x" << std::hex
<< pos
<< std::dec
1657 << ": stop: unrecognized op " << (int)op
<< dendl
;
1662 ceph_assert(p
.end());
1664 // we successfully replayed the transaction; bump the seq and log size
1666 log_file
->fnode
.size
= log_reader
->buf
.pos
;
1668 vselector
->add_usage(log_file
->vselector_hint
, log_file
->fnode
);
1670 if (!noop
&& first_log_check
&&
1671 cct
->_conf
->bluefs_log_replay_check_allocations
) {
1672 int r
= _check_new_allocations(log_file
->fnode
,
1673 MAX_BDEV
, owned_blocks
, used_blocks
);
1679 dout(10) << __func__
<< " log file size was 0x"
1680 << std::hex
<< log_file
->fnode
.size
<< std::dec
<< dendl
;
1681 if (unlikely(to_stdout
)) {
1682 std::cout
<< " log file size was 0x"
1683 << std::hex
<< log_file
->fnode
.size
<< std::dec
<< std::endl
;
1689 // verify file link counts are all >0
1690 for (auto& p
: file_map
) {
1691 if (p
.second
->refs
== 0 &&
1692 p
.second
->fnode
.ino
> 1) {
1693 derr
<< __func__
<< " file with link count 0: " << p
.second
->fnode
1700 for (unsigned id
= 0; id
< MAX_BDEV
; ++id
) {
1701 dout(10) << __func__
<< " block_unused_too_granular " << id
<< ": "
1702 << block_unused_too_granular
[id
] << dendl
;
1704 dout(10) << __func__
<< " done" << dendl
;
1708 int BlueFS::log_dump()
1710 // only dump log file's content
1711 int r
= _replay(true, true);
1713 derr
<< __func__
<< " failed to replay log: " << cpp_strerror(r
) << dendl
;
1720 int BlueFS::device_migrate_to_existing(
1722 const set
<int>& devs_source
,
1724 const bluefs_layout_t
& layout
)
1727 bool buffered
= cct
->_conf
->bluefs_buffered_io
;
1729 dout(10) << __func__
<< " devs_source " << devs_source
1730 << " dev_target " << dev_target
<< dendl
;
1731 assert(dev_target
< (int)MAX_BDEV
);
1734 flags
|= devs_source
.count(BDEV_DB
) ?
1735 (REMOVE_DB
| RENAME_SLOW2DB
) : 0;
1736 flags
|= devs_source
.count(BDEV_WAL
) ? REMOVE_WAL
: 0;
1737 int dev_target_new
= dev_target
;
1739 // Slow device without separate DB one is addressed via BDEV_DB
1740 // Hence need renaming.
1741 if ((flags
& REMOVE_DB
) && dev_target
== BDEV_SLOW
) {
1742 dev_target_new
= BDEV_DB
;
1743 dout(0) << __func__
<< " super to be written to " << dev_target
<< dendl
;
1746 for (auto& [ino
, file_ref
] : file_map
) {
1748 if (file_ref
->fnode
.ino
== 1) {
1751 dout(10) << __func__
<< " " << ino
<< " " << file_ref
->fnode
<< dendl
;
1753 auto& fnode_extents
= file_ref
->fnode
.extents
;
1755 bool rewrite
= std::any_of(
1756 fnode_extents
.begin(),
1757 fnode_extents
.end(),
1759 return ext
.bdev
!= dev_target
&& devs_source
.count(ext
.bdev
);
1762 dout(10) << __func__
<< " migrating" << dendl
;
1766 for (auto old_ext
: fnode_extents
) {
1767 buf
.resize(old_ext
.length
);
1768 int r
= bdev
[old_ext
.bdev
]->read_random(
1774 derr
<< __func__
<< " failed to read 0x" << std::hex
1775 << old_ext
.offset
<< "~" << old_ext
.length
<< std::dec
1776 << " from " << (int)dev_target
<< dendl
;
1779 bl
.append((char*)&buf
[0], old_ext
.length
);
1782 // write entire file
1783 PExtentVector extents
;
1784 auto l
= _allocate_without_fallback(dev_target
, bl
.length(), &extents
);
1786 derr
<< __func__
<< " unable to allocate len 0x" << std::hex
1787 << bl
.length() << std::dec
<< " from " << (int)dev_target
1788 << ": " << cpp_strerror(l
) << dendl
;
1793 for (auto& i
: extents
) {
1795 uint64_t cur_len
= std::min
<uint64_t>(i
.length
, bl
.length() - off
);
1796 ceph_assert(cur_len
> 0);
1797 cur
.substr_of(bl
, off
, cur_len
);
1798 int r
= bdev
[dev_target
]->write(i
.offset
, cur
, buffered
);
1799 ceph_assert(r
== 0);
1803 // release old extents
1804 for (auto old_ext
: fnode_extents
) {
1805 PExtentVector to_release
;
1806 to_release
.emplace_back(old_ext
.offset
, old_ext
.length
);
1807 alloc
[old_ext
.bdev
]->release(to_release
);
1811 fnode_extents
.clear();
1812 for (auto& i
: extents
) {
1813 fnode_extents
.emplace_back(dev_target_new
, i
.offset
, i
.length
);
1816 for (auto& ext
: fnode_extents
) {
1817 if (dev_target
!= dev_target_new
&& ext
.bdev
== dev_target
) {
1818 dout(20) << __func__
<< " " << " ... adjusting extent 0x"
1819 << std::hex
<< ext
.offset
<< std::dec
1820 << " bdev " << dev_target
<< " -> " << dev_target_new
1822 ext
.bdev
= dev_target_new
;
1827 // new logging device in the current naming scheme
1828 int new_log_dev_cur
= bdev
[BDEV_WAL
] ?
1830 bdev
[BDEV_DB
] ? BDEV_DB
: BDEV_SLOW
;
1832 // new logging device in new naming scheme
1833 int new_log_dev_next
= new_log_dev_cur
;
1835 if (devs_source
.count(new_log_dev_cur
)) {
1836 // SLOW device is addressed via BDEV_DB too hence either WAL or DB
1837 new_log_dev_next
= (flags
& REMOVE_WAL
) || !bdev
[BDEV_WAL
] ?
1841 dout(0) << __func__
<< " log moved from " << new_log_dev_cur
1842 << " to " << new_log_dev_next
<< dendl
;
1845 (flags
& REMOVE_DB
) && new_log_dev_next
== BDEV_DB
?
1850 _rewrite_log_and_layout_sync(
1852 (flags
& REMOVE_DB
) ? BDEV_SLOW
: BDEV_DB
,
1860 int BlueFS::device_migrate_to_new(
1862 const set
<int>& devs_source
,
1864 const bluefs_layout_t
& layout
)
1867 bool buffered
= cct
->_conf
->bluefs_buffered_io
;
1869 dout(10) << __func__
<< " devs_source " << devs_source
1870 << " dev_target " << dev_target
<< dendl
;
1871 assert(dev_target
== (int)BDEV_NEWDB
|| (int)BDEV_NEWWAL
);
1875 flags
|= devs_source
.count(BDEV_DB
) ?
1876 (!bdev
[BDEV_SLOW
] ? RENAME_DB2SLOW
: REMOVE_DB
) :
1878 flags
|= devs_source
.count(BDEV_WAL
) ? REMOVE_WAL
: 0;
1879 int dev_target_new
= dev_target
; //FIXME: remove, makes no sense
1881 for (auto& p
: file_map
) {
1883 if (p
.second
->fnode
.ino
== 1) {
1886 dout(10) << __func__
<< " " << p
.first
<< " " << p
.second
->fnode
<< dendl
;
1888 auto& fnode_extents
= p
.second
->fnode
.extents
;
1890 bool rewrite
= false;
1891 for (auto ext_it
= fnode_extents
.begin();
1892 ext_it
!= p
.second
->fnode
.extents
.end();
1894 if (ext_it
->bdev
!= dev_target
&& devs_source
.count(ext_it
->bdev
)) {
1900 dout(10) << __func__
<< " migrating" << dendl
;
1904 for (auto old_ext
: fnode_extents
) {
1905 buf
.resize(old_ext
.length
);
1906 int r
= bdev
[old_ext
.bdev
]->read_random(
1912 derr
<< __func__
<< " failed to read 0x" << std::hex
1913 << old_ext
.offset
<< "~" << old_ext
.length
<< std::dec
1914 << " from " << (int)dev_target
<< dendl
;
1917 bl
.append((char*)&buf
[0], old_ext
.length
);
1920 // write entire file
1921 PExtentVector extents
;
1922 auto l
= _allocate_without_fallback(dev_target
, bl
.length(), &extents
);
1924 derr
<< __func__
<< " unable to allocate len 0x" << std::hex
1925 << bl
.length() << std::dec
<< " from " << (int)dev_target
1926 << ": " << cpp_strerror(l
) << dendl
;
1931 for (auto& i
: extents
) {
1933 uint64_t cur_len
= std::min
<uint64_t>(i
.length
, bl
.length() - off
);
1934 ceph_assert(cur_len
> 0);
1935 cur
.substr_of(bl
, off
, cur_len
);
1936 int r
= bdev
[dev_target
]->write(i
.offset
, cur
, buffered
);
1937 ceph_assert(r
== 0);
1941 // release old extents
1942 for (auto old_ext
: fnode_extents
) {
1943 PExtentVector to_release
;
1944 to_release
.emplace_back(old_ext
.offset
, old_ext
.length
);
1945 alloc
[old_ext
.bdev
]->release(to_release
);
1949 fnode_extents
.clear();
1950 for (auto& i
: extents
) {
1951 fnode_extents
.emplace_back(dev_target_new
, i
.offset
, i
.length
);
1955 // new logging device in the current naming scheme
1956 int new_log_dev_cur
=
1959 bdev
[BDEV_WAL
] && !(flags
& REMOVE_WAL
) ?
1963 bdev
[BDEV_DB
] && !(flags
& REMOVE_DB
)?
1967 // new logging device in new naming scheme
1968 int new_log_dev_next
=
1969 new_log_dev_cur
== BDEV_NEWWAL
?
1971 new_log_dev_cur
== BDEV_NEWDB
?
1976 dev_target
== BDEV_NEWDB
?
1982 _rewrite_log_and_layout_sync(
1992 BlueFS::FileRef
BlueFS::_get_file(uint64_t ino
)
1994 auto p
= file_map
.find(ino
);
1995 if (p
== file_map
.end()) {
1996 FileRef f
= ceph::make_ref
<File
>();
1998 dout(30) << __func__
<< " ino " << ino
<< " = " << f
1999 << " (new)" << dendl
;
2002 dout(30) << __func__
<< " ino " << ino
<< " = " << p
->second
<< dendl
;
2007 void BlueFS::_drop_link(FileRef file
)
2009 dout(20) << __func__
<< " had refs " << file
->refs
2010 << " on " << file
->fnode
<< dendl
;
2011 ceph_assert(file
->refs
> 0);
2013 if (file
->refs
== 0) {
2014 dout(20) << __func__
<< " destroying " << file
->fnode
<< dendl
;
2015 ceph_assert(file
->num_reading
.load() == 0);
2016 vselector
->sub_usage(file
->vselector_hint
, file
->fnode
);
2017 log_t
.op_file_remove(file
->fnode
.ino
);
2018 for (auto& r
: file
->fnode
.extents
) {
2019 pending_release
[r
.bdev
].insert(r
.offset
, r
.length
);
2021 file_map
.erase(file
->fnode
.ino
);
2022 file
->deleted
= true;
2024 if (file
->dirty_seq
) {
2025 ceph_assert(file
->dirty_seq
> log_seq_stable
);
2026 ceph_assert(dirty_files
.count(file
->dirty_seq
));
2027 auto it
= dirty_files
[file
->dirty_seq
].iterator_to(*file
);
2028 dirty_files
[file
->dirty_seq
].erase(it
);
2029 file
->dirty_seq
= 0;
2034 int64_t BlueFS::_read_random(
2035 FileReader
*h
, ///< [in] read from here
2036 uint64_t off
, ///< [in] offset
2037 uint64_t len
, ///< [in] this many bytes
2038 char *out
) ///< [out] optional: or copy it here
2040 auto* buf
= &h
->buf
;
2043 dout(10) << __func__
<< " h " << h
2044 << " 0x" << std::hex
<< off
<< "~" << len
<< std::dec
2045 << " from " << h
->file
->fnode
<< dendl
;
2047 ++h
->file
->num_reading
;
2049 if (!h
->ignore_eof
&&
2050 off
+ len
> h
->file
->fnode
.size
) {
2051 if (off
> h
->file
->fnode
.size
)
2054 len
= h
->file
->fnode
.size
- off
;
2055 dout(20) << __func__
<< " reaching (or past) eof, len clipped to 0x"
2056 << std::hex
<< len
<< std::dec
<< dendl
;
2058 logger
->inc(l_bluefs_read_random_count
, 1);
2059 logger
->inc(l_bluefs_read_random_bytes
, len
);
2061 std::shared_lock
s_lock(h
->lock
);
2062 buf
->bl
.reassign_to_mempool(mempool::mempool_bluefs_file_reader
);
2064 if (off
< buf
->bl_off
|| off
>= buf
->get_buf_end()) {
2067 auto p
= h
->file
->fnode
.seek(off
, &x_off
);
2068 ceph_assert(p
!= h
->file
->fnode
.extents
.end());
2069 uint64_t l
= std::min(p
->length
- x_off
, len
);
2071 l
= std::min(l
, uint64_t(1) << 30);
2072 dout(20) << __func__
<< " read random 0x"
2073 << std::hex
<< x_off
<< "~" << l
<< std::dec
2074 << " of " << *p
<< dendl
;
2076 if (!cct
->_conf
->bluefs_check_for_zeros
) {
2077 r
= bdev
[p
->bdev
]->read_random(p
->offset
+ x_off
, l
, out
,
2078 cct
->_conf
->bluefs_buffered_io
);
2080 r
= read_random(p
->bdev
, p
->offset
+ x_off
, l
, out
,
2081 cct
->_conf
->bluefs_buffered_io
);
2083 ceph_assert(r
== 0);
2089 logger
->inc(l_bluefs_read_random_disk_count
, 1);
2090 logger
->inc(l_bluefs_read_random_disk_bytes
, l
);
2095 auto left
= buf
->get_buf_remaining(off
);
2096 int64_t r
= std::min(len
, left
);
2097 logger
->inc(l_bluefs_read_random_buffer_count
, 1);
2098 logger
->inc(l_bluefs_read_random_buffer_bytes
, r
);
2099 dout(20) << __func__
<< " left 0x" << std::hex
<< left
2100 << " 0x" << off
<< "~" << len
<< std::dec
2104 auto p
= buf
->bl
.begin();
2105 p
.seek(off
- buf
->bl_off
);
2110 dout(30) << __func__
<< " result chunk (0x"
2111 << std::hex
<< r
<< std::dec
<< " bytes):\n";
2113 t
.substr_of(buf
->bl
, off
- buf
->bl_off
, r
);
2123 dout(20) << __func__
<< " got " << ret
<< dendl
;
2124 --h
->file
->num_reading
;
2128 int64_t BlueFS::_read(
2129 FileReader
*h
, ///< [in] read from here
2130 FileReaderBuffer
*buf
, ///< [in] reader state
2131 uint64_t off
, ///< [in] offset
2132 size_t len
, ///< [in] this many bytes
2133 bufferlist
*outbl
, ///< [out] optional: reference the result here
2134 char *out
) ///< [out] optional: or copy it here
2136 bool prefetch
= !outbl
&& !out
;
2137 dout(10) << __func__
<< " h " << h
2138 << " 0x" << std::hex
<< off
<< "~" << len
<< std::dec
2139 << " from " << h
->file
->fnode
2140 << (prefetch
? " prefetch" : "")
2143 ++h
->file
->num_reading
;
2145 if (!h
->ignore_eof
&&
2146 off
+ len
> h
->file
->fnode
.size
) {
2147 if (off
> h
->file
->fnode
.size
)
2150 len
= h
->file
->fnode
.size
- off
;
2151 dout(20) << __func__
<< " reaching (or past) eof, len clipped to 0x"
2152 << std::hex
<< len
<< std::dec
<< dendl
;
2154 logger
->inc(l_bluefs_read_count
, 1);
2155 logger
->inc(l_bluefs_read_bytes
, len
);
2157 logger
->inc(l_bluefs_read_prefetch_count
, 1);
2158 logger
->inc(l_bluefs_read_prefetch_bytes
, len
);
2165 std::shared_lock
s_lock(h
->lock
);
2168 if (off
< buf
->bl_off
|| off
>= buf
->get_buf_end()) {
2170 std::unique_lock
u_lock(h
->lock
);
2171 buf
->bl
.reassign_to_mempool(mempool::mempool_bluefs_file_reader
);
2172 if (off
< buf
->bl_off
|| off
>= buf
->get_buf_end()) {
2173 // if precondition hasn't changed during locking upgrade.
2175 buf
->bl_off
= off
& super
.block_mask();
2177 auto p
= h
->file
->fnode
.seek(buf
->bl_off
, &x_off
);
2178 if (p
== h
->file
->fnode
.extents
.end()) {
2179 dout(5) << __func__
<< " reading less then required "
2180 << ret
<< "<" << ret
+ len
<< dendl
;
2184 uint64_t want
= round_up_to(len
+ (off
& ~super
.block_mask()),
2186 want
= std::max(want
, buf
->max_prefetch
);
2187 uint64_t l
= std::min(p
->length
- x_off
, want
);
2189 l
= std::min(l
, uint64_t(1) << 30);
2190 uint64_t eof_offset
= round_up_to(h
->file
->fnode
.size
, super
.block_size
);
2191 if (!h
->ignore_eof
&&
2192 buf
->bl_off
+ l
> eof_offset
) {
2193 l
= eof_offset
- buf
->bl_off
;
2195 dout(20) << __func__
<< " fetching 0x"
2196 << std::hex
<< x_off
<< "~" << l
<< std::dec
2197 << " of " << *p
<< dendl
;
2199 if (!cct
->_conf
->bluefs_check_for_zeros
) {
2200 r
= bdev
[p
->bdev
]->read(p
->offset
+ x_off
, l
, &buf
->bl
, ioc
[p
->bdev
],
2201 cct
->_conf
->bluefs_buffered_io
);
2203 r
= read(p
->bdev
, p
->offset
+ x_off
, l
, &buf
->bl
, ioc
[p
->bdev
],
2204 cct
->_conf
->bluefs_buffered_io
);
2206 ceph_assert(r
== 0);
2210 // we should recheck if buffer is valid after lock downgrade
2213 left
= buf
->get_buf_remaining(off
);
2214 dout(20) << __func__
<< " left 0x" << std::hex
<< left
2215 << " len 0x" << len
<< std::dec
<< dendl
;
2217 int64_t r
= std::min(len
, left
);
2220 t
.substr_of(buf
->bl
, off
- buf
->bl_off
, r
);
2221 outbl
->claim_append(t
);
2224 auto p
= buf
->bl
.begin();
2225 p
.seek(off
- buf
->bl_off
);
2230 dout(30) << __func__
<< " result chunk (0x"
2231 << std::hex
<< r
<< std::dec
<< " bytes):\n";
2233 t
.substr_of(buf
->bl
, off
- buf
->bl_off
, r
);
2242 dout(20) << __func__
<< " got " << ret
<< dendl
;
2243 ceph_assert(!outbl
|| (int)outbl
->length() == ret
);
2244 --h
->file
->num_reading
;
2248 void BlueFS::_invalidate_cache(FileRef f
, uint64_t offset
, uint64_t length
)
2250 dout(10) << __func__
<< " file " << f
->fnode
2251 << " 0x" << std::hex
<< offset
<< "~" << length
<< std::dec
2253 if (offset
& ~super
.block_mask()) {
2254 offset
&= super
.block_mask();
2255 length
= round_up_to(length
, super
.block_size
);
2258 auto p
= f
->fnode
.seek(offset
, &x_off
);
2259 while (length
> 0 && p
!= f
->fnode
.extents
.end()) {
2260 uint64_t x_len
= std::min(p
->length
- x_off
, length
);
2261 bdev
[p
->bdev
]->invalidate_cache(p
->offset
+ x_off
, x_len
);
2262 dout(20) << __func__
<< " 0x" << std::hex
<< x_off
<< "~" << x_len
2263 << std:: dec
<< " of " << *p
<< dendl
;
2269 uint64_t BlueFS::_estimate_log_size()
2271 int avg_dir_size
= 40; // fixme
2272 int avg_file_size
= 12;
2273 uint64_t size
= 4096 * 2;
2274 size
+= file_map
.size() * (1 + sizeof(bluefs_fnode_t
));
2275 for (auto& p
: block_all
)
2276 size
+= p
.num_intervals() * (1 + 1 + sizeof(uint64_t) * 2);
2277 size
+= dir_map
.size() + (1 + avg_dir_size
);
2278 size
+= file_map
.size() * (1 + avg_dir_size
+ avg_file_size
);
2279 return round_up_to(size
, super
.block_size
);
2282 void BlueFS::compact_log()
2284 std::unique_lock
<ceph::mutex
> l(lock
);
2285 if (!cct
->_conf
->bluefs_replay_recovery_disable_compact
) {
2286 if (cct
->_conf
->bluefs_compact_log_sync
) {
2287 _compact_log_sync();
2289 _compact_log_async(l
);
2294 bool BlueFS::_should_compact_log()
2296 uint64_t current
= log_writer
->file
->fnode
.size
;
2297 uint64_t expected
= _estimate_log_size();
2298 float ratio
= (float)current
/ (float)expected
;
2299 dout(10) << __func__
<< " current 0x" << std::hex
<< current
2300 << " expected " << expected
<< std::dec
2301 << " ratio " << ratio
2302 << (new_log
? " (async compaction in progress)" : "")
2305 current
< cct
->_conf
->bluefs_log_compact_min_size
||
2306 ratio
< cct
->_conf
->bluefs_log_compact_min_ratio
) {
2312 void BlueFS::_compact_log_dump_metadata(bluefs_transaction_t
*t
,
2316 t
->uuid
= super
.uuid
;
2317 dout(20) << __func__
<< " op_init" << dendl
;
2320 for (unsigned bdev
= 0; bdev
< MAX_BDEV
; ++bdev
) {
2321 interval_set
<uint64_t>& p
= block_all
[bdev
];
2322 for (interval_set
<uint64_t>::iterator q
= p
.begin(); q
!= p
.end(); ++q
) {
2323 auto bdev_new
= bdev
;
2324 if ((flags
& REMOVE_WAL
) && bdev
== BDEV_WAL
) {
2327 if ((flags
& REMOVE_DB
) && bdev
== BDEV_DB
) {
2330 if ((flags
& RENAME_SLOW2DB
) && bdev
== BDEV_SLOW
) {
2333 if ((flags
& RENAME_DB2SLOW
) && bdev
== BDEV_DB
) {
2334 bdev_new
= BDEV_SLOW
;
2336 if (bdev
== BDEV_NEWDB
) {
2337 // REMOVE_DB xor RENAME_DB
2338 ceph_assert(!(flags
& REMOVE_DB
) != !(flags
& RENAME_DB2SLOW
));
2339 ceph_assert(!(flags
& RENAME_SLOW2DB
));
2342 if (bdev
== BDEV_NEWWAL
) {
2343 ceph_assert(flags
& REMOVE_WAL
);
2344 bdev_new
= BDEV_WAL
;
2346 dout(20) << __func__
<< " op_alloc_add " << bdev_new
<< " 0x"
2347 << std::hex
<< q
.get_start() << "~" << q
.get_len() << std::dec
2349 t
->op_alloc_add(bdev_new
, q
.get_start(), q
.get_len());
2352 for (auto& [ino
, file_ref
] : file_map
) {
2355 ceph_assert(ino
> 1);
2357 for(auto& e
: file_ref
->fnode
.extents
) {
2359 auto bdev_new
= bdev
;
2360 ceph_assert(!((flags
& REMOVE_WAL
) && bdev
== BDEV_WAL
));
2361 if ((flags
& RENAME_SLOW2DB
) && bdev
== BDEV_SLOW
) {
2364 if ((flags
& RENAME_DB2SLOW
) && bdev
== BDEV_DB
) {
2365 bdev_new
= BDEV_SLOW
;
2367 if (bdev
== BDEV_NEWDB
) {
2368 // REMOVE_DB xor RENAME_DB
2369 ceph_assert(!(flags
& REMOVE_DB
) != !(flags
& RENAME_DB2SLOW
));
2370 ceph_assert(!(flags
& RENAME_SLOW2DB
));
2373 if (bdev
== BDEV_NEWWAL
) {
2374 ceph_assert(flags
& REMOVE_WAL
);
2375 bdev_new
= BDEV_WAL
;
2379 dout(20) << __func__
<< " op_file_update " << file_ref
->fnode
<< dendl
;
2380 t
->op_file_update(file_ref
->fnode
);
2382 for (auto& [path
, dir_ref
] : dir_map
) {
2383 dout(20) << __func__
<< " op_dir_create " << path
<< dendl
;
2384 t
->op_dir_create(path
);
2385 for (auto& [fname
, file_ref
] : dir_ref
->file_map
) {
2386 dout(20) << __func__
<< " op_dir_link " << path
<< "/" << fname
2387 << " to " << file_ref
->fnode
.ino
<< dendl
;
2388 t
->op_dir_link(path
, fname
, file_ref
->fnode
.ino
);
2393 void BlueFS::_compact_log_sync()
2395 dout(10) << __func__
<< dendl
;
2397 vselector
->select_prefer_bdev(log_writer
->file
->vselector_hint
);
2398 _rewrite_log_and_layout_sync(true,
2403 super
.memorized_layout
);
2404 logger
->inc(l_bluefs_log_compactions
);
2407 void BlueFS::_rewrite_log_and_layout_sync(bool allocate_with_fallback
,
2412 std::optional
<bluefs_layout_t
> layout
)
2414 File
*log_file
= log_writer
->file
.get();
2416 // clear out log (be careful who calls us!!!)
2419 dout(20) << __func__
<< " super_dev:" << super_dev
2420 << " log_dev:" << log_dev
2421 << " log_dev_new:" << log_dev_new
2422 << " flags:" << flags
2424 bluefs_transaction_t t
;
2425 _compact_log_dump_metadata(&t
, flags
);
2427 dout(20) << __func__
<< " op_jump_seq " << log_seq
<< dendl
;
2428 t
.op_jump_seq(log_seq
);
2434 uint64_t need
= bl
.length() + cct
->_conf
->bluefs_max_log_runway
;
2435 dout(20) << __func__
<< " need " << need
<< dendl
;
2437 bluefs_fnode_t old_fnode
;
2439 log_file
->fnode
.swap_extents(old_fnode
);
2440 if (allocate_with_fallback
) {
2441 r
= _allocate(log_dev
, need
, &log_file
->fnode
);
2442 ceph_assert(r
== 0);
2444 PExtentVector extents
;
2445 r
= _allocate_without_fallback(log_dev
,
2448 ceph_assert(r
== 0);
2449 for (auto& p
: extents
) {
2450 log_file
->fnode
.append_extent(
2451 bluefs_extent_t(log_dev
, p
.offset
, p
.length
));
2455 _close_writer(log_writer
);
2457 log_file
->fnode
.size
= bl
.length();
2458 vselector
->sub_usage(log_file
->vselector_hint
, old_fnode
);
2459 vselector
->add_usage(log_file
->vselector_hint
, log_file
->fnode
);
2461 log_writer
= _create_writer(log_file
);
2462 log_writer
->append(bl
);
2463 r
= _flush(log_writer
, true);
2464 ceph_assert(r
== 0);
2466 if (!cct
->_conf
->bluefs_sync_write
) {
2467 list
<aio_t
> completed_ios
;
2468 _claim_completed_aios(log_writer
, &completed_ios
);
2469 wait_for_aio(log_writer
);
2470 completed_ios
.clear();
2475 super
.memorized_layout
= layout
;
2476 super
.log_fnode
= log_file
->fnode
;
2477 // rename device if needed
2478 if (log_dev
!= log_dev_new
) {
2479 dout(10) << __func__
<< " renaming log extents to " << log_dev_new
<< dendl
;
2480 for (auto& p
: super
.log_fnode
.extents
) {
2481 p
.bdev
= log_dev_new
;
2484 dout(10) << __func__
<< " writing super, log fnode: " << super
.log_fnode
<< dendl
;
2487 _write_super(super_dev
);
2490 dout(10) << __func__
<< " release old log extents " << old_fnode
.extents
<< dendl
;
2491 for (auto& r
: old_fnode
.extents
) {
2492 pending_release
[r
.bdev
].insert(r
.offset
, r
.length
);
2497 * 1. Allocate a new extent to continue the log, and then log an event
2498 * that jumps the log write position to the new extent. At this point, the
2499 * old extent(s) won't be written to, and reflect everything to compact.
2500 * New events will be written to the new region that we'll keep.
2502 * 2. While still holding the lock, encode a bufferlist that dumps all of the
2503 * in-memory fnodes and names. This will become the new beginning of the
2504 * log. The last event will jump to the log continuation extent from #1.
2506 * 3. Queue a write to a new extent for the new beginnging of the log.
2508 * 4. Drop lock and wait
2510 * 5. Retake the lock.
2512 * 6. Update the log_fnode to splice in the new beginning.
2514 * 7. Write the new superblock.
2516 * 8. Release the old log space. Clean up.
2518 void BlueFS::_compact_log_async(std::unique_lock
<ceph::mutex
>& l
)
2520 dout(10) << __func__
<< dendl
;
2521 File
*log_file
= log_writer
->file
.get();
2522 ceph_assert(!new_log
);
2523 ceph_assert(!new_log_writer
);
2525 // create a new log [writer] so that we know compaction is in progress
2526 // (see _should_compact_log)
2527 new_log
= ceph::make_ref
<File
>();
2528 new_log
->fnode
.ino
= 0; // so that _flush_range won't try to log the fnode
2530 // 0. wait for any racing flushes to complete. (We do not want to block
2531 // in _flush_sync_log with jump_to set or else a racing thread might flush
2532 // our entries and our jump_to update won't be correct.)
2533 while (log_flushing
) {
2534 dout(10) << __func__
<< " log is currently flushing, waiting" << dendl
;
2538 vselector
->sub_usage(log_file
->vselector_hint
, log_file
->fnode
);
2540 // 1. allocate new log space and jump to it.
2541 old_log_jump_to
= log_file
->fnode
.get_allocated();
2542 dout(10) << __func__
<< " old_log_jump_to 0x" << std::hex
<< old_log_jump_to
2543 << " need 0x" << (old_log_jump_to
+ cct
->_conf
->bluefs_max_log_runway
) << std::dec
<< dendl
;
2544 int r
= _allocate(vselector
->select_prefer_bdev(log_file
->vselector_hint
),
2545 cct
->_conf
->bluefs_max_log_runway
,
2547 ceph_assert(r
== 0);
2548 //adjust usage as flush below will need it
2549 vselector
->add_usage(log_file
->vselector_hint
, log_file
->fnode
);
2550 dout(10) << __func__
<< " log extents " << log_file
->fnode
.extents
<< dendl
;
2552 // update the log file change and log a jump to the offset where we want to
2553 // write the new entries
2554 log_t
.op_file_update(log_file
->fnode
);
2555 log_t
.op_jump(log_seq
, old_log_jump_to
);
2557 flush_bdev(); // FIXME?
2559 _flush_and_sync_log(l
, 0, old_log_jump_to
);
2561 // 2. prepare compacted log
2562 bluefs_transaction_t t
;
2563 //avoid record two times in log_t and _compact_log_dump_metadata.
2565 _compact_log_dump_metadata(&t
, 0);
2567 uint64_t max_alloc_size
= std::max(alloc_size
[BDEV_WAL
],
2568 std::max(alloc_size
[BDEV_DB
],
2569 alloc_size
[BDEV_SLOW
]));
2571 // conservative estimate for final encoded size
2572 new_log_jump_to
= round_up_to(t
.op_bl
.length() + super
.block_size
* 2,
2574 t
.op_jump(log_seq
, new_log_jump_to
);
2577 //FIXME: check if we want DB here?
2578 r
= _allocate(BlueFS::BDEV_DB
, new_log_jump_to
,
2580 ceph_assert(r
== 0);
2582 // we might have some more ops in log_t due to _allocate call
2589 dout(10) << __func__
<< " new_log_jump_to 0x" << std::hex
<< new_log_jump_to
2590 << std::dec
<< dendl
;
2592 new_log_writer
= _create_writer(new_log
);
2593 new_log_writer
->append(bl
);
2596 r
= _flush(new_log_writer
, true);
2597 ceph_assert(r
== 0);
2600 _flush_bdev_safely(new_log_writer
);
2602 // 5. update our log fnode
2603 // discard first old_log_jump_to extents
2605 dout(10) << __func__
<< " remove 0x" << std::hex
<< old_log_jump_to
<< std::dec
2606 << " of " << log_file
->fnode
.extents
<< dendl
;
2607 uint64_t discarded
= 0;
2608 mempool::bluefs::vector
<bluefs_extent_t
> old_extents
;
2609 while (discarded
< old_log_jump_to
) {
2610 ceph_assert(!log_file
->fnode
.extents
.empty());
2611 bluefs_extent_t
& e
= log_file
->fnode
.extents
.front();
2612 bluefs_extent_t temp
= e
;
2613 if (discarded
+ e
.length
<= old_log_jump_to
) {
2614 dout(10) << __func__
<< " remove old log extent " << e
<< dendl
;
2615 discarded
+= e
.length
;
2616 log_file
->fnode
.pop_front_extent();
2618 dout(10) << __func__
<< " remove front of old log extent " << e
<< dendl
;
2619 uint64_t drop
= old_log_jump_to
- discarded
;
2624 dout(10) << __func__
<< " kept " << e
<< " removed " << temp
<< dendl
;
2626 old_extents
.push_back(temp
);
2628 auto from
= log_file
->fnode
.extents
.begin();
2629 auto to
= log_file
->fnode
.extents
.end();
2630 while (from
!= to
) {
2631 new_log
->fnode
.append_extent(*from
);
2635 vselector
->sub_usage(log_file
->vselector_hint
, log_file
->fnode
);
2637 // clear the extents from old log file, they are added to new log
2638 log_file
->fnode
.clear_extents();
2639 // swap the log files. New log file is the log file now.
2640 new_log
->fnode
.swap_extents(log_file
->fnode
);
2642 log_writer
->pos
= log_writer
->file
->fnode
.size
=
2643 log_writer
->pos
- old_log_jump_to
+ new_log_jump_to
;
2645 vselector
->add_usage(log_file
->vselector_hint
, log_file
->fnode
);
2647 // 6. write the super block to reflect the changes
2648 dout(10) << __func__
<< " writing super" << dendl
;
2649 super
.log_fnode
= log_file
->fnode
;
2651 _write_super(BDEV_DB
);
2657 // 7. release old space
2658 dout(10) << __func__
<< " release old log extents " << old_extents
<< dendl
;
2659 for (auto& r
: old_extents
) {
2660 pending_release
[r
.bdev
].insert(r
.offset
, r
.length
);
2663 // delete the new log, remove from the dirty files list
2664 _close_writer(new_log_writer
);
2665 if (new_log
->dirty_seq
) {
2666 ceph_assert(dirty_files
.count(new_log
->dirty_seq
));
2667 auto it
= dirty_files
[new_log
->dirty_seq
].iterator_to(*new_log
);
2668 dirty_files
[new_log
->dirty_seq
].erase(it
);
2670 new_log_writer
= nullptr;
2672 log_cond
.notify_all();
2674 dout(10) << __func__
<< " log extents " << log_file
->fnode
.extents
<< dendl
;
2675 logger
->inc(l_bluefs_log_compactions
);
2678 void BlueFS::_pad_bl(bufferlist
& bl
)
2680 uint64_t partial
= bl
.length() % super
.block_size
;
2682 dout(10) << __func__
<< " padding with 0x" << std::hex
2683 << super
.block_size
- partial
<< " zeros" << std::dec
<< dendl
;
2684 bl
.append_zero(super
.block_size
- partial
);
2689 int BlueFS::_flush_and_sync_log(std::unique_lock
<ceph::mutex
>& l
,
2693 while (log_flushing
) {
2694 dout(10) << __func__
<< " want_seq " << want_seq
2695 << " log is currently flushing, waiting" << dendl
;
2696 ceph_assert(!jump_to
);
2699 if (want_seq
&& want_seq
<= log_seq_stable
) {
2700 dout(10) << __func__
<< " want_seq " << want_seq
<< " <= log_seq_stable "
2701 << log_seq_stable
<< ", done" << dendl
;
2702 ceph_assert(!jump_to
);
2705 if (log_t
.empty() && dirty_files
.empty()) {
2706 dout(10) << __func__
<< " want_seq " << want_seq
2707 << " " << log_t
<< " not dirty, dirty_files empty, no-op" << dendl
;
2708 ceph_assert(!jump_to
);
2712 vector
<interval_set
<uint64_t>> to_release(pending_release
.size());
2713 to_release
.swap(pending_release
);
2715 uint64_t seq
= log_t
.seq
= ++log_seq
;
2716 ceph_assert(want_seq
== 0 || want_seq
<= seq
);
2717 log_t
.uuid
= super
.uuid
;
2720 auto lsi
= dirty_files
.find(seq
);
2721 if (lsi
!= dirty_files
.end()) {
2722 dout(20) << __func__
<< " " << lsi
->second
.size() << " dirty_files" << dendl
;
2723 for (auto &f
: lsi
->second
) {
2724 dout(20) << __func__
<< " op_file_update " << f
.fnode
<< dendl
;
2725 log_t
.op_file_update(f
.fnode
);
2729 dout(10) << __func__
<< " " << log_t
<< dendl
;
2730 ceph_assert(!log_t
.empty());
2732 // allocate some more space (before we run out)?
2733 int64_t runway
= log_writer
->file
->fnode
.get_allocated() -
2734 log_writer
->get_effective_write_pos();
2735 bool just_expanded_log
= false;
2736 if (runway
< (int64_t)cct
->_conf
->bluefs_min_log_runway
) {
2737 dout(10) << __func__
<< " allocating more log runway (0x"
2738 << std::hex
<< runway
<< std::dec
<< " remaining)" << dendl
;
2739 while (new_log_writer
) {
2740 dout(10) << __func__
<< " waiting for async compaction" << dendl
;
2743 vselector
->sub_usage(log_writer
->file
->vselector_hint
, log_writer
->file
->fnode
);
2745 vselector
->select_prefer_bdev(log_writer
->file
->vselector_hint
),
2746 cct
->_conf
->bluefs_max_log_runway
,
2747 &log_writer
->file
->fnode
);
2748 ceph_assert(r
== 0);
2749 vselector
->add_usage(log_writer
->file
->vselector_hint
, log_writer
->file
->fnode
);
2750 log_t
.op_file_update(log_writer
->file
->fnode
);
2751 just_expanded_log
= true;
2755 bl
.reserve(super
.block_size
);
2757 // pad to block boundary
2758 size_t realign
= super
.block_size
- (bl
.length() % super
.block_size
);
2759 if (realign
&& realign
!= super
.block_size
)
2760 bl
.append_zero(realign
);
2762 logger
->inc(l_bluefs_logged_bytes
, bl
.length());
2764 if (just_expanded_log
) {
2765 ceph_assert(bl
.length() <= runway
); // if we write this, we will have an unrecoverable data loss
2768 log_writer
->append(bl
);
2771 log_t
.seq
= 0; // just so debug output is less confusing
2772 log_flushing
= true;
2774 int r
= _flush(log_writer
, true);
2775 ceph_assert(r
== 0);
2778 dout(10) << __func__
<< " jumping log offset from 0x" << std::hex
2779 << log_writer
->pos
<< " -> 0x" << jump_to
<< std::dec
<< dendl
;
2780 log_writer
->pos
= jump_to
;
2781 vselector
->sub_usage(log_writer
->file
->vselector_hint
, log_writer
->file
->fnode
.size
);
2782 log_writer
->file
->fnode
.size
= jump_to
;
2783 vselector
->add_usage(log_writer
->file
->vselector_hint
, log_writer
->file
->fnode
.size
);
2786 _flush_bdev_safely(log_writer
);
2788 log_flushing
= false;
2789 log_cond
.notify_all();
2791 // clean dirty files
2792 if (seq
> log_seq_stable
) {
2793 log_seq_stable
= seq
;
2794 dout(20) << __func__
<< " log_seq_stable " << log_seq_stable
<< dendl
;
2796 auto p
= dirty_files
.begin();
2797 while (p
!= dirty_files
.end()) {
2798 if (p
->first
> log_seq_stable
) {
2799 dout(20) << __func__
<< " done cleaning up dirty files" << dendl
;
2803 auto l
= p
->second
.begin();
2804 while (l
!= p
->second
.end()) {
2806 ceph_assert(file
->dirty_seq
> 0);
2807 ceph_assert(file
->dirty_seq
<= log_seq_stable
);
2808 dout(20) << __func__
<< " cleaned file " << file
->fnode
<< dendl
;
2809 file
->dirty_seq
= 0;
2810 p
->second
.erase(l
++);
2813 ceph_assert(p
->second
.empty());
2814 dirty_files
.erase(p
++);
2817 dout(20) << __func__
<< " log_seq_stable " << log_seq_stable
2818 << " already >= out seq " << seq
2819 << ", we lost a race against another log flush, done" << dendl
;
2822 for (unsigned i
= 0; i
< to_release
.size(); ++i
) {
2823 if (!to_release
[i
].empty()) {
2824 /* OK, now we have the guarantee alloc[i] won't be null. */
2826 if (cct
->_conf
->bdev_enable_discard
&& cct
->_conf
->bdev_async_discard
) {
2827 r
= bdev
[i
]->queue_discard(to_release
[i
]);
2830 } else if (cct
->_conf
->bdev_enable_discard
) {
2831 for (auto p
= to_release
[i
].begin(); p
!= to_release
[i
].end(); ++p
) {
2832 bdev
[i
]->discard(p
.get_start(), p
.get_len());
2835 alloc
[i
]->release(to_release
[i
]);
2839 _update_logger_stats();
2844 int BlueFS::_signal_dirty_to_log(FileWriter
*h
)
2846 h
->file
->fnode
.mtime
= ceph_clock_now();
2847 ceph_assert(h
->file
->fnode
.ino
>= 1);
2848 if (h
->file
->dirty_seq
== 0) {
2849 h
->file
->dirty_seq
= log_seq
+ 1;
2850 dirty_files
[h
->file
->dirty_seq
].push_back(*h
->file
);
2851 dout(20) << __func__
<< " dirty_seq = " << log_seq
+ 1
2852 << " (was clean)" << dendl
;
2854 if (h
->file
->dirty_seq
!= log_seq
+ 1) {
2855 // need re-dirty, erase from list first
2856 ceph_assert(dirty_files
.count(h
->file
->dirty_seq
));
2857 auto it
= dirty_files
[h
->file
->dirty_seq
].iterator_to(*h
->file
);
2858 dirty_files
[h
->file
->dirty_seq
].erase(it
);
2859 h
->file
->dirty_seq
= log_seq
+ 1;
2860 dirty_files
[h
->file
->dirty_seq
].push_back(*h
->file
);
2861 dout(20) << __func__
<< " dirty_seq = " << log_seq
+ 1
2862 << " (was " << h
->file
->dirty_seq
<< ")" << dendl
;
2864 dout(20) << __func__
<< " dirty_seq = " << log_seq
+ 1
2865 << " (unchanged, do nothing) " << dendl
;
2871 int BlueFS::_flush_range(FileWriter
*h
, uint64_t offset
, uint64_t length
)
2873 dout(10) << __func__
<< " " << h
<< " pos 0x" << std::hex
<< h
->pos
2874 << " 0x" << offset
<< "~" << length
<< std::dec
2875 << " to " << h
->file
->fnode
<< dendl
;
2876 if (h
->file
->deleted
) {
2877 dout(10) << __func__
<< " deleted, no-op" << dendl
;
2881 ceph_assert(h
->file
->num_readers
.load() == 0);
2883 h
->buffer_appender
.flush();
2886 if (h
->file
->fnode
.ino
== 1)
2889 buffered
= cct
->_conf
->bluefs_buffered_io
;
2891 if (offset
+ length
<= h
->pos
)
2893 if (offset
< h
->pos
) {
2894 length
-= h
->pos
- offset
;
2896 dout(10) << " still need 0x"
2897 << std::hex
<< offset
<< "~" << length
<< std::dec
2900 ceph_assert(offset
<= h
->file
->fnode
.size
);
2902 uint64_t allocated
= h
->file
->fnode
.get_allocated();
2903 vselector
->sub_usage(h
->file
->vselector_hint
, h
->file
->fnode
);
2904 // do not bother to dirty the file if we are overwriting
2905 // previously allocated extents.
2907 if (allocated
< offset
+ length
) {
2908 // we should never run out of log space here; see the min runway check
2909 // in _flush_and_sync_log.
2910 ceph_assert(h
->file
->fnode
.ino
!= 1);
2911 int r
= _allocate(vselector
->select_prefer_bdev(h
->file
->vselector_hint
),
2912 offset
+ length
- allocated
,
2915 derr
<< __func__
<< " allocated: 0x" << std::hex
<< allocated
2916 << " offset: 0x" << offset
<< " length: 0x" << length
<< std::dec
2918 vselector
->add_usage(h
->file
->vselector_hint
, h
->file
->fnode
); // undo
2919 ceph_abort_msg("bluefs enospc");
2922 h
->file
->is_dirty
= true;
2924 if (h
->file
->fnode
.size
< offset
+ length
) {
2925 h
->file
->fnode
.size
= offset
+ length
;
2926 if (h
->file
->fnode
.ino
> 1) {
2927 // we do not need to dirty the log file (or it's compacting
2928 // replacement) when the file size changes because replay is
2929 // smart enough to discover it on its own.
2930 h
->file
->is_dirty
= true;
2933 dout(20) << __func__
<< " file now, unflushed " << h
->file
->fnode
<< dendl
;
2936 auto p
= h
->file
->fnode
.seek(offset
, &x_off
);
2937 ceph_assert(p
!= h
->file
->fnode
.extents
.end());
2938 dout(20) << __func__
<< " in " << *p
<< " x_off 0x"
2939 << std::hex
<< x_off
<< std::dec
<< dendl
;
2941 unsigned partial
= x_off
& ~super
.block_mask();
2944 dout(20) << __func__
<< " using partial tail 0x"
2945 << std::hex
<< partial
<< std::dec
<< dendl
;
2946 ceph_assert(h
->tail_block
.length() == partial
);
2947 bl
.claim_append_piecewise(h
->tail_block
);
2951 dout(20) << __func__
<< " waiting for previous aio to complete" << dendl
;
2952 for (auto p
: h
->iocv
) {
2958 if (length
== partial
+ h
->buffer
.length()) {
2959 /* in case of inital allocation and need to zero, limited flush is unacceptable */
2960 bl
.claim_append_piecewise(h
->buffer
);
2963 h
->buffer
.splice(0, length
, &t
);
2964 bl
.claim_append_piecewise(t
);
2965 t
.substr_of(h
->buffer
, length
, h
->buffer
.length() - length
);
2967 dout(20) << " leaving 0x" << std::hex
<< h
->buffer
.length() << std::dec
2968 << " unflushed" << dendl
;
2970 ceph_assert(bl
.length() == length
);
2972 h
->pos
= offset
+ length
;
2974 unsigned tail
= bl
.length() & ~super
.block_mask();
2976 dout(20) << __func__
<< " caching tail of 0x"
2978 << " and padding block with 0x" << (super
.block_size
- tail
)
2979 << std::dec
<< dendl
;
2980 h
->tail_block
.substr_of(bl
, bl
.length() - tail
, tail
);
2981 bl
.append_zero(super
.block_size
- tail
);
2982 length
+= super
.block_size
- tail
;
2984 h
->tail_block
.clear();
2986 ceph_assert(bl
.length() == length
);
2988 switch (h
->writer_type
) {
2990 logger
->inc(l_bluefs_bytes_written_wal
, length
);
2993 logger
->inc(l_bluefs_bytes_written_sst
, length
);
2997 dout(30) << "dump:\n";
3002 uint64_t bytes_written_slow
= 0;
3003 while (length
> 0) {
3004 uint64_t x_len
= std::min(p
->length
- x_off
, length
);
3006 t
.substr_of(bl
, bloff
, x_len
);
3007 if (cct
->_conf
->bluefs_sync_write
) {
3008 bdev
[p
->bdev
]->write(p
->offset
+ x_off
, t
, buffered
, h
->write_hint
);
3010 bdev
[p
->bdev
]->aio_write(p
->offset
+ x_off
, t
, h
->iocv
[p
->bdev
], buffered
, h
->write_hint
);
3012 h
->dirty_devs
[p
->bdev
] = true;
3013 if (p
->bdev
== BDEV_SLOW
) {
3014 bytes_written_slow
+= t
.length();
3022 logger
->inc(l_bluefs_bytes_written_slow
, bytes_written_slow
);
3023 for (unsigned i
= 0; i
< MAX_BDEV
; ++i
) {
3025 if (h
->iocv
[i
] && h
->iocv
[i
]->has_pending_aios()) {
3026 bdev
[i
]->aio_submit(h
->iocv
[i
]);
3030 vselector
->add_usage(h
->file
->vselector_hint
, h
->file
->fnode
);
3031 dout(20) << __func__
<< " h " << h
<< " pos now 0x"
3032 << std::hex
<< h
->pos
<< std::dec
<< dendl
;
3037 // we need to retire old completed aios so they don't stick around in
3038 // memory indefinitely (along with their bufferlist refs).
3039 void BlueFS::_claim_completed_aios(FileWriter
*h
, list
<aio_t
> *ls
)
3041 for (auto p
: h
->iocv
) {
3043 ls
->splice(ls
->end(), p
->running_aios
);
3046 dout(10) << __func__
<< " got " << ls
->size() << " aios" << dendl
;
3049 void BlueFS::wait_for_aio(FileWriter
*h
)
3051 // NOTE: this is safe to call without a lock, as long as our reference is
3053 dout(10) << __func__
<< " " << h
<< dendl
;
3054 utime_t start
= ceph_clock_now();
3055 for (auto p
: h
->iocv
) {
3060 dout(10) << __func__
<< " " << h
<< " done in " << (ceph_clock_now() - start
) << dendl
;
3064 int BlueFS::_flush(FileWriter
*h
, bool force
, std::unique_lock
<ceph::mutex
>& l
)
3066 bool flushed
= false;
3067 int r
= _flush(h
, force
, &flushed
);
3068 if (r
== 0 && flushed
) {
3069 _maybe_compact_log(l
);
3074 int BlueFS::_flush(FileWriter
*h
, bool force
, bool *flushed
)
3076 h
->buffer_appender
.flush();
3077 uint64_t length
= h
->buffer
.length();
3078 uint64_t offset
= h
->pos
;
3083 length
< cct
->_conf
->bluefs_min_flush_size
) {
3084 dout(10) << __func__
<< " " << h
<< " ignoring, length " << length
3085 << " < min_flush_size " << cct
->_conf
->bluefs_min_flush_size
3090 dout(10) << __func__
<< " " << h
<< " no dirty data on "
3091 << h
->file
->fnode
<< dendl
;
3094 dout(10) << __func__
<< " " << h
<< " 0x"
3095 << std::hex
<< offset
<< "~" << length
<< std::dec
3096 << " to " << h
->file
->fnode
<< dendl
;
3097 ceph_assert(h
->pos
<= h
->file
->fnode
.size
);
3098 int r
= _flush_range(h
, offset
, length
);
3105 int BlueFS::_truncate(FileWriter
*h
, uint64_t offset
)
3107 dout(10) << __func__
<< " 0x" << std::hex
<< offset
<< std::dec
3108 << " file " << h
->file
->fnode
<< dendl
;
3109 if (h
->file
->deleted
) {
3110 dout(10) << __func__
<< " deleted, no-op" << dendl
;
3114 // we never truncate internal log files
3115 ceph_assert(h
->file
->fnode
.ino
> 1);
3117 h
->buffer_appender
.flush();
3119 // truncate off unflushed data?
3120 if (h
->pos
< offset
&&
3121 h
->pos
+ h
->buffer
.length() > offset
) {
3123 dout(20) << __func__
<< " tossing out last " << offset
- h
->pos
3124 << " unflushed bytes" << dendl
;
3125 t
.substr_of(h
->buffer
, 0, offset
- h
->pos
);
3127 ceph_abort_msg("actually this shouldn't happen");
3129 if (h
->buffer
.length()) {
3130 int r
= _flush(h
, true);
3134 if (offset
== h
->file
->fnode
.size
) {
3137 if (offset
> h
->file
->fnode
.size
) {
3138 ceph_abort_msg("truncate up not supported");
3140 ceph_assert(h
->file
->fnode
.size
>= offset
);
3141 vselector
->sub_usage(h
->file
->vselector_hint
, h
->file
->fnode
.size
);
3142 h
->file
->fnode
.size
= offset
;
3143 vselector
->add_usage(h
->file
->vselector_hint
, h
->file
->fnode
.size
);
3144 log_t
.op_file_update(h
->file
->fnode
);
3148 int BlueFS::_fsync(FileWriter
*h
, std::unique_lock
<ceph::mutex
>& l
)
3150 dout(10) << __func__
<< " " << h
<< " " << h
->file
->fnode
<< dendl
;
3151 int r
= _flush(h
, true);
3154 if (h
->file
->is_dirty
) {
3155 _signal_dirty_to_log(h
);
3156 h
->file
->is_dirty
= false;
3158 uint64_t old_dirty_seq
= h
->file
->dirty_seq
;
3160 _flush_bdev_safely(h
);
3162 if (old_dirty_seq
) {
3163 uint64_t s
= log_seq
;
3164 dout(20) << __func__
<< " file metadata was dirty (" << old_dirty_seq
3165 << ") on " << h
->file
->fnode
<< ", flushing log" << dendl
;
3166 _flush_and_sync_log(l
, old_dirty_seq
);
3167 ceph_assert(h
->file
->dirty_seq
== 0 || // cleaned
3168 h
->file
->dirty_seq
> s
); // or redirtied by someone else
3173 void BlueFS::_flush_bdev_safely(FileWriter
*h
)
3175 std::array
<bool, MAX_BDEV
> flush_devs
= h
->dirty_devs
;
3176 h
->dirty_devs
.fill(false);
3178 if (!cct
->_conf
->bluefs_sync_write
) {
3179 list
<aio_t
> completed_ios
;
3180 _claim_completed_aios(h
, &completed_ios
);
3183 completed_ios
.clear();
3184 flush_bdev(flush_devs
);
3190 flush_bdev(flush_devs
);
3195 void BlueFS::flush_bdev(std::array
<bool, MAX_BDEV
>& dirty_bdevs
)
3197 // NOTE: this is safe to call without a lock.
3198 dout(20) << __func__
<< dendl
;
3199 for (unsigned i
= 0; i
< MAX_BDEV
; i
++) {
3205 void BlueFS::flush_bdev()
3207 // NOTE: this is safe to call without a lock.
3208 dout(20) << __func__
<< dendl
;
3209 for (auto p
: bdev
) {
3215 const char* BlueFS::get_device_name(unsigned id
)
3217 if (id
>= MAX_BDEV
) return "BDEV_INV";
3218 const char* names
[] = {"BDEV_WAL", "BDEV_DB", "BDEV_SLOW", "BDEV_NEWWAL", "BDEV_NEWDB"};
3222 int BlueFS::_expand_slow_device(uint64_t need
, PExtentVector
& extents
)
3225 if (slow_dev_expander
) {
3226 auto id
= _get_slow_device_id();
3227 auto min_alloc_size
= alloc_size
[id
];
3228 ceph_assert(id
<= alloc
.size() && alloc
[id
]);
3229 auto min_need
= round_up_to(need
, min_alloc_size
);
3230 need
= std::max(need
,
3231 slow_dev_expander
->get_recommended_expansion_delta(
3232 alloc
[id
]->get_free(), block_all
[id
].size()));
3234 need
= round_up_to(need
, min_alloc_size
);
3235 dout(10) << __func__
<< " expanding slow device by 0x"
3236 << std::hex
<< need
<< std::dec
3238 r
= slow_dev_expander
->allocate_freespace(min_need
, need
, extents
);
3243 int BlueFS::_allocate_without_fallback(uint8_t id
, uint64_t len
,
3244 PExtentVector
* extents
)
3246 dout(10) << __func__
<< " len 0x" << std::hex
<< len
<< std::dec
3247 << " from " << (int)id
<< dendl
;
3248 assert(id
< alloc
.size());
3252 extents
->reserve(4); // 4 should be (more than) enough for most allocations
3253 uint64_t min_alloc_size
= alloc_size
[id
];
3254 uint64_t left
= round_up_to(len
, min_alloc_size
);
3255 int64_t alloc_len
= alloc
[id
]->allocate(left
, min_alloc_size
, 0, extents
);
3256 if (alloc_len
< 0 || alloc_len
< (int64_t)left
) {
3257 if (alloc_len
> 0) {
3258 alloc
[id
]->release(*extents
);
3261 derr
<< __func__
<< " failed to allocate 0x" << std::hex
<< left
3262 << " on bdev " << (int)id
3263 << ", free 0x" << alloc
[id
]->get_free() << std::dec
<< dendl
;
3265 derr
<< __func__
<< " failed to allocate 0x" << std::hex
<< left
3266 << " on bdev " << (int)id
<< ", dne" << std::dec
<< dendl
;
3275 int BlueFS::_allocate(uint8_t id
, uint64_t len
,
3276 bluefs_fnode_t
* node
)
3278 dout(10) << __func__
<< " len 0x" << std::hex
<< len
<< std::dec
3279 << " from " << (int)id
<< dendl
;
3280 ceph_assert(id
< alloc
.size());
3281 int64_t alloc_len
= 0;
3282 PExtentVector extents
;
3285 if (!node
->extents
.empty() && node
->extents
.back().bdev
== id
) {
3286 hint
= node
->extents
.back().end();
3288 extents
.reserve(4); // 4 should be (more than) enough for most allocations
3289 alloc_len
= alloc
[id
]->allocate(round_up_to(len
, alloc_size
[id
]),
3290 alloc_size
[id
], hint
, &extents
);
3294 alloc_len
< (int64_t)round_up_to(len
, alloc_size
[id
])) {
3295 if (alloc_len
> 0) {
3296 alloc
[id
]->release(extents
);
3298 if (id
!= BDEV_SLOW
) {
3300 dout(1) << __func__
<< " failed to allocate 0x" << std::hex
<< len
3301 << " on bdev " << (int)id
3302 << ", free 0x" << alloc
[id
]->get_free()
3303 << "; fallback to bdev " << (int)id
+ 1
3304 << std::dec
<< dendl
;
3306 return _allocate(id
+ 1, len
, node
);
3308 dout(1) << __func__
<< " unable to allocate 0x" << std::hex
<< len
3309 << " on bdev " << (int)id
<< ", free 0x"
3310 << (alloc
[id
] ? alloc
[id
]->get_free() : (uint64_t)-1)
3311 << "; fallback to slow device expander "
3312 << std::dec
<< dendl
;
3314 if (_expand_slow_device(len
, extents
) == 0) {
3315 id
= _get_slow_device_id();
3316 for (auto& e
: extents
) {
3317 _add_block_extent(id
, e
.offset
, e
.length
);
3320 auto* last_alloc
= alloc
[id
];
3321 ceph_assert(last_alloc
);
3323 alloc_len
= last_alloc
->allocate(round_up_to(len
, alloc_size
[id
]),
3324 alloc_size
[id
], hint
, &extents
);
3325 if (alloc_len
< 0 || alloc_len
< (int64_t)len
) {
3326 if (alloc_len
> 0) {
3327 last_alloc
->release(extents
);
3329 derr
<< __func__
<< " failed to allocate 0x" << std::hex
<< len
3330 << " on bdev " << (int)id
3331 << ", free 0x" << last_alloc
->get_free() << std::dec
<< dendl
;
3335 derr
<< __func__
<< " failed to expand slow device to fit +0x"
3336 << std::hex
<< len
<< std::dec
3341 uint64_t total_allocated
=
3342 block_all
[id
].size() - alloc
[id
]->get_free();
3343 if (max_bytes
[id
] < total_allocated
) {
3344 logger
->set(max_bytes_pcounters
[id
], total_allocated
);
3345 max_bytes
[id
] = total_allocated
;
3349 for (auto& p
: extents
) {
3350 node
->append_extent(bluefs_extent_t(id
, p
.offset
, p
.length
));
3356 int BlueFS::_preallocate(FileRef f
, uint64_t off
, uint64_t len
)
3358 dout(10) << __func__
<< " file " << f
->fnode
<< " 0x"
3359 << std::hex
<< off
<< "~" << len
<< std::dec
<< dendl
;
3361 dout(10) << __func__
<< " deleted, no-op" << dendl
;
3364 ceph_assert(f
->fnode
.ino
> 1);
3365 uint64_t allocated
= f
->fnode
.get_allocated();
3366 if (off
+ len
> allocated
) {
3367 uint64_t want
= off
+ len
- allocated
;
3368 vselector
->sub_usage(f
->vselector_hint
, f
->fnode
);
3370 int r
= _allocate(vselector
->select_prefer_bdev(f
->vselector_hint
),
3373 vselector
->add_usage(f
->vselector_hint
, f
->fnode
);
3376 log_t
.op_file_update(f
->fnode
);
3381 void BlueFS::sync_metadata(bool avoid_compact
)
3383 std::unique_lock
<ceph::mutex
> l(lock
);
3384 if (log_t
.empty() && dirty_files
.empty()) {
3385 dout(10) << __func__
<< " - no pending log events" << dendl
;
3387 dout(10) << __func__
<< dendl
;
3388 utime_t start
= ceph_clock_now();
3389 flush_bdev(); // FIXME?
3390 _flush_and_sync_log(l
);
3391 dout(10) << __func__
<< " done in " << (ceph_clock_now() - start
) << dendl
;
3394 if (!avoid_compact
) {
3395 _maybe_compact_log(l
);
3399 void BlueFS::_maybe_compact_log(std::unique_lock
<ceph::mutex
>& l
)
3401 if (!cct
->_conf
->bluefs_replay_recovery_disable_compact
&&
3402 _should_compact_log()) {
3403 if (cct
->_conf
->bluefs_compact_log_sync
) {
3404 _compact_log_sync();
3406 _compact_log_async(l
);
3411 int BlueFS::open_for_write(
3412 std::string_view dirname
,
3413 std::string_view filename
,
3417 std::lock_guard
l(lock
);
3418 dout(10) << __func__
<< " " << dirname
<< "/" << filename
<< dendl
;
3419 map
<string
,DirRef
>::iterator p
= dir_map
.find(dirname
);
3421 if (p
== dir_map
.end()) {
3422 // implicitly create the dir
3423 dout(20) << __func__
<< " dir " << dirname
3424 << " does not exist" << dendl
;
3431 bool create
= false;
3432 bool truncate
= false;
3433 map
<string
,FileRef
>::iterator q
= dir
->file_map
.find(filename
);
3434 if (q
== dir
->file_map
.end()) {
3436 dout(20) << __func__
<< " dir " << dirname
<< " (" << dir
3437 << ") file " << filename
3438 << " does not exist" << dendl
;
3441 file
= ceph::make_ref
<File
>();
3442 file
->fnode
.ino
= ++ino_last
;
3443 file_map
[ino_last
] = file
;
3444 dir
->file_map
[string
{filename
}] = file
;
3448 // overwrite existing file?
3451 dout(20) << __func__
<< " dir " << dirname
<< " (" << dir
3452 << ") file " << filename
3453 << " already exists, overwrite in place" << dendl
;
3455 dout(20) << __func__
<< " dir " << dirname
<< " (" << dir
3456 << ") file " << filename
3457 << " already exists, truncate + overwrite" << dendl
;
3458 vselector
->sub_usage(file
->vselector_hint
, file
->fnode
);
3459 file
->fnode
.size
= 0;
3460 for (auto& p
: file
->fnode
.extents
) {
3461 pending_release
[p
.bdev
].insert(p
.offset
, p
.length
);
3465 file
->fnode
.clear_extents();
3468 ceph_assert(file
->fnode
.ino
> 1);
3470 file
->fnode
.mtime
= ceph_clock_now();
3471 file
->vselector_hint
= vselector
->get_hint_by_dir(dirname
);
3472 if (create
|| truncate
) {
3473 vselector
->add_usage(file
->vselector_hint
, file
->fnode
); // update file count
3476 dout(20) << __func__
<< " mapping " << dirname
<< "/" << filename
3477 << " vsel_hint " << file
->vselector_hint
3480 log_t
.op_file_update(file
->fnode
);
3482 log_t
.op_dir_link(dirname
, filename
, file
->fnode
.ino
);
3484 *h
= _create_writer(file
);
3486 if (boost::algorithm::ends_with(filename
, ".log")) {
3487 (*h
)->writer_type
= BlueFS::WRITER_WAL
;
3488 if (logger
&& !overwrite
) {
3489 logger
->inc(l_bluefs_files_written_wal
);
3491 } else if (boost::algorithm::ends_with(filename
, ".sst")) {
3492 (*h
)->writer_type
= BlueFS::WRITER_SST
;
3494 logger
->inc(l_bluefs_files_written_sst
);
3498 dout(10) << __func__
<< " h " << *h
<< " on " << file
->fnode
<< dendl
;
3502 BlueFS::FileWriter
*BlueFS::_create_writer(FileRef f
)
3504 FileWriter
*w
= new FileWriter(f
);
3505 for (unsigned i
= 0; i
< MAX_BDEV
; ++i
) {
3507 w
->iocv
[i
] = new IOContext(cct
, NULL
);
3513 void BlueFS::_close_writer(FileWriter
*h
)
3515 dout(10) << __func__
<< " " << h
<< " type " << h
->writer_type
<< dendl
;
3516 h
->buffer
.reassign_to_mempool(mempool::mempool_bluefs_file_writer
);
3517 for (unsigned i
=0; i
<MAX_BDEV
; ++i
) {
3520 h
->iocv
[i
]->aio_wait();
3521 bdev
[i
]->queue_reap_ioc(h
->iocv
[i
]);
3528 uint64_t BlueFS::debug_get_dirty_seq(FileWriter
*h
)
3530 std::lock_guard
l(lock
);
3531 return h
->file
->dirty_seq
;
3534 bool BlueFS::debug_get_is_dev_dirty(FileWriter
*h
, uint8_t dev
)
3536 std::lock_guard
l(lock
);
3537 return h
->dirty_devs
[dev
];
3540 int BlueFS::open_for_read(
3541 std::string_view dirname
,
3542 std::string_view filename
,
3546 std::lock_guard
l(lock
);
3547 dout(10) << __func__
<< " " << dirname
<< "/" << filename
3548 << (random
? " (random)":" (sequential)") << dendl
;
3549 map
<string
,DirRef
>::iterator p
= dir_map
.find(dirname
);
3550 if (p
== dir_map
.end()) {
3551 dout(20) << __func__
<< " dir " << dirname
<< " not found" << dendl
;
3554 DirRef dir
= p
->second
;
3556 map
<string
,FileRef
>::iterator q
= dir
->file_map
.find(filename
);
3557 if (q
== dir
->file_map
.end()) {
3558 dout(20) << __func__
<< " dir " << dirname
<< " (" << dir
3559 << ") file " << filename
3560 << " not found" << dendl
;
3563 File
*file
= q
->second
.get();
3565 *h
= new FileReader(file
, random
? 4096 : cct
->_conf
->bluefs_max_prefetch
,
3567 dout(10) << __func__
<< " h " << *h
<< " on " << file
->fnode
<< dendl
;
3572 std::string_view old_dirname
, std::string_view old_filename
,
3573 std::string_view new_dirname
, std::string_view new_filename
)
3575 std::lock_guard
l(lock
);
3576 dout(10) << __func__
<< " " << old_dirname
<< "/" << old_filename
3577 << " -> " << new_dirname
<< "/" << new_filename
<< dendl
;
3578 map
<string
,DirRef
>::iterator p
= dir_map
.find(old_dirname
);
3579 if (p
== dir_map
.end()) {
3580 dout(20) << __func__
<< " dir " << old_dirname
<< " not found" << dendl
;
3583 DirRef old_dir
= p
->second
;
3584 map
<string
,FileRef
>::iterator q
= old_dir
->file_map
.find(old_filename
);
3585 if (q
== old_dir
->file_map
.end()) {
3586 dout(20) << __func__
<< " dir " << old_dirname
<< " (" << old_dir
3587 << ") file " << old_filename
3588 << " not found" << dendl
;
3591 FileRef file
= q
->second
;
3593 p
= dir_map
.find(new_dirname
);
3594 if (p
== dir_map
.end()) {
3595 dout(20) << __func__
<< " dir " << new_dirname
<< " not found" << dendl
;
3598 DirRef new_dir
= p
->second
;
3599 q
= new_dir
->file_map
.find(new_filename
);
3600 if (q
!= new_dir
->file_map
.end()) {
3601 dout(20) << __func__
<< " dir " << new_dirname
<< " (" << old_dir
3602 << ") file " << new_filename
3603 << " already exists, unlinking" << dendl
;
3604 ceph_assert(q
->second
!= file
);
3605 log_t
.op_dir_unlink(new_dirname
, new_filename
);
3606 _drop_link(q
->second
);
3609 dout(10) << __func__
<< " " << new_dirname
<< "/" << new_filename
<< " "
3610 << " " << file
->fnode
<< dendl
;
3612 new_dir
->file_map
[string
{new_filename
}] = file
;
3613 old_dir
->file_map
.erase(string
{old_filename
});
3615 log_t
.op_dir_link(new_dirname
, new_filename
, file
->fnode
.ino
);
3616 log_t
.op_dir_unlink(old_dirname
, old_filename
);
3620 int BlueFS::mkdir(std::string_view dirname
)
3622 std::lock_guard
l(lock
);
3623 dout(10) << __func__
<< " " << dirname
<< dendl
;
3624 map
<string
,DirRef
>::iterator p
= dir_map
.find(dirname
);
3625 if (p
!= dir_map
.end()) {
3626 dout(20) << __func__
<< " dir " << dirname
<< " exists" << dendl
;
3629 dir_map
[string
{dirname
}] = ceph::make_ref
<Dir
>();
3630 log_t
.op_dir_create(dirname
);
3634 int BlueFS::rmdir(std::string_view dirname
)
3636 std::lock_guard
l(lock
);
3637 dout(10) << __func__
<< " " << dirname
<< dendl
;
3638 auto p
= dir_map
.find(dirname
);
3639 if (p
== dir_map
.end()) {
3640 dout(20) << __func__
<< " dir " << dirname
<< " does not exist" << dendl
;
3643 DirRef dir
= p
->second
;
3644 if (!dir
->file_map
.empty()) {
3645 dout(20) << __func__
<< " dir " << dirname
<< " not empty" << dendl
;
3648 dir_map
.erase(string
{dirname
});
3649 log_t
.op_dir_remove(dirname
);
3653 bool BlueFS::dir_exists(std::string_view dirname
)
3655 std::lock_guard
l(lock
);
3656 map
<string
,DirRef
>::iterator p
= dir_map
.find(dirname
);
3657 bool exists
= p
!= dir_map
.end();
3658 dout(10) << __func__
<< " " << dirname
<< " = " << (int)exists
<< dendl
;
3662 int BlueFS::stat(std::string_view dirname
, std::string_view filename
,
3663 uint64_t *size
, utime_t
*mtime
)
3665 std::lock_guard
l(lock
);
3666 dout(10) << __func__
<< " " << dirname
<< "/" << filename
<< dendl
;
3667 map
<string
,DirRef
>::iterator p
= dir_map
.find(dirname
);
3668 if (p
== dir_map
.end()) {
3669 dout(20) << __func__
<< " dir " << dirname
<< " not found" << dendl
;
3672 DirRef dir
= p
->second
;
3673 map
<string
,FileRef
>::iterator q
= dir
->file_map
.find(filename
);
3674 if (q
== dir
->file_map
.end()) {
3675 dout(20) << __func__
<< " dir " << dirname
<< " (" << dir
3676 << ") file " << filename
3677 << " not found" << dendl
;
3680 File
*file
= q
->second
.get();
3681 dout(10) << __func__
<< " " << dirname
<< "/" << filename
3682 << " " << file
->fnode
<< dendl
;
3684 *size
= file
->fnode
.size
;
3686 *mtime
= file
->fnode
.mtime
;
3690 int BlueFS::lock_file(std::string_view dirname
, std::string_view filename
,
3693 std::lock_guard
l(lock
);
3694 dout(10) << __func__
<< " " << dirname
<< "/" << filename
<< dendl
;
3695 map
<string
,DirRef
>::iterator p
= dir_map
.find(dirname
);
3696 if (p
== dir_map
.end()) {
3697 dout(20) << __func__
<< " dir " << dirname
<< " not found" << dendl
;
3700 DirRef dir
= p
->second
;
3701 auto q
= dir
->file_map
.find(filename
);
3703 if (q
== dir
->file_map
.end()) {
3704 dout(20) << __func__
<< " dir " << dirname
<< " (" << dir
3705 << ") file " << filename
3706 << " not found, creating" << dendl
;
3707 file
= ceph::make_ref
<File
>();
3708 file
->fnode
.ino
= ++ino_last
;
3709 file
->fnode
.mtime
= ceph_clock_now();
3710 file_map
[ino_last
] = file
;
3711 dir
->file_map
[string
{filename
}] = file
;
3713 log_t
.op_file_update(file
->fnode
);
3714 log_t
.op_dir_link(dirname
, filename
, file
->fnode
.ino
);
3718 dout(10) << __func__
<< " already locked" << dendl
;
3722 file
->locked
= true;
3723 *plock
= new FileLock(file
);
3724 dout(10) << __func__
<< " locked " << file
->fnode
3725 << " with " << *plock
<< dendl
;
3729 int BlueFS::unlock_file(FileLock
*fl
)
3731 std::lock_guard
l(lock
);
3732 dout(10) << __func__
<< " " << fl
<< " on " << fl
->file
->fnode
<< dendl
;
3733 ceph_assert(fl
->file
->locked
);
3734 fl
->file
->locked
= false;
3739 int BlueFS::readdir(std::string_view dirname
, vector
<string
> *ls
)
3741 // dirname may contain a trailing /
3742 if (!dirname
.empty() && dirname
.back() == '/') {
3743 dirname
.remove_suffix(1);
3745 std::lock_guard
l(lock
);
3746 dout(10) << __func__
<< " " << dirname
<< dendl
;
3747 if (dirname
.empty()) {
3749 ls
->reserve(dir_map
.size() + 2);
3750 for (auto& q
: dir_map
) {
3751 ls
->push_back(q
.first
);
3754 // list files in dir
3755 map
<string
,DirRef
>::iterator p
= dir_map
.find(dirname
);
3756 if (p
== dir_map
.end()) {
3757 dout(20) << __func__
<< " dir " << dirname
<< " not found" << dendl
;
3760 DirRef dir
= p
->second
;
3761 ls
->reserve(dir
->file_map
.size() + 2);
3762 for (auto& q
: dir
->file_map
) {
3763 ls
->push_back(q
.first
);
3767 ls
->push_back("..");
3771 int BlueFS::unlink(std::string_view dirname
, std::string_view filename
)
3773 std::lock_guard
l(lock
);
3774 dout(10) << __func__
<< " " << dirname
<< "/" << filename
<< dendl
;
3775 map
<string
,DirRef
>::iterator p
= dir_map
.find(dirname
);
3776 if (p
== dir_map
.end()) {
3777 dout(20) << __func__
<< " dir " << dirname
<< " not found" << dendl
;
3780 DirRef dir
= p
->second
;
3781 map
<string
,FileRef
>::iterator q
= dir
->file_map
.find(filename
);
3782 if (q
== dir
->file_map
.end()) {
3783 dout(20) << __func__
<< " file " << dirname
<< "/" << filename
3784 << " not found" << dendl
;
3787 FileRef file
= q
->second
;
3789 dout(20) << __func__
<< " file " << dirname
<< "/" << filename
3790 << " is locked" << dendl
;
3793 dir
->file_map
.erase(string
{filename
});
3794 log_t
.op_dir_unlink(dirname
, filename
);
3799 bool BlueFS::wal_is_rotational()
3801 if (bdev
[BDEV_WAL
]) {
3802 return bdev
[BDEV_WAL
]->is_rotational();
3803 } else if (bdev
[BDEV_DB
]) {
3804 return bdev
[BDEV_DB
]->is_rotational();
3806 return bdev
[BDEV_SLOW
]->is_rotational();
3811 do_replay_recovery_read is used when bluefs log abruptly ends, but it seems that more data should be there.
3812 Idea is to search disk for definiton of extents that will be accompanied with bluefs log in future,
3813 and try if using it will produce healthy bluefs transaction.
3814 We encode already known bluefs log extents and search disk for these bytes.
3815 When we find it, we decode following bytes as extent.
3816 We read that whole extent and then check if merged with existing log part gives a proper bluefs transaction.
3818 int BlueFS::do_replay_recovery_read(FileReader
*log_reader
,
3823 dout(1) << __func__
<< " replay_pos=0x" << std::hex
<< replay_pos
<<
3824 " needs 0x" << read_offset
<< "~" << read_len
<< std::dec
<< dendl
;
3826 bluefs_fnode_t
& log_fnode
= log_reader
->file
->fnode
;
3827 bufferlist bin_extents
;
3828 ceph::encode(log_fnode
.extents
, bin_extents
);
3829 dout(2) << __func__
<< " log file encoded extents length = " << bin_extents
.length() << dendl
;
3831 // cannot process if too small to effectively search
3832 ceph_assert(bin_extents
.length() >= 32);
3834 last_32
.substr_of(bin_extents
, bin_extents
.length() - 32, 32);
3836 //read fixed part from replay_pos to end of bluefs_log extents
3839 auto e
= log_fnode
.seek(replay_pos
, &e_off
);
3840 ceph_assert(e
!= log_fnode
.extents
.end());
3841 int r
= bdev
[e
->bdev
]->read(e
->offset
+ e_off
, e
->length
- e_off
, &fixed
, ioc
[e
->bdev
],
3842 cct
->_conf
->bluefs_buffered_io
);
3843 ceph_assert(r
== 0);
3844 //capture dev of last good extent
3845 uint8_t last_e_dev
= e
->bdev
;
3846 uint64_t last_e_off
= e
->offset
;
3848 while (e
!= log_fnode
.extents
.end()) {
3849 r
= bdev
[e
->bdev
]->read(e
->offset
, e
->length
, &fixed
, ioc
[e
->bdev
],
3850 cct
->_conf
->bluefs_buffered_io
);
3851 ceph_assert(r
== 0);
3852 last_e_dev
= e
->bdev
;
3855 ceph_assert(replay_pos
+ fixed
.length() == read_offset
);
3857 dout(2) << __func__
<< " valid data in log = " << fixed
.length() << dendl
;
3860 bool operator()(const bluefs_extent_t
& a
, const bluefs_extent_t
& b
) const {
3861 if (a
.bdev
< b
.bdev
) return true;
3862 if (a
.offset
< b
.offset
) return true;
3863 return a
.length
< b
.length
;
3866 std::set
<bluefs_extent_t
, compare
> extents_rejected
;
3867 for (int dcnt
= 0; dcnt
< 3; dcnt
++) {
3868 uint8_t dev
= (last_e_dev
+ dcnt
) % MAX_BDEV
;
3869 if (bdev
[dev
] == nullptr) continue;
3870 dout(2) << __func__
<< " processing " << get_device_name(dev
) << dendl
;
3871 interval_set
<uint64_t> disk_regions
;
3872 disk_regions
.insert(0, bdev
[dev
]->get_size());
3873 for (auto f
: file_map
) {
3874 auto& e
= f
.second
->fnode
.extents
;
3876 if (p
.bdev
== dev
) {
3877 disk_regions
.erase(p
.offset
, p
.length
);
3881 size_t disk_regions_count
= disk_regions
.num_intervals();
3882 dout(5) << __func__
<< " " << disk_regions_count
<< " regions to scan on " << get_device_name(dev
) << dendl
;
3884 auto reg
= disk_regions
.lower_bound(last_e_off
);
3885 //for all except first, start from beginning
3887 if (reg
== disk_regions
.end()) {
3888 reg
= disk_regions
.begin();
3890 const uint64_t chunk_size
= 4 * 1024 * 1024;
3891 const uint64_t page_size
= 4096;
3892 const uint64_t max_extent_size
= 16;
3893 uint64_t overlay_size
= last_32
.length() + max_extent_size
;
3894 for (size_t i
= 0; i
< disk_regions_count
; reg
++, i
++) {
3895 if (reg
== disk_regions
.end()) {
3896 reg
= disk_regions
.begin();
3898 uint64_t pos
= reg
.get_start();
3899 uint64_t len
= reg
.get_len();
3901 std::unique_ptr
<char[]> raw_data_p
{new char[page_size
+ chunk_size
]};
3902 char* raw_data
= raw_data_p
.get();
3903 memset(raw_data
, 0, page_size
);
3905 while (len
> last_32
.length()) {
3906 uint64_t chunk_len
= len
> chunk_size
? chunk_size
: len
;
3907 dout(5) << __func__
<< " read "
3908 << get_device_name(dev
) << ":0x" << std::hex
<< pos
<< "+" << chunk_len
<< std::dec
<< dendl
;
3909 r
= bdev
[dev
]->read_random(pos
, chunk_len
, raw_data
+ page_size
, cct
->_conf
->bluefs_buffered_io
);
3910 ceph_assert(r
== 0);
3912 //search for fixed_last_32
3913 char* chunk_b
= raw_data
+ page_size
;
3914 char* chunk_e
= chunk_b
+ chunk_len
;
3916 char* search_b
= chunk_b
- overlay_size
;
3917 char* search_e
= chunk_e
;
3919 for (char* sp
= search_b
; ; sp
+= last_32
.length()) {
3920 sp
= (char*)memmem(sp
, search_e
- sp
, last_32
.c_str(), last_32
.length());
3921 if (sp
== nullptr) {
3925 char* n
= sp
+ last_32
.length();
3926 dout(5) << __func__
<< " checking location 0x" << std::hex
<< pos
+ (n
- chunk_b
) << std::dec
<< dendl
;
3928 test
.append(n
, std::min
<size_t>(max_extent_size
, chunk_e
- n
));
3931 bufferlist::const_iterator p
= test
.begin();
3932 ceph::decode(ne
, p
);
3933 } catch (buffer::error
& e
) {
3936 if (extents_rejected
.count(ne
) != 0) {
3937 dout(5) << __func__
<< " extent " << ne
<< " already refected" <<dendl
;
3940 //insert as rejected already. if we succeed, it wouldn't make difference.
3941 extents_rejected
.insert(ne
);
3943 if (ne
.bdev
>= MAX_BDEV
||
3944 bdev
[ne
.bdev
] == nullptr ||
3945 ne
.length
> 16 * 1024 * 1024 ||
3946 (ne
.length
& 4095) != 0 ||
3947 ne
.offset
+ ne
.length
> bdev
[ne
.bdev
]->get_size() ||
3948 (ne
.offset
& 4095) != 0) {
3949 dout(5) << __func__
<< " refusing extent " << ne
<< dendl
;
3952 dout(5) << __func__
<< " checking extent " << ne
<< dendl
;
3954 //read candidate extent - whole
3955 bufferlist candidate
;
3956 candidate
.append(fixed
);
3957 r
= bdev
[ne
.bdev
]->read(ne
.offset
, ne
.length
, &candidate
, ioc
[ne
.bdev
],
3958 cct
->_conf
->bluefs_buffered_io
);
3959 ceph_assert(r
== 0);
3961 //check if transaction & crc is ok
3962 bluefs_transaction_t t
;
3964 bufferlist::const_iterator p
= candidate
.cbegin();
3967 catch (buffer::error
& e
) {
3968 dout(5) << __func__
<< " failed match" << dendl
;
3972 //success, it seems a probable candidate
3973 uint64_t l
= std::min
<uint64_t>(ne
.length
, read_len
);
3974 //trim to required size
3975 bufferlist requested_read
;
3976 requested_read
.substr_of(candidate
, fixed
.length(), l
);
3977 bl
->append(requested_read
);
3978 dout(5) << __func__
<< " successful extension of log " << l
<< "/" << read_len
<< dendl
;
3979 log_fnode
.append_extent(ne
);
3980 log_fnode
.recalc_allocated();
3981 log_reader
->buf
.pos
+= l
;
3984 //save overlay for next search
3985 memcpy(search_b
, chunk_e
- overlay_size
, overlay_size
);
3994 void BlueFS::debug_inject_duplicate_gift(unsigned id
,
3998 dout(0) << __func__
<< dendl
;
3999 if (id
< alloc
.size() && alloc
[id
]) {
4000 alloc
[id
]->init_add_free(offset
, len
);
4004 // ===============================================
4005 // OriginalVolumeSelector
4007 void* OriginalVolumeSelector::get_hint_for_log() const {
4008 return reinterpret_cast<void*>(BlueFS::BDEV_WAL
);
4010 void* OriginalVolumeSelector::get_hint_by_dir(std::string_view dirname
) const {
4011 uint8_t res
= BlueFS::BDEV_DB
;
4012 if (dirname
.length() > 5) {
4013 // the "db.slow" and "db.wal" directory names are hard-coded at
4014 // match up with bluestore. the slow device is always the second
4015 // one (when a dedicated block.db device is present and used at
4016 // bdev 0). the wal device is always last.
4017 if (boost::algorithm::ends_with(dirname
, ".slow")) {
4018 res
= BlueFS::BDEV_SLOW
;
4020 else if (boost::algorithm::ends_with(dirname
, ".wal")) {
4021 res
= BlueFS::BDEV_WAL
;
4024 return reinterpret_cast<void*>(res
);
4027 uint8_t OriginalVolumeSelector::select_prefer_bdev(void* hint
)
4029 return (uint8_t)(reinterpret_cast<uint64_t>(hint
));
4032 void OriginalVolumeSelector::get_paths(const std::string
& base
, paths
& res
) const
4034 res
.emplace_back(base
, db_total
);
4035 res
.emplace_back(base
+ ".slow", slow_total
);
4039 #define dout_prefix *_dout << "OriginalVolumeSelector: "
4041 void OriginalVolumeSelector::dump(ostream
& sout
) {
4042 sout
<< "wal_total:" << wal_total
4043 << ", db_total:" << db_total
4044 << ", slow_total:" << slow_total