]> git.proxmox.com Git - ceph.git/blame - ceph/src/os/bluestore/BlueFS.cc
Import ceph 15.2.8
[ceph.git] / ceph / src / os / bluestore / BlueFS.cc
CommitLineData
7c673cae
FG
1// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2// vim: ts=8 sw=2 smarttab
3
4#include "boost/algorithm/string.hpp"
9f95a23c 5#include "bluestore_common.h"
7c673cae
FG
6#include "BlueFS.h"
7
8#include "common/debug.h"
9#include "common/errno.h"
10#include "common/perf_counters.h"
11#include "BlockDevice.h"
12#include "Allocator.h"
11fdf7f2 13#include "include/ceph_assert.h"
eafe8130 14#include "common/admin_socket.h"
7c673cae
FG
15
16#define dout_context cct
17#define dout_subsys ceph_subsys_bluefs
18#undef dout_prefix
19#define dout_prefix *_dout << "bluefs "
9f95a23c 20using TOPNSPC::common::cmd_getval;
7c673cae
FG
21MEMPOOL_DEFINE_OBJECT_FACTORY(BlueFS::File, bluefs_file, bluefs);
22MEMPOOL_DEFINE_OBJECT_FACTORY(BlueFS::Dir, bluefs_dir, bluefs);
f91f0fd5 23MEMPOOL_DEFINE_OBJECT_FACTORY(BlueFS::FileWriter, bluefs_file_writer, bluefs_file_writer);
7c673cae 24MEMPOOL_DEFINE_OBJECT_FACTORY(BlueFS::FileReaderBuffer,
f91f0fd5
TL
25 bluefs_file_reader_buffer, bluefs_file_reader);
26MEMPOOL_DEFINE_OBJECT_FACTORY(BlueFS::FileReader, bluefs_file_reader, bluefs_file_reader);
7c673cae
FG
27MEMPOOL_DEFINE_OBJECT_FACTORY(BlueFS::FileLock, bluefs_file_lock, bluefs);
28
11fdf7f2
TL
29static void wal_discard_cb(void *priv, void* priv2) {
30 BlueFS *bluefs = static_cast<BlueFS*>(priv);
31 interval_set<uint64_t> *tmp = static_cast<interval_set<uint64_t>*>(priv2);
32 bluefs->handle_discard(BlueFS::BDEV_WAL, *tmp);
33}
34
35static void db_discard_cb(void *priv, void* priv2) {
36 BlueFS *bluefs = static_cast<BlueFS*>(priv);
37 interval_set<uint64_t> *tmp = static_cast<interval_set<uint64_t>*>(priv2);
38 bluefs->handle_discard(BlueFS::BDEV_DB, *tmp);
39}
40
41static void slow_discard_cb(void *priv, void* priv2) {
42 BlueFS *bluefs = static_cast<BlueFS*>(priv);
43 interval_set<uint64_t> *tmp = static_cast<interval_set<uint64_t>*>(priv2);
44 bluefs->handle_discard(BlueFS::BDEV_SLOW, *tmp);
45}
7c673cae 46
eafe8130
TL
47class BlueFS::SocketHook : public AdminSocketHook {
48 BlueFS* bluefs;
49public:
50 static BlueFS::SocketHook* create(BlueFS* bluefs)
51 {
52 BlueFS::SocketHook* hook = nullptr;
53 AdminSocket* admin_socket = bluefs->cct->get_admin_socket();
54 if (admin_socket) {
55 hook = new BlueFS::SocketHook(bluefs);
9f95a23c 56 int r = admin_socket->register_command("bluestore bluefs available "
eafe8130
TL
57 "name=alloc_size,type=CephInt,req=false",
58 hook,
59 "Report available space for bluefs. "
60 "If alloc_size set, make simulation.");
61 if (r != 0) {
62 ldout(bluefs->cct, 1) << __func__ << " cannot register SocketHook" << dendl;
63 delete hook;
64 hook = nullptr;
9f95a23c 65 } else {
f6b5b4d7 66 r = admin_socket->register_command("bluefs stats",
9f95a23c
TL
67 hook,
68 "Dump internal statistics for bluefs."
69 "");
70 ceph_assert(r == 0);
eafe8130
TL
71 }
72 }
73 return hook;
74 }
75
76 ~SocketHook() {
77 AdminSocket* admin_socket = bluefs->cct->get_admin_socket();
9f95a23c 78 admin_socket->unregister_commands(this);
eafe8130
TL
79 }
80private:
81 SocketHook(BlueFS* bluefs) :
82 bluefs(bluefs) {}
9f95a23c
TL
83 int call(std::string_view command, const cmdmap_t& cmdmap,
84 Formatter *f,
85 std::ostream& errss,
86 bufferlist& out) override {
87 if (command == "bluestore bluefs available") {
88 int64_t alloc_size = 0;
89 cmd_getval(cmdmap, "alloc_size", alloc_size);
90 if ((alloc_size & (alloc_size - 1)) != 0) {
91 errss << "Invalid allocation size:'" << alloc_size << std::endl;
92 return -EINVAL;
93 }
94 if (alloc_size == 0)
95 alloc_size = bluefs->cct->_conf->bluefs_alloc_size;
96 f->open_object_section("bluefs_available_space");
97 for (unsigned dev = BDEV_WAL; dev <= BDEV_SLOW; dev++) {
98 if (bluefs->bdev[dev]) {
99 f->open_object_section("dev");
100 f->dump_string("device", bluefs->get_device_name(dev));
101 ceph_assert(bluefs->alloc[dev]);
102 f->dump_int("free", bluefs->alloc[dev]->get_free());
103 f->close_section();
104 }
105 }
106 size_t extra_space = 0;
107 if (bluefs->slow_dev_expander) {
108 extra_space = bluefs->slow_dev_expander->available_freespace(alloc_size);
eafe8130 109 }
9f95a23c
TL
110 f->dump_int("available_from_bluestore", extra_space);
111 f->close_section();
112 } else if (command == "bluefs stats") {
113 std::stringstream ss;
114 bluefs->dump_block_extents(ss);
115 bluefs->dump_volume_selector(ss);
eafe8130 116 out.append(ss);
9f95a23c
TL
117 } else {
118 errss << "Invalid command" << std::endl;
119 return -ENOSYS;
eafe8130 120 }
9f95a23c
TL
121 return 0;
122 }
eafe8130
TL
123};
124
7c673cae
FG
125BlueFS::BlueFS(CephContext* cct)
126 : cct(cct),
127 bdev(MAX_BDEV),
128 ioc(MAX_BDEV),
11fdf7f2 129 block_all(MAX_BDEV)
7c673cae 130{
11fdf7f2
TL
131 discard_cb[BDEV_WAL] = wal_discard_cb;
132 discard_cb[BDEV_DB] = db_discard_cb;
133 discard_cb[BDEV_SLOW] = slow_discard_cb;
eafe8130 134 asok_hook = SocketHook::create(this);
7c673cae
FG
135}
136
137BlueFS::~BlueFS()
138{
eafe8130 139 delete asok_hook;
7c673cae
FG
140 for (auto p : ioc) {
141 if (p)
142 p->aio_wait();
143 }
144 for (auto p : bdev) {
145 if (p) {
146 p->close();
147 delete p;
148 }
149 }
150 for (auto p : ioc) {
151 delete p;
152 }
153}
154
155void BlueFS::_init_logger()
156{
157 PerfCountersBuilder b(cct, "bluefs",
158 l_bluefs_first, l_bluefs_last);
159 b.add_u64_counter(l_bluefs_gift_bytes, "gift_bytes",
11fdf7f2 160 "Bytes gifted from BlueStore", NULL, 0, unit_t(UNIT_BYTES));
7c673cae 161 b.add_u64_counter(l_bluefs_reclaim_bytes, "reclaim_bytes",
11fdf7f2 162 "Bytes reclaimed by BlueStore", NULL, 0, unit_t(UNIT_BYTES));
7c673cae
FG
163 b.add_u64(l_bluefs_db_total_bytes, "db_total_bytes",
164 "Total bytes (main db device)",
11fdf7f2 165 "b", PerfCountersBuilder::PRIO_USEFUL, unit_t(UNIT_BYTES));
7c673cae
FG
166 b.add_u64(l_bluefs_db_used_bytes, "db_used_bytes",
167 "Used bytes (main db device)",
11fdf7f2 168 "u", PerfCountersBuilder::PRIO_USEFUL, unit_t(UNIT_BYTES));
7c673cae
FG
169 b.add_u64(l_bluefs_wal_total_bytes, "wal_total_bytes",
170 "Total bytes (wal device)",
11fdf7f2 171 "walb", PerfCountersBuilder::PRIO_USEFUL, unit_t(UNIT_BYTES));
7c673cae
FG
172 b.add_u64(l_bluefs_wal_used_bytes, "wal_used_bytes",
173 "Used bytes (wal device)",
11fdf7f2 174 "walu", PerfCountersBuilder::PRIO_USEFUL, unit_t(UNIT_BYTES));
7c673cae
FG
175 b.add_u64(l_bluefs_slow_total_bytes, "slow_total_bytes",
176 "Total bytes (slow device)",
11fdf7f2 177 "slob", PerfCountersBuilder::PRIO_USEFUL, unit_t(UNIT_BYTES));
7c673cae
FG
178 b.add_u64(l_bluefs_slow_used_bytes, "slow_used_bytes",
179 "Used bytes (slow device)",
11fdf7f2 180 "slou", PerfCountersBuilder::PRIO_USEFUL, unit_t(UNIT_BYTES));
7c673cae
FG
181 b.add_u64(l_bluefs_num_files, "num_files", "File count",
182 "f", PerfCountersBuilder::PRIO_USEFUL);
183 b.add_u64(l_bluefs_log_bytes, "log_bytes", "Size of the metadata log",
11fdf7f2 184 "jlen", PerfCountersBuilder::PRIO_INTERESTING, unit_t(UNIT_BYTES));
7c673cae
FG
185 b.add_u64_counter(l_bluefs_log_compactions, "log_compactions",
186 "Compactions of the metadata log");
187 b.add_u64_counter(l_bluefs_logged_bytes, "logged_bytes",
188 "Bytes written to the metadata log", "j",
11fdf7f2 189 PerfCountersBuilder::PRIO_CRITICAL, unit_t(UNIT_BYTES));
7c673cae
FG
190 b.add_u64_counter(l_bluefs_files_written_wal, "files_written_wal",
191 "Files written to WAL");
192 b.add_u64_counter(l_bluefs_files_written_sst, "files_written_sst",
193 "Files written to SSTs");
194 b.add_u64_counter(l_bluefs_bytes_written_wal, "bytes_written_wal",
195 "Bytes written to WAL", "wal",
196 PerfCountersBuilder::PRIO_CRITICAL);
197 b.add_u64_counter(l_bluefs_bytes_written_sst, "bytes_written_sst",
198 "Bytes written to SSTs", "sst",
11fdf7f2
TL
199 PerfCountersBuilder::PRIO_CRITICAL, unit_t(UNIT_BYTES));
200 b.add_u64_counter(l_bluefs_bytes_written_slow, "bytes_written_slow",
201 "Bytes written to WAL/SSTs at slow device", NULL,
202 PerfCountersBuilder::PRIO_USEFUL, unit_t(UNIT_BYTES));
203 b.add_u64_counter(l_bluefs_max_bytes_wal, "max_bytes_wal",
204 "Maximum bytes allocated from WAL");
205 b.add_u64_counter(l_bluefs_max_bytes_db, "max_bytes_db",
206 "Maximum bytes allocated from DB");
207 b.add_u64_counter(l_bluefs_max_bytes_slow, "max_bytes_slow",
208 "Maximum bytes allocated from SLOW");
494da23a
TL
209
210 b.add_u64_counter(l_bluefs_read_random_count, "read_random_count",
211 "random read requests processed");
212 b.add_u64_counter(l_bluefs_read_random_bytes, "read_random_bytes",
213 "Bytes requested in random read mode", NULL,
214 PerfCountersBuilder::PRIO_USEFUL, unit_t(UNIT_BYTES));
215 b.add_u64_counter(l_bluefs_read_random_disk_count, "read_random_disk_count",
216 "random reads requests going to disk");
217 b.add_u64_counter(l_bluefs_read_random_disk_bytes, "read_random_disk_bytes",
218 "Bytes read from disk in random read mode", NULL,
219 PerfCountersBuilder::PRIO_USEFUL, unit_t(UNIT_BYTES));
220 b.add_u64_counter(l_bluefs_read_random_buffer_count, "read_random_buffer_count",
221 "random read requests processed using prefetch buffer");
222 b.add_u64_counter(l_bluefs_read_random_buffer_bytes, "read_random_buffer_bytes",
223 "Bytes read from prefetch buffer in random read mode", NULL,
224 PerfCountersBuilder::PRIO_USEFUL, unit_t(UNIT_BYTES));
225
226 b.add_u64_counter(l_bluefs_read_count, "read_count",
227 "buffered read requests processed");
228 b.add_u64_counter(l_bluefs_read_bytes, "read_bytes",
229 "Bytes requested in buffered read mode", NULL,
230 PerfCountersBuilder::PRIO_USEFUL, unit_t(UNIT_BYTES));
231
232 b.add_u64_counter(l_bluefs_read_prefetch_count, "read_prefetch_count",
233 "prefetch read requests processed");
234 b.add_u64_counter(l_bluefs_read_prefetch_bytes, "read_prefetch_bytes",
235 "Bytes requested in prefetch read mode", NULL,
236 PerfCountersBuilder::PRIO_USEFUL, unit_t(UNIT_BYTES));
237
7c673cae
FG
238 logger = b.create_perf_counters();
239 cct->get_perfcounters_collection()->add(logger);
240}
241
242void BlueFS::_shutdown_logger()
243{
244 cct->get_perfcounters_collection()->remove(logger);
245 delete logger;
246}
247
248void BlueFS::_update_logger_stats()
249{
250 // we must be holding the lock
251 logger->set(l_bluefs_num_files, file_map.size());
252 logger->set(l_bluefs_log_bytes, log_writer->file->fnode.size);
253
254 if (alloc[BDEV_WAL]) {
11fdf7f2 255 logger->set(l_bluefs_wal_total_bytes, block_all[BDEV_WAL].size());
7c673cae 256 logger->set(l_bluefs_wal_used_bytes,
11fdf7f2 257 block_all[BDEV_WAL].size() - alloc[BDEV_WAL]->get_free());
7c673cae
FG
258 }
259 if (alloc[BDEV_DB]) {
11fdf7f2 260 logger->set(l_bluefs_db_total_bytes, block_all[BDEV_DB].size());
7c673cae 261 logger->set(l_bluefs_db_used_bytes,
11fdf7f2 262 block_all[BDEV_DB].size() - alloc[BDEV_DB]->get_free());
7c673cae
FG
263 }
264 if (alloc[BDEV_SLOW]) {
11fdf7f2 265 logger->set(l_bluefs_slow_total_bytes, block_all[BDEV_SLOW].size());
7c673cae 266 logger->set(l_bluefs_slow_used_bytes,
11fdf7f2 267 block_all[BDEV_SLOW].size() - alloc[BDEV_SLOW]->get_free());
7c673cae
FG
268 }
269}
270
11fdf7f2
TL
271int BlueFS::add_block_device(unsigned id, const string& path, bool trim,
272 bool shared_with_bluestore)
7c673cae
FG
273{
274 dout(10) << __func__ << " bdev " << id << " path " << path << dendl;
11fdf7f2
TL
275 ceph_assert(id < bdev.size());
276 ceph_assert(bdev[id] == NULL);
277 BlockDevice *b = BlockDevice::create(cct, path, NULL, NULL,
278 discard_cb[id], static_cast<void*>(this));
279 if (shared_with_bluestore) {
280 b->set_no_exclusive_lock();
281 }
7c673cae
FG
282 int r = b->open(path);
283 if (r < 0) {
284 delete b;
285 return r;
286 }
11fdf7f2
TL
287 if (trim) {
288 b->discard(0, b->get_size());
289 }
290
7c673cae 291 dout(1) << __func__ << " bdev " << id << " path " << path
1adf2230 292 << " size " << byte_u_t(b->get_size()) << dendl;
7c673cae
FG
293 bdev[id] = b;
294 ioc[id] = new IOContext(cct, NULL);
295 return 0;
296}
297
298bool BlueFS::bdev_support_label(unsigned id)
299{
11fdf7f2
TL
300 ceph_assert(id < bdev.size());
301 ceph_assert(bdev[id]);
7c673cae
FG
302 return bdev[id]->supported_bdev_label();
303}
304
305uint64_t BlueFS::get_block_device_size(unsigned id)
306{
307 if (id < bdev.size() && bdev[id])
308 return bdev[id]->get_size();
309 return 0;
310}
311
1911f103
TL
312void BlueFS::_add_block_extent(unsigned id, uint64_t offset, uint64_t length,
313 bool skip)
7c673cae 314{
7c673cae 315 dout(1) << __func__ << " bdev " << id
11fdf7f2 316 << " 0x" << std::hex << offset << "~" << length << std::dec
1911f103 317 << " skip " << skip
7c673cae 318 << dendl;
11fdf7f2
TL
319
320 ceph_assert(id < bdev.size());
321 ceph_assert(bdev[id]);
322 ceph_assert(bdev[id]->get_size() >= offset + length);
7c673cae 323 block_all[id].insert(offset, length);
7c673cae
FG
324
325 if (id < alloc.size() && alloc[id]) {
1911f103
TL
326 if (!skip)
327 log_t.op_alloc_add(id, offset, length);
328
7c673cae
FG
329 alloc[id]->init_add_free(offset, length);
330 }
331
332 if (logger)
333 logger->inc(l_bluefs_gift_bytes, length);
334 dout(10) << __func__ << " done" << dendl;
335}
336
337int BlueFS::reclaim_blocks(unsigned id, uint64_t want,
a8e16298 338 PExtentVector *extents)
7c673cae 339{
11fdf7f2 340 std::unique_lock l(lock);
7c673cae
FG
341 dout(1) << __func__ << " bdev " << id
342 << " want 0x" << std::hex << want << std::dec << dendl;
11fdf7f2
TL
343 ceph_assert(id < alloc.size());
344 ceph_assert(alloc[id]);
9f95a23c
TL
345 int64_t got = 0;
346
347 interval_set<uint64_t> granular;
348 while (want > 0 && !block_unused_too_granular[id].empty()) {
349 auto p = block_unused_too_granular[id].begin();
350 dout(20) << __func__ << " unused " << (int)id << ":"
351 << std::hex << p.get_start() << "~" << p.get_len() << dendl;
352 extents->push_back({p.get_start(), p.get_len()});
353 granular.insert(p.get_start(), p.get_len());
354 if (want >= p.get_len()) {
355 want -= p.get_len();
356 } else {
357 want = 0;
358 }
359 got += p.get_len();
360 block_unused_too_granular[id].erase(p);
7c673cae
FG
361 }
362
9f95a23c
TL
363 if (want > 0) {
364 got += alloc[id]->allocate(want, alloc_size[id], 0, extents);
365 ceph_assert(got != 0);
366 if (got < 0) {
367 derr << __func__ << " failed to allocate space to return to bluestore"
368 << dendl;
369 alloc[id]->dump();
370 block_unused_too_granular[id].insert(granular);
371 return got;
372 }
7c673cae 373
9f95a23c
TL
374 for (auto& p : *extents) {
375 block_all[id].erase(p.offset, p.length);
376 log_t.op_alloc_rm(id, p.offset, p.length);
377 }
378
379 flush_bdev();
380 int r = _flush_and_sync_log(l);
381 ceph_assert(r == 0);
382 }
7c673cae 383
11fdf7f2 384 logger->inc(l_bluefs_reclaim_bytes, got);
7c673cae
FG
385 dout(1) << __func__ << " bdev " << id << " want 0x" << std::hex << want
386 << " got " << *extents << dendl;
387 return 0;
388}
389
11fdf7f2 390void BlueFS::handle_discard(unsigned id, interval_set<uint64_t>& to_release)
7c673cae 391{
11fdf7f2
TL
392 dout(10) << __func__ << " bdev " << id << dendl;
393 ceph_assert(alloc[id]);
394 alloc[id]->release(to_release);
395}
396
397uint64_t BlueFS::get_used()
398{
399 std::lock_guard l(lock);
400 uint64_t used = 0;
401 for (unsigned id = 0; id < MAX_BDEV; ++id) {
402 if (alloc[id]) {
403 used += block_all[id].size() - alloc[id]->get_free();
404 }
7c673cae 405 }
11fdf7f2 406 return used;
7c673cae
FG
407}
408
409uint64_t BlueFS::get_total(unsigned id)
410{
11fdf7f2
TL
411 std::lock_guard l(lock);
412 ceph_assert(id < block_all.size());
413 return block_all[id].size();
7c673cae
FG
414}
415
416uint64_t BlueFS::get_free(unsigned id)
417{
11fdf7f2
TL
418 std::lock_guard l(lock);
419 ceph_assert(id < alloc.size());
7c673cae
FG
420 return alloc[id]->get_free();
421}
422
423void BlueFS::dump_perf_counters(Formatter *f)
424{
425 f->open_object_section("bluefs_perf_counters");
426 logger->dump_formatted(f,0);
427 f->close_section();
428}
429
3efd9988
FG
430void BlueFS::dump_block_extents(ostream& out)
431{
432 for (unsigned i = 0; i < MAX_BDEV; ++i) {
433 if (!bdev[i]) {
434 continue;
435 }
11fdf7f2
TL
436 auto owned = get_total(i);
437 auto free = get_free(i);
1911f103 438
11fdf7f2
TL
439 out << i << " : device size 0x" << std::hex << bdev[i]->get_size()
440 << " : own 0x" << block_all[i]
441 << " = 0x" << owned
442 << " : using 0x" << owned - free
1911f103
TL
443 << std::dec << "(" << byte_u_t(owned - free) << ")";
444 if (i == _get_slow_device_id()) {
445 ceph_assert(slow_dev_expander);
446 ceph_assert(alloc[i]);
447 free = slow_dev_expander->available_freespace(alloc_size[i]);
448 out << std::hex
449 << " : bluestore has 0x" << free
450 << std::dec << "(" << byte_u_t(free) << ") available";
451 }
452 out << "\n";
3efd9988
FG
453 }
454}
7c673cae
FG
455
456void BlueFS::get_usage(vector<pair<uint64_t,uint64_t>> *usage)
457{
11fdf7f2 458 std::lock_guard l(lock);
7c673cae
FG
459 usage->resize(bdev.size());
460 for (unsigned id = 0; id < bdev.size(); ++id) {
461 if (!bdev[id]) {
462 (*usage)[id] = make_pair(0, 0);
463 continue;
464 }
465 (*usage)[id].first = alloc[id]->get_free();
11fdf7f2 466 (*usage)[id].second = block_all[id].size();
7c673cae 467 uint64_t used =
11fdf7f2 468 (block_all[id].size() - (*usage)[id].first) * 100 / block_all[id].size();
7c673cae
FG
469 dout(10) << __func__ << " bdev " << id
470 << " free " << (*usage)[id].first
1adf2230 471 << " (" << byte_u_t((*usage)[id].first) << ")"
7c673cae 472 << " / " << (*usage)[id].second
1adf2230 473 << " (" << byte_u_t((*usage)[id].second) << ")"
7c673cae
FG
474 << ", used " << used << "%"
475 << dendl;
476 }
477}
478
479int BlueFS::get_block_extents(unsigned id, interval_set<uint64_t> *extents)
480{
11fdf7f2 481 std::lock_guard l(lock);
7c673cae
FG
482 dout(10) << __func__ << " bdev " << id << dendl;
483 if (id >= block_all.size())
484 return -EINVAL;
485 *extents = block_all[id];
486 return 0;
487}
488
9f95a23c 489int BlueFS::mkfs(uuid_d osd_uuid, const bluefs_layout_t& layout)
7c673cae 490{
11fdf7f2 491 std::unique_lock l(lock);
7c673cae
FG
492 dout(1) << __func__
493 << " osd_uuid " << osd_uuid
494 << dendl;
495
9f95a23c
TL
496 // set volume selector if not provided before/outside
497 if (vselector == nullptr) {
498 vselector.reset(
499 new OriginalVolumeSelector(
500 get_block_device_size(BlueFS::BDEV_WAL) * 95 / 100,
501 get_block_device_size(BlueFS::BDEV_DB) * 95 / 100,
502 get_block_device_size(BlueFS::BDEV_SLOW) * 95 / 100));
503 }
504
7c673cae
FG
505 _init_alloc();
506 _init_logger();
507
508 super.version = 1;
509 super.block_size = bdev[BDEV_DB]->get_block_size();
510 super.osd_uuid = osd_uuid;
511 super.uuid.generate_random();
512 dout(1) << __func__ << " uuid " << super.uuid << dendl;
513
514 // init log
9f95a23c 515 FileRef log_file = ceph::make_ref<File>();
7c673cae 516 log_file->fnode.ino = 1;
f6b5b4d7 517 log_file->vselector_hint = vselector->get_hint_for_log();
7c673cae 518 int r = _allocate(
9f95a23c 519 vselector->select_prefer_bdev(log_file->vselector_hint),
7c673cae 520 cct->_conf->bluefs_max_log_runway,
94b18763 521 &log_file->fnode);
9f95a23c 522 vselector->add_usage(log_file->vselector_hint, log_file->fnode);
11fdf7f2 523 ceph_assert(r == 0);
7c673cae
FG
524 log_writer = _create_writer(log_file);
525
526 // initial txn
527 log_t.op_init();
528 for (unsigned bdev = 0; bdev < MAX_BDEV; ++bdev) {
529 interval_set<uint64_t>& p = block_all[bdev];
530 if (p.empty())
531 continue;
532 for (interval_set<uint64_t>::iterator q = p.begin(); q != p.end(); ++q) {
533 dout(20) << __func__ << " op_alloc_add " << bdev << " 0x"
534 << std::hex << q.get_start() << "~" << q.get_len() << std::dec
535 << dendl;
536 log_t.op_alloc_add(bdev, q.get_start(), q.get_len());
537 }
538 }
539 _flush_and_sync_log(l);
540
541 // write supers
542 super.log_fnode = log_file->fnode;
9f95a23c 543 super.memorized_layout = layout;
11fdf7f2 544 _write_super(BDEV_DB);
7c673cae
FG
545 flush_bdev();
546
547 // clean up
548 super = bluefs_super_t();
549 _close_writer(log_writer);
550 log_writer = NULL;
551 block_all.clear();
9f95a23c 552 vselector.reset(nullptr);
7c673cae
FG
553 _stop_alloc();
554 _shutdown_logger();
555
556 dout(10) << __func__ << " success" << dendl;
557 return 0;
558}
559
560void BlueFS::_init_alloc()
561{
562 dout(20) << __func__ << dendl;
563 alloc.resize(MAX_BDEV);
eafe8130 564 alloc_size.resize(MAX_BDEV, 0);
7c673cae 565 pending_release.resize(MAX_BDEV);
9f95a23c 566 block_unused_too_granular.resize(MAX_BDEV);
eafe8130
TL
567
568 if (bdev[BDEV_WAL]) {
569 alloc_size[BDEV_WAL] = cct->_conf->bluefs_alloc_size;
570 }
571 if (bdev[BDEV_SLOW]) {
572 alloc_size[BDEV_DB] = cct->_conf->bluefs_alloc_size;
573 alloc_size[BDEV_SLOW] = cct->_conf->bluefs_shared_alloc_size;
574 } else {
575 alloc_size[BDEV_DB] = cct->_conf->bluefs_shared_alloc_size;
576 }
577 // new wal and db devices are never shared
578 if (bdev[BDEV_NEWWAL]) {
579 alloc_size[BDEV_NEWWAL] = cct->_conf->bluefs_alloc_size;
580 }
581 if (bdev[BDEV_NEWDB]) {
582 alloc_size[BDEV_NEWDB] = cct->_conf->bluefs_alloc_size;
583 }
584
7c673cae
FG
585 for (unsigned id = 0; id < bdev.size(); ++id) {
586 if (!bdev[id]) {
587 continue;
588 }
11fdf7f2 589 ceph_assert(bdev[id]->get_size());
eafe8130
TL
590 std::string name = "bluefs-";
591 const char* devnames[] = {"wal","db","slow"};
592 if (id <= BDEV_SLOW)
593 name += devnames[id];
594 else
595 name += to_string(uintptr_t(this));
596 ceph_assert(alloc_size[id]);
597 dout(1) << __func__ << " id " << id
598 << " alloc_size 0x" << std::hex << alloc_size[id]
599 << " size 0x" << bdev[id]->get_size() << std::dec << dendl;
7c673cae
FG
600 alloc[id] = Allocator::create(cct, cct->_conf->bluefs_allocator,
601 bdev[id]->get_size(),
eafe8130 602 alloc_size[id], name);
7c673cae
FG
603 interval_set<uint64_t>& p = block_all[id];
604 for (interval_set<uint64_t>::iterator q = p.begin(); q != p.end(); ++q) {
605 alloc[id]->init_add_free(q.get_start(), q.get_len());
606 }
607 }
608}
609
610void BlueFS::_stop_alloc()
611{
612 dout(20) << __func__ << dendl;
11fdf7f2
TL
613 for (auto p : bdev) {
614 if (p)
615 p->discard_drain();
616 }
617
7c673cae
FG
618 for (auto p : alloc) {
619 if (p != nullptr) {
620 p->shutdown();
621 delete p;
622 }
623 }
624 alloc.clear();
9f95a23c 625 block_unused_too_granular.clear();
7c673cae
FG
626}
627
628int BlueFS::mount()
629{
630 dout(1) << __func__ << dendl;
631
632 int r = _open_super();
633 if (r < 0) {
634 derr << __func__ << " failed to open super: " << cpp_strerror(r) << dendl;
635 goto out;
636 }
637
9f95a23c
TL
638 // set volume selector if not provided before/outside
639 if (vselector == nullptr) {
640 vselector.reset(
641 new OriginalVolumeSelector(
642 get_block_device_size(BlueFS::BDEV_WAL) * 95 / 100,
643 get_block_device_size(BlueFS::BDEV_DB) * 95 / 100,
644 get_block_device_size(BlueFS::BDEV_SLOW) * 95 / 100));
645 }
646
7c673cae
FG
647 block_all.clear();
648 block_all.resize(MAX_BDEV);
7c673cae 649 _init_alloc();
494da23a 650 _init_logger();
7c673cae 651
11fdf7f2 652 r = _replay(false, false);
7c673cae
FG
653 if (r < 0) {
654 derr << __func__ << " failed to replay log: " << cpp_strerror(r) << dendl;
655 _stop_alloc();
656 goto out;
657 }
658
659 // init freelist
660 for (auto& p : file_map) {
661 dout(30) << __func__ << " noting alloc for " << p.second->fnode << dendl;
662 for (auto& q : p.second->fnode.extents) {
663 alloc[q.bdev]->init_rm_free(q.offset, q.length);
664 }
665 }
666
667 // set up the log for future writes
668 log_writer = _create_writer(_get_file(1));
11fdf7f2 669 ceph_assert(log_writer->file->fnode.ino == 1);
7c673cae
FG
670 log_writer->pos = log_writer->file->fnode.size;
671 dout(10) << __func__ << " log write pos set to 0x"
672 << std::hex << log_writer->pos << std::dec
673 << dendl;
674
7c673cae
FG
675 return 0;
676
677 out:
678 super = bluefs_super_t();
679 return r;
680}
681
9f95a23c
TL
682int BlueFS::maybe_verify_layout(const bluefs_layout_t& layout) const
683{
684 if (super.memorized_layout) {
685 if (layout == *super.memorized_layout) {
686 dout(10) << __func__ << " bluefs layout verified positively" << dendl;
687 } else {
688 derr << __func__ << " memorized layout doesn't fit current one" << dendl;
689 return -EIO;
690 }
691 } else {
692 dout(10) << __func__ << " no memorized_layout in bluefs superblock"
693 << dendl;
694 }
695
696 return 0;
697}
698
1911f103 699void BlueFS::umount(bool avoid_compact)
7c673cae
FG
700{
701 dout(1) << __func__ << dendl;
702
1911f103 703 sync_metadata(avoid_compact);
7c673cae
FG
704
705 _close_writer(log_writer);
706 log_writer = NULL;
707
9f95a23c 708 vselector.reset(nullptr);
7c673cae
FG
709 _stop_alloc();
710 file_map.clear();
711 dir_map.clear();
712 super = bluefs_super_t();
713 log_t.clear();
714 _shutdown_logger();
715}
716
9f95a23c 717int BlueFS::prepare_new_device(int id, const bluefs_layout_t& layout)
7c673cae 718{
11fdf7f2
TL
719 dout(1) << __func__ << dendl;
720
721 if(id == BDEV_NEWDB) {
722 int new_log_dev_cur = BDEV_WAL;
723 int new_log_dev_next = BDEV_WAL;
724 if (!bdev[BDEV_WAL]) {
725 new_log_dev_cur = BDEV_NEWDB;
726 new_log_dev_next = BDEV_DB;
727 }
9f95a23c 728 _rewrite_log_and_layout_sync(false,
11fdf7f2
TL
729 BDEV_NEWDB,
730 new_log_dev_cur,
731 new_log_dev_next,
9f95a23c
TL
732 RENAME_DB2SLOW,
733 layout);
11fdf7f2
TL
734 //}
735 } else if(id == BDEV_NEWWAL) {
9f95a23c
TL
736 _rewrite_log_and_layout_sync(false,
737 BDEV_DB,
738 BDEV_NEWWAL,
739 BDEV_WAL,
740 REMOVE_WAL,
741 layout);
11fdf7f2
TL
742 } else {
743 assert(false);
744 }
745 return 0;
746}
747
748void BlueFS::collect_metadata(map<string,string> *pm, unsigned skip_bdev_id)
749{
750 if (skip_bdev_id != BDEV_DB && bdev[BDEV_DB])
7c673cae
FG
751 bdev[BDEV_DB]->collect_metadata("bluefs_db_", pm);
752 if (bdev[BDEV_WAL])
753 bdev[BDEV_WAL]->collect_metadata("bluefs_wal_", pm);
11fdf7f2
TL
754}
755
756void BlueFS::get_devices(set<string> *ls)
757{
758 for (unsigned i = 0; i < MAX_BDEV; ++i) {
759 if (bdev[i]) {
760 bdev[i]->get_devices(ls);
761 }
762 }
7c673cae
FG
763}
764
765int BlueFS::fsck()
766{
11fdf7f2 767 std::lock_guard l(lock);
7c673cae
FG
768 dout(1) << __func__ << dendl;
769 // hrm, i think we check everything on mount...
770 return 0;
771}
772
11fdf7f2 773int BlueFS::_write_super(int dev)
7c673cae
FG
774{
775 // build superblock
776 bufferlist bl;
11fdf7f2 777 encode(super, bl);
7c673cae 778 uint32_t crc = bl.crc32c(-1);
11fdf7f2 779 encode(crc, bl);
7c673cae
FG
780 dout(10) << __func__ << " super block length(encoded): " << bl.length() << dendl;
781 dout(10) << __func__ << " superblock " << super.version << dendl;
782 dout(10) << __func__ << " log_fnode " << super.log_fnode << dendl;
9f95a23c 783 ceph_assert_always(bl.length() <= get_super_length());
7c673cae
FG
784 bl.append_zero(get_super_length() - bl.length());
785
11fdf7f2 786 bdev[dev]->write(get_super_offset(), bl, false, WRITE_LIFE_SHORT);
7c673cae
FG
787 dout(20) << __func__ << " v " << super.version
788 << " crc 0x" << std::hex << crc
789 << " offset 0x" << get_super_offset() << std::dec
790 << dendl;
791 return 0;
792}
793
794int BlueFS::_open_super()
795{
796 dout(10) << __func__ << dendl;
797
798 bufferlist bl;
799 uint32_t expected_crc, crc;
800 int r;
801
802 // always the second block
803 r = bdev[BDEV_DB]->read(get_super_offset(), get_super_length(),
804 &bl, ioc[BDEV_DB], false);
805 if (r < 0)
806 return r;
807
11fdf7f2
TL
808 auto p = bl.cbegin();
809 decode(super, p);
7c673cae
FG
810 {
811 bufferlist t;
812 t.substr_of(bl, 0, p.get_off());
813 crc = t.crc32c(-1);
814 }
11fdf7f2 815 decode(expected_crc, p);
7c673cae
FG
816 if (crc != expected_crc) {
817 derr << __func__ << " bad crc on superblock, expected 0x"
818 << std::hex << expected_crc << " != actual 0x" << crc << std::dec
819 << dendl;
820 return -EIO;
821 }
822 dout(10) << __func__ << " superblock " << super.version << dendl;
823 dout(10) << __func__ << " log_fnode " << super.log_fnode << dendl;
824 return 0;
825}
826
9f95a23c
TL
827int BlueFS::_check_new_allocations(const bluefs_fnode_t& fnode,
828 size_t dev_count,
829 boost::dynamic_bitset<uint64_t>* owned_blocks,
830 boost::dynamic_bitset<uint64_t>* used_blocks)
831{
832 auto& fnode_extents = fnode.extents;
833 for (auto e : fnode_extents) {
834 auto id = e.bdev;
835 bool fail = false;
836 ceph_assert(id < dev_count);
837 apply_for_bitset_range(e.offset, e.length, alloc_size[id], owned_blocks[id],
838 [&](uint64_t pos, boost::dynamic_bitset<uint64_t> &bs) {
839 if (!bs.test(pos)) {
840 fail = true;
841 }
842 }
843 );
844 if (fail) {
845 derr << __func__ << " invalid extent " << int(id)
846 << ": 0x" << std::hex << e.offset << "~" << e.length
847 << std::dec
848 << ": wasn't given but allocated for ino " << fnode.ino
849 << dendl;
850 return -EFAULT;
851 }
852
853 apply_for_bitset_range(e.offset, e.length, alloc_size[id], used_blocks[id],
854 [&](uint64_t pos, boost::dynamic_bitset<uint64_t> &bs) {
855 if (bs.test(pos)) {
856 fail = true;
857 }
858 bs.set(pos);
859 }
860 );
861 if (fail) {
862 derr << __func__ << " invalid extent " << int(e.bdev)
863 << ": 0x" << std::hex << e.offset << "~" << e.length
864 << std::dec << ": duplicate reference, ino " << fnode.ino
865 << dendl;
866 return -EFAULT;
867 }
868 }
869 return 0;
870}
871
872int BlueFS::_adjust_granularity(
873 __u8 id, uint64_t *offset, uint64_t *length, bool alloc)
874{
875 const char *op = alloc ? "op_alloc_add" : "op_alloc_rm";
876 auto oldo = *offset;
877 auto oldl = *length;
878 if (*offset & (alloc_size[id] - 1)) {
879 *offset &= ~(alloc_size[id] - 1);
880 *offset += alloc_size[id];
881 if (*length > *offset - oldo) {
882 if (alloc) {
883 block_unused_too_granular[id].insert(oldo, *offset - oldo);
884 } else {
885 block_unused_too_granular[id].erase(oldo, *offset - oldo);
886 }
887 *length -= (*offset - oldo);
888 } else {
889 if (alloc) {
890 block_unused_too_granular[id].insert(oldo, *length);
891 } else {
892 block_unused_too_granular[id].erase(oldo, *length);
893 }
894 *length = 0;
895 }
896 }
897 if (*length & (alloc_size[id] - 1)) {
898 *length &= ~(alloc_size[id] - 1);
899 if (alloc) {
900 block_unused_too_granular[id].insert(
901 *offset + *length,
902 oldo + oldl - *offset - *length);
903 } else {
904 block_unused_too_granular[id].erase(
905 *offset + *length,
906 oldo + oldl - *offset - *length);
907 }
908 }
909 if (oldo != *offset || oldl != *length) {
910 dout(10) << __func__ << " " << op << " "
911 << (int)id << ":" << std::hex << oldo << "~" << oldl
912 << " -> " << (int)id << ":" << *offset << "~" << *length << dendl;
913 }
914 return 0;
915}
916
917int BlueFS::_verify_alloc_granularity(
918 __u8 id, uint64_t offset, uint64_t length, const char *op)
919{
920 if ((offset & (alloc_size[id] - 1)) ||
921 (length & (alloc_size[id] - 1))) {
922 derr << __func__ << " " << op << " of " << (int)id
923 << ":0x" << std::hex << offset << "~" << length << std::dec
924 << " does not align to alloc_size 0x"
925 << std::hex << alloc_size[id] << std::dec << dendl;
926 // be helpful
927 auto need = alloc_size[id];
928 while (need && ((offset & (need - 1)) ||
929 (length & (need - 1)))) {
930 need >>= 1;
931 }
932 if (need) {
933 const char *which;
934 if (id == BDEV_SLOW ||
935 (id == BDEV_DB && !bdev[BDEV_SLOW])) {
936 which = "bluefs_shared_alloc_size";
937 } else {
938 which = "bluefs_alloc_size";
939 }
940 derr << "work-around by setting " << which << " = " << need
941 << " for this OSD" << dendl;
942 }
943 return -EFAULT;
944 }
945 return 0;
946}
947
11fdf7f2 948int BlueFS::_replay(bool noop, bool to_stdout)
7c673cae
FG
949{
950 dout(10) << __func__ << (noop ? " NO-OP" : "") << dendl;
951 ino_last = 1; // by the log
952 log_seq = 0;
953
954 FileRef log_file;
11fdf7f2 955 log_file = _get_file(1);
9f95a23c
TL
956
957 // sanity check
958 for (auto& a : block_unused_too_granular) {
959 ceph_assert(a.empty());
960 }
961
11fdf7f2
TL
962 if (!noop) {
963 log_file->fnode = super.log_fnode;
9f95a23c 964 log_file->vselector_hint =
f6b5b4d7 965 vselector->get_hint_for_log();
7c673cae 966 } else {
11fdf7f2
TL
967 // do not use fnode from superblock in 'noop' mode - log_file's one should
968 // be fine and up-to-date
969 ceph_assert(log_file->fnode.ino == 1);
970 ceph_assert(log_file->fnode.extents.size() != 0);
7c673cae 971 }
7c673cae 972 dout(10) << __func__ << " log_fnode " << super.log_fnode << dendl;
11fdf7f2
TL
973 if (unlikely(to_stdout)) {
974 std::cout << " log_fnode " << super.log_fnode << std::endl;
975 }
7c673cae
FG
976
977 FileReader *log_reader = new FileReader(
978 log_file, cct->_conf->bluefs_max_prefetch,
979 false, // !random
980 true); // ignore eof
9f95a23c
TL
981
982 bool seen_recs = false;
983
984 boost::dynamic_bitset<uint64_t> used_blocks[MAX_BDEV];
985 boost::dynamic_bitset<uint64_t> owned_blocks[MAX_BDEV];
986
987 if (cct->_conf->bluefs_log_replay_check_allocations) {
988 for (size_t i = 0; i < MAX_BDEV; ++i) {
989 if (alloc_size[i] != 0 && bdev[i] != nullptr) {
990 used_blocks[i].resize(round_up_to(bdev[i]->get_size(), alloc_size[i]) / alloc_size[i]);
991 owned_blocks[i].resize(round_up_to(bdev[i]->get_size(), alloc_size[i]) / alloc_size[i]);
992 }
993 }
994 }
995
996 bool first_log_check = true;
997
7c673cae 998 while (true) {
11fdf7f2 999 ceph_assert((log_reader->buf.pos & ~super.block_mask()) == 0);
7c673cae
FG
1000 uint64_t pos = log_reader->buf.pos;
1001 uint64_t read_pos = pos;
1002 bufferlist bl;
1003 {
1004 int r = _read(log_reader, &log_reader->buf, read_pos, super.block_size,
1005 &bl, NULL);
f6b5b4d7
TL
1006 if (r != (int)super.block_size && cct->_conf->bluefs_replay_recovery) {
1007 r += do_replay_recovery_read(log_reader, pos, read_pos + r, super.block_size - r, &bl);
1008 }
1009 assert(r == (int)super.block_size);
7c673cae
FG
1010 read_pos += r;
1011 }
1012 uint64_t more = 0;
1013 uint64_t seq;
1014 uuid_d uuid;
1015 {
11fdf7f2 1016 auto p = bl.cbegin();
7c673cae
FG
1017 __u8 a, b;
1018 uint32_t len;
11fdf7f2
TL
1019 decode(a, p);
1020 decode(b, p);
1021 decode(len, p);
1022 decode(uuid, p);
1023 decode(seq, p);
7c673cae 1024 if (len + 6 > bl.length()) {
11fdf7f2 1025 more = round_up_to(len + 6 - bl.length(), super.block_size);
7c673cae
FG
1026 }
1027 }
1028 if (uuid != super.uuid) {
9f95a23c
TL
1029 if (seen_recs) {
1030 dout(10) << __func__ << " 0x" << std::hex << pos << std::dec
1031 << ": stop: uuid " << uuid << " != super.uuid " << super.uuid
1032 << dendl;
1033 } else {
1034 derr << __func__ << " 0x" << std::hex << pos << std::dec
1035 << ": stop: uuid " << uuid << " != super.uuid " << super.uuid
1036 << ", block dump: \n";
1037 bufferlist t;
1038 t.substr_of(bl, 0, super.block_size);
1039 t.hexdump(*_dout);
1040 *_dout << dendl;
1041 }
7c673cae
FG
1042 break;
1043 }
1044 if (seq != log_seq + 1) {
9f95a23c
TL
1045 if (seen_recs) {
1046 dout(10) << __func__ << " 0x" << std::hex << pos << std::dec
1047 << ": stop: seq " << seq << " != expected " << log_seq + 1
1048 << dendl;;
1049 } else {
1050 derr << __func__ << " 0x" << std::hex << pos << std::dec
1051 << ": stop: seq " << seq << " != expected " << log_seq + 1
1052 << dendl;;
1053 }
7c673cae
FG
1054 break;
1055 }
1056 if (more) {
1057 dout(20) << __func__ << " need 0x" << std::hex << more << std::dec
1058 << " more bytes" << dendl;
1059 bufferlist t;
1060 int r = _read(log_reader, &log_reader->buf, read_pos, more, &t, NULL);
1061 if (r < (int)more) {
f6b5b4d7
TL
1062 dout(10) << __func__ << " 0x" << std::hex << pos
1063 << ": stop: len is 0x" << bl.length() + more << std::dec
1064 << ", which is past eof" << dendl;
1065 if (cct->_conf->bluefs_replay_recovery) {
1066 //try to search for more data
1067 r += do_replay_recovery_read(log_reader, pos, read_pos + r, more - r, &t);
1068 if (r < (int)more) {
1069 //in normal mode we must read r==more, for recovery it is too strict
1070 break;
1071 }
1072 }
7c673cae 1073 }
11fdf7f2 1074 ceph_assert(r == (int)more);
7c673cae
FG
1075 bl.claim_append(t);
1076 read_pos += r;
1077 }
9f95a23c 1078 seen_recs = true;
7c673cae
FG
1079 bluefs_transaction_t t;
1080 try {
11fdf7f2
TL
1081 auto p = bl.cbegin();
1082 decode(t, p);
7c673cae
FG
1083 }
1084 catch (buffer::error& e) {
9f95a23c
TL
1085 derr << __func__ << " 0x" << std::hex << pos << std::dec
1086 << ": stop: failed to decode: " << e.what()
1087 << dendl;
7c673cae
FG
1088 delete log_reader;
1089 return -EIO;
1090 }
11fdf7f2 1091 ceph_assert(seq == t.seq);
7c673cae
FG
1092 dout(10) << __func__ << " 0x" << std::hex << pos << std::dec
1093 << ": " << t << dendl;
11fdf7f2
TL
1094 if (unlikely(to_stdout)) {
1095 std::cout << " 0x" << std::hex << pos << std::dec
1096 << ": " << t << std::endl;
1097 }
7c673cae 1098
11fdf7f2 1099 auto p = t.op_bl.cbegin();
7c673cae
FG
1100 while (!p.end()) {
1101 __u8 op;
11fdf7f2 1102 decode(op, p);
7c673cae
FG
1103 switch (op) {
1104
1105 case bluefs_transaction_t::OP_INIT:
1106 dout(20) << __func__ << " 0x" << std::hex << pos << std::dec
1107 << ": op_init" << dendl;
11fdf7f2
TL
1108 if (unlikely(to_stdout)) {
1109 std::cout << " 0x" << std::hex << pos << std::dec
1110 << ": op_init" << std::endl;
1111 }
1112
1113 ceph_assert(t.seq == 1);
7c673cae
FG
1114 break;
1115
1116 case bluefs_transaction_t::OP_JUMP:
1117 {
1118 uint64_t next_seq;
1119 uint64_t offset;
11fdf7f2
TL
1120 decode(next_seq, p);
1121 decode(offset, p);
7c673cae
FG
1122 dout(20) << __func__ << " 0x" << std::hex << pos << std::dec
1123 << ": op_jump seq " << next_seq
1124 << " offset 0x" << std::hex << offset << std::dec << dendl;
11fdf7f2
TL
1125 if (unlikely(to_stdout)) {
1126 std::cout << " 0x" << std::hex << pos << std::dec
1127 << ": op_jump seq " << next_seq
1128 << " offset 0x" << std::hex << offset << std::dec
1129 << std::endl;
1130 }
1131
1132 ceph_assert(next_seq >= log_seq);
7c673cae
FG
1133 log_seq = next_seq - 1; // we will increment it below
1134 uint64_t skip = offset - read_pos;
1135 if (skip) {
1136 bufferlist junk;
1137 int r = _read(log_reader, &log_reader->buf, read_pos, skip, &junk,
1138 NULL);
1139 if (r != (int)skip) {
1140 dout(10) << __func__ << " 0x" << std::hex << read_pos
1141 << ": stop: failed to skip to " << offset
1142 << std::dec << dendl;
11fdf7f2 1143 ceph_abort_msg("problem with op_jump");
7c673cae
FG
1144 }
1145 }
1146 }
1147 break;
1148
1149 case bluefs_transaction_t::OP_JUMP_SEQ:
1150 {
1151 uint64_t next_seq;
11fdf7f2 1152 decode(next_seq, p);
7c673cae
FG
1153 dout(20) << __func__ << " 0x" << std::hex << pos << std::dec
1154 << ": op_jump_seq " << next_seq << dendl;
11fdf7f2
TL
1155 if (unlikely(to_stdout)) {
1156 std::cout << " 0x" << std::hex << pos << std::dec
1157 << ": op_jump_seq " << next_seq << std::endl;
1158 }
1159
1160 ceph_assert(next_seq >= log_seq);
7c673cae
FG
1161 log_seq = next_seq - 1; // we will increment it below
1162 }
1163 break;
1164
1165 case bluefs_transaction_t::OP_ALLOC_ADD:
1166 {
1167 __u8 id;
1168 uint64_t offset, length;
11fdf7f2
TL
1169 decode(id, p);
1170 decode(offset, p);
1171 decode(length, p);
7c673cae
FG
1172 dout(20) << __func__ << " 0x" << std::hex << pos << std::dec
1173 << ": op_alloc_add " << " " << (int)id
1174 << ":0x" << std::hex << offset << "~" << length << std::dec
1175 << dendl;
11fdf7f2
TL
1176 if (unlikely(to_stdout)) {
1177 std::cout << " 0x" << std::hex << pos << std::dec
1178 << ": op_alloc_add " << " " << (int)id
1179 << ":0x" << std::hex << offset << "~" << length << std::dec
1180 << std::endl;
1181 }
7c673cae
FG
1182 if (!noop) {
1183 block_all[id].insert(offset, length);
9f95a23c
TL
1184 _adjust_granularity(id, &offset, &length, true);
1185 if (length) {
1186 alloc[id]->init_add_free(offset, length);
1187 }
1188
1189 if (cct->_conf->bluefs_log_replay_check_allocations) {
1190 bool fail = false;
1191 apply_for_bitset_range(offset, length, alloc_size[id], owned_blocks[id],
1192 [&](uint64_t pos, boost::dynamic_bitset<uint64_t> &bs) {
1193 if (bs.test(pos)) {
1194 fail = true;
1195 } else {
1196 bs.set(pos);
1197 }
1198 }
1199 );
1200 if (fail) {
1201 derr << __func__ << " invalid extent " << (int)id
1202 << ": 0x" << std::hex << offset << "~" << length
1203 << std::dec << ": already given" << dendl;
1204 return -EFAULT;
1205 }
1206 apply_for_bitset_range(offset, length, alloc_size[id], used_blocks[id],
1207 [&](uint64_t pos, boost::dynamic_bitset<uint64_t> &bs) {
1208 if (bs.test(pos)) {
1209 fail = true;
1210 }
1211 }
1212 );
1213 if (fail) {
1214 derr << __func__ << " invalid extent " << int(id)
1215 << ": 0x" << std::hex << offset << "~" << length
1216 << std::dec << ": already in use" << dendl;
1217 return -EFAULT;
1218 }
1219 }
7c673cae
FG
1220 }
1221 }
1222 break;
1223
1224 case bluefs_transaction_t::OP_ALLOC_RM:
1225 {
1226 __u8 id;
1227 uint64_t offset, length;
11fdf7f2
TL
1228 decode(id, p);
1229 decode(offset, p);
1230 decode(length, p);
7c673cae
FG
1231 dout(20) << __func__ << " 0x" << std::hex << pos << std::dec
1232 << ": op_alloc_rm " << " " << (int)id
1233 << ":0x" << std::hex << offset << "~" << length << std::dec
1234 << dendl;
11fdf7f2
TL
1235 if (unlikely(to_stdout)) {
1236 std::cout << " 0x" << std::hex << pos << std::dec
1237 << ": op_alloc_rm " << " " << (int)id
1238 << ":0x" << std::hex << offset << "~" << length << std::dec
1239 << std::endl;
1240 }
7c673cae
FG
1241 if (!noop) {
1242 block_all[id].erase(offset, length);
9f95a23c
TL
1243 _adjust_granularity(id, &offset, &length, false);
1244 if (length) {
1245 alloc[id]->init_rm_free(offset, length);
1246 }
1247 if (cct->_conf->bluefs_log_replay_check_allocations) {
1248 bool fail = false;
1249 apply_for_bitset_range(offset, length, alloc_size[id], owned_blocks[id],
1250 [&](uint64_t pos, boost::dynamic_bitset<uint64_t> &bs) {
1251 if (!bs.test(pos)) {
1252 fail = true;
1253 } else {
1254 bs.reset(pos);
1255 }
1256 }
1257 );
1258 if (fail) {
1259 derr << __func__ << " invalid extent " << int(id)
1260 << ": 0x" << std::hex << offset << "~" << length
1261 << std::dec << ": wasn't given" << dendl;
1262 return -EFAULT;
1263 }
1264
1265 apply_for_bitset_range(offset, length, alloc_size[id], used_blocks[id],
1266 [&](uint64_t pos, boost::dynamic_bitset<uint64_t> &bs) {
1267 if (bs.test(pos)) {
1268 fail = true;
1269 }
1270 }
1271 );
1272 if (fail) {
1273 derr << __func__ << " invalid extent " << (int)id
1274 << ": 0x" << std::hex << offset << "~" << length
1275 << std::dec << ": still in use" << dendl;
1276 return -EFAULT;
1277 }
1278 }
1279 }
7c673cae
FG
1280 }
1281 break;
1282
1283 case bluefs_transaction_t::OP_DIR_LINK:
1284 {
1285 string dirname, filename;
1286 uint64_t ino;
11fdf7f2
TL
1287 decode(dirname, p);
1288 decode(filename, p);
1289 decode(ino, p);
7c673cae
FG
1290 dout(20) << __func__ << " 0x" << std::hex << pos << std::dec
1291 << ": op_dir_link " << " " << dirname << "/" << filename
1292 << " to " << ino
1293 << dendl;
11fdf7f2
TL
1294 if (unlikely(to_stdout)) {
1295 std::cout << " 0x" << std::hex << pos << std::dec
1296 << ": op_dir_link " << " " << dirname << "/" << filename
1297 << " to " << ino
1298 << std::endl;
1299 }
1300
7c673cae
FG
1301 if (!noop) {
1302 FileRef file = _get_file(ino);
11fdf7f2 1303 ceph_assert(file->fnode.ino);
7c673cae 1304 map<string,DirRef>::iterator q = dir_map.find(dirname);
11fdf7f2 1305 ceph_assert(q != dir_map.end());
7c673cae 1306 map<string,FileRef>::iterator r = q->second->file_map.find(filename);
11fdf7f2 1307 ceph_assert(r == q->second->file_map.end());
9f95a23c
TL
1308
1309 vselector->sub_usage(file->vselector_hint, file->fnode);
1310 file->vselector_hint =
1311 vselector->get_hint_by_dir(dirname);
1312 vselector->add_usage(file->vselector_hint, file->fnode);
1313
7c673cae
FG
1314 q->second->file_map[filename] = file;
1315 ++file->refs;
1316 }
1317 }
1318 break;
1319
1320 case bluefs_transaction_t::OP_DIR_UNLINK:
1321 {
1322 string dirname, filename;
11fdf7f2
TL
1323 decode(dirname, p);
1324 decode(filename, p);
7c673cae
FG
1325 dout(20) << __func__ << " 0x" << std::hex << pos << std::dec
1326 << ": op_dir_unlink " << " " << dirname << "/" << filename
1327 << dendl;
11fdf7f2
TL
1328 if (unlikely(to_stdout)) {
1329 std::cout << " 0x" << std::hex << pos << std::dec
1330 << ": op_dir_unlink " << " " << dirname << "/" << filename
1331 << std::endl;
1332 }
1333
7c673cae
FG
1334 if (!noop) {
1335 map<string,DirRef>::iterator q = dir_map.find(dirname);
11fdf7f2 1336 ceph_assert(q != dir_map.end());
7c673cae 1337 map<string,FileRef>::iterator r = q->second->file_map.find(filename);
11fdf7f2
TL
1338 ceph_assert(r != q->second->file_map.end());
1339 ceph_assert(r->second->refs > 0);
7c673cae
FG
1340 --r->second->refs;
1341 q->second->file_map.erase(r);
1342 }
1343 }
1344 break;
1345
1346 case bluefs_transaction_t::OP_DIR_CREATE:
1347 {
1348 string dirname;
11fdf7f2 1349 decode(dirname, p);
7c673cae
FG
1350 dout(20) << __func__ << " 0x" << std::hex << pos << std::dec
1351 << ": op_dir_create " << dirname << dendl;
11fdf7f2
TL
1352 if (unlikely(to_stdout)) {
1353 std::cout << " 0x" << std::hex << pos << std::dec
1354 << ": op_dir_create " << dirname << std::endl;
1355 }
1356
7c673cae
FG
1357 if (!noop) {
1358 map<string,DirRef>::iterator q = dir_map.find(dirname);
11fdf7f2 1359 ceph_assert(q == dir_map.end());
9f95a23c 1360 dir_map[dirname] = ceph::make_ref<Dir>();
7c673cae
FG
1361 }
1362 }
1363 break;
1364
1365 case bluefs_transaction_t::OP_DIR_REMOVE:
1366 {
1367 string dirname;
11fdf7f2 1368 decode(dirname, p);
7c673cae
FG
1369 dout(20) << __func__ << " 0x" << std::hex << pos << std::dec
1370 << ": op_dir_remove " << dirname << dendl;
11fdf7f2
TL
1371 if (unlikely(to_stdout)) {
1372 std::cout << " 0x" << std::hex << pos << std::dec
1373 << ": op_dir_remove " << dirname << std::endl;
1374 }
1375
7c673cae
FG
1376 if (!noop) {
1377 map<string,DirRef>::iterator q = dir_map.find(dirname);
11fdf7f2
TL
1378 ceph_assert(q != dir_map.end());
1379 ceph_assert(q->second->file_map.empty());
7c673cae
FG
1380 dir_map.erase(q);
1381 }
1382 }
1383 break;
1384
1385 case bluefs_transaction_t::OP_FILE_UPDATE:
1386 {
1387 bluefs_fnode_t fnode;
11fdf7f2 1388 decode(fnode, p);
7c673cae 1389 dout(20) << __func__ << " 0x" << std::hex << pos << std::dec
9f95a23c 1390 << ": op_file_update " << " " << fnode << " " << dendl;
11fdf7f2
TL
1391 if (unlikely(to_stdout)) {
1392 std::cout << " 0x" << std::hex << pos << std::dec
1393 << ": op_file_update " << " " << fnode << std::endl;
1394 }
9f95a23c 1395 if (!noop) {
7c673cae 1396 FileRef f = _get_file(fnode.ino);
9f95a23c
TL
1397 if (cct->_conf->bluefs_log_replay_check_allocations) {
1398 // check initial log layout
1399 if (first_log_check) {
1400 first_log_check = false;
1401 int r = _check_new_allocations(log_file->fnode,
1402 MAX_BDEV, owned_blocks, used_blocks);
1403 if (r < 0) {
1404 return r;
1405 }
1406 }
1407
1408 auto& fnode_extents = f->fnode.extents;
1409 for (auto e : fnode_extents) {
1410 auto id = e.bdev;
1411 if (int r = _verify_alloc_granularity(id, e.offset, e.length,
1412 "OP_FILE_UPDATE"); r < 0) {
1413 return r;
1414 }
1415 apply_for_bitset_range(e.offset, e.length, alloc_size[id],
1416 used_blocks[id],
1417 [&](uint64_t pos, boost::dynamic_bitset<uint64_t> &bs) {
1418 ceph_assert(bs.test(pos));
1419 bs.reset(pos);
1420 }
1421 );
1422 }
1423 }
1424
1425 if (fnode.ino != 1) {
1426 vselector->sub_usage(f->vselector_hint, f->fnode);
1427 }
1428 f->fnode = fnode;
1429 if (fnode.ino != 1) {
1430 vselector->add_usage(f->vselector_hint, f->fnode);
1431 }
1432
7c673cae
FG
1433 if (fnode.ino > ino_last) {
1434 ino_last = fnode.ino;
1435 }
9f95a23c
TL
1436 if (cct->_conf->bluefs_log_replay_check_allocations) {
1437 int r = _check_new_allocations(f->fnode,
1438 MAX_BDEV, owned_blocks, used_blocks);
1439 if (r < 0) {
1440 return r;
1441 }
1442 }
7c673cae 1443 }
9f95a23c 1444 }
7c673cae
FG
1445 break;
1446
1447 case bluefs_transaction_t::OP_FILE_REMOVE:
1448 {
1449 uint64_t ino;
11fdf7f2 1450 decode(ino, p);
7c673cae
FG
1451 dout(20) << __func__ << " 0x" << std::hex << pos << std::dec
1452 << ": op_file_remove " << ino << dendl;
11fdf7f2
TL
1453 if (unlikely(to_stdout)) {
1454 std::cout << " 0x" << std::hex << pos << std::dec
1455 << ": op_file_remove " << ino << std::endl;
1456 }
1457
9f95a23c
TL
1458 if (!noop) {
1459 auto p = file_map.find(ino);
1460 ceph_assert(p != file_map.end());
1461 vselector->sub_usage(p->second->vselector_hint, p->second->fnode);
1462 if (cct->_conf->bluefs_log_replay_check_allocations) {
1463 auto& fnode_extents = p->second->fnode.extents;
1464 for (auto e : fnode_extents) {
1465 auto id = e.bdev;
1466 bool fail = false;
1467 apply_for_bitset_range(e.offset, e.length, alloc_size[id], owned_blocks[id],
1468 [&](uint64_t pos, boost::dynamic_bitset<uint64_t> &bs) {
1469 if (!bs.test(pos)) {
1470 fail = true;
1471 }
1472 }
1473 );
1474 if (fail) {
1475 derr << __func__ << " invalid extent " << int(id)
1476 << ": 0x" << std::hex << e.offset << "~" << e.length
1477 << std::dec
1478 << ": wasn't given but is allocated for removed ino " << ino
1479 << dendl;
1480 return -EFAULT;
1481 }
1482
1483 apply_for_bitset_range(e.offset, e.length, alloc_size[id], used_blocks[id],
1484 [&](uint64_t pos, boost::dynamic_bitset<uint64_t> &bs) {
1485 if (!bs.test(pos)) {
1486 fail = true;
1487 }
1488 bs.reset(pos);
1489 }
1490 );
1491 if (fail) {
1492 derr << __func__ << " invalid extent " << int(id)
1493 << ": 0x" << std::hex << e.offset << "~" << e.length
1494 << std::dec
1495 << ": not in use but is allocated for removed ino " << ino
1496 << dendl;
1497 return -EFAULT;
1498 }
1499 }
1500 }
1501 file_map.erase(p);
1502 }
1503 }
7c673cae
FG
1504 break;
1505
1506 default:
1507 derr << __func__ << " 0x" << std::hex << pos << std::dec
1508 << ": stop: unrecognized op " << (int)op << dendl;
1509 delete log_reader;
1510 return -EIO;
1511 }
1512 }
11fdf7f2 1513 ceph_assert(p.end());
7c673cae
FG
1514
1515 // we successfully replayed the transaction; bump the seq and log size
1516 ++log_seq;
1517 log_file->fnode.size = log_reader->buf.pos;
1518 }
9f95a23c
TL
1519 vselector->add_usage(log_file->vselector_hint, log_file->fnode);
1520
1521 if (!noop && first_log_check &&
1522 cct->_conf->bluefs_log_replay_check_allocations) {
1523 int r = _check_new_allocations(log_file->fnode,
1524 MAX_BDEV, owned_blocks, used_blocks);
1525 if (r < 0) {
1526 return r;
1527 }
1528 }
7c673cae
FG
1529
1530 dout(10) << __func__ << " log file size was 0x"
1531 << std::hex << log_file->fnode.size << std::dec << dendl;
11fdf7f2
TL
1532 if (unlikely(to_stdout)) {
1533 std::cout << " log file size was 0x"
1534 << std::hex << log_file->fnode.size << std::dec << std::endl;
1535 }
1536
7c673cae
FG
1537 delete log_reader;
1538
1539 if (!noop) {
1540 // verify file link counts are all >0
1541 for (auto& p : file_map) {
1542 if (p.second->refs == 0 &&
1543 p.second->fnode.ino > 1) {
1544 derr << __func__ << " file with link count 0: " << p.second->fnode
1545 << dendl;
1546 return -EIO;
1547 }
1548 }
1549 }
1550
9f95a23c
TL
1551 for (unsigned id = 0; id < MAX_BDEV; ++id) {
1552 dout(10) << __func__ << " block_unused_too_granular " << id << ": "
1553 << block_unused_too_granular[id] << dendl;
1554 }
7c673cae
FG
1555 dout(10) << __func__ << " done" << dendl;
1556 return 0;
1557}
1558
11fdf7f2
TL
1559int BlueFS::log_dump()
1560{
1561 // only dump log file's content
1562 int r = _replay(true, true);
1563 if (r < 0) {
1564 derr << __func__ << " failed to replay log: " << cpp_strerror(r) << dendl;
1565 return r;
1566 }
1567
1568 return 0;
1569}
1570
1571int BlueFS::device_migrate_to_existing(
1572 CephContext *cct,
1573 const set<int>& devs_source,
9f95a23c
TL
1574 int dev_target,
1575 const bluefs_layout_t& layout)
11fdf7f2
TL
1576{
1577 vector<byte> buf;
1578 bool buffered = cct->_conf->bluefs_buffered_io;
1579
eafe8130
TL
1580 dout(10) << __func__ << " devs_source " << devs_source
1581 << " dev_target " << dev_target << dendl;
11fdf7f2
TL
1582 assert(dev_target < (int)MAX_BDEV);
1583
1584 int flags = 0;
1585 flags |= devs_source.count(BDEV_DB) ?
1586 (REMOVE_DB | RENAME_SLOW2DB) : 0;
1587 flags |= devs_source.count(BDEV_WAL) ? REMOVE_WAL : 0;
1588 int dev_target_new = dev_target;
1589
1590 // Slow device without separate DB one is addressed via BDEV_DB
1591 // Hence need renaming.
1592 if ((flags & REMOVE_DB) && dev_target == BDEV_SLOW) {
1593 dev_target_new = BDEV_DB;
1594 dout(0) << __func__ << " super to be written to " << dev_target << dendl;
1595 }
1596
9f95a23c 1597 for (auto& [ino, file_ref] : file_map) {
11fdf7f2 1598 //do not copy log
9f95a23c 1599 if (file_ref->fnode.ino == 1) {
11fdf7f2
TL
1600 continue;
1601 }
9f95a23c 1602 dout(10) << __func__ << " " << ino << " " << file_ref->fnode << dendl;
eafe8130 1603
9f95a23c 1604 auto& fnode_extents = file_ref->fnode.extents;
11fdf7f2 1605
9f95a23c
TL
1606 bool rewrite = std::any_of(
1607 fnode_extents.begin(),
1608 fnode_extents.end(),
1609 [=](auto& ext) {
1610 return ext.bdev != dev_target && devs_source.count(ext.bdev);
1611 });
eafe8130
TL
1612 if (rewrite) {
1613 dout(10) << __func__ << " migrating" << dendl;
1614
1615 // read entire file
1616 bufferlist bl;
1617 for (auto old_ext : fnode_extents) {
1618 buf.resize(old_ext.length);
1619 int r = bdev[old_ext.bdev]->read_random(
1620 old_ext.offset,
1621 old_ext.length,
1622 (char*)&buf.at(0),
1623 buffered);
1624 if (r != 0) {
1625 derr << __func__ << " failed to read 0x" << std::hex
1626 << old_ext.offset << "~" << old_ext.length << std::dec
1627 << " from " << (int)dev_target << dendl;
1628 return -EIO;
1629 }
1630 bl.append((char*)&buf[0], old_ext.length);
1631 }
11fdf7f2 1632
eafe8130
TL
1633 // write entire file
1634 PExtentVector extents;
1635 auto l = _allocate_without_fallback(dev_target, bl.length(), &extents);
1636 if (l < 0) {
1637 derr << __func__ << " unable to allocate len 0x" << std::hex
1638 << bl.length() << std::dec << " from " << (int)dev_target
1639 << ": " << cpp_strerror(l) << dendl;
1640 return -ENOSPC;
1641 }
11fdf7f2 1642
eafe8130
TL
1643 uint64_t off = 0;
1644 for (auto& i : extents) {
1645 bufferlist cur;
1646 uint64_t cur_len = std::min<uint64_t>(i.length, bl.length() - off);
1647 ceph_assert(cur_len > 0);
1648 cur.substr_of(bl, off, cur_len);
1649 int r = bdev[dev_target]->write(i.offset, cur, buffered);
1650 ceph_assert(r == 0);
1651 off += cur_len;
1652 }
1653
1654 // release old extents
1655 for (auto old_ext : fnode_extents) {
1656 PExtentVector to_release;
1657 to_release.emplace_back(old_ext.offset, old_ext.length);
1658 alloc[old_ext.bdev]->release(to_release);
1659 }
1660
1661 // update fnode
1662 fnode_extents.clear();
1663 for (auto& i : extents) {
1664 fnode_extents.emplace_back(dev_target_new, i.offset, i.length);
1665 }
1666 } else {
9f95a23c
TL
1667 for (auto& ext : fnode_extents) {
1668 if (dev_target != dev_target_new && ext.bdev == dev_target) {
eafe8130 1669 dout(20) << __func__ << " " << " ... adjusting extent 0x"
9f95a23c 1670 << std::hex << ext.offset << std::dec
eafe8130
TL
1671 << " bdev " << dev_target << " -> " << dev_target_new
1672 << dendl;
9f95a23c 1673 ext.bdev = dev_target_new;
11fdf7f2 1674 }
11fdf7f2
TL
1675 }
1676 }
11fdf7f2
TL
1677 }
1678 // new logging device in the current naming scheme
1679 int new_log_dev_cur = bdev[BDEV_WAL] ?
1680 BDEV_WAL :
1681 bdev[BDEV_DB] ? BDEV_DB : BDEV_SLOW;
1682
1683 // new logging device in new naming scheme
1684 int new_log_dev_next = new_log_dev_cur;
1685
1686 if (devs_source.count(new_log_dev_cur)) {
1687 // SLOW device is addressed via BDEV_DB too hence either WAL or DB
1688 new_log_dev_next = (flags & REMOVE_WAL) || !bdev[BDEV_WAL] ?
1689 BDEV_DB :
1690 BDEV_WAL;
1691
1692 dout(0) << __func__ << " log moved from " << new_log_dev_cur
1693 << " to " << new_log_dev_next << dendl;
1694
1695 new_log_dev_cur =
1696 (flags & REMOVE_DB) && new_log_dev_next == BDEV_DB ?
1697 BDEV_SLOW :
1698 new_log_dev_next;
1699 }
1700
9f95a23c 1701 _rewrite_log_and_layout_sync(
11fdf7f2
TL
1702 false,
1703 (flags & REMOVE_DB) ? BDEV_SLOW : BDEV_DB,
1704 new_log_dev_cur,
1705 new_log_dev_next,
9f95a23c
TL
1706 flags,
1707 layout);
11fdf7f2
TL
1708 return 0;
1709}
1710
1711int BlueFS::device_migrate_to_new(
1712 CephContext *cct,
1713 const set<int>& devs_source,
9f95a23c
TL
1714 int dev_target,
1715 const bluefs_layout_t& layout)
11fdf7f2
TL
1716{
1717 vector<byte> buf;
1718 bool buffered = cct->_conf->bluefs_buffered_io;
1719
eafe8130
TL
1720 dout(10) << __func__ << " devs_source " << devs_source
1721 << " dev_target " << dev_target << dendl;
11fdf7f2
TL
1722 assert(dev_target == (int)BDEV_NEWDB || (int)BDEV_NEWWAL);
1723
1724 int flags = 0;
1725
1726 flags |= devs_source.count(BDEV_DB) ?
1727 (!bdev[BDEV_SLOW] ? RENAME_DB2SLOW: REMOVE_DB) :
1728 0;
1729 flags |= devs_source.count(BDEV_WAL) ? REMOVE_WAL : 0;
9f95a23c 1730 int dev_target_new = dev_target; //FIXME: remove, makes no sense
11fdf7f2
TL
1731
1732 for (auto& p : file_map) {
1733 //do not copy log
1734 if (p.second->fnode.ino == 1) {
1735 continue;
1736 }
eafe8130
TL
1737 dout(10) << __func__ << " " << p.first << " " << p.second->fnode << dendl;
1738
11fdf7f2
TL
1739 auto& fnode_extents = p.second->fnode.extents;
1740
eafe8130 1741 bool rewrite = false;
11fdf7f2 1742 for (auto ext_it = fnode_extents.begin();
eafe8130
TL
1743 ext_it != p.second->fnode.extents.end();
1744 ++ext_it) {
11fdf7f2 1745 if (ext_it->bdev != dev_target && devs_source.count(ext_it->bdev)) {
eafe8130
TL
1746 rewrite = true;
1747 break;
1748 }
1749 }
1750 if (rewrite) {
1751 dout(10) << __func__ << " migrating" << dendl;
1752
1753 // read entire file
1754 bufferlist bl;
1755 for (auto old_ext : fnode_extents) {
1756 buf.resize(old_ext.length);
1757 int r = bdev[old_ext.bdev]->read_random(
1758 old_ext.offset,
1759 old_ext.length,
1760 (char*)&buf.at(0),
1761 buffered);
1762 if (r != 0) {
1763 derr << __func__ << " failed to read 0x" << std::hex
1764 << old_ext.offset << "~" << old_ext.length << std::dec
1765 << " from " << (int)dev_target << dendl;
1766 return -EIO;
11fdf7f2 1767 }
eafe8130
TL
1768 bl.append((char*)&buf[0], old_ext.length);
1769 }
1770
1771 // write entire file
1772 PExtentVector extents;
1773 auto l = _allocate_without_fallback(dev_target, bl.length(), &extents);
1774 if (l < 0) {
1775 derr << __func__ << " unable to allocate len 0x" << std::hex
1776 << bl.length() << std::dec << " from " << (int)dev_target
1777 << ": " << cpp_strerror(l) << dendl;
1778 return -ENOSPC;
1779 }
1780
1781 uint64_t off = 0;
1782 for (auto& i : extents) {
1783 bufferlist cur;
1784 uint64_t cur_len = std::min<uint64_t>(i.length, bl.length() - off);
1785 ceph_assert(cur_len > 0);
1786 cur.substr_of(bl, off, cur_len);
1787 int r = bdev[dev_target]->write(i.offset, cur, buffered);
1788 ceph_assert(r == 0);
1789 off += cur_len;
1790 }
1791
1792 // release old extents
1793 for (auto old_ext : fnode_extents) {
1794 PExtentVector to_release;
1795 to_release.emplace_back(old_ext.offset, old_ext.length);
1796 alloc[old_ext.bdev]->release(to_release);
1797 }
1798
1799 // update fnode
1800 fnode_extents.clear();
1801 for (auto& i : extents) {
1802 fnode_extents.emplace_back(dev_target_new, i.offset, i.length);
11fdf7f2
TL
1803 }
1804 }
11fdf7f2
TL
1805 }
1806 // new logging device in the current naming scheme
1807 int new_log_dev_cur =
1808 bdev[BDEV_NEWWAL] ?
1809 BDEV_NEWWAL :
1810 bdev[BDEV_WAL] && !(flags & REMOVE_WAL) ?
1811 BDEV_WAL :
1812 bdev[BDEV_NEWDB] ?
1813 BDEV_NEWDB :
1814 bdev[BDEV_DB] && !(flags & REMOVE_DB)?
1815 BDEV_DB :
1816 BDEV_SLOW;
1817
1818 // new logging device in new naming scheme
1819 int new_log_dev_next =
1820 new_log_dev_cur == BDEV_NEWWAL ?
1821 BDEV_WAL :
1822 new_log_dev_cur == BDEV_NEWDB ?
1823 BDEV_DB :
1824 new_log_dev_cur;
1825
1826 int super_dev =
1827 dev_target == BDEV_NEWDB ?
1828 BDEV_NEWDB :
1829 bdev[BDEV_DB] ?
1830 BDEV_DB :
1831 BDEV_SLOW;
1832
9f95a23c 1833 _rewrite_log_and_layout_sync(
11fdf7f2
TL
1834 false,
1835 super_dev,
1836 new_log_dev_cur,
1837 new_log_dev_next,
9f95a23c
TL
1838 flags,
1839 layout);
11fdf7f2
TL
1840 return 0;
1841}
1842
7c673cae
FG
1843BlueFS::FileRef BlueFS::_get_file(uint64_t ino)
1844{
1845 auto p = file_map.find(ino);
1846 if (p == file_map.end()) {
9f95a23c 1847 FileRef f = ceph::make_ref<File>();
7c673cae
FG
1848 file_map[ino] = f;
1849 dout(30) << __func__ << " ino " << ino << " = " << f
1850 << " (new)" << dendl;
1851 return f;
1852 } else {
1853 dout(30) << __func__ << " ino " << ino << " = " << p->second << dendl;
1854 return p->second;
1855 }
1856}
1857
1858void BlueFS::_drop_link(FileRef file)
1859{
1860 dout(20) << __func__ << " had refs " << file->refs
1861 << " on " << file->fnode << dendl;
11fdf7f2 1862 ceph_assert(file->refs > 0);
7c673cae
FG
1863 --file->refs;
1864 if (file->refs == 0) {
1865 dout(20) << __func__ << " destroying " << file->fnode << dendl;
11fdf7f2 1866 ceph_assert(file->num_reading.load() == 0);
9f95a23c 1867 vselector->sub_usage(file->vselector_hint, file->fnode);
7c673cae
FG
1868 log_t.op_file_remove(file->fnode.ino);
1869 for (auto& r : file->fnode.extents) {
1870 pending_release[r.bdev].insert(r.offset, r.length);
1871 }
1872 file_map.erase(file->fnode.ino);
1873 file->deleted = true;
94b18763 1874
7c673cae 1875 if (file->dirty_seq) {
11fdf7f2
TL
1876 ceph_assert(file->dirty_seq > log_seq_stable);
1877 ceph_assert(dirty_files.count(file->dirty_seq));
7c673cae
FG
1878 auto it = dirty_files[file->dirty_seq].iterator_to(*file);
1879 dirty_files[file->dirty_seq].erase(it);
1880 file->dirty_seq = 0;
1881 }
1882 }
1883}
1884
1885int BlueFS::_read_random(
1886 FileReader *h, ///< [in] read from here
1887 uint64_t off, ///< [in] offset
9f95a23c 1888 uint64_t len, ///< [in] this many bytes
7c673cae
FG
1889 char *out) ///< [out] optional: or copy it here
1890{
494da23a
TL
1891 auto* buf = &h->buf;
1892
1893 int ret = 0;
7c673cae
FG
1894 dout(10) << __func__ << " h " << h
1895 << " 0x" << std::hex << off << "~" << len << std::dec
1896 << " from " << h->file->fnode << dendl;
1897
1898 ++h->file->num_reading;
1899
1900 if (!h->ignore_eof &&
1901 off + len > h->file->fnode.size) {
1902 if (off > h->file->fnode.size)
1903 len = 0;
1904 else
1905 len = h->file->fnode.size - off;
1906 dout(20) << __func__ << " reaching (or past) eof, len clipped to 0x"
1907 << std::hex << len << std::dec << dendl;
1908 }
494da23a
TL
1909 logger->inc(l_bluefs_read_random_count, 1);
1910 logger->inc(l_bluefs_read_random_bytes, len);
7c673cae 1911
494da23a 1912 std::shared_lock s_lock(h->lock);
f91f0fd5 1913 buf->bl.reassign_to_mempool(mempool::mempool_bluefs_file_reader);
7c673cae 1914 while (len > 0) {
494da23a
TL
1915 if (off < buf->bl_off || off >= buf->get_buf_end()) {
1916 s_lock.unlock();
1917 uint64_t x_off = 0;
1918 auto p = h->file->fnode.seek(off, &x_off);
f6b5b4d7 1919 ceph_assert(p != h->file->fnode.extents.end());
9f95a23c 1920 uint64_t l = std::min(p->length - x_off, len);
494da23a
TL
1921 dout(20) << __func__ << " read random 0x"
1922 << std::hex << x_off << "~" << l << std::dec
1923 << " of " << *p << dendl;
1924 int r = bdev[p->bdev]->read_random(p->offset + x_off, l, out,
1925 cct->_conf->bluefs_buffered_io);
1926 ceph_assert(r == 0);
1927 off += l;
1928 len -= l;
1929 ret += l;
1930 out += l;
1931
1932 logger->inc(l_bluefs_read_random_disk_count, 1);
1933 logger->inc(l_bluefs_read_random_disk_bytes, l);
1934 if (len > 0) {
1935 s_lock.lock();
1936 }
1937 } else {
1938 auto left = buf->get_buf_remaining(off);
1939 int r = std::min(len, left);
1940 logger->inc(l_bluefs_read_random_buffer_count, 1);
1941 logger->inc(l_bluefs_read_random_buffer_bytes, r);
1942 dout(20) << __func__ << " left 0x" << std::hex << left
1943 << " 0x" << off << "~" << len << std::dec
1944 << dendl;
1945
1946 if (out) {
1947 // NOTE: h->bl is normally a contiguous buffer so c_str() is free.
1948 memcpy(out, buf->bl.c_str() + off - buf->bl_off, r);
1949 out += r;
1950 }
7c673cae 1951
494da23a
TL
1952 dout(30) << __func__ << " result chunk (0x"
1953 << std::hex << r << std::dec << " bytes):\n";
1954 bufferlist t;
1955 t.substr_of(buf->bl, off - buf->bl_off, r);
1956 t.hexdump(*_dout);
1957 *_dout << dendl;
1958
1959 off += r;
1960 len -= r;
1961 ret += r;
1962 buf->pos += r;
1963 }
1964 }
7c673cae
FG
1965 dout(20) << __func__ << " got " << ret << dendl;
1966 --h->file->num_reading;
1967 return ret;
1968}
1969
1970int BlueFS::_read(
1971 FileReader *h, ///< [in] read from here
1972 FileReaderBuffer *buf, ///< [in] reader state
1973 uint64_t off, ///< [in] offset
1974 size_t len, ///< [in] this many bytes
1975 bufferlist *outbl, ///< [out] optional: reference the result here
1976 char *out) ///< [out] optional: or copy it here
1977{
494da23a 1978 bool prefetch = !outbl && !out;
7c673cae
FG
1979 dout(10) << __func__ << " h " << h
1980 << " 0x" << std::hex << off << "~" << len << std::dec
494da23a
TL
1981 << " from " << h->file->fnode
1982 << (prefetch ? " prefetch" : "")
1983 << dendl;
7c673cae
FG
1984
1985 ++h->file->num_reading;
1986
1987 if (!h->ignore_eof &&
1988 off + len > h->file->fnode.size) {
1989 if (off > h->file->fnode.size)
1990 len = 0;
1991 else
1992 len = h->file->fnode.size - off;
1993 dout(20) << __func__ << " reaching (or past) eof, len clipped to 0x"
1994 << std::hex << len << std::dec << dendl;
1995 }
494da23a
TL
1996 logger->inc(l_bluefs_read_count, 1);
1997 logger->inc(l_bluefs_read_bytes, len);
1998 if (prefetch) {
1999 logger->inc(l_bluefs_read_prefetch_count, 1);
2000 logger->inc(l_bluefs_read_prefetch_bytes, len);
2001 }
2002
7c673cae
FG
2003 if (outbl)
2004 outbl->clear();
2005
2006 int ret = 0;
494da23a 2007 std::shared_lock s_lock(h->lock);
7c673cae
FG
2008 while (len > 0) {
2009 size_t left;
2010 if (off < buf->bl_off || off >= buf->get_buf_end()) {
494da23a
TL
2011 s_lock.unlock();
2012 std::unique_lock u_lock(h->lock);
f91f0fd5 2013 buf->bl.reassign_to_mempool(mempool::mempool_bluefs_file_reader);
494da23a
TL
2014 if (off < buf->bl_off || off >= buf->get_buf_end()) {
2015 // if precondition hasn't changed during locking upgrade.
2016 buf->bl.clear();
2017 buf->bl_off = off & super.block_mask();
2018 uint64_t x_off = 0;
2019 auto p = h->file->fnode.seek(buf->bl_off, &x_off);
f6b5b4d7
TL
2020 if (p == h->file->fnode.extents.end()) {
2021 dout(5) << __func__ << " reading less then required "
2022 << ret << "<" << ret + len << dendl;
2023 break;
2024 }
2025
494da23a
TL
2026 uint64_t want = round_up_to(len + (off & ~super.block_mask()),
2027 super.block_size);
2028 want = std::max(want, buf->max_prefetch);
2029 uint64_t l = std::min(p->length - x_off, want);
2030 uint64_t eof_offset = round_up_to(h->file->fnode.size, super.block_size);
2031 if (!h->ignore_eof &&
2032 buf->bl_off + l > eof_offset) {
2033 l = eof_offset - buf->bl_off;
2034 }
2035 dout(20) << __func__ << " fetching 0x"
2036 << std::hex << x_off << "~" << l << std::dec
2037 << " of " << *p << dendl;
2038 int r = bdev[p->bdev]->read(p->offset + x_off, l, &buf->bl, ioc[p->bdev],
2039 cct->_conf->bluefs_buffered_io);
2040 ceph_assert(r == 0);
7c673cae 2041 }
494da23a
TL
2042 u_lock.unlock();
2043 s_lock.lock();
2044 // we should recheck if buffer is valid after lock downgrade
2045 continue;
7c673cae
FG
2046 }
2047 left = buf->get_buf_remaining(off);
2048 dout(20) << __func__ << " left 0x" << std::hex << left
2049 << " len 0x" << len << std::dec << dendl;
2050
11fdf7f2 2051 int r = std::min(len, left);
7c673cae
FG
2052 if (outbl) {
2053 bufferlist t;
2054 t.substr_of(buf->bl, off - buf->bl_off, r);
2055 outbl->claim_append(t);
2056 }
2057 if (out) {
2058 // NOTE: h->bl is normally a contiguous buffer so c_str() is free.
2059 memcpy(out, buf->bl.c_str() + off - buf->bl_off, r);
2060 out += r;
2061 }
2062
2063 dout(30) << __func__ << " result chunk (0x"
2064 << std::hex << r << std::dec << " bytes):\n";
2065 bufferlist t;
2066 t.substr_of(buf->bl, off - buf->bl_off, r);
2067 t.hexdump(*_dout);
2068 *_dout << dendl;
2069
2070 off += r;
2071 len -= r;
2072 ret += r;
2073 buf->pos += r;
2074 }
2075
2076 dout(20) << __func__ << " got " << ret << dendl;
11fdf7f2 2077 ceph_assert(!outbl || (int)outbl->length() == ret);
7c673cae
FG
2078 --h->file->num_reading;
2079 return ret;
2080}
2081
2082void BlueFS::_invalidate_cache(FileRef f, uint64_t offset, uint64_t length)
2083{
2084 dout(10) << __func__ << " file " << f->fnode
2085 << " 0x" << std::hex << offset << "~" << length << std::dec
2086 << dendl;
2087 if (offset & ~super.block_mask()) {
2088 offset &= super.block_mask();
11fdf7f2 2089 length = round_up_to(length, super.block_size);
7c673cae
FG
2090 }
2091 uint64_t x_off = 0;
2092 auto p = f->fnode.seek(offset, &x_off);
2093 while (length > 0 && p != f->fnode.extents.end()) {
11fdf7f2 2094 uint64_t x_len = std::min(p->length - x_off, length);
7c673cae
FG
2095 bdev[p->bdev]->invalidate_cache(p->offset + x_off, x_len);
2096 dout(20) << __func__ << " 0x" << std::hex << x_off << "~" << x_len
2097 << std:: dec << " of " << *p << dendl;
2098 offset += x_len;
2099 length -= x_len;
2100 }
2101}
2102
2103uint64_t BlueFS::_estimate_log_size()
2104{
2105 int avg_dir_size = 40; // fixme
2106 int avg_file_size = 12;
2107 uint64_t size = 4096 * 2;
2108 size += file_map.size() * (1 + sizeof(bluefs_fnode_t));
2109 for (auto& p : block_all)
2110 size += p.num_intervals() * (1 + 1 + sizeof(uint64_t) * 2);
2111 size += dir_map.size() + (1 + avg_dir_size);
2112 size += file_map.size() * (1 + avg_dir_size + avg_file_size);
11fdf7f2 2113 return round_up_to(size, super.block_size);
7c673cae
FG
2114}
2115
2116void BlueFS::compact_log()
2117{
f6b5b4d7
TL
2118 std::unique_lock<ceph::mutex> l(lock);
2119 if (!cct->_conf->bluefs_replay_recovery_disable_compact) {
2120 if (cct->_conf->bluefs_compact_log_sync) {
2121 _compact_log_sync();
2122 } else {
2123 _compact_log_async(l);
2124 }
7c673cae
FG
2125 }
2126}
2127
2128bool BlueFS::_should_compact_log()
2129{
2130 uint64_t current = log_writer->file->fnode.size;
2131 uint64_t expected = _estimate_log_size();
2132 float ratio = (float)current / (float)expected;
2133 dout(10) << __func__ << " current 0x" << std::hex << current
2134 << " expected " << expected << std::dec
2135 << " ratio " << ratio
2136 << (new_log ? " (async compaction in progress)" : "")
2137 << dendl;
2138 if (new_log ||
2139 current < cct->_conf->bluefs_log_compact_min_size ||
2140 ratio < cct->_conf->bluefs_log_compact_min_ratio) {
2141 return false;
2142 }
2143 return true;
2144}
2145
11fdf7f2
TL
2146void BlueFS::_compact_log_dump_metadata(bluefs_transaction_t *t,
2147 int flags)
7c673cae
FG
2148{
2149 t->seq = 1;
2150 t->uuid = super.uuid;
2151 dout(20) << __func__ << " op_init" << dendl;
2152
2153 t->op_init();
2154 for (unsigned bdev = 0; bdev < MAX_BDEV; ++bdev) {
2155 interval_set<uint64_t>& p = block_all[bdev];
2156 for (interval_set<uint64_t>::iterator q = p.begin(); q != p.end(); ++q) {
11fdf7f2
TL
2157 auto bdev_new = bdev;
2158 if ((flags & REMOVE_WAL) && bdev == BDEV_WAL) {
2159 continue;
2160 }
2161 if ((flags & REMOVE_DB) && bdev == BDEV_DB) {
2162 continue;
2163 }
2164 if ((flags & RENAME_SLOW2DB) && bdev == BDEV_SLOW) {
2165 bdev_new = BDEV_DB;
2166 }
2167 if ((flags & RENAME_DB2SLOW) && bdev == BDEV_DB) {
2168 bdev_new = BDEV_SLOW;
2169 }
2170 if (bdev == BDEV_NEWDB) {
2171 // REMOVE_DB xor RENAME_DB
2172 ceph_assert(!(flags & REMOVE_DB) != !(flags & RENAME_DB2SLOW));
2173 ceph_assert(!(flags & RENAME_SLOW2DB));
2174 bdev_new = BDEV_DB;
2175 }
2176 if (bdev == BDEV_NEWWAL) {
2177 ceph_assert(flags & REMOVE_WAL);
2178 bdev_new = BDEV_WAL;
2179 }
2180 dout(20) << __func__ << " op_alloc_add " << bdev_new << " 0x"
7c673cae
FG
2181 << std::hex << q.get_start() << "~" << q.get_len() << std::dec
2182 << dendl;
11fdf7f2 2183 t->op_alloc_add(bdev_new, q.get_start(), q.get_len());
7c673cae
FG
2184 }
2185 }
9f95a23c
TL
2186 for (auto& [ino, file_ref] : file_map) {
2187 if (ino == 1)
7c673cae 2188 continue;
9f95a23c 2189 ceph_assert(ino > 1);
11fdf7f2 2190
9f95a23c 2191 for(auto& e : file_ref->fnode.extents) {
11fdf7f2
TL
2192 auto bdev = e.bdev;
2193 auto bdev_new = bdev;
2194 ceph_assert(!((flags & REMOVE_WAL) && bdev == BDEV_WAL));
2195 if ((flags & RENAME_SLOW2DB) && bdev == BDEV_SLOW) {
2196 bdev_new = BDEV_DB;
2197 }
2198 if ((flags & RENAME_DB2SLOW) && bdev == BDEV_DB) {
2199 bdev_new = BDEV_SLOW;
2200 }
2201 if (bdev == BDEV_NEWDB) {
2202 // REMOVE_DB xor RENAME_DB
2203 ceph_assert(!(flags & REMOVE_DB) != !(flags & RENAME_DB2SLOW));
2204 ceph_assert(!(flags & RENAME_SLOW2DB));
2205 bdev_new = BDEV_DB;
2206 }
2207 if (bdev == BDEV_NEWWAL) {
2208 ceph_assert(flags & REMOVE_WAL);
2209 bdev_new = BDEV_WAL;
2210 }
2211 e.bdev = bdev_new;
2212 }
9f95a23c
TL
2213 dout(20) << __func__ << " op_file_update " << file_ref->fnode << dendl;
2214 t->op_file_update(file_ref->fnode);
7c673cae 2215 }
9f95a23c
TL
2216 for (auto& [path, dir_ref] : dir_map) {
2217 dout(20) << __func__ << " op_dir_create " << path << dendl;
2218 t->op_dir_create(path);
2219 for (auto& [fname, file_ref] : dir_ref->file_map) {
2220 dout(20) << __func__ << " op_dir_link " << path << "/" << fname
2221 << " to " << file_ref->fnode.ino << dendl;
2222 t->op_dir_link(path, fname, file_ref->fnode.ino);
7c673cae
FG
2223 }
2224 }
2225}
2226
2227void BlueFS::_compact_log_sync()
2228{
2229 dout(10) << __func__ << dendl;
9f95a23c
TL
2230 auto prefer_bdev =
2231 vselector->select_prefer_bdev(log_writer->file->vselector_hint);
2232 _rewrite_log_and_layout_sync(true,
11fdf7f2 2233 BDEV_DB,
9f95a23c
TL
2234 prefer_bdev,
2235 prefer_bdev,
2236 0,
2237 super.memorized_layout);
11fdf7f2
TL
2238 logger->inc(l_bluefs_log_compactions);
2239}
2240
9f95a23c
TL
2241void BlueFS::_rewrite_log_and_layout_sync(bool allocate_with_fallback,
2242 int super_dev,
2243 int log_dev,
2244 int log_dev_new,
2245 int flags,
2246 std::optional<bluefs_layout_t> layout)
11fdf7f2 2247{
7c673cae
FG
2248 File *log_file = log_writer->file.get();
2249
2250 // clear out log (be careful who calls us!!!)
2251 log_t.clear();
2252
11fdf7f2
TL
2253 dout(20) << __func__ << " super_dev:" << super_dev
2254 << " log_dev:" << log_dev
2255 << " log_dev_new:" << log_dev_new
2256 << " flags:" << flags
2257 << dendl;
7c673cae 2258 bluefs_transaction_t t;
11fdf7f2 2259 _compact_log_dump_metadata(&t, flags);
7c673cae
FG
2260
2261 dout(20) << __func__ << " op_jump_seq " << log_seq << dendl;
2262 t.op_jump_seq(log_seq);
2263
2264 bufferlist bl;
11fdf7f2 2265 encode(t, bl);
7c673cae
FG
2266 _pad_bl(bl);
2267
2268 uint64_t need = bl.length() + cct->_conf->bluefs_max_log_runway;
2269 dout(20) << __func__ << " need " << need << dendl;
2270
494da23a 2271 bluefs_fnode_t old_fnode;
11fdf7f2 2272 int r;
494da23a 2273 log_file->fnode.swap_extents(old_fnode);
11fdf7f2
TL
2274 if (allocate_with_fallback) {
2275 r = _allocate(log_dev, need, &log_file->fnode);
2276 ceph_assert(r == 0);
2277 } else {
2278 PExtentVector extents;
2279 r = _allocate_without_fallback(log_dev,
2280 need,
2281 &extents);
2282 ceph_assert(r == 0);
2283 for (auto& p : extents) {
2284 log_file->fnode.append_extent(
2285 bluefs_extent_t(log_dev, p.offset, p.length));
2286 }
7c673cae
FG
2287 }
2288
2289 _close_writer(log_writer);
2290
2291 log_file->fnode.size = bl.length();
9f95a23c
TL
2292 vselector->sub_usage(log_file->vselector_hint, old_fnode);
2293 vselector->add_usage(log_file->vselector_hint, log_file->fnode);
2294
7c673cae
FG
2295 log_writer = _create_writer(log_file);
2296 log_writer->append(bl);
11fdf7f2
TL
2297 r = _flush(log_writer, true);
2298 ceph_assert(r == 0);
2299#ifdef HAVE_LIBAIO
2300 if (!cct->_conf->bluefs_sync_write) {
2301 list<aio_t> completed_ios;
2302 _claim_completed_aios(log_writer, &completed_ios);
2303 wait_for_aio(log_writer);
2304 completed_ios.clear();
2305 }
2306#endif
224ce89b 2307 flush_bdev();
224ce89b 2308
9f95a23c 2309 super.memorized_layout = layout;
7c673cae 2310 super.log_fnode = log_file->fnode;
11fdf7f2
TL
2311 // rename device if needed
2312 if (log_dev != log_dev_new) {
2313 dout(10) << __func__ << " renaming log extents to " << log_dev_new << dendl;
2314 for (auto& p : super.log_fnode.extents) {
2315 p.bdev = log_dev_new;
2316 }
2317 }
2318 dout(10) << __func__ << " writing super, log fnode: " << super.log_fnode << dendl;
2319
7c673cae 2320 ++super.version;
11fdf7f2 2321 _write_super(super_dev);
7c673cae
FG
2322 flush_bdev();
2323
494da23a
TL
2324 dout(10) << __func__ << " release old log extents " << old_fnode.extents << dendl;
2325 for (auto& r : old_fnode.extents) {
7c673cae
FG
2326 pending_release[r.bdev].insert(r.offset, r.length);
2327 }
7c673cae
FG
2328}
2329
2330/*
2331 * 1. Allocate a new extent to continue the log, and then log an event
2332 * that jumps the log write position to the new extent. At this point, the
2333 * old extent(s) won't be written to, and reflect everything to compact.
2334 * New events will be written to the new region that we'll keep.
2335 *
2336 * 2. While still holding the lock, encode a bufferlist that dumps all of the
2337 * in-memory fnodes and names. This will become the new beginning of the
2338 * log. The last event will jump to the log continuation extent from #1.
2339 *
2340 * 3. Queue a write to a new extent for the new beginnging of the log.
2341 *
2342 * 4. Drop lock and wait
2343 *
2344 * 5. Retake the lock.
2345 *
2346 * 6. Update the log_fnode to splice in the new beginning.
2347 *
2348 * 7. Write the new superblock.
2349 *
2350 * 8. Release the old log space. Clean up.
2351 */
11fdf7f2 2352void BlueFS::_compact_log_async(std::unique_lock<ceph::mutex>& l)
7c673cae
FG
2353{
2354 dout(10) << __func__ << dendl;
2355 File *log_file = log_writer->file.get();
11fdf7f2
TL
2356 ceph_assert(!new_log);
2357 ceph_assert(!new_log_writer);
7c673cae 2358
181888fb
FG
2359 // create a new log [writer] so that we know compaction is in progress
2360 // (see _should_compact_log)
9f95a23c 2361 new_log = ceph::make_ref<File>();
181888fb
FG
2362 new_log->fnode.ino = 0; // so that _flush_range won't try to log the fnode
2363
3efd9988
FG
2364 // 0. wait for any racing flushes to complete. (We do not want to block
2365 // in _flush_sync_log with jump_to set or else a racing thread might flush
2366 // our entries and our jump_to update won't be correct.)
2367 while (log_flushing) {
2368 dout(10) << __func__ << " log is currently flushing, waiting" << dendl;
2369 log_cond.wait(l);
2370 }
2371
9f95a23c
TL
2372 vselector->sub_usage(log_file->vselector_hint, log_file->fnode);
2373
7c673cae
FG
2374 // 1. allocate new log space and jump to it.
2375 old_log_jump_to = log_file->fnode.get_allocated();
7c673cae 2376 dout(10) << __func__ << " old_log_jump_to 0x" << std::hex << old_log_jump_to
11fdf7f2 2377 << " need 0x" << (old_log_jump_to + cct->_conf->bluefs_max_log_runway) << std::dec << dendl;
9f95a23c
TL
2378 int r = _allocate(vselector->select_prefer_bdev(log_file->vselector_hint),
2379 cct->_conf->bluefs_max_log_runway,
2380 &log_file->fnode);
11fdf7f2 2381 ceph_assert(r == 0);
9f95a23c
TL
2382 //adjust usage as flush below will need it
2383 vselector->add_usage(log_file->vselector_hint, log_file->fnode);
7c673cae
FG
2384 dout(10) << __func__ << " log extents " << log_file->fnode.extents << dendl;
2385
2386 // update the log file change and log a jump to the offset where we want to
2387 // write the new entries
2388 log_t.op_file_update(log_file->fnode);
2389 log_t.op_jump(log_seq, old_log_jump_to);
2390
2391 flush_bdev(); // FIXME?
2392
2393 _flush_and_sync_log(l, 0, old_log_jump_to);
2394
2395 // 2. prepare compacted log
2396 bluefs_transaction_t t;
224ce89b
WB
2397 //avoid record two times in log_t and _compact_log_dump_metadata.
2398 log_t.clear();
11fdf7f2 2399 _compact_log_dump_metadata(&t, 0);
7c673cae 2400
eafe8130
TL
2401 uint64_t max_alloc_size = std::max(alloc_size[BDEV_WAL],
2402 std::max(alloc_size[BDEV_DB],
2403 alloc_size[BDEV_SLOW]));
2404
7c673cae 2405 // conservative estimate for final encoded size
11fdf7f2 2406 new_log_jump_to = round_up_to(t.op_bl.length() + super.block_size * 2,
eafe8130 2407 max_alloc_size);
7c673cae
FG
2408 t.op_jump(log_seq, new_log_jump_to);
2409
11fdf7f2 2410 // allocate
9f95a23c 2411 //FIXME: check if we want DB here?
11fdf7f2
TL
2412 r = _allocate(BlueFS::BDEV_DB, new_log_jump_to,
2413 &new_log->fnode);
2414 ceph_assert(r == 0);
2415
2416 // we might have some more ops in log_t due to _allocate call
2417 t.claim_ops(log_t);
2418
7c673cae 2419 bufferlist bl;
11fdf7f2 2420 encode(t, bl);
7c673cae
FG
2421 _pad_bl(bl);
2422
2423 dout(10) << __func__ << " new_log_jump_to 0x" << std::hex << new_log_jump_to
2424 << std::dec << dendl;
2425
7c673cae
FG
2426 new_log_writer = _create_writer(new_log);
2427 new_log_writer->append(bl);
2428
2429 // 3. flush
2430 r = _flush(new_log_writer, true);
11fdf7f2 2431 ceph_assert(r == 0);
7c673cae
FG
2432
2433 // 4. wait
11fdf7f2 2434 _flush_bdev_safely(new_log_writer);
7c673cae 2435
11fdf7f2 2436 // 5. update our log fnode
7c673cae 2437 // discard first old_log_jump_to extents
9f95a23c 2438
7c673cae
FG
2439 dout(10) << __func__ << " remove 0x" << std::hex << old_log_jump_to << std::dec
2440 << " of " << log_file->fnode.extents << dendl;
2441 uint64_t discarded = 0;
2442 mempool::bluefs::vector<bluefs_extent_t> old_extents;
2443 while (discarded < old_log_jump_to) {
11fdf7f2 2444 ceph_assert(!log_file->fnode.extents.empty());
7c673cae
FG
2445 bluefs_extent_t& e = log_file->fnode.extents.front();
2446 bluefs_extent_t temp = e;
2447 if (discarded + e.length <= old_log_jump_to) {
2448 dout(10) << __func__ << " remove old log extent " << e << dendl;
2449 discarded += e.length;
94b18763 2450 log_file->fnode.pop_front_extent();
7c673cae
FG
2451 } else {
2452 dout(10) << __func__ << " remove front of old log extent " << e << dendl;
2453 uint64_t drop = old_log_jump_to - discarded;
2454 temp.length = drop;
2455 e.offset += drop;
2456 e.length -= drop;
2457 discarded += drop;
2458 dout(10) << __func__ << " kept " << e << " removed " << temp << dendl;
2459 }
2460 old_extents.push_back(temp);
2461 }
94b18763
FG
2462 auto from = log_file->fnode.extents.begin();
2463 auto to = log_file->fnode.extents.end();
2464 while (from != to) {
2465 new_log->fnode.append_extent(*from);
2466 ++from;
2467 }
7c673cae 2468
9f95a23c
TL
2469 vselector->sub_usage(log_file->vselector_hint, log_file->fnode);
2470
7c673cae 2471 // clear the extents from old log file, they are added to new log
94b18763 2472 log_file->fnode.clear_extents();
7c673cae 2473 // swap the log files. New log file is the log file now.
94b18763
FG
2474 new_log->fnode.swap_extents(log_file->fnode);
2475
7c673cae
FG
2476 log_writer->pos = log_writer->file->fnode.size =
2477 log_writer->pos - old_log_jump_to + new_log_jump_to;
2478
9f95a23c
TL
2479 vselector->add_usage(log_file->vselector_hint, log_file->fnode);
2480
11fdf7f2 2481 // 6. write the super block to reflect the changes
7c673cae
FG
2482 dout(10) << __func__ << " writing super" << dendl;
2483 super.log_fnode = log_file->fnode;
2484 ++super.version;
11fdf7f2 2485 _write_super(BDEV_DB);
7c673cae
FG
2486
2487 lock.unlock();
2488 flush_bdev();
2489 lock.lock();
2490
11fdf7f2 2491 // 7. release old space
7c673cae
FG
2492 dout(10) << __func__ << " release old log extents " << old_extents << dendl;
2493 for (auto& r : old_extents) {
2494 pending_release[r.bdev].insert(r.offset, r.length);
2495 }
2496
2497 // delete the new log, remove from the dirty files list
2498 _close_writer(new_log_writer);
2499 if (new_log->dirty_seq) {
11fdf7f2 2500 ceph_assert(dirty_files.count(new_log->dirty_seq));
7c673cae
FG
2501 auto it = dirty_files[new_log->dirty_seq].iterator_to(*new_log);
2502 dirty_files[new_log->dirty_seq].erase(it);
2503 }
2504 new_log_writer = nullptr;
2505 new_log = nullptr;
2506 log_cond.notify_all();
2507
2508 dout(10) << __func__ << " log extents " << log_file->fnode.extents << dendl;
2509 logger->inc(l_bluefs_log_compactions);
2510}
2511
2512void BlueFS::_pad_bl(bufferlist& bl)
2513{
2514 uint64_t partial = bl.length() % super.block_size;
2515 if (partial) {
2516 dout(10) << __func__ << " padding with 0x" << std::hex
2517 << super.block_size - partial << " zeros" << std::dec << dendl;
2518 bl.append_zero(super.block_size - partial);
2519 }
2520}
2521
7c673cae 2522
11fdf7f2 2523int BlueFS::_flush_and_sync_log(std::unique_lock<ceph::mutex>& l,
7c673cae
FG
2524 uint64_t want_seq,
2525 uint64_t jump_to)
2526{
2527 while (log_flushing) {
2528 dout(10) << __func__ << " want_seq " << want_seq
2529 << " log is currently flushing, waiting" << dendl;
11fdf7f2 2530 ceph_assert(!jump_to);
7c673cae
FG
2531 log_cond.wait(l);
2532 }
2533 if (want_seq && want_seq <= log_seq_stable) {
2534 dout(10) << __func__ << " want_seq " << want_seq << " <= log_seq_stable "
2535 << log_seq_stable << ", done" << dendl;
11fdf7f2 2536 ceph_assert(!jump_to);
7c673cae
FG
2537 return 0;
2538 }
2539 if (log_t.empty() && dirty_files.empty()) {
2540 dout(10) << __func__ << " want_seq " << want_seq
2541 << " " << log_t << " not dirty, dirty_files empty, no-op" << dendl;
11fdf7f2 2542 ceph_assert(!jump_to);
7c673cae
FG
2543 return 0;
2544 }
2545
a8e16298
TL
2546 vector<interval_set<uint64_t>> to_release(pending_release.size());
2547 to_release.swap(pending_release);
2548
7c673cae 2549 uint64_t seq = log_t.seq = ++log_seq;
11fdf7f2 2550 ceph_assert(want_seq == 0 || want_seq <= seq);
7c673cae
FG
2551 log_t.uuid = super.uuid;
2552
2553 // log dirty files
2554 auto lsi = dirty_files.find(seq);
2555 if (lsi != dirty_files.end()) {
2556 dout(20) << __func__ << " " << lsi->second.size() << " dirty_files" << dendl;
2557 for (auto &f : lsi->second) {
2558 dout(20) << __func__ << " op_file_update " << f.fnode << dendl;
2559 log_t.op_file_update(f.fnode);
2560 }
2561 }
2562
2563 dout(10) << __func__ << " " << log_t << dendl;
11fdf7f2 2564 ceph_assert(!log_t.empty());
7c673cae
FG
2565
2566 // allocate some more space (before we run out)?
2567 int64_t runway = log_writer->file->fnode.get_allocated() -
2568 log_writer->get_effective_write_pos();
f6b5b4d7 2569 bool just_expanded_log = false;
7c673cae
FG
2570 if (runway < (int64_t)cct->_conf->bluefs_min_log_runway) {
2571 dout(10) << __func__ << " allocating more log runway (0x"
2572 << std::hex << runway << std::dec << " remaining)" << dendl;
2573 while (new_log_writer) {
2574 dout(10) << __func__ << " waiting for async compaction" << dendl;
2575 log_cond.wait(l);
2576 }
9f95a23c
TL
2577 vselector->sub_usage(log_writer->file->vselector_hint, log_writer->file->fnode);
2578 int r = _allocate(
2579 vselector->select_prefer_bdev(log_writer->file->vselector_hint),
2580 cct->_conf->bluefs_max_log_runway,
2581 &log_writer->file->fnode);
11fdf7f2 2582 ceph_assert(r == 0);
9f95a23c 2583 vselector->add_usage(log_writer->file->vselector_hint, log_writer->file->fnode);
7c673cae 2584 log_t.op_file_update(log_writer->file->fnode);
f6b5b4d7 2585 just_expanded_log = true;
7c673cae
FG
2586 }
2587
2588 bufferlist bl;
11fdf7f2
TL
2589 bl.reserve(super.block_size);
2590 encode(log_t, bl);
7c673cae 2591 // pad to block boundary
11fdf7f2
TL
2592 size_t realign = super.block_size - (bl.length() % super.block_size);
2593 if (realign && realign != super.block_size)
2594 bl.append_zero(realign);
2595
7c673cae
FG
2596 logger->inc(l_bluefs_logged_bytes, bl.length());
2597
f6b5b4d7
TL
2598 if (just_expanded_log) {
2599 ceph_assert(bl.length() <= runway); // if we write this, we will have an unrecoverable data loss
2600 }
2601
7c673cae
FG
2602 log_writer->append(bl);
2603
2604 log_t.clear();
2605 log_t.seq = 0; // just so debug output is less confusing
2606 log_flushing = true;
2607
2608 int r = _flush(log_writer, true);
11fdf7f2 2609 ceph_assert(r == 0);
7c673cae
FG
2610
2611 if (jump_to) {
2612 dout(10) << __func__ << " jumping log offset from 0x" << std::hex
2613 << log_writer->pos << " -> 0x" << jump_to << std::dec << dendl;
2614 log_writer->pos = jump_to;
9f95a23c 2615 vselector->sub_usage(log_writer->file->vselector_hint, log_writer->file->fnode.size);
7c673cae 2616 log_writer->file->fnode.size = jump_to;
9f95a23c 2617 vselector->add_usage(log_writer->file->vselector_hint, log_writer->file->fnode.size);
7c673cae
FG
2618 }
2619
2620 _flush_bdev_safely(log_writer);
2621
2622 log_flushing = false;
2623 log_cond.notify_all();
2624
2625 // clean dirty files
2626 if (seq > log_seq_stable) {
2627 log_seq_stable = seq;
2628 dout(20) << __func__ << " log_seq_stable " << log_seq_stable << dendl;
2629
2630 auto p = dirty_files.begin();
2631 while (p != dirty_files.end()) {
2632 if (p->first > log_seq_stable) {
2633 dout(20) << __func__ << " done cleaning up dirty files" << dendl;
2634 break;
2635 }
2636
2637 auto l = p->second.begin();
2638 while (l != p->second.end()) {
2639 File *file = &*l;
11fdf7f2
TL
2640 ceph_assert(file->dirty_seq > 0);
2641 ceph_assert(file->dirty_seq <= log_seq_stable);
7c673cae
FG
2642 dout(20) << __func__ << " cleaned file " << file->fnode << dendl;
2643 file->dirty_seq = 0;
2644 p->second.erase(l++);
2645 }
2646
11fdf7f2 2647 ceph_assert(p->second.empty());
7c673cae
FG
2648 dirty_files.erase(p++);
2649 }
2650 } else {
2651 dout(20) << __func__ << " log_seq_stable " << log_seq_stable
2652 << " already >= out seq " << seq
2653 << ", we lost a race against another log flush, done" << dendl;
2654 }
a8e16298
TL
2655
2656 for (unsigned i = 0; i < to_release.size(); ++i) {
2657 if (!to_release[i].empty()) {
2658 /* OK, now we have the guarantee alloc[i] won't be null. */
11fdf7f2
TL
2659 int r = 0;
2660 if (cct->_conf->bdev_enable_discard && cct->_conf->bdev_async_discard) {
2661 r = bdev[i]->queue_discard(to_release[i]);
2662 if (r == 0)
2663 continue;
2664 } else if (cct->_conf->bdev_enable_discard) {
2665 for (auto p = to_release[i].begin(); p != to_release[i].end(); ++p) {
2666 bdev[i]->discard(p.get_start(), p.get_len());
2667 }
2668 }
a8e16298
TL
2669 alloc[i]->release(to_release[i]);
2670 }
2671 }
2672
7c673cae
FG
2673 _update_logger_stats();
2674
2675 return 0;
2676}
2677
2678int BlueFS::_flush_range(FileWriter *h, uint64_t offset, uint64_t length)
2679{
2680 dout(10) << __func__ << " " << h << " pos 0x" << std::hex << h->pos
2681 << " 0x" << offset << "~" << length << std::dec
2682 << " to " << h->file->fnode << dendl;
11fdf7f2
TL
2683 ceph_assert(!h->file->deleted);
2684 ceph_assert(h->file->num_readers.load() == 0);
7c673cae
FG
2685
2686 h->buffer_appender.flush();
2687
2688 bool buffered;
2689 if (h->file->fnode.ino == 1)
2690 buffered = false;
2691 else
2692 buffered = cct->_conf->bluefs_buffered_io;
2693
2694 if (offset + length <= h->pos)
2695 return 0;
2696 if (offset < h->pos) {
2697 length -= h->pos - offset;
2698 offset = h->pos;
2699 dout(10) << " still need 0x"
2700 << std::hex << offset << "~" << length << std::dec
2701 << dendl;
2702 }
11fdf7f2 2703 ceph_assert(offset <= h->file->fnode.size);
7c673cae
FG
2704
2705 uint64_t allocated = h->file->fnode.get_allocated();
9f95a23c 2706 vselector->sub_usage(h->file->vselector_hint, h->file->fnode);
7c673cae
FG
2707 // do not bother to dirty the file if we are overwriting
2708 // previously allocated extents.
2709 bool must_dirty = false;
2710 if (allocated < offset + length) {
2711 // we should never run out of log space here; see the min runway check
2712 // in _flush_and_sync_log.
11fdf7f2 2713 ceph_assert(h->file->fnode.ino != 1);
9f95a23c 2714 int r = _allocate(vselector->select_prefer_bdev(h->file->vselector_hint),
7c673cae 2715 offset + length - allocated,
94b18763 2716 &h->file->fnode);
7c673cae
FG
2717 if (r < 0) {
2718 derr << __func__ << " allocated: 0x" << std::hex << allocated
2719 << " offset: 0x" << offset << " length: 0x" << length << std::dec
2720 << dendl;
9f95a23c 2721 vselector->add_usage(h->file->vselector_hint, h->file->fnode); // undo
11fdf7f2 2722 ceph_abort_msg("bluefs enospc");
7c673cae
FG
2723 return r;
2724 }
7c673cae
FG
2725 must_dirty = true;
2726 }
2727 if (h->file->fnode.size < offset + length) {
2728 h->file->fnode.size = offset + length;
2729 if (h->file->fnode.ino > 1) {
2730 // we do not need to dirty the log file (or it's compacting
2731 // replacement) when the file size changes because replay is
2732 // smart enough to discover it on its own.
2733 must_dirty = true;
2734 }
2735 }
2736 if (must_dirty) {
2737 h->file->fnode.mtime = ceph_clock_now();
11fdf7f2 2738 ceph_assert(h->file->fnode.ino >= 1);
7c673cae
FG
2739 if (h->file->dirty_seq == 0) {
2740 h->file->dirty_seq = log_seq + 1;
2741 dirty_files[h->file->dirty_seq].push_back(*h->file);
2742 dout(20) << __func__ << " dirty_seq = " << log_seq + 1
2743 << " (was clean)" << dendl;
2744 } else {
2745 if (h->file->dirty_seq != log_seq + 1) {
2746 // need re-dirty, erase from list first
11fdf7f2 2747 ceph_assert(dirty_files.count(h->file->dirty_seq));
7c673cae
FG
2748 auto it = dirty_files[h->file->dirty_seq].iterator_to(*h->file);
2749 dirty_files[h->file->dirty_seq].erase(it);
2750 h->file->dirty_seq = log_seq + 1;
2751 dirty_files[h->file->dirty_seq].push_back(*h->file);
2752 dout(20) << __func__ << " dirty_seq = " << log_seq + 1
2753 << " (was " << h->file->dirty_seq << ")" << dendl;
2754 } else {
2755 dout(20) << __func__ << " dirty_seq = " << log_seq + 1
2756 << " (unchanged, do nothing) " << dendl;
2757 }
2758 }
2759 }
2760 dout(20) << __func__ << " file now " << h->file->fnode << dendl;
2761
2762 uint64_t x_off = 0;
2763 auto p = h->file->fnode.seek(offset, &x_off);
11fdf7f2 2764 ceph_assert(p != h->file->fnode.extents.end());
7c673cae
FG
2765 dout(20) << __func__ << " in " << *p << " x_off 0x"
2766 << std::hex << x_off << std::dec << dendl;
2767
2768 unsigned partial = x_off & ~super.block_mask();
2769 bufferlist bl;
2770 if (partial) {
2771 dout(20) << __func__ << " using partial tail 0x"
2772 << std::hex << partial << std::dec << dendl;
11fdf7f2 2773 ceph_assert(h->tail_block.length() == partial);
31f18b77 2774 bl.claim_append_piecewise(h->tail_block);
7c673cae
FG
2775 x_off -= partial;
2776 offset -= partial;
2777 length += partial;
2778 dout(20) << __func__ << " waiting for previous aio to complete" << dendl;
2779 for (auto p : h->iocv) {
2780 if (p) {
2781 p->aio_wait();
2782 }
2783 }
2784 }
f91f0fd5 2785 if (length == partial + h->buffer.length()) {
9f95a23c 2786 /* in case of inital allocation and need to zero, limited flush is unacceptable */
31f18b77 2787 bl.claim_append_piecewise(h->buffer);
7c673cae
FG
2788 } else {
2789 bufferlist t;
31f18b77
FG
2790 h->buffer.splice(0, length, &t);
2791 bl.claim_append_piecewise(t);
7c673cae
FG
2792 t.substr_of(h->buffer, length, h->buffer.length() - length);
2793 h->buffer.swap(t);
2794 dout(20) << " leaving 0x" << std::hex << h->buffer.length() << std::dec
2795 << " unflushed" << dendl;
2796 }
11fdf7f2 2797 ceph_assert(bl.length() == length);
7c673cae 2798
9f95a23c
TL
2799 h->pos = offset + length;
2800
2801 unsigned tail = bl.length() & ~super.block_mask();
2802 if (tail) {
2803 dout(20) << __func__ << " caching tail of 0x"
2804 << std::hex << tail
2805 << " and padding block with 0x" << (super.block_size - tail)
2806 << std::dec << dendl;
2807 h->tail_block.substr_of(bl, bl.length() - tail, tail);
2808 bl.append_zero(super.block_size - tail);
2809 length += super.block_size - tail;
2810 } else {
2811 h->tail_block.clear();
2812 }
9f95a23c
TL
2813 ceph_assert(bl.length() == length);
2814
7c673cae
FG
2815 switch (h->writer_type) {
2816 case WRITER_WAL:
2817 logger->inc(l_bluefs_bytes_written_wal, length);
2818 break;
2819 case WRITER_SST:
2820 logger->inc(l_bluefs_bytes_written_sst, length);
2821 break;
2822 }
2823
2824 dout(30) << "dump:\n";
2825 bl.hexdump(*_dout);
2826 *_dout << dendl;
2827
7c673cae 2828 uint64_t bloff = 0;
11fdf7f2 2829 uint64_t bytes_written_slow = 0;
7c673cae 2830 while (length > 0) {
11fdf7f2 2831 uint64_t x_len = std::min(p->length - x_off, length);
7c673cae
FG
2832 bufferlist t;
2833 t.substr_of(bl, bloff, x_len);
7c673cae 2834 if (cct->_conf->bluefs_sync_write) {
11fdf7f2 2835 bdev[p->bdev]->write(p->offset + x_off, t, buffered, h->write_hint);
7c673cae 2836 } else {
11fdf7f2
TL
2837 bdev[p->bdev]->aio_write(p->offset + x_off, t, h->iocv[p->bdev], buffered, h->write_hint);
2838 }
2839 h->dirty_devs[p->bdev] = true;
2840 if (p->bdev == BDEV_SLOW) {
2841 bytes_written_slow += t.length();
7c673cae 2842 }
11fdf7f2 2843
7c673cae
FG
2844 bloff += x_len;
2845 length -= x_len;
2846 ++p;
2847 x_off = 0;
2848 }
11fdf7f2 2849 logger->inc(l_bluefs_bytes_written_slow, bytes_written_slow);
7c673cae
FG
2850 for (unsigned i = 0; i < MAX_BDEV; ++i) {
2851 if (bdev[i]) {
11fdf7f2 2852 if (h->iocv[i] && h->iocv[i]->has_pending_aios()) {
7c673cae
FG
2853 bdev[i]->aio_submit(h->iocv[i]);
2854 }
2855 }
2856 }
9f95a23c 2857 vselector->add_usage(h->file->vselector_hint, h->file->fnode);
7c673cae
FG
2858 dout(20) << __func__ << " h " << h << " pos now 0x"
2859 << std::hex << h->pos << std::dec << dendl;
2860 return 0;
2861}
2862
11fdf7f2 2863#ifdef HAVE_LIBAIO
7c673cae
FG
2864// we need to retire old completed aios so they don't stick around in
2865// memory indefinitely (along with their bufferlist refs).
2866void BlueFS::_claim_completed_aios(FileWriter *h, list<aio_t> *ls)
2867{
2868 for (auto p : h->iocv) {
2869 if (p) {
2870 ls->splice(ls->end(), p->running_aios);
2871 }
2872 }
2873 dout(10) << __func__ << " got " << ls->size() << " aios" << dendl;
2874}
2875
2876void BlueFS::wait_for_aio(FileWriter *h)
2877{
2878 // NOTE: this is safe to call without a lock, as long as our reference is
2879 // stable.
2880 dout(10) << __func__ << " " << h << dendl;
2881 utime_t start = ceph_clock_now();
2882 for (auto p : h->iocv) {
2883 if (p) {
2884 p->aio_wait();
2885 }
2886 }
11fdf7f2 2887 dout(10) << __func__ << " " << h << " done in " << (ceph_clock_now() - start) << dendl;
7c673cae 2888}
11fdf7f2 2889#endif
7c673cae 2890
f6b5b4d7
TL
2891int BlueFS::_flush(FileWriter *h, bool force, std::unique_lock<ceph::mutex>& l)
2892{
2893 bool flushed = false;
2894 int r = _flush(h, force, &flushed);
2895 if (r == 0 && flushed) {
2896 _maybe_compact_log(l);
2897 }
2898 return r;
2899}
2900
2901int BlueFS::_flush(FileWriter *h, bool force, bool *flushed)
7c673cae
FG
2902{
2903 h->buffer_appender.flush();
2904 uint64_t length = h->buffer.length();
2905 uint64_t offset = h->pos;
f6b5b4d7
TL
2906 if (flushed) {
2907 *flushed = false;
2908 }
7c673cae
FG
2909 if (!force &&
2910 length < cct->_conf->bluefs_min_flush_size) {
2911 dout(10) << __func__ << " " << h << " ignoring, length " << length
2912 << " < min_flush_size " << cct->_conf->bluefs_min_flush_size
2913 << dendl;
2914 return 0;
2915 }
2916 if (length == 0) {
2917 dout(10) << __func__ << " " << h << " no dirty data on "
2918 << h->file->fnode << dendl;
2919 return 0;
2920 }
2921 dout(10) << __func__ << " " << h << " 0x"
2922 << std::hex << offset << "~" << length << std::dec
2923 << " to " << h->file->fnode << dendl;
11fdf7f2 2924 ceph_assert(h->pos <= h->file->fnode.size);
f6b5b4d7
TL
2925 int r = _flush_range(h, offset, length);
2926 if (flushed) {
2927 *flushed = true;
2928 }
2929 return r;
7c673cae
FG
2930}
2931
2932int BlueFS::_truncate(FileWriter *h, uint64_t offset)
2933{
2934 dout(10) << __func__ << " 0x" << std::hex << offset << std::dec
2935 << " file " << h->file->fnode << dendl;
2936 if (h->file->deleted) {
2937 dout(10) << __func__ << " deleted, no-op" << dendl;
2938 return 0;
2939 }
2940
2941 // we never truncate internal log files
11fdf7f2 2942 ceph_assert(h->file->fnode.ino > 1);
7c673cae
FG
2943
2944 h->buffer_appender.flush();
2945
2946 // truncate off unflushed data?
2947 if (h->pos < offset &&
2948 h->pos + h->buffer.length() > offset) {
2949 bufferlist t;
2950 dout(20) << __func__ << " tossing out last " << offset - h->pos
2951 << " unflushed bytes" << dendl;
2952 t.substr_of(h->buffer, 0, offset - h->pos);
2953 h->buffer.swap(t);
11fdf7f2 2954 ceph_abort_msg("actually this shouldn't happen");
7c673cae
FG
2955 }
2956 if (h->buffer.length()) {
2957 int r = _flush(h, true);
2958 if (r < 0)
2959 return r;
2960 }
2961 if (offset == h->file->fnode.size) {
2962 return 0; // no-op!
2963 }
2964 if (offset > h->file->fnode.size) {
11fdf7f2 2965 ceph_abort_msg("truncate up not supported");
7c673cae 2966 }
11fdf7f2 2967 ceph_assert(h->file->fnode.size >= offset);
9f95a23c 2968 vselector->sub_usage(h->file->vselector_hint, h->file->fnode.size);
7c673cae 2969 h->file->fnode.size = offset;
9f95a23c 2970 vselector->add_usage(h->file->vselector_hint, h->file->fnode.size);
7c673cae
FG
2971 log_t.op_file_update(h->file->fnode);
2972 return 0;
2973}
2974
11fdf7f2 2975int BlueFS::_fsync(FileWriter *h, std::unique_lock<ceph::mutex>& l)
7c673cae
FG
2976{
2977 dout(10) << __func__ << " " << h << " " << h->file->fnode << dendl;
2978 int r = _flush(h, true);
2979 if (r < 0)
2980 return r;
2981 uint64_t old_dirty_seq = h->file->dirty_seq;
2982
2983 _flush_bdev_safely(h);
2984
2985 if (old_dirty_seq) {
2986 uint64_t s = log_seq;
2987 dout(20) << __func__ << " file metadata was dirty (" << old_dirty_seq
2988 << ") on " << h->file->fnode << ", flushing log" << dendl;
2989 _flush_and_sync_log(l, old_dirty_seq);
11fdf7f2 2990 ceph_assert(h->file->dirty_seq == 0 || // cleaned
7c673cae
FG
2991 h->file->dirty_seq > s); // or redirtied by someone else
2992 }
2993 return 0;
2994}
2995
2996void BlueFS::_flush_bdev_safely(FileWriter *h)
2997{
11fdf7f2
TL
2998 std::array<bool, MAX_BDEV> flush_devs = h->dirty_devs;
2999 h->dirty_devs.fill(false);
3000#ifdef HAVE_LIBAIO
7c673cae
FG
3001 if (!cct->_conf->bluefs_sync_write) {
3002 list<aio_t> completed_ios;
3003 _claim_completed_aios(h, &completed_ios);
3004 lock.unlock();
3005 wait_for_aio(h);
3006 completed_ios.clear();
11fdf7f2 3007 flush_bdev(flush_devs);
7c673cae 3008 lock.lock();
11fdf7f2
TL
3009 } else
3010#endif
3011 {
7c673cae 3012 lock.unlock();
11fdf7f2 3013 flush_bdev(flush_devs);
7c673cae
FG
3014 lock.lock();
3015 }
3016}
3017
11fdf7f2
TL
3018void BlueFS::flush_bdev(std::array<bool, MAX_BDEV>& dirty_bdevs)
3019{
3020 // NOTE: this is safe to call without a lock.
3021 dout(20) << __func__ << dendl;
3022 for (unsigned i = 0; i < MAX_BDEV; i++) {
3023 if (dirty_bdevs[i])
3024 bdev[i]->flush();
3025 }
3026}
3027
7c673cae
FG
3028void BlueFS::flush_bdev()
3029{
3030 // NOTE: this is safe to call without a lock.
3031 dout(20) << __func__ << dendl;
3032 for (auto p : bdev) {
3033 if (p)
3034 p->flush();
3035 }
3036}
3037
eafe8130
TL
3038const char* BlueFS::get_device_name(unsigned id)
3039{
3040 if (id >= MAX_BDEV) return "BDEV_INV";
3041 const char* names[] = {"BDEV_WAL", "BDEV_DB", "BDEV_SLOW", "BDEV_NEWWAL", "BDEV_NEWDB"};
3042 return names[id];
3043}
3044
11fdf7f2
TL
3045int BlueFS::_expand_slow_device(uint64_t need, PExtentVector& extents)
3046{
3047 int r = -ENOSPC;
3048 if (slow_dev_expander) {
1911f103 3049 auto id = _get_slow_device_id();
eafe8130 3050 auto min_alloc_size = alloc_size[id];
1911f103 3051 ceph_assert(id <= alloc.size() && alloc[id]);
11fdf7f2
TL
3052 auto min_need = round_up_to(need, min_alloc_size);
3053 need = std::max(need,
3054 slow_dev_expander->get_recommended_expansion_delta(
3055 alloc[id]->get_free(), block_all[id].size()));
3056
3057 need = round_up_to(need, min_alloc_size);
3058 dout(10) << __func__ << " expanding slow device by 0x"
3059 << std::hex << need << std::dec
3060 << dendl;
3061 r = slow_dev_expander->allocate_freespace(min_need, need, extents);
3062 }
3063 return r;
3064}
3065
3066int BlueFS::_allocate_without_fallback(uint8_t id, uint64_t len,
3067 PExtentVector* extents)
3068{
3069 dout(10) << __func__ << " len 0x" << std::hex << len << std::dec
3070 << " from " << (int)id << dendl;
3071 assert(id < alloc.size());
11fdf7f2
TL
3072 if (!alloc[id]) {
3073 return -ENOENT;
3074 }
3075 extents->reserve(4); // 4 should be (more than) enough for most allocations
eafe8130
TL
3076 uint64_t min_alloc_size = alloc_size[id];
3077 uint64_t left = round_up_to(len, min_alloc_size);
11fdf7f2 3078 int64_t alloc_len = alloc[id]->allocate(left, min_alloc_size, 0, extents);
eafe8130
TL
3079 if (alloc_len < 0 || alloc_len < (int64_t)left) {
3080 if (alloc_len > 0) {
11fdf7f2
TL
3081 alloc[id]->release(*extents);
3082 }
3083 if (bdev[id])
3084 derr << __func__ << " failed to allocate 0x" << std::hex << left
3085 << " on bdev " << (int)id
3086 << ", free 0x" << alloc[id]->get_free() << std::dec << dendl;
3087 else
3088 derr << __func__ << " failed to allocate 0x" << std::hex << left
3089 << " on bdev " << (int)id << ", dne" << std::dec << dendl;
3090 if (alloc[id])
3091 alloc[id]->dump();
3092 return -ENOSPC;
3093 }
3094
3095 return 0;
3096}
3097
7c673cae 3098int BlueFS::_allocate(uint8_t id, uint64_t len,
94b18763 3099 bluefs_fnode_t* node)
7c673cae
FG
3100{
3101 dout(10) << __func__ << " len 0x" << std::hex << len << std::dec
3102 << " from " << (int)id << dendl;
11fdf7f2 3103 ceph_assert(id < alloc.size());
b32b8144 3104 int64_t alloc_len = 0;
a8e16298 3105 PExtentVector extents;
11fdf7f2 3106 uint64_t hint = 0;
7c673cae 3107 if (alloc[id]) {
94b18763
FG
3108 if (!node->extents.empty() && node->extents.back().bdev == id) {
3109 hint = node->extents.back().end();
11fdf7f2 3110 }
b32b8144 3111 extents.reserve(4); // 4 should be (more than) enough for most allocations
eafe8130
TL
3112 alloc_len = alloc[id]->allocate(round_up_to(len, alloc_size[id]),
3113 alloc_size[id], hint, &extents);
b32b8144 3114 }
eafe8130
TL
3115 if (!alloc[id] ||
3116 alloc_len < 0 ||
3117 alloc_len < (int64_t)round_up_to(len, alloc_size[id])) {
11fdf7f2 3118 if (alloc_len > 0) {
a8e16298 3119 alloc[id]->release(extents);
b32b8144 3120 }
7c673cae
FG
3121 if (id != BDEV_SLOW) {
3122 if (bdev[id]) {
eafe8130 3123 dout(1) << __func__ << " failed to allocate 0x" << std::hex << len
7c673cae
FG
3124 << " on bdev " << (int)id
3125 << ", free 0x" << alloc[id]->get_free()
3126 << "; fallback to bdev " << (int)id + 1
3127 << std::dec << dendl;
3128 }
94b18763 3129 return _allocate(id + 1, len, node);
7c673cae 3130 }
eafe8130 3131 dout(1) << __func__ << " unable to allocate 0x" << std::hex << len
11fdf7f2
TL
3132 << " on bdev " << (int)id << ", free 0x"
3133 << (alloc[id] ? alloc[id]->get_free() : (uint64_t)-1)
3134 << "; fallback to slow device expander "
3135 << std::dec << dendl;
3136 extents.clear();
eafe8130 3137 if (_expand_slow_device(len, extents) == 0) {
11fdf7f2
TL
3138 id = _get_slow_device_id();
3139 for (auto& e : extents) {
3140 _add_block_extent(id, e.offset, e.length);
3141 }
3142 extents.clear();
3143 auto* last_alloc = alloc[id];
3144 ceph_assert(last_alloc);
3145 // try again
eafe8130
TL
3146 alloc_len = last_alloc->allocate(round_up_to(len, alloc_size[id]),
3147 alloc_size[id], hint, &extents);
3148 if (alloc_len < 0 || alloc_len < (int64_t)len) {
11fdf7f2
TL
3149 if (alloc_len > 0) {
3150 last_alloc->release(extents);
3151 }
eafe8130 3152 derr << __func__ << " failed to allocate 0x" << std::hex << len
11fdf7f2
TL
3153 << " on bdev " << (int)id
3154 << ", free 0x" << last_alloc->get_free() << std::dec << dendl;
3155 return -ENOSPC;
3156 }
3157 } else {
3158 derr << __func__ << " failed to expand slow device to fit +0x"
eafe8130 3159 << std::hex << len << std::dec
11fdf7f2
TL
3160 << dendl;
3161 return -ENOSPC;
3162 }
3163 } else {
3164 uint64_t total_allocated =
3165 block_all[id].size() - alloc[id]->get_free();
3166 if (max_bytes[id] < total_allocated) {
3167 logger->set(max_bytes_pcounters[id], total_allocated);
3168 max_bytes[id] = total_allocated;
3169 }
7c673cae
FG
3170 }
3171
3172 for (auto& p : extents) {
94b18763 3173 node->append_extent(bluefs_extent_t(id, p.offset, p.length));
7c673cae
FG
3174 }
3175
3176 return 0;
3177}
3178
3179int BlueFS::_preallocate(FileRef f, uint64_t off, uint64_t len)
3180{
3181 dout(10) << __func__ << " file " << f->fnode << " 0x"
3182 << std::hex << off << "~" << len << std::dec << dendl;
3183 if (f->deleted) {
3184 dout(10) << __func__ << " deleted, no-op" << dendl;
3185 return 0;
3186 }
11fdf7f2 3187 ceph_assert(f->fnode.ino > 1);
7c673cae
FG
3188 uint64_t allocated = f->fnode.get_allocated();
3189 if (off + len > allocated) {
3190 uint64_t want = off + len - allocated;
9f95a23c
TL
3191 vselector->sub_usage(f->vselector_hint, f->fnode);
3192
3193 int r = _allocate(vselector->select_prefer_bdev(f->vselector_hint),
3194 want,
3195 &f->fnode);
3196 vselector->add_usage(f->vselector_hint, f->fnode);
7c673cae
FG
3197 if (r < 0)
3198 return r;
7c673cae
FG
3199 log_t.op_file_update(f->fnode);
3200 }
3201 return 0;
3202}
3203
1911f103 3204void BlueFS::sync_metadata(bool avoid_compact)
7c673cae 3205{
f6b5b4d7 3206 std::unique_lock<ceph::mutex> l(lock);
9f95a23c 3207 if (log_t.empty() && dirty_files.empty()) {
7c673cae 3208 dout(10) << __func__ << " - no pending log events" << dendl;
11fdf7f2
TL
3209 } else {
3210 dout(10) << __func__ << dendl;
3211 utime_t start = ceph_clock_now();
3212 flush_bdev(); // FIXME?
3213 _flush_and_sync_log(l);
3214 dout(10) << __func__ << " done in " << (ceph_clock_now() - start) << dendl;
7c673cae 3215 }
7c673cae 3216
f6b5b4d7
TL
3217 if (!avoid_compact) {
3218 _maybe_compact_log(l);
3219 }
3220}
3221
3222void BlueFS::_maybe_compact_log(std::unique_lock<ceph::mutex>& l)
3223{
3224 if (!cct->_conf->bluefs_replay_recovery_disable_compact &&
3225 _should_compact_log()) {
7c673cae
FG
3226 if (cct->_conf->bluefs_compact_log_sync) {
3227 _compact_log_sync();
3228 } else {
3229 _compact_log_async(l);
3230 }
3231 }
7c673cae
FG
3232}
3233
3234int BlueFS::open_for_write(
3235 const string& dirname,
3236 const string& filename,
3237 FileWriter **h,
3238 bool overwrite)
3239{
11fdf7f2 3240 std::lock_guard l(lock);
7c673cae
FG
3241 dout(10) << __func__ << " " << dirname << "/" << filename << dendl;
3242 map<string,DirRef>::iterator p = dir_map.find(dirname);
3243 DirRef dir;
3244 if (p == dir_map.end()) {
3245 // implicitly create the dir
3246 dout(20) << __func__ << " dir " << dirname
3247 << " does not exist" << dendl;
3248 return -ENOENT;
3249 } else {
3250 dir = p->second;
3251 }
3252
3253 FileRef file;
3254 bool create = false;
f6b5b4d7 3255 bool truncate = false;
7c673cae
FG
3256 map<string,FileRef>::iterator q = dir->file_map.find(filename);
3257 if (q == dir->file_map.end()) {
3258 if (overwrite) {
3259 dout(20) << __func__ << " dir " << dirname << " (" << dir
3260 << ") file " << filename
3261 << " does not exist" << dendl;
3262 return -ENOENT;
3263 }
9f95a23c 3264 file = ceph::make_ref<File>();
7c673cae
FG
3265 file->fnode.ino = ++ino_last;
3266 file_map[ino_last] = file;
3267 dir->file_map[filename] = file;
3268 ++file->refs;
3269 create = true;
3270 } else {
3271 // overwrite existing file?
3272 file = q->second;
3273 if (overwrite) {
3274 dout(20) << __func__ << " dir " << dirname << " (" << dir
3275 << ") file " << filename
3276 << " already exists, overwrite in place" << dendl;
3277 } else {
3278 dout(20) << __func__ << " dir " << dirname << " (" << dir
3279 << ") file " << filename
3280 << " already exists, truncate + overwrite" << dendl;
9f95a23c 3281 vselector->sub_usage(file->vselector_hint, file->fnode);
7c673cae
FG
3282 file->fnode.size = 0;
3283 for (auto& p : file->fnode.extents) {
3284 pending_release[p.bdev].insert(p.offset, p.length);
3285 }
f6b5b4d7 3286 truncate = true;
94b18763
FG
3287
3288 file->fnode.clear_extents();
7c673cae
FG
3289 }
3290 }
11fdf7f2 3291 ceph_assert(file->fnode.ino > 1);
7c673cae
FG
3292
3293 file->fnode.mtime = ceph_clock_now();
9f95a23c 3294 file->vselector_hint = vselector->get_hint_by_dir(dirname);
f6b5b4d7
TL
3295 if (create || truncate) {
3296 vselector->add_usage(file->vselector_hint, file->fnode); // update file count
3297 }
9f95a23c 3298
7c673cae 3299 dout(20) << __func__ << " mapping " << dirname << "/" << filename
9f95a23c
TL
3300 << " vsel_hint " << file->vselector_hint
3301 << dendl;
7c673cae
FG
3302
3303 log_t.op_file_update(file->fnode);
3304 if (create)
3305 log_t.op_dir_link(dirname, filename, file->fnode.ino);
3306
3307 *h = _create_writer(file);
3308
3309 if (boost::algorithm::ends_with(filename, ".log")) {
3310 (*h)->writer_type = BlueFS::WRITER_WAL;
3311 if (logger && !overwrite) {
3312 logger->inc(l_bluefs_files_written_wal);
3313 }
3314 } else if (boost::algorithm::ends_with(filename, ".sst")) {
3315 (*h)->writer_type = BlueFS::WRITER_SST;
3316 if (logger) {
3317 logger->inc(l_bluefs_files_written_sst);
3318 }
3319 }
3320
3321 dout(10) << __func__ << " h " << *h << " on " << file->fnode << dendl;
3322 return 0;
3323}
3324
3325BlueFS::FileWriter *BlueFS::_create_writer(FileRef f)
3326{
3327 FileWriter *w = new FileWriter(f);
3328 for (unsigned i = 0; i < MAX_BDEV; ++i) {
3329 if (bdev[i]) {
3330 w->iocv[i] = new IOContext(cct, NULL);
7c673cae
FG
3331 }
3332 }
3333 return w;
3334}
3335
3336void BlueFS::_close_writer(FileWriter *h)
3337{
3338 dout(10) << __func__ << " " << h << " type " << h->writer_type << dendl;
f91f0fd5 3339 h->buffer.reassign_to_mempool(mempool::mempool_bluefs_file_writer);
7c673cae
FG
3340 for (unsigned i=0; i<MAX_BDEV; ++i) {
3341 if (bdev[i]) {
11fdf7f2
TL
3342 if (h->iocv[i]) {
3343 h->iocv[i]->aio_wait();
3344 bdev[i]->queue_reap_ioc(h->iocv[i]);
3345 }
7c673cae
FG
3346 }
3347 }
3348 delete h;
3349}
3350
3351int BlueFS::open_for_read(
3352 const string& dirname,
3353 const string& filename,
3354 FileReader **h,
3355 bool random)
3356{
11fdf7f2 3357 std::lock_guard l(lock);
7c673cae
FG
3358 dout(10) << __func__ << " " << dirname << "/" << filename
3359 << (random ? " (random)":" (sequential)") << dendl;
3360 map<string,DirRef>::iterator p = dir_map.find(dirname);
3361 if (p == dir_map.end()) {
3362 dout(20) << __func__ << " dir " << dirname << " not found" << dendl;
3363 return -ENOENT;
3364 }
3365 DirRef dir = p->second;
3366
3367 map<string,FileRef>::iterator q = dir->file_map.find(filename);
3368 if (q == dir->file_map.end()) {
3369 dout(20) << __func__ << " dir " << dirname << " (" << dir
3370 << ") file " << filename
3371 << " not found" << dendl;
3372 return -ENOENT;
3373 }
3374 File *file = q->second.get();
3375
3376 *h = new FileReader(file, random ? 4096 : cct->_conf->bluefs_max_prefetch,
3377 random, false);
3378 dout(10) << __func__ << " h " << *h << " on " << file->fnode << dendl;
3379 return 0;
3380}
3381
3382int BlueFS::rename(
3383 const string& old_dirname, const string& old_filename,
3384 const string& new_dirname, const string& new_filename)
3385{
11fdf7f2 3386 std::lock_guard l(lock);
7c673cae
FG
3387 dout(10) << __func__ << " " << old_dirname << "/" << old_filename
3388 << " -> " << new_dirname << "/" << new_filename << dendl;
3389 map<string,DirRef>::iterator p = dir_map.find(old_dirname);
3390 if (p == dir_map.end()) {
3391 dout(20) << __func__ << " dir " << old_dirname << " not found" << dendl;
3392 return -ENOENT;
3393 }
3394 DirRef old_dir = p->second;
3395 map<string,FileRef>::iterator q = old_dir->file_map.find(old_filename);
3396 if (q == old_dir->file_map.end()) {
3397 dout(20) << __func__ << " dir " << old_dirname << " (" << old_dir
3398 << ") file " << old_filename
3399 << " not found" << dendl;
3400 return -ENOENT;
3401 }
3402 FileRef file = q->second;
3403
3404 p = dir_map.find(new_dirname);
3405 if (p == dir_map.end()) {
3406 dout(20) << __func__ << " dir " << new_dirname << " not found" << dendl;
3407 return -ENOENT;
3408 }
3409 DirRef new_dir = p->second;
3410 q = new_dir->file_map.find(new_filename);
3411 if (q != new_dir->file_map.end()) {
3412 dout(20) << __func__ << " dir " << new_dirname << " (" << old_dir
3413 << ") file " << new_filename
3414 << " already exists, unlinking" << dendl;
11fdf7f2 3415 ceph_assert(q->second != file);
7c673cae
FG
3416 log_t.op_dir_unlink(new_dirname, new_filename);
3417 _drop_link(q->second);
3418 }
3419
3420 dout(10) << __func__ << " " << new_dirname << "/" << new_filename << " "
3421 << " " << file->fnode << dendl;
3422
3423 new_dir->file_map[new_filename] = file;
3424 old_dir->file_map.erase(old_filename);
3425
3426 log_t.op_dir_link(new_dirname, new_filename, file->fnode.ino);
3427 log_t.op_dir_unlink(old_dirname, old_filename);
3428 return 0;
3429}
3430
3431int BlueFS::mkdir(const string& dirname)
3432{
11fdf7f2 3433 std::lock_guard l(lock);
7c673cae
FG
3434 dout(10) << __func__ << " " << dirname << dendl;
3435 map<string,DirRef>::iterator p = dir_map.find(dirname);
3436 if (p != dir_map.end()) {
3437 dout(20) << __func__ << " dir " << dirname << " exists" << dendl;
3438 return -EEXIST;
3439 }
9f95a23c 3440 dir_map[dirname] = ceph::make_ref<Dir>();
7c673cae
FG
3441 log_t.op_dir_create(dirname);
3442 return 0;
3443}
3444
3445int BlueFS::rmdir(const string& dirname)
3446{
11fdf7f2 3447 std::lock_guard l(lock);
7c673cae
FG
3448 dout(10) << __func__ << " " << dirname << dendl;
3449 map<string,DirRef>::iterator p = dir_map.find(dirname);
3450 if (p == dir_map.end()) {
3451 dout(20) << __func__ << " dir " << dirname << " does not exist" << dendl;
3452 return -ENOENT;
3453 }
3454 DirRef dir = p->second;
3455 if (!dir->file_map.empty()) {
3456 dout(20) << __func__ << " dir " << dirname << " not empty" << dendl;
3457 return -ENOTEMPTY;
3458 }
3459 dir_map.erase(dirname);
3460 log_t.op_dir_remove(dirname);
3461 return 0;
3462}
3463
3464bool BlueFS::dir_exists(const string& dirname)
3465{
11fdf7f2 3466 std::lock_guard l(lock);
7c673cae
FG
3467 map<string,DirRef>::iterator p = dir_map.find(dirname);
3468 bool exists = p != dir_map.end();
3469 dout(10) << __func__ << " " << dirname << " = " << (int)exists << dendl;
3470 return exists;
3471}
3472
3473int BlueFS::stat(const string& dirname, const string& filename,
3474 uint64_t *size, utime_t *mtime)
3475{
11fdf7f2 3476 std::lock_guard l(lock);
7c673cae
FG
3477 dout(10) << __func__ << " " << dirname << "/" << filename << dendl;
3478 map<string,DirRef>::iterator p = dir_map.find(dirname);
3479 if (p == dir_map.end()) {
3480 dout(20) << __func__ << " dir " << dirname << " not found" << dendl;
3481 return -ENOENT;
3482 }
3483 DirRef dir = p->second;
3484 map<string,FileRef>::iterator q = dir->file_map.find(filename);
3485 if (q == dir->file_map.end()) {
3486 dout(20) << __func__ << " dir " << dirname << " (" << dir
3487 << ") file " << filename
3488 << " not found" << dendl;
3489 return -ENOENT;
3490 }
3491 File *file = q->second.get();
3492 dout(10) << __func__ << " " << dirname << "/" << filename
3493 << " " << file->fnode << dendl;
3494 if (size)
3495 *size = file->fnode.size;
3496 if (mtime)
3497 *mtime = file->fnode.mtime;
3498 return 0;
3499}
3500
3501int BlueFS::lock_file(const string& dirname, const string& filename,
3502 FileLock **plock)
3503{
11fdf7f2 3504 std::lock_guard l(lock);
7c673cae
FG
3505 dout(10) << __func__ << " " << dirname << "/" << filename << dendl;
3506 map<string,DirRef>::iterator p = dir_map.find(dirname);
3507 if (p == dir_map.end()) {
3508 dout(20) << __func__ << " dir " << dirname << " not found" << dendl;
3509 return -ENOENT;
3510 }
3511 DirRef dir = p->second;
3512 map<string,FileRef>::iterator q = dir->file_map.find(filename);
9f95a23c 3513 FileRef file;
7c673cae
FG
3514 if (q == dir->file_map.end()) {
3515 dout(20) << __func__ << " dir " << dirname << " (" << dir
3516 << ") file " << filename
3517 << " not found, creating" << dendl;
9f95a23c 3518 file = ceph::make_ref<File>();
7c673cae
FG
3519 file->fnode.ino = ++ino_last;
3520 file->fnode.mtime = ceph_clock_now();
3521 file_map[ino_last] = file;
3522 dir->file_map[filename] = file;
3523 ++file->refs;
3524 log_t.op_file_update(file->fnode);
3525 log_t.op_dir_link(dirname, filename, file->fnode.ino);
3526 } else {
9f95a23c 3527 file = q->second;
7c673cae
FG
3528 if (file->locked) {
3529 dout(10) << __func__ << " already locked" << dendl;
11fdf7f2 3530 return -ENOLCK;
7c673cae
FG
3531 }
3532 }
3533 file->locked = true;
3534 *plock = new FileLock(file);
3535 dout(10) << __func__ << " locked " << file->fnode
3536 << " with " << *plock << dendl;
3537 return 0;
3538}
3539
3540int BlueFS::unlock_file(FileLock *fl)
3541{
11fdf7f2 3542 std::lock_guard l(lock);
7c673cae 3543 dout(10) << __func__ << " " << fl << " on " << fl->file->fnode << dendl;
11fdf7f2 3544 ceph_assert(fl->file->locked);
7c673cae
FG
3545 fl->file->locked = false;
3546 delete fl;
3547 return 0;
3548}
3549
3550int BlueFS::readdir(const string& dirname, vector<string> *ls)
3551{
11fdf7f2 3552 std::lock_guard l(lock);
7c673cae
FG
3553 dout(10) << __func__ << " " << dirname << dendl;
3554 if (dirname.empty()) {
3555 // list dirs
3556 ls->reserve(dir_map.size() + 2);
3557 for (auto& q : dir_map) {
3558 ls->push_back(q.first);
3559 }
3560 } else {
3561 // list files in dir
3562 map<string,DirRef>::iterator p = dir_map.find(dirname);
3563 if (p == dir_map.end()) {
3564 dout(20) << __func__ << " dir " << dirname << " not found" << dendl;
3565 return -ENOENT;
3566 }
3567 DirRef dir = p->second;
3568 ls->reserve(dir->file_map.size() + 2);
3569 for (auto& q : dir->file_map) {
3570 ls->push_back(q.first);
3571 }
3572 }
3573 ls->push_back(".");
3574 ls->push_back("..");
3575 return 0;
3576}
3577
3578int BlueFS::unlink(const string& dirname, const string& filename)
3579{
11fdf7f2 3580 std::lock_guard l(lock);
7c673cae
FG
3581 dout(10) << __func__ << " " << dirname << "/" << filename << dendl;
3582 map<string,DirRef>::iterator p = dir_map.find(dirname);
3583 if (p == dir_map.end()) {
3584 dout(20) << __func__ << " dir " << dirname << " not found" << dendl;
3585 return -ENOENT;
3586 }
3587 DirRef dir = p->second;
3588 map<string,FileRef>::iterator q = dir->file_map.find(filename);
3589 if (q == dir->file_map.end()) {
3590 dout(20) << __func__ << " file " << dirname << "/" << filename
3591 << " not found" << dendl;
3592 return -ENOENT;
3593 }
3594 FileRef file = q->second;
3595 if (file->locked) {
3596 dout(20) << __func__ << " file " << dirname << "/" << filename
3597 << " is locked" << dendl;
3598 return -EBUSY;
3599 }
3600 dir->file_map.erase(filename);
3601 log_t.op_dir_unlink(dirname, filename);
3602 _drop_link(file);
3603 return 0;
3604}
d2e6a577
FG
3605
3606bool BlueFS::wal_is_rotational()
3607{
94b18763
FG
3608 if (bdev[BDEV_WAL]) {
3609 return bdev[BDEV_WAL]->is_rotational();
3610 } else if (bdev[BDEV_DB]) {
3611 return bdev[BDEV_DB]->is_rotational();
3612 }
3613 return bdev[BDEV_SLOW]->is_rotational();
d2e6a577 3614}
9f95a23c 3615
f6b5b4d7
TL
3616/*
3617 Algorithm.
3618 do_replay_recovery_read is used when bluefs log abruptly ends, but it seems that more data should be there.
3619 Idea is to search disk for definiton of extents that will be accompanied with bluefs log in future,
3620 and try if using it will produce healthy bluefs transaction.
3621 We encode already known bluefs log extents and search disk for these bytes.
3622 When we find it, we decode following bytes as extent.
3623 We read that whole extent and then check if merged with existing log part gives a proper bluefs transaction.
3624 */
3625int BlueFS::do_replay_recovery_read(FileReader *log_reader,
3626 size_t replay_pos,
3627 size_t read_offset,
3628 size_t read_len,
3629 bufferlist* bl) {
3630 dout(1) << __func__ << " replay_pos=0x" << std::hex << replay_pos <<
3631 " needs 0x" << read_offset << "~" << read_len << std::dec << dendl;
3632
3633 bluefs_fnode_t& log_fnode = log_reader->file->fnode;
3634 bufferlist bin_extents;
3635 ceph::encode(log_fnode.extents, bin_extents);
3636 dout(2) << __func__ << " log file encoded extents length = " << bin_extents.length() << dendl;
3637
3638 // cannot process if too small to effectively search
3639 ceph_assert(bin_extents.length() >= 32);
3640 bufferlist last_32;
3641 last_32.substr_of(bin_extents, bin_extents.length() - 32, 32);
3642
3643 //read fixed part from replay_pos to end of bluefs_log extents
3644 bufferlist fixed;
3645 uint64_t e_off = 0;
3646 auto e = log_fnode.seek(replay_pos, &e_off);
3647 ceph_assert(e != log_fnode.extents.end());
3648 int r = bdev[e->bdev]->read(e->offset + e_off, e->length - e_off, &fixed, ioc[e->bdev],
3649 cct->_conf->bluefs_buffered_io);
3650 ceph_assert(r == 0);
3651 //capture dev of last good extent
3652 uint8_t last_e_dev = e->bdev;
3653 uint64_t last_e_off = e->offset;
3654 ++e;
3655 while (e != log_fnode.extents.end()) {
3656 r = bdev[e->bdev]->read(e->offset, e->length, &fixed, ioc[e->bdev],
3657 cct->_conf->bluefs_buffered_io);
3658 ceph_assert(r == 0);
3659 last_e_dev = e->bdev;
3660 ++e;
3661 }
3662 ceph_assert(replay_pos + fixed.length() == read_offset);
3663
3664 dout(2) << __func__ << " valid data in log = " << fixed.length() << dendl;
3665
3666 struct compare {
3667 bool operator()(const bluefs_extent_t& a, const bluefs_extent_t& b) const {
3668 if (a.bdev < b.bdev) return true;
3669 if (a.offset < b.offset) return true;
3670 return a.length < b.length;
3671 }
3672 };
3673 std::set<bluefs_extent_t, compare> extents_rejected;
3674 for (int dcnt = 0; dcnt < 3; dcnt++) {
3675 uint8_t dev = (last_e_dev + dcnt) % MAX_BDEV;
3676 if (bdev[dev] == nullptr) continue;
3677 dout(2) << __func__ << " processing " << get_device_name(dev) << dendl;
3678 interval_set<uint64_t> disk_regions;
3679 disk_regions.insert(0, bdev[dev]->get_size());
3680 for (auto f : file_map) {
3681 auto& e = f.second->fnode.extents;
3682 for (auto& p : e) {
3683 if (p.bdev == dev) {
3684 disk_regions.erase(p.offset, p.length);
3685 }
3686 }
3687 }
3688 size_t disk_regions_count = disk_regions.num_intervals();
3689 dout(5) << __func__ << " " << disk_regions_count << " regions to scan on " << get_device_name(dev) << dendl;
3690
3691 auto reg = disk_regions.lower_bound(last_e_off);
3692 //for all except first, start from beginning
3693 last_e_off = 0;
3694 if (reg == disk_regions.end()) {
3695 reg = disk_regions.begin();
3696 }
3697 const uint64_t chunk_size = 4 * 1024 * 1024;
3698 const uint64_t page_size = 4096;
3699 const uint64_t max_extent_size = 16;
3700 uint64_t overlay_size = last_32.length() + max_extent_size;
3701 for (size_t i = 0; i < disk_regions_count; reg++, i++) {
3702 if (reg == disk_regions.end()) {
3703 reg = disk_regions.begin();
3704 }
3705 uint64_t pos = reg.get_start();
3706 uint64_t len = reg.get_len();
3707
3708 std::unique_ptr<char[]> raw_data_p{new char[page_size + chunk_size]};
3709 char* raw_data = raw_data_p.get();
3710 memset(raw_data, 0, page_size);
3711
3712 while (len > last_32.length()) {
3713 uint64_t chunk_len = len > chunk_size ? chunk_size : len;
3714 dout(5) << __func__ << " read "
3715 << get_device_name(dev) << ":0x" << std::hex << pos << "+" << chunk_len << std::dec << dendl;
3716 r = bdev[dev]->read_random(pos, chunk_len, raw_data + page_size, cct->_conf->bluefs_buffered_io);
3717 ceph_assert(r == 0);
3718
3719 //search for fixed_last_32
3720 char* chunk_b = raw_data + page_size;
3721 char* chunk_e = chunk_b + chunk_len;
3722
3723 char* search_b = chunk_b - overlay_size;
3724 char* search_e = chunk_e;
3725
3726 for (char* sp = search_b; ; sp += last_32.length()) {
3727 sp = (char*)memmem(sp, search_e - sp, last_32.c_str(), last_32.length());
3728 if (sp == nullptr) {
3729 break;
3730 }
3731
3732 char* n = sp + last_32.length();
3733 dout(5) << __func__ << " checking location 0x" << std::hex << pos + (n - chunk_b) << std::dec << dendl;
3734 bufferlist test;
3735 test.append(n, std::min<size_t>(max_extent_size, chunk_e - n));
3736 bluefs_extent_t ne;
3737 try {
3738 bufferlist::const_iterator p = test.begin();
3739 ceph::decode(ne, p);
3740 } catch (buffer::error& e) {
3741 continue;
3742 }
3743 if (extents_rejected.count(ne) != 0) {
3744 dout(5) << __func__ << " extent " << ne << " already refected" <<dendl;
3745 continue;
3746 }
3747 //insert as rejected already. if we succeed, it wouldn't make difference.
3748 extents_rejected.insert(ne);
3749
3750 if (ne.bdev >= MAX_BDEV ||
3751 bdev[ne.bdev] == nullptr ||
3752 ne.length > 16 * 1024 * 1024 ||
3753 (ne.length & 4095) != 0 ||
3754 ne.offset + ne.length > bdev[ne.bdev]->get_size() ||
3755 (ne.offset & 4095) != 0) {
3756 dout(5) << __func__ << " refusing extent " << ne << dendl;
3757 continue;
3758 }
3759 dout(5) << __func__ << " checking extent " << ne << dendl;
3760
3761 //read candidate extent - whole
3762 bufferlist candidate;
3763 candidate.append(fixed);
3764 r = bdev[ne.bdev]->read(ne.offset, ne.length, &candidate, ioc[ne.bdev],
3765 cct->_conf->bluefs_buffered_io);
3766 ceph_assert(r == 0);
3767
3768 //check if transaction & crc is ok
3769 bluefs_transaction_t t;
3770 try {
3771 bufferlist::const_iterator p = candidate.cbegin();
3772 decode(t, p);
3773 }
3774 catch (buffer::error& e) {
3775 dout(5) << __func__ << " failed match" << dendl;
3776 continue;
3777 }
3778
3779 //success, it seems a probable candidate
3780 uint64_t l = std::min<uint64_t>(ne.length, read_len);
3781 //trim to required size
3782 bufferlist requested_read;
3783 requested_read.substr_of(candidate, fixed.length(), l);
3784 bl->append(requested_read);
3785 dout(5) << __func__ << " successful extension of log " << l << "/" << read_len << dendl;
3786 log_fnode.append_extent(ne);
3787 log_fnode.recalc_allocated();
3788 log_reader->buf.pos += l;
3789 return l;
3790 }
3791 //save overlay for next search
3792 memcpy(search_b, chunk_e - overlay_size, overlay_size);
3793 pos += chunk_len;
3794 len -= chunk_len;
3795 }
3796 }
3797 }
3798 return 0;
3799}
3800
9f95a23c
TL
3801void BlueFS::debug_inject_duplicate_gift(unsigned id,
3802 uint64_t offset,
3803 uint64_t len)
3804{
3805 dout(0) << __func__ << dendl;
3806 if (id < alloc.size() && alloc[id]) {
3807 alloc[id]->init_add_free(offset, len);
3808 }
3809}
3810
3811// ===============================================
3812// OriginalVolumeSelector
3813
f6b5b4d7
TL
3814void* OriginalVolumeSelector::get_hint_for_log() const {
3815 return reinterpret_cast<void*>(BlueFS::BDEV_WAL);
9f95a23c
TL
3816}
3817void* OriginalVolumeSelector::get_hint_by_dir(const string& dirname) const {
3818 uint8_t res = BlueFS::BDEV_DB;
3819 if (dirname.length() > 5) {
3820 // the "db.slow" and "db.wal" directory names are hard-coded at
3821 // match up with bluestore. the slow device is always the second
3822 // one (when a dedicated block.db device is present and used at
3823 // bdev 0). the wal device is always last.
3824 if (boost::algorithm::ends_with(dirname, ".slow")) {
3825 res = BlueFS::BDEV_SLOW;
3826 }
3827 else if (boost::algorithm::ends_with(dirname, ".wal")) {
3828 res = BlueFS::BDEV_WAL;
3829 }
3830 }
3831 return reinterpret_cast<void*>(res);
3832}
3833
3834uint8_t OriginalVolumeSelector::select_prefer_bdev(void* hint)
3835{
3836 return (uint8_t)(reinterpret_cast<uint64_t>(hint));
3837}
3838
3839void OriginalVolumeSelector::get_paths(const std::string& base, paths& res) const
3840{
3841 res.emplace_back(base, db_total);
3842 res.emplace_back(base + ".slow", slow_total);
3843}
3844
3845#undef dout_prefix
3846#define dout_prefix *_dout << "OriginalVolumeSelector: "
3847
3848void OriginalVolumeSelector::dump(ostream& sout) {
3849 sout<< "wal_total:" << wal_total
3850 << ", db_total:" << db_total
3851 << ", slow_total:" << slow_total
3852 << std::endl;
3853}