]> git.proxmox.com Git - ceph.git/blob - ceph/src/os/bluestore/BlueFS.cc
bump version to 15.2.1-pve1
[ceph.git] / ceph / src / os / bluestore / BlueFS.cc
1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2 // vim: ts=8 sw=2 smarttab
3
4 #include "boost/algorithm/string.hpp"
5 #include "bluestore_common.h"
6 #include "BlueFS.h"
7
8 #include "common/debug.h"
9 #include "common/errno.h"
10 #include "common/perf_counters.h"
11 #include "BlockDevice.h"
12 #include "Allocator.h"
13 #include "include/ceph_assert.h"
14 #include "common/admin_socket.h"
15
16 #define dout_context cct
17 #define dout_subsys ceph_subsys_bluefs
18 #undef dout_prefix
19 #define dout_prefix *_dout << "bluefs "
20 using TOPNSPC::common::cmd_getval;
21 MEMPOOL_DEFINE_OBJECT_FACTORY(BlueFS::File, bluefs_file, bluefs);
22 MEMPOOL_DEFINE_OBJECT_FACTORY(BlueFS::Dir, bluefs_dir, bluefs);
23 MEMPOOL_DEFINE_OBJECT_FACTORY(BlueFS::FileWriter, bluefs_file_writer, bluefs);
24 MEMPOOL_DEFINE_OBJECT_FACTORY(BlueFS::FileReaderBuffer,
25 bluefs_file_reader_buffer, bluefs);
26 MEMPOOL_DEFINE_OBJECT_FACTORY(BlueFS::FileReader, bluefs_file_reader, bluefs);
27 MEMPOOL_DEFINE_OBJECT_FACTORY(BlueFS::FileLock, bluefs_file_lock, bluefs);
28
29 static void wal_discard_cb(void *priv, void* priv2) {
30 BlueFS *bluefs = static_cast<BlueFS*>(priv);
31 interval_set<uint64_t> *tmp = static_cast<interval_set<uint64_t>*>(priv2);
32 bluefs->handle_discard(BlueFS::BDEV_WAL, *tmp);
33 }
34
35 static void db_discard_cb(void *priv, void* priv2) {
36 BlueFS *bluefs = static_cast<BlueFS*>(priv);
37 interval_set<uint64_t> *tmp = static_cast<interval_set<uint64_t>*>(priv2);
38 bluefs->handle_discard(BlueFS::BDEV_DB, *tmp);
39 }
40
41 static void slow_discard_cb(void *priv, void* priv2) {
42 BlueFS *bluefs = static_cast<BlueFS*>(priv);
43 interval_set<uint64_t> *tmp = static_cast<interval_set<uint64_t>*>(priv2);
44 bluefs->handle_discard(BlueFS::BDEV_SLOW, *tmp);
45 }
46
47 class BlueFS::SocketHook : public AdminSocketHook {
48 BlueFS* bluefs;
49 public:
50 static BlueFS::SocketHook* create(BlueFS* bluefs)
51 {
52 BlueFS::SocketHook* hook = nullptr;
53 AdminSocket* admin_socket = bluefs->cct->get_admin_socket();
54 if (admin_socket) {
55 hook = new BlueFS::SocketHook(bluefs);
56 int r = admin_socket->register_command("bluestore bluefs available "
57 "name=alloc_size,type=CephInt,req=false",
58 hook,
59 "Report available space for bluefs. "
60 "If alloc_size set, make simulation.");
61 if (r != 0) {
62 ldout(bluefs->cct, 1) << __func__ << " cannot register SocketHook" << dendl;
63 delete hook;
64 hook = nullptr;
65 } else {
66 r = admin_socket->register_command("bluestore bluefs stats",
67 hook,
68 "Dump internal statistics for bluefs."
69 "");
70 ceph_assert(r == 0);
71 }
72 }
73 return hook;
74 }
75
76 ~SocketHook() {
77 AdminSocket* admin_socket = bluefs->cct->get_admin_socket();
78 admin_socket->unregister_commands(this);
79 }
80 private:
81 SocketHook(BlueFS* bluefs) :
82 bluefs(bluefs) {}
83 int call(std::string_view command, const cmdmap_t& cmdmap,
84 Formatter *f,
85 std::ostream& errss,
86 bufferlist& out) override {
87 if (command == "bluestore bluefs available") {
88 int64_t alloc_size = 0;
89 cmd_getval(cmdmap, "alloc_size", alloc_size);
90 if ((alloc_size & (alloc_size - 1)) != 0) {
91 errss << "Invalid allocation size:'" << alloc_size << std::endl;
92 return -EINVAL;
93 }
94 if (alloc_size == 0)
95 alloc_size = bluefs->cct->_conf->bluefs_alloc_size;
96 f->open_object_section("bluefs_available_space");
97 for (unsigned dev = BDEV_WAL; dev <= BDEV_SLOW; dev++) {
98 if (bluefs->bdev[dev]) {
99 f->open_object_section("dev");
100 f->dump_string("device", bluefs->get_device_name(dev));
101 ceph_assert(bluefs->alloc[dev]);
102 f->dump_int("free", bluefs->alloc[dev]->get_free());
103 f->close_section();
104 }
105 }
106 size_t extra_space = 0;
107 if (bluefs->slow_dev_expander) {
108 extra_space = bluefs->slow_dev_expander->available_freespace(alloc_size);
109 }
110 f->dump_int("available_from_bluestore", extra_space);
111 f->close_section();
112 } else if (command == "bluefs stats") {
113 std::stringstream ss;
114 bluefs->dump_block_extents(ss);
115 bluefs->dump_volume_selector(ss);
116 out.append(ss);
117 } else {
118 errss << "Invalid command" << std::endl;
119 return -ENOSYS;
120 }
121 return 0;
122 }
123 };
124
125 BlueFS::BlueFS(CephContext* cct)
126 : cct(cct),
127 bdev(MAX_BDEV),
128 ioc(MAX_BDEV),
129 block_all(MAX_BDEV)
130 {
131 discard_cb[BDEV_WAL] = wal_discard_cb;
132 discard_cb[BDEV_DB] = db_discard_cb;
133 discard_cb[BDEV_SLOW] = slow_discard_cb;
134 asok_hook = SocketHook::create(this);
135 }
136
137 BlueFS::~BlueFS()
138 {
139 delete asok_hook;
140 for (auto p : ioc) {
141 if (p)
142 p->aio_wait();
143 }
144 for (auto p : bdev) {
145 if (p) {
146 p->close();
147 delete p;
148 }
149 }
150 for (auto p : ioc) {
151 delete p;
152 }
153 }
154
155 void BlueFS::_init_logger()
156 {
157 PerfCountersBuilder b(cct, "bluefs",
158 l_bluefs_first, l_bluefs_last);
159 b.add_u64_counter(l_bluefs_gift_bytes, "gift_bytes",
160 "Bytes gifted from BlueStore", NULL, 0, unit_t(UNIT_BYTES));
161 b.add_u64_counter(l_bluefs_reclaim_bytes, "reclaim_bytes",
162 "Bytes reclaimed by BlueStore", NULL, 0, unit_t(UNIT_BYTES));
163 b.add_u64(l_bluefs_db_total_bytes, "db_total_bytes",
164 "Total bytes (main db device)",
165 "b", PerfCountersBuilder::PRIO_USEFUL, unit_t(UNIT_BYTES));
166 b.add_u64(l_bluefs_db_used_bytes, "db_used_bytes",
167 "Used bytes (main db device)",
168 "u", PerfCountersBuilder::PRIO_USEFUL, unit_t(UNIT_BYTES));
169 b.add_u64(l_bluefs_wal_total_bytes, "wal_total_bytes",
170 "Total bytes (wal device)",
171 "walb", PerfCountersBuilder::PRIO_USEFUL, unit_t(UNIT_BYTES));
172 b.add_u64(l_bluefs_wal_used_bytes, "wal_used_bytes",
173 "Used bytes (wal device)",
174 "walu", PerfCountersBuilder::PRIO_USEFUL, unit_t(UNIT_BYTES));
175 b.add_u64(l_bluefs_slow_total_bytes, "slow_total_bytes",
176 "Total bytes (slow device)",
177 "slob", PerfCountersBuilder::PRIO_USEFUL, unit_t(UNIT_BYTES));
178 b.add_u64(l_bluefs_slow_used_bytes, "slow_used_bytes",
179 "Used bytes (slow device)",
180 "slou", PerfCountersBuilder::PRIO_USEFUL, unit_t(UNIT_BYTES));
181 b.add_u64(l_bluefs_num_files, "num_files", "File count",
182 "f", PerfCountersBuilder::PRIO_USEFUL);
183 b.add_u64(l_bluefs_log_bytes, "log_bytes", "Size of the metadata log",
184 "jlen", PerfCountersBuilder::PRIO_INTERESTING, unit_t(UNIT_BYTES));
185 b.add_u64_counter(l_bluefs_log_compactions, "log_compactions",
186 "Compactions of the metadata log");
187 b.add_u64_counter(l_bluefs_logged_bytes, "logged_bytes",
188 "Bytes written to the metadata log", "j",
189 PerfCountersBuilder::PRIO_CRITICAL, unit_t(UNIT_BYTES));
190 b.add_u64_counter(l_bluefs_files_written_wal, "files_written_wal",
191 "Files written to WAL");
192 b.add_u64_counter(l_bluefs_files_written_sst, "files_written_sst",
193 "Files written to SSTs");
194 b.add_u64_counter(l_bluefs_bytes_written_wal, "bytes_written_wal",
195 "Bytes written to WAL", "wal",
196 PerfCountersBuilder::PRIO_CRITICAL);
197 b.add_u64_counter(l_bluefs_bytes_written_sst, "bytes_written_sst",
198 "Bytes written to SSTs", "sst",
199 PerfCountersBuilder::PRIO_CRITICAL, unit_t(UNIT_BYTES));
200 b.add_u64_counter(l_bluefs_bytes_written_slow, "bytes_written_slow",
201 "Bytes written to WAL/SSTs at slow device", NULL,
202 PerfCountersBuilder::PRIO_USEFUL, unit_t(UNIT_BYTES));
203 b.add_u64_counter(l_bluefs_max_bytes_wal, "max_bytes_wal",
204 "Maximum bytes allocated from WAL");
205 b.add_u64_counter(l_bluefs_max_bytes_db, "max_bytes_db",
206 "Maximum bytes allocated from DB");
207 b.add_u64_counter(l_bluefs_max_bytes_slow, "max_bytes_slow",
208 "Maximum bytes allocated from SLOW");
209
210 b.add_u64_counter(l_bluefs_read_random_count, "read_random_count",
211 "random read requests processed");
212 b.add_u64_counter(l_bluefs_read_random_bytes, "read_random_bytes",
213 "Bytes requested in random read mode", NULL,
214 PerfCountersBuilder::PRIO_USEFUL, unit_t(UNIT_BYTES));
215 b.add_u64_counter(l_bluefs_read_random_disk_count, "read_random_disk_count",
216 "random reads requests going to disk");
217 b.add_u64_counter(l_bluefs_read_random_disk_bytes, "read_random_disk_bytes",
218 "Bytes read from disk in random read mode", NULL,
219 PerfCountersBuilder::PRIO_USEFUL, unit_t(UNIT_BYTES));
220 b.add_u64_counter(l_bluefs_read_random_buffer_count, "read_random_buffer_count",
221 "random read requests processed using prefetch buffer");
222 b.add_u64_counter(l_bluefs_read_random_buffer_bytes, "read_random_buffer_bytes",
223 "Bytes read from prefetch buffer in random read mode", NULL,
224 PerfCountersBuilder::PRIO_USEFUL, unit_t(UNIT_BYTES));
225
226 b.add_u64_counter(l_bluefs_read_count, "read_count",
227 "buffered read requests processed");
228 b.add_u64_counter(l_bluefs_read_bytes, "read_bytes",
229 "Bytes requested in buffered read mode", NULL,
230 PerfCountersBuilder::PRIO_USEFUL, unit_t(UNIT_BYTES));
231
232 b.add_u64_counter(l_bluefs_read_prefetch_count, "read_prefetch_count",
233 "prefetch read requests processed");
234 b.add_u64_counter(l_bluefs_read_prefetch_bytes, "read_prefetch_bytes",
235 "Bytes requested in prefetch read mode", NULL,
236 PerfCountersBuilder::PRIO_USEFUL, unit_t(UNIT_BYTES));
237
238 logger = b.create_perf_counters();
239 cct->get_perfcounters_collection()->add(logger);
240 }
241
242 void BlueFS::_shutdown_logger()
243 {
244 cct->get_perfcounters_collection()->remove(logger);
245 delete logger;
246 }
247
248 void BlueFS::_update_logger_stats()
249 {
250 // we must be holding the lock
251 logger->set(l_bluefs_num_files, file_map.size());
252 logger->set(l_bluefs_log_bytes, log_writer->file->fnode.size);
253
254 if (alloc[BDEV_WAL]) {
255 logger->set(l_bluefs_wal_total_bytes, block_all[BDEV_WAL].size());
256 logger->set(l_bluefs_wal_used_bytes,
257 block_all[BDEV_WAL].size() - alloc[BDEV_WAL]->get_free());
258 }
259 if (alloc[BDEV_DB]) {
260 logger->set(l_bluefs_db_total_bytes, block_all[BDEV_DB].size());
261 logger->set(l_bluefs_db_used_bytes,
262 block_all[BDEV_DB].size() - alloc[BDEV_DB]->get_free());
263 }
264 if (alloc[BDEV_SLOW]) {
265 logger->set(l_bluefs_slow_total_bytes, block_all[BDEV_SLOW].size());
266 logger->set(l_bluefs_slow_used_bytes,
267 block_all[BDEV_SLOW].size() - alloc[BDEV_SLOW]->get_free());
268 }
269 }
270
271 int BlueFS::add_block_device(unsigned id, const string& path, bool trim,
272 bool shared_with_bluestore)
273 {
274 dout(10) << __func__ << " bdev " << id << " path " << path << dendl;
275 ceph_assert(id < bdev.size());
276 ceph_assert(bdev[id] == NULL);
277 BlockDevice *b = BlockDevice::create(cct, path, NULL, NULL,
278 discard_cb[id], static_cast<void*>(this));
279 if (shared_with_bluestore) {
280 b->set_no_exclusive_lock();
281 }
282 int r = b->open(path);
283 if (r < 0) {
284 delete b;
285 return r;
286 }
287 if (trim) {
288 b->discard(0, b->get_size());
289 }
290
291 dout(1) << __func__ << " bdev " << id << " path " << path
292 << " size " << byte_u_t(b->get_size()) << dendl;
293 bdev[id] = b;
294 ioc[id] = new IOContext(cct, NULL);
295 return 0;
296 }
297
298 bool BlueFS::bdev_support_label(unsigned id)
299 {
300 ceph_assert(id < bdev.size());
301 ceph_assert(bdev[id]);
302 return bdev[id]->supported_bdev_label();
303 }
304
305 uint64_t BlueFS::get_block_device_size(unsigned id)
306 {
307 if (id < bdev.size() && bdev[id])
308 return bdev[id]->get_size();
309 return 0;
310 }
311
312 void BlueFS::_add_block_extent(unsigned id, uint64_t offset, uint64_t length)
313 {
314 dout(1) << __func__ << " bdev " << id
315 << " 0x" << std::hex << offset << "~" << length << std::dec
316 << dendl;
317
318 ceph_assert(id < bdev.size());
319 ceph_assert(bdev[id]);
320 ceph_assert(bdev[id]->get_size() >= offset + length);
321 block_all[id].insert(offset, length);
322
323 if (id < alloc.size() && alloc[id]) {
324 log_t.op_alloc_add(id, offset, length);
325 alloc[id]->init_add_free(offset, length);
326 }
327
328 if (logger)
329 logger->inc(l_bluefs_gift_bytes, length);
330 dout(10) << __func__ << " done" << dendl;
331 }
332
333 int BlueFS::reclaim_blocks(unsigned id, uint64_t want,
334 PExtentVector *extents)
335 {
336 std::unique_lock l(lock);
337 dout(1) << __func__ << " bdev " << id
338 << " want 0x" << std::hex << want << std::dec << dendl;
339 ceph_assert(id < alloc.size());
340 ceph_assert(alloc[id]);
341 int64_t got = 0;
342
343 interval_set<uint64_t> granular;
344 while (want > 0 && !block_unused_too_granular[id].empty()) {
345 auto p = block_unused_too_granular[id].begin();
346 dout(20) << __func__ << " unused " << (int)id << ":"
347 << std::hex << p.get_start() << "~" << p.get_len() << dendl;
348 extents->push_back({p.get_start(), p.get_len()});
349 granular.insert(p.get_start(), p.get_len());
350 if (want >= p.get_len()) {
351 want -= p.get_len();
352 } else {
353 want = 0;
354 }
355 got += p.get_len();
356 block_unused_too_granular[id].erase(p);
357 }
358
359 if (want > 0) {
360 got += alloc[id]->allocate(want, alloc_size[id], 0, extents);
361 ceph_assert(got != 0);
362 if (got < 0) {
363 derr << __func__ << " failed to allocate space to return to bluestore"
364 << dendl;
365 alloc[id]->dump();
366 block_unused_too_granular[id].insert(granular);
367 return got;
368 }
369
370 for (auto& p : *extents) {
371 block_all[id].erase(p.offset, p.length);
372 log_t.op_alloc_rm(id, p.offset, p.length);
373 }
374
375 flush_bdev();
376 int r = _flush_and_sync_log(l);
377 ceph_assert(r == 0);
378 }
379
380 logger->inc(l_bluefs_reclaim_bytes, got);
381 dout(1) << __func__ << " bdev " << id << " want 0x" << std::hex << want
382 << " got " << *extents << dendl;
383 return 0;
384 }
385
386 void BlueFS::handle_discard(unsigned id, interval_set<uint64_t>& to_release)
387 {
388 dout(10) << __func__ << " bdev " << id << dendl;
389 ceph_assert(alloc[id]);
390 alloc[id]->release(to_release);
391 }
392
393 uint64_t BlueFS::get_used()
394 {
395 std::lock_guard l(lock);
396 uint64_t used = 0;
397 for (unsigned id = 0; id < MAX_BDEV; ++id) {
398 if (alloc[id]) {
399 used += block_all[id].size() - alloc[id]->get_free();
400 }
401 }
402 return used;
403 }
404
405 uint64_t BlueFS::get_total(unsigned id)
406 {
407 std::lock_guard l(lock);
408 ceph_assert(id < block_all.size());
409 return block_all[id].size();
410 }
411
412 uint64_t BlueFS::get_free(unsigned id)
413 {
414 std::lock_guard l(lock);
415 ceph_assert(id < alloc.size());
416 return alloc[id]->get_free();
417 }
418
419 void BlueFS::dump_perf_counters(Formatter *f)
420 {
421 f->open_object_section("bluefs_perf_counters");
422 logger->dump_formatted(f,0);
423 f->close_section();
424 }
425
426 void BlueFS::dump_block_extents(ostream& out)
427 {
428 for (unsigned i = 0; i < MAX_BDEV; ++i) {
429 if (!bdev[i]) {
430 continue;
431 }
432 auto owned = get_total(i);
433 auto free = get_free(i);
434 out << i << " : device size 0x" << std::hex << bdev[i]->get_size()
435 << " : own 0x" << block_all[i]
436 << " = 0x" << owned
437 << " : using 0x" << owned - free
438 << std::dec << "(" << byte_u_t(owned - free) << ")"
439 << "\n";
440 }
441 }
442
443 void BlueFS::get_usage(vector<pair<uint64_t,uint64_t>> *usage)
444 {
445 std::lock_guard l(lock);
446 usage->resize(bdev.size());
447 for (unsigned id = 0; id < bdev.size(); ++id) {
448 if (!bdev[id]) {
449 (*usage)[id] = make_pair(0, 0);
450 continue;
451 }
452 (*usage)[id].first = alloc[id]->get_free();
453 (*usage)[id].second = block_all[id].size();
454 uint64_t used =
455 (block_all[id].size() - (*usage)[id].first) * 100 / block_all[id].size();
456 dout(10) << __func__ << " bdev " << id
457 << " free " << (*usage)[id].first
458 << " (" << byte_u_t((*usage)[id].first) << ")"
459 << " / " << (*usage)[id].second
460 << " (" << byte_u_t((*usage)[id].second) << ")"
461 << ", used " << used << "%"
462 << dendl;
463 }
464 }
465
466 int BlueFS::get_block_extents(unsigned id, interval_set<uint64_t> *extents)
467 {
468 std::lock_guard l(lock);
469 dout(10) << __func__ << " bdev " << id << dendl;
470 if (id >= block_all.size())
471 return -EINVAL;
472 *extents = block_all[id];
473 return 0;
474 }
475
476 int BlueFS::mkfs(uuid_d osd_uuid, const bluefs_layout_t& layout)
477 {
478 std::unique_lock l(lock);
479 dout(1) << __func__
480 << " osd_uuid " << osd_uuid
481 << dendl;
482
483 // set volume selector if not provided before/outside
484 if (vselector == nullptr) {
485 vselector.reset(
486 new OriginalVolumeSelector(
487 get_block_device_size(BlueFS::BDEV_WAL) * 95 / 100,
488 get_block_device_size(BlueFS::BDEV_DB) * 95 / 100,
489 get_block_device_size(BlueFS::BDEV_SLOW) * 95 / 100));
490 }
491
492 _init_alloc();
493 _init_logger();
494
495 super.version = 1;
496 super.block_size = bdev[BDEV_DB]->get_block_size();
497 super.osd_uuid = osd_uuid;
498 super.uuid.generate_random();
499 dout(1) << __func__ << " uuid " << super.uuid << dendl;
500
501 // init log
502 FileRef log_file = ceph::make_ref<File>();
503 log_file->fnode.ino = 1;
504 log_file->vselector_hint = vselector->get_hint_by_device(BDEV_WAL);
505 int r = _allocate(
506 vselector->select_prefer_bdev(log_file->vselector_hint),
507 cct->_conf->bluefs_max_log_runway,
508 &log_file->fnode);
509 vselector->add_usage(log_file->vselector_hint, log_file->fnode);
510 ceph_assert(r == 0);
511 log_writer = _create_writer(log_file);
512
513 // initial txn
514 log_t.op_init();
515 for (unsigned bdev = 0; bdev < MAX_BDEV; ++bdev) {
516 interval_set<uint64_t>& p = block_all[bdev];
517 if (p.empty())
518 continue;
519 for (interval_set<uint64_t>::iterator q = p.begin(); q != p.end(); ++q) {
520 dout(20) << __func__ << " op_alloc_add " << bdev << " 0x"
521 << std::hex << q.get_start() << "~" << q.get_len() << std::dec
522 << dendl;
523 log_t.op_alloc_add(bdev, q.get_start(), q.get_len());
524 }
525 }
526 _flush_and_sync_log(l);
527
528 // write supers
529 super.log_fnode = log_file->fnode;
530 super.memorized_layout = layout;
531 _write_super(BDEV_DB);
532 flush_bdev();
533
534 // clean up
535 super = bluefs_super_t();
536 _close_writer(log_writer);
537 log_writer = NULL;
538 block_all.clear();
539 vselector.reset(nullptr);
540 _stop_alloc();
541 _shutdown_logger();
542
543 dout(10) << __func__ << " success" << dendl;
544 return 0;
545 }
546
547 void BlueFS::_init_alloc()
548 {
549 dout(20) << __func__ << dendl;
550 alloc.resize(MAX_BDEV);
551 alloc_size.resize(MAX_BDEV, 0);
552 pending_release.resize(MAX_BDEV);
553 block_unused_too_granular.resize(MAX_BDEV);
554
555 if (bdev[BDEV_WAL]) {
556 alloc_size[BDEV_WAL] = cct->_conf->bluefs_alloc_size;
557 }
558 if (bdev[BDEV_SLOW]) {
559 alloc_size[BDEV_DB] = cct->_conf->bluefs_alloc_size;
560 alloc_size[BDEV_SLOW] = cct->_conf->bluefs_shared_alloc_size;
561 } else {
562 alloc_size[BDEV_DB] = cct->_conf->bluefs_shared_alloc_size;
563 }
564 // new wal and db devices are never shared
565 if (bdev[BDEV_NEWWAL]) {
566 alloc_size[BDEV_NEWWAL] = cct->_conf->bluefs_alloc_size;
567 }
568 if (bdev[BDEV_NEWDB]) {
569 alloc_size[BDEV_NEWDB] = cct->_conf->bluefs_alloc_size;
570 }
571
572 for (unsigned id = 0; id < bdev.size(); ++id) {
573 if (!bdev[id]) {
574 continue;
575 }
576 ceph_assert(bdev[id]->get_size());
577 std::string name = "bluefs-";
578 const char* devnames[] = {"wal","db","slow"};
579 if (id <= BDEV_SLOW)
580 name += devnames[id];
581 else
582 name += to_string(uintptr_t(this));
583 ceph_assert(alloc_size[id]);
584 dout(1) << __func__ << " id " << id
585 << " alloc_size 0x" << std::hex << alloc_size[id]
586 << " size 0x" << bdev[id]->get_size() << std::dec << dendl;
587 alloc[id] = Allocator::create(cct, cct->_conf->bluefs_allocator,
588 bdev[id]->get_size(),
589 alloc_size[id], name);
590 interval_set<uint64_t>& p = block_all[id];
591 for (interval_set<uint64_t>::iterator q = p.begin(); q != p.end(); ++q) {
592 alloc[id]->init_add_free(q.get_start(), q.get_len());
593 }
594 }
595 }
596
597 void BlueFS::_stop_alloc()
598 {
599 dout(20) << __func__ << dendl;
600 for (auto p : bdev) {
601 if (p)
602 p->discard_drain();
603 }
604
605 for (auto p : alloc) {
606 if (p != nullptr) {
607 p->shutdown();
608 delete p;
609 }
610 }
611 alloc.clear();
612 block_unused_too_granular.clear();
613 }
614
615 int BlueFS::mount()
616 {
617 dout(1) << __func__ << dendl;
618
619 int r = _open_super();
620 if (r < 0) {
621 derr << __func__ << " failed to open super: " << cpp_strerror(r) << dendl;
622 goto out;
623 }
624
625 // set volume selector if not provided before/outside
626 if (vselector == nullptr) {
627 vselector.reset(
628 new OriginalVolumeSelector(
629 get_block_device_size(BlueFS::BDEV_WAL) * 95 / 100,
630 get_block_device_size(BlueFS::BDEV_DB) * 95 / 100,
631 get_block_device_size(BlueFS::BDEV_SLOW) * 95 / 100));
632 }
633
634 block_all.clear();
635 block_all.resize(MAX_BDEV);
636 _init_alloc();
637 _init_logger();
638
639 r = _replay(false, false);
640 if (r < 0) {
641 derr << __func__ << " failed to replay log: " << cpp_strerror(r) << dendl;
642 _stop_alloc();
643 goto out;
644 }
645
646 // init freelist
647 for (auto& p : file_map) {
648 dout(30) << __func__ << " noting alloc for " << p.second->fnode << dendl;
649 for (auto& q : p.second->fnode.extents) {
650 alloc[q.bdev]->init_rm_free(q.offset, q.length);
651 }
652 }
653
654 // set up the log for future writes
655 log_writer = _create_writer(_get_file(1));
656 ceph_assert(log_writer->file->fnode.ino == 1);
657 log_writer->pos = log_writer->file->fnode.size;
658 dout(10) << __func__ << " log write pos set to 0x"
659 << std::hex << log_writer->pos << std::dec
660 << dendl;
661
662 return 0;
663
664 out:
665 super = bluefs_super_t();
666 return r;
667 }
668
669 int BlueFS::maybe_verify_layout(const bluefs_layout_t& layout) const
670 {
671 if (super.memorized_layout) {
672 if (layout == *super.memorized_layout) {
673 dout(10) << __func__ << " bluefs layout verified positively" << dendl;
674 } else {
675 derr << __func__ << " memorized layout doesn't fit current one" << dendl;
676 return -EIO;
677 }
678 } else {
679 dout(10) << __func__ << " no memorized_layout in bluefs superblock"
680 << dendl;
681 }
682
683 return 0;
684 }
685
686 void BlueFS::umount()
687 {
688 dout(1) << __func__ << dendl;
689
690 sync_metadata();
691
692 _close_writer(log_writer);
693 log_writer = NULL;
694
695 vselector.reset(nullptr);
696 _stop_alloc();
697 file_map.clear();
698 dir_map.clear();
699 super = bluefs_super_t();
700 log_t.clear();
701 _shutdown_logger();
702 }
703
704 int BlueFS::prepare_new_device(int id, const bluefs_layout_t& layout)
705 {
706 dout(1) << __func__ << dendl;
707
708 if(id == BDEV_NEWDB) {
709 int new_log_dev_cur = BDEV_WAL;
710 int new_log_dev_next = BDEV_WAL;
711 if (!bdev[BDEV_WAL]) {
712 new_log_dev_cur = BDEV_NEWDB;
713 new_log_dev_next = BDEV_DB;
714 }
715 _rewrite_log_and_layout_sync(false,
716 BDEV_NEWDB,
717 new_log_dev_cur,
718 new_log_dev_next,
719 RENAME_DB2SLOW,
720 layout);
721 //}
722 } else if(id == BDEV_NEWWAL) {
723 _rewrite_log_and_layout_sync(false,
724 BDEV_DB,
725 BDEV_NEWWAL,
726 BDEV_WAL,
727 REMOVE_WAL,
728 layout);
729 } else {
730 assert(false);
731 }
732 return 0;
733 }
734
735 void BlueFS::collect_metadata(map<string,string> *pm, unsigned skip_bdev_id)
736 {
737 if (skip_bdev_id != BDEV_DB && bdev[BDEV_DB])
738 bdev[BDEV_DB]->collect_metadata("bluefs_db_", pm);
739 if (bdev[BDEV_WAL])
740 bdev[BDEV_WAL]->collect_metadata("bluefs_wal_", pm);
741 }
742
743 void BlueFS::get_devices(set<string> *ls)
744 {
745 for (unsigned i = 0; i < MAX_BDEV; ++i) {
746 if (bdev[i]) {
747 bdev[i]->get_devices(ls);
748 }
749 }
750 }
751
752 int BlueFS::fsck()
753 {
754 std::lock_guard l(lock);
755 dout(1) << __func__ << dendl;
756 // hrm, i think we check everything on mount...
757 return 0;
758 }
759
760 int BlueFS::_write_super(int dev)
761 {
762 // build superblock
763 bufferlist bl;
764 encode(super, bl);
765 uint32_t crc = bl.crc32c(-1);
766 encode(crc, bl);
767 dout(10) << __func__ << " super block length(encoded): " << bl.length() << dendl;
768 dout(10) << __func__ << " superblock " << super.version << dendl;
769 dout(10) << __func__ << " log_fnode " << super.log_fnode << dendl;
770 ceph_assert_always(bl.length() <= get_super_length());
771 bl.append_zero(get_super_length() - bl.length());
772
773 bdev[dev]->write(get_super_offset(), bl, false, WRITE_LIFE_SHORT);
774 dout(20) << __func__ << " v " << super.version
775 << " crc 0x" << std::hex << crc
776 << " offset 0x" << get_super_offset() << std::dec
777 << dendl;
778 return 0;
779 }
780
781 int BlueFS::_open_super()
782 {
783 dout(10) << __func__ << dendl;
784
785 bufferlist bl;
786 uint32_t expected_crc, crc;
787 int r;
788
789 // always the second block
790 r = bdev[BDEV_DB]->read(get_super_offset(), get_super_length(),
791 &bl, ioc[BDEV_DB], false);
792 if (r < 0)
793 return r;
794
795 auto p = bl.cbegin();
796 decode(super, p);
797 {
798 bufferlist t;
799 t.substr_of(bl, 0, p.get_off());
800 crc = t.crc32c(-1);
801 }
802 decode(expected_crc, p);
803 if (crc != expected_crc) {
804 derr << __func__ << " bad crc on superblock, expected 0x"
805 << std::hex << expected_crc << " != actual 0x" << crc << std::dec
806 << dendl;
807 return -EIO;
808 }
809 dout(10) << __func__ << " superblock " << super.version << dendl;
810 dout(10) << __func__ << " log_fnode " << super.log_fnode << dendl;
811 return 0;
812 }
813
814 int BlueFS::_check_new_allocations(const bluefs_fnode_t& fnode,
815 size_t dev_count,
816 boost::dynamic_bitset<uint64_t>* owned_blocks,
817 boost::dynamic_bitset<uint64_t>* used_blocks)
818 {
819 auto& fnode_extents = fnode.extents;
820 for (auto e : fnode_extents) {
821 auto id = e.bdev;
822 bool fail = false;
823 ceph_assert(id < dev_count);
824 apply_for_bitset_range(e.offset, e.length, alloc_size[id], owned_blocks[id],
825 [&](uint64_t pos, boost::dynamic_bitset<uint64_t> &bs) {
826 if (!bs.test(pos)) {
827 fail = true;
828 }
829 }
830 );
831 if (fail) {
832 derr << __func__ << " invalid extent " << int(id)
833 << ": 0x" << std::hex << e.offset << "~" << e.length
834 << std::dec
835 << ": wasn't given but allocated for ino " << fnode.ino
836 << dendl;
837 return -EFAULT;
838 }
839
840 apply_for_bitset_range(e.offset, e.length, alloc_size[id], used_blocks[id],
841 [&](uint64_t pos, boost::dynamic_bitset<uint64_t> &bs) {
842 if (bs.test(pos)) {
843 fail = true;
844 }
845 bs.set(pos);
846 }
847 );
848 if (fail) {
849 derr << __func__ << " invalid extent " << int(e.bdev)
850 << ": 0x" << std::hex << e.offset << "~" << e.length
851 << std::dec << ": duplicate reference, ino " << fnode.ino
852 << dendl;
853 return -EFAULT;
854 }
855 }
856 return 0;
857 }
858
859 int BlueFS::_adjust_granularity(
860 __u8 id, uint64_t *offset, uint64_t *length, bool alloc)
861 {
862 const char *op = alloc ? "op_alloc_add" : "op_alloc_rm";
863 auto oldo = *offset;
864 auto oldl = *length;
865 if (*offset & (alloc_size[id] - 1)) {
866 *offset &= ~(alloc_size[id] - 1);
867 *offset += alloc_size[id];
868 if (*length > *offset - oldo) {
869 if (alloc) {
870 block_unused_too_granular[id].insert(oldo, *offset - oldo);
871 } else {
872 block_unused_too_granular[id].erase(oldo, *offset - oldo);
873 }
874 *length -= (*offset - oldo);
875 } else {
876 if (alloc) {
877 block_unused_too_granular[id].insert(oldo, *length);
878 } else {
879 block_unused_too_granular[id].erase(oldo, *length);
880 }
881 *length = 0;
882 }
883 }
884 if (*length & (alloc_size[id] - 1)) {
885 *length &= ~(alloc_size[id] - 1);
886 if (alloc) {
887 block_unused_too_granular[id].insert(
888 *offset + *length,
889 oldo + oldl - *offset - *length);
890 } else {
891 block_unused_too_granular[id].erase(
892 *offset + *length,
893 oldo + oldl - *offset - *length);
894 }
895 }
896 if (oldo != *offset || oldl != *length) {
897 dout(10) << __func__ << " " << op << " "
898 << (int)id << ":" << std::hex << oldo << "~" << oldl
899 << " -> " << (int)id << ":" << *offset << "~" << *length << dendl;
900 }
901 return 0;
902 }
903
904 int BlueFS::_verify_alloc_granularity(
905 __u8 id, uint64_t offset, uint64_t length, const char *op)
906 {
907 if ((offset & (alloc_size[id] - 1)) ||
908 (length & (alloc_size[id] - 1))) {
909 derr << __func__ << " " << op << " of " << (int)id
910 << ":0x" << std::hex << offset << "~" << length << std::dec
911 << " does not align to alloc_size 0x"
912 << std::hex << alloc_size[id] << std::dec << dendl;
913 // be helpful
914 auto need = alloc_size[id];
915 while (need && ((offset & (need - 1)) ||
916 (length & (need - 1)))) {
917 need >>= 1;
918 }
919 if (need) {
920 const char *which;
921 if (id == BDEV_SLOW ||
922 (id == BDEV_DB && !bdev[BDEV_SLOW])) {
923 which = "bluefs_shared_alloc_size";
924 } else {
925 which = "bluefs_alloc_size";
926 }
927 derr << "work-around by setting " << which << " = " << need
928 << " for this OSD" << dendl;
929 }
930 return -EFAULT;
931 }
932 return 0;
933 }
934
935 int BlueFS::_replay(bool noop, bool to_stdout)
936 {
937 dout(10) << __func__ << (noop ? " NO-OP" : "") << dendl;
938 ino_last = 1; // by the log
939 log_seq = 0;
940
941 FileRef log_file;
942 log_file = _get_file(1);
943
944 // sanity check
945 for (auto& a : block_unused_too_granular) {
946 ceph_assert(a.empty());
947 }
948
949 if (!noop) {
950 log_file->fnode = super.log_fnode;
951 log_file->vselector_hint =
952 vselector->get_hint_by_device(BDEV_WAL);
953 } else {
954 // do not use fnode from superblock in 'noop' mode - log_file's one should
955 // be fine and up-to-date
956 ceph_assert(log_file->fnode.ino == 1);
957 ceph_assert(log_file->fnode.extents.size() != 0);
958 }
959 dout(10) << __func__ << " log_fnode " << super.log_fnode << dendl;
960 if (unlikely(to_stdout)) {
961 std::cout << " log_fnode " << super.log_fnode << std::endl;
962 }
963
964 FileReader *log_reader = new FileReader(
965 log_file, cct->_conf->bluefs_max_prefetch,
966 false, // !random
967 true); // ignore eof
968
969 bool seen_recs = false;
970
971 boost::dynamic_bitset<uint64_t> used_blocks[MAX_BDEV];
972 boost::dynamic_bitset<uint64_t> owned_blocks[MAX_BDEV];
973
974 if (cct->_conf->bluefs_log_replay_check_allocations) {
975 for (size_t i = 0; i < MAX_BDEV; ++i) {
976 if (alloc_size[i] != 0 && bdev[i] != nullptr) {
977 used_blocks[i].resize(round_up_to(bdev[i]->get_size(), alloc_size[i]) / alloc_size[i]);
978 owned_blocks[i].resize(round_up_to(bdev[i]->get_size(), alloc_size[i]) / alloc_size[i]);
979 }
980 }
981 }
982
983 bool first_log_check = true;
984
985 while (true) {
986 ceph_assert((log_reader->buf.pos & ~super.block_mask()) == 0);
987 uint64_t pos = log_reader->buf.pos;
988 uint64_t read_pos = pos;
989 bufferlist bl;
990 {
991 int r = _read(log_reader, &log_reader->buf, read_pos, super.block_size,
992 &bl, NULL);
993 ceph_assert(r == (int)super.block_size);
994 read_pos += r;
995 }
996 uint64_t more = 0;
997 uint64_t seq;
998 uuid_d uuid;
999 {
1000 auto p = bl.cbegin();
1001 __u8 a, b;
1002 uint32_t len;
1003 decode(a, p);
1004 decode(b, p);
1005 decode(len, p);
1006 decode(uuid, p);
1007 decode(seq, p);
1008 if (len + 6 > bl.length()) {
1009 more = round_up_to(len + 6 - bl.length(), super.block_size);
1010 }
1011 }
1012 if (uuid != super.uuid) {
1013 if (seen_recs) {
1014 dout(10) << __func__ << " 0x" << std::hex << pos << std::dec
1015 << ": stop: uuid " << uuid << " != super.uuid " << super.uuid
1016 << dendl;
1017 } else {
1018 derr << __func__ << " 0x" << std::hex << pos << std::dec
1019 << ": stop: uuid " << uuid << " != super.uuid " << super.uuid
1020 << ", block dump: \n";
1021 bufferlist t;
1022 t.substr_of(bl, 0, super.block_size);
1023 t.hexdump(*_dout);
1024 *_dout << dendl;
1025 }
1026 break;
1027 }
1028 if (seq != log_seq + 1) {
1029 if (seen_recs) {
1030 dout(10) << __func__ << " 0x" << std::hex << pos << std::dec
1031 << ": stop: seq " << seq << " != expected " << log_seq + 1
1032 << dendl;;
1033 } else {
1034 derr << __func__ << " 0x" << std::hex << pos << std::dec
1035 << ": stop: seq " << seq << " != expected " << log_seq + 1
1036 << dendl;;
1037 }
1038 break;
1039 }
1040 if (more) {
1041 dout(20) << __func__ << " need 0x" << std::hex << more << std::dec
1042 << " more bytes" << dendl;
1043 bufferlist t;
1044 int r = _read(log_reader, &log_reader->buf, read_pos, more, &t, NULL);
1045 if (r < (int)more) {
1046 derr << __func__ << " 0x" << std::hex << pos
1047 << ": stop: len is 0x" << bl.length() + more << std::dec
1048 << ", which is past eof" << dendl;
1049 break;
1050 }
1051 ceph_assert(r == (int)more);
1052 bl.claim_append(t);
1053 read_pos += r;
1054 }
1055 seen_recs = true;
1056 bluefs_transaction_t t;
1057 try {
1058 auto p = bl.cbegin();
1059 decode(t, p);
1060 }
1061 catch (buffer::error& e) {
1062 derr << __func__ << " 0x" << std::hex << pos << std::dec
1063 << ": stop: failed to decode: " << e.what()
1064 << dendl;
1065 delete log_reader;
1066 return -EIO;
1067 }
1068 ceph_assert(seq == t.seq);
1069 dout(10) << __func__ << " 0x" << std::hex << pos << std::dec
1070 << ": " << t << dendl;
1071 if (unlikely(to_stdout)) {
1072 std::cout << " 0x" << std::hex << pos << std::dec
1073 << ": " << t << std::endl;
1074 }
1075
1076 auto p = t.op_bl.cbegin();
1077 while (!p.end()) {
1078 __u8 op;
1079 decode(op, p);
1080 switch (op) {
1081
1082 case bluefs_transaction_t::OP_INIT:
1083 dout(20) << __func__ << " 0x" << std::hex << pos << std::dec
1084 << ": op_init" << dendl;
1085 if (unlikely(to_stdout)) {
1086 std::cout << " 0x" << std::hex << pos << std::dec
1087 << ": op_init" << std::endl;
1088 }
1089
1090 ceph_assert(t.seq == 1);
1091 break;
1092
1093 case bluefs_transaction_t::OP_JUMP:
1094 {
1095 uint64_t next_seq;
1096 uint64_t offset;
1097 decode(next_seq, p);
1098 decode(offset, p);
1099 dout(20) << __func__ << " 0x" << std::hex << pos << std::dec
1100 << ": op_jump seq " << next_seq
1101 << " offset 0x" << std::hex << offset << std::dec << dendl;
1102 if (unlikely(to_stdout)) {
1103 std::cout << " 0x" << std::hex << pos << std::dec
1104 << ": op_jump seq " << next_seq
1105 << " offset 0x" << std::hex << offset << std::dec
1106 << std::endl;
1107 }
1108
1109 ceph_assert(next_seq >= log_seq);
1110 log_seq = next_seq - 1; // we will increment it below
1111 uint64_t skip = offset - read_pos;
1112 if (skip) {
1113 bufferlist junk;
1114 int r = _read(log_reader, &log_reader->buf, read_pos, skip, &junk,
1115 NULL);
1116 if (r != (int)skip) {
1117 dout(10) << __func__ << " 0x" << std::hex << read_pos
1118 << ": stop: failed to skip to " << offset
1119 << std::dec << dendl;
1120 ceph_abort_msg("problem with op_jump");
1121 }
1122 }
1123 }
1124 break;
1125
1126 case bluefs_transaction_t::OP_JUMP_SEQ:
1127 {
1128 uint64_t next_seq;
1129 decode(next_seq, p);
1130 dout(20) << __func__ << " 0x" << std::hex << pos << std::dec
1131 << ": op_jump_seq " << next_seq << dendl;
1132 if (unlikely(to_stdout)) {
1133 std::cout << " 0x" << std::hex << pos << std::dec
1134 << ": op_jump_seq " << next_seq << std::endl;
1135 }
1136
1137 ceph_assert(next_seq >= log_seq);
1138 log_seq = next_seq - 1; // we will increment it below
1139 }
1140 break;
1141
1142 case bluefs_transaction_t::OP_ALLOC_ADD:
1143 {
1144 __u8 id;
1145 uint64_t offset, length;
1146 decode(id, p);
1147 decode(offset, p);
1148 decode(length, p);
1149 dout(20) << __func__ << " 0x" << std::hex << pos << std::dec
1150 << ": op_alloc_add " << " " << (int)id
1151 << ":0x" << std::hex << offset << "~" << length << std::dec
1152 << dendl;
1153 if (unlikely(to_stdout)) {
1154 std::cout << " 0x" << std::hex << pos << std::dec
1155 << ": op_alloc_add " << " " << (int)id
1156 << ":0x" << std::hex << offset << "~" << length << std::dec
1157 << std::endl;
1158 }
1159 if (!noop) {
1160 block_all[id].insert(offset, length);
1161 _adjust_granularity(id, &offset, &length, true);
1162 if (length) {
1163 alloc[id]->init_add_free(offset, length);
1164 }
1165
1166 if (cct->_conf->bluefs_log_replay_check_allocations) {
1167 bool fail = false;
1168 apply_for_bitset_range(offset, length, alloc_size[id], owned_blocks[id],
1169 [&](uint64_t pos, boost::dynamic_bitset<uint64_t> &bs) {
1170 if (bs.test(pos)) {
1171 fail = true;
1172 } else {
1173 bs.set(pos);
1174 }
1175 }
1176 );
1177 if (fail) {
1178 derr << __func__ << " invalid extent " << (int)id
1179 << ": 0x" << std::hex << offset << "~" << length
1180 << std::dec << ": already given" << dendl;
1181 return -EFAULT;
1182 }
1183 apply_for_bitset_range(offset, length, alloc_size[id], used_blocks[id],
1184 [&](uint64_t pos, boost::dynamic_bitset<uint64_t> &bs) {
1185 if (bs.test(pos)) {
1186 fail = true;
1187 }
1188 }
1189 );
1190 if (fail) {
1191 derr << __func__ << " invalid extent " << int(id)
1192 << ": 0x" << std::hex << offset << "~" << length
1193 << std::dec << ": already in use" << dendl;
1194 return -EFAULT;
1195 }
1196 }
1197 }
1198 }
1199 break;
1200
1201 case bluefs_transaction_t::OP_ALLOC_RM:
1202 {
1203 __u8 id;
1204 uint64_t offset, length;
1205 decode(id, p);
1206 decode(offset, p);
1207 decode(length, p);
1208 dout(20) << __func__ << " 0x" << std::hex << pos << std::dec
1209 << ": op_alloc_rm " << " " << (int)id
1210 << ":0x" << std::hex << offset << "~" << length << std::dec
1211 << dendl;
1212 if (unlikely(to_stdout)) {
1213 std::cout << " 0x" << std::hex << pos << std::dec
1214 << ": op_alloc_rm " << " " << (int)id
1215 << ":0x" << std::hex << offset << "~" << length << std::dec
1216 << std::endl;
1217 }
1218 if (!noop) {
1219 block_all[id].erase(offset, length);
1220 _adjust_granularity(id, &offset, &length, false);
1221 if (length) {
1222 alloc[id]->init_rm_free(offset, length);
1223 }
1224 if (cct->_conf->bluefs_log_replay_check_allocations) {
1225 bool fail = false;
1226 apply_for_bitset_range(offset, length, alloc_size[id], owned_blocks[id],
1227 [&](uint64_t pos, boost::dynamic_bitset<uint64_t> &bs) {
1228 if (!bs.test(pos)) {
1229 fail = true;
1230 } else {
1231 bs.reset(pos);
1232 }
1233 }
1234 );
1235 if (fail) {
1236 derr << __func__ << " invalid extent " << int(id)
1237 << ": 0x" << std::hex << offset << "~" << length
1238 << std::dec << ": wasn't given" << dendl;
1239 return -EFAULT;
1240 }
1241
1242 apply_for_bitset_range(offset, length, alloc_size[id], used_blocks[id],
1243 [&](uint64_t pos, boost::dynamic_bitset<uint64_t> &bs) {
1244 if (bs.test(pos)) {
1245 fail = true;
1246 }
1247 }
1248 );
1249 if (fail) {
1250 derr << __func__ << " invalid extent " << (int)id
1251 << ": 0x" << std::hex << offset << "~" << length
1252 << std::dec << ": still in use" << dendl;
1253 return -EFAULT;
1254 }
1255 }
1256 }
1257 }
1258 break;
1259
1260 case bluefs_transaction_t::OP_DIR_LINK:
1261 {
1262 string dirname, filename;
1263 uint64_t ino;
1264 decode(dirname, p);
1265 decode(filename, p);
1266 decode(ino, p);
1267 dout(20) << __func__ << " 0x" << std::hex << pos << std::dec
1268 << ": op_dir_link " << " " << dirname << "/" << filename
1269 << " to " << ino
1270 << dendl;
1271 if (unlikely(to_stdout)) {
1272 std::cout << " 0x" << std::hex << pos << std::dec
1273 << ": op_dir_link " << " " << dirname << "/" << filename
1274 << " to " << ino
1275 << std::endl;
1276 }
1277
1278 if (!noop) {
1279 FileRef file = _get_file(ino);
1280 ceph_assert(file->fnode.ino);
1281 map<string,DirRef>::iterator q = dir_map.find(dirname);
1282 ceph_assert(q != dir_map.end());
1283 map<string,FileRef>::iterator r = q->second->file_map.find(filename);
1284 ceph_assert(r == q->second->file_map.end());
1285
1286 vselector->sub_usage(file->vselector_hint, file->fnode);
1287 file->vselector_hint =
1288 vselector->get_hint_by_dir(dirname);
1289 vselector->add_usage(file->vselector_hint, file->fnode);
1290
1291 q->second->file_map[filename] = file;
1292 ++file->refs;
1293 }
1294 }
1295 break;
1296
1297 case bluefs_transaction_t::OP_DIR_UNLINK:
1298 {
1299 string dirname, filename;
1300 decode(dirname, p);
1301 decode(filename, p);
1302 dout(20) << __func__ << " 0x" << std::hex << pos << std::dec
1303 << ": op_dir_unlink " << " " << dirname << "/" << filename
1304 << dendl;
1305 if (unlikely(to_stdout)) {
1306 std::cout << " 0x" << std::hex << pos << std::dec
1307 << ": op_dir_unlink " << " " << dirname << "/" << filename
1308 << std::endl;
1309 }
1310
1311 if (!noop) {
1312 map<string,DirRef>::iterator q = dir_map.find(dirname);
1313 ceph_assert(q != dir_map.end());
1314 map<string,FileRef>::iterator r = q->second->file_map.find(filename);
1315 ceph_assert(r != q->second->file_map.end());
1316 ceph_assert(r->second->refs > 0);
1317 --r->second->refs;
1318 q->second->file_map.erase(r);
1319 }
1320 }
1321 break;
1322
1323 case bluefs_transaction_t::OP_DIR_CREATE:
1324 {
1325 string dirname;
1326 decode(dirname, p);
1327 dout(20) << __func__ << " 0x" << std::hex << pos << std::dec
1328 << ": op_dir_create " << dirname << dendl;
1329 if (unlikely(to_stdout)) {
1330 std::cout << " 0x" << std::hex << pos << std::dec
1331 << ": op_dir_create " << dirname << std::endl;
1332 }
1333
1334 if (!noop) {
1335 map<string,DirRef>::iterator q = dir_map.find(dirname);
1336 ceph_assert(q == dir_map.end());
1337 dir_map[dirname] = ceph::make_ref<Dir>();
1338 }
1339 }
1340 break;
1341
1342 case bluefs_transaction_t::OP_DIR_REMOVE:
1343 {
1344 string dirname;
1345 decode(dirname, p);
1346 dout(20) << __func__ << " 0x" << std::hex << pos << std::dec
1347 << ": op_dir_remove " << dirname << dendl;
1348 if (unlikely(to_stdout)) {
1349 std::cout << " 0x" << std::hex << pos << std::dec
1350 << ": op_dir_remove " << dirname << std::endl;
1351 }
1352
1353 if (!noop) {
1354 map<string,DirRef>::iterator q = dir_map.find(dirname);
1355 ceph_assert(q != dir_map.end());
1356 ceph_assert(q->second->file_map.empty());
1357 dir_map.erase(q);
1358 }
1359 }
1360 break;
1361
1362 case bluefs_transaction_t::OP_FILE_UPDATE:
1363 {
1364 bluefs_fnode_t fnode;
1365 decode(fnode, p);
1366 dout(20) << __func__ << " 0x" << std::hex << pos << std::dec
1367 << ": op_file_update " << " " << fnode << " " << dendl;
1368 if (unlikely(to_stdout)) {
1369 std::cout << " 0x" << std::hex << pos << std::dec
1370 << ": op_file_update " << " " << fnode << std::endl;
1371 }
1372 if (!noop) {
1373 FileRef f = _get_file(fnode.ino);
1374 if (cct->_conf->bluefs_log_replay_check_allocations) {
1375 // check initial log layout
1376 if (first_log_check) {
1377 first_log_check = false;
1378 int r = _check_new_allocations(log_file->fnode,
1379 MAX_BDEV, owned_blocks, used_blocks);
1380 if (r < 0) {
1381 return r;
1382 }
1383 }
1384
1385 auto& fnode_extents = f->fnode.extents;
1386 for (auto e : fnode_extents) {
1387 auto id = e.bdev;
1388 if (int r = _verify_alloc_granularity(id, e.offset, e.length,
1389 "OP_FILE_UPDATE"); r < 0) {
1390 return r;
1391 }
1392 apply_for_bitset_range(e.offset, e.length, alloc_size[id],
1393 used_blocks[id],
1394 [&](uint64_t pos, boost::dynamic_bitset<uint64_t> &bs) {
1395 ceph_assert(bs.test(pos));
1396 bs.reset(pos);
1397 }
1398 );
1399 }
1400 }
1401
1402 if (fnode.ino != 1) {
1403 vselector->sub_usage(f->vselector_hint, f->fnode);
1404 }
1405 f->fnode = fnode;
1406 if (fnode.ino != 1) {
1407 vselector->add_usage(f->vselector_hint, f->fnode);
1408 }
1409
1410 if (fnode.ino > ino_last) {
1411 ino_last = fnode.ino;
1412 }
1413 if (cct->_conf->bluefs_log_replay_check_allocations) {
1414 int r = _check_new_allocations(f->fnode,
1415 MAX_BDEV, owned_blocks, used_blocks);
1416 if (r < 0) {
1417 return r;
1418 }
1419 }
1420 }
1421 }
1422 break;
1423
1424 case bluefs_transaction_t::OP_FILE_REMOVE:
1425 {
1426 uint64_t ino;
1427 decode(ino, p);
1428 dout(20) << __func__ << " 0x" << std::hex << pos << std::dec
1429 << ": op_file_remove " << ino << dendl;
1430 if (unlikely(to_stdout)) {
1431 std::cout << " 0x" << std::hex << pos << std::dec
1432 << ": op_file_remove " << ino << std::endl;
1433 }
1434
1435 if (!noop) {
1436 auto p = file_map.find(ino);
1437 ceph_assert(p != file_map.end());
1438 vselector->sub_usage(p->second->vselector_hint, p->second->fnode);
1439 if (cct->_conf->bluefs_log_replay_check_allocations) {
1440 auto& fnode_extents = p->second->fnode.extents;
1441 for (auto e : fnode_extents) {
1442 auto id = e.bdev;
1443 bool fail = false;
1444 apply_for_bitset_range(e.offset, e.length, alloc_size[id], owned_blocks[id],
1445 [&](uint64_t pos, boost::dynamic_bitset<uint64_t> &bs) {
1446 if (!bs.test(pos)) {
1447 fail = true;
1448 }
1449 }
1450 );
1451 if (fail) {
1452 derr << __func__ << " invalid extent " << int(id)
1453 << ": 0x" << std::hex << e.offset << "~" << e.length
1454 << std::dec
1455 << ": wasn't given but is allocated for removed ino " << ino
1456 << dendl;
1457 return -EFAULT;
1458 }
1459
1460 apply_for_bitset_range(e.offset, e.length, alloc_size[id], used_blocks[id],
1461 [&](uint64_t pos, boost::dynamic_bitset<uint64_t> &bs) {
1462 if (!bs.test(pos)) {
1463 fail = true;
1464 }
1465 bs.reset(pos);
1466 }
1467 );
1468 if (fail) {
1469 derr << __func__ << " invalid extent " << int(id)
1470 << ": 0x" << std::hex << e.offset << "~" << e.length
1471 << std::dec
1472 << ": not in use but is allocated for removed ino " << ino
1473 << dendl;
1474 return -EFAULT;
1475 }
1476 }
1477 }
1478 file_map.erase(p);
1479 }
1480 }
1481 break;
1482
1483 default:
1484 derr << __func__ << " 0x" << std::hex << pos << std::dec
1485 << ": stop: unrecognized op " << (int)op << dendl;
1486 delete log_reader;
1487 return -EIO;
1488 }
1489 }
1490 ceph_assert(p.end());
1491
1492 // we successfully replayed the transaction; bump the seq and log size
1493 ++log_seq;
1494 log_file->fnode.size = log_reader->buf.pos;
1495 }
1496 vselector->add_usage(log_file->vselector_hint, log_file->fnode);
1497
1498 if (!noop && first_log_check &&
1499 cct->_conf->bluefs_log_replay_check_allocations) {
1500 int r = _check_new_allocations(log_file->fnode,
1501 MAX_BDEV, owned_blocks, used_blocks);
1502 if (r < 0) {
1503 return r;
1504 }
1505 }
1506
1507 dout(10) << __func__ << " log file size was 0x"
1508 << std::hex << log_file->fnode.size << std::dec << dendl;
1509 if (unlikely(to_stdout)) {
1510 std::cout << " log file size was 0x"
1511 << std::hex << log_file->fnode.size << std::dec << std::endl;
1512 }
1513
1514 delete log_reader;
1515
1516 if (!noop) {
1517 // verify file link counts are all >0
1518 for (auto& p : file_map) {
1519 if (p.second->refs == 0 &&
1520 p.second->fnode.ino > 1) {
1521 derr << __func__ << " file with link count 0: " << p.second->fnode
1522 << dendl;
1523 return -EIO;
1524 }
1525 }
1526 }
1527
1528 for (unsigned id = 0; id < MAX_BDEV; ++id) {
1529 dout(10) << __func__ << " block_unused_too_granular " << id << ": "
1530 << block_unused_too_granular[id] << dendl;
1531 }
1532 dout(10) << __func__ << " done" << dendl;
1533 return 0;
1534 }
1535
1536 int BlueFS::log_dump()
1537 {
1538 // only dump log file's content
1539 int r = _replay(true, true);
1540 if (r < 0) {
1541 derr << __func__ << " failed to replay log: " << cpp_strerror(r) << dendl;
1542 return r;
1543 }
1544
1545 return 0;
1546 }
1547
1548 int BlueFS::device_migrate_to_existing(
1549 CephContext *cct,
1550 const set<int>& devs_source,
1551 int dev_target,
1552 const bluefs_layout_t& layout)
1553 {
1554 vector<byte> buf;
1555 bool buffered = cct->_conf->bluefs_buffered_io;
1556
1557 dout(10) << __func__ << " devs_source " << devs_source
1558 << " dev_target " << dev_target << dendl;
1559 assert(dev_target < (int)MAX_BDEV);
1560
1561 int flags = 0;
1562 flags |= devs_source.count(BDEV_DB) ?
1563 (REMOVE_DB | RENAME_SLOW2DB) : 0;
1564 flags |= devs_source.count(BDEV_WAL) ? REMOVE_WAL : 0;
1565 int dev_target_new = dev_target;
1566
1567 // Slow device without separate DB one is addressed via BDEV_DB
1568 // Hence need renaming.
1569 if ((flags & REMOVE_DB) && dev_target == BDEV_SLOW) {
1570 dev_target_new = BDEV_DB;
1571 dout(0) << __func__ << " super to be written to " << dev_target << dendl;
1572 }
1573
1574 for (auto& [ino, file_ref] : file_map) {
1575 //do not copy log
1576 if (file_ref->fnode.ino == 1) {
1577 continue;
1578 }
1579 dout(10) << __func__ << " " << ino << " " << file_ref->fnode << dendl;
1580
1581 auto& fnode_extents = file_ref->fnode.extents;
1582
1583 bool rewrite = std::any_of(
1584 fnode_extents.begin(),
1585 fnode_extents.end(),
1586 [=](auto& ext) {
1587 return ext.bdev != dev_target && devs_source.count(ext.bdev);
1588 });
1589 if (rewrite) {
1590 dout(10) << __func__ << " migrating" << dendl;
1591
1592 // read entire file
1593 bufferlist bl;
1594 for (auto old_ext : fnode_extents) {
1595 buf.resize(old_ext.length);
1596 int r = bdev[old_ext.bdev]->read_random(
1597 old_ext.offset,
1598 old_ext.length,
1599 (char*)&buf.at(0),
1600 buffered);
1601 if (r != 0) {
1602 derr << __func__ << " failed to read 0x" << std::hex
1603 << old_ext.offset << "~" << old_ext.length << std::dec
1604 << " from " << (int)dev_target << dendl;
1605 return -EIO;
1606 }
1607 bl.append((char*)&buf[0], old_ext.length);
1608 }
1609
1610 // write entire file
1611 PExtentVector extents;
1612 auto l = _allocate_without_fallback(dev_target, bl.length(), &extents);
1613 if (l < 0) {
1614 derr << __func__ << " unable to allocate len 0x" << std::hex
1615 << bl.length() << std::dec << " from " << (int)dev_target
1616 << ": " << cpp_strerror(l) << dendl;
1617 return -ENOSPC;
1618 }
1619
1620 uint64_t off = 0;
1621 for (auto& i : extents) {
1622 bufferlist cur;
1623 uint64_t cur_len = std::min<uint64_t>(i.length, bl.length() - off);
1624 ceph_assert(cur_len > 0);
1625 cur.substr_of(bl, off, cur_len);
1626 int r = bdev[dev_target]->write(i.offset, cur, buffered);
1627 ceph_assert(r == 0);
1628 off += cur_len;
1629 }
1630
1631 // release old extents
1632 for (auto old_ext : fnode_extents) {
1633 PExtentVector to_release;
1634 to_release.emplace_back(old_ext.offset, old_ext.length);
1635 alloc[old_ext.bdev]->release(to_release);
1636 }
1637
1638 // update fnode
1639 fnode_extents.clear();
1640 for (auto& i : extents) {
1641 fnode_extents.emplace_back(dev_target_new, i.offset, i.length);
1642 }
1643 } else {
1644 for (auto& ext : fnode_extents) {
1645 if (dev_target != dev_target_new && ext.bdev == dev_target) {
1646 dout(20) << __func__ << " " << " ... adjusting extent 0x"
1647 << std::hex << ext.offset << std::dec
1648 << " bdev " << dev_target << " -> " << dev_target_new
1649 << dendl;
1650 ext.bdev = dev_target_new;
1651 }
1652 }
1653 }
1654 }
1655 // new logging device in the current naming scheme
1656 int new_log_dev_cur = bdev[BDEV_WAL] ?
1657 BDEV_WAL :
1658 bdev[BDEV_DB] ? BDEV_DB : BDEV_SLOW;
1659
1660 // new logging device in new naming scheme
1661 int new_log_dev_next = new_log_dev_cur;
1662
1663 if (devs_source.count(new_log_dev_cur)) {
1664 // SLOW device is addressed via BDEV_DB too hence either WAL or DB
1665 new_log_dev_next = (flags & REMOVE_WAL) || !bdev[BDEV_WAL] ?
1666 BDEV_DB :
1667 BDEV_WAL;
1668
1669 dout(0) << __func__ << " log moved from " << new_log_dev_cur
1670 << " to " << new_log_dev_next << dendl;
1671
1672 new_log_dev_cur =
1673 (flags & REMOVE_DB) && new_log_dev_next == BDEV_DB ?
1674 BDEV_SLOW :
1675 new_log_dev_next;
1676 }
1677
1678 _rewrite_log_and_layout_sync(
1679 false,
1680 (flags & REMOVE_DB) ? BDEV_SLOW : BDEV_DB,
1681 new_log_dev_cur,
1682 new_log_dev_next,
1683 flags,
1684 layout);
1685 return 0;
1686 }
1687
1688 int BlueFS::device_migrate_to_new(
1689 CephContext *cct,
1690 const set<int>& devs_source,
1691 int dev_target,
1692 const bluefs_layout_t& layout)
1693 {
1694 vector<byte> buf;
1695 bool buffered = cct->_conf->bluefs_buffered_io;
1696
1697 dout(10) << __func__ << " devs_source " << devs_source
1698 << " dev_target " << dev_target << dendl;
1699 assert(dev_target == (int)BDEV_NEWDB || (int)BDEV_NEWWAL);
1700
1701 int flags = 0;
1702
1703 flags |= devs_source.count(BDEV_DB) ?
1704 (!bdev[BDEV_SLOW] ? RENAME_DB2SLOW: REMOVE_DB) :
1705 0;
1706 flags |= devs_source.count(BDEV_WAL) ? REMOVE_WAL : 0;
1707 int dev_target_new = dev_target; //FIXME: remove, makes no sense
1708
1709 for (auto& p : file_map) {
1710 //do not copy log
1711 if (p.second->fnode.ino == 1) {
1712 continue;
1713 }
1714 dout(10) << __func__ << " " << p.first << " " << p.second->fnode << dendl;
1715
1716 auto& fnode_extents = p.second->fnode.extents;
1717
1718 bool rewrite = false;
1719 for (auto ext_it = fnode_extents.begin();
1720 ext_it != p.second->fnode.extents.end();
1721 ++ext_it) {
1722 if (ext_it->bdev != dev_target && devs_source.count(ext_it->bdev)) {
1723 rewrite = true;
1724 break;
1725 }
1726 }
1727 if (rewrite) {
1728 dout(10) << __func__ << " migrating" << dendl;
1729
1730 // read entire file
1731 bufferlist bl;
1732 for (auto old_ext : fnode_extents) {
1733 buf.resize(old_ext.length);
1734 int r = bdev[old_ext.bdev]->read_random(
1735 old_ext.offset,
1736 old_ext.length,
1737 (char*)&buf.at(0),
1738 buffered);
1739 if (r != 0) {
1740 derr << __func__ << " failed to read 0x" << std::hex
1741 << old_ext.offset << "~" << old_ext.length << std::dec
1742 << " from " << (int)dev_target << dendl;
1743 return -EIO;
1744 }
1745 bl.append((char*)&buf[0], old_ext.length);
1746 }
1747
1748 // write entire file
1749 PExtentVector extents;
1750 auto l = _allocate_without_fallback(dev_target, bl.length(), &extents);
1751 if (l < 0) {
1752 derr << __func__ << " unable to allocate len 0x" << std::hex
1753 << bl.length() << std::dec << " from " << (int)dev_target
1754 << ": " << cpp_strerror(l) << dendl;
1755 return -ENOSPC;
1756 }
1757
1758 uint64_t off = 0;
1759 for (auto& i : extents) {
1760 bufferlist cur;
1761 uint64_t cur_len = std::min<uint64_t>(i.length, bl.length() - off);
1762 ceph_assert(cur_len > 0);
1763 cur.substr_of(bl, off, cur_len);
1764 int r = bdev[dev_target]->write(i.offset, cur, buffered);
1765 ceph_assert(r == 0);
1766 off += cur_len;
1767 }
1768
1769 // release old extents
1770 for (auto old_ext : fnode_extents) {
1771 PExtentVector to_release;
1772 to_release.emplace_back(old_ext.offset, old_ext.length);
1773 alloc[old_ext.bdev]->release(to_release);
1774 }
1775
1776 // update fnode
1777 fnode_extents.clear();
1778 for (auto& i : extents) {
1779 fnode_extents.emplace_back(dev_target_new, i.offset, i.length);
1780 }
1781 }
1782 }
1783 // new logging device in the current naming scheme
1784 int new_log_dev_cur =
1785 bdev[BDEV_NEWWAL] ?
1786 BDEV_NEWWAL :
1787 bdev[BDEV_WAL] && !(flags & REMOVE_WAL) ?
1788 BDEV_WAL :
1789 bdev[BDEV_NEWDB] ?
1790 BDEV_NEWDB :
1791 bdev[BDEV_DB] && !(flags & REMOVE_DB)?
1792 BDEV_DB :
1793 BDEV_SLOW;
1794
1795 // new logging device in new naming scheme
1796 int new_log_dev_next =
1797 new_log_dev_cur == BDEV_NEWWAL ?
1798 BDEV_WAL :
1799 new_log_dev_cur == BDEV_NEWDB ?
1800 BDEV_DB :
1801 new_log_dev_cur;
1802
1803 int super_dev =
1804 dev_target == BDEV_NEWDB ?
1805 BDEV_NEWDB :
1806 bdev[BDEV_DB] ?
1807 BDEV_DB :
1808 BDEV_SLOW;
1809
1810 _rewrite_log_and_layout_sync(
1811 false,
1812 super_dev,
1813 new_log_dev_cur,
1814 new_log_dev_next,
1815 flags,
1816 layout);
1817 return 0;
1818 }
1819
1820 BlueFS::FileRef BlueFS::_get_file(uint64_t ino)
1821 {
1822 auto p = file_map.find(ino);
1823 if (p == file_map.end()) {
1824 FileRef f = ceph::make_ref<File>();
1825 file_map[ino] = f;
1826 dout(30) << __func__ << " ino " << ino << " = " << f
1827 << " (new)" << dendl;
1828 return f;
1829 } else {
1830 dout(30) << __func__ << " ino " << ino << " = " << p->second << dendl;
1831 return p->second;
1832 }
1833 }
1834
1835 void BlueFS::_drop_link(FileRef file)
1836 {
1837 dout(20) << __func__ << " had refs " << file->refs
1838 << " on " << file->fnode << dendl;
1839 ceph_assert(file->refs > 0);
1840 --file->refs;
1841 if (file->refs == 0) {
1842 dout(20) << __func__ << " destroying " << file->fnode << dendl;
1843 ceph_assert(file->num_reading.load() == 0);
1844 vselector->sub_usage(file->vselector_hint, file->fnode);
1845 log_t.op_file_remove(file->fnode.ino);
1846 for (auto& r : file->fnode.extents) {
1847 pending_release[r.bdev].insert(r.offset, r.length);
1848 }
1849 file_map.erase(file->fnode.ino);
1850 file->deleted = true;
1851
1852 if (file->dirty_seq) {
1853 ceph_assert(file->dirty_seq > log_seq_stable);
1854 ceph_assert(dirty_files.count(file->dirty_seq));
1855 auto it = dirty_files[file->dirty_seq].iterator_to(*file);
1856 dirty_files[file->dirty_seq].erase(it);
1857 file->dirty_seq = 0;
1858 }
1859 }
1860 }
1861
1862 int BlueFS::_read_random(
1863 FileReader *h, ///< [in] read from here
1864 uint64_t off, ///< [in] offset
1865 uint64_t len, ///< [in] this many bytes
1866 char *out) ///< [out] optional: or copy it here
1867 {
1868 auto* buf = &h->buf;
1869
1870 int ret = 0;
1871 dout(10) << __func__ << " h " << h
1872 << " 0x" << std::hex << off << "~" << len << std::dec
1873 << " from " << h->file->fnode << dendl;
1874
1875 ++h->file->num_reading;
1876
1877 if (!h->ignore_eof &&
1878 off + len > h->file->fnode.size) {
1879 if (off > h->file->fnode.size)
1880 len = 0;
1881 else
1882 len = h->file->fnode.size - off;
1883 dout(20) << __func__ << " reaching (or past) eof, len clipped to 0x"
1884 << std::hex << len << std::dec << dendl;
1885 }
1886 logger->inc(l_bluefs_read_random_count, 1);
1887 logger->inc(l_bluefs_read_random_bytes, len);
1888
1889 std::shared_lock s_lock(h->lock);
1890 while (len > 0) {
1891 if (off < buf->bl_off || off >= buf->get_buf_end()) {
1892 s_lock.unlock();
1893 uint64_t x_off = 0;
1894 auto p = h->file->fnode.seek(off, &x_off);
1895 uint64_t l = std::min(p->length - x_off, len);
1896 dout(20) << __func__ << " read random 0x"
1897 << std::hex << x_off << "~" << l << std::dec
1898 << " of " << *p << dendl;
1899 int r = bdev[p->bdev]->read_random(p->offset + x_off, l, out,
1900 cct->_conf->bluefs_buffered_io);
1901 ceph_assert(r == 0);
1902 off += l;
1903 len -= l;
1904 ret += l;
1905 out += l;
1906
1907 logger->inc(l_bluefs_read_random_disk_count, 1);
1908 logger->inc(l_bluefs_read_random_disk_bytes, l);
1909 if (len > 0) {
1910 s_lock.lock();
1911 }
1912 } else {
1913 auto left = buf->get_buf_remaining(off);
1914 int r = std::min(len, left);
1915 logger->inc(l_bluefs_read_random_buffer_count, 1);
1916 logger->inc(l_bluefs_read_random_buffer_bytes, r);
1917 dout(20) << __func__ << " left 0x" << std::hex << left
1918 << " 0x" << off << "~" << len << std::dec
1919 << dendl;
1920
1921 if (out) {
1922 // NOTE: h->bl is normally a contiguous buffer so c_str() is free.
1923 memcpy(out, buf->bl.c_str() + off - buf->bl_off, r);
1924 out += r;
1925 }
1926
1927 dout(30) << __func__ << " result chunk (0x"
1928 << std::hex << r << std::dec << " bytes):\n";
1929 bufferlist t;
1930 t.substr_of(buf->bl, off - buf->bl_off, r);
1931 t.hexdump(*_dout);
1932 *_dout << dendl;
1933
1934 off += r;
1935 len -= r;
1936 ret += r;
1937 buf->pos += r;
1938 }
1939 }
1940 dout(20) << __func__ << " got " << ret << dendl;
1941 --h->file->num_reading;
1942 return ret;
1943 }
1944
1945 int BlueFS::_read(
1946 FileReader *h, ///< [in] read from here
1947 FileReaderBuffer *buf, ///< [in] reader state
1948 uint64_t off, ///< [in] offset
1949 size_t len, ///< [in] this many bytes
1950 bufferlist *outbl, ///< [out] optional: reference the result here
1951 char *out) ///< [out] optional: or copy it here
1952 {
1953 bool prefetch = !outbl && !out;
1954 dout(10) << __func__ << " h " << h
1955 << " 0x" << std::hex << off << "~" << len << std::dec
1956 << " from " << h->file->fnode
1957 << (prefetch ? " prefetch" : "")
1958 << dendl;
1959
1960 ++h->file->num_reading;
1961
1962 if (!h->ignore_eof &&
1963 off + len > h->file->fnode.size) {
1964 if (off > h->file->fnode.size)
1965 len = 0;
1966 else
1967 len = h->file->fnode.size - off;
1968 dout(20) << __func__ << " reaching (or past) eof, len clipped to 0x"
1969 << std::hex << len << std::dec << dendl;
1970 }
1971 logger->inc(l_bluefs_read_count, 1);
1972 logger->inc(l_bluefs_read_bytes, len);
1973 if (prefetch) {
1974 logger->inc(l_bluefs_read_prefetch_count, 1);
1975 logger->inc(l_bluefs_read_prefetch_bytes, len);
1976 }
1977
1978 if (outbl)
1979 outbl->clear();
1980
1981 int ret = 0;
1982 std::shared_lock s_lock(h->lock);
1983 while (len > 0) {
1984 size_t left;
1985 if (off < buf->bl_off || off >= buf->get_buf_end()) {
1986 s_lock.unlock();
1987 std::unique_lock u_lock(h->lock);
1988 if (off < buf->bl_off || off >= buf->get_buf_end()) {
1989 // if precondition hasn't changed during locking upgrade.
1990 buf->bl.clear();
1991 buf->bl_off = off & super.block_mask();
1992 uint64_t x_off = 0;
1993 auto p = h->file->fnode.seek(buf->bl_off, &x_off);
1994 uint64_t want = round_up_to(len + (off & ~super.block_mask()),
1995 super.block_size);
1996 want = std::max(want, buf->max_prefetch);
1997 uint64_t l = std::min(p->length - x_off, want);
1998 uint64_t eof_offset = round_up_to(h->file->fnode.size, super.block_size);
1999 if (!h->ignore_eof &&
2000 buf->bl_off + l > eof_offset) {
2001 l = eof_offset - buf->bl_off;
2002 }
2003 dout(20) << __func__ << " fetching 0x"
2004 << std::hex << x_off << "~" << l << std::dec
2005 << " of " << *p << dendl;
2006 int r = bdev[p->bdev]->read(p->offset + x_off, l, &buf->bl, ioc[p->bdev],
2007 cct->_conf->bluefs_buffered_io);
2008 ceph_assert(r == 0);
2009 }
2010 u_lock.unlock();
2011 s_lock.lock();
2012 // we should recheck if buffer is valid after lock downgrade
2013 continue;
2014 }
2015 left = buf->get_buf_remaining(off);
2016 dout(20) << __func__ << " left 0x" << std::hex << left
2017 << " len 0x" << len << std::dec << dendl;
2018
2019 int r = std::min(len, left);
2020 if (outbl) {
2021 bufferlist t;
2022 t.substr_of(buf->bl, off - buf->bl_off, r);
2023 outbl->claim_append(t);
2024 }
2025 if (out) {
2026 // NOTE: h->bl is normally a contiguous buffer so c_str() is free.
2027 memcpy(out, buf->bl.c_str() + off - buf->bl_off, r);
2028 out += r;
2029 }
2030
2031 dout(30) << __func__ << " result chunk (0x"
2032 << std::hex << r << std::dec << " bytes):\n";
2033 bufferlist t;
2034 t.substr_of(buf->bl, off - buf->bl_off, r);
2035 t.hexdump(*_dout);
2036 *_dout << dendl;
2037
2038 off += r;
2039 len -= r;
2040 ret += r;
2041 buf->pos += r;
2042 }
2043
2044 dout(20) << __func__ << " got " << ret << dendl;
2045 ceph_assert(!outbl || (int)outbl->length() == ret);
2046 --h->file->num_reading;
2047 return ret;
2048 }
2049
2050 void BlueFS::_invalidate_cache(FileRef f, uint64_t offset, uint64_t length)
2051 {
2052 dout(10) << __func__ << " file " << f->fnode
2053 << " 0x" << std::hex << offset << "~" << length << std::dec
2054 << dendl;
2055 if (offset & ~super.block_mask()) {
2056 offset &= super.block_mask();
2057 length = round_up_to(length, super.block_size);
2058 }
2059 uint64_t x_off = 0;
2060 auto p = f->fnode.seek(offset, &x_off);
2061 while (length > 0 && p != f->fnode.extents.end()) {
2062 uint64_t x_len = std::min(p->length - x_off, length);
2063 bdev[p->bdev]->invalidate_cache(p->offset + x_off, x_len);
2064 dout(20) << __func__ << " 0x" << std::hex << x_off << "~" << x_len
2065 << std:: dec << " of " << *p << dendl;
2066 offset += x_len;
2067 length -= x_len;
2068 }
2069 }
2070
2071 uint64_t BlueFS::_estimate_log_size()
2072 {
2073 int avg_dir_size = 40; // fixme
2074 int avg_file_size = 12;
2075 uint64_t size = 4096 * 2;
2076 size += file_map.size() * (1 + sizeof(bluefs_fnode_t));
2077 for (auto& p : block_all)
2078 size += p.num_intervals() * (1 + 1 + sizeof(uint64_t) * 2);
2079 size += dir_map.size() + (1 + avg_dir_size);
2080 size += file_map.size() * (1 + avg_dir_size + avg_file_size);
2081 return round_up_to(size, super.block_size);
2082 }
2083
2084 void BlueFS::compact_log()
2085 {
2086 std::unique_lock l(lock);
2087 if (cct->_conf->bluefs_compact_log_sync) {
2088 _compact_log_sync();
2089 } else {
2090 _compact_log_async(l);
2091 }
2092 }
2093
2094 bool BlueFS::_should_compact_log()
2095 {
2096 uint64_t current = log_writer->file->fnode.size;
2097 uint64_t expected = _estimate_log_size();
2098 float ratio = (float)current / (float)expected;
2099 dout(10) << __func__ << " current 0x" << std::hex << current
2100 << " expected " << expected << std::dec
2101 << " ratio " << ratio
2102 << (new_log ? " (async compaction in progress)" : "")
2103 << dendl;
2104 if (new_log ||
2105 current < cct->_conf->bluefs_log_compact_min_size ||
2106 ratio < cct->_conf->bluefs_log_compact_min_ratio) {
2107 return false;
2108 }
2109 return true;
2110 }
2111
2112 void BlueFS::_compact_log_dump_metadata(bluefs_transaction_t *t,
2113 int flags)
2114 {
2115 t->seq = 1;
2116 t->uuid = super.uuid;
2117 dout(20) << __func__ << " op_init" << dendl;
2118
2119 t->op_init();
2120 for (unsigned bdev = 0; bdev < MAX_BDEV; ++bdev) {
2121 interval_set<uint64_t>& p = block_all[bdev];
2122 for (interval_set<uint64_t>::iterator q = p.begin(); q != p.end(); ++q) {
2123 auto bdev_new = bdev;
2124 if ((flags & REMOVE_WAL) && bdev == BDEV_WAL) {
2125 continue;
2126 }
2127 if ((flags & REMOVE_DB) && bdev == BDEV_DB) {
2128 continue;
2129 }
2130 if ((flags & RENAME_SLOW2DB) && bdev == BDEV_SLOW) {
2131 bdev_new = BDEV_DB;
2132 }
2133 if ((flags & RENAME_DB2SLOW) && bdev == BDEV_DB) {
2134 bdev_new = BDEV_SLOW;
2135 }
2136 if (bdev == BDEV_NEWDB) {
2137 // REMOVE_DB xor RENAME_DB
2138 ceph_assert(!(flags & REMOVE_DB) != !(flags & RENAME_DB2SLOW));
2139 ceph_assert(!(flags & RENAME_SLOW2DB));
2140 bdev_new = BDEV_DB;
2141 }
2142 if (bdev == BDEV_NEWWAL) {
2143 ceph_assert(flags & REMOVE_WAL);
2144 bdev_new = BDEV_WAL;
2145 }
2146 dout(20) << __func__ << " op_alloc_add " << bdev_new << " 0x"
2147 << std::hex << q.get_start() << "~" << q.get_len() << std::dec
2148 << dendl;
2149 t->op_alloc_add(bdev_new, q.get_start(), q.get_len());
2150 }
2151 }
2152 for (auto& [ino, file_ref] : file_map) {
2153 if (ino == 1)
2154 continue;
2155 ceph_assert(ino > 1);
2156
2157 for(auto& e : file_ref->fnode.extents) {
2158 auto bdev = e.bdev;
2159 auto bdev_new = bdev;
2160 ceph_assert(!((flags & REMOVE_WAL) && bdev == BDEV_WAL));
2161 if ((flags & RENAME_SLOW2DB) && bdev == BDEV_SLOW) {
2162 bdev_new = BDEV_DB;
2163 }
2164 if ((flags & RENAME_DB2SLOW) && bdev == BDEV_DB) {
2165 bdev_new = BDEV_SLOW;
2166 }
2167 if (bdev == BDEV_NEWDB) {
2168 // REMOVE_DB xor RENAME_DB
2169 ceph_assert(!(flags & REMOVE_DB) != !(flags & RENAME_DB2SLOW));
2170 ceph_assert(!(flags & RENAME_SLOW2DB));
2171 bdev_new = BDEV_DB;
2172 }
2173 if (bdev == BDEV_NEWWAL) {
2174 ceph_assert(flags & REMOVE_WAL);
2175 bdev_new = BDEV_WAL;
2176 }
2177 e.bdev = bdev_new;
2178 }
2179 dout(20) << __func__ << " op_file_update " << file_ref->fnode << dendl;
2180 t->op_file_update(file_ref->fnode);
2181 }
2182 for (auto& [path, dir_ref] : dir_map) {
2183 dout(20) << __func__ << " op_dir_create " << path << dendl;
2184 t->op_dir_create(path);
2185 for (auto& [fname, file_ref] : dir_ref->file_map) {
2186 dout(20) << __func__ << " op_dir_link " << path << "/" << fname
2187 << " to " << file_ref->fnode.ino << dendl;
2188 t->op_dir_link(path, fname, file_ref->fnode.ino);
2189 }
2190 }
2191 }
2192
2193 void BlueFS::_compact_log_sync()
2194 {
2195 dout(10) << __func__ << dendl;
2196 auto prefer_bdev =
2197 vselector->select_prefer_bdev(log_writer->file->vselector_hint);
2198 _rewrite_log_and_layout_sync(true,
2199 BDEV_DB,
2200 prefer_bdev,
2201 prefer_bdev,
2202 0,
2203 super.memorized_layout);
2204 logger->inc(l_bluefs_log_compactions);
2205 }
2206
2207 void BlueFS::_rewrite_log_and_layout_sync(bool allocate_with_fallback,
2208 int super_dev,
2209 int log_dev,
2210 int log_dev_new,
2211 int flags,
2212 std::optional<bluefs_layout_t> layout)
2213 {
2214 File *log_file = log_writer->file.get();
2215
2216 // clear out log (be careful who calls us!!!)
2217 log_t.clear();
2218
2219 dout(20) << __func__ << " super_dev:" << super_dev
2220 << " log_dev:" << log_dev
2221 << " log_dev_new:" << log_dev_new
2222 << " flags:" << flags
2223 << dendl;
2224 bluefs_transaction_t t;
2225 _compact_log_dump_metadata(&t, flags);
2226
2227 dout(20) << __func__ << " op_jump_seq " << log_seq << dendl;
2228 t.op_jump_seq(log_seq);
2229
2230 bufferlist bl;
2231 encode(t, bl);
2232 _pad_bl(bl);
2233
2234 uint64_t need = bl.length() + cct->_conf->bluefs_max_log_runway;
2235 dout(20) << __func__ << " need " << need << dendl;
2236
2237 bluefs_fnode_t old_fnode;
2238 int r;
2239 log_file->fnode.swap_extents(old_fnode);
2240 if (allocate_with_fallback) {
2241 r = _allocate(log_dev, need, &log_file->fnode);
2242 ceph_assert(r == 0);
2243 } else {
2244 PExtentVector extents;
2245 r = _allocate_without_fallback(log_dev,
2246 need,
2247 &extents);
2248 ceph_assert(r == 0);
2249 for (auto& p : extents) {
2250 log_file->fnode.append_extent(
2251 bluefs_extent_t(log_dev, p.offset, p.length));
2252 }
2253 }
2254
2255 _close_writer(log_writer);
2256
2257 log_file->fnode.size = bl.length();
2258 vselector->sub_usage(log_file->vselector_hint, old_fnode);
2259 vselector->add_usage(log_file->vselector_hint, log_file->fnode);
2260
2261 log_writer = _create_writer(log_file);
2262 log_writer->append(bl);
2263 r = _flush(log_writer, true);
2264 ceph_assert(r == 0);
2265 #ifdef HAVE_LIBAIO
2266 if (!cct->_conf->bluefs_sync_write) {
2267 list<aio_t> completed_ios;
2268 _claim_completed_aios(log_writer, &completed_ios);
2269 wait_for_aio(log_writer);
2270 completed_ios.clear();
2271 }
2272 #endif
2273 flush_bdev();
2274
2275 super.memorized_layout = layout;
2276 super.log_fnode = log_file->fnode;
2277 // rename device if needed
2278 if (log_dev != log_dev_new) {
2279 dout(10) << __func__ << " renaming log extents to " << log_dev_new << dendl;
2280 for (auto& p : super.log_fnode.extents) {
2281 p.bdev = log_dev_new;
2282 }
2283 }
2284 dout(10) << __func__ << " writing super, log fnode: " << super.log_fnode << dendl;
2285
2286 ++super.version;
2287 _write_super(super_dev);
2288 flush_bdev();
2289
2290 dout(10) << __func__ << " release old log extents " << old_fnode.extents << dendl;
2291 for (auto& r : old_fnode.extents) {
2292 pending_release[r.bdev].insert(r.offset, r.length);
2293 }
2294 }
2295
2296 /*
2297 * 1. Allocate a new extent to continue the log, and then log an event
2298 * that jumps the log write position to the new extent. At this point, the
2299 * old extent(s) won't be written to, and reflect everything to compact.
2300 * New events will be written to the new region that we'll keep.
2301 *
2302 * 2. While still holding the lock, encode a bufferlist that dumps all of the
2303 * in-memory fnodes and names. This will become the new beginning of the
2304 * log. The last event will jump to the log continuation extent from #1.
2305 *
2306 * 3. Queue a write to a new extent for the new beginnging of the log.
2307 *
2308 * 4. Drop lock and wait
2309 *
2310 * 5. Retake the lock.
2311 *
2312 * 6. Update the log_fnode to splice in the new beginning.
2313 *
2314 * 7. Write the new superblock.
2315 *
2316 * 8. Release the old log space. Clean up.
2317 */
2318 void BlueFS::_compact_log_async(std::unique_lock<ceph::mutex>& l)
2319 {
2320 dout(10) << __func__ << dendl;
2321 File *log_file = log_writer->file.get();
2322 ceph_assert(!new_log);
2323 ceph_assert(!new_log_writer);
2324
2325 // create a new log [writer] so that we know compaction is in progress
2326 // (see _should_compact_log)
2327 new_log = ceph::make_ref<File>();
2328 new_log->fnode.ino = 0; // so that _flush_range won't try to log the fnode
2329
2330 // 0. wait for any racing flushes to complete. (We do not want to block
2331 // in _flush_sync_log with jump_to set or else a racing thread might flush
2332 // our entries and our jump_to update won't be correct.)
2333 while (log_flushing) {
2334 dout(10) << __func__ << " log is currently flushing, waiting" << dendl;
2335 log_cond.wait(l);
2336 }
2337
2338 vselector->sub_usage(log_file->vselector_hint, log_file->fnode);
2339
2340 // 1. allocate new log space and jump to it.
2341 old_log_jump_to = log_file->fnode.get_allocated();
2342 dout(10) << __func__ << " old_log_jump_to 0x" << std::hex << old_log_jump_to
2343 << " need 0x" << (old_log_jump_to + cct->_conf->bluefs_max_log_runway) << std::dec << dendl;
2344 int r = _allocate(vselector->select_prefer_bdev(log_file->vselector_hint),
2345 cct->_conf->bluefs_max_log_runway,
2346 &log_file->fnode);
2347 ceph_assert(r == 0);
2348 //adjust usage as flush below will need it
2349 vselector->add_usage(log_file->vselector_hint, log_file->fnode);
2350 dout(10) << __func__ << " log extents " << log_file->fnode.extents << dendl;
2351
2352 // update the log file change and log a jump to the offset where we want to
2353 // write the new entries
2354 log_t.op_file_update(log_file->fnode);
2355 log_t.op_jump(log_seq, old_log_jump_to);
2356
2357 flush_bdev(); // FIXME?
2358
2359 _flush_and_sync_log(l, 0, old_log_jump_to);
2360
2361 // 2. prepare compacted log
2362 bluefs_transaction_t t;
2363 //avoid record two times in log_t and _compact_log_dump_metadata.
2364 log_t.clear();
2365 _compact_log_dump_metadata(&t, 0);
2366
2367 uint64_t max_alloc_size = std::max(alloc_size[BDEV_WAL],
2368 std::max(alloc_size[BDEV_DB],
2369 alloc_size[BDEV_SLOW]));
2370
2371 // conservative estimate for final encoded size
2372 new_log_jump_to = round_up_to(t.op_bl.length() + super.block_size * 2,
2373 max_alloc_size);
2374 t.op_jump(log_seq, new_log_jump_to);
2375
2376 // allocate
2377 //FIXME: check if we want DB here?
2378 r = _allocate(BlueFS::BDEV_DB, new_log_jump_to,
2379 &new_log->fnode);
2380 ceph_assert(r == 0);
2381
2382 // we might have some more ops in log_t due to _allocate call
2383 t.claim_ops(log_t);
2384
2385 bufferlist bl;
2386 encode(t, bl);
2387 _pad_bl(bl);
2388
2389 dout(10) << __func__ << " new_log_jump_to 0x" << std::hex << new_log_jump_to
2390 << std::dec << dendl;
2391
2392 new_log_writer = _create_writer(new_log);
2393 new_log_writer->append(bl);
2394
2395 // 3. flush
2396 r = _flush(new_log_writer, true);
2397 ceph_assert(r == 0);
2398
2399 // 4. wait
2400 _flush_bdev_safely(new_log_writer);
2401
2402 // 5. update our log fnode
2403 // discard first old_log_jump_to extents
2404
2405 dout(10) << __func__ << " remove 0x" << std::hex << old_log_jump_to << std::dec
2406 << " of " << log_file->fnode.extents << dendl;
2407 uint64_t discarded = 0;
2408 mempool::bluefs::vector<bluefs_extent_t> old_extents;
2409 while (discarded < old_log_jump_to) {
2410 ceph_assert(!log_file->fnode.extents.empty());
2411 bluefs_extent_t& e = log_file->fnode.extents.front();
2412 bluefs_extent_t temp = e;
2413 if (discarded + e.length <= old_log_jump_to) {
2414 dout(10) << __func__ << " remove old log extent " << e << dendl;
2415 discarded += e.length;
2416 log_file->fnode.pop_front_extent();
2417 } else {
2418 dout(10) << __func__ << " remove front of old log extent " << e << dendl;
2419 uint64_t drop = old_log_jump_to - discarded;
2420 temp.length = drop;
2421 e.offset += drop;
2422 e.length -= drop;
2423 discarded += drop;
2424 dout(10) << __func__ << " kept " << e << " removed " << temp << dendl;
2425 }
2426 old_extents.push_back(temp);
2427 }
2428 auto from = log_file->fnode.extents.begin();
2429 auto to = log_file->fnode.extents.end();
2430 while (from != to) {
2431 new_log->fnode.append_extent(*from);
2432 ++from;
2433 }
2434
2435 vselector->sub_usage(log_file->vselector_hint, log_file->fnode);
2436
2437 // clear the extents from old log file, they are added to new log
2438 log_file->fnode.clear_extents();
2439 // swap the log files. New log file is the log file now.
2440 new_log->fnode.swap_extents(log_file->fnode);
2441
2442 log_writer->pos = log_writer->file->fnode.size =
2443 log_writer->pos - old_log_jump_to + new_log_jump_to;
2444
2445 vselector->add_usage(log_file->vselector_hint, log_file->fnode);
2446
2447 // 6. write the super block to reflect the changes
2448 dout(10) << __func__ << " writing super" << dendl;
2449 super.log_fnode = log_file->fnode;
2450 ++super.version;
2451 _write_super(BDEV_DB);
2452
2453 lock.unlock();
2454 flush_bdev();
2455 lock.lock();
2456
2457 // 7. release old space
2458 dout(10) << __func__ << " release old log extents " << old_extents << dendl;
2459 for (auto& r : old_extents) {
2460 pending_release[r.bdev].insert(r.offset, r.length);
2461 }
2462
2463 // delete the new log, remove from the dirty files list
2464 _close_writer(new_log_writer);
2465 if (new_log->dirty_seq) {
2466 ceph_assert(dirty_files.count(new_log->dirty_seq));
2467 auto it = dirty_files[new_log->dirty_seq].iterator_to(*new_log);
2468 dirty_files[new_log->dirty_seq].erase(it);
2469 }
2470 new_log_writer = nullptr;
2471 new_log = nullptr;
2472 log_cond.notify_all();
2473
2474 dout(10) << __func__ << " log extents " << log_file->fnode.extents << dendl;
2475 logger->inc(l_bluefs_log_compactions);
2476 }
2477
2478 void BlueFS::_pad_bl(bufferlist& bl)
2479 {
2480 uint64_t partial = bl.length() % super.block_size;
2481 if (partial) {
2482 dout(10) << __func__ << " padding with 0x" << std::hex
2483 << super.block_size - partial << " zeros" << std::dec << dendl;
2484 bl.append_zero(super.block_size - partial);
2485 }
2486 }
2487
2488
2489 int BlueFS::_flush_and_sync_log(std::unique_lock<ceph::mutex>& l,
2490 uint64_t want_seq,
2491 uint64_t jump_to)
2492 {
2493 while (log_flushing) {
2494 dout(10) << __func__ << " want_seq " << want_seq
2495 << " log is currently flushing, waiting" << dendl;
2496 ceph_assert(!jump_to);
2497 log_cond.wait(l);
2498 }
2499 if (want_seq && want_seq <= log_seq_stable) {
2500 dout(10) << __func__ << " want_seq " << want_seq << " <= log_seq_stable "
2501 << log_seq_stable << ", done" << dendl;
2502 ceph_assert(!jump_to);
2503 return 0;
2504 }
2505 if (log_t.empty() && dirty_files.empty()) {
2506 dout(10) << __func__ << " want_seq " << want_seq
2507 << " " << log_t << " not dirty, dirty_files empty, no-op" << dendl;
2508 ceph_assert(!jump_to);
2509 return 0;
2510 }
2511
2512 vector<interval_set<uint64_t>> to_release(pending_release.size());
2513 to_release.swap(pending_release);
2514
2515 uint64_t seq = log_t.seq = ++log_seq;
2516 ceph_assert(want_seq == 0 || want_seq <= seq);
2517 log_t.uuid = super.uuid;
2518
2519 // log dirty files
2520 auto lsi = dirty_files.find(seq);
2521 if (lsi != dirty_files.end()) {
2522 dout(20) << __func__ << " " << lsi->second.size() << " dirty_files" << dendl;
2523 for (auto &f : lsi->second) {
2524 dout(20) << __func__ << " op_file_update " << f.fnode << dendl;
2525 log_t.op_file_update(f.fnode);
2526 }
2527 }
2528
2529 dout(10) << __func__ << " " << log_t << dendl;
2530 ceph_assert(!log_t.empty());
2531
2532 // allocate some more space (before we run out)?
2533 int64_t runway = log_writer->file->fnode.get_allocated() -
2534 log_writer->get_effective_write_pos();
2535 if (runway < (int64_t)cct->_conf->bluefs_min_log_runway) {
2536 dout(10) << __func__ << " allocating more log runway (0x"
2537 << std::hex << runway << std::dec << " remaining)" << dendl;
2538 while (new_log_writer) {
2539 dout(10) << __func__ << " waiting for async compaction" << dendl;
2540 log_cond.wait(l);
2541 }
2542 vselector->sub_usage(log_writer->file->vselector_hint, log_writer->file->fnode);
2543 int r = _allocate(
2544 vselector->select_prefer_bdev(log_writer->file->vselector_hint),
2545 cct->_conf->bluefs_max_log_runway,
2546 &log_writer->file->fnode);
2547 ceph_assert(r == 0);
2548 vselector->add_usage(log_writer->file->vselector_hint, log_writer->file->fnode);
2549 log_t.op_file_update(log_writer->file->fnode);
2550 }
2551
2552 bufferlist bl;
2553 bl.reserve(super.block_size);
2554 encode(log_t, bl);
2555 // pad to block boundary
2556 size_t realign = super.block_size - (bl.length() % super.block_size);
2557 if (realign && realign != super.block_size)
2558 bl.append_zero(realign);
2559
2560 logger->inc(l_bluefs_logged_bytes, bl.length());
2561
2562 log_writer->append(bl);
2563
2564 log_t.clear();
2565 log_t.seq = 0; // just so debug output is less confusing
2566 log_flushing = true;
2567
2568 int r = _flush(log_writer, true);
2569 ceph_assert(r == 0);
2570
2571 if (jump_to) {
2572 dout(10) << __func__ << " jumping log offset from 0x" << std::hex
2573 << log_writer->pos << " -> 0x" << jump_to << std::dec << dendl;
2574 log_writer->pos = jump_to;
2575 vselector->sub_usage(log_writer->file->vselector_hint, log_writer->file->fnode.size);
2576 log_writer->file->fnode.size = jump_to;
2577 vselector->add_usage(log_writer->file->vselector_hint, log_writer->file->fnode.size);
2578 }
2579
2580 _flush_bdev_safely(log_writer);
2581
2582 log_flushing = false;
2583 log_cond.notify_all();
2584
2585 // clean dirty files
2586 if (seq > log_seq_stable) {
2587 log_seq_stable = seq;
2588 dout(20) << __func__ << " log_seq_stable " << log_seq_stable << dendl;
2589
2590 auto p = dirty_files.begin();
2591 while (p != dirty_files.end()) {
2592 if (p->first > log_seq_stable) {
2593 dout(20) << __func__ << " done cleaning up dirty files" << dendl;
2594 break;
2595 }
2596
2597 auto l = p->second.begin();
2598 while (l != p->second.end()) {
2599 File *file = &*l;
2600 ceph_assert(file->dirty_seq > 0);
2601 ceph_assert(file->dirty_seq <= log_seq_stable);
2602 dout(20) << __func__ << " cleaned file " << file->fnode << dendl;
2603 file->dirty_seq = 0;
2604 p->second.erase(l++);
2605 }
2606
2607 ceph_assert(p->second.empty());
2608 dirty_files.erase(p++);
2609 }
2610 } else {
2611 dout(20) << __func__ << " log_seq_stable " << log_seq_stable
2612 << " already >= out seq " << seq
2613 << ", we lost a race against another log flush, done" << dendl;
2614 }
2615
2616 for (unsigned i = 0; i < to_release.size(); ++i) {
2617 if (!to_release[i].empty()) {
2618 /* OK, now we have the guarantee alloc[i] won't be null. */
2619 int r = 0;
2620 if (cct->_conf->bdev_enable_discard && cct->_conf->bdev_async_discard) {
2621 r = bdev[i]->queue_discard(to_release[i]);
2622 if (r == 0)
2623 continue;
2624 } else if (cct->_conf->bdev_enable_discard) {
2625 for (auto p = to_release[i].begin(); p != to_release[i].end(); ++p) {
2626 bdev[i]->discard(p.get_start(), p.get_len());
2627 }
2628 }
2629 alloc[i]->release(to_release[i]);
2630 }
2631 }
2632
2633 _update_logger_stats();
2634
2635 return 0;
2636 }
2637
2638 int BlueFS::_flush_range(FileWriter *h, uint64_t offset, uint64_t length)
2639 {
2640 dout(10) << __func__ << " " << h << " pos 0x" << std::hex << h->pos
2641 << " 0x" << offset << "~" << length << std::dec
2642 << " to " << h->file->fnode << dendl;
2643 ceph_assert(!h->file->deleted);
2644 ceph_assert(h->file->num_readers.load() == 0);
2645
2646 h->buffer_appender.flush();
2647
2648 bool buffered;
2649 if (h->file->fnode.ino == 1)
2650 buffered = false;
2651 else
2652 buffered = cct->_conf->bluefs_buffered_io;
2653
2654 if (offset + length <= h->pos)
2655 return 0;
2656 if (offset < h->pos) {
2657 length -= h->pos - offset;
2658 offset = h->pos;
2659 dout(10) << " still need 0x"
2660 << std::hex << offset << "~" << length << std::dec
2661 << dendl;
2662 }
2663 ceph_assert(offset <= h->file->fnode.size);
2664
2665 uint64_t allocated = h->file->fnode.get_allocated();
2666 vselector->sub_usage(h->file->vselector_hint, h->file->fnode);
2667 // do not bother to dirty the file if we are overwriting
2668 // previously allocated extents.
2669 bool must_dirty = false;
2670 uint64_t clear_upto = 0;
2671 if (allocated < offset + length) {
2672 // we should never run out of log space here; see the min runway check
2673 // in _flush_and_sync_log.
2674 ceph_assert(h->file->fnode.ino != 1);
2675 int r = _allocate(vselector->select_prefer_bdev(h->file->vselector_hint),
2676 offset + length - allocated,
2677 &h->file->fnode);
2678 if (r < 0) {
2679 derr << __func__ << " allocated: 0x" << std::hex << allocated
2680 << " offset: 0x" << offset << " length: 0x" << length << std::dec
2681 << dendl;
2682 vselector->add_usage(h->file->vselector_hint, h->file->fnode); // undo
2683 ceph_abort_msg("bluefs enospc");
2684 return r;
2685 }
2686 if (cct->_conf->bluefs_preextend_wal_files &&
2687 h->writer_type == WRITER_WAL) {
2688 // NOTE: this *requires* that rocksdb also has log recycling
2689 // enabled and is therefore doing robust CRCs on the log
2690 // records. otherwise, we will fail to reply the rocksdb log
2691 // properly due to garbage on the device.
2692 h->file->fnode.size = h->file->fnode.get_allocated();
2693 clear_upto = h->file->fnode.size;
2694 dout(10) << __func__ << " extending WAL size to 0x" << std::hex
2695 << h->file->fnode.size << std::dec << " to include allocated"
2696 << dendl;
2697 }
2698 must_dirty = true;
2699 }
2700 if (h->file->fnode.size < offset + length) {
2701 h->file->fnode.size = offset + length;
2702 if (h->file->fnode.ino > 1) {
2703 // we do not need to dirty the log file (or it's compacting
2704 // replacement) when the file size changes because replay is
2705 // smart enough to discover it on its own.
2706 must_dirty = true;
2707 }
2708 }
2709 if (must_dirty) {
2710 h->file->fnode.mtime = ceph_clock_now();
2711 ceph_assert(h->file->fnode.ino >= 1);
2712 if (h->file->dirty_seq == 0) {
2713 h->file->dirty_seq = log_seq + 1;
2714 dirty_files[h->file->dirty_seq].push_back(*h->file);
2715 dout(20) << __func__ << " dirty_seq = " << log_seq + 1
2716 << " (was clean)" << dendl;
2717 } else {
2718 if (h->file->dirty_seq != log_seq + 1) {
2719 // need re-dirty, erase from list first
2720 ceph_assert(dirty_files.count(h->file->dirty_seq));
2721 auto it = dirty_files[h->file->dirty_seq].iterator_to(*h->file);
2722 dirty_files[h->file->dirty_seq].erase(it);
2723 h->file->dirty_seq = log_seq + 1;
2724 dirty_files[h->file->dirty_seq].push_back(*h->file);
2725 dout(20) << __func__ << " dirty_seq = " << log_seq + 1
2726 << " (was " << h->file->dirty_seq << ")" << dendl;
2727 } else {
2728 dout(20) << __func__ << " dirty_seq = " << log_seq + 1
2729 << " (unchanged, do nothing) " << dendl;
2730 }
2731 }
2732 }
2733 dout(20) << __func__ << " file now " << h->file->fnode << dendl;
2734
2735 uint64_t x_off = 0;
2736 auto p = h->file->fnode.seek(offset, &x_off);
2737 ceph_assert(p != h->file->fnode.extents.end());
2738 dout(20) << __func__ << " in " << *p << " x_off 0x"
2739 << std::hex << x_off << std::dec << dendl;
2740
2741 unsigned partial = x_off & ~super.block_mask();
2742 bufferlist bl;
2743 if (partial) {
2744 dout(20) << __func__ << " using partial tail 0x"
2745 << std::hex << partial << std::dec << dendl;
2746 ceph_assert(h->tail_block.length() == partial);
2747 bl.claim_append_piecewise(h->tail_block);
2748 x_off -= partial;
2749 offset -= partial;
2750 length += partial;
2751 dout(20) << __func__ << " waiting for previous aio to complete" << dendl;
2752 for (auto p : h->iocv) {
2753 if (p) {
2754 p->aio_wait();
2755 }
2756 }
2757 }
2758 if (length == partial + h->buffer.length() || clear_upto != 0) {
2759 /* in case of inital allocation and need to zero, limited flush is unacceptable */
2760 bl.claim_append_piecewise(h->buffer);
2761 } else {
2762 bufferlist t;
2763 h->buffer.splice(0, length, &t);
2764 bl.claim_append_piecewise(t);
2765 t.substr_of(h->buffer, length, h->buffer.length() - length);
2766 h->buffer.swap(t);
2767 dout(20) << " leaving 0x" << std::hex << h->buffer.length() << std::dec
2768 << " unflushed" << dendl;
2769 }
2770 ceph_assert(bl.length() == length);
2771
2772 h->pos = offset + length;
2773
2774 unsigned tail = bl.length() & ~super.block_mask();
2775 if (tail) {
2776 dout(20) << __func__ << " caching tail of 0x"
2777 << std::hex << tail
2778 << " and padding block with 0x" << (super.block_size - tail)
2779 << std::dec << dendl;
2780 h->tail_block.substr_of(bl, bl.length() - tail, tail);
2781 bl.append_zero(super.block_size - tail);
2782 length += super.block_size - tail;
2783 } else {
2784 h->tail_block.clear();
2785 }
2786 if (clear_upto != 0) {
2787 if (offset + length < clear_upto) {
2788 dout(20) << __func__ << " zeroing WAL log up to 0x"
2789 << std::hex << clear_upto
2790 << std::dec << dendl;
2791 bl.append_zero(clear_upto - (offset + length));
2792 length += clear_upto - (offset + length);
2793 }
2794 }
2795 ceph_assert(bl.length() == length);
2796
2797 switch (h->writer_type) {
2798 case WRITER_WAL:
2799 logger->inc(l_bluefs_bytes_written_wal, length);
2800 break;
2801 case WRITER_SST:
2802 logger->inc(l_bluefs_bytes_written_sst, length);
2803 break;
2804 }
2805
2806 dout(30) << "dump:\n";
2807 bl.hexdump(*_dout);
2808 *_dout << dendl;
2809
2810 uint64_t bloff = 0;
2811 uint64_t bytes_written_slow = 0;
2812 while (length > 0) {
2813 uint64_t x_len = std::min(p->length - x_off, length);
2814 bufferlist t;
2815 t.substr_of(bl, bloff, x_len);
2816 if (cct->_conf->bluefs_sync_write) {
2817 bdev[p->bdev]->write(p->offset + x_off, t, buffered, h->write_hint);
2818 } else {
2819 bdev[p->bdev]->aio_write(p->offset + x_off, t, h->iocv[p->bdev], buffered, h->write_hint);
2820 }
2821 h->dirty_devs[p->bdev] = true;
2822 if (p->bdev == BDEV_SLOW) {
2823 bytes_written_slow += t.length();
2824 }
2825
2826 bloff += x_len;
2827 length -= x_len;
2828 ++p;
2829 x_off = 0;
2830 }
2831 logger->inc(l_bluefs_bytes_written_slow, bytes_written_slow);
2832 for (unsigned i = 0; i < MAX_BDEV; ++i) {
2833 if (bdev[i]) {
2834 if (h->iocv[i] && h->iocv[i]->has_pending_aios()) {
2835 bdev[i]->aio_submit(h->iocv[i]);
2836 }
2837 }
2838 }
2839 vselector->add_usage(h->file->vselector_hint, h->file->fnode);
2840 dout(20) << __func__ << " h " << h << " pos now 0x"
2841 << std::hex << h->pos << std::dec << dendl;
2842 return 0;
2843 }
2844
2845 #ifdef HAVE_LIBAIO
2846 // we need to retire old completed aios so they don't stick around in
2847 // memory indefinitely (along with their bufferlist refs).
2848 void BlueFS::_claim_completed_aios(FileWriter *h, list<aio_t> *ls)
2849 {
2850 for (auto p : h->iocv) {
2851 if (p) {
2852 ls->splice(ls->end(), p->running_aios);
2853 }
2854 }
2855 dout(10) << __func__ << " got " << ls->size() << " aios" << dendl;
2856 }
2857
2858 void BlueFS::wait_for_aio(FileWriter *h)
2859 {
2860 // NOTE: this is safe to call without a lock, as long as our reference is
2861 // stable.
2862 dout(10) << __func__ << " " << h << dendl;
2863 utime_t start = ceph_clock_now();
2864 for (auto p : h->iocv) {
2865 if (p) {
2866 p->aio_wait();
2867 }
2868 }
2869 dout(10) << __func__ << " " << h << " done in " << (ceph_clock_now() - start) << dendl;
2870 }
2871 #endif
2872
2873 int BlueFS::_flush(FileWriter *h, bool force)
2874 {
2875 h->buffer_appender.flush();
2876 uint64_t length = h->buffer.length();
2877 uint64_t offset = h->pos;
2878 if (!force &&
2879 length < cct->_conf->bluefs_min_flush_size) {
2880 dout(10) << __func__ << " " << h << " ignoring, length " << length
2881 << " < min_flush_size " << cct->_conf->bluefs_min_flush_size
2882 << dendl;
2883 return 0;
2884 }
2885 if (length == 0) {
2886 dout(10) << __func__ << " " << h << " no dirty data on "
2887 << h->file->fnode << dendl;
2888 return 0;
2889 }
2890 dout(10) << __func__ << " " << h << " 0x"
2891 << std::hex << offset << "~" << length << std::dec
2892 << " to " << h->file->fnode << dendl;
2893 ceph_assert(h->pos <= h->file->fnode.size);
2894 return _flush_range(h, offset, length);
2895 }
2896
2897 int BlueFS::_truncate(FileWriter *h, uint64_t offset)
2898 {
2899 dout(10) << __func__ << " 0x" << std::hex << offset << std::dec
2900 << " file " << h->file->fnode << dendl;
2901 if (h->file->deleted) {
2902 dout(10) << __func__ << " deleted, no-op" << dendl;
2903 return 0;
2904 }
2905
2906 // we never truncate internal log files
2907 ceph_assert(h->file->fnode.ino > 1);
2908
2909 h->buffer_appender.flush();
2910
2911 // truncate off unflushed data?
2912 if (h->pos < offset &&
2913 h->pos + h->buffer.length() > offset) {
2914 bufferlist t;
2915 dout(20) << __func__ << " tossing out last " << offset - h->pos
2916 << " unflushed bytes" << dendl;
2917 t.substr_of(h->buffer, 0, offset - h->pos);
2918 h->buffer.swap(t);
2919 ceph_abort_msg("actually this shouldn't happen");
2920 }
2921 if (h->buffer.length()) {
2922 int r = _flush(h, true);
2923 if (r < 0)
2924 return r;
2925 }
2926 if (offset == h->file->fnode.size) {
2927 return 0; // no-op!
2928 }
2929 if (offset > h->file->fnode.size) {
2930 ceph_abort_msg("truncate up not supported");
2931 }
2932 ceph_assert(h->file->fnode.size >= offset);
2933 vselector->sub_usage(h->file->vselector_hint, h->file->fnode.size);
2934 h->file->fnode.size = offset;
2935 vselector->add_usage(h->file->vselector_hint, h->file->fnode.size);
2936 log_t.op_file_update(h->file->fnode);
2937 return 0;
2938 }
2939
2940 int BlueFS::_fsync(FileWriter *h, std::unique_lock<ceph::mutex>& l)
2941 {
2942 dout(10) << __func__ << " " << h << " " << h->file->fnode << dendl;
2943 int r = _flush(h, true);
2944 if (r < 0)
2945 return r;
2946 uint64_t old_dirty_seq = h->file->dirty_seq;
2947
2948 _flush_bdev_safely(h);
2949
2950 if (old_dirty_seq) {
2951 uint64_t s = log_seq;
2952 dout(20) << __func__ << " file metadata was dirty (" << old_dirty_seq
2953 << ") on " << h->file->fnode << ", flushing log" << dendl;
2954 _flush_and_sync_log(l, old_dirty_seq);
2955 ceph_assert(h->file->dirty_seq == 0 || // cleaned
2956 h->file->dirty_seq > s); // or redirtied by someone else
2957 }
2958 return 0;
2959 }
2960
2961 void BlueFS::_flush_bdev_safely(FileWriter *h)
2962 {
2963 std::array<bool, MAX_BDEV> flush_devs = h->dirty_devs;
2964 h->dirty_devs.fill(false);
2965 #ifdef HAVE_LIBAIO
2966 if (!cct->_conf->bluefs_sync_write) {
2967 list<aio_t> completed_ios;
2968 _claim_completed_aios(h, &completed_ios);
2969 lock.unlock();
2970 wait_for_aio(h);
2971 completed_ios.clear();
2972 flush_bdev(flush_devs);
2973 lock.lock();
2974 } else
2975 #endif
2976 {
2977 lock.unlock();
2978 flush_bdev(flush_devs);
2979 lock.lock();
2980 }
2981 }
2982
2983 void BlueFS::flush_bdev(std::array<bool, MAX_BDEV>& dirty_bdevs)
2984 {
2985 // NOTE: this is safe to call without a lock.
2986 dout(20) << __func__ << dendl;
2987 for (unsigned i = 0; i < MAX_BDEV; i++) {
2988 if (dirty_bdevs[i])
2989 bdev[i]->flush();
2990 }
2991 }
2992
2993 void BlueFS::flush_bdev()
2994 {
2995 // NOTE: this is safe to call without a lock.
2996 dout(20) << __func__ << dendl;
2997 for (auto p : bdev) {
2998 if (p)
2999 p->flush();
3000 }
3001 }
3002
3003 const char* BlueFS::get_device_name(unsigned id)
3004 {
3005 if (id >= MAX_BDEV) return "BDEV_INV";
3006 const char* names[] = {"BDEV_WAL", "BDEV_DB", "BDEV_SLOW", "BDEV_NEWWAL", "BDEV_NEWDB"};
3007 return names[id];
3008 }
3009
3010 int BlueFS::_expand_slow_device(uint64_t need, PExtentVector& extents)
3011 {
3012 int r = -ENOSPC;
3013 if (slow_dev_expander) {
3014 int id = _get_slow_device_id();
3015 auto min_alloc_size = alloc_size[id];
3016 ceph_assert(id <= (int)alloc.size() && alloc[id]);
3017 auto min_need = round_up_to(need, min_alloc_size);
3018 need = std::max(need,
3019 slow_dev_expander->get_recommended_expansion_delta(
3020 alloc[id]->get_free(), block_all[id].size()));
3021
3022 need = round_up_to(need, min_alloc_size);
3023 dout(10) << __func__ << " expanding slow device by 0x"
3024 << std::hex << need << std::dec
3025 << dendl;
3026 r = slow_dev_expander->allocate_freespace(min_need, need, extents);
3027 }
3028 return r;
3029 }
3030
3031 int BlueFS::_allocate_without_fallback(uint8_t id, uint64_t len,
3032 PExtentVector* extents)
3033 {
3034 dout(10) << __func__ << " len 0x" << std::hex << len << std::dec
3035 << " from " << (int)id << dendl;
3036 assert(id < alloc.size());
3037 if (!alloc[id]) {
3038 return -ENOENT;
3039 }
3040 extents->reserve(4); // 4 should be (more than) enough for most allocations
3041 uint64_t min_alloc_size = alloc_size[id];
3042 uint64_t left = round_up_to(len, min_alloc_size);
3043 int64_t alloc_len = alloc[id]->allocate(left, min_alloc_size, 0, extents);
3044 if (alloc_len < 0 || alloc_len < (int64_t)left) {
3045 if (alloc_len > 0) {
3046 alloc[id]->release(*extents);
3047 }
3048 if (bdev[id])
3049 derr << __func__ << " failed to allocate 0x" << std::hex << left
3050 << " on bdev " << (int)id
3051 << ", free 0x" << alloc[id]->get_free() << std::dec << dendl;
3052 else
3053 derr << __func__ << " failed to allocate 0x" << std::hex << left
3054 << " on bdev " << (int)id << ", dne" << std::dec << dendl;
3055 if (alloc[id])
3056 alloc[id]->dump();
3057 return -ENOSPC;
3058 }
3059
3060 return 0;
3061 }
3062
3063 int BlueFS::_allocate(uint8_t id, uint64_t len,
3064 bluefs_fnode_t* node)
3065 {
3066 dout(10) << __func__ << " len 0x" << std::hex << len << std::dec
3067 << " from " << (int)id << dendl;
3068 ceph_assert(id < alloc.size());
3069 int64_t alloc_len = 0;
3070 PExtentVector extents;
3071 uint64_t hint = 0;
3072 if (alloc[id]) {
3073 if (!node->extents.empty() && node->extents.back().bdev == id) {
3074 hint = node->extents.back().end();
3075 }
3076 extents.reserve(4); // 4 should be (more than) enough for most allocations
3077 alloc_len = alloc[id]->allocate(round_up_to(len, alloc_size[id]),
3078 alloc_size[id], hint, &extents);
3079 }
3080 if (!alloc[id] ||
3081 alloc_len < 0 ||
3082 alloc_len < (int64_t)round_up_to(len, alloc_size[id])) {
3083 if (alloc_len > 0) {
3084 alloc[id]->release(extents);
3085 }
3086 if (id != BDEV_SLOW) {
3087 if (bdev[id]) {
3088 dout(1) << __func__ << " failed to allocate 0x" << std::hex << len
3089 << " on bdev " << (int)id
3090 << ", free 0x" << alloc[id]->get_free()
3091 << "; fallback to bdev " << (int)id + 1
3092 << std::dec << dendl;
3093 }
3094 return _allocate(id + 1, len, node);
3095 }
3096 dout(1) << __func__ << " unable to allocate 0x" << std::hex << len
3097 << " on bdev " << (int)id << ", free 0x"
3098 << (alloc[id] ? alloc[id]->get_free() : (uint64_t)-1)
3099 << "; fallback to slow device expander "
3100 << std::dec << dendl;
3101 extents.clear();
3102 if (_expand_slow_device(len, extents) == 0) {
3103 id = _get_slow_device_id();
3104 for (auto& e : extents) {
3105 _add_block_extent(id, e.offset, e.length);
3106 }
3107 extents.clear();
3108 auto* last_alloc = alloc[id];
3109 ceph_assert(last_alloc);
3110 // try again
3111 alloc_len = last_alloc->allocate(round_up_to(len, alloc_size[id]),
3112 alloc_size[id], hint, &extents);
3113 if (alloc_len < 0 || alloc_len < (int64_t)len) {
3114 if (alloc_len > 0) {
3115 last_alloc->release(extents);
3116 }
3117 derr << __func__ << " failed to allocate 0x" << std::hex << len
3118 << " on bdev " << (int)id
3119 << ", free 0x" << last_alloc->get_free() << std::dec << dendl;
3120 return -ENOSPC;
3121 }
3122 } else {
3123 derr << __func__ << " failed to expand slow device to fit +0x"
3124 << std::hex << len << std::dec
3125 << dendl;
3126 return -ENOSPC;
3127 }
3128 } else {
3129 uint64_t total_allocated =
3130 block_all[id].size() - alloc[id]->get_free();
3131 if (max_bytes[id] < total_allocated) {
3132 logger->set(max_bytes_pcounters[id], total_allocated);
3133 max_bytes[id] = total_allocated;
3134 }
3135 }
3136
3137 for (auto& p : extents) {
3138 node->append_extent(bluefs_extent_t(id, p.offset, p.length));
3139 }
3140
3141 return 0;
3142 }
3143
3144 int BlueFS::_preallocate(FileRef f, uint64_t off, uint64_t len)
3145 {
3146 dout(10) << __func__ << " file " << f->fnode << " 0x"
3147 << std::hex << off << "~" << len << std::dec << dendl;
3148 if (f->deleted) {
3149 dout(10) << __func__ << " deleted, no-op" << dendl;
3150 return 0;
3151 }
3152 ceph_assert(f->fnode.ino > 1);
3153 uint64_t allocated = f->fnode.get_allocated();
3154 if (off + len > allocated) {
3155 uint64_t want = off + len - allocated;
3156 vselector->sub_usage(f->vselector_hint, f->fnode);
3157
3158 int r = _allocate(vselector->select_prefer_bdev(f->vselector_hint),
3159 want,
3160 &f->fnode);
3161 vselector->add_usage(f->vselector_hint, f->fnode);
3162 if (r < 0)
3163 return r;
3164 log_t.op_file_update(f->fnode);
3165 }
3166 return 0;
3167 }
3168
3169 void BlueFS::sync_metadata()
3170 {
3171 std::unique_lock l(lock);
3172 if (log_t.empty() && dirty_files.empty()) {
3173 dout(10) << __func__ << " - no pending log events" << dendl;
3174 } else {
3175 dout(10) << __func__ << dendl;
3176 utime_t start = ceph_clock_now();
3177 flush_bdev(); // FIXME?
3178 _flush_and_sync_log(l);
3179 dout(10) << __func__ << " done in " << (ceph_clock_now() - start) << dendl;
3180 }
3181
3182 if (_should_compact_log()) {
3183 if (cct->_conf->bluefs_compact_log_sync) {
3184 _compact_log_sync();
3185 } else {
3186 _compact_log_async(l);
3187 }
3188 }
3189 }
3190
3191 int BlueFS::open_for_write(
3192 const string& dirname,
3193 const string& filename,
3194 FileWriter **h,
3195 bool overwrite)
3196 {
3197 std::lock_guard l(lock);
3198 dout(10) << __func__ << " " << dirname << "/" << filename << dendl;
3199 map<string,DirRef>::iterator p = dir_map.find(dirname);
3200 DirRef dir;
3201 if (p == dir_map.end()) {
3202 // implicitly create the dir
3203 dout(20) << __func__ << " dir " << dirname
3204 << " does not exist" << dendl;
3205 return -ENOENT;
3206 } else {
3207 dir = p->second;
3208 }
3209
3210 FileRef file;
3211 bool create = false;
3212 map<string,FileRef>::iterator q = dir->file_map.find(filename);
3213 if (q == dir->file_map.end()) {
3214 if (overwrite) {
3215 dout(20) << __func__ << " dir " << dirname << " (" << dir
3216 << ") file " << filename
3217 << " does not exist" << dendl;
3218 return -ENOENT;
3219 }
3220 file = ceph::make_ref<File>();
3221 file->fnode.ino = ++ino_last;
3222 file_map[ino_last] = file;
3223 dir->file_map[filename] = file;
3224 ++file->refs;
3225 create = true;
3226 } else {
3227 // overwrite existing file?
3228 file = q->second;
3229 if (overwrite) {
3230 dout(20) << __func__ << " dir " << dirname << " (" << dir
3231 << ") file " << filename
3232 << " already exists, overwrite in place" << dendl;
3233 } else {
3234 dout(20) << __func__ << " dir " << dirname << " (" << dir
3235 << ") file " << filename
3236 << " already exists, truncate + overwrite" << dendl;
3237 vselector->sub_usage(file->vselector_hint, file->fnode);
3238 file->fnode.size = 0;
3239 for (auto& p : file->fnode.extents) {
3240 pending_release[p.bdev].insert(p.offset, p.length);
3241 }
3242
3243 file->fnode.clear_extents();
3244 }
3245 }
3246 ceph_assert(file->fnode.ino > 1);
3247
3248 file->fnode.mtime = ceph_clock_now();
3249 file->vselector_hint = vselector->get_hint_by_dir(dirname);
3250
3251 dout(20) << __func__ << " mapping " << dirname << "/" << filename
3252 << " vsel_hint " << file->vselector_hint
3253 << dendl;
3254
3255 log_t.op_file_update(file->fnode);
3256 if (create)
3257 log_t.op_dir_link(dirname, filename, file->fnode.ino);
3258
3259 *h = _create_writer(file);
3260
3261 if (boost::algorithm::ends_with(filename, ".log")) {
3262 (*h)->writer_type = BlueFS::WRITER_WAL;
3263 if (logger && !overwrite) {
3264 logger->inc(l_bluefs_files_written_wal);
3265 }
3266 } else if (boost::algorithm::ends_with(filename, ".sst")) {
3267 (*h)->writer_type = BlueFS::WRITER_SST;
3268 if (logger) {
3269 logger->inc(l_bluefs_files_written_sst);
3270 }
3271 }
3272
3273 dout(10) << __func__ << " h " << *h << " on " << file->fnode << dendl;
3274 return 0;
3275 }
3276
3277 BlueFS::FileWriter *BlueFS::_create_writer(FileRef f)
3278 {
3279 FileWriter *w = new FileWriter(f);
3280 for (unsigned i = 0; i < MAX_BDEV; ++i) {
3281 if (bdev[i]) {
3282 w->iocv[i] = new IOContext(cct, NULL);
3283 }
3284 }
3285 return w;
3286 }
3287
3288 void BlueFS::_close_writer(FileWriter *h)
3289 {
3290 dout(10) << __func__ << " " << h << " type " << h->writer_type << dendl;
3291 for (unsigned i=0; i<MAX_BDEV; ++i) {
3292 if (bdev[i]) {
3293 if (h->iocv[i]) {
3294 h->iocv[i]->aio_wait();
3295 bdev[i]->queue_reap_ioc(h->iocv[i]);
3296 }
3297 }
3298 }
3299 delete h;
3300 }
3301
3302 int BlueFS::open_for_read(
3303 const string& dirname,
3304 const string& filename,
3305 FileReader **h,
3306 bool random)
3307 {
3308 std::lock_guard l(lock);
3309 dout(10) << __func__ << " " << dirname << "/" << filename
3310 << (random ? " (random)":" (sequential)") << dendl;
3311 map<string,DirRef>::iterator p = dir_map.find(dirname);
3312 if (p == dir_map.end()) {
3313 dout(20) << __func__ << " dir " << dirname << " not found" << dendl;
3314 return -ENOENT;
3315 }
3316 DirRef dir = p->second;
3317
3318 map<string,FileRef>::iterator q = dir->file_map.find(filename);
3319 if (q == dir->file_map.end()) {
3320 dout(20) << __func__ << " dir " << dirname << " (" << dir
3321 << ") file " << filename
3322 << " not found" << dendl;
3323 return -ENOENT;
3324 }
3325 File *file = q->second.get();
3326
3327 *h = new FileReader(file, random ? 4096 : cct->_conf->bluefs_max_prefetch,
3328 random, false);
3329 dout(10) << __func__ << " h " << *h << " on " << file->fnode << dendl;
3330 return 0;
3331 }
3332
3333 int BlueFS::rename(
3334 const string& old_dirname, const string& old_filename,
3335 const string& new_dirname, const string& new_filename)
3336 {
3337 std::lock_guard l(lock);
3338 dout(10) << __func__ << " " << old_dirname << "/" << old_filename
3339 << " -> " << new_dirname << "/" << new_filename << dendl;
3340 map<string,DirRef>::iterator p = dir_map.find(old_dirname);
3341 if (p == dir_map.end()) {
3342 dout(20) << __func__ << " dir " << old_dirname << " not found" << dendl;
3343 return -ENOENT;
3344 }
3345 DirRef old_dir = p->second;
3346 map<string,FileRef>::iterator q = old_dir->file_map.find(old_filename);
3347 if (q == old_dir->file_map.end()) {
3348 dout(20) << __func__ << " dir " << old_dirname << " (" << old_dir
3349 << ") file " << old_filename
3350 << " not found" << dendl;
3351 return -ENOENT;
3352 }
3353 FileRef file = q->second;
3354
3355 p = dir_map.find(new_dirname);
3356 if (p == dir_map.end()) {
3357 dout(20) << __func__ << " dir " << new_dirname << " not found" << dendl;
3358 return -ENOENT;
3359 }
3360 DirRef new_dir = p->second;
3361 q = new_dir->file_map.find(new_filename);
3362 if (q != new_dir->file_map.end()) {
3363 dout(20) << __func__ << " dir " << new_dirname << " (" << old_dir
3364 << ") file " << new_filename
3365 << " already exists, unlinking" << dendl;
3366 ceph_assert(q->second != file);
3367 log_t.op_dir_unlink(new_dirname, new_filename);
3368 _drop_link(q->second);
3369 }
3370
3371 dout(10) << __func__ << " " << new_dirname << "/" << new_filename << " "
3372 << " " << file->fnode << dendl;
3373
3374 new_dir->file_map[new_filename] = file;
3375 old_dir->file_map.erase(old_filename);
3376
3377 log_t.op_dir_link(new_dirname, new_filename, file->fnode.ino);
3378 log_t.op_dir_unlink(old_dirname, old_filename);
3379 return 0;
3380 }
3381
3382 int BlueFS::mkdir(const string& dirname)
3383 {
3384 std::lock_guard l(lock);
3385 dout(10) << __func__ << " " << dirname << dendl;
3386 map<string,DirRef>::iterator p = dir_map.find(dirname);
3387 if (p != dir_map.end()) {
3388 dout(20) << __func__ << " dir " << dirname << " exists" << dendl;
3389 return -EEXIST;
3390 }
3391 dir_map[dirname] = ceph::make_ref<Dir>();
3392 log_t.op_dir_create(dirname);
3393 return 0;
3394 }
3395
3396 int BlueFS::rmdir(const string& dirname)
3397 {
3398 std::lock_guard l(lock);
3399 dout(10) << __func__ << " " << dirname << dendl;
3400 map<string,DirRef>::iterator p = dir_map.find(dirname);
3401 if (p == dir_map.end()) {
3402 dout(20) << __func__ << " dir " << dirname << " does not exist" << dendl;
3403 return -ENOENT;
3404 }
3405 DirRef dir = p->second;
3406 if (!dir->file_map.empty()) {
3407 dout(20) << __func__ << " dir " << dirname << " not empty" << dendl;
3408 return -ENOTEMPTY;
3409 }
3410 dir_map.erase(dirname);
3411 log_t.op_dir_remove(dirname);
3412 return 0;
3413 }
3414
3415 bool BlueFS::dir_exists(const string& dirname)
3416 {
3417 std::lock_guard l(lock);
3418 map<string,DirRef>::iterator p = dir_map.find(dirname);
3419 bool exists = p != dir_map.end();
3420 dout(10) << __func__ << " " << dirname << " = " << (int)exists << dendl;
3421 return exists;
3422 }
3423
3424 int BlueFS::stat(const string& dirname, const string& filename,
3425 uint64_t *size, utime_t *mtime)
3426 {
3427 std::lock_guard l(lock);
3428 dout(10) << __func__ << " " << dirname << "/" << filename << dendl;
3429 map<string,DirRef>::iterator p = dir_map.find(dirname);
3430 if (p == dir_map.end()) {
3431 dout(20) << __func__ << " dir " << dirname << " not found" << dendl;
3432 return -ENOENT;
3433 }
3434 DirRef dir = p->second;
3435 map<string,FileRef>::iterator q = dir->file_map.find(filename);
3436 if (q == dir->file_map.end()) {
3437 dout(20) << __func__ << " dir " << dirname << " (" << dir
3438 << ") file " << filename
3439 << " not found" << dendl;
3440 return -ENOENT;
3441 }
3442 File *file = q->second.get();
3443 dout(10) << __func__ << " " << dirname << "/" << filename
3444 << " " << file->fnode << dendl;
3445 if (size)
3446 *size = file->fnode.size;
3447 if (mtime)
3448 *mtime = file->fnode.mtime;
3449 return 0;
3450 }
3451
3452 int BlueFS::lock_file(const string& dirname, const string& filename,
3453 FileLock **plock)
3454 {
3455 std::lock_guard l(lock);
3456 dout(10) << __func__ << " " << dirname << "/" << filename << dendl;
3457 map<string,DirRef>::iterator p = dir_map.find(dirname);
3458 if (p == dir_map.end()) {
3459 dout(20) << __func__ << " dir " << dirname << " not found" << dendl;
3460 return -ENOENT;
3461 }
3462 DirRef dir = p->second;
3463 map<string,FileRef>::iterator q = dir->file_map.find(filename);
3464 FileRef file;
3465 if (q == dir->file_map.end()) {
3466 dout(20) << __func__ << " dir " << dirname << " (" << dir
3467 << ") file " << filename
3468 << " not found, creating" << dendl;
3469 file = ceph::make_ref<File>();
3470 file->fnode.ino = ++ino_last;
3471 file->fnode.mtime = ceph_clock_now();
3472 file_map[ino_last] = file;
3473 dir->file_map[filename] = file;
3474 ++file->refs;
3475 log_t.op_file_update(file->fnode);
3476 log_t.op_dir_link(dirname, filename, file->fnode.ino);
3477 } else {
3478 file = q->second;
3479 if (file->locked) {
3480 dout(10) << __func__ << " already locked" << dendl;
3481 return -ENOLCK;
3482 }
3483 }
3484 file->locked = true;
3485 *plock = new FileLock(file);
3486 dout(10) << __func__ << " locked " << file->fnode
3487 << " with " << *plock << dendl;
3488 return 0;
3489 }
3490
3491 int BlueFS::unlock_file(FileLock *fl)
3492 {
3493 std::lock_guard l(lock);
3494 dout(10) << __func__ << " " << fl << " on " << fl->file->fnode << dendl;
3495 ceph_assert(fl->file->locked);
3496 fl->file->locked = false;
3497 delete fl;
3498 return 0;
3499 }
3500
3501 int BlueFS::readdir(const string& dirname, vector<string> *ls)
3502 {
3503 std::lock_guard l(lock);
3504 dout(10) << __func__ << " " << dirname << dendl;
3505 if (dirname.empty()) {
3506 // list dirs
3507 ls->reserve(dir_map.size() + 2);
3508 for (auto& q : dir_map) {
3509 ls->push_back(q.first);
3510 }
3511 } else {
3512 // list files in dir
3513 map<string,DirRef>::iterator p = dir_map.find(dirname);
3514 if (p == dir_map.end()) {
3515 dout(20) << __func__ << " dir " << dirname << " not found" << dendl;
3516 return -ENOENT;
3517 }
3518 DirRef dir = p->second;
3519 ls->reserve(dir->file_map.size() + 2);
3520 for (auto& q : dir->file_map) {
3521 ls->push_back(q.first);
3522 }
3523 }
3524 ls->push_back(".");
3525 ls->push_back("..");
3526 return 0;
3527 }
3528
3529 int BlueFS::unlink(const string& dirname, const string& filename)
3530 {
3531 std::lock_guard l(lock);
3532 dout(10) << __func__ << " " << dirname << "/" << filename << dendl;
3533 map<string,DirRef>::iterator p = dir_map.find(dirname);
3534 if (p == dir_map.end()) {
3535 dout(20) << __func__ << " dir " << dirname << " not found" << dendl;
3536 return -ENOENT;
3537 }
3538 DirRef dir = p->second;
3539 map<string,FileRef>::iterator q = dir->file_map.find(filename);
3540 if (q == dir->file_map.end()) {
3541 dout(20) << __func__ << " file " << dirname << "/" << filename
3542 << " not found" << dendl;
3543 return -ENOENT;
3544 }
3545 FileRef file = q->second;
3546 if (file->locked) {
3547 dout(20) << __func__ << " file " << dirname << "/" << filename
3548 << " is locked" << dendl;
3549 return -EBUSY;
3550 }
3551 dir->file_map.erase(filename);
3552 log_t.op_dir_unlink(dirname, filename);
3553 _drop_link(file);
3554 return 0;
3555 }
3556
3557 bool BlueFS::wal_is_rotational()
3558 {
3559 if (bdev[BDEV_WAL]) {
3560 return bdev[BDEV_WAL]->is_rotational();
3561 } else if (bdev[BDEV_DB]) {
3562 return bdev[BDEV_DB]->is_rotational();
3563 }
3564 return bdev[BDEV_SLOW]->is_rotational();
3565 }
3566
3567 void BlueFS::debug_inject_duplicate_gift(unsigned id,
3568 uint64_t offset,
3569 uint64_t len)
3570 {
3571 dout(0) << __func__ << dendl;
3572 if (id < alloc.size() && alloc[id]) {
3573 alloc[id]->init_add_free(offset, len);
3574 }
3575 }
3576
3577 // ===============================================
3578 // OriginalVolumeSelector
3579
3580 void* OriginalVolumeSelector::get_hint_by_device(uint8_t dev) const {
3581 return reinterpret_cast<void*>(dev);
3582 }
3583 void* OriginalVolumeSelector::get_hint_by_dir(const string& dirname) const {
3584 uint8_t res = BlueFS::BDEV_DB;
3585 if (dirname.length() > 5) {
3586 // the "db.slow" and "db.wal" directory names are hard-coded at
3587 // match up with bluestore. the slow device is always the second
3588 // one (when a dedicated block.db device is present and used at
3589 // bdev 0). the wal device is always last.
3590 if (boost::algorithm::ends_with(dirname, ".slow")) {
3591 res = BlueFS::BDEV_SLOW;
3592 }
3593 else if (boost::algorithm::ends_with(dirname, ".wal")) {
3594 res = BlueFS::BDEV_WAL;
3595 }
3596 }
3597 return reinterpret_cast<void*>(res);
3598 }
3599
3600 uint8_t OriginalVolumeSelector::select_prefer_bdev(void* hint)
3601 {
3602 return (uint8_t)(reinterpret_cast<uint64_t>(hint));
3603 }
3604
3605 void OriginalVolumeSelector::get_paths(const std::string& base, paths& res) const
3606 {
3607 res.emplace_back(base, db_total);
3608 res.emplace_back(base + ".slow", slow_total);
3609 }
3610
3611 #undef dout_prefix
3612 #define dout_prefix *_dout << "OriginalVolumeSelector: "
3613
3614 void OriginalVolumeSelector::dump(ostream& sout) {
3615 sout<< "wal_total:" << wal_total
3616 << ", db_total:" << db_total
3617 << ", slow_total:" << slow_total
3618 << std::endl;
3619 }