]> git.proxmox.com Git - ceph.git/blame - ceph/src/os/bluestore/BlueFS.cc
update download target update for octopus release
[ceph.git] / ceph / src / os / bluestore / BlueFS.cc
CommitLineData
7c673cae
FG
1// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2// vim: ts=8 sw=2 smarttab
3
4#include "boost/algorithm/string.hpp"
5#include "BlueFS.h"
6
7#include "common/debug.h"
8#include "common/errno.h"
9#include "common/perf_counters.h"
10#include "BlockDevice.h"
11#include "Allocator.h"
11fdf7f2 12#include "include/ceph_assert.h"
eafe8130 13#include "common/admin_socket.h"
7c673cae
FG
14
15#define dout_context cct
16#define dout_subsys ceph_subsys_bluefs
17#undef dout_prefix
18#define dout_prefix *_dout << "bluefs "
19
20MEMPOOL_DEFINE_OBJECT_FACTORY(BlueFS::File, bluefs_file, bluefs);
21MEMPOOL_DEFINE_OBJECT_FACTORY(BlueFS::Dir, bluefs_dir, bluefs);
22MEMPOOL_DEFINE_OBJECT_FACTORY(BlueFS::FileWriter, bluefs_file_writer, bluefs);
23MEMPOOL_DEFINE_OBJECT_FACTORY(BlueFS::FileReaderBuffer,
24 bluefs_file_reader_buffer, bluefs);
25MEMPOOL_DEFINE_OBJECT_FACTORY(BlueFS::FileReader, bluefs_file_reader, bluefs);
26MEMPOOL_DEFINE_OBJECT_FACTORY(BlueFS::FileLock, bluefs_file_lock, bluefs);
27
11fdf7f2
TL
28static void wal_discard_cb(void *priv, void* priv2) {
29 BlueFS *bluefs = static_cast<BlueFS*>(priv);
30 interval_set<uint64_t> *tmp = static_cast<interval_set<uint64_t>*>(priv2);
31 bluefs->handle_discard(BlueFS::BDEV_WAL, *tmp);
32}
33
34static void db_discard_cb(void *priv, void* priv2) {
35 BlueFS *bluefs = static_cast<BlueFS*>(priv);
36 interval_set<uint64_t> *tmp = static_cast<interval_set<uint64_t>*>(priv2);
37 bluefs->handle_discard(BlueFS::BDEV_DB, *tmp);
38}
39
40static void slow_discard_cb(void *priv, void* priv2) {
41 BlueFS *bluefs = static_cast<BlueFS*>(priv);
42 interval_set<uint64_t> *tmp = static_cast<interval_set<uint64_t>*>(priv2);
43 bluefs->handle_discard(BlueFS::BDEV_SLOW, *tmp);
44}
7c673cae 45
eafe8130
TL
46class BlueFS::SocketHook : public AdminSocketHook {
47 BlueFS* bluefs;
48public:
49 static BlueFS::SocketHook* create(BlueFS* bluefs)
50 {
51 BlueFS::SocketHook* hook = nullptr;
52 AdminSocket* admin_socket = bluefs->cct->get_admin_socket();
53 if (admin_socket) {
54 hook = new BlueFS::SocketHook(bluefs);
55 int r = admin_socket->register_command("bluestore bluefs available",
56 "bluestore bluefs available "
57 "name=alloc_size,type=CephInt,req=false",
58 hook,
59 "Report available space for bluefs. "
60 "If alloc_size set, make simulation.");
61 if (r != 0) {
62 ldout(bluefs->cct, 1) << __func__ << " cannot register SocketHook" << dendl;
63 delete hook;
64 hook = nullptr;
65 }
66 }
67 return hook;
68 }
69
70 ~SocketHook() {
71 AdminSocket* admin_socket = bluefs->cct->get_admin_socket();
72 int r = admin_socket->unregister_command("bluestore bluefs available");
73 ceph_assert(r == 0);
74 }
75private:
76 SocketHook(BlueFS* bluefs) :
77 bluefs(bluefs) {}
78 bool call(std::string_view command, const cmdmap_t& cmdmap,
79 std::string_view format, bufferlist& out) override {
80 stringstream ss;
81 bool r = true;
82 if (command == "bluestore bluefs available") {
83 int64_t alloc_size = 0;
84 cmd_getval(bluefs->cct, cmdmap, "alloc_size", alloc_size);
85 if ((alloc_size & (alloc_size - 1)) != 0) {
86 ss << "Invalid allocation size:'" << alloc_size << std::endl;
87 }
88 if (alloc_size == 0)
89 alloc_size = bluefs->cct->_conf->bluefs_alloc_size;
90 Formatter *f = Formatter::create(format, "json-pretty", "json-pretty");
91 f->open_object_section("bluefs_available_space");
92 for (unsigned dev = BDEV_WAL; dev <= BDEV_SLOW; dev++) {
93 if (bluefs->bdev[dev]) {
94 f->open_object_section("dev");
95 f->dump_string("device", bluefs->get_device_name(dev));
96 ceph_assert(bluefs->alloc[dev]);
97 f->dump_int("free", bluefs->alloc[dev]->get_free());
98 f->close_section();
99 }
100 }
101 size_t extra_space = 0;
102 if (bluefs->slow_dev_expander) {
103 extra_space = bluefs->slow_dev_expander->available_freespace(alloc_size);
104 }
105 f->dump_int("available_from_bluestore", extra_space);
106 f->close_section();
107 f->flush(ss);
108 delete f;
109 } else {
110 ss << "Invalid command" << std::endl;
111 r = false;
112 }
113 out.append(ss);
114 return r;
115 }
116};
117
7c673cae
FG
118BlueFS::BlueFS(CephContext* cct)
119 : cct(cct),
120 bdev(MAX_BDEV),
121 ioc(MAX_BDEV),
11fdf7f2 122 block_all(MAX_BDEV)
7c673cae 123{
11fdf7f2
TL
124 discard_cb[BDEV_WAL] = wal_discard_cb;
125 discard_cb[BDEV_DB] = db_discard_cb;
126 discard_cb[BDEV_SLOW] = slow_discard_cb;
eafe8130 127 asok_hook = SocketHook::create(this);
7c673cae
FG
128}
129
130BlueFS::~BlueFS()
131{
eafe8130 132 delete asok_hook;
7c673cae
FG
133 for (auto p : ioc) {
134 if (p)
135 p->aio_wait();
136 }
137 for (auto p : bdev) {
138 if (p) {
139 p->close();
140 delete p;
141 }
142 }
143 for (auto p : ioc) {
144 delete p;
145 }
146}
147
148void BlueFS::_init_logger()
149{
150 PerfCountersBuilder b(cct, "bluefs",
151 l_bluefs_first, l_bluefs_last);
152 b.add_u64_counter(l_bluefs_gift_bytes, "gift_bytes",
11fdf7f2 153 "Bytes gifted from BlueStore", NULL, 0, unit_t(UNIT_BYTES));
7c673cae 154 b.add_u64_counter(l_bluefs_reclaim_bytes, "reclaim_bytes",
11fdf7f2 155 "Bytes reclaimed by BlueStore", NULL, 0, unit_t(UNIT_BYTES));
7c673cae
FG
156 b.add_u64(l_bluefs_db_total_bytes, "db_total_bytes",
157 "Total bytes (main db device)",
11fdf7f2 158 "b", PerfCountersBuilder::PRIO_USEFUL, unit_t(UNIT_BYTES));
7c673cae
FG
159 b.add_u64(l_bluefs_db_used_bytes, "db_used_bytes",
160 "Used bytes (main db device)",
11fdf7f2 161 "u", PerfCountersBuilder::PRIO_USEFUL, unit_t(UNIT_BYTES));
7c673cae
FG
162 b.add_u64(l_bluefs_wal_total_bytes, "wal_total_bytes",
163 "Total bytes (wal device)",
11fdf7f2 164 "walb", PerfCountersBuilder::PRIO_USEFUL, unit_t(UNIT_BYTES));
7c673cae
FG
165 b.add_u64(l_bluefs_wal_used_bytes, "wal_used_bytes",
166 "Used bytes (wal device)",
11fdf7f2 167 "walu", PerfCountersBuilder::PRIO_USEFUL, unit_t(UNIT_BYTES));
7c673cae
FG
168 b.add_u64(l_bluefs_slow_total_bytes, "slow_total_bytes",
169 "Total bytes (slow device)",
11fdf7f2 170 "slob", PerfCountersBuilder::PRIO_USEFUL, unit_t(UNIT_BYTES));
7c673cae
FG
171 b.add_u64(l_bluefs_slow_used_bytes, "slow_used_bytes",
172 "Used bytes (slow device)",
11fdf7f2 173 "slou", PerfCountersBuilder::PRIO_USEFUL, unit_t(UNIT_BYTES));
7c673cae
FG
174 b.add_u64(l_bluefs_num_files, "num_files", "File count",
175 "f", PerfCountersBuilder::PRIO_USEFUL);
176 b.add_u64(l_bluefs_log_bytes, "log_bytes", "Size of the metadata log",
11fdf7f2 177 "jlen", PerfCountersBuilder::PRIO_INTERESTING, unit_t(UNIT_BYTES));
7c673cae
FG
178 b.add_u64_counter(l_bluefs_log_compactions, "log_compactions",
179 "Compactions of the metadata log");
180 b.add_u64_counter(l_bluefs_logged_bytes, "logged_bytes",
181 "Bytes written to the metadata log", "j",
11fdf7f2 182 PerfCountersBuilder::PRIO_CRITICAL, unit_t(UNIT_BYTES));
7c673cae
FG
183 b.add_u64_counter(l_bluefs_files_written_wal, "files_written_wal",
184 "Files written to WAL");
185 b.add_u64_counter(l_bluefs_files_written_sst, "files_written_sst",
186 "Files written to SSTs");
187 b.add_u64_counter(l_bluefs_bytes_written_wal, "bytes_written_wal",
188 "Bytes written to WAL", "wal",
189 PerfCountersBuilder::PRIO_CRITICAL);
190 b.add_u64_counter(l_bluefs_bytes_written_sst, "bytes_written_sst",
191 "Bytes written to SSTs", "sst",
11fdf7f2
TL
192 PerfCountersBuilder::PRIO_CRITICAL, unit_t(UNIT_BYTES));
193 b.add_u64_counter(l_bluefs_bytes_written_slow, "bytes_written_slow",
194 "Bytes written to WAL/SSTs at slow device", NULL,
195 PerfCountersBuilder::PRIO_USEFUL, unit_t(UNIT_BYTES));
196 b.add_u64_counter(l_bluefs_max_bytes_wal, "max_bytes_wal",
197 "Maximum bytes allocated from WAL");
198 b.add_u64_counter(l_bluefs_max_bytes_db, "max_bytes_db",
199 "Maximum bytes allocated from DB");
200 b.add_u64_counter(l_bluefs_max_bytes_slow, "max_bytes_slow",
201 "Maximum bytes allocated from SLOW");
494da23a
TL
202
203 b.add_u64_counter(l_bluefs_read_random_count, "read_random_count",
204 "random read requests processed");
205 b.add_u64_counter(l_bluefs_read_random_bytes, "read_random_bytes",
206 "Bytes requested in random read mode", NULL,
207 PerfCountersBuilder::PRIO_USEFUL, unit_t(UNIT_BYTES));
208 b.add_u64_counter(l_bluefs_read_random_disk_count, "read_random_disk_count",
209 "random reads requests going to disk");
210 b.add_u64_counter(l_bluefs_read_random_disk_bytes, "read_random_disk_bytes",
211 "Bytes read from disk in random read mode", NULL,
212 PerfCountersBuilder::PRIO_USEFUL, unit_t(UNIT_BYTES));
213 b.add_u64_counter(l_bluefs_read_random_buffer_count, "read_random_buffer_count",
214 "random read requests processed using prefetch buffer");
215 b.add_u64_counter(l_bluefs_read_random_buffer_bytes, "read_random_buffer_bytes",
216 "Bytes read from prefetch buffer in random read mode", NULL,
217 PerfCountersBuilder::PRIO_USEFUL, unit_t(UNIT_BYTES));
218
219 b.add_u64_counter(l_bluefs_read_count, "read_count",
220 "buffered read requests processed");
221 b.add_u64_counter(l_bluefs_read_bytes, "read_bytes",
222 "Bytes requested in buffered read mode", NULL,
223 PerfCountersBuilder::PRIO_USEFUL, unit_t(UNIT_BYTES));
224
225 b.add_u64_counter(l_bluefs_read_prefetch_count, "read_prefetch_count",
226 "prefetch read requests processed");
227 b.add_u64_counter(l_bluefs_read_prefetch_bytes, "read_prefetch_bytes",
228 "Bytes requested in prefetch read mode", NULL,
229 PerfCountersBuilder::PRIO_USEFUL, unit_t(UNIT_BYTES));
230
7c673cae
FG
231 logger = b.create_perf_counters();
232 cct->get_perfcounters_collection()->add(logger);
233}
234
235void BlueFS::_shutdown_logger()
236{
237 cct->get_perfcounters_collection()->remove(logger);
238 delete logger;
239}
240
241void BlueFS::_update_logger_stats()
242{
243 // we must be holding the lock
244 logger->set(l_bluefs_num_files, file_map.size());
245 logger->set(l_bluefs_log_bytes, log_writer->file->fnode.size);
246
247 if (alloc[BDEV_WAL]) {
11fdf7f2 248 logger->set(l_bluefs_wal_total_bytes, block_all[BDEV_WAL].size());
7c673cae 249 logger->set(l_bluefs_wal_used_bytes,
11fdf7f2 250 block_all[BDEV_WAL].size() - alloc[BDEV_WAL]->get_free());
7c673cae
FG
251 }
252 if (alloc[BDEV_DB]) {
11fdf7f2 253 logger->set(l_bluefs_db_total_bytes, block_all[BDEV_DB].size());
7c673cae 254 logger->set(l_bluefs_db_used_bytes,
11fdf7f2 255 block_all[BDEV_DB].size() - alloc[BDEV_DB]->get_free());
7c673cae
FG
256 }
257 if (alloc[BDEV_SLOW]) {
11fdf7f2 258 logger->set(l_bluefs_slow_total_bytes, block_all[BDEV_SLOW].size());
7c673cae 259 logger->set(l_bluefs_slow_used_bytes,
11fdf7f2 260 block_all[BDEV_SLOW].size() - alloc[BDEV_SLOW]->get_free());
7c673cae
FG
261 }
262}
263
11fdf7f2
TL
264int BlueFS::add_block_device(unsigned id, const string& path, bool trim,
265 bool shared_with_bluestore)
7c673cae
FG
266{
267 dout(10) << __func__ << " bdev " << id << " path " << path << dendl;
11fdf7f2
TL
268 ceph_assert(id < bdev.size());
269 ceph_assert(bdev[id] == NULL);
270 BlockDevice *b = BlockDevice::create(cct, path, NULL, NULL,
271 discard_cb[id], static_cast<void*>(this));
272 if (shared_with_bluestore) {
273 b->set_no_exclusive_lock();
274 }
7c673cae
FG
275 int r = b->open(path);
276 if (r < 0) {
277 delete b;
278 return r;
279 }
11fdf7f2
TL
280 if (trim) {
281 b->discard(0, b->get_size());
282 }
283
7c673cae 284 dout(1) << __func__ << " bdev " << id << " path " << path
1adf2230 285 << " size " << byte_u_t(b->get_size()) << dendl;
7c673cae
FG
286 bdev[id] = b;
287 ioc[id] = new IOContext(cct, NULL);
288 return 0;
289}
290
291bool BlueFS::bdev_support_label(unsigned id)
292{
11fdf7f2
TL
293 ceph_assert(id < bdev.size());
294 ceph_assert(bdev[id]);
7c673cae
FG
295 return bdev[id]->supported_bdev_label();
296}
297
298uint64_t BlueFS::get_block_device_size(unsigned id)
299{
300 if (id < bdev.size() && bdev[id])
301 return bdev[id]->get_size();
302 return 0;
303}
304
11fdf7f2 305void BlueFS::_add_block_extent(unsigned id, uint64_t offset, uint64_t length)
7c673cae 306{
7c673cae 307 dout(1) << __func__ << " bdev " << id
11fdf7f2 308 << " 0x" << std::hex << offset << "~" << length << std::dec
7c673cae 309 << dendl;
11fdf7f2
TL
310
311 ceph_assert(id < bdev.size());
312 ceph_assert(bdev[id]);
313 ceph_assert(bdev[id]->get_size() >= offset + length);
7c673cae 314 block_all[id].insert(offset, length);
7c673cae
FG
315
316 if (id < alloc.size() && alloc[id]) {
317 log_t.op_alloc_add(id, offset, length);
7c673cae
FG
318 alloc[id]->init_add_free(offset, length);
319 }
320
321 if (logger)
322 logger->inc(l_bluefs_gift_bytes, length);
323 dout(10) << __func__ << " done" << dendl;
324}
325
326int BlueFS::reclaim_blocks(unsigned id, uint64_t want,
a8e16298 327 PExtentVector *extents)
7c673cae 328{
11fdf7f2 329 std::unique_lock l(lock);
7c673cae
FG
330 dout(1) << __func__ << " bdev " << id
331 << " want 0x" << std::hex << want << std::dec << dendl;
11fdf7f2
TL
332 ceph_assert(id < alloc.size());
333 ceph_assert(alloc[id]);
a8e16298 334
eafe8130 335 int64_t got = alloc[id]->allocate(want, alloc_size[id], 0, extents);
11fdf7f2 336 ceph_assert(got != 0);
a8e16298 337 if (got < 0) {
7c673cae 338 derr << __func__ << " failed to allocate space to return to bluestore"
a8e16298 339 << dendl;
7c673cae
FG
340 alloc[id]->dump();
341 return got;
342 }
343
344 for (auto& p : *extents) {
345 block_all[id].erase(p.offset, p.length);
7c673cae
FG
346 log_t.op_alloc_rm(id, p.offset, p.length);
347 }
348
349 flush_bdev();
a8e16298 350 int r = _flush_and_sync_log(l);
11fdf7f2 351 ceph_assert(r == 0);
7c673cae 352
11fdf7f2 353 logger->inc(l_bluefs_reclaim_bytes, got);
7c673cae
FG
354 dout(1) << __func__ << " bdev " << id << " want 0x" << std::hex << want
355 << " got " << *extents << dendl;
356 return 0;
357}
358
11fdf7f2 359void BlueFS::handle_discard(unsigned id, interval_set<uint64_t>& to_release)
7c673cae 360{
11fdf7f2
TL
361 dout(10) << __func__ << " bdev " << id << dendl;
362 ceph_assert(alloc[id]);
363 alloc[id]->release(to_release);
364}
365
366uint64_t BlueFS::get_used()
367{
368 std::lock_guard l(lock);
369 uint64_t used = 0;
370 for (unsigned id = 0; id < MAX_BDEV; ++id) {
371 if (alloc[id]) {
372 used += block_all[id].size() - alloc[id]->get_free();
373 }
7c673cae 374 }
11fdf7f2 375 return used;
7c673cae
FG
376}
377
378uint64_t BlueFS::get_total(unsigned id)
379{
11fdf7f2
TL
380 std::lock_guard l(lock);
381 ceph_assert(id < block_all.size());
382 return block_all[id].size();
7c673cae
FG
383}
384
385uint64_t BlueFS::get_free(unsigned id)
386{
11fdf7f2
TL
387 std::lock_guard l(lock);
388 ceph_assert(id < alloc.size());
7c673cae
FG
389 return alloc[id]->get_free();
390}
391
392void BlueFS::dump_perf_counters(Formatter *f)
393{
394 f->open_object_section("bluefs_perf_counters");
395 logger->dump_formatted(f,0);
396 f->close_section();
397}
398
3efd9988
FG
399void BlueFS::dump_block_extents(ostream& out)
400{
401 for (unsigned i = 0; i < MAX_BDEV; ++i) {
402 if (!bdev[i]) {
403 continue;
404 }
11fdf7f2
TL
405 auto owned = get_total(i);
406 auto free = get_free(i);
407 out << i << " : device size 0x" << std::hex << bdev[i]->get_size()
408 << " : own 0x" << block_all[i]
409 << " = 0x" << owned
410 << " : using 0x" << owned - free
494da23a
TL
411 << std::dec << "(" << byte_u_t(owned - free) << ")"
412 << "\n";
3efd9988
FG
413 }
414}
7c673cae
FG
415
416void BlueFS::get_usage(vector<pair<uint64_t,uint64_t>> *usage)
417{
11fdf7f2 418 std::lock_guard l(lock);
7c673cae
FG
419 usage->resize(bdev.size());
420 for (unsigned id = 0; id < bdev.size(); ++id) {
421 if (!bdev[id]) {
422 (*usage)[id] = make_pair(0, 0);
423 continue;
424 }
425 (*usage)[id].first = alloc[id]->get_free();
11fdf7f2 426 (*usage)[id].second = block_all[id].size();
7c673cae 427 uint64_t used =
11fdf7f2 428 (block_all[id].size() - (*usage)[id].first) * 100 / block_all[id].size();
7c673cae
FG
429 dout(10) << __func__ << " bdev " << id
430 << " free " << (*usage)[id].first
1adf2230 431 << " (" << byte_u_t((*usage)[id].first) << ")"
7c673cae 432 << " / " << (*usage)[id].second
1adf2230 433 << " (" << byte_u_t((*usage)[id].second) << ")"
7c673cae
FG
434 << ", used " << used << "%"
435 << dendl;
436 }
437}
438
439int BlueFS::get_block_extents(unsigned id, interval_set<uint64_t> *extents)
440{
11fdf7f2 441 std::lock_guard l(lock);
7c673cae
FG
442 dout(10) << __func__ << " bdev " << id << dendl;
443 if (id >= block_all.size())
444 return -EINVAL;
445 *extents = block_all[id];
446 return 0;
447}
448
449int BlueFS::mkfs(uuid_d osd_uuid)
450{
11fdf7f2 451 std::unique_lock l(lock);
7c673cae
FG
452 dout(1) << __func__
453 << " osd_uuid " << osd_uuid
454 << dendl;
455
456 _init_alloc();
457 _init_logger();
458
459 super.version = 1;
460 super.block_size = bdev[BDEV_DB]->get_block_size();
461 super.osd_uuid = osd_uuid;
462 super.uuid.generate_random();
463 dout(1) << __func__ << " uuid " << super.uuid << dendl;
464
465 // init log
466 FileRef log_file = new File;
467 log_file->fnode.ino = 1;
468 log_file->fnode.prefer_bdev = BDEV_WAL;
469 int r = _allocate(
470 log_file->fnode.prefer_bdev,
471 cct->_conf->bluefs_max_log_runway,
94b18763 472 &log_file->fnode);
11fdf7f2 473 ceph_assert(r == 0);
7c673cae
FG
474 log_writer = _create_writer(log_file);
475
476 // initial txn
477 log_t.op_init();
478 for (unsigned bdev = 0; bdev < MAX_BDEV; ++bdev) {
479 interval_set<uint64_t>& p = block_all[bdev];
480 if (p.empty())
481 continue;
482 for (interval_set<uint64_t>::iterator q = p.begin(); q != p.end(); ++q) {
483 dout(20) << __func__ << " op_alloc_add " << bdev << " 0x"
484 << std::hex << q.get_start() << "~" << q.get_len() << std::dec
485 << dendl;
486 log_t.op_alloc_add(bdev, q.get_start(), q.get_len());
487 }
488 }
489 _flush_and_sync_log(l);
490
491 // write supers
492 super.log_fnode = log_file->fnode;
11fdf7f2 493 _write_super(BDEV_DB);
7c673cae
FG
494 flush_bdev();
495
496 // clean up
497 super = bluefs_super_t();
498 _close_writer(log_writer);
499 log_writer = NULL;
500 block_all.clear();
7c673cae
FG
501 _stop_alloc();
502 _shutdown_logger();
503
504 dout(10) << __func__ << " success" << dendl;
505 return 0;
506}
507
508void BlueFS::_init_alloc()
509{
510 dout(20) << __func__ << dendl;
511 alloc.resize(MAX_BDEV);
eafe8130 512 alloc_size.resize(MAX_BDEV, 0);
7c673cae 513 pending_release.resize(MAX_BDEV);
eafe8130
TL
514
515 if (bdev[BDEV_WAL]) {
516 alloc_size[BDEV_WAL] = cct->_conf->bluefs_alloc_size;
517 }
518 if (bdev[BDEV_SLOW]) {
519 alloc_size[BDEV_DB] = cct->_conf->bluefs_alloc_size;
520 alloc_size[BDEV_SLOW] = cct->_conf->bluefs_shared_alloc_size;
521 } else {
522 alloc_size[BDEV_DB] = cct->_conf->bluefs_shared_alloc_size;
523 }
524 // new wal and db devices are never shared
525 if (bdev[BDEV_NEWWAL]) {
526 alloc_size[BDEV_NEWWAL] = cct->_conf->bluefs_alloc_size;
527 }
528 if (bdev[BDEV_NEWDB]) {
529 alloc_size[BDEV_NEWDB] = cct->_conf->bluefs_alloc_size;
530 }
531
7c673cae
FG
532 for (unsigned id = 0; id < bdev.size(); ++id) {
533 if (!bdev[id]) {
534 continue;
535 }
11fdf7f2 536 ceph_assert(bdev[id]->get_size());
eafe8130
TL
537 std::string name = "bluefs-";
538 const char* devnames[] = {"wal","db","slow"};
539 if (id <= BDEV_SLOW)
540 name += devnames[id];
541 else
542 name += to_string(uintptr_t(this));
543 ceph_assert(alloc_size[id]);
544 dout(1) << __func__ << " id " << id
545 << " alloc_size 0x" << std::hex << alloc_size[id]
546 << " size 0x" << bdev[id]->get_size() << std::dec << dendl;
7c673cae
FG
547 alloc[id] = Allocator::create(cct, cct->_conf->bluefs_allocator,
548 bdev[id]->get_size(),
eafe8130 549 alloc_size[id], name);
7c673cae
FG
550 interval_set<uint64_t>& p = block_all[id];
551 for (interval_set<uint64_t>::iterator q = p.begin(); q != p.end(); ++q) {
552 alloc[id]->init_add_free(q.get_start(), q.get_len());
553 }
554 }
555}
556
557void BlueFS::_stop_alloc()
558{
559 dout(20) << __func__ << dendl;
11fdf7f2
TL
560 for (auto p : bdev) {
561 if (p)
562 p->discard_drain();
563 }
564
7c673cae
FG
565 for (auto p : alloc) {
566 if (p != nullptr) {
567 p->shutdown();
568 delete p;
569 }
570 }
571 alloc.clear();
572}
573
574int BlueFS::mount()
575{
576 dout(1) << __func__ << dendl;
577
578 int r = _open_super();
579 if (r < 0) {
580 derr << __func__ << " failed to open super: " << cpp_strerror(r) << dendl;
581 goto out;
582 }
583
584 block_all.clear();
585 block_all.resize(MAX_BDEV);
7c673cae 586 _init_alloc();
494da23a 587 _init_logger();
7c673cae 588
11fdf7f2 589 r = _replay(false, false);
7c673cae
FG
590 if (r < 0) {
591 derr << __func__ << " failed to replay log: " << cpp_strerror(r) << dendl;
592 _stop_alloc();
593 goto out;
594 }
595
596 // init freelist
597 for (auto& p : file_map) {
598 dout(30) << __func__ << " noting alloc for " << p.second->fnode << dendl;
599 for (auto& q : p.second->fnode.extents) {
600 alloc[q.bdev]->init_rm_free(q.offset, q.length);
601 }
602 }
603
604 // set up the log for future writes
605 log_writer = _create_writer(_get_file(1));
11fdf7f2 606 ceph_assert(log_writer->file->fnode.ino == 1);
7c673cae
FG
607 log_writer->pos = log_writer->file->fnode.size;
608 dout(10) << __func__ << " log write pos set to 0x"
609 << std::hex << log_writer->pos << std::dec
610 << dendl;
611
7c673cae
FG
612 return 0;
613
614 out:
615 super = bluefs_super_t();
616 return r;
617}
618
619void BlueFS::umount()
620{
621 dout(1) << __func__ << dendl;
622
623 sync_metadata();
624
625 _close_writer(log_writer);
626 log_writer = NULL;
627
628 _stop_alloc();
629 file_map.clear();
630 dir_map.clear();
631 super = bluefs_super_t();
632 log_t.clear();
633 _shutdown_logger();
634}
635
11fdf7f2 636int BlueFS::prepare_new_device(int id)
7c673cae 637{
11fdf7f2
TL
638 dout(1) << __func__ << dendl;
639
640 if(id == BDEV_NEWDB) {
641 int new_log_dev_cur = BDEV_WAL;
642 int new_log_dev_next = BDEV_WAL;
643 if (!bdev[BDEV_WAL]) {
644 new_log_dev_cur = BDEV_NEWDB;
645 new_log_dev_next = BDEV_DB;
646 }
647 _rewrite_log_sync(false,
648 BDEV_NEWDB,
649 new_log_dev_cur,
650 new_log_dev_next,
651 RENAME_DB2SLOW);
652 //}
653 } else if(id == BDEV_NEWWAL) {
654 _rewrite_log_sync(false, BDEV_DB, BDEV_NEWWAL, BDEV_WAL, REMOVE_WAL);
655 } else {
656 assert(false);
657 }
658 return 0;
659}
660
661void BlueFS::collect_metadata(map<string,string> *pm, unsigned skip_bdev_id)
662{
663 if (skip_bdev_id != BDEV_DB && bdev[BDEV_DB])
7c673cae
FG
664 bdev[BDEV_DB]->collect_metadata("bluefs_db_", pm);
665 if (bdev[BDEV_WAL])
666 bdev[BDEV_WAL]->collect_metadata("bluefs_wal_", pm);
11fdf7f2
TL
667}
668
669void BlueFS::get_devices(set<string> *ls)
670{
671 for (unsigned i = 0; i < MAX_BDEV; ++i) {
672 if (bdev[i]) {
673 bdev[i]->get_devices(ls);
674 }
675 }
7c673cae
FG
676}
677
678int BlueFS::fsck()
679{
11fdf7f2 680 std::lock_guard l(lock);
7c673cae
FG
681 dout(1) << __func__ << dendl;
682 // hrm, i think we check everything on mount...
683 return 0;
684}
685
11fdf7f2 686int BlueFS::_write_super(int dev)
7c673cae
FG
687{
688 // build superblock
689 bufferlist bl;
11fdf7f2 690 encode(super, bl);
7c673cae 691 uint32_t crc = bl.crc32c(-1);
11fdf7f2 692 encode(crc, bl);
7c673cae
FG
693 dout(10) << __func__ << " super block length(encoded): " << bl.length() << dendl;
694 dout(10) << __func__ << " superblock " << super.version << dendl;
695 dout(10) << __func__ << " log_fnode " << super.log_fnode << dendl;
11fdf7f2 696 ceph_assert(bl.length() <= get_super_length());
7c673cae
FG
697 bl.append_zero(get_super_length() - bl.length());
698
11fdf7f2 699 bdev[dev]->write(get_super_offset(), bl, false, WRITE_LIFE_SHORT);
7c673cae
FG
700 dout(20) << __func__ << " v " << super.version
701 << " crc 0x" << std::hex << crc
702 << " offset 0x" << get_super_offset() << std::dec
703 << dendl;
704 return 0;
705}
706
707int BlueFS::_open_super()
708{
709 dout(10) << __func__ << dendl;
710
711 bufferlist bl;
712 uint32_t expected_crc, crc;
713 int r;
714
715 // always the second block
716 r = bdev[BDEV_DB]->read(get_super_offset(), get_super_length(),
717 &bl, ioc[BDEV_DB], false);
718 if (r < 0)
719 return r;
720
11fdf7f2
TL
721 auto p = bl.cbegin();
722 decode(super, p);
7c673cae
FG
723 {
724 bufferlist t;
725 t.substr_of(bl, 0, p.get_off());
726 crc = t.crc32c(-1);
727 }
11fdf7f2 728 decode(expected_crc, p);
7c673cae
FG
729 if (crc != expected_crc) {
730 derr << __func__ << " bad crc on superblock, expected 0x"
731 << std::hex << expected_crc << " != actual 0x" << crc << std::dec
732 << dendl;
733 return -EIO;
734 }
735 dout(10) << __func__ << " superblock " << super.version << dendl;
736 dout(10) << __func__ << " log_fnode " << super.log_fnode << dendl;
737 return 0;
738}
739
11fdf7f2 740int BlueFS::_replay(bool noop, bool to_stdout)
7c673cae
FG
741{
742 dout(10) << __func__ << (noop ? " NO-OP" : "") << dendl;
743 ino_last = 1; // by the log
744 log_seq = 0;
745
746 FileRef log_file;
11fdf7f2
TL
747 log_file = _get_file(1);
748 if (!noop) {
749 log_file->fnode = super.log_fnode;
7c673cae 750 } else {
11fdf7f2
TL
751 // do not use fnode from superblock in 'noop' mode - log_file's one should
752 // be fine and up-to-date
753 ceph_assert(log_file->fnode.ino == 1);
754 ceph_assert(log_file->fnode.extents.size() != 0);
7c673cae 755 }
7c673cae 756 dout(10) << __func__ << " log_fnode " << super.log_fnode << dendl;
11fdf7f2
TL
757 if (unlikely(to_stdout)) {
758 std::cout << " log_fnode " << super.log_fnode << std::endl;
759 }
7c673cae
FG
760
761 FileReader *log_reader = new FileReader(
762 log_file, cct->_conf->bluefs_max_prefetch,
763 false, // !random
764 true); // ignore eof
765 while (true) {
11fdf7f2 766 ceph_assert((log_reader->buf.pos & ~super.block_mask()) == 0);
7c673cae
FG
767 uint64_t pos = log_reader->buf.pos;
768 uint64_t read_pos = pos;
769 bufferlist bl;
770 {
771 int r = _read(log_reader, &log_reader->buf, read_pos, super.block_size,
772 &bl, NULL);
11fdf7f2 773 ceph_assert(r == (int)super.block_size);
7c673cae
FG
774 read_pos += r;
775 }
776 uint64_t more = 0;
777 uint64_t seq;
778 uuid_d uuid;
779 {
11fdf7f2 780 auto p = bl.cbegin();
7c673cae
FG
781 __u8 a, b;
782 uint32_t len;
11fdf7f2
TL
783 decode(a, p);
784 decode(b, p);
785 decode(len, p);
786 decode(uuid, p);
787 decode(seq, p);
7c673cae 788 if (len + 6 > bl.length()) {
11fdf7f2 789 more = round_up_to(len + 6 - bl.length(), super.block_size);
7c673cae
FG
790 }
791 }
792 if (uuid != super.uuid) {
793 dout(10) << __func__ << " 0x" << std::hex << pos << std::dec
794 << ": stop: uuid " << uuid << " != super.uuid " << super.uuid
795 << dendl;
796 break;
797 }
798 if (seq != log_seq + 1) {
799 dout(10) << __func__ << " 0x" << std::hex << pos << std::dec
800 << ": stop: seq " << seq << " != expected " << log_seq + 1
801 << dendl;
802 break;
803 }
804 if (more) {
805 dout(20) << __func__ << " need 0x" << std::hex << more << std::dec
806 << " more bytes" << dendl;
807 bufferlist t;
808 int r = _read(log_reader, &log_reader->buf, read_pos, more, &t, NULL);
809 if (r < (int)more) {
810 dout(10) << __func__ << " 0x" << std::hex << pos
811 << ": stop: len is 0x" << bl.length() + more << std::dec
812 << ", which is past eof" << dendl;
813 break;
814 }
11fdf7f2 815 ceph_assert(r == (int)more);
7c673cae
FG
816 bl.claim_append(t);
817 read_pos += r;
818 }
819 bluefs_transaction_t t;
820 try {
11fdf7f2
TL
821 auto p = bl.cbegin();
822 decode(t, p);
7c673cae
FG
823 }
824 catch (buffer::error& e) {
825 dout(10) << __func__ << " 0x" << std::hex << pos << std::dec
826 << ": stop: failed to decode: " << e.what()
827 << dendl;
828 delete log_reader;
829 return -EIO;
830 }
11fdf7f2 831 ceph_assert(seq == t.seq);
7c673cae
FG
832 dout(10) << __func__ << " 0x" << std::hex << pos << std::dec
833 << ": " << t << dendl;
11fdf7f2
TL
834 if (unlikely(to_stdout)) {
835 std::cout << " 0x" << std::hex << pos << std::dec
836 << ": " << t << std::endl;
837 }
7c673cae 838
11fdf7f2 839 auto p = t.op_bl.cbegin();
7c673cae
FG
840 while (!p.end()) {
841 __u8 op;
11fdf7f2 842 decode(op, p);
7c673cae
FG
843 switch (op) {
844
845 case bluefs_transaction_t::OP_INIT:
846 dout(20) << __func__ << " 0x" << std::hex << pos << std::dec
847 << ": op_init" << dendl;
11fdf7f2
TL
848 if (unlikely(to_stdout)) {
849 std::cout << " 0x" << std::hex << pos << std::dec
850 << ": op_init" << std::endl;
851 }
852
853 ceph_assert(t.seq == 1);
7c673cae
FG
854 break;
855
856 case bluefs_transaction_t::OP_JUMP:
857 {
858 uint64_t next_seq;
859 uint64_t offset;
11fdf7f2
TL
860 decode(next_seq, p);
861 decode(offset, p);
7c673cae
FG
862 dout(20) << __func__ << " 0x" << std::hex << pos << std::dec
863 << ": op_jump seq " << next_seq
864 << " offset 0x" << std::hex << offset << std::dec << dendl;
11fdf7f2
TL
865 if (unlikely(to_stdout)) {
866 std::cout << " 0x" << std::hex << pos << std::dec
867 << ": op_jump seq " << next_seq
868 << " offset 0x" << std::hex << offset << std::dec
869 << std::endl;
870 }
871
872 ceph_assert(next_seq >= log_seq);
7c673cae
FG
873 log_seq = next_seq - 1; // we will increment it below
874 uint64_t skip = offset - read_pos;
875 if (skip) {
876 bufferlist junk;
877 int r = _read(log_reader, &log_reader->buf, read_pos, skip, &junk,
878 NULL);
879 if (r != (int)skip) {
880 dout(10) << __func__ << " 0x" << std::hex << read_pos
881 << ": stop: failed to skip to " << offset
882 << std::dec << dendl;
11fdf7f2 883 ceph_abort_msg("problem with op_jump");
7c673cae
FG
884 }
885 }
886 }
887 break;
888
889 case bluefs_transaction_t::OP_JUMP_SEQ:
890 {
891 uint64_t next_seq;
11fdf7f2 892 decode(next_seq, p);
7c673cae
FG
893 dout(20) << __func__ << " 0x" << std::hex << pos << std::dec
894 << ": op_jump_seq " << next_seq << dendl;
11fdf7f2
TL
895 if (unlikely(to_stdout)) {
896 std::cout << " 0x" << std::hex << pos << std::dec
897 << ": op_jump_seq " << next_seq << std::endl;
898 }
899
900 ceph_assert(next_seq >= log_seq);
7c673cae
FG
901 log_seq = next_seq - 1; // we will increment it below
902 }
903 break;
904
905 case bluefs_transaction_t::OP_ALLOC_ADD:
906 {
907 __u8 id;
908 uint64_t offset, length;
11fdf7f2
TL
909 decode(id, p);
910 decode(offset, p);
911 decode(length, p);
7c673cae
FG
912 dout(20) << __func__ << " 0x" << std::hex << pos << std::dec
913 << ": op_alloc_add " << " " << (int)id
914 << ":0x" << std::hex << offset << "~" << length << std::dec
915 << dendl;
11fdf7f2
TL
916 if (unlikely(to_stdout)) {
917 std::cout << " 0x" << std::hex << pos << std::dec
918 << ": op_alloc_add " << " " << (int)id
919 << ":0x" << std::hex << offset << "~" << length << std::dec
920 << std::endl;
921 }
922
7c673cae
FG
923 if (!noop) {
924 block_all[id].insert(offset, length);
7c673cae
FG
925 alloc[id]->init_add_free(offset, length);
926 }
927 }
928 break;
929
930 case bluefs_transaction_t::OP_ALLOC_RM:
931 {
932 __u8 id;
933 uint64_t offset, length;
11fdf7f2
TL
934 decode(id, p);
935 decode(offset, p);
936 decode(length, p);
7c673cae
FG
937 dout(20) << __func__ << " 0x" << std::hex << pos << std::dec
938 << ": op_alloc_rm " << " " << (int)id
939 << ":0x" << std::hex << offset << "~" << length << std::dec
940 << dendl;
11fdf7f2
TL
941 if (unlikely(to_stdout)) {
942 std::cout << " 0x" << std::hex << pos << std::dec
943 << ": op_alloc_rm " << " " << (int)id
944 << ":0x" << std::hex << offset << "~" << length << std::dec
945 << std::endl;
946 }
947
7c673cae
FG
948 if (!noop) {
949 block_all[id].erase(offset, length);
7c673cae
FG
950 alloc[id]->init_rm_free(offset, length);
951 }
952 }
953 break;
954
955 case bluefs_transaction_t::OP_DIR_LINK:
956 {
957 string dirname, filename;
958 uint64_t ino;
11fdf7f2
TL
959 decode(dirname, p);
960 decode(filename, p);
961 decode(ino, p);
7c673cae
FG
962 dout(20) << __func__ << " 0x" << std::hex << pos << std::dec
963 << ": op_dir_link " << " " << dirname << "/" << filename
964 << " to " << ino
965 << dendl;
11fdf7f2
TL
966 if (unlikely(to_stdout)) {
967 std::cout << " 0x" << std::hex << pos << std::dec
968 << ": op_dir_link " << " " << dirname << "/" << filename
969 << " to " << ino
970 << std::endl;
971 }
972
7c673cae
FG
973 if (!noop) {
974 FileRef file = _get_file(ino);
11fdf7f2 975 ceph_assert(file->fnode.ino);
7c673cae 976 map<string,DirRef>::iterator q = dir_map.find(dirname);
11fdf7f2 977 ceph_assert(q != dir_map.end());
7c673cae 978 map<string,FileRef>::iterator r = q->second->file_map.find(filename);
11fdf7f2 979 ceph_assert(r == q->second->file_map.end());
7c673cae
FG
980 q->second->file_map[filename] = file;
981 ++file->refs;
982 }
983 }
984 break;
985
986 case bluefs_transaction_t::OP_DIR_UNLINK:
987 {
988 string dirname, filename;
11fdf7f2
TL
989 decode(dirname, p);
990 decode(filename, p);
7c673cae
FG
991 dout(20) << __func__ << " 0x" << std::hex << pos << std::dec
992 << ": op_dir_unlink " << " " << dirname << "/" << filename
993 << dendl;
11fdf7f2
TL
994 if (unlikely(to_stdout)) {
995 std::cout << " 0x" << std::hex << pos << std::dec
996 << ": op_dir_unlink " << " " << dirname << "/" << filename
997 << std::endl;
998 }
999
7c673cae
FG
1000 if (!noop) {
1001 map<string,DirRef>::iterator q = dir_map.find(dirname);
11fdf7f2 1002 ceph_assert(q != dir_map.end());
7c673cae 1003 map<string,FileRef>::iterator r = q->second->file_map.find(filename);
11fdf7f2
TL
1004 ceph_assert(r != q->second->file_map.end());
1005 ceph_assert(r->second->refs > 0);
7c673cae
FG
1006 --r->second->refs;
1007 q->second->file_map.erase(r);
1008 }
1009 }
1010 break;
1011
1012 case bluefs_transaction_t::OP_DIR_CREATE:
1013 {
1014 string dirname;
11fdf7f2 1015 decode(dirname, p);
7c673cae
FG
1016 dout(20) << __func__ << " 0x" << std::hex << pos << std::dec
1017 << ": op_dir_create " << dirname << dendl;
11fdf7f2
TL
1018 if (unlikely(to_stdout)) {
1019 std::cout << " 0x" << std::hex << pos << std::dec
1020 << ": op_dir_create " << dirname << std::endl;
1021 }
1022
7c673cae
FG
1023 if (!noop) {
1024 map<string,DirRef>::iterator q = dir_map.find(dirname);
11fdf7f2 1025 ceph_assert(q == dir_map.end());
7c673cae
FG
1026 dir_map[dirname] = new Dir;
1027 }
1028 }
1029 break;
1030
1031 case bluefs_transaction_t::OP_DIR_REMOVE:
1032 {
1033 string dirname;
11fdf7f2 1034 decode(dirname, p);
7c673cae
FG
1035 dout(20) << __func__ << " 0x" << std::hex << pos << std::dec
1036 << ": op_dir_remove " << dirname << dendl;
11fdf7f2
TL
1037 if (unlikely(to_stdout)) {
1038 std::cout << " 0x" << std::hex << pos << std::dec
1039 << ": op_dir_remove " << dirname << std::endl;
1040 }
1041
7c673cae
FG
1042 if (!noop) {
1043 map<string,DirRef>::iterator q = dir_map.find(dirname);
11fdf7f2
TL
1044 ceph_assert(q != dir_map.end());
1045 ceph_assert(q->second->file_map.empty());
7c673cae
FG
1046 dir_map.erase(q);
1047 }
1048 }
1049 break;
1050
1051 case bluefs_transaction_t::OP_FILE_UPDATE:
1052 {
1053 bluefs_fnode_t fnode;
11fdf7f2 1054 decode(fnode, p);
7c673cae
FG
1055 dout(20) << __func__ << " 0x" << std::hex << pos << std::dec
1056 << ": op_file_update " << " " << fnode << dendl;
11fdf7f2
TL
1057 if (unlikely(to_stdout)) {
1058 std::cout << " 0x" << std::hex << pos << std::dec
1059 << ": op_file_update " << " " << fnode << std::endl;
1060 }
1061
7c673cae
FG
1062 if (!noop) {
1063 FileRef f = _get_file(fnode.ino);
1064 f->fnode = fnode;
1065 if (fnode.ino > ino_last) {
1066 ino_last = fnode.ino;
1067 }
1068 }
1069 }
1070 break;
1071
1072 case bluefs_transaction_t::OP_FILE_REMOVE:
1073 {
1074 uint64_t ino;
11fdf7f2 1075 decode(ino, p);
7c673cae
FG
1076 dout(20) << __func__ << " 0x" << std::hex << pos << std::dec
1077 << ": op_file_remove " << ino << dendl;
11fdf7f2
TL
1078 if (unlikely(to_stdout)) {
1079 std::cout << " 0x" << std::hex << pos << std::dec
1080 << ": op_file_remove " << ino << std::endl;
1081 }
1082
7c673cae
FG
1083 if (!noop) {
1084 auto p = file_map.find(ino);
11fdf7f2 1085 ceph_assert(p != file_map.end());
7c673cae
FG
1086 file_map.erase(p);
1087 }
1088 }
1089 break;
1090
1091 default:
1092 derr << __func__ << " 0x" << std::hex << pos << std::dec
1093 << ": stop: unrecognized op " << (int)op << dendl;
1094 delete log_reader;
1095 return -EIO;
1096 }
1097 }
11fdf7f2 1098 ceph_assert(p.end());
7c673cae
FG
1099
1100 // we successfully replayed the transaction; bump the seq and log size
1101 ++log_seq;
1102 log_file->fnode.size = log_reader->buf.pos;
1103 }
1104
1105 dout(10) << __func__ << " log file size was 0x"
1106 << std::hex << log_file->fnode.size << std::dec << dendl;
11fdf7f2
TL
1107 if (unlikely(to_stdout)) {
1108 std::cout << " log file size was 0x"
1109 << std::hex << log_file->fnode.size << std::dec << std::endl;
1110 }
1111
7c673cae
FG
1112 delete log_reader;
1113
1114 if (!noop) {
1115 // verify file link counts are all >0
1116 for (auto& p : file_map) {
1117 if (p.second->refs == 0 &&
1118 p.second->fnode.ino > 1) {
1119 derr << __func__ << " file with link count 0: " << p.second->fnode
1120 << dendl;
1121 return -EIO;
1122 }
1123 }
1124 }
1125
1126 dout(10) << __func__ << " done" << dendl;
1127 return 0;
1128}
1129
11fdf7f2
TL
1130int BlueFS::log_dump()
1131{
1132 // only dump log file's content
1133 int r = _replay(true, true);
1134 if (r < 0) {
1135 derr << __func__ << " failed to replay log: " << cpp_strerror(r) << dendl;
1136 return r;
1137 }
1138
1139 return 0;
1140}
1141
1142int BlueFS::device_migrate_to_existing(
1143 CephContext *cct,
1144 const set<int>& devs_source,
1145 int dev_target)
1146{
1147 vector<byte> buf;
1148 bool buffered = cct->_conf->bluefs_buffered_io;
1149
eafe8130
TL
1150 dout(10) << __func__ << " devs_source " << devs_source
1151 << " dev_target " << dev_target << dendl;
11fdf7f2
TL
1152 assert(dev_target < (int)MAX_BDEV);
1153
1154 int flags = 0;
1155 flags |= devs_source.count(BDEV_DB) ?
1156 (REMOVE_DB | RENAME_SLOW2DB) : 0;
1157 flags |= devs_source.count(BDEV_WAL) ? REMOVE_WAL : 0;
1158 int dev_target_new = dev_target;
1159
1160 // Slow device without separate DB one is addressed via BDEV_DB
1161 // Hence need renaming.
1162 if ((flags & REMOVE_DB) && dev_target == BDEV_SLOW) {
1163 dev_target_new = BDEV_DB;
1164 dout(0) << __func__ << " super to be written to " << dev_target << dendl;
1165 }
1166
1167 for (auto& p : file_map) {
1168 //do not copy log
1169 if (p.second->fnode.ino == 1) {
1170 continue;
1171 }
eafe8130
TL
1172 dout(10) << __func__ << " " << p.first << " " << p.second->fnode << dendl;
1173
11fdf7f2
TL
1174 auto& fnode_extents = p.second->fnode.extents;
1175
eafe8130 1176 bool rewrite = false;
11fdf7f2 1177 for (auto ext_it = fnode_extents.begin();
eafe8130
TL
1178 ext_it != p.second->fnode.extents.end();
1179 ++ext_it) {
11fdf7f2 1180 if (ext_it->bdev != dev_target && devs_source.count(ext_it->bdev)) {
eafe8130
TL
1181 rewrite = true;
1182 break;
1183 }
1184 }
1185 if (rewrite) {
1186 dout(10) << __func__ << " migrating" << dendl;
1187
1188 // read entire file
1189 bufferlist bl;
1190 for (auto old_ext : fnode_extents) {
1191 buf.resize(old_ext.length);
1192 int r = bdev[old_ext.bdev]->read_random(
1193 old_ext.offset,
1194 old_ext.length,
1195 (char*)&buf.at(0),
1196 buffered);
1197 if (r != 0) {
1198 derr << __func__ << " failed to read 0x" << std::hex
1199 << old_ext.offset << "~" << old_ext.length << std::dec
1200 << " from " << (int)dev_target << dendl;
1201 return -EIO;
1202 }
1203 bl.append((char*)&buf[0], old_ext.length);
1204 }
11fdf7f2 1205
eafe8130
TL
1206 // write entire file
1207 PExtentVector extents;
1208 auto l = _allocate_without_fallback(dev_target, bl.length(), &extents);
1209 if (l < 0) {
1210 derr << __func__ << " unable to allocate len 0x" << std::hex
1211 << bl.length() << std::dec << " from " << (int)dev_target
1212 << ": " << cpp_strerror(l) << dendl;
1213 return -ENOSPC;
1214 }
11fdf7f2 1215
eafe8130
TL
1216 uint64_t off = 0;
1217 for (auto& i : extents) {
1218 bufferlist cur;
1219 uint64_t cur_len = std::min<uint64_t>(i.length, bl.length() - off);
1220 ceph_assert(cur_len > 0);
1221 cur.substr_of(bl, off, cur_len);
1222 int r = bdev[dev_target]->write(i.offset, cur, buffered);
1223 ceph_assert(r == 0);
1224 off += cur_len;
1225 }
1226
1227 // release old extents
1228 for (auto old_ext : fnode_extents) {
1229 PExtentVector to_release;
1230 to_release.emplace_back(old_ext.offset, old_ext.length);
1231 alloc[old_ext.bdev]->release(to_release);
1232 }
1233
1234 // update fnode
1235 fnode_extents.clear();
1236 for (auto& i : extents) {
1237 fnode_extents.emplace_back(dev_target_new, i.offset, i.length);
1238 }
1239 } else {
1240 for (auto ext_it = fnode_extents.begin();
1241 ext_it != p.second->fnode.extents.end();
1242 ++ext_it) {
1243 if (dev_target != dev_target_new && ext_it->bdev == dev_target) {
1244 dout(20) << __func__ << " " << " ... adjusting extent 0x"
1245 << std::hex << ext_it->offset << std::dec
1246 << " bdev " << dev_target << " -> " << dev_target_new
1247 << dendl;
1248 ext_it->bdev = dev_target_new;
11fdf7f2 1249 }
11fdf7f2
TL
1250 }
1251 }
1252 auto& prefer_bdev = p.second->fnode.prefer_bdev;
1253 if (prefer_bdev != dev_target && devs_source.count(prefer_bdev)) {
eafe8130
TL
1254 dout(20) << __func__ << " " << " ... adjusting prefer_bdev "
1255 << prefer_bdev << " -> " << dev_target_new << dendl;
11fdf7f2
TL
1256 prefer_bdev = dev_target_new;
1257 }
1258 }
1259 // new logging device in the current naming scheme
1260 int new_log_dev_cur = bdev[BDEV_WAL] ?
1261 BDEV_WAL :
1262 bdev[BDEV_DB] ? BDEV_DB : BDEV_SLOW;
1263
1264 // new logging device in new naming scheme
1265 int new_log_dev_next = new_log_dev_cur;
1266
1267 if (devs_source.count(new_log_dev_cur)) {
1268 // SLOW device is addressed via BDEV_DB too hence either WAL or DB
1269 new_log_dev_next = (flags & REMOVE_WAL) || !bdev[BDEV_WAL] ?
1270 BDEV_DB :
1271 BDEV_WAL;
1272
1273 dout(0) << __func__ << " log moved from " << new_log_dev_cur
1274 << " to " << new_log_dev_next << dendl;
1275
1276 new_log_dev_cur =
1277 (flags & REMOVE_DB) && new_log_dev_next == BDEV_DB ?
1278 BDEV_SLOW :
1279 new_log_dev_next;
1280 }
1281
1282 _rewrite_log_sync(
1283 false,
1284 (flags & REMOVE_DB) ? BDEV_SLOW : BDEV_DB,
1285 new_log_dev_cur,
1286 new_log_dev_next,
1287 flags);
1288 return 0;
1289}
1290
1291int BlueFS::device_migrate_to_new(
1292 CephContext *cct,
1293 const set<int>& devs_source,
1294 int dev_target)
1295{
1296 vector<byte> buf;
1297 bool buffered = cct->_conf->bluefs_buffered_io;
1298
eafe8130
TL
1299 dout(10) << __func__ << " devs_source " << devs_source
1300 << " dev_target " << dev_target << dendl;
11fdf7f2
TL
1301 assert(dev_target == (int)BDEV_NEWDB || (int)BDEV_NEWWAL);
1302
1303 int flags = 0;
1304
1305 flags |= devs_source.count(BDEV_DB) ?
1306 (!bdev[BDEV_SLOW] ? RENAME_DB2SLOW: REMOVE_DB) :
1307 0;
1308 flags |= devs_source.count(BDEV_WAL) ? REMOVE_WAL : 0;
1309 int dev_target_new = dev_target;
1310
1311 for (auto& p : file_map) {
1312 //do not copy log
1313 if (p.second->fnode.ino == 1) {
1314 continue;
1315 }
eafe8130
TL
1316 dout(10) << __func__ << " " << p.first << " " << p.second->fnode << dendl;
1317
11fdf7f2
TL
1318 auto& fnode_extents = p.second->fnode.extents;
1319
eafe8130 1320 bool rewrite = false;
11fdf7f2 1321 for (auto ext_it = fnode_extents.begin();
eafe8130
TL
1322 ext_it != p.second->fnode.extents.end();
1323 ++ext_it) {
11fdf7f2 1324 if (ext_it->bdev != dev_target && devs_source.count(ext_it->bdev)) {
eafe8130
TL
1325 rewrite = true;
1326 break;
1327 }
1328 }
1329 if (rewrite) {
1330 dout(10) << __func__ << " migrating" << dendl;
1331
1332 // read entire file
1333 bufferlist bl;
1334 for (auto old_ext : fnode_extents) {
1335 buf.resize(old_ext.length);
1336 int r = bdev[old_ext.bdev]->read_random(
1337 old_ext.offset,
1338 old_ext.length,
1339 (char*)&buf.at(0),
1340 buffered);
1341 if (r != 0) {
1342 derr << __func__ << " failed to read 0x" << std::hex
1343 << old_ext.offset << "~" << old_ext.length << std::dec
1344 << " from " << (int)dev_target << dendl;
1345 return -EIO;
11fdf7f2 1346 }
eafe8130
TL
1347 bl.append((char*)&buf[0], old_ext.length);
1348 }
1349
1350 // write entire file
1351 PExtentVector extents;
1352 auto l = _allocate_without_fallback(dev_target, bl.length(), &extents);
1353 if (l < 0) {
1354 derr << __func__ << " unable to allocate len 0x" << std::hex
1355 << bl.length() << std::dec << " from " << (int)dev_target
1356 << ": " << cpp_strerror(l) << dendl;
1357 return -ENOSPC;
1358 }
1359
1360 uint64_t off = 0;
1361 for (auto& i : extents) {
1362 bufferlist cur;
1363 uint64_t cur_len = std::min<uint64_t>(i.length, bl.length() - off);
1364 ceph_assert(cur_len > 0);
1365 cur.substr_of(bl, off, cur_len);
1366 int r = bdev[dev_target]->write(i.offset, cur, buffered);
1367 ceph_assert(r == 0);
1368 off += cur_len;
1369 }
1370
1371 // release old extents
1372 for (auto old_ext : fnode_extents) {
1373 PExtentVector to_release;
1374 to_release.emplace_back(old_ext.offset, old_ext.length);
1375 alloc[old_ext.bdev]->release(to_release);
1376 }
1377
1378 // update fnode
1379 fnode_extents.clear();
1380 for (auto& i : extents) {
1381 fnode_extents.emplace_back(dev_target_new, i.offset, i.length);
11fdf7f2
TL
1382 }
1383 }
1384 auto& prefer_bdev = p.second->fnode.prefer_bdev;
1385 if (prefer_bdev != dev_target && devs_source.count(prefer_bdev)) {
eafe8130
TL
1386 dout(20) << __func__ << " " << " ... adjusting prefer_bdev "
1387 << prefer_bdev << " -> " << dev_target_new << dendl;
11fdf7f2
TL
1388 prefer_bdev = dev_target_new;
1389 }
1390 }
1391 // new logging device in the current naming scheme
1392 int new_log_dev_cur =
1393 bdev[BDEV_NEWWAL] ?
1394 BDEV_NEWWAL :
1395 bdev[BDEV_WAL] && !(flags & REMOVE_WAL) ?
1396 BDEV_WAL :
1397 bdev[BDEV_NEWDB] ?
1398 BDEV_NEWDB :
1399 bdev[BDEV_DB] && !(flags & REMOVE_DB)?
1400 BDEV_DB :
1401 BDEV_SLOW;
1402
1403 // new logging device in new naming scheme
1404 int new_log_dev_next =
1405 new_log_dev_cur == BDEV_NEWWAL ?
1406 BDEV_WAL :
1407 new_log_dev_cur == BDEV_NEWDB ?
1408 BDEV_DB :
1409 new_log_dev_cur;
1410
1411 int super_dev =
1412 dev_target == BDEV_NEWDB ?
1413 BDEV_NEWDB :
1414 bdev[BDEV_DB] ?
1415 BDEV_DB :
1416 BDEV_SLOW;
1417
1418 _rewrite_log_sync(
1419 false,
1420 super_dev,
1421 new_log_dev_cur,
1422 new_log_dev_next,
1423 flags);
1424 return 0;
1425}
1426
7c673cae
FG
1427BlueFS::FileRef BlueFS::_get_file(uint64_t ino)
1428{
1429 auto p = file_map.find(ino);
1430 if (p == file_map.end()) {
1431 FileRef f = new File;
1432 file_map[ino] = f;
1433 dout(30) << __func__ << " ino " << ino << " = " << f
1434 << " (new)" << dendl;
1435 return f;
1436 } else {
1437 dout(30) << __func__ << " ino " << ino << " = " << p->second << dendl;
1438 return p->second;
1439 }
1440}
1441
1442void BlueFS::_drop_link(FileRef file)
1443{
1444 dout(20) << __func__ << " had refs " << file->refs
1445 << " on " << file->fnode << dendl;
11fdf7f2 1446 ceph_assert(file->refs > 0);
7c673cae
FG
1447 --file->refs;
1448 if (file->refs == 0) {
1449 dout(20) << __func__ << " destroying " << file->fnode << dendl;
11fdf7f2 1450 ceph_assert(file->num_reading.load() == 0);
7c673cae
FG
1451 log_t.op_file_remove(file->fnode.ino);
1452 for (auto& r : file->fnode.extents) {
1453 pending_release[r.bdev].insert(r.offset, r.length);
1454 }
1455 file_map.erase(file->fnode.ino);
1456 file->deleted = true;
94b18763 1457
7c673cae 1458 if (file->dirty_seq) {
11fdf7f2
TL
1459 ceph_assert(file->dirty_seq > log_seq_stable);
1460 ceph_assert(dirty_files.count(file->dirty_seq));
7c673cae
FG
1461 auto it = dirty_files[file->dirty_seq].iterator_to(*file);
1462 dirty_files[file->dirty_seq].erase(it);
1463 file->dirty_seq = 0;
1464 }
1465 }
1466}
1467
1468int BlueFS::_read_random(
1469 FileReader *h, ///< [in] read from here
1470 uint64_t off, ///< [in] offset
1471 size_t len, ///< [in] this many bytes
1472 char *out) ///< [out] optional: or copy it here
1473{
494da23a
TL
1474 auto* buf = &h->buf;
1475
1476 int ret = 0;
7c673cae
FG
1477 dout(10) << __func__ << " h " << h
1478 << " 0x" << std::hex << off << "~" << len << std::dec
1479 << " from " << h->file->fnode << dendl;
1480
1481 ++h->file->num_reading;
1482
1483 if (!h->ignore_eof &&
1484 off + len > h->file->fnode.size) {
1485 if (off > h->file->fnode.size)
1486 len = 0;
1487 else
1488 len = h->file->fnode.size - off;
1489 dout(20) << __func__ << " reaching (or past) eof, len clipped to 0x"
1490 << std::hex << len << std::dec << dendl;
1491 }
494da23a
TL
1492 logger->inc(l_bluefs_read_random_count, 1);
1493 logger->inc(l_bluefs_read_random_bytes, len);
7c673cae 1494
494da23a 1495 std::shared_lock s_lock(h->lock);
7c673cae 1496 while (len > 0) {
494da23a
TL
1497 if (off < buf->bl_off || off >= buf->get_buf_end()) {
1498 s_lock.unlock();
1499 uint64_t x_off = 0;
1500 auto p = h->file->fnode.seek(off, &x_off);
1501 uint64_t l = std::min(p->length - x_off, static_cast<uint64_t>(len));
1502 dout(20) << __func__ << " read random 0x"
1503 << std::hex << x_off << "~" << l << std::dec
1504 << " of " << *p << dendl;
1505 int r = bdev[p->bdev]->read_random(p->offset + x_off, l, out,
1506 cct->_conf->bluefs_buffered_io);
1507 ceph_assert(r == 0);
1508 off += l;
1509 len -= l;
1510 ret += l;
1511 out += l;
1512
1513 logger->inc(l_bluefs_read_random_disk_count, 1);
1514 logger->inc(l_bluefs_read_random_disk_bytes, l);
1515 if (len > 0) {
1516 s_lock.lock();
1517 }
1518 } else {
1519 auto left = buf->get_buf_remaining(off);
1520 int r = std::min(len, left);
1521 logger->inc(l_bluefs_read_random_buffer_count, 1);
1522 logger->inc(l_bluefs_read_random_buffer_bytes, r);
1523 dout(20) << __func__ << " left 0x" << std::hex << left
1524 << " 0x" << off << "~" << len << std::dec
1525 << dendl;
1526
1527 if (out) {
1528 // NOTE: h->bl is normally a contiguous buffer so c_str() is free.
1529 memcpy(out, buf->bl.c_str() + off - buf->bl_off, r);
1530 out += r;
1531 }
7c673cae 1532
494da23a
TL
1533 dout(30) << __func__ << " result chunk (0x"
1534 << std::hex << r << std::dec << " bytes):\n";
1535 bufferlist t;
1536 t.substr_of(buf->bl, off - buf->bl_off, r);
1537 t.hexdump(*_dout);
1538 *_dout << dendl;
1539
1540 off += r;
1541 len -= r;
1542 ret += r;
1543 buf->pos += r;
1544 }
1545 }
7c673cae
FG
1546 dout(20) << __func__ << " got " << ret << dendl;
1547 --h->file->num_reading;
1548 return ret;
1549}
1550
1551int BlueFS::_read(
1552 FileReader *h, ///< [in] read from here
1553 FileReaderBuffer *buf, ///< [in] reader state
1554 uint64_t off, ///< [in] offset
1555 size_t len, ///< [in] this many bytes
1556 bufferlist *outbl, ///< [out] optional: reference the result here
1557 char *out) ///< [out] optional: or copy it here
1558{
494da23a 1559 bool prefetch = !outbl && !out;
7c673cae
FG
1560 dout(10) << __func__ << " h " << h
1561 << " 0x" << std::hex << off << "~" << len << std::dec
494da23a
TL
1562 << " from " << h->file->fnode
1563 << (prefetch ? " prefetch" : "")
1564 << dendl;
7c673cae
FG
1565
1566 ++h->file->num_reading;
1567
1568 if (!h->ignore_eof &&
1569 off + len > h->file->fnode.size) {
1570 if (off > h->file->fnode.size)
1571 len = 0;
1572 else
1573 len = h->file->fnode.size - off;
1574 dout(20) << __func__ << " reaching (or past) eof, len clipped to 0x"
1575 << std::hex << len << std::dec << dendl;
1576 }
494da23a
TL
1577 logger->inc(l_bluefs_read_count, 1);
1578 logger->inc(l_bluefs_read_bytes, len);
1579 if (prefetch) {
1580 logger->inc(l_bluefs_read_prefetch_count, 1);
1581 logger->inc(l_bluefs_read_prefetch_bytes, len);
1582 }
1583
7c673cae
FG
1584 if (outbl)
1585 outbl->clear();
1586
1587 int ret = 0;
494da23a 1588 std::shared_lock s_lock(h->lock);
7c673cae
FG
1589 while (len > 0) {
1590 size_t left;
1591 if (off < buf->bl_off || off >= buf->get_buf_end()) {
494da23a
TL
1592 s_lock.unlock();
1593 std::unique_lock u_lock(h->lock);
1594 if (off < buf->bl_off || off >= buf->get_buf_end()) {
1595 // if precondition hasn't changed during locking upgrade.
1596 buf->bl.clear();
1597 buf->bl_off = off & super.block_mask();
1598 uint64_t x_off = 0;
1599 auto p = h->file->fnode.seek(buf->bl_off, &x_off);
1600 uint64_t want = round_up_to(len + (off & ~super.block_mask()),
1601 super.block_size);
1602 want = std::max(want, buf->max_prefetch);
1603 uint64_t l = std::min(p->length - x_off, want);
1604 uint64_t eof_offset = round_up_to(h->file->fnode.size, super.block_size);
1605 if (!h->ignore_eof &&
1606 buf->bl_off + l > eof_offset) {
1607 l = eof_offset - buf->bl_off;
1608 }
1609 dout(20) << __func__ << " fetching 0x"
1610 << std::hex << x_off << "~" << l << std::dec
1611 << " of " << *p << dendl;
1612 int r = bdev[p->bdev]->read(p->offset + x_off, l, &buf->bl, ioc[p->bdev],
1613 cct->_conf->bluefs_buffered_io);
1614 ceph_assert(r == 0);
7c673cae 1615 }
494da23a
TL
1616 u_lock.unlock();
1617 s_lock.lock();
1618 // we should recheck if buffer is valid after lock downgrade
1619 continue;
7c673cae
FG
1620 }
1621 left = buf->get_buf_remaining(off);
1622 dout(20) << __func__ << " left 0x" << std::hex << left
1623 << " len 0x" << len << std::dec << dendl;
1624
11fdf7f2 1625 int r = std::min(len, left);
7c673cae
FG
1626 if (outbl) {
1627 bufferlist t;
1628 t.substr_of(buf->bl, off - buf->bl_off, r);
1629 outbl->claim_append(t);
1630 }
1631 if (out) {
1632 // NOTE: h->bl is normally a contiguous buffer so c_str() is free.
1633 memcpy(out, buf->bl.c_str() + off - buf->bl_off, r);
1634 out += r;
1635 }
1636
1637 dout(30) << __func__ << " result chunk (0x"
1638 << std::hex << r << std::dec << " bytes):\n";
1639 bufferlist t;
1640 t.substr_of(buf->bl, off - buf->bl_off, r);
1641 t.hexdump(*_dout);
1642 *_dout << dendl;
1643
1644 off += r;
1645 len -= r;
1646 ret += r;
1647 buf->pos += r;
1648 }
1649
1650 dout(20) << __func__ << " got " << ret << dendl;
11fdf7f2 1651 ceph_assert(!outbl || (int)outbl->length() == ret);
7c673cae
FG
1652 --h->file->num_reading;
1653 return ret;
1654}
1655
1656void BlueFS::_invalidate_cache(FileRef f, uint64_t offset, uint64_t length)
1657{
1658 dout(10) << __func__ << " file " << f->fnode
1659 << " 0x" << std::hex << offset << "~" << length << std::dec
1660 << dendl;
1661 if (offset & ~super.block_mask()) {
1662 offset &= super.block_mask();
11fdf7f2 1663 length = round_up_to(length, super.block_size);
7c673cae
FG
1664 }
1665 uint64_t x_off = 0;
1666 auto p = f->fnode.seek(offset, &x_off);
1667 while (length > 0 && p != f->fnode.extents.end()) {
11fdf7f2 1668 uint64_t x_len = std::min(p->length - x_off, length);
7c673cae
FG
1669 bdev[p->bdev]->invalidate_cache(p->offset + x_off, x_len);
1670 dout(20) << __func__ << " 0x" << std::hex << x_off << "~" << x_len
1671 << std:: dec << " of " << *p << dendl;
1672 offset += x_len;
1673 length -= x_len;
1674 }
1675}
1676
1677uint64_t BlueFS::_estimate_log_size()
1678{
1679 int avg_dir_size = 40; // fixme
1680 int avg_file_size = 12;
1681 uint64_t size = 4096 * 2;
1682 size += file_map.size() * (1 + sizeof(bluefs_fnode_t));
1683 for (auto& p : block_all)
1684 size += p.num_intervals() * (1 + 1 + sizeof(uint64_t) * 2);
1685 size += dir_map.size() + (1 + avg_dir_size);
1686 size += file_map.size() * (1 + avg_dir_size + avg_file_size);
11fdf7f2 1687 return round_up_to(size, super.block_size);
7c673cae
FG
1688}
1689
1690void BlueFS::compact_log()
1691{
11fdf7f2 1692 std::unique_lock l(lock);
7c673cae
FG
1693 if (cct->_conf->bluefs_compact_log_sync) {
1694 _compact_log_sync();
1695 } else {
1696 _compact_log_async(l);
1697 }
1698}
1699
1700bool BlueFS::_should_compact_log()
1701{
1702 uint64_t current = log_writer->file->fnode.size;
1703 uint64_t expected = _estimate_log_size();
1704 float ratio = (float)current / (float)expected;
1705 dout(10) << __func__ << " current 0x" << std::hex << current
1706 << " expected " << expected << std::dec
1707 << " ratio " << ratio
1708 << (new_log ? " (async compaction in progress)" : "")
1709 << dendl;
1710 if (new_log ||
1711 current < cct->_conf->bluefs_log_compact_min_size ||
1712 ratio < cct->_conf->bluefs_log_compact_min_ratio) {
1713 return false;
1714 }
1715 return true;
1716}
1717
11fdf7f2
TL
1718void BlueFS::_compact_log_dump_metadata(bluefs_transaction_t *t,
1719 int flags)
7c673cae
FG
1720{
1721 t->seq = 1;
1722 t->uuid = super.uuid;
1723 dout(20) << __func__ << " op_init" << dendl;
1724
1725 t->op_init();
1726 for (unsigned bdev = 0; bdev < MAX_BDEV; ++bdev) {
1727 interval_set<uint64_t>& p = block_all[bdev];
1728 for (interval_set<uint64_t>::iterator q = p.begin(); q != p.end(); ++q) {
11fdf7f2
TL
1729 auto bdev_new = bdev;
1730 if ((flags & REMOVE_WAL) && bdev == BDEV_WAL) {
1731 continue;
1732 }
1733 if ((flags & REMOVE_DB) && bdev == BDEV_DB) {
1734 continue;
1735 }
1736 if ((flags & RENAME_SLOW2DB) && bdev == BDEV_SLOW) {
1737 bdev_new = BDEV_DB;
1738 }
1739 if ((flags & RENAME_DB2SLOW) && bdev == BDEV_DB) {
1740 bdev_new = BDEV_SLOW;
1741 }
1742 if (bdev == BDEV_NEWDB) {
1743 // REMOVE_DB xor RENAME_DB
1744 ceph_assert(!(flags & REMOVE_DB) != !(flags & RENAME_DB2SLOW));
1745 ceph_assert(!(flags & RENAME_SLOW2DB));
1746 bdev_new = BDEV_DB;
1747 }
1748 if (bdev == BDEV_NEWWAL) {
1749 ceph_assert(flags & REMOVE_WAL);
1750 bdev_new = BDEV_WAL;
1751 }
1752 dout(20) << __func__ << " op_alloc_add " << bdev_new << " 0x"
7c673cae
FG
1753 << std::hex << q.get_start() << "~" << q.get_len() << std::dec
1754 << dendl;
11fdf7f2 1755 t->op_alloc_add(bdev_new, q.get_start(), q.get_len());
7c673cae
FG
1756 }
1757 }
1758 for (auto& p : file_map) {
1759 if (p.first == 1)
1760 continue;
11fdf7f2
TL
1761 ceph_assert(p.first > 1);
1762
1763 for(auto& e : p.second->fnode.extents) {
1764 auto bdev = e.bdev;
1765 auto bdev_new = bdev;
1766 ceph_assert(!((flags & REMOVE_WAL) && bdev == BDEV_WAL));
1767 if ((flags & RENAME_SLOW2DB) && bdev == BDEV_SLOW) {
1768 bdev_new = BDEV_DB;
1769 }
1770 if ((flags & RENAME_DB2SLOW) && bdev == BDEV_DB) {
1771 bdev_new = BDEV_SLOW;
1772 }
1773 if (bdev == BDEV_NEWDB) {
1774 // REMOVE_DB xor RENAME_DB
1775 ceph_assert(!(flags & REMOVE_DB) != !(flags & RENAME_DB2SLOW));
1776 ceph_assert(!(flags & RENAME_SLOW2DB));
1777 bdev_new = BDEV_DB;
1778 }
1779 if (bdev == BDEV_NEWWAL) {
1780 ceph_assert(flags & REMOVE_WAL);
1781 bdev_new = BDEV_WAL;
1782 }
1783 e.bdev = bdev_new;
1784 }
7c673cae 1785 dout(20) << __func__ << " op_file_update " << p.second->fnode << dendl;
7c673cae
FG
1786 t->op_file_update(p.second->fnode);
1787 }
1788 for (auto& p : dir_map) {
1789 dout(20) << __func__ << " op_dir_create " << p.first << dendl;
1790 t->op_dir_create(p.first);
1791 for (auto& q : p.second->file_map) {
1792 dout(20) << __func__ << " op_dir_link " << p.first << "/" << q.first
1793 << " to " << q.second->fnode.ino << dendl;
1794 t->op_dir_link(p.first, q.first, q.second->fnode.ino);
1795 }
1796 }
1797}
1798
1799void BlueFS::_compact_log_sync()
1800{
1801 dout(10) << __func__ << dendl;
11fdf7f2
TL
1802 _rewrite_log_sync(true,
1803 BDEV_DB,
1804 log_writer->file->fnode.prefer_bdev,
1805 log_writer->file->fnode.prefer_bdev,
1806 0);
1807 logger->inc(l_bluefs_log_compactions);
1808}
1809
1810void BlueFS::_rewrite_log_sync(bool allocate_with_fallback,
1811 int super_dev,
1812 int log_dev,
1813 int log_dev_new,
1814 int flags)
1815{
7c673cae
FG
1816 File *log_file = log_writer->file.get();
1817
1818 // clear out log (be careful who calls us!!!)
1819 log_t.clear();
1820
11fdf7f2
TL
1821 dout(20) << __func__ << " super_dev:" << super_dev
1822 << " log_dev:" << log_dev
1823 << " log_dev_new:" << log_dev_new
1824 << " flags:" << flags
1825 << dendl;
7c673cae 1826 bluefs_transaction_t t;
11fdf7f2 1827 _compact_log_dump_metadata(&t, flags);
7c673cae
FG
1828
1829 dout(20) << __func__ << " op_jump_seq " << log_seq << dendl;
1830 t.op_jump_seq(log_seq);
1831
1832 bufferlist bl;
11fdf7f2 1833 encode(t, bl);
7c673cae
FG
1834 _pad_bl(bl);
1835
1836 uint64_t need = bl.length() + cct->_conf->bluefs_max_log_runway;
1837 dout(20) << __func__ << " need " << need << dendl;
1838
494da23a 1839 bluefs_fnode_t old_fnode;
11fdf7f2 1840 int r;
494da23a 1841 log_file->fnode.swap_extents(old_fnode);
11fdf7f2
TL
1842 if (allocate_with_fallback) {
1843 r = _allocate(log_dev, need, &log_file->fnode);
1844 ceph_assert(r == 0);
1845 } else {
1846 PExtentVector extents;
1847 r = _allocate_without_fallback(log_dev,
1848 need,
1849 &extents);
1850 ceph_assert(r == 0);
1851 for (auto& p : extents) {
1852 log_file->fnode.append_extent(
1853 bluefs_extent_t(log_dev, p.offset, p.length));
1854 }
7c673cae
FG
1855 }
1856
1857 _close_writer(log_writer);
1858
1859 log_file->fnode.size = bl.length();
1860 log_writer = _create_writer(log_file);
1861 log_writer->append(bl);
11fdf7f2
TL
1862 r = _flush(log_writer, true);
1863 ceph_assert(r == 0);
1864#ifdef HAVE_LIBAIO
1865 if (!cct->_conf->bluefs_sync_write) {
1866 list<aio_t> completed_ios;
1867 _claim_completed_aios(log_writer, &completed_ios);
1868 wait_for_aio(log_writer);
1869 completed_ios.clear();
1870 }
1871#endif
224ce89b 1872 flush_bdev();
224ce89b 1873
7c673cae 1874 super.log_fnode = log_file->fnode;
11fdf7f2
TL
1875 // rename device if needed
1876 if (log_dev != log_dev_new) {
1877 dout(10) << __func__ << " renaming log extents to " << log_dev_new << dendl;
1878 for (auto& p : super.log_fnode.extents) {
1879 p.bdev = log_dev_new;
1880 }
1881 }
1882 dout(10) << __func__ << " writing super, log fnode: " << super.log_fnode << dendl;
1883
7c673cae 1884 ++super.version;
11fdf7f2 1885 _write_super(super_dev);
7c673cae
FG
1886 flush_bdev();
1887
494da23a
TL
1888 dout(10) << __func__ << " release old log extents " << old_fnode.extents << dendl;
1889 for (auto& r : old_fnode.extents) {
7c673cae
FG
1890 pending_release[r.bdev].insert(r.offset, r.length);
1891 }
7c673cae
FG
1892}
1893
1894/*
1895 * 1. Allocate a new extent to continue the log, and then log an event
1896 * that jumps the log write position to the new extent. At this point, the
1897 * old extent(s) won't be written to, and reflect everything to compact.
1898 * New events will be written to the new region that we'll keep.
1899 *
1900 * 2. While still holding the lock, encode a bufferlist that dumps all of the
1901 * in-memory fnodes and names. This will become the new beginning of the
1902 * log. The last event will jump to the log continuation extent from #1.
1903 *
1904 * 3. Queue a write to a new extent for the new beginnging of the log.
1905 *
1906 * 4. Drop lock and wait
1907 *
1908 * 5. Retake the lock.
1909 *
1910 * 6. Update the log_fnode to splice in the new beginning.
1911 *
1912 * 7. Write the new superblock.
1913 *
1914 * 8. Release the old log space. Clean up.
1915 */
11fdf7f2 1916void BlueFS::_compact_log_async(std::unique_lock<ceph::mutex>& l)
7c673cae
FG
1917{
1918 dout(10) << __func__ << dendl;
1919 File *log_file = log_writer->file.get();
11fdf7f2
TL
1920 ceph_assert(!new_log);
1921 ceph_assert(!new_log_writer);
7c673cae 1922
181888fb
FG
1923 // create a new log [writer] so that we know compaction is in progress
1924 // (see _should_compact_log)
1925 new_log = new File;
1926 new_log->fnode.ino = 0; // so that _flush_range won't try to log the fnode
1927
3efd9988
FG
1928 // 0. wait for any racing flushes to complete. (We do not want to block
1929 // in _flush_sync_log with jump_to set or else a racing thread might flush
1930 // our entries and our jump_to update won't be correct.)
1931 while (log_flushing) {
1932 dout(10) << __func__ << " log is currently flushing, waiting" << dendl;
1933 log_cond.wait(l);
1934 }
1935
7c673cae
FG
1936 // 1. allocate new log space and jump to it.
1937 old_log_jump_to = log_file->fnode.get_allocated();
7c673cae 1938 dout(10) << __func__ << " old_log_jump_to 0x" << std::hex << old_log_jump_to
11fdf7f2
TL
1939 << " need 0x" << (old_log_jump_to + cct->_conf->bluefs_max_log_runway) << std::dec << dendl;
1940 int r = _allocate(log_file->fnode.prefer_bdev,
1941 cct->_conf->bluefs_max_log_runway, &log_file->fnode);
1942 ceph_assert(r == 0);
7c673cae
FG
1943 dout(10) << __func__ << " log extents " << log_file->fnode.extents << dendl;
1944
1945 // update the log file change and log a jump to the offset where we want to
1946 // write the new entries
1947 log_t.op_file_update(log_file->fnode);
1948 log_t.op_jump(log_seq, old_log_jump_to);
1949
1950 flush_bdev(); // FIXME?
1951
1952 _flush_and_sync_log(l, 0, old_log_jump_to);
1953
1954 // 2. prepare compacted log
1955 bluefs_transaction_t t;
224ce89b
WB
1956 //avoid record two times in log_t and _compact_log_dump_metadata.
1957 log_t.clear();
11fdf7f2 1958 _compact_log_dump_metadata(&t, 0);
7c673cae 1959
eafe8130
TL
1960 uint64_t max_alloc_size = std::max(alloc_size[BDEV_WAL],
1961 std::max(alloc_size[BDEV_DB],
1962 alloc_size[BDEV_SLOW]));
1963
7c673cae 1964 // conservative estimate for final encoded size
11fdf7f2 1965 new_log_jump_to = round_up_to(t.op_bl.length() + super.block_size * 2,
eafe8130 1966 max_alloc_size);
7c673cae
FG
1967 t.op_jump(log_seq, new_log_jump_to);
1968
11fdf7f2
TL
1969 // allocate
1970 r = _allocate(BlueFS::BDEV_DB, new_log_jump_to,
1971 &new_log->fnode);
1972 ceph_assert(r == 0);
1973
1974 // we might have some more ops in log_t due to _allocate call
1975 t.claim_ops(log_t);
1976
7c673cae 1977 bufferlist bl;
11fdf7f2 1978 encode(t, bl);
7c673cae
FG
1979 _pad_bl(bl);
1980
1981 dout(10) << __func__ << " new_log_jump_to 0x" << std::hex << new_log_jump_to
1982 << std::dec << dendl;
1983
7c673cae
FG
1984 new_log_writer = _create_writer(new_log);
1985 new_log_writer->append(bl);
1986
1987 // 3. flush
1988 r = _flush(new_log_writer, true);
11fdf7f2 1989 ceph_assert(r == 0);
7c673cae
FG
1990
1991 // 4. wait
11fdf7f2 1992 _flush_bdev_safely(new_log_writer);
7c673cae 1993
11fdf7f2 1994 // 5. update our log fnode
7c673cae
FG
1995 // discard first old_log_jump_to extents
1996 dout(10) << __func__ << " remove 0x" << std::hex << old_log_jump_to << std::dec
1997 << " of " << log_file->fnode.extents << dendl;
1998 uint64_t discarded = 0;
1999 mempool::bluefs::vector<bluefs_extent_t> old_extents;
2000 while (discarded < old_log_jump_to) {
11fdf7f2 2001 ceph_assert(!log_file->fnode.extents.empty());
7c673cae
FG
2002 bluefs_extent_t& e = log_file->fnode.extents.front();
2003 bluefs_extent_t temp = e;
2004 if (discarded + e.length <= old_log_jump_to) {
2005 dout(10) << __func__ << " remove old log extent " << e << dendl;
2006 discarded += e.length;
94b18763 2007 log_file->fnode.pop_front_extent();
7c673cae
FG
2008 } else {
2009 dout(10) << __func__ << " remove front of old log extent " << e << dendl;
2010 uint64_t drop = old_log_jump_to - discarded;
2011 temp.length = drop;
2012 e.offset += drop;
2013 e.length -= drop;
2014 discarded += drop;
2015 dout(10) << __func__ << " kept " << e << " removed " << temp << dendl;
2016 }
2017 old_extents.push_back(temp);
2018 }
94b18763
FG
2019 auto from = log_file->fnode.extents.begin();
2020 auto to = log_file->fnode.extents.end();
2021 while (from != to) {
2022 new_log->fnode.append_extent(*from);
2023 ++from;
2024 }
7c673cae
FG
2025
2026 // clear the extents from old log file, they are added to new log
94b18763 2027 log_file->fnode.clear_extents();
7c673cae 2028 // swap the log files. New log file is the log file now.
94b18763
FG
2029 new_log->fnode.swap_extents(log_file->fnode);
2030
7c673cae
FG
2031 log_writer->pos = log_writer->file->fnode.size =
2032 log_writer->pos - old_log_jump_to + new_log_jump_to;
2033
11fdf7f2 2034 // 6. write the super block to reflect the changes
7c673cae
FG
2035 dout(10) << __func__ << " writing super" << dendl;
2036 super.log_fnode = log_file->fnode;
2037 ++super.version;
11fdf7f2 2038 _write_super(BDEV_DB);
7c673cae
FG
2039
2040 lock.unlock();
2041 flush_bdev();
2042 lock.lock();
2043
11fdf7f2 2044 // 7. release old space
7c673cae
FG
2045 dout(10) << __func__ << " release old log extents " << old_extents << dendl;
2046 for (auto& r : old_extents) {
2047 pending_release[r.bdev].insert(r.offset, r.length);
2048 }
2049
2050 // delete the new log, remove from the dirty files list
2051 _close_writer(new_log_writer);
2052 if (new_log->dirty_seq) {
11fdf7f2 2053 ceph_assert(dirty_files.count(new_log->dirty_seq));
7c673cae
FG
2054 auto it = dirty_files[new_log->dirty_seq].iterator_to(*new_log);
2055 dirty_files[new_log->dirty_seq].erase(it);
2056 }
2057 new_log_writer = nullptr;
2058 new_log = nullptr;
2059 log_cond.notify_all();
2060
2061 dout(10) << __func__ << " log extents " << log_file->fnode.extents << dendl;
2062 logger->inc(l_bluefs_log_compactions);
2063}
2064
2065void BlueFS::_pad_bl(bufferlist& bl)
2066{
2067 uint64_t partial = bl.length() % super.block_size;
2068 if (partial) {
2069 dout(10) << __func__ << " padding with 0x" << std::hex
2070 << super.block_size - partial << " zeros" << std::dec << dendl;
2071 bl.append_zero(super.block_size - partial);
2072 }
2073}
2074
2075void BlueFS::flush_log()
2076{
11fdf7f2 2077 std::unique_lock l(lock);
7c673cae
FG
2078 flush_bdev();
2079 _flush_and_sync_log(l);
2080}
2081
11fdf7f2 2082int BlueFS::_flush_and_sync_log(std::unique_lock<ceph::mutex>& l,
7c673cae
FG
2083 uint64_t want_seq,
2084 uint64_t jump_to)
2085{
2086 while (log_flushing) {
2087 dout(10) << __func__ << " want_seq " << want_seq
2088 << " log is currently flushing, waiting" << dendl;
11fdf7f2 2089 ceph_assert(!jump_to);
7c673cae
FG
2090 log_cond.wait(l);
2091 }
2092 if (want_seq && want_seq <= log_seq_stable) {
2093 dout(10) << __func__ << " want_seq " << want_seq << " <= log_seq_stable "
2094 << log_seq_stable << ", done" << dendl;
11fdf7f2 2095 ceph_assert(!jump_to);
7c673cae
FG
2096 return 0;
2097 }
2098 if (log_t.empty() && dirty_files.empty()) {
2099 dout(10) << __func__ << " want_seq " << want_seq
2100 << " " << log_t << " not dirty, dirty_files empty, no-op" << dendl;
11fdf7f2 2101 ceph_assert(!jump_to);
7c673cae
FG
2102 return 0;
2103 }
2104
a8e16298
TL
2105 vector<interval_set<uint64_t>> to_release(pending_release.size());
2106 to_release.swap(pending_release);
2107
7c673cae 2108 uint64_t seq = log_t.seq = ++log_seq;
11fdf7f2 2109 ceph_assert(want_seq == 0 || want_seq <= seq);
7c673cae
FG
2110 log_t.uuid = super.uuid;
2111
2112 // log dirty files
2113 auto lsi = dirty_files.find(seq);
2114 if (lsi != dirty_files.end()) {
2115 dout(20) << __func__ << " " << lsi->second.size() << " dirty_files" << dendl;
2116 for (auto &f : lsi->second) {
2117 dout(20) << __func__ << " op_file_update " << f.fnode << dendl;
2118 log_t.op_file_update(f.fnode);
2119 }
2120 }
2121
2122 dout(10) << __func__ << " " << log_t << dendl;
11fdf7f2 2123 ceph_assert(!log_t.empty());
7c673cae
FG
2124
2125 // allocate some more space (before we run out)?
2126 int64_t runway = log_writer->file->fnode.get_allocated() -
2127 log_writer->get_effective_write_pos();
2128 if (runway < (int64_t)cct->_conf->bluefs_min_log_runway) {
2129 dout(10) << __func__ << " allocating more log runway (0x"
2130 << std::hex << runway << std::dec << " remaining)" << dendl;
2131 while (new_log_writer) {
2132 dout(10) << __func__ << " waiting for async compaction" << dendl;
2133 log_cond.wait(l);
2134 }
2135 int r = _allocate(log_writer->file->fnode.prefer_bdev,
2136 cct->_conf->bluefs_max_log_runway,
94b18763 2137 &log_writer->file->fnode);
11fdf7f2 2138 ceph_assert(r == 0);
7c673cae
FG
2139 log_t.op_file_update(log_writer->file->fnode);
2140 }
2141
2142 bufferlist bl;
11fdf7f2
TL
2143 bl.reserve(super.block_size);
2144 encode(log_t, bl);
7c673cae 2145 // pad to block boundary
11fdf7f2
TL
2146 size_t realign = super.block_size - (bl.length() % super.block_size);
2147 if (realign && realign != super.block_size)
2148 bl.append_zero(realign);
2149
7c673cae
FG
2150 logger->inc(l_bluefs_logged_bytes, bl.length());
2151
2152 log_writer->append(bl);
2153
2154 log_t.clear();
2155 log_t.seq = 0; // just so debug output is less confusing
2156 log_flushing = true;
2157
2158 int r = _flush(log_writer, true);
11fdf7f2 2159 ceph_assert(r == 0);
7c673cae
FG
2160
2161 if (jump_to) {
2162 dout(10) << __func__ << " jumping log offset from 0x" << std::hex
2163 << log_writer->pos << " -> 0x" << jump_to << std::dec << dendl;
2164 log_writer->pos = jump_to;
2165 log_writer->file->fnode.size = jump_to;
2166 }
2167
2168 _flush_bdev_safely(log_writer);
2169
2170 log_flushing = false;
2171 log_cond.notify_all();
2172
2173 // clean dirty files
2174 if (seq > log_seq_stable) {
2175 log_seq_stable = seq;
2176 dout(20) << __func__ << " log_seq_stable " << log_seq_stable << dendl;
2177
2178 auto p = dirty_files.begin();
2179 while (p != dirty_files.end()) {
2180 if (p->first > log_seq_stable) {
2181 dout(20) << __func__ << " done cleaning up dirty files" << dendl;
2182 break;
2183 }
2184
2185 auto l = p->second.begin();
2186 while (l != p->second.end()) {
2187 File *file = &*l;
11fdf7f2
TL
2188 ceph_assert(file->dirty_seq > 0);
2189 ceph_assert(file->dirty_seq <= log_seq_stable);
7c673cae
FG
2190 dout(20) << __func__ << " cleaned file " << file->fnode << dendl;
2191 file->dirty_seq = 0;
2192 p->second.erase(l++);
2193 }
2194
11fdf7f2 2195 ceph_assert(p->second.empty());
7c673cae
FG
2196 dirty_files.erase(p++);
2197 }
2198 } else {
2199 dout(20) << __func__ << " log_seq_stable " << log_seq_stable
2200 << " already >= out seq " << seq
2201 << ", we lost a race against another log flush, done" << dendl;
2202 }
a8e16298
TL
2203
2204 for (unsigned i = 0; i < to_release.size(); ++i) {
2205 if (!to_release[i].empty()) {
2206 /* OK, now we have the guarantee alloc[i] won't be null. */
11fdf7f2
TL
2207 int r = 0;
2208 if (cct->_conf->bdev_enable_discard && cct->_conf->bdev_async_discard) {
2209 r = bdev[i]->queue_discard(to_release[i]);
2210 if (r == 0)
2211 continue;
2212 } else if (cct->_conf->bdev_enable_discard) {
2213 for (auto p = to_release[i].begin(); p != to_release[i].end(); ++p) {
2214 bdev[i]->discard(p.get_start(), p.get_len());
2215 }
2216 }
a8e16298
TL
2217 alloc[i]->release(to_release[i]);
2218 }
2219 }
2220
7c673cae
FG
2221 _update_logger_stats();
2222
2223 return 0;
2224}
2225
2226int BlueFS::_flush_range(FileWriter *h, uint64_t offset, uint64_t length)
2227{
2228 dout(10) << __func__ << " " << h << " pos 0x" << std::hex << h->pos
2229 << " 0x" << offset << "~" << length << std::dec
2230 << " to " << h->file->fnode << dendl;
11fdf7f2
TL
2231 ceph_assert(!h->file->deleted);
2232 ceph_assert(h->file->num_readers.load() == 0);
7c673cae
FG
2233
2234 h->buffer_appender.flush();
2235
2236 bool buffered;
2237 if (h->file->fnode.ino == 1)
2238 buffered = false;
2239 else
2240 buffered = cct->_conf->bluefs_buffered_io;
2241
2242 if (offset + length <= h->pos)
2243 return 0;
2244 if (offset < h->pos) {
2245 length -= h->pos - offset;
2246 offset = h->pos;
2247 dout(10) << " still need 0x"
2248 << std::hex << offset << "~" << length << std::dec
2249 << dendl;
2250 }
11fdf7f2 2251 ceph_assert(offset <= h->file->fnode.size);
7c673cae
FG
2252
2253 uint64_t allocated = h->file->fnode.get_allocated();
2254
2255 // do not bother to dirty the file if we are overwriting
2256 // previously allocated extents.
2257 bool must_dirty = false;
2258 if (allocated < offset + length) {
2259 // we should never run out of log space here; see the min runway check
2260 // in _flush_and_sync_log.
11fdf7f2 2261 ceph_assert(h->file->fnode.ino != 1);
7c673cae
FG
2262 int r = _allocate(h->file->fnode.prefer_bdev,
2263 offset + length - allocated,
94b18763 2264 &h->file->fnode);
7c673cae
FG
2265 if (r < 0) {
2266 derr << __func__ << " allocated: 0x" << std::hex << allocated
2267 << " offset: 0x" << offset << " length: 0x" << length << std::dec
2268 << dendl;
11fdf7f2 2269 ceph_abort_msg("bluefs enospc");
7c673cae
FG
2270 return r;
2271 }
7c673cae
FG
2272 if (cct->_conf->bluefs_preextend_wal_files &&
2273 h->writer_type == WRITER_WAL) {
2274 // NOTE: this *requires* that rocksdb also has log recycling
2275 // enabled and is therefore doing robust CRCs on the log
2276 // records. otherwise, we will fail to reply the rocksdb log
2277 // properly due to garbage on the device.
2278 h->file->fnode.size = h->file->fnode.get_allocated();
2279 dout(10) << __func__ << " extending WAL size to 0x" << std::hex
2280 << h->file->fnode.size << std::dec << " to include allocated"
2281 << dendl;
2282 }
2283 must_dirty = true;
2284 }
2285 if (h->file->fnode.size < offset + length) {
2286 h->file->fnode.size = offset + length;
2287 if (h->file->fnode.ino > 1) {
2288 // we do not need to dirty the log file (or it's compacting
2289 // replacement) when the file size changes because replay is
2290 // smart enough to discover it on its own.
2291 must_dirty = true;
2292 }
2293 }
2294 if (must_dirty) {
2295 h->file->fnode.mtime = ceph_clock_now();
11fdf7f2 2296 ceph_assert(h->file->fnode.ino >= 1);
7c673cae
FG
2297 if (h->file->dirty_seq == 0) {
2298 h->file->dirty_seq = log_seq + 1;
2299 dirty_files[h->file->dirty_seq].push_back(*h->file);
2300 dout(20) << __func__ << " dirty_seq = " << log_seq + 1
2301 << " (was clean)" << dendl;
2302 } else {
2303 if (h->file->dirty_seq != log_seq + 1) {
2304 // need re-dirty, erase from list first
11fdf7f2 2305 ceph_assert(dirty_files.count(h->file->dirty_seq));
7c673cae
FG
2306 auto it = dirty_files[h->file->dirty_seq].iterator_to(*h->file);
2307 dirty_files[h->file->dirty_seq].erase(it);
2308 h->file->dirty_seq = log_seq + 1;
2309 dirty_files[h->file->dirty_seq].push_back(*h->file);
2310 dout(20) << __func__ << " dirty_seq = " << log_seq + 1
2311 << " (was " << h->file->dirty_seq << ")" << dendl;
2312 } else {
2313 dout(20) << __func__ << " dirty_seq = " << log_seq + 1
2314 << " (unchanged, do nothing) " << dendl;
2315 }
2316 }
2317 }
2318 dout(20) << __func__ << " file now " << h->file->fnode << dendl;
2319
2320 uint64_t x_off = 0;
2321 auto p = h->file->fnode.seek(offset, &x_off);
11fdf7f2 2322 ceph_assert(p != h->file->fnode.extents.end());
7c673cae
FG
2323 dout(20) << __func__ << " in " << *p << " x_off 0x"
2324 << std::hex << x_off << std::dec << dendl;
2325
2326 unsigned partial = x_off & ~super.block_mask();
2327 bufferlist bl;
2328 if (partial) {
2329 dout(20) << __func__ << " using partial tail 0x"
2330 << std::hex << partial << std::dec << dendl;
11fdf7f2 2331 ceph_assert(h->tail_block.length() == partial);
31f18b77 2332 bl.claim_append_piecewise(h->tail_block);
7c673cae
FG
2333 x_off -= partial;
2334 offset -= partial;
2335 length += partial;
2336 dout(20) << __func__ << " waiting for previous aio to complete" << dendl;
2337 for (auto p : h->iocv) {
2338 if (p) {
2339 p->aio_wait();
2340 }
2341 }
2342 }
2343 if (length == partial + h->buffer.length()) {
31f18b77 2344 bl.claim_append_piecewise(h->buffer);
7c673cae
FG
2345 } else {
2346 bufferlist t;
31f18b77
FG
2347 h->buffer.splice(0, length, &t);
2348 bl.claim_append_piecewise(t);
7c673cae
FG
2349 t.substr_of(h->buffer, length, h->buffer.length() - length);
2350 h->buffer.swap(t);
2351 dout(20) << " leaving 0x" << std::hex << h->buffer.length() << std::dec
2352 << " unflushed" << dendl;
2353 }
11fdf7f2 2354 ceph_assert(bl.length() == length);
7c673cae
FG
2355
2356 switch (h->writer_type) {
2357 case WRITER_WAL:
2358 logger->inc(l_bluefs_bytes_written_wal, length);
2359 break;
2360 case WRITER_SST:
2361 logger->inc(l_bluefs_bytes_written_sst, length);
2362 break;
2363 }
2364
2365 dout(30) << "dump:\n";
2366 bl.hexdump(*_dout);
2367 *_dout << dendl;
2368
2369 h->pos = offset + length;
2370 h->tail_block.clear();
2371
2372 uint64_t bloff = 0;
11fdf7f2 2373 uint64_t bytes_written_slow = 0;
7c673cae 2374 while (length > 0) {
11fdf7f2 2375 uint64_t x_len = std::min(p->length - x_off, length);
7c673cae
FG
2376 bufferlist t;
2377 t.substr_of(bl, bloff, x_len);
2378 unsigned tail = x_len & ~super.block_mask();
2379 if (tail) {
2380 size_t zlen = super.block_size - tail;
2381 dout(20) << __func__ << " caching tail of 0x"
2382 << std::hex << tail
2383 << " and padding block with 0x" << zlen
2384 << std::dec << dendl;
2385 h->tail_block.substr_of(bl, bl.length() - tail, tail);
2386 if (h->file->fnode.ino > 1) {
2387 // we are using the page_aligned_appender, and can safely use
2388 // the tail of the raw buffer.
2389 const bufferptr &last = t.back();
2390 if (last.unused_tail_length() < zlen) {
2391 derr << " wtf, last is " << last << " from " << t << dendl;
11fdf7f2 2392 ceph_assert(last.unused_tail_length() >= zlen);
7c673cae
FG
2393 }
2394 bufferptr z = last;
2395 z.set_offset(last.offset() + last.length());
2396 z.set_length(zlen);
2397 z.zero();
2398 t.append(z, 0, zlen);
2399 } else {
2400 t.append_zero(zlen);
2401 }
2402 }
2403 if (cct->_conf->bluefs_sync_write) {
11fdf7f2 2404 bdev[p->bdev]->write(p->offset + x_off, t, buffered, h->write_hint);
7c673cae 2405 } else {
11fdf7f2
TL
2406 bdev[p->bdev]->aio_write(p->offset + x_off, t, h->iocv[p->bdev], buffered, h->write_hint);
2407 }
2408 h->dirty_devs[p->bdev] = true;
2409 if (p->bdev == BDEV_SLOW) {
2410 bytes_written_slow += t.length();
7c673cae 2411 }
11fdf7f2 2412
7c673cae
FG
2413 bloff += x_len;
2414 length -= x_len;
2415 ++p;
2416 x_off = 0;
2417 }
11fdf7f2 2418 logger->inc(l_bluefs_bytes_written_slow, bytes_written_slow);
7c673cae
FG
2419 for (unsigned i = 0; i < MAX_BDEV; ++i) {
2420 if (bdev[i]) {
11fdf7f2 2421 if (h->iocv[i] && h->iocv[i]->has_pending_aios()) {
7c673cae
FG
2422 bdev[i]->aio_submit(h->iocv[i]);
2423 }
2424 }
2425 }
2426 dout(20) << __func__ << " h " << h << " pos now 0x"
2427 << std::hex << h->pos << std::dec << dendl;
2428 return 0;
2429}
2430
11fdf7f2 2431#ifdef HAVE_LIBAIO
7c673cae
FG
2432// we need to retire old completed aios so they don't stick around in
2433// memory indefinitely (along with their bufferlist refs).
2434void BlueFS::_claim_completed_aios(FileWriter *h, list<aio_t> *ls)
2435{
2436 for (auto p : h->iocv) {
2437 if (p) {
2438 ls->splice(ls->end(), p->running_aios);
2439 }
2440 }
2441 dout(10) << __func__ << " got " << ls->size() << " aios" << dendl;
2442}
2443
2444void BlueFS::wait_for_aio(FileWriter *h)
2445{
2446 // NOTE: this is safe to call without a lock, as long as our reference is
2447 // stable.
2448 dout(10) << __func__ << " " << h << dendl;
2449 utime_t start = ceph_clock_now();
2450 for (auto p : h->iocv) {
2451 if (p) {
2452 p->aio_wait();
2453 }
2454 }
11fdf7f2 2455 dout(10) << __func__ << " " << h << " done in " << (ceph_clock_now() - start) << dendl;
7c673cae 2456}
11fdf7f2 2457#endif
7c673cae
FG
2458
2459int BlueFS::_flush(FileWriter *h, bool force)
2460{
2461 h->buffer_appender.flush();
2462 uint64_t length = h->buffer.length();
2463 uint64_t offset = h->pos;
2464 if (!force &&
2465 length < cct->_conf->bluefs_min_flush_size) {
2466 dout(10) << __func__ << " " << h << " ignoring, length " << length
2467 << " < min_flush_size " << cct->_conf->bluefs_min_flush_size
2468 << dendl;
2469 return 0;
2470 }
2471 if (length == 0) {
2472 dout(10) << __func__ << " " << h << " no dirty data on "
2473 << h->file->fnode << dendl;
2474 return 0;
2475 }
2476 dout(10) << __func__ << " " << h << " 0x"
2477 << std::hex << offset << "~" << length << std::dec
2478 << " to " << h->file->fnode << dendl;
11fdf7f2 2479 ceph_assert(h->pos <= h->file->fnode.size);
7c673cae
FG
2480 return _flush_range(h, offset, length);
2481}
2482
2483int BlueFS::_truncate(FileWriter *h, uint64_t offset)
2484{
2485 dout(10) << __func__ << " 0x" << std::hex << offset << std::dec
2486 << " file " << h->file->fnode << dendl;
2487 if (h->file->deleted) {
2488 dout(10) << __func__ << " deleted, no-op" << dendl;
2489 return 0;
2490 }
2491
2492 // we never truncate internal log files
11fdf7f2 2493 ceph_assert(h->file->fnode.ino > 1);
7c673cae
FG
2494
2495 h->buffer_appender.flush();
2496
2497 // truncate off unflushed data?
2498 if (h->pos < offset &&
2499 h->pos + h->buffer.length() > offset) {
2500 bufferlist t;
2501 dout(20) << __func__ << " tossing out last " << offset - h->pos
2502 << " unflushed bytes" << dendl;
2503 t.substr_of(h->buffer, 0, offset - h->pos);
2504 h->buffer.swap(t);
11fdf7f2 2505 ceph_abort_msg("actually this shouldn't happen");
7c673cae
FG
2506 }
2507 if (h->buffer.length()) {
2508 int r = _flush(h, true);
2509 if (r < 0)
2510 return r;
2511 }
2512 if (offset == h->file->fnode.size) {
2513 return 0; // no-op!
2514 }
2515 if (offset > h->file->fnode.size) {
11fdf7f2 2516 ceph_abort_msg("truncate up not supported");
7c673cae 2517 }
11fdf7f2 2518 ceph_assert(h->file->fnode.size >= offset);
7c673cae
FG
2519 h->file->fnode.size = offset;
2520 log_t.op_file_update(h->file->fnode);
2521 return 0;
2522}
2523
11fdf7f2 2524int BlueFS::_fsync(FileWriter *h, std::unique_lock<ceph::mutex>& l)
7c673cae
FG
2525{
2526 dout(10) << __func__ << " " << h << " " << h->file->fnode << dendl;
2527 int r = _flush(h, true);
2528 if (r < 0)
2529 return r;
2530 uint64_t old_dirty_seq = h->file->dirty_seq;
2531
2532 _flush_bdev_safely(h);
2533
2534 if (old_dirty_seq) {
2535 uint64_t s = log_seq;
2536 dout(20) << __func__ << " file metadata was dirty (" << old_dirty_seq
2537 << ") on " << h->file->fnode << ", flushing log" << dendl;
2538 _flush_and_sync_log(l, old_dirty_seq);
11fdf7f2 2539 ceph_assert(h->file->dirty_seq == 0 || // cleaned
7c673cae
FG
2540 h->file->dirty_seq > s); // or redirtied by someone else
2541 }
2542 return 0;
2543}
2544
2545void BlueFS::_flush_bdev_safely(FileWriter *h)
2546{
11fdf7f2
TL
2547 std::array<bool, MAX_BDEV> flush_devs = h->dirty_devs;
2548 h->dirty_devs.fill(false);
2549#ifdef HAVE_LIBAIO
7c673cae
FG
2550 if (!cct->_conf->bluefs_sync_write) {
2551 list<aio_t> completed_ios;
2552 _claim_completed_aios(h, &completed_ios);
2553 lock.unlock();
2554 wait_for_aio(h);
2555 completed_ios.clear();
11fdf7f2 2556 flush_bdev(flush_devs);
7c673cae 2557 lock.lock();
11fdf7f2
TL
2558 } else
2559#endif
2560 {
7c673cae 2561 lock.unlock();
11fdf7f2 2562 flush_bdev(flush_devs);
7c673cae
FG
2563 lock.lock();
2564 }
2565}
2566
11fdf7f2
TL
2567void BlueFS::flush_bdev(std::array<bool, MAX_BDEV>& dirty_bdevs)
2568{
2569 // NOTE: this is safe to call without a lock.
2570 dout(20) << __func__ << dendl;
2571 for (unsigned i = 0; i < MAX_BDEV; i++) {
2572 if (dirty_bdevs[i])
2573 bdev[i]->flush();
2574 }
2575}
2576
7c673cae
FG
2577void BlueFS::flush_bdev()
2578{
2579 // NOTE: this is safe to call without a lock.
2580 dout(20) << __func__ << dendl;
2581 for (auto p : bdev) {
2582 if (p)
2583 p->flush();
2584 }
2585}
2586
eafe8130
TL
2587const char* BlueFS::get_device_name(unsigned id)
2588{
2589 if (id >= MAX_BDEV) return "BDEV_INV";
2590 const char* names[] = {"BDEV_WAL", "BDEV_DB", "BDEV_SLOW", "BDEV_NEWWAL", "BDEV_NEWDB"};
2591 return names[id];
2592}
2593
11fdf7f2
TL
2594int BlueFS::_expand_slow_device(uint64_t need, PExtentVector& extents)
2595{
2596 int r = -ENOSPC;
2597 if (slow_dev_expander) {
11fdf7f2 2598 int id = _get_slow_device_id();
eafe8130 2599 auto min_alloc_size = alloc_size[id];
11fdf7f2
TL
2600 ceph_assert(id <= (int)alloc.size() && alloc[id]);
2601 auto min_need = round_up_to(need, min_alloc_size);
2602 need = std::max(need,
2603 slow_dev_expander->get_recommended_expansion_delta(
2604 alloc[id]->get_free(), block_all[id].size()));
2605
2606 need = round_up_to(need, min_alloc_size);
2607 dout(10) << __func__ << " expanding slow device by 0x"
2608 << std::hex << need << std::dec
2609 << dendl;
2610 r = slow_dev_expander->allocate_freespace(min_need, need, extents);
2611 }
2612 return r;
2613}
2614
2615int BlueFS::_allocate_without_fallback(uint8_t id, uint64_t len,
2616 PExtentVector* extents)
2617{
2618 dout(10) << __func__ << " len 0x" << std::hex << len << std::dec
2619 << " from " << (int)id << dendl;
2620 assert(id < alloc.size());
11fdf7f2
TL
2621 if (!alloc[id]) {
2622 return -ENOENT;
2623 }
2624 extents->reserve(4); // 4 should be (more than) enough for most allocations
eafe8130
TL
2625 uint64_t min_alloc_size = alloc_size[id];
2626 uint64_t left = round_up_to(len, min_alloc_size);
11fdf7f2 2627 int64_t alloc_len = alloc[id]->allocate(left, min_alloc_size, 0, extents);
eafe8130
TL
2628 if (alloc_len < 0 || alloc_len < (int64_t)left) {
2629 if (alloc_len > 0) {
11fdf7f2
TL
2630 alloc[id]->release(*extents);
2631 }
2632 if (bdev[id])
2633 derr << __func__ << " failed to allocate 0x" << std::hex << left
2634 << " on bdev " << (int)id
2635 << ", free 0x" << alloc[id]->get_free() << std::dec << dendl;
2636 else
2637 derr << __func__ << " failed to allocate 0x" << std::hex << left
2638 << " on bdev " << (int)id << ", dne" << std::dec << dendl;
2639 if (alloc[id])
2640 alloc[id]->dump();
2641 return -ENOSPC;
2642 }
2643
2644 return 0;
2645}
2646
7c673cae 2647int BlueFS::_allocate(uint8_t id, uint64_t len,
94b18763 2648 bluefs_fnode_t* node)
7c673cae
FG
2649{
2650 dout(10) << __func__ << " len 0x" << std::hex << len << std::dec
2651 << " from " << (int)id << dendl;
11fdf7f2 2652 ceph_assert(id < alloc.size());
b32b8144 2653 int64_t alloc_len = 0;
a8e16298 2654 PExtentVector extents;
11fdf7f2 2655 uint64_t hint = 0;
7c673cae 2656 if (alloc[id]) {
94b18763
FG
2657 if (!node->extents.empty() && node->extents.back().bdev == id) {
2658 hint = node->extents.back().end();
11fdf7f2 2659 }
b32b8144 2660 extents.reserve(4); // 4 should be (more than) enough for most allocations
eafe8130
TL
2661 alloc_len = alloc[id]->allocate(round_up_to(len, alloc_size[id]),
2662 alloc_size[id], hint, &extents);
b32b8144 2663 }
eafe8130
TL
2664 if (!alloc[id] ||
2665 alloc_len < 0 ||
2666 alloc_len < (int64_t)round_up_to(len, alloc_size[id])) {
11fdf7f2 2667 if (alloc_len > 0) {
a8e16298 2668 alloc[id]->release(extents);
b32b8144 2669 }
7c673cae
FG
2670 if (id != BDEV_SLOW) {
2671 if (bdev[id]) {
eafe8130 2672 dout(1) << __func__ << " failed to allocate 0x" << std::hex << len
7c673cae
FG
2673 << " on bdev " << (int)id
2674 << ", free 0x" << alloc[id]->get_free()
2675 << "; fallback to bdev " << (int)id + 1
2676 << std::dec << dendl;
2677 }
94b18763 2678 return _allocate(id + 1, len, node);
7c673cae 2679 }
eafe8130 2680 dout(1) << __func__ << " unable to allocate 0x" << std::hex << len
11fdf7f2
TL
2681 << " on bdev " << (int)id << ", free 0x"
2682 << (alloc[id] ? alloc[id]->get_free() : (uint64_t)-1)
2683 << "; fallback to slow device expander "
2684 << std::dec << dendl;
2685 extents.clear();
eafe8130 2686 if (_expand_slow_device(len, extents) == 0) {
11fdf7f2
TL
2687 id = _get_slow_device_id();
2688 for (auto& e : extents) {
2689 _add_block_extent(id, e.offset, e.length);
2690 }
2691 extents.clear();
2692 auto* last_alloc = alloc[id];
2693 ceph_assert(last_alloc);
2694 // try again
eafe8130
TL
2695 alloc_len = last_alloc->allocate(round_up_to(len, alloc_size[id]),
2696 alloc_size[id], hint, &extents);
2697 if (alloc_len < 0 || alloc_len < (int64_t)len) {
11fdf7f2
TL
2698 if (alloc_len > 0) {
2699 last_alloc->release(extents);
2700 }
eafe8130 2701 derr << __func__ << " failed to allocate 0x" << std::hex << len
11fdf7f2
TL
2702 << " on bdev " << (int)id
2703 << ", free 0x" << last_alloc->get_free() << std::dec << dendl;
2704 return -ENOSPC;
2705 }
2706 } else {
2707 derr << __func__ << " failed to expand slow device to fit +0x"
eafe8130 2708 << std::hex << len << std::dec
11fdf7f2
TL
2709 << dendl;
2710 return -ENOSPC;
2711 }
2712 } else {
2713 uint64_t total_allocated =
2714 block_all[id].size() - alloc[id]->get_free();
2715 if (max_bytes[id] < total_allocated) {
2716 logger->set(max_bytes_pcounters[id], total_allocated);
2717 max_bytes[id] = total_allocated;
2718 }
7c673cae
FG
2719 }
2720
2721 for (auto& p : extents) {
94b18763 2722 node->append_extent(bluefs_extent_t(id, p.offset, p.length));
7c673cae
FG
2723 }
2724
2725 return 0;
2726}
2727
2728int BlueFS::_preallocate(FileRef f, uint64_t off, uint64_t len)
2729{
2730 dout(10) << __func__ << " file " << f->fnode << " 0x"
2731 << std::hex << off << "~" << len << std::dec << dendl;
2732 if (f->deleted) {
2733 dout(10) << __func__ << " deleted, no-op" << dendl;
2734 return 0;
2735 }
11fdf7f2 2736 ceph_assert(f->fnode.ino > 1);
7c673cae
FG
2737 uint64_t allocated = f->fnode.get_allocated();
2738 if (off + len > allocated) {
2739 uint64_t want = off + len - allocated;
94b18763 2740 int r = _allocate(f->fnode.prefer_bdev, want, &f->fnode);
7c673cae
FG
2741 if (r < 0)
2742 return r;
7c673cae
FG
2743 log_t.op_file_update(f->fnode);
2744 }
2745 return 0;
2746}
2747
2748void BlueFS::sync_metadata()
2749{
11fdf7f2 2750 std::unique_lock l(lock);
7c673cae
FG
2751 if (log_t.empty()) {
2752 dout(10) << __func__ << " - no pending log events" << dendl;
11fdf7f2
TL
2753 } else {
2754 dout(10) << __func__ << dendl;
2755 utime_t start = ceph_clock_now();
2756 flush_bdev(); // FIXME?
2757 _flush_and_sync_log(l);
2758 dout(10) << __func__ << " done in " << (ceph_clock_now() - start) << dendl;
7c673cae 2759 }
7c673cae
FG
2760
2761 if (_should_compact_log()) {
2762 if (cct->_conf->bluefs_compact_log_sync) {
2763 _compact_log_sync();
2764 } else {
2765 _compact_log_async(l);
2766 }
2767 }
7c673cae
FG
2768}
2769
2770int BlueFS::open_for_write(
2771 const string& dirname,
2772 const string& filename,
2773 FileWriter **h,
2774 bool overwrite)
2775{
11fdf7f2 2776 std::lock_guard l(lock);
7c673cae
FG
2777 dout(10) << __func__ << " " << dirname << "/" << filename << dendl;
2778 map<string,DirRef>::iterator p = dir_map.find(dirname);
2779 DirRef dir;
2780 if (p == dir_map.end()) {
2781 // implicitly create the dir
2782 dout(20) << __func__ << " dir " << dirname
2783 << " does not exist" << dendl;
2784 return -ENOENT;
2785 } else {
2786 dir = p->second;
2787 }
2788
2789 FileRef file;
2790 bool create = false;
2791 map<string,FileRef>::iterator q = dir->file_map.find(filename);
2792 if (q == dir->file_map.end()) {
2793 if (overwrite) {
2794 dout(20) << __func__ << " dir " << dirname << " (" << dir
2795 << ") file " << filename
2796 << " does not exist" << dendl;
2797 return -ENOENT;
2798 }
2799 file = new File;
2800 file->fnode.ino = ++ino_last;
2801 file_map[ino_last] = file;
2802 dir->file_map[filename] = file;
2803 ++file->refs;
2804 create = true;
2805 } else {
2806 // overwrite existing file?
2807 file = q->second;
2808 if (overwrite) {
2809 dout(20) << __func__ << " dir " << dirname << " (" << dir
2810 << ") file " << filename
2811 << " already exists, overwrite in place" << dendl;
2812 } else {
2813 dout(20) << __func__ << " dir " << dirname << " (" << dir
2814 << ") file " << filename
2815 << " already exists, truncate + overwrite" << dendl;
2816 file->fnode.size = 0;
2817 for (auto& p : file->fnode.extents) {
2818 pending_release[p.bdev].insert(p.offset, p.length);
2819 }
94b18763
FG
2820
2821 file->fnode.clear_extents();
7c673cae
FG
2822 }
2823 }
11fdf7f2 2824 ceph_assert(file->fnode.ino > 1);
7c673cae
FG
2825
2826 file->fnode.mtime = ceph_clock_now();
2827 file->fnode.prefer_bdev = BlueFS::BDEV_DB;
2828 if (dirname.length() > 5) {
2829 // the "db.slow" and "db.wal" directory names are hard-coded at
2830 // match up with bluestore. the slow device is always the second
2831 // one (when a dedicated block.db device is present and used at
2832 // bdev 0). the wal device is always last.
31f18b77 2833 if (boost::algorithm::ends_with(dirname, ".slow")) {
7c673cae
FG
2834 file->fnode.prefer_bdev = BlueFS::BDEV_SLOW;
2835 } else if (boost::algorithm::ends_with(dirname, ".wal")) {
2836 file->fnode.prefer_bdev = BlueFS::BDEV_WAL;
2837 }
2838 }
2839 dout(20) << __func__ << " mapping " << dirname << "/" << filename
2840 << " to bdev " << (int)file->fnode.prefer_bdev << dendl;
2841
2842 log_t.op_file_update(file->fnode);
2843 if (create)
2844 log_t.op_dir_link(dirname, filename, file->fnode.ino);
2845
2846 *h = _create_writer(file);
2847
2848 if (boost::algorithm::ends_with(filename, ".log")) {
2849 (*h)->writer_type = BlueFS::WRITER_WAL;
2850 if (logger && !overwrite) {
2851 logger->inc(l_bluefs_files_written_wal);
2852 }
2853 } else if (boost::algorithm::ends_with(filename, ".sst")) {
2854 (*h)->writer_type = BlueFS::WRITER_SST;
2855 if (logger) {
2856 logger->inc(l_bluefs_files_written_sst);
2857 }
2858 }
2859
2860 dout(10) << __func__ << " h " << *h << " on " << file->fnode << dendl;
2861 return 0;
2862}
2863
2864BlueFS::FileWriter *BlueFS::_create_writer(FileRef f)
2865{
2866 FileWriter *w = new FileWriter(f);
2867 for (unsigned i = 0; i < MAX_BDEV; ++i) {
2868 if (bdev[i]) {
2869 w->iocv[i] = new IOContext(cct, NULL);
7c673cae
FG
2870 }
2871 }
2872 return w;
2873}
2874
2875void BlueFS::_close_writer(FileWriter *h)
2876{
2877 dout(10) << __func__ << " " << h << " type " << h->writer_type << dendl;
2878 for (unsigned i=0; i<MAX_BDEV; ++i) {
2879 if (bdev[i]) {
11fdf7f2
TL
2880 if (h->iocv[i]) {
2881 h->iocv[i]->aio_wait();
2882 bdev[i]->queue_reap_ioc(h->iocv[i]);
2883 }
7c673cae
FG
2884 }
2885 }
2886 delete h;
2887}
2888
2889int BlueFS::open_for_read(
2890 const string& dirname,
2891 const string& filename,
2892 FileReader **h,
2893 bool random)
2894{
11fdf7f2 2895 std::lock_guard l(lock);
7c673cae
FG
2896 dout(10) << __func__ << " " << dirname << "/" << filename
2897 << (random ? " (random)":" (sequential)") << dendl;
2898 map<string,DirRef>::iterator p = dir_map.find(dirname);
2899 if (p == dir_map.end()) {
2900 dout(20) << __func__ << " dir " << dirname << " not found" << dendl;
2901 return -ENOENT;
2902 }
2903 DirRef dir = p->second;
2904
2905 map<string,FileRef>::iterator q = dir->file_map.find(filename);
2906 if (q == dir->file_map.end()) {
2907 dout(20) << __func__ << " dir " << dirname << " (" << dir
2908 << ") file " << filename
2909 << " not found" << dendl;
2910 return -ENOENT;
2911 }
2912 File *file = q->second.get();
2913
2914 *h = new FileReader(file, random ? 4096 : cct->_conf->bluefs_max_prefetch,
2915 random, false);
2916 dout(10) << __func__ << " h " << *h << " on " << file->fnode << dendl;
2917 return 0;
2918}
2919
2920int BlueFS::rename(
2921 const string& old_dirname, const string& old_filename,
2922 const string& new_dirname, const string& new_filename)
2923{
11fdf7f2 2924 std::lock_guard l(lock);
7c673cae
FG
2925 dout(10) << __func__ << " " << old_dirname << "/" << old_filename
2926 << " -> " << new_dirname << "/" << new_filename << dendl;
2927 map<string,DirRef>::iterator p = dir_map.find(old_dirname);
2928 if (p == dir_map.end()) {
2929 dout(20) << __func__ << " dir " << old_dirname << " not found" << dendl;
2930 return -ENOENT;
2931 }
2932 DirRef old_dir = p->second;
2933 map<string,FileRef>::iterator q = old_dir->file_map.find(old_filename);
2934 if (q == old_dir->file_map.end()) {
2935 dout(20) << __func__ << " dir " << old_dirname << " (" << old_dir
2936 << ") file " << old_filename
2937 << " not found" << dendl;
2938 return -ENOENT;
2939 }
2940 FileRef file = q->second;
2941
2942 p = dir_map.find(new_dirname);
2943 if (p == dir_map.end()) {
2944 dout(20) << __func__ << " dir " << new_dirname << " not found" << dendl;
2945 return -ENOENT;
2946 }
2947 DirRef new_dir = p->second;
2948 q = new_dir->file_map.find(new_filename);
2949 if (q != new_dir->file_map.end()) {
2950 dout(20) << __func__ << " dir " << new_dirname << " (" << old_dir
2951 << ") file " << new_filename
2952 << " already exists, unlinking" << dendl;
11fdf7f2 2953 ceph_assert(q->second != file);
7c673cae
FG
2954 log_t.op_dir_unlink(new_dirname, new_filename);
2955 _drop_link(q->second);
2956 }
2957
2958 dout(10) << __func__ << " " << new_dirname << "/" << new_filename << " "
2959 << " " << file->fnode << dendl;
2960
2961 new_dir->file_map[new_filename] = file;
2962 old_dir->file_map.erase(old_filename);
2963
2964 log_t.op_dir_link(new_dirname, new_filename, file->fnode.ino);
2965 log_t.op_dir_unlink(old_dirname, old_filename);
2966 return 0;
2967}
2968
2969int BlueFS::mkdir(const string& dirname)
2970{
11fdf7f2 2971 std::lock_guard l(lock);
7c673cae
FG
2972 dout(10) << __func__ << " " << dirname << dendl;
2973 map<string,DirRef>::iterator p = dir_map.find(dirname);
2974 if (p != dir_map.end()) {
2975 dout(20) << __func__ << " dir " << dirname << " exists" << dendl;
2976 return -EEXIST;
2977 }
2978 dir_map[dirname] = new Dir;
2979 log_t.op_dir_create(dirname);
2980 return 0;
2981}
2982
2983int BlueFS::rmdir(const string& dirname)
2984{
11fdf7f2 2985 std::lock_guard l(lock);
7c673cae
FG
2986 dout(10) << __func__ << " " << dirname << dendl;
2987 map<string,DirRef>::iterator p = dir_map.find(dirname);
2988 if (p == dir_map.end()) {
2989 dout(20) << __func__ << " dir " << dirname << " does not exist" << dendl;
2990 return -ENOENT;
2991 }
2992 DirRef dir = p->second;
2993 if (!dir->file_map.empty()) {
2994 dout(20) << __func__ << " dir " << dirname << " not empty" << dendl;
2995 return -ENOTEMPTY;
2996 }
2997 dir_map.erase(dirname);
2998 log_t.op_dir_remove(dirname);
2999 return 0;
3000}
3001
3002bool BlueFS::dir_exists(const string& dirname)
3003{
11fdf7f2 3004 std::lock_guard l(lock);
7c673cae
FG
3005 map<string,DirRef>::iterator p = dir_map.find(dirname);
3006 bool exists = p != dir_map.end();
3007 dout(10) << __func__ << " " << dirname << " = " << (int)exists << dendl;
3008 return exists;
3009}
3010
3011int BlueFS::stat(const string& dirname, const string& filename,
3012 uint64_t *size, utime_t *mtime)
3013{
11fdf7f2 3014 std::lock_guard l(lock);
7c673cae
FG
3015 dout(10) << __func__ << " " << dirname << "/" << filename << dendl;
3016 map<string,DirRef>::iterator p = dir_map.find(dirname);
3017 if (p == dir_map.end()) {
3018 dout(20) << __func__ << " dir " << dirname << " not found" << dendl;
3019 return -ENOENT;
3020 }
3021 DirRef dir = p->second;
3022 map<string,FileRef>::iterator q = dir->file_map.find(filename);
3023 if (q == dir->file_map.end()) {
3024 dout(20) << __func__ << " dir " << dirname << " (" << dir
3025 << ") file " << filename
3026 << " not found" << dendl;
3027 return -ENOENT;
3028 }
3029 File *file = q->second.get();
3030 dout(10) << __func__ << " " << dirname << "/" << filename
3031 << " " << file->fnode << dendl;
3032 if (size)
3033 *size = file->fnode.size;
3034 if (mtime)
3035 *mtime = file->fnode.mtime;
3036 return 0;
3037}
3038
3039int BlueFS::lock_file(const string& dirname, const string& filename,
3040 FileLock **plock)
3041{
11fdf7f2 3042 std::lock_guard l(lock);
7c673cae
FG
3043 dout(10) << __func__ << " " << dirname << "/" << filename << dendl;
3044 map<string,DirRef>::iterator p = dir_map.find(dirname);
3045 if (p == dir_map.end()) {
3046 dout(20) << __func__ << " dir " << dirname << " not found" << dendl;
3047 return -ENOENT;
3048 }
3049 DirRef dir = p->second;
3050 map<string,FileRef>::iterator q = dir->file_map.find(filename);
3051 File *file;
3052 if (q == dir->file_map.end()) {
3053 dout(20) << __func__ << " dir " << dirname << " (" << dir
3054 << ") file " << filename
3055 << " not found, creating" << dendl;
3056 file = new File;
3057 file->fnode.ino = ++ino_last;
3058 file->fnode.mtime = ceph_clock_now();
3059 file_map[ino_last] = file;
3060 dir->file_map[filename] = file;
3061 ++file->refs;
3062 log_t.op_file_update(file->fnode);
3063 log_t.op_dir_link(dirname, filename, file->fnode.ino);
3064 } else {
3065 file = q->second.get();
3066 if (file->locked) {
3067 dout(10) << __func__ << " already locked" << dendl;
11fdf7f2 3068 return -ENOLCK;
7c673cae
FG
3069 }
3070 }
3071 file->locked = true;
3072 *plock = new FileLock(file);
3073 dout(10) << __func__ << " locked " << file->fnode
3074 << " with " << *plock << dendl;
3075 return 0;
3076}
3077
3078int BlueFS::unlock_file(FileLock *fl)
3079{
11fdf7f2 3080 std::lock_guard l(lock);
7c673cae 3081 dout(10) << __func__ << " " << fl << " on " << fl->file->fnode << dendl;
11fdf7f2 3082 ceph_assert(fl->file->locked);
7c673cae
FG
3083 fl->file->locked = false;
3084 delete fl;
3085 return 0;
3086}
3087
3088int BlueFS::readdir(const string& dirname, vector<string> *ls)
3089{
11fdf7f2 3090 std::lock_guard l(lock);
7c673cae
FG
3091 dout(10) << __func__ << " " << dirname << dendl;
3092 if (dirname.empty()) {
3093 // list dirs
3094 ls->reserve(dir_map.size() + 2);
3095 for (auto& q : dir_map) {
3096 ls->push_back(q.first);
3097 }
3098 } else {
3099 // list files in dir
3100 map<string,DirRef>::iterator p = dir_map.find(dirname);
3101 if (p == dir_map.end()) {
3102 dout(20) << __func__ << " dir " << dirname << " not found" << dendl;
3103 return -ENOENT;
3104 }
3105 DirRef dir = p->second;
3106 ls->reserve(dir->file_map.size() + 2);
3107 for (auto& q : dir->file_map) {
3108 ls->push_back(q.first);
3109 }
3110 }
3111 ls->push_back(".");
3112 ls->push_back("..");
3113 return 0;
3114}
3115
3116int BlueFS::unlink(const string& dirname, const string& filename)
3117{
11fdf7f2 3118 std::lock_guard l(lock);
7c673cae
FG
3119 dout(10) << __func__ << " " << dirname << "/" << filename << dendl;
3120 map<string,DirRef>::iterator p = dir_map.find(dirname);
3121 if (p == dir_map.end()) {
3122 dout(20) << __func__ << " dir " << dirname << " not found" << dendl;
3123 return -ENOENT;
3124 }
3125 DirRef dir = p->second;
3126 map<string,FileRef>::iterator q = dir->file_map.find(filename);
3127 if (q == dir->file_map.end()) {
3128 dout(20) << __func__ << " file " << dirname << "/" << filename
3129 << " not found" << dendl;
3130 return -ENOENT;
3131 }
3132 FileRef file = q->second;
3133 if (file->locked) {
3134 dout(20) << __func__ << " file " << dirname << "/" << filename
3135 << " is locked" << dendl;
3136 return -EBUSY;
3137 }
3138 dir->file_map.erase(filename);
3139 log_t.op_dir_unlink(dirname, filename);
3140 _drop_link(file);
3141 return 0;
3142}
d2e6a577
FG
3143
3144bool BlueFS::wal_is_rotational()
3145{
94b18763
FG
3146 if (bdev[BDEV_WAL]) {
3147 return bdev[BDEV_WAL]->is_rotational();
3148 } else if (bdev[BDEV_DB]) {
3149 return bdev[BDEV_DB]->is_rotational();
3150 }
3151 return bdev[BDEV_SLOW]->is_rotational();
d2e6a577 3152}