]> git.proxmox.com Git - ceph.git/blame - ceph/src/os/bluestore/BlueFS.cc
import 14.2.4 nautilus point release
[ceph.git] / ceph / src / os / bluestore / BlueFS.cc
CommitLineData
7c673cae
FG
1// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2// vim: ts=8 sw=2 smarttab
3
4#include "boost/algorithm/string.hpp"
5#include "BlueFS.h"
6
7#include "common/debug.h"
8#include "common/errno.h"
9#include "common/perf_counters.h"
10#include "BlockDevice.h"
11#include "Allocator.h"
11fdf7f2 12#include "include/ceph_assert.h"
7c673cae
FG
13
14#define dout_context cct
15#define dout_subsys ceph_subsys_bluefs
16#undef dout_prefix
17#define dout_prefix *_dout << "bluefs "
18
19MEMPOOL_DEFINE_OBJECT_FACTORY(BlueFS::File, bluefs_file, bluefs);
20MEMPOOL_DEFINE_OBJECT_FACTORY(BlueFS::Dir, bluefs_dir, bluefs);
21MEMPOOL_DEFINE_OBJECT_FACTORY(BlueFS::FileWriter, bluefs_file_writer, bluefs);
22MEMPOOL_DEFINE_OBJECT_FACTORY(BlueFS::FileReaderBuffer,
23 bluefs_file_reader_buffer, bluefs);
24MEMPOOL_DEFINE_OBJECT_FACTORY(BlueFS::FileReader, bluefs_file_reader, bluefs);
25MEMPOOL_DEFINE_OBJECT_FACTORY(BlueFS::FileLock, bluefs_file_lock, bluefs);
26
11fdf7f2
TL
27static void wal_discard_cb(void *priv, void* priv2) {
28 BlueFS *bluefs = static_cast<BlueFS*>(priv);
29 interval_set<uint64_t> *tmp = static_cast<interval_set<uint64_t>*>(priv2);
30 bluefs->handle_discard(BlueFS::BDEV_WAL, *tmp);
31}
32
33static void db_discard_cb(void *priv, void* priv2) {
34 BlueFS *bluefs = static_cast<BlueFS*>(priv);
35 interval_set<uint64_t> *tmp = static_cast<interval_set<uint64_t>*>(priv2);
36 bluefs->handle_discard(BlueFS::BDEV_DB, *tmp);
37}
38
39static void slow_discard_cb(void *priv, void* priv2) {
40 BlueFS *bluefs = static_cast<BlueFS*>(priv);
41 interval_set<uint64_t> *tmp = static_cast<interval_set<uint64_t>*>(priv2);
42 bluefs->handle_discard(BlueFS::BDEV_SLOW, *tmp);
43}
7c673cae
FG
44
45BlueFS::BlueFS(CephContext* cct)
46 : cct(cct),
47 bdev(MAX_BDEV),
48 ioc(MAX_BDEV),
11fdf7f2 49 block_all(MAX_BDEV)
7c673cae 50{
11fdf7f2
TL
51 discard_cb[BDEV_WAL] = wal_discard_cb;
52 discard_cb[BDEV_DB] = db_discard_cb;
53 discard_cb[BDEV_SLOW] = slow_discard_cb;
7c673cae
FG
54}
55
56BlueFS::~BlueFS()
57{
58 for (auto p : ioc) {
59 if (p)
60 p->aio_wait();
61 }
62 for (auto p : bdev) {
63 if (p) {
64 p->close();
65 delete p;
66 }
67 }
68 for (auto p : ioc) {
69 delete p;
70 }
71}
72
73void BlueFS::_init_logger()
74{
75 PerfCountersBuilder b(cct, "bluefs",
76 l_bluefs_first, l_bluefs_last);
77 b.add_u64_counter(l_bluefs_gift_bytes, "gift_bytes",
11fdf7f2 78 "Bytes gifted from BlueStore", NULL, 0, unit_t(UNIT_BYTES));
7c673cae 79 b.add_u64_counter(l_bluefs_reclaim_bytes, "reclaim_bytes",
11fdf7f2 80 "Bytes reclaimed by BlueStore", NULL, 0, unit_t(UNIT_BYTES));
7c673cae
FG
81 b.add_u64(l_bluefs_db_total_bytes, "db_total_bytes",
82 "Total bytes (main db device)",
11fdf7f2 83 "b", PerfCountersBuilder::PRIO_USEFUL, unit_t(UNIT_BYTES));
7c673cae
FG
84 b.add_u64(l_bluefs_db_used_bytes, "db_used_bytes",
85 "Used bytes (main db device)",
11fdf7f2 86 "u", PerfCountersBuilder::PRIO_USEFUL, unit_t(UNIT_BYTES));
7c673cae
FG
87 b.add_u64(l_bluefs_wal_total_bytes, "wal_total_bytes",
88 "Total bytes (wal device)",
11fdf7f2 89 "walb", PerfCountersBuilder::PRIO_USEFUL, unit_t(UNIT_BYTES));
7c673cae
FG
90 b.add_u64(l_bluefs_wal_used_bytes, "wal_used_bytes",
91 "Used bytes (wal device)",
11fdf7f2 92 "walu", PerfCountersBuilder::PRIO_USEFUL, unit_t(UNIT_BYTES));
7c673cae
FG
93 b.add_u64(l_bluefs_slow_total_bytes, "slow_total_bytes",
94 "Total bytes (slow device)",
11fdf7f2 95 "slob", PerfCountersBuilder::PRIO_USEFUL, unit_t(UNIT_BYTES));
7c673cae
FG
96 b.add_u64(l_bluefs_slow_used_bytes, "slow_used_bytes",
97 "Used bytes (slow device)",
11fdf7f2 98 "slou", PerfCountersBuilder::PRIO_USEFUL, unit_t(UNIT_BYTES));
7c673cae
FG
99 b.add_u64(l_bluefs_num_files, "num_files", "File count",
100 "f", PerfCountersBuilder::PRIO_USEFUL);
101 b.add_u64(l_bluefs_log_bytes, "log_bytes", "Size of the metadata log",
11fdf7f2 102 "jlen", PerfCountersBuilder::PRIO_INTERESTING, unit_t(UNIT_BYTES));
7c673cae
FG
103 b.add_u64_counter(l_bluefs_log_compactions, "log_compactions",
104 "Compactions of the metadata log");
105 b.add_u64_counter(l_bluefs_logged_bytes, "logged_bytes",
106 "Bytes written to the metadata log", "j",
11fdf7f2 107 PerfCountersBuilder::PRIO_CRITICAL, unit_t(UNIT_BYTES));
7c673cae
FG
108 b.add_u64_counter(l_bluefs_files_written_wal, "files_written_wal",
109 "Files written to WAL");
110 b.add_u64_counter(l_bluefs_files_written_sst, "files_written_sst",
111 "Files written to SSTs");
112 b.add_u64_counter(l_bluefs_bytes_written_wal, "bytes_written_wal",
113 "Bytes written to WAL", "wal",
114 PerfCountersBuilder::PRIO_CRITICAL);
115 b.add_u64_counter(l_bluefs_bytes_written_sst, "bytes_written_sst",
116 "Bytes written to SSTs", "sst",
11fdf7f2
TL
117 PerfCountersBuilder::PRIO_CRITICAL, unit_t(UNIT_BYTES));
118 b.add_u64_counter(l_bluefs_bytes_written_slow, "bytes_written_slow",
119 "Bytes written to WAL/SSTs at slow device", NULL,
120 PerfCountersBuilder::PRIO_USEFUL, unit_t(UNIT_BYTES));
121 b.add_u64_counter(l_bluefs_max_bytes_wal, "max_bytes_wal",
122 "Maximum bytes allocated from WAL");
123 b.add_u64_counter(l_bluefs_max_bytes_db, "max_bytes_db",
124 "Maximum bytes allocated from DB");
125 b.add_u64_counter(l_bluefs_max_bytes_slow, "max_bytes_slow",
126 "Maximum bytes allocated from SLOW");
494da23a
TL
127
128 b.add_u64_counter(l_bluefs_read_random_count, "read_random_count",
129 "random read requests processed");
130 b.add_u64_counter(l_bluefs_read_random_bytes, "read_random_bytes",
131 "Bytes requested in random read mode", NULL,
132 PerfCountersBuilder::PRIO_USEFUL, unit_t(UNIT_BYTES));
133 b.add_u64_counter(l_bluefs_read_random_disk_count, "read_random_disk_count",
134 "random reads requests going to disk");
135 b.add_u64_counter(l_bluefs_read_random_disk_bytes, "read_random_disk_bytes",
136 "Bytes read from disk in random read mode", NULL,
137 PerfCountersBuilder::PRIO_USEFUL, unit_t(UNIT_BYTES));
138 b.add_u64_counter(l_bluefs_read_random_buffer_count, "read_random_buffer_count",
139 "random read requests processed using prefetch buffer");
140 b.add_u64_counter(l_bluefs_read_random_buffer_bytes, "read_random_buffer_bytes",
141 "Bytes read from prefetch buffer in random read mode", NULL,
142 PerfCountersBuilder::PRIO_USEFUL, unit_t(UNIT_BYTES));
143
144 b.add_u64_counter(l_bluefs_read_count, "read_count",
145 "buffered read requests processed");
146 b.add_u64_counter(l_bluefs_read_bytes, "read_bytes",
147 "Bytes requested in buffered read mode", NULL,
148 PerfCountersBuilder::PRIO_USEFUL, unit_t(UNIT_BYTES));
149
150 b.add_u64_counter(l_bluefs_read_prefetch_count, "read_prefetch_count",
151 "prefetch read requests processed");
152 b.add_u64_counter(l_bluefs_read_prefetch_bytes, "read_prefetch_bytes",
153 "Bytes requested in prefetch read mode", NULL,
154 PerfCountersBuilder::PRIO_USEFUL, unit_t(UNIT_BYTES));
155
7c673cae
FG
156 logger = b.create_perf_counters();
157 cct->get_perfcounters_collection()->add(logger);
158}
159
160void BlueFS::_shutdown_logger()
161{
162 cct->get_perfcounters_collection()->remove(logger);
163 delete logger;
164}
165
166void BlueFS::_update_logger_stats()
167{
168 // we must be holding the lock
169 logger->set(l_bluefs_num_files, file_map.size());
170 logger->set(l_bluefs_log_bytes, log_writer->file->fnode.size);
171
172 if (alloc[BDEV_WAL]) {
11fdf7f2 173 logger->set(l_bluefs_wal_total_bytes, block_all[BDEV_WAL].size());
7c673cae 174 logger->set(l_bluefs_wal_used_bytes,
11fdf7f2 175 block_all[BDEV_WAL].size() - alloc[BDEV_WAL]->get_free());
7c673cae
FG
176 }
177 if (alloc[BDEV_DB]) {
11fdf7f2 178 logger->set(l_bluefs_db_total_bytes, block_all[BDEV_DB].size());
7c673cae 179 logger->set(l_bluefs_db_used_bytes,
11fdf7f2 180 block_all[BDEV_DB].size() - alloc[BDEV_DB]->get_free());
7c673cae
FG
181 }
182 if (alloc[BDEV_SLOW]) {
11fdf7f2 183 logger->set(l_bluefs_slow_total_bytes, block_all[BDEV_SLOW].size());
7c673cae 184 logger->set(l_bluefs_slow_used_bytes,
11fdf7f2 185 block_all[BDEV_SLOW].size() - alloc[BDEV_SLOW]->get_free());
7c673cae
FG
186 }
187}
188
11fdf7f2
TL
189int BlueFS::add_block_device(unsigned id, const string& path, bool trim,
190 bool shared_with_bluestore)
7c673cae
FG
191{
192 dout(10) << __func__ << " bdev " << id << " path " << path << dendl;
11fdf7f2
TL
193 ceph_assert(id < bdev.size());
194 ceph_assert(bdev[id] == NULL);
195 BlockDevice *b = BlockDevice::create(cct, path, NULL, NULL,
196 discard_cb[id], static_cast<void*>(this));
197 if (shared_with_bluestore) {
198 b->set_no_exclusive_lock();
199 }
7c673cae
FG
200 int r = b->open(path);
201 if (r < 0) {
202 delete b;
203 return r;
204 }
11fdf7f2
TL
205 if (trim) {
206 b->discard(0, b->get_size());
207 }
208
7c673cae 209 dout(1) << __func__ << " bdev " << id << " path " << path
1adf2230 210 << " size " << byte_u_t(b->get_size()) << dendl;
7c673cae
FG
211 bdev[id] = b;
212 ioc[id] = new IOContext(cct, NULL);
213 return 0;
214}
215
216bool BlueFS::bdev_support_label(unsigned id)
217{
11fdf7f2
TL
218 ceph_assert(id < bdev.size());
219 ceph_assert(bdev[id]);
7c673cae
FG
220 return bdev[id]->supported_bdev_label();
221}
222
223uint64_t BlueFS::get_block_device_size(unsigned id)
224{
225 if (id < bdev.size() && bdev[id])
226 return bdev[id]->get_size();
227 return 0;
228}
229
11fdf7f2 230void BlueFS::_add_block_extent(unsigned id, uint64_t offset, uint64_t length)
7c673cae 231{
7c673cae 232 dout(1) << __func__ << " bdev " << id
11fdf7f2 233 << " 0x" << std::hex << offset << "~" << length << std::dec
7c673cae 234 << dendl;
11fdf7f2
TL
235
236 ceph_assert(id < bdev.size());
237 ceph_assert(bdev[id]);
238 ceph_assert(bdev[id]->get_size() >= offset + length);
7c673cae 239 block_all[id].insert(offset, length);
7c673cae
FG
240
241 if (id < alloc.size() && alloc[id]) {
242 log_t.op_alloc_add(id, offset, length);
7c673cae
FG
243 alloc[id]->init_add_free(offset, length);
244 }
245
246 if (logger)
247 logger->inc(l_bluefs_gift_bytes, length);
248 dout(10) << __func__ << " done" << dendl;
249}
250
251int BlueFS::reclaim_blocks(unsigned id, uint64_t want,
a8e16298 252 PExtentVector *extents)
7c673cae 253{
11fdf7f2 254 std::unique_lock l(lock);
7c673cae
FG
255 dout(1) << __func__ << " bdev " << id
256 << " want 0x" << std::hex << want << std::dec << dendl;
11fdf7f2
TL
257 ceph_assert(id < alloc.size());
258 ceph_assert(alloc[id]);
a8e16298 259
7c673cae
FG
260 int64_t got = alloc[id]->allocate(want, cct->_conf->bluefs_alloc_size, 0,
261 extents);
11fdf7f2 262 ceph_assert(got != 0);
a8e16298 263 if (got < 0) {
7c673cae 264 derr << __func__ << " failed to allocate space to return to bluestore"
a8e16298 265 << dendl;
7c673cae
FG
266 alloc[id]->dump();
267 return got;
268 }
269
270 for (auto& p : *extents) {
271 block_all[id].erase(p.offset, p.length);
7c673cae
FG
272 log_t.op_alloc_rm(id, p.offset, p.length);
273 }
274
275 flush_bdev();
a8e16298 276 int r = _flush_and_sync_log(l);
11fdf7f2 277 ceph_assert(r == 0);
7c673cae 278
11fdf7f2 279 logger->inc(l_bluefs_reclaim_bytes, got);
7c673cae
FG
280 dout(1) << __func__ << " bdev " << id << " want 0x" << std::hex << want
281 << " got " << *extents << dendl;
282 return 0;
283}
284
11fdf7f2 285void BlueFS::handle_discard(unsigned id, interval_set<uint64_t>& to_release)
7c673cae 286{
11fdf7f2
TL
287 dout(10) << __func__ << " bdev " << id << dendl;
288 ceph_assert(alloc[id]);
289 alloc[id]->release(to_release);
290}
291
292uint64_t BlueFS::get_used()
293{
294 std::lock_guard l(lock);
295 uint64_t used = 0;
296 for (unsigned id = 0; id < MAX_BDEV; ++id) {
297 if (alloc[id]) {
298 used += block_all[id].size() - alloc[id]->get_free();
299 }
7c673cae 300 }
11fdf7f2 301 return used;
7c673cae
FG
302}
303
304uint64_t BlueFS::get_total(unsigned id)
305{
11fdf7f2
TL
306 std::lock_guard l(lock);
307 ceph_assert(id < block_all.size());
308 return block_all[id].size();
7c673cae
FG
309}
310
311uint64_t BlueFS::get_free(unsigned id)
312{
11fdf7f2
TL
313 std::lock_guard l(lock);
314 ceph_assert(id < alloc.size());
7c673cae
FG
315 return alloc[id]->get_free();
316}
317
318void BlueFS::dump_perf_counters(Formatter *f)
319{
320 f->open_object_section("bluefs_perf_counters");
321 logger->dump_formatted(f,0);
322 f->close_section();
323}
324
3efd9988
FG
325void BlueFS::dump_block_extents(ostream& out)
326{
327 for (unsigned i = 0; i < MAX_BDEV; ++i) {
328 if (!bdev[i]) {
329 continue;
330 }
11fdf7f2
TL
331 auto owned = get_total(i);
332 auto free = get_free(i);
333 out << i << " : device size 0x" << std::hex << bdev[i]->get_size()
334 << " : own 0x" << block_all[i]
335 << " = 0x" << owned
336 << " : using 0x" << owned - free
494da23a
TL
337 << std::dec << "(" << byte_u_t(owned - free) << ")"
338 << "\n";
3efd9988
FG
339 }
340}
7c673cae
FG
341
342void BlueFS::get_usage(vector<pair<uint64_t,uint64_t>> *usage)
343{
11fdf7f2 344 std::lock_guard l(lock);
7c673cae
FG
345 usage->resize(bdev.size());
346 for (unsigned id = 0; id < bdev.size(); ++id) {
347 if (!bdev[id]) {
348 (*usage)[id] = make_pair(0, 0);
349 continue;
350 }
351 (*usage)[id].first = alloc[id]->get_free();
11fdf7f2 352 (*usage)[id].second = block_all[id].size();
7c673cae 353 uint64_t used =
11fdf7f2 354 (block_all[id].size() - (*usage)[id].first) * 100 / block_all[id].size();
7c673cae
FG
355 dout(10) << __func__ << " bdev " << id
356 << " free " << (*usage)[id].first
1adf2230 357 << " (" << byte_u_t((*usage)[id].first) << ")"
7c673cae 358 << " / " << (*usage)[id].second
1adf2230 359 << " (" << byte_u_t((*usage)[id].second) << ")"
7c673cae
FG
360 << ", used " << used << "%"
361 << dendl;
362 }
363}
364
365int BlueFS::get_block_extents(unsigned id, interval_set<uint64_t> *extents)
366{
11fdf7f2 367 std::lock_guard l(lock);
7c673cae
FG
368 dout(10) << __func__ << " bdev " << id << dendl;
369 if (id >= block_all.size())
370 return -EINVAL;
371 *extents = block_all[id];
372 return 0;
373}
374
375int BlueFS::mkfs(uuid_d osd_uuid)
376{
11fdf7f2 377 std::unique_lock l(lock);
7c673cae
FG
378 dout(1) << __func__
379 << " osd_uuid " << osd_uuid
380 << dendl;
381
382 _init_alloc();
383 _init_logger();
384
385 super.version = 1;
386 super.block_size = bdev[BDEV_DB]->get_block_size();
387 super.osd_uuid = osd_uuid;
388 super.uuid.generate_random();
389 dout(1) << __func__ << " uuid " << super.uuid << dendl;
390
391 // init log
392 FileRef log_file = new File;
393 log_file->fnode.ino = 1;
394 log_file->fnode.prefer_bdev = BDEV_WAL;
395 int r = _allocate(
396 log_file->fnode.prefer_bdev,
397 cct->_conf->bluefs_max_log_runway,
94b18763 398 &log_file->fnode);
11fdf7f2 399 ceph_assert(r == 0);
7c673cae
FG
400 log_writer = _create_writer(log_file);
401
402 // initial txn
403 log_t.op_init();
404 for (unsigned bdev = 0; bdev < MAX_BDEV; ++bdev) {
405 interval_set<uint64_t>& p = block_all[bdev];
406 if (p.empty())
407 continue;
408 for (interval_set<uint64_t>::iterator q = p.begin(); q != p.end(); ++q) {
409 dout(20) << __func__ << " op_alloc_add " << bdev << " 0x"
410 << std::hex << q.get_start() << "~" << q.get_len() << std::dec
411 << dendl;
412 log_t.op_alloc_add(bdev, q.get_start(), q.get_len());
413 }
414 }
415 _flush_and_sync_log(l);
416
417 // write supers
418 super.log_fnode = log_file->fnode;
11fdf7f2 419 _write_super(BDEV_DB);
7c673cae
FG
420 flush_bdev();
421
422 // clean up
423 super = bluefs_super_t();
424 _close_writer(log_writer);
425 log_writer = NULL;
426 block_all.clear();
7c673cae
FG
427 _stop_alloc();
428 _shutdown_logger();
429
430 dout(10) << __func__ << " success" << dendl;
431 return 0;
432}
433
434void BlueFS::_init_alloc()
435{
436 dout(20) << __func__ << dendl;
437 alloc.resize(MAX_BDEV);
438 pending_release.resize(MAX_BDEV);
439 for (unsigned id = 0; id < bdev.size(); ++id) {
440 if (!bdev[id]) {
441 continue;
442 }
11fdf7f2 443 ceph_assert(bdev[id]->get_size());
7c673cae
FG
444 alloc[id] = Allocator::create(cct, cct->_conf->bluefs_allocator,
445 bdev[id]->get_size(),
446 cct->_conf->bluefs_alloc_size);
447 interval_set<uint64_t>& p = block_all[id];
448 for (interval_set<uint64_t>::iterator q = p.begin(); q != p.end(); ++q) {
449 alloc[id]->init_add_free(q.get_start(), q.get_len());
450 }
451 }
452}
453
454void BlueFS::_stop_alloc()
455{
456 dout(20) << __func__ << dendl;
11fdf7f2
TL
457 for (auto p : bdev) {
458 if (p)
459 p->discard_drain();
460 }
461
7c673cae
FG
462 for (auto p : alloc) {
463 if (p != nullptr) {
464 p->shutdown();
465 delete p;
466 }
467 }
468 alloc.clear();
469}
470
471int BlueFS::mount()
472{
473 dout(1) << __func__ << dendl;
474
475 int r = _open_super();
476 if (r < 0) {
477 derr << __func__ << " failed to open super: " << cpp_strerror(r) << dendl;
478 goto out;
479 }
480
481 block_all.clear();
482 block_all.resize(MAX_BDEV);
7c673cae 483 _init_alloc();
494da23a 484 _init_logger();
7c673cae 485
11fdf7f2 486 r = _replay(false, false);
7c673cae
FG
487 if (r < 0) {
488 derr << __func__ << " failed to replay log: " << cpp_strerror(r) << dendl;
489 _stop_alloc();
490 goto out;
491 }
492
493 // init freelist
494 for (auto& p : file_map) {
495 dout(30) << __func__ << " noting alloc for " << p.second->fnode << dendl;
496 for (auto& q : p.second->fnode.extents) {
497 alloc[q.bdev]->init_rm_free(q.offset, q.length);
498 }
499 }
500
501 // set up the log for future writes
502 log_writer = _create_writer(_get_file(1));
11fdf7f2 503 ceph_assert(log_writer->file->fnode.ino == 1);
7c673cae
FG
504 log_writer->pos = log_writer->file->fnode.size;
505 dout(10) << __func__ << " log write pos set to 0x"
506 << std::hex << log_writer->pos << std::dec
507 << dendl;
508
7c673cae
FG
509 return 0;
510
511 out:
512 super = bluefs_super_t();
513 return r;
514}
515
516void BlueFS::umount()
517{
518 dout(1) << __func__ << dendl;
519
520 sync_metadata();
521
522 _close_writer(log_writer);
523 log_writer = NULL;
524
525 _stop_alloc();
526 file_map.clear();
527 dir_map.clear();
528 super = bluefs_super_t();
529 log_t.clear();
530 _shutdown_logger();
531}
532
11fdf7f2 533int BlueFS::prepare_new_device(int id)
7c673cae 534{
11fdf7f2
TL
535 dout(1) << __func__ << dendl;
536
537 if(id == BDEV_NEWDB) {
538 int new_log_dev_cur = BDEV_WAL;
539 int new_log_dev_next = BDEV_WAL;
540 if (!bdev[BDEV_WAL]) {
541 new_log_dev_cur = BDEV_NEWDB;
542 new_log_dev_next = BDEV_DB;
543 }
544 _rewrite_log_sync(false,
545 BDEV_NEWDB,
546 new_log_dev_cur,
547 new_log_dev_next,
548 RENAME_DB2SLOW);
549 //}
550 } else if(id == BDEV_NEWWAL) {
551 _rewrite_log_sync(false, BDEV_DB, BDEV_NEWWAL, BDEV_WAL, REMOVE_WAL);
552 } else {
553 assert(false);
554 }
555 return 0;
556}
557
558void BlueFS::collect_metadata(map<string,string> *pm, unsigned skip_bdev_id)
559{
560 if (skip_bdev_id != BDEV_DB && bdev[BDEV_DB])
7c673cae
FG
561 bdev[BDEV_DB]->collect_metadata("bluefs_db_", pm);
562 if (bdev[BDEV_WAL])
563 bdev[BDEV_WAL]->collect_metadata("bluefs_wal_", pm);
11fdf7f2
TL
564}
565
566void BlueFS::get_devices(set<string> *ls)
567{
568 for (unsigned i = 0; i < MAX_BDEV; ++i) {
569 if (bdev[i]) {
570 bdev[i]->get_devices(ls);
571 }
572 }
7c673cae
FG
573}
574
575int BlueFS::fsck()
576{
11fdf7f2 577 std::lock_guard l(lock);
7c673cae
FG
578 dout(1) << __func__ << dendl;
579 // hrm, i think we check everything on mount...
580 return 0;
581}
582
11fdf7f2 583int BlueFS::_write_super(int dev)
7c673cae
FG
584{
585 // build superblock
586 bufferlist bl;
11fdf7f2 587 encode(super, bl);
7c673cae 588 uint32_t crc = bl.crc32c(-1);
11fdf7f2 589 encode(crc, bl);
7c673cae
FG
590 dout(10) << __func__ << " super block length(encoded): " << bl.length() << dendl;
591 dout(10) << __func__ << " superblock " << super.version << dendl;
592 dout(10) << __func__ << " log_fnode " << super.log_fnode << dendl;
11fdf7f2 593 ceph_assert(bl.length() <= get_super_length());
7c673cae
FG
594 bl.append_zero(get_super_length() - bl.length());
595
11fdf7f2 596 bdev[dev]->write(get_super_offset(), bl, false, WRITE_LIFE_SHORT);
7c673cae
FG
597 dout(20) << __func__ << " v " << super.version
598 << " crc 0x" << std::hex << crc
599 << " offset 0x" << get_super_offset() << std::dec
600 << dendl;
601 return 0;
602}
603
604int BlueFS::_open_super()
605{
606 dout(10) << __func__ << dendl;
607
608 bufferlist bl;
609 uint32_t expected_crc, crc;
610 int r;
611
612 // always the second block
613 r = bdev[BDEV_DB]->read(get_super_offset(), get_super_length(),
614 &bl, ioc[BDEV_DB], false);
615 if (r < 0)
616 return r;
617
11fdf7f2
TL
618 auto p = bl.cbegin();
619 decode(super, p);
7c673cae
FG
620 {
621 bufferlist t;
622 t.substr_of(bl, 0, p.get_off());
623 crc = t.crc32c(-1);
624 }
11fdf7f2 625 decode(expected_crc, p);
7c673cae
FG
626 if (crc != expected_crc) {
627 derr << __func__ << " bad crc on superblock, expected 0x"
628 << std::hex << expected_crc << " != actual 0x" << crc << std::dec
629 << dendl;
630 return -EIO;
631 }
632 dout(10) << __func__ << " superblock " << super.version << dendl;
633 dout(10) << __func__ << " log_fnode " << super.log_fnode << dendl;
634 return 0;
635}
636
11fdf7f2 637int BlueFS::_replay(bool noop, bool to_stdout)
7c673cae
FG
638{
639 dout(10) << __func__ << (noop ? " NO-OP" : "") << dendl;
640 ino_last = 1; // by the log
641 log_seq = 0;
642
643 FileRef log_file;
11fdf7f2
TL
644 log_file = _get_file(1);
645 if (!noop) {
646 log_file->fnode = super.log_fnode;
7c673cae 647 } else {
11fdf7f2
TL
648 // do not use fnode from superblock in 'noop' mode - log_file's one should
649 // be fine and up-to-date
650 ceph_assert(log_file->fnode.ino == 1);
651 ceph_assert(log_file->fnode.extents.size() != 0);
7c673cae 652 }
7c673cae 653 dout(10) << __func__ << " log_fnode " << super.log_fnode << dendl;
11fdf7f2
TL
654 if (unlikely(to_stdout)) {
655 std::cout << " log_fnode " << super.log_fnode << std::endl;
656 }
7c673cae
FG
657
658 FileReader *log_reader = new FileReader(
659 log_file, cct->_conf->bluefs_max_prefetch,
660 false, // !random
661 true); // ignore eof
662 while (true) {
11fdf7f2 663 ceph_assert((log_reader->buf.pos & ~super.block_mask()) == 0);
7c673cae
FG
664 uint64_t pos = log_reader->buf.pos;
665 uint64_t read_pos = pos;
666 bufferlist bl;
667 {
668 int r = _read(log_reader, &log_reader->buf, read_pos, super.block_size,
669 &bl, NULL);
11fdf7f2 670 ceph_assert(r == (int)super.block_size);
7c673cae
FG
671 read_pos += r;
672 }
673 uint64_t more = 0;
674 uint64_t seq;
675 uuid_d uuid;
676 {
11fdf7f2 677 auto p = bl.cbegin();
7c673cae
FG
678 __u8 a, b;
679 uint32_t len;
11fdf7f2
TL
680 decode(a, p);
681 decode(b, p);
682 decode(len, p);
683 decode(uuid, p);
684 decode(seq, p);
7c673cae 685 if (len + 6 > bl.length()) {
11fdf7f2 686 more = round_up_to(len + 6 - bl.length(), super.block_size);
7c673cae
FG
687 }
688 }
689 if (uuid != super.uuid) {
690 dout(10) << __func__ << " 0x" << std::hex << pos << std::dec
691 << ": stop: uuid " << uuid << " != super.uuid " << super.uuid
692 << dendl;
693 break;
694 }
695 if (seq != log_seq + 1) {
696 dout(10) << __func__ << " 0x" << std::hex << pos << std::dec
697 << ": stop: seq " << seq << " != expected " << log_seq + 1
698 << dendl;
699 break;
700 }
701 if (more) {
702 dout(20) << __func__ << " need 0x" << std::hex << more << std::dec
703 << " more bytes" << dendl;
704 bufferlist t;
705 int r = _read(log_reader, &log_reader->buf, read_pos, more, &t, NULL);
706 if (r < (int)more) {
707 dout(10) << __func__ << " 0x" << std::hex << pos
708 << ": stop: len is 0x" << bl.length() + more << std::dec
709 << ", which is past eof" << dendl;
710 break;
711 }
11fdf7f2 712 ceph_assert(r == (int)more);
7c673cae
FG
713 bl.claim_append(t);
714 read_pos += r;
715 }
716 bluefs_transaction_t t;
717 try {
11fdf7f2
TL
718 auto p = bl.cbegin();
719 decode(t, p);
7c673cae
FG
720 }
721 catch (buffer::error& e) {
722 dout(10) << __func__ << " 0x" << std::hex << pos << std::dec
723 << ": stop: failed to decode: " << e.what()
724 << dendl;
725 delete log_reader;
726 return -EIO;
727 }
11fdf7f2 728 ceph_assert(seq == t.seq);
7c673cae
FG
729 dout(10) << __func__ << " 0x" << std::hex << pos << std::dec
730 << ": " << t << dendl;
11fdf7f2
TL
731 if (unlikely(to_stdout)) {
732 std::cout << " 0x" << std::hex << pos << std::dec
733 << ": " << t << std::endl;
734 }
7c673cae 735
11fdf7f2 736 auto p = t.op_bl.cbegin();
7c673cae
FG
737 while (!p.end()) {
738 __u8 op;
11fdf7f2 739 decode(op, p);
7c673cae
FG
740 switch (op) {
741
742 case bluefs_transaction_t::OP_INIT:
743 dout(20) << __func__ << " 0x" << std::hex << pos << std::dec
744 << ": op_init" << dendl;
11fdf7f2
TL
745 if (unlikely(to_stdout)) {
746 std::cout << " 0x" << std::hex << pos << std::dec
747 << ": op_init" << std::endl;
748 }
749
750 ceph_assert(t.seq == 1);
7c673cae
FG
751 break;
752
753 case bluefs_transaction_t::OP_JUMP:
754 {
755 uint64_t next_seq;
756 uint64_t offset;
11fdf7f2
TL
757 decode(next_seq, p);
758 decode(offset, p);
7c673cae
FG
759 dout(20) << __func__ << " 0x" << std::hex << pos << std::dec
760 << ": op_jump seq " << next_seq
761 << " offset 0x" << std::hex << offset << std::dec << dendl;
11fdf7f2
TL
762 if (unlikely(to_stdout)) {
763 std::cout << " 0x" << std::hex << pos << std::dec
764 << ": op_jump seq " << next_seq
765 << " offset 0x" << std::hex << offset << std::dec
766 << std::endl;
767 }
768
769 ceph_assert(next_seq >= log_seq);
7c673cae
FG
770 log_seq = next_seq - 1; // we will increment it below
771 uint64_t skip = offset - read_pos;
772 if (skip) {
773 bufferlist junk;
774 int r = _read(log_reader, &log_reader->buf, read_pos, skip, &junk,
775 NULL);
776 if (r != (int)skip) {
777 dout(10) << __func__ << " 0x" << std::hex << read_pos
778 << ": stop: failed to skip to " << offset
779 << std::dec << dendl;
11fdf7f2 780 ceph_abort_msg("problem with op_jump");
7c673cae
FG
781 }
782 }
783 }
784 break;
785
786 case bluefs_transaction_t::OP_JUMP_SEQ:
787 {
788 uint64_t next_seq;
11fdf7f2 789 decode(next_seq, p);
7c673cae
FG
790 dout(20) << __func__ << " 0x" << std::hex << pos << std::dec
791 << ": op_jump_seq " << next_seq << dendl;
11fdf7f2
TL
792 if (unlikely(to_stdout)) {
793 std::cout << " 0x" << std::hex << pos << std::dec
794 << ": op_jump_seq " << next_seq << std::endl;
795 }
796
797 ceph_assert(next_seq >= log_seq);
7c673cae
FG
798 log_seq = next_seq - 1; // we will increment it below
799 }
800 break;
801
802 case bluefs_transaction_t::OP_ALLOC_ADD:
803 {
804 __u8 id;
805 uint64_t offset, length;
11fdf7f2
TL
806 decode(id, p);
807 decode(offset, p);
808 decode(length, p);
7c673cae
FG
809 dout(20) << __func__ << " 0x" << std::hex << pos << std::dec
810 << ": op_alloc_add " << " " << (int)id
811 << ":0x" << std::hex << offset << "~" << length << std::dec
812 << dendl;
11fdf7f2
TL
813 if (unlikely(to_stdout)) {
814 std::cout << " 0x" << std::hex << pos << std::dec
815 << ": op_alloc_add " << " " << (int)id
816 << ":0x" << std::hex << offset << "~" << length << std::dec
817 << std::endl;
818 }
819
7c673cae
FG
820 if (!noop) {
821 block_all[id].insert(offset, length);
7c673cae
FG
822 alloc[id]->init_add_free(offset, length);
823 }
824 }
825 break;
826
827 case bluefs_transaction_t::OP_ALLOC_RM:
828 {
829 __u8 id;
830 uint64_t offset, length;
11fdf7f2
TL
831 decode(id, p);
832 decode(offset, p);
833 decode(length, p);
7c673cae
FG
834 dout(20) << __func__ << " 0x" << std::hex << pos << std::dec
835 << ": op_alloc_rm " << " " << (int)id
836 << ":0x" << std::hex << offset << "~" << length << std::dec
837 << dendl;
11fdf7f2
TL
838 if (unlikely(to_stdout)) {
839 std::cout << " 0x" << std::hex << pos << std::dec
840 << ": op_alloc_rm " << " " << (int)id
841 << ":0x" << std::hex << offset << "~" << length << std::dec
842 << std::endl;
843 }
844
7c673cae
FG
845 if (!noop) {
846 block_all[id].erase(offset, length);
7c673cae
FG
847 alloc[id]->init_rm_free(offset, length);
848 }
849 }
850 break;
851
852 case bluefs_transaction_t::OP_DIR_LINK:
853 {
854 string dirname, filename;
855 uint64_t ino;
11fdf7f2
TL
856 decode(dirname, p);
857 decode(filename, p);
858 decode(ino, p);
7c673cae
FG
859 dout(20) << __func__ << " 0x" << std::hex << pos << std::dec
860 << ": op_dir_link " << " " << dirname << "/" << filename
861 << " to " << ino
862 << dendl;
11fdf7f2
TL
863 if (unlikely(to_stdout)) {
864 std::cout << " 0x" << std::hex << pos << std::dec
865 << ": op_dir_link " << " " << dirname << "/" << filename
866 << " to " << ino
867 << std::endl;
868 }
869
7c673cae
FG
870 if (!noop) {
871 FileRef file = _get_file(ino);
11fdf7f2 872 ceph_assert(file->fnode.ino);
7c673cae 873 map<string,DirRef>::iterator q = dir_map.find(dirname);
11fdf7f2 874 ceph_assert(q != dir_map.end());
7c673cae 875 map<string,FileRef>::iterator r = q->second->file_map.find(filename);
11fdf7f2 876 ceph_assert(r == q->second->file_map.end());
7c673cae
FG
877 q->second->file_map[filename] = file;
878 ++file->refs;
879 }
880 }
881 break;
882
883 case bluefs_transaction_t::OP_DIR_UNLINK:
884 {
885 string dirname, filename;
11fdf7f2
TL
886 decode(dirname, p);
887 decode(filename, p);
7c673cae
FG
888 dout(20) << __func__ << " 0x" << std::hex << pos << std::dec
889 << ": op_dir_unlink " << " " << dirname << "/" << filename
890 << dendl;
11fdf7f2
TL
891 if (unlikely(to_stdout)) {
892 std::cout << " 0x" << std::hex << pos << std::dec
893 << ": op_dir_unlink " << " " << dirname << "/" << filename
894 << std::endl;
895 }
896
7c673cae
FG
897 if (!noop) {
898 map<string,DirRef>::iterator q = dir_map.find(dirname);
11fdf7f2 899 ceph_assert(q != dir_map.end());
7c673cae 900 map<string,FileRef>::iterator r = q->second->file_map.find(filename);
11fdf7f2
TL
901 ceph_assert(r != q->second->file_map.end());
902 ceph_assert(r->second->refs > 0);
7c673cae
FG
903 --r->second->refs;
904 q->second->file_map.erase(r);
905 }
906 }
907 break;
908
909 case bluefs_transaction_t::OP_DIR_CREATE:
910 {
911 string dirname;
11fdf7f2 912 decode(dirname, p);
7c673cae
FG
913 dout(20) << __func__ << " 0x" << std::hex << pos << std::dec
914 << ": op_dir_create " << dirname << dendl;
11fdf7f2
TL
915 if (unlikely(to_stdout)) {
916 std::cout << " 0x" << std::hex << pos << std::dec
917 << ": op_dir_create " << dirname << std::endl;
918 }
919
7c673cae
FG
920 if (!noop) {
921 map<string,DirRef>::iterator q = dir_map.find(dirname);
11fdf7f2 922 ceph_assert(q == dir_map.end());
7c673cae
FG
923 dir_map[dirname] = new Dir;
924 }
925 }
926 break;
927
928 case bluefs_transaction_t::OP_DIR_REMOVE:
929 {
930 string dirname;
11fdf7f2 931 decode(dirname, p);
7c673cae
FG
932 dout(20) << __func__ << " 0x" << std::hex << pos << std::dec
933 << ": op_dir_remove " << dirname << dendl;
11fdf7f2
TL
934 if (unlikely(to_stdout)) {
935 std::cout << " 0x" << std::hex << pos << std::dec
936 << ": op_dir_remove " << dirname << std::endl;
937 }
938
7c673cae
FG
939 if (!noop) {
940 map<string,DirRef>::iterator q = dir_map.find(dirname);
11fdf7f2
TL
941 ceph_assert(q != dir_map.end());
942 ceph_assert(q->second->file_map.empty());
7c673cae
FG
943 dir_map.erase(q);
944 }
945 }
946 break;
947
948 case bluefs_transaction_t::OP_FILE_UPDATE:
949 {
950 bluefs_fnode_t fnode;
11fdf7f2 951 decode(fnode, p);
7c673cae
FG
952 dout(20) << __func__ << " 0x" << std::hex << pos << std::dec
953 << ": op_file_update " << " " << fnode << dendl;
11fdf7f2
TL
954 if (unlikely(to_stdout)) {
955 std::cout << " 0x" << std::hex << pos << std::dec
956 << ": op_file_update " << " " << fnode << std::endl;
957 }
958
7c673cae
FG
959 if (!noop) {
960 FileRef f = _get_file(fnode.ino);
961 f->fnode = fnode;
962 if (fnode.ino > ino_last) {
963 ino_last = fnode.ino;
964 }
965 }
966 }
967 break;
968
969 case bluefs_transaction_t::OP_FILE_REMOVE:
970 {
971 uint64_t ino;
11fdf7f2 972 decode(ino, p);
7c673cae
FG
973 dout(20) << __func__ << " 0x" << std::hex << pos << std::dec
974 << ": op_file_remove " << ino << dendl;
11fdf7f2
TL
975 if (unlikely(to_stdout)) {
976 std::cout << " 0x" << std::hex << pos << std::dec
977 << ": op_file_remove " << ino << std::endl;
978 }
979
7c673cae
FG
980 if (!noop) {
981 auto p = file_map.find(ino);
11fdf7f2 982 ceph_assert(p != file_map.end());
7c673cae
FG
983 file_map.erase(p);
984 }
985 }
986 break;
987
988 default:
989 derr << __func__ << " 0x" << std::hex << pos << std::dec
990 << ": stop: unrecognized op " << (int)op << dendl;
991 delete log_reader;
992 return -EIO;
993 }
994 }
11fdf7f2 995 ceph_assert(p.end());
7c673cae
FG
996
997 // we successfully replayed the transaction; bump the seq and log size
998 ++log_seq;
999 log_file->fnode.size = log_reader->buf.pos;
1000 }
1001
1002 dout(10) << __func__ << " log file size was 0x"
1003 << std::hex << log_file->fnode.size << std::dec << dendl;
11fdf7f2
TL
1004 if (unlikely(to_stdout)) {
1005 std::cout << " log file size was 0x"
1006 << std::hex << log_file->fnode.size << std::dec << std::endl;
1007 }
1008
7c673cae
FG
1009 delete log_reader;
1010
1011 if (!noop) {
1012 // verify file link counts are all >0
1013 for (auto& p : file_map) {
1014 if (p.second->refs == 0 &&
1015 p.second->fnode.ino > 1) {
1016 derr << __func__ << " file with link count 0: " << p.second->fnode
1017 << dendl;
1018 return -EIO;
1019 }
1020 }
1021 }
1022
1023 dout(10) << __func__ << " done" << dendl;
1024 return 0;
1025}
1026
11fdf7f2
TL
1027int BlueFS::log_dump()
1028{
1029 // only dump log file's content
1030 int r = _replay(true, true);
1031 if (r < 0) {
1032 derr << __func__ << " failed to replay log: " << cpp_strerror(r) << dendl;
1033 return r;
1034 }
1035
1036 return 0;
1037}
1038
1039int BlueFS::device_migrate_to_existing(
1040 CephContext *cct,
1041 const set<int>& devs_source,
1042 int dev_target)
1043{
1044 vector<byte> buf;
1045 bool buffered = cct->_conf->bluefs_buffered_io;
1046
1047 assert(dev_target < (int)MAX_BDEV);
1048
1049 int flags = 0;
1050 flags |= devs_source.count(BDEV_DB) ?
1051 (REMOVE_DB | RENAME_SLOW2DB) : 0;
1052 flags |= devs_source.count(BDEV_WAL) ? REMOVE_WAL : 0;
1053 int dev_target_new = dev_target;
1054
1055 // Slow device without separate DB one is addressed via BDEV_DB
1056 // Hence need renaming.
1057 if ((flags & REMOVE_DB) && dev_target == BDEV_SLOW) {
1058 dev_target_new = BDEV_DB;
1059 dout(0) << __func__ << " super to be written to " << dev_target << dendl;
1060 }
1061
1062 for (auto& p : file_map) {
1063 //do not copy log
1064 if (p.second->fnode.ino == 1) {
1065 continue;
1066 }
1067 auto& fnode_extents = p.second->fnode.extents;
1068
1069 for (auto ext_it = fnode_extents.begin();
1070 ext_it != p.second->fnode.extents.end();
1071 ++ext_it) {
1072 if (ext_it->bdev != dev_target && devs_source.count(ext_it->bdev)) {
1073 bluefs_extent_t old_ext = *ext_it;
1074 PExtentVector extents;
1075 auto l =
1076 _allocate_without_fallback(dev_target, old_ext.length, &extents);
1077 if (l == 0) {
1078 buf.resize(old_ext.length);
1079 int r = bdev[old_ext.bdev]->read_random(
1080 old_ext.offset,
1081 old_ext.length,
1082 (char*)&buf.at(0),
1083 buffered);
1084 if (r != 0) {
1085 derr << __func__ << " failed to read 0x" << std::hex
1086 << old_ext.offset << "~" <<old_ext.length << std::dec
1087 << " from " << (int)dev_target << dendl;
1088 return -EIO;
1089 }
1090
1091 assert(extents.size() > 0);
1092 uint64_t src_buf_pos = 0;
1093 {
1094 // overwrite existing extent
1095 *ext_it=
1096 bluefs_extent_t(dev_target_new, extents[0].offset, extents[0].length);
1097 bufferlist bl;
1098 bl.append((char*)&buf.at(src_buf_pos), extents[0].length);
1099 int r = bdev[dev_target]->write(extents[0].offset, bl, buffered);
1100 ceph_assert(r == 0);
1101 src_buf_pos += extents[0].length;
1102 }
1103 // then insert more extents if needed
1104 for( size_t i = 1; i < extents.size(); ++i) {
1105 bufferlist bl;
1106 bl.append((char*)&buf.at(src_buf_pos), extents[i].length);
1107 ++ext_it;
1108 ext_it = fnode_extents.emplace(ext_it, dev_target_new,
1109 extents[i].offset, extents[i].length);
1110 int r = bdev[dev_target]->write(extents[i].offset, bl, buffered);
1111 ceph_assert(r == 0);
1112 src_buf_pos += extents[i].length;
1113 }
1114 {
1115 PExtentVector to_release;
1116 to_release.emplace_back(old_ext.offset, old_ext.length);
1117 alloc[old_ext.bdev]->release(to_release);
1118 }
1119
1120 } else {
1121 derr << __func__ << " unable to allocate len 0x" << std::hex
1122 << old_ext.length << std::dec << " from " << (int)dev_target
1123 << dendl;
1124 return -ENOSPC;
1125 }
1126 } else if (dev_target != dev_target_new && ext_it->bdev == dev_target) {
1127 ext_it->bdev = dev_target_new;
1128 }
1129 }
1130 auto& prefer_bdev = p.second->fnode.prefer_bdev;
1131 if (prefer_bdev != dev_target && devs_source.count(prefer_bdev)) {
1132 prefer_bdev = dev_target_new;
1133 }
1134 }
1135 // new logging device in the current naming scheme
1136 int new_log_dev_cur = bdev[BDEV_WAL] ?
1137 BDEV_WAL :
1138 bdev[BDEV_DB] ? BDEV_DB : BDEV_SLOW;
1139
1140 // new logging device in new naming scheme
1141 int new_log_dev_next = new_log_dev_cur;
1142
1143 if (devs_source.count(new_log_dev_cur)) {
1144 // SLOW device is addressed via BDEV_DB too hence either WAL or DB
1145 new_log_dev_next = (flags & REMOVE_WAL) || !bdev[BDEV_WAL] ?
1146 BDEV_DB :
1147 BDEV_WAL;
1148
1149 dout(0) << __func__ << " log moved from " << new_log_dev_cur
1150 << " to " << new_log_dev_next << dendl;
1151
1152 new_log_dev_cur =
1153 (flags & REMOVE_DB) && new_log_dev_next == BDEV_DB ?
1154 BDEV_SLOW :
1155 new_log_dev_next;
1156 }
1157
1158 _rewrite_log_sync(
1159 false,
1160 (flags & REMOVE_DB) ? BDEV_SLOW : BDEV_DB,
1161 new_log_dev_cur,
1162 new_log_dev_next,
1163 flags);
1164 return 0;
1165}
1166
1167int BlueFS::device_migrate_to_new(
1168 CephContext *cct,
1169 const set<int>& devs_source,
1170 int dev_target)
1171{
1172 vector<byte> buf;
1173 bool buffered = cct->_conf->bluefs_buffered_io;
1174
1175 assert(dev_target == (int)BDEV_NEWDB || (int)BDEV_NEWWAL);
1176
1177 int flags = 0;
1178
1179 flags |= devs_source.count(BDEV_DB) ?
1180 (!bdev[BDEV_SLOW] ? RENAME_DB2SLOW: REMOVE_DB) :
1181 0;
1182 flags |= devs_source.count(BDEV_WAL) ? REMOVE_WAL : 0;
1183 int dev_target_new = dev_target;
1184
1185 for (auto& p : file_map) {
1186 //do not copy log
1187 if (p.second->fnode.ino == 1) {
1188 continue;
1189 }
1190 auto& fnode_extents = p.second->fnode.extents;
1191
1192 for (auto ext_it = fnode_extents.begin();
1193 ext_it != p.second->fnode.extents.end();
1194 ++ext_it) {
1195 if (ext_it->bdev != dev_target && devs_source.count(ext_it->bdev)) {
1196 bluefs_extent_t old_ext = *ext_it;
1197 PExtentVector extents;
1198 auto l =
1199 _allocate_without_fallback(dev_target, old_ext.length, &extents);
1200 if (l == 0) {
1201 buf.resize(old_ext.length);
1202 int r = bdev[old_ext.bdev]->read_random(
1203 old_ext.offset,
1204 old_ext.length,
1205 (char*)&buf.at(0),
1206 buffered);
1207 dout(10)<<__func__<<" read = "<<r<<dendl;
1208 if (r != 0) {
1209 derr << __func__ << " failed to read 0x" << std::hex
1210 << old_ext.offset << "~" <<old_ext.length << std::dec
1211 << " from " << (int)dev_target << dendl;
1212 return -EIO;
1213 }
1214
1215 assert(extents.size() > 0);
1216 uint64_t src_buf_pos = 0;
1217 {
1218 // overwrite existing extent
1219 *ext_it=
1220 bluefs_extent_t(dev_target_new, extents[0].offset, extents[0].length);
1221 bufferlist bl;
1222 bl.append((char*)&buf.at(src_buf_pos), extents[0].length);
1223 int r = bdev[dev_target]->write(extents[0].offset, bl, buffered);
1224 ceph_assert(r == 0);
1225 src_buf_pos += extents[0].length;
1226 }
1227 // then insert more extents if needed
1228 for( size_t i = 1; i < extents.size(); ++i) {
1229 bufferlist bl;
1230 bl.append((char*)&buf.at(src_buf_pos), extents[i].length);
1231 ++ext_it;
1232 ext_it = fnode_extents.emplace(ext_it, dev_target_new,
1233 extents[i].offset, extents[i].length);
1234 int r = bdev[dev_target]->write(extents[i].offset, bl, buffered);
1235 ceph_assert(r == 0);
1236 src_buf_pos += extents[i].length;
1237 }
1238 {
1239 PExtentVector to_release;
1240 to_release.emplace_back(old_ext.offset, old_ext.length);
1241 alloc[old_ext.bdev]->release(to_release);
1242 }
1243 } else {
1244 derr << __func__ << " unable to allocate len 0x" << std::hex
1245 << old_ext.length << std::dec << " from " << (int)dev_target
1246 << dendl;
1247 return -ENOSPC;
1248 }
1249 } else if (dev_target != dev_target_new && ext_it->bdev == dev_target) {
1250 ext_it->bdev = dev_target_new;
1251 }
1252 }
1253 auto& prefer_bdev = p.second->fnode.prefer_bdev;
1254 if (prefer_bdev != dev_target && devs_source.count(prefer_bdev)) {
1255 prefer_bdev = dev_target_new;
1256 }
1257 }
1258 // new logging device in the current naming scheme
1259 int new_log_dev_cur =
1260 bdev[BDEV_NEWWAL] ?
1261 BDEV_NEWWAL :
1262 bdev[BDEV_WAL] && !(flags & REMOVE_WAL) ?
1263 BDEV_WAL :
1264 bdev[BDEV_NEWDB] ?
1265 BDEV_NEWDB :
1266 bdev[BDEV_DB] && !(flags & REMOVE_DB)?
1267 BDEV_DB :
1268 BDEV_SLOW;
1269
1270 // new logging device in new naming scheme
1271 int new_log_dev_next =
1272 new_log_dev_cur == BDEV_NEWWAL ?
1273 BDEV_WAL :
1274 new_log_dev_cur == BDEV_NEWDB ?
1275 BDEV_DB :
1276 new_log_dev_cur;
1277
1278 int super_dev =
1279 dev_target == BDEV_NEWDB ?
1280 BDEV_NEWDB :
1281 bdev[BDEV_DB] ?
1282 BDEV_DB :
1283 BDEV_SLOW;
1284
1285 _rewrite_log_sync(
1286 false,
1287 super_dev,
1288 new_log_dev_cur,
1289 new_log_dev_next,
1290 flags);
1291 return 0;
1292}
1293
7c673cae
FG
1294BlueFS::FileRef BlueFS::_get_file(uint64_t ino)
1295{
1296 auto p = file_map.find(ino);
1297 if (p == file_map.end()) {
1298 FileRef f = new File;
1299 file_map[ino] = f;
1300 dout(30) << __func__ << " ino " << ino << " = " << f
1301 << " (new)" << dendl;
1302 return f;
1303 } else {
1304 dout(30) << __func__ << " ino " << ino << " = " << p->second << dendl;
1305 return p->second;
1306 }
1307}
1308
1309void BlueFS::_drop_link(FileRef file)
1310{
1311 dout(20) << __func__ << " had refs " << file->refs
1312 << " on " << file->fnode << dendl;
11fdf7f2 1313 ceph_assert(file->refs > 0);
7c673cae
FG
1314 --file->refs;
1315 if (file->refs == 0) {
1316 dout(20) << __func__ << " destroying " << file->fnode << dendl;
11fdf7f2 1317 ceph_assert(file->num_reading.load() == 0);
7c673cae
FG
1318 log_t.op_file_remove(file->fnode.ino);
1319 for (auto& r : file->fnode.extents) {
1320 pending_release[r.bdev].insert(r.offset, r.length);
1321 }
1322 file_map.erase(file->fnode.ino);
1323 file->deleted = true;
94b18763 1324
7c673cae 1325 if (file->dirty_seq) {
11fdf7f2
TL
1326 ceph_assert(file->dirty_seq > log_seq_stable);
1327 ceph_assert(dirty_files.count(file->dirty_seq));
7c673cae
FG
1328 auto it = dirty_files[file->dirty_seq].iterator_to(*file);
1329 dirty_files[file->dirty_seq].erase(it);
1330 file->dirty_seq = 0;
1331 }
1332 }
1333}
1334
1335int BlueFS::_read_random(
1336 FileReader *h, ///< [in] read from here
1337 uint64_t off, ///< [in] offset
1338 size_t len, ///< [in] this many bytes
1339 char *out) ///< [out] optional: or copy it here
1340{
494da23a
TL
1341 auto* buf = &h->buf;
1342
1343 int ret = 0;
7c673cae
FG
1344 dout(10) << __func__ << " h " << h
1345 << " 0x" << std::hex << off << "~" << len << std::dec
1346 << " from " << h->file->fnode << dendl;
1347
1348 ++h->file->num_reading;
1349
1350 if (!h->ignore_eof &&
1351 off + len > h->file->fnode.size) {
1352 if (off > h->file->fnode.size)
1353 len = 0;
1354 else
1355 len = h->file->fnode.size - off;
1356 dout(20) << __func__ << " reaching (or past) eof, len clipped to 0x"
1357 << std::hex << len << std::dec << dendl;
1358 }
494da23a
TL
1359 logger->inc(l_bluefs_read_random_count, 1);
1360 logger->inc(l_bluefs_read_random_bytes, len);
7c673cae 1361
494da23a 1362 std::shared_lock s_lock(h->lock);
7c673cae 1363 while (len > 0) {
494da23a
TL
1364 if (off < buf->bl_off || off >= buf->get_buf_end()) {
1365 s_lock.unlock();
1366 uint64_t x_off = 0;
1367 auto p = h->file->fnode.seek(off, &x_off);
1368 uint64_t l = std::min(p->length - x_off, static_cast<uint64_t>(len));
1369 dout(20) << __func__ << " read random 0x"
1370 << std::hex << x_off << "~" << l << std::dec
1371 << " of " << *p << dendl;
1372 int r = bdev[p->bdev]->read_random(p->offset + x_off, l, out,
1373 cct->_conf->bluefs_buffered_io);
1374 ceph_assert(r == 0);
1375 off += l;
1376 len -= l;
1377 ret += l;
1378 out += l;
1379
1380 logger->inc(l_bluefs_read_random_disk_count, 1);
1381 logger->inc(l_bluefs_read_random_disk_bytes, l);
1382 if (len > 0) {
1383 s_lock.lock();
1384 }
1385 } else {
1386 auto left = buf->get_buf_remaining(off);
1387 int r = std::min(len, left);
1388 logger->inc(l_bluefs_read_random_buffer_count, 1);
1389 logger->inc(l_bluefs_read_random_buffer_bytes, r);
1390 dout(20) << __func__ << " left 0x" << std::hex << left
1391 << " 0x" << off << "~" << len << std::dec
1392 << dendl;
1393
1394 if (out) {
1395 // NOTE: h->bl is normally a contiguous buffer so c_str() is free.
1396 memcpy(out, buf->bl.c_str() + off - buf->bl_off, r);
1397 out += r;
1398 }
7c673cae 1399
494da23a
TL
1400 dout(30) << __func__ << " result chunk (0x"
1401 << std::hex << r << std::dec << " bytes):\n";
1402 bufferlist t;
1403 t.substr_of(buf->bl, off - buf->bl_off, r);
1404 t.hexdump(*_dout);
1405 *_dout << dendl;
1406
1407 off += r;
1408 len -= r;
1409 ret += r;
1410 buf->pos += r;
1411 }
1412 }
7c673cae
FG
1413 dout(20) << __func__ << " got " << ret << dendl;
1414 --h->file->num_reading;
1415 return ret;
1416}
1417
1418int BlueFS::_read(
1419 FileReader *h, ///< [in] read from here
1420 FileReaderBuffer *buf, ///< [in] reader state
1421 uint64_t off, ///< [in] offset
1422 size_t len, ///< [in] this many bytes
1423 bufferlist *outbl, ///< [out] optional: reference the result here
1424 char *out) ///< [out] optional: or copy it here
1425{
494da23a 1426 bool prefetch = !outbl && !out;
7c673cae
FG
1427 dout(10) << __func__ << " h " << h
1428 << " 0x" << std::hex << off << "~" << len << std::dec
494da23a
TL
1429 << " from " << h->file->fnode
1430 << (prefetch ? " prefetch" : "")
1431 << dendl;
7c673cae
FG
1432
1433 ++h->file->num_reading;
1434
1435 if (!h->ignore_eof &&
1436 off + len > h->file->fnode.size) {
1437 if (off > h->file->fnode.size)
1438 len = 0;
1439 else
1440 len = h->file->fnode.size - off;
1441 dout(20) << __func__ << " reaching (or past) eof, len clipped to 0x"
1442 << std::hex << len << std::dec << dendl;
1443 }
494da23a
TL
1444 logger->inc(l_bluefs_read_count, 1);
1445 logger->inc(l_bluefs_read_bytes, len);
1446 if (prefetch) {
1447 logger->inc(l_bluefs_read_prefetch_count, 1);
1448 logger->inc(l_bluefs_read_prefetch_bytes, len);
1449 }
1450
7c673cae
FG
1451 if (outbl)
1452 outbl->clear();
1453
1454 int ret = 0;
494da23a 1455 std::shared_lock s_lock(h->lock);
7c673cae
FG
1456 while (len > 0) {
1457 size_t left;
1458 if (off < buf->bl_off || off >= buf->get_buf_end()) {
494da23a
TL
1459 s_lock.unlock();
1460 std::unique_lock u_lock(h->lock);
1461 if (off < buf->bl_off || off >= buf->get_buf_end()) {
1462 // if precondition hasn't changed during locking upgrade.
1463 buf->bl.clear();
1464 buf->bl_off = off & super.block_mask();
1465 uint64_t x_off = 0;
1466 auto p = h->file->fnode.seek(buf->bl_off, &x_off);
1467 uint64_t want = round_up_to(len + (off & ~super.block_mask()),
1468 super.block_size);
1469 want = std::max(want, buf->max_prefetch);
1470 uint64_t l = std::min(p->length - x_off, want);
1471 uint64_t eof_offset = round_up_to(h->file->fnode.size, super.block_size);
1472 if (!h->ignore_eof &&
1473 buf->bl_off + l > eof_offset) {
1474 l = eof_offset - buf->bl_off;
1475 }
1476 dout(20) << __func__ << " fetching 0x"
1477 << std::hex << x_off << "~" << l << std::dec
1478 << " of " << *p << dendl;
1479 int r = bdev[p->bdev]->read(p->offset + x_off, l, &buf->bl, ioc[p->bdev],
1480 cct->_conf->bluefs_buffered_io);
1481 ceph_assert(r == 0);
7c673cae 1482 }
494da23a
TL
1483 u_lock.unlock();
1484 s_lock.lock();
1485 // we should recheck if buffer is valid after lock downgrade
1486 continue;
7c673cae
FG
1487 }
1488 left = buf->get_buf_remaining(off);
1489 dout(20) << __func__ << " left 0x" << std::hex << left
1490 << " len 0x" << len << std::dec << dendl;
1491
11fdf7f2 1492 int r = std::min(len, left);
7c673cae
FG
1493 if (outbl) {
1494 bufferlist t;
1495 t.substr_of(buf->bl, off - buf->bl_off, r);
1496 outbl->claim_append(t);
1497 }
1498 if (out) {
1499 // NOTE: h->bl is normally a contiguous buffer so c_str() is free.
1500 memcpy(out, buf->bl.c_str() + off - buf->bl_off, r);
1501 out += r;
1502 }
1503
1504 dout(30) << __func__ << " result chunk (0x"
1505 << std::hex << r << std::dec << " bytes):\n";
1506 bufferlist t;
1507 t.substr_of(buf->bl, off - buf->bl_off, r);
1508 t.hexdump(*_dout);
1509 *_dout << dendl;
1510
1511 off += r;
1512 len -= r;
1513 ret += r;
1514 buf->pos += r;
1515 }
1516
1517 dout(20) << __func__ << " got " << ret << dendl;
11fdf7f2 1518 ceph_assert(!outbl || (int)outbl->length() == ret);
7c673cae
FG
1519 --h->file->num_reading;
1520 return ret;
1521}
1522
1523void BlueFS::_invalidate_cache(FileRef f, uint64_t offset, uint64_t length)
1524{
1525 dout(10) << __func__ << " file " << f->fnode
1526 << " 0x" << std::hex << offset << "~" << length << std::dec
1527 << dendl;
1528 if (offset & ~super.block_mask()) {
1529 offset &= super.block_mask();
11fdf7f2 1530 length = round_up_to(length, super.block_size);
7c673cae
FG
1531 }
1532 uint64_t x_off = 0;
1533 auto p = f->fnode.seek(offset, &x_off);
1534 while (length > 0 && p != f->fnode.extents.end()) {
11fdf7f2 1535 uint64_t x_len = std::min(p->length - x_off, length);
7c673cae
FG
1536 bdev[p->bdev]->invalidate_cache(p->offset + x_off, x_len);
1537 dout(20) << __func__ << " 0x" << std::hex << x_off << "~" << x_len
1538 << std:: dec << " of " << *p << dendl;
1539 offset += x_len;
1540 length -= x_len;
1541 }
1542}
1543
1544uint64_t BlueFS::_estimate_log_size()
1545{
1546 int avg_dir_size = 40; // fixme
1547 int avg_file_size = 12;
1548 uint64_t size = 4096 * 2;
1549 size += file_map.size() * (1 + sizeof(bluefs_fnode_t));
1550 for (auto& p : block_all)
1551 size += p.num_intervals() * (1 + 1 + sizeof(uint64_t) * 2);
1552 size += dir_map.size() + (1 + avg_dir_size);
1553 size += file_map.size() * (1 + avg_dir_size + avg_file_size);
11fdf7f2 1554 return round_up_to(size, super.block_size);
7c673cae
FG
1555}
1556
1557void BlueFS::compact_log()
1558{
11fdf7f2 1559 std::unique_lock l(lock);
7c673cae
FG
1560 if (cct->_conf->bluefs_compact_log_sync) {
1561 _compact_log_sync();
1562 } else {
1563 _compact_log_async(l);
1564 }
1565}
1566
1567bool BlueFS::_should_compact_log()
1568{
1569 uint64_t current = log_writer->file->fnode.size;
1570 uint64_t expected = _estimate_log_size();
1571 float ratio = (float)current / (float)expected;
1572 dout(10) << __func__ << " current 0x" << std::hex << current
1573 << " expected " << expected << std::dec
1574 << " ratio " << ratio
1575 << (new_log ? " (async compaction in progress)" : "")
1576 << dendl;
1577 if (new_log ||
1578 current < cct->_conf->bluefs_log_compact_min_size ||
1579 ratio < cct->_conf->bluefs_log_compact_min_ratio) {
1580 return false;
1581 }
1582 return true;
1583}
1584
11fdf7f2
TL
1585void BlueFS::_compact_log_dump_metadata(bluefs_transaction_t *t,
1586 int flags)
7c673cae
FG
1587{
1588 t->seq = 1;
1589 t->uuid = super.uuid;
1590 dout(20) << __func__ << " op_init" << dendl;
1591
1592 t->op_init();
1593 for (unsigned bdev = 0; bdev < MAX_BDEV; ++bdev) {
1594 interval_set<uint64_t>& p = block_all[bdev];
1595 for (interval_set<uint64_t>::iterator q = p.begin(); q != p.end(); ++q) {
11fdf7f2
TL
1596 auto bdev_new = bdev;
1597 if ((flags & REMOVE_WAL) && bdev == BDEV_WAL) {
1598 continue;
1599 }
1600 if ((flags & REMOVE_DB) && bdev == BDEV_DB) {
1601 continue;
1602 }
1603 if ((flags & RENAME_SLOW2DB) && bdev == BDEV_SLOW) {
1604 bdev_new = BDEV_DB;
1605 }
1606 if ((flags & RENAME_DB2SLOW) && bdev == BDEV_DB) {
1607 bdev_new = BDEV_SLOW;
1608 }
1609 if (bdev == BDEV_NEWDB) {
1610 // REMOVE_DB xor RENAME_DB
1611 ceph_assert(!(flags & REMOVE_DB) != !(flags & RENAME_DB2SLOW));
1612 ceph_assert(!(flags & RENAME_SLOW2DB));
1613 bdev_new = BDEV_DB;
1614 }
1615 if (bdev == BDEV_NEWWAL) {
1616 ceph_assert(flags & REMOVE_WAL);
1617 bdev_new = BDEV_WAL;
1618 }
1619 dout(20) << __func__ << " op_alloc_add " << bdev_new << " 0x"
7c673cae
FG
1620 << std::hex << q.get_start() << "~" << q.get_len() << std::dec
1621 << dendl;
11fdf7f2 1622 t->op_alloc_add(bdev_new, q.get_start(), q.get_len());
7c673cae
FG
1623 }
1624 }
1625 for (auto& p : file_map) {
1626 if (p.first == 1)
1627 continue;
11fdf7f2
TL
1628 ceph_assert(p.first > 1);
1629
1630 for(auto& e : p.second->fnode.extents) {
1631 auto bdev = e.bdev;
1632 auto bdev_new = bdev;
1633 ceph_assert(!((flags & REMOVE_WAL) && bdev == BDEV_WAL));
1634 if ((flags & RENAME_SLOW2DB) && bdev == BDEV_SLOW) {
1635 bdev_new = BDEV_DB;
1636 }
1637 if ((flags & RENAME_DB2SLOW) && bdev == BDEV_DB) {
1638 bdev_new = BDEV_SLOW;
1639 }
1640 if (bdev == BDEV_NEWDB) {
1641 // REMOVE_DB xor RENAME_DB
1642 ceph_assert(!(flags & REMOVE_DB) != !(flags & RENAME_DB2SLOW));
1643 ceph_assert(!(flags & RENAME_SLOW2DB));
1644 bdev_new = BDEV_DB;
1645 }
1646 if (bdev == BDEV_NEWWAL) {
1647 ceph_assert(flags & REMOVE_WAL);
1648 bdev_new = BDEV_WAL;
1649 }
1650 e.bdev = bdev_new;
1651 }
7c673cae 1652 dout(20) << __func__ << " op_file_update " << p.second->fnode << dendl;
7c673cae
FG
1653 t->op_file_update(p.second->fnode);
1654 }
1655 for (auto& p : dir_map) {
1656 dout(20) << __func__ << " op_dir_create " << p.first << dendl;
1657 t->op_dir_create(p.first);
1658 for (auto& q : p.second->file_map) {
1659 dout(20) << __func__ << " op_dir_link " << p.first << "/" << q.first
1660 << " to " << q.second->fnode.ino << dendl;
1661 t->op_dir_link(p.first, q.first, q.second->fnode.ino);
1662 }
1663 }
1664}
1665
1666void BlueFS::_compact_log_sync()
1667{
1668 dout(10) << __func__ << dendl;
11fdf7f2
TL
1669 _rewrite_log_sync(true,
1670 BDEV_DB,
1671 log_writer->file->fnode.prefer_bdev,
1672 log_writer->file->fnode.prefer_bdev,
1673 0);
1674 logger->inc(l_bluefs_log_compactions);
1675}
1676
1677void BlueFS::_rewrite_log_sync(bool allocate_with_fallback,
1678 int super_dev,
1679 int log_dev,
1680 int log_dev_new,
1681 int flags)
1682{
7c673cae
FG
1683 File *log_file = log_writer->file.get();
1684
1685 // clear out log (be careful who calls us!!!)
1686 log_t.clear();
1687
11fdf7f2
TL
1688 dout(20) << __func__ << " super_dev:" << super_dev
1689 << " log_dev:" << log_dev
1690 << " log_dev_new:" << log_dev_new
1691 << " flags:" << flags
1692 << dendl;
7c673cae 1693 bluefs_transaction_t t;
11fdf7f2 1694 _compact_log_dump_metadata(&t, flags);
7c673cae
FG
1695
1696 dout(20) << __func__ << " op_jump_seq " << log_seq << dendl;
1697 t.op_jump_seq(log_seq);
1698
1699 bufferlist bl;
11fdf7f2 1700 encode(t, bl);
7c673cae
FG
1701 _pad_bl(bl);
1702
1703 uint64_t need = bl.length() + cct->_conf->bluefs_max_log_runway;
1704 dout(20) << __func__ << " need " << need << dendl;
1705
494da23a 1706 bluefs_fnode_t old_fnode;
11fdf7f2 1707 int r;
494da23a 1708 log_file->fnode.swap_extents(old_fnode);
11fdf7f2
TL
1709 if (allocate_with_fallback) {
1710 r = _allocate(log_dev, need, &log_file->fnode);
1711 ceph_assert(r == 0);
1712 } else {
1713 PExtentVector extents;
1714 r = _allocate_without_fallback(log_dev,
1715 need,
1716 &extents);
1717 ceph_assert(r == 0);
1718 for (auto& p : extents) {
1719 log_file->fnode.append_extent(
1720 bluefs_extent_t(log_dev, p.offset, p.length));
1721 }
7c673cae
FG
1722 }
1723
1724 _close_writer(log_writer);
1725
1726 log_file->fnode.size = bl.length();
1727 log_writer = _create_writer(log_file);
1728 log_writer->append(bl);
11fdf7f2
TL
1729 r = _flush(log_writer, true);
1730 ceph_assert(r == 0);
1731#ifdef HAVE_LIBAIO
1732 if (!cct->_conf->bluefs_sync_write) {
1733 list<aio_t> completed_ios;
1734 _claim_completed_aios(log_writer, &completed_ios);
1735 wait_for_aio(log_writer);
1736 completed_ios.clear();
1737 }
1738#endif
224ce89b 1739 flush_bdev();
224ce89b 1740
7c673cae 1741 super.log_fnode = log_file->fnode;
11fdf7f2
TL
1742 // rename device if needed
1743 if (log_dev != log_dev_new) {
1744 dout(10) << __func__ << " renaming log extents to " << log_dev_new << dendl;
1745 for (auto& p : super.log_fnode.extents) {
1746 p.bdev = log_dev_new;
1747 }
1748 }
1749 dout(10) << __func__ << " writing super, log fnode: " << super.log_fnode << dendl;
1750
7c673cae 1751 ++super.version;
11fdf7f2 1752 _write_super(super_dev);
7c673cae
FG
1753 flush_bdev();
1754
494da23a
TL
1755 dout(10) << __func__ << " release old log extents " << old_fnode.extents << dendl;
1756 for (auto& r : old_fnode.extents) {
7c673cae
FG
1757 pending_release[r.bdev].insert(r.offset, r.length);
1758 }
7c673cae
FG
1759}
1760
1761/*
1762 * 1. Allocate a new extent to continue the log, and then log an event
1763 * that jumps the log write position to the new extent. At this point, the
1764 * old extent(s) won't be written to, and reflect everything to compact.
1765 * New events will be written to the new region that we'll keep.
1766 *
1767 * 2. While still holding the lock, encode a bufferlist that dumps all of the
1768 * in-memory fnodes and names. This will become the new beginning of the
1769 * log. The last event will jump to the log continuation extent from #1.
1770 *
1771 * 3. Queue a write to a new extent for the new beginnging of the log.
1772 *
1773 * 4. Drop lock and wait
1774 *
1775 * 5. Retake the lock.
1776 *
1777 * 6. Update the log_fnode to splice in the new beginning.
1778 *
1779 * 7. Write the new superblock.
1780 *
1781 * 8. Release the old log space. Clean up.
1782 */
11fdf7f2 1783void BlueFS::_compact_log_async(std::unique_lock<ceph::mutex>& l)
7c673cae
FG
1784{
1785 dout(10) << __func__ << dendl;
1786 File *log_file = log_writer->file.get();
11fdf7f2
TL
1787 ceph_assert(!new_log);
1788 ceph_assert(!new_log_writer);
7c673cae 1789
181888fb
FG
1790 // create a new log [writer] so that we know compaction is in progress
1791 // (see _should_compact_log)
1792 new_log = new File;
1793 new_log->fnode.ino = 0; // so that _flush_range won't try to log the fnode
1794
3efd9988
FG
1795 // 0. wait for any racing flushes to complete. (We do not want to block
1796 // in _flush_sync_log with jump_to set or else a racing thread might flush
1797 // our entries and our jump_to update won't be correct.)
1798 while (log_flushing) {
1799 dout(10) << __func__ << " log is currently flushing, waiting" << dendl;
1800 log_cond.wait(l);
1801 }
1802
7c673cae
FG
1803 // 1. allocate new log space and jump to it.
1804 old_log_jump_to = log_file->fnode.get_allocated();
7c673cae 1805 dout(10) << __func__ << " old_log_jump_to 0x" << std::hex << old_log_jump_to
11fdf7f2
TL
1806 << " need 0x" << (old_log_jump_to + cct->_conf->bluefs_max_log_runway) << std::dec << dendl;
1807 int r = _allocate(log_file->fnode.prefer_bdev,
1808 cct->_conf->bluefs_max_log_runway, &log_file->fnode);
1809 ceph_assert(r == 0);
7c673cae
FG
1810 dout(10) << __func__ << " log extents " << log_file->fnode.extents << dendl;
1811
1812 // update the log file change and log a jump to the offset where we want to
1813 // write the new entries
1814 log_t.op_file_update(log_file->fnode);
1815 log_t.op_jump(log_seq, old_log_jump_to);
1816
1817 flush_bdev(); // FIXME?
1818
1819 _flush_and_sync_log(l, 0, old_log_jump_to);
1820
1821 // 2. prepare compacted log
1822 bluefs_transaction_t t;
224ce89b
WB
1823 //avoid record two times in log_t and _compact_log_dump_metadata.
1824 log_t.clear();
11fdf7f2 1825 _compact_log_dump_metadata(&t, 0);
7c673cae
FG
1826
1827 // conservative estimate for final encoded size
11fdf7f2 1828 new_log_jump_to = round_up_to(t.op_bl.length() + super.block_size * 2,
7c673cae
FG
1829 cct->_conf->bluefs_alloc_size);
1830 t.op_jump(log_seq, new_log_jump_to);
1831
11fdf7f2
TL
1832 // allocate
1833 r = _allocate(BlueFS::BDEV_DB, new_log_jump_to,
1834 &new_log->fnode);
1835 ceph_assert(r == 0);
1836
1837 // we might have some more ops in log_t due to _allocate call
1838 t.claim_ops(log_t);
1839
7c673cae 1840 bufferlist bl;
11fdf7f2 1841 encode(t, bl);
7c673cae
FG
1842 _pad_bl(bl);
1843
1844 dout(10) << __func__ << " new_log_jump_to 0x" << std::hex << new_log_jump_to
1845 << std::dec << dendl;
1846
7c673cae
FG
1847 new_log_writer = _create_writer(new_log);
1848 new_log_writer->append(bl);
1849
1850 // 3. flush
1851 r = _flush(new_log_writer, true);
11fdf7f2 1852 ceph_assert(r == 0);
7c673cae
FG
1853
1854 // 4. wait
11fdf7f2 1855 _flush_bdev_safely(new_log_writer);
7c673cae 1856
11fdf7f2 1857 // 5. update our log fnode
7c673cae
FG
1858 // discard first old_log_jump_to extents
1859 dout(10) << __func__ << " remove 0x" << std::hex << old_log_jump_to << std::dec
1860 << " of " << log_file->fnode.extents << dendl;
1861 uint64_t discarded = 0;
1862 mempool::bluefs::vector<bluefs_extent_t> old_extents;
1863 while (discarded < old_log_jump_to) {
11fdf7f2 1864 ceph_assert(!log_file->fnode.extents.empty());
7c673cae
FG
1865 bluefs_extent_t& e = log_file->fnode.extents.front();
1866 bluefs_extent_t temp = e;
1867 if (discarded + e.length <= old_log_jump_to) {
1868 dout(10) << __func__ << " remove old log extent " << e << dendl;
1869 discarded += e.length;
94b18763 1870 log_file->fnode.pop_front_extent();
7c673cae
FG
1871 } else {
1872 dout(10) << __func__ << " remove front of old log extent " << e << dendl;
1873 uint64_t drop = old_log_jump_to - discarded;
1874 temp.length = drop;
1875 e.offset += drop;
1876 e.length -= drop;
1877 discarded += drop;
1878 dout(10) << __func__ << " kept " << e << " removed " << temp << dendl;
1879 }
1880 old_extents.push_back(temp);
1881 }
94b18763
FG
1882 auto from = log_file->fnode.extents.begin();
1883 auto to = log_file->fnode.extents.end();
1884 while (from != to) {
1885 new_log->fnode.append_extent(*from);
1886 ++from;
1887 }
7c673cae
FG
1888
1889 // clear the extents from old log file, they are added to new log
94b18763 1890 log_file->fnode.clear_extents();
7c673cae 1891 // swap the log files. New log file is the log file now.
94b18763
FG
1892 new_log->fnode.swap_extents(log_file->fnode);
1893
7c673cae
FG
1894 log_writer->pos = log_writer->file->fnode.size =
1895 log_writer->pos - old_log_jump_to + new_log_jump_to;
1896
11fdf7f2 1897 // 6. write the super block to reflect the changes
7c673cae
FG
1898 dout(10) << __func__ << " writing super" << dendl;
1899 super.log_fnode = log_file->fnode;
1900 ++super.version;
11fdf7f2 1901 _write_super(BDEV_DB);
7c673cae
FG
1902
1903 lock.unlock();
1904 flush_bdev();
1905 lock.lock();
1906
11fdf7f2 1907 // 7. release old space
7c673cae
FG
1908 dout(10) << __func__ << " release old log extents " << old_extents << dendl;
1909 for (auto& r : old_extents) {
1910 pending_release[r.bdev].insert(r.offset, r.length);
1911 }
1912
1913 // delete the new log, remove from the dirty files list
1914 _close_writer(new_log_writer);
1915 if (new_log->dirty_seq) {
11fdf7f2 1916 ceph_assert(dirty_files.count(new_log->dirty_seq));
7c673cae
FG
1917 auto it = dirty_files[new_log->dirty_seq].iterator_to(*new_log);
1918 dirty_files[new_log->dirty_seq].erase(it);
1919 }
1920 new_log_writer = nullptr;
1921 new_log = nullptr;
1922 log_cond.notify_all();
1923
1924 dout(10) << __func__ << " log extents " << log_file->fnode.extents << dendl;
1925 logger->inc(l_bluefs_log_compactions);
1926}
1927
1928void BlueFS::_pad_bl(bufferlist& bl)
1929{
1930 uint64_t partial = bl.length() % super.block_size;
1931 if (partial) {
1932 dout(10) << __func__ << " padding with 0x" << std::hex
1933 << super.block_size - partial << " zeros" << std::dec << dendl;
1934 bl.append_zero(super.block_size - partial);
1935 }
1936}
1937
1938void BlueFS::flush_log()
1939{
11fdf7f2 1940 std::unique_lock l(lock);
7c673cae
FG
1941 flush_bdev();
1942 _flush_and_sync_log(l);
1943}
1944
11fdf7f2 1945int BlueFS::_flush_and_sync_log(std::unique_lock<ceph::mutex>& l,
7c673cae
FG
1946 uint64_t want_seq,
1947 uint64_t jump_to)
1948{
1949 while (log_flushing) {
1950 dout(10) << __func__ << " want_seq " << want_seq
1951 << " log is currently flushing, waiting" << dendl;
11fdf7f2 1952 ceph_assert(!jump_to);
7c673cae
FG
1953 log_cond.wait(l);
1954 }
1955 if (want_seq && want_seq <= log_seq_stable) {
1956 dout(10) << __func__ << " want_seq " << want_seq << " <= log_seq_stable "
1957 << log_seq_stable << ", done" << dendl;
11fdf7f2 1958 ceph_assert(!jump_to);
7c673cae
FG
1959 return 0;
1960 }
1961 if (log_t.empty() && dirty_files.empty()) {
1962 dout(10) << __func__ << " want_seq " << want_seq
1963 << " " << log_t << " not dirty, dirty_files empty, no-op" << dendl;
11fdf7f2 1964 ceph_assert(!jump_to);
7c673cae
FG
1965 return 0;
1966 }
1967
a8e16298
TL
1968 vector<interval_set<uint64_t>> to_release(pending_release.size());
1969 to_release.swap(pending_release);
1970
7c673cae 1971 uint64_t seq = log_t.seq = ++log_seq;
11fdf7f2 1972 ceph_assert(want_seq == 0 || want_seq <= seq);
7c673cae
FG
1973 log_t.uuid = super.uuid;
1974
1975 // log dirty files
1976 auto lsi = dirty_files.find(seq);
1977 if (lsi != dirty_files.end()) {
1978 dout(20) << __func__ << " " << lsi->second.size() << " dirty_files" << dendl;
1979 for (auto &f : lsi->second) {
1980 dout(20) << __func__ << " op_file_update " << f.fnode << dendl;
1981 log_t.op_file_update(f.fnode);
1982 }
1983 }
1984
1985 dout(10) << __func__ << " " << log_t << dendl;
11fdf7f2 1986 ceph_assert(!log_t.empty());
7c673cae
FG
1987
1988 // allocate some more space (before we run out)?
1989 int64_t runway = log_writer->file->fnode.get_allocated() -
1990 log_writer->get_effective_write_pos();
1991 if (runway < (int64_t)cct->_conf->bluefs_min_log_runway) {
1992 dout(10) << __func__ << " allocating more log runway (0x"
1993 << std::hex << runway << std::dec << " remaining)" << dendl;
1994 while (new_log_writer) {
1995 dout(10) << __func__ << " waiting for async compaction" << dendl;
1996 log_cond.wait(l);
1997 }
1998 int r = _allocate(log_writer->file->fnode.prefer_bdev,
1999 cct->_conf->bluefs_max_log_runway,
94b18763 2000 &log_writer->file->fnode);
11fdf7f2 2001 ceph_assert(r == 0);
7c673cae
FG
2002 log_t.op_file_update(log_writer->file->fnode);
2003 }
2004
2005 bufferlist bl;
11fdf7f2
TL
2006 bl.reserve(super.block_size);
2007 encode(log_t, bl);
7c673cae 2008 // pad to block boundary
11fdf7f2
TL
2009 size_t realign = super.block_size - (bl.length() % super.block_size);
2010 if (realign && realign != super.block_size)
2011 bl.append_zero(realign);
2012
7c673cae
FG
2013 logger->inc(l_bluefs_logged_bytes, bl.length());
2014
2015 log_writer->append(bl);
2016
2017 log_t.clear();
2018 log_t.seq = 0; // just so debug output is less confusing
2019 log_flushing = true;
2020
2021 int r = _flush(log_writer, true);
11fdf7f2 2022 ceph_assert(r == 0);
7c673cae
FG
2023
2024 if (jump_to) {
2025 dout(10) << __func__ << " jumping log offset from 0x" << std::hex
2026 << log_writer->pos << " -> 0x" << jump_to << std::dec << dendl;
2027 log_writer->pos = jump_to;
2028 log_writer->file->fnode.size = jump_to;
2029 }
2030
2031 _flush_bdev_safely(log_writer);
2032
2033 log_flushing = false;
2034 log_cond.notify_all();
2035
2036 // clean dirty files
2037 if (seq > log_seq_stable) {
2038 log_seq_stable = seq;
2039 dout(20) << __func__ << " log_seq_stable " << log_seq_stable << dendl;
2040
2041 auto p = dirty_files.begin();
2042 while (p != dirty_files.end()) {
2043 if (p->first > log_seq_stable) {
2044 dout(20) << __func__ << " done cleaning up dirty files" << dendl;
2045 break;
2046 }
2047
2048 auto l = p->second.begin();
2049 while (l != p->second.end()) {
2050 File *file = &*l;
11fdf7f2
TL
2051 ceph_assert(file->dirty_seq > 0);
2052 ceph_assert(file->dirty_seq <= log_seq_stable);
7c673cae
FG
2053 dout(20) << __func__ << " cleaned file " << file->fnode << dendl;
2054 file->dirty_seq = 0;
2055 p->second.erase(l++);
2056 }
2057
11fdf7f2 2058 ceph_assert(p->second.empty());
7c673cae
FG
2059 dirty_files.erase(p++);
2060 }
2061 } else {
2062 dout(20) << __func__ << " log_seq_stable " << log_seq_stable
2063 << " already >= out seq " << seq
2064 << ", we lost a race against another log flush, done" << dendl;
2065 }
a8e16298
TL
2066
2067 for (unsigned i = 0; i < to_release.size(); ++i) {
2068 if (!to_release[i].empty()) {
2069 /* OK, now we have the guarantee alloc[i] won't be null. */
11fdf7f2
TL
2070 int r = 0;
2071 if (cct->_conf->bdev_enable_discard && cct->_conf->bdev_async_discard) {
2072 r = bdev[i]->queue_discard(to_release[i]);
2073 if (r == 0)
2074 continue;
2075 } else if (cct->_conf->bdev_enable_discard) {
2076 for (auto p = to_release[i].begin(); p != to_release[i].end(); ++p) {
2077 bdev[i]->discard(p.get_start(), p.get_len());
2078 }
2079 }
a8e16298
TL
2080 alloc[i]->release(to_release[i]);
2081 }
2082 }
2083
7c673cae
FG
2084 _update_logger_stats();
2085
2086 return 0;
2087}
2088
2089int BlueFS::_flush_range(FileWriter *h, uint64_t offset, uint64_t length)
2090{
2091 dout(10) << __func__ << " " << h << " pos 0x" << std::hex << h->pos
2092 << " 0x" << offset << "~" << length << std::dec
2093 << " to " << h->file->fnode << dendl;
11fdf7f2
TL
2094 ceph_assert(!h->file->deleted);
2095 ceph_assert(h->file->num_readers.load() == 0);
7c673cae
FG
2096
2097 h->buffer_appender.flush();
2098
2099 bool buffered;
2100 if (h->file->fnode.ino == 1)
2101 buffered = false;
2102 else
2103 buffered = cct->_conf->bluefs_buffered_io;
2104
2105 if (offset + length <= h->pos)
2106 return 0;
2107 if (offset < h->pos) {
2108 length -= h->pos - offset;
2109 offset = h->pos;
2110 dout(10) << " still need 0x"
2111 << std::hex << offset << "~" << length << std::dec
2112 << dendl;
2113 }
11fdf7f2 2114 ceph_assert(offset <= h->file->fnode.size);
7c673cae
FG
2115
2116 uint64_t allocated = h->file->fnode.get_allocated();
2117
2118 // do not bother to dirty the file if we are overwriting
2119 // previously allocated extents.
2120 bool must_dirty = false;
2121 if (allocated < offset + length) {
2122 // we should never run out of log space here; see the min runway check
2123 // in _flush_and_sync_log.
11fdf7f2 2124 ceph_assert(h->file->fnode.ino != 1);
7c673cae
FG
2125 int r = _allocate(h->file->fnode.prefer_bdev,
2126 offset + length - allocated,
94b18763 2127 &h->file->fnode);
7c673cae
FG
2128 if (r < 0) {
2129 derr << __func__ << " allocated: 0x" << std::hex << allocated
2130 << " offset: 0x" << offset << " length: 0x" << length << std::dec
2131 << dendl;
11fdf7f2 2132 ceph_abort_msg("bluefs enospc");
7c673cae
FG
2133 return r;
2134 }
7c673cae
FG
2135 if (cct->_conf->bluefs_preextend_wal_files &&
2136 h->writer_type == WRITER_WAL) {
2137 // NOTE: this *requires* that rocksdb also has log recycling
2138 // enabled and is therefore doing robust CRCs on the log
2139 // records. otherwise, we will fail to reply the rocksdb log
2140 // properly due to garbage on the device.
2141 h->file->fnode.size = h->file->fnode.get_allocated();
2142 dout(10) << __func__ << " extending WAL size to 0x" << std::hex
2143 << h->file->fnode.size << std::dec << " to include allocated"
2144 << dendl;
2145 }
2146 must_dirty = true;
2147 }
2148 if (h->file->fnode.size < offset + length) {
2149 h->file->fnode.size = offset + length;
2150 if (h->file->fnode.ino > 1) {
2151 // we do not need to dirty the log file (or it's compacting
2152 // replacement) when the file size changes because replay is
2153 // smart enough to discover it on its own.
2154 must_dirty = true;
2155 }
2156 }
2157 if (must_dirty) {
2158 h->file->fnode.mtime = ceph_clock_now();
11fdf7f2 2159 ceph_assert(h->file->fnode.ino >= 1);
7c673cae
FG
2160 if (h->file->dirty_seq == 0) {
2161 h->file->dirty_seq = log_seq + 1;
2162 dirty_files[h->file->dirty_seq].push_back(*h->file);
2163 dout(20) << __func__ << " dirty_seq = " << log_seq + 1
2164 << " (was clean)" << dendl;
2165 } else {
2166 if (h->file->dirty_seq != log_seq + 1) {
2167 // need re-dirty, erase from list first
11fdf7f2 2168 ceph_assert(dirty_files.count(h->file->dirty_seq));
7c673cae
FG
2169 auto it = dirty_files[h->file->dirty_seq].iterator_to(*h->file);
2170 dirty_files[h->file->dirty_seq].erase(it);
2171 h->file->dirty_seq = log_seq + 1;
2172 dirty_files[h->file->dirty_seq].push_back(*h->file);
2173 dout(20) << __func__ << " dirty_seq = " << log_seq + 1
2174 << " (was " << h->file->dirty_seq << ")" << dendl;
2175 } else {
2176 dout(20) << __func__ << " dirty_seq = " << log_seq + 1
2177 << " (unchanged, do nothing) " << dendl;
2178 }
2179 }
2180 }
2181 dout(20) << __func__ << " file now " << h->file->fnode << dendl;
2182
2183 uint64_t x_off = 0;
2184 auto p = h->file->fnode.seek(offset, &x_off);
11fdf7f2 2185 ceph_assert(p != h->file->fnode.extents.end());
7c673cae
FG
2186 dout(20) << __func__ << " in " << *p << " x_off 0x"
2187 << std::hex << x_off << std::dec << dendl;
2188
2189 unsigned partial = x_off & ~super.block_mask();
2190 bufferlist bl;
2191 if (partial) {
2192 dout(20) << __func__ << " using partial tail 0x"
2193 << std::hex << partial << std::dec << dendl;
11fdf7f2 2194 ceph_assert(h->tail_block.length() == partial);
31f18b77 2195 bl.claim_append_piecewise(h->tail_block);
7c673cae
FG
2196 x_off -= partial;
2197 offset -= partial;
2198 length += partial;
2199 dout(20) << __func__ << " waiting for previous aio to complete" << dendl;
2200 for (auto p : h->iocv) {
2201 if (p) {
2202 p->aio_wait();
2203 }
2204 }
2205 }
2206 if (length == partial + h->buffer.length()) {
31f18b77 2207 bl.claim_append_piecewise(h->buffer);
7c673cae
FG
2208 } else {
2209 bufferlist t;
31f18b77
FG
2210 h->buffer.splice(0, length, &t);
2211 bl.claim_append_piecewise(t);
7c673cae
FG
2212 t.substr_of(h->buffer, length, h->buffer.length() - length);
2213 h->buffer.swap(t);
2214 dout(20) << " leaving 0x" << std::hex << h->buffer.length() << std::dec
2215 << " unflushed" << dendl;
2216 }
11fdf7f2 2217 ceph_assert(bl.length() == length);
7c673cae
FG
2218
2219 switch (h->writer_type) {
2220 case WRITER_WAL:
2221 logger->inc(l_bluefs_bytes_written_wal, length);
2222 break;
2223 case WRITER_SST:
2224 logger->inc(l_bluefs_bytes_written_sst, length);
2225 break;
2226 }
2227
2228 dout(30) << "dump:\n";
2229 bl.hexdump(*_dout);
2230 *_dout << dendl;
2231
2232 h->pos = offset + length;
2233 h->tail_block.clear();
2234
2235 uint64_t bloff = 0;
11fdf7f2 2236 uint64_t bytes_written_slow = 0;
7c673cae 2237 while (length > 0) {
11fdf7f2 2238 uint64_t x_len = std::min(p->length - x_off, length);
7c673cae
FG
2239 bufferlist t;
2240 t.substr_of(bl, bloff, x_len);
2241 unsigned tail = x_len & ~super.block_mask();
2242 if (tail) {
2243 size_t zlen = super.block_size - tail;
2244 dout(20) << __func__ << " caching tail of 0x"
2245 << std::hex << tail
2246 << " and padding block with 0x" << zlen
2247 << std::dec << dendl;
2248 h->tail_block.substr_of(bl, bl.length() - tail, tail);
2249 if (h->file->fnode.ino > 1) {
2250 // we are using the page_aligned_appender, and can safely use
2251 // the tail of the raw buffer.
2252 const bufferptr &last = t.back();
2253 if (last.unused_tail_length() < zlen) {
2254 derr << " wtf, last is " << last << " from " << t << dendl;
11fdf7f2 2255 ceph_assert(last.unused_tail_length() >= zlen);
7c673cae
FG
2256 }
2257 bufferptr z = last;
2258 z.set_offset(last.offset() + last.length());
2259 z.set_length(zlen);
2260 z.zero();
2261 t.append(z, 0, zlen);
2262 } else {
2263 t.append_zero(zlen);
2264 }
2265 }
2266 if (cct->_conf->bluefs_sync_write) {
11fdf7f2 2267 bdev[p->bdev]->write(p->offset + x_off, t, buffered, h->write_hint);
7c673cae 2268 } else {
11fdf7f2
TL
2269 bdev[p->bdev]->aio_write(p->offset + x_off, t, h->iocv[p->bdev], buffered, h->write_hint);
2270 }
2271 h->dirty_devs[p->bdev] = true;
2272 if (p->bdev == BDEV_SLOW) {
2273 bytes_written_slow += t.length();
7c673cae 2274 }
11fdf7f2 2275
7c673cae
FG
2276 bloff += x_len;
2277 length -= x_len;
2278 ++p;
2279 x_off = 0;
2280 }
11fdf7f2 2281 logger->inc(l_bluefs_bytes_written_slow, bytes_written_slow);
7c673cae
FG
2282 for (unsigned i = 0; i < MAX_BDEV; ++i) {
2283 if (bdev[i]) {
11fdf7f2 2284 if (h->iocv[i] && h->iocv[i]->has_pending_aios()) {
7c673cae
FG
2285 bdev[i]->aio_submit(h->iocv[i]);
2286 }
2287 }
2288 }
2289 dout(20) << __func__ << " h " << h << " pos now 0x"
2290 << std::hex << h->pos << std::dec << dendl;
2291 return 0;
2292}
2293
11fdf7f2 2294#ifdef HAVE_LIBAIO
7c673cae
FG
2295// we need to retire old completed aios so they don't stick around in
2296// memory indefinitely (along with their bufferlist refs).
2297void BlueFS::_claim_completed_aios(FileWriter *h, list<aio_t> *ls)
2298{
2299 for (auto p : h->iocv) {
2300 if (p) {
2301 ls->splice(ls->end(), p->running_aios);
2302 }
2303 }
2304 dout(10) << __func__ << " got " << ls->size() << " aios" << dendl;
2305}
2306
2307void BlueFS::wait_for_aio(FileWriter *h)
2308{
2309 // NOTE: this is safe to call without a lock, as long as our reference is
2310 // stable.
2311 dout(10) << __func__ << " " << h << dendl;
2312 utime_t start = ceph_clock_now();
2313 for (auto p : h->iocv) {
2314 if (p) {
2315 p->aio_wait();
2316 }
2317 }
11fdf7f2 2318 dout(10) << __func__ << " " << h << " done in " << (ceph_clock_now() - start) << dendl;
7c673cae 2319}
11fdf7f2 2320#endif
7c673cae
FG
2321
2322int BlueFS::_flush(FileWriter *h, bool force)
2323{
2324 h->buffer_appender.flush();
2325 uint64_t length = h->buffer.length();
2326 uint64_t offset = h->pos;
2327 if (!force &&
2328 length < cct->_conf->bluefs_min_flush_size) {
2329 dout(10) << __func__ << " " << h << " ignoring, length " << length
2330 << " < min_flush_size " << cct->_conf->bluefs_min_flush_size
2331 << dendl;
2332 return 0;
2333 }
2334 if (length == 0) {
2335 dout(10) << __func__ << " " << h << " no dirty data on "
2336 << h->file->fnode << dendl;
2337 return 0;
2338 }
2339 dout(10) << __func__ << " " << h << " 0x"
2340 << std::hex << offset << "~" << length << std::dec
2341 << " to " << h->file->fnode << dendl;
11fdf7f2 2342 ceph_assert(h->pos <= h->file->fnode.size);
7c673cae
FG
2343 return _flush_range(h, offset, length);
2344}
2345
2346int BlueFS::_truncate(FileWriter *h, uint64_t offset)
2347{
2348 dout(10) << __func__ << " 0x" << std::hex << offset << std::dec
2349 << " file " << h->file->fnode << dendl;
2350 if (h->file->deleted) {
2351 dout(10) << __func__ << " deleted, no-op" << dendl;
2352 return 0;
2353 }
2354
2355 // we never truncate internal log files
11fdf7f2 2356 ceph_assert(h->file->fnode.ino > 1);
7c673cae
FG
2357
2358 h->buffer_appender.flush();
2359
2360 // truncate off unflushed data?
2361 if (h->pos < offset &&
2362 h->pos + h->buffer.length() > offset) {
2363 bufferlist t;
2364 dout(20) << __func__ << " tossing out last " << offset - h->pos
2365 << " unflushed bytes" << dendl;
2366 t.substr_of(h->buffer, 0, offset - h->pos);
2367 h->buffer.swap(t);
11fdf7f2 2368 ceph_abort_msg("actually this shouldn't happen");
7c673cae
FG
2369 }
2370 if (h->buffer.length()) {
2371 int r = _flush(h, true);
2372 if (r < 0)
2373 return r;
2374 }
2375 if (offset == h->file->fnode.size) {
2376 return 0; // no-op!
2377 }
2378 if (offset > h->file->fnode.size) {
11fdf7f2 2379 ceph_abort_msg("truncate up not supported");
7c673cae 2380 }
11fdf7f2 2381 ceph_assert(h->file->fnode.size >= offset);
7c673cae
FG
2382 h->file->fnode.size = offset;
2383 log_t.op_file_update(h->file->fnode);
2384 return 0;
2385}
2386
11fdf7f2 2387int BlueFS::_fsync(FileWriter *h, std::unique_lock<ceph::mutex>& l)
7c673cae
FG
2388{
2389 dout(10) << __func__ << " " << h << " " << h->file->fnode << dendl;
2390 int r = _flush(h, true);
2391 if (r < 0)
2392 return r;
2393 uint64_t old_dirty_seq = h->file->dirty_seq;
2394
2395 _flush_bdev_safely(h);
2396
2397 if (old_dirty_seq) {
2398 uint64_t s = log_seq;
2399 dout(20) << __func__ << " file metadata was dirty (" << old_dirty_seq
2400 << ") on " << h->file->fnode << ", flushing log" << dendl;
2401 _flush_and_sync_log(l, old_dirty_seq);
11fdf7f2 2402 ceph_assert(h->file->dirty_seq == 0 || // cleaned
7c673cae
FG
2403 h->file->dirty_seq > s); // or redirtied by someone else
2404 }
2405 return 0;
2406}
2407
2408void BlueFS::_flush_bdev_safely(FileWriter *h)
2409{
11fdf7f2
TL
2410 std::array<bool, MAX_BDEV> flush_devs = h->dirty_devs;
2411 h->dirty_devs.fill(false);
2412#ifdef HAVE_LIBAIO
7c673cae
FG
2413 if (!cct->_conf->bluefs_sync_write) {
2414 list<aio_t> completed_ios;
2415 _claim_completed_aios(h, &completed_ios);
2416 lock.unlock();
2417 wait_for_aio(h);
2418 completed_ios.clear();
11fdf7f2 2419 flush_bdev(flush_devs);
7c673cae 2420 lock.lock();
11fdf7f2
TL
2421 } else
2422#endif
2423 {
7c673cae 2424 lock.unlock();
11fdf7f2 2425 flush_bdev(flush_devs);
7c673cae
FG
2426 lock.lock();
2427 }
2428}
2429
11fdf7f2
TL
2430void BlueFS::flush_bdev(std::array<bool, MAX_BDEV>& dirty_bdevs)
2431{
2432 // NOTE: this is safe to call without a lock.
2433 dout(20) << __func__ << dendl;
2434 for (unsigned i = 0; i < MAX_BDEV; i++) {
2435 if (dirty_bdevs[i])
2436 bdev[i]->flush();
2437 }
2438}
2439
7c673cae
FG
2440void BlueFS::flush_bdev()
2441{
2442 // NOTE: this is safe to call without a lock.
2443 dout(20) << __func__ << dendl;
2444 for (auto p : bdev) {
2445 if (p)
2446 p->flush();
2447 }
2448}
2449
11fdf7f2
TL
2450int BlueFS::_expand_slow_device(uint64_t need, PExtentVector& extents)
2451{
2452 int r = -ENOSPC;
2453 if (slow_dev_expander) {
2454 auto min_alloc_size = cct->_conf->bluefs_alloc_size;
2455 int id = _get_slow_device_id();
2456 ceph_assert(id <= (int)alloc.size() && alloc[id]);
2457 auto min_need = round_up_to(need, min_alloc_size);
2458 need = std::max(need,
2459 slow_dev_expander->get_recommended_expansion_delta(
2460 alloc[id]->get_free(), block_all[id].size()));
2461
2462 need = round_up_to(need, min_alloc_size);
2463 dout(10) << __func__ << " expanding slow device by 0x"
2464 << std::hex << need << std::dec
2465 << dendl;
2466 r = slow_dev_expander->allocate_freespace(min_need, need, extents);
2467 }
2468 return r;
2469}
2470
2471int BlueFS::_allocate_without_fallback(uint8_t id, uint64_t len,
2472 PExtentVector* extents)
2473{
2474 dout(10) << __func__ << " len 0x" << std::hex << len << std::dec
2475 << " from " << (int)id << dendl;
2476 assert(id < alloc.size());
2477 uint64_t min_alloc_size = cct->_conf->bluefs_alloc_size;
2478
2479 uint64_t left = round_up_to(len, min_alloc_size);
2480
2481 if (!alloc[id]) {
2482 return -ENOENT;
2483 }
2484 extents->reserve(4); // 4 should be (more than) enough for most allocations
2485 int64_t alloc_len = alloc[id]->allocate(left, min_alloc_size, 0, extents);
2486 if (alloc_len < (int64_t)left) {
2487 if (alloc_len != 0) {
2488 alloc[id]->release(*extents);
2489 }
2490 if (bdev[id])
2491 derr << __func__ << " failed to allocate 0x" << std::hex << left
2492 << " on bdev " << (int)id
2493 << ", free 0x" << alloc[id]->get_free() << std::dec << dendl;
2494 else
2495 derr << __func__ << " failed to allocate 0x" << std::hex << left
2496 << " on bdev " << (int)id << ", dne" << std::dec << dendl;
2497 if (alloc[id])
2498 alloc[id]->dump();
2499 return -ENOSPC;
2500 }
2501
2502 return 0;
2503}
2504
7c673cae 2505int BlueFS::_allocate(uint8_t id, uint64_t len,
94b18763 2506 bluefs_fnode_t* node)
7c673cae
FG
2507{
2508 dout(10) << __func__ << " len 0x" << std::hex << len << std::dec
2509 << " from " << (int)id << dendl;
11fdf7f2 2510 ceph_assert(id < alloc.size());
7c673cae
FG
2511 uint64_t min_alloc_size = cct->_conf->bluefs_alloc_size;
2512
11fdf7f2 2513 uint64_t left = round_up_to(len, min_alloc_size);
b32b8144 2514 int64_t alloc_len = 0;
a8e16298 2515 PExtentVector extents;
b32b8144 2516
11fdf7f2 2517 uint64_t hint = 0;
7c673cae 2518 if (alloc[id]) {
94b18763
FG
2519 if (!node->extents.empty() && node->extents.back().bdev == id) {
2520 hint = node->extents.back().end();
11fdf7f2 2521 }
b32b8144
FG
2522 extents.reserve(4); // 4 should be (more than) enough for most allocations
2523 alloc_len = alloc[id]->allocate(left, min_alloc_size, hint, &extents);
2524 }
a8e16298 2525 if (alloc_len < (int64_t)left) {
11fdf7f2 2526 if (alloc_len > 0) {
a8e16298 2527 alloc[id]->release(extents);
b32b8144 2528 }
7c673cae
FG
2529 if (id != BDEV_SLOW) {
2530 if (bdev[id]) {
2531 dout(1) << __func__ << " failed to allocate 0x" << std::hex << left
2532 << " on bdev " << (int)id
2533 << ", free 0x" << alloc[id]->get_free()
2534 << "; fallback to bdev " << (int)id + 1
2535 << std::dec << dendl;
2536 }
94b18763 2537 return _allocate(id + 1, len, node);
7c673cae 2538 }
11fdf7f2
TL
2539 dout(1) << __func__ << " unable to allocate 0x" << std::hex << left
2540 << " on bdev " << (int)id << ", free 0x"
2541 << (alloc[id] ? alloc[id]->get_free() : (uint64_t)-1)
2542 << "; fallback to slow device expander "
2543 << std::dec << dendl;
2544 extents.clear();
2545 if (_expand_slow_device(left, extents) == 0) {
2546 id = _get_slow_device_id();
2547 for (auto& e : extents) {
2548 _add_block_extent(id, e.offset, e.length);
2549 }
2550 extents.clear();
2551 auto* last_alloc = alloc[id];
2552 ceph_assert(last_alloc);
2553 // try again
2554 alloc_len = last_alloc->allocate(left, min_alloc_size, hint, &extents);
2555 if (alloc_len < (int64_t)left) {
2556 if (alloc_len > 0) {
2557 last_alloc->release(extents);
2558 }
2559 derr << __func__ << " failed to allocate 0x" << std::hex << left
2560 << " on bdev " << (int)id
2561 << ", free 0x" << last_alloc->get_free() << std::dec << dendl;
2562 return -ENOSPC;
2563 }
2564 } else {
2565 derr << __func__ << " failed to expand slow device to fit +0x"
2566 << std::hex << left << std::dec
2567 << dendl;
2568 return -ENOSPC;
2569 }
2570 } else {
2571 uint64_t total_allocated =
2572 block_all[id].size() - alloc[id]->get_free();
2573 if (max_bytes[id] < total_allocated) {
2574 logger->set(max_bytes_pcounters[id], total_allocated);
2575 max_bytes[id] = total_allocated;
2576 }
7c673cae
FG
2577 }
2578
2579 for (auto& p : extents) {
94b18763 2580 node->append_extent(bluefs_extent_t(id, p.offset, p.length));
7c673cae
FG
2581 }
2582
2583 return 0;
2584}
2585
2586int BlueFS::_preallocate(FileRef f, uint64_t off, uint64_t len)
2587{
2588 dout(10) << __func__ << " file " << f->fnode << " 0x"
2589 << std::hex << off << "~" << len << std::dec << dendl;
2590 if (f->deleted) {
2591 dout(10) << __func__ << " deleted, no-op" << dendl;
2592 return 0;
2593 }
11fdf7f2 2594 ceph_assert(f->fnode.ino > 1);
7c673cae
FG
2595 uint64_t allocated = f->fnode.get_allocated();
2596 if (off + len > allocated) {
2597 uint64_t want = off + len - allocated;
94b18763 2598 int r = _allocate(f->fnode.prefer_bdev, want, &f->fnode);
7c673cae
FG
2599 if (r < 0)
2600 return r;
7c673cae
FG
2601 log_t.op_file_update(f->fnode);
2602 }
2603 return 0;
2604}
2605
2606void BlueFS::sync_metadata()
2607{
11fdf7f2 2608 std::unique_lock l(lock);
7c673cae
FG
2609 if (log_t.empty()) {
2610 dout(10) << __func__ << " - no pending log events" << dendl;
11fdf7f2
TL
2611 } else {
2612 dout(10) << __func__ << dendl;
2613 utime_t start = ceph_clock_now();
2614 flush_bdev(); // FIXME?
2615 _flush_and_sync_log(l);
2616 dout(10) << __func__ << " done in " << (ceph_clock_now() - start) << dendl;
7c673cae 2617 }
7c673cae
FG
2618
2619 if (_should_compact_log()) {
2620 if (cct->_conf->bluefs_compact_log_sync) {
2621 _compact_log_sync();
2622 } else {
2623 _compact_log_async(l);
2624 }
2625 }
7c673cae
FG
2626}
2627
2628int BlueFS::open_for_write(
2629 const string& dirname,
2630 const string& filename,
2631 FileWriter **h,
2632 bool overwrite)
2633{
11fdf7f2 2634 std::lock_guard l(lock);
7c673cae
FG
2635 dout(10) << __func__ << " " << dirname << "/" << filename << dendl;
2636 map<string,DirRef>::iterator p = dir_map.find(dirname);
2637 DirRef dir;
2638 if (p == dir_map.end()) {
2639 // implicitly create the dir
2640 dout(20) << __func__ << " dir " << dirname
2641 << " does not exist" << dendl;
2642 return -ENOENT;
2643 } else {
2644 dir = p->second;
2645 }
2646
2647 FileRef file;
2648 bool create = false;
2649 map<string,FileRef>::iterator q = dir->file_map.find(filename);
2650 if (q == dir->file_map.end()) {
2651 if (overwrite) {
2652 dout(20) << __func__ << " dir " << dirname << " (" << dir
2653 << ") file " << filename
2654 << " does not exist" << dendl;
2655 return -ENOENT;
2656 }
2657 file = new File;
2658 file->fnode.ino = ++ino_last;
2659 file_map[ino_last] = file;
2660 dir->file_map[filename] = file;
2661 ++file->refs;
2662 create = true;
2663 } else {
2664 // overwrite existing file?
2665 file = q->second;
2666 if (overwrite) {
2667 dout(20) << __func__ << " dir " << dirname << " (" << dir
2668 << ") file " << filename
2669 << " already exists, overwrite in place" << dendl;
2670 } else {
2671 dout(20) << __func__ << " dir " << dirname << " (" << dir
2672 << ") file " << filename
2673 << " already exists, truncate + overwrite" << dendl;
2674 file->fnode.size = 0;
2675 for (auto& p : file->fnode.extents) {
2676 pending_release[p.bdev].insert(p.offset, p.length);
2677 }
94b18763
FG
2678
2679 file->fnode.clear_extents();
7c673cae
FG
2680 }
2681 }
11fdf7f2 2682 ceph_assert(file->fnode.ino > 1);
7c673cae
FG
2683
2684 file->fnode.mtime = ceph_clock_now();
2685 file->fnode.prefer_bdev = BlueFS::BDEV_DB;
2686 if (dirname.length() > 5) {
2687 // the "db.slow" and "db.wal" directory names are hard-coded at
2688 // match up with bluestore. the slow device is always the second
2689 // one (when a dedicated block.db device is present and used at
2690 // bdev 0). the wal device is always last.
31f18b77 2691 if (boost::algorithm::ends_with(dirname, ".slow")) {
7c673cae
FG
2692 file->fnode.prefer_bdev = BlueFS::BDEV_SLOW;
2693 } else if (boost::algorithm::ends_with(dirname, ".wal")) {
2694 file->fnode.prefer_bdev = BlueFS::BDEV_WAL;
2695 }
2696 }
2697 dout(20) << __func__ << " mapping " << dirname << "/" << filename
2698 << " to bdev " << (int)file->fnode.prefer_bdev << dendl;
2699
2700 log_t.op_file_update(file->fnode);
2701 if (create)
2702 log_t.op_dir_link(dirname, filename, file->fnode.ino);
2703
2704 *h = _create_writer(file);
2705
2706 if (boost::algorithm::ends_with(filename, ".log")) {
2707 (*h)->writer_type = BlueFS::WRITER_WAL;
2708 if (logger && !overwrite) {
2709 logger->inc(l_bluefs_files_written_wal);
2710 }
2711 } else if (boost::algorithm::ends_with(filename, ".sst")) {
2712 (*h)->writer_type = BlueFS::WRITER_SST;
2713 if (logger) {
2714 logger->inc(l_bluefs_files_written_sst);
2715 }
2716 }
2717
2718 dout(10) << __func__ << " h " << *h << " on " << file->fnode << dendl;
2719 return 0;
2720}
2721
2722BlueFS::FileWriter *BlueFS::_create_writer(FileRef f)
2723{
2724 FileWriter *w = new FileWriter(f);
2725 for (unsigned i = 0; i < MAX_BDEV; ++i) {
2726 if (bdev[i]) {
2727 w->iocv[i] = new IOContext(cct, NULL);
7c673cae
FG
2728 }
2729 }
2730 return w;
2731}
2732
2733void BlueFS::_close_writer(FileWriter *h)
2734{
2735 dout(10) << __func__ << " " << h << " type " << h->writer_type << dendl;
2736 for (unsigned i=0; i<MAX_BDEV; ++i) {
2737 if (bdev[i]) {
11fdf7f2
TL
2738 if (h->iocv[i]) {
2739 h->iocv[i]->aio_wait();
2740 bdev[i]->queue_reap_ioc(h->iocv[i]);
2741 }
7c673cae
FG
2742 }
2743 }
2744 delete h;
2745}
2746
2747int BlueFS::open_for_read(
2748 const string& dirname,
2749 const string& filename,
2750 FileReader **h,
2751 bool random)
2752{
11fdf7f2 2753 std::lock_guard l(lock);
7c673cae
FG
2754 dout(10) << __func__ << " " << dirname << "/" << filename
2755 << (random ? " (random)":" (sequential)") << dendl;
2756 map<string,DirRef>::iterator p = dir_map.find(dirname);
2757 if (p == dir_map.end()) {
2758 dout(20) << __func__ << " dir " << dirname << " not found" << dendl;
2759 return -ENOENT;
2760 }
2761 DirRef dir = p->second;
2762
2763 map<string,FileRef>::iterator q = dir->file_map.find(filename);
2764 if (q == dir->file_map.end()) {
2765 dout(20) << __func__ << " dir " << dirname << " (" << dir
2766 << ") file " << filename
2767 << " not found" << dendl;
2768 return -ENOENT;
2769 }
2770 File *file = q->second.get();
2771
2772 *h = new FileReader(file, random ? 4096 : cct->_conf->bluefs_max_prefetch,
2773 random, false);
2774 dout(10) << __func__ << " h " << *h << " on " << file->fnode << dendl;
2775 return 0;
2776}
2777
2778int BlueFS::rename(
2779 const string& old_dirname, const string& old_filename,
2780 const string& new_dirname, const string& new_filename)
2781{
11fdf7f2 2782 std::lock_guard l(lock);
7c673cae
FG
2783 dout(10) << __func__ << " " << old_dirname << "/" << old_filename
2784 << " -> " << new_dirname << "/" << new_filename << dendl;
2785 map<string,DirRef>::iterator p = dir_map.find(old_dirname);
2786 if (p == dir_map.end()) {
2787 dout(20) << __func__ << " dir " << old_dirname << " not found" << dendl;
2788 return -ENOENT;
2789 }
2790 DirRef old_dir = p->second;
2791 map<string,FileRef>::iterator q = old_dir->file_map.find(old_filename);
2792 if (q == old_dir->file_map.end()) {
2793 dout(20) << __func__ << " dir " << old_dirname << " (" << old_dir
2794 << ") file " << old_filename
2795 << " not found" << dendl;
2796 return -ENOENT;
2797 }
2798 FileRef file = q->second;
2799
2800 p = dir_map.find(new_dirname);
2801 if (p == dir_map.end()) {
2802 dout(20) << __func__ << " dir " << new_dirname << " not found" << dendl;
2803 return -ENOENT;
2804 }
2805 DirRef new_dir = p->second;
2806 q = new_dir->file_map.find(new_filename);
2807 if (q != new_dir->file_map.end()) {
2808 dout(20) << __func__ << " dir " << new_dirname << " (" << old_dir
2809 << ") file " << new_filename
2810 << " already exists, unlinking" << dendl;
11fdf7f2 2811 ceph_assert(q->second != file);
7c673cae
FG
2812 log_t.op_dir_unlink(new_dirname, new_filename);
2813 _drop_link(q->second);
2814 }
2815
2816 dout(10) << __func__ << " " << new_dirname << "/" << new_filename << " "
2817 << " " << file->fnode << dendl;
2818
2819 new_dir->file_map[new_filename] = file;
2820 old_dir->file_map.erase(old_filename);
2821
2822 log_t.op_dir_link(new_dirname, new_filename, file->fnode.ino);
2823 log_t.op_dir_unlink(old_dirname, old_filename);
2824 return 0;
2825}
2826
2827int BlueFS::mkdir(const string& dirname)
2828{
11fdf7f2 2829 std::lock_guard l(lock);
7c673cae
FG
2830 dout(10) << __func__ << " " << dirname << dendl;
2831 map<string,DirRef>::iterator p = dir_map.find(dirname);
2832 if (p != dir_map.end()) {
2833 dout(20) << __func__ << " dir " << dirname << " exists" << dendl;
2834 return -EEXIST;
2835 }
2836 dir_map[dirname] = new Dir;
2837 log_t.op_dir_create(dirname);
2838 return 0;
2839}
2840
2841int BlueFS::rmdir(const string& dirname)
2842{
11fdf7f2 2843 std::lock_guard l(lock);
7c673cae
FG
2844 dout(10) << __func__ << " " << dirname << dendl;
2845 map<string,DirRef>::iterator p = dir_map.find(dirname);
2846 if (p == dir_map.end()) {
2847 dout(20) << __func__ << " dir " << dirname << " does not exist" << dendl;
2848 return -ENOENT;
2849 }
2850 DirRef dir = p->second;
2851 if (!dir->file_map.empty()) {
2852 dout(20) << __func__ << " dir " << dirname << " not empty" << dendl;
2853 return -ENOTEMPTY;
2854 }
2855 dir_map.erase(dirname);
2856 log_t.op_dir_remove(dirname);
2857 return 0;
2858}
2859
2860bool BlueFS::dir_exists(const string& dirname)
2861{
11fdf7f2 2862 std::lock_guard l(lock);
7c673cae
FG
2863 map<string,DirRef>::iterator p = dir_map.find(dirname);
2864 bool exists = p != dir_map.end();
2865 dout(10) << __func__ << " " << dirname << " = " << (int)exists << dendl;
2866 return exists;
2867}
2868
2869int BlueFS::stat(const string& dirname, const string& filename,
2870 uint64_t *size, utime_t *mtime)
2871{
11fdf7f2 2872 std::lock_guard l(lock);
7c673cae
FG
2873 dout(10) << __func__ << " " << dirname << "/" << filename << dendl;
2874 map<string,DirRef>::iterator p = dir_map.find(dirname);
2875 if (p == dir_map.end()) {
2876 dout(20) << __func__ << " dir " << dirname << " not found" << dendl;
2877 return -ENOENT;
2878 }
2879 DirRef dir = p->second;
2880 map<string,FileRef>::iterator q = dir->file_map.find(filename);
2881 if (q == dir->file_map.end()) {
2882 dout(20) << __func__ << " dir " << dirname << " (" << dir
2883 << ") file " << filename
2884 << " not found" << dendl;
2885 return -ENOENT;
2886 }
2887 File *file = q->second.get();
2888 dout(10) << __func__ << " " << dirname << "/" << filename
2889 << " " << file->fnode << dendl;
2890 if (size)
2891 *size = file->fnode.size;
2892 if (mtime)
2893 *mtime = file->fnode.mtime;
2894 return 0;
2895}
2896
2897int BlueFS::lock_file(const string& dirname, const string& filename,
2898 FileLock **plock)
2899{
11fdf7f2 2900 std::lock_guard l(lock);
7c673cae
FG
2901 dout(10) << __func__ << " " << dirname << "/" << filename << dendl;
2902 map<string,DirRef>::iterator p = dir_map.find(dirname);
2903 if (p == dir_map.end()) {
2904 dout(20) << __func__ << " dir " << dirname << " not found" << dendl;
2905 return -ENOENT;
2906 }
2907 DirRef dir = p->second;
2908 map<string,FileRef>::iterator q = dir->file_map.find(filename);
2909 File *file;
2910 if (q == dir->file_map.end()) {
2911 dout(20) << __func__ << " dir " << dirname << " (" << dir
2912 << ") file " << filename
2913 << " not found, creating" << dendl;
2914 file = new File;
2915 file->fnode.ino = ++ino_last;
2916 file->fnode.mtime = ceph_clock_now();
2917 file_map[ino_last] = file;
2918 dir->file_map[filename] = file;
2919 ++file->refs;
2920 log_t.op_file_update(file->fnode);
2921 log_t.op_dir_link(dirname, filename, file->fnode.ino);
2922 } else {
2923 file = q->second.get();
2924 if (file->locked) {
2925 dout(10) << __func__ << " already locked" << dendl;
11fdf7f2 2926 return -ENOLCK;
7c673cae
FG
2927 }
2928 }
2929 file->locked = true;
2930 *plock = new FileLock(file);
2931 dout(10) << __func__ << " locked " << file->fnode
2932 << " with " << *plock << dendl;
2933 return 0;
2934}
2935
2936int BlueFS::unlock_file(FileLock *fl)
2937{
11fdf7f2 2938 std::lock_guard l(lock);
7c673cae 2939 dout(10) << __func__ << " " << fl << " on " << fl->file->fnode << dendl;
11fdf7f2 2940 ceph_assert(fl->file->locked);
7c673cae
FG
2941 fl->file->locked = false;
2942 delete fl;
2943 return 0;
2944}
2945
2946int BlueFS::readdir(const string& dirname, vector<string> *ls)
2947{
11fdf7f2 2948 std::lock_guard l(lock);
7c673cae
FG
2949 dout(10) << __func__ << " " << dirname << dendl;
2950 if (dirname.empty()) {
2951 // list dirs
2952 ls->reserve(dir_map.size() + 2);
2953 for (auto& q : dir_map) {
2954 ls->push_back(q.first);
2955 }
2956 } else {
2957 // list files in dir
2958 map<string,DirRef>::iterator p = dir_map.find(dirname);
2959 if (p == dir_map.end()) {
2960 dout(20) << __func__ << " dir " << dirname << " not found" << dendl;
2961 return -ENOENT;
2962 }
2963 DirRef dir = p->second;
2964 ls->reserve(dir->file_map.size() + 2);
2965 for (auto& q : dir->file_map) {
2966 ls->push_back(q.first);
2967 }
2968 }
2969 ls->push_back(".");
2970 ls->push_back("..");
2971 return 0;
2972}
2973
2974int BlueFS::unlink(const string& dirname, const string& filename)
2975{
11fdf7f2 2976 std::lock_guard l(lock);
7c673cae
FG
2977 dout(10) << __func__ << " " << dirname << "/" << filename << dendl;
2978 map<string,DirRef>::iterator p = dir_map.find(dirname);
2979 if (p == dir_map.end()) {
2980 dout(20) << __func__ << " dir " << dirname << " not found" << dendl;
2981 return -ENOENT;
2982 }
2983 DirRef dir = p->second;
2984 map<string,FileRef>::iterator q = dir->file_map.find(filename);
2985 if (q == dir->file_map.end()) {
2986 dout(20) << __func__ << " file " << dirname << "/" << filename
2987 << " not found" << dendl;
2988 return -ENOENT;
2989 }
2990 FileRef file = q->second;
2991 if (file->locked) {
2992 dout(20) << __func__ << " file " << dirname << "/" << filename
2993 << " is locked" << dendl;
2994 return -EBUSY;
2995 }
2996 dir->file_map.erase(filename);
2997 log_t.op_dir_unlink(dirname, filename);
2998 _drop_link(file);
2999 return 0;
3000}
d2e6a577
FG
3001
3002bool BlueFS::wal_is_rotational()
3003{
94b18763
FG
3004 if (bdev[BDEV_WAL]) {
3005 return bdev[BDEV_WAL]->is_rotational();
3006 } else if (bdev[BDEV_DB]) {
3007 return bdev[BDEV_DB]->is_rotational();
3008 }
3009 return bdev[BDEV_SLOW]->is_rotational();
d2e6a577 3010}