]> git.proxmox.com Git - ceph.git/blame - ceph/src/os/bluestore/BlueFS.cc
update sources to v12.2.3
[ceph.git] / ceph / src / os / bluestore / BlueFS.cc
CommitLineData
7c673cae
FG
1// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2// vim: ts=8 sw=2 smarttab
3
4#include "boost/algorithm/string.hpp"
5#include "BlueFS.h"
6
7#include "common/debug.h"
8#include "common/errno.h"
9#include "common/perf_counters.h"
10#include "BlockDevice.h"
11#include "Allocator.h"
12#include "include/assert.h"
13
14#define dout_context cct
15#define dout_subsys ceph_subsys_bluefs
16#undef dout_prefix
17#define dout_prefix *_dout << "bluefs "
18
19MEMPOOL_DEFINE_OBJECT_FACTORY(BlueFS::File, bluefs_file, bluefs);
20MEMPOOL_DEFINE_OBJECT_FACTORY(BlueFS::Dir, bluefs_dir, bluefs);
21MEMPOOL_DEFINE_OBJECT_FACTORY(BlueFS::FileWriter, bluefs_file_writer, bluefs);
22MEMPOOL_DEFINE_OBJECT_FACTORY(BlueFS::FileReaderBuffer,
23 bluefs_file_reader_buffer, bluefs);
24MEMPOOL_DEFINE_OBJECT_FACTORY(BlueFS::FileReader, bluefs_file_reader, bluefs);
25MEMPOOL_DEFINE_OBJECT_FACTORY(BlueFS::FileLock, bluefs_file_lock, bluefs);
26
27
28BlueFS::BlueFS(CephContext* cct)
29 : cct(cct),
30 bdev(MAX_BDEV),
31 ioc(MAX_BDEV),
32 block_all(MAX_BDEV),
33 block_total(MAX_BDEV, 0)
34{
35}
36
37BlueFS::~BlueFS()
38{
39 for (auto p : ioc) {
40 if (p)
41 p->aio_wait();
42 }
43 for (auto p : bdev) {
44 if (p) {
45 p->close();
46 delete p;
47 }
48 }
49 for (auto p : ioc) {
50 delete p;
51 }
52}
53
54void BlueFS::_init_logger()
55{
56 PerfCountersBuilder b(cct, "bluefs",
57 l_bluefs_first, l_bluefs_last);
58 b.add_u64_counter(l_bluefs_gift_bytes, "gift_bytes",
59 "Bytes gifted from BlueStore");
60 b.add_u64_counter(l_bluefs_reclaim_bytes, "reclaim_bytes",
61 "Bytes reclaimed by BlueStore");
62 b.add_u64(l_bluefs_db_total_bytes, "db_total_bytes",
63 "Total bytes (main db device)",
64 "b", PerfCountersBuilder::PRIO_USEFUL);
65 b.add_u64(l_bluefs_db_used_bytes, "db_used_bytes",
66 "Used bytes (main db device)",
67 "u", PerfCountersBuilder::PRIO_USEFUL);
68 b.add_u64(l_bluefs_wal_total_bytes, "wal_total_bytes",
69 "Total bytes (wal device)",
70 "walb", PerfCountersBuilder::PRIO_USEFUL);
71 b.add_u64(l_bluefs_wal_used_bytes, "wal_used_bytes",
72 "Used bytes (wal device)",
73 "walu", PerfCountersBuilder::PRIO_USEFUL);
74 b.add_u64(l_bluefs_slow_total_bytes, "slow_total_bytes",
75 "Total bytes (slow device)",
76 "slob", PerfCountersBuilder::PRIO_USEFUL);
77 b.add_u64(l_bluefs_slow_used_bytes, "slow_used_bytes",
78 "Used bytes (slow device)",
79 "slou", PerfCountersBuilder::PRIO_USEFUL);
80 b.add_u64(l_bluefs_num_files, "num_files", "File count",
81 "f", PerfCountersBuilder::PRIO_USEFUL);
82 b.add_u64(l_bluefs_log_bytes, "log_bytes", "Size of the metadata log",
83 "jlen", PerfCountersBuilder::PRIO_INTERESTING);
84 b.add_u64_counter(l_bluefs_log_compactions, "log_compactions",
85 "Compactions of the metadata log");
86 b.add_u64_counter(l_bluefs_logged_bytes, "logged_bytes",
87 "Bytes written to the metadata log", "j",
88 PerfCountersBuilder::PRIO_CRITICAL);
89 b.add_u64_counter(l_bluefs_files_written_wal, "files_written_wal",
90 "Files written to WAL");
91 b.add_u64_counter(l_bluefs_files_written_sst, "files_written_sst",
92 "Files written to SSTs");
93 b.add_u64_counter(l_bluefs_bytes_written_wal, "bytes_written_wal",
94 "Bytes written to WAL", "wal",
95 PerfCountersBuilder::PRIO_CRITICAL);
96 b.add_u64_counter(l_bluefs_bytes_written_sst, "bytes_written_sst",
97 "Bytes written to SSTs", "sst",
98 PerfCountersBuilder::PRIO_CRITICAL);
99 logger = b.create_perf_counters();
100 cct->get_perfcounters_collection()->add(logger);
101}
102
103void BlueFS::_shutdown_logger()
104{
105 cct->get_perfcounters_collection()->remove(logger);
106 delete logger;
107}
108
109void BlueFS::_update_logger_stats()
110{
111 // we must be holding the lock
112 logger->set(l_bluefs_num_files, file_map.size());
113 logger->set(l_bluefs_log_bytes, log_writer->file->fnode.size);
114
115 if (alloc[BDEV_WAL]) {
116 logger->set(l_bluefs_wal_total_bytes, block_total[BDEV_WAL]);
117 logger->set(l_bluefs_wal_used_bytes,
118 block_total[BDEV_WAL] - alloc[BDEV_WAL]->get_free());
119 }
120 if (alloc[BDEV_DB]) {
121 logger->set(l_bluefs_db_total_bytes, block_total[BDEV_DB]);
122 logger->set(l_bluefs_db_used_bytes,
123 block_total[BDEV_DB] - alloc[BDEV_DB]->get_free());
124 }
125 if (alloc[BDEV_SLOW]) {
126 logger->set(l_bluefs_slow_total_bytes, block_total[BDEV_SLOW]);
127 logger->set(l_bluefs_slow_used_bytes,
128 block_total[BDEV_SLOW] - alloc[BDEV_SLOW]->get_free());
129 }
130}
131
c07f9fc5 132int BlueFS::add_block_device(unsigned id, const string& path)
7c673cae
FG
133{
134 dout(10) << __func__ << " bdev " << id << " path " << path << dendl;
135 assert(id < bdev.size());
136 assert(bdev[id] == NULL);
137 BlockDevice *b = BlockDevice::create(cct, path, NULL, NULL);
138 int r = b->open(path);
139 if (r < 0) {
140 delete b;
141 return r;
142 }
143 dout(1) << __func__ << " bdev " << id << " path " << path
144 << " size " << pretty_si_t(b->get_size()) << "B" << dendl;
145 bdev[id] = b;
146 ioc[id] = new IOContext(cct, NULL);
147 return 0;
148}
149
150bool BlueFS::bdev_support_label(unsigned id)
151{
152 assert(id < bdev.size());
153 assert(bdev[id]);
154 return bdev[id]->supported_bdev_label();
155}
156
157uint64_t BlueFS::get_block_device_size(unsigned id)
158{
159 if (id < bdev.size() && bdev[id])
160 return bdev[id]->get_size();
161 return 0;
162}
163
164void BlueFS::add_block_extent(unsigned id, uint64_t offset, uint64_t length)
165{
166 std::unique_lock<std::mutex> l(lock);
167 dout(1) << __func__ << " bdev " << id
168 << " 0x" << std::hex << offset << "~" << length << std::dec
169 << dendl;
170 assert(id < bdev.size());
171 assert(bdev[id]);
172 assert(bdev[id]->get_size() >= offset + length);
173 block_all[id].insert(offset, length);
174 block_total[id] += length;
175
176 if (id < alloc.size() && alloc[id]) {
177 log_t.op_alloc_add(id, offset, length);
178 int r = _flush_and_sync_log(l);
179 assert(r == 0);
180 alloc[id]->init_add_free(offset, length);
181 }
182
183 if (logger)
184 logger->inc(l_bluefs_gift_bytes, length);
185 dout(10) << __func__ << " done" << dendl;
186}
187
188int BlueFS::reclaim_blocks(unsigned id, uint64_t want,
189 AllocExtentVector *extents)
190{
191 std::unique_lock<std::mutex> l(lock);
192 dout(1) << __func__ << " bdev " << id
193 << " want 0x" << std::hex << want << std::dec << dendl;
194 assert(id < alloc.size());
195 assert(alloc[id]);
196 int r = alloc[id]->reserve(want);
197 assert(r == 0); // caller shouldn't ask for more than they can get
198 int64_t got = alloc[id]->allocate(want, cct->_conf->bluefs_alloc_size, 0,
199 extents);
200 if (got < (int64_t)want) {
201 alloc[id]->unreserve(want - MAX(0, got));
202 }
203 if (got <= 0) {
204 derr << __func__ << " failed to allocate space to return to bluestore"
205 << dendl;
206 alloc[id]->dump();
207 return got;
208 }
209
210 for (auto& p : *extents) {
211 block_all[id].erase(p.offset, p.length);
212 block_total[id] -= p.length;
213 log_t.op_alloc_rm(id, p.offset, p.length);
214 }
215
216 flush_bdev();
217 r = _flush_and_sync_log(l);
218 assert(r == 0);
219
220 if (logger)
221 logger->inc(l_bluefs_reclaim_bytes, got);
222 dout(1) << __func__ << " bdev " << id << " want 0x" << std::hex << want
223 << " got " << *extents << dendl;
224 return 0;
225}
226
227uint64_t BlueFS::get_fs_usage()
228{
229 std::lock_guard<std::mutex> l(lock);
230 uint64_t total_bytes = 0;
231 for (auto& p : file_map) {
232 total_bytes += p.second->fnode.get_allocated();
233 }
234 return total_bytes;
235}
236
237uint64_t BlueFS::get_total(unsigned id)
238{
239 std::lock_guard<std::mutex> l(lock);
240 assert(id < block_all.size());
241 return block_total[id];
242}
243
244uint64_t BlueFS::get_free(unsigned id)
245{
246 std::lock_guard<std::mutex> l(lock);
247 assert(id < alloc.size());
248 return alloc[id]->get_free();
249}
250
251void BlueFS::dump_perf_counters(Formatter *f)
252{
253 f->open_object_section("bluefs_perf_counters");
254 logger->dump_formatted(f,0);
255 f->close_section();
256}
257
3efd9988
FG
258void BlueFS::dump_block_extents(ostream& out)
259{
260 for (unsigned i = 0; i < MAX_BDEV; ++i) {
261 if (!bdev[i]) {
262 continue;
263 }
264 out << i << " : size 0x" << std::hex << bdev[i]->get_size()
265 << " : own 0x" << block_all[i] << std::dec << "\n";
266 }
267}
7c673cae
FG
268
269void BlueFS::get_usage(vector<pair<uint64_t,uint64_t>> *usage)
270{
271 std::lock_guard<std::mutex> l(lock);
272 usage->resize(bdev.size());
273 for (unsigned id = 0; id < bdev.size(); ++id) {
274 if (!bdev[id]) {
275 (*usage)[id] = make_pair(0, 0);
276 continue;
277 }
278 (*usage)[id].first = alloc[id]->get_free();
279 (*usage)[id].second = block_total[id];
280 uint64_t used =
281 (block_total[id] - (*usage)[id].first) * 100 / block_total[id];
282 dout(10) << __func__ << " bdev " << id
283 << " free " << (*usage)[id].first
284 << " (" << pretty_si_t((*usage)[id].first) << "B)"
285 << " / " << (*usage)[id].second
286 << " (" << pretty_si_t((*usage)[id].second) << "B)"
287 << ", used " << used << "%"
288 << dendl;
289 }
290}
291
292int BlueFS::get_block_extents(unsigned id, interval_set<uint64_t> *extents)
293{
294 std::lock_guard<std::mutex> l(lock);
295 dout(10) << __func__ << " bdev " << id << dendl;
296 if (id >= block_all.size())
297 return -EINVAL;
298 *extents = block_all[id];
299 return 0;
300}
301
302int BlueFS::mkfs(uuid_d osd_uuid)
303{
304 std::unique_lock<std::mutex> l(lock);
305 dout(1) << __func__
306 << " osd_uuid " << osd_uuid
307 << dendl;
308
309 _init_alloc();
310 _init_logger();
311
312 super.version = 1;
313 super.block_size = bdev[BDEV_DB]->get_block_size();
314 super.osd_uuid = osd_uuid;
315 super.uuid.generate_random();
316 dout(1) << __func__ << " uuid " << super.uuid << dendl;
317
318 // init log
319 FileRef log_file = new File;
320 log_file->fnode.ino = 1;
321 log_file->fnode.prefer_bdev = BDEV_WAL;
322 int r = _allocate(
323 log_file->fnode.prefer_bdev,
324 cct->_conf->bluefs_max_log_runway,
325 &log_file->fnode.extents);
326 log_file->fnode.recalc_allocated();
327 assert(r == 0);
328 log_writer = _create_writer(log_file);
329
330 // initial txn
331 log_t.op_init();
332 for (unsigned bdev = 0; bdev < MAX_BDEV; ++bdev) {
333 interval_set<uint64_t>& p = block_all[bdev];
334 if (p.empty())
335 continue;
336 for (interval_set<uint64_t>::iterator q = p.begin(); q != p.end(); ++q) {
337 dout(20) << __func__ << " op_alloc_add " << bdev << " 0x"
338 << std::hex << q.get_start() << "~" << q.get_len() << std::dec
339 << dendl;
340 log_t.op_alloc_add(bdev, q.get_start(), q.get_len());
341 }
342 }
343 _flush_and_sync_log(l);
344
345 // write supers
346 super.log_fnode = log_file->fnode;
347 _write_super();
348 flush_bdev();
349
350 // clean up
351 super = bluefs_super_t();
352 _close_writer(log_writer);
353 log_writer = NULL;
354 block_all.clear();
355 block_total.clear();
356 _stop_alloc();
357 _shutdown_logger();
358
359 dout(10) << __func__ << " success" << dendl;
360 return 0;
361}
362
363void BlueFS::_init_alloc()
364{
365 dout(20) << __func__ << dendl;
366 alloc.resize(MAX_BDEV);
367 pending_release.resize(MAX_BDEV);
368 for (unsigned id = 0; id < bdev.size(); ++id) {
369 if (!bdev[id]) {
370 continue;
371 }
372 assert(bdev[id]->get_size());
373 alloc[id] = Allocator::create(cct, cct->_conf->bluefs_allocator,
374 bdev[id]->get_size(),
375 cct->_conf->bluefs_alloc_size);
376 interval_set<uint64_t>& p = block_all[id];
377 for (interval_set<uint64_t>::iterator q = p.begin(); q != p.end(); ++q) {
378 alloc[id]->init_add_free(q.get_start(), q.get_len());
379 }
380 }
381}
382
383void BlueFS::_stop_alloc()
384{
385 dout(20) << __func__ << dendl;
386 for (auto p : alloc) {
387 if (p != nullptr) {
388 p->shutdown();
389 delete p;
390 }
391 }
392 alloc.clear();
393}
394
395int BlueFS::mount()
396{
397 dout(1) << __func__ << dendl;
398
399 int r = _open_super();
400 if (r < 0) {
401 derr << __func__ << " failed to open super: " << cpp_strerror(r) << dendl;
402 goto out;
403 }
404
405 block_all.clear();
406 block_all.resize(MAX_BDEV);
407 block_total.clear();
408 block_total.resize(MAX_BDEV, 0);
409 _init_alloc();
410
411 r = _replay(false);
412 if (r < 0) {
413 derr << __func__ << " failed to replay log: " << cpp_strerror(r) << dendl;
414 _stop_alloc();
415 goto out;
416 }
417
418 // init freelist
419 for (auto& p : file_map) {
420 dout(30) << __func__ << " noting alloc for " << p.second->fnode << dendl;
421 for (auto& q : p.second->fnode.extents) {
422 alloc[q.bdev]->init_rm_free(q.offset, q.length);
423 }
424 }
425
426 // set up the log for future writes
427 log_writer = _create_writer(_get_file(1));
428 assert(log_writer->file->fnode.ino == 1);
429 log_writer->pos = log_writer->file->fnode.size;
430 dout(10) << __func__ << " log write pos set to 0x"
431 << std::hex << log_writer->pos << std::dec
432 << dendl;
433
434 _init_logger();
435 return 0;
436
437 out:
438 super = bluefs_super_t();
439 return r;
440}
441
442void BlueFS::umount()
443{
444 dout(1) << __func__ << dendl;
445
446 sync_metadata();
447
448 _close_writer(log_writer);
449 log_writer = NULL;
450
451 _stop_alloc();
452 file_map.clear();
453 dir_map.clear();
454 super = bluefs_super_t();
455 log_t.clear();
456 _shutdown_logger();
457}
458
459void BlueFS::collect_metadata(map<string,string> *pm)
460{
461 if (bdev[BDEV_DB])
462 bdev[BDEV_DB]->collect_metadata("bluefs_db_", pm);
463 if (bdev[BDEV_WAL])
464 bdev[BDEV_WAL]->collect_metadata("bluefs_wal_", pm);
465 if (bdev[BDEV_SLOW])
466 bdev[BDEV_SLOW]->collect_metadata("bluefs_slow_", pm);
467}
468
469int BlueFS::fsck()
470{
471 std::lock_guard<std::mutex> l(lock);
472 dout(1) << __func__ << dendl;
473 // hrm, i think we check everything on mount...
474 return 0;
475}
476
477int BlueFS::_write_super()
478{
479 // build superblock
480 bufferlist bl;
481 ::encode(super, bl);
482 uint32_t crc = bl.crc32c(-1);
483 ::encode(crc, bl);
484 dout(10) << __func__ << " super block length(encoded): " << bl.length() << dendl;
485 dout(10) << __func__ << " superblock " << super.version << dendl;
486 dout(10) << __func__ << " log_fnode " << super.log_fnode << dendl;
487 assert(bl.length() <= get_super_length());
488 bl.append_zero(get_super_length() - bl.length());
489
490 bdev[BDEV_DB]->write(get_super_offset(), bl, false);
491 dout(20) << __func__ << " v " << super.version
492 << " crc 0x" << std::hex << crc
493 << " offset 0x" << get_super_offset() << std::dec
494 << dendl;
495 return 0;
496}
497
498int BlueFS::_open_super()
499{
500 dout(10) << __func__ << dendl;
501
502 bufferlist bl;
503 uint32_t expected_crc, crc;
504 int r;
505
506 // always the second block
507 r = bdev[BDEV_DB]->read(get_super_offset(), get_super_length(),
508 &bl, ioc[BDEV_DB], false);
509 if (r < 0)
510 return r;
511
512 bufferlist::iterator p = bl.begin();
513 ::decode(super, p);
514 {
515 bufferlist t;
516 t.substr_of(bl, 0, p.get_off());
517 crc = t.crc32c(-1);
518 }
519 ::decode(expected_crc, p);
520 if (crc != expected_crc) {
521 derr << __func__ << " bad crc on superblock, expected 0x"
522 << std::hex << expected_crc << " != actual 0x" << crc << std::dec
523 << dendl;
524 return -EIO;
525 }
526 dout(10) << __func__ << " superblock " << super.version << dendl;
527 dout(10) << __func__ << " log_fnode " << super.log_fnode << dendl;
528 return 0;
529}
530
531int BlueFS::_replay(bool noop)
532{
533 dout(10) << __func__ << (noop ? " NO-OP" : "") << dendl;
534 ino_last = 1; // by the log
535 log_seq = 0;
536
537 FileRef log_file;
538 if (noop) {
539 log_file = new File;
540 } else {
541 log_file = _get_file(1);
542 }
543 log_file->fnode = super.log_fnode;
544 dout(10) << __func__ << " log_fnode " << super.log_fnode << dendl;
545
546 FileReader *log_reader = new FileReader(
547 log_file, cct->_conf->bluefs_max_prefetch,
548 false, // !random
549 true); // ignore eof
550 while (true) {
551 assert((log_reader->buf.pos & ~super.block_mask()) == 0);
552 uint64_t pos = log_reader->buf.pos;
553 uint64_t read_pos = pos;
554 bufferlist bl;
555 {
556 int r = _read(log_reader, &log_reader->buf, read_pos, super.block_size,
557 &bl, NULL);
558 assert(r == (int)super.block_size);
559 read_pos += r;
560 }
561 uint64_t more = 0;
562 uint64_t seq;
563 uuid_d uuid;
564 {
565 bufferlist::iterator p = bl.begin();
566 __u8 a, b;
567 uint32_t len;
568 ::decode(a, p);
569 ::decode(b, p);
570 ::decode(len, p);
571 ::decode(uuid, p);
572 ::decode(seq, p);
573 if (len + 6 > bl.length()) {
574 more = ROUND_UP_TO(len + 6 - bl.length(), super.block_size);
575 }
576 }
577 if (uuid != super.uuid) {
578 dout(10) << __func__ << " 0x" << std::hex << pos << std::dec
579 << ": stop: uuid " << uuid << " != super.uuid " << super.uuid
580 << dendl;
581 break;
582 }
583 if (seq != log_seq + 1) {
584 dout(10) << __func__ << " 0x" << std::hex << pos << std::dec
585 << ": stop: seq " << seq << " != expected " << log_seq + 1
586 << dendl;
587 break;
588 }
589 if (more) {
590 dout(20) << __func__ << " need 0x" << std::hex << more << std::dec
591 << " more bytes" << dendl;
592 bufferlist t;
593 int r = _read(log_reader, &log_reader->buf, read_pos, more, &t, NULL);
594 if (r < (int)more) {
595 dout(10) << __func__ << " 0x" << std::hex << pos
596 << ": stop: len is 0x" << bl.length() + more << std::dec
597 << ", which is past eof" << dendl;
598 break;
599 }
600 assert(r == (int)more);
601 bl.claim_append(t);
602 read_pos += r;
603 }
604 bluefs_transaction_t t;
605 try {
606 bufferlist::iterator p = bl.begin();
607 ::decode(t, p);
608 }
609 catch (buffer::error& e) {
610 dout(10) << __func__ << " 0x" << std::hex << pos << std::dec
611 << ": stop: failed to decode: " << e.what()
612 << dendl;
613 delete log_reader;
614 return -EIO;
615 }
616 assert(seq == t.seq);
617 dout(10) << __func__ << " 0x" << std::hex << pos << std::dec
618 << ": " << t << dendl;
619
620 bufferlist::iterator p = t.op_bl.begin();
621 while (!p.end()) {
622 __u8 op;
623 ::decode(op, p);
624 switch (op) {
625
626 case bluefs_transaction_t::OP_INIT:
627 dout(20) << __func__ << " 0x" << std::hex << pos << std::dec
628 << ": op_init" << dendl;
629 assert(t.seq == 1);
630 break;
631
632 case bluefs_transaction_t::OP_JUMP:
633 {
634 uint64_t next_seq;
635 uint64_t offset;
636 ::decode(next_seq, p);
637 ::decode(offset, p);
638 dout(20) << __func__ << " 0x" << std::hex << pos << std::dec
639 << ": op_jump seq " << next_seq
640 << " offset 0x" << std::hex << offset << std::dec << dendl;
641 assert(next_seq >= log_seq);
642 log_seq = next_seq - 1; // we will increment it below
643 uint64_t skip = offset - read_pos;
644 if (skip) {
645 bufferlist junk;
646 int r = _read(log_reader, &log_reader->buf, read_pos, skip, &junk,
647 NULL);
648 if (r != (int)skip) {
649 dout(10) << __func__ << " 0x" << std::hex << read_pos
650 << ": stop: failed to skip to " << offset
651 << std::dec << dendl;
652 assert(0 == "problem with op_jump");
653 }
654 }
655 }
656 break;
657
658 case bluefs_transaction_t::OP_JUMP_SEQ:
659 {
660 uint64_t next_seq;
661 ::decode(next_seq, p);
662 dout(20) << __func__ << " 0x" << std::hex << pos << std::dec
663 << ": op_jump_seq " << next_seq << dendl;
664 assert(next_seq >= log_seq);
665 log_seq = next_seq - 1; // we will increment it below
666 }
667 break;
668
669 case bluefs_transaction_t::OP_ALLOC_ADD:
670 {
671 __u8 id;
672 uint64_t offset, length;
673 ::decode(id, p);
674 ::decode(offset, p);
675 ::decode(length, p);
676 dout(20) << __func__ << " 0x" << std::hex << pos << std::dec
677 << ": op_alloc_add " << " " << (int)id
678 << ":0x" << std::hex << offset << "~" << length << std::dec
679 << dendl;
680 if (!noop) {
681 block_all[id].insert(offset, length);
682 block_total[id] += length;
683 alloc[id]->init_add_free(offset, length);
684 }
685 }
686 break;
687
688 case bluefs_transaction_t::OP_ALLOC_RM:
689 {
690 __u8 id;
691 uint64_t offset, length;
692 ::decode(id, p);
693 ::decode(offset, p);
694 ::decode(length, p);
695 dout(20) << __func__ << " 0x" << std::hex << pos << std::dec
696 << ": op_alloc_rm " << " " << (int)id
697 << ":0x" << std::hex << offset << "~" << length << std::dec
698 << dendl;
699 if (!noop) {
700 block_all[id].erase(offset, length);
701 block_total[id] -= length;
702 alloc[id]->init_rm_free(offset, length);
703 }
704 }
705 break;
706
707 case bluefs_transaction_t::OP_DIR_LINK:
708 {
709 string dirname, filename;
710 uint64_t ino;
711 ::decode(dirname, p);
712 ::decode(filename, p);
713 ::decode(ino, p);
714 dout(20) << __func__ << " 0x" << std::hex << pos << std::dec
715 << ": op_dir_link " << " " << dirname << "/" << filename
716 << " to " << ino
717 << dendl;
718 if (!noop) {
719 FileRef file = _get_file(ino);
720 assert(file->fnode.ino);
721 map<string,DirRef>::iterator q = dir_map.find(dirname);
722 assert(q != dir_map.end());
723 map<string,FileRef>::iterator r = q->second->file_map.find(filename);
724 assert(r == q->second->file_map.end());
725 q->second->file_map[filename] = file;
726 ++file->refs;
727 }
728 }
729 break;
730
731 case bluefs_transaction_t::OP_DIR_UNLINK:
732 {
733 string dirname, filename;
734 ::decode(dirname, p);
735 ::decode(filename, p);
736 dout(20) << __func__ << " 0x" << std::hex << pos << std::dec
737 << ": op_dir_unlink " << " " << dirname << "/" << filename
738 << dendl;
739 if (!noop) {
740 map<string,DirRef>::iterator q = dir_map.find(dirname);
741 assert(q != dir_map.end());
742 map<string,FileRef>::iterator r = q->second->file_map.find(filename);
743 assert(r != q->second->file_map.end());
744 assert(r->second->refs > 0);
745 --r->second->refs;
746 q->second->file_map.erase(r);
747 }
748 }
749 break;
750
751 case bluefs_transaction_t::OP_DIR_CREATE:
752 {
753 string dirname;
754 ::decode(dirname, p);
755 dout(20) << __func__ << " 0x" << std::hex << pos << std::dec
756 << ": op_dir_create " << dirname << dendl;
757 if (!noop) {
758 map<string,DirRef>::iterator q = dir_map.find(dirname);
759 assert(q == dir_map.end());
760 dir_map[dirname] = new Dir;
761 }
762 }
763 break;
764
765 case bluefs_transaction_t::OP_DIR_REMOVE:
766 {
767 string dirname;
768 ::decode(dirname, p);
769 dout(20) << __func__ << " 0x" << std::hex << pos << std::dec
770 << ": op_dir_remove " << dirname << dendl;
771 if (!noop) {
772 map<string,DirRef>::iterator q = dir_map.find(dirname);
773 assert(q != dir_map.end());
774 assert(q->second->file_map.empty());
775 dir_map.erase(q);
776 }
777 }
778 break;
779
780 case bluefs_transaction_t::OP_FILE_UPDATE:
781 {
782 bluefs_fnode_t fnode;
783 ::decode(fnode, p);
784 dout(20) << __func__ << " 0x" << std::hex << pos << std::dec
785 << ": op_file_update " << " " << fnode << dendl;
786 if (!noop) {
787 FileRef f = _get_file(fnode.ino);
788 f->fnode = fnode;
789 if (fnode.ino > ino_last) {
790 ino_last = fnode.ino;
791 }
792 }
793 }
794 break;
795
796 case bluefs_transaction_t::OP_FILE_REMOVE:
797 {
798 uint64_t ino;
799 ::decode(ino, p);
800 dout(20) << __func__ << " 0x" << std::hex << pos << std::dec
801 << ": op_file_remove " << ino << dendl;
802 if (!noop) {
803 auto p = file_map.find(ino);
804 assert(p != file_map.end());
805 file_map.erase(p);
806 }
807 }
808 break;
809
810 default:
811 derr << __func__ << " 0x" << std::hex << pos << std::dec
812 << ": stop: unrecognized op " << (int)op << dendl;
813 delete log_reader;
814 return -EIO;
815 }
816 }
817 assert(p.end());
818
819 // we successfully replayed the transaction; bump the seq and log size
820 ++log_seq;
821 log_file->fnode.size = log_reader->buf.pos;
822 }
823
824 dout(10) << __func__ << " log file size was 0x"
825 << std::hex << log_file->fnode.size << std::dec << dendl;
826 delete log_reader;
827
828 if (!noop) {
829 // verify file link counts are all >0
830 for (auto& p : file_map) {
831 if (p.second->refs == 0 &&
832 p.second->fnode.ino > 1) {
833 derr << __func__ << " file with link count 0: " << p.second->fnode
834 << dendl;
835 return -EIO;
836 }
837 }
838 }
839
840 dout(10) << __func__ << " done" << dendl;
841 return 0;
842}
843
844BlueFS::FileRef BlueFS::_get_file(uint64_t ino)
845{
846 auto p = file_map.find(ino);
847 if (p == file_map.end()) {
848 FileRef f = new File;
849 file_map[ino] = f;
850 dout(30) << __func__ << " ino " << ino << " = " << f
851 << " (new)" << dendl;
852 return f;
853 } else {
854 dout(30) << __func__ << " ino " << ino << " = " << p->second << dendl;
855 return p->second;
856 }
857}
858
859void BlueFS::_drop_link(FileRef file)
860{
861 dout(20) << __func__ << " had refs " << file->refs
862 << " on " << file->fnode << dendl;
863 assert(file->refs > 0);
864 --file->refs;
865 if (file->refs == 0) {
866 dout(20) << __func__ << " destroying " << file->fnode << dendl;
867 assert(file->num_reading.load() == 0);
868 log_t.op_file_remove(file->fnode.ino);
869 for (auto& r : file->fnode.extents) {
870 pending_release[r.bdev].insert(r.offset, r.length);
871 }
872 file_map.erase(file->fnode.ino);
873 file->deleted = true;
874 file->fnode.recalc_allocated();
875 if (file->dirty_seq) {
876 assert(file->dirty_seq > log_seq_stable);
877 assert(dirty_files.count(file->dirty_seq));
878 auto it = dirty_files[file->dirty_seq].iterator_to(*file);
879 dirty_files[file->dirty_seq].erase(it);
880 file->dirty_seq = 0;
881 }
882 }
883}
884
885int BlueFS::_read_random(
886 FileReader *h, ///< [in] read from here
887 uint64_t off, ///< [in] offset
888 size_t len, ///< [in] this many bytes
889 char *out) ///< [out] optional: or copy it here
890{
891 dout(10) << __func__ << " h " << h
892 << " 0x" << std::hex << off << "~" << len << std::dec
893 << " from " << h->file->fnode << dendl;
894
895 ++h->file->num_reading;
896
897 if (!h->ignore_eof &&
898 off + len > h->file->fnode.size) {
899 if (off > h->file->fnode.size)
900 len = 0;
901 else
902 len = h->file->fnode.size - off;
903 dout(20) << __func__ << " reaching (or past) eof, len clipped to 0x"
904 << std::hex << len << std::dec << dendl;
905 }
906
907 int ret = 0;
908 while (len > 0) {
909 uint64_t x_off = 0;
910 auto p = h->file->fnode.seek(off, &x_off);
911 uint64_t l = MIN(p->length - x_off, len);
912 dout(20) << __func__ << " read buffered 0x"
913 << std::hex << x_off << "~" << l << std::dec
914 << " of " << *p << dendl;
915 int r = bdev[p->bdev]->read_random(p->offset + x_off, l, out,
916 cct->_conf->bluefs_buffered_io);
917 assert(r == 0);
918 off += l;
919 len -= l;
920 ret += l;
921 out += l;
922 }
923
924 dout(20) << __func__ << " got " << ret << dendl;
925 --h->file->num_reading;
926 return ret;
927}
928
929int BlueFS::_read(
930 FileReader *h, ///< [in] read from here
931 FileReaderBuffer *buf, ///< [in] reader state
932 uint64_t off, ///< [in] offset
933 size_t len, ///< [in] this many bytes
934 bufferlist *outbl, ///< [out] optional: reference the result here
935 char *out) ///< [out] optional: or copy it here
936{
937 dout(10) << __func__ << " h " << h
938 << " 0x" << std::hex << off << "~" << len << std::dec
939 << " from " << h->file->fnode << dendl;
940
941 ++h->file->num_reading;
942
943 if (!h->ignore_eof &&
944 off + len > h->file->fnode.size) {
945 if (off > h->file->fnode.size)
946 len = 0;
947 else
948 len = h->file->fnode.size - off;
949 dout(20) << __func__ << " reaching (or past) eof, len clipped to 0x"
950 << std::hex << len << std::dec << dendl;
951 }
952 if (outbl)
953 outbl->clear();
954
955 int ret = 0;
956 while (len > 0) {
957 size_t left;
958 if (off < buf->bl_off || off >= buf->get_buf_end()) {
959 buf->bl.clear();
960 buf->bl_off = off & super.block_mask();
961 uint64_t x_off = 0;
962 auto p = h->file->fnode.seek(buf->bl_off, &x_off);
963 uint64_t want = ROUND_UP_TO(len + (off & ~super.block_mask()),
964 super.block_size);
965 want = MAX(want, buf->max_prefetch);
966 uint64_t l = MIN(p->length - x_off, want);
967 uint64_t eof_offset = ROUND_UP_TO(h->file->fnode.size, super.block_size);
968 if (!h->ignore_eof &&
969 buf->bl_off + l > eof_offset) {
970 l = eof_offset - buf->bl_off;
971 }
972 dout(20) << __func__ << " fetching 0x"
973 << std::hex << x_off << "~" << l << std::dec
974 << " of " << *p << dendl;
975 int r = bdev[p->bdev]->read(p->offset + x_off, l, &buf->bl, ioc[p->bdev],
976 cct->_conf->bluefs_buffered_io);
977 assert(r == 0);
978 }
979 left = buf->get_buf_remaining(off);
980 dout(20) << __func__ << " left 0x" << std::hex << left
981 << " len 0x" << len << std::dec << dendl;
982
983 int r = MIN(len, left);
984 if (outbl) {
985 bufferlist t;
986 t.substr_of(buf->bl, off - buf->bl_off, r);
987 outbl->claim_append(t);
988 }
989 if (out) {
990 // NOTE: h->bl is normally a contiguous buffer so c_str() is free.
991 memcpy(out, buf->bl.c_str() + off - buf->bl_off, r);
992 out += r;
993 }
994
995 dout(30) << __func__ << " result chunk (0x"
996 << std::hex << r << std::dec << " bytes):\n";
997 bufferlist t;
998 t.substr_of(buf->bl, off - buf->bl_off, r);
999 t.hexdump(*_dout);
1000 *_dout << dendl;
1001
1002 off += r;
1003 len -= r;
1004 ret += r;
1005 buf->pos += r;
1006 }
1007
1008 dout(20) << __func__ << " got " << ret << dendl;
1009 assert(!outbl || (int)outbl->length() == ret);
1010 --h->file->num_reading;
1011 return ret;
1012}
1013
1014void BlueFS::_invalidate_cache(FileRef f, uint64_t offset, uint64_t length)
1015{
1016 dout(10) << __func__ << " file " << f->fnode
1017 << " 0x" << std::hex << offset << "~" << length << std::dec
1018 << dendl;
1019 if (offset & ~super.block_mask()) {
1020 offset &= super.block_mask();
1021 length = ROUND_UP_TO(length, super.block_size);
1022 }
1023 uint64_t x_off = 0;
1024 auto p = f->fnode.seek(offset, &x_off);
1025 while (length > 0 && p != f->fnode.extents.end()) {
1026 uint64_t x_len = MIN(p->length - x_off, length);
1027 bdev[p->bdev]->invalidate_cache(p->offset + x_off, x_len);
1028 dout(20) << __func__ << " 0x" << std::hex << x_off << "~" << x_len
1029 << std:: dec << " of " << *p << dendl;
1030 offset += x_len;
1031 length -= x_len;
1032 }
1033}
1034
1035uint64_t BlueFS::_estimate_log_size()
1036{
1037 int avg_dir_size = 40; // fixme
1038 int avg_file_size = 12;
1039 uint64_t size = 4096 * 2;
1040 size += file_map.size() * (1 + sizeof(bluefs_fnode_t));
1041 for (auto& p : block_all)
1042 size += p.num_intervals() * (1 + 1 + sizeof(uint64_t) * 2);
1043 size += dir_map.size() + (1 + avg_dir_size);
1044 size += file_map.size() * (1 + avg_dir_size + avg_file_size);
1045 return ROUND_UP_TO(size, super.block_size);
1046}
1047
1048void BlueFS::compact_log()
1049{
1050 std::unique_lock<std::mutex> l(lock);
1051 if (cct->_conf->bluefs_compact_log_sync) {
1052 _compact_log_sync();
1053 } else {
1054 _compact_log_async(l);
1055 }
1056}
1057
1058bool BlueFS::_should_compact_log()
1059{
1060 uint64_t current = log_writer->file->fnode.size;
1061 uint64_t expected = _estimate_log_size();
1062 float ratio = (float)current / (float)expected;
1063 dout(10) << __func__ << " current 0x" << std::hex << current
1064 << " expected " << expected << std::dec
1065 << " ratio " << ratio
1066 << (new_log ? " (async compaction in progress)" : "")
1067 << dendl;
1068 if (new_log ||
1069 current < cct->_conf->bluefs_log_compact_min_size ||
1070 ratio < cct->_conf->bluefs_log_compact_min_ratio) {
1071 return false;
1072 }
1073 return true;
1074}
1075
1076void BlueFS::_compact_log_dump_metadata(bluefs_transaction_t *t)
1077{
1078 t->seq = 1;
1079 t->uuid = super.uuid;
1080 dout(20) << __func__ << " op_init" << dendl;
1081
1082 t->op_init();
1083 for (unsigned bdev = 0; bdev < MAX_BDEV; ++bdev) {
1084 interval_set<uint64_t>& p = block_all[bdev];
1085 for (interval_set<uint64_t>::iterator q = p.begin(); q != p.end(); ++q) {
1086 dout(20) << __func__ << " op_alloc_add " << bdev << " 0x"
1087 << std::hex << q.get_start() << "~" << q.get_len() << std::dec
1088 << dendl;
1089 t->op_alloc_add(bdev, q.get_start(), q.get_len());
1090 }
1091 }
1092 for (auto& p : file_map) {
1093 if (p.first == 1)
1094 continue;
1095 dout(20) << __func__ << " op_file_update " << p.second->fnode << dendl;
1096 assert(p.first > 1);
1097 t->op_file_update(p.second->fnode);
1098 }
1099 for (auto& p : dir_map) {
1100 dout(20) << __func__ << " op_dir_create " << p.first << dendl;
1101 t->op_dir_create(p.first);
1102 for (auto& q : p.second->file_map) {
1103 dout(20) << __func__ << " op_dir_link " << p.first << "/" << q.first
1104 << " to " << q.second->fnode.ino << dendl;
1105 t->op_dir_link(p.first, q.first, q.second->fnode.ino);
1106 }
1107 }
1108}
1109
1110void BlueFS::_compact_log_sync()
1111{
1112 dout(10) << __func__ << dendl;
1113 File *log_file = log_writer->file.get();
1114
1115 // clear out log (be careful who calls us!!!)
1116 log_t.clear();
1117
1118 bluefs_transaction_t t;
1119 _compact_log_dump_metadata(&t);
1120
1121 dout(20) << __func__ << " op_jump_seq " << log_seq << dendl;
1122 t.op_jump_seq(log_seq);
1123
1124 bufferlist bl;
1125 ::encode(t, bl);
1126 _pad_bl(bl);
1127
1128 uint64_t need = bl.length() + cct->_conf->bluefs_max_log_runway;
1129 dout(20) << __func__ << " need " << need << dendl;
1130
1131 mempool::bluefs::vector<bluefs_extent_t> old_extents;
1132 old_extents.swap(log_file->fnode.extents);
1133 log_file->fnode.recalc_allocated();
1134 while (log_file->fnode.get_allocated() < need) {
1135 int r = _allocate(log_file->fnode.prefer_bdev,
1136 need - log_file->fnode.get_allocated(),
1137 &log_file->fnode.extents);
1138 log_file->fnode.recalc_allocated();
1139 assert(r == 0);
1140 }
1141
1142 _close_writer(log_writer);
1143
1144 log_file->fnode.size = bl.length();
1145 log_writer = _create_writer(log_file);
1146 log_writer->append(bl);
1147 int r = _flush(log_writer, true);
1148 assert(r == 0);
1149 wait_for_aio(log_writer);
1150
224ce89b
WB
1151 list<aio_t> completed_ios;
1152 _claim_completed_aios(log_writer, &completed_ios);
1153 flush_bdev();
1154 completed_ios.clear();
1155
7c673cae
FG
1156 dout(10) << __func__ << " writing super" << dendl;
1157 super.log_fnode = log_file->fnode;
1158 ++super.version;
1159 _write_super();
1160 flush_bdev();
1161
1162 dout(10) << __func__ << " release old log extents " << old_extents << dendl;
1163 for (auto& r : old_extents) {
1164 pending_release[r.bdev].insert(r.offset, r.length);
1165 }
1166
1167 logger->inc(l_bluefs_log_compactions);
1168}
1169
1170/*
1171 * 1. Allocate a new extent to continue the log, and then log an event
1172 * that jumps the log write position to the new extent. At this point, the
1173 * old extent(s) won't be written to, and reflect everything to compact.
1174 * New events will be written to the new region that we'll keep.
1175 *
1176 * 2. While still holding the lock, encode a bufferlist that dumps all of the
1177 * in-memory fnodes and names. This will become the new beginning of the
1178 * log. The last event will jump to the log continuation extent from #1.
1179 *
1180 * 3. Queue a write to a new extent for the new beginnging of the log.
1181 *
1182 * 4. Drop lock and wait
1183 *
1184 * 5. Retake the lock.
1185 *
1186 * 6. Update the log_fnode to splice in the new beginning.
1187 *
1188 * 7. Write the new superblock.
1189 *
1190 * 8. Release the old log space. Clean up.
1191 */
1192void BlueFS::_compact_log_async(std::unique_lock<std::mutex>& l)
1193{
1194 dout(10) << __func__ << dendl;
1195 File *log_file = log_writer->file.get();
1196 assert(!new_log);
1197 assert(!new_log_writer);
1198
181888fb
FG
1199 // create a new log [writer] so that we know compaction is in progress
1200 // (see _should_compact_log)
1201 new_log = new File;
1202 new_log->fnode.ino = 0; // so that _flush_range won't try to log the fnode
1203
3efd9988
FG
1204 // 0. wait for any racing flushes to complete. (We do not want to block
1205 // in _flush_sync_log with jump_to set or else a racing thread might flush
1206 // our entries and our jump_to update won't be correct.)
1207 while (log_flushing) {
1208 dout(10) << __func__ << " log is currently flushing, waiting" << dendl;
1209 log_cond.wait(l);
1210 }
1211
7c673cae
FG
1212 // 1. allocate new log space and jump to it.
1213 old_log_jump_to = log_file->fnode.get_allocated();
1214 uint64_t need = old_log_jump_to + cct->_conf->bluefs_max_log_runway;
1215 dout(10) << __func__ << " old_log_jump_to 0x" << std::hex << old_log_jump_to
1216 << " need 0x" << need << std::dec << dendl;
1217 while (log_file->fnode.get_allocated() < need) {
1218 int r = _allocate(log_file->fnode.prefer_bdev,
1219 cct->_conf->bluefs_max_log_runway,
1220 &log_file->fnode.extents);
1221 assert(r == 0);
1222 log_file->fnode.recalc_allocated();
1223 }
1224 dout(10) << __func__ << " log extents " << log_file->fnode.extents << dendl;
1225
1226 // update the log file change and log a jump to the offset where we want to
1227 // write the new entries
1228 log_t.op_file_update(log_file->fnode);
1229 log_t.op_jump(log_seq, old_log_jump_to);
1230
1231 flush_bdev(); // FIXME?
1232
1233 _flush_and_sync_log(l, 0, old_log_jump_to);
1234
1235 // 2. prepare compacted log
1236 bluefs_transaction_t t;
224ce89b
WB
1237 //avoid record two times in log_t and _compact_log_dump_metadata.
1238 log_t.clear();
7c673cae
FG
1239 _compact_log_dump_metadata(&t);
1240
1241 // conservative estimate for final encoded size
1242 new_log_jump_to = ROUND_UP_TO(t.op_bl.length() + super.block_size * 2,
1243 cct->_conf->bluefs_alloc_size);
1244 t.op_jump(log_seq, new_log_jump_to);
1245
1246 bufferlist bl;
1247 ::encode(t, bl);
1248 _pad_bl(bl);
1249
1250 dout(10) << __func__ << " new_log_jump_to 0x" << std::hex << new_log_jump_to
1251 << std::dec << dendl;
1252
181888fb 1253 // allocate
7c673cae
FG
1254 int r = _allocate(BlueFS::BDEV_DB, new_log_jump_to,
1255 &new_log->fnode.extents);
1256 assert(r == 0);
1257 new_log->fnode.recalc_allocated();
1258 new_log_writer = _create_writer(new_log);
1259 new_log_writer->append(bl);
1260
1261 // 3. flush
1262 r = _flush(new_log_writer, true);
1263 assert(r == 0);
1264 lock.unlock();
1265
1266 // 4. wait
1267 dout(10) << __func__ << " waiting for compacted log to sync" << dendl;
1268 wait_for_aio(new_log_writer);
224ce89b
WB
1269
1270 list<aio_t> completed_ios;
1271 _claim_completed_aios(new_log_writer, &completed_ios);
7c673cae 1272 flush_bdev();
224ce89b 1273 completed_ios.clear();
7c673cae
FG
1274
1275 // 5. retake lock
1276 lock.lock();
1277
1278 // 6. update our log fnode
1279 // discard first old_log_jump_to extents
1280 dout(10) << __func__ << " remove 0x" << std::hex << old_log_jump_to << std::dec
1281 << " of " << log_file->fnode.extents << dendl;
1282 uint64_t discarded = 0;
1283 mempool::bluefs::vector<bluefs_extent_t> old_extents;
1284 while (discarded < old_log_jump_to) {
1285 assert(!log_file->fnode.extents.empty());
1286 bluefs_extent_t& e = log_file->fnode.extents.front();
1287 bluefs_extent_t temp = e;
1288 if (discarded + e.length <= old_log_jump_to) {
1289 dout(10) << __func__ << " remove old log extent " << e << dendl;
1290 discarded += e.length;
1291 log_file->fnode.extents.erase(log_file->fnode.extents.begin());
1292 } else {
1293 dout(10) << __func__ << " remove front of old log extent " << e << dendl;
1294 uint64_t drop = old_log_jump_to - discarded;
1295 temp.length = drop;
1296 e.offset += drop;
1297 e.length -= drop;
1298 discarded += drop;
1299 dout(10) << __func__ << " kept " << e << " removed " << temp << dendl;
1300 }
1301 old_extents.push_back(temp);
1302 }
1303 new_log->fnode.extents.insert(new_log->fnode.extents.end(),
1304 log_file->fnode.extents.begin(),
1305 log_file->fnode.extents.end());
1306
1307 // clear the extents from old log file, they are added to new log
1308 log_file->fnode.extents.clear();
1309
1310 // swap the log files. New log file is the log file now.
1311 log_file->fnode.extents.swap(new_log->fnode.extents);
1312 log_file->fnode.recalc_allocated();
1313 new_log->fnode.recalc_allocated();
1314 log_writer->pos = log_writer->file->fnode.size =
1315 log_writer->pos - old_log_jump_to + new_log_jump_to;
1316
1317 // 7. write the super block to reflect the changes
1318 dout(10) << __func__ << " writing super" << dendl;
1319 super.log_fnode = log_file->fnode;
1320 ++super.version;
1321 _write_super();
1322
1323 lock.unlock();
1324 flush_bdev();
1325 lock.lock();
1326
1327 // 8. release old space
1328 dout(10) << __func__ << " release old log extents " << old_extents << dendl;
1329 for (auto& r : old_extents) {
1330 pending_release[r.bdev].insert(r.offset, r.length);
1331 }
1332
1333 // delete the new log, remove from the dirty files list
1334 _close_writer(new_log_writer);
1335 if (new_log->dirty_seq) {
1336 assert(dirty_files.count(new_log->dirty_seq));
1337 auto it = dirty_files[new_log->dirty_seq].iterator_to(*new_log);
1338 dirty_files[new_log->dirty_seq].erase(it);
1339 }
1340 new_log_writer = nullptr;
1341 new_log = nullptr;
1342 log_cond.notify_all();
1343
1344 dout(10) << __func__ << " log extents " << log_file->fnode.extents << dendl;
1345 logger->inc(l_bluefs_log_compactions);
1346}
1347
1348void BlueFS::_pad_bl(bufferlist& bl)
1349{
1350 uint64_t partial = bl.length() % super.block_size;
1351 if (partial) {
1352 dout(10) << __func__ << " padding with 0x" << std::hex
1353 << super.block_size - partial << " zeros" << std::dec << dendl;
1354 bl.append_zero(super.block_size - partial);
1355 }
1356}
1357
1358void BlueFS::flush_log()
1359{
1360 std::unique_lock<std::mutex> l(lock);
1361 flush_bdev();
1362 _flush_and_sync_log(l);
1363}
1364
1365int BlueFS::_flush_and_sync_log(std::unique_lock<std::mutex>& l,
1366 uint64_t want_seq,
1367 uint64_t jump_to)
1368{
1369 while (log_flushing) {
1370 dout(10) << __func__ << " want_seq " << want_seq
1371 << " log is currently flushing, waiting" << dendl;
3efd9988 1372 assert(!jump_to);
7c673cae
FG
1373 log_cond.wait(l);
1374 }
1375 if (want_seq && want_seq <= log_seq_stable) {
1376 dout(10) << __func__ << " want_seq " << want_seq << " <= log_seq_stable "
1377 << log_seq_stable << ", done" << dendl;
3efd9988 1378 assert(!jump_to);
7c673cae
FG
1379 return 0;
1380 }
1381 if (log_t.empty() && dirty_files.empty()) {
1382 dout(10) << __func__ << " want_seq " << want_seq
1383 << " " << log_t << " not dirty, dirty_files empty, no-op" << dendl;
3efd9988 1384 assert(!jump_to);
7c673cae
FG
1385 return 0;
1386 }
1387
1388 uint64_t seq = log_t.seq = ++log_seq;
1389 assert(want_seq == 0 || want_seq <= seq);
1390 log_t.uuid = super.uuid;
1391
1392 // log dirty files
1393 auto lsi = dirty_files.find(seq);
1394 if (lsi != dirty_files.end()) {
1395 dout(20) << __func__ << " " << lsi->second.size() << " dirty_files" << dendl;
1396 for (auto &f : lsi->second) {
1397 dout(20) << __func__ << " op_file_update " << f.fnode << dendl;
1398 log_t.op_file_update(f.fnode);
1399 }
1400 }
1401
1402 dout(10) << __func__ << " " << log_t << dendl;
1403 assert(!log_t.empty());
1404
1405 // allocate some more space (before we run out)?
1406 int64_t runway = log_writer->file->fnode.get_allocated() -
1407 log_writer->get_effective_write_pos();
1408 if (runway < (int64_t)cct->_conf->bluefs_min_log_runway) {
1409 dout(10) << __func__ << " allocating more log runway (0x"
1410 << std::hex << runway << std::dec << " remaining)" << dendl;
1411 while (new_log_writer) {
1412 dout(10) << __func__ << " waiting for async compaction" << dendl;
1413 log_cond.wait(l);
1414 }
1415 int r = _allocate(log_writer->file->fnode.prefer_bdev,
1416 cct->_conf->bluefs_max_log_runway,
1417 &log_writer->file->fnode.extents);
1418 assert(r == 0);
1419 log_writer->file->fnode.recalc_allocated();
1420 log_t.op_file_update(log_writer->file->fnode);
1421 }
1422
1423 bufferlist bl;
1424 ::encode(log_t, bl);
1425
1426 // pad to block boundary
1427 _pad_bl(bl);
1428 logger->inc(l_bluefs_logged_bytes, bl.length());
1429
1430 log_writer->append(bl);
1431
1432 log_t.clear();
1433 log_t.seq = 0; // just so debug output is less confusing
1434 log_flushing = true;
1435
1436 int r = _flush(log_writer, true);
1437 assert(r == 0);
1438
1439 if (jump_to) {
1440 dout(10) << __func__ << " jumping log offset from 0x" << std::hex
1441 << log_writer->pos << " -> 0x" << jump_to << std::dec << dendl;
1442 log_writer->pos = jump_to;
1443 log_writer->file->fnode.size = jump_to;
1444 }
1445
1446 _flush_bdev_safely(log_writer);
1447
1448 log_flushing = false;
1449 log_cond.notify_all();
1450
1451 // clean dirty files
1452 if (seq > log_seq_stable) {
1453 log_seq_stable = seq;
1454 dout(20) << __func__ << " log_seq_stable " << log_seq_stable << dendl;
1455
1456 auto p = dirty_files.begin();
1457 while (p != dirty_files.end()) {
1458 if (p->first > log_seq_stable) {
1459 dout(20) << __func__ << " done cleaning up dirty files" << dendl;
1460 break;
1461 }
1462
1463 auto l = p->second.begin();
1464 while (l != p->second.end()) {
1465 File *file = &*l;
1466 assert(file->dirty_seq > 0);
1467 assert(file->dirty_seq <= log_seq_stable);
1468 dout(20) << __func__ << " cleaned file " << file->fnode << dendl;
1469 file->dirty_seq = 0;
1470 p->second.erase(l++);
1471 }
1472
1473 assert(p->second.empty());
1474 dirty_files.erase(p++);
1475 }
1476 } else {
1477 dout(20) << __func__ << " log_seq_stable " << log_seq_stable
1478 << " already >= out seq " << seq
1479 << ", we lost a race against another log flush, done" << dendl;
1480 }
1481 _update_logger_stats();
1482
1483 return 0;
1484}
1485
1486int BlueFS::_flush_range(FileWriter *h, uint64_t offset, uint64_t length)
1487{
1488 dout(10) << __func__ << " " << h << " pos 0x" << std::hex << h->pos
1489 << " 0x" << offset << "~" << length << std::dec
1490 << " to " << h->file->fnode << dendl;
1491 assert(!h->file->deleted);
1492 assert(h->file->num_readers.load() == 0);
1493
1494 h->buffer_appender.flush();
1495
1496 bool buffered;
1497 if (h->file->fnode.ino == 1)
1498 buffered = false;
1499 else
1500 buffered = cct->_conf->bluefs_buffered_io;
1501
1502 if (offset + length <= h->pos)
1503 return 0;
1504 if (offset < h->pos) {
1505 length -= h->pos - offset;
1506 offset = h->pos;
1507 dout(10) << " still need 0x"
1508 << std::hex << offset << "~" << length << std::dec
1509 << dendl;
1510 }
1511 assert(offset <= h->file->fnode.size);
1512
1513 uint64_t allocated = h->file->fnode.get_allocated();
1514
1515 // do not bother to dirty the file if we are overwriting
1516 // previously allocated extents.
1517 bool must_dirty = false;
1518 if (allocated < offset + length) {
1519 // we should never run out of log space here; see the min runway check
1520 // in _flush_and_sync_log.
1521 assert(h->file->fnode.ino != 1);
1522 int r = _allocate(h->file->fnode.prefer_bdev,
1523 offset + length - allocated,
1524 &h->file->fnode.extents);
1525 if (r < 0) {
1526 derr << __func__ << " allocated: 0x" << std::hex << allocated
1527 << " offset: 0x" << offset << " length: 0x" << length << std::dec
1528 << dendl;
3efd9988 1529 assert(0 == "bluefs enospc");
7c673cae
FG
1530 return r;
1531 }
1532 h->file->fnode.recalc_allocated();
1533 if (cct->_conf->bluefs_preextend_wal_files &&
1534 h->writer_type == WRITER_WAL) {
1535 // NOTE: this *requires* that rocksdb also has log recycling
1536 // enabled and is therefore doing robust CRCs on the log
1537 // records. otherwise, we will fail to reply the rocksdb log
1538 // properly due to garbage on the device.
1539 h->file->fnode.size = h->file->fnode.get_allocated();
1540 dout(10) << __func__ << " extending WAL size to 0x" << std::hex
1541 << h->file->fnode.size << std::dec << " to include allocated"
1542 << dendl;
1543 }
1544 must_dirty = true;
1545 }
1546 if (h->file->fnode.size < offset + length) {
1547 h->file->fnode.size = offset + length;
1548 if (h->file->fnode.ino > 1) {
1549 // we do not need to dirty the log file (or it's compacting
1550 // replacement) when the file size changes because replay is
1551 // smart enough to discover it on its own.
1552 must_dirty = true;
1553 }
1554 }
1555 if (must_dirty) {
1556 h->file->fnode.mtime = ceph_clock_now();
1557 assert(h->file->fnode.ino >= 1);
1558 if (h->file->dirty_seq == 0) {
1559 h->file->dirty_seq = log_seq + 1;
1560 dirty_files[h->file->dirty_seq].push_back(*h->file);
1561 dout(20) << __func__ << " dirty_seq = " << log_seq + 1
1562 << " (was clean)" << dendl;
1563 } else {
1564 if (h->file->dirty_seq != log_seq + 1) {
1565 // need re-dirty, erase from list first
1566 assert(dirty_files.count(h->file->dirty_seq));
1567 auto it = dirty_files[h->file->dirty_seq].iterator_to(*h->file);
1568 dirty_files[h->file->dirty_seq].erase(it);
1569 h->file->dirty_seq = log_seq + 1;
1570 dirty_files[h->file->dirty_seq].push_back(*h->file);
1571 dout(20) << __func__ << " dirty_seq = " << log_seq + 1
1572 << " (was " << h->file->dirty_seq << ")" << dendl;
1573 } else {
1574 dout(20) << __func__ << " dirty_seq = " << log_seq + 1
1575 << " (unchanged, do nothing) " << dendl;
1576 }
1577 }
1578 }
1579 dout(20) << __func__ << " file now " << h->file->fnode << dendl;
1580
1581 uint64_t x_off = 0;
1582 auto p = h->file->fnode.seek(offset, &x_off);
1583 assert(p != h->file->fnode.extents.end());
1584 dout(20) << __func__ << " in " << *p << " x_off 0x"
1585 << std::hex << x_off << std::dec << dendl;
1586
1587 unsigned partial = x_off & ~super.block_mask();
1588 bufferlist bl;
1589 if (partial) {
1590 dout(20) << __func__ << " using partial tail 0x"
1591 << std::hex << partial << std::dec << dendl;
1592 assert(h->tail_block.length() == partial);
31f18b77 1593 bl.claim_append_piecewise(h->tail_block);
7c673cae
FG
1594 x_off -= partial;
1595 offset -= partial;
1596 length += partial;
1597 dout(20) << __func__ << " waiting for previous aio to complete" << dendl;
1598 for (auto p : h->iocv) {
1599 if (p) {
1600 p->aio_wait();
1601 }
1602 }
1603 }
1604 if (length == partial + h->buffer.length()) {
31f18b77 1605 bl.claim_append_piecewise(h->buffer);
7c673cae
FG
1606 } else {
1607 bufferlist t;
31f18b77
FG
1608 h->buffer.splice(0, length, &t);
1609 bl.claim_append_piecewise(t);
7c673cae
FG
1610 t.substr_of(h->buffer, length, h->buffer.length() - length);
1611 h->buffer.swap(t);
1612 dout(20) << " leaving 0x" << std::hex << h->buffer.length() << std::dec
1613 << " unflushed" << dendl;
1614 }
1615 assert(bl.length() == length);
1616
1617 switch (h->writer_type) {
1618 case WRITER_WAL:
1619 logger->inc(l_bluefs_bytes_written_wal, length);
1620 break;
1621 case WRITER_SST:
1622 logger->inc(l_bluefs_bytes_written_sst, length);
1623 break;
1624 }
1625
1626 dout(30) << "dump:\n";
1627 bl.hexdump(*_dout);
1628 *_dout << dendl;
1629
1630 h->pos = offset + length;
1631 h->tail_block.clear();
1632
1633 uint64_t bloff = 0;
1634 while (length > 0) {
1635 uint64_t x_len = MIN(p->length - x_off, length);
1636 bufferlist t;
1637 t.substr_of(bl, bloff, x_len);
1638 unsigned tail = x_len & ~super.block_mask();
1639 if (tail) {
1640 size_t zlen = super.block_size - tail;
1641 dout(20) << __func__ << " caching tail of 0x"
1642 << std::hex << tail
1643 << " and padding block with 0x" << zlen
1644 << std::dec << dendl;
1645 h->tail_block.substr_of(bl, bl.length() - tail, tail);
1646 if (h->file->fnode.ino > 1) {
1647 // we are using the page_aligned_appender, and can safely use
1648 // the tail of the raw buffer.
1649 const bufferptr &last = t.back();
1650 if (last.unused_tail_length() < zlen) {
1651 derr << " wtf, last is " << last << " from " << t << dendl;
1652 assert(last.unused_tail_length() >= zlen);
1653 }
1654 bufferptr z = last;
1655 z.set_offset(last.offset() + last.length());
1656 z.set_length(zlen);
1657 z.zero();
1658 t.append(z, 0, zlen);
1659 } else {
1660 t.append_zero(zlen);
1661 }
1662 }
1663 if (cct->_conf->bluefs_sync_write) {
1664 bdev[p->bdev]->write(p->offset + x_off, t, buffered);
1665 } else {
1666 bdev[p->bdev]->aio_write(p->offset + x_off, t, h->iocv[p->bdev], buffered);
1667 }
1668 bloff += x_len;
1669 length -= x_len;
1670 ++p;
1671 x_off = 0;
1672 }
1673 for (unsigned i = 0; i < MAX_BDEV; ++i) {
1674 if (bdev[i]) {
1675 assert(h->iocv[i]);
1676 if (h->iocv[i]->has_pending_aios()) {
1677 bdev[i]->aio_submit(h->iocv[i]);
1678 }
1679 }
1680 }
1681 dout(20) << __func__ << " h " << h << " pos now 0x"
1682 << std::hex << h->pos << std::dec << dendl;
1683 return 0;
1684}
1685
1686// we need to retire old completed aios so they don't stick around in
1687// memory indefinitely (along with their bufferlist refs).
1688void BlueFS::_claim_completed_aios(FileWriter *h, list<aio_t> *ls)
1689{
1690 for (auto p : h->iocv) {
1691 if (p) {
1692 ls->splice(ls->end(), p->running_aios);
1693 }
1694 }
1695 dout(10) << __func__ << " got " << ls->size() << " aios" << dendl;
1696}
1697
1698void BlueFS::wait_for_aio(FileWriter *h)
1699{
1700 // NOTE: this is safe to call without a lock, as long as our reference is
1701 // stable.
1702 dout(10) << __func__ << " " << h << dendl;
1703 utime_t start = ceph_clock_now();
1704 for (auto p : h->iocv) {
1705 if (p) {
1706 p->aio_wait();
1707 }
1708 }
1709 utime_t end = ceph_clock_now();
1710 utime_t dur = end - start;
1711 dout(10) << __func__ << " " << h << " done in " << dur << dendl;
1712}
1713
1714int BlueFS::_flush(FileWriter *h, bool force)
1715{
1716 h->buffer_appender.flush();
1717 uint64_t length = h->buffer.length();
1718 uint64_t offset = h->pos;
1719 if (!force &&
1720 length < cct->_conf->bluefs_min_flush_size) {
1721 dout(10) << __func__ << " " << h << " ignoring, length " << length
1722 << " < min_flush_size " << cct->_conf->bluefs_min_flush_size
1723 << dendl;
1724 return 0;
1725 }
1726 if (length == 0) {
1727 dout(10) << __func__ << " " << h << " no dirty data on "
1728 << h->file->fnode << dendl;
1729 return 0;
1730 }
1731 dout(10) << __func__ << " " << h << " 0x"
1732 << std::hex << offset << "~" << length << std::dec
1733 << " to " << h->file->fnode << dendl;
1734 assert(h->pos <= h->file->fnode.size);
1735 return _flush_range(h, offset, length);
1736}
1737
1738int BlueFS::_truncate(FileWriter *h, uint64_t offset)
1739{
1740 dout(10) << __func__ << " 0x" << std::hex << offset << std::dec
1741 << " file " << h->file->fnode << dendl;
1742 if (h->file->deleted) {
1743 dout(10) << __func__ << " deleted, no-op" << dendl;
1744 return 0;
1745 }
1746
1747 // we never truncate internal log files
1748 assert(h->file->fnode.ino > 1);
1749
1750 h->buffer_appender.flush();
1751
1752 // truncate off unflushed data?
1753 if (h->pos < offset &&
1754 h->pos + h->buffer.length() > offset) {
1755 bufferlist t;
1756 dout(20) << __func__ << " tossing out last " << offset - h->pos
1757 << " unflushed bytes" << dendl;
1758 t.substr_of(h->buffer, 0, offset - h->pos);
1759 h->buffer.swap(t);
1760 assert(0 == "actually this shouldn't happen");
1761 }
1762 if (h->buffer.length()) {
1763 int r = _flush(h, true);
1764 if (r < 0)
1765 return r;
1766 }
1767 if (offset == h->file->fnode.size) {
1768 return 0; // no-op!
1769 }
1770 if (offset > h->file->fnode.size) {
1771 assert(0 == "truncate up not supported");
1772 }
1773 assert(h->file->fnode.size >= offset);
1774 h->file->fnode.size = offset;
1775 log_t.op_file_update(h->file->fnode);
1776 return 0;
1777}
1778
1779int BlueFS::_fsync(FileWriter *h, std::unique_lock<std::mutex>& l)
1780{
1781 dout(10) << __func__ << " " << h << " " << h->file->fnode << dendl;
1782 int r = _flush(h, true);
1783 if (r < 0)
1784 return r;
1785 uint64_t old_dirty_seq = h->file->dirty_seq;
1786
1787 _flush_bdev_safely(h);
1788
1789 if (old_dirty_seq) {
1790 uint64_t s = log_seq;
1791 dout(20) << __func__ << " file metadata was dirty (" << old_dirty_seq
1792 << ") on " << h->file->fnode << ", flushing log" << dendl;
1793 _flush_and_sync_log(l, old_dirty_seq);
1794 assert(h->file->dirty_seq == 0 || // cleaned
1795 h->file->dirty_seq > s); // or redirtied by someone else
1796 }
1797 return 0;
1798}
1799
1800void BlueFS::_flush_bdev_safely(FileWriter *h)
1801{
1802 if (!cct->_conf->bluefs_sync_write) {
1803 list<aio_t> completed_ios;
1804 _claim_completed_aios(h, &completed_ios);
1805 lock.unlock();
1806 wait_for_aio(h);
1807 completed_ios.clear();
1808 flush_bdev();
1809 lock.lock();
1810 } else {
1811 lock.unlock();
1812 flush_bdev();
1813 lock.lock();
1814 }
1815}
1816
1817void BlueFS::flush_bdev()
1818{
1819 // NOTE: this is safe to call without a lock.
1820 dout(20) << __func__ << dendl;
1821 for (auto p : bdev) {
1822 if (p)
1823 p->flush();
1824 }
1825}
1826
1827int BlueFS::_allocate(uint8_t id, uint64_t len,
1828 mempool::bluefs::vector<bluefs_extent_t> *ev)
1829{
1830 dout(10) << __func__ << " len 0x" << std::hex << len << std::dec
1831 << " from " << (int)id << dendl;
1832 assert(id < alloc.size());
1833 uint64_t min_alloc_size = cct->_conf->bluefs_alloc_size;
1834
1835 uint64_t left = ROUND_UP_TO(len, min_alloc_size);
1836 int r = -ENOSPC;
b32b8144
FG
1837 int64_t alloc_len = 0;
1838 AllocExtentVector extents;
1839
7c673cae
FG
1840 if (alloc[id]) {
1841 r = alloc[id]->reserve(left);
1842 }
b32b8144
FG
1843
1844 if (r == 0) {
1845 uint64_t hint = 0;
1846 if (!ev->empty()) {
1847 hint = ev->back().end();
1848 }
1849 extents.reserve(4); // 4 should be (more than) enough for most allocations
1850 alloc_len = alloc[id]->allocate(left, min_alloc_size, hint, &extents);
1851 }
1852 if (r < 0 || (alloc_len < (int64_t)left)) {
1853 if (r == 0) {
1854 alloc[id]->unreserve(left - alloc_len);
1855 for (auto& p : extents) {
1856 alloc[id]->release(p.offset, p.length);
1857 }
1858 }
7c673cae
FG
1859 if (id != BDEV_SLOW) {
1860 if (bdev[id]) {
1861 dout(1) << __func__ << " failed to allocate 0x" << std::hex << left
1862 << " on bdev " << (int)id
1863 << ", free 0x" << alloc[id]->get_free()
1864 << "; fallback to bdev " << (int)id + 1
1865 << std::dec << dendl;
1866 }
1867 return _allocate(id + 1, len, ev);
1868 }
1869 if (bdev[id])
1870 derr << __func__ << " failed to allocate 0x" << std::hex << left
1871 << " on bdev " << (int)id
1872 << ", free 0x" << alloc[id]->get_free() << std::dec << dendl;
1873 else
1874 derr << __func__ << " failed to allocate 0x" << std::hex << left
1875 << " on bdev " << (int)id << ", dne" << std::dec << dendl;
b32b8144
FG
1876 if (alloc[id])
1877 alloc[id]->dump();
7c673cae
FG
1878 return -ENOSPC;
1879 }
1880
1881 for (auto& p : extents) {
1882 bluefs_extent_t e = bluefs_extent_t(id, p.offset, p.length);
1883 if (!ev->empty() &&
1884 ev->back().bdev == e.bdev &&
1885 ev->back().end() == (uint64_t) e.offset) {
1886 ev->back().length += e.length;
1887 } else {
1888 ev->push_back(e);
1889 }
1890 }
1891
1892 return 0;
1893}
1894
1895int BlueFS::_preallocate(FileRef f, uint64_t off, uint64_t len)
1896{
1897 dout(10) << __func__ << " file " << f->fnode << " 0x"
1898 << std::hex << off << "~" << len << std::dec << dendl;
1899 if (f->deleted) {
1900 dout(10) << __func__ << " deleted, no-op" << dendl;
1901 return 0;
1902 }
1903 assert(f->fnode.ino > 1);
1904 uint64_t allocated = f->fnode.get_allocated();
1905 if (off + len > allocated) {
1906 uint64_t want = off + len - allocated;
1907 int r = _allocate(f->fnode.prefer_bdev, want, &f->fnode.extents);
1908 if (r < 0)
1909 return r;
1910 f->fnode.recalc_allocated();
1911 log_t.op_file_update(f->fnode);
1912 }
1913 return 0;
1914}
1915
1916void BlueFS::sync_metadata()
1917{
1918 std::unique_lock<std::mutex> l(lock);
1919 if (log_t.empty()) {
1920 dout(10) << __func__ << " - no pending log events" << dendl;
1921 return;
1922 }
1923 dout(10) << __func__ << dendl;
1924 utime_t start = ceph_clock_now();
1925 vector<interval_set<uint64_t>> to_release(pending_release.size());
1926 to_release.swap(pending_release);
1927 flush_bdev(); // FIXME?
1928 _flush_and_sync_log(l);
1929 for (unsigned i = 0; i < to_release.size(); ++i) {
1930 for (auto p = to_release[i].begin(); p != to_release[i].end(); ++p) {
1931 alloc[i]->release(p.get_start(), p.get_len());
1932 }
1933 }
1934
1935 if (_should_compact_log()) {
1936 if (cct->_conf->bluefs_compact_log_sync) {
1937 _compact_log_sync();
1938 } else {
1939 _compact_log_async(l);
1940 }
1941 }
1942
1943 utime_t end = ceph_clock_now();
1944 utime_t dur = end - start;
1945 dout(10) << __func__ << " done in " << dur << dendl;
1946}
1947
1948int BlueFS::open_for_write(
1949 const string& dirname,
1950 const string& filename,
1951 FileWriter **h,
1952 bool overwrite)
1953{
1954 std::lock_guard<std::mutex> l(lock);
1955 dout(10) << __func__ << " " << dirname << "/" << filename << dendl;
1956 map<string,DirRef>::iterator p = dir_map.find(dirname);
1957 DirRef dir;
1958 if (p == dir_map.end()) {
1959 // implicitly create the dir
1960 dout(20) << __func__ << " dir " << dirname
1961 << " does not exist" << dendl;
1962 return -ENOENT;
1963 } else {
1964 dir = p->second;
1965 }
1966
1967 FileRef file;
1968 bool create = false;
1969 map<string,FileRef>::iterator q = dir->file_map.find(filename);
1970 if (q == dir->file_map.end()) {
1971 if (overwrite) {
1972 dout(20) << __func__ << " dir " << dirname << " (" << dir
1973 << ") file " << filename
1974 << " does not exist" << dendl;
1975 return -ENOENT;
1976 }
1977 file = new File;
1978 file->fnode.ino = ++ino_last;
1979 file_map[ino_last] = file;
1980 dir->file_map[filename] = file;
1981 ++file->refs;
1982 create = true;
1983 } else {
1984 // overwrite existing file?
1985 file = q->second;
1986 if (overwrite) {
1987 dout(20) << __func__ << " dir " << dirname << " (" << dir
1988 << ") file " << filename
1989 << " already exists, overwrite in place" << dendl;
1990 } else {
1991 dout(20) << __func__ << " dir " << dirname << " (" << dir
1992 << ") file " << filename
1993 << " already exists, truncate + overwrite" << dendl;
1994 file->fnode.size = 0;
1995 for (auto& p : file->fnode.extents) {
1996 pending_release[p.bdev].insert(p.offset, p.length);
1997 }
1998 file->fnode.extents.clear();
1999 file->fnode.recalc_allocated();
2000 }
2001 }
2002 assert(file->fnode.ino > 1);
2003
2004 file->fnode.mtime = ceph_clock_now();
2005 file->fnode.prefer_bdev = BlueFS::BDEV_DB;
2006 if (dirname.length() > 5) {
2007 // the "db.slow" and "db.wal" directory names are hard-coded at
2008 // match up with bluestore. the slow device is always the second
2009 // one (when a dedicated block.db device is present and used at
2010 // bdev 0). the wal device is always last.
31f18b77 2011 if (boost::algorithm::ends_with(dirname, ".slow")) {
7c673cae
FG
2012 file->fnode.prefer_bdev = BlueFS::BDEV_SLOW;
2013 } else if (boost::algorithm::ends_with(dirname, ".wal")) {
2014 file->fnode.prefer_bdev = BlueFS::BDEV_WAL;
2015 }
2016 }
2017 dout(20) << __func__ << " mapping " << dirname << "/" << filename
2018 << " to bdev " << (int)file->fnode.prefer_bdev << dendl;
2019
2020 log_t.op_file_update(file->fnode);
2021 if (create)
2022 log_t.op_dir_link(dirname, filename, file->fnode.ino);
2023
2024 *h = _create_writer(file);
2025
2026 if (boost::algorithm::ends_with(filename, ".log")) {
2027 (*h)->writer_type = BlueFS::WRITER_WAL;
2028 if (logger && !overwrite) {
2029 logger->inc(l_bluefs_files_written_wal);
2030 }
2031 } else if (boost::algorithm::ends_with(filename, ".sst")) {
2032 (*h)->writer_type = BlueFS::WRITER_SST;
2033 if (logger) {
2034 logger->inc(l_bluefs_files_written_sst);
2035 }
2036 }
2037
2038 dout(10) << __func__ << " h " << *h << " on " << file->fnode << dendl;
2039 return 0;
2040}
2041
2042BlueFS::FileWriter *BlueFS::_create_writer(FileRef f)
2043{
2044 FileWriter *w = new FileWriter(f);
2045 for (unsigned i = 0; i < MAX_BDEV; ++i) {
2046 if (bdev[i]) {
2047 w->iocv[i] = new IOContext(cct, NULL);
2048 } else {
2049 w->iocv[i] = NULL;
2050 }
2051 }
2052 return w;
2053}
2054
2055void BlueFS::_close_writer(FileWriter *h)
2056{
2057 dout(10) << __func__ << " " << h << " type " << h->writer_type << dendl;
2058 for (unsigned i=0; i<MAX_BDEV; ++i) {
2059 if (bdev[i]) {
2060 assert(h->iocv[i]);
2061 h->iocv[i]->aio_wait();
2062 bdev[i]->queue_reap_ioc(h->iocv[i]);
2063 }
2064 }
2065 delete h;
2066}
2067
2068int BlueFS::open_for_read(
2069 const string& dirname,
2070 const string& filename,
2071 FileReader **h,
2072 bool random)
2073{
2074 std::lock_guard<std::mutex> l(lock);
2075 dout(10) << __func__ << " " << dirname << "/" << filename
2076 << (random ? " (random)":" (sequential)") << dendl;
2077 map<string,DirRef>::iterator p = dir_map.find(dirname);
2078 if (p == dir_map.end()) {
2079 dout(20) << __func__ << " dir " << dirname << " not found" << dendl;
2080 return -ENOENT;
2081 }
2082 DirRef dir = p->second;
2083
2084 map<string,FileRef>::iterator q = dir->file_map.find(filename);
2085 if (q == dir->file_map.end()) {
2086 dout(20) << __func__ << " dir " << dirname << " (" << dir
2087 << ") file " << filename
2088 << " not found" << dendl;
2089 return -ENOENT;
2090 }
2091 File *file = q->second.get();
2092
2093 *h = new FileReader(file, random ? 4096 : cct->_conf->bluefs_max_prefetch,
2094 random, false);
2095 dout(10) << __func__ << " h " << *h << " on " << file->fnode << dendl;
2096 return 0;
2097}
2098
2099int BlueFS::rename(
2100 const string& old_dirname, const string& old_filename,
2101 const string& new_dirname, const string& new_filename)
2102{
2103 std::lock_guard<std::mutex> l(lock);
2104 dout(10) << __func__ << " " << old_dirname << "/" << old_filename
2105 << " -> " << new_dirname << "/" << new_filename << dendl;
2106 map<string,DirRef>::iterator p = dir_map.find(old_dirname);
2107 if (p == dir_map.end()) {
2108 dout(20) << __func__ << " dir " << old_dirname << " not found" << dendl;
2109 return -ENOENT;
2110 }
2111 DirRef old_dir = p->second;
2112 map<string,FileRef>::iterator q = old_dir->file_map.find(old_filename);
2113 if (q == old_dir->file_map.end()) {
2114 dout(20) << __func__ << " dir " << old_dirname << " (" << old_dir
2115 << ") file " << old_filename
2116 << " not found" << dendl;
2117 return -ENOENT;
2118 }
2119 FileRef file = q->second;
2120
2121 p = dir_map.find(new_dirname);
2122 if (p == dir_map.end()) {
2123 dout(20) << __func__ << " dir " << new_dirname << " not found" << dendl;
2124 return -ENOENT;
2125 }
2126 DirRef new_dir = p->second;
2127 q = new_dir->file_map.find(new_filename);
2128 if (q != new_dir->file_map.end()) {
2129 dout(20) << __func__ << " dir " << new_dirname << " (" << old_dir
2130 << ") file " << new_filename
2131 << " already exists, unlinking" << dendl;
2132 assert(q->second != file);
2133 log_t.op_dir_unlink(new_dirname, new_filename);
2134 _drop_link(q->second);
2135 }
2136
2137 dout(10) << __func__ << " " << new_dirname << "/" << new_filename << " "
2138 << " " << file->fnode << dendl;
2139
2140 new_dir->file_map[new_filename] = file;
2141 old_dir->file_map.erase(old_filename);
2142
2143 log_t.op_dir_link(new_dirname, new_filename, file->fnode.ino);
2144 log_t.op_dir_unlink(old_dirname, old_filename);
2145 return 0;
2146}
2147
2148int BlueFS::mkdir(const string& dirname)
2149{
2150 std::lock_guard<std::mutex> l(lock);
2151 dout(10) << __func__ << " " << dirname << dendl;
2152 map<string,DirRef>::iterator p = dir_map.find(dirname);
2153 if (p != dir_map.end()) {
2154 dout(20) << __func__ << " dir " << dirname << " exists" << dendl;
2155 return -EEXIST;
2156 }
2157 dir_map[dirname] = new Dir;
2158 log_t.op_dir_create(dirname);
2159 return 0;
2160}
2161
2162int BlueFS::rmdir(const string& dirname)
2163{
2164 std::lock_guard<std::mutex> l(lock);
2165 dout(10) << __func__ << " " << dirname << dendl;
2166 map<string,DirRef>::iterator p = dir_map.find(dirname);
2167 if (p == dir_map.end()) {
2168 dout(20) << __func__ << " dir " << dirname << " does not exist" << dendl;
2169 return -ENOENT;
2170 }
2171 DirRef dir = p->second;
2172 if (!dir->file_map.empty()) {
2173 dout(20) << __func__ << " dir " << dirname << " not empty" << dendl;
2174 return -ENOTEMPTY;
2175 }
2176 dir_map.erase(dirname);
2177 log_t.op_dir_remove(dirname);
2178 return 0;
2179}
2180
2181bool BlueFS::dir_exists(const string& dirname)
2182{
2183 std::lock_guard<std::mutex> l(lock);
2184 map<string,DirRef>::iterator p = dir_map.find(dirname);
2185 bool exists = p != dir_map.end();
2186 dout(10) << __func__ << " " << dirname << " = " << (int)exists << dendl;
2187 return exists;
2188}
2189
2190int BlueFS::stat(const string& dirname, const string& filename,
2191 uint64_t *size, utime_t *mtime)
2192{
2193 std::lock_guard<std::mutex> l(lock);
2194 dout(10) << __func__ << " " << dirname << "/" << filename << dendl;
2195 map<string,DirRef>::iterator p = dir_map.find(dirname);
2196 if (p == dir_map.end()) {
2197 dout(20) << __func__ << " dir " << dirname << " not found" << dendl;
2198 return -ENOENT;
2199 }
2200 DirRef dir = p->second;
2201 map<string,FileRef>::iterator q = dir->file_map.find(filename);
2202 if (q == dir->file_map.end()) {
2203 dout(20) << __func__ << " dir " << dirname << " (" << dir
2204 << ") file " << filename
2205 << " not found" << dendl;
2206 return -ENOENT;
2207 }
2208 File *file = q->second.get();
2209 dout(10) << __func__ << " " << dirname << "/" << filename
2210 << " " << file->fnode << dendl;
2211 if (size)
2212 *size = file->fnode.size;
2213 if (mtime)
2214 *mtime = file->fnode.mtime;
2215 return 0;
2216}
2217
2218int BlueFS::lock_file(const string& dirname, const string& filename,
2219 FileLock **plock)
2220{
2221 std::lock_guard<std::mutex> l(lock);
2222 dout(10) << __func__ << " " << dirname << "/" << filename << dendl;
2223 map<string,DirRef>::iterator p = dir_map.find(dirname);
2224 if (p == dir_map.end()) {
2225 dout(20) << __func__ << " dir " << dirname << " not found" << dendl;
2226 return -ENOENT;
2227 }
2228 DirRef dir = p->second;
2229 map<string,FileRef>::iterator q = dir->file_map.find(filename);
2230 File *file;
2231 if (q == dir->file_map.end()) {
2232 dout(20) << __func__ << " dir " << dirname << " (" << dir
2233 << ") file " << filename
2234 << " not found, creating" << dendl;
2235 file = new File;
2236 file->fnode.ino = ++ino_last;
2237 file->fnode.mtime = ceph_clock_now();
2238 file_map[ino_last] = file;
2239 dir->file_map[filename] = file;
2240 ++file->refs;
2241 log_t.op_file_update(file->fnode);
2242 log_t.op_dir_link(dirname, filename, file->fnode.ino);
2243 } else {
2244 file = q->second.get();
2245 if (file->locked) {
2246 dout(10) << __func__ << " already locked" << dendl;
2247 return -EBUSY;
2248 }
2249 }
2250 file->locked = true;
2251 *plock = new FileLock(file);
2252 dout(10) << __func__ << " locked " << file->fnode
2253 << " with " << *plock << dendl;
2254 return 0;
2255}
2256
2257int BlueFS::unlock_file(FileLock *fl)
2258{
2259 std::lock_guard<std::mutex> l(lock);
2260 dout(10) << __func__ << " " << fl << " on " << fl->file->fnode << dendl;
2261 assert(fl->file->locked);
2262 fl->file->locked = false;
2263 delete fl;
2264 return 0;
2265}
2266
2267int BlueFS::readdir(const string& dirname, vector<string> *ls)
2268{
2269 std::lock_guard<std::mutex> l(lock);
2270 dout(10) << __func__ << " " << dirname << dendl;
2271 if (dirname.empty()) {
2272 // list dirs
2273 ls->reserve(dir_map.size() + 2);
2274 for (auto& q : dir_map) {
2275 ls->push_back(q.first);
2276 }
2277 } else {
2278 // list files in dir
2279 map<string,DirRef>::iterator p = dir_map.find(dirname);
2280 if (p == dir_map.end()) {
2281 dout(20) << __func__ << " dir " << dirname << " not found" << dendl;
2282 return -ENOENT;
2283 }
2284 DirRef dir = p->second;
2285 ls->reserve(dir->file_map.size() + 2);
2286 for (auto& q : dir->file_map) {
2287 ls->push_back(q.first);
2288 }
2289 }
2290 ls->push_back(".");
2291 ls->push_back("..");
2292 return 0;
2293}
2294
2295int BlueFS::unlink(const string& dirname, const string& filename)
2296{
2297 std::lock_guard<std::mutex> l(lock);
2298 dout(10) << __func__ << " " << dirname << "/" << filename << dendl;
2299 map<string,DirRef>::iterator p = dir_map.find(dirname);
2300 if (p == dir_map.end()) {
2301 dout(20) << __func__ << " dir " << dirname << " not found" << dendl;
2302 return -ENOENT;
2303 }
2304 DirRef dir = p->second;
2305 map<string,FileRef>::iterator q = dir->file_map.find(filename);
2306 if (q == dir->file_map.end()) {
2307 dout(20) << __func__ << " file " << dirname << "/" << filename
2308 << " not found" << dendl;
2309 return -ENOENT;
2310 }
2311 FileRef file = q->second;
2312 if (file->locked) {
2313 dout(20) << __func__ << " file " << dirname << "/" << filename
2314 << " is locked" << dendl;
2315 return -EBUSY;
2316 }
2317 dir->file_map.erase(filename);
2318 log_t.op_dir_unlink(dirname, filename);
2319 _drop_link(file);
2320 return 0;
2321}
d2e6a577
FG
2322
2323bool BlueFS::wal_is_rotational()
2324{
2325 if (!bdev[BDEV_WAL] || bdev[BDEV_WAL]->is_rotational())
2326 return true;
2327 return false;
2328}