]> git.proxmox.com Git - ceph.git/blame - ceph/src/os/bluestore/BlueFS.cc
update sources to v12.1.1
[ceph.git] / ceph / src / os / bluestore / BlueFS.cc
CommitLineData
7c673cae
FG
1// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2// vim: ts=8 sw=2 smarttab
3
4#include "boost/algorithm/string.hpp"
5#include "BlueFS.h"
6
7#include "common/debug.h"
8#include "common/errno.h"
9#include "common/perf_counters.h"
10#include "BlockDevice.h"
11#include "Allocator.h"
12#include "include/assert.h"
13
14#define dout_context cct
15#define dout_subsys ceph_subsys_bluefs
16#undef dout_prefix
17#define dout_prefix *_dout << "bluefs "
18
19MEMPOOL_DEFINE_OBJECT_FACTORY(BlueFS::File, bluefs_file, bluefs);
20MEMPOOL_DEFINE_OBJECT_FACTORY(BlueFS::Dir, bluefs_dir, bluefs);
21MEMPOOL_DEFINE_OBJECT_FACTORY(BlueFS::FileWriter, bluefs_file_writer, bluefs);
22MEMPOOL_DEFINE_OBJECT_FACTORY(BlueFS::FileReaderBuffer,
23 bluefs_file_reader_buffer, bluefs);
24MEMPOOL_DEFINE_OBJECT_FACTORY(BlueFS::FileReader, bluefs_file_reader, bluefs);
25MEMPOOL_DEFINE_OBJECT_FACTORY(BlueFS::FileLock, bluefs_file_lock, bluefs);
26
27
28BlueFS::BlueFS(CephContext* cct)
29 : cct(cct),
30 bdev(MAX_BDEV),
31 ioc(MAX_BDEV),
32 block_all(MAX_BDEV),
33 block_total(MAX_BDEV, 0)
34{
35}
36
37BlueFS::~BlueFS()
38{
39 for (auto p : ioc) {
40 if (p)
41 p->aio_wait();
42 }
43 for (auto p : bdev) {
44 if (p) {
45 p->close();
46 delete p;
47 }
48 }
49 for (auto p : ioc) {
50 delete p;
51 }
52}
53
54void BlueFS::_init_logger()
55{
56 PerfCountersBuilder b(cct, "bluefs",
57 l_bluefs_first, l_bluefs_last);
58 b.add_u64_counter(l_bluefs_gift_bytes, "gift_bytes",
59 "Bytes gifted from BlueStore");
60 b.add_u64_counter(l_bluefs_reclaim_bytes, "reclaim_bytes",
61 "Bytes reclaimed by BlueStore");
62 b.add_u64(l_bluefs_db_total_bytes, "db_total_bytes",
63 "Total bytes (main db device)",
64 "b", PerfCountersBuilder::PRIO_USEFUL);
65 b.add_u64(l_bluefs_db_used_bytes, "db_used_bytes",
66 "Used bytes (main db device)",
67 "u", PerfCountersBuilder::PRIO_USEFUL);
68 b.add_u64(l_bluefs_wal_total_bytes, "wal_total_bytes",
69 "Total bytes (wal device)",
70 "walb", PerfCountersBuilder::PRIO_USEFUL);
71 b.add_u64(l_bluefs_wal_used_bytes, "wal_used_bytes",
72 "Used bytes (wal device)",
73 "walu", PerfCountersBuilder::PRIO_USEFUL);
74 b.add_u64(l_bluefs_slow_total_bytes, "slow_total_bytes",
75 "Total bytes (slow device)",
76 "slob", PerfCountersBuilder::PRIO_USEFUL);
77 b.add_u64(l_bluefs_slow_used_bytes, "slow_used_bytes",
78 "Used bytes (slow device)",
79 "slou", PerfCountersBuilder::PRIO_USEFUL);
80 b.add_u64(l_bluefs_num_files, "num_files", "File count",
81 "f", PerfCountersBuilder::PRIO_USEFUL);
82 b.add_u64(l_bluefs_log_bytes, "log_bytes", "Size of the metadata log",
83 "jlen", PerfCountersBuilder::PRIO_INTERESTING);
84 b.add_u64_counter(l_bluefs_log_compactions, "log_compactions",
85 "Compactions of the metadata log");
86 b.add_u64_counter(l_bluefs_logged_bytes, "logged_bytes",
87 "Bytes written to the metadata log", "j",
88 PerfCountersBuilder::PRIO_CRITICAL);
89 b.add_u64_counter(l_bluefs_files_written_wal, "files_written_wal",
90 "Files written to WAL");
91 b.add_u64_counter(l_bluefs_files_written_sst, "files_written_sst",
92 "Files written to SSTs");
93 b.add_u64_counter(l_bluefs_bytes_written_wal, "bytes_written_wal",
94 "Bytes written to WAL", "wal",
95 PerfCountersBuilder::PRIO_CRITICAL);
96 b.add_u64_counter(l_bluefs_bytes_written_sst, "bytes_written_sst",
97 "Bytes written to SSTs", "sst",
98 PerfCountersBuilder::PRIO_CRITICAL);
99 logger = b.create_perf_counters();
100 cct->get_perfcounters_collection()->add(logger);
101}
102
103void BlueFS::_shutdown_logger()
104{
105 cct->get_perfcounters_collection()->remove(logger);
106 delete logger;
107}
108
109void BlueFS::_update_logger_stats()
110{
111 // we must be holding the lock
112 logger->set(l_bluefs_num_files, file_map.size());
113 logger->set(l_bluefs_log_bytes, log_writer->file->fnode.size);
114
115 if (alloc[BDEV_WAL]) {
116 logger->set(l_bluefs_wal_total_bytes, block_total[BDEV_WAL]);
117 logger->set(l_bluefs_wal_used_bytes,
118 block_total[BDEV_WAL] - alloc[BDEV_WAL]->get_free());
119 }
120 if (alloc[BDEV_DB]) {
121 logger->set(l_bluefs_db_total_bytes, block_total[BDEV_DB]);
122 logger->set(l_bluefs_db_used_bytes,
123 block_total[BDEV_DB] - alloc[BDEV_DB]->get_free());
124 }
125 if (alloc[BDEV_SLOW]) {
126 logger->set(l_bluefs_slow_total_bytes, block_total[BDEV_SLOW]);
127 logger->set(l_bluefs_slow_used_bytes,
128 block_total[BDEV_SLOW] - alloc[BDEV_SLOW]->get_free());
129 }
130}
131
132int BlueFS::add_block_device(unsigned id, string path)
133{
134 dout(10) << __func__ << " bdev " << id << " path " << path << dendl;
135 assert(id < bdev.size());
136 assert(bdev[id] == NULL);
137 BlockDevice *b = BlockDevice::create(cct, path, NULL, NULL);
138 int r = b->open(path);
139 if (r < 0) {
140 delete b;
141 return r;
142 }
143 dout(1) << __func__ << " bdev " << id << " path " << path
144 << " size " << pretty_si_t(b->get_size()) << "B" << dendl;
145 bdev[id] = b;
146 ioc[id] = new IOContext(cct, NULL);
147 return 0;
148}
149
150bool BlueFS::bdev_support_label(unsigned id)
151{
152 assert(id < bdev.size());
153 assert(bdev[id]);
154 return bdev[id]->supported_bdev_label();
155}
156
157uint64_t BlueFS::get_block_device_size(unsigned id)
158{
159 if (id < bdev.size() && bdev[id])
160 return bdev[id]->get_size();
161 return 0;
162}
163
164void BlueFS::add_block_extent(unsigned id, uint64_t offset, uint64_t length)
165{
166 std::unique_lock<std::mutex> l(lock);
167 dout(1) << __func__ << " bdev " << id
168 << " 0x" << std::hex << offset << "~" << length << std::dec
169 << dendl;
170 assert(id < bdev.size());
171 assert(bdev[id]);
172 assert(bdev[id]->get_size() >= offset + length);
173 block_all[id].insert(offset, length);
174 block_total[id] += length;
175
176 if (id < alloc.size() && alloc[id]) {
177 log_t.op_alloc_add(id, offset, length);
178 int r = _flush_and_sync_log(l);
179 assert(r == 0);
180 alloc[id]->init_add_free(offset, length);
181 }
182
183 if (logger)
184 logger->inc(l_bluefs_gift_bytes, length);
185 dout(10) << __func__ << " done" << dendl;
186}
187
188int BlueFS::reclaim_blocks(unsigned id, uint64_t want,
189 AllocExtentVector *extents)
190{
191 std::unique_lock<std::mutex> l(lock);
192 dout(1) << __func__ << " bdev " << id
193 << " want 0x" << std::hex << want << std::dec << dendl;
194 assert(id < alloc.size());
195 assert(alloc[id]);
196 int r = alloc[id]->reserve(want);
197 assert(r == 0); // caller shouldn't ask for more than they can get
198 int64_t got = alloc[id]->allocate(want, cct->_conf->bluefs_alloc_size, 0,
199 extents);
200 if (got < (int64_t)want) {
201 alloc[id]->unreserve(want - MAX(0, got));
202 }
203 if (got <= 0) {
204 derr << __func__ << " failed to allocate space to return to bluestore"
205 << dendl;
206 alloc[id]->dump();
207 return got;
208 }
209
210 for (auto& p : *extents) {
211 block_all[id].erase(p.offset, p.length);
212 block_total[id] -= p.length;
213 log_t.op_alloc_rm(id, p.offset, p.length);
214 }
215
216 flush_bdev();
217 r = _flush_and_sync_log(l);
218 assert(r == 0);
219
220 if (logger)
221 logger->inc(l_bluefs_reclaim_bytes, got);
222 dout(1) << __func__ << " bdev " << id << " want 0x" << std::hex << want
223 << " got " << *extents << dendl;
224 return 0;
225}
226
227uint64_t BlueFS::get_fs_usage()
228{
229 std::lock_guard<std::mutex> l(lock);
230 uint64_t total_bytes = 0;
231 for (auto& p : file_map) {
232 total_bytes += p.second->fnode.get_allocated();
233 }
234 return total_bytes;
235}
236
237uint64_t BlueFS::get_total(unsigned id)
238{
239 std::lock_guard<std::mutex> l(lock);
240 assert(id < block_all.size());
241 return block_total[id];
242}
243
244uint64_t BlueFS::get_free(unsigned id)
245{
246 std::lock_guard<std::mutex> l(lock);
247 assert(id < alloc.size());
248 return alloc[id]->get_free();
249}
250
251void BlueFS::dump_perf_counters(Formatter *f)
252{
253 f->open_object_section("bluefs_perf_counters");
254 logger->dump_formatted(f,0);
255 f->close_section();
256}
257
258
259void BlueFS::get_usage(vector<pair<uint64_t,uint64_t>> *usage)
260{
261 std::lock_guard<std::mutex> l(lock);
262 usage->resize(bdev.size());
263 for (unsigned id = 0; id < bdev.size(); ++id) {
264 if (!bdev[id]) {
265 (*usage)[id] = make_pair(0, 0);
266 continue;
267 }
268 (*usage)[id].first = alloc[id]->get_free();
269 (*usage)[id].second = block_total[id];
270 uint64_t used =
271 (block_total[id] - (*usage)[id].first) * 100 / block_total[id];
272 dout(10) << __func__ << " bdev " << id
273 << " free " << (*usage)[id].first
274 << " (" << pretty_si_t((*usage)[id].first) << "B)"
275 << " / " << (*usage)[id].second
276 << " (" << pretty_si_t((*usage)[id].second) << "B)"
277 << ", used " << used << "%"
278 << dendl;
279 }
280}
281
282int BlueFS::get_block_extents(unsigned id, interval_set<uint64_t> *extents)
283{
284 std::lock_guard<std::mutex> l(lock);
285 dout(10) << __func__ << " bdev " << id << dendl;
286 if (id >= block_all.size())
287 return -EINVAL;
288 *extents = block_all[id];
289 return 0;
290}
291
292int BlueFS::mkfs(uuid_d osd_uuid)
293{
294 std::unique_lock<std::mutex> l(lock);
295 dout(1) << __func__
296 << " osd_uuid " << osd_uuid
297 << dendl;
298
299 _init_alloc();
300 _init_logger();
301
302 super.version = 1;
303 super.block_size = bdev[BDEV_DB]->get_block_size();
304 super.osd_uuid = osd_uuid;
305 super.uuid.generate_random();
306 dout(1) << __func__ << " uuid " << super.uuid << dendl;
307
308 // init log
309 FileRef log_file = new File;
310 log_file->fnode.ino = 1;
311 log_file->fnode.prefer_bdev = BDEV_WAL;
312 int r = _allocate(
313 log_file->fnode.prefer_bdev,
314 cct->_conf->bluefs_max_log_runway,
315 &log_file->fnode.extents);
316 log_file->fnode.recalc_allocated();
317 assert(r == 0);
318 log_writer = _create_writer(log_file);
319
320 // initial txn
321 log_t.op_init();
322 for (unsigned bdev = 0; bdev < MAX_BDEV; ++bdev) {
323 interval_set<uint64_t>& p = block_all[bdev];
324 if (p.empty())
325 continue;
326 for (interval_set<uint64_t>::iterator q = p.begin(); q != p.end(); ++q) {
327 dout(20) << __func__ << " op_alloc_add " << bdev << " 0x"
328 << std::hex << q.get_start() << "~" << q.get_len() << std::dec
329 << dendl;
330 log_t.op_alloc_add(bdev, q.get_start(), q.get_len());
331 }
332 }
333 _flush_and_sync_log(l);
334
335 // write supers
336 super.log_fnode = log_file->fnode;
337 _write_super();
338 flush_bdev();
339
340 // clean up
341 super = bluefs_super_t();
342 _close_writer(log_writer);
343 log_writer = NULL;
344 block_all.clear();
345 block_total.clear();
346 _stop_alloc();
347 _shutdown_logger();
348
349 dout(10) << __func__ << " success" << dendl;
350 return 0;
351}
352
353void BlueFS::_init_alloc()
354{
355 dout(20) << __func__ << dendl;
356 alloc.resize(MAX_BDEV);
357 pending_release.resize(MAX_BDEV);
358 for (unsigned id = 0; id < bdev.size(); ++id) {
359 if (!bdev[id]) {
360 continue;
361 }
362 assert(bdev[id]->get_size());
363 alloc[id] = Allocator::create(cct, cct->_conf->bluefs_allocator,
364 bdev[id]->get_size(),
365 cct->_conf->bluefs_alloc_size);
366 interval_set<uint64_t>& p = block_all[id];
367 for (interval_set<uint64_t>::iterator q = p.begin(); q != p.end(); ++q) {
368 alloc[id]->init_add_free(q.get_start(), q.get_len());
369 }
370 }
371}
372
373void BlueFS::_stop_alloc()
374{
375 dout(20) << __func__ << dendl;
376 for (auto p : alloc) {
377 if (p != nullptr) {
378 p->shutdown();
379 delete p;
380 }
381 }
382 alloc.clear();
383}
384
385int BlueFS::mount()
386{
387 dout(1) << __func__ << dendl;
388
389 int r = _open_super();
390 if (r < 0) {
391 derr << __func__ << " failed to open super: " << cpp_strerror(r) << dendl;
392 goto out;
393 }
394
395 block_all.clear();
396 block_all.resize(MAX_BDEV);
397 block_total.clear();
398 block_total.resize(MAX_BDEV, 0);
399 _init_alloc();
400
401 r = _replay(false);
402 if (r < 0) {
403 derr << __func__ << " failed to replay log: " << cpp_strerror(r) << dendl;
404 _stop_alloc();
405 goto out;
406 }
407
408 // init freelist
409 for (auto& p : file_map) {
410 dout(30) << __func__ << " noting alloc for " << p.second->fnode << dendl;
411 for (auto& q : p.second->fnode.extents) {
412 alloc[q.bdev]->init_rm_free(q.offset, q.length);
413 }
414 }
415
416 // set up the log for future writes
417 log_writer = _create_writer(_get_file(1));
418 assert(log_writer->file->fnode.ino == 1);
419 log_writer->pos = log_writer->file->fnode.size;
420 dout(10) << __func__ << " log write pos set to 0x"
421 << std::hex << log_writer->pos << std::dec
422 << dendl;
423
424 _init_logger();
425 return 0;
426
427 out:
428 super = bluefs_super_t();
429 return r;
430}
431
432void BlueFS::umount()
433{
434 dout(1) << __func__ << dendl;
435
436 sync_metadata();
437
438 _close_writer(log_writer);
439 log_writer = NULL;
440
441 _stop_alloc();
442 file_map.clear();
443 dir_map.clear();
444 super = bluefs_super_t();
445 log_t.clear();
446 _shutdown_logger();
447}
448
449void BlueFS::collect_metadata(map<string,string> *pm)
450{
451 if (bdev[BDEV_DB])
452 bdev[BDEV_DB]->collect_metadata("bluefs_db_", pm);
453 if (bdev[BDEV_WAL])
454 bdev[BDEV_WAL]->collect_metadata("bluefs_wal_", pm);
455 if (bdev[BDEV_SLOW])
456 bdev[BDEV_SLOW]->collect_metadata("bluefs_slow_", pm);
457}
458
459int BlueFS::fsck()
460{
461 std::lock_guard<std::mutex> l(lock);
462 dout(1) << __func__ << dendl;
463 // hrm, i think we check everything on mount...
464 return 0;
465}
466
467int BlueFS::_write_super()
468{
469 // build superblock
470 bufferlist bl;
471 ::encode(super, bl);
472 uint32_t crc = bl.crc32c(-1);
473 ::encode(crc, bl);
474 dout(10) << __func__ << " super block length(encoded): " << bl.length() << dendl;
475 dout(10) << __func__ << " superblock " << super.version << dendl;
476 dout(10) << __func__ << " log_fnode " << super.log_fnode << dendl;
477 assert(bl.length() <= get_super_length());
478 bl.append_zero(get_super_length() - bl.length());
479
480 bdev[BDEV_DB]->write(get_super_offset(), bl, false);
481 dout(20) << __func__ << " v " << super.version
482 << " crc 0x" << std::hex << crc
483 << " offset 0x" << get_super_offset() << std::dec
484 << dendl;
485 return 0;
486}
487
488int BlueFS::_open_super()
489{
490 dout(10) << __func__ << dendl;
491
492 bufferlist bl;
493 uint32_t expected_crc, crc;
494 int r;
495
496 // always the second block
497 r = bdev[BDEV_DB]->read(get_super_offset(), get_super_length(),
498 &bl, ioc[BDEV_DB], false);
499 if (r < 0)
500 return r;
501
502 bufferlist::iterator p = bl.begin();
503 ::decode(super, p);
504 {
505 bufferlist t;
506 t.substr_of(bl, 0, p.get_off());
507 crc = t.crc32c(-1);
508 }
509 ::decode(expected_crc, p);
510 if (crc != expected_crc) {
511 derr << __func__ << " bad crc on superblock, expected 0x"
512 << std::hex << expected_crc << " != actual 0x" << crc << std::dec
513 << dendl;
514 return -EIO;
515 }
516 dout(10) << __func__ << " superblock " << super.version << dendl;
517 dout(10) << __func__ << " log_fnode " << super.log_fnode << dendl;
518 return 0;
519}
520
521int BlueFS::_replay(bool noop)
522{
523 dout(10) << __func__ << (noop ? " NO-OP" : "") << dendl;
524 ino_last = 1; // by the log
525 log_seq = 0;
526
527 FileRef log_file;
528 if (noop) {
529 log_file = new File;
530 } else {
531 log_file = _get_file(1);
532 }
533 log_file->fnode = super.log_fnode;
534 dout(10) << __func__ << " log_fnode " << super.log_fnode << dendl;
535
536 FileReader *log_reader = new FileReader(
537 log_file, cct->_conf->bluefs_max_prefetch,
538 false, // !random
539 true); // ignore eof
540 while (true) {
541 assert((log_reader->buf.pos & ~super.block_mask()) == 0);
542 uint64_t pos = log_reader->buf.pos;
543 uint64_t read_pos = pos;
544 bufferlist bl;
545 {
546 int r = _read(log_reader, &log_reader->buf, read_pos, super.block_size,
547 &bl, NULL);
548 assert(r == (int)super.block_size);
549 read_pos += r;
550 }
551 uint64_t more = 0;
552 uint64_t seq;
553 uuid_d uuid;
554 {
555 bufferlist::iterator p = bl.begin();
556 __u8 a, b;
557 uint32_t len;
558 ::decode(a, p);
559 ::decode(b, p);
560 ::decode(len, p);
561 ::decode(uuid, p);
562 ::decode(seq, p);
563 if (len + 6 > bl.length()) {
564 more = ROUND_UP_TO(len + 6 - bl.length(), super.block_size);
565 }
566 }
567 if (uuid != super.uuid) {
568 dout(10) << __func__ << " 0x" << std::hex << pos << std::dec
569 << ": stop: uuid " << uuid << " != super.uuid " << super.uuid
570 << dendl;
571 break;
572 }
573 if (seq != log_seq + 1) {
574 dout(10) << __func__ << " 0x" << std::hex << pos << std::dec
575 << ": stop: seq " << seq << " != expected " << log_seq + 1
576 << dendl;
577 break;
578 }
579 if (more) {
580 dout(20) << __func__ << " need 0x" << std::hex << more << std::dec
581 << " more bytes" << dendl;
582 bufferlist t;
583 int r = _read(log_reader, &log_reader->buf, read_pos, more, &t, NULL);
584 if (r < (int)more) {
585 dout(10) << __func__ << " 0x" << std::hex << pos
586 << ": stop: len is 0x" << bl.length() + more << std::dec
587 << ", which is past eof" << dendl;
588 break;
589 }
590 assert(r == (int)more);
591 bl.claim_append(t);
592 read_pos += r;
593 }
594 bluefs_transaction_t t;
595 try {
596 bufferlist::iterator p = bl.begin();
597 ::decode(t, p);
598 }
599 catch (buffer::error& e) {
600 dout(10) << __func__ << " 0x" << std::hex << pos << std::dec
601 << ": stop: failed to decode: " << e.what()
602 << dendl;
603 delete log_reader;
604 return -EIO;
605 }
606 assert(seq == t.seq);
607 dout(10) << __func__ << " 0x" << std::hex << pos << std::dec
608 << ": " << t << dendl;
609
610 bufferlist::iterator p = t.op_bl.begin();
611 while (!p.end()) {
612 __u8 op;
613 ::decode(op, p);
614 switch (op) {
615
616 case bluefs_transaction_t::OP_INIT:
617 dout(20) << __func__ << " 0x" << std::hex << pos << std::dec
618 << ": op_init" << dendl;
619 assert(t.seq == 1);
620 break;
621
622 case bluefs_transaction_t::OP_JUMP:
623 {
624 uint64_t next_seq;
625 uint64_t offset;
626 ::decode(next_seq, p);
627 ::decode(offset, p);
628 dout(20) << __func__ << " 0x" << std::hex << pos << std::dec
629 << ": op_jump seq " << next_seq
630 << " offset 0x" << std::hex << offset << std::dec << dendl;
631 assert(next_seq >= log_seq);
632 log_seq = next_seq - 1; // we will increment it below
633 uint64_t skip = offset - read_pos;
634 if (skip) {
635 bufferlist junk;
636 int r = _read(log_reader, &log_reader->buf, read_pos, skip, &junk,
637 NULL);
638 if (r != (int)skip) {
639 dout(10) << __func__ << " 0x" << std::hex << read_pos
640 << ": stop: failed to skip to " << offset
641 << std::dec << dendl;
642 assert(0 == "problem with op_jump");
643 }
644 }
645 }
646 break;
647
648 case bluefs_transaction_t::OP_JUMP_SEQ:
649 {
650 uint64_t next_seq;
651 ::decode(next_seq, p);
652 dout(20) << __func__ << " 0x" << std::hex << pos << std::dec
653 << ": op_jump_seq " << next_seq << dendl;
654 assert(next_seq >= log_seq);
655 log_seq = next_seq - 1; // we will increment it below
656 }
657 break;
658
659 case bluefs_transaction_t::OP_ALLOC_ADD:
660 {
661 __u8 id;
662 uint64_t offset, length;
663 ::decode(id, p);
664 ::decode(offset, p);
665 ::decode(length, p);
666 dout(20) << __func__ << " 0x" << std::hex << pos << std::dec
667 << ": op_alloc_add " << " " << (int)id
668 << ":0x" << std::hex << offset << "~" << length << std::dec
669 << dendl;
670 if (!noop) {
671 block_all[id].insert(offset, length);
672 block_total[id] += length;
673 alloc[id]->init_add_free(offset, length);
674 }
675 }
676 break;
677
678 case bluefs_transaction_t::OP_ALLOC_RM:
679 {
680 __u8 id;
681 uint64_t offset, length;
682 ::decode(id, p);
683 ::decode(offset, p);
684 ::decode(length, p);
685 dout(20) << __func__ << " 0x" << std::hex << pos << std::dec
686 << ": op_alloc_rm " << " " << (int)id
687 << ":0x" << std::hex << offset << "~" << length << std::dec
688 << dendl;
689 if (!noop) {
690 block_all[id].erase(offset, length);
691 block_total[id] -= length;
692 alloc[id]->init_rm_free(offset, length);
693 }
694 }
695 break;
696
697 case bluefs_transaction_t::OP_DIR_LINK:
698 {
699 string dirname, filename;
700 uint64_t ino;
701 ::decode(dirname, p);
702 ::decode(filename, p);
703 ::decode(ino, p);
704 dout(20) << __func__ << " 0x" << std::hex << pos << std::dec
705 << ": op_dir_link " << " " << dirname << "/" << filename
706 << " to " << ino
707 << dendl;
708 if (!noop) {
709 FileRef file = _get_file(ino);
710 assert(file->fnode.ino);
711 map<string,DirRef>::iterator q = dir_map.find(dirname);
712 assert(q != dir_map.end());
713 map<string,FileRef>::iterator r = q->second->file_map.find(filename);
714 assert(r == q->second->file_map.end());
715 q->second->file_map[filename] = file;
716 ++file->refs;
717 }
718 }
719 break;
720
721 case bluefs_transaction_t::OP_DIR_UNLINK:
722 {
723 string dirname, filename;
724 ::decode(dirname, p);
725 ::decode(filename, p);
726 dout(20) << __func__ << " 0x" << std::hex << pos << std::dec
727 << ": op_dir_unlink " << " " << dirname << "/" << filename
728 << dendl;
729 if (!noop) {
730 map<string,DirRef>::iterator q = dir_map.find(dirname);
731 assert(q != dir_map.end());
732 map<string,FileRef>::iterator r = q->second->file_map.find(filename);
733 assert(r != q->second->file_map.end());
734 assert(r->second->refs > 0);
735 --r->second->refs;
736 q->second->file_map.erase(r);
737 }
738 }
739 break;
740
741 case bluefs_transaction_t::OP_DIR_CREATE:
742 {
743 string dirname;
744 ::decode(dirname, p);
745 dout(20) << __func__ << " 0x" << std::hex << pos << std::dec
746 << ": op_dir_create " << dirname << dendl;
747 if (!noop) {
748 map<string,DirRef>::iterator q = dir_map.find(dirname);
749 assert(q == dir_map.end());
750 dir_map[dirname] = new Dir;
751 }
752 }
753 break;
754
755 case bluefs_transaction_t::OP_DIR_REMOVE:
756 {
757 string dirname;
758 ::decode(dirname, p);
759 dout(20) << __func__ << " 0x" << std::hex << pos << std::dec
760 << ": op_dir_remove " << dirname << dendl;
761 if (!noop) {
762 map<string,DirRef>::iterator q = dir_map.find(dirname);
763 assert(q != dir_map.end());
764 assert(q->second->file_map.empty());
765 dir_map.erase(q);
766 }
767 }
768 break;
769
770 case bluefs_transaction_t::OP_FILE_UPDATE:
771 {
772 bluefs_fnode_t fnode;
773 ::decode(fnode, p);
774 dout(20) << __func__ << " 0x" << std::hex << pos << std::dec
775 << ": op_file_update " << " " << fnode << dendl;
776 if (!noop) {
777 FileRef f = _get_file(fnode.ino);
778 f->fnode = fnode;
779 if (fnode.ino > ino_last) {
780 ino_last = fnode.ino;
781 }
782 }
783 }
784 break;
785
786 case bluefs_transaction_t::OP_FILE_REMOVE:
787 {
788 uint64_t ino;
789 ::decode(ino, p);
790 dout(20) << __func__ << " 0x" << std::hex << pos << std::dec
791 << ": op_file_remove " << ino << dendl;
792 if (!noop) {
793 auto p = file_map.find(ino);
794 assert(p != file_map.end());
795 file_map.erase(p);
796 }
797 }
798 break;
799
800 default:
801 derr << __func__ << " 0x" << std::hex << pos << std::dec
802 << ": stop: unrecognized op " << (int)op << dendl;
803 delete log_reader;
804 return -EIO;
805 }
806 }
807 assert(p.end());
808
809 // we successfully replayed the transaction; bump the seq and log size
810 ++log_seq;
811 log_file->fnode.size = log_reader->buf.pos;
812 }
813
814 dout(10) << __func__ << " log file size was 0x"
815 << std::hex << log_file->fnode.size << std::dec << dendl;
816 delete log_reader;
817
818 if (!noop) {
819 // verify file link counts are all >0
820 for (auto& p : file_map) {
821 if (p.second->refs == 0 &&
822 p.second->fnode.ino > 1) {
823 derr << __func__ << " file with link count 0: " << p.second->fnode
824 << dendl;
825 return -EIO;
826 }
827 }
828 }
829
830 dout(10) << __func__ << " done" << dendl;
831 return 0;
832}
833
834BlueFS::FileRef BlueFS::_get_file(uint64_t ino)
835{
836 auto p = file_map.find(ino);
837 if (p == file_map.end()) {
838 FileRef f = new File;
839 file_map[ino] = f;
840 dout(30) << __func__ << " ino " << ino << " = " << f
841 << " (new)" << dendl;
842 return f;
843 } else {
844 dout(30) << __func__ << " ino " << ino << " = " << p->second << dendl;
845 return p->second;
846 }
847}
848
849void BlueFS::_drop_link(FileRef file)
850{
851 dout(20) << __func__ << " had refs " << file->refs
852 << " on " << file->fnode << dendl;
853 assert(file->refs > 0);
854 --file->refs;
855 if (file->refs == 0) {
856 dout(20) << __func__ << " destroying " << file->fnode << dendl;
857 assert(file->num_reading.load() == 0);
858 log_t.op_file_remove(file->fnode.ino);
859 for (auto& r : file->fnode.extents) {
860 pending_release[r.bdev].insert(r.offset, r.length);
861 }
862 file_map.erase(file->fnode.ino);
863 file->deleted = true;
864 file->fnode.recalc_allocated();
865 if (file->dirty_seq) {
866 assert(file->dirty_seq > log_seq_stable);
867 assert(dirty_files.count(file->dirty_seq));
868 auto it = dirty_files[file->dirty_seq].iterator_to(*file);
869 dirty_files[file->dirty_seq].erase(it);
870 file->dirty_seq = 0;
871 }
872 }
873}
874
875int BlueFS::_read_random(
876 FileReader *h, ///< [in] read from here
877 uint64_t off, ///< [in] offset
878 size_t len, ///< [in] this many bytes
879 char *out) ///< [out] optional: or copy it here
880{
881 dout(10) << __func__ << " h " << h
882 << " 0x" << std::hex << off << "~" << len << std::dec
883 << " from " << h->file->fnode << dendl;
884
885 ++h->file->num_reading;
886
887 if (!h->ignore_eof &&
888 off + len > h->file->fnode.size) {
889 if (off > h->file->fnode.size)
890 len = 0;
891 else
892 len = h->file->fnode.size - off;
893 dout(20) << __func__ << " reaching (or past) eof, len clipped to 0x"
894 << std::hex << len << std::dec << dendl;
895 }
896
897 int ret = 0;
898 while (len > 0) {
899 uint64_t x_off = 0;
900 auto p = h->file->fnode.seek(off, &x_off);
901 uint64_t l = MIN(p->length - x_off, len);
902 dout(20) << __func__ << " read buffered 0x"
903 << std::hex << x_off << "~" << l << std::dec
904 << " of " << *p << dendl;
905 int r = bdev[p->bdev]->read_random(p->offset + x_off, l, out,
906 cct->_conf->bluefs_buffered_io);
907 assert(r == 0);
908 off += l;
909 len -= l;
910 ret += l;
911 out += l;
912 }
913
914 dout(20) << __func__ << " got " << ret << dendl;
915 --h->file->num_reading;
916 return ret;
917}
918
919int BlueFS::_read(
920 FileReader *h, ///< [in] read from here
921 FileReaderBuffer *buf, ///< [in] reader state
922 uint64_t off, ///< [in] offset
923 size_t len, ///< [in] this many bytes
924 bufferlist *outbl, ///< [out] optional: reference the result here
925 char *out) ///< [out] optional: or copy it here
926{
927 dout(10) << __func__ << " h " << h
928 << " 0x" << std::hex << off << "~" << len << std::dec
929 << " from " << h->file->fnode << dendl;
930
931 ++h->file->num_reading;
932
933 if (!h->ignore_eof &&
934 off + len > h->file->fnode.size) {
935 if (off > h->file->fnode.size)
936 len = 0;
937 else
938 len = h->file->fnode.size - off;
939 dout(20) << __func__ << " reaching (or past) eof, len clipped to 0x"
940 << std::hex << len << std::dec << dendl;
941 }
942 if (outbl)
943 outbl->clear();
944
945 int ret = 0;
946 while (len > 0) {
947 size_t left;
948 if (off < buf->bl_off || off >= buf->get_buf_end()) {
949 buf->bl.clear();
950 buf->bl_off = off & super.block_mask();
951 uint64_t x_off = 0;
952 auto p = h->file->fnode.seek(buf->bl_off, &x_off);
953 uint64_t want = ROUND_UP_TO(len + (off & ~super.block_mask()),
954 super.block_size);
955 want = MAX(want, buf->max_prefetch);
956 uint64_t l = MIN(p->length - x_off, want);
957 uint64_t eof_offset = ROUND_UP_TO(h->file->fnode.size, super.block_size);
958 if (!h->ignore_eof &&
959 buf->bl_off + l > eof_offset) {
960 l = eof_offset - buf->bl_off;
961 }
962 dout(20) << __func__ << " fetching 0x"
963 << std::hex << x_off << "~" << l << std::dec
964 << " of " << *p << dendl;
965 int r = bdev[p->bdev]->read(p->offset + x_off, l, &buf->bl, ioc[p->bdev],
966 cct->_conf->bluefs_buffered_io);
967 assert(r == 0);
968 }
969 left = buf->get_buf_remaining(off);
970 dout(20) << __func__ << " left 0x" << std::hex << left
971 << " len 0x" << len << std::dec << dendl;
972
973 int r = MIN(len, left);
974 if (outbl) {
975 bufferlist t;
976 t.substr_of(buf->bl, off - buf->bl_off, r);
977 outbl->claim_append(t);
978 }
979 if (out) {
980 // NOTE: h->bl is normally a contiguous buffer so c_str() is free.
981 memcpy(out, buf->bl.c_str() + off - buf->bl_off, r);
982 out += r;
983 }
984
985 dout(30) << __func__ << " result chunk (0x"
986 << std::hex << r << std::dec << " bytes):\n";
987 bufferlist t;
988 t.substr_of(buf->bl, off - buf->bl_off, r);
989 t.hexdump(*_dout);
990 *_dout << dendl;
991
992 off += r;
993 len -= r;
994 ret += r;
995 buf->pos += r;
996 }
997
998 dout(20) << __func__ << " got " << ret << dendl;
999 assert(!outbl || (int)outbl->length() == ret);
1000 --h->file->num_reading;
1001 return ret;
1002}
1003
1004void BlueFS::_invalidate_cache(FileRef f, uint64_t offset, uint64_t length)
1005{
1006 dout(10) << __func__ << " file " << f->fnode
1007 << " 0x" << std::hex << offset << "~" << length << std::dec
1008 << dendl;
1009 if (offset & ~super.block_mask()) {
1010 offset &= super.block_mask();
1011 length = ROUND_UP_TO(length, super.block_size);
1012 }
1013 uint64_t x_off = 0;
1014 auto p = f->fnode.seek(offset, &x_off);
1015 while (length > 0 && p != f->fnode.extents.end()) {
1016 uint64_t x_len = MIN(p->length - x_off, length);
1017 bdev[p->bdev]->invalidate_cache(p->offset + x_off, x_len);
1018 dout(20) << __func__ << " 0x" << std::hex << x_off << "~" << x_len
1019 << std:: dec << " of " << *p << dendl;
1020 offset += x_len;
1021 length -= x_len;
1022 }
1023}
1024
1025uint64_t BlueFS::_estimate_log_size()
1026{
1027 int avg_dir_size = 40; // fixme
1028 int avg_file_size = 12;
1029 uint64_t size = 4096 * 2;
1030 size += file_map.size() * (1 + sizeof(bluefs_fnode_t));
1031 for (auto& p : block_all)
1032 size += p.num_intervals() * (1 + 1 + sizeof(uint64_t) * 2);
1033 size += dir_map.size() + (1 + avg_dir_size);
1034 size += file_map.size() * (1 + avg_dir_size + avg_file_size);
1035 return ROUND_UP_TO(size, super.block_size);
1036}
1037
1038void BlueFS::compact_log()
1039{
1040 std::unique_lock<std::mutex> l(lock);
1041 if (cct->_conf->bluefs_compact_log_sync) {
1042 _compact_log_sync();
1043 } else {
1044 _compact_log_async(l);
1045 }
1046}
1047
1048bool BlueFS::_should_compact_log()
1049{
1050 uint64_t current = log_writer->file->fnode.size;
1051 uint64_t expected = _estimate_log_size();
1052 float ratio = (float)current / (float)expected;
1053 dout(10) << __func__ << " current 0x" << std::hex << current
1054 << " expected " << expected << std::dec
1055 << " ratio " << ratio
1056 << (new_log ? " (async compaction in progress)" : "")
1057 << dendl;
1058 if (new_log ||
1059 current < cct->_conf->bluefs_log_compact_min_size ||
1060 ratio < cct->_conf->bluefs_log_compact_min_ratio) {
1061 return false;
1062 }
1063 return true;
1064}
1065
1066void BlueFS::_compact_log_dump_metadata(bluefs_transaction_t *t)
1067{
1068 t->seq = 1;
1069 t->uuid = super.uuid;
1070 dout(20) << __func__ << " op_init" << dendl;
1071
1072 t->op_init();
1073 for (unsigned bdev = 0; bdev < MAX_BDEV; ++bdev) {
1074 interval_set<uint64_t>& p = block_all[bdev];
1075 for (interval_set<uint64_t>::iterator q = p.begin(); q != p.end(); ++q) {
1076 dout(20) << __func__ << " op_alloc_add " << bdev << " 0x"
1077 << std::hex << q.get_start() << "~" << q.get_len() << std::dec
1078 << dendl;
1079 t->op_alloc_add(bdev, q.get_start(), q.get_len());
1080 }
1081 }
1082 for (auto& p : file_map) {
1083 if (p.first == 1)
1084 continue;
1085 dout(20) << __func__ << " op_file_update " << p.second->fnode << dendl;
1086 assert(p.first > 1);
1087 t->op_file_update(p.second->fnode);
1088 }
1089 for (auto& p : dir_map) {
1090 dout(20) << __func__ << " op_dir_create " << p.first << dendl;
1091 t->op_dir_create(p.first);
1092 for (auto& q : p.second->file_map) {
1093 dout(20) << __func__ << " op_dir_link " << p.first << "/" << q.first
1094 << " to " << q.second->fnode.ino << dendl;
1095 t->op_dir_link(p.first, q.first, q.second->fnode.ino);
1096 }
1097 }
1098}
1099
1100void BlueFS::_compact_log_sync()
1101{
1102 dout(10) << __func__ << dendl;
1103 File *log_file = log_writer->file.get();
1104
1105 // clear out log (be careful who calls us!!!)
1106 log_t.clear();
1107
1108 bluefs_transaction_t t;
1109 _compact_log_dump_metadata(&t);
1110
1111 dout(20) << __func__ << " op_jump_seq " << log_seq << dendl;
1112 t.op_jump_seq(log_seq);
1113
1114 bufferlist bl;
1115 ::encode(t, bl);
1116 _pad_bl(bl);
1117
1118 uint64_t need = bl.length() + cct->_conf->bluefs_max_log_runway;
1119 dout(20) << __func__ << " need " << need << dendl;
1120
1121 mempool::bluefs::vector<bluefs_extent_t> old_extents;
1122 old_extents.swap(log_file->fnode.extents);
1123 log_file->fnode.recalc_allocated();
1124 while (log_file->fnode.get_allocated() < need) {
1125 int r = _allocate(log_file->fnode.prefer_bdev,
1126 need - log_file->fnode.get_allocated(),
1127 &log_file->fnode.extents);
1128 log_file->fnode.recalc_allocated();
1129 assert(r == 0);
1130 }
1131
1132 _close_writer(log_writer);
1133
1134 log_file->fnode.size = bl.length();
1135 log_writer = _create_writer(log_file);
1136 log_writer->append(bl);
1137 int r = _flush(log_writer, true);
1138 assert(r == 0);
1139 wait_for_aio(log_writer);
1140
224ce89b
WB
1141 list<aio_t> completed_ios;
1142 _claim_completed_aios(log_writer, &completed_ios);
1143 flush_bdev();
1144 completed_ios.clear();
1145
7c673cae
FG
1146 dout(10) << __func__ << " writing super" << dendl;
1147 super.log_fnode = log_file->fnode;
1148 ++super.version;
1149 _write_super();
1150 flush_bdev();
1151
1152 dout(10) << __func__ << " release old log extents " << old_extents << dendl;
1153 for (auto& r : old_extents) {
1154 pending_release[r.bdev].insert(r.offset, r.length);
1155 }
1156
1157 logger->inc(l_bluefs_log_compactions);
1158}
1159
1160/*
1161 * 1. Allocate a new extent to continue the log, and then log an event
1162 * that jumps the log write position to the new extent. At this point, the
1163 * old extent(s) won't be written to, and reflect everything to compact.
1164 * New events will be written to the new region that we'll keep.
1165 *
1166 * 2. While still holding the lock, encode a bufferlist that dumps all of the
1167 * in-memory fnodes and names. This will become the new beginning of the
1168 * log. The last event will jump to the log continuation extent from #1.
1169 *
1170 * 3. Queue a write to a new extent for the new beginnging of the log.
1171 *
1172 * 4. Drop lock and wait
1173 *
1174 * 5. Retake the lock.
1175 *
1176 * 6. Update the log_fnode to splice in the new beginning.
1177 *
1178 * 7. Write the new superblock.
1179 *
1180 * 8. Release the old log space. Clean up.
1181 */
1182void BlueFS::_compact_log_async(std::unique_lock<std::mutex>& l)
1183{
1184 dout(10) << __func__ << dendl;
1185 File *log_file = log_writer->file.get();
1186 assert(!new_log);
1187 assert(!new_log_writer);
1188
1189 // 1. allocate new log space and jump to it.
1190 old_log_jump_to = log_file->fnode.get_allocated();
1191 uint64_t need = old_log_jump_to + cct->_conf->bluefs_max_log_runway;
1192 dout(10) << __func__ << " old_log_jump_to 0x" << std::hex << old_log_jump_to
1193 << " need 0x" << need << std::dec << dendl;
1194 while (log_file->fnode.get_allocated() < need) {
1195 int r = _allocate(log_file->fnode.prefer_bdev,
1196 cct->_conf->bluefs_max_log_runway,
1197 &log_file->fnode.extents);
1198 assert(r == 0);
1199 log_file->fnode.recalc_allocated();
1200 }
1201 dout(10) << __func__ << " log extents " << log_file->fnode.extents << dendl;
1202
1203 // update the log file change and log a jump to the offset where we want to
1204 // write the new entries
1205 log_t.op_file_update(log_file->fnode);
1206 log_t.op_jump(log_seq, old_log_jump_to);
1207
1208 flush_bdev(); // FIXME?
1209
1210 _flush_and_sync_log(l, 0, old_log_jump_to);
1211
1212 // 2. prepare compacted log
1213 bluefs_transaction_t t;
224ce89b
WB
1214 //avoid record two times in log_t and _compact_log_dump_metadata.
1215 log_t.clear();
7c673cae
FG
1216 _compact_log_dump_metadata(&t);
1217
1218 // conservative estimate for final encoded size
1219 new_log_jump_to = ROUND_UP_TO(t.op_bl.length() + super.block_size * 2,
1220 cct->_conf->bluefs_alloc_size);
1221 t.op_jump(log_seq, new_log_jump_to);
1222
1223 bufferlist bl;
1224 ::encode(t, bl);
1225 _pad_bl(bl);
1226
1227 dout(10) << __func__ << " new_log_jump_to 0x" << std::hex << new_log_jump_to
1228 << std::dec << dendl;
1229
1230 // create a new log [writer]
1231 new_log = new File;
1232 new_log->fnode.ino = 0; // so that _flush_range won't try to log the fnode
1233 int r = _allocate(BlueFS::BDEV_DB, new_log_jump_to,
1234 &new_log->fnode.extents);
1235 assert(r == 0);
1236 new_log->fnode.recalc_allocated();
1237 new_log_writer = _create_writer(new_log);
1238 new_log_writer->append(bl);
1239
1240 // 3. flush
1241 r = _flush(new_log_writer, true);
1242 assert(r == 0);
1243 lock.unlock();
1244
1245 // 4. wait
1246 dout(10) << __func__ << " waiting for compacted log to sync" << dendl;
1247 wait_for_aio(new_log_writer);
224ce89b
WB
1248
1249 list<aio_t> completed_ios;
1250 _claim_completed_aios(new_log_writer, &completed_ios);
7c673cae 1251 flush_bdev();
224ce89b 1252 completed_ios.clear();
7c673cae
FG
1253
1254 // 5. retake lock
1255 lock.lock();
1256
1257 // 6. update our log fnode
1258 // discard first old_log_jump_to extents
1259 dout(10) << __func__ << " remove 0x" << std::hex << old_log_jump_to << std::dec
1260 << " of " << log_file->fnode.extents << dendl;
1261 uint64_t discarded = 0;
1262 mempool::bluefs::vector<bluefs_extent_t> old_extents;
1263 while (discarded < old_log_jump_to) {
1264 assert(!log_file->fnode.extents.empty());
1265 bluefs_extent_t& e = log_file->fnode.extents.front();
1266 bluefs_extent_t temp = e;
1267 if (discarded + e.length <= old_log_jump_to) {
1268 dout(10) << __func__ << " remove old log extent " << e << dendl;
1269 discarded += e.length;
1270 log_file->fnode.extents.erase(log_file->fnode.extents.begin());
1271 } else {
1272 dout(10) << __func__ << " remove front of old log extent " << e << dendl;
1273 uint64_t drop = old_log_jump_to - discarded;
1274 temp.length = drop;
1275 e.offset += drop;
1276 e.length -= drop;
1277 discarded += drop;
1278 dout(10) << __func__ << " kept " << e << " removed " << temp << dendl;
1279 }
1280 old_extents.push_back(temp);
1281 }
1282 new_log->fnode.extents.insert(new_log->fnode.extents.end(),
1283 log_file->fnode.extents.begin(),
1284 log_file->fnode.extents.end());
1285
1286 // clear the extents from old log file, they are added to new log
1287 log_file->fnode.extents.clear();
1288
1289 // swap the log files. New log file is the log file now.
1290 log_file->fnode.extents.swap(new_log->fnode.extents);
1291 log_file->fnode.recalc_allocated();
1292 new_log->fnode.recalc_allocated();
1293 log_writer->pos = log_writer->file->fnode.size =
1294 log_writer->pos - old_log_jump_to + new_log_jump_to;
1295
1296 // 7. write the super block to reflect the changes
1297 dout(10) << __func__ << " writing super" << dendl;
1298 super.log_fnode = log_file->fnode;
1299 ++super.version;
1300 _write_super();
1301
1302 lock.unlock();
1303 flush_bdev();
1304 lock.lock();
1305
1306 // 8. release old space
1307 dout(10) << __func__ << " release old log extents " << old_extents << dendl;
1308 for (auto& r : old_extents) {
1309 pending_release[r.bdev].insert(r.offset, r.length);
1310 }
1311
1312 // delete the new log, remove from the dirty files list
1313 _close_writer(new_log_writer);
1314 if (new_log->dirty_seq) {
1315 assert(dirty_files.count(new_log->dirty_seq));
1316 auto it = dirty_files[new_log->dirty_seq].iterator_to(*new_log);
1317 dirty_files[new_log->dirty_seq].erase(it);
1318 }
1319 new_log_writer = nullptr;
1320 new_log = nullptr;
1321 log_cond.notify_all();
1322
1323 dout(10) << __func__ << " log extents " << log_file->fnode.extents << dendl;
1324 logger->inc(l_bluefs_log_compactions);
1325}
1326
1327void BlueFS::_pad_bl(bufferlist& bl)
1328{
1329 uint64_t partial = bl.length() % super.block_size;
1330 if (partial) {
1331 dout(10) << __func__ << " padding with 0x" << std::hex
1332 << super.block_size - partial << " zeros" << std::dec << dendl;
1333 bl.append_zero(super.block_size - partial);
1334 }
1335}
1336
1337void BlueFS::flush_log()
1338{
1339 std::unique_lock<std::mutex> l(lock);
1340 flush_bdev();
1341 _flush_and_sync_log(l);
1342}
1343
1344int BlueFS::_flush_and_sync_log(std::unique_lock<std::mutex>& l,
1345 uint64_t want_seq,
1346 uint64_t jump_to)
1347{
1348 while (log_flushing) {
1349 dout(10) << __func__ << " want_seq " << want_seq
1350 << " log is currently flushing, waiting" << dendl;
1351 log_cond.wait(l);
1352 }
1353 if (want_seq && want_seq <= log_seq_stable) {
1354 dout(10) << __func__ << " want_seq " << want_seq << " <= log_seq_stable "
1355 << log_seq_stable << ", done" << dendl;
1356 return 0;
1357 }
1358 if (log_t.empty() && dirty_files.empty()) {
1359 dout(10) << __func__ << " want_seq " << want_seq
1360 << " " << log_t << " not dirty, dirty_files empty, no-op" << dendl;
1361 return 0;
1362 }
1363
1364 uint64_t seq = log_t.seq = ++log_seq;
1365 assert(want_seq == 0 || want_seq <= seq);
1366 log_t.uuid = super.uuid;
1367
1368 // log dirty files
1369 auto lsi = dirty_files.find(seq);
1370 if (lsi != dirty_files.end()) {
1371 dout(20) << __func__ << " " << lsi->second.size() << " dirty_files" << dendl;
1372 for (auto &f : lsi->second) {
1373 dout(20) << __func__ << " op_file_update " << f.fnode << dendl;
1374 log_t.op_file_update(f.fnode);
1375 }
1376 }
1377
1378 dout(10) << __func__ << " " << log_t << dendl;
1379 assert(!log_t.empty());
1380
1381 // allocate some more space (before we run out)?
1382 int64_t runway = log_writer->file->fnode.get_allocated() -
1383 log_writer->get_effective_write_pos();
1384 if (runway < (int64_t)cct->_conf->bluefs_min_log_runway) {
1385 dout(10) << __func__ << " allocating more log runway (0x"
1386 << std::hex << runway << std::dec << " remaining)" << dendl;
1387 while (new_log_writer) {
1388 dout(10) << __func__ << " waiting for async compaction" << dendl;
1389 log_cond.wait(l);
1390 }
1391 int r = _allocate(log_writer->file->fnode.prefer_bdev,
1392 cct->_conf->bluefs_max_log_runway,
1393 &log_writer->file->fnode.extents);
1394 assert(r == 0);
1395 log_writer->file->fnode.recalc_allocated();
1396 log_t.op_file_update(log_writer->file->fnode);
1397 }
1398
1399 bufferlist bl;
1400 ::encode(log_t, bl);
1401
1402 // pad to block boundary
1403 _pad_bl(bl);
1404 logger->inc(l_bluefs_logged_bytes, bl.length());
1405
1406 log_writer->append(bl);
1407
1408 log_t.clear();
1409 log_t.seq = 0; // just so debug output is less confusing
1410 log_flushing = true;
1411
1412 int r = _flush(log_writer, true);
1413 assert(r == 0);
1414
1415 if (jump_to) {
1416 dout(10) << __func__ << " jumping log offset from 0x" << std::hex
1417 << log_writer->pos << " -> 0x" << jump_to << std::dec << dendl;
1418 log_writer->pos = jump_to;
1419 log_writer->file->fnode.size = jump_to;
1420 }
1421
1422 _flush_bdev_safely(log_writer);
1423
1424 log_flushing = false;
1425 log_cond.notify_all();
1426
1427 // clean dirty files
1428 if (seq > log_seq_stable) {
1429 log_seq_stable = seq;
1430 dout(20) << __func__ << " log_seq_stable " << log_seq_stable << dendl;
1431
1432 auto p = dirty_files.begin();
1433 while (p != dirty_files.end()) {
1434 if (p->first > log_seq_stable) {
1435 dout(20) << __func__ << " done cleaning up dirty files" << dendl;
1436 break;
1437 }
1438
1439 auto l = p->second.begin();
1440 while (l != p->second.end()) {
1441 File *file = &*l;
1442 assert(file->dirty_seq > 0);
1443 assert(file->dirty_seq <= log_seq_stable);
1444 dout(20) << __func__ << " cleaned file " << file->fnode << dendl;
1445 file->dirty_seq = 0;
1446 p->second.erase(l++);
1447 }
1448
1449 assert(p->second.empty());
1450 dirty_files.erase(p++);
1451 }
1452 } else {
1453 dout(20) << __func__ << " log_seq_stable " << log_seq_stable
1454 << " already >= out seq " << seq
1455 << ", we lost a race against another log flush, done" << dendl;
1456 }
1457 _update_logger_stats();
1458
1459 return 0;
1460}
1461
1462int BlueFS::_flush_range(FileWriter *h, uint64_t offset, uint64_t length)
1463{
1464 dout(10) << __func__ << " " << h << " pos 0x" << std::hex << h->pos
1465 << " 0x" << offset << "~" << length << std::dec
1466 << " to " << h->file->fnode << dendl;
1467 assert(!h->file->deleted);
1468 assert(h->file->num_readers.load() == 0);
1469
1470 h->buffer_appender.flush();
1471
1472 bool buffered;
1473 if (h->file->fnode.ino == 1)
1474 buffered = false;
1475 else
1476 buffered = cct->_conf->bluefs_buffered_io;
1477
1478 if (offset + length <= h->pos)
1479 return 0;
1480 if (offset < h->pos) {
1481 length -= h->pos - offset;
1482 offset = h->pos;
1483 dout(10) << " still need 0x"
1484 << std::hex << offset << "~" << length << std::dec
1485 << dendl;
1486 }
1487 assert(offset <= h->file->fnode.size);
1488
1489 uint64_t allocated = h->file->fnode.get_allocated();
1490
1491 // do not bother to dirty the file if we are overwriting
1492 // previously allocated extents.
1493 bool must_dirty = false;
1494 if (allocated < offset + length) {
1495 // we should never run out of log space here; see the min runway check
1496 // in _flush_and_sync_log.
1497 assert(h->file->fnode.ino != 1);
1498 int r = _allocate(h->file->fnode.prefer_bdev,
1499 offset + length - allocated,
1500 &h->file->fnode.extents);
1501 if (r < 0) {
1502 derr << __func__ << " allocated: 0x" << std::hex << allocated
1503 << " offset: 0x" << offset << " length: 0x" << length << std::dec
1504 << dendl;
1505 return r;
1506 }
1507 h->file->fnode.recalc_allocated();
1508 if (cct->_conf->bluefs_preextend_wal_files &&
1509 h->writer_type == WRITER_WAL) {
1510 // NOTE: this *requires* that rocksdb also has log recycling
1511 // enabled and is therefore doing robust CRCs on the log
1512 // records. otherwise, we will fail to reply the rocksdb log
1513 // properly due to garbage on the device.
1514 h->file->fnode.size = h->file->fnode.get_allocated();
1515 dout(10) << __func__ << " extending WAL size to 0x" << std::hex
1516 << h->file->fnode.size << std::dec << " to include allocated"
1517 << dendl;
1518 }
1519 must_dirty = true;
1520 }
1521 if (h->file->fnode.size < offset + length) {
1522 h->file->fnode.size = offset + length;
1523 if (h->file->fnode.ino > 1) {
1524 // we do not need to dirty the log file (or it's compacting
1525 // replacement) when the file size changes because replay is
1526 // smart enough to discover it on its own.
1527 must_dirty = true;
1528 }
1529 }
1530 if (must_dirty) {
1531 h->file->fnode.mtime = ceph_clock_now();
1532 assert(h->file->fnode.ino >= 1);
1533 if (h->file->dirty_seq == 0) {
1534 h->file->dirty_seq = log_seq + 1;
1535 dirty_files[h->file->dirty_seq].push_back(*h->file);
1536 dout(20) << __func__ << " dirty_seq = " << log_seq + 1
1537 << " (was clean)" << dendl;
1538 } else {
1539 if (h->file->dirty_seq != log_seq + 1) {
1540 // need re-dirty, erase from list first
1541 assert(dirty_files.count(h->file->dirty_seq));
1542 auto it = dirty_files[h->file->dirty_seq].iterator_to(*h->file);
1543 dirty_files[h->file->dirty_seq].erase(it);
1544 h->file->dirty_seq = log_seq + 1;
1545 dirty_files[h->file->dirty_seq].push_back(*h->file);
1546 dout(20) << __func__ << " dirty_seq = " << log_seq + 1
1547 << " (was " << h->file->dirty_seq << ")" << dendl;
1548 } else {
1549 dout(20) << __func__ << " dirty_seq = " << log_seq + 1
1550 << " (unchanged, do nothing) " << dendl;
1551 }
1552 }
1553 }
1554 dout(20) << __func__ << " file now " << h->file->fnode << dendl;
1555
1556 uint64_t x_off = 0;
1557 auto p = h->file->fnode.seek(offset, &x_off);
1558 assert(p != h->file->fnode.extents.end());
1559 dout(20) << __func__ << " in " << *p << " x_off 0x"
1560 << std::hex << x_off << std::dec << dendl;
1561
1562 unsigned partial = x_off & ~super.block_mask();
1563 bufferlist bl;
1564 if (partial) {
1565 dout(20) << __func__ << " using partial tail 0x"
1566 << std::hex << partial << std::dec << dendl;
1567 assert(h->tail_block.length() == partial);
31f18b77 1568 bl.claim_append_piecewise(h->tail_block);
7c673cae
FG
1569 x_off -= partial;
1570 offset -= partial;
1571 length += partial;
1572 dout(20) << __func__ << " waiting for previous aio to complete" << dendl;
1573 for (auto p : h->iocv) {
1574 if (p) {
1575 p->aio_wait();
1576 }
1577 }
1578 }
1579 if (length == partial + h->buffer.length()) {
31f18b77 1580 bl.claim_append_piecewise(h->buffer);
7c673cae
FG
1581 } else {
1582 bufferlist t;
31f18b77
FG
1583 h->buffer.splice(0, length, &t);
1584 bl.claim_append_piecewise(t);
7c673cae
FG
1585 t.substr_of(h->buffer, length, h->buffer.length() - length);
1586 h->buffer.swap(t);
1587 dout(20) << " leaving 0x" << std::hex << h->buffer.length() << std::dec
1588 << " unflushed" << dendl;
1589 }
1590 assert(bl.length() == length);
1591
1592 switch (h->writer_type) {
1593 case WRITER_WAL:
1594 logger->inc(l_bluefs_bytes_written_wal, length);
1595 break;
1596 case WRITER_SST:
1597 logger->inc(l_bluefs_bytes_written_sst, length);
1598 break;
1599 }
1600
1601 dout(30) << "dump:\n";
1602 bl.hexdump(*_dout);
1603 *_dout << dendl;
1604
1605 h->pos = offset + length;
1606 h->tail_block.clear();
1607
1608 uint64_t bloff = 0;
1609 while (length > 0) {
1610 uint64_t x_len = MIN(p->length - x_off, length);
1611 bufferlist t;
1612 t.substr_of(bl, bloff, x_len);
1613 unsigned tail = x_len & ~super.block_mask();
1614 if (tail) {
1615 size_t zlen = super.block_size - tail;
1616 dout(20) << __func__ << " caching tail of 0x"
1617 << std::hex << tail
1618 << " and padding block with 0x" << zlen
1619 << std::dec << dendl;
1620 h->tail_block.substr_of(bl, bl.length() - tail, tail);
1621 if (h->file->fnode.ino > 1) {
1622 // we are using the page_aligned_appender, and can safely use
1623 // the tail of the raw buffer.
1624 const bufferptr &last = t.back();
1625 if (last.unused_tail_length() < zlen) {
1626 derr << " wtf, last is " << last << " from " << t << dendl;
1627 assert(last.unused_tail_length() >= zlen);
1628 }
1629 bufferptr z = last;
1630 z.set_offset(last.offset() + last.length());
1631 z.set_length(zlen);
1632 z.zero();
1633 t.append(z, 0, zlen);
1634 } else {
1635 t.append_zero(zlen);
1636 }
1637 }
1638 if (cct->_conf->bluefs_sync_write) {
1639 bdev[p->bdev]->write(p->offset + x_off, t, buffered);
1640 } else {
1641 bdev[p->bdev]->aio_write(p->offset + x_off, t, h->iocv[p->bdev], buffered);
1642 }
1643 bloff += x_len;
1644 length -= x_len;
1645 ++p;
1646 x_off = 0;
1647 }
1648 for (unsigned i = 0; i < MAX_BDEV; ++i) {
1649 if (bdev[i]) {
1650 assert(h->iocv[i]);
1651 if (h->iocv[i]->has_pending_aios()) {
1652 bdev[i]->aio_submit(h->iocv[i]);
1653 }
1654 }
1655 }
1656 dout(20) << __func__ << " h " << h << " pos now 0x"
1657 << std::hex << h->pos << std::dec << dendl;
1658 return 0;
1659}
1660
1661// we need to retire old completed aios so they don't stick around in
1662// memory indefinitely (along with their bufferlist refs).
1663void BlueFS::_claim_completed_aios(FileWriter *h, list<aio_t> *ls)
1664{
1665 for (auto p : h->iocv) {
1666 if (p) {
1667 ls->splice(ls->end(), p->running_aios);
1668 }
1669 }
1670 dout(10) << __func__ << " got " << ls->size() << " aios" << dendl;
1671}
1672
1673void BlueFS::wait_for_aio(FileWriter *h)
1674{
1675 // NOTE: this is safe to call without a lock, as long as our reference is
1676 // stable.
1677 dout(10) << __func__ << " " << h << dendl;
1678 utime_t start = ceph_clock_now();
1679 for (auto p : h->iocv) {
1680 if (p) {
1681 p->aio_wait();
1682 }
1683 }
1684 utime_t end = ceph_clock_now();
1685 utime_t dur = end - start;
1686 dout(10) << __func__ << " " << h << " done in " << dur << dendl;
1687}
1688
1689int BlueFS::_flush(FileWriter *h, bool force)
1690{
1691 h->buffer_appender.flush();
1692 uint64_t length = h->buffer.length();
1693 uint64_t offset = h->pos;
1694 if (!force &&
1695 length < cct->_conf->bluefs_min_flush_size) {
1696 dout(10) << __func__ << " " << h << " ignoring, length " << length
1697 << " < min_flush_size " << cct->_conf->bluefs_min_flush_size
1698 << dendl;
1699 return 0;
1700 }
1701 if (length == 0) {
1702 dout(10) << __func__ << " " << h << " no dirty data on "
1703 << h->file->fnode << dendl;
1704 return 0;
1705 }
1706 dout(10) << __func__ << " " << h << " 0x"
1707 << std::hex << offset << "~" << length << std::dec
1708 << " to " << h->file->fnode << dendl;
1709 assert(h->pos <= h->file->fnode.size);
1710 return _flush_range(h, offset, length);
1711}
1712
1713int BlueFS::_truncate(FileWriter *h, uint64_t offset)
1714{
1715 dout(10) << __func__ << " 0x" << std::hex << offset << std::dec
1716 << " file " << h->file->fnode << dendl;
1717 if (h->file->deleted) {
1718 dout(10) << __func__ << " deleted, no-op" << dendl;
1719 return 0;
1720 }
1721
1722 // we never truncate internal log files
1723 assert(h->file->fnode.ino > 1);
1724
1725 h->buffer_appender.flush();
1726
1727 // truncate off unflushed data?
1728 if (h->pos < offset &&
1729 h->pos + h->buffer.length() > offset) {
1730 bufferlist t;
1731 dout(20) << __func__ << " tossing out last " << offset - h->pos
1732 << " unflushed bytes" << dendl;
1733 t.substr_of(h->buffer, 0, offset - h->pos);
1734 h->buffer.swap(t);
1735 assert(0 == "actually this shouldn't happen");
1736 }
1737 if (h->buffer.length()) {
1738 int r = _flush(h, true);
1739 if (r < 0)
1740 return r;
1741 }
1742 if (offset == h->file->fnode.size) {
1743 return 0; // no-op!
1744 }
1745 if (offset > h->file->fnode.size) {
1746 assert(0 == "truncate up not supported");
1747 }
1748 assert(h->file->fnode.size >= offset);
1749 h->file->fnode.size = offset;
1750 log_t.op_file_update(h->file->fnode);
1751 return 0;
1752}
1753
1754int BlueFS::_fsync(FileWriter *h, std::unique_lock<std::mutex>& l)
1755{
1756 dout(10) << __func__ << " " << h << " " << h->file->fnode << dendl;
1757 int r = _flush(h, true);
1758 if (r < 0)
1759 return r;
1760 uint64_t old_dirty_seq = h->file->dirty_seq;
1761
1762 _flush_bdev_safely(h);
1763
1764 if (old_dirty_seq) {
1765 uint64_t s = log_seq;
1766 dout(20) << __func__ << " file metadata was dirty (" << old_dirty_seq
1767 << ") on " << h->file->fnode << ", flushing log" << dendl;
1768 _flush_and_sync_log(l, old_dirty_seq);
1769 assert(h->file->dirty_seq == 0 || // cleaned
1770 h->file->dirty_seq > s); // or redirtied by someone else
1771 }
1772 return 0;
1773}
1774
1775void BlueFS::_flush_bdev_safely(FileWriter *h)
1776{
1777 if (!cct->_conf->bluefs_sync_write) {
1778 list<aio_t> completed_ios;
1779 _claim_completed_aios(h, &completed_ios);
1780 lock.unlock();
1781 wait_for_aio(h);
1782 completed_ios.clear();
1783 flush_bdev();
1784 lock.lock();
1785 } else {
1786 lock.unlock();
1787 flush_bdev();
1788 lock.lock();
1789 }
1790}
1791
1792void BlueFS::flush_bdev()
1793{
1794 // NOTE: this is safe to call without a lock.
1795 dout(20) << __func__ << dendl;
1796 for (auto p : bdev) {
1797 if (p)
1798 p->flush();
1799 }
1800}
1801
1802int BlueFS::_allocate(uint8_t id, uint64_t len,
1803 mempool::bluefs::vector<bluefs_extent_t> *ev)
1804{
1805 dout(10) << __func__ << " len 0x" << std::hex << len << std::dec
1806 << " from " << (int)id << dendl;
1807 assert(id < alloc.size());
1808 uint64_t min_alloc_size = cct->_conf->bluefs_alloc_size;
1809
1810 uint64_t left = ROUND_UP_TO(len, min_alloc_size);
1811 int r = -ENOSPC;
1812 if (alloc[id]) {
1813 r = alloc[id]->reserve(left);
1814 }
1815 if (r < 0) {
1816 if (id != BDEV_SLOW) {
1817 if (bdev[id]) {
1818 dout(1) << __func__ << " failed to allocate 0x" << std::hex << left
1819 << " on bdev " << (int)id
1820 << ", free 0x" << alloc[id]->get_free()
1821 << "; fallback to bdev " << (int)id + 1
1822 << std::dec << dendl;
1823 }
1824 return _allocate(id + 1, len, ev);
1825 }
1826 if (bdev[id])
1827 derr << __func__ << " failed to allocate 0x" << std::hex << left
1828 << " on bdev " << (int)id
1829 << ", free 0x" << alloc[id]->get_free() << std::dec << dendl;
1830 else
1831 derr << __func__ << " failed to allocate 0x" << std::hex << left
1832 << " on bdev " << (int)id << ", dne" << std::dec << dendl;
1833 return r;
1834 }
1835
1836 uint64_t hint = 0;
1837 if (!ev->empty()) {
1838 hint = ev->back().end();
1839 }
1840
1841 AllocExtentVector extents;
1842 extents.reserve(4); // 4 should be (more than) enough for most allocations
1843 int64_t alloc_len = alloc[id]->allocate(left, min_alloc_size, hint,
1844 &extents);
1845 if (alloc_len < (int64_t)left) {
1846 derr << __func__ << " allocate failed on 0x" << std::hex << left
31f18b77
FG
1847 << " min_alloc_size 0x" << min_alloc_size
1848 << " hint 0x" << hint << std::dec << dendl;
7c673cae
FG
1849 alloc[id]->dump();
1850 assert(0 == "allocate failed... wtf");
1851 return -ENOSPC;
1852 }
1853
1854 for (auto& p : extents) {
1855 bluefs_extent_t e = bluefs_extent_t(id, p.offset, p.length);
1856 if (!ev->empty() &&
1857 ev->back().bdev == e.bdev &&
1858 ev->back().end() == (uint64_t) e.offset) {
1859 ev->back().length += e.length;
1860 } else {
1861 ev->push_back(e);
1862 }
1863 }
1864
1865 return 0;
1866}
1867
1868int BlueFS::_preallocate(FileRef f, uint64_t off, uint64_t len)
1869{
1870 dout(10) << __func__ << " file " << f->fnode << " 0x"
1871 << std::hex << off << "~" << len << std::dec << dendl;
1872 if (f->deleted) {
1873 dout(10) << __func__ << " deleted, no-op" << dendl;
1874 return 0;
1875 }
1876 assert(f->fnode.ino > 1);
1877 uint64_t allocated = f->fnode.get_allocated();
1878 if (off + len > allocated) {
1879 uint64_t want = off + len - allocated;
1880 int r = _allocate(f->fnode.prefer_bdev, want, &f->fnode.extents);
1881 if (r < 0)
1882 return r;
1883 f->fnode.recalc_allocated();
1884 log_t.op_file_update(f->fnode);
1885 }
1886 return 0;
1887}
1888
1889void BlueFS::sync_metadata()
1890{
1891 std::unique_lock<std::mutex> l(lock);
1892 if (log_t.empty()) {
1893 dout(10) << __func__ << " - no pending log events" << dendl;
1894 return;
1895 }
1896 dout(10) << __func__ << dendl;
1897 utime_t start = ceph_clock_now();
1898 vector<interval_set<uint64_t>> to_release(pending_release.size());
1899 to_release.swap(pending_release);
1900 flush_bdev(); // FIXME?
1901 _flush_and_sync_log(l);
1902 for (unsigned i = 0; i < to_release.size(); ++i) {
1903 for (auto p = to_release[i].begin(); p != to_release[i].end(); ++p) {
1904 alloc[i]->release(p.get_start(), p.get_len());
1905 }
1906 }
1907
1908 if (_should_compact_log()) {
1909 if (cct->_conf->bluefs_compact_log_sync) {
1910 _compact_log_sync();
1911 } else {
1912 _compact_log_async(l);
1913 }
1914 }
1915
1916 utime_t end = ceph_clock_now();
1917 utime_t dur = end - start;
1918 dout(10) << __func__ << " done in " << dur << dendl;
1919}
1920
1921int BlueFS::open_for_write(
1922 const string& dirname,
1923 const string& filename,
1924 FileWriter **h,
1925 bool overwrite)
1926{
1927 std::lock_guard<std::mutex> l(lock);
1928 dout(10) << __func__ << " " << dirname << "/" << filename << dendl;
1929 map<string,DirRef>::iterator p = dir_map.find(dirname);
1930 DirRef dir;
1931 if (p == dir_map.end()) {
1932 // implicitly create the dir
1933 dout(20) << __func__ << " dir " << dirname
1934 << " does not exist" << dendl;
1935 return -ENOENT;
1936 } else {
1937 dir = p->second;
1938 }
1939
1940 FileRef file;
1941 bool create = false;
1942 map<string,FileRef>::iterator q = dir->file_map.find(filename);
1943 if (q == dir->file_map.end()) {
1944 if (overwrite) {
1945 dout(20) << __func__ << " dir " << dirname << " (" << dir
1946 << ") file " << filename
1947 << " does not exist" << dendl;
1948 return -ENOENT;
1949 }
1950 file = new File;
1951 file->fnode.ino = ++ino_last;
1952 file_map[ino_last] = file;
1953 dir->file_map[filename] = file;
1954 ++file->refs;
1955 create = true;
1956 } else {
1957 // overwrite existing file?
1958 file = q->second;
1959 if (overwrite) {
1960 dout(20) << __func__ << " dir " << dirname << " (" << dir
1961 << ") file " << filename
1962 << " already exists, overwrite in place" << dendl;
1963 } else {
1964 dout(20) << __func__ << " dir " << dirname << " (" << dir
1965 << ") file " << filename
1966 << " already exists, truncate + overwrite" << dendl;
1967 file->fnode.size = 0;
1968 for (auto& p : file->fnode.extents) {
1969 pending_release[p.bdev].insert(p.offset, p.length);
1970 }
1971 file->fnode.extents.clear();
1972 file->fnode.recalc_allocated();
1973 }
1974 }
1975 assert(file->fnode.ino > 1);
1976
1977 file->fnode.mtime = ceph_clock_now();
1978 file->fnode.prefer_bdev = BlueFS::BDEV_DB;
1979 if (dirname.length() > 5) {
1980 // the "db.slow" and "db.wal" directory names are hard-coded at
1981 // match up with bluestore. the slow device is always the second
1982 // one (when a dedicated block.db device is present and used at
1983 // bdev 0). the wal device is always last.
31f18b77 1984 if (boost::algorithm::ends_with(dirname, ".slow")) {
7c673cae
FG
1985 file->fnode.prefer_bdev = BlueFS::BDEV_SLOW;
1986 } else if (boost::algorithm::ends_with(dirname, ".wal")) {
1987 file->fnode.prefer_bdev = BlueFS::BDEV_WAL;
1988 }
1989 }
1990 dout(20) << __func__ << " mapping " << dirname << "/" << filename
1991 << " to bdev " << (int)file->fnode.prefer_bdev << dendl;
1992
1993 log_t.op_file_update(file->fnode);
1994 if (create)
1995 log_t.op_dir_link(dirname, filename, file->fnode.ino);
1996
1997 *h = _create_writer(file);
1998
1999 if (boost::algorithm::ends_with(filename, ".log")) {
2000 (*h)->writer_type = BlueFS::WRITER_WAL;
2001 if (logger && !overwrite) {
2002 logger->inc(l_bluefs_files_written_wal);
2003 }
2004 } else if (boost::algorithm::ends_with(filename, ".sst")) {
2005 (*h)->writer_type = BlueFS::WRITER_SST;
2006 if (logger) {
2007 logger->inc(l_bluefs_files_written_sst);
2008 }
2009 }
2010
2011 dout(10) << __func__ << " h " << *h << " on " << file->fnode << dendl;
2012 return 0;
2013}
2014
2015BlueFS::FileWriter *BlueFS::_create_writer(FileRef f)
2016{
2017 FileWriter *w = new FileWriter(f);
2018 for (unsigned i = 0; i < MAX_BDEV; ++i) {
2019 if (bdev[i]) {
2020 w->iocv[i] = new IOContext(cct, NULL);
2021 } else {
2022 w->iocv[i] = NULL;
2023 }
2024 }
2025 return w;
2026}
2027
2028void BlueFS::_close_writer(FileWriter *h)
2029{
2030 dout(10) << __func__ << " " << h << " type " << h->writer_type << dendl;
2031 for (unsigned i=0; i<MAX_BDEV; ++i) {
2032 if (bdev[i]) {
2033 assert(h->iocv[i]);
2034 h->iocv[i]->aio_wait();
2035 bdev[i]->queue_reap_ioc(h->iocv[i]);
2036 }
2037 }
2038 delete h;
2039}
2040
2041int BlueFS::open_for_read(
2042 const string& dirname,
2043 const string& filename,
2044 FileReader **h,
2045 bool random)
2046{
2047 std::lock_guard<std::mutex> l(lock);
2048 dout(10) << __func__ << " " << dirname << "/" << filename
2049 << (random ? " (random)":" (sequential)") << dendl;
2050 map<string,DirRef>::iterator p = dir_map.find(dirname);
2051 if (p == dir_map.end()) {
2052 dout(20) << __func__ << " dir " << dirname << " not found" << dendl;
2053 return -ENOENT;
2054 }
2055 DirRef dir = p->second;
2056
2057 map<string,FileRef>::iterator q = dir->file_map.find(filename);
2058 if (q == dir->file_map.end()) {
2059 dout(20) << __func__ << " dir " << dirname << " (" << dir
2060 << ") file " << filename
2061 << " not found" << dendl;
2062 return -ENOENT;
2063 }
2064 File *file = q->second.get();
2065
2066 *h = new FileReader(file, random ? 4096 : cct->_conf->bluefs_max_prefetch,
2067 random, false);
2068 dout(10) << __func__ << " h " << *h << " on " << file->fnode << dendl;
2069 return 0;
2070}
2071
2072int BlueFS::rename(
2073 const string& old_dirname, const string& old_filename,
2074 const string& new_dirname, const string& new_filename)
2075{
2076 std::lock_guard<std::mutex> l(lock);
2077 dout(10) << __func__ << " " << old_dirname << "/" << old_filename
2078 << " -> " << new_dirname << "/" << new_filename << dendl;
2079 map<string,DirRef>::iterator p = dir_map.find(old_dirname);
2080 if (p == dir_map.end()) {
2081 dout(20) << __func__ << " dir " << old_dirname << " not found" << dendl;
2082 return -ENOENT;
2083 }
2084 DirRef old_dir = p->second;
2085 map<string,FileRef>::iterator q = old_dir->file_map.find(old_filename);
2086 if (q == old_dir->file_map.end()) {
2087 dout(20) << __func__ << " dir " << old_dirname << " (" << old_dir
2088 << ") file " << old_filename
2089 << " not found" << dendl;
2090 return -ENOENT;
2091 }
2092 FileRef file = q->second;
2093
2094 p = dir_map.find(new_dirname);
2095 if (p == dir_map.end()) {
2096 dout(20) << __func__ << " dir " << new_dirname << " not found" << dendl;
2097 return -ENOENT;
2098 }
2099 DirRef new_dir = p->second;
2100 q = new_dir->file_map.find(new_filename);
2101 if (q != new_dir->file_map.end()) {
2102 dout(20) << __func__ << " dir " << new_dirname << " (" << old_dir
2103 << ") file " << new_filename
2104 << " already exists, unlinking" << dendl;
2105 assert(q->second != file);
2106 log_t.op_dir_unlink(new_dirname, new_filename);
2107 _drop_link(q->second);
2108 }
2109
2110 dout(10) << __func__ << " " << new_dirname << "/" << new_filename << " "
2111 << " " << file->fnode << dendl;
2112
2113 new_dir->file_map[new_filename] = file;
2114 old_dir->file_map.erase(old_filename);
2115
2116 log_t.op_dir_link(new_dirname, new_filename, file->fnode.ino);
2117 log_t.op_dir_unlink(old_dirname, old_filename);
2118 return 0;
2119}
2120
2121int BlueFS::mkdir(const string& dirname)
2122{
2123 std::lock_guard<std::mutex> l(lock);
2124 dout(10) << __func__ << " " << dirname << dendl;
2125 map<string,DirRef>::iterator p = dir_map.find(dirname);
2126 if (p != dir_map.end()) {
2127 dout(20) << __func__ << " dir " << dirname << " exists" << dendl;
2128 return -EEXIST;
2129 }
2130 dir_map[dirname] = new Dir;
2131 log_t.op_dir_create(dirname);
2132 return 0;
2133}
2134
2135int BlueFS::rmdir(const string& dirname)
2136{
2137 std::lock_guard<std::mutex> l(lock);
2138 dout(10) << __func__ << " " << dirname << dendl;
2139 map<string,DirRef>::iterator p = dir_map.find(dirname);
2140 if (p == dir_map.end()) {
2141 dout(20) << __func__ << " dir " << dirname << " does not exist" << dendl;
2142 return -ENOENT;
2143 }
2144 DirRef dir = p->second;
2145 if (!dir->file_map.empty()) {
2146 dout(20) << __func__ << " dir " << dirname << " not empty" << dendl;
2147 return -ENOTEMPTY;
2148 }
2149 dir_map.erase(dirname);
2150 log_t.op_dir_remove(dirname);
2151 return 0;
2152}
2153
2154bool BlueFS::dir_exists(const string& dirname)
2155{
2156 std::lock_guard<std::mutex> l(lock);
2157 map<string,DirRef>::iterator p = dir_map.find(dirname);
2158 bool exists = p != dir_map.end();
2159 dout(10) << __func__ << " " << dirname << " = " << (int)exists << dendl;
2160 return exists;
2161}
2162
2163int BlueFS::stat(const string& dirname, const string& filename,
2164 uint64_t *size, utime_t *mtime)
2165{
2166 std::lock_guard<std::mutex> l(lock);
2167 dout(10) << __func__ << " " << dirname << "/" << filename << dendl;
2168 map<string,DirRef>::iterator p = dir_map.find(dirname);
2169 if (p == dir_map.end()) {
2170 dout(20) << __func__ << " dir " << dirname << " not found" << dendl;
2171 return -ENOENT;
2172 }
2173 DirRef dir = p->second;
2174 map<string,FileRef>::iterator q = dir->file_map.find(filename);
2175 if (q == dir->file_map.end()) {
2176 dout(20) << __func__ << " dir " << dirname << " (" << dir
2177 << ") file " << filename
2178 << " not found" << dendl;
2179 return -ENOENT;
2180 }
2181 File *file = q->second.get();
2182 dout(10) << __func__ << " " << dirname << "/" << filename
2183 << " " << file->fnode << dendl;
2184 if (size)
2185 *size = file->fnode.size;
2186 if (mtime)
2187 *mtime = file->fnode.mtime;
2188 return 0;
2189}
2190
2191int BlueFS::lock_file(const string& dirname, const string& filename,
2192 FileLock **plock)
2193{
2194 std::lock_guard<std::mutex> l(lock);
2195 dout(10) << __func__ << " " << dirname << "/" << filename << dendl;
2196 map<string,DirRef>::iterator p = dir_map.find(dirname);
2197 if (p == dir_map.end()) {
2198 dout(20) << __func__ << " dir " << dirname << " not found" << dendl;
2199 return -ENOENT;
2200 }
2201 DirRef dir = p->second;
2202 map<string,FileRef>::iterator q = dir->file_map.find(filename);
2203 File *file;
2204 if (q == dir->file_map.end()) {
2205 dout(20) << __func__ << " dir " << dirname << " (" << dir
2206 << ") file " << filename
2207 << " not found, creating" << dendl;
2208 file = new File;
2209 file->fnode.ino = ++ino_last;
2210 file->fnode.mtime = ceph_clock_now();
2211 file_map[ino_last] = file;
2212 dir->file_map[filename] = file;
2213 ++file->refs;
2214 log_t.op_file_update(file->fnode);
2215 log_t.op_dir_link(dirname, filename, file->fnode.ino);
2216 } else {
2217 file = q->second.get();
2218 if (file->locked) {
2219 dout(10) << __func__ << " already locked" << dendl;
2220 return -EBUSY;
2221 }
2222 }
2223 file->locked = true;
2224 *plock = new FileLock(file);
2225 dout(10) << __func__ << " locked " << file->fnode
2226 << " with " << *plock << dendl;
2227 return 0;
2228}
2229
2230int BlueFS::unlock_file(FileLock *fl)
2231{
2232 std::lock_guard<std::mutex> l(lock);
2233 dout(10) << __func__ << " " << fl << " on " << fl->file->fnode << dendl;
2234 assert(fl->file->locked);
2235 fl->file->locked = false;
2236 delete fl;
2237 return 0;
2238}
2239
2240int BlueFS::readdir(const string& dirname, vector<string> *ls)
2241{
2242 std::lock_guard<std::mutex> l(lock);
2243 dout(10) << __func__ << " " << dirname << dendl;
2244 if (dirname.empty()) {
2245 // list dirs
2246 ls->reserve(dir_map.size() + 2);
2247 for (auto& q : dir_map) {
2248 ls->push_back(q.first);
2249 }
2250 } else {
2251 // list files in dir
2252 map<string,DirRef>::iterator p = dir_map.find(dirname);
2253 if (p == dir_map.end()) {
2254 dout(20) << __func__ << " dir " << dirname << " not found" << dendl;
2255 return -ENOENT;
2256 }
2257 DirRef dir = p->second;
2258 ls->reserve(dir->file_map.size() + 2);
2259 for (auto& q : dir->file_map) {
2260 ls->push_back(q.first);
2261 }
2262 }
2263 ls->push_back(".");
2264 ls->push_back("..");
2265 return 0;
2266}
2267
2268int BlueFS::unlink(const string& dirname, const string& filename)
2269{
2270 std::lock_guard<std::mutex> l(lock);
2271 dout(10) << __func__ << " " << dirname << "/" << filename << dendl;
2272 map<string,DirRef>::iterator p = dir_map.find(dirname);
2273 if (p == dir_map.end()) {
2274 dout(20) << __func__ << " dir " << dirname << " not found" << dendl;
2275 return -ENOENT;
2276 }
2277 DirRef dir = p->second;
2278 map<string,FileRef>::iterator q = dir->file_map.find(filename);
2279 if (q == dir->file_map.end()) {
2280 dout(20) << __func__ << " file " << dirname << "/" << filename
2281 << " not found" << dendl;
2282 return -ENOENT;
2283 }
2284 FileRef file = q->second;
2285 if (file->locked) {
2286 dout(20) << __func__ << " file " << dirname << "/" << filename
2287 << " is locked" << dendl;
2288 return -EBUSY;
2289 }
2290 dir->file_map.erase(filename);
2291 log_t.op_dir_unlink(dirname, filename);
2292 _drop_link(file);
2293 return 0;
2294}