1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2 // vim: ts=8 sw=2 smarttab
4 #include "boost/algorithm/string.hpp"
7 #include "common/debug.h"
8 #include "common/errno.h"
9 #include "common/perf_counters.h"
10 #include "BlockDevice.h"
11 #include "Allocator.h"
12 #include "include/assert.h"
14 #define dout_context cct
15 #define dout_subsys ceph_subsys_bluefs
17 #define dout_prefix *_dout << "bluefs "
19 MEMPOOL_DEFINE_OBJECT_FACTORY(BlueFS::File
, bluefs_file
, bluefs
);
20 MEMPOOL_DEFINE_OBJECT_FACTORY(BlueFS::Dir
, bluefs_dir
, bluefs
);
21 MEMPOOL_DEFINE_OBJECT_FACTORY(BlueFS::FileWriter
, bluefs_file_writer
, bluefs
);
22 MEMPOOL_DEFINE_OBJECT_FACTORY(BlueFS::FileReaderBuffer
,
23 bluefs_file_reader_buffer
, bluefs
);
24 MEMPOOL_DEFINE_OBJECT_FACTORY(BlueFS::FileReader
, bluefs_file_reader
, bluefs
);
25 MEMPOOL_DEFINE_OBJECT_FACTORY(BlueFS::FileLock
, bluefs_file_lock
, bluefs
);
28 BlueFS::BlueFS(CephContext
* cct
)
33 block_total(MAX_BDEV
, 0)
54 void BlueFS::_init_logger()
56 PerfCountersBuilder
b(cct
, "bluefs",
57 l_bluefs_first
, l_bluefs_last
);
58 b
.add_u64_counter(l_bluefs_gift_bytes
, "gift_bytes",
59 "Bytes gifted from BlueStore", NULL
, 0, unit_t(BYTES
));
60 b
.add_u64_counter(l_bluefs_reclaim_bytes
, "reclaim_bytes",
61 "Bytes reclaimed by BlueStore", NULL
, 0, unit_t(BYTES
));
62 b
.add_u64(l_bluefs_db_total_bytes
, "db_total_bytes",
63 "Total bytes (main db device)",
64 "b", PerfCountersBuilder::PRIO_USEFUL
, unit_t(BYTES
));
65 b
.add_u64(l_bluefs_db_used_bytes
, "db_used_bytes",
66 "Used bytes (main db device)",
67 "u", PerfCountersBuilder::PRIO_USEFUL
, unit_t(BYTES
));
68 b
.add_u64(l_bluefs_wal_total_bytes
, "wal_total_bytes",
69 "Total bytes (wal device)",
70 "walb", PerfCountersBuilder::PRIO_USEFUL
, unit_t(BYTES
));
71 b
.add_u64(l_bluefs_wal_used_bytes
, "wal_used_bytes",
72 "Used bytes (wal device)",
73 "walu", PerfCountersBuilder::PRIO_USEFUL
, unit_t(BYTES
));
74 b
.add_u64(l_bluefs_slow_total_bytes
, "slow_total_bytes",
75 "Total bytes (slow device)",
76 "slob", PerfCountersBuilder::PRIO_USEFUL
, unit_t(BYTES
));
77 b
.add_u64(l_bluefs_slow_used_bytes
, "slow_used_bytes",
78 "Used bytes (slow device)",
79 "slou", PerfCountersBuilder::PRIO_USEFUL
, unit_t(BYTES
));
80 b
.add_u64(l_bluefs_num_files
, "num_files", "File count",
81 "f", PerfCountersBuilder::PRIO_USEFUL
);
82 b
.add_u64(l_bluefs_log_bytes
, "log_bytes", "Size of the metadata log",
83 "jlen", PerfCountersBuilder::PRIO_INTERESTING
, unit_t(BYTES
));
84 b
.add_u64_counter(l_bluefs_log_compactions
, "log_compactions",
85 "Compactions of the metadata log");
86 b
.add_u64_counter(l_bluefs_logged_bytes
, "logged_bytes",
87 "Bytes written to the metadata log", "j",
88 PerfCountersBuilder::PRIO_CRITICAL
, unit_t(BYTES
));
89 b
.add_u64_counter(l_bluefs_files_written_wal
, "files_written_wal",
90 "Files written to WAL");
91 b
.add_u64_counter(l_bluefs_files_written_sst
, "files_written_sst",
92 "Files written to SSTs");
93 b
.add_u64_counter(l_bluefs_bytes_written_wal
, "bytes_written_wal",
94 "Bytes written to WAL", "wal",
95 PerfCountersBuilder::PRIO_CRITICAL
);
96 b
.add_u64_counter(l_bluefs_bytes_written_sst
, "bytes_written_sst",
97 "Bytes written to SSTs", "sst",
98 PerfCountersBuilder::PRIO_CRITICAL
, unit_t(BYTES
));
99 logger
= b
.create_perf_counters();
100 cct
->get_perfcounters_collection()->add(logger
);
103 void BlueFS::_shutdown_logger()
105 cct
->get_perfcounters_collection()->remove(logger
);
109 void BlueFS::_update_logger_stats()
111 // we must be holding the lock
112 logger
->set(l_bluefs_num_files
, file_map
.size());
113 logger
->set(l_bluefs_log_bytes
, log_writer
->file
->fnode
.size
);
115 if (alloc
[BDEV_WAL
]) {
116 logger
->set(l_bluefs_wal_total_bytes
, block_total
[BDEV_WAL
]);
117 logger
->set(l_bluefs_wal_used_bytes
,
118 block_total
[BDEV_WAL
] - alloc
[BDEV_WAL
]->get_free());
120 if (alloc
[BDEV_DB
]) {
121 logger
->set(l_bluefs_db_total_bytes
, block_total
[BDEV_DB
]);
122 logger
->set(l_bluefs_db_used_bytes
,
123 block_total
[BDEV_DB
] - alloc
[BDEV_DB
]->get_free());
125 if (alloc
[BDEV_SLOW
]) {
126 logger
->set(l_bluefs_slow_total_bytes
, block_total
[BDEV_SLOW
]);
127 logger
->set(l_bluefs_slow_used_bytes
,
128 block_total
[BDEV_SLOW
] - alloc
[BDEV_SLOW
]->get_free());
132 int BlueFS::add_block_device(unsigned id
, const string
& path
)
134 dout(10) << __func__
<< " bdev " << id
<< " path " << path
<< dendl
;
135 assert(id
< bdev
.size());
136 assert(bdev
[id
] == NULL
);
137 BlockDevice
*b
= BlockDevice::create(cct
, path
, NULL
, NULL
);
138 int r
= b
->open(path
);
143 dout(1) << __func__
<< " bdev " << id
<< " path " << path
144 << " size " << byte_u_t(b
->get_size()) << dendl
;
146 ioc
[id
] = new IOContext(cct
, NULL
);
150 bool BlueFS::bdev_support_label(unsigned id
)
152 assert(id
< bdev
.size());
154 return bdev
[id
]->supported_bdev_label();
157 uint64_t BlueFS::get_block_device_size(unsigned id
)
159 if (id
< bdev
.size() && bdev
[id
])
160 return bdev
[id
]->get_size();
164 void BlueFS::add_block_extent(unsigned id
, uint64_t offset
, uint64_t length
)
166 std::unique_lock
<std::mutex
> l(lock
);
167 dout(1) << __func__
<< " bdev " << id
168 << " 0x" << std::hex
<< offset
<< "~" << length
<< std::dec
170 assert(id
< bdev
.size());
172 assert(bdev
[id
]->get_size() >= offset
+ length
);
173 block_all
[id
].insert(offset
, length
);
174 block_total
[id
] += length
;
176 if (id
< alloc
.size() && alloc
[id
]) {
177 log_t
.op_alloc_add(id
, offset
, length
);
178 int r
= _flush_and_sync_log(l
);
180 alloc
[id
]->init_add_free(offset
, length
);
184 logger
->inc(l_bluefs_gift_bytes
, length
);
185 dout(10) << __func__
<< " done" << dendl
;
188 int BlueFS::reclaim_blocks(unsigned id
, uint64_t want
,
189 AllocExtentVector
*extents
)
191 std::unique_lock
<std::mutex
> l(lock
);
192 dout(1) << __func__
<< " bdev " << id
193 << " want 0x" << std::hex
<< want
<< std::dec
<< dendl
;
194 assert(id
< alloc
.size());
196 int r
= alloc
[id
]->reserve(want
);
197 assert(r
== 0); // caller shouldn't ask for more than they can get
198 int64_t got
= alloc
[id
]->allocate(want
, cct
->_conf
->bluefs_alloc_size
, 0,
200 if (got
< (int64_t)want
) {
201 alloc
[id
]->unreserve(want
- MAX(0, got
));
204 derr
<< __func__
<< " failed to allocate space to return to bluestore"
210 for (auto& p
: *extents
) {
211 block_all
[id
].erase(p
.offset
, p
.length
);
212 block_total
[id
] -= p
.length
;
213 log_t
.op_alloc_rm(id
, p
.offset
, p
.length
);
217 r
= _flush_and_sync_log(l
);
221 logger
->inc(l_bluefs_reclaim_bytes
, got
);
222 dout(1) << __func__
<< " bdev " << id
<< " want 0x" << std::hex
<< want
223 << " got " << *extents
<< dendl
;
227 uint64_t BlueFS::get_fs_usage()
229 std::lock_guard
<std::mutex
> l(lock
);
230 uint64_t total_bytes
= 0;
231 for (auto& p
: file_map
) {
232 total_bytes
+= p
.second
->fnode
.get_allocated();
237 uint64_t BlueFS::get_total(unsigned id
)
239 std::lock_guard
<std::mutex
> l(lock
);
240 assert(id
< block_all
.size());
241 return block_total
[id
];
244 uint64_t BlueFS::get_free(unsigned id
)
246 std::lock_guard
<std::mutex
> l(lock
);
247 assert(id
< alloc
.size());
248 return alloc
[id
]->get_free();
251 void BlueFS::dump_perf_counters(Formatter
*f
)
253 f
->open_object_section("bluefs_perf_counters");
254 logger
->dump_formatted(f
,0);
258 void BlueFS::dump_block_extents(ostream
& out
)
260 for (unsigned i
= 0; i
< MAX_BDEV
; ++i
) {
264 out
<< i
<< " : size 0x" << std::hex
<< bdev
[i
]->get_size()
265 << " : own 0x" << block_all
[i
] << std::dec
<< "\n";
269 void BlueFS::get_usage(vector
<pair
<uint64_t,uint64_t>> *usage
)
271 std::lock_guard
<std::mutex
> l(lock
);
272 usage
->resize(bdev
.size());
273 for (unsigned id
= 0; id
< bdev
.size(); ++id
) {
275 (*usage
)[id
] = make_pair(0, 0);
278 (*usage
)[id
].first
= alloc
[id
]->get_free();
279 (*usage
)[id
].second
= block_total
[id
];
281 (block_total
[id
] - (*usage
)[id
].first
) * 100 / block_total
[id
];
282 dout(10) << __func__
<< " bdev " << id
283 << " free " << (*usage
)[id
].first
284 << " (" << byte_u_t((*usage
)[id
].first
) << ")"
285 << " / " << (*usage
)[id
].second
286 << " (" << byte_u_t((*usage
)[id
].second
) << ")"
287 << ", used " << used
<< "%"
292 int BlueFS::get_block_extents(unsigned id
, interval_set
<uint64_t> *extents
)
294 std::lock_guard
<std::mutex
> l(lock
);
295 dout(10) << __func__
<< " bdev " << id
<< dendl
;
296 if (id
>= block_all
.size())
298 *extents
= block_all
[id
];
302 // returns true if specified device is attached
303 bool BlueFS::is_device(unsigned id
)
305 return !(id
>= MAX_BDEV
|| bdev
[id
] == nullptr);
308 // returns true if specified device is under full bluefs control
309 // and hence can be expanded
310 bool BlueFS::is_device_expandable(unsigned id
)
312 if (id
>= MAX_BDEV
|| bdev
[id
] == nullptr) {
320 // true if DB volume is non-shared
321 return bdev
[BDEV_SLOW
] != nullptr;
326 int BlueFS::mkfs(uuid_d osd_uuid
)
328 std::unique_lock
<std::mutex
> l(lock
);
330 << " osd_uuid " << osd_uuid
337 super
.block_size
= bdev
[BDEV_DB
]->get_block_size();
338 super
.osd_uuid
= osd_uuid
;
339 super
.uuid
.generate_random();
340 dout(1) << __func__
<< " uuid " << super
.uuid
<< dendl
;
343 FileRef log_file
= new File
;
344 log_file
->fnode
.ino
= 1;
345 log_file
->fnode
.prefer_bdev
= BDEV_WAL
;
347 log_file
->fnode
.prefer_bdev
,
348 cct
->_conf
->bluefs_max_log_runway
,
351 log_writer
= _create_writer(log_file
);
355 for (unsigned bdev
= 0; bdev
< MAX_BDEV
; ++bdev
) {
356 interval_set
<uint64_t>& p
= block_all
[bdev
];
359 for (interval_set
<uint64_t>::iterator q
= p
.begin(); q
!= p
.end(); ++q
) {
360 dout(20) << __func__
<< " op_alloc_add " << bdev
<< " 0x"
361 << std::hex
<< q
.get_start() << "~" << q
.get_len() << std::dec
363 log_t
.op_alloc_add(bdev
, q
.get_start(), q
.get_len());
366 _flush_and_sync_log(l
);
369 super
.log_fnode
= log_file
->fnode
;
374 super
= bluefs_super_t();
375 _close_writer(log_writer
);
382 dout(10) << __func__
<< " success" << dendl
;
386 void BlueFS::_init_alloc()
388 dout(20) << __func__
<< dendl
;
389 alloc
.resize(MAX_BDEV
);
390 pending_release
.resize(MAX_BDEV
);
391 for (unsigned id
= 0; id
< bdev
.size(); ++id
) {
395 assert(bdev
[id
]->get_size());
396 alloc
[id
] = Allocator::create(cct
, cct
->_conf
->bluefs_allocator
,
397 bdev
[id
]->get_size(),
398 cct
->_conf
->bluefs_alloc_size
);
399 interval_set
<uint64_t>& p
= block_all
[id
];
400 for (interval_set
<uint64_t>::iterator q
= p
.begin(); q
!= p
.end(); ++q
) {
401 alloc
[id
]->init_add_free(q
.get_start(), q
.get_len());
406 void BlueFS::_stop_alloc()
408 dout(20) << __func__
<< dendl
;
409 for (auto p
: alloc
) {
420 dout(1) << __func__
<< dendl
;
422 int r
= _open_super();
424 derr
<< __func__
<< " failed to open super: " << cpp_strerror(r
) << dendl
;
429 block_all
.resize(MAX_BDEV
);
431 block_total
.resize(MAX_BDEV
, 0);
436 derr
<< __func__
<< " failed to replay log: " << cpp_strerror(r
) << dendl
;
442 for (auto& p
: file_map
) {
443 dout(30) << __func__
<< " noting alloc for " << p
.second
->fnode
<< dendl
;
444 for (auto& q
: p
.second
->fnode
.extents
) {
445 alloc
[q
.bdev
]->init_rm_free(q
.offset
, q
.length
);
449 // set up the log for future writes
450 log_writer
= _create_writer(_get_file(1));
451 assert(log_writer
->file
->fnode
.ino
== 1);
452 log_writer
->pos
= log_writer
->file
->fnode
.size
;
453 dout(10) << __func__
<< " log write pos set to 0x"
454 << std::hex
<< log_writer
->pos
<< std::dec
461 super
= bluefs_super_t();
465 void BlueFS::umount()
467 dout(1) << __func__
<< dendl
;
471 _close_writer(log_writer
);
477 super
= bluefs_super_t();
482 void BlueFS::collect_metadata(map
<string
,string
> *pm
)
485 bdev
[BDEV_DB
]->collect_metadata("bluefs_db_", pm
);
487 bdev
[BDEV_WAL
]->collect_metadata("bluefs_wal_", pm
);
489 bdev
[BDEV_SLOW
]->collect_metadata("bluefs_slow_", pm
);
494 std::lock_guard
<std::mutex
> l(lock
);
495 dout(1) << __func__
<< dendl
;
496 // hrm, i think we check everything on mount...
500 int BlueFS::_write_super()
505 uint32_t crc
= bl
.crc32c(-1);
507 dout(10) << __func__
<< " super block length(encoded): " << bl
.length() << dendl
;
508 dout(10) << __func__
<< " superblock " << super
.version
<< dendl
;
509 dout(10) << __func__
<< " log_fnode " << super
.log_fnode
<< dendl
;
510 assert(bl
.length() <= get_super_length());
511 bl
.append_zero(get_super_length() - bl
.length());
513 bdev
[BDEV_DB
]->write(get_super_offset(), bl
, false);
514 dout(20) << __func__
<< " v " << super
.version
515 << " crc 0x" << std::hex
<< crc
516 << " offset 0x" << get_super_offset() << std::dec
521 int BlueFS::_open_super()
523 dout(10) << __func__
<< dendl
;
526 uint32_t expected_crc
, crc
;
529 // always the second block
530 r
= bdev
[BDEV_DB
]->read(get_super_offset(), get_super_length(),
531 &bl
, ioc
[BDEV_DB
], false);
535 bufferlist::iterator p
= bl
.begin();
539 t
.substr_of(bl
, 0, p
.get_off());
542 ::decode(expected_crc
, p
);
543 if (crc
!= expected_crc
) {
544 derr
<< __func__
<< " bad crc on superblock, expected 0x"
545 << std::hex
<< expected_crc
<< " != actual 0x" << crc
<< std::dec
549 dout(10) << __func__
<< " superblock " << super
.version
<< dendl
;
550 dout(10) << __func__
<< " log_fnode " << super
.log_fnode
<< dendl
;
554 int BlueFS::_replay(bool noop
)
556 dout(10) << __func__
<< (noop
? " NO-OP" : "") << dendl
;
557 ino_last
= 1; // by the log
564 log_file
= _get_file(1);
566 log_file
->fnode
= super
.log_fnode
;
567 dout(10) << __func__
<< " log_fnode " << super
.log_fnode
<< dendl
;
569 FileReader
*log_reader
= new FileReader(
570 log_file
, cct
->_conf
->bluefs_max_prefetch
,
574 assert((log_reader
->buf
.pos
& ~super
.block_mask()) == 0);
575 uint64_t pos
= log_reader
->buf
.pos
;
576 uint64_t read_pos
= pos
;
579 int r
= _read(log_reader
, &log_reader
->buf
, read_pos
, super
.block_size
,
581 assert(r
== (int)super
.block_size
);
588 bufferlist::iterator p
= bl
.begin();
596 if (len
+ 6 > bl
.length()) {
597 more
= ROUND_UP_TO(len
+ 6 - bl
.length(), super
.block_size
);
600 if (uuid
!= super
.uuid
) {
601 dout(10) << __func__
<< " 0x" << std::hex
<< pos
<< std::dec
602 << ": stop: uuid " << uuid
<< " != super.uuid " << super
.uuid
606 if (seq
!= log_seq
+ 1) {
607 dout(10) << __func__
<< " 0x" << std::hex
<< pos
<< std::dec
608 << ": stop: seq " << seq
<< " != expected " << log_seq
+ 1
613 dout(20) << __func__
<< " need 0x" << std::hex
<< more
<< std::dec
614 << " more bytes" << dendl
;
616 int r
= _read(log_reader
, &log_reader
->buf
, read_pos
, more
, &t
, NULL
);
618 dout(10) << __func__
<< " 0x" << std::hex
<< pos
619 << ": stop: len is 0x" << bl
.length() + more
<< std::dec
620 << ", which is past eof" << dendl
;
623 assert(r
== (int)more
);
627 bluefs_transaction_t t
;
629 bufferlist::iterator p
= bl
.begin();
632 catch (buffer::error
& e
) {
633 dout(10) << __func__
<< " 0x" << std::hex
<< pos
<< std::dec
634 << ": stop: failed to decode: " << e
.what()
639 assert(seq
== t
.seq
);
640 dout(10) << __func__
<< " 0x" << std::hex
<< pos
<< std::dec
641 << ": " << t
<< dendl
;
643 bufferlist::iterator p
= t
.op_bl
.begin();
649 case bluefs_transaction_t::OP_INIT
:
650 dout(20) << __func__
<< " 0x" << std::hex
<< pos
<< std::dec
651 << ": op_init" << dendl
;
655 case bluefs_transaction_t::OP_JUMP
:
659 ::decode(next_seq
, p
);
661 dout(20) << __func__
<< " 0x" << std::hex
<< pos
<< std::dec
662 << ": op_jump seq " << next_seq
663 << " offset 0x" << std::hex
<< offset
<< std::dec
<< dendl
;
664 assert(next_seq
>= log_seq
);
665 log_seq
= next_seq
- 1; // we will increment it below
666 uint64_t skip
= offset
- read_pos
;
669 int r
= _read(log_reader
, &log_reader
->buf
, read_pos
, skip
, &junk
,
671 if (r
!= (int)skip
) {
672 dout(10) << __func__
<< " 0x" << std::hex
<< read_pos
673 << ": stop: failed to skip to " << offset
674 << std::dec
<< dendl
;
675 assert(0 == "problem with op_jump");
681 case bluefs_transaction_t::OP_JUMP_SEQ
:
684 ::decode(next_seq
, p
);
685 dout(20) << __func__
<< " 0x" << std::hex
<< pos
<< std::dec
686 << ": op_jump_seq " << next_seq
<< dendl
;
687 assert(next_seq
>= log_seq
);
688 log_seq
= next_seq
- 1; // we will increment it below
692 case bluefs_transaction_t::OP_ALLOC_ADD
:
695 uint64_t offset
, length
;
699 dout(20) << __func__
<< " 0x" << std::hex
<< pos
<< std::dec
700 << ": op_alloc_add " << " " << (int)id
701 << ":0x" << std::hex
<< offset
<< "~" << length
<< std::dec
704 block_all
[id
].insert(offset
, length
);
705 block_total
[id
] += length
;
706 alloc
[id
]->init_add_free(offset
, length
);
711 case bluefs_transaction_t::OP_ALLOC_RM
:
714 uint64_t offset
, length
;
718 dout(20) << __func__
<< " 0x" << std::hex
<< pos
<< std::dec
719 << ": op_alloc_rm " << " " << (int)id
720 << ":0x" << std::hex
<< offset
<< "~" << length
<< std::dec
723 block_all
[id
].erase(offset
, length
);
724 block_total
[id
] -= length
;
725 alloc
[id
]->init_rm_free(offset
, length
);
730 case bluefs_transaction_t::OP_DIR_LINK
:
732 string dirname
, filename
;
734 ::decode(dirname
, p
);
735 ::decode(filename
, p
);
737 dout(20) << __func__
<< " 0x" << std::hex
<< pos
<< std::dec
738 << ": op_dir_link " << " " << dirname
<< "/" << filename
742 FileRef file
= _get_file(ino
);
743 assert(file
->fnode
.ino
);
744 map
<string
,DirRef
>::iterator q
= dir_map
.find(dirname
);
745 assert(q
!= dir_map
.end());
746 map
<string
,FileRef
>::iterator r
= q
->second
->file_map
.find(filename
);
747 assert(r
== q
->second
->file_map
.end());
748 q
->second
->file_map
[filename
] = file
;
754 case bluefs_transaction_t::OP_DIR_UNLINK
:
756 string dirname
, filename
;
757 ::decode(dirname
, p
);
758 ::decode(filename
, p
);
759 dout(20) << __func__
<< " 0x" << std::hex
<< pos
<< std::dec
760 << ": op_dir_unlink " << " " << dirname
<< "/" << filename
763 map
<string
,DirRef
>::iterator q
= dir_map
.find(dirname
);
764 assert(q
!= dir_map
.end());
765 map
<string
,FileRef
>::iterator r
= q
->second
->file_map
.find(filename
);
766 assert(r
!= q
->second
->file_map
.end());
767 assert(r
->second
->refs
> 0);
769 q
->second
->file_map
.erase(r
);
774 case bluefs_transaction_t::OP_DIR_CREATE
:
777 ::decode(dirname
, p
);
778 dout(20) << __func__
<< " 0x" << std::hex
<< pos
<< std::dec
779 << ": op_dir_create " << dirname
<< dendl
;
781 map
<string
,DirRef
>::iterator q
= dir_map
.find(dirname
);
782 assert(q
== dir_map
.end());
783 dir_map
[dirname
] = new Dir
;
788 case bluefs_transaction_t::OP_DIR_REMOVE
:
791 ::decode(dirname
, p
);
792 dout(20) << __func__
<< " 0x" << std::hex
<< pos
<< std::dec
793 << ": op_dir_remove " << dirname
<< dendl
;
795 map
<string
,DirRef
>::iterator q
= dir_map
.find(dirname
);
796 assert(q
!= dir_map
.end());
797 assert(q
->second
->file_map
.empty());
803 case bluefs_transaction_t::OP_FILE_UPDATE
:
805 bluefs_fnode_t fnode
;
807 dout(20) << __func__
<< " 0x" << std::hex
<< pos
<< std::dec
808 << ": op_file_update " << " " << fnode
<< dendl
;
810 FileRef f
= _get_file(fnode
.ino
);
812 if (fnode
.ino
> ino_last
) {
813 ino_last
= fnode
.ino
;
819 case bluefs_transaction_t::OP_FILE_REMOVE
:
823 dout(20) << __func__
<< " 0x" << std::hex
<< pos
<< std::dec
824 << ": op_file_remove " << ino
<< dendl
;
826 auto p
= file_map
.find(ino
);
827 assert(p
!= file_map
.end());
834 derr
<< __func__
<< " 0x" << std::hex
<< pos
<< std::dec
835 << ": stop: unrecognized op " << (int)op
<< dendl
;
842 // we successfully replayed the transaction; bump the seq and log size
844 log_file
->fnode
.size
= log_reader
->buf
.pos
;
847 dout(10) << __func__
<< " log file size was 0x"
848 << std::hex
<< log_file
->fnode
.size
<< std::dec
<< dendl
;
852 // verify file link counts are all >0
853 for (auto& p
: file_map
) {
854 if (p
.second
->refs
== 0 &&
855 p
.second
->fnode
.ino
> 1) {
856 derr
<< __func__
<< " file with link count 0: " << p
.second
->fnode
863 dout(10) << __func__
<< " done" << dendl
;
867 BlueFS::FileRef
BlueFS::_get_file(uint64_t ino
)
869 auto p
= file_map
.find(ino
);
870 if (p
== file_map
.end()) {
871 FileRef f
= new File
;
873 dout(30) << __func__
<< " ino " << ino
<< " = " << f
874 << " (new)" << dendl
;
877 dout(30) << __func__
<< " ino " << ino
<< " = " << p
->second
<< dendl
;
882 void BlueFS::_drop_link(FileRef file
)
884 dout(20) << __func__
<< " had refs " << file
->refs
885 << " on " << file
->fnode
<< dendl
;
886 assert(file
->refs
> 0);
888 if (file
->refs
== 0) {
889 dout(20) << __func__
<< " destroying " << file
->fnode
<< dendl
;
890 assert(file
->num_reading
.load() == 0);
891 log_t
.op_file_remove(file
->fnode
.ino
);
892 for (auto& r
: file
->fnode
.extents
) {
893 pending_release
[r
.bdev
].insert(r
.offset
, r
.length
);
895 file_map
.erase(file
->fnode
.ino
);
896 file
->deleted
= true;
898 if (file
->dirty_seq
) {
899 assert(file
->dirty_seq
> log_seq_stable
);
900 assert(dirty_files
.count(file
->dirty_seq
));
901 auto it
= dirty_files
[file
->dirty_seq
].iterator_to(*file
);
902 dirty_files
[file
->dirty_seq
].erase(it
);
908 int BlueFS::_read_random(
909 FileReader
*h
, ///< [in] read from here
910 uint64_t off
, ///< [in] offset
911 size_t len
, ///< [in] this many bytes
912 char *out
) ///< [out] optional: or copy it here
914 dout(10) << __func__
<< " h " << h
915 << " 0x" << std::hex
<< off
<< "~" << len
<< std::dec
916 << " from " << h
->file
->fnode
<< dendl
;
918 ++h
->file
->num_reading
;
920 if (!h
->ignore_eof
&&
921 off
+ len
> h
->file
->fnode
.size
) {
922 if (off
> h
->file
->fnode
.size
)
925 len
= h
->file
->fnode
.size
- off
;
926 dout(20) << __func__
<< " reaching (or past) eof, len clipped to 0x"
927 << std::hex
<< len
<< std::dec
<< dendl
;
933 auto p
= h
->file
->fnode
.seek(off
, &x_off
);
934 uint64_t l
= MIN(p
->length
- x_off
, len
);
935 dout(20) << __func__
<< " read buffered 0x"
936 << std::hex
<< x_off
<< "~" << l
<< std::dec
937 << " of " << *p
<< dendl
;
938 int r
= bdev
[p
->bdev
]->read_random(p
->offset
+ x_off
, l
, out
,
939 cct
->_conf
->bluefs_buffered_io
);
947 dout(20) << __func__
<< " got " << ret
<< dendl
;
948 --h
->file
->num_reading
;
953 FileReader
*h
, ///< [in] read from here
954 FileReaderBuffer
*buf
, ///< [in] reader state
955 uint64_t off
, ///< [in] offset
956 size_t len
, ///< [in] this many bytes
957 bufferlist
*outbl
, ///< [out] optional: reference the result here
958 char *out
) ///< [out] optional: or copy it here
960 dout(10) << __func__
<< " h " << h
961 << " 0x" << std::hex
<< off
<< "~" << len
<< std::dec
962 << " from " << h
->file
->fnode
<< dendl
;
964 ++h
->file
->num_reading
;
966 if (!h
->ignore_eof
&&
967 off
+ len
> h
->file
->fnode
.size
) {
968 if (off
> h
->file
->fnode
.size
)
971 len
= h
->file
->fnode
.size
- off
;
972 dout(20) << __func__
<< " reaching (or past) eof, len clipped to 0x"
973 << std::hex
<< len
<< std::dec
<< dendl
;
981 if (off
< buf
->bl_off
|| off
>= buf
->get_buf_end()) {
983 buf
->bl_off
= off
& super
.block_mask();
985 auto p
= h
->file
->fnode
.seek(buf
->bl_off
, &x_off
);
986 uint64_t want
= ROUND_UP_TO(len
+ (off
& ~super
.block_mask()),
988 want
= MAX(want
, buf
->max_prefetch
);
989 uint64_t l
= MIN(p
->length
- x_off
, want
);
990 uint64_t eof_offset
= ROUND_UP_TO(h
->file
->fnode
.size
, super
.block_size
);
991 if (!h
->ignore_eof
&&
992 buf
->bl_off
+ l
> eof_offset
) {
993 l
= eof_offset
- buf
->bl_off
;
995 dout(20) << __func__
<< " fetching 0x"
996 << std::hex
<< x_off
<< "~" << l
<< std::dec
997 << " of " << *p
<< dendl
;
998 int r
= bdev
[p
->bdev
]->read(p
->offset
+ x_off
, l
, &buf
->bl
, ioc
[p
->bdev
],
999 cct
->_conf
->bluefs_buffered_io
);
1002 left
= buf
->get_buf_remaining(off
);
1003 dout(20) << __func__
<< " left 0x" << std::hex
<< left
1004 << " len 0x" << len
<< std::dec
<< dendl
;
1006 int r
= MIN(len
, left
);
1009 t
.substr_of(buf
->bl
, off
- buf
->bl_off
, r
);
1010 outbl
->claim_append(t
);
1013 // NOTE: h->bl is normally a contiguous buffer so c_str() is free.
1014 memcpy(out
, buf
->bl
.c_str() + off
- buf
->bl_off
, r
);
1018 dout(30) << __func__
<< " result chunk (0x"
1019 << std::hex
<< r
<< std::dec
<< " bytes):\n";
1021 t
.substr_of(buf
->bl
, off
- buf
->bl_off
, r
);
1031 dout(20) << __func__
<< " got " << ret
<< dendl
;
1032 assert(!outbl
|| (int)outbl
->length() == ret
);
1033 --h
->file
->num_reading
;
1037 void BlueFS::_invalidate_cache(FileRef f
, uint64_t offset
, uint64_t length
)
1039 dout(10) << __func__
<< " file " << f
->fnode
1040 << " 0x" << std::hex
<< offset
<< "~" << length
<< std::dec
1042 if (offset
& ~super
.block_mask()) {
1043 offset
&= super
.block_mask();
1044 length
= ROUND_UP_TO(length
, super
.block_size
);
1047 auto p
= f
->fnode
.seek(offset
, &x_off
);
1048 while (length
> 0 && p
!= f
->fnode
.extents
.end()) {
1049 uint64_t x_len
= MIN(p
->length
- x_off
, length
);
1050 bdev
[p
->bdev
]->invalidate_cache(p
->offset
+ x_off
, x_len
);
1051 dout(20) << __func__
<< " 0x" << std::hex
<< x_off
<< "~" << x_len
1052 << std:: dec
<< " of " << *p
<< dendl
;
1058 uint64_t BlueFS::_estimate_log_size()
1060 int avg_dir_size
= 40; // fixme
1061 int avg_file_size
= 12;
1062 uint64_t size
= 4096 * 2;
1063 size
+= file_map
.size() * (1 + sizeof(bluefs_fnode_t
));
1064 for (auto& p
: block_all
)
1065 size
+= p
.num_intervals() * (1 + 1 + sizeof(uint64_t) * 2);
1066 size
+= dir_map
.size() + (1 + avg_dir_size
);
1067 size
+= file_map
.size() * (1 + avg_dir_size
+ avg_file_size
);
1068 return ROUND_UP_TO(size
, super
.block_size
);
1071 void BlueFS::compact_log()
1073 std::unique_lock
<std::mutex
> l(lock
);
1074 if (cct
->_conf
->bluefs_compact_log_sync
) {
1075 _compact_log_sync();
1077 _compact_log_async(l
);
1081 bool BlueFS::_should_compact_log()
1083 uint64_t current
= log_writer
->file
->fnode
.size
;
1084 uint64_t expected
= _estimate_log_size();
1085 float ratio
= (float)current
/ (float)expected
;
1086 dout(10) << __func__
<< " current 0x" << std::hex
<< current
1087 << " expected " << expected
<< std::dec
1088 << " ratio " << ratio
1089 << (new_log
? " (async compaction in progress)" : "")
1092 current
< cct
->_conf
->bluefs_log_compact_min_size
||
1093 ratio
< cct
->_conf
->bluefs_log_compact_min_ratio
) {
1099 void BlueFS::_compact_log_dump_metadata(bluefs_transaction_t
*t
)
1102 t
->uuid
= super
.uuid
;
1103 dout(20) << __func__
<< " op_init" << dendl
;
1106 for (unsigned bdev
= 0; bdev
< MAX_BDEV
; ++bdev
) {
1107 interval_set
<uint64_t>& p
= block_all
[bdev
];
1108 for (interval_set
<uint64_t>::iterator q
= p
.begin(); q
!= p
.end(); ++q
) {
1109 dout(20) << __func__
<< " op_alloc_add " << bdev
<< " 0x"
1110 << std::hex
<< q
.get_start() << "~" << q
.get_len() << std::dec
1112 t
->op_alloc_add(bdev
, q
.get_start(), q
.get_len());
1115 for (auto& p
: file_map
) {
1118 dout(20) << __func__
<< " op_file_update " << p
.second
->fnode
<< dendl
;
1119 assert(p
.first
> 1);
1120 t
->op_file_update(p
.second
->fnode
);
1122 for (auto& p
: dir_map
) {
1123 dout(20) << __func__
<< " op_dir_create " << p
.first
<< dendl
;
1124 t
->op_dir_create(p
.first
);
1125 for (auto& q
: p
.second
->file_map
) {
1126 dout(20) << __func__
<< " op_dir_link " << p
.first
<< "/" << q
.first
1127 << " to " << q
.second
->fnode
.ino
<< dendl
;
1128 t
->op_dir_link(p
.first
, q
.first
, q
.second
->fnode
.ino
);
1133 void BlueFS::_compact_log_sync()
1135 dout(10) << __func__
<< dendl
;
1136 File
*log_file
= log_writer
->file
.get();
1138 // clear out log (be careful who calls us!!!)
1141 bluefs_transaction_t t
;
1142 _compact_log_dump_metadata(&t
);
1144 dout(20) << __func__
<< " op_jump_seq " << log_seq
<< dendl
;
1145 t
.op_jump_seq(log_seq
);
1151 uint64_t need
= bl
.length() + cct
->_conf
->bluefs_max_log_runway
;
1152 dout(20) << __func__
<< " need " << need
<< dendl
;
1154 mempool::bluefs::vector
<bluefs_extent_t
> old_extents
;
1155 uint64_t old_allocated
= 0;
1156 log_file
->fnode
.swap_extents(old_extents
, old_allocated
);
1157 while (log_file
->fnode
.get_allocated() < need
) {
1158 int r
= _allocate(log_file
->fnode
.prefer_bdev
,
1159 need
- log_file
->fnode
.get_allocated(),
1164 _close_writer(log_writer
);
1166 log_file
->fnode
.size
= bl
.length();
1167 log_writer
= _create_writer(log_file
);
1168 log_writer
->append(bl
);
1169 int r
= _flush(log_writer
, true);
1171 wait_for_aio(log_writer
);
1173 list
<aio_t
> completed_ios
;
1174 _claim_completed_aios(log_writer
, &completed_ios
);
1176 completed_ios
.clear();
1178 dout(10) << __func__
<< " writing super" << dendl
;
1179 super
.log_fnode
= log_file
->fnode
;
1184 dout(10) << __func__
<< " release old log extents " << old_extents
<< dendl
;
1185 for (auto& r
: old_extents
) {
1186 pending_release
[r
.bdev
].insert(r
.offset
, r
.length
);
1189 logger
->inc(l_bluefs_log_compactions
);
1193 * 1. Allocate a new extent to continue the log, and then log an event
1194 * that jumps the log write position to the new extent. At this point, the
1195 * old extent(s) won't be written to, and reflect everything to compact.
1196 * New events will be written to the new region that we'll keep.
1198 * 2. While still holding the lock, encode a bufferlist that dumps all of the
1199 * in-memory fnodes and names. This will become the new beginning of the
1200 * log. The last event will jump to the log continuation extent from #1.
1202 * 3. Queue a write to a new extent for the new beginnging of the log.
1204 * 4. Drop lock and wait
1206 * 5. Retake the lock.
1208 * 6. Update the log_fnode to splice in the new beginning.
1210 * 7. Write the new superblock.
1212 * 8. Release the old log space. Clean up.
1214 void BlueFS::_compact_log_async(std::unique_lock
<std::mutex
>& l
)
1216 dout(10) << __func__
<< dendl
;
1217 File
*log_file
= log_writer
->file
.get();
1219 assert(!new_log_writer
);
1221 // create a new log [writer] so that we know compaction is in progress
1222 // (see _should_compact_log)
1224 new_log
->fnode
.ino
= 0; // so that _flush_range won't try to log the fnode
1226 // 0. wait for any racing flushes to complete. (We do not want to block
1227 // in _flush_sync_log with jump_to set or else a racing thread might flush
1228 // our entries and our jump_to update won't be correct.)
1229 while (log_flushing
) {
1230 dout(10) << __func__
<< " log is currently flushing, waiting" << dendl
;
1234 // 1. allocate new log space and jump to it.
1235 old_log_jump_to
= log_file
->fnode
.get_allocated();
1236 uint64_t need
= old_log_jump_to
+ cct
->_conf
->bluefs_max_log_runway
;
1237 dout(10) << __func__
<< " old_log_jump_to 0x" << std::hex
<< old_log_jump_to
1238 << " need 0x" << need
<< std::dec
<< dendl
;
1239 while (log_file
->fnode
.get_allocated() < need
) {
1240 int r
= _allocate(log_file
->fnode
.prefer_bdev
,
1241 cct
->_conf
->bluefs_max_log_runway
,
1245 dout(10) << __func__
<< " log extents " << log_file
->fnode
.extents
<< dendl
;
1247 // update the log file change and log a jump to the offset where we want to
1248 // write the new entries
1249 log_t
.op_file_update(log_file
->fnode
);
1250 log_t
.op_jump(log_seq
, old_log_jump_to
);
1252 flush_bdev(); // FIXME?
1254 _flush_and_sync_log(l
, 0, old_log_jump_to
);
1256 // 2. prepare compacted log
1257 bluefs_transaction_t t
;
1258 //avoid record two times in log_t and _compact_log_dump_metadata.
1260 _compact_log_dump_metadata(&t
);
1262 // conservative estimate for final encoded size
1263 new_log_jump_to
= ROUND_UP_TO(t
.op_bl
.length() + super
.block_size
* 2,
1264 cct
->_conf
->bluefs_alloc_size
);
1265 t
.op_jump(log_seq
, new_log_jump_to
);
1271 dout(10) << __func__
<< " new_log_jump_to 0x" << std::hex
<< new_log_jump_to
1272 << std::dec
<< dendl
;
1275 int r
= _allocate(BlueFS::BDEV_DB
, new_log_jump_to
,
1278 new_log_writer
= _create_writer(new_log
);
1279 new_log_writer
->append(bl
);
1282 r
= _flush(new_log_writer
, true);
1287 dout(10) << __func__
<< " waiting for compacted log to sync" << dendl
;
1288 wait_for_aio(new_log_writer
);
1290 list
<aio_t
> completed_ios
;
1291 _claim_completed_aios(new_log_writer
, &completed_ios
);
1293 completed_ios
.clear();
1298 // 6. update our log fnode
1299 // discard first old_log_jump_to extents
1300 dout(10) << __func__
<< " remove 0x" << std::hex
<< old_log_jump_to
<< std::dec
1301 << " of " << log_file
->fnode
.extents
<< dendl
;
1302 uint64_t discarded
= 0;
1303 mempool::bluefs::vector
<bluefs_extent_t
> old_extents
;
1304 while (discarded
< old_log_jump_to
) {
1305 assert(!log_file
->fnode
.extents
.empty());
1306 bluefs_extent_t
& e
= log_file
->fnode
.extents
.front();
1307 bluefs_extent_t temp
= e
;
1308 if (discarded
+ e
.length
<= old_log_jump_to
) {
1309 dout(10) << __func__
<< " remove old log extent " << e
<< dendl
;
1310 discarded
+= e
.length
;
1311 log_file
->fnode
.pop_front_extent();
1313 dout(10) << __func__
<< " remove front of old log extent " << e
<< dendl
;
1314 uint64_t drop
= old_log_jump_to
- discarded
;
1319 dout(10) << __func__
<< " kept " << e
<< " removed " << temp
<< dendl
;
1321 old_extents
.push_back(temp
);
1323 auto from
= log_file
->fnode
.extents
.begin();
1324 auto to
= log_file
->fnode
.extents
.end();
1325 while (from
!= to
) {
1326 new_log
->fnode
.append_extent(*from
);
1330 // clear the extents from old log file, they are added to new log
1331 log_file
->fnode
.clear_extents();
1332 // swap the log files. New log file is the log file now.
1333 new_log
->fnode
.swap_extents(log_file
->fnode
);
1335 log_writer
->pos
= log_writer
->file
->fnode
.size
=
1336 log_writer
->pos
- old_log_jump_to
+ new_log_jump_to
;
1338 // 7. write the super block to reflect the changes
1339 dout(10) << __func__
<< " writing super" << dendl
;
1340 super
.log_fnode
= log_file
->fnode
;
1348 // 8. release old space
1349 dout(10) << __func__
<< " release old log extents " << old_extents
<< dendl
;
1350 for (auto& r
: old_extents
) {
1351 pending_release
[r
.bdev
].insert(r
.offset
, r
.length
);
1354 // delete the new log, remove from the dirty files list
1355 _close_writer(new_log_writer
);
1356 if (new_log
->dirty_seq
) {
1357 assert(dirty_files
.count(new_log
->dirty_seq
));
1358 auto it
= dirty_files
[new_log
->dirty_seq
].iterator_to(*new_log
);
1359 dirty_files
[new_log
->dirty_seq
].erase(it
);
1361 new_log_writer
= nullptr;
1363 log_cond
.notify_all();
1365 dout(10) << __func__
<< " log extents " << log_file
->fnode
.extents
<< dendl
;
1366 logger
->inc(l_bluefs_log_compactions
);
1369 void BlueFS::_pad_bl(bufferlist
& bl
)
1371 uint64_t partial
= bl
.length() % super
.block_size
;
1373 dout(10) << __func__
<< " padding with 0x" << std::hex
1374 << super
.block_size
- partial
<< " zeros" << std::dec
<< dendl
;
1375 bl
.append_zero(super
.block_size
- partial
);
1379 void BlueFS::flush_log()
1381 std::unique_lock
<std::mutex
> l(lock
);
1383 _flush_and_sync_log(l
);
1386 int BlueFS::_flush_and_sync_log(std::unique_lock
<std::mutex
>& l
,
1390 while (log_flushing
) {
1391 dout(10) << __func__
<< " want_seq " << want_seq
1392 << " log is currently flushing, waiting" << dendl
;
1396 if (want_seq
&& want_seq
<= log_seq_stable
) {
1397 dout(10) << __func__
<< " want_seq " << want_seq
<< " <= log_seq_stable "
1398 << log_seq_stable
<< ", done" << dendl
;
1402 if (log_t
.empty() && dirty_files
.empty()) {
1403 dout(10) << __func__
<< " want_seq " << want_seq
1404 << " " << log_t
<< " not dirty, dirty_files empty, no-op" << dendl
;
1409 uint64_t seq
= log_t
.seq
= ++log_seq
;
1410 assert(want_seq
== 0 || want_seq
<= seq
);
1411 log_t
.uuid
= super
.uuid
;
1414 auto lsi
= dirty_files
.find(seq
);
1415 if (lsi
!= dirty_files
.end()) {
1416 dout(20) << __func__
<< " " << lsi
->second
.size() << " dirty_files" << dendl
;
1417 for (auto &f
: lsi
->second
) {
1418 dout(20) << __func__
<< " op_file_update " << f
.fnode
<< dendl
;
1419 log_t
.op_file_update(f
.fnode
);
1423 dout(10) << __func__
<< " " << log_t
<< dendl
;
1424 assert(!log_t
.empty());
1426 // allocate some more space (before we run out)?
1427 int64_t runway
= log_writer
->file
->fnode
.get_allocated() -
1428 log_writer
->get_effective_write_pos();
1429 if (runway
< (int64_t)cct
->_conf
->bluefs_min_log_runway
) {
1430 dout(10) << __func__
<< " allocating more log runway (0x"
1431 << std::hex
<< runway
<< std::dec
<< " remaining)" << dendl
;
1432 while (new_log_writer
) {
1433 dout(10) << __func__
<< " waiting for async compaction" << dendl
;
1436 int r
= _allocate(log_writer
->file
->fnode
.prefer_bdev
,
1437 cct
->_conf
->bluefs_max_log_runway
,
1438 &log_writer
->file
->fnode
);
1440 log_t
.op_file_update(log_writer
->file
->fnode
);
1444 ::encode(log_t
, bl
);
1446 // pad to block boundary
1448 logger
->inc(l_bluefs_logged_bytes
, bl
.length());
1450 log_writer
->append(bl
);
1453 log_t
.seq
= 0; // just so debug output is less confusing
1454 log_flushing
= true;
1456 int r
= _flush(log_writer
, true);
1460 dout(10) << __func__
<< " jumping log offset from 0x" << std::hex
1461 << log_writer
->pos
<< " -> 0x" << jump_to
<< std::dec
<< dendl
;
1462 log_writer
->pos
= jump_to
;
1463 log_writer
->file
->fnode
.size
= jump_to
;
1466 _flush_bdev_safely(log_writer
);
1468 log_flushing
= false;
1469 log_cond
.notify_all();
1471 // clean dirty files
1472 if (seq
> log_seq_stable
) {
1473 log_seq_stable
= seq
;
1474 dout(20) << __func__
<< " log_seq_stable " << log_seq_stable
<< dendl
;
1476 auto p
= dirty_files
.begin();
1477 while (p
!= dirty_files
.end()) {
1478 if (p
->first
> log_seq_stable
) {
1479 dout(20) << __func__
<< " done cleaning up dirty files" << dendl
;
1483 auto l
= p
->second
.begin();
1484 while (l
!= p
->second
.end()) {
1486 assert(file
->dirty_seq
> 0);
1487 assert(file
->dirty_seq
<= log_seq_stable
);
1488 dout(20) << __func__
<< " cleaned file " << file
->fnode
<< dendl
;
1489 file
->dirty_seq
= 0;
1490 p
->second
.erase(l
++);
1493 assert(p
->second
.empty());
1494 dirty_files
.erase(p
++);
1497 dout(20) << __func__
<< " log_seq_stable " << log_seq_stable
1498 << " already >= out seq " << seq
1499 << ", we lost a race against another log flush, done" << dendl
;
1501 _update_logger_stats();
1506 int BlueFS::_flush_range(FileWriter
*h
, uint64_t offset
, uint64_t length
)
1508 dout(10) << __func__
<< " " << h
<< " pos 0x" << std::hex
<< h
->pos
1509 << " 0x" << offset
<< "~" << length
<< std::dec
1510 << " to " << h
->file
->fnode
<< dendl
;
1511 assert(!h
->file
->deleted
);
1512 assert(h
->file
->num_readers
.load() == 0);
1514 h
->buffer_appender
.flush();
1517 if (h
->file
->fnode
.ino
== 1)
1520 buffered
= cct
->_conf
->bluefs_buffered_io
;
1522 if (offset
+ length
<= h
->pos
)
1524 if (offset
< h
->pos
) {
1525 length
-= h
->pos
- offset
;
1527 dout(10) << " still need 0x"
1528 << std::hex
<< offset
<< "~" << length
<< std::dec
1531 assert(offset
<= h
->file
->fnode
.size
);
1533 uint64_t allocated
= h
->file
->fnode
.get_allocated();
1535 // do not bother to dirty the file if we are overwriting
1536 // previously allocated extents.
1537 bool must_dirty
= false;
1538 if (allocated
< offset
+ length
) {
1539 // we should never run out of log space here; see the min runway check
1540 // in _flush_and_sync_log.
1541 assert(h
->file
->fnode
.ino
!= 1);
1542 int r
= _allocate(h
->file
->fnode
.prefer_bdev
,
1543 offset
+ length
- allocated
,
1546 derr
<< __func__
<< " allocated: 0x" << std::hex
<< allocated
1547 << " offset: 0x" << offset
<< " length: 0x" << length
<< std::dec
1549 assert(0 == "bluefs enospc");
1552 if (cct
->_conf
->bluefs_preextend_wal_files
&&
1553 h
->writer_type
== WRITER_WAL
) {
1554 // NOTE: this *requires* that rocksdb also has log recycling
1555 // enabled and is therefore doing robust CRCs on the log
1556 // records. otherwise, we will fail to reply the rocksdb log
1557 // properly due to garbage on the device.
1558 h
->file
->fnode
.size
= h
->file
->fnode
.get_allocated();
1559 dout(10) << __func__
<< " extending WAL size to 0x" << std::hex
1560 << h
->file
->fnode
.size
<< std::dec
<< " to include allocated"
1565 if (h
->file
->fnode
.size
< offset
+ length
) {
1566 h
->file
->fnode
.size
= offset
+ length
;
1567 if (h
->file
->fnode
.ino
> 1) {
1568 // we do not need to dirty the log file (or it's compacting
1569 // replacement) when the file size changes because replay is
1570 // smart enough to discover it on its own.
1575 h
->file
->fnode
.mtime
= ceph_clock_now();
1576 assert(h
->file
->fnode
.ino
>= 1);
1577 if (h
->file
->dirty_seq
== 0) {
1578 h
->file
->dirty_seq
= log_seq
+ 1;
1579 dirty_files
[h
->file
->dirty_seq
].push_back(*h
->file
);
1580 dout(20) << __func__
<< " dirty_seq = " << log_seq
+ 1
1581 << " (was clean)" << dendl
;
1583 if (h
->file
->dirty_seq
!= log_seq
+ 1) {
1584 // need re-dirty, erase from list first
1585 assert(dirty_files
.count(h
->file
->dirty_seq
));
1586 auto it
= dirty_files
[h
->file
->dirty_seq
].iterator_to(*h
->file
);
1587 dirty_files
[h
->file
->dirty_seq
].erase(it
);
1588 h
->file
->dirty_seq
= log_seq
+ 1;
1589 dirty_files
[h
->file
->dirty_seq
].push_back(*h
->file
);
1590 dout(20) << __func__
<< " dirty_seq = " << log_seq
+ 1
1591 << " (was " << h
->file
->dirty_seq
<< ")" << dendl
;
1593 dout(20) << __func__
<< " dirty_seq = " << log_seq
+ 1
1594 << " (unchanged, do nothing) " << dendl
;
1598 dout(20) << __func__
<< " file now " << h
->file
->fnode
<< dendl
;
1601 auto p
= h
->file
->fnode
.seek(offset
, &x_off
);
1602 assert(p
!= h
->file
->fnode
.extents
.end());
1603 dout(20) << __func__
<< " in " << *p
<< " x_off 0x"
1604 << std::hex
<< x_off
<< std::dec
<< dendl
;
1606 unsigned partial
= x_off
& ~super
.block_mask();
1609 dout(20) << __func__
<< " using partial tail 0x"
1610 << std::hex
<< partial
<< std::dec
<< dendl
;
1611 assert(h
->tail_block
.length() == partial
);
1612 bl
.claim_append_piecewise(h
->tail_block
);
1616 dout(20) << __func__
<< " waiting for previous aio to complete" << dendl
;
1617 for (auto p
: h
->iocv
) {
1623 if (length
== partial
+ h
->buffer
.length()) {
1624 bl
.claim_append_piecewise(h
->buffer
);
1627 h
->buffer
.splice(0, length
, &t
);
1628 bl
.claim_append_piecewise(t
);
1629 t
.substr_of(h
->buffer
, length
, h
->buffer
.length() - length
);
1631 dout(20) << " leaving 0x" << std::hex
<< h
->buffer
.length() << std::dec
1632 << " unflushed" << dendl
;
1634 assert(bl
.length() == length
);
1636 switch (h
->writer_type
) {
1638 logger
->inc(l_bluefs_bytes_written_wal
, length
);
1641 logger
->inc(l_bluefs_bytes_written_sst
, length
);
1645 dout(30) << "dump:\n";
1649 h
->pos
= offset
+ length
;
1650 h
->tail_block
.clear();
1653 while (length
> 0) {
1654 uint64_t x_len
= MIN(p
->length
- x_off
, length
);
1656 t
.substr_of(bl
, bloff
, x_len
);
1657 unsigned tail
= x_len
& ~super
.block_mask();
1659 size_t zlen
= super
.block_size
- tail
;
1660 dout(20) << __func__
<< " caching tail of 0x"
1662 << " and padding block with 0x" << zlen
1663 << std::dec
<< dendl
;
1664 h
->tail_block
.substr_of(bl
, bl
.length() - tail
, tail
);
1665 if (h
->file
->fnode
.ino
> 1) {
1666 // we are using the page_aligned_appender, and can safely use
1667 // the tail of the raw buffer.
1668 const bufferptr
&last
= t
.back();
1669 if (last
.unused_tail_length() < zlen
) {
1670 derr
<< " wtf, last is " << last
<< " from " << t
<< dendl
;
1671 assert(last
.unused_tail_length() >= zlen
);
1674 z
.set_offset(last
.offset() + last
.length());
1677 t
.append(z
, 0, zlen
);
1679 t
.append_zero(zlen
);
1682 if (cct
->_conf
->bluefs_sync_write
) {
1683 bdev
[p
->bdev
]->write(p
->offset
+ x_off
, t
, buffered
);
1685 bdev
[p
->bdev
]->aio_write(p
->offset
+ x_off
, t
, h
->iocv
[p
->bdev
], buffered
);
1692 for (unsigned i
= 0; i
< MAX_BDEV
; ++i
) {
1695 if (h
->iocv
[i
]->has_pending_aios()) {
1696 bdev
[i
]->aio_submit(h
->iocv
[i
]);
1700 dout(20) << __func__
<< " h " << h
<< " pos now 0x"
1701 << std::hex
<< h
->pos
<< std::dec
<< dendl
;
1705 // we need to retire old completed aios so they don't stick around in
1706 // memory indefinitely (along with their bufferlist refs).
1707 void BlueFS::_claim_completed_aios(FileWriter
*h
, list
<aio_t
> *ls
)
1709 for (auto p
: h
->iocv
) {
1711 ls
->splice(ls
->end(), p
->running_aios
);
1714 dout(10) << __func__
<< " got " << ls
->size() << " aios" << dendl
;
1717 void BlueFS::wait_for_aio(FileWriter
*h
)
1719 // NOTE: this is safe to call without a lock, as long as our reference is
1721 dout(10) << __func__
<< " " << h
<< dendl
;
1722 utime_t start
= ceph_clock_now();
1723 for (auto p
: h
->iocv
) {
1728 utime_t end
= ceph_clock_now();
1729 utime_t dur
= end
- start
;
1730 dout(10) << __func__
<< " " << h
<< " done in " << dur
<< dendl
;
1733 int BlueFS::_flush(FileWriter
*h
, bool force
)
1735 h
->buffer_appender
.flush();
1736 uint64_t length
= h
->buffer
.length();
1737 uint64_t offset
= h
->pos
;
1739 length
< cct
->_conf
->bluefs_min_flush_size
) {
1740 dout(10) << __func__
<< " " << h
<< " ignoring, length " << length
1741 << " < min_flush_size " << cct
->_conf
->bluefs_min_flush_size
1746 dout(10) << __func__
<< " " << h
<< " no dirty data on "
1747 << h
->file
->fnode
<< dendl
;
1750 dout(10) << __func__
<< " " << h
<< " 0x"
1751 << std::hex
<< offset
<< "~" << length
<< std::dec
1752 << " to " << h
->file
->fnode
<< dendl
;
1753 assert(h
->pos
<= h
->file
->fnode
.size
);
1754 return _flush_range(h
, offset
, length
);
1757 int BlueFS::_truncate(FileWriter
*h
, uint64_t offset
)
1759 dout(10) << __func__
<< " 0x" << std::hex
<< offset
<< std::dec
1760 << " file " << h
->file
->fnode
<< dendl
;
1761 if (h
->file
->deleted
) {
1762 dout(10) << __func__
<< " deleted, no-op" << dendl
;
1766 // we never truncate internal log files
1767 assert(h
->file
->fnode
.ino
> 1);
1769 h
->buffer_appender
.flush();
1771 // truncate off unflushed data?
1772 if (h
->pos
< offset
&&
1773 h
->pos
+ h
->buffer
.length() > offset
) {
1775 dout(20) << __func__
<< " tossing out last " << offset
- h
->pos
1776 << " unflushed bytes" << dendl
;
1777 t
.substr_of(h
->buffer
, 0, offset
- h
->pos
);
1779 assert(0 == "actually this shouldn't happen");
1781 if (h
->buffer
.length()) {
1782 int r
= _flush(h
, true);
1786 if (offset
== h
->file
->fnode
.size
) {
1789 if (offset
> h
->file
->fnode
.size
) {
1790 assert(0 == "truncate up not supported");
1792 assert(h
->file
->fnode
.size
>= offset
);
1793 h
->file
->fnode
.size
= offset
;
1794 log_t
.op_file_update(h
->file
->fnode
);
1798 int BlueFS::_fsync(FileWriter
*h
, std::unique_lock
<std::mutex
>& l
)
1800 dout(10) << __func__
<< " " << h
<< " " << h
->file
->fnode
<< dendl
;
1801 int r
= _flush(h
, true);
1804 uint64_t old_dirty_seq
= h
->file
->dirty_seq
;
1806 _flush_bdev_safely(h
);
1808 if (old_dirty_seq
) {
1809 uint64_t s
= log_seq
;
1810 dout(20) << __func__
<< " file metadata was dirty (" << old_dirty_seq
1811 << ") on " << h
->file
->fnode
<< ", flushing log" << dendl
;
1812 _flush_and_sync_log(l
, old_dirty_seq
);
1813 assert(h
->file
->dirty_seq
== 0 || // cleaned
1814 h
->file
->dirty_seq
> s
); // or redirtied by someone else
1819 void BlueFS::_flush_bdev_safely(FileWriter
*h
)
1821 if (!cct
->_conf
->bluefs_sync_write
) {
1822 list
<aio_t
> completed_ios
;
1823 _claim_completed_aios(h
, &completed_ios
);
1826 completed_ios
.clear();
1836 void BlueFS::flush_bdev()
1838 // NOTE: this is safe to call without a lock.
1839 dout(20) << __func__
<< dendl
;
1840 for (auto p
: bdev
) {
1846 int BlueFS::_allocate(uint8_t id
, uint64_t len
,
1847 bluefs_fnode_t
* node
)
1849 dout(10) << __func__
<< " len 0x" << std::hex
<< len
<< std::dec
1850 << " from " << (int)id
<< dendl
;
1851 assert(id
< alloc
.size());
1852 uint64_t min_alloc_size
= cct
->_conf
->bluefs_alloc_size
;
1854 uint64_t left
= ROUND_UP_TO(len
, min_alloc_size
);
1856 int64_t alloc_len
= 0;
1857 AllocExtentVector extents
;
1860 r
= alloc
[id
]->reserve(left
);
1865 if (!node
->extents
.empty() && node
->extents
.back().bdev
== id
) {
1866 hint
= node
->extents
.back().end();
1868 extents
.reserve(4); // 4 should be (more than) enough for most allocations
1869 alloc_len
= alloc
[id
]->allocate(left
, min_alloc_size
, hint
, &extents
);
1871 if (r
< 0 || (alloc_len
< (int64_t)left
)) {
1873 alloc
[id
]->unreserve(left
- alloc_len
);
1874 for (auto& p
: extents
) {
1875 alloc
[id
]->release(p
.offset
, p
.length
);
1878 if (id
!= BDEV_SLOW
) {
1880 dout(1) << __func__
<< " failed to allocate 0x" << std::hex
<< left
1881 << " on bdev " << (int)id
1882 << ", free 0x" << alloc
[id
]->get_free()
1883 << "; fallback to bdev " << (int)id
+ 1
1884 << std::dec
<< dendl
;
1886 return _allocate(id
+ 1, len
, node
);
1889 derr
<< __func__
<< " failed to allocate 0x" << std::hex
<< left
1890 << " on bdev " << (int)id
1891 << ", free 0x" << alloc
[id
]->get_free() << std::dec
<< dendl
;
1893 derr
<< __func__
<< " failed to allocate 0x" << std::hex
<< left
1894 << " on bdev " << (int)id
<< ", dne" << std::dec
<< dendl
;
1900 for (auto& p
: extents
) {
1901 node
->append_extent(bluefs_extent_t(id
, p
.offset
, p
.length
));
1907 int BlueFS::_preallocate(FileRef f
, uint64_t off
, uint64_t len
)
1909 dout(10) << __func__
<< " file " << f
->fnode
<< " 0x"
1910 << std::hex
<< off
<< "~" << len
<< std::dec
<< dendl
;
1912 dout(10) << __func__
<< " deleted, no-op" << dendl
;
1915 assert(f
->fnode
.ino
> 1);
1916 uint64_t allocated
= f
->fnode
.get_allocated();
1917 if (off
+ len
> allocated
) {
1918 uint64_t want
= off
+ len
- allocated
;
1919 int r
= _allocate(f
->fnode
.prefer_bdev
, want
, &f
->fnode
);
1922 log_t
.op_file_update(f
->fnode
);
1927 void BlueFS::sync_metadata()
1929 std::unique_lock
<std::mutex
> l(lock
);
1930 if (log_t
.empty()) {
1931 dout(10) << __func__
<< " - no pending log events" << dendl
;
1934 dout(10) << __func__
<< dendl
;
1935 utime_t start
= ceph_clock_now();
1936 vector
<interval_set
<uint64_t>> to_release(pending_release
.size());
1937 to_release
.swap(pending_release
);
1938 flush_bdev(); // FIXME?
1939 _flush_and_sync_log(l
);
1940 for (unsigned i
= 0; i
< to_release
.size(); ++i
) {
1941 for (auto p
= to_release
[i
].begin(); p
!= to_release
[i
].end(); ++p
) {
1942 alloc
[i
]->release(p
.get_start(), p
.get_len());
1946 if (_should_compact_log()) {
1947 if (cct
->_conf
->bluefs_compact_log_sync
) {
1948 _compact_log_sync();
1950 _compact_log_async(l
);
1954 utime_t end
= ceph_clock_now();
1955 utime_t dur
= end
- start
;
1956 dout(10) << __func__
<< " done in " << dur
<< dendl
;
1959 int BlueFS::open_for_write(
1960 const string
& dirname
,
1961 const string
& filename
,
1965 std::lock_guard
<std::mutex
> l(lock
);
1966 dout(10) << __func__
<< " " << dirname
<< "/" << filename
<< dendl
;
1967 map
<string
,DirRef
>::iterator p
= dir_map
.find(dirname
);
1969 if (p
== dir_map
.end()) {
1970 // implicitly create the dir
1971 dout(20) << __func__
<< " dir " << dirname
1972 << " does not exist" << dendl
;
1979 bool create
= false;
1980 map
<string
,FileRef
>::iterator q
= dir
->file_map
.find(filename
);
1981 if (q
== dir
->file_map
.end()) {
1983 dout(20) << __func__
<< " dir " << dirname
<< " (" << dir
1984 << ") file " << filename
1985 << " does not exist" << dendl
;
1989 file
->fnode
.ino
= ++ino_last
;
1990 file_map
[ino_last
] = file
;
1991 dir
->file_map
[filename
] = file
;
1995 // overwrite existing file?
1998 dout(20) << __func__
<< " dir " << dirname
<< " (" << dir
1999 << ") file " << filename
2000 << " already exists, overwrite in place" << dendl
;
2002 dout(20) << __func__
<< " dir " << dirname
<< " (" << dir
2003 << ") file " << filename
2004 << " already exists, truncate + overwrite" << dendl
;
2005 file
->fnode
.size
= 0;
2006 for (auto& p
: file
->fnode
.extents
) {
2007 pending_release
[p
.bdev
].insert(p
.offset
, p
.length
);
2010 file
->fnode
.clear_extents();
2013 assert(file
->fnode
.ino
> 1);
2015 file
->fnode
.mtime
= ceph_clock_now();
2016 file
->fnode
.prefer_bdev
= BlueFS::BDEV_DB
;
2017 if (dirname
.length() > 5) {
2018 // the "db.slow" and "db.wal" directory names are hard-coded at
2019 // match up with bluestore. the slow device is always the second
2020 // one (when a dedicated block.db device is present and used at
2021 // bdev 0). the wal device is always last.
2022 if (boost::algorithm::ends_with(dirname
, ".slow")) {
2023 file
->fnode
.prefer_bdev
= BlueFS::BDEV_SLOW
;
2024 } else if (boost::algorithm::ends_with(dirname
, ".wal")) {
2025 file
->fnode
.prefer_bdev
= BlueFS::BDEV_WAL
;
2028 dout(20) << __func__
<< " mapping " << dirname
<< "/" << filename
2029 << " to bdev " << (int)file
->fnode
.prefer_bdev
<< dendl
;
2031 log_t
.op_file_update(file
->fnode
);
2033 log_t
.op_dir_link(dirname
, filename
, file
->fnode
.ino
);
2035 *h
= _create_writer(file
);
2037 if (boost::algorithm::ends_with(filename
, ".log")) {
2038 (*h
)->writer_type
= BlueFS::WRITER_WAL
;
2039 if (logger
&& !overwrite
) {
2040 logger
->inc(l_bluefs_files_written_wal
);
2042 } else if (boost::algorithm::ends_with(filename
, ".sst")) {
2043 (*h
)->writer_type
= BlueFS::WRITER_SST
;
2045 logger
->inc(l_bluefs_files_written_sst
);
2049 dout(10) << __func__
<< " h " << *h
<< " on " << file
->fnode
<< dendl
;
2053 BlueFS::FileWriter
*BlueFS::_create_writer(FileRef f
)
2055 FileWriter
*w
= new FileWriter(f
);
2056 for (unsigned i
= 0; i
< MAX_BDEV
; ++i
) {
2058 w
->iocv
[i
] = new IOContext(cct
, NULL
);
2066 void BlueFS::_close_writer(FileWriter
*h
)
2068 dout(10) << __func__
<< " " << h
<< " type " << h
->writer_type
<< dendl
;
2069 for (unsigned i
=0; i
<MAX_BDEV
; ++i
) {
2072 h
->iocv
[i
]->aio_wait();
2073 bdev
[i
]->queue_reap_ioc(h
->iocv
[i
]);
2079 int BlueFS::open_for_read(
2080 const string
& dirname
,
2081 const string
& filename
,
2085 std::lock_guard
<std::mutex
> l(lock
);
2086 dout(10) << __func__
<< " " << dirname
<< "/" << filename
2087 << (random
? " (random)":" (sequential)") << dendl
;
2088 map
<string
,DirRef
>::iterator p
= dir_map
.find(dirname
);
2089 if (p
== dir_map
.end()) {
2090 dout(20) << __func__
<< " dir " << dirname
<< " not found" << dendl
;
2093 DirRef dir
= p
->second
;
2095 map
<string
,FileRef
>::iterator q
= dir
->file_map
.find(filename
);
2096 if (q
== dir
->file_map
.end()) {
2097 dout(20) << __func__
<< " dir " << dirname
<< " (" << dir
2098 << ") file " << filename
2099 << " not found" << dendl
;
2102 File
*file
= q
->second
.get();
2104 *h
= new FileReader(file
, random
? 4096 : cct
->_conf
->bluefs_max_prefetch
,
2106 dout(10) << __func__
<< " h " << *h
<< " on " << file
->fnode
<< dendl
;
2111 const string
& old_dirname
, const string
& old_filename
,
2112 const string
& new_dirname
, const string
& new_filename
)
2114 std::lock_guard
<std::mutex
> l(lock
);
2115 dout(10) << __func__
<< " " << old_dirname
<< "/" << old_filename
2116 << " -> " << new_dirname
<< "/" << new_filename
<< dendl
;
2117 map
<string
,DirRef
>::iterator p
= dir_map
.find(old_dirname
);
2118 if (p
== dir_map
.end()) {
2119 dout(20) << __func__
<< " dir " << old_dirname
<< " not found" << dendl
;
2122 DirRef old_dir
= p
->second
;
2123 map
<string
,FileRef
>::iterator q
= old_dir
->file_map
.find(old_filename
);
2124 if (q
== old_dir
->file_map
.end()) {
2125 dout(20) << __func__
<< " dir " << old_dirname
<< " (" << old_dir
2126 << ") file " << old_filename
2127 << " not found" << dendl
;
2130 FileRef file
= q
->second
;
2132 p
= dir_map
.find(new_dirname
);
2133 if (p
== dir_map
.end()) {
2134 dout(20) << __func__
<< " dir " << new_dirname
<< " not found" << dendl
;
2137 DirRef new_dir
= p
->second
;
2138 q
= new_dir
->file_map
.find(new_filename
);
2139 if (q
!= new_dir
->file_map
.end()) {
2140 dout(20) << __func__
<< " dir " << new_dirname
<< " (" << old_dir
2141 << ") file " << new_filename
2142 << " already exists, unlinking" << dendl
;
2143 assert(q
->second
!= file
);
2144 log_t
.op_dir_unlink(new_dirname
, new_filename
);
2145 _drop_link(q
->second
);
2148 dout(10) << __func__
<< " " << new_dirname
<< "/" << new_filename
<< " "
2149 << " " << file
->fnode
<< dendl
;
2151 new_dir
->file_map
[new_filename
] = file
;
2152 old_dir
->file_map
.erase(old_filename
);
2154 log_t
.op_dir_link(new_dirname
, new_filename
, file
->fnode
.ino
);
2155 log_t
.op_dir_unlink(old_dirname
, old_filename
);
2159 int BlueFS::mkdir(const string
& dirname
)
2161 std::lock_guard
<std::mutex
> l(lock
);
2162 dout(10) << __func__
<< " " << dirname
<< dendl
;
2163 map
<string
,DirRef
>::iterator p
= dir_map
.find(dirname
);
2164 if (p
!= dir_map
.end()) {
2165 dout(20) << __func__
<< " dir " << dirname
<< " exists" << dendl
;
2168 dir_map
[dirname
] = new Dir
;
2169 log_t
.op_dir_create(dirname
);
2173 int BlueFS::rmdir(const string
& dirname
)
2175 std::lock_guard
<std::mutex
> l(lock
);
2176 dout(10) << __func__
<< " " << dirname
<< dendl
;
2177 map
<string
,DirRef
>::iterator p
= dir_map
.find(dirname
);
2178 if (p
== dir_map
.end()) {
2179 dout(20) << __func__
<< " dir " << dirname
<< " does not exist" << dendl
;
2182 DirRef dir
= p
->second
;
2183 if (!dir
->file_map
.empty()) {
2184 dout(20) << __func__
<< " dir " << dirname
<< " not empty" << dendl
;
2187 dir_map
.erase(dirname
);
2188 log_t
.op_dir_remove(dirname
);
2192 bool BlueFS::dir_exists(const string
& dirname
)
2194 std::lock_guard
<std::mutex
> l(lock
);
2195 map
<string
,DirRef
>::iterator p
= dir_map
.find(dirname
);
2196 bool exists
= p
!= dir_map
.end();
2197 dout(10) << __func__
<< " " << dirname
<< " = " << (int)exists
<< dendl
;
2201 int BlueFS::stat(const string
& dirname
, const string
& filename
,
2202 uint64_t *size
, utime_t
*mtime
)
2204 std::lock_guard
<std::mutex
> l(lock
);
2205 dout(10) << __func__
<< " " << dirname
<< "/" << filename
<< dendl
;
2206 map
<string
,DirRef
>::iterator p
= dir_map
.find(dirname
);
2207 if (p
== dir_map
.end()) {
2208 dout(20) << __func__
<< " dir " << dirname
<< " not found" << dendl
;
2211 DirRef dir
= p
->second
;
2212 map
<string
,FileRef
>::iterator q
= dir
->file_map
.find(filename
);
2213 if (q
== dir
->file_map
.end()) {
2214 dout(20) << __func__
<< " dir " << dirname
<< " (" << dir
2215 << ") file " << filename
2216 << " not found" << dendl
;
2219 File
*file
= q
->second
.get();
2220 dout(10) << __func__
<< " " << dirname
<< "/" << filename
2221 << " " << file
->fnode
<< dendl
;
2223 *size
= file
->fnode
.size
;
2225 *mtime
= file
->fnode
.mtime
;
2229 int BlueFS::lock_file(const string
& dirname
, const string
& filename
,
2232 std::lock_guard
<std::mutex
> l(lock
);
2233 dout(10) << __func__
<< " " << dirname
<< "/" << filename
<< dendl
;
2234 map
<string
,DirRef
>::iterator p
= dir_map
.find(dirname
);
2235 if (p
== dir_map
.end()) {
2236 dout(20) << __func__
<< " dir " << dirname
<< " not found" << dendl
;
2239 DirRef dir
= p
->second
;
2240 map
<string
,FileRef
>::iterator q
= dir
->file_map
.find(filename
);
2242 if (q
== dir
->file_map
.end()) {
2243 dout(20) << __func__
<< " dir " << dirname
<< " (" << dir
2244 << ") file " << filename
2245 << " not found, creating" << dendl
;
2247 file
->fnode
.ino
= ++ino_last
;
2248 file
->fnode
.mtime
= ceph_clock_now();
2249 file_map
[ino_last
] = file
;
2250 dir
->file_map
[filename
] = file
;
2252 log_t
.op_file_update(file
->fnode
);
2253 log_t
.op_dir_link(dirname
, filename
, file
->fnode
.ino
);
2255 file
= q
->second
.get();
2257 dout(10) << __func__
<< " already locked" << dendl
;
2261 file
->locked
= true;
2262 *plock
= new FileLock(file
);
2263 dout(10) << __func__
<< " locked " << file
->fnode
2264 << " with " << *plock
<< dendl
;
2268 int BlueFS::unlock_file(FileLock
*fl
)
2270 std::lock_guard
<std::mutex
> l(lock
);
2271 dout(10) << __func__
<< " " << fl
<< " on " << fl
->file
->fnode
<< dendl
;
2272 assert(fl
->file
->locked
);
2273 fl
->file
->locked
= false;
2278 int BlueFS::readdir(const string
& dirname
, vector
<string
> *ls
)
2280 std::lock_guard
<std::mutex
> l(lock
);
2281 dout(10) << __func__
<< " " << dirname
<< dendl
;
2282 if (dirname
.empty()) {
2284 ls
->reserve(dir_map
.size() + 2);
2285 for (auto& q
: dir_map
) {
2286 ls
->push_back(q
.first
);
2289 // list files in dir
2290 map
<string
,DirRef
>::iterator p
= dir_map
.find(dirname
);
2291 if (p
== dir_map
.end()) {
2292 dout(20) << __func__
<< " dir " << dirname
<< " not found" << dendl
;
2295 DirRef dir
= p
->second
;
2296 ls
->reserve(dir
->file_map
.size() + 2);
2297 for (auto& q
: dir
->file_map
) {
2298 ls
->push_back(q
.first
);
2302 ls
->push_back("..");
2306 int BlueFS::unlink(const string
& dirname
, const string
& filename
)
2308 std::lock_guard
<std::mutex
> l(lock
);
2309 dout(10) << __func__
<< " " << dirname
<< "/" << filename
<< dendl
;
2310 map
<string
,DirRef
>::iterator p
= dir_map
.find(dirname
);
2311 if (p
== dir_map
.end()) {
2312 dout(20) << __func__
<< " dir " << dirname
<< " not found" << dendl
;
2315 DirRef dir
= p
->second
;
2316 map
<string
,FileRef
>::iterator q
= dir
->file_map
.find(filename
);
2317 if (q
== dir
->file_map
.end()) {
2318 dout(20) << __func__
<< " file " << dirname
<< "/" << filename
2319 << " not found" << dendl
;
2322 FileRef file
= q
->second
;
2324 dout(20) << __func__
<< " file " << dirname
<< "/" << filename
2325 << " is locked" << dendl
;
2328 dir
->file_map
.erase(filename
);
2329 log_t
.op_dir_unlink(dirname
, filename
);
2334 bool BlueFS::wal_is_rotational()
2336 if (bdev
[BDEV_WAL
]) {
2337 return bdev
[BDEV_WAL
]->is_rotational();
2338 } else if (bdev
[BDEV_DB
]) {
2339 return bdev
[BDEV_DB
]->is_rotational();
2341 return bdev
[BDEV_SLOW
]->is_rotational();