1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2 // vim: ts=8 sw=2 smarttab
4 #include "boost/algorithm/string.hpp"
7 #include "common/debug.h"
8 #include "common/errno.h"
9 #include "common/perf_counters.h"
10 #include "BlockDevice.h"
11 #include "Allocator.h"
12 #include "include/ceph_assert.h"
14 #define dout_context cct
15 #define dout_subsys ceph_subsys_bluefs
17 #define dout_prefix *_dout << "bluefs "
19 MEMPOOL_DEFINE_OBJECT_FACTORY(BlueFS::File
, bluefs_file
, bluefs
);
20 MEMPOOL_DEFINE_OBJECT_FACTORY(BlueFS::Dir
, bluefs_dir
, bluefs
);
21 MEMPOOL_DEFINE_OBJECT_FACTORY(BlueFS::FileWriter
, bluefs_file_writer
, bluefs
);
22 MEMPOOL_DEFINE_OBJECT_FACTORY(BlueFS::FileReaderBuffer
,
23 bluefs_file_reader_buffer
, bluefs
);
24 MEMPOOL_DEFINE_OBJECT_FACTORY(BlueFS::FileReader
, bluefs_file_reader
, bluefs
);
25 MEMPOOL_DEFINE_OBJECT_FACTORY(BlueFS::FileLock
, bluefs_file_lock
, bluefs
);
27 static void wal_discard_cb(void *priv
, void* priv2
) {
28 BlueFS
*bluefs
= static_cast<BlueFS
*>(priv
);
29 interval_set
<uint64_t> *tmp
= static_cast<interval_set
<uint64_t>*>(priv2
);
30 bluefs
->handle_discard(BlueFS::BDEV_WAL
, *tmp
);
33 static void db_discard_cb(void *priv
, void* priv2
) {
34 BlueFS
*bluefs
= static_cast<BlueFS
*>(priv
);
35 interval_set
<uint64_t> *tmp
= static_cast<interval_set
<uint64_t>*>(priv2
);
36 bluefs
->handle_discard(BlueFS::BDEV_DB
, *tmp
);
39 static void slow_discard_cb(void *priv
, void* priv2
) {
40 BlueFS
*bluefs
= static_cast<BlueFS
*>(priv
);
41 interval_set
<uint64_t> *tmp
= static_cast<interval_set
<uint64_t>*>(priv2
);
42 bluefs
->handle_discard(BlueFS::BDEV_SLOW
, *tmp
);
45 BlueFS::BlueFS(CephContext
* cct
)
51 discard_cb
[BDEV_WAL
] = wal_discard_cb
;
52 discard_cb
[BDEV_DB
] = db_discard_cb
;
53 discard_cb
[BDEV_SLOW
] = slow_discard_cb
;
73 void BlueFS::_init_logger()
75 PerfCountersBuilder
b(cct
, "bluefs",
76 l_bluefs_first
, l_bluefs_last
);
77 b
.add_u64_counter(l_bluefs_gift_bytes
, "gift_bytes",
78 "Bytes gifted from BlueStore", NULL
, 0, unit_t(UNIT_BYTES
));
79 b
.add_u64_counter(l_bluefs_reclaim_bytes
, "reclaim_bytes",
80 "Bytes reclaimed by BlueStore", NULL
, 0, unit_t(UNIT_BYTES
));
81 b
.add_u64(l_bluefs_db_total_bytes
, "db_total_bytes",
82 "Total bytes (main db device)",
83 "b", PerfCountersBuilder::PRIO_USEFUL
, unit_t(UNIT_BYTES
));
84 b
.add_u64(l_bluefs_db_used_bytes
, "db_used_bytes",
85 "Used bytes (main db device)",
86 "u", PerfCountersBuilder::PRIO_USEFUL
, unit_t(UNIT_BYTES
));
87 b
.add_u64(l_bluefs_wal_total_bytes
, "wal_total_bytes",
88 "Total bytes (wal device)",
89 "walb", PerfCountersBuilder::PRIO_USEFUL
, unit_t(UNIT_BYTES
));
90 b
.add_u64(l_bluefs_wal_used_bytes
, "wal_used_bytes",
91 "Used bytes (wal device)",
92 "walu", PerfCountersBuilder::PRIO_USEFUL
, unit_t(UNIT_BYTES
));
93 b
.add_u64(l_bluefs_slow_total_bytes
, "slow_total_bytes",
94 "Total bytes (slow device)",
95 "slob", PerfCountersBuilder::PRIO_USEFUL
, unit_t(UNIT_BYTES
));
96 b
.add_u64(l_bluefs_slow_used_bytes
, "slow_used_bytes",
97 "Used bytes (slow device)",
98 "slou", PerfCountersBuilder::PRIO_USEFUL
, unit_t(UNIT_BYTES
));
99 b
.add_u64(l_bluefs_num_files
, "num_files", "File count",
100 "f", PerfCountersBuilder::PRIO_USEFUL
);
101 b
.add_u64(l_bluefs_log_bytes
, "log_bytes", "Size of the metadata log",
102 "jlen", PerfCountersBuilder::PRIO_INTERESTING
, unit_t(UNIT_BYTES
));
103 b
.add_u64_counter(l_bluefs_log_compactions
, "log_compactions",
104 "Compactions of the metadata log");
105 b
.add_u64_counter(l_bluefs_logged_bytes
, "logged_bytes",
106 "Bytes written to the metadata log", "j",
107 PerfCountersBuilder::PRIO_CRITICAL
, unit_t(UNIT_BYTES
));
108 b
.add_u64_counter(l_bluefs_files_written_wal
, "files_written_wal",
109 "Files written to WAL");
110 b
.add_u64_counter(l_bluefs_files_written_sst
, "files_written_sst",
111 "Files written to SSTs");
112 b
.add_u64_counter(l_bluefs_bytes_written_wal
, "bytes_written_wal",
113 "Bytes written to WAL", "wal",
114 PerfCountersBuilder::PRIO_CRITICAL
);
115 b
.add_u64_counter(l_bluefs_bytes_written_sst
, "bytes_written_sst",
116 "Bytes written to SSTs", "sst",
117 PerfCountersBuilder::PRIO_CRITICAL
, unit_t(UNIT_BYTES
));
118 b
.add_u64_counter(l_bluefs_bytes_written_slow
, "bytes_written_slow",
119 "Bytes written to WAL/SSTs at slow device", NULL
,
120 PerfCountersBuilder::PRIO_USEFUL
, unit_t(UNIT_BYTES
));
121 b
.add_u64_counter(l_bluefs_max_bytes_wal
, "max_bytes_wal",
122 "Maximum bytes allocated from WAL");
123 b
.add_u64_counter(l_bluefs_max_bytes_db
, "max_bytes_db",
124 "Maximum bytes allocated from DB");
125 b
.add_u64_counter(l_bluefs_max_bytes_slow
, "max_bytes_slow",
126 "Maximum bytes allocated from SLOW");
128 b
.add_u64_counter(l_bluefs_read_random_count
, "read_random_count",
129 "random read requests processed");
130 b
.add_u64_counter(l_bluefs_read_random_bytes
, "read_random_bytes",
131 "Bytes requested in random read mode", NULL
,
132 PerfCountersBuilder::PRIO_USEFUL
, unit_t(UNIT_BYTES
));
133 b
.add_u64_counter(l_bluefs_read_random_disk_count
, "read_random_disk_count",
134 "random reads requests going to disk");
135 b
.add_u64_counter(l_bluefs_read_random_disk_bytes
, "read_random_disk_bytes",
136 "Bytes read from disk in random read mode", NULL
,
137 PerfCountersBuilder::PRIO_USEFUL
, unit_t(UNIT_BYTES
));
138 b
.add_u64_counter(l_bluefs_read_random_buffer_count
, "read_random_buffer_count",
139 "random read requests processed using prefetch buffer");
140 b
.add_u64_counter(l_bluefs_read_random_buffer_bytes
, "read_random_buffer_bytes",
141 "Bytes read from prefetch buffer in random read mode", NULL
,
142 PerfCountersBuilder::PRIO_USEFUL
, unit_t(UNIT_BYTES
));
144 b
.add_u64_counter(l_bluefs_read_count
, "read_count",
145 "buffered read requests processed");
146 b
.add_u64_counter(l_bluefs_read_bytes
, "read_bytes",
147 "Bytes requested in buffered read mode", NULL
,
148 PerfCountersBuilder::PRIO_USEFUL
, unit_t(UNIT_BYTES
));
150 b
.add_u64_counter(l_bluefs_read_prefetch_count
, "read_prefetch_count",
151 "prefetch read requests processed");
152 b
.add_u64_counter(l_bluefs_read_prefetch_bytes
, "read_prefetch_bytes",
153 "Bytes requested in prefetch read mode", NULL
,
154 PerfCountersBuilder::PRIO_USEFUL
, unit_t(UNIT_BYTES
));
156 logger
= b
.create_perf_counters();
157 cct
->get_perfcounters_collection()->add(logger
);
160 void BlueFS::_shutdown_logger()
162 cct
->get_perfcounters_collection()->remove(logger
);
166 void BlueFS::_update_logger_stats()
168 // we must be holding the lock
169 logger
->set(l_bluefs_num_files
, file_map
.size());
170 logger
->set(l_bluefs_log_bytes
, log_writer
->file
->fnode
.size
);
172 if (alloc
[BDEV_WAL
]) {
173 logger
->set(l_bluefs_wal_total_bytes
, block_all
[BDEV_WAL
].size());
174 logger
->set(l_bluefs_wal_used_bytes
,
175 block_all
[BDEV_WAL
].size() - alloc
[BDEV_WAL
]->get_free());
177 if (alloc
[BDEV_DB
]) {
178 logger
->set(l_bluefs_db_total_bytes
, block_all
[BDEV_DB
].size());
179 logger
->set(l_bluefs_db_used_bytes
,
180 block_all
[BDEV_DB
].size() - alloc
[BDEV_DB
]->get_free());
182 if (alloc
[BDEV_SLOW
]) {
183 logger
->set(l_bluefs_slow_total_bytes
, block_all
[BDEV_SLOW
].size());
184 logger
->set(l_bluefs_slow_used_bytes
,
185 block_all
[BDEV_SLOW
].size() - alloc
[BDEV_SLOW
]->get_free());
189 int BlueFS::add_block_device(unsigned id
, const string
& path
, bool trim
,
190 bool shared_with_bluestore
)
192 dout(10) << __func__
<< " bdev " << id
<< " path " << path
<< dendl
;
193 ceph_assert(id
< bdev
.size());
194 ceph_assert(bdev
[id
] == NULL
);
195 BlockDevice
*b
= BlockDevice::create(cct
, path
, NULL
, NULL
,
196 discard_cb
[id
], static_cast<void*>(this));
197 if (shared_with_bluestore
) {
198 b
->set_no_exclusive_lock();
200 int r
= b
->open(path
);
206 b
->discard(0, b
->get_size());
209 dout(1) << __func__
<< " bdev " << id
<< " path " << path
210 << " size " << byte_u_t(b
->get_size()) << dendl
;
212 ioc
[id
] = new IOContext(cct
, NULL
);
216 bool BlueFS::bdev_support_label(unsigned id
)
218 ceph_assert(id
< bdev
.size());
219 ceph_assert(bdev
[id
]);
220 return bdev
[id
]->supported_bdev_label();
223 uint64_t BlueFS::get_block_device_size(unsigned id
)
225 if (id
< bdev
.size() && bdev
[id
])
226 return bdev
[id
]->get_size();
230 void BlueFS::_add_block_extent(unsigned id
, uint64_t offset
, uint64_t length
)
232 dout(1) << __func__
<< " bdev " << id
233 << " 0x" << std::hex
<< offset
<< "~" << length
<< std::dec
236 ceph_assert(id
< bdev
.size());
237 ceph_assert(bdev
[id
]);
238 ceph_assert(bdev
[id
]->get_size() >= offset
+ length
);
239 block_all
[id
].insert(offset
, length
);
241 if (id
< alloc
.size() && alloc
[id
]) {
242 log_t
.op_alloc_add(id
, offset
, length
);
243 alloc
[id
]->init_add_free(offset
, length
);
247 logger
->inc(l_bluefs_gift_bytes
, length
);
248 dout(10) << __func__
<< " done" << dendl
;
251 int BlueFS::reclaim_blocks(unsigned id
, uint64_t want
,
252 PExtentVector
*extents
)
254 std::unique_lock
l(lock
);
255 dout(1) << __func__
<< " bdev " << id
256 << " want 0x" << std::hex
<< want
<< std::dec
<< dendl
;
257 ceph_assert(id
< alloc
.size());
258 ceph_assert(alloc
[id
]);
260 int64_t got
= alloc
[id
]->allocate(want
, cct
->_conf
->bluefs_alloc_size
, 0,
262 ceph_assert(got
!= 0);
264 derr
<< __func__
<< " failed to allocate space to return to bluestore"
270 for (auto& p
: *extents
) {
271 block_all
[id
].erase(p
.offset
, p
.length
);
272 log_t
.op_alloc_rm(id
, p
.offset
, p
.length
);
276 int r
= _flush_and_sync_log(l
);
279 logger
->inc(l_bluefs_reclaim_bytes
, got
);
280 dout(1) << __func__
<< " bdev " << id
<< " want 0x" << std::hex
<< want
281 << " got " << *extents
<< dendl
;
285 void BlueFS::handle_discard(unsigned id
, interval_set
<uint64_t>& to_release
)
287 dout(10) << __func__
<< " bdev " << id
<< dendl
;
288 ceph_assert(alloc
[id
]);
289 alloc
[id
]->release(to_release
);
292 uint64_t BlueFS::get_used()
294 std::lock_guard
l(lock
);
296 for (unsigned id
= 0; id
< MAX_BDEV
; ++id
) {
298 used
+= block_all
[id
].size() - alloc
[id
]->get_free();
304 uint64_t BlueFS::get_total(unsigned id
)
306 std::lock_guard
l(lock
);
307 ceph_assert(id
< block_all
.size());
308 return block_all
[id
].size();
311 uint64_t BlueFS::get_free(unsigned id
)
313 std::lock_guard
l(lock
);
314 ceph_assert(id
< alloc
.size());
315 return alloc
[id
]->get_free();
318 void BlueFS::dump_perf_counters(Formatter
*f
)
320 f
->open_object_section("bluefs_perf_counters");
321 logger
->dump_formatted(f
,0);
325 void BlueFS::dump_block_extents(ostream
& out
)
327 for (unsigned i
= 0; i
< MAX_BDEV
; ++i
) {
331 auto owned
= get_total(i
);
332 auto free
= get_free(i
);
333 out
<< i
<< " : device size 0x" << std::hex
<< bdev
[i
]->get_size()
334 << " : own 0x" << block_all
[i
]
336 << " : using 0x" << owned
- free
337 << std::dec
<< "(" << byte_u_t(owned
- free
) << ")"
342 void BlueFS::get_usage(vector
<pair
<uint64_t,uint64_t>> *usage
)
344 std::lock_guard
l(lock
);
345 usage
->resize(bdev
.size());
346 for (unsigned id
= 0; id
< bdev
.size(); ++id
) {
348 (*usage
)[id
] = make_pair(0, 0);
351 (*usage
)[id
].first
= alloc
[id
]->get_free();
352 (*usage
)[id
].second
= block_all
[id
].size();
354 (block_all
[id
].size() - (*usage
)[id
].first
) * 100 / block_all
[id
].size();
355 dout(10) << __func__
<< " bdev " << id
356 << " free " << (*usage
)[id
].first
357 << " (" << byte_u_t((*usage
)[id
].first
) << ")"
358 << " / " << (*usage
)[id
].second
359 << " (" << byte_u_t((*usage
)[id
].second
) << ")"
360 << ", used " << used
<< "%"
365 int BlueFS::get_block_extents(unsigned id
, interval_set
<uint64_t> *extents
)
367 std::lock_guard
l(lock
);
368 dout(10) << __func__
<< " bdev " << id
<< dendl
;
369 if (id
>= block_all
.size())
371 *extents
= block_all
[id
];
375 int BlueFS::mkfs(uuid_d osd_uuid
)
377 std::unique_lock
l(lock
);
379 << " osd_uuid " << osd_uuid
386 super
.block_size
= bdev
[BDEV_DB
]->get_block_size();
387 super
.osd_uuid
= osd_uuid
;
388 super
.uuid
.generate_random();
389 dout(1) << __func__
<< " uuid " << super
.uuid
<< dendl
;
392 FileRef log_file
= new File
;
393 log_file
->fnode
.ino
= 1;
394 log_file
->fnode
.prefer_bdev
= BDEV_WAL
;
396 log_file
->fnode
.prefer_bdev
,
397 cct
->_conf
->bluefs_max_log_runway
,
400 log_writer
= _create_writer(log_file
);
404 for (unsigned bdev
= 0; bdev
< MAX_BDEV
; ++bdev
) {
405 interval_set
<uint64_t>& p
= block_all
[bdev
];
408 for (interval_set
<uint64_t>::iterator q
= p
.begin(); q
!= p
.end(); ++q
) {
409 dout(20) << __func__
<< " op_alloc_add " << bdev
<< " 0x"
410 << std::hex
<< q
.get_start() << "~" << q
.get_len() << std::dec
412 log_t
.op_alloc_add(bdev
, q
.get_start(), q
.get_len());
415 _flush_and_sync_log(l
);
418 super
.log_fnode
= log_file
->fnode
;
419 _write_super(BDEV_DB
);
423 super
= bluefs_super_t();
424 _close_writer(log_writer
);
430 dout(10) << __func__
<< " success" << dendl
;
434 void BlueFS::_init_alloc()
436 dout(20) << __func__
<< dendl
;
437 alloc
.resize(MAX_BDEV
);
438 pending_release
.resize(MAX_BDEV
);
439 for (unsigned id
= 0; id
< bdev
.size(); ++id
) {
443 ceph_assert(bdev
[id
]->get_size());
444 alloc
[id
] = Allocator::create(cct
, cct
->_conf
->bluefs_allocator
,
445 bdev
[id
]->get_size(),
446 cct
->_conf
->bluefs_alloc_size
);
447 interval_set
<uint64_t>& p
= block_all
[id
];
448 for (interval_set
<uint64_t>::iterator q
= p
.begin(); q
!= p
.end(); ++q
) {
449 alloc
[id
]->init_add_free(q
.get_start(), q
.get_len());
454 void BlueFS::_stop_alloc()
456 dout(20) << __func__
<< dendl
;
457 for (auto p
: bdev
) {
462 for (auto p
: alloc
) {
473 dout(1) << __func__
<< dendl
;
475 int r
= _open_super();
477 derr
<< __func__
<< " failed to open super: " << cpp_strerror(r
) << dendl
;
482 block_all
.resize(MAX_BDEV
);
486 r
= _replay(false, false);
488 derr
<< __func__
<< " failed to replay log: " << cpp_strerror(r
) << dendl
;
494 for (auto& p
: file_map
) {
495 dout(30) << __func__
<< " noting alloc for " << p
.second
->fnode
<< dendl
;
496 for (auto& q
: p
.second
->fnode
.extents
) {
497 alloc
[q
.bdev
]->init_rm_free(q
.offset
, q
.length
);
501 // set up the log for future writes
502 log_writer
= _create_writer(_get_file(1));
503 ceph_assert(log_writer
->file
->fnode
.ino
== 1);
504 log_writer
->pos
= log_writer
->file
->fnode
.size
;
505 dout(10) << __func__
<< " log write pos set to 0x"
506 << std::hex
<< log_writer
->pos
<< std::dec
512 super
= bluefs_super_t();
516 void BlueFS::umount()
518 dout(1) << __func__
<< dendl
;
522 _close_writer(log_writer
);
528 super
= bluefs_super_t();
533 int BlueFS::prepare_new_device(int id
)
535 dout(1) << __func__
<< dendl
;
537 if(id
== BDEV_NEWDB
) {
538 int new_log_dev_cur
= BDEV_WAL
;
539 int new_log_dev_next
= BDEV_WAL
;
540 if (!bdev
[BDEV_WAL
]) {
541 new_log_dev_cur
= BDEV_NEWDB
;
542 new_log_dev_next
= BDEV_DB
;
544 _rewrite_log_sync(false,
550 } else if(id
== BDEV_NEWWAL
) {
551 _rewrite_log_sync(false, BDEV_DB
, BDEV_NEWWAL
, BDEV_WAL
, REMOVE_WAL
);
558 void BlueFS::collect_metadata(map
<string
,string
> *pm
, unsigned skip_bdev_id
)
560 if (skip_bdev_id
!= BDEV_DB
&& bdev
[BDEV_DB
])
561 bdev
[BDEV_DB
]->collect_metadata("bluefs_db_", pm
);
563 bdev
[BDEV_WAL
]->collect_metadata("bluefs_wal_", pm
);
566 void BlueFS::get_devices(set
<string
> *ls
)
568 for (unsigned i
= 0; i
< MAX_BDEV
; ++i
) {
570 bdev
[i
]->get_devices(ls
);
577 std::lock_guard
l(lock
);
578 dout(1) << __func__
<< dendl
;
579 // hrm, i think we check everything on mount...
583 int BlueFS::_write_super(int dev
)
588 uint32_t crc
= bl
.crc32c(-1);
590 dout(10) << __func__
<< " super block length(encoded): " << bl
.length() << dendl
;
591 dout(10) << __func__
<< " superblock " << super
.version
<< dendl
;
592 dout(10) << __func__
<< " log_fnode " << super
.log_fnode
<< dendl
;
593 ceph_assert(bl
.length() <= get_super_length());
594 bl
.append_zero(get_super_length() - bl
.length());
596 bdev
[dev
]->write(get_super_offset(), bl
, false, WRITE_LIFE_SHORT
);
597 dout(20) << __func__
<< " v " << super
.version
598 << " crc 0x" << std::hex
<< crc
599 << " offset 0x" << get_super_offset() << std::dec
604 int BlueFS::_open_super()
606 dout(10) << __func__
<< dendl
;
609 uint32_t expected_crc
, crc
;
612 // always the second block
613 r
= bdev
[BDEV_DB
]->read(get_super_offset(), get_super_length(),
614 &bl
, ioc
[BDEV_DB
], false);
618 auto p
= bl
.cbegin();
622 t
.substr_of(bl
, 0, p
.get_off());
625 decode(expected_crc
, p
);
626 if (crc
!= expected_crc
) {
627 derr
<< __func__
<< " bad crc on superblock, expected 0x"
628 << std::hex
<< expected_crc
<< " != actual 0x" << crc
<< std::dec
632 dout(10) << __func__
<< " superblock " << super
.version
<< dendl
;
633 dout(10) << __func__
<< " log_fnode " << super
.log_fnode
<< dendl
;
637 int BlueFS::_replay(bool noop
, bool to_stdout
)
639 dout(10) << __func__
<< (noop
? " NO-OP" : "") << dendl
;
640 ino_last
= 1; // by the log
644 log_file
= _get_file(1);
646 log_file
->fnode
= super
.log_fnode
;
648 // do not use fnode from superblock in 'noop' mode - log_file's one should
649 // be fine and up-to-date
650 ceph_assert(log_file
->fnode
.ino
== 1);
651 ceph_assert(log_file
->fnode
.extents
.size() != 0);
653 dout(10) << __func__
<< " log_fnode " << super
.log_fnode
<< dendl
;
654 if (unlikely(to_stdout
)) {
655 std::cout
<< " log_fnode " << super
.log_fnode
<< std::endl
;
658 FileReader
*log_reader
= new FileReader(
659 log_file
, cct
->_conf
->bluefs_max_prefetch
,
663 ceph_assert((log_reader
->buf
.pos
& ~super
.block_mask()) == 0);
664 uint64_t pos
= log_reader
->buf
.pos
;
665 uint64_t read_pos
= pos
;
668 int r
= _read(log_reader
, &log_reader
->buf
, read_pos
, super
.block_size
,
670 ceph_assert(r
== (int)super
.block_size
);
677 auto p
= bl
.cbegin();
685 if (len
+ 6 > bl
.length()) {
686 more
= round_up_to(len
+ 6 - bl
.length(), super
.block_size
);
689 if (uuid
!= super
.uuid
) {
690 dout(10) << __func__
<< " 0x" << std::hex
<< pos
<< std::dec
691 << ": stop: uuid " << uuid
<< " != super.uuid " << super
.uuid
695 if (seq
!= log_seq
+ 1) {
696 dout(10) << __func__
<< " 0x" << std::hex
<< pos
<< std::dec
697 << ": stop: seq " << seq
<< " != expected " << log_seq
+ 1
702 dout(20) << __func__
<< " need 0x" << std::hex
<< more
<< std::dec
703 << " more bytes" << dendl
;
705 int r
= _read(log_reader
, &log_reader
->buf
, read_pos
, more
, &t
, NULL
);
707 dout(10) << __func__
<< " 0x" << std::hex
<< pos
708 << ": stop: len is 0x" << bl
.length() + more
<< std::dec
709 << ", which is past eof" << dendl
;
712 ceph_assert(r
== (int)more
);
716 bluefs_transaction_t t
;
718 auto p
= bl
.cbegin();
721 catch (buffer::error
& e
) {
722 dout(10) << __func__
<< " 0x" << std::hex
<< pos
<< std::dec
723 << ": stop: failed to decode: " << e
.what()
728 ceph_assert(seq
== t
.seq
);
729 dout(10) << __func__
<< " 0x" << std::hex
<< pos
<< std::dec
730 << ": " << t
<< dendl
;
731 if (unlikely(to_stdout
)) {
732 std::cout
<< " 0x" << std::hex
<< pos
<< std::dec
733 << ": " << t
<< std::endl
;
736 auto p
= t
.op_bl
.cbegin();
742 case bluefs_transaction_t::OP_INIT
:
743 dout(20) << __func__
<< " 0x" << std::hex
<< pos
<< std::dec
744 << ": op_init" << dendl
;
745 if (unlikely(to_stdout
)) {
746 std::cout
<< " 0x" << std::hex
<< pos
<< std::dec
747 << ": op_init" << std::endl
;
750 ceph_assert(t
.seq
== 1);
753 case bluefs_transaction_t::OP_JUMP
:
759 dout(20) << __func__
<< " 0x" << std::hex
<< pos
<< std::dec
760 << ": op_jump seq " << next_seq
761 << " offset 0x" << std::hex
<< offset
<< std::dec
<< dendl
;
762 if (unlikely(to_stdout
)) {
763 std::cout
<< " 0x" << std::hex
<< pos
<< std::dec
764 << ": op_jump seq " << next_seq
765 << " offset 0x" << std::hex
<< offset
<< std::dec
769 ceph_assert(next_seq
>= log_seq
);
770 log_seq
= next_seq
- 1; // we will increment it below
771 uint64_t skip
= offset
- read_pos
;
774 int r
= _read(log_reader
, &log_reader
->buf
, read_pos
, skip
, &junk
,
776 if (r
!= (int)skip
) {
777 dout(10) << __func__
<< " 0x" << std::hex
<< read_pos
778 << ": stop: failed to skip to " << offset
779 << std::dec
<< dendl
;
780 ceph_abort_msg("problem with op_jump");
786 case bluefs_transaction_t::OP_JUMP_SEQ
:
790 dout(20) << __func__
<< " 0x" << std::hex
<< pos
<< std::dec
791 << ": op_jump_seq " << next_seq
<< dendl
;
792 if (unlikely(to_stdout
)) {
793 std::cout
<< " 0x" << std::hex
<< pos
<< std::dec
794 << ": op_jump_seq " << next_seq
<< std::endl
;
797 ceph_assert(next_seq
>= log_seq
);
798 log_seq
= next_seq
- 1; // we will increment it below
802 case bluefs_transaction_t::OP_ALLOC_ADD
:
805 uint64_t offset
, length
;
809 dout(20) << __func__
<< " 0x" << std::hex
<< pos
<< std::dec
810 << ": op_alloc_add " << " " << (int)id
811 << ":0x" << std::hex
<< offset
<< "~" << length
<< std::dec
813 if (unlikely(to_stdout
)) {
814 std::cout
<< " 0x" << std::hex
<< pos
<< std::dec
815 << ": op_alloc_add " << " " << (int)id
816 << ":0x" << std::hex
<< offset
<< "~" << length
<< std::dec
821 block_all
[id
].insert(offset
, length
);
822 alloc
[id
]->init_add_free(offset
, length
);
827 case bluefs_transaction_t::OP_ALLOC_RM
:
830 uint64_t offset
, length
;
834 dout(20) << __func__
<< " 0x" << std::hex
<< pos
<< std::dec
835 << ": op_alloc_rm " << " " << (int)id
836 << ":0x" << std::hex
<< offset
<< "~" << length
<< std::dec
838 if (unlikely(to_stdout
)) {
839 std::cout
<< " 0x" << std::hex
<< pos
<< std::dec
840 << ": op_alloc_rm " << " " << (int)id
841 << ":0x" << std::hex
<< offset
<< "~" << length
<< std::dec
846 block_all
[id
].erase(offset
, length
);
847 alloc
[id
]->init_rm_free(offset
, length
);
852 case bluefs_transaction_t::OP_DIR_LINK
:
854 string dirname
, filename
;
859 dout(20) << __func__
<< " 0x" << std::hex
<< pos
<< std::dec
860 << ": op_dir_link " << " " << dirname
<< "/" << filename
863 if (unlikely(to_stdout
)) {
864 std::cout
<< " 0x" << std::hex
<< pos
<< std::dec
865 << ": op_dir_link " << " " << dirname
<< "/" << filename
871 FileRef file
= _get_file(ino
);
872 ceph_assert(file
->fnode
.ino
);
873 map
<string
,DirRef
>::iterator q
= dir_map
.find(dirname
);
874 ceph_assert(q
!= dir_map
.end());
875 map
<string
,FileRef
>::iterator r
= q
->second
->file_map
.find(filename
);
876 ceph_assert(r
== q
->second
->file_map
.end());
877 q
->second
->file_map
[filename
] = file
;
883 case bluefs_transaction_t::OP_DIR_UNLINK
:
885 string dirname
, filename
;
888 dout(20) << __func__
<< " 0x" << std::hex
<< pos
<< std::dec
889 << ": op_dir_unlink " << " " << dirname
<< "/" << filename
891 if (unlikely(to_stdout
)) {
892 std::cout
<< " 0x" << std::hex
<< pos
<< std::dec
893 << ": op_dir_unlink " << " " << dirname
<< "/" << filename
898 map
<string
,DirRef
>::iterator q
= dir_map
.find(dirname
);
899 ceph_assert(q
!= dir_map
.end());
900 map
<string
,FileRef
>::iterator r
= q
->second
->file_map
.find(filename
);
901 ceph_assert(r
!= q
->second
->file_map
.end());
902 ceph_assert(r
->second
->refs
> 0);
904 q
->second
->file_map
.erase(r
);
909 case bluefs_transaction_t::OP_DIR_CREATE
:
913 dout(20) << __func__
<< " 0x" << std::hex
<< pos
<< std::dec
914 << ": op_dir_create " << dirname
<< dendl
;
915 if (unlikely(to_stdout
)) {
916 std::cout
<< " 0x" << std::hex
<< pos
<< std::dec
917 << ": op_dir_create " << dirname
<< std::endl
;
921 map
<string
,DirRef
>::iterator q
= dir_map
.find(dirname
);
922 ceph_assert(q
== dir_map
.end());
923 dir_map
[dirname
] = new Dir
;
928 case bluefs_transaction_t::OP_DIR_REMOVE
:
932 dout(20) << __func__
<< " 0x" << std::hex
<< pos
<< std::dec
933 << ": op_dir_remove " << dirname
<< dendl
;
934 if (unlikely(to_stdout
)) {
935 std::cout
<< " 0x" << std::hex
<< pos
<< std::dec
936 << ": op_dir_remove " << dirname
<< std::endl
;
940 map
<string
,DirRef
>::iterator q
= dir_map
.find(dirname
);
941 ceph_assert(q
!= dir_map
.end());
942 ceph_assert(q
->second
->file_map
.empty());
948 case bluefs_transaction_t::OP_FILE_UPDATE
:
950 bluefs_fnode_t fnode
;
952 dout(20) << __func__
<< " 0x" << std::hex
<< pos
<< std::dec
953 << ": op_file_update " << " " << fnode
<< dendl
;
954 if (unlikely(to_stdout
)) {
955 std::cout
<< " 0x" << std::hex
<< pos
<< std::dec
956 << ": op_file_update " << " " << fnode
<< std::endl
;
960 FileRef f
= _get_file(fnode
.ino
);
962 if (fnode
.ino
> ino_last
) {
963 ino_last
= fnode
.ino
;
969 case bluefs_transaction_t::OP_FILE_REMOVE
:
973 dout(20) << __func__
<< " 0x" << std::hex
<< pos
<< std::dec
974 << ": op_file_remove " << ino
<< dendl
;
975 if (unlikely(to_stdout
)) {
976 std::cout
<< " 0x" << std::hex
<< pos
<< std::dec
977 << ": op_file_remove " << ino
<< std::endl
;
981 auto p
= file_map
.find(ino
);
982 ceph_assert(p
!= file_map
.end());
989 derr
<< __func__
<< " 0x" << std::hex
<< pos
<< std::dec
990 << ": stop: unrecognized op " << (int)op
<< dendl
;
995 ceph_assert(p
.end());
997 // we successfully replayed the transaction; bump the seq and log size
999 log_file
->fnode
.size
= log_reader
->buf
.pos
;
1002 dout(10) << __func__
<< " log file size was 0x"
1003 << std::hex
<< log_file
->fnode
.size
<< std::dec
<< dendl
;
1004 if (unlikely(to_stdout
)) {
1005 std::cout
<< " log file size was 0x"
1006 << std::hex
<< log_file
->fnode
.size
<< std::dec
<< std::endl
;
1012 // verify file link counts are all >0
1013 for (auto& p
: file_map
) {
1014 if (p
.second
->refs
== 0 &&
1015 p
.second
->fnode
.ino
> 1) {
1016 derr
<< __func__
<< " file with link count 0: " << p
.second
->fnode
1023 dout(10) << __func__
<< " done" << dendl
;
1027 int BlueFS::log_dump()
1029 // only dump log file's content
1030 int r
= _replay(true, true);
1032 derr
<< __func__
<< " failed to replay log: " << cpp_strerror(r
) << dendl
;
1039 int BlueFS::device_migrate_to_existing(
1041 const set
<int>& devs_source
,
1045 bool buffered
= cct
->_conf
->bluefs_buffered_io
;
1047 assert(dev_target
< (int)MAX_BDEV
);
1050 flags
|= devs_source
.count(BDEV_DB
) ?
1051 (REMOVE_DB
| RENAME_SLOW2DB
) : 0;
1052 flags
|= devs_source
.count(BDEV_WAL
) ? REMOVE_WAL
: 0;
1053 int dev_target_new
= dev_target
;
1055 // Slow device without separate DB one is addressed via BDEV_DB
1056 // Hence need renaming.
1057 if ((flags
& REMOVE_DB
) && dev_target
== BDEV_SLOW
) {
1058 dev_target_new
= BDEV_DB
;
1059 dout(0) << __func__
<< " super to be written to " << dev_target
<< dendl
;
1062 for (auto& p
: file_map
) {
1064 if (p
.second
->fnode
.ino
== 1) {
1067 auto& fnode_extents
= p
.second
->fnode
.extents
;
1069 for (auto ext_it
= fnode_extents
.begin();
1070 ext_it
!= p
.second
->fnode
.extents
.end();
1072 if (ext_it
->bdev
!= dev_target
&& devs_source
.count(ext_it
->bdev
)) {
1073 bluefs_extent_t old_ext
= *ext_it
;
1074 PExtentVector extents
;
1076 _allocate_without_fallback(dev_target
, old_ext
.length
, &extents
);
1078 buf
.resize(old_ext
.length
);
1079 int r
= bdev
[old_ext
.bdev
]->read_random(
1085 derr
<< __func__
<< " failed to read 0x" << std::hex
1086 << old_ext
.offset
<< "~" <<old_ext
.length
<< std::dec
1087 << " from " << (int)dev_target
<< dendl
;
1091 assert(extents
.size() > 0);
1092 uint64_t src_buf_pos
= 0;
1094 // overwrite existing extent
1096 bluefs_extent_t(dev_target_new
, extents
[0].offset
, extents
[0].length
);
1098 bl
.append((char*)&buf
.at(src_buf_pos
), extents
[0].length
);
1099 int r
= bdev
[dev_target
]->write(extents
[0].offset
, bl
, buffered
);
1100 ceph_assert(r
== 0);
1101 src_buf_pos
+= extents
[0].length
;
1103 // then insert more extents if needed
1104 for( size_t i
= 1; i
< extents
.size(); ++i
) {
1106 bl
.append((char*)&buf
.at(src_buf_pos
), extents
[i
].length
);
1108 ext_it
= fnode_extents
.emplace(ext_it
, dev_target_new
,
1109 extents
[i
].offset
, extents
[i
].length
);
1110 int r
= bdev
[dev_target
]->write(extents
[i
].offset
, bl
, buffered
);
1111 ceph_assert(r
== 0);
1112 src_buf_pos
+= extents
[i
].length
;
1115 PExtentVector to_release
;
1116 to_release
.emplace_back(old_ext
.offset
, old_ext
.length
);
1117 alloc
[old_ext
.bdev
]->release(to_release
);
1121 derr
<< __func__
<< " unable to allocate len 0x" << std::hex
1122 << old_ext
.length
<< std::dec
<< " from " << (int)dev_target
1126 } else if (dev_target
!= dev_target_new
&& ext_it
->bdev
== dev_target
) {
1127 ext_it
->bdev
= dev_target_new
;
1130 auto& prefer_bdev
= p
.second
->fnode
.prefer_bdev
;
1131 if (prefer_bdev
!= dev_target
&& devs_source
.count(prefer_bdev
)) {
1132 prefer_bdev
= dev_target_new
;
1135 // new logging device in the current naming scheme
1136 int new_log_dev_cur
= bdev
[BDEV_WAL
] ?
1138 bdev
[BDEV_DB
] ? BDEV_DB
: BDEV_SLOW
;
1140 // new logging device in new naming scheme
1141 int new_log_dev_next
= new_log_dev_cur
;
1143 if (devs_source
.count(new_log_dev_cur
)) {
1144 // SLOW device is addressed via BDEV_DB too hence either WAL or DB
1145 new_log_dev_next
= (flags
& REMOVE_WAL
) || !bdev
[BDEV_WAL
] ?
1149 dout(0) << __func__
<< " log moved from " << new_log_dev_cur
1150 << " to " << new_log_dev_next
<< dendl
;
1153 (flags
& REMOVE_DB
) && new_log_dev_next
== BDEV_DB
?
1160 (flags
& REMOVE_DB
) ? BDEV_SLOW
: BDEV_DB
,
1167 int BlueFS::device_migrate_to_new(
1169 const set
<int>& devs_source
,
1173 bool buffered
= cct
->_conf
->bluefs_buffered_io
;
1175 assert(dev_target
== (int)BDEV_NEWDB
|| (int)BDEV_NEWWAL
);
1179 flags
|= devs_source
.count(BDEV_DB
) ?
1180 (!bdev
[BDEV_SLOW
] ? RENAME_DB2SLOW
: REMOVE_DB
) :
1182 flags
|= devs_source
.count(BDEV_WAL
) ? REMOVE_WAL
: 0;
1183 int dev_target_new
= dev_target
;
1185 for (auto& p
: file_map
) {
1187 if (p
.second
->fnode
.ino
== 1) {
1190 auto& fnode_extents
= p
.second
->fnode
.extents
;
1192 for (auto ext_it
= fnode_extents
.begin();
1193 ext_it
!= p
.second
->fnode
.extents
.end();
1195 if (ext_it
->bdev
!= dev_target
&& devs_source
.count(ext_it
->bdev
)) {
1196 bluefs_extent_t old_ext
= *ext_it
;
1197 PExtentVector extents
;
1199 _allocate_without_fallback(dev_target
, old_ext
.length
, &extents
);
1201 buf
.resize(old_ext
.length
);
1202 int r
= bdev
[old_ext
.bdev
]->read_random(
1207 dout(10)<<__func__
<<" read = "<<r
<<dendl
;
1209 derr
<< __func__
<< " failed to read 0x" << std::hex
1210 << old_ext
.offset
<< "~" <<old_ext
.length
<< std::dec
1211 << " from " << (int)dev_target
<< dendl
;
1215 assert(extents
.size() > 0);
1216 uint64_t src_buf_pos
= 0;
1218 // overwrite existing extent
1220 bluefs_extent_t(dev_target_new
, extents
[0].offset
, extents
[0].length
);
1222 bl
.append((char*)&buf
.at(src_buf_pos
), extents
[0].length
);
1223 int r
= bdev
[dev_target
]->write(extents
[0].offset
, bl
, buffered
);
1224 ceph_assert(r
== 0);
1225 src_buf_pos
+= extents
[0].length
;
1227 // then insert more extents if needed
1228 for( size_t i
= 1; i
< extents
.size(); ++i
) {
1230 bl
.append((char*)&buf
.at(src_buf_pos
), extents
[i
].length
);
1232 ext_it
= fnode_extents
.emplace(ext_it
, dev_target_new
,
1233 extents
[i
].offset
, extents
[i
].length
);
1234 int r
= bdev
[dev_target
]->write(extents
[i
].offset
, bl
, buffered
);
1235 ceph_assert(r
== 0);
1236 src_buf_pos
+= extents
[i
].length
;
1239 PExtentVector to_release
;
1240 to_release
.emplace_back(old_ext
.offset
, old_ext
.length
);
1241 alloc
[old_ext
.bdev
]->release(to_release
);
1244 derr
<< __func__
<< " unable to allocate len 0x" << std::hex
1245 << old_ext
.length
<< std::dec
<< " from " << (int)dev_target
1249 } else if (dev_target
!= dev_target_new
&& ext_it
->bdev
== dev_target
) {
1250 ext_it
->bdev
= dev_target_new
;
1253 auto& prefer_bdev
= p
.second
->fnode
.prefer_bdev
;
1254 if (prefer_bdev
!= dev_target
&& devs_source
.count(prefer_bdev
)) {
1255 prefer_bdev
= dev_target_new
;
1258 // new logging device in the current naming scheme
1259 int new_log_dev_cur
=
1262 bdev
[BDEV_WAL
] && !(flags
& REMOVE_WAL
) ?
1266 bdev
[BDEV_DB
] && !(flags
& REMOVE_DB
)?
1270 // new logging device in new naming scheme
1271 int new_log_dev_next
=
1272 new_log_dev_cur
== BDEV_NEWWAL
?
1274 new_log_dev_cur
== BDEV_NEWDB
?
1279 dev_target
== BDEV_NEWDB
?
1294 BlueFS::FileRef
BlueFS::_get_file(uint64_t ino
)
1296 auto p
= file_map
.find(ino
);
1297 if (p
== file_map
.end()) {
1298 FileRef f
= new File
;
1300 dout(30) << __func__
<< " ino " << ino
<< " = " << f
1301 << " (new)" << dendl
;
1304 dout(30) << __func__
<< " ino " << ino
<< " = " << p
->second
<< dendl
;
1309 void BlueFS::_drop_link(FileRef file
)
1311 dout(20) << __func__
<< " had refs " << file
->refs
1312 << " on " << file
->fnode
<< dendl
;
1313 ceph_assert(file
->refs
> 0);
1315 if (file
->refs
== 0) {
1316 dout(20) << __func__
<< " destroying " << file
->fnode
<< dendl
;
1317 ceph_assert(file
->num_reading
.load() == 0);
1318 log_t
.op_file_remove(file
->fnode
.ino
);
1319 for (auto& r
: file
->fnode
.extents
) {
1320 pending_release
[r
.bdev
].insert(r
.offset
, r
.length
);
1322 file_map
.erase(file
->fnode
.ino
);
1323 file
->deleted
= true;
1325 if (file
->dirty_seq
) {
1326 ceph_assert(file
->dirty_seq
> log_seq_stable
);
1327 ceph_assert(dirty_files
.count(file
->dirty_seq
));
1328 auto it
= dirty_files
[file
->dirty_seq
].iterator_to(*file
);
1329 dirty_files
[file
->dirty_seq
].erase(it
);
1330 file
->dirty_seq
= 0;
1335 int BlueFS::_read_random(
1336 FileReader
*h
, ///< [in] read from here
1337 uint64_t off
, ///< [in] offset
1338 size_t len
, ///< [in] this many bytes
1339 char *out
) ///< [out] optional: or copy it here
1341 auto* buf
= &h
->buf
;
1344 dout(10) << __func__
<< " h " << h
1345 << " 0x" << std::hex
<< off
<< "~" << len
<< std::dec
1346 << " from " << h
->file
->fnode
<< dendl
;
1348 ++h
->file
->num_reading
;
1350 if (!h
->ignore_eof
&&
1351 off
+ len
> h
->file
->fnode
.size
) {
1352 if (off
> h
->file
->fnode
.size
)
1355 len
= h
->file
->fnode
.size
- off
;
1356 dout(20) << __func__
<< " reaching (or past) eof, len clipped to 0x"
1357 << std::hex
<< len
<< std::dec
<< dendl
;
1359 logger
->inc(l_bluefs_read_random_count
, 1);
1360 logger
->inc(l_bluefs_read_random_bytes
, len
);
1362 std::shared_lock
s_lock(h
->lock
);
1364 if (off
< buf
->bl_off
|| off
>= buf
->get_buf_end()) {
1367 auto p
= h
->file
->fnode
.seek(off
, &x_off
);
1368 uint64_t l
= std::min(p
->length
- x_off
, static_cast<uint64_t>(len
));
1369 dout(20) << __func__
<< " read random 0x"
1370 << std::hex
<< x_off
<< "~" << l
<< std::dec
1371 << " of " << *p
<< dendl
;
1372 int r
= bdev
[p
->bdev
]->read_random(p
->offset
+ x_off
, l
, out
,
1373 cct
->_conf
->bluefs_buffered_io
);
1374 ceph_assert(r
== 0);
1380 logger
->inc(l_bluefs_read_random_disk_count
, 1);
1381 logger
->inc(l_bluefs_read_random_disk_bytes
, l
);
1386 auto left
= buf
->get_buf_remaining(off
);
1387 int r
= std::min(len
, left
);
1388 logger
->inc(l_bluefs_read_random_buffer_count
, 1);
1389 logger
->inc(l_bluefs_read_random_buffer_bytes
, r
);
1390 dout(20) << __func__
<< " left 0x" << std::hex
<< left
1391 << " 0x" << off
<< "~" << len
<< std::dec
1395 // NOTE: h->bl is normally a contiguous buffer so c_str() is free.
1396 memcpy(out
, buf
->bl
.c_str() + off
- buf
->bl_off
, r
);
1400 dout(30) << __func__
<< " result chunk (0x"
1401 << std::hex
<< r
<< std::dec
<< " bytes):\n";
1403 t
.substr_of(buf
->bl
, off
- buf
->bl_off
, r
);
1413 dout(20) << __func__
<< " got " << ret
<< dendl
;
1414 --h
->file
->num_reading
;
1419 FileReader
*h
, ///< [in] read from here
1420 FileReaderBuffer
*buf
, ///< [in] reader state
1421 uint64_t off
, ///< [in] offset
1422 size_t len
, ///< [in] this many bytes
1423 bufferlist
*outbl
, ///< [out] optional: reference the result here
1424 char *out
) ///< [out] optional: or copy it here
1426 bool prefetch
= !outbl
&& !out
;
1427 dout(10) << __func__
<< " h " << h
1428 << " 0x" << std::hex
<< off
<< "~" << len
<< std::dec
1429 << " from " << h
->file
->fnode
1430 << (prefetch
? " prefetch" : "")
1433 ++h
->file
->num_reading
;
1435 if (!h
->ignore_eof
&&
1436 off
+ len
> h
->file
->fnode
.size
) {
1437 if (off
> h
->file
->fnode
.size
)
1440 len
= h
->file
->fnode
.size
- off
;
1441 dout(20) << __func__
<< " reaching (or past) eof, len clipped to 0x"
1442 << std::hex
<< len
<< std::dec
<< dendl
;
1444 logger
->inc(l_bluefs_read_count
, 1);
1445 logger
->inc(l_bluefs_read_bytes
, len
);
1447 logger
->inc(l_bluefs_read_prefetch_count
, 1);
1448 logger
->inc(l_bluefs_read_prefetch_bytes
, len
);
1455 std::shared_lock
s_lock(h
->lock
);
1458 if (off
< buf
->bl_off
|| off
>= buf
->get_buf_end()) {
1460 std::unique_lock
u_lock(h
->lock
);
1461 if (off
< buf
->bl_off
|| off
>= buf
->get_buf_end()) {
1462 // if precondition hasn't changed during locking upgrade.
1464 buf
->bl_off
= off
& super
.block_mask();
1466 auto p
= h
->file
->fnode
.seek(buf
->bl_off
, &x_off
);
1467 uint64_t want
= round_up_to(len
+ (off
& ~super
.block_mask()),
1469 want
= std::max(want
, buf
->max_prefetch
);
1470 uint64_t l
= std::min(p
->length
- x_off
, want
);
1471 uint64_t eof_offset
= round_up_to(h
->file
->fnode
.size
, super
.block_size
);
1472 if (!h
->ignore_eof
&&
1473 buf
->bl_off
+ l
> eof_offset
) {
1474 l
= eof_offset
- buf
->bl_off
;
1476 dout(20) << __func__
<< " fetching 0x"
1477 << std::hex
<< x_off
<< "~" << l
<< std::dec
1478 << " of " << *p
<< dendl
;
1479 int r
= bdev
[p
->bdev
]->read(p
->offset
+ x_off
, l
, &buf
->bl
, ioc
[p
->bdev
],
1480 cct
->_conf
->bluefs_buffered_io
);
1481 ceph_assert(r
== 0);
1485 // we should recheck if buffer is valid after lock downgrade
1488 left
= buf
->get_buf_remaining(off
);
1489 dout(20) << __func__
<< " left 0x" << std::hex
<< left
1490 << " len 0x" << len
<< std::dec
<< dendl
;
1492 int r
= std::min(len
, left
);
1495 t
.substr_of(buf
->bl
, off
- buf
->bl_off
, r
);
1496 outbl
->claim_append(t
);
1499 // NOTE: h->bl is normally a contiguous buffer so c_str() is free.
1500 memcpy(out
, buf
->bl
.c_str() + off
- buf
->bl_off
, r
);
1504 dout(30) << __func__
<< " result chunk (0x"
1505 << std::hex
<< r
<< std::dec
<< " bytes):\n";
1507 t
.substr_of(buf
->bl
, off
- buf
->bl_off
, r
);
1517 dout(20) << __func__
<< " got " << ret
<< dendl
;
1518 ceph_assert(!outbl
|| (int)outbl
->length() == ret
);
1519 --h
->file
->num_reading
;
1523 void BlueFS::_invalidate_cache(FileRef f
, uint64_t offset
, uint64_t length
)
1525 dout(10) << __func__
<< " file " << f
->fnode
1526 << " 0x" << std::hex
<< offset
<< "~" << length
<< std::dec
1528 if (offset
& ~super
.block_mask()) {
1529 offset
&= super
.block_mask();
1530 length
= round_up_to(length
, super
.block_size
);
1533 auto p
= f
->fnode
.seek(offset
, &x_off
);
1534 while (length
> 0 && p
!= f
->fnode
.extents
.end()) {
1535 uint64_t x_len
= std::min(p
->length
- x_off
, length
);
1536 bdev
[p
->bdev
]->invalidate_cache(p
->offset
+ x_off
, x_len
);
1537 dout(20) << __func__
<< " 0x" << std::hex
<< x_off
<< "~" << x_len
1538 << std:: dec
<< " of " << *p
<< dendl
;
1544 uint64_t BlueFS::_estimate_log_size()
1546 int avg_dir_size
= 40; // fixme
1547 int avg_file_size
= 12;
1548 uint64_t size
= 4096 * 2;
1549 size
+= file_map
.size() * (1 + sizeof(bluefs_fnode_t
));
1550 for (auto& p
: block_all
)
1551 size
+= p
.num_intervals() * (1 + 1 + sizeof(uint64_t) * 2);
1552 size
+= dir_map
.size() + (1 + avg_dir_size
);
1553 size
+= file_map
.size() * (1 + avg_dir_size
+ avg_file_size
);
1554 return round_up_to(size
, super
.block_size
);
1557 void BlueFS::compact_log()
1559 std::unique_lock
l(lock
);
1560 if (cct
->_conf
->bluefs_compact_log_sync
) {
1561 _compact_log_sync();
1563 _compact_log_async(l
);
1567 bool BlueFS::_should_compact_log()
1569 uint64_t current
= log_writer
->file
->fnode
.size
;
1570 uint64_t expected
= _estimate_log_size();
1571 float ratio
= (float)current
/ (float)expected
;
1572 dout(10) << __func__
<< " current 0x" << std::hex
<< current
1573 << " expected " << expected
<< std::dec
1574 << " ratio " << ratio
1575 << (new_log
? " (async compaction in progress)" : "")
1578 current
< cct
->_conf
->bluefs_log_compact_min_size
||
1579 ratio
< cct
->_conf
->bluefs_log_compact_min_ratio
) {
1585 void BlueFS::_compact_log_dump_metadata(bluefs_transaction_t
*t
,
1589 t
->uuid
= super
.uuid
;
1590 dout(20) << __func__
<< " op_init" << dendl
;
1593 for (unsigned bdev
= 0; bdev
< MAX_BDEV
; ++bdev
) {
1594 interval_set
<uint64_t>& p
= block_all
[bdev
];
1595 for (interval_set
<uint64_t>::iterator q
= p
.begin(); q
!= p
.end(); ++q
) {
1596 auto bdev_new
= bdev
;
1597 if ((flags
& REMOVE_WAL
) && bdev
== BDEV_WAL
) {
1600 if ((flags
& REMOVE_DB
) && bdev
== BDEV_DB
) {
1603 if ((flags
& RENAME_SLOW2DB
) && bdev
== BDEV_SLOW
) {
1606 if ((flags
& RENAME_DB2SLOW
) && bdev
== BDEV_DB
) {
1607 bdev_new
= BDEV_SLOW
;
1609 if (bdev
== BDEV_NEWDB
) {
1610 // REMOVE_DB xor RENAME_DB
1611 ceph_assert(!(flags
& REMOVE_DB
) != !(flags
& RENAME_DB2SLOW
));
1612 ceph_assert(!(flags
& RENAME_SLOW2DB
));
1615 if (bdev
== BDEV_NEWWAL
) {
1616 ceph_assert(flags
& REMOVE_WAL
);
1617 bdev_new
= BDEV_WAL
;
1619 dout(20) << __func__
<< " op_alloc_add " << bdev_new
<< " 0x"
1620 << std::hex
<< q
.get_start() << "~" << q
.get_len() << std::dec
1622 t
->op_alloc_add(bdev_new
, q
.get_start(), q
.get_len());
1625 for (auto& p
: file_map
) {
1628 ceph_assert(p
.first
> 1);
1630 for(auto& e
: p
.second
->fnode
.extents
) {
1632 auto bdev_new
= bdev
;
1633 ceph_assert(!((flags
& REMOVE_WAL
) && bdev
== BDEV_WAL
));
1634 if ((flags
& RENAME_SLOW2DB
) && bdev
== BDEV_SLOW
) {
1637 if ((flags
& RENAME_DB2SLOW
) && bdev
== BDEV_DB
) {
1638 bdev_new
= BDEV_SLOW
;
1640 if (bdev
== BDEV_NEWDB
) {
1641 // REMOVE_DB xor RENAME_DB
1642 ceph_assert(!(flags
& REMOVE_DB
) != !(flags
& RENAME_DB2SLOW
));
1643 ceph_assert(!(flags
& RENAME_SLOW2DB
));
1646 if (bdev
== BDEV_NEWWAL
) {
1647 ceph_assert(flags
& REMOVE_WAL
);
1648 bdev_new
= BDEV_WAL
;
1652 dout(20) << __func__
<< " op_file_update " << p
.second
->fnode
<< dendl
;
1653 t
->op_file_update(p
.second
->fnode
);
1655 for (auto& p
: dir_map
) {
1656 dout(20) << __func__
<< " op_dir_create " << p
.first
<< dendl
;
1657 t
->op_dir_create(p
.first
);
1658 for (auto& q
: p
.second
->file_map
) {
1659 dout(20) << __func__
<< " op_dir_link " << p
.first
<< "/" << q
.first
1660 << " to " << q
.second
->fnode
.ino
<< dendl
;
1661 t
->op_dir_link(p
.first
, q
.first
, q
.second
->fnode
.ino
);
1666 void BlueFS::_compact_log_sync()
1668 dout(10) << __func__
<< dendl
;
1669 _rewrite_log_sync(true,
1671 log_writer
->file
->fnode
.prefer_bdev
,
1672 log_writer
->file
->fnode
.prefer_bdev
,
1674 logger
->inc(l_bluefs_log_compactions
);
1677 void BlueFS::_rewrite_log_sync(bool allocate_with_fallback
,
1683 File
*log_file
= log_writer
->file
.get();
1685 // clear out log (be careful who calls us!!!)
1688 dout(20) << __func__
<< " super_dev:" << super_dev
1689 << " log_dev:" << log_dev
1690 << " log_dev_new:" << log_dev_new
1691 << " flags:" << flags
1693 bluefs_transaction_t t
;
1694 _compact_log_dump_metadata(&t
, flags
);
1696 dout(20) << __func__
<< " op_jump_seq " << log_seq
<< dendl
;
1697 t
.op_jump_seq(log_seq
);
1703 uint64_t need
= bl
.length() + cct
->_conf
->bluefs_max_log_runway
;
1704 dout(20) << __func__
<< " need " << need
<< dendl
;
1706 bluefs_fnode_t old_fnode
;
1708 log_file
->fnode
.swap_extents(old_fnode
);
1709 if (allocate_with_fallback
) {
1710 r
= _allocate(log_dev
, need
, &log_file
->fnode
);
1711 ceph_assert(r
== 0);
1713 PExtentVector extents
;
1714 r
= _allocate_without_fallback(log_dev
,
1717 ceph_assert(r
== 0);
1718 for (auto& p
: extents
) {
1719 log_file
->fnode
.append_extent(
1720 bluefs_extent_t(log_dev
, p
.offset
, p
.length
));
1724 _close_writer(log_writer
);
1726 log_file
->fnode
.size
= bl
.length();
1727 log_writer
= _create_writer(log_file
);
1728 log_writer
->append(bl
);
1729 r
= _flush(log_writer
, true);
1730 ceph_assert(r
== 0);
1732 if (!cct
->_conf
->bluefs_sync_write
) {
1733 list
<aio_t
> completed_ios
;
1734 _claim_completed_aios(log_writer
, &completed_ios
);
1735 wait_for_aio(log_writer
);
1736 completed_ios
.clear();
1741 super
.log_fnode
= log_file
->fnode
;
1742 // rename device if needed
1743 if (log_dev
!= log_dev_new
) {
1744 dout(10) << __func__
<< " renaming log extents to " << log_dev_new
<< dendl
;
1745 for (auto& p
: super
.log_fnode
.extents
) {
1746 p
.bdev
= log_dev_new
;
1749 dout(10) << __func__
<< " writing super, log fnode: " << super
.log_fnode
<< dendl
;
1752 _write_super(super_dev
);
1755 dout(10) << __func__
<< " release old log extents " << old_fnode
.extents
<< dendl
;
1756 for (auto& r
: old_fnode
.extents
) {
1757 pending_release
[r
.bdev
].insert(r
.offset
, r
.length
);
1762 * 1. Allocate a new extent to continue the log, and then log an event
1763 * that jumps the log write position to the new extent. At this point, the
1764 * old extent(s) won't be written to, and reflect everything to compact.
1765 * New events will be written to the new region that we'll keep.
1767 * 2. While still holding the lock, encode a bufferlist that dumps all of the
1768 * in-memory fnodes and names. This will become the new beginning of the
1769 * log. The last event will jump to the log continuation extent from #1.
1771 * 3. Queue a write to a new extent for the new beginnging of the log.
1773 * 4. Drop lock and wait
1775 * 5. Retake the lock.
1777 * 6. Update the log_fnode to splice in the new beginning.
1779 * 7. Write the new superblock.
1781 * 8. Release the old log space. Clean up.
1783 void BlueFS::_compact_log_async(std::unique_lock
<ceph::mutex
>& l
)
1785 dout(10) << __func__
<< dendl
;
1786 File
*log_file
= log_writer
->file
.get();
1787 ceph_assert(!new_log
);
1788 ceph_assert(!new_log_writer
);
1790 // create a new log [writer] so that we know compaction is in progress
1791 // (see _should_compact_log)
1793 new_log
->fnode
.ino
= 0; // so that _flush_range won't try to log the fnode
1795 // 0. wait for any racing flushes to complete. (We do not want to block
1796 // in _flush_sync_log with jump_to set or else a racing thread might flush
1797 // our entries and our jump_to update won't be correct.)
1798 while (log_flushing
) {
1799 dout(10) << __func__
<< " log is currently flushing, waiting" << dendl
;
1803 // 1. allocate new log space and jump to it.
1804 old_log_jump_to
= log_file
->fnode
.get_allocated();
1805 dout(10) << __func__
<< " old_log_jump_to 0x" << std::hex
<< old_log_jump_to
1806 << " need 0x" << (old_log_jump_to
+ cct
->_conf
->bluefs_max_log_runway
) << std::dec
<< dendl
;
1807 int r
= _allocate(log_file
->fnode
.prefer_bdev
,
1808 cct
->_conf
->bluefs_max_log_runway
, &log_file
->fnode
);
1809 ceph_assert(r
== 0);
1810 dout(10) << __func__
<< " log extents " << log_file
->fnode
.extents
<< dendl
;
1812 // update the log file change and log a jump to the offset where we want to
1813 // write the new entries
1814 log_t
.op_file_update(log_file
->fnode
);
1815 log_t
.op_jump(log_seq
, old_log_jump_to
);
1817 flush_bdev(); // FIXME?
1819 _flush_and_sync_log(l
, 0, old_log_jump_to
);
1821 // 2. prepare compacted log
1822 bluefs_transaction_t t
;
1823 //avoid record two times in log_t and _compact_log_dump_metadata.
1825 _compact_log_dump_metadata(&t
, 0);
1827 // conservative estimate for final encoded size
1828 new_log_jump_to
= round_up_to(t
.op_bl
.length() + super
.block_size
* 2,
1829 cct
->_conf
->bluefs_alloc_size
);
1830 t
.op_jump(log_seq
, new_log_jump_to
);
1833 r
= _allocate(BlueFS::BDEV_DB
, new_log_jump_to
,
1835 ceph_assert(r
== 0);
1837 // we might have some more ops in log_t due to _allocate call
1844 dout(10) << __func__
<< " new_log_jump_to 0x" << std::hex
<< new_log_jump_to
1845 << std::dec
<< dendl
;
1847 new_log_writer
= _create_writer(new_log
);
1848 new_log_writer
->append(bl
);
1851 r
= _flush(new_log_writer
, true);
1852 ceph_assert(r
== 0);
1855 _flush_bdev_safely(new_log_writer
);
1857 // 5. update our log fnode
1858 // discard first old_log_jump_to extents
1859 dout(10) << __func__
<< " remove 0x" << std::hex
<< old_log_jump_to
<< std::dec
1860 << " of " << log_file
->fnode
.extents
<< dendl
;
1861 uint64_t discarded
= 0;
1862 mempool::bluefs::vector
<bluefs_extent_t
> old_extents
;
1863 while (discarded
< old_log_jump_to
) {
1864 ceph_assert(!log_file
->fnode
.extents
.empty());
1865 bluefs_extent_t
& e
= log_file
->fnode
.extents
.front();
1866 bluefs_extent_t temp
= e
;
1867 if (discarded
+ e
.length
<= old_log_jump_to
) {
1868 dout(10) << __func__
<< " remove old log extent " << e
<< dendl
;
1869 discarded
+= e
.length
;
1870 log_file
->fnode
.pop_front_extent();
1872 dout(10) << __func__
<< " remove front of old log extent " << e
<< dendl
;
1873 uint64_t drop
= old_log_jump_to
- discarded
;
1878 dout(10) << __func__
<< " kept " << e
<< " removed " << temp
<< dendl
;
1880 old_extents
.push_back(temp
);
1882 auto from
= log_file
->fnode
.extents
.begin();
1883 auto to
= log_file
->fnode
.extents
.end();
1884 while (from
!= to
) {
1885 new_log
->fnode
.append_extent(*from
);
1889 // clear the extents from old log file, they are added to new log
1890 log_file
->fnode
.clear_extents();
1891 // swap the log files. New log file is the log file now.
1892 new_log
->fnode
.swap_extents(log_file
->fnode
);
1894 log_writer
->pos
= log_writer
->file
->fnode
.size
=
1895 log_writer
->pos
- old_log_jump_to
+ new_log_jump_to
;
1897 // 6. write the super block to reflect the changes
1898 dout(10) << __func__
<< " writing super" << dendl
;
1899 super
.log_fnode
= log_file
->fnode
;
1901 _write_super(BDEV_DB
);
1907 // 7. release old space
1908 dout(10) << __func__
<< " release old log extents " << old_extents
<< dendl
;
1909 for (auto& r
: old_extents
) {
1910 pending_release
[r
.bdev
].insert(r
.offset
, r
.length
);
1913 // delete the new log, remove from the dirty files list
1914 _close_writer(new_log_writer
);
1915 if (new_log
->dirty_seq
) {
1916 ceph_assert(dirty_files
.count(new_log
->dirty_seq
));
1917 auto it
= dirty_files
[new_log
->dirty_seq
].iterator_to(*new_log
);
1918 dirty_files
[new_log
->dirty_seq
].erase(it
);
1920 new_log_writer
= nullptr;
1922 log_cond
.notify_all();
1924 dout(10) << __func__
<< " log extents " << log_file
->fnode
.extents
<< dendl
;
1925 logger
->inc(l_bluefs_log_compactions
);
1928 void BlueFS::_pad_bl(bufferlist
& bl
)
1930 uint64_t partial
= bl
.length() % super
.block_size
;
1932 dout(10) << __func__
<< " padding with 0x" << std::hex
1933 << super
.block_size
- partial
<< " zeros" << std::dec
<< dendl
;
1934 bl
.append_zero(super
.block_size
- partial
);
1938 void BlueFS::flush_log()
1940 std::unique_lock
l(lock
);
1942 _flush_and_sync_log(l
);
1945 int BlueFS::_flush_and_sync_log(std::unique_lock
<ceph::mutex
>& l
,
1949 while (log_flushing
) {
1950 dout(10) << __func__
<< " want_seq " << want_seq
1951 << " log is currently flushing, waiting" << dendl
;
1952 ceph_assert(!jump_to
);
1955 if (want_seq
&& want_seq
<= log_seq_stable
) {
1956 dout(10) << __func__
<< " want_seq " << want_seq
<< " <= log_seq_stable "
1957 << log_seq_stable
<< ", done" << dendl
;
1958 ceph_assert(!jump_to
);
1961 if (log_t
.empty() && dirty_files
.empty()) {
1962 dout(10) << __func__
<< " want_seq " << want_seq
1963 << " " << log_t
<< " not dirty, dirty_files empty, no-op" << dendl
;
1964 ceph_assert(!jump_to
);
1968 vector
<interval_set
<uint64_t>> to_release(pending_release
.size());
1969 to_release
.swap(pending_release
);
1971 uint64_t seq
= log_t
.seq
= ++log_seq
;
1972 ceph_assert(want_seq
== 0 || want_seq
<= seq
);
1973 log_t
.uuid
= super
.uuid
;
1976 auto lsi
= dirty_files
.find(seq
);
1977 if (lsi
!= dirty_files
.end()) {
1978 dout(20) << __func__
<< " " << lsi
->second
.size() << " dirty_files" << dendl
;
1979 for (auto &f
: lsi
->second
) {
1980 dout(20) << __func__
<< " op_file_update " << f
.fnode
<< dendl
;
1981 log_t
.op_file_update(f
.fnode
);
1985 dout(10) << __func__
<< " " << log_t
<< dendl
;
1986 ceph_assert(!log_t
.empty());
1988 // allocate some more space (before we run out)?
1989 int64_t runway
= log_writer
->file
->fnode
.get_allocated() -
1990 log_writer
->get_effective_write_pos();
1991 if (runway
< (int64_t)cct
->_conf
->bluefs_min_log_runway
) {
1992 dout(10) << __func__
<< " allocating more log runway (0x"
1993 << std::hex
<< runway
<< std::dec
<< " remaining)" << dendl
;
1994 while (new_log_writer
) {
1995 dout(10) << __func__
<< " waiting for async compaction" << dendl
;
1998 int r
= _allocate(log_writer
->file
->fnode
.prefer_bdev
,
1999 cct
->_conf
->bluefs_max_log_runway
,
2000 &log_writer
->file
->fnode
);
2001 ceph_assert(r
== 0);
2002 log_t
.op_file_update(log_writer
->file
->fnode
);
2006 bl
.reserve(super
.block_size
);
2008 // pad to block boundary
2009 size_t realign
= super
.block_size
- (bl
.length() % super
.block_size
);
2010 if (realign
&& realign
!= super
.block_size
)
2011 bl
.append_zero(realign
);
2013 logger
->inc(l_bluefs_logged_bytes
, bl
.length());
2015 log_writer
->append(bl
);
2018 log_t
.seq
= 0; // just so debug output is less confusing
2019 log_flushing
= true;
2021 int r
= _flush(log_writer
, true);
2022 ceph_assert(r
== 0);
2025 dout(10) << __func__
<< " jumping log offset from 0x" << std::hex
2026 << log_writer
->pos
<< " -> 0x" << jump_to
<< std::dec
<< dendl
;
2027 log_writer
->pos
= jump_to
;
2028 log_writer
->file
->fnode
.size
= jump_to
;
2031 _flush_bdev_safely(log_writer
);
2033 log_flushing
= false;
2034 log_cond
.notify_all();
2036 // clean dirty files
2037 if (seq
> log_seq_stable
) {
2038 log_seq_stable
= seq
;
2039 dout(20) << __func__
<< " log_seq_stable " << log_seq_stable
<< dendl
;
2041 auto p
= dirty_files
.begin();
2042 while (p
!= dirty_files
.end()) {
2043 if (p
->first
> log_seq_stable
) {
2044 dout(20) << __func__
<< " done cleaning up dirty files" << dendl
;
2048 auto l
= p
->second
.begin();
2049 while (l
!= p
->second
.end()) {
2051 ceph_assert(file
->dirty_seq
> 0);
2052 ceph_assert(file
->dirty_seq
<= log_seq_stable
);
2053 dout(20) << __func__
<< " cleaned file " << file
->fnode
<< dendl
;
2054 file
->dirty_seq
= 0;
2055 p
->second
.erase(l
++);
2058 ceph_assert(p
->second
.empty());
2059 dirty_files
.erase(p
++);
2062 dout(20) << __func__
<< " log_seq_stable " << log_seq_stable
2063 << " already >= out seq " << seq
2064 << ", we lost a race against another log flush, done" << dendl
;
2067 for (unsigned i
= 0; i
< to_release
.size(); ++i
) {
2068 if (!to_release
[i
].empty()) {
2069 /* OK, now we have the guarantee alloc[i] won't be null. */
2071 if (cct
->_conf
->bdev_enable_discard
&& cct
->_conf
->bdev_async_discard
) {
2072 r
= bdev
[i
]->queue_discard(to_release
[i
]);
2075 } else if (cct
->_conf
->bdev_enable_discard
) {
2076 for (auto p
= to_release
[i
].begin(); p
!= to_release
[i
].end(); ++p
) {
2077 bdev
[i
]->discard(p
.get_start(), p
.get_len());
2080 alloc
[i
]->release(to_release
[i
]);
2084 _update_logger_stats();
2089 int BlueFS::_flush_range(FileWriter
*h
, uint64_t offset
, uint64_t length
)
2091 dout(10) << __func__
<< " " << h
<< " pos 0x" << std::hex
<< h
->pos
2092 << " 0x" << offset
<< "~" << length
<< std::dec
2093 << " to " << h
->file
->fnode
<< dendl
;
2094 ceph_assert(!h
->file
->deleted
);
2095 ceph_assert(h
->file
->num_readers
.load() == 0);
2097 h
->buffer_appender
.flush();
2100 if (h
->file
->fnode
.ino
== 1)
2103 buffered
= cct
->_conf
->bluefs_buffered_io
;
2105 if (offset
+ length
<= h
->pos
)
2107 if (offset
< h
->pos
) {
2108 length
-= h
->pos
- offset
;
2110 dout(10) << " still need 0x"
2111 << std::hex
<< offset
<< "~" << length
<< std::dec
2114 ceph_assert(offset
<= h
->file
->fnode
.size
);
2116 uint64_t allocated
= h
->file
->fnode
.get_allocated();
2118 // do not bother to dirty the file if we are overwriting
2119 // previously allocated extents.
2120 bool must_dirty
= false;
2121 if (allocated
< offset
+ length
) {
2122 // we should never run out of log space here; see the min runway check
2123 // in _flush_and_sync_log.
2124 ceph_assert(h
->file
->fnode
.ino
!= 1);
2125 int r
= _allocate(h
->file
->fnode
.prefer_bdev
,
2126 offset
+ length
- allocated
,
2129 derr
<< __func__
<< " allocated: 0x" << std::hex
<< allocated
2130 << " offset: 0x" << offset
<< " length: 0x" << length
<< std::dec
2132 ceph_abort_msg("bluefs enospc");
2135 if (cct
->_conf
->bluefs_preextend_wal_files
&&
2136 h
->writer_type
== WRITER_WAL
) {
2137 // NOTE: this *requires* that rocksdb also has log recycling
2138 // enabled and is therefore doing robust CRCs on the log
2139 // records. otherwise, we will fail to reply the rocksdb log
2140 // properly due to garbage on the device.
2141 h
->file
->fnode
.size
= h
->file
->fnode
.get_allocated();
2142 dout(10) << __func__
<< " extending WAL size to 0x" << std::hex
2143 << h
->file
->fnode
.size
<< std::dec
<< " to include allocated"
2148 if (h
->file
->fnode
.size
< offset
+ length
) {
2149 h
->file
->fnode
.size
= offset
+ length
;
2150 if (h
->file
->fnode
.ino
> 1) {
2151 // we do not need to dirty the log file (or it's compacting
2152 // replacement) when the file size changes because replay is
2153 // smart enough to discover it on its own.
2158 h
->file
->fnode
.mtime
= ceph_clock_now();
2159 ceph_assert(h
->file
->fnode
.ino
>= 1);
2160 if (h
->file
->dirty_seq
== 0) {
2161 h
->file
->dirty_seq
= log_seq
+ 1;
2162 dirty_files
[h
->file
->dirty_seq
].push_back(*h
->file
);
2163 dout(20) << __func__
<< " dirty_seq = " << log_seq
+ 1
2164 << " (was clean)" << dendl
;
2166 if (h
->file
->dirty_seq
!= log_seq
+ 1) {
2167 // need re-dirty, erase from list first
2168 ceph_assert(dirty_files
.count(h
->file
->dirty_seq
));
2169 auto it
= dirty_files
[h
->file
->dirty_seq
].iterator_to(*h
->file
);
2170 dirty_files
[h
->file
->dirty_seq
].erase(it
);
2171 h
->file
->dirty_seq
= log_seq
+ 1;
2172 dirty_files
[h
->file
->dirty_seq
].push_back(*h
->file
);
2173 dout(20) << __func__
<< " dirty_seq = " << log_seq
+ 1
2174 << " (was " << h
->file
->dirty_seq
<< ")" << dendl
;
2176 dout(20) << __func__
<< " dirty_seq = " << log_seq
+ 1
2177 << " (unchanged, do nothing) " << dendl
;
2181 dout(20) << __func__
<< " file now " << h
->file
->fnode
<< dendl
;
2184 auto p
= h
->file
->fnode
.seek(offset
, &x_off
);
2185 ceph_assert(p
!= h
->file
->fnode
.extents
.end());
2186 dout(20) << __func__
<< " in " << *p
<< " x_off 0x"
2187 << std::hex
<< x_off
<< std::dec
<< dendl
;
2189 unsigned partial
= x_off
& ~super
.block_mask();
2192 dout(20) << __func__
<< " using partial tail 0x"
2193 << std::hex
<< partial
<< std::dec
<< dendl
;
2194 ceph_assert(h
->tail_block
.length() == partial
);
2195 bl
.claim_append_piecewise(h
->tail_block
);
2199 dout(20) << __func__
<< " waiting for previous aio to complete" << dendl
;
2200 for (auto p
: h
->iocv
) {
2206 if (length
== partial
+ h
->buffer
.length()) {
2207 bl
.claim_append_piecewise(h
->buffer
);
2210 h
->buffer
.splice(0, length
, &t
);
2211 bl
.claim_append_piecewise(t
);
2212 t
.substr_of(h
->buffer
, length
, h
->buffer
.length() - length
);
2214 dout(20) << " leaving 0x" << std::hex
<< h
->buffer
.length() << std::dec
2215 << " unflushed" << dendl
;
2217 ceph_assert(bl
.length() == length
);
2219 switch (h
->writer_type
) {
2221 logger
->inc(l_bluefs_bytes_written_wal
, length
);
2224 logger
->inc(l_bluefs_bytes_written_sst
, length
);
2228 dout(30) << "dump:\n";
2232 h
->pos
= offset
+ length
;
2233 h
->tail_block
.clear();
2236 uint64_t bytes_written_slow
= 0;
2237 while (length
> 0) {
2238 uint64_t x_len
= std::min(p
->length
- x_off
, length
);
2240 t
.substr_of(bl
, bloff
, x_len
);
2241 unsigned tail
= x_len
& ~super
.block_mask();
2243 size_t zlen
= super
.block_size
- tail
;
2244 dout(20) << __func__
<< " caching tail of 0x"
2246 << " and padding block with 0x" << zlen
2247 << std::dec
<< dendl
;
2248 h
->tail_block
.substr_of(bl
, bl
.length() - tail
, tail
);
2249 if (h
->file
->fnode
.ino
> 1) {
2250 // we are using the page_aligned_appender, and can safely use
2251 // the tail of the raw buffer.
2252 const bufferptr
&last
= t
.back();
2253 if (last
.unused_tail_length() < zlen
) {
2254 derr
<< " wtf, last is " << last
<< " from " << t
<< dendl
;
2255 ceph_assert(last
.unused_tail_length() >= zlen
);
2258 z
.set_offset(last
.offset() + last
.length());
2261 t
.append(z
, 0, zlen
);
2263 t
.append_zero(zlen
);
2266 if (cct
->_conf
->bluefs_sync_write
) {
2267 bdev
[p
->bdev
]->write(p
->offset
+ x_off
, t
, buffered
, h
->write_hint
);
2269 bdev
[p
->bdev
]->aio_write(p
->offset
+ x_off
, t
, h
->iocv
[p
->bdev
], buffered
, h
->write_hint
);
2271 h
->dirty_devs
[p
->bdev
] = true;
2272 if (p
->bdev
== BDEV_SLOW
) {
2273 bytes_written_slow
+= t
.length();
2281 logger
->inc(l_bluefs_bytes_written_slow
, bytes_written_slow
);
2282 for (unsigned i
= 0; i
< MAX_BDEV
; ++i
) {
2284 if (h
->iocv
[i
] && h
->iocv
[i
]->has_pending_aios()) {
2285 bdev
[i
]->aio_submit(h
->iocv
[i
]);
2289 dout(20) << __func__
<< " h " << h
<< " pos now 0x"
2290 << std::hex
<< h
->pos
<< std::dec
<< dendl
;
2295 // we need to retire old completed aios so they don't stick around in
2296 // memory indefinitely (along with their bufferlist refs).
2297 void BlueFS::_claim_completed_aios(FileWriter
*h
, list
<aio_t
> *ls
)
2299 for (auto p
: h
->iocv
) {
2301 ls
->splice(ls
->end(), p
->running_aios
);
2304 dout(10) << __func__
<< " got " << ls
->size() << " aios" << dendl
;
2307 void BlueFS::wait_for_aio(FileWriter
*h
)
2309 // NOTE: this is safe to call without a lock, as long as our reference is
2311 dout(10) << __func__
<< " " << h
<< dendl
;
2312 utime_t start
= ceph_clock_now();
2313 for (auto p
: h
->iocv
) {
2318 dout(10) << __func__
<< " " << h
<< " done in " << (ceph_clock_now() - start
) << dendl
;
2322 int BlueFS::_flush(FileWriter
*h
, bool force
)
2324 h
->buffer_appender
.flush();
2325 uint64_t length
= h
->buffer
.length();
2326 uint64_t offset
= h
->pos
;
2328 length
< cct
->_conf
->bluefs_min_flush_size
) {
2329 dout(10) << __func__
<< " " << h
<< " ignoring, length " << length
2330 << " < min_flush_size " << cct
->_conf
->bluefs_min_flush_size
2335 dout(10) << __func__
<< " " << h
<< " no dirty data on "
2336 << h
->file
->fnode
<< dendl
;
2339 dout(10) << __func__
<< " " << h
<< " 0x"
2340 << std::hex
<< offset
<< "~" << length
<< std::dec
2341 << " to " << h
->file
->fnode
<< dendl
;
2342 ceph_assert(h
->pos
<= h
->file
->fnode
.size
);
2343 return _flush_range(h
, offset
, length
);
2346 int BlueFS::_truncate(FileWriter
*h
, uint64_t offset
)
2348 dout(10) << __func__
<< " 0x" << std::hex
<< offset
<< std::dec
2349 << " file " << h
->file
->fnode
<< dendl
;
2350 if (h
->file
->deleted
) {
2351 dout(10) << __func__
<< " deleted, no-op" << dendl
;
2355 // we never truncate internal log files
2356 ceph_assert(h
->file
->fnode
.ino
> 1);
2358 h
->buffer_appender
.flush();
2360 // truncate off unflushed data?
2361 if (h
->pos
< offset
&&
2362 h
->pos
+ h
->buffer
.length() > offset
) {
2364 dout(20) << __func__
<< " tossing out last " << offset
- h
->pos
2365 << " unflushed bytes" << dendl
;
2366 t
.substr_of(h
->buffer
, 0, offset
- h
->pos
);
2368 ceph_abort_msg("actually this shouldn't happen");
2370 if (h
->buffer
.length()) {
2371 int r
= _flush(h
, true);
2375 if (offset
== h
->file
->fnode
.size
) {
2378 if (offset
> h
->file
->fnode
.size
) {
2379 ceph_abort_msg("truncate up not supported");
2381 ceph_assert(h
->file
->fnode
.size
>= offset
);
2382 h
->file
->fnode
.size
= offset
;
2383 log_t
.op_file_update(h
->file
->fnode
);
2387 int BlueFS::_fsync(FileWriter
*h
, std::unique_lock
<ceph::mutex
>& l
)
2389 dout(10) << __func__
<< " " << h
<< " " << h
->file
->fnode
<< dendl
;
2390 int r
= _flush(h
, true);
2393 uint64_t old_dirty_seq
= h
->file
->dirty_seq
;
2395 _flush_bdev_safely(h
);
2397 if (old_dirty_seq
) {
2398 uint64_t s
= log_seq
;
2399 dout(20) << __func__
<< " file metadata was dirty (" << old_dirty_seq
2400 << ") on " << h
->file
->fnode
<< ", flushing log" << dendl
;
2401 _flush_and_sync_log(l
, old_dirty_seq
);
2402 ceph_assert(h
->file
->dirty_seq
== 0 || // cleaned
2403 h
->file
->dirty_seq
> s
); // or redirtied by someone else
2408 void BlueFS::_flush_bdev_safely(FileWriter
*h
)
2410 std::array
<bool, MAX_BDEV
> flush_devs
= h
->dirty_devs
;
2411 h
->dirty_devs
.fill(false);
2413 if (!cct
->_conf
->bluefs_sync_write
) {
2414 list
<aio_t
> completed_ios
;
2415 _claim_completed_aios(h
, &completed_ios
);
2418 completed_ios
.clear();
2419 flush_bdev(flush_devs
);
2425 flush_bdev(flush_devs
);
2430 void BlueFS::flush_bdev(std::array
<bool, MAX_BDEV
>& dirty_bdevs
)
2432 // NOTE: this is safe to call without a lock.
2433 dout(20) << __func__
<< dendl
;
2434 for (unsigned i
= 0; i
< MAX_BDEV
; i
++) {
2440 void BlueFS::flush_bdev()
2442 // NOTE: this is safe to call without a lock.
2443 dout(20) << __func__
<< dendl
;
2444 for (auto p
: bdev
) {
2450 int BlueFS::_expand_slow_device(uint64_t need
, PExtentVector
& extents
)
2453 if (slow_dev_expander
) {
2454 auto min_alloc_size
= cct
->_conf
->bluefs_alloc_size
;
2455 int id
= _get_slow_device_id();
2456 ceph_assert(id
<= (int)alloc
.size() && alloc
[id
]);
2457 auto min_need
= round_up_to(need
, min_alloc_size
);
2458 need
= std::max(need
,
2459 slow_dev_expander
->get_recommended_expansion_delta(
2460 alloc
[id
]->get_free(), block_all
[id
].size()));
2462 need
= round_up_to(need
, min_alloc_size
);
2463 dout(10) << __func__
<< " expanding slow device by 0x"
2464 << std::hex
<< need
<< std::dec
2466 r
= slow_dev_expander
->allocate_freespace(min_need
, need
, extents
);
2471 int BlueFS::_allocate_without_fallback(uint8_t id
, uint64_t len
,
2472 PExtentVector
* extents
)
2474 dout(10) << __func__
<< " len 0x" << std::hex
<< len
<< std::dec
2475 << " from " << (int)id
<< dendl
;
2476 assert(id
< alloc
.size());
2477 uint64_t min_alloc_size
= cct
->_conf
->bluefs_alloc_size
;
2479 uint64_t left
= round_up_to(len
, min_alloc_size
);
2484 extents
->reserve(4); // 4 should be (more than) enough for most allocations
2485 int64_t alloc_len
= alloc
[id
]->allocate(left
, min_alloc_size
, 0, extents
);
2486 if (alloc_len
< (int64_t)left
) {
2487 if (alloc_len
!= 0) {
2488 alloc
[id
]->release(*extents
);
2491 derr
<< __func__
<< " failed to allocate 0x" << std::hex
<< left
2492 << " on bdev " << (int)id
2493 << ", free 0x" << alloc
[id
]->get_free() << std::dec
<< dendl
;
2495 derr
<< __func__
<< " failed to allocate 0x" << std::hex
<< left
2496 << " on bdev " << (int)id
<< ", dne" << std::dec
<< dendl
;
2505 int BlueFS::_allocate(uint8_t id
, uint64_t len
,
2506 bluefs_fnode_t
* node
)
2508 dout(10) << __func__
<< " len 0x" << std::hex
<< len
<< std::dec
2509 << " from " << (int)id
<< dendl
;
2510 ceph_assert(id
< alloc
.size());
2511 uint64_t min_alloc_size
= cct
->_conf
->bluefs_alloc_size
;
2513 uint64_t left
= round_up_to(len
, min_alloc_size
);
2514 int64_t alloc_len
= 0;
2515 PExtentVector extents
;
2519 if (!node
->extents
.empty() && node
->extents
.back().bdev
== id
) {
2520 hint
= node
->extents
.back().end();
2522 extents
.reserve(4); // 4 should be (more than) enough for most allocations
2523 alloc_len
= alloc
[id
]->allocate(left
, min_alloc_size
, hint
, &extents
);
2525 if (alloc_len
< (int64_t)left
) {
2526 if (alloc_len
> 0) {
2527 alloc
[id
]->release(extents
);
2529 if (id
!= BDEV_SLOW
) {
2531 dout(1) << __func__
<< " failed to allocate 0x" << std::hex
<< left
2532 << " on bdev " << (int)id
2533 << ", free 0x" << alloc
[id
]->get_free()
2534 << "; fallback to bdev " << (int)id
+ 1
2535 << std::dec
<< dendl
;
2537 return _allocate(id
+ 1, len
, node
);
2539 dout(1) << __func__
<< " unable to allocate 0x" << std::hex
<< left
2540 << " on bdev " << (int)id
<< ", free 0x"
2541 << (alloc
[id
] ? alloc
[id
]->get_free() : (uint64_t)-1)
2542 << "; fallback to slow device expander "
2543 << std::dec
<< dendl
;
2545 if (_expand_slow_device(left
, extents
) == 0) {
2546 id
= _get_slow_device_id();
2547 for (auto& e
: extents
) {
2548 _add_block_extent(id
, e
.offset
, e
.length
);
2551 auto* last_alloc
= alloc
[id
];
2552 ceph_assert(last_alloc
);
2554 alloc_len
= last_alloc
->allocate(left
, min_alloc_size
, hint
, &extents
);
2555 if (alloc_len
< (int64_t)left
) {
2556 if (alloc_len
> 0) {
2557 last_alloc
->release(extents
);
2559 derr
<< __func__
<< " failed to allocate 0x" << std::hex
<< left
2560 << " on bdev " << (int)id
2561 << ", free 0x" << last_alloc
->get_free() << std::dec
<< dendl
;
2565 derr
<< __func__
<< " failed to expand slow device to fit +0x"
2566 << std::hex
<< left
<< std::dec
2571 uint64_t total_allocated
=
2572 block_all
[id
].size() - alloc
[id
]->get_free();
2573 if (max_bytes
[id
] < total_allocated
) {
2574 logger
->set(max_bytes_pcounters
[id
], total_allocated
);
2575 max_bytes
[id
] = total_allocated
;
2579 for (auto& p
: extents
) {
2580 node
->append_extent(bluefs_extent_t(id
, p
.offset
, p
.length
));
2586 int BlueFS::_preallocate(FileRef f
, uint64_t off
, uint64_t len
)
2588 dout(10) << __func__
<< " file " << f
->fnode
<< " 0x"
2589 << std::hex
<< off
<< "~" << len
<< std::dec
<< dendl
;
2591 dout(10) << __func__
<< " deleted, no-op" << dendl
;
2594 ceph_assert(f
->fnode
.ino
> 1);
2595 uint64_t allocated
= f
->fnode
.get_allocated();
2596 if (off
+ len
> allocated
) {
2597 uint64_t want
= off
+ len
- allocated
;
2598 int r
= _allocate(f
->fnode
.prefer_bdev
, want
, &f
->fnode
);
2601 log_t
.op_file_update(f
->fnode
);
2606 void BlueFS::sync_metadata()
2608 std::unique_lock
l(lock
);
2609 if (log_t
.empty()) {
2610 dout(10) << __func__
<< " - no pending log events" << dendl
;
2612 dout(10) << __func__
<< dendl
;
2613 utime_t start
= ceph_clock_now();
2614 flush_bdev(); // FIXME?
2615 _flush_and_sync_log(l
);
2616 dout(10) << __func__
<< " done in " << (ceph_clock_now() - start
) << dendl
;
2619 if (_should_compact_log()) {
2620 if (cct
->_conf
->bluefs_compact_log_sync
) {
2621 _compact_log_sync();
2623 _compact_log_async(l
);
2628 int BlueFS::open_for_write(
2629 const string
& dirname
,
2630 const string
& filename
,
2634 std::lock_guard
l(lock
);
2635 dout(10) << __func__
<< " " << dirname
<< "/" << filename
<< dendl
;
2636 map
<string
,DirRef
>::iterator p
= dir_map
.find(dirname
);
2638 if (p
== dir_map
.end()) {
2639 // implicitly create the dir
2640 dout(20) << __func__
<< " dir " << dirname
2641 << " does not exist" << dendl
;
2648 bool create
= false;
2649 map
<string
,FileRef
>::iterator q
= dir
->file_map
.find(filename
);
2650 if (q
== dir
->file_map
.end()) {
2652 dout(20) << __func__
<< " dir " << dirname
<< " (" << dir
2653 << ") file " << filename
2654 << " does not exist" << dendl
;
2658 file
->fnode
.ino
= ++ino_last
;
2659 file_map
[ino_last
] = file
;
2660 dir
->file_map
[filename
] = file
;
2664 // overwrite existing file?
2667 dout(20) << __func__
<< " dir " << dirname
<< " (" << dir
2668 << ") file " << filename
2669 << " already exists, overwrite in place" << dendl
;
2671 dout(20) << __func__
<< " dir " << dirname
<< " (" << dir
2672 << ") file " << filename
2673 << " already exists, truncate + overwrite" << dendl
;
2674 file
->fnode
.size
= 0;
2675 for (auto& p
: file
->fnode
.extents
) {
2676 pending_release
[p
.bdev
].insert(p
.offset
, p
.length
);
2679 file
->fnode
.clear_extents();
2682 ceph_assert(file
->fnode
.ino
> 1);
2684 file
->fnode
.mtime
= ceph_clock_now();
2685 file
->fnode
.prefer_bdev
= BlueFS::BDEV_DB
;
2686 if (dirname
.length() > 5) {
2687 // the "db.slow" and "db.wal" directory names are hard-coded at
2688 // match up with bluestore. the slow device is always the second
2689 // one (when a dedicated block.db device is present and used at
2690 // bdev 0). the wal device is always last.
2691 if (boost::algorithm::ends_with(dirname
, ".slow")) {
2692 file
->fnode
.prefer_bdev
= BlueFS::BDEV_SLOW
;
2693 } else if (boost::algorithm::ends_with(dirname
, ".wal")) {
2694 file
->fnode
.prefer_bdev
= BlueFS::BDEV_WAL
;
2697 dout(20) << __func__
<< " mapping " << dirname
<< "/" << filename
2698 << " to bdev " << (int)file
->fnode
.prefer_bdev
<< dendl
;
2700 log_t
.op_file_update(file
->fnode
);
2702 log_t
.op_dir_link(dirname
, filename
, file
->fnode
.ino
);
2704 *h
= _create_writer(file
);
2706 if (boost::algorithm::ends_with(filename
, ".log")) {
2707 (*h
)->writer_type
= BlueFS::WRITER_WAL
;
2708 if (logger
&& !overwrite
) {
2709 logger
->inc(l_bluefs_files_written_wal
);
2711 } else if (boost::algorithm::ends_with(filename
, ".sst")) {
2712 (*h
)->writer_type
= BlueFS::WRITER_SST
;
2714 logger
->inc(l_bluefs_files_written_sst
);
2718 dout(10) << __func__
<< " h " << *h
<< " on " << file
->fnode
<< dendl
;
2722 BlueFS::FileWriter
*BlueFS::_create_writer(FileRef f
)
2724 FileWriter
*w
= new FileWriter(f
);
2725 for (unsigned i
= 0; i
< MAX_BDEV
; ++i
) {
2727 w
->iocv
[i
] = new IOContext(cct
, NULL
);
2733 void BlueFS::_close_writer(FileWriter
*h
)
2735 dout(10) << __func__
<< " " << h
<< " type " << h
->writer_type
<< dendl
;
2736 for (unsigned i
=0; i
<MAX_BDEV
; ++i
) {
2739 h
->iocv
[i
]->aio_wait();
2740 bdev
[i
]->queue_reap_ioc(h
->iocv
[i
]);
2747 int BlueFS::open_for_read(
2748 const string
& dirname
,
2749 const string
& filename
,
2753 std::lock_guard
l(lock
);
2754 dout(10) << __func__
<< " " << dirname
<< "/" << filename
2755 << (random
? " (random)":" (sequential)") << dendl
;
2756 map
<string
,DirRef
>::iterator p
= dir_map
.find(dirname
);
2757 if (p
== dir_map
.end()) {
2758 dout(20) << __func__
<< " dir " << dirname
<< " not found" << dendl
;
2761 DirRef dir
= p
->second
;
2763 map
<string
,FileRef
>::iterator q
= dir
->file_map
.find(filename
);
2764 if (q
== dir
->file_map
.end()) {
2765 dout(20) << __func__
<< " dir " << dirname
<< " (" << dir
2766 << ") file " << filename
2767 << " not found" << dendl
;
2770 File
*file
= q
->second
.get();
2772 *h
= new FileReader(file
, random
? 4096 : cct
->_conf
->bluefs_max_prefetch
,
2774 dout(10) << __func__
<< " h " << *h
<< " on " << file
->fnode
<< dendl
;
2779 const string
& old_dirname
, const string
& old_filename
,
2780 const string
& new_dirname
, const string
& new_filename
)
2782 std::lock_guard
l(lock
);
2783 dout(10) << __func__
<< " " << old_dirname
<< "/" << old_filename
2784 << " -> " << new_dirname
<< "/" << new_filename
<< dendl
;
2785 map
<string
,DirRef
>::iterator p
= dir_map
.find(old_dirname
);
2786 if (p
== dir_map
.end()) {
2787 dout(20) << __func__
<< " dir " << old_dirname
<< " not found" << dendl
;
2790 DirRef old_dir
= p
->second
;
2791 map
<string
,FileRef
>::iterator q
= old_dir
->file_map
.find(old_filename
);
2792 if (q
== old_dir
->file_map
.end()) {
2793 dout(20) << __func__
<< " dir " << old_dirname
<< " (" << old_dir
2794 << ") file " << old_filename
2795 << " not found" << dendl
;
2798 FileRef file
= q
->second
;
2800 p
= dir_map
.find(new_dirname
);
2801 if (p
== dir_map
.end()) {
2802 dout(20) << __func__
<< " dir " << new_dirname
<< " not found" << dendl
;
2805 DirRef new_dir
= p
->second
;
2806 q
= new_dir
->file_map
.find(new_filename
);
2807 if (q
!= new_dir
->file_map
.end()) {
2808 dout(20) << __func__
<< " dir " << new_dirname
<< " (" << old_dir
2809 << ") file " << new_filename
2810 << " already exists, unlinking" << dendl
;
2811 ceph_assert(q
->second
!= file
);
2812 log_t
.op_dir_unlink(new_dirname
, new_filename
);
2813 _drop_link(q
->second
);
2816 dout(10) << __func__
<< " " << new_dirname
<< "/" << new_filename
<< " "
2817 << " " << file
->fnode
<< dendl
;
2819 new_dir
->file_map
[new_filename
] = file
;
2820 old_dir
->file_map
.erase(old_filename
);
2822 log_t
.op_dir_link(new_dirname
, new_filename
, file
->fnode
.ino
);
2823 log_t
.op_dir_unlink(old_dirname
, old_filename
);
2827 int BlueFS::mkdir(const string
& dirname
)
2829 std::lock_guard
l(lock
);
2830 dout(10) << __func__
<< " " << dirname
<< dendl
;
2831 map
<string
,DirRef
>::iterator p
= dir_map
.find(dirname
);
2832 if (p
!= dir_map
.end()) {
2833 dout(20) << __func__
<< " dir " << dirname
<< " exists" << dendl
;
2836 dir_map
[dirname
] = new Dir
;
2837 log_t
.op_dir_create(dirname
);
2841 int BlueFS::rmdir(const string
& dirname
)
2843 std::lock_guard
l(lock
);
2844 dout(10) << __func__
<< " " << dirname
<< dendl
;
2845 map
<string
,DirRef
>::iterator p
= dir_map
.find(dirname
);
2846 if (p
== dir_map
.end()) {
2847 dout(20) << __func__
<< " dir " << dirname
<< " does not exist" << dendl
;
2850 DirRef dir
= p
->second
;
2851 if (!dir
->file_map
.empty()) {
2852 dout(20) << __func__
<< " dir " << dirname
<< " not empty" << dendl
;
2855 dir_map
.erase(dirname
);
2856 log_t
.op_dir_remove(dirname
);
2860 bool BlueFS::dir_exists(const string
& dirname
)
2862 std::lock_guard
l(lock
);
2863 map
<string
,DirRef
>::iterator p
= dir_map
.find(dirname
);
2864 bool exists
= p
!= dir_map
.end();
2865 dout(10) << __func__
<< " " << dirname
<< " = " << (int)exists
<< dendl
;
2869 int BlueFS::stat(const string
& dirname
, const string
& filename
,
2870 uint64_t *size
, utime_t
*mtime
)
2872 std::lock_guard
l(lock
);
2873 dout(10) << __func__
<< " " << dirname
<< "/" << filename
<< dendl
;
2874 map
<string
,DirRef
>::iterator p
= dir_map
.find(dirname
);
2875 if (p
== dir_map
.end()) {
2876 dout(20) << __func__
<< " dir " << dirname
<< " not found" << dendl
;
2879 DirRef dir
= p
->second
;
2880 map
<string
,FileRef
>::iterator q
= dir
->file_map
.find(filename
);
2881 if (q
== dir
->file_map
.end()) {
2882 dout(20) << __func__
<< " dir " << dirname
<< " (" << dir
2883 << ") file " << filename
2884 << " not found" << dendl
;
2887 File
*file
= q
->second
.get();
2888 dout(10) << __func__
<< " " << dirname
<< "/" << filename
2889 << " " << file
->fnode
<< dendl
;
2891 *size
= file
->fnode
.size
;
2893 *mtime
= file
->fnode
.mtime
;
2897 int BlueFS::lock_file(const string
& dirname
, const string
& filename
,
2900 std::lock_guard
l(lock
);
2901 dout(10) << __func__
<< " " << dirname
<< "/" << filename
<< dendl
;
2902 map
<string
,DirRef
>::iterator p
= dir_map
.find(dirname
);
2903 if (p
== dir_map
.end()) {
2904 dout(20) << __func__
<< " dir " << dirname
<< " not found" << dendl
;
2907 DirRef dir
= p
->second
;
2908 map
<string
,FileRef
>::iterator q
= dir
->file_map
.find(filename
);
2910 if (q
== dir
->file_map
.end()) {
2911 dout(20) << __func__
<< " dir " << dirname
<< " (" << dir
2912 << ") file " << filename
2913 << " not found, creating" << dendl
;
2915 file
->fnode
.ino
= ++ino_last
;
2916 file
->fnode
.mtime
= ceph_clock_now();
2917 file_map
[ino_last
] = file
;
2918 dir
->file_map
[filename
] = file
;
2920 log_t
.op_file_update(file
->fnode
);
2921 log_t
.op_dir_link(dirname
, filename
, file
->fnode
.ino
);
2923 file
= q
->second
.get();
2925 dout(10) << __func__
<< " already locked" << dendl
;
2929 file
->locked
= true;
2930 *plock
= new FileLock(file
);
2931 dout(10) << __func__
<< " locked " << file
->fnode
2932 << " with " << *plock
<< dendl
;
2936 int BlueFS::unlock_file(FileLock
*fl
)
2938 std::lock_guard
l(lock
);
2939 dout(10) << __func__
<< " " << fl
<< " on " << fl
->file
->fnode
<< dendl
;
2940 ceph_assert(fl
->file
->locked
);
2941 fl
->file
->locked
= false;
2946 int BlueFS::readdir(const string
& dirname
, vector
<string
> *ls
)
2948 std::lock_guard
l(lock
);
2949 dout(10) << __func__
<< " " << dirname
<< dendl
;
2950 if (dirname
.empty()) {
2952 ls
->reserve(dir_map
.size() + 2);
2953 for (auto& q
: dir_map
) {
2954 ls
->push_back(q
.first
);
2957 // list files in dir
2958 map
<string
,DirRef
>::iterator p
= dir_map
.find(dirname
);
2959 if (p
== dir_map
.end()) {
2960 dout(20) << __func__
<< " dir " << dirname
<< " not found" << dendl
;
2963 DirRef dir
= p
->second
;
2964 ls
->reserve(dir
->file_map
.size() + 2);
2965 for (auto& q
: dir
->file_map
) {
2966 ls
->push_back(q
.first
);
2970 ls
->push_back("..");
2974 int BlueFS::unlink(const string
& dirname
, const string
& filename
)
2976 std::lock_guard
l(lock
);
2977 dout(10) << __func__
<< " " << dirname
<< "/" << filename
<< dendl
;
2978 map
<string
,DirRef
>::iterator p
= dir_map
.find(dirname
);
2979 if (p
== dir_map
.end()) {
2980 dout(20) << __func__
<< " dir " << dirname
<< " not found" << dendl
;
2983 DirRef dir
= p
->second
;
2984 map
<string
,FileRef
>::iterator q
= dir
->file_map
.find(filename
);
2985 if (q
== dir
->file_map
.end()) {
2986 dout(20) << __func__
<< " file " << dirname
<< "/" << filename
2987 << " not found" << dendl
;
2990 FileRef file
= q
->second
;
2992 dout(20) << __func__
<< " file " << dirname
<< "/" << filename
2993 << " is locked" << dendl
;
2996 dir
->file_map
.erase(filename
);
2997 log_t
.op_dir_unlink(dirname
, filename
);
3002 bool BlueFS::wal_is_rotational()
3004 if (bdev
[BDEV_WAL
]) {
3005 return bdev
[BDEV_WAL
]->is_rotational();
3006 } else if (bdev
[BDEV_DB
]) {
3007 return bdev
[BDEV_DB
]->is_rotational();
3009 return bdev
[BDEV_SLOW
]->is_rotational();