]>
Commit | Line | Data |
---|---|---|
7c673cae FG |
1 | // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- |
2 | // vim: ts=8 sw=2 smarttab | |
3 | ||
4 | #include "boost/algorithm/string.hpp" | |
9f95a23c | 5 | #include "bluestore_common.h" |
7c673cae FG |
6 | #include "BlueFS.h" |
7 | ||
8 | #include "common/debug.h" | |
9 | #include "common/errno.h" | |
10 | #include "common/perf_counters.h" | |
11 | #include "BlockDevice.h" | |
12 | #include "Allocator.h" | |
11fdf7f2 | 13 | #include "include/ceph_assert.h" |
eafe8130 | 14 | #include "common/admin_socket.h" |
7c673cae FG |
15 | |
16 | #define dout_context cct | |
17 | #define dout_subsys ceph_subsys_bluefs | |
18 | #undef dout_prefix | |
19 | #define dout_prefix *_dout << "bluefs " | |
9f95a23c | 20 | using TOPNSPC::common::cmd_getval; |
7c673cae FG |
21 | MEMPOOL_DEFINE_OBJECT_FACTORY(BlueFS::File, bluefs_file, bluefs); |
22 | MEMPOOL_DEFINE_OBJECT_FACTORY(BlueFS::Dir, bluefs_dir, bluefs); | |
f91f0fd5 | 23 | MEMPOOL_DEFINE_OBJECT_FACTORY(BlueFS::FileWriter, bluefs_file_writer, bluefs_file_writer); |
7c673cae | 24 | MEMPOOL_DEFINE_OBJECT_FACTORY(BlueFS::FileReaderBuffer, |
f91f0fd5 TL |
25 | bluefs_file_reader_buffer, bluefs_file_reader); |
26 | MEMPOOL_DEFINE_OBJECT_FACTORY(BlueFS::FileReader, bluefs_file_reader, bluefs_file_reader); | |
7c673cae FG |
27 | MEMPOOL_DEFINE_OBJECT_FACTORY(BlueFS::FileLock, bluefs_file_lock, bluefs); |
28 | ||
11fdf7f2 TL |
29 | static void wal_discard_cb(void *priv, void* priv2) { |
30 | BlueFS *bluefs = static_cast<BlueFS*>(priv); | |
31 | interval_set<uint64_t> *tmp = static_cast<interval_set<uint64_t>*>(priv2); | |
32 | bluefs->handle_discard(BlueFS::BDEV_WAL, *tmp); | |
33 | } | |
34 | ||
35 | static void db_discard_cb(void *priv, void* priv2) { | |
36 | BlueFS *bluefs = static_cast<BlueFS*>(priv); | |
37 | interval_set<uint64_t> *tmp = static_cast<interval_set<uint64_t>*>(priv2); | |
38 | bluefs->handle_discard(BlueFS::BDEV_DB, *tmp); | |
39 | } | |
40 | ||
41 | static void slow_discard_cb(void *priv, void* priv2) { | |
42 | BlueFS *bluefs = static_cast<BlueFS*>(priv); | |
43 | interval_set<uint64_t> *tmp = static_cast<interval_set<uint64_t>*>(priv2); | |
44 | bluefs->handle_discard(BlueFS::BDEV_SLOW, *tmp); | |
45 | } | |
7c673cae | 46 | |
eafe8130 TL |
47 | class BlueFS::SocketHook : public AdminSocketHook { |
48 | BlueFS* bluefs; | |
49 | public: | |
50 | static BlueFS::SocketHook* create(BlueFS* bluefs) | |
51 | { | |
52 | BlueFS::SocketHook* hook = nullptr; | |
53 | AdminSocket* admin_socket = bluefs->cct->get_admin_socket(); | |
54 | if (admin_socket) { | |
55 | hook = new BlueFS::SocketHook(bluefs); | |
9f95a23c | 56 | int r = admin_socket->register_command("bluestore bluefs available " |
eafe8130 TL |
57 | "name=alloc_size,type=CephInt,req=false", |
58 | hook, | |
59 | "Report available space for bluefs. " | |
60 | "If alloc_size set, make simulation."); | |
61 | if (r != 0) { | |
62 | ldout(bluefs->cct, 1) << __func__ << " cannot register SocketHook" << dendl; | |
63 | delete hook; | |
64 | hook = nullptr; | |
9f95a23c | 65 | } else { |
f6b5b4d7 | 66 | r = admin_socket->register_command("bluefs stats", |
9f95a23c TL |
67 | hook, |
68 | "Dump internal statistics for bluefs." | |
69 | ""); | |
70 | ceph_assert(r == 0); | |
cd265ab1 TL |
71 | r = admin_socket->register_command("bluefs debug_inject_read_zeros", hook, |
72 | "Injects 8K zeros into next BlueFS read. Debug only."); | |
73 | ceph_assert(r == 0); | |
eafe8130 TL |
74 | } |
75 | } | |
76 | return hook; | |
77 | } | |
78 | ||
79 | ~SocketHook() { | |
80 | AdminSocket* admin_socket = bluefs->cct->get_admin_socket(); | |
9f95a23c | 81 | admin_socket->unregister_commands(this); |
eafe8130 TL |
82 | } |
83 | private: | |
84 | SocketHook(BlueFS* bluefs) : | |
85 | bluefs(bluefs) {} | |
9f95a23c TL |
86 | int call(std::string_view command, const cmdmap_t& cmdmap, |
87 | Formatter *f, | |
88 | std::ostream& errss, | |
89 | bufferlist& out) override { | |
90 | if (command == "bluestore bluefs available") { | |
91 | int64_t alloc_size = 0; | |
92 | cmd_getval(cmdmap, "alloc_size", alloc_size); | |
93 | if ((alloc_size & (alloc_size - 1)) != 0) { | |
94 | errss << "Invalid allocation size:'" << alloc_size << std::endl; | |
95 | return -EINVAL; | |
96 | } | |
97 | if (alloc_size == 0) | |
98 | alloc_size = bluefs->cct->_conf->bluefs_alloc_size; | |
99 | f->open_object_section("bluefs_available_space"); | |
100 | for (unsigned dev = BDEV_WAL; dev <= BDEV_SLOW; dev++) { | |
101 | if (bluefs->bdev[dev]) { | |
102 | f->open_object_section("dev"); | |
103 | f->dump_string("device", bluefs->get_device_name(dev)); | |
104 | ceph_assert(bluefs->alloc[dev]); | |
105 | f->dump_int("free", bluefs->alloc[dev]->get_free()); | |
106 | f->close_section(); | |
107 | } | |
108 | } | |
109 | size_t extra_space = 0; | |
110 | if (bluefs->slow_dev_expander) { | |
111 | extra_space = bluefs->slow_dev_expander->available_freespace(alloc_size); | |
eafe8130 | 112 | } |
9f95a23c TL |
113 | f->dump_int("available_from_bluestore", extra_space); |
114 | f->close_section(); | |
115 | } else if (command == "bluefs stats") { | |
116 | std::stringstream ss; | |
117 | bluefs->dump_block_extents(ss); | |
118 | bluefs->dump_volume_selector(ss); | |
eafe8130 | 119 | out.append(ss); |
cd265ab1 TL |
120 | } else if (command == "bluefs debug_inject_read_zeros") { |
121 | bluefs->inject_read_zeros++; | |
9f95a23c TL |
122 | } else { |
123 | errss << "Invalid command" << std::endl; | |
124 | return -ENOSYS; | |
eafe8130 | 125 | } |
9f95a23c TL |
126 | return 0; |
127 | } | |
eafe8130 TL |
128 | }; |
129 | ||
7c673cae FG |
130 | BlueFS::BlueFS(CephContext* cct) |
131 | : cct(cct), | |
132 | bdev(MAX_BDEV), | |
133 | ioc(MAX_BDEV), | |
11fdf7f2 | 134 | block_all(MAX_BDEV) |
7c673cae | 135 | { |
11fdf7f2 TL |
136 | discard_cb[BDEV_WAL] = wal_discard_cb; |
137 | discard_cb[BDEV_DB] = db_discard_cb; | |
138 | discard_cb[BDEV_SLOW] = slow_discard_cb; | |
eafe8130 | 139 | asok_hook = SocketHook::create(this); |
7c673cae FG |
140 | } |
141 | ||
142 | BlueFS::~BlueFS() | |
143 | { | |
eafe8130 | 144 | delete asok_hook; |
7c673cae FG |
145 | for (auto p : ioc) { |
146 | if (p) | |
147 | p->aio_wait(); | |
148 | } | |
149 | for (auto p : bdev) { | |
150 | if (p) { | |
151 | p->close(); | |
152 | delete p; | |
153 | } | |
154 | } | |
155 | for (auto p : ioc) { | |
156 | delete p; | |
157 | } | |
158 | } | |
159 | ||
160 | void BlueFS::_init_logger() | |
161 | { | |
162 | PerfCountersBuilder b(cct, "bluefs", | |
163 | l_bluefs_first, l_bluefs_last); | |
164 | b.add_u64_counter(l_bluefs_gift_bytes, "gift_bytes", | |
11fdf7f2 | 165 | "Bytes gifted from BlueStore", NULL, 0, unit_t(UNIT_BYTES)); |
7c673cae | 166 | b.add_u64_counter(l_bluefs_reclaim_bytes, "reclaim_bytes", |
11fdf7f2 | 167 | "Bytes reclaimed by BlueStore", NULL, 0, unit_t(UNIT_BYTES)); |
7c673cae FG |
168 | b.add_u64(l_bluefs_db_total_bytes, "db_total_bytes", |
169 | "Total bytes (main db device)", | |
11fdf7f2 | 170 | "b", PerfCountersBuilder::PRIO_USEFUL, unit_t(UNIT_BYTES)); |
7c673cae FG |
171 | b.add_u64(l_bluefs_db_used_bytes, "db_used_bytes", |
172 | "Used bytes (main db device)", | |
11fdf7f2 | 173 | "u", PerfCountersBuilder::PRIO_USEFUL, unit_t(UNIT_BYTES)); |
7c673cae FG |
174 | b.add_u64(l_bluefs_wal_total_bytes, "wal_total_bytes", |
175 | "Total bytes (wal device)", | |
11fdf7f2 | 176 | "walb", PerfCountersBuilder::PRIO_USEFUL, unit_t(UNIT_BYTES)); |
7c673cae FG |
177 | b.add_u64(l_bluefs_wal_used_bytes, "wal_used_bytes", |
178 | "Used bytes (wal device)", | |
11fdf7f2 | 179 | "walu", PerfCountersBuilder::PRIO_USEFUL, unit_t(UNIT_BYTES)); |
7c673cae FG |
180 | b.add_u64(l_bluefs_slow_total_bytes, "slow_total_bytes", |
181 | "Total bytes (slow device)", | |
11fdf7f2 | 182 | "slob", PerfCountersBuilder::PRIO_USEFUL, unit_t(UNIT_BYTES)); |
7c673cae FG |
183 | b.add_u64(l_bluefs_slow_used_bytes, "slow_used_bytes", |
184 | "Used bytes (slow device)", | |
11fdf7f2 | 185 | "slou", PerfCountersBuilder::PRIO_USEFUL, unit_t(UNIT_BYTES)); |
7c673cae FG |
186 | b.add_u64(l_bluefs_num_files, "num_files", "File count", |
187 | "f", PerfCountersBuilder::PRIO_USEFUL); | |
188 | b.add_u64(l_bluefs_log_bytes, "log_bytes", "Size of the metadata log", | |
11fdf7f2 | 189 | "jlen", PerfCountersBuilder::PRIO_INTERESTING, unit_t(UNIT_BYTES)); |
7c673cae FG |
190 | b.add_u64_counter(l_bluefs_log_compactions, "log_compactions", |
191 | "Compactions of the metadata log"); | |
192 | b.add_u64_counter(l_bluefs_logged_bytes, "logged_bytes", | |
193 | "Bytes written to the metadata log", "j", | |
11fdf7f2 | 194 | PerfCountersBuilder::PRIO_CRITICAL, unit_t(UNIT_BYTES)); |
7c673cae FG |
195 | b.add_u64_counter(l_bluefs_files_written_wal, "files_written_wal", |
196 | "Files written to WAL"); | |
197 | b.add_u64_counter(l_bluefs_files_written_sst, "files_written_sst", | |
198 | "Files written to SSTs"); | |
199 | b.add_u64_counter(l_bluefs_bytes_written_wal, "bytes_written_wal", | |
200 | "Bytes written to WAL", "wal", | |
201 | PerfCountersBuilder::PRIO_CRITICAL); | |
202 | b.add_u64_counter(l_bluefs_bytes_written_sst, "bytes_written_sst", | |
203 | "Bytes written to SSTs", "sst", | |
11fdf7f2 TL |
204 | PerfCountersBuilder::PRIO_CRITICAL, unit_t(UNIT_BYTES)); |
205 | b.add_u64_counter(l_bluefs_bytes_written_slow, "bytes_written_slow", | |
206 | "Bytes written to WAL/SSTs at slow device", NULL, | |
207 | PerfCountersBuilder::PRIO_USEFUL, unit_t(UNIT_BYTES)); | |
208 | b.add_u64_counter(l_bluefs_max_bytes_wal, "max_bytes_wal", | |
209 | "Maximum bytes allocated from WAL"); | |
210 | b.add_u64_counter(l_bluefs_max_bytes_db, "max_bytes_db", | |
211 | "Maximum bytes allocated from DB"); | |
212 | b.add_u64_counter(l_bluefs_max_bytes_slow, "max_bytes_slow", | |
213 | "Maximum bytes allocated from SLOW"); | |
494da23a TL |
214 | |
215 | b.add_u64_counter(l_bluefs_read_random_count, "read_random_count", | |
216 | "random read requests processed"); | |
217 | b.add_u64_counter(l_bluefs_read_random_bytes, "read_random_bytes", | |
218 | "Bytes requested in random read mode", NULL, | |
219 | PerfCountersBuilder::PRIO_USEFUL, unit_t(UNIT_BYTES)); | |
220 | b.add_u64_counter(l_bluefs_read_random_disk_count, "read_random_disk_count", | |
221 | "random reads requests going to disk"); | |
222 | b.add_u64_counter(l_bluefs_read_random_disk_bytes, "read_random_disk_bytes", | |
223 | "Bytes read from disk in random read mode", NULL, | |
224 | PerfCountersBuilder::PRIO_USEFUL, unit_t(UNIT_BYTES)); | |
225 | b.add_u64_counter(l_bluefs_read_random_buffer_count, "read_random_buffer_count", | |
226 | "random read requests processed using prefetch buffer"); | |
227 | b.add_u64_counter(l_bluefs_read_random_buffer_bytes, "read_random_buffer_bytes", | |
228 | "Bytes read from prefetch buffer in random read mode", NULL, | |
229 | PerfCountersBuilder::PRIO_USEFUL, unit_t(UNIT_BYTES)); | |
230 | ||
231 | b.add_u64_counter(l_bluefs_read_count, "read_count", | |
232 | "buffered read requests processed"); | |
233 | b.add_u64_counter(l_bluefs_read_bytes, "read_bytes", | |
234 | "Bytes requested in buffered read mode", NULL, | |
235 | PerfCountersBuilder::PRIO_USEFUL, unit_t(UNIT_BYTES)); | |
236 | ||
237 | b.add_u64_counter(l_bluefs_read_prefetch_count, "read_prefetch_count", | |
238 | "prefetch read requests processed"); | |
239 | b.add_u64_counter(l_bluefs_read_prefetch_bytes, "read_prefetch_bytes", | |
240 | "Bytes requested in prefetch read mode", NULL, | |
241 | PerfCountersBuilder::PRIO_USEFUL, unit_t(UNIT_BYTES)); | |
cd265ab1 TL |
242 | b.add_u64(l_bluefs_read_zeros_candidate, "read_zeros_candidate", |
243 | "How many times bluefs read found page with all 0s"); | |
244 | b.add_u64(l_bluefs_read_zeros_errors, "read_zeros_errors", | |
245 | "How many times bluefs read found transient page with all 0s"); | |
494da23a | 246 | |
7c673cae FG |
247 | logger = b.create_perf_counters(); |
248 | cct->get_perfcounters_collection()->add(logger); | |
249 | } | |
250 | ||
251 | void BlueFS::_shutdown_logger() | |
252 | { | |
253 | cct->get_perfcounters_collection()->remove(logger); | |
254 | delete logger; | |
255 | } | |
256 | ||
257 | void BlueFS::_update_logger_stats() | |
258 | { | |
259 | // we must be holding the lock | |
260 | logger->set(l_bluefs_num_files, file_map.size()); | |
261 | logger->set(l_bluefs_log_bytes, log_writer->file->fnode.size); | |
262 | ||
263 | if (alloc[BDEV_WAL]) { | |
11fdf7f2 | 264 | logger->set(l_bluefs_wal_total_bytes, block_all[BDEV_WAL].size()); |
7c673cae | 265 | logger->set(l_bluefs_wal_used_bytes, |
11fdf7f2 | 266 | block_all[BDEV_WAL].size() - alloc[BDEV_WAL]->get_free()); |
7c673cae FG |
267 | } |
268 | if (alloc[BDEV_DB]) { | |
11fdf7f2 | 269 | logger->set(l_bluefs_db_total_bytes, block_all[BDEV_DB].size()); |
7c673cae | 270 | logger->set(l_bluefs_db_used_bytes, |
11fdf7f2 | 271 | block_all[BDEV_DB].size() - alloc[BDEV_DB]->get_free()); |
7c673cae FG |
272 | } |
273 | if (alloc[BDEV_SLOW]) { | |
11fdf7f2 | 274 | logger->set(l_bluefs_slow_total_bytes, block_all[BDEV_SLOW].size()); |
7c673cae | 275 | logger->set(l_bluefs_slow_used_bytes, |
11fdf7f2 | 276 | block_all[BDEV_SLOW].size() - alloc[BDEV_SLOW]->get_free()); |
7c673cae FG |
277 | } |
278 | } | |
279 | ||
11fdf7f2 TL |
280 | int BlueFS::add_block_device(unsigned id, const string& path, bool trim, |
281 | bool shared_with_bluestore) | |
7c673cae FG |
282 | { |
283 | dout(10) << __func__ << " bdev " << id << " path " << path << dendl; | |
11fdf7f2 TL |
284 | ceph_assert(id < bdev.size()); |
285 | ceph_assert(bdev[id] == NULL); | |
286 | BlockDevice *b = BlockDevice::create(cct, path, NULL, NULL, | |
287 | discard_cb[id], static_cast<void*>(this)); | |
288 | if (shared_with_bluestore) { | |
289 | b->set_no_exclusive_lock(); | |
290 | } | |
7c673cae FG |
291 | int r = b->open(path); |
292 | if (r < 0) { | |
293 | delete b; | |
294 | return r; | |
295 | } | |
11fdf7f2 TL |
296 | if (trim) { |
297 | b->discard(0, b->get_size()); | |
298 | } | |
299 | ||
7c673cae | 300 | dout(1) << __func__ << " bdev " << id << " path " << path |
1adf2230 | 301 | << " size " << byte_u_t(b->get_size()) << dendl; |
7c673cae FG |
302 | bdev[id] = b; |
303 | ioc[id] = new IOContext(cct, NULL); | |
304 | return 0; | |
305 | } | |
306 | ||
307 | bool BlueFS::bdev_support_label(unsigned id) | |
308 | { | |
11fdf7f2 TL |
309 | ceph_assert(id < bdev.size()); |
310 | ceph_assert(bdev[id]); | |
7c673cae FG |
311 | return bdev[id]->supported_bdev_label(); |
312 | } | |
313 | ||
314 | uint64_t BlueFS::get_block_device_size(unsigned id) | |
315 | { | |
316 | if (id < bdev.size() && bdev[id]) | |
317 | return bdev[id]->get_size(); | |
318 | return 0; | |
319 | } | |
320 | ||
1911f103 TL |
321 | void BlueFS::_add_block_extent(unsigned id, uint64_t offset, uint64_t length, |
322 | bool skip) | |
7c673cae | 323 | { |
7c673cae | 324 | dout(1) << __func__ << " bdev " << id |
11fdf7f2 | 325 | << " 0x" << std::hex << offset << "~" << length << std::dec |
1911f103 | 326 | << " skip " << skip |
7c673cae | 327 | << dendl; |
11fdf7f2 TL |
328 | |
329 | ceph_assert(id < bdev.size()); | |
330 | ceph_assert(bdev[id]); | |
331 | ceph_assert(bdev[id]->get_size() >= offset + length); | |
7c673cae | 332 | block_all[id].insert(offset, length); |
7c673cae FG |
333 | |
334 | if (id < alloc.size() && alloc[id]) { | |
1911f103 TL |
335 | if (!skip) |
336 | log_t.op_alloc_add(id, offset, length); | |
337 | ||
7c673cae FG |
338 | alloc[id]->init_add_free(offset, length); |
339 | } | |
340 | ||
341 | if (logger) | |
342 | logger->inc(l_bluefs_gift_bytes, length); | |
343 | dout(10) << __func__ << " done" << dendl; | |
344 | } | |
345 | ||
346 | int BlueFS::reclaim_blocks(unsigned id, uint64_t want, | |
a8e16298 | 347 | PExtentVector *extents) |
7c673cae | 348 | { |
11fdf7f2 | 349 | std::unique_lock l(lock); |
7c673cae FG |
350 | dout(1) << __func__ << " bdev " << id |
351 | << " want 0x" << std::hex << want << std::dec << dendl; | |
11fdf7f2 TL |
352 | ceph_assert(id < alloc.size()); |
353 | ceph_assert(alloc[id]); | |
9f95a23c TL |
354 | int64_t got = 0; |
355 | ||
356 | interval_set<uint64_t> granular; | |
357 | while (want > 0 && !block_unused_too_granular[id].empty()) { | |
358 | auto p = block_unused_too_granular[id].begin(); | |
359 | dout(20) << __func__ << " unused " << (int)id << ":" | |
360 | << std::hex << p.get_start() << "~" << p.get_len() << dendl; | |
361 | extents->push_back({p.get_start(), p.get_len()}); | |
362 | granular.insert(p.get_start(), p.get_len()); | |
363 | if (want >= p.get_len()) { | |
364 | want -= p.get_len(); | |
365 | } else { | |
366 | want = 0; | |
367 | } | |
368 | got += p.get_len(); | |
369 | block_unused_too_granular[id].erase(p); | |
7c673cae FG |
370 | } |
371 | ||
9f95a23c TL |
372 | if (want > 0) { |
373 | got += alloc[id]->allocate(want, alloc_size[id], 0, extents); | |
374 | ceph_assert(got != 0); | |
375 | if (got < 0) { | |
376 | derr << __func__ << " failed to allocate space to return to bluestore" | |
377 | << dendl; | |
378 | alloc[id]->dump(); | |
379 | block_unused_too_granular[id].insert(granular); | |
380 | return got; | |
381 | } | |
7c673cae | 382 | |
9f95a23c TL |
383 | for (auto& p : *extents) { |
384 | block_all[id].erase(p.offset, p.length); | |
385 | log_t.op_alloc_rm(id, p.offset, p.length); | |
386 | } | |
387 | ||
388 | flush_bdev(); | |
389 | int r = _flush_and_sync_log(l); | |
390 | ceph_assert(r == 0); | |
391 | } | |
7c673cae | 392 | |
11fdf7f2 | 393 | logger->inc(l_bluefs_reclaim_bytes, got); |
7c673cae FG |
394 | dout(1) << __func__ << " bdev " << id << " want 0x" << std::hex << want |
395 | << " got " << *extents << dendl; | |
396 | return 0; | |
397 | } | |
398 | ||
11fdf7f2 | 399 | void BlueFS::handle_discard(unsigned id, interval_set<uint64_t>& to_release) |
7c673cae | 400 | { |
11fdf7f2 TL |
401 | dout(10) << __func__ << " bdev " << id << dendl; |
402 | ceph_assert(alloc[id]); | |
403 | alloc[id]->release(to_release); | |
404 | } | |
405 | ||
406 | uint64_t BlueFS::get_used() | |
407 | { | |
408 | std::lock_guard l(lock); | |
409 | uint64_t used = 0; | |
410 | for (unsigned id = 0; id < MAX_BDEV; ++id) { | |
411 | if (alloc[id]) { | |
412 | used += block_all[id].size() - alloc[id]->get_free(); | |
413 | } | |
7c673cae | 414 | } |
11fdf7f2 | 415 | return used; |
7c673cae FG |
416 | } |
417 | ||
418 | uint64_t BlueFS::get_total(unsigned id) | |
419 | { | |
11fdf7f2 TL |
420 | std::lock_guard l(lock); |
421 | ceph_assert(id < block_all.size()); | |
422 | return block_all[id].size(); | |
7c673cae FG |
423 | } |
424 | ||
425 | uint64_t BlueFS::get_free(unsigned id) | |
426 | { | |
11fdf7f2 TL |
427 | std::lock_guard l(lock); |
428 | ceph_assert(id < alloc.size()); | |
7c673cae FG |
429 | return alloc[id]->get_free(); |
430 | } | |
431 | ||
432 | void BlueFS::dump_perf_counters(Formatter *f) | |
433 | { | |
434 | f->open_object_section("bluefs_perf_counters"); | |
435 | logger->dump_formatted(f,0); | |
436 | f->close_section(); | |
437 | } | |
438 | ||
3efd9988 FG |
439 | void BlueFS::dump_block_extents(ostream& out) |
440 | { | |
441 | for (unsigned i = 0; i < MAX_BDEV; ++i) { | |
442 | if (!bdev[i]) { | |
443 | continue; | |
444 | } | |
11fdf7f2 TL |
445 | auto owned = get_total(i); |
446 | auto free = get_free(i); | |
1911f103 | 447 | |
11fdf7f2 TL |
448 | out << i << " : device size 0x" << std::hex << bdev[i]->get_size() |
449 | << " : own 0x" << block_all[i] | |
450 | << " = 0x" << owned | |
451 | << " : using 0x" << owned - free | |
1911f103 TL |
452 | << std::dec << "(" << byte_u_t(owned - free) << ")"; |
453 | if (i == _get_slow_device_id()) { | |
454 | ceph_assert(slow_dev_expander); | |
455 | ceph_assert(alloc[i]); | |
456 | free = slow_dev_expander->available_freespace(alloc_size[i]); | |
457 | out << std::hex | |
458 | << " : bluestore has 0x" << free | |
459 | << std::dec << "(" << byte_u_t(free) << ") available"; | |
460 | } | |
461 | out << "\n"; | |
3efd9988 FG |
462 | } |
463 | } | |
7c673cae FG |
464 | |
465 | void BlueFS::get_usage(vector<pair<uint64_t,uint64_t>> *usage) | |
466 | { | |
11fdf7f2 | 467 | std::lock_guard l(lock); |
7c673cae FG |
468 | usage->resize(bdev.size()); |
469 | for (unsigned id = 0; id < bdev.size(); ++id) { | |
470 | if (!bdev[id]) { | |
471 | (*usage)[id] = make_pair(0, 0); | |
472 | continue; | |
473 | } | |
474 | (*usage)[id].first = alloc[id]->get_free(); | |
11fdf7f2 | 475 | (*usage)[id].second = block_all[id].size(); |
7c673cae | 476 | uint64_t used = |
11fdf7f2 | 477 | (block_all[id].size() - (*usage)[id].first) * 100 / block_all[id].size(); |
7c673cae FG |
478 | dout(10) << __func__ << " bdev " << id |
479 | << " free " << (*usage)[id].first | |
1adf2230 | 480 | << " (" << byte_u_t((*usage)[id].first) << ")" |
7c673cae | 481 | << " / " << (*usage)[id].second |
1adf2230 | 482 | << " (" << byte_u_t((*usage)[id].second) << ")" |
7c673cae FG |
483 | << ", used " << used << "%" |
484 | << dendl; | |
485 | } | |
486 | } | |
487 | ||
488 | int BlueFS::get_block_extents(unsigned id, interval_set<uint64_t> *extents) | |
489 | { | |
11fdf7f2 | 490 | std::lock_guard l(lock); |
7c673cae FG |
491 | dout(10) << __func__ << " bdev " << id << dendl; |
492 | if (id >= block_all.size()) | |
493 | return -EINVAL; | |
494 | *extents = block_all[id]; | |
495 | return 0; | |
496 | } | |
497 | ||
9f95a23c | 498 | int BlueFS::mkfs(uuid_d osd_uuid, const bluefs_layout_t& layout) |
7c673cae | 499 | { |
11fdf7f2 | 500 | std::unique_lock l(lock); |
7c673cae FG |
501 | dout(1) << __func__ |
502 | << " osd_uuid " << osd_uuid | |
503 | << dendl; | |
504 | ||
9f95a23c TL |
505 | // set volume selector if not provided before/outside |
506 | if (vselector == nullptr) { | |
507 | vselector.reset( | |
508 | new OriginalVolumeSelector( | |
509 | get_block_device_size(BlueFS::BDEV_WAL) * 95 / 100, | |
510 | get_block_device_size(BlueFS::BDEV_DB) * 95 / 100, | |
511 | get_block_device_size(BlueFS::BDEV_SLOW) * 95 / 100)); | |
512 | } | |
513 | ||
7c673cae FG |
514 | _init_alloc(); |
515 | _init_logger(); | |
516 | ||
517 | super.version = 1; | |
518 | super.block_size = bdev[BDEV_DB]->get_block_size(); | |
519 | super.osd_uuid = osd_uuid; | |
520 | super.uuid.generate_random(); | |
521 | dout(1) << __func__ << " uuid " << super.uuid << dendl; | |
522 | ||
523 | // init log | |
9f95a23c | 524 | FileRef log_file = ceph::make_ref<File>(); |
7c673cae | 525 | log_file->fnode.ino = 1; |
f6b5b4d7 | 526 | log_file->vselector_hint = vselector->get_hint_for_log(); |
7c673cae | 527 | int r = _allocate( |
9f95a23c | 528 | vselector->select_prefer_bdev(log_file->vselector_hint), |
7c673cae | 529 | cct->_conf->bluefs_max_log_runway, |
94b18763 | 530 | &log_file->fnode); |
9f95a23c | 531 | vselector->add_usage(log_file->vselector_hint, log_file->fnode); |
11fdf7f2 | 532 | ceph_assert(r == 0); |
7c673cae FG |
533 | log_writer = _create_writer(log_file); |
534 | ||
535 | // initial txn | |
536 | log_t.op_init(); | |
537 | for (unsigned bdev = 0; bdev < MAX_BDEV; ++bdev) { | |
538 | interval_set<uint64_t>& p = block_all[bdev]; | |
539 | if (p.empty()) | |
540 | continue; | |
541 | for (interval_set<uint64_t>::iterator q = p.begin(); q != p.end(); ++q) { | |
542 | dout(20) << __func__ << " op_alloc_add " << bdev << " 0x" | |
543 | << std::hex << q.get_start() << "~" << q.get_len() << std::dec | |
544 | << dendl; | |
545 | log_t.op_alloc_add(bdev, q.get_start(), q.get_len()); | |
546 | } | |
547 | } | |
548 | _flush_and_sync_log(l); | |
549 | ||
550 | // write supers | |
551 | super.log_fnode = log_file->fnode; | |
9f95a23c | 552 | super.memorized_layout = layout; |
11fdf7f2 | 553 | _write_super(BDEV_DB); |
7c673cae FG |
554 | flush_bdev(); |
555 | ||
556 | // clean up | |
557 | super = bluefs_super_t(); | |
558 | _close_writer(log_writer); | |
559 | log_writer = NULL; | |
560 | block_all.clear(); | |
9f95a23c | 561 | vselector.reset(nullptr); |
7c673cae FG |
562 | _stop_alloc(); |
563 | _shutdown_logger(); | |
564 | ||
565 | dout(10) << __func__ << " success" << dendl; | |
566 | return 0; | |
567 | } | |
568 | ||
569 | void BlueFS::_init_alloc() | |
570 | { | |
571 | dout(20) << __func__ << dendl; | |
572 | alloc.resize(MAX_BDEV); | |
eafe8130 | 573 | alloc_size.resize(MAX_BDEV, 0); |
7c673cae | 574 | pending_release.resize(MAX_BDEV); |
9f95a23c | 575 | block_unused_too_granular.resize(MAX_BDEV); |
eafe8130 TL |
576 | |
577 | if (bdev[BDEV_WAL]) { | |
578 | alloc_size[BDEV_WAL] = cct->_conf->bluefs_alloc_size; | |
579 | } | |
580 | if (bdev[BDEV_SLOW]) { | |
581 | alloc_size[BDEV_DB] = cct->_conf->bluefs_alloc_size; | |
582 | alloc_size[BDEV_SLOW] = cct->_conf->bluefs_shared_alloc_size; | |
583 | } else { | |
584 | alloc_size[BDEV_DB] = cct->_conf->bluefs_shared_alloc_size; | |
585 | } | |
586 | // new wal and db devices are never shared | |
587 | if (bdev[BDEV_NEWWAL]) { | |
588 | alloc_size[BDEV_NEWWAL] = cct->_conf->bluefs_alloc_size; | |
589 | } | |
590 | if (bdev[BDEV_NEWDB]) { | |
591 | alloc_size[BDEV_NEWDB] = cct->_conf->bluefs_alloc_size; | |
592 | } | |
593 | ||
7c673cae FG |
594 | for (unsigned id = 0; id < bdev.size(); ++id) { |
595 | if (!bdev[id]) { | |
596 | continue; | |
597 | } | |
11fdf7f2 | 598 | ceph_assert(bdev[id]->get_size()); |
eafe8130 TL |
599 | std::string name = "bluefs-"; |
600 | const char* devnames[] = {"wal","db","slow"}; | |
601 | if (id <= BDEV_SLOW) | |
602 | name += devnames[id]; | |
603 | else | |
604 | name += to_string(uintptr_t(this)); | |
605 | ceph_assert(alloc_size[id]); | |
606 | dout(1) << __func__ << " id " << id | |
607 | << " alloc_size 0x" << std::hex << alloc_size[id] | |
608 | << " size 0x" << bdev[id]->get_size() << std::dec << dendl; | |
7c673cae FG |
609 | alloc[id] = Allocator::create(cct, cct->_conf->bluefs_allocator, |
610 | bdev[id]->get_size(), | |
eafe8130 | 611 | alloc_size[id], name); |
7c673cae FG |
612 | interval_set<uint64_t>& p = block_all[id]; |
613 | for (interval_set<uint64_t>::iterator q = p.begin(); q != p.end(); ++q) { | |
614 | alloc[id]->init_add_free(q.get_start(), q.get_len()); | |
615 | } | |
616 | } | |
617 | } | |
618 | ||
619 | void BlueFS::_stop_alloc() | |
620 | { | |
621 | dout(20) << __func__ << dendl; | |
11fdf7f2 TL |
622 | for (auto p : bdev) { |
623 | if (p) | |
624 | p->discard_drain(); | |
625 | } | |
626 | ||
7c673cae FG |
627 | for (auto p : alloc) { |
628 | if (p != nullptr) { | |
629 | p->shutdown(); | |
630 | delete p; | |
631 | } | |
632 | } | |
633 | alloc.clear(); | |
9f95a23c | 634 | block_unused_too_granular.clear(); |
7c673cae FG |
635 | } |
636 | ||
cd265ab1 TL |
637 | int BlueFS::read(uint8_t ndev, uint64_t off, uint64_t len, |
638 | ceph::buffer::list *pbl, IOContext *ioc, bool buffered) | |
639 | { | |
640 | dout(10) << __func__ << " dev " << int(ndev) | |
641 | << ": 0x" << std::hex << off << "~" << len << std::dec | |
642 | << (buffered ? " buffered" : "") | |
643 | << dendl; | |
644 | int r; | |
645 | bufferlist bl; | |
646 | r = bdev[ndev]->read(off, len, &bl, ioc, buffered); | |
647 | if (r != 0) { | |
648 | return r; | |
649 | } | |
650 | uint64_t block_size = bdev[ndev]->get_block_size(); | |
651 | if (inject_read_zeros) { | |
652 | if (len >= block_size * 2) { | |
653 | derr << __func__ << " injecting error, zeros at " | |
654 | << int(ndev) << ": 0x" << std::hex << (off + len / 2) | |
655 | << "~" << (block_size * 2) << std::dec << dendl; | |
656 | //use beginning, replace 8K in the middle with zeros, use tail | |
657 | bufferlist temp; | |
658 | bl.splice(0, len / 2 - block_size, &temp); | |
659 | temp.append_zero(block_size * 2); | |
660 | bl.splice(block_size * 2, len / 2 - block_size, &temp); | |
661 | bl = temp; | |
662 | inject_read_zeros--; | |
663 | } | |
664 | } | |
665 | //make a check if there is a block with all 0 | |
666 | uint64_t to_check_len = len; | |
667 | uint64_t skip = p2nphase(off, block_size); | |
668 | if (skip >= to_check_len) { | |
669 | return r; | |
670 | } | |
671 | auto it = bl.begin(skip); | |
672 | to_check_len -= skip; | |
673 | bool all_zeros = false; | |
674 | while (all_zeros == false && to_check_len >= block_size) { | |
675 | // checking 0s step | |
676 | unsigned block_left = block_size; | |
677 | unsigned avail; | |
678 | const char* data; | |
679 | all_zeros = true; | |
680 | while (all_zeros && block_left > 0) { | |
681 | avail = it.get_ptr_and_advance(block_left, &data); | |
682 | block_left -= avail; | |
683 | all_zeros = mem_is_zero(data, avail); | |
684 | } | |
685 | // skipping step | |
686 | while (block_left > 0) { | |
687 | avail = it.get_ptr_and_advance(block_left, &data); | |
688 | block_left -= avail; | |
689 | } | |
690 | to_check_len -= block_size; | |
691 | } | |
692 | if (all_zeros) { | |
693 | logger->inc(l_bluefs_read_zeros_candidate, 1); | |
694 | bufferlist bl_reread; | |
695 | r = bdev[ndev]->read(off, len, &bl_reread, ioc, buffered); | |
696 | if (r != 0) { | |
697 | return r; | |
698 | } | |
699 | // check if both read gave the same | |
700 | if (!bl.contents_equal(bl_reread)) { | |
701 | // report problems to log, but continue, maybe it will be good now... | |
702 | derr << __func__ << " initial read of " << int(ndev) | |
703 | << ": 0x" << std::hex << off << "~" << len | |
704 | << std::dec << ": different then re-read " << dendl; | |
705 | logger->inc(l_bluefs_read_zeros_errors, 1); | |
706 | } | |
707 | // use second read will be better if is different | |
708 | pbl->append(bl_reread); | |
709 | } else { | |
710 | pbl->append(bl); | |
711 | } | |
712 | return r; | |
713 | } | |
714 | ||
715 | int BlueFS::read_random(uint8_t ndev, uint64_t off, uint64_t len, char *buf, bool buffered) | |
716 | { | |
717 | dout(10) << __func__ << " dev " << int(ndev) | |
718 | << ": 0x" << std::hex << off << "~" << len << std::dec | |
719 | << (buffered ? " buffered" : "") | |
720 | << dendl; | |
721 | int r; | |
722 | r = bdev[ndev]->read_random(off, len, buf, buffered); | |
723 | if (r != 0) { | |
724 | return r; | |
725 | } | |
726 | uint64_t block_size = bdev[ndev]->get_block_size(); | |
727 | if (inject_read_zeros) { | |
728 | if (len >= block_size * 2) { | |
729 | derr << __func__ << " injecting error, zeros at " | |
730 | << int(ndev) << ": 0x" << std::hex << (off + len / 2) | |
731 | << "~" << (block_size * 2) << std::dec << dendl; | |
732 | //zero middle 8K | |
733 | memset(buf + len / 2 - block_size, 0, block_size * 2); | |
734 | inject_read_zeros--; | |
735 | } | |
736 | } | |
737 | //make a check if there is a block with all 0 | |
738 | uint64_t to_check_len = len; | |
739 | const char* data = buf; | |
740 | uint64_t skip = p2nphase(off, block_size); | |
741 | if (skip >= to_check_len) { | |
742 | return r; | |
743 | } | |
744 | to_check_len -= skip; | |
745 | data += skip; | |
746 | ||
747 | bool all_zeros = false; | |
748 | while (all_zeros == false && to_check_len >= block_size) { | |
749 | if (mem_is_zero(data, block_size)) { | |
750 | // at least one block is all zeros | |
751 | all_zeros = true; | |
752 | break; | |
753 | } | |
754 | data += block_size; | |
755 | to_check_len -= block_size; | |
756 | } | |
757 | if (all_zeros) { | |
758 | logger->inc(l_bluefs_read_zeros_candidate, 1); | |
759 | std::unique_ptr<char[]> data_reread(new char[len]); | |
760 | r = bdev[ndev]->read_random(off, len, &data_reread[0], buffered); | |
761 | if (r != 0) { | |
762 | return r; | |
763 | } | |
764 | // check if both read gave the same | |
765 | if (memcmp(buf, &data_reread[0], len) != 0) { | |
766 | derr << __func__ << " initial read of " << int(ndev) | |
767 | << ": 0x" << std::hex << off << "~" << len | |
768 | << std::dec << ": different then re-read " << dendl; | |
769 | logger->inc(l_bluefs_read_zeros_errors, 1); | |
770 | // second read is probably better | |
771 | memcpy(buf, &data_reread[0], len); | |
772 | } | |
773 | } | |
774 | return r; | |
775 | } | |
776 | ||
7c673cae FG |
777 | int BlueFS::mount() |
778 | { | |
779 | dout(1) << __func__ << dendl; | |
780 | ||
781 | int r = _open_super(); | |
782 | if (r < 0) { | |
783 | derr << __func__ << " failed to open super: " << cpp_strerror(r) << dendl; | |
784 | goto out; | |
785 | } | |
786 | ||
9f95a23c TL |
787 | // set volume selector if not provided before/outside |
788 | if (vselector == nullptr) { | |
789 | vselector.reset( | |
790 | new OriginalVolumeSelector( | |
791 | get_block_device_size(BlueFS::BDEV_WAL) * 95 / 100, | |
792 | get_block_device_size(BlueFS::BDEV_DB) * 95 / 100, | |
793 | get_block_device_size(BlueFS::BDEV_SLOW) * 95 / 100)); | |
794 | } | |
795 | ||
7c673cae FG |
796 | block_all.clear(); |
797 | block_all.resize(MAX_BDEV); | |
7c673cae | 798 | _init_alloc(); |
494da23a | 799 | _init_logger(); |
7c673cae | 800 | |
11fdf7f2 | 801 | r = _replay(false, false); |
7c673cae FG |
802 | if (r < 0) { |
803 | derr << __func__ << " failed to replay log: " << cpp_strerror(r) << dendl; | |
804 | _stop_alloc(); | |
805 | goto out; | |
806 | } | |
807 | ||
808 | // init freelist | |
809 | for (auto& p : file_map) { | |
810 | dout(30) << __func__ << " noting alloc for " << p.second->fnode << dendl; | |
811 | for (auto& q : p.second->fnode.extents) { | |
812 | alloc[q.bdev]->init_rm_free(q.offset, q.length); | |
813 | } | |
814 | } | |
815 | ||
816 | // set up the log for future writes | |
817 | log_writer = _create_writer(_get_file(1)); | |
11fdf7f2 | 818 | ceph_assert(log_writer->file->fnode.ino == 1); |
7c673cae FG |
819 | log_writer->pos = log_writer->file->fnode.size; |
820 | dout(10) << __func__ << " log write pos set to 0x" | |
821 | << std::hex << log_writer->pos << std::dec | |
822 | << dendl; | |
823 | ||
7c673cae FG |
824 | return 0; |
825 | ||
826 | out: | |
827 | super = bluefs_super_t(); | |
828 | return r; | |
829 | } | |
830 | ||
9f95a23c TL |
831 | int BlueFS::maybe_verify_layout(const bluefs_layout_t& layout) const |
832 | { | |
833 | if (super.memorized_layout) { | |
834 | if (layout == *super.memorized_layout) { | |
835 | dout(10) << __func__ << " bluefs layout verified positively" << dendl; | |
836 | } else { | |
837 | derr << __func__ << " memorized layout doesn't fit current one" << dendl; | |
838 | return -EIO; | |
839 | } | |
840 | } else { | |
841 | dout(10) << __func__ << " no memorized_layout in bluefs superblock" | |
842 | << dendl; | |
843 | } | |
844 | ||
845 | return 0; | |
846 | } | |
847 | ||
1911f103 | 848 | void BlueFS::umount(bool avoid_compact) |
7c673cae FG |
849 | { |
850 | dout(1) << __func__ << dendl; | |
851 | ||
1911f103 | 852 | sync_metadata(avoid_compact); |
7c673cae FG |
853 | |
854 | _close_writer(log_writer); | |
855 | log_writer = NULL; | |
856 | ||
9f95a23c | 857 | vselector.reset(nullptr); |
7c673cae FG |
858 | _stop_alloc(); |
859 | file_map.clear(); | |
860 | dir_map.clear(); | |
861 | super = bluefs_super_t(); | |
862 | log_t.clear(); | |
863 | _shutdown_logger(); | |
864 | } | |
865 | ||
9f95a23c | 866 | int BlueFS::prepare_new_device(int id, const bluefs_layout_t& layout) |
7c673cae | 867 | { |
11fdf7f2 TL |
868 | dout(1) << __func__ << dendl; |
869 | ||
870 | if(id == BDEV_NEWDB) { | |
871 | int new_log_dev_cur = BDEV_WAL; | |
872 | int new_log_dev_next = BDEV_WAL; | |
873 | if (!bdev[BDEV_WAL]) { | |
874 | new_log_dev_cur = BDEV_NEWDB; | |
875 | new_log_dev_next = BDEV_DB; | |
876 | } | |
9f95a23c | 877 | _rewrite_log_and_layout_sync(false, |
11fdf7f2 TL |
878 | BDEV_NEWDB, |
879 | new_log_dev_cur, | |
880 | new_log_dev_next, | |
9f95a23c TL |
881 | RENAME_DB2SLOW, |
882 | layout); | |
11fdf7f2 TL |
883 | //} |
884 | } else if(id == BDEV_NEWWAL) { | |
9f95a23c TL |
885 | _rewrite_log_and_layout_sync(false, |
886 | BDEV_DB, | |
887 | BDEV_NEWWAL, | |
888 | BDEV_WAL, | |
889 | REMOVE_WAL, | |
890 | layout); | |
11fdf7f2 TL |
891 | } else { |
892 | assert(false); | |
893 | } | |
894 | return 0; | |
895 | } | |
896 | ||
897 | void BlueFS::collect_metadata(map<string,string> *pm, unsigned skip_bdev_id) | |
898 | { | |
899 | if (skip_bdev_id != BDEV_DB && bdev[BDEV_DB]) | |
7c673cae FG |
900 | bdev[BDEV_DB]->collect_metadata("bluefs_db_", pm); |
901 | if (bdev[BDEV_WAL]) | |
902 | bdev[BDEV_WAL]->collect_metadata("bluefs_wal_", pm); | |
11fdf7f2 TL |
903 | } |
904 | ||
905 | void BlueFS::get_devices(set<string> *ls) | |
906 | { | |
907 | for (unsigned i = 0; i < MAX_BDEV; ++i) { | |
908 | if (bdev[i]) { | |
909 | bdev[i]->get_devices(ls); | |
910 | } | |
911 | } | |
7c673cae FG |
912 | } |
913 | ||
914 | int BlueFS::fsck() | |
915 | { | |
11fdf7f2 | 916 | std::lock_guard l(lock); |
7c673cae FG |
917 | dout(1) << __func__ << dendl; |
918 | // hrm, i think we check everything on mount... | |
919 | return 0; | |
920 | } | |
921 | ||
11fdf7f2 | 922 | int BlueFS::_write_super(int dev) |
7c673cae FG |
923 | { |
924 | // build superblock | |
925 | bufferlist bl; | |
11fdf7f2 | 926 | encode(super, bl); |
7c673cae | 927 | uint32_t crc = bl.crc32c(-1); |
11fdf7f2 | 928 | encode(crc, bl); |
7c673cae FG |
929 | dout(10) << __func__ << " super block length(encoded): " << bl.length() << dendl; |
930 | dout(10) << __func__ << " superblock " << super.version << dendl; | |
931 | dout(10) << __func__ << " log_fnode " << super.log_fnode << dendl; | |
9f95a23c | 932 | ceph_assert_always(bl.length() <= get_super_length()); |
7c673cae FG |
933 | bl.append_zero(get_super_length() - bl.length()); |
934 | ||
11fdf7f2 | 935 | bdev[dev]->write(get_super_offset(), bl, false, WRITE_LIFE_SHORT); |
7c673cae FG |
936 | dout(20) << __func__ << " v " << super.version |
937 | << " crc 0x" << std::hex << crc | |
938 | << " offset 0x" << get_super_offset() << std::dec | |
939 | << dendl; | |
940 | return 0; | |
941 | } | |
942 | ||
943 | int BlueFS::_open_super() | |
944 | { | |
945 | dout(10) << __func__ << dendl; | |
946 | ||
947 | bufferlist bl; | |
948 | uint32_t expected_crc, crc; | |
949 | int r; | |
950 | ||
951 | // always the second block | |
952 | r = bdev[BDEV_DB]->read(get_super_offset(), get_super_length(), | |
953 | &bl, ioc[BDEV_DB], false); | |
954 | if (r < 0) | |
955 | return r; | |
956 | ||
11fdf7f2 TL |
957 | auto p = bl.cbegin(); |
958 | decode(super, p); | |
7c673cae FG |
959 | { |
960 | bufferlist t; | |
961 | t.substr_of(bl, 0, p.get_off()); | |
962 | crc = t.crc32c(-1); | |
963 | } | |
11fdf7f2 | 964 | decode(expected_crc, p); |
7c673cae FG |
965 | if (crc != expected_crc) { |
966 | derr << __func__ << " bad crc on superblock, expected 0x" | |
967 | << std::hex << expected_crc << " != actual 0x" << crc << std::dec | |
968 | << dendl; | |
969 | return -EIO; | |
970 | } | |
971 | dout(10) << __func__ << " superblock " << super.version << dendl; | |
972 | dout(10) << __func__ << " log_fnode " << super.log_fnode << dendl; | |
973 | return 0; | |
974 | } | |
975 | ||
9f95a23c TL |
976 | int BlueFS::_check_new_allocations(const bluefs_fnode_t& fnode, |
977 | size_t dev_count, | |
978 | boost::dynamic_bitset<uint64_t>* owned_blocks, | |
979 | boost::dynamic_bitset<uint64_t>* used_blocks) | |
980 | { | |
981 | auto& fnode_extents = fnode.extents; | |
982 | for (auto e : fnode_extents) { | |
983 | auto id = e.bdev; | |
984 | bool fail = false; | |
985 | ceph_assert(id < dev_count); | |
986 | apply_for_bitset_range(e.offset, e.length, alloc_size[id], owned_blocks[id], | |
987 | [&](uint64_t pos, boost::dynamic_bitset<uint64_t> &bs) { | |
988 | if (!bs.test(pos)) { | |
989 | fail = true; | |
990 | } | |
991 | } | |
992 | ); | |
993 | if (fail) { | |
994 | derr << __func__ << " invalid extent " << int(id) | |
995 | << ": 0x" << std::hex << e.offset << "~" << e.length | |
996 | << std::dec | |
997 | << ": wasn't given but allocated for ino " << fnode.ino | |
998 | << dendl; | |
999 | return -EFAULT; | |
1000 | } | |
1001 | ||
1002 | apply_for_bitset_range(e.offset, e.length, alloc_size[id], used_blocks[id], | |
1003 | [&](uint64_t pos, boost::dynamic_bitset<uint64_t> &bs) { | |
1004 | if (bs.test(pos)) { | |
1005 | fail = true; | |
1006 | } | |
1007 | bs.set(pos); | |
1008 | } | |
1009 | ); | |
1010 | if (fail) { | |
1011 | derr << __func__ << " invalid extent " << int(e.bdev) | |
1012 | << ": 0x" << std::hex << e.offset << "~" << e.length | |
1013 | << std::dec << ": duplicate reference, ino " << fnode.ino | |
1014 | << dendl; | |
1015 | return -EFAULT; | |
1016 | } | |
1017 | } | |
1018 | return 0; | |
1019 | } | |
1020 | ||
1021 | int BlueFS::_adjust_granularity( | |
1022 | __u8 id, uint64_t *offset, uint64_t *length, bool alloc) | |
1023 | { | |
1024 | const char *op = alloc ? "op_alloc_add" : "op_alloc_rm"; | |
1025 | auto oldo = *offset; | |
1026 | auto oldl = *length; | |
1027 | if (*offset & (alloc_size[id] - 1)) { | |
1028 | *offset &= ~(alloc_size[id] - 1); | |
1029 | *offset += alloc_size[id]; | |
1030 | if (*length > *offset - oldo) { | |
1031 | if (alloc) { | |
1032 | block_unused_too_granular[id].insert(oldo, *offset - oldo); | |
1033 | } else { | |
1034 | block_unused_too_granular[id].erase(oldo, *offset - oldo); | |
1035 | } | |
1036 | *length -= (*offset - oldo); | |
1037 | } else { | |
1038 | if (alloc) { | |
1039 | block_unused_too_granular[id].insert(oldo, *length); | |
1040 | } else { | |
1041 | block_unused_too_granular[id].erase(oldo, *length); | |
1042 | } | |
1043 | *length = 0; | |
1044 | } | |
1045 | } | |
1046 | if (*length & (alloc_size[id] - 1)) { | |
1047 | *length &= ~(alloc_size[id] - 1); | |
1048 | if (alloc) { | |
1049 | block_unused_too_granular[id].insert( | |
1050 | *offset + *length, | |
1051 | oldo + oldl - *offset - *length); | |
1052 | } else { | |
1053 | block_unused_too_granular[id].erase( | |
1054 | *offset + *length, | |
1055 | oldo + oldl - *offset - *length); | |
1056 | } | |
1057 | } | |
1058 | if (oldo != *offset || oldl != *length) { | |
1059 | dout(10) << __func__ << " " << op << " " | |
1060 | << (int)id << ":" << std::hex << oldo << "~" << oldl | |
1061 | << " -> " << (int)id << ":" << *offset << "~" << *length << dendl; | |
1062 | } | |
1063 | return 0; | |
1064 | } | |
1065 | ||
1066 | int BlueFS::_verify_alloc_granularity( | |
1067 | __u8 id, uint64_t offset, uint64_t length, const char *op) | |
1068 | { | |
1069 | if ((offset & (alloc_size[id] - 1)) || | |
1070 | (length & (alloc_size[id] - 1))) { | |
1071 | derr << __func__ << " " << op << " of " << (int)id | |
1072 | << ":0x" << std::hex << offset << "~" << length << std::dec | |
1073 | << " does not align to alloc_size 0x" | |
1074 | << std::hex << alloc_size[id] << std::dec << dendl; | |
1075 | // be helpful | |
1076 | auto need = alloc_size[id]; | |
1077 | while (need && ((offset & (need - 1)) || | |
1078 | (length & (need - 1)))) { | |
1079 | need >>= 1; | |
1080 | } | |
1081 | if (need) { | |
1082 | const char *which; | |
1083 | if (id == BDEV_SLOW || | |
1084 | (id == BDEV_DB && !bdev[BDEV_SLOW])) { | |
1085 | which = "bluefs_shared_alloc_size"; | |
1086 | } else { | |
1087 | which = "bluefs_alloc_size"; | |
1088 | } | |
1089 | derr << "work-around by setting " << which << " = " << need | |
1090 | << " for this OSD" << dendl; | |
1091 | } | |
1092 | return -EFAULT; | |
1093 | } | |
1094 | return 0; | |
1095 | } | |
1096 | ||
11fdf7f2 | 1097 | int BlueFS::_replay(bool noop, bool to_stdout) |
7c673cae FG |
1098 | { |
1099 | dout(10) << __func__ << (noop ? " NO-OP" : "") << dendl; | |
1100 | ino_last = 1; // by the log | |
1101 | log_seq = 0; | |
1102 | ||
1103 | FileRef log_file; | |
11fdf7f2 | 1104 | log_file = _get_file(1); |
9f95a23c TL |
1105 | |
1106 | // sanity check | |
1107 | for (auto& a : block_unused_too_granular) { | |
1108 | ceph_assert(a.empty()); | |
1109 | } | |
1110 | ||
11fdf7f2 TL |
1111 | if (!noop) { |
1112 | log_file->fnode = super.log_fnode; | |
9f95a23c | 1113 | log_file->vselector_hint = |
f6b5b4d7 | 1114 | vselector->get_hint_for_log(); |
7c673cae | 1115 | } else { |
11fdf7f2 TL |
1116 | // do not use fnode from superblock in 'noop' mode - log_file's one should |
1117 | // be fine and up-to-date | |
1118 | ceph_assert(log_file->fnode.ino == 1); | |
1119 | ceph_assert(log_file->fnode.extents.size() != 0); | |
7c673cae | 1120 | } |
7c673cae | 1121 | dout(10) << __func__ << " log_fnode " << super.log_fnode << dendl; |
11fdf7f2 TL |
1122 | if (unlikely(to_stdout)) { |
1123 | std::cout << " log_fnode " << super.log_fnode << std::endl; | |
1124 | } | |
7c673cae FG |
1125 | |
1126 | FileReader *log_reader = new FileReader( | |
1127 | log_file, cct->_conf->bluefs_max_prefetch, | |
1128 | false, // !random | |
1129 | true); // ignore eof | |
9f95a23c TL |
1130 | |
1131 | bool seen_recs = false; | |
1132 | ||
1133 | boost::dynamic_bitset<uint64_t> used_blocks[MAX_BDEV]; | |
1134 | boost::dynamic_bitset<uint64_t> owned_blocks[MAX_BDEV]; | |
1135 | ||
1136 | if (cct->_conf->bluefs_log_replay_check_allocations) { | |
1137 | for (size_t i = 0; i < MAX_BDEV; ++i) { | |
1138 | if (alloc_size[i] != 0 && bdev[i] != nullptr) { | |
1139 | used_blocks[i].resize(round_up_to(bdev[i]->get_size(), alloc_size[i]) / alloc_size[i]); | |
1140 | owned_blocks[i].resize(round_up_to(bdev[i]->get_size(), alloc_size[i]) / alloc_size[i]); | |
1141 | } | |
1142 | } | |
1143 | } | |
1144 | ||
1145 | bool first_log_check = true; | |
1146 | ||
7c673cae | 1147 | while (true) { |
11fdf7f2 | 1148 | ceph_assert((log_reader->buf.pos & ~super.block_mask()) == 0); |
7c673cae FG |
1149 | uint64_t pos = log_reader->buf.pos; |
1150 | uint64_t read_pos = pos; | |
1151 | bufferlist bl; | |
1152 | { | |
1153 | int r = _read(log_reader, &log_reader->buf, read_pos, super.block_size, | |
1154 | &bl, NULL); | |
f6b5b4d7 TL |
1155 | if (r != (int)super.block_size && cct->_conf->bluefs_replay_recovery) { |
1156 | r += do_replay_recovery_read(log_reader, pos, read_pos + r, super.block_size - r, &bl); | |
1157 | } | |
1158 | assert(r == (int)super.block_size); | |
7c673cae FG |
1159 | read_pos += r; |
1160 | } | |
1161 | uint64_t more = 0; | |
1162 | uint64_t seq; | |
1163 | uuid_d uuid; | |
1164 | { | |
11fdf7f2 | 1165 | auto p = bl.cbegin(); |
7c673cae FG |
1166 | __u8 a, b; |
1167 | uint32_t len; | |
11fdf7f2 TL |
1168 | decode(a, p); |
1169 | decode(b, p); | |
1170 | decode(len, p); | |
1171 | decode(uuid, p); | |
1172 | decode(seq, p); | |
7c673cae | 1173 | if (len + 6 > bl.length()) { |
11fdf7f2 | 1174 | more = round_up_to(len + 6 - bl.length(), super.block_size); |
7c673cae FG |
1175 | } |
1176 | } | |
1177 | if (uuid != super.uuid) { | |
9f95a23c TL |
1178 | if (seen_recs) { |
1179 | dout(10) << __func__ << " 0x" << std::hex << pos << std::dec | |
1180 | << ": stop: uuid " << uuid << " != super.uuid " << super.uuid | |
1181 | << dendl; | |
1182 | } else { | |
1183 | derr << __func__ << " 0x" << std::hex << pos << std::dec | |
1184 | << ": stop: uuid " << uuid << " != super.uuid " << super.uuid | |
1185 | << ", block dump: \n"; | |
1186 | bufferlist t; | |
1187 | t.substr_of(bl, 0, super.block_size); | |
1188 | t.hexdump(*_dout); | |
1189 | *_dout << dendl; | |
1190 | } | |
7c673cae FG |
1191 | break; |
1192 | } | |
1193 | if (seq != log_seq + 1) { | |
9f95a23c TL |
1194 | if (seen_recs) { |
1195 | dout(10) << __func__ << " 0x" << std::hex << pos << std::dec | |
1196 | << ": stop: seq " << seq << " != expected " << log_seq + 1 | |
1197 | << dendl;; | |
1198 | } else { | |
1199 | derr << __func__ << " 0x" << std::hex << pos << std::dec | |
1200 | << ": stop: seq " << seq << " != expected " << log_seq + 1 | |
1201 | << dendl;; | |
1202 | } | |
7c673cae FG |
1203 | break; |
1204 | } | |
1205 | if (more) { | |
1206 | dout(20) << __func__ << " need 0x" << std::hex << more << std::dec | |
1207 | << " more bytes" << dendl; | |
1208 | bufferlist t; | |
1209 | int r = _read(log_reader, &log_reader->buf, read_pos, more, &t, NULL); | |
1210 | if (r < (int)more) { | |
f6b5b4d7 TL |
1211 | dout(10) << __func__ << " 0x" << std::hex << pos |
1212 | << ": stop: len is 0x" << bl.length() + more << std::dec | |
1213 | << ", which is past eof" << dendl; | |
1214 | if (cct->_conf->bluefs_replay_recovery) { | |
1215 | //try to search for more data | |
1216 | r += do_replay_recovery_read(log_reader, pos, read_pos + r, more - r, &t); | |
1217 | if (r < (int)more) { | |
1218 | //in normal mode we must read r==more, for recovery it is too strict | |
1219 | break; | |
1220 | } | |
1221 | } | |
7c673cae | 1222 | } |
11fdf7f2 | 1223 | ceph_assert(r == (int)more); |
7c673cae FG |
1224 | bl.claim_append(t); |
1225 | read_pos += r; | |
1226 | } | |
9f95a23c | 1227 | seen_recs = true; |
7c673cae FG |
1228 | bluefs_transaction_t t; |
1229 | try { | |
11fdf7f2 TL |
1230 | auto p = bl.cbegin(); |
1231 | decode(t, p); | |
7c673cae FG |
1232 | } |
1233 | catch (buffer::error& e) { | |
9f95a23c TL |
1234 | derr << __func__ << " 0x" << std::hex << pos << std::dec |
1235 | << ": stop: failed to decode: " << e.what() | |
1236 | << dendl; | |
7c673cae FG |
1237 | delete log_reader; |
1238 | return -EIO; | |
1239 | } | |
11fdf7f2 | 1240 | ceph_assert(seq == t.seq); |
7c673cae FG |
1241 | dout(10) << __func__ << " 0x" << std::hex << pos << std::dec |
1242 | << ": " << t << dendl; | |
11fdf7f2 TL |
1243 | if (unlikely(to_stdout)) { |
1244 | std::cout << " 0x" << std::hex << pos << std::dec | |
1245 | << ": " << t << std::endl; | |
1246 | } | |
7c673cae | 1247 | |
11fdf7f2 | 1248 | auto p = t.op_bl.cbegin(); |
7c673cae FG |
1249 | while (!p.end()) { |
1250 | __u8 op; | |
11fdf7f2 | 1251 | decode(op, p); |
7c673cae FG |
1252 | switch (op) { |
1253 | ||
1254 | case bluefs_transaction_t::OP_INIT: | |
1255 | dout(20) << __func__ << " 0x" << std::hex << pos << std::dec | |
1256 | << ": op_init" << dendl; | |
11fdf7f2 TL |
1257 | if (unlikely(to_stdout)) { |
1258 | std::cout << " 0x" << std::hex << pos << std::dec | |
1259 | << ": op_init" << std::endl; | |
1260 | } | |
1261 | ||
1262 | ceph_assert(t.seq == 1); | |
7c673cae FG |
1263 | break; |
1264 | ||
1265 | case bluefs_transaction_t::OP_JUMP: | |
1266 | { | |
1267 | uint64_t next_seq; | |
1268 | uint64_t offset; | |
11fdf7f2 TL |
1269 | decode(next_seq, p); |
1270 | decode(offset, p); | |
7c673cae FG |
1271 | dout(20) << __func__ << " 0x" << std::hex << pos << std::dec |
1272 | << ": op_jump seq " << next_seq | |
1273 | << " offset 0x" << std::hex << offset << std::dec << dendl; | |
11fdf7f2 TL |
1274 | if (unlikely(to_stdout)) { |
1275 | std::cout << " 0x" << std::hex << pos << std::dec | |
1276 | << ": op_jump seq " << next_seq | |
1277 | << " offset 0x" << std::hex << offset << std::dec | |
1278 | << std::endl; | |
1279 | } | |
1280 | ||
1281 | ceph_assert(next_seq >= log_seq); | |
7c673cae FG |
1282 | log_seq = next_seq - 1; // we will increment it below |
1283 | uint64_t skip = offset - read_pos; | |
1284 | if (skip) { | |
1285 | bufferlist junk; | |
1286 | int r = _read(log_reader, &log_reader->buf, read_pos, skip, &junk, | |
1287 | NULL); | |
1288 | if (r != (int)skip) { | |
1289 | dout(10) << __func__ << " 0x" << std::hex << read_pos | |
1290 | << ": stop: failed to skip to " << offset | |
1291 | << std::dec << dendl; | |
11fdf7f2 | 1292 | ceph_abort_msg("problem with op_jump"); |
7c673cae FG |
1293 | } |
1294 | } | |
1295 | } | |
1296 | break; | |
1297 | ||
1298 | case bluefs_transaction_t::OP_JUMP_SEQ: | |
1299 | { | |
1300 | uint64_t next_seq; | |
11fdf7f2 | 1301 | decode(next_seq, p); |
7c673cae FG |
1302 | dout(20) << __func__ << " 0x" << std::hex << pos << std::dec |
1303 | << ": op_jump_seq " << next_seq << dendl; | |
11fdf7f2 TL |
1304 | if (unlikely(to_stdout)) { |
1305 | std::cout << " 0x" << std::hex << pos << std::dec | |
1306 | << ": op_jump_seq " << next_seq << std::endl; | |
1307 | } | |
1308 | ||
1309 | ceph_assert(next_seq >= log_seq); | |
7c673cae FG |
1310 | log_seq = next_seq - 1; // we will increment it below |
1311 | } | |
1312 | break; | |
1313 | ||
1314 | case bluefs_transaction_t::OP_ALLOC_ADD: | |
1315 | { | |
1316 | __u8 id; | |
1317 | uint64_t offset, length; | |
11fdf7f2 TL |
1318 | decode(id, p); |
1319 | decode(offset, p); | |
1320 | decode(length, p); | |
7c673cae FG |
1321 | dout(20) << __func__ << " 0x" << std::hex << pos << std::dec |
1322 | << ": op_alloc_add " << " " << (int)id | |
1323 | << ":0x" << std::hex << offset << "~" << length << std::dec | |
1324 | << dendl; | |
11fdf7f2 TL |
1325 | if (unlikely(to_stdout)) { |
1326 | std::cout << " 0x" << std::hex << pos << std::dec | |
1327 | << ": op_alloc_add " << " " << (int)id | |
1328 | << ":0x" << std::hex << offset << "~" << length << std::dec | |
1329 | << std::endl; | |
1330 | } | |
7c673cae FG |
1331 | if (!noop) { |
1332 | block_all[id].insert(offset, length); | |
9f95a23c TL |
1333 | _adjust_granularity(id, &offset, &length, true); |
1334 | if (length) { | |
1335 | alloc[id]->init_add_free(offset, length); | |
1336 | } | |
1337 | ||
1338 | if (cct->_conf->bluefs_log_replay_check_allocations) { | |
1339 | bool fail = false; | |
1340 | apply_for_bitset_range(offset, length, alloc_size[id], owned_blocks[id], | |
1341 | [&](uint64_t pos, boost::dynamic_bitset<uint64_t> &bs) { | |
1342 | if (bs.test(pos)) { | |
1343 | fail = true; | |
1344 | } else { | |
1345 | bs.set(pos); | |
1346 | } | |
1347 | } | |
1348 | ); | |
1349 | if (fail) { | |
1350 | derr << __func__ << " invalid extent " << (int)id | |
1351 | << ": 0x" << std::hex << offset << "~" << length | |
1352 | << std::dec << ": already given" << dendl; | |
1353 | return -EFAULT; | |
1354 | } | |
1355 | apply_for_bitset_range(offset, length, alloc_size[id], used_blocks[id], | |
1356 | [&](uint64_t pos, boost::dynamic_bitset<uint64_t> &bs) { | |
1357 | if (bs.test(pos)) { | |
1358 | fail = true; | |
1359 | } | |
1360 | } | |
1361 | ); | |
1362 | if (fail) { | |
1363 | derr << __func__ << " invalid extent " << int(id) | |
1364 | << ": 0x" << std::hex << offset << "~" << length | |
1365 | << std::dec << ": already in use" << dendl; | |
1366 | return -EFAULT; | |
1367 | } | |
1368 | } | |
7c673cae FG |
1369 | } |
1370 | } | |
1371 | break; | |
1372 | ||
1373 | case bluefs_transaction_t::OP_ALLOC_RM: | |
1374 | { | |
1375 | __u8 id; | |
1376 | uint64_t offset, length; | |
11fdf7f2 TL |
1377 | decode(id, p); |
1378 | decode(offset, p); | |
1379 | decode(length, p); | |
7c673cae FG |
1380 | dout(20) << __func__ << " 0x" << std::hex << pos << std::dec |
1381 | << ": op_alloc_rm " << " " << (int)id | |
1382 | << ":0x" << std::hex << offset << "~" << length << std::dec | |
1383 | << dendl; | |
11fdf7f2 TL |
1384 | if (unlikely(to_stdout)) { |
1385 | std::cout << " 0x" << std::hex << pos << std::dec | |
1386 | << ": op_alloc_rm " << " " << (int)id | |
1387 | << ":0x" << std::hex << offset << "~" << length << std::dec | |
1388 | << std::endl; | |
1389 | } | |
7c673cae FG |
1390 | if (!noop) { |
1391 | block_all[id].erase(offset, length); | |
9f95a23c TL |
1392 | _adjust_granularity(id, &offset, &length, false); |
1393 | if (length) { | |
1394 | alloc[id]->init_rm_free(offset, length); | |
1395 | } | |
1396 | if (cct->_conf->bluefs_log_replay_check_allocations) { | |
1397 | bool fail = false; | |
1398 | apply_for_bitset_range(offset, length, alloc_size[id], owned_blocks[id], | |
1399 | [&](uint64_t pos, boost::dynamic_bitset<uint64_t> &bs) { | |
1400 | if (!bs.test(pos)) { | |
1401 | fail = true; | |
1402 | } else { | |
1403 | bs.reset(pos); | |
1404 | } | |
1405 | } | |
1406 | ); | |
1407 | if (fail) { | |
1408 | derr << __func__ << " invalid extent " << int(id) | |
1409 | << ": 0x" << std::hex << offset << "~" << length | |
1410 | << std::dec << ": wasn't given" << dendl; | |
1411 | return -EFAULT; | |
1412 | } | |
1413 | ||
1414 | apply_for_bitset_range(offset, length, alloc_size[id], used_blocks[id], | |
1415 | [&](uint64_t pos, boost::dynamic_bitset<uint64_t> &bs) { | |
1416 | if (bs.test(pos)) { | |
1417 | fail = true; | |
1418 | } | |
1419 | } | |
1420 | ); | |
1421 | if (fail) { | |
1422 | derr << __func__ << " invalid extent " << (int)id | |
1423 | << ": 0x" << std::hex << offset << "~" << length | |
1424 | << std::dec << ": still in use" << dendl; | |
1425 | return -EFAULT; | |
1426 | } | |
1427 | } | |
1428 | } | |
7c673cae FG |
1429 | } |
1430 | break; | |
1431 | ||
1432 | case bluefs_transaction_t::OP_DIR_LINK: | |
1433 | { | |
1434 | string dirname, filename; | |
1435 | uint64_t ino; | |
11fdf7f2 TL |
1436 | decode(dirname, p); |
1437 | decode(filename, p); | |
1438 | decode(ino, p); | |
7c673cae FG |
1439 | dout(20) << __func__ << " 0x" << std::hex << pos << std::dec |
1440 | << ": op_dir_link " << " " << dirname << "/" << filename | |
1441 | << " to " << ino | |
1442 | << dendl; | |
11fdf7f2 TL |
1443 | if (unlikely(to_stdout)) { |
1444 | std::cout << " 0x" << std::hex << pos << std::dec | |
1445 | << ": op_dir_link " << " " << dirname << "/" << filename | |
1446 | << " to " << ino | |
1447 | << std::endl; | |
1448 | } | |
1449 | ||
7c673cae FG |
1450 | if (!noop) { |
1451 | FileRef file = _get_file(ino); | |
11fdf7f2 | 1452 | ceph_assert(file->fnode.ino); |
7c673cae | 1453 | map<string,DirRef>::iterator q = dir_map.find(dirname); |
11fdf7f2 | 1454 | ceph_assert(q != dir_map.end()); |
7c673cae | 1455 | map<string,FileRef>::iterator r = q->second->file_map.find(filename); |
11fdf7f2 | 1456 | ceph_assert(r == q->second->file_map.end()); |
9f95a23c TL |
1457 | |
1458 | vselector->sub_usage(file->vselector_hint, file->fnode); | |
1459 | file->vselector_hint = | |
1460 | vselector->get_hint_by_dir(dirname); | |
1461 | vselector->add_usage(file->vselector_hint, file->fnode); | |
1462 | ||
7c673cae FG |
1463 | q->second->file_map[filename] = file; |
1464 | ++file->refs; | |
1465 | } | |
1466 | } | |
1467 | break; | |
1468 | ||
1469 | case bluefs_transaction_t::OP_DIR_UNLINK: | |
1470 | { | |
1471 | string dirname, filename; | |
11fdf7f2 TL |
1472 | decode(dirname, p); |
1473 | decode(filename, p); | |
7c673cae FG |
1474 | dout(20) << __func__ << " 0x" << std::hex << pos << std::dec |
1475 | << ": op_dir_unlink " << " " << dirname << "/" << filename | |
1476 | << dendl; | |
11fdf7f2 TL |
1477 | if (unlikely(to_stdout)) { |
1478 | std::cout << " 0x" << std::hex << pos << std::dec | |
1479 | << ": op_dir_unlink " << " " << dirname << "/" << filename | |
1480 | << std::endl; | |
1481 | } | |
1482 | ||
7c673cae FG |
1483 | if (!noop) { |
1484 | map<string,DirRef>::iterator q = dir_map.find(dirname); | |
11fdf7f2 | 1485 | ceph_assert(q != dir_map.end()); |
7c673cae | 1486 | map<string,FileRef>::iterator r = q->second->file_map.find(filename); |
11fdf7f2 TL |
1487 | ceph_assert(r != q->second->file_map.end()); |
1488 | ceph_assert(r->second->refs > 0); | |
7c673cae FG |
1489 | --r->second->refs; |
1490 | q->second->file_map.erase(r); | |
1491 | } | |
1492 | } | |
1493 | break; | |
1494 | ||
1495 | case bluefs_transaction_t::OP_DIR_CREATE: | |
1496 | { | |
1497 | string dirname; | |
11fdf7f2 | 1498 | decode(dirname, p); |
7c673cae FG |
1499 | dout(20) << __func__ << " 0x" << std::hex << pos << std::dec |
1500 | << ": op_dir_create " << dirname << dendl; | |
11fdf7f2 TL |
1501 | if (unlikely(to_stdout)) { |
1502 | std::cout << " 0x" << std::hex << pos << std::dec | |
1503 | << ": op_dir_create " << dirname << std::endl; | |
1504 | } | |
1505 | ||
7c673cae FG |
1506 | if (!noop) { |
1507 | map<string,DirRef>::iterator q = dir_map.find(dirname); | |
11fdf7f2 | 1508 | ceph_assert(q == dir_map.end()); |
9f95a23c | 1509 | dir_map[dirname] = ceph::make_ref<Dir>(); |
7c673cae FG |
1510 | } |
1511 | } | |
1512 | break; | |
1513 | ||
1514 | case bluefs_transaction_t::OP_DIR_REMOVE: | |
1515 | { | |
1516 | string dirname; | |
11fdf7f2 | 1517 | decode(dirname, p); |
7c673cae FG |
1518 | dout(20) << __func__ << " 0x" << std::hex << pos << std::dec |
1519 | << ": op_dir_remove " << dirname << dendl; | |
11fdf7f2 TL |
1520 | if (unlikely(to_stdout)) { |
1521 | std::cout << " 0x" << std::hex << pos << std::dec | |
1522 | << ": op_dir_remove " << dirname << std::endl; | |
1523 | } | |
1524 | ||
7c673cae FG |
1525 | if (!noop) { |
1526 | map<string,DirRef>::iterator q = dir_map.find(dirname); | |
11fdf7f2 TL |
1527 | ceph_assert(q != dir_map.end()); |
1528 | ceph_assert(q->second->file_map.empty()); | |
7c673cae FG |
1529 | dir_map.erase(q); |
1530 | } | |
1531 | } | |
1532 | break; | |
1533 | ||
1534 | case bluefs_transaction_t::OP_FILE_UPDATE: | |
1535 | { | |
1536 | bluefs_fnode_t fnode; | |
11fdf7f2 | 1537 | decode(fnode, p); |
7c673cae | 1538 | dout(20) << __func__ << " 0x" << std::hex << pos << std::dec |
9f95a23c | 1539 | << ": op_file_update " << " " << fnode << " " << dendl; |
11fdf7f2 TL |
1540 | if (unlikely(to_stdout)) { |
1541 | std::cout << " 0x" << std::hex << pos << std::dec | |
1542 | << ": op_file_update " << " " << fnode << std::endl; | |
1543 | } | |
9f95a23c | 1544 | if (!noop) { |
7c673cae | 1545 | FileRef f = _get_file(fnode.ino); |
9f95a23c TL |
1546 | if (cct->_conf->bluefs_log_replay_check_allocations) { |
1547 | // check initial log layout | |
1548 | if (first_log_check) { | |
1549 | first_log_check = false; | |
1550 | int r = _check_new_allocations(log_file->fnode, | |
1551 | MAX_BDEV, owned_blocks, used_blocks); | |
1552 | if (r < 0) { | |
1553 | return r; | |
1554 | } | |
1555 | } | |
1556 | ||
1557 | auto& fnode_extents = f->fnode.extents; | |
1558 | for (auto e : fnode_extents) { | |
1559 | auto id = e.bdev; | |
1560 | if (int r = _verify_alloc_granularity(id, e.offset, e.length, | |
1561 | "OP_FILE_UPDATE"); r < 0) { | |
1562 | return r; | |
1563 | } | |
1564 | apply_for_bitset_range(e.offset, e.length, alloc_size[id], | |
1565 | used_blocks[id], | |
1566 | [&](uint64_t pos, boost::dynamic_bitset<uint64_t> &bs) { | |
1567 | ceph_assert(bs.test(pos)); | |
1568 | bs.reset(pos); | |
1569 | } | |
1570 | ); | |
1571 | } | |
1572 | } | |
1573 | ||
1574 | if (fnode.ino != 1) { | |
1575 | vselector->sub_usage(f->vselector_hint, f->fnode); | |
1576 | } | |
1577 | f->fnode = fnode; | |
1578 | if (fnode.ino != 1) { | |
1579 | vselector->add_usage(f->vselector_hint, f->fnode); | |
1580 | } | |
1581 | ||
7c673cae FG |
1582 | if (fnode.ino > ino_last) { |
1583 | ino_last = fnode.ino; | |
1584 | } | |
9f95a23c TL |
1585 | if (cct->_conf->bluefs_log_replay_check_allocations) { |
1586 | int r = _check_new_allocations(f->fnode, | |
1587 | MAX_BDEV, owned_blocks, used_blocks); | |
1588 | if (r < 0) { | |
1589 | return r; | |
1590 | } | |
1591 | } | |
7c673cae | 1592 | } |
9f95a23c | 1593 | } |
7c673cae FG |
1594 | break; |
1595 | ||
1596 | case bluefs_transaction_t::OP_FILE_REMOVE: | |
1597 | { | |
1598 | uint64_t ino; | |
11fdf7f2 | 1599 | decode(ino, p); |
7c673cae FG |
1600 | dout(20) << __func__ << " 0x" << std::hex << pos << std::dec |
1601 | << ": op_file_remove " << ino << dendl; | |
11fdf7f2 TL |
1602 | if (unlikely(to_stdout)) { |
1603 | std::cout << " 0x" << std::hex << pos << std::dec | |
1604 | << ": op_file_remove " << ino << std::endl; | |
1605 | } | |
1606 | ||
9f95a23c TL |
1607 | if (!noop) { |
1608 | auto p = file_map.find(ino); | |
1609 | ceph_assert(p != file_map.end()); | |
1610 | vselector->sub_usage(p->second->vselector_hint, p->second->fnode); | |
1611 | if (cct->_conf->bluefs_log_replay_check_allocations) { | |
1612 | auto& fnode_extents = p->second->fnode.extents; | |
1613 | for (auto e : fnode_extents) { | |
1614 | auto id = e.bdev; | |
1615 | bool fail = false; | |
1616 | apply_for_bitset_range(e.offset, e.length, alloc_size[id], owned_blocks[id], | |
1617 | [&](uint64_t pos, boost::dynamic_bitset<uint64_t> &bs) { | |
1618 | if (!bs.test(pos)) { | |
1619 | fail = true; | |
1620 | } | |
1621 | } | |
1622 | ); | |
1623 | if (fail) { | |
1624 | derr << __func__ << " invalid extent " << int(id) | |
1625 | << ": 0x" << std::hex << e.offset << "~" << e.length | |
1626 | << std::dec | |
1627 | << ": wasn't given but is allocated for removed ino " << ino | |
1628 | << dendl; | |
1629 | return -EFAULT; | |
1630 | } | |
1631 | ||
1632 | apply_for_bitset_range(e.offset, e.length, alloc_size[id], used_blocks[id], | |
1633 | [&](uint64_t pos, boost::dynamic_bitset<uint64_t> &bs) { | |
1634 | if (!bs.test(pos)) { | |
1635 | fail = true; | |
1636 | } | |
1637 | bs.reset(pos); | |
1638 | } | |
1639 | ); | |
1640 | if (fail) { | |
1641 | derr << __func__ << " invalid extent " << int(id) | |
1642 | << ": 0x" << std::hex << e.offset << "~" << e.length | |
1643 | << std::dec | |
1644 | << ": not in use but is allocated for removed ino " << ino | |
1645 | << dendl; | |
1646 | return -EFAULT; | |
1647 | } | |
1648 | } | |
1649 | } | |
1650 | file_map.erase(p); | |
1651 | } | |
1652 | } | |
7c673cae FG |
1653 | break; |
1654 | ||
1655 | default: | |
1656 | derr << __func__ << " 0x" << std::hex << pos << std::dec | |
1657 | << ": stop: unrecognized op " << (int)op << dendl; | |
1658 | delete log_reader; | |
1659 | return -EIO; | |
1660 | } | |
1661 | } | |
11fdf7f2 | 1662 | ceph_assert(p.end()); |
7c673cae FG |
1663 | |
1664 | // we successfully replayed the transaction; bump the seq and log size | |
1665 | ++log_seq; | |
1666 | log_file->fnode.size = log_reader->buf.pos; | |
1667 | } | |
9f95a23c TL |
1668 | vselector->add_usage(log_file->vselector_hint, log_file->fnode); |
1669 | ||
1670 | if (!noop && first_log_check && | |
1671 | cct->_conf->bluefs_log_replay_check_allocations) { | |
1672 | int r = _check_new_allocations(log_file->fnode, | |
1673 | MAX_BDEV, owned_blocks, used_blocks); | |
1674 | if (r < 0) { | |
1675 | return r; | |
1676 | } | |
1677 | } | |
7c673cae FG |
1678 | |
1679 | dout(10) << __func__ << " log file size was 0x" | |
1680 | << std::hex << log_file->fnode.size << std::dec << dendl; | |
11fdf7f2 TL |
1681 | if (unlikely(to_stdout)) { |
1682 | std::cout << " log file size was 0x" | |
1683 | << std::hex << log_file->fnode.size << std::dec << std::endl; | |
1684 | } | |
1685 | ||
7c673cae FG |
1686 | delete log_reader; |
1687 | ||
1688 | if (!noop) { | |
1689 | // verify file link counts are all >0 | |
1690 | for (auto& p : file_map) { | |
1691 | if (p.second->refs == 0 && | |
1692 | p.second->fnode.ino > 1) { | |
1693 | derr << __func__ << " file with link count 0: " << p.second->fnode | |
1694 | << dendl; | |
1695 | return -EIO; | |
1696 | } | |
1697 | } | |
1698 | } | |
1699 | ||
9f95a23c TL |
1700 | for (unsigned id = 0; id < MAX_BDEV; ++id) { |
1701 | dout(10) << __func__ << " block_unused_too_granular " << id << ": " | |
1702 | << block_unused_too_granular[id] << dendl; | |
1703 | } | |
7c673cae FG |
1704 | dout(10) << __func__ << " done" << dendl; |
1705 | return 0; | |
1706 | } | |
1707 | ||
11fdf7f2 TL |
1708 | int BlueFS::log_dump() |
1709 | { | |
1710 | // only dump log file's content | |
1711 | int r = _replay(true, true); | |
1712 | if (r < 0) { | |
1713 | derr << __func__ << " failed to replay log: " << cpp_strerror(r) << dendl; | |
1714 | return r; | |
1715 | } | |
1716 | ||
1717 | return 0; | |
1718 | } | |
1719 | ||
1720 | int BlueFS::device_migrate_to_existing( | |
1721 | CephContext *cct, | |
1722 | const set<int>& devs_source, | |
9f95a23c TL |
1723 | int dev_target, |
1724 | const bluefs_layout_t& layout) | |
11fdf7f2 TL |
1725 | { |
1726 | vector<byte> buf; | |
1727 | bool buffered = cct->_conf->bluefs_buffered_io; | |
1728 | ||
eafe8130 TL |
1729 | dout(10) << __func__ << " devs_source " << devs_source |
1730 | << " dev_target " << dev_target << dendl; | |
11fdf7f2 TL |
1731 | assert(dev_target < (int)MAX_BDEV); |
1732 | ||
1733 | int flags = 0; | |
1734 | flags |= devs_source.count(BDEV_DB) ? | |
1735 | (REMOVE_DB | RENAME_SLOW2DB) : 0; | |
1736 | flags |= devs_source.count(BDEV_WAL) ? REMOVE_WAL : 0; | |
1737 | int dev_target_new = dev_target; | |
1738 | ||
1739 | // Slow device without separate DB one is addressed via BDEV_DB | |
1740 | // Hence need renaming. | |
1741 | if ((flags & REMOVE_DB) && dev_target == BDEV_SLOW) { | |
1742 | dev_target_new = BDEV_DB; | |
1743 | dout(0) << __func__ << " super to be written to " << dev_target << dendl; | |
1744 | } | |
1745 | ||
9f95a23c | 1746 | for (auto& [ino, file_ref] : file_map) { |
11fdf7f2 | 1747 | //do not copy log |
9f95a23c | 1748 | if (file_ref->fnode.ino == 1) { |
11fdf7f2 TL |
1749 | continue; |
1750 | } | |
9f95a23c | 1751 | dout(10) << __func__ << " " << ino << " " << file_ref->fnode << dendl; |
eafe8130 | 1752 | |
9f95a23c | 1753 | auto& fnode_extents = file_ref->fnode.extents; |
11fdf7f2 | 1754 | |
9f95a23c TL |
1755 | bool rewrite = std::any_of( |
1756 | fnode_extents.begin(), | |
1757 | fnode_extents.end(), | |
1758 | [=](auto& ext) { | |
1759 | return ext.bdev != dev_target && devs_source.count(ext.bdev); | |
1760 | }); | |
eafe8130 TL |
1761 | if (rewrite) { |
1762 | dout(10) << __func__ << " migrating" << dendl; | |
1763 | ||
1764 | // read entire file | |
1765 | bufferlist bl; | |
1766 | for (auto old_ext : fnode_extents) { | |
1767 | buf.resize(old_ext.length); | |
1768 | int r = bdev[old_ext.bdev]->read_random( | |
1769 | old_ext.offset, | |
1770 | old_ext.length, | |
1771 | (char*)&buf.at(0), | |
1772 | buffered); | |
1773 | if (r != 0) { | |
1774 | derr << __func__ << " failed to read 0x" << std::hex | |
1775 | << old_ext.offset << "~" << old_ext.length << std::dec | |
1776 | << " from " << (int)dev_target << dendl; | |
1777 | return -EIO; | |
1778 | } | |
1779 | bl.append((char*)&buf[0], old_ext.length); | |
1780 | } | |
11fdf7f2 | 1781 | |
eafe8130 TL |
1782 | // write entire file |
1783 | PExtentVector extents; | |
1784 | auto l = _allocate_without_fallback(dev_target, bl.length(), &extents); | |
1785 | if (l < 0) { | |
1786 | derr << __func__ << " unable to allocate len 0x" << std::hex | |
1787 | << bl.length() << std::dec << " from " << (int)dev_target | |
1788 | << ": " << cpp_strerror(l) << dendl; | |
1789 | return -ENOSPC; | |
1790 | } | |
11fdf7f2 | 1791 | |
eafe8130 TL |
1792 | uint64_t off = 0; |
1793 | for (auto& i : extents) { | |
1794 | bufferlist cur; | |
1795 | uint64_t cur_len = std::min<uint64_t>(i.length, bl.length() - off); | |
1796 | ceph_assert(cur_len > 0); | |
1797 | cur.substr_of(bl, off, cur_len); | |
1798 | int r = bdev[dev_target]->write(i.offset, cur, buffered); | |
1799 | ceph_assert(r == 0); | |
1800 | off += cur_len; | |
1801 | } | |
1802 | ||
1803 | // release old extents | |
1804 | for (auto old_ext : fnode_extents) { | |
1805 | PExtentVector to_release; | |
1806 | to_release.emplace_back(old_ext.offset, old_ext.length); | |
1807 | alloc[old_ext.bdev]->release(to_release); | |
1808 | } | |
1809 | ||
1810 | // update fnode | |
1811 | fnode_extents.clear(); | |
1812 | for (auto& i : extents) { | |
1813 | fnode_extents.emplace_back(dev_target_new, i.offset, i.length); | |
1814 | } | |
1815 | } else { | |
9f95a23c TL |
1816 | for (auto& ext : fnode_extents) { |
1817 | if (dev_target != dev_target_new && ext.bdev == dev_target) { | |
eafe8130 | 1818 | dout(20) << __func__ << " " << " ... adjusting extent 0x" |
9f95a23c | 1819 | << std::hex << ext.offset << std::dec |
eafe8130 TL |
1820 | << " bdev " << dev_target << " -> " << dev_target_new |
1821 | << dendl; | |
9f95a23c | 1822 | ext.bdev = dev_target_new; |
11fdf7f2 | 1823 | } |
11fdf7f2 TL |
1824 | } |
1825 | } | |
11fdf7f2 TL |
1826 | } |
1827 | // new logging device in the current naming scheme | |
1828 | int new_log_dev_cur = bdev[BDEV_WAL] ? | |
1829 | BDEV_WAL : | |
1830 | bdev[BDEV_DB] ? BDEV_DB : BDEV_SLOW; | |
1831 | ||
1832 | // new logging device in new naming scheme | |
1833 | int new_log_dev_next = new_log_dev_cur; | |
1834 | ||
1835 | if (devs_source.count(new_log_dev_cur)) { | |
1836 | // SLOW device is addressed via BDEV_DB too hence either WAL or DB | |
1837 | new_log_dev_next = (flags & REMOVE_WAL) || !bdev[BDEV_WAL] ? | |
1838 | BDEV_DB : | |
1839 | BDEV_WAL; | |
1840 | ||
1841 | dout(0) << __func__ << " log moved from " << new_log_dev_cur | |
1842 | << " to " << new_log_dev_next << dendl; | |
1843 | ||
1844 | new_log_dev_cur = | |
1845 | (flags & REMOVE_DB) && new_log_dev_next == BDEV_DB ? | |
1846 | BDEV_SLOW : | |
1847 | new_log_dev_next; | |
1848 | } | |
1849 | ||
9f95a23c | 1850 | _rewrite_log_and_layout_sync( |
11fdf7f2 TL |
1851 | false, |
1852 | (flags & REMOVE_DB) ? BDEV_SLOW : BDEV_DB, | |
1853 | new_log_dev_cur, | |
1854 | new_log_dev_next, | |
9f95a23c TL |
1855 | flags, |
1856 | layout); | |
11fdf7f2 TL |
1857 | return 0; |
1858 | } | |
1859 | ||
1860 | int BlueFS::device_migrate_to_new( | |
1861 | CephContext *cct, | |
1862 | const set<int>& devs_source, | |
9f95a23c TL |
1863 | int dev_target, |
1864 | const bluefs_layout_t& layout) | |
11fdf7f2 TL |
1865 | { |
1866 | vector<byte> buf; | |
1867 | bool buffered = cct->_conf->bluefs_buffered_io; | |
1868 | ||
eafe8130 TL |
1869 | dout(10) << __func__ << " devs_source " << devs_source |
1870 | << " dev_target " << dev_target << dendl; | |
11fdf7f2 TL |
1871 | assert(dev_target == (int)BDEV_NEWDB || (int)BDEV_NEWWAL); |
1872 | ||
1873 | int flags = 0; | |
1874 | ||
1875 | flags |= devs_source.count(BDEV_DB) ? | |
1876 | (!bdev[BDEV_SLOW] ? RENAME_DB2SLOW: REMOVE_DB) : | |
1877 | 0; | |
1878 | flags |= devs_source.count(BDEV_WAL) ? REMOVE_WAL : 0; | |
9f95a23c | 1879 | int dev_target_new = dev_target; //FIXME: remove, makes no sense |
11fdf7f2 TL |
1880 | |
1881 | for (auto& p : file_map) { | |
1882 | //do not copy log | |
1883 | if (p.second->fnode.ino == 1) { | |
1884 | continue; | |
1885 | } | |
eafe8130 TL |
1886 | dout(10) << __func__ << " " << p.first << " " << p.second->fnode << dendl; |
1887 | ||
11fdf7f2 TL |
1888 | auto& fnode_extents = p.second->fnode.extents; |
1889 | ||
eafe8130 | 1890 | bool rewrite = false; |
11fdf7f2 | 1891 | for (auto ext_it = fnode_extents.begin(); |
eafe8130 TL |
1892 | ext_it != p.second->fnode.extents.end(); |
1893 | ++ext_it) { | |
11fdf7f2 | 1894 | if (ext_it->bdev != dev_target && devs_source.count(ext_it->bdev)) { |
eafe8130 TL |
1895 | rewrite = true; |
1896 | break; | |
1897 | } | |
1898 | } | |
1899 | if (rewrite) { | |
1900 | dout(10) << __func__ << " migrating" << dendl; | |
1901 | ||
1902 | // read entire file | |
1903 | bufferlist bl; | |
1904 | for (auto old_ext : fnode_extents) { | |
1905 | buf.resize(old_ext.length); | |
1906 | int r = bdev[old_ext.bdev]->read_random( | |
1907 | old_ext.offset, | |
1908 | old_ext.length, | |
1909 | (char*)&buf.at(0), | |
1910 | buffered); | |
1911 | if (r != 0) { | |
1912 | derr << __func__ << " failed to read 0x" << std::hex | |
1913 | << old_ext.offset << "~" << old_ext.length << std::dec | |
1914 | << " from " << (int)dev_target << dendl; | |
1915 | return -EIO; | |
11fdf7f2 | 1916 | } |
eafe8130 TL |
1917 | bl.append((char*)&buf[0], old_ext.length); |
1918 | } | |
1919 | ||
1920 | // write entire file | |
1921 | PExtentVector extents; | |
1922 | auto l = _allocate_without_fallback(dev_target, bl.length(), &extents); | |
1923 | if (l < 0) { | |
1924 | derr << __func__ << " unable to allocate len 0x" << std::hex | |
1925 | << bl.length() << std::dec << " from " << (int)dev_target | |
1926 | << ": " << cpp_strerror(l) << dendl; | |
1927 | return -ENOSPC; | |
1928 | } | |
1929 | ||
1930 | uint64_t off = 0; | |
1931 | for (auto& i : extents) { | |
1932 | bufferlist cur; | |
1933 | uint64_t cur_len = std::min<uint64_t>(i.length, bl.length() - off); | |
1934 | ceph_assert(cur_len > 0); | |
1935 | cur.substr_of(bl, off, cur_len); | |
1936 | int r = bdev[dev_target]->write(i.offset, cur, buffered); | |
1937 | ceph_assert(r == 0); | |
1938 | off += cur_len; | |
1939 | } | |
1940 | ||
1941 | // release old extents | |
1942 | for (auto old_ext : fnode_extents) { | |
1943 | PExtentVector to_release; | |
1944 | to_release.emplace_back(old_ext.offset, old_ext.length); | |
1945 | alloc[old_ext.bdev]->release(to_release); | |
1946 | } | |
1947 | ||
1948 | // update fnode | |
1949 | fnode_extents.clear(); | |
1950 | for (auto& i : extents) { | |
1951 | fnode_extents.emplace_back(dev_target_new, i.offset, i.length); | |
11fdf7f2 TL |
1952 | } |
1953 | } | |
11fdf7f2 TL |
1954 | } |
1955 | // new logging device in the current naming scheme | |
1956 | int new_log_dev_cur = | |
1957 | bdev[BDEV_NEWWAL] ? | |
1958 | BDEV_NEWWAL : | |
1959 | bdev[BDEV_WAL] && !(flags & REMOVE_WAL) ? | |
1960 | BDEV_WAL : | |
1961 | bdev[BDEV_NEWDB] ? | |
1962 | BDEV_NEWDB : | |
1963 | bdev[BDEV_DB] && !(flags & REMOVE_DB)? | |
1964 | BDEV_DB : | |
1965 | BDEV_SLOW; | |
1966 | ||
1967 | // new logging device in new naming scheme | |
1968 | int new_log_dev_next = | |
1969 | new_log_dev_cur == BDEV_NEWWAL ? | |
1970 | BDEV_WAL : | |
1971 | new_log_dev_cur == BDEV_NEWDB ? | |
1972 | BDEV_DB : | |
1973 | new_log_dev_cur; | |
1974 | ||
1975 | int super_dev = | |
1976 | dev_target == BDEV_NEWDB ? | |
1977 | BDEV_NEWDB : | |
1978 | bdev[BDEV_DB] ? | |
1979 | BDEV_DB : | |
1980 | BDEV_SLOW; | |
1981 | ||
9f95a23c | 1982 | _rewrite_log_and_layout_sync( |
11fdf7f2 TL |
1983 | false, |
1984 | super_dev, | |
1985 | new_log_dev_cur, | |
1986 | new_log_dev_next, | |
9f95a23c TL |
1987 | flags, |
1988 | layout); | |
11fdf7f2 TL |
1989 | return 0; |
1990 | } | |
1991 | ||
7c673cae FG |
1992 | BlueFS::FileRef BlueFS::_get_file(uint64_t ino) |
1993 | { | |
1994 | auto p = file_map.find(ino); | |
1995 | if (p == file_map.end()) { | |
9f95a23c | 1996 | FileRef f = ceph::make_ref<File>(); |
7c673cae FG |
1997 | file_map[ino] = f; |
1998 | dout(30) << __func__ << " ino " << ino << " = " << f | |
1999 | << " (new)" << dendl; | |
2000 | return f; | |
2001 | } else { | |
2002 | dout(30) << __func__ << " ino " << ino << " = " << p->second << dendl; | |
2003 | return p->second; | |
2004 | } | |
2005 | } | |
2006 | ||
2007 | void BlueFS::_drop_link(FileRef file) | |
2008 | { | |
2009 | dout(20) << __func__ << " had refs " << file->refs | |
2010 | << " on " << file->fnode << dendl; | |
11fdf7f2 | 2011 | ceph_assert(file->refs > 0); |
7c673cae FG |
2012 | --file->refs; |
2013 | if (file->refs == 0) { | |
2014 | dout(20) << __func__ << " destroying " << file->fnode << dendl; | |
11fdf7f2 | 2015 | ceph_assert(file->num_reading.load() == 0); |
9f95a23c | 2016 | vselector->sub_usage(file->vselector_hint, file->fnode); |
7c673cae FG |
2017 | log_t.op_file_remove(file->fnode.ino); |
2018 | for (auto& r : file->fnode.extents) { | |
2019 | pending_release[r.bdev].insert(r.offset, r.length); | |
2020 | } | |
2021 | file_map.erase(file->fnode.ino); | |
2022 | file->deleted = true; | |
94b18763 | 2023 | |
7c673cae | 2024 | if (file->dirty_seq) { |
11fdf7f2 TL |
2025 | ceph_assert(file->dirty_seq > log_seq_stable); |
2026 | ceph_assert(dirty_files.count(file->dirty_seq)); | |
7c673cae FG |
2027 | auto it = dirty_files[file->dirty_seq].iterator_to(*file); |
2028 | dirty_files[file->dirty_seq].erase(it); | |
2029 | file->dirty_seq = 0; | |
2030 | } | |
2031 | } | |
2032 | } | |
2033 | ||
adb31ebb | 2034 | int64_t BlueFS::_read_random( |
7c673cae FG |
2035 | FileReader *h, ///< [in] read from here |
2036 | uint64_t off, ///< [in] offset | |
9f95a23c | 2037 | uint64_t len, ///< [in] this many bytes |
7c673cae FG |
2038 | char *out) ///< [out] optional: or copy it here |
2039 | { | |
494da23a TL |
2040 | auto* buf = &h->buf; |
2041 | ||
adb31ebb | 2042 | int64_t ret = 0; |
7c673cae FG |
2043 | dout(10) << __func__ << " h " << h |
2044 | << " 0x" << std::hex << off << "~" << len << std::dec | |
2045 | << " from " << h->file->fnode << dendl; | |
2046 | ||
2047 | ++h->file->num_reading; | |
2048 | ||
2049 | if (!h->ignore_eof && | |
2050 | off + len > h->file->fnode.size) { | |
2051 | if (off > h->file->fnode.size) | |
2052 | len = 0; | |
2053 | else | |
2054 | len = h->file->fnode.size - off; | |
2055 | dout(20) << __func__ << " reaching (or past) eof, len clipped to 0x" | |
2056 | << std::hex << len << std::dec << dendl; | |
2057 | } | |
494da23a TL |
2058 | logger->inc(l_bluefs_read_random_count, 1); |
2059 | logger->inc(l_bluefs_read_random_bytes, len); | |
7c673cae | 2060 | |
494da23a | 2061 | std::shared_lock s_lock(h->lock); |
f91f0fd5 | 2062 | buf->bl.reassign_to_mempool(mempool::mempool_bluefs_file_reader); |
7c673cae | 2063 | while (len > 0) { |
494da23a TL |
2064 | if (off < buf->bl_off || off >= buf->get_buf_end()) { |
2065 | s_lock.unlock(); | |
2066 | uint64_t x_off = 0; | |
2067 | auto p = h->file->fnode.seek(off, &x_off); | |
f6b5b4d7 | 2068 | ceph_assert(p != h->file->fnode.extents.end()); |
9f95a23c | 2069 | uint64_t l = std::min(p->length - x_off, len); |
adb31ebb TL |
2070 | //hard cap to 1GB |
2071 | l = std::min(l, uint64_t(1) << 30); | |
494da23a TL |
2072 | dout(20) << __func__ << " read random 0x" |
2073 | << std::hex << x_off << "~" << l << std::dec | |
2074 | << " of " << *p << dendl; | |
cd265ab1 TL |
2075 | int r; |
2076 | if (!cct->_conf->bluefs_check_for_zeros) { | |
2077 | r = bdev[p->bdev]->read_random(p->offset + x_off, l, out, | |
2078 | cct->_conf->bluefs_buffered_io); | |
2079 | } else { | |
2080 | r = read_random(p->bdev, p->offset + x_off, l, out, | |
2081 | cct->_conf->bluefs_buffered_io); | |
2082 | } | |
494da23a TL |
2083 | ceph_assert(r == 0); |
2084 | off += l; | |
2085 | len -= l; | |
2086 | ret += l; | |
2087 | out += l; | |
2088 | ||
2089 | logger->inc(l_bluefs_read_random_disk_count, 1); | |
2090 | logger->inc(l_bluefs_read_random_disk_bytes, l); | |
2091 | if (len > 0) { | |
2092 | s_lock.lock(); | |
2093 | } | |
2094 | } else { | |
2095 | auto left = buf->get_buf_remaining(off); | |
adb31ebb | 2096 | int64_t r = std::min(len, left); |
494da23a TL |
2097 | logger->inc(l_bluefs_read_random_buffer_count, 1); |
2098 | logger->inc(l_bluefs_read_random_buffer_bytes, r); | |
2099 | dout(20) << __func__ << " left 0x" << std::hex << left | |
2100 | << " 0x" << off << "~" << len << std::dec | |
2101 | << dendl; | |
2102 | ||
2103 | if (out) { | |
7f7e6c64 TL |
2104 | auto p = buf->bl.begin(); |
2105 | p.seek(off - buf->bl_off); | |
2106 | p.copy(r, out); | |
494da23a TL |
2107 | out += r; |
2108 | } | |
7c673cae | 2109 | |
494da23a TL |
2110 | dout(30) << __func__ << " result chunk (0x" |
2111 | << std::hex << r << std::dec << " bytes):\n"; | |
2112 | bufferlist t; | |
2113 | t.substr_of(buf->bl, off - buf->bl_off, r); | |
2114 | t.hexdump(*_dout); | |
2115 | *_dout << dendl; | |
2116 | ||
2117 | off += r; | |
2118 | len -= r; | |
2119 | ret += r; | |
2120 | buf->pos += r; | |
2121 | } | |
2122 | } | |
7c673cae FG |
2123 | dout(20) << __func__ << " got " << ret << dendl; |
2124 | --h->file->num_reading; | |
2125 | return ret; | |
2126 | } | |
2127 | ||
adb31ebb | 2128 | int64_t BlueFS::_read( |
7c673cae FG |
2129 | FileReader *h, ///< [in] read from here |
2130 | FileReaderBuffer *buf, ///< [in] reader state | |
2131 | uint64_t off, ///< [in] offset | |
2132 | size_t len, ///< [in] this many bytes | |
2133 | bufferlist *outbl, ///< [out] optional: reference the result here | |
2134 | char *out) ///< [out] optional: or copy it here | |
2135 | { | |
494da23a | 2136 | bool prefetch = !outbl && !out; |
7c673cae FG |
2137 | dout(10) << __func__ << " h " << h |
2138 | << " 0x" << std::hex << off << "~" << len << std::dec | |
494da23a TL |
2139 | << " from " << h->file->fnode |
2140 | << (prefetch ? " prefetch" : "") | |
2141 | << dendl; | |
7c673cae FG |
2142 | |
2143 | ++h->file->num_reading; | |
2144 | ||
2145 | if (!h->ignore_eof && | |
2146 | off + len > h->file->fnode.size) { | |
2147 | if (off > h->file->fnode.size) | |
2148 | len = 0; | |
2149 | else | |
2150 | len = h->file->fnode.size - off; | |
2151 | dout(20) << __func__ << " reaching (or past) eof, len clipped to 0x" | |
2152 | << std::hex << len << std::dec << dendl; | |
2153 | } | |
494da23a TL |
2154 | logger->inc(l_bluefs_read_count, 1); |
2155 | logger->inc(l_bluefs_read_bytes, len); | |
2156 | if (prefetch) { | |
2157 | logger->inc(l_bluefs_read_prefetch_count, 1); | |
2158 | logger->inc(l_bluefs_read_prefetch_bytes, len); | |
2159 | } | |
2160 | ||
7c673cae FG |
2161 | if (outbl) |
2162 | outbl->clear(); | |
2163 | ||
adb31ebb | 2164 | int64_t ret = 0; |
494da23a | 2165 | std::shared_lock s_lock(h->lock); |
7c673cae FG |
2166 | while (len > 0) { |
2167 | size_t left; | |
2168 | if (off < buf->bl_off || off >= buf->get_buf_end()) { | |
494da23a TL |
2169 | s_lock.unlock(); |
2170 | std::unique_lock u_lock(h->lock); | |
f91f0fd5 | 2171 | buf->bl.reassign_to_mempool(mempool::mempool_bluefs_file_reader); |
494da23a TL |
2172 | if (off < buf->bl_off || off >= buf->get_buf_end()) { |
2173 | // if precondition hasn't changed during locking upgrade. | |
2174 | buf->bl.clear(); | |
2175 | buf->bl_off = off & super.block_mask(); | |
2176 | uint64_t x_off = 0; | |
2177 | auto p = h->file->fnode.seek(buf->bl_off, &x_off); | |
f6b5b4d7 TL |
2178 | if (p == h->file->fnode.extents.end()) { |
2179 | dout(5) << __func__ << " reading less then required " | |
2180 | << ret << "<" << ret + len << dendl; | |
2181 | break; | |
2182 | } | |
2183 | ||
494da23a TL |
2184 | uint64_t want = round_up_to(len + (off & ~super.block_mask()), |
2185 | super.block_size); | |
2186 | want = std::max(want, buf->max_prefetch); | |
2187 | uint64_t l = std::min(p->length - x_off, want); | |
adb31ebb TL |
2188 | //hard cap to 1GB |
2189 | l = std::min(l, uint64_t(1) << 30); | |
494da23a TL |
2190 | uint64_t eof_offset = round_up_to(h->file->fnode.size, super.block_size); |
2191 | if (!h->ignore_eof && | |
2192 | buf->bl_off + l > eof_offset) { | |
2193 | l = eof_offset - buf->bl_off; | |
2194 | } | |
2195 | dout(20) << __func__ << " fetching 0x" | |
2196 | << std::hex << x_off << "~" << l << std::dec | |
2197 | << " of " << *p << dendl; | |
cd265ab1 TL |
2198 | int r; |
2199 | if (!cct->_conf->bluefs_check_for_zeros) { | |
2200 | r = bdev[p->bdev]->read(p->offset + x_off, l, &buf->bl, ioc[p->bdev], | |
2201 | cct->_conf->bluefs_buffered_io); | |
2202 | } else { | |
2203 | r = read(p->bdev, p->offset + x_off, l, &buf->bl, ioc[p->bdev], | |
2204 | cct->_conf->bluefs_buffered_io); | |
2205 | } | |
494da23a | 2206 | ceph_assert(r == 0); |
7c673cae | 2207 | } |
494da23a TL |
2208 | u_lock.unlock(); |
2209 | s_lock.lock(); | |
2210 | // we should recheck if buffer is valid after lock downgrade | |
2211 | continue; | |
7c673cae FG |
2212 | } |
2213 | left = buf->get_buf_remaining(off); | |
2214 | dout(20) << __func__ << " left 0x" << std::hex << left | |
2215 | << " len 0x" << len << std::dec << dendl; | |
2216 | ||
adb31ebb | 2217 | int64_t r = std::min(len, left); |
7c673cae FG |
2218 | if (outbl) { |
2219 | bufferlist t; | |
2220 | t.substr_of(buf->bl, off - buf->bl_off, r); | |
2221 | outbl->claim_append(t); | |
2222 | } | |
2223 | if (out) { | |
7f7e6c64 TL |
2224 | auto p = buf->bl.begin(); |
2225 | p.seek(off - buf->bl_off); | |
2226 | p.copy(r, out); | |
7c673cae FG |
2227 | out += r; |
2228 | } | |
2229 | ||
2230 | dout(30) << __func__ << " result chunk (0x" | |
2231 | << std::hex << r << std::dec << " bytes):\n"; | |
2232 | bufferlist t; | |
2233 | t.substr_of(buf->bl, off - buf->bl_off, r); | |
2234 | t.hexdump(*_dout); | |
2235 | *_dout << dendl; | |
2236 | ||
2237 | off += r; | |
2238 | len -= r; | |
2239 | ret += r; | |
2240 | buf->pos += r; | |
2241 | } | |
7c673cae | 2242 | dout(20) << __func__ << " got " << ret << dendl; |
11fdf7f2 | 2243 | ceph_assert(!outbl || (int)outbl->length() == ret); |
7c673cae FG |
2244 | --h->file->num_reading; |
2245 | return ret; | |
2246 | } | |
2247 | ||
2248 | void BlueFS::_invalidate_cache(FileRef f, uint64_t offset, uint64_t length) | |
2249 | { | |
2250 | dout(10) << __func__ << " file " << f->fnode | |
2251 | << " 0x" << std::hex << offset << "~" << length << std::dec | |
2252 | << dendl; | |
2253 | if (offset & ~super.block_mask()) { | |
2254 | offset &= super.block_mask(); | |
11fdf7f2 | 2255 | length = round_up_to(length, super.block_size); |
7c673cae FG |
2256 | } |
2257 | uint64_t x_off = 0; | |
2258 | auto p = f->fnode.seek(offset, &x_off); | |
2259 | while (length > 0 && p != f->fnode.extents.end()) { | |
11fdf7f2 | 2260 | uint64_t x_len = std::min(p->length - x_off, length); |
7c673cae FG |
2261 | bdev[p->bdev]->invalidate_cache(p->offset + x_off, x_len); |
2262 | dout(20) << __func__ << " 0x" << std::hex << x_off << "~" << x_len | |
2263 | << std:: dec << " of " << *p << dendl; | |
2264 | offset += x_len; | |
2265 | length -= x_len; | |
2266 | } | |
2267 | } | |
2268 | ||
2269 | uint64_t BlueFS::_estimate_log_size() | |
2270 | { | |
2271 | int avg_dir_size = 40; // fixme | |
2272 | int avg_file_size = 12; | |
2273 | uint64_t size = 4096 * 2; | |
2274 | size += file_map.size() * (1 + sizeof(bluefs_fnode_t)); | |
2275 | for (auto& p : block_all) | |
2276 | size += p.num_intervals() * (1 + 1 + sizeof(uint64_t) * 2); | |
2277 | size += dir_map.size() + (1 + avg_dir_size); | |
2278 | size += file_map.size() * (1 + avg_dir_size + avg_file_size); | |
11fdf7f2 | 2279 | return round_up_to(size, super.block_size); |
7c673cae FG |
2280 | } |
2281 | ||
2282 | void BlueFS::compact_log() | |
2283 | { | |
f6b5b4d7 TL |
2284 | std::unique_lock<ceph::mutex> l(lock); |
2285 | if (!cct->_conf->bluefs_replay_recovery_disable_compact) { | |
2286 | if (cct->_conf->bluefs_compact_log_sync) { | |
2287 | _compact_log_sync(); | |
2288 | } else { | |
2289 | _compact_log_async(l); | |
2290 | } | |
7c673cae FG |
2291 | } |
2292 | } | |
2293 | ||
2294 | bool BlueFS::_should_compact_log() | |
2295 | { | |
2296 | uint64_t current = log_writer->file->fnode.size; | |
2297 | uint64_t expected = _estimate_log_size(); | |
2298 | float ratio = (float)current / (float)expected; | |
2299 | dout(10) << __func__ << " current 0x" << std::hex << current | |
2300 | << " expected " << expected << std::dec | |
2301 | << " ratio " << ratio | |
2302 | << (new_log ? " (async compaction in progress)" : "") | |
2303 | << dendl; | |
2304 | if (new_log || | |
2305 | current < cct->_conf->bluefs_log_compact_min_size || | |
2306 | ratio < cct->_conf->bluefs_log_compact_min_ratio) { | |
2307 | return false; | |
2308 | } | |
2309 | return true; | |
2310 | } | |
2311 | ||
11fdf7f2 TL |
2312 | void BlueFS::_compact_log_dump_metadata(bluefs_transaction_t *t, |
2313 | int flags) | |
7c673cae FG |
2314 | { |
2315 | t->seq = 1; | |
2316 | t->uuid = super.uuid; | |
2317 | dout(20) << __func__ << " op_init" << dendl; | |
2318 | ||
2319 | t->op_init(); | |
2320 | for (unsigned bdev = 0; bdev < MAX_BDEV; ++bdev) { | |
2321 | interval_set<uint64_t>& p = block_all[bdev]; | |
2322 | for (interval_set<uint64_t>::iterator q = p.begin(); q != p.end(); ++q) { | |
11fdf7f2 TL |
2323 | auto bdev_new = bdev; |
2324 | if ((flags & REMOVE_WAL) && bdev == BDEV_WAL) { | |
2325 | continue; | |
2326 | } | |
2327 | if ((flags & REMOVE_DB) && bdev == BDEV_DB) { | |
2328 | continue; | |
2329 | } | |
2330 | if ((flags & RENAME_SLOW2DB) && bdev == BDEV_SLOW) { | |
2331 | bdev_new = BDEV_DB; | |
2332 | } | |
2333 | if ((flags & RENAME_DB2SLOW) && bdev == BDEV_DB) { | |
2334 | bdev_new = BDEV_SLOW; | |
2335 | } | |
2336 | if (bdev == BDEV_NEWDB) { | |
2337 | // REMOVE_DB xor RENAME_DB | |
2338 | ceph_assert(!(flags & REMOVE_DB) != !(flags & RENAME_DB2SLOW)); | |
2339 | ceph_assert(!(flags & RENAME_SLOW2DB)); | |
2340 | bdev_new = BDEV_DB; | |
2341 | } | |
2342 | if (bdev == BDEV_NEWWAL) { | |
2343 | ceph_assert(flags & REMOVE_WAL); | |
2344 | bdev_new = BDEV_WAL; | |
2345 | } | |
2346 | dout(20) << __func__ << " op_alloc_add " << bdev_new << " 0x" | |
7c673cae FG |
2347 | << std::hex << q.get_start() << "~" << q.get_len() << std::dec |
2348 | << dendl; | |
11fdf7f2 | 2349 | t->op_alloc_add(bdev_new, q.get_start(), q.get_len()); |
7c673cae FG |
2350 | } |
2351 | } | |
9f95a23c TL |
2352 | for (auto& [ino, file_ref] : file_map) { |
2353 | if (ino == 1) | |
7c673cae | 2354 | continue; |
9f95a23c | 2355 | ceph_assert(ino > 1); |
11fdf7f2 | 2356 | |
9f95a23c | 2357 | for(auto& e : file_ref->fnode.extents) { |
11fdf7f2 TL |
2358 | auto bdev = e.bdev; |
2359 | auto bdev_new = bdev; | |
2360 | ceph_assert(!((flags & REMOVE_WAL) && bdev == BDEV_WAL)); | |
2361 | if ((flags & RENAME_SLOW2DB) && bdev == BDEV_SLOW) { | |
2362 | bdev_new = BDEV_DB; | |
2363 | } | |
2364 | if ((flags & RENAME_DB2SLOW) && bdev == BDEV_DB) { | |
2365 | bdev_new = BDEV_SLOW; | |
2366 | } | |
2367 | if (bdev == BDEV_NEWDB) { | |
2368 | // REMOVE_DB xor RENAME_DB | |
2369 | ceph_assert(!(flags & REMOVE_DB) != !(flags & RENAME_DB2SLOW)); | |
2370 | ceph_assert(!(flags & RENAME_SLOW2DB)); | |
2371 | bdev_new = BDEV_DB; | |
2372 | } | |
2373 | if (bdev == BDEV_NEWWAL) { | |
2374 | ceph_assert(flags & REMOVE_WAL); | |
2375 | bdev_new = BDEV_WAL; | |
2376 | } | |
2377 | e.bdev = bdev_new; | |
2378 | } | |
9f95a23c TL |
2379 | dout(20) << __func__ << " op_file_update " << file_ref->fnode << dendl; |
2380 | t->op_file_update(file_ref->fnode); | |
7c673cae | 2381 | } |
9f95a23c TL |
2382 | for (auto& [path, dir_ref] : dir_map) { |
2383 | dout(20) << __func__ << " op_dir_create " << path << dendl; | |
2384 | t->op_dir_create(path); | |
2385 | for (auto& [fname, file_ref] : dir_ref->file_map) { | |
2386 | dout(20) << __func__ << " op_dir_link " << path << "/" << fname | |
2387 | << " to " << file_ref->fnode.ino << dendl; | |
2388 | t->op_dir_link(path, fname, file_ref->fnode.ino); | |
7c673cae FG |
2389 | } |
2390 | } | |
2391 | } | |
2392 | ||
2393 | void BlueFS::_compact_log_sync() | |
2394 | { | |
2395 | dout(10) << __func__ << dendl; | |
9f95a23c TL |
2396 | auto prefer_bdev = |
2397 | vselector->select_prefer_bdev(log_writer->file->vselector_hint); | |
2398 | _rewrite_log_and_layout_sync(true, | |
11fdf7f2 | 2399 | BDEV_DB, |
9f95a23c TL |
2400 | prefer_bdev, |
2401 | prefer_bdev, | |
2402 | 0, | |
2403 | super.memorized_layout); | |
11fdf7f2 TL |
2404 | logger->inc(l_bluefs_log_compactions); |
2405 | } | |
2406 | ||
9f95a23c TL |
2407 | void BlueFS::_rewrite_log_and_layout_sync(bool allocate_with_fallback, |
2408 | int super_dev, | |
2409 | int log_dev, | |
2410 | int log_dev_new, | |
2411 | int flags, | |
2412 | std::optional<bluefs_layout_t> layout) | |
11fdf7f2 | 2413 | { |
7c673cae FG |
2414 | File *log_file = log_writer->file.get(); |
2415 | ||
2416 | // clear out log (be careful who calls us!!!) | |
2417 | log_t.clear(); | |
2418 | ||
11fdf7f2 TL |
2419 | dout(20) << __func__ << " super_dev:" << super_dev |
2420 | << " log_dev:" << log_dev | |
2421 | << " log_dev_new:" << log_dev_new | |
2422 | << " flags:" << flags | |
2423 | << dendl; | |
7c673cae | 2424 | bluefs_transaction_t t; |
11fdf7f2 | 2425 | _compact_log_dump_metadata(&t, flags); |
7c673cae FG |
2426 | |
2427 | dout(20) << __func__ << " op_jump_seq " << log_seq << dendl; | |
2428 | t.op_jump_seq(log_seq); | |
2429 | ||
2430 | bufferlist bl; | |
11fdf7f2 | 2431 | encode(t, bl); |
7c673cae FG |
2432 | _pad_bl(bl); |
2433 | ||
2434 | uint64_t need = bl.length() + cct->_conf->bluefs_max_log_runway; | |
2435 | dout(20) << __func__ << " need " << need << dendl; | |
2436 | ||
494da23a | 2437 | bluefs_fnode_t old_fnode; |
11fdf7f2 | 2438 | int r; |
494da23a | 2439 | log_file->fnode.swap_extents(old_fnode); |
11fdf7f2 TL |
2440 | if (allocate_with_fallback) { |
2441 | r = _allocate(log_dev, need, &log_file->fnode); | |
2442 | ceph_assert(r == 0); | |
2443 | } else { | |
2444 | PExtentVector extents; | |
2445 | r = _allocate_without_fallback(log_dev, | |
2446 | need, | |
2447 | &extents); | |
2448 | ceph_assert(r == 0); | |
2449 | for (auto& p : extents) { | |
2450 | log_file->fnode.append_extent( | |
2451 | bluefs_extent_t(log_dev, p.offset, p.length)); | |
2452 | } | |
7c673cae FG |
2453 | } |
2454 | ||
2455 | _close_writer(log_writer); | |
2456 | ||
2457 | log_file->fnode.size = bl.length(); | |
9f95a23c TL |
2458 | vselector->sub_usage(log_file->vselector_hint, old_fnode); |
2459 | vselector->add_usage(log_file->vselector_hint, log_file->fnode); | |
2460 | ||
7c673cae FG |
2461 | log_writer = _create_writer(log_file); |
2462 | log_writer->append(bl); | |
11fdf7f2 TL |
2463 | r = _flush(log_writer, true); |
2464 | ceph_assert(r == 0); | |
2465 | #ifdef HAVE_LIBAIO | |
2466 | if (!cct->_conf->bluefs_sync_write) { | |
2467 | list<aio_t> completed_ios; | |
2468 | _claim_completed_aios(log_writer, &completed_ios); | |
2469 | wait_for_aio(log_writer); | |
2470 | completed_ios.clear(); | |
2471 | } | |
2472 | #endif | |
224ce89b | 2473 | flush_bdev(); |
224ce89b | 2474 | |
9f95a23c | 2475 | super.memorized_layout = layout; |
7c673cae | 2476 | super.log_fnode = log_file->fnode; |
11fdf7f2 TL |
2477 | // rename device if needed |
2478 | if (log_dev != log_dev_new) { | |
2479 | dout(10) << __func__ << " renaming log extents to " << log_dev_new << dendl; | |
2480 | for (auto& p : super.log_fnode.extents) { | |
2481 | p.bdev = log_dev_new; | |
2482 | } | |
2483 | } | |
2484 | dout(10) << __func__ << " writing super, log fnode: " << super.log_fnode << dendl; | |
2485 | ||
7c673cae | 2486 | ++super.version; |
11fdf7f2 | 2487 | _write_super(super_dev); |
7c673cae FG |
2488 | flush_bdev(); |
2489 | ||
494da23a TL |
2490 | dout(10) << __func__ << " release old log extents " << old_fnode.extents << dendl; |
2491 | for (auto& r : old_fnode.extents) { | |
7c673cae FG |
2492 | pending_release[r.bdev].insert(r.offset, r.length); |
2493 | } | |
7c673cae FG |
2494 | } |
2495 | ||
2496 | /* | |
2497 | * 1. Allocate a new extent to continue the log, and then log an event | |
2498 | * that jumps the log write position to the new extent. At this point, the | |
2499 | * old extent(s) won't be written to, and reflect everything to compact. | |
2500 | * New events will be written to the new region that we'll keep. | |
2501 | * | |
2502 | * 2. While still holding the lock, encode a bufferlist that dumps all of the | |
2503 | * in-memory fnodes and names. This will become the new beginning of the | |
2504 | * log. The last event will jump to the log continuation extent from #1. | |
2505 | * | |
2506 | * 3. Queue a write to a new extent for the new beginnging of the log. | |
2507 | * | |
2508 | * 4. Drop lock and wait | |
2509 | * | |
2510 | * 5. Retake the lock. | |
2511 | * | |
2512 | * 6. Update the log_fnode to splice in the new beginning. | |
2513 | * | |
2514 | * 7. Write the new superblock. | |
2515 | * | |
2516 | * 8. Release the old log space. Clean up. | |
2517 | */ | |
11fdf7f2 | 2518 | void BlueFS::_compact_log_async(std::unique_lock<ceph::mutex>& l) |
7c673cae FG |
2519 | { |
2520 | dout(10) << __func__ << dendl; | |
2521 | File *log_file = log_writer->file.get(); | |
11fdf7f2 TL |
2522 | ceph_assert(!new_log); |
2523 | ceph_assert(!new_log_writer); | |
7c673cae | 2524 | |
181888fb FG |
2525 | // create a new log [writer] so that we know compaction is in progress |
2526 | // (see _should_compact_log) | |
9f95a23c | 2527 | new_log = ceph::make_ref<File>(); |
181888fb FG |
2528 | new_log->fnode.ino = 0; // so that _flush_range won't try to log the fnode |
2529 | ||
3efd9988 FG |
2530 | // 0. wait for any racing flushes to complete. (We do not want to block |
2531 | // in _flush_sync_log with jump_to set or else a racing thread might flush | |
2532 | // our entries and our jump_to update won't be correct.) | |
2533 | while (log_flushing) { | |
2534 | dout(10) << __func__ << " log is currently flushing, waiting" << dendl; | |
2535 | log_cond.wait(l); | |
2536 | } | |
2537 | ||
9f95a23c TL |
2538 | vselector->sub_usage(log_file->vselector_hint, log_file->fnode); |
2539 | ||
7c673cae FG |
2540 | // 1. allocate new log space and jump to it. |
2541 | old_log_jump_to = log_file->fnode.get_allocated(); | |
7c673cae | 2542 | dout(10) << __func__ << " old_log_jump_to 0x" << std::hex << old_log_jump_to |
11fdf7f2 | 2543 | << " need 0x" << (old_log_jump_to + cct->_conf->bluefs_max_log_runway) << std::dec << dendl; |
9f95a23c TL |
2544 | int r = _allocate(vselector->select_prefer_bdev(log_file->vselector_hint), |
2545 | cct->_conf->bluefs_max_log_runway, | |
2546 | &log_file->fnode); | |
11fdf7f2 | 2547 | ceph_assert(r == 0); |
9f95a23c TL |
2548 | //adjust usage as flush below will need it |
2549 | vselector->add_usage(log_file->vselector_hint, log_file->fnode); | |
7c673cae FG |
2550 | dout(10) << __func__ << " log extents " << log_file->fnode.extents << dendl; |
2551 | ||
2552 | // update the log file change and log a jump to the offset where we want to | |
2553 | // write the new entries | |
2554 | log_t.op_file_update(log_file->fnode); | |
2555 | log_t.op_jump(log_seq, old_log_jump_to); | |
2556 | ||
2557 | flush_bdev(); // FIXME? | |
2558 | ||
2559 | _flush_and_sync_log(l, 0, old_log_jump_to); | |
2560 | ||
2561 | // 2. prepare compacted log | |
2562 | bluefs_transaction_t t; | |
224ce89b WB |
2563 | //avoid record two times in log_t and _compact_log_dump_metadata. |
2564 | log_t.clear(); | |
11fdf7f2 | 2565 | _compact_log_dump_metadata(&t, 0); |
7c673cae | 2566 | |
eafe8130 TL |
2567 | uint64_t max_alloc_size = std::max(alloc_size[BDEV_WAL], |
2568 | std::max(alloc_size[BDEV_DB], | |
2569 | alloc_size[BDEV_SLOW])); | |
2570 | ||
7c673cae | 2571 | // conservative estimate for final encoded size |
11fdf7f2 | 2572 | new_log_jump_to = round_up_to(t.op_bl.length() + super.block_size * 2, |
eafe8130 | 2573 | max_alloc_size); |
7c673cae FG |
2574 | t.op_jump(log_seq, new_log_jump_to); |
2575 | ||
11fdf7f2 | 2576 | // allocate |
9f95a23c | 2577 | //FIXME: check if we want DB here? |
11fdf7f2 TL |
2578 | r = _allocate(BlueFS::BDEV_DB, new_log_jump_to, |
2579 | &new_log->fnode); | |
2580 | ceph_assert(r == 0); | |
2581 | ||
2582 | // we might have some more ops in log_t due to _allocate call | |
2583 | t.claim_ops(log_t); | |
2584 | ||
7c673cae | 2585 | bufferlist bl; |
11fdf7f2 | 2586 | encode(t, bl); |
7c673cae FG |
2587 | _pad_bl(bl); |
2588 | ||
2589 | dout(10) << __func__ << " new_log_jump_to 0x" << std::hex << new_log_jump_to | |
2590 | << std::dec << dendl; | |
2591 | ||
7c673cae FG |
2592 | new_log_writer = _create_writer(new_log); |
2593 | new_log_writer->append(bl); | |
2594 | ||
2595 | // 3. flush | |
2596 | r = _flush(new_log_writer, true); | |
11fdf7f2 | 2597 | ceph_assert(r == 0); |
7c673cae FG |
2598 | |
2599 | // 4. wait | |
11fdf7f2 | 2600 | _flush_bdev_safely(new_log_writer); |
7c673cae | 2601 | |
11fdf7f2 | 2602 | // 5. update our log fnode |
7c673cae | 2603 | // discard first old_log_jump_to extents |
9f95a23c | 2604 | |
7c673cae FG |
2605 | dout(10) << __func__ << " remove 0x" << std::hex << old_log_jump_to << std::dec |
2606 | << " of " << log_file->fnode.extents << dendl; | |
2607 | uint64_t discarded = 0; | |
2608 | mempool::bluefs::vector<bluefs_extent_t> old_extents; | |
2609 | while (discarded < old_log_jump_to) { | |
11fdf7f2 | 2610 | ceph_assert(!log_file->fnode.extents.empty()); |
7c673cae FG |
2611 | bluefs_extent_t& e = log_file->fnode.extents.front(); |
2612 | bluefs_extent_t temp = e; | |
2613 | if (discarded + e.length <= old_log_jump_to) { | |
2614 | dout(10) << __func__ << " remove old log extent " << e << dendl; | |
2615 | discarded += e.length; | |
94b18763 | 2616 | log_file->fnode.pop_front_extent(); |
7c673cae FG |
2617 | } else { |
2618 | dout(10) << __func__ << " remove front of old log extent " << e << dendl; | |
2619 | uint64_t drop = old_log_jump_to - discarded; | |
2620 | temp.length = drop; | |
2621 | e.offset += drop; | |
2622 | e.length -= drop; | |
2623 | discarded += drop; | |
2624 | dout(10) << __func__ << " kept " << e << " removed " << temp << dendl; | |
2625 | } | |
2626 | old_extents.push_back(temp); | |
2627 | } | |
94b18763 FG |
2628 | auto from = log_file->fnode.extents.begin(); |
2629 | auto to = log_file->fnode.extents.end(); | |
2630 | while (from != to) { | |
2631 | new_log->fnode.append_extent(*from); | |
2632 | ++from; | |
2633 | } | |
7c673cae | 2634 | |
9f95a23c TL |
2635 | vselector->sub_usage(log_file->vselector_hint, log_file->fnode); |
2636 | ||
7c673cae | 2637 | // clear the extents from old log file, they are added to new log |
94b18763 | 2638 | log_file->fnode.clear_extents(); |
7c673cae | 2639 | // swap the log files. New log file is the log file now. |
94b18763 FG |
2640 | new_log->fnode.swap_extents(log_file->fnode); |
2641 | ||
7c673cae FG |
2642 | log_writer->pos = log_writer->file->fnode.size = |
2643 | log_writer->pos - old_log_jump_to + new_log_jump_to; | |
2644 | ||
9f95a23c TL |
2645 | vselector->add_usage(log_file->vselector_hint, log_file->fnode); |
2646 | ||
11fdf7f2 | 2647 | // 6. write the super block to reflect the changes |
7c673cae FG |
2648 | dout(10) << __func__ << " writing super" << dendl; |
2649 | super.log_fnode = log_file->fnode; | |
2650 | ++super.version; | |
11fdf7f2 | 2651 | _write_super(BDEV_DB); |
7c673cae FG |
2652 | |
2653 | lock.unlock(); | |
2654 | flush_bdev(); | |
2655 | lock.lock(); | |
2656 | ||
11fdf7f2 | 2657 | // 7. release old space |
7c673cae FG |
2658 | dout(10) << __func__ << " release old log extents " << old_extents << dendl; |
2659 | for (auto& r : old_extents) { | |
2660 | pending_release[r.bdev].insert(r.offset, r.length); | |
2661 | } | |
2662 | ||
2663 | // delete the new log, remove from the dirty files list | |
2664 | _close_writer(new_log_writer); | |
2665 | if (new_log->dirty_seq) { | |
11fdf7f2 | 2666 | ceph_assert(dirty_files.count(new_log->dirty_seq)); |
7c673cae FG |
2667 | auto it = dirty_files[new_log->dirty_seq].iterator_to(*new_log); |
2668 | dirty_files[new_log->dirty_seq].erase(it); | |
2669 | } | |
2670 | new_log_writer = nullptr; | |
2671 | new_log = nullptr; | |
2672 | log_cond.notify_all(); | |
2673 | ||
2674 | dout(10) << __func__ << " log extents " << log_file->fnode.extents << dendl; | |
2675 | logger->inc(l_bluefs_log_compactions); | |
2676 | } | |
2677 | ||
2678 | void BlueFS::_pad_bl(bufferlist& bl) | |
2679 | { | |
2680 | uint64_t partial = bl.length() % super.block_size; | |
2681 | if (partial) { | |
2682 | dout(10) << __func__ << " padding with 0x" << std::hex | |
2683 | << super.block_size - partial << " zeros" << std::dec << dendl; | |
2684 | bl.append_zero(super.block_size - partial); | |
2685 | } | |
2686 | } | |
2687 | ||
7c673cae | 2688 | |
11fdf7f2 | 2689 | int BlueFS::_flush_and_sync_log(std::unique_lock<ceph::mutex>& l, |
7c673cae FG |
2690 | uint64_t want_seq, |
2691 | uint64_t jump_to) | |
2692 | { | |
2693 | while (log_flushing) { | |
2694 | dout(10) << __func__ << " want_seq " << want_seq | |
2695 | << " log is currently flushing, waiting" << dendl; | |
11fdf7f2 | 2696 | ceph_assert(!jump_to); |
7c673cae FG |
2697 | log_cond.wait(l); |
2698 | } | |
2699 | if (want_seq && want_seq <= log_seq_stable) { | |
2700 | dout(10) << __func__ << " want_seq " << want_seq << " <= log_seq_stable " | |
2701 | << log_seq_stable << ", done" << dendl; | |
11fdf7f2 | 2702 | ceph_assert(!jump_to); |
7c673cae FG |
2703 | return 0; |
2704 | } | |
2705 | if (log_t.empty() && dirty_files.empty()) { | |
2706 | dout(10) << __func__ << " want_seq " << want_seq | |
2707 | << " " << log_t << " not dirty, dirty_files empty, no-op" << dendl; | |
11fdf7f2 | 2708 | ceph_assert(!jump_to); |
7c673cae FG |
2709 | return 0; |
2710 | } | |
2711 | ||
a8e16298 TL |
2712 | vector<interval_set<uint64_t>> to_release(pending_release.size()); |
2713 | to_release.swap(pending_release); | |
2714 | ||
7c673cae | 2715 | uint64_t seq = log_t.seq = ++log_seq; |
11fdf7f2 | 2716 | ceph_assert(want_seq == 0 || want_seq <= seq); |
7c673cae FG |
2717 | log_t.uuid = super.uuid; |
2718 | ||
2719 | // log dirty files | |
2720 | auto lsi = dirty_files.find(seq); | |
2721 | if (lsi != dirty_files.end()) { | |
2722 | dout(20) << __func__ << " " << lsi->second.size() << " dirty_files" << dendl; | |
2723 | for (auto &f : lsi->second) { | |
2724 | dout(20) << __func__ << " op_file_update " << f.fnode << dendl; | |
2725 | log_t.op_file_update(f.fnode); | |
2726 | } | |
2727 | } | |
2728 | ||
2729 | dout(10) << __func__ << " " << log_t << dendl; | |
11fdf7f2 | 2730 | ceph_assert(!log_t.empty()); |
7c673cae FG |
2731 | |
2732 | // allocate some more space (before we run out)? | |
2733 | int64_t runway = log_writer->file->fnode.get_allocated() - | |
2734 | log_writer->get_effective_write_pos(); | |
f6b5b4d7 | 2735 | bool just_expanded_log = false; |
7c673cae FG |
2736 | if (runway < (int64_t)cct->_conf->bluefs_min_log_runway) { |
2737 | dout(10) << __func__ << " allocating more log runway (0x" | |
2738 | << std::hex << runway << std::dec << " remaining)" << dendl; | |
2739 | while (new_log_writer) { | |
2740 | dout(10) << __func__ << " waiting for async compaction" << dendl; | |
2741 | log_cond.wait(l); | |
2742 | } | |
9f95a23c TL |
2743 | vselector->sub_usage(log_writer->file->vselector_hint, log_writer->file->fnode); |
2744 | int r = _allocate( | |
2745 | vselector->select_prefer_bdev(log_writer->file->vselector_hint), | |
2746 | cct->_conf->bluefs_max_log_runway, | |
2747 | &log_writer->file->fnode); | |
11fdf7f2 | 2748 | ceph_assert(r == 0); |
9f95a23c | 2749 | vselector->add_usage(log_writer->file->vselector_hint, log_writer->file->fnode); |
7c673cae | 2750 | log_t.op_file_update(log_writer->file->fnode); |
f6b5b4d7 | 2751 | just_expanded_log = true; |
7c673cae FG |
2752 | } |
2753 | ||
2754 | bufferlist bl; | |
11fdf7f2 TL |
2755 | bl.reserve(super.block_size); |
2756 | encode(log_t, bl); | |
7c673cae | 2757 | // pad to block boundary |
11fdf7f2 TL |
2758 | size_t realign = super.block_size - (bl.length() % super.block_size); |
2759 | if (realign && realign != super.block_size) | |
2760 | bl.append_zero(realign); | |
2761 | ||
7c673cae FG |
2762 | logger->inc(l_bluefs_logged_bytes, bl.length()); |
2763 | ||
f6b5b4d7 TL |
2764 | if (just_expanded_log) { |
2765 | ceph_assert(bl.length() <= runway); // if we write this, we will have an unrecoverable data loss | |
2766 | } | |
2767 | ||
7c673cae FG |
2768 | log_writer->append(bl); |
2769 | ||
2770 | log_t.clear(); | |
2771 | log_t.seq = 0; // just so debug output is less confusing | |
2772 | log_flushing = true; | |
2773 | ||
2774 | int r = _flush(log_writer, true); | |
11fdf7f2 | 2775 | ceph_assert(r == 0); |
7c673cae FG |
2776 | |
2777 | if (jump_to) { | |
2778 | dout(10) << __func__ << " jumping log offset from 0x" << std::hex | |
2779 | << log_writer->pos << " -> 0x" << jump_to << std::dec << dendl; | |
2780 | log_writer->pos = jump_to; | |
9f95a23c | 2781 | vselector->sub_usage(log_writer->file->vselector_hint, log_writer->file->fnode.size); |
7c673cae | 2782 | log_writer->file->fnode.size = jump_to; |
9f95a23c | 2783 | vselector->add_usage(log_writer->file->vselector_hint, log_writer->file->fnode.size); |
7c673cae FG |
2784 | } |
2785 | ||
2786 | _flush_bdev_safely(log_writer); | |
2787 | ||
2788 | log_flushing = false; | |
2789 | log_cond.notify_all(); | |
2790 | ||
2791 | // clean dirty files | |
2792 | if (seq > log_seq_stable) { | |
2793 | log_seq_stable = seq; | |
2794 | dout(20) << __func__ << " log_seq_stable " << log_seq_stable << dendl; | |
2795 | ||
2796 | auto p = dirty_files.begin(); | |
2797 | while (p != dirty_files.end()) { | |
2798 | if (p->first > log_seq_stable) { | |
2799 | dout(20) << __func__ << " done cleaning up dirty files" << dendl; | |
2800 | break; | |
2801 | } | |
2802 | ||
2803 | auto l = p->second.begin(); | |
2804 | while (l != p->second.end()) { | |
2805 | File *file = &*l; | |
11fdf7f2 TL |
2806 | ceph_assert(file->dirty_seq > 0); |
2807 | ceph_assert(file->dirty_seq <= log_seq_stable); | |
7c673cae FG |
2808 | dout(20) << __func__ << " cleaned file " << file->fnode << dendl; |
2809 | file->dirty_seq = 0; | |
2810 | p->second.erase(l++); | |
2811 | } | |
2812 | ||
11fdf7f2 | 2813 | ceph_assert(p->second.empty()); |
7c673cae FG |
2814 | dirty_files.erase(p++); |
2815 | } | |
2816 | } else { | |
2817 | dout(20) << __func__ << " log_seq_stable " << log_seq_stable | |
2818 | << " already >= out seq " << seq | |
2819 | << ", we lost a race against another log flush, done" << dendl; | |
2820 | } | |
a8e16298 TL |
2821 | |
2822 | for (unsigned i = 0; i < to_release.size(); ++i) { | |
2823 | if (!to_release[i].empty()) { | |
2824 | /* OK, now we have the guarantee alloc[i] won't be null. */ | |
11fdf7f2 TL |
2825 | int r = 0; |
2826 | if (cct->_conf->bdev_enable_discard && cct->_conf->bdev_async_discard) { | |
2827 | r = bdev[i]->queue_discard(to_release[i]); | |
2828 | if (r == 0) | |
2829 | continue; | |
2830 | } else if (cct->_conf->bdev_enable_discard) { | |
2831 | for (auto p = to_release[i].begin(); p != to_release[i].end(); ++p) { | |
2832 | bdev[i]->discard(p.get_start(), p.get_len()); | |
2833 | } | |
2834 | } | |
a8e16298 TL |
2835 | alloc[i]->release(to_release[i]); |
2836 | } | |
2837 | } | |
2838 | ||
7c673cae FG |
2839 | _update_logger_stats(); |
2840 | ||
2841 | return 0; | |
2842 | } | |
2843 | ||
ec96510d FG |
2844 | int BlueFS::_signal_dirty_to_log(FileWriter *h) |
2845 | { | |
2846 | h->file->fnode.mtime = ceph_clock_now(); | |
2847 | ceph_assert(h->file->fnode.ino >= 1); | |
2848 | if (h->file->dirty_seq == 0) { | |
2849 | h->file->dirty_seq = log_seq + 1; | |
2850 | dirty_files[h->file->dirty_seq].push_back(*h->file); | |
2851 | dout(20) << __func__ << " dirty_seq = " << log_seq + 1 | |
2852 | << " (was clean)" << dendl; | |
2853 | } else { | |
2854 | if (h->file->dirty_seq != log_seq + 1) { | |
2855 | // need re-dirty, erase from list first | |
2856 | ceph_assert(dirty_files.count(h->file->dirty_seq)); | |
2857 | auto it = dirty_files[h->file->dirty_seq].iterator_to(*h->file); | |
2858 | dirty_files[h->file->dirty_seq].erase(it); | |
2859 | h->file->dirty_seq = log_seq + 1; | |
2860 | dirty_files[h->file->dirty_seq].push_back(*h->file); | |
2861 | dout(20) << __func__ << " dirty_seq = " << log_seq + 1 | |
2862 | << " (was " << h->file->dirty_seq << ")" << dendl; | |
2863 | } else { | |
2864 | dout(20) << __func__ << " dirty_seq = " << log_seq + 1 | |
2865 | << " (unchanged, do nothing) " << dendl; | |
2866 | } | |
2867 | } | |
2868 | return 0; | |
2869 | } | |
2870 | ||
7c673cae FG |
2871 | int BlueFS::_flush_range(FileWriter *h, uint64_t offset, uint64_t length) |
2872 | { | |
2873 | dout(10) << __func__ << " " << h << " pos 0x" << std::hex << h->pos | |
2874 | << " 0x" << offset << "~" << length << std::dec | |
2875 | << " to " << h->file->fnode << dendl; | |
7f7e6c64 TL |
2876 | if (h->file->deleted) { |
2877 | dout(10) << __func__ << " deleted, no-op" << dendl; | |
2878 | return 0; | |
2879 | } | |
2880 | ||
11fdf7f2 | 2881 | ceph_assert(h->file->num_readers.load() == 0); |
7c673cae FG |
2882 | |
2883 | h->buffer_appender.flush(); | |
2884 | ||
2885 | bool buffered; | |
2886 | if (h->file->fnode.ino == 1) | |
2887 | buffered = false; | |
2888 | else | |
2889 | buffered = cct->_conf->bluefs_buffered_io; | |
2890 | ||
2891 | if (offset + length <= h->pos) | |
2892 | return 0; | |
2893 | if (offset < h->pos) { | |
2894 | length -= h->pos - offset; | |
2895 | offset = h->pos; | |
2896 | dout(10) << " still need 0x" | |
2897 | << std::hex << offset << "~" << length << std::dec | |
2898 | << dendl; | |
2899 | } | |
11fdf7f2 | 2900 | ceph_assert(offset <= h->file->fnode.size); |
7c673cae FG |
2901 | |
2902 | uint64_t allocated = h->file->fnode.get_allocated(); | |
9f95a23c | 2903 | vselector->sub_usage(h->file->vselector_hint, h->file->fnode); |
7c673cae FG |
2904 | // do not bother to dirty the file if we are overwriting |
2905 | // previously allocated extents. | |
ec96510d | 2906 | |
7c673cae FG |
2907 | if (allocated < offset + length) { |
2908 | // we should never run out of log space here; see the min runway check | |
2909 | // in _flush_and_sync_log. | |
11fdf7f2 | 2910 | ceph_assert(h->file->fnode.ino != 1); |
9f95a23c | 2911 | int r = _allocate(vselector->select_prefer_bdev(h->file->vselector_hint), |
7c673cae | 2912 | offset + length - allocated, |
94b18763 | 2913 | &h->file->fnode); |
7c673cae FG |
2914 | if (r < 0) { |
2915 | derr << __func__ << " allocated: 0x" << std::hex << allocated | |
2916 | << " offset: 0x" << offset << " length: 0x" << length << std::dec | |
2917 | << dendl; | |
9f95a23c | 2918 | vselector->add_usage(h->file->vselector_hint, h->file->fnode); // undo |
11fdf7f2 | 2919 | ceph_abort_msg("bluefs enospc"); |
7c673cae FG |
2920 | return r; |
2921 | } | |
ec96510d | 2922 | h->file->is_dirty = true; |
7c673cae FG |
2923 | } |
2924 | if (h->file->fnode.size < offset + length) { | |
2925 | h->file->fnode.size = offset + length; | |
2926 | if (h->file->fnode.ino > 1) { | |
2927 | // we do not need to dirty the log file (or it's compacting | |
2928 | // replacement) when the file size changes because replay is | |
2929 | // smart enough to discover it on its own. | |
ec96510d | 2930 | h->file->is_dirty = true; |
7c673cae FG |
2931 | } |
2932 | } | |
ec96510d | 2933 | dout(20) << __func__ << " file now, unflushed " << h->file->fnode << dendl; |
7c673cae FG |
2934 | |
2935 | uint64_t x_off = 0; | |
2936 | auto p = h->file->fnode.seek(offset, &x_off); | |
11fdf7f2 | 2937 | ceph_assert(p != h->file->fnode.extents.end()); |
7c673cae FG |
2938 | dout(20) << __func__ << " in " << *p << " x_off 0x" |
2939 | << std::hex << x_off << std::dec << dendl; | |
2940 | ||
2941 | unsigned partial = x_off & ~super.block_mask(); | |
2942 | bufferlist bl; | |
2943 | if (partial) { | |
2944 | dout(20) << __func__ << " using partial tail 0x" | |
2945 | << std::hex << partial << std::dec << dendl; | |
11fdf7f2 | 2946 | ceph_assert(h->tail_block.length() == partial); |
31f18b77 | 2947 | bl.claim_append_piecewise(h->tail_block); |
7c673cae FG |
2948 | x_off -= partial; |
2949 | offset -= partial; | |
2950 | length += partial; | |
2951 | dout(20) << __func__ << " waiting for previous aio to complete" << dendl; | |
2952 | for (auto p : h->iocv) { | |
2953 | if (p) { | |
2954 | p->aio_wait(); | |
2955 | } | |
2956 | } | |
2957 | } | |
f91f0fd5 | 2958 | if (length == partial + h->buffer.length()) { |
9f95a23c | 2959 | /* in case of inital allocation and need to zero, limited flush is unacceptable */ |
31f18b77 | 2960 | bl.claim_append_piecewise(h->buffer); |
7c673cae FG |
2961 | } else { |
2962 | bufferlist t; | |
31f18b77 FG |
2963 | h->buffer.splice(0, length, &t); |
2964 | bl.claim_append_piecewise(t); | |
7c673cae FG |
2965 | t.substr_of(h->buffer, length, h->buffer.length() - length); |
2966 | h->buffer.swap(t); | |
2967 | dout(20) << " leaving 0x" << std::hex << h->buffer.length() << std::dec | |
2968 | << " unflushed" << dendl; | |
2969 | } | |
11fdf7f2 | 2970 | ceph_assert(bl.length() == length); |
7c673cae | 2971 | |
9f95a23c TL |
2972 | h->pos = offset + length; |
2973 | ||
2974 | unsigned tail = bl.length() & ~super.block_mask(); | |
2975 | if (tail) { | |
2976 | dout(20) << __func__ << " caching tail of 0x" | |
2977 | << std::hex << tail | |
2978 | << " and padding block with 0x" << (super.block_size - tail) | |
2979 | << std::dec << dendl; | |
2980 | h->tail_block.substr_of(bl, bl.length() - tail, tail); | |
2981 | bl.append_zero(super.block_size - tail); | |
2982 | length += super.block_size - tail; | |
2983 | } else { | |
2984 | h->tail_block.clear(); | |
2985 | } | |
9f95a23c TL |
2986 | ceph_assert(bl.length() == length); |
2987 | ||
7c673cae FG |
2988 | switch (h->writer_type) { |
2989 | case WRITER_WAL: | |
2990 | logger->inc(l_bluefs_bytes_written_wal, length); | |
2991 | break; | |
2992 | case WRITER_SST: | |
2993 | logger->inc(l_bluefs_bytes_written_sst, length); | |
2994 | break; | |
2995 | } | |
2996 | ||
2997 | dout(30) << "dump:\n"; | |
2998 | bl.hexdump(*_dout); | |
2999 | *_dout << dendl; | |
3000 | ||
7c673cae | 3001 | uint64_t bloff = 0; |
11fdf7f2 | 3002 | uint64_t bytes_written_slow = 0; |
7c673cae | 3003 | while (length > 0) { |
11fdf7f2 | 3004 | uint64_t x_len = std::min(p->length - x_off, length); |
7c673cae FG |
3005 | bufferlist t; |
3006 | t.substr_of(bl, bloff, x_len); | |
7c673cae | 3007 | if (cct->_conf->bluefs_sync_write) { |
11fdf7f2 | 3008 | bdev[p->bdev]->write(p->offset + x_off, t, buffered, h->write_hint); |
7c673cae | 3009 | } else { |
11fdf7f2 TL |
3010 | bdev[p->bdev]->aio_write(p->offset + x_off, t, h->iocv[p->bdev], buffered, h->write_hint); |
3011 | } | |
3012 | h->dirty_devs[p->bdev] = true; | |
3013 | if (p->bdev == BDEV_SLOW) { | |
3014 | bytes_written_slow += t.length(); | |
7c673cae | 3015 | } |
11fdf7f2 | 3016 | |
7c673cae FG |
3017 | bloff += x_len; |
3018 | length -= x_len; | |
3019 | ++p; | |
3020 | x_off = 0; | |
3021 | } | |
11fdf7f2 | 3022 | logger->inc(l_bluefs_bytes_written_slow, bytes_written_slow); |
7c673cae FG |
3023 | for (unsigned i = 0; i < MAX_BDEV; ++i) { |
3024 | if (bdev[i]) { | |
11fdf7f2 | 3025 | if (h->iocv[i] && h->iocv[i]->has_pending_aios()) { |
7c673cae FG |
3026 | bdev[i]->aio_submit(h->iocv[i]); |
3027 | } | |
3028 | } | |
3029 | } | |
9f95a23c | 3030 | vselector->add_usage(h->file->vselector_hint, h->file->fnode); |
7c673cae FG |
3031 | dout(20) << __func__ << " h " << h << " pos now 0x" |
3032 | << std::hex << h->pos << std::dec << dendl; | |
3033 | return 0; | |
3034 | } | |
3035 | ||
11fdf7f2 | 3036 | #ifdef HAVE_LIBAIO |
7c673cae FG |
3037 | // we need to retire old completed aios so they don't stick around in |
3038 | // memory indefinitely (along with their bufferlist refs). | |
3039 | void BlueFS::_claim_completed_aios(FileWriter *h, list<aio_t> *ls) | |
3040 | { | |
3041 | for (auto p : h->iocv) { | |
3042 | if (p) { | |
3043 | ls->splice(ls->end(), p->running_aios); | |
3044 | } | |
3045 | } | |
3046 | dout(10) << __func__ << " got " << ls->size() << " aios" << dendl; | |
3047 | } | |
3048 | ||
3049 | void BlueFS::wait_for_aio(FileWriter *h) | |
3050 | { | |
3051 | // NOTE: this is safe to call without a lock, as long as our reference is | |
3052 | // stable. | |
3053 | dout(10) << __func__ << " " << h << dendl; | |
3054 | utime_t start = ceph_clock_now(); | |
3055 | for (auto p : h->iocv) { | |
3056 | if (p) { | |
3057 | p->aio_wait(); | |
3058 | } | |
3059 | } | |
11fdf7f2 | 3060 | dout(10) << __func__ << " " << h << " done in " << (ceph_clock_now() - start) << dendl; |
7c673cae | 3061 | } |
11fdf7f2 | 3062 | #endif |
7c673cae | 3063 | |
f6b5b4d7 TL |
3064 | int BlueFS::_flush(FileWriter *h, bool force, std::unique_lock<ceph::mutex>& l) |
3065 | { | |
3066 | bool flushed = false; | |
3067 | int r = _flush(h, force, &flushed); | |
3068 | if (r == 0 && flushed) { | |
3069 | _maybe_compact_log(l); | |
3070 | } | |
3071 | return r; | |
3072 | } | |
3073 | ||
3074 | int BlueFS::_flush(FileWriter *h, bool force, bool *flushed) | |
7c673cae FG |
3075 | { |
3076 | h->buffer_appender.flush(); | |
3077 | uint64_t length = h->buffer.length(); | |
3078 | uint64_t offset = h->pos; | |
f6b5b4d7 TL |
3079 | if (flushed) { |
3080 | *flushed = false; | |
3081 | } | |
7c673cae FG |
3082 | if (!force && |
3083 | length < cct->_conf->bluefs_min_flush_size) { | |
3084 | dout(10) << __func__ << " " << h << " ignoring, length " << length | |
3085 | << " < min_flush_size " << cct->_conf->bluefs_min_flush_size | |
3086 | << dendl; | |
3087 | return 0; | |
3088 | } | |
3089 | if (length == 0) { | |
3090 | dout(10) << __func__ << " " << h << " no dirty data on " | |
3091 | << h->file->fnode << dendl; | |
3092 | return 0; | |
3093 | } | |
3094 | dout(10) << __func__ << " " << h << " 0x" | |
3095 | << std::hex << offset << "~" << length << std::dec | |
3096 | << " to " << h->file->fnode << dendl; | |
11fdf7f2 | 3097 | ceph_assert(h->pos <= h->file->fnode.size); |
f6b5b4d7 TL |
3098 | int r = _flush_range(h, offset, length); |
3099 | if (flushed) { | |
3100 | *flushed = true; | |
3101 | } | |
3102 | return r; | |
7c673cae FG |
3103 | } |
3104 | ||
3105 | int BlueFS::_truncate(FileWriter *h, uint64_t offset) | |
3106 | { | |
3107 | dout(10) << __func__ << " 0x" << std::hex << offset << std::dec | |
3108 | << " file " << h->file->fnode << dendl; | |
3109 | if (h->file->deleted) { | |
3110 | dout(10) << __func__ << " deleted, no-op" << dendl; | |
3111 | return 0; | |
3112 | } | |
3113 | ||
3114 | // we never truncate internal log files | |
11fdf7f2 | 3115 | ceph_assert(h->file->fnode.ino > 1); |
7c673cae FG |
3116 | |
3117 | h->buffer_appender.flush(); | |
3118 | ||
3119 | // truncate off unflushed data? | |
3120 | if (h->pos < offset && | |
3121 | h->pos + h->buffer.length() > offset) { | |
3122 | bufferlist t; | |
3123 | dout(20) << __func__ << " tossing out last " << offset - h->pos | |
3124 | << " unflushed bytes" << dendl; | |
3125 | t.substr_of(h->buffer, 0, offset - h->pos); | |
3126 | h->buffer.swap(t); | |
11fdf7f2 | 3127 | ceph_abort_msg("actually this shouldn't happen"); |
7c673cae FG |
3128 | } |
3129 | if (h->buffer.length()) { | |
3130 | int r = _flush(h, true); | |
3131 | if (r < 0) | |
3132 | return r; | |
3133 | } | |
3134 | if (offset == h->file->fnode.size) { | |
3135 | return 0; // no-op! | |
3136 | } | |
3137 | if (offset > h->file->fnode.size) { | |
11fdf7f2 | 3138 | ceph_abort_msg("truncate up not supported"); |
7c673cae | 3139 | } |
11fdf7f2 | 3140 | ceph_assert(h->file->fnode.size >= offset); |
9f95a23c | 3141 | vselector->sub_usage(h->file->vselector_hint, h->file->fnode.size); |
7c673cae | 3142 | h->file->fnode.size = offset; |
9f95a23c | 3143 | vselector->add_usage(h->file->vselector_hint, h->file->fnode.size); |
7c673cae FG |
3144 | log_t.op_file_update(h->file->fnode); |
3145 | return 0; | |
3146 | } | |
3147 | ||
11fdf7f2 | 3148 | int BlueFS::_fsync(FileWriter *h, std::unique_lock<ceph::mutex>& l) |
7c673cae FG |
3149 | { |
3150 | dout(10) << __func__ << " " << h << " " << h->file->fnode << dendl; | |
3151 | int r = _flush(h, true); | |
3152 | if (r < 0) | |
3153 | return r; | |
ec96510d FG |
3154 | if (h->file->is_dirty) { |
3155 | _signal_dirty_to_log(h); | |
3156 | h->file->is_dirty = false; | |
3157 | } | |
7c673cae FG |
3158 | uint64_t old_dirty_seq = h->file->dirty_seq; |
3159 | ||
3160 | _flush_bdev_safely(h); | |
3161 | ||
3162 | if (old_dirty_seq) { | |
3163 | uint64_t s = log_seq; | |
3164 | dout(20) << __func__ << " file metadata was dirty (" << old_dirty_seq | |
3165 | << ") on " << h->file->fnode << ", flushing log" << dendl; | |
3166 | _flush_and_sync_log(l, old_dirty_seq); | |
11fdf7f2 | 3167 | ceph_assert(h->file->dirty_seq == 0 || // cleaned |
7c673cae FG |
3168 | h->file->dirty_seq > s); // or redirtied by someone else |
3169 | } | |
3170 | return 0; | |
3171 | } | |
3172 | ||
3173 | void BlueFS::_flush_bdev_safely(FileWriter *h) | |
3174 | { | |
11fdf7f2 TL |
3175 | std::array<bool, MAX_BDEV> flush_devs = h->dirty_devs; |
3176 | h->dirty_devs.fill(false); | |
3177 | #ifdef HAVE_LIBAIO | |
7c673cae FG |
3178 | if (!cct->_conf->bluefs_sync_write) { |
3179 | list<aio_t> completed_ios; | |
3180 | _claim_completed_aios(h, &completed_ios); | |
3181 | lock.unlock(); | |
3182 | wait_for_aio(h); | |
3183 | completed_ios.clear(); | |
11fdf7f2 | 3184 | flush_bdev(flush_devs); |
7c673cae | 3185 | lock.lock(); |
11fdf7f2 TL |
3186 | } else |
3187 | #endif | |
3188 | { | |
7c673cae | 3189 | lock.unlock(); |
11fdf7f2 | 3190 | flush_bdev(flush_devs); |
7c673cae FG |
3191 | lock.lock(); |
3192 | } | |
3193 | } | |
3194 | ||
11fdf7f2 TL |
3195 | void BlueFS::flush_bdev(std::array<bool, MAX_BDEV>& dirty_bdevs) |
3196 | { | |
3197 | // NOTE: this is safe to call without a lock. | |
3198 | dout(20) << __func__ << dendl; | |
3199 | for (unsigned i = 0; i < MAX_BDEV; i++) { | |
3200 | if (dirty_bdevs[i]) | |
3201 | bdev[i]->flush(); | |
3202 | } | |
3203 | } | |
3204 | ||
7c673cae FG |
3205 | void BlueFS::flush_bdev() |
3206 | { | |
3207 | // NOTE: this is safe to call without a lock. | |
3208 | dout(20) << __func__ << dendl; | |
3209 | for (auto p : bdev) { | |
3210 | if (p) | |
3211 | p->flush(); | |
3212 | } | |
3213 | } | |
3214 | ||
eafe8130 TL |
3215 | const char* BlueFS::get_device_name(unsigned id) |
3216 | { | |
3217 | if (id >= MAX_BDEV) return "BDEV_INV"; | |
3218 | const char* names[] = {"BDEV_WAL", "BDEV_DB", "BDEV_SLOW", "BDEV_NEWWAL", "BDEV_NEWDB"}; | |
3219 | return names[id]; | |
3220 | } | |
3221 | ||
11fdf7f2 TL |
3222 | int BlueFS::_expand_slow_device(uint64_t need, PExtentVector& extents) |
3223 | { | |
3224 | int r = -ENOSPC; | |
3225 | if (slow_dev_expander) { | |
1911f103 | 3226 | auto id = _get_slow_device_id(); |
eafe8130 | 3227 | auto min_alloc_size = alloc_size[id]; |
1911f103 | 3228 | ceph_assert(id <= alloc.size() && alloc[id]); |
11fdf7f2 TL |
3229 | auto min_need = round_up_to(need, min_alloc_size); |
3230 | need = std::max(need, | |
3231 | slow_dev_expander->get_recommended_expansion_delta( | |
3232 | alloc[id]->get_free(), block_all[id].size())); | |
3233 | ||
3234 | need = round_up_to(need, min_alloc_size); | |
3235 | dout(10) << __func__ << " expanding slow device by 0x" | |
3236 | << std::hex << need << std::dec | |
3237 | << dendl; | |
3238 | r = slow_dev_expander->allocate_freespace(min_need, need, extents); | |
3239 | } | |
3240 | return r; | |
3241 | } | |
3242 | ||
3243 | int BlueFS::_allocate_without_fallback(uint8_t id, uint64_t len, | |
3244 | PExtentVector* extents) | |
3245 | { | |
3246 | dout(10) << __func__ << " len 0x" << std::hex << len << std::dec | |
3247 | << " from " << (int)id << dendl; | |
3248 | assert(id < alloc.size()); | |
11fdf7f2 TL |
3249 | if (!alloc[id]) { |
3250 | return -ENOENT; | |
3251 | } | |
3252 | extents->reserve(4); // 4 should be (more than) enough for most allocations | |
eafe8130 TL |
3253 | uint64_t min_alloc_size = alloc_size[id]; |
3254 | uint64_t left = round_up_to(len, min_alloc_size); | |
11fdf7f2 | 3255 | int64_t alloc_len = alloc[id]->allocate(left, min_alloc_size, 0, extents); |
eafe8130 TL |
3256 | if (alloc_len < 0 || alloc_len < (int64_t)left) { |
3257 | if (alloc_len > 0) { | |
11fdf7f2 TL |
3258 | alloc[id]->release(*extents); |
3259 | } | |
3260 | if (bdev[id]) | |
3261 | derr << __func__ << " failed to allocate 0x" << std::hex << left | |
3262 | << " on bdev " << (int)id | |
3263 | << ", free 0x" << alloc[id]->get_free() << std::dec << dendl; | |
3264 | else | |
3265 | derr << __func__ << " failed to allocate 0x" << std::hex << left | |
3266 | << " on bdev " << (int)id << ", dne" << std::dec << dendl; | |
3267 | if (alloc[id]) | |
3268 | alloc[id]->dump(); | |
3269 | return -ENOSPC; | |
3270 | } | |
3271 | ||
3272 | return 0; | |
3273 | } | |
3274 | ||
7c673cae | 3275 | int BlueFS::_allocate(uint8_t id, uint64_t len, |
94b18763 | 3276 | bluefs_fnode_t* node) |
7c673cae FG |
3277 | { |
3278 | dout(10) << __func__ << " len 0x" << std::hex << len << std::dec | |
3279 | << " from " << (int)id << dendl; | |
11fdf7f2 | 3280 | ceph_assert(id < alloc.size()); |
b32b8144 | 3281 | int64_t alloc_len = 0; |
a8e16298 | 3282 | PExtentVector extents; |
11fdf7f2 | 3283 | uint64_t hint = 0; |
7c673cae | 3284 | if (alloc[id]) { |
94b18763 FG |
3285 | if (!node->extents.empty() && node->extents.back().bdev == id) { |
3286 | hint = node->extents.back().end(); | |
11fdf7f2 | 3287 | } |
b32b8144 | 3288 | extents.reserve(4); // 4 should be (more than) enough for most allocations |
eafe8130 TL |
3289 | alloc_len = alloc[id]->allocate(round_up_to(len, alloc_size[id]), |
3290 | alloc_size[id], hint, &extents); | |
b32b8144 | 3291 | } |
eafe8130 TL |
3292 | if (!alloc[id] || |
3293 | alloc_len < 0 || | |
3294 | alloc_len < (int64_t)round_up_to(len, alloc_size[id])) { | |
11fdf7f2 | 3295 | if (alloc_len > 0) { |
a8e16298 | 3296 | alloc[id]->release(extents); |
b32b8144 | 3297 | } |
7c673cae FG |
3298 | if (id != BDEV_SLOW) { |
3299 | if (bdev[id]) { | |
eafe8130 | 3300 | dout(1) << __func__ << " failed to allocate 0x" << std::hex << len |
7c673cae FG |
3301 | << " on bdev " << (int)id |
3302 | << ", free 0x" << alloc[id]->get_free() | |
3303 | << "; fallback to bdev " << (int)id + 1 | |
3304 | << std::dec << dendl; | |
3305 | } | |
94b18763 | 3306 | return _allocate(id + 1, len, node); |
7c673cae | 3307 | } |
eafe8130 | 3308 | dout(1) << __func__ << " unable to allocate 0x" << std::hex << len |
11fdf7f2 TL |
3309 | << " on bdev " << (int)id << ", free 0x" |
3310 | << (alloc[id] ? alloc[id]->get_free() : (uint64_t)-1) | |
3311 | << "; fallback to slow device expander " | |
3312 | << std::dec << dendl; | |
3313 | extents.clear(); | |
eafe8130 | 3314 | if (_expand_slow_device(len, extents) == 0) { |
11fdf7f2 TL |
3315 | id = _get_slow_device_id(); |
3316 | for (auto& e : extents) { | |
3317 | _add_block_extent(id, e.offset, e.length); | |
3318 | } | |
3319 | extents.clear(); | |
3320 | auto* last_alloc = alloc[id]; | |
3321 | ceph_assert(last_alloc); | |
3322 | // try again | |
eafe8130 TL |
3323 | alloc_len = last_alloc->allocate(round_up_to(len, alloc_size[id]), |
3324 | alloc_size[id], hint, &extents); | |
3325 | if (alloc_len < 0 || alloc_len < (int64_t)len) { | |
11fdf7f2 TL |
3326 | if (alloc_len > 0) { |
3327 | last_alloc->release(extents); | |
3328 | } | |
eafe8130 | 3329 | derr << __func__ << " failed to allocate 0x" << std::hex << len |
11fdf7f2 TL |
3330 | << " on bdev " << (int)id |
3331 | << ", free 0x" << last_alloc->get_free() << std::dec << dendl; | |
3332 | return -ENOSPC; | |
3333 | } | |
3334 | } else { | |
3335 | derr << __func__ << " failed to expand slow device to fit +0x" | |
eafe8130 | 3336 | << std::hex << len << std::dec |
11fdf7f2 TL |
3337 | << dendl; |
3338 | return -ENOSPC; | |
3339 | } | |
3340 | } else { | |
3341 | uint64_t total_allocated = | |
3342 | block_all[id].size() - alloc[id]->get_free(); | |
3343 | if (max_bytes[id] < total_allocated) { | |
3344 | logger->set(max_bytes_pcounters[id], total_allocated); | |
3345 | max_bytes[id] = total_allocated; | |
3346 | } | |
7c673cae FG |
3347 | } |
3348 | ||
3349 | for (auto& p : extents) { | |
94b18763 | 3350 | node->append_extent(bluefs_extent_t(id, p.offset, p.length)); |
7c673cae FG |
3351 | } |
3352 | ||
3353 | return 0; | |
3354 | } | |
3355 | ||
3356 | int BlueFS::_preallocate(FileRef f, uint64_t off, uint64_t len) | |
3357 | { | |
3358 | dout(10) << __func__ << " file " << f->fnode << " 0x" | |
3359 | << std::hex << off << "~" << len << std::dec << dendl; | |
3360 | if (f->deleted) { | |
3361 | dout(10) << __func__ << " deleted, no-op" << dendl; | |
3362 | return 0; | |
3363 | } | |
11fdf7f2 | 3364 | ceph_assert(f->fnode.ino > 1); |
7c673cae FG |
3365 | uint64_t allocated = f->fnode.get_allocated(); |
3366 | if (off + len > allocated) { | |
3367 | uint64_t want = off + len - allocated; | |
9f95a23c TL |
3368 | vselector->sub_usage(f->vselector_hint, f->fnode); |
3369 | ||
3370 | int r = _allocate(vselector->select_prefer_bdev(f->vselector_hint), | |
3371 | want, | |
3372 | &f->fnode); | |
3373 | vselector->add_usage(f->vselector_hint, f->fnode); | |
7c673cae FG |
3374 | if (r < 0) |
3375 | return r; | |
7c673cae FG |
3376 | log_t.op_file_update(f->fnode); |
3377 | } | |
3378 | return 0; | |
3379 | } | |
3380 | ||
1911f103 | 3381 | void BlueFS::sync_metadata(bool avoid_compact) |
7c673cae | 3382 | { |
f6b5b4d7 | 3383 | std::unique_lock<ceph::mutex> l(lock); |
9f95a23c | 3384 | if (log_t.empty() && dirty_files.empty()) { |
7c673cae | 3385 | dout(10) << __func__ << " - no pending log events" << dendl; |
11fdf7f2 TL |
3386 | } else { |
3387 | dout(10) << __func__ << dendl; | |
3388 | utime_t start = ceph_clock_now(); | |
3389 | flush_bdev(); // FIXME? | |
3390 | _flush_and_sync_log(l); | |
3391 | dout(10) << __func__ << " done in " << (ceph_clock_now() - start) << dendl; | |
7c673cae | 3392 | } |
7c673cae | 3393 | |
f6b5b4d7 TL |
3394 | if (!avoid_compact) { |
3395 | _maybe_compact_log(l); | |
3396 | } | |
3397 | } | |
3398 | ||
3399 | void BlueFS::_maybe_compact_log(std::unique_lock<ceph::mutex>& l) | |
3400 | { | |
3401 | if (!cct->_conf->bluefs_replay_recovery_disable_compact && | |
3402 | _should_compact_log()) { | |
7c673cae FG |
3403 | if (cct->_conf->bluefs_compact_log_sync) { |
3404 | _compact_log_sync(); | |
3405 | } else { | |
3406 | _compact_log_async(l); | |
3407 | } | |
3408 | } | |
7c673cae FG |
3409 | } |
3410 | ||
3411 | int BlueFS::open_for_write( | |
ec96510d FG |
3412 | std::string_view dirname, |
3413 | std::string_view filename, | |
7c673cae FG |
3414 | FileWriter **h, |
3415 | bool overwrite) | |
3416 | { | |
11fdf7f2 | 3417 | std::lock_guard l(lock); |
7c673cae FG |
3418 | dout(10) << __func__ << " " << dirname << "/" << filename << dendl; |
3419 | map<string,DirRef>::iterator p = dir_map.find(dirname); | |
3420 | DirRef dir; | |
3421 | if (p == dir_map.end()) { | |
3422 | // implicitly create the dir | |
3423 | dout(20) << __func__ << " dir " << dirname | |
3424 | << " does not exist" << dendl; | |
3425 | return -ENOENT; | |
3426 | } else { | |
3427 | dir = p->second; | |
3428 | } | |
3429 | ||
3430 | FileRef file; | |
3431 | bool create = false; | |
f6b5b4d7 | 3432 | bool truncate = false; |
7c673cae FG |
3433 | map<string,FileRef>::iterator q = dir->file_map.find(filename); |
3434 | if (q == dir->file_map.end()) { | |
3435 | if (overwrite) { | |
3436 | dout(20) << __func__ << " dir " << dirname << " (" << dir | |
3437 | << ") file " << filename | |
3438 | << " does not exist" << dendl; | |
3439 | return -ENOENT; | |
3440 | } | |
9f95a23c | 3441 | file = ceph::make_ref<File>(); |
7c673cae FG |
3442 | file->fnode.ino = ++ino_last; |
3443 | file_map[ino_last] = file; | |
ec96510d | 3444 | dir->file_map[string{filename}] = file; |
7c673cae FG |
3445 | ++file->refs; |
3446 | create = true; | |
3447 | } else { | |
3448 | // overwrite existing file? | |
3449 | file = q->second; | |
3450 | if (overwrite) { | |
3451 | dout(20) << __func__ << " dir " << dirname << " (" << dir | |
3452 | << ") file " << filename | |
3453 | << " already exists, overwrite in place" << dendl; | |
3454 | } else { | |
3455 | dout(20) << __func__ << " dir " << dirname << " (" << dir | |
3456 | << ") file " << filename | |
3457 | << " already exists, truncate + overwrite" << dendl; | |
9f95a23c | 3458 | vselector->sub_usage(file->vselector_hint, file->fnode); |
7c673cae FG |
3459 | file->fnode.size = 0; |
3460 | for (auto& p : file->fnode.extents) { | |
3461 | pending_release[p.bdev].insert(p.offset, p.length); | |
3462 | } | |
f6b5b4d7 | 3463 | truncate = true; |
94b18763 FG |
3464 | |
3465 | file->fnode.clear_extents(); | |
7c673cae FG |
3466 | } |
3467 | } | |
11fdf7f2 | 3468 | ceph_assert(file->fnode.ino > 1); |
7c673cae FG |
3469 | |
3470 | file->fnode.mtime = ceph_clock_now(); | |
9f95a23c | 3471 | file->vselector_hint = vselector->get_hint_by_dir(dirname); |
f6b5b4d7 TL |
3472 | if (create || truncate) { |
3473 | vselector->add_usage(file->vselector_hint, file->fnode); // update file count | |
3474 | } | |
9f95a23c | 3475 | |
7c673cae | 3476 | dout(20) << __func__ << " mapping " << dirname << "/" << filename |
9f95a23c TL |
3477 | << " vsel_hint " << file->vselector_hint |
3478 | << dendl; | |
7c673cae FG |
3479 | |
3480 | log_t.op_file_update(file->fnode); | |
3481 | if (create) | |
3482 | log_t.op_dir_link(dirname, filename, file->fnode.ino); | |
3483 | ||
3484 | *h = _create_writer(file); | |
3485 | ||
3486 | if (boost::algorithm::ends_with(filename, ".log")) { | |
3487 | (*h)->writer_type = BlueFS::WRITER_WAL; | |
3488 | if (logger && !overwrite) { | |
3489 | logger->inc(l_bluefs_files_written_wal); | |
3490 | } | |
3491 | } else if (boost::algorithm::ends_with(filename, ".sst")) { | |
3492 | (*h)->writer_type = BlueFS::WRITER_SST; | |
3493 | if (logger) { | |
3494 | logger->inc(l_bluefs_files_written_sst); | |
3495 | } | |
3496 | } | |
3497 | ||
3498 | dout(10) << __func__ << " h " << *h << " on " << file->fnode << dendl; | |
3499 | return 0; | |
3500 | } | |
3501 | ||
3502 | BlueFS::FileWriter *BlueFS::_create_writer(FileRef f) | |
3503 | { | |
3504 | FileWriter *w = new FileWriter(f); | |
3505 | for (unsigned i = 0; i < MAX_BDEV; ++i) { | |
3506 | if (bdev[i]) { | |
3507 | w->iocv[i] = new IOContext(cct, NULL); | |
7c673cae FG |
3508 | } |
3509 | } | |
3510 | return w; | |
3511 | } | |
3512 | ||
3513 | void BlueFS::_close_writer(FileWriter *h) | |
3514 | { | |
3515 | dout(10) << __func__ << " " << h << " type " << h->writer_type << dendl; | |
f91f0fd5 | 3516 | h->buffer.reassign_to_mempool(mempool::mempool_bluefs_file_writer); |
7c673cae FG |
3517 | for (unsigned i=0; i<MAX_BDEV; ++i) { |
3518 | if (bdev[i]) { | |
11fdf7f2 TL |
3519 | if (h->iocv[i]) { |
3520 | h->iocv[i]->aio_wait(); | |
3521 | bdev[i]->queue_reap_ioc(h->iocv[i]); | |
3522 | } | |
7c673cae FG |
3523 | } |
3524 | } | |
3525 | delete h; | |
3526 | } | |
3527 | ||
ec96510d FG |
3528 | uint64_t BlueFS::debug_get_dirty_seq(FileWriter *h) |
3529 | { | |
3530 | std::lock_guard l(lock); | |
3531 | return h->file->dirty_seq; | |
3532 | } | |
3533 | ||
3534 | bool BlueFS::debug_get_is_dev_dirty(FileWriter *h, uint8_t dev) | |
3535 | { | |
3536 | std::lock_guard l(lock); | |
3537 | return h->dirty_devs[dev]; | |
3538 | } | |
3539 | ||
7c673cae | 3540 | int BlueFS::open_for_read( |
ec96510d FG |
3541 | std::string_view dirname, |
3542 | std::string_view filename, | |
7c673cae FG |
3543 | FileReader **h, |
3544 | bool random) | |
3545 | { | |
11fdf7f2 | 3546 | std::lock_guard l(lock); |
7c673cae FG |
3547 | dout(10) << __func__ << " " << dirname << "/" << filename |
3548 | << (random ? " (random)":" (sequential)") << dendl; | |
3549 | map<string,DirRef>::iterator p = dir_map.find(dirname); | |
3550 | if (p == dir_map.end()) { | |
3551 | dout(20) << __func__ << " dir " << dirname << " not found" << dendl; | |
3552 | return -ENOENT; | |
3553 | } | |
3554 | DirRef dir = p->second; | |
3555 | ||
3556 | map<string,FileRef>::iterator q = dir->file_map.find(filename); | |
3557 | if (q == dir->file_map.end()) { | |
3558 | dout(20) << __func__ << " dir " << dirname << " (" << dir | |
3559 | << ") file " << filename | |
3560 | << " not found" << dendl; | |
3561 | return -ENOENT; | |
3562 | } | |
3563 | File *file = q->second.get(); | |
3564 | ||
3565 | *h = new FileReader(file, random ? 4096 : cct->_conf->bluefs_max_prefetch, | |
3566 | random, false); | |
3567 | dout(10) << __func__ << " h " << *h << " on " << file->fnode << dendl; | |
3568 | return 0; | |
3569 | } | |
3570 | ||
3571 | int BlueFS::rename( | |
ec96510d FG |
3572 | std::string_view old_dirname, std::string_view old_filename, |
3573 | std::string_view new_dirname, std::string_view new_filename) | |
7c673cae | 3574 | { |
11fdf7f2 | 3575 | std::lock_guard l(lock); |
7c673cae FG |
3576 | dout(10) << __func__ << " " << old_dirname << "/" << old_filename |
3577 | << " -> " << new_dirname << "/" << new_filename << dendl; | |
3578 | map<string,DirRef>::iterator p = dir_map.find(old_dirname); | |
3579 | if (p == dir_map.end()) { | |
3580 | dout(20) << __func__ << " dir " << old_dirname << " not found" << dendl; | |
3581 | return -ENOENT; | |
3582 | } | |
3583 | DirRef old_dir = p->second; | |
3584 | map<string,FileRef>::iterator q = old_dir->file_map.find(old_filename); | |
3585 | if (q == old_dir->file_map.end()) { | |
3586 | dout(20) << __func__ << " dir " << old_dirname << " (" << old_dir | |
3587 | << ") file " << old_filename | |
3588 | << " not found" << dendl; | |
3589 | return -ENOENT; | |
3590 | } | |
3591 | FileRef file = q->second; | |
3592 | ||
3593 | p = dir_map.find(new_dirname); | |
3594 | if (p == dir_map.end()) { | |
3595 | dout(20) << __func__ << " dir " << new_dirname << " not found" << dendl; | |
3596 | return -ENOENT; | |
3597 | } | |
3598 | DirRef new_dir = p->second; | |
3599 | q = new_dir->file_map.find(new_filename); | |
3600 | if (q != new_dir->file_map.end()) { | |
3601 | dout(20) << __func__ << " dir " << new_dirname << " (" << old_dir | |
3602 | << ") file " << new_filename | |
3603 | << " already exists, unlinking" << dendl; | |
11fdf7f2 | 3604 | ceph_assert(q->second != file); |
7c673cae FG |
3605 | log_t.op_dir_unlink(new_dirname, new_filename); |
3606 | _drop_link(q->second); | |
3607 | } | |
3608 | ||
3609 | dout(10) << __func__ << " " << new_dirname << "/" << new_filename << " " | |
3610 | << " " << file->fnode << dendl; | |
3611 | ||
ec96510d FG |
3612 | new_dir->file_map[string{new_filename}] = file; |
3613 | old_dir->file_map.erase(string{old_filename}); | |
7c673cae FG |
3614 | |
3615 | log_t.op_dir_link(new_dirname, new_filename, file->fnode.ino); | |
3616 | log_t.op_dir_unlink(old_dirname, old_filename); | |
3617 | return 0; | |
3618 | } | |
3619 | ||
ec96510d | 3620 | int BlueFS::mkdir(std::string_view dirname) |
7c673cae | 3621 | { |
11fdf7f2 | 3622 | std::lock_guard l(lock); |
7c673cae FG |
3623 | dout(10) << __func__ << " " << dirname << dendl; |
3624 | map<string,DirRef>::iterator p = dir_map.find(dirname); | |
3625 | if (p != dir_map.end()) { | |
3626 | dout(20) << __func__ << " dir " << dirname << " exists" << dendl; | |
3627 | return -EEXIST; | |
3628 | } | |
ec96510d | 3629 | dir_map[string{dirname}] = ceph::make_ref<Dir>(); |
7c673cae FG |
3630 | log_t.op_dir_create(dirname); |
3631 | return 0; | |
3632 | } | |
3633 | ||
ec96510d | 3634 | int BlueFS::rmdir(std::string_view dirname) |
7c673cae | 3635 | { |
11fdf7f2 | 3636 | std::lock_guard l(lock); |
7c673cae | 3637 | dout(10) << __func__ << " " << dirname << dendl; |
ec96510d | 3638 | auto p = dir_map.find(dirname); |
7c673cae FG |
3639 | if (p == dir_map.end()) { |
3640 | dout(20) << __func__ << " dir " << dirname << " does not exist" << dendl; | |
3641 | return -ENOENT; | |
3642 | } | |
3643 | DirRef dir = p->second; | |
3644 | if (!dir->file_map.empty()) { | |
3645 | dout(20) << __func__ << " dir " << dirname << " not empty" << dendl; | |
3646 | return -ENOTEMPTY; | |
3647 | } | |
ec96510d | 3648 | dir_map.erase(string{dirname}); |
7c673cae FG |
3649 | log_t.op_dir_remove(dirname); |
3650 | return 0; | |
3651 | } | |
3652 | ||
ec96510d | 3653 | bool BlueFS::dir_exists(std::string_view dirname) |
7c673cae | 3654 | { |
11fdf7f2 | 3655 | std::lock_guard l(lock); |
7c673cae FG |
3656 | map<string,DirRef>::iterator p = dir_map.find(dirname); |
3657 | bool exists = p != dir_map.end(); | |
3658 | dout(10) << __func__ << " " << dirname << " = " << (int)exists << dendl; | |
3659 | return exists; | |
3660 | } | |
3661 | ||
ec96510d | 3662 | int BlueFS::stat(std::string_view dirname, std::string_view filename, |
7c673cae FG |
3663 | uint64_t *size, utime_t *mtime) |
3664 | { | |
11fdf7f2 | 3665 | std::lock_guard l(lock); |
7c673cae FG |
3666 | dout(10) << __func__ << " " << dirname << "/" << filename << dendl; |
3667 | map<string,DirRef>::iterator p = dir_map.find(dirname); | |
3668 | if (p == dir_map.end()) { | |
3669 | dout(20) << __func__ << " dir " << dirname << " not found" << dendl; | |
3670 | return -ENOENT; | |
3671 | } | |
3672 | DirRef dir = p->second; | |
3673 | map<string,FileRef>::iterator q = dir->file_map.find(filename); | |
3674 | if (q == dir->file_map.end()) { | |
3675 | dout(20) << __func__ << " dir " << dirname << " (" << dir | |
3676 | << ") file " << filename | |
3677 | << " not found" << dendl; | |
3678 | return -ENOENT; | |
3679 | } | |
3680 | File *file = q->second.get(); | |
3681 | dout(10) << __func__ << " " << dirname << "/" << filename | |
3682 | << " " << file->fnode << dendl; | |
3683 | if (size) | |
3684 | *size = file->fnode.size; | |
3685 | if (mtime) | |
3686 | *mtime = file->fnode.mtime; | |
3687 | return 0; | |
3688 | } | |
3689 | ||
ec96510d | 3690 | int BlueFS::lock_file(std::string_view dirname, std::string_view filename, |
7c673cae FG |
3691 | FileLock **plock) |
3692 | { | |
11fdf7f2 | 3693 | std::lock_guard l(lock); |
7c673cae FG |
3694 | dout(10) << __func__ << " " << dirname << "/" << filename << dendl; |
3695 | map<string,DirRef>::iterator p = dir_map.find(dirname); | |
3696 | if (p == dir_map.end()) { | |
3697 | dout(20) << __func__ << " dir " << dirname << " not found" << dendl; | |
3698 | return -ENOENT; | |
3699 | } | |
3700 | DirRef dir = p->second; | |
ec96510d | 3701 | auto q = dir->file_map.find(filename); |
9f95a23c | 3702 | FileRef file; |
7c673cae FG |
3703 | if (q == dir->file_map.end()) { |
3704 | dout(20) << __func__ << " dir " << dirname << " (" << dir | |
3705 | << ") file " << filename | |
3706 | << " not found, creating" << dendl; | |
9f95a23c | 3707 | file = ceph::make_ref<File>(); |
7c673cae FG |
3708 | file->fnode.ino = ++ino_last; |
3709 | file->fnode.mtime = ceph_clock_now(); | |
3710 | file_map[ino_last] = file; | |
ec96510d | 3711 | dir->file_map[string{filename}] = file; |
7c673cae FG |
3712 | ++file->refs; |
3713 | log_t.op_file_update(file->fnode); | |
3714 | log_t.op_dir_link(dirname, filename, file->fnode.ino); | |
3715 | } else { | |
9f95a23c | 3716 | file = q->second; |
7c673cae FG |
3717 | if (file->locked) { |
3718 | dout(10) << __func__ << " already locked" << dendl; | |
11fdf7f2 | 3719 | return -ENOLCK; |
7c673cae FG |
3720 | } |
3721 | } | |
3722 | file->locked = true; | |
3723 | *plock = new FileLock(file); | |
3724 | dout(10) << __func__ << " locked " << file->fnode | |
3725 | << " with " << *plock << dendl; | |
3726 | return 0; | |
3727 | } | |
3728 | ||
3729 | int BlueFS::unlock_file(FileLock *fl) | |
3730 | { | |
11fdf7f2 | 3731 | std::lock_guard l(lock); |
7c673cae | 3732 | dout(10) << __func__ << " " << fl << " on " << fl->file->fnode << dendl; |
11fdf7f2 | 3733 | ceph_assert(fl->file->locked); |
7c673cae FG |
3734 | fl->file->locked = false; |
3735 | delete fl; | |
3736 | return 0; | |
3737 | } | |
3738 | ||
ec96510d | 3739 | int BlueFS::readdir(std::string_view dirname, vector<string> *ls) |
7c673cae | 3740 | { |
ec96510d FG |
3741 | // dirname may contain a trailing / |
3742 | if (!dirname.empty() && dirname.back() == '/') { | |
3743 | dirname.remove_suffix(1); | |
3744 | } | |
11fdf7f2 | 3745 | std::lock_guard l(lock); |
7c673cae FG |
3746 | dout(10) << __func__ << " " << dirname << dendl; |
3747 | if (dirname.empty()) { | |
3748 | // list dirs | |
3749 | ls->reserve(dir_map.size() + 2); | |
3750 | for (auto& q : dir_map) { | |
3751 | ls->push_back(q.first); | |
3752 | } | |
3753 | } else { | |
3754 | // list files in dir | |
3755 | map<string,DirRef>::iterator p = dir_map.find(dirname); | |
3756 | if (p == dir_map.end()) { | |
3757 | dout(20) << __func__ << " dir " << dirname << " not found" << dendl; | |
3758 | return -ENOENT; | |
3759 | } | |
3760 | DirRef dir = p->second; | |
3761 | ls->reserve(dir->file_map.size() + 2); | |
3762 | for (auto& q : dir->file_map) { | |
3763 | ls->push_back(q.first); | |
3764 | } | |
3765 | } | |
3766 | ls->push_back("."); | |
3767 | ls->push_back(".."); | |
3768 | return 0; | |
3769 | } | |
3770 | ||
ec96510d | 3771 | int BlueFS::unlink(std::string_view dirname, std::string_view filename) |
7c673cae | 3772 | { |
11fdf7f2 | 3773 | std::lock_guard l(lock); |
7c673cae FG |
3774 | dout(10) << __func__ << " " << dirname << "/" << filename << dendl; |
3775 | map<string,DirRef>::iterator p = dir_map.find(dirname); | |
3776 | if (p == dir_map.end()) { | |
3777 | dout(20) << __func__ << " dir " << dirname << " not found" << dendl; | |
3778 | return -ENOENT; | |
3779 | } | |
3780 | DirRef dir = p->second; | |
3781 | map<string,FileRef>::iterator q = dir->file_map.find(filename); | |
3782 | if (q == dir->file_map.end()) { | |
3783 | dout(20) << __func__ << " file " << dirname << "/" << filename | |
3784 | << " not found" << dendl; | |
3785 | return -ENOENT; | |
3786 | } | |
3787 | FileRef file = q->second; | |
3788 | if (file->locked) { | |
3789 | dout(20) << __func__ << " file " << dirname << "/" << filename | |
3790 | << " is locked" << dendl; | |
3791 | return -EBUSY; | |
3792 | } | |
ec96510d | 3793 | dir->file_map.erase(string{filename}); |
7c673cae FG |
3794 | log_t.op_dir_unlink(dirname, filename); |
3795 | _drop_link(file); | |
3796 | return 0; | |
3797 | } | |
d2e6a577 FG |
3798 | |
3799 | bool BlueFS::wal_is_rotational() | |
3800 | { | |
94b18763 FG |
3801 | if (bdev[BDEV_WAL]) { |
3802 | return bdev[BDEV_WAL]->is_rotational(); | |
3803 | } else if (bdev[BDEV_DB]) { | |
3804 | return bdev[BDEV_DB]->is_rotational(); | |
3805 | } | |
3806 | return bdev[BDEV_SLOW]->is_rotational(); | |
d2e6a577 | 3807 | } |
9f95a23c | 3808 | |
f6b5b4d7 TL |
3809 | /* |
3810 | Algorithm. | |
3811 | do_replay_recovery_read is used when bluefs log abruptly ends, but it seems that more data should be there. | |
3812 | Idea is to search disk for definiton of extents that will be accompanied with bluefs log in future, | |
3813 | and try if using it will produce healthy bluefs transaction. | |
3814 | We encode already known bluefs log extents and search disk for these bytes. | |
3815 | When we find it, we decode following bytes as extent. | |
3816 | We read that whole extent and then check if merged with existing log part gives a proper bluefs transaction. | |
3817 | */ | |
3818 | int BlueFS::do_replay_recovery_read(FileReader *log_reader, | |
3819 | size_t replay_pos, | |
3820 | size_t read_offset, | |
3821 | size_t read_len, | |
3822 | bufferlist* bl) { | |
3823 | dout(1) << __func__ << " replay_pos=0x" << std::hex << replay_pos << | |
3824 | " needs 0x" << read_offset << "~" << read_len << std::dec << dendl; | |
3825 | ||
3826 | bluefs_fnode_t& log_fnode = log_reader->file->fnode; | |
3827 | bufferlist bin_extents; | |
3828 | ceph::encode(log_fnode.extents, bin_extents); | |
3829 | dout(2) << __func__ << " log file encoded extents length = " << bin_extents.length() << dendl; | |
3830 | ||
3831 | // cannot process if too small to effectively search | |
3832 | ceph_assert(bin_extents.length() >= 32); | |
3833 | bufferlist last_32; | |
3834 | last_32.substr_of(bin_extents, bin_extents.length() - 32, 32); | |
3835 | ||
3836 | //read fixed part from replay_pos to end of bluefs_log extents | |
3837 | bufferlist fixed; | |
3838 | uint64_t e_off = 0; | |
3839 | auto e = log_fnode.seek(replay_pos, &e_off); | |
3840 | ceph_assert(e != log_fnode.extents.end()); | |
3841 | int r = bdev[e->bdev]->read(e->offset + e_off, e->length - e_off, &fixed, ioc[e->bdev], | |
3842 | cct->_conf->bluefs_buffered_io); | |
3843 | ceph_assert(r == 0); | |
3844 | //capture dev of last good extent | |
3845 | uint8_t last_e_dev = e->bdev; | |
3846 | uint64_t last_e_off = e->offset; | |
3847 | ++e; | |
3848 | while (e != log_fnode.extents.end()) { | |
3849 | r = bdev[e->bdev]->read(e->offset, e->length, &fixed, ioc[e->bdev], | |
3850 | cct->_conf->bluefs_buffered_io); | |
3851 | ceph_assert(r == 0); | |
3852 | last_e_dev = e->bdev; | |
3853 | ++e; | |
3854 | } | |
3855 | ceph_assert(replay_pos + fixed.length() == read_offset); | |
3856 | ||
3857 | dout(2) << __func__ << " valid data in log = " << fixed.length() << dendl; | |
3858 | ||
3859 | struct compare { | |
3860 | bool operator()(const bluefs_extent_t& a, const bluefs_extent_t& b) const { | |
3861 | if (a.bdev < b.bdev) return true; | |
3862 | if (a.offset < b.offset) return true; | |
3863 | return a.length < b.length; | |
3864 | } | |
3865 | }; | |
3866 | std::set<bluefs_extent_t, compare> extents_rejected; | |
3867 | for (int dcnt = 0; dcnt < 3; dcnt++) { | |
3868 | uint8_t dev = (last_e_dev + dcnt) % MAX_BDEV; | |
3869 | if (bdev[dev] == nullptr) continue; | |
3870 | dout(2) << __func__ << " processing " << get_device_name(dev) << dendl; | |
3871 | interval_set<uint64_t> disk_regions; | |
3872 | disk_regions.insert(0, bdev[dev]->get_size()); | |
3873 | for (auto f : file_map) { | |
3874 | auto& e = f.second->fnode.extents; | |
3875 | for (auto& p : e) { | |
3876 | if (p.bdev == dev) { | |
3877 | disk_regions.erase(p.offset, p.length); | |
3878 | } | |
3879 | } | |
3880 | } | |
3881 | size_t disk_regions_count = disk_regions.num_intervals(); | |
3882 | dout(5) << __func__ << " " << disk_regions_count << " regions to scan on " << get_device_name(dev) << dendl; | |
3883 | ||
3884 | auto reg = disk_regions.lower_bound(last_e_off); | |
3885 | //for all except first, start from beginning | |
3886 | last_e_off = 0; | |
3887 | if (reg == disk_regions.end()) { | |
3888 | reg = disk_regions.begin(); | |
3889 | } | |
3890 | const uint64_t chunk_size = 4 * 1024 * 1024; | |
3891 | const uint64_t page_size = 4096; | |
3892 | const uint64_t max_extent_size = 16; | |
3893 | uint64_t overlay_size = last_32.length() + max_extent_size; | |
3894 | for (size_t i = 0; i < disk_regions_count; reg++, i++) { | |
3895 | if (reg == disk_regions.end()) { | |
3896 | reg = disk_regions.begin(); | |
3897 | } | |
3898 | uint64_t pos = reg.get_start(); | |
3899 | uint64_t len = reg.get_len(); | |
3900 | ||
3901 | std::unique_ptr<char[]> raw_data_p{new char[page_size + chunk_size]}; | |
3902 | char* raw_data = raw_data_p.get(); | |
3903 | memset(raw_data, 0, page_size); | |
3904 | ||
3905 | while (len > last_32.length()) { | |
3906 | uint64_t chunk_len = len > chunk_size ? chunk_size : len; | |
3907 | dout(5) << __func__ << " read " | |
3908 | << get_device_name(dev) << ":0x" << std::hex << pos << "+" << chunk_len << std::dec << dendl; | |
3909 | r = bdev[dev]->read_random(pos, chunk_len, raw_data + page_size, cct->_conf->bluefs_buffered_io); | |
3910 | ceph_assert(r == 0); | |
3911 | ||
3912 | //search for fixed_last_32 | |
3913 | char* chunk_b = raw_data + page_size; | |
3914 | char* chunk_e = chunk_b + chunk_len; | |
3915 | ||
3916 | char* search_b = chunk_b - overlay_size; | |
3917 | char* search_e = chunk_e; | |
3918 | ||
3919 | for (char* sp = search_b; ; sp += last_32.length()) { | |
3920 | sp = (char*)memmem(sp, search_e - sp, last_32.c_str(), last_32.length()); | |
3921 | if (sp == nullptr) { | |
3922 | break; | |
3923 | } | |
3924 | ||
3925 | char* n = sp + last_32.length(); | |
3926 | dout(5) << __func__ << " checking location 0x" << std::hex << pos + (n - chunk_b) << std::dec << dendl; | |
3927 | bufferlist test; | |
3928 | test.append(n, std::min<size_t>(max_extent_size, chunk_e - n)); | |
3929 | bluefs_extent_t ne; | |
3930 | try { | |
3931 | bufferlist::const_iterator p = test.begin(); | |
3932 | ceph::decode(ne, p); | |
3933 | } catch (buffer::error& e) { | |
3934 | continue; | |
3935 | } | |
3936 | if (extents_rejected.count(ne) != 0) { | |
3937 | dout(5) << __func__ << " extent " << ne << " already refected" <<dendl; | |
3938 | continue; | |
3939 | } | |
3940 | //insert as rejected already. if we succeed, it wouldn't make difference. | |
3941 | extents_rejected.insert(ne); | |
3942 | ||
3943 | if (ne.bdev >= MAX_BDEV || | |
3944 | bdev[ne.bdev] == nullptr || | |
3945 | ne.length > 16 * 1024 * 1024 || | |
3946 | (ne.length & 4095) != 0 || | |
3947 | ne.offset + ne.length > bdev[ne.bdev]->get_size() || | |
3948 | (ne.offset & 4095) != 0) { | |
3949 | dout(5) << __func__ << " refusing extent " << ne << dendl; | |
3950 | continue; | |
3951 | } | |
3952 | dout(5) << __func__ << " checking extent " << ne << dendl; | |
3953 | ||
3954 | //read candidate extent - whole | |
3955 | bufferlist candidate; | |
3956 | candidate.append(fixed); | |
3957 | r = bdev[ne.bdev]->read(ne.offset, ne.length, &candidate, ioc[ne.bdev], | |
3958 | cct->_conf->bluefs_buffered_io); | |
3959 | ceph_assert(r == 0); | |
3960 | ||
3961 | //check if transaction & crc is ok | |
3962 | bluefs_transaction_t t; | |
3963 | try { | |
3964 | bufferlist::const_iterator p = candidate.cbegin(); | |
3965 | decode(t, p); | |
3966 | } | |
3967 | catch (buffer::error& e) { | |
3968 | dout(5) << __func__ << " failed match" << dendl; | |
3969 | continue; | |
3970 | } | |
3971 | ||
3972 | //success, it seems a probable candidate | |
3973 | uint64_t l = std::min<uint64_t>(ne.length, read_len); | |
3974 | //trim to required size | |
3975 | bufferlist requested_read; | |
3976 | requested_read.substr_of(candidate, fixed.length(), l); | |
3977 | bl->append(requested_read); | |
3978 | dout(5) << __func__ << " successful extension of log " << l << "/" << read_len << dendl; | |
3979 | log_fnode.append_extent(ne); | |
3980 | log_fnode.recalc_allocated(); | |
3981 | log_reader->buf.pos += l; | |
3982 | return l; | |
3983 | } | |
3984 | //save overlay for next search | |
3985 | memcpy(search_b, chunk_e - overlay_size, overlay_size); | |
3986 | pos += chunk_len; | |
3987 | len -= chunk_len; | |
3988 | } | |
3989 | } | |
3990 | } | |
3991 | return 0; | |
3992 | } | |
3993 | ||
9f95a23c TL |
3994 | void BlueFS::debug_inject_duplicate_gift(unsigned id, |
3995 | uint64_t offset, | |
3996 | uint64_t len) | |
3997 | { | |
3998 | dout(0) << __func__ << dendl; | |
3999 | if (id < alloc.size() && alloc[id]) { | |
4000 | alloc[id]->init_add_free(offset, len); | |
4001 | } | |
4002 | } | |
4003 | ||
4004 | // =============================================== | |
4005 | // OriginalVolumeSelector | |
4006 | ||
f6b5b4d7 TL |
4007 | void* OriginalVolumeSelector::get_hint_for_log() const { |
4008 | return reinterpret_cast<void*>(BlueFS::BDEV_WAL); | |
9f95a23c | 4009 | } |
ec96510d | 4010 | void* OriginalVolumeSelector::get_hint_by_dir(std::string_view dirname) const { |
9f95a23c TL |
4011 | uint8_t res = BlueFS::BDEV_DB; |
4012 | if (dirname.length() > 5) { | |
4013 | // the "db.slow" and "db.wal" directory names are hard-coded at | |
4014 | // match up with bluestore. the slow device is always the second | |
4015 | // one (when a dedicated block.db device is present and used at | |
4016 | // bdev 0). the wal device is always last. | |
4017 | if (boost::algorithm::ends_with(dirname, ".slow")) { | |
4018 | res = BlueFS::BDEV_SLOW; | |
4019 | } | |
4020 | else if (boost::algorithm::ends_with(dirname, ".wal")) { | |
4021 | res = BlueFS::BDEV_WAL; | |
4022 | } | |
4023 | } | |
4024 | return reinterpret_cast<void*>(res); | |
4025 | } | |
4026 | ||
4027 | uint8_t OriginalVolumeSelector::select_prefer_bdev(void* hint) | |
4028 | { | |
4029 | return (uint8_t)(reinterpret_cast<uint64_t>(hint)); | |
4030 | } | |
4031 | ||
4032 | void OriginalVolumeSelector::get_paths(const std::string& base, paths& res) const | |
4033 | { | |
4034 | res.emplace_back(base, db_total); | |
4035 | res.emplace_back(base + ".slow", slow_total); | |
4036 | } | |
4037 | ||
4038 | #undef dout_prefix | |
4039 | #define dout_prefix *_dout << "OriginalVolumeSelector: " | |
4040 | ||
4041 | void OriginalVolumeSelector::dump(ostream& sout) { | |
4042 | sout<< "wal_total:" << wal_total | |
4043 | << ", db_total:" << db_total | |
4044 | << ", slow_total:" << slow_total | |
4045 | << std::endl; | |
4046 | } |