]>
Commit | Line | Data |
---|---|---|
7c673cae FG |
1 | // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- |
2 | // vim: ts=8 sw=2 smarttab | |
3 | ||
4 | #include "boost/algorithm/string.hpp" | |
9f95a23c | 5 | #include "bluestore_common.h" |
7c673cae FG |
6 | #include "BlueFS.h" |
7 | ||
8 | #include "common/debug.h" | |
9 | #include "common/errno.h" | |
10 | #include "common/perf_counters.h" | |
7c673cae | 11 | #include "Allocator.h" |
11fdf7f2 | 12 | #include "include/ceph_assert.h" |
eafe8130 | 13 | #include "common/admin_socket.h" |
7c673cae FG |
14 | |
15 | #define dout_context cct | |
16 | #define dout_subsys ceph_subsys_bluefs | |
17 | #undef dout_prefix | |
18 | #define dout_prefix *_dout << "bluefs " | |
9f95a23c | 19 | using TOPNSPC::common::cmd_getval; |
f67539c2 TL |
20 | |
21 | using std::byte; | |
22 | using std::list; | |
23 | using std::make_pair; | |
24 | using std::map; | |
25 | using std::ostream; | |
26 | using std::pair; | |
27 | using std::set; | |
28 | using std::string; | |
29 | using std::to_string; | |
30 | using std::vector; | |
31 | ||
32 | using ceph::bufferlist; | |
33 | using ceph::decode; | |
34 | using ceph::encode; | |
35 | using ceph::Formatter; | |
36 | ||
37 | ||
7c673cae FG |
38 | MEMPOOL_DEFINE_OBJECT_FACTORY(BlueFS::File, bluefs_file, bluefs); |
39 | MEMPOOL_DEFINE_OBJECT_FACTORY(BlueFS::Dir, bluefs_dir, bluefs); | |
f91f0fd5 | 40 | MEMPOOL_DEFINE_OBJECT_FACTORY(BlueFS::FileWriter, bluefs_file_writer, bluefs_file_writer); |
7c673cae | 41 | MEMPOOL_DEFINE_OBJECT_FACTORY(BlueFS::FileReaderBuffer, |
f91f0fd5 TL |
42 | bluefs_file_reader_buffer, bluefs_file_reader); |
43 | MEMPOOL_DEFINE_OBJECT_FACTORY(BlueFS::FileReader, bluefs_file_reader, bluefs_file_reader); | |
7c673cae FG |
44 | MEMPOOL_DEFINE_OBJECT_FACTORY(BlueFS::FileLock, bluefs_file_lock, bluefs); |
45 | ||
11fdf7f2 TL |
46 | static void wal_discard_cb(void *priv, void* priv2) { |
47 | BlueFS *bluefs = static_cast<BlueFS*>(priv); | |
48 | interval_set<uint64_t> *tmp = static_cast<interval_set<uint64_t>*>(priv2); | |
49 | bluefs->handle_discard(BlueFS::BDEV_WAL, *tmp); | |
50 | } | |
51 | ||
52 | static void db_discard_cb(void *priv, void* priv2) { | |
53 | BlueFS *bluefs = static_cast<BlueFS*>(priv); | |
54 | interval_set<uint64_t> *tmp = static_cast<interval_set<uint64_t>*>(priv2); | |
55 | bluefs->handle_discard(BlueFS::BDEV_DB, *tmp); | |
56 | } | |
57 | ||
58 | static void slow_discard_cb(void *priv, void* priv2) { | |
59 | BlueFS *bluefs = static_cast<BlueFS*>(priv); | |
60 | interval_set<uint64_t> *tmp = static_cast<interval_set<uint64_t>*>(priv2); | |
61 | bluefs->handle_discard(BlueFS::BDEV_SLOW, *tmp); | |
62 | } | |
7c673cae | 63 | |
eafe8130 TL |
64 | class BlueFS::SocketHook : public AdminSocketHook { |
65 | BlueFS* bluefs; | |
66 | public: | |
67 | static BlueFS::SocketHook* create(BlueFS* bluefs) | |
68 | { | |
69 | BlueFS::SocketHook* hook = nullptr; | |
70 | AdminSocket* admin_socket = bluefs->cct->get_admin_socket(); | |
71 | if (admin_socket) { | |
72 | hook = new BlueFS::SocketHook(bluefs); | |
f67539c2 | 73 | int r = admin_socket->register_command("bluestore bluefs device info " |
eafe8130 TL |
74 | "name=alloc_size,type=CephInt,req=false", |
75 | hook, | |
f67539c2 TL |
76 | "Shows space report for bluefs devices. " |
77 | "This also includes an estimation for space " | |
78 | "available to bluefs at main device. " | |
79 | "alloc_size, if set, specifies the custom bluefs " | |
80 | "allocation unit size for the estimation above."); | |
eafe8130 TL |
81 | if (r != 0) { |
82 | ldout(bluefs->cct, 1) << __func__ << " cannot register SocketHook" << dendl; | |
83 | delete hook; | |
84 | hook = nullptr; | |
9f95a23c | 85 | } else { |
f6b5b4d7 | 86 | r = admin_socket->register_command("bluefs stats", |
9f95a23c TL |
87 | hook, |
88 | "Dump internal statistics for bluefs." | |
89 | ""); | |
90 | ceph_assert(r == 0); | |
f67539c2 TL |
91 | r = admin_socket->register_command("bluefs files list", hook, |
92 | "print files in bluefs"); | |
93 | ceph_assert(r == 0); | |
cd265ab1 TL |
94 | r = admin_socket->register_command("bluefs debug_inject_read_zeros", hook, |
95 | "Injects 8K zeros into next BlueFS read. Debug only."); | |
96 | ceph_assert(r == 0); | |
eafe8130 TL |
97 | } |
98 | } | |
99 | return hook; | |
100 | } | |
101 | ||
102 | ~SocketHook() { | |
103 | AdminSocket* admin_socket = bluefs->cct->get_admin_socket(); | |
9f95a23c | 104 | admin_socket->unregister_commands(this); |
eafe8130 TL |
105 | } |
106 | private: | |
107 | SocketHook(BlueFS* bluefs) : | |
108 | bluefs(bluefs) {} | |
9f95a23c TL |
109 | int call(std::string_view command, const cmdmap_t& cmdmap, |
110 | Formatter *f, | |
111 | std::ostream& errss, | |
112 | bufferlist& out) override { | |
f67539c2 | 113 | if (command == "bluestore bluefs device info") { |
9f95a23c TL |
114 | int64_t alloc_size = 0; |
115 | cmd_getval(cmdmap, "alloc_size", alloc_size); | |
116 | if ((alloc_size & (alloc_size - 1)) != 0) { | |
117 | errss << "Invalid allocation size:'" << alloc_size << std::endl; | |
118 | return -EINVAL; | |
119 | } | |
120 | if (alloc_size == 0) | |
f67539c2 TL |
121 | alloc_size = bluefs->cct->_conf->bluefs_shared_alloc_size; |
122 | f->open_object_section("bluefs_device_info"); | |
9f95a23c TL |
123 | for (unsigned dev = BDEV_WAL; dev <= BDEV_SLOW; dev++) { |
124 | if (bluefs->bdev[dev]) { | |
125 | f->open_object_section("dev"); | |
126 | f->dump_string("device", bluefs->get_device_name(dev)); | |
127 | ceph_assert(bluefs->alloc[dev]); | |
f67539c2 TL |
128 | auto total = bluefs->get_total(dev); |
129 | auto free = bluefs->get_free(dev); | |
130 | auto used = bluefs->get_used(dev); | |
131 | ||
132 | f->dump_int("total", total); | |
133 | f->dump_int("free", free); | |
134 | f->dump_int("bluefs_used", used); | |
135 | if (bluefs->is_shared_alloc(dev)) { | |
136 | size_t avail = bluefs->probe_alloc_avail(dev, alloc_size); | |
137 | f->dump_int("bluefs max available", avail); | |
138 | } | |
139 | f->close_section(); | |
140 | } | |
eafe8130 | 141 | } |
f67539c2 | 142 | |
9f95a23c TL |
143 | f->close_section(); |
144 | } else if (command == "bluefs stats") { | |
145 | std::stringstream ss; | |
146 | bluefs->dump_block_extents(ss); | |
147 | bluefs->dump_volume_selector(ss); | |
eafe8130 | 148 | out.append(ss); |
f67539c2 TL |
149 | } else if (command == "bluefs files list") { |
150 | const char* devnames[3] = {"wal","db","slow"}; | |
20effc67 | 151 | std::lock_guard l(bluefs->nodes.lock); |
f67539c2 | 152 | f->open_array_section("files"); |
20effc67 | 153 | for (auto &d : bluefs->nodes.dir_map) { |
f67539c2 TL |
154 | std::string dir = d.first; |
155 | for (auto &r : d.second->file_map) { | |
156 | f->open_object_section("file"); | |
157 | f->dump_string("name", (dir + "/" + r.first).c_str()); | |
158 | std::vector<size_t> sizes; | |
159 | sizes.resize(bluefs->bdev.size()); | |
160 | for(auto& i : r.second->fnode.extents) { | |
161 | sizes[i.bdev] += i.length; | |
162 | } | |
163 | for (size_t i = 0; i < sizes.size(); i++) { | |
164 | if (sizes[i]>0) { | |
165 | if (i < sizeof(devnames) / sizeof(*devnames)) | |
166 | f->dump_int(devnames[i], sizes[i]); | |
167 | else | |
168 | f->dump_int(("dev-"+to_string(i)).c_str(), sizes[i]); | |
169 | } | |
170 | } | |
171 | f->close_section(); | |
172 | } | |
173 | } | |
174 | f->close_section(); | |
175 | f->flush(out); | |
cd265ab1 TL |
176 | } else if (command == "bluefs debug_inject_read_zeros") { |
177 | bluefs->inject_read_zeros++; | |
9f95a23c TL |
178 | } else { |
179 | errss << "Invalid command" << std::endl; | |
180 | return -ENOSYS; | |
eafe8130 | 181 | } |
9f95a23c TL |
182 | return 0; |
183 | } | |
eafe8130 TL |
184 | }; |
185 | ||
7c673cae FG |
186 | BlueFS::BlueFS(CephContext* cct) |
187 | : cct(cct), | |
188 | bdev(MAX_BDEV), | |
189 | ioc(MAX_BDEV), | |
f67539c2 TL |
190 | block_reserved(MAX_BDEV), |
191 | alloc(MAX_BDEV), | |
20effc67 | 192 | alloc_size(MAX_BDEV, 0) |
7c673cae | 193 | { |
20effc67 | 194 | dirty.pending_release.resize(MAX_BDEV); |
11fdf7f2 TL |
195 | discard_cb[BDEV_WAL] = wal_discard_cb; |
196 | discard_cb[BDEV_DB] = db_discard_cb; | |
197 | discard_cb[BDEV_SLOW] = slow_discard_cb; | |
eafe8130 | 198 | asok_hook = SocketHook::create(this); |
7c673cae FG |
199 | } |
200 | ||
201 | BlueFS::~BlueFS() | |
202 | { | |
eafe8130 | 203 | delete asok_hook; |
7c673cae FG |
204 | for (auto p : ioc) { |
205 | if (p) | |
206 | p->aio_wait(); | |
207 | } | |
208 | for (auto p : bdev) { | |
209 | if (p) { | |
210 | p->close(); | |
211 | delete p; | |
212 | } | |
213 | } | |
214 | for (auto p : ioc) { | |
215 | delete p; | |
216 | } | |
217 | } | |
218 | ||
219 | void BlueFS::_init_logger() | |
220 | { | |
221 | PerfCountersBuilder b(cct, "bluefs", | |
222 | l_bluefs_first, l_bluefs_last); | |
7c673cae FG |
223 | b.add_u64(l_bluefs_db_total_bytes, "db_total_bytes", |
224 | "Total bytes (main db device)", | |
11fdf7f2 | 225 | "b", PerfCountersBuilder::PRIO_USEFUL, unit_t(UNIT_BYTES)); |
7c673cae FG |
226 | b.add_u64(l_bluefs_db_used_bytes, "db_used_bytes", |
227 | "Used bytes (main db device)", | |
11fdf7f2 | 228 | "u", PerfCountersBuilder::PRIO_USEFUL, unit_t(UNIT_BYTES)); |
7c673cae FG |
229 | b.add_u64(l_bluefs_wal_total_bytes, "wal_total_bytes", |
230 | "Total bytes (wal device)", | |
11fdf7f2 | 231 | "walb", PerfCountersBuilder::PRIO_USEFUL, unit_t(UNIT_BYTES)); |
7c673cae FG |
232 | b.add_u64(l_bluefs_wal_used_bytes, "wal_used_bytes", |
233 | "Used bytes (wal device)", | |
11fdf7f2 | 234 | "walu", PerfCountersBuilder::PRIO_USEFUL, unit_t(UNIT_BYTES)); |
7c673cae FG |
235 | b.add_u64(l_bluefs_slow_total_bytes, "slow_total_bytes", |
236 | "Total bytes (slow device)", | |
11fdf7f2 | 237 | "slob", PerfCountersBuilder::PRIO_USEFUL, unit_t(UNIT_BYTES)); |
7c673cae FG |
238 | b.add_u64(l_bluefs_slow_used_bytes, "slow_used_bytes", |
239 | "Used bytes (slow device)", | |
11fdf7f2 | 240 | "slou", PerfCountersBuilder::PRIO_USEFUL, unit_t(UNIT_BYTES)); |
7c673cae FG |
241 | b.add_u64(l_bluefs_num_files, "num_files", "File count", |
242 | "f", PerfCountersBuilder::PRIO_USEFUL); | |
243 | b.add_u64(l_bluefs_log_bytes, "log_bytes", "Size of the metadata log", | |
11fdf7f2 | 244 | "jlen", PerfCountersBuilder::PRIO_INTERESTING, unit_t(UNIT_BYTES)); |
7c673cae FG |
245 | b.add_u64_counter(l_bluefs_log_compactions, "log_compactions", |
246 | "Compactions of the metadata log"); | |
247 | b.add_u64_counter(l_bluefs_logged_bytes, "logged_bytes", | |
20effc67 TL |
248 | "Bytes written to the metadata log", |
249 | "j", | |
11fdf7f2 | 250 | PerfCountersBuilder::PRIO_CRITICAL, unit_t(UNIT_BYTES)); |
7c673cae FG |
251 | b.add_u64_counter(l_bluefs_files_written_wal, "files_written_wal", |
252 | "Files written to WAL"); | |
253 | b.add_u64_counter(l_bluefs_files_written_sst, "files_written_sst", | |
254 | "Files written to SSTs"); | |
255 | b.add_u64_counter(l_bluefs_bytes_written_wal, "bytes_written_wal", | |
20effc67 TL |
256 | "Bytes written to WAL", |
257 | "walb", | |
7c673cae FG |
258 | PerfCountersBuilder::PRIO_CRITICAL); |
259 | b.add_u64_counter(l_bluefs_bytes_written_sst, "bytes_written_sst", | |
20effc67 TL |
260 | "Bytes written to SSTs", |
261 | "sstb", | |
11fdf7f2 TL |
262 | PerfCountersBuilder::PRIO_CRITICAL, unit_t(UNIT_BYTES)); |
263 | b.add_u64_counter(l_bluefs_bytes_written_slow, "bytes_written_slow", | |
20effc67 TL |
264 | "Bytes written to WAL/SSTs at slow device", |
265 | "slwb", | |
266 | PerfCountersBuilder::PRIO_CRITICAL, unit_t(UNIT_BYTES)); | |
11fdf7f2 | 267 | b.add_u64_counter(l_bluefs_max_bytes_wal, "max_bytes_wal", |
20effc67 TL |
268 | "Maximum bytes allocated from WAL", |
269 | "mxwb", | |
270 | PerfCountersBuilder::PRIO_INTERESTING, | |
271 | unit_t(UNIT_BYTES)); | |
11fdf7f2 | 272 | b.add_u64_counter(l_bluefs_max_bytes_db, "max_bytes_db", |
20effc67 TL |
273 | "Maximum bytes allocated from DB", |
274 | "mxdb", | |
275 | PerfCountersBuilder::PRIO_INTERESTING, | |
276 | unit_t(UNIT_BYTES)); | |
11fdf7f2 | 277 | b.add_u64_counter(l_bluefs_max_bytes_slow, "max_bytes_slow", |
20effc67 TL |
278 | "Maximum bytes allocated from SLOW", |
279 | "mxwb", | |
280 | PerfCountersBuilder::PRIO_INTERESTING, | |
281 | unit_t(UNIT_BYTES)); | |
282 | b.add_u64_counter(l_bluefs_main_alloc_unit, "alloc_unit_main", | |
283 | "Allocation unit size (in bytes) for primary/shared device", | |
284 | "aumb", | |
285 | PerfCountersBuilder::PRIO_CRITICAL, | |
286 | unit_t(UNIT_BYTES)); | |
287 | b.add_u64_counter(l_bluefs_db_alloc_unit, "alloc_unit_db", | |
288 | "Allocation unit size (in bytes) for standalone DB device", | |
289 | "audb", | |
290 | PerfCountersBuilder::PRIO_CRITICAL, | |
291 | unit_t(UNIT_BYTES)); | |
292 | b.add_u64_counter(l_bluefs_wal_alloc_unit, "alloc_unit_wal", | |
293 | "Allocation unit size (in bytes) for standalone WAL device", | |
294 | "auwb", | |
295 | PerfCountersBuilder::PRIO_CRITICAL, | |
296 | unit_t(UNIT_BYTES)); | |
494da23a | 297 | b.add_u64_counter(l_bluefs_read_random_count, "read_random_count", |
20effc67 TL |
298 | "random read requests processed", |
299 | NULL, | |
300 | PerfCountersBuilder::PRIO_USEFUL); | |
494da23a | 301 | b.add_u64_counter(l_bluefs_read_random_bytes, "read_random_bytes", |
20effc67 TL |
302 | "Bytes requested in random read mode", |
303 | NULL, | |
494da23a TL |
304 | PerfCountersBuilder::PRIO_USEFUL, unit_t(UNIT_BYTES)); |
305 | b.add_u64_counter(l_bluefs_read_random_disk_count, "read_random_disk_count", | |
20effc67 TL |
306 | "random reads requests going to disk", |
307 | NULL, | |
308 | PerfCountersBuilder::PRIO_USEFUL); | |
494da23a | 309 | b.add_u64_counter(l_bluefs_read_random_disk_bytes, "read_random_disk_bytes", |
20effc67 TL |
310 | "Bytes read from disk in random read mode", |
311 | "rrb", | |
312 | PerfCountersBuilder::PRIO_INTERESTING, | |
313 | unit_t(UNIT_BYTES)); | |
314 | b.add_u64_counter(l_bluefs_read_random_disk_bytes_wal, "read_random_disk_bytes_wal", | |
315 | "random reads requests going to WAL disk", | |
316 | NULL, | |
317 | PerfCountersBuilder::PRIO_USEFUL, unit_t(UNIT_BYTES)); | |
318 | b.add_u64_counter(l_bluefs_read_random_disk_bytes_db, "read_random_disk_bytes_db", | |
319 | "random reads requests going to DB disk", | |
320 | NULL, | |
494da23a | 321 | PerfCountersBuilder::PRIO_USEFUL, unit_t(UNIT_BYTES)); |
20effc67 TL |
322 | b.add_u64_counter(l_bluefs_read_random_disk_bytes_slow, "read_random_disk_bytes_slow", |
323 | "random reads requests going to main disk", | |
324 | "rrsb", | |
325 | PerfCountersBuilder::PRIO_INTERESTING, | |
326 | unit_t(UNIT_BYTES)); | |
494da23a | 327 | b.add_u64_counter(l_bluefs_read_random_buffer_count, "read_random_buffer_count", |
20effc67 TL |
328 | "random read requests processed using prefetch buffer", |
329 | NULL, | |
330 | PerfCountersBuilder::PRIO_USEFUL); | |
494da23a | 331 | b.add_u64_counter(l_bluefs_read_random_buffer_bytes, "read_random_buffer_bytes", |
20effc67 TL |
332 | "Bytes read from prefetch buffer in random read mode", |
333 | NULL, | |
494da23a | 334 | PerfCountersBuilder::PRIO_USEFUL, unit_t(UNIT_BYTES)); |
494da23a | 335 | b.add_u64_counter(l_bluefs_read_count, "read_count", |
20effc67 TL |
336 | "buffered read requests processed", |
337 | NULL, | |
338 | PerfCountersBuilder::PRIO_USEFUL); | |
494da23a | 339 | b.add_u64_counter(l_bluefs_read_bytes, "read_bytes", |
20effc67 TL |
340 | "Bytes requested in buffered read mode", |
341 | NULL, | |
494da23a | 342 | PerfCountersBuilder::PRIO_USEFUL, unit_t(UNIT_BYTES)); |
20effc67 TL |
343 | b.add_u64_counter(l_bluefs_read_disk_count, "read_disk_count", |
344 | "buffered reads requests going to disk", | |
345 | NULL, | |
346 | PerfCountersBuilder::PRIO_USEFUL); | |
347 | b.add_u64_counter(l_bluefs_read_disk_bytes, "read_disk_bytes", | |
348 | "Bytes read in buffered mode from disk", | |
349 | "rb", | |
350 | PerfCountersBuilder::PRIO_INTERESTING, unit_t(UNIT_BYTES)); | |
351 | b.add_u64_counter(l_bluefs_read_disk_bytes_wal, "read_disk_bytes_wal", | |
352 | "reads requests going to WAL disk", | |
353 | NULL, | |
354 | PerfCountersBuilder::PRIO_USEFUL, unit_t(UNIT_BYTES)); | |
355 | b.add_u64_counter(l_bluefs_read_disk_bytes_db, "read_disk_bytes_db", | |
356 | "reads requests going to DB disk", | |
357 | NULL, | |
358 | PerfCountersBuilder::PRIO_USEFUL, unit_t(UNIT_BYTES)); | |
359 | b.add_u64_counter(l_bluefs_read_disk_bytes_slow, "read_disk_bytes_slow", | |
360 | "reads requests going to main disk", | |
361 | "rsb", | |
362 | PerfCountersBuilder::PRIO_INTERESTING, unit_t(UNIT_BYTES)); | |
494da23a | 363 | b.add_u64_counter(l_bluefs_read_prefetch_count, "read_prefetch_count", |
20effc67 TL |
364 | "prefetch read requests processed", |
365 | NULL, | |
366 | PerfCountersBuilder::PRIO_USEFUL); | |
494da23a | 367 | b.add_u64_counter(l_bluefs_read_prefetch_bytes, "read_prefetch_bytes", |
20effc67 TL |
368 | "Bytes requested in prefetch read mode", |
369 | NULL, | |
494da23a | 370 | PerfCountersBuilder::PRIO_USEFUL, unit_t(UNIT_BYTES)); |
cd265ab1 TL |
371 | b.add_u64(l_bluefs_read_zeros_candidate, "read_zeros_candidate", |
372 | "How many times bluefs read found page with all 0s"); | |
373 | b.add_u64(l_bluefs_read_zeros_errors, "read_zeros_errors", | |
374 | "How many times bluefs read found transient page with all 0s"); | |
494da23a | 375 | |
7c673cae FG |
376 | logger = b.create_perf_counters(); |
377 | cct->get_perfcounters_collection()->add(logger); | |
378 | } | |
379 | ||
380 | void BlueFS::_shutdown_logger() | |
381 | { | |
382 | cct->get_perfcounters_collection()->remove(logger); | |
383 | delete logger; | |
384 | } | |
385 | ||
386 | void BlueFS::_update_logger_stats() | |
387 | { | |
7c673cae | 388 | if (alloc[BDEV_WAL]) { |
f67539c2 TL |
389 | logger->set(l_bluefs_wal_total_bytes, _get_total(BDEV_WAL)); |
390 | logger->set(l_bluefs_wal_used_bytes, _get_used(BDEV_WAL)); | |
7c673cae FG |
391 | } |
392 | if (alloc[BDEV_DB]) { | |
f67539c2 TL |
393 | logger->set(l_bluefs_db_total_bytes, _get_total(BDEV_DB)); |
394 | logger->set(l_bluefs_db_used_bytes, _get_used(BDEV_DB)); | |
7c673cae FG |
395 | } |
396 | if (alloc[BDEV_SLOW]) { | |
f67539c2 TL |
397 | logger->set(l_bluefs_slow_total_bytes, _get_total(BDEV_SLOW)); |
398 | logger->set(l_bluefs_slow_used_bytes, _get_used(BDEV_SLOW)); | |
7c673cae FG |
399 | } |
400 | } | |
401 | ||
11fdf7f2 | 402 | int BlueFS::add_block_device(unsigned id, const string& path, bool trim, |
f67539c2 TL |
403 | uint64_t reserved, |
404 | bluefs_shared_alloc_context_t* _shared_alloc) | |
7c673cae | 405 | { |
f67539c2 TL |
406 | dout(10) << __func__ << " bdev " << id << " path " << path << " " |
407 | << reserved << dendl; | |
11fdf7f2 TL |
408 | ceph_assert(id < bdev.size()); |
409 | ceph_assert(bdev[id] == NULL); | |
410 | BlockDevice *b = BlockDevice::create(cct, path, NULL, NULL, | |
411 | discard_cb[id], static_cast<void*>(this)); | |
f67539c2 TL |
412 | block_reserved[id] = reserved; |
413 | if (_shared_alloc) { | |
11fdf7f2 TL |
414 | b->set_no_exclusive_lock(); |
415 | } | |
7c673cae FG |
416 | int r = b->open(path); |
417 | if (r < 0) { | |
418 | delete b; | |
419 | return r; | |
420 | } | |
11fdf7f2 TL |
421 | if (trim) { |
422 | b->discard(0, b->get_size()); | |
423 | } | |
424 | ||
7c673cae | 425 | dout(1) << __func__ << " bdev " << id << " path " << path |
1adf2230 | 426 | << " size " << byte_u_t(b->get_size()) << dendl; |
7c673cae FG |
427 | bdev[id] = b; |
428 | ioc[id] = new IOContext(cct, NULL); | |
f67539c2 TL |
429 | if (_shared_alloc) { |
430 | ceph_assert(!shared_alloc); | |
431 | shared_alloc = _shared_alloc; | |
432 | alloc[id] = shared_alloc->a; | |
433 | shared_alloc_id = id; | |
434 | } | |
7c673cae FG |
435 | return 0; |
436 | } | |
437 | ||
438 | bool BlueFS::bdev_support_label(unsigned id) | |
439 | { | |
11fdf7f2 TL |
440 | ceph_assert(id < bdev.size()); |
441 | ceph_assert(bdev[id]); | |
7c673cae FG |
442 | return bdev[id]->supported_bdev_label(); |
443 | } | |
444 | ||
f67539c2 | 445 | uint64_t BlueFS::get_block_device_size(unsigned id) const |
7c673cae FG |
446 | { |
447 | if (id < bdev.size() && bdev[id]) | |
448 | return bdev[id]->get_size(); | |
449 | return 0; | |
450 | } | |
451 | ||
f67539c2 | 452 | void BlueFS::handle_discard(unsigned id, interval_set<uint64_t>& to_release) |
7c673cae | 453 | { |
f67539c2 TL |
454 | dout(10) << __func__ << " bdev " << id << dendl; |
455 | ceph_assert(alloc[id]); | |
456 | alloc[id]->release(to_release); | |
457 | if (is_shared_alloc(id)) { | |
458 | shared_alloc->bluefs_used -= to_release.size(); | |
7c673cae | 459 | } |
7c673cae FG |
460 | } |
461 | ||
f67539c2 | 462 | uint64_t BlueFS::get_used() |
7c673cae | 463 | { |
f67539c2 TL |
464 | uint64_t used = 0; |
465 | for (unsigned id = 0; id < MAX_BDEV; ++id) { | |
466 | used += _get_used(id); | |
7c673cae | 467 | } |
f67539c2 TL |
468 | return used; |
469 | } | |
7c673cae | 470 | |
f67539c2 TL |
471 | uint64_t BlueFS::_get_used(unsigned id) const |
472 | { | |
473 | uint64_t used = 0; | |
474 | if (!alloc[id]) | |
475 | return 0; | |
9f95a23c | 476 | |
f67539c2 TL |
477 | if (is_shared_alloc(id)) { |
478 | used = shared_alloc->bluefs_used; | |
479 | } else { | |
480 | used = _get_total(id) - alloc[id]->get_free(); | |
9f95a23c | 481 | } |
f67539c2 | 482 | return used; |
7c673cae FG |
483 | } |
484 | ||
f67539c2 | 485 | uint64_t BlueFS::get_used(unsigned id) |
7c673cae | 486 | { |
f67539c2 | 487 | ceph_assert(id < alloc.size()); |
11fdf7f2 | 488 | ceph_assert(alloc[id]); |
f67539c2 | 489 | return _get_used(id); |
11fdf7f2 TL |
490 | } |
491 | ||
f67539c2 | 492 | uint64_t BlueFS::_get_total(unsigned id) const |
11fdf7f2 | 493 | { |
f67539c2 TL |
494 | ceph_assert(id < bdev.size()); |
495 | ceph_assert(id < block_reserved.size()); | |
496 | return get_block_device_size(id) - block_reserved[id]; | |
7c673cae FG |
497 | } |
498 | ||
499 | uint64_t BlueFS::get_total(unsigned id) | |
500 | { | |
f67539c2 | 501 | return _get_total(id); |
7c673cae FG |
502 | } |
503 | ||
504 | uint64_t BlueFS::get_free(unsigned id) | |
505 | { | |
11fdf7f2 | 506 | ceph_assert(id < alloc.size()); |
7c673cae FG |
507 | return alloc[id]->get_free(); |
508 | } | |
509 | ||
510 | void BlueFS::dump_perf_counters(Formatter *f) | |
511 | { | |
512 | f->open_object_section("bluefs_perf_counters"); | |
513 | logger->dump_formatted(f,0); | |
514 | f->close_section(); | |
515 | } | |
516 | ||
3efd9988 FG |
517 | void BlueFS::dump_block_extents(ostream& out) |
518 | { | |
519 | for (unsigned i = 0; i < MAX_BDEV; ++i) { | |
520 | if (!bdev[i]) { | |
521 | continue; | |
522 | } | |
f67539c2 | 523 | auto total = get_total(i); |
11fdf7f2 | 524 | auto free = get_free(i); |
1911f103 | 525 | |
f67539c2 TL |
526 | out << i << " : device size 0x" << std::hex << total |
527 | << " : using 0x" << total - free | |
528 | << std::dec << "(" << byte_u_t(total - free) << ")"; | |
1911f103 | 529 | out << "\n"; |
3efd9988 FG |
530 | } |
531 | } | |
7c673cae | 532 | |
7c673cae FG |
533 | int BlueFS::get_block_extents(unsigned id, interval_set<uint64_t> *extents) |
534 | { | |
20effc67 | 535 | std::lock_guard nl(nodes.lock); |
7c673cae | 536 | dout(10) << __func__ << " bdev " << id << dendl; |
f67539c2 | 537 | ceph_assert(id < alloc.size()); |
20effc67 | 538 | for (auto& p : nodes.file_map) { |
f67539c2 TL |
539 | for (auto& q : p.second->fnode.extents) { |
540 | if (q.bdev == id) { | |
541 | extents->insert(q.offset, q.length); | |
542 | } | |
543 | } | |
544 | } | |
7c673cae FG |
545 | return 0; |
546 | } | |
547 | ||
9f95a23c | 548 | int BlueFS::mkfs(uuid_d osd_uuid, const bluefs_layout_t& layout) |
7c673cae | 549 | { |
7c673cae FG |
550 | dout(1) << __func__ |
551 | << " osd_uuid " << osd_uuid | |
552 | << dendl; | |
553 | ||
9f95a23c TL |
554 | // set volume selector if not provided before/outside |
555 | if (vselector == nullptr) { | |
556 | vselector.reset( | |
557 | new OriginalVolumeSelector( | |
558 | get_block_device_size(BlueFS::BDEV_WAL) * 95 / 100, | |
559 | get_block_device_size(BlueFS::BDEV_DB) * 95 / 100, | |
560 | get_block_device_size(BlueFS::BDEV_SLOW) * 95 / 100)); | |
561 | } | |
562 | ||
7c673cae | 563 | _init_logger(); |
20effc67 | 564 | _init_alloc(); |
7c673cae FG |
565 | |
566 | super.version = 1; | |
567 | super.block_size = bdev[BDEV_DB]->get_block_size(); | |
568 | super.osd_uuid = osd_uuid; | |
569 | super.uuid.generate_random(); | |
570 | dout(1) << __func__ << " uuid " << super.uuid << dendl; | |
571 | ||
572 | // init log | |
9f95a23c | 573 | FileRef log_file = ceph::make_ref<File>(); |
7c673cae | 574 | log_file->fnode.ino = 1; |
f6b5b4d7 | 575 | log_file->vselector_hint = vselector->get_hint_for_log(); |
7c673cae | 576 | int r = _allocate( |
9f95a23c | 577 | vselector->select_prefer_bdev(log_file->vselector_hint), |
7c673cae | 578 | cct->_conf->bluefs_max_log_runway, |
94b18763 | 579 | &log_file->fnode); |
9f95a23c | 580 | vselector->add_usage(log_file->vselector_hint, log_file->fnode); |
11fdf7f2 | 581 | ceph_assert(r == 0); |
20effc67 | 582 | log.writer = _create_writer(log_file); |
7c673cae FG |
583 | |
584 | // initial txn | |
20effc67 TL |
585 | ceph_assert(log.seq_live == 1); |
586 | log.t.seq = 1; | |
587 | log.t.op_init(); | |
588 | _flush_and_sync_log_LD(); | |
7c673cae FG |
589 | |
590 | // write supers | |
591 | super.log_fnode = log_file->fnode; | |
9f95a23c | 592 | super.memorized_layout = layout; |
11fdf7f2 | 593 | _write_super(BDEV_DB); |
20effc67 | 594 | _flush_bdev(); |
7c673cae FG |
595 | |
596 | // clean up | |
597 | super = bluefs_super_t(); | |
20effc67 TL |
598 | _close_writer(log.writer); |
599 | log.writer = NULL; | |
9f95a23c | 600 | vselector.reset(nullptr); |
7c673cae FG |
601 | _stop_alloc(); |
602 | _shutdown_logger(); | |
f67539c2 TL |
603 | if (shared_alloc) { |
604 | ceph_assert(shared_alloc->need_init); | |
605 | shared_alloc->need_init = false; | |
606 | } | |
7c673cae FG |
607 | |
608 | dout(10) << __func__ << " success" << dendl; | |
609 | return 0; | |
610 | } | |
611 | ||
612 | void BlueFS::_init_alloc() | |
613 | { | |
614 | dout(20) << __func__ << dendl; | |
eafe8130 | 615 | |
20effc67 | 616 | size_t wal_alloc_size = 0; |
eafe8130 | 617 | if (bdev[BDEV_WAL]) { |
20effc67 TL |
618 | wal_alloc_size = cct->_conf->bluefs_alloc_size; |
619 | alloc_size[BDEV_WAL] = wal_alloc_size; | |
eafe8130 | 620 | } |
20effc67 TL |
621 | logger->set(l_bluefs_wal_alloc_unit, wal_alloc_size); |
622 | ||
eafe8130 TL |
623 | if (bdev[BDEV_SLOW]) { |
624 | alloc_size[BDEV_DB] = cct->_conf->bluefs_alloc_size; | |
625 | alloc_size[BDEV_SLOW] = cct->_conf->bluefs_shared_alloc_size; | |
20effc67 TL |
626 | logger->set(l_bluefs_db_alloc_unit, cct->_conf->bluefs_alloc_size); |
627 | logger->set(l_bluefs_main_alloc_unit, cct->_conf->bluefs_shared_alloc_size); | |
eafe8130 TL |
628 | } else { |
629 | alloc_size[BDEV_DB] = cct->_conf->bluefs_shared_alloc_size; | |
20effc67 TL |
630 | logger->set(l_bluefs_main_alloc_unit, 0); |
631 | logger->set(l_bluefs_db_alloc_unit, cct->_conf->bluefs_shared_alloc_size); | |
eafe8130 TL |
632 | } |
633 | // new wal and db devices are never shared | |
634 | if (bdev[BDEV_NEWWAL]) { | |
635 | alloc_size[BDEV_NEWWAL] = cct->_conf->bluefs_alloc_size; | |
636 | } | |
637 | if (bdev[BDEV_NEWDB]) { | |
638 | alloc_size[BDEV_NEWDB] = cct->_conf->bluefs_alloc_size; | |
639 | } | |
640 | ||
7c673cae FG |
641 | for (unsigned id = 0; id < bdev.size(); ++id) { |
642 | if (!bdev[id]) { | |
643 | continue; | |
644 | } | |
11fdf7f2 | 645 | ceph_assert(bdev[id]->get_size()); |
eafe8130 | 646 | ceph_assert(alloc_size[id]); |
f67539c2 TL |
647 | if (is_shared_alloc(id)) { |
648 | dout(1) << __func__ << " shared, id " << id << std::hex | |
649 | << ", capacity 0x" << bdev[id]->get_size() | |
650 | << ", block size 0x" << alloc_size[id] | |
651 | << std::dec << dendl; | |
652 | } else { | |
653 | std::string name = "bluefs-"; | |
654 | const char* devnames[] = { "wal","db","slow" }; | |
655 | if (id <= BDEV_SLOW) | |
656 | name += devnames[id]; | |
657 | else | |
658 | name += to_string(uintptr_t(this)); | |
659 | dout(1) << __func__ << " new, id " << id << std::hex | |
660 | << ", allocator name " << name | |
661 | << ", allocator type " << cct->_conf->bluefs_allocator | |
662 | << ", capacity 0x" << bdev[id]->get_size() | |
663 | << ", block size 0x" << alloc_size[id] | |
664 | << std::dec << dendl; | |
665 | alloc[id] = Allocator::create(cct, cct->_conf->bluefs_allocator, | |
666 | bdev[id]->get_size(), | |
20effc67 TL |
667 | alloc_size[id], |
668 | 0, 0, | |
669 | name); | |
f67539c2 TL |
670 | alloc[id]->init_add_free( |
671 | block_reserved[id], | |
672 | _get_total(id)); | |
7c673cae FG |
673 | } |
674 | } | |
675 | } | |
676 | ||
677 | void BlueFS::_stop_alloc() | |
678 | { | |
679 | dout(20) << __func__ << dendl; | |
11fdf7f2 TL |
680 | for (auto p : bdev) { |
681 | if (p) | |
682 | p->discard_drain(); | |
683 | } | |
684 | ||
f67539c2 TL |
685 | for (size_t i = 0; i < alloc.size(); ++i) { |
686 | if (alloc[i] && !is_shared_alloc(i)) { | |
687 | alloc[i]->shutdown(); | |
688 | delete alloc[i]; | |
689 | alloc[i] = nullptr; | |
7c673cae FG |
690 | } |
691 | } | |
7c673cae FG |
692 | } |
693 | ||
20effc67 TL |
694 | int BlueFS::_read_and_check(uint8_t ndev, uint64_t off, uint64_t len, |
695 | ceph::buffer::list *pbl, IOContext *ioc, bool buffered) | |
cd265ab1 TL |
696 | { |
697 | dout(10) << __func__ << " dev " << int(ndev) | |
698 | << ": 0x" << std::hex << off << "~" << len << std::dec | |
699 | << (buffered ? " buffered" : "") | |
700 | << dendl; | |
701 | int r; | |
702 | bufferlist bl; | |
20effc67 | 703 | r = _bdev_read(ndev, off, len, &bl, ioc, buffered); |
cd265ab1 TL |
704 | if (r != 0) { |
705 | return r; | |
706 | } | |
707 | uint64_t block_size = bdev[ndev]->get_block_size(); | |
708 | if (inject_read_zeros) { | |
709 | if (len >= block_size * 2) { | |
710 | derr << __func__ << " injecting error, zeros at " | |
711 | << int(ndev) << ": 0x" << std::hex << (off + len / 2) | |
712 | << "~" << (block_size * 2) << std::dec << dendl; | |
713 | //use beginning, replace 8K in the middle with zeros, use tail | |
714 | bufferlist temp; | |
715 | bl.splice(0, len / 2 - block_size, &temp); | |
f67539c2 | 716 | temp.append(buffer::create(block_size * 2, 0)); |
cd265ab1 TL |
717 | bl.splice(block_size * 2, len / 2 - block_size, &temp); |
718 | bl = temp; | |
719 | inject_read_zeros--; | |
720 | } | |
721 | } | |
722 | //make a check if there is a block with all 0 | |
723 | uint64_t to_check_len = len; | |
724 | uint64_t skip = p2nphase(off, block_size); | |
725 | if (skip >= to_check_len) { | |
726 | return r; | |
727 | } | |
728 | auto it = bl.begin(skip); | |
729 | to_check_len -= skip; | |
730 | bool all_zeros = false; | |
731 | while (all_zeros == false && to_check_len >= block_size) { | |
732 | // checking 0s step | |
733 | unsigned block_left = block_size; | |
734 | unsigned avail; | |
735 | const char* data; | |
736 | all_zeros = true; | |
737 | while (all_zeros && block_left > 0) { | |
738 | avail = it.get_ptr_and_advance(block_left, &data); | |
739 | block_left -= avail; | |
740 | all_zeros = mem_is_zero(data, avail); | |
741 | } | |
742 | // skipping step | |
743 | while (block_left > 0) { | |
744 | avail = it.get_ptr_and_advance(block_left, &data); | |
745 | block_left -= avail; | |
746 | } | |
747 | to_check_len -= block_size; | |
748 | } | |
749 | if (all_zeros) { | |
750 | logger->inc(l_bluefs_read_zeros_candidate, 1); | |
751 | bufferlist bl_reread; | |
20effc67 | 752 | r = _bdev_read(ndev, off, len, &bl_reread, ioc, buffered); |
cd265ab1 TL |
753 | if (r != 0) { |
754 | return r; | |
755 | } | |
756 | // check if both read gave the same | |
757 | if (!bl.contents_equal(bl_reread)) { | |
758 | // report problems to log, but continue, maybe it will be good now... | |
759 | derr << __func__ << " initial read of " << int(ndev) | |
760 | << ": 0x" << std::hex << off << "~" << len | |
761 | << std::dec << ": different then re-read " << dendl; | |
762 | logger->inc(l_bluefs_read_zeros_errors, 1); | |
763 | } | |
764 | // use second read will be better if is different | |
765 | pbl->append(bl_reread); | |
766 | } else { | |
767 | pbl->append(bl); | |
768 | } | |
769 | return r; | |
770 | } | |
771 | ||
20effc67 TL |
772 | int BlueFS::_read_random_and_check( |
773 | uint8_t ndev, uint64_t off, uint64_t len, char *buf, bool buffered) | |
cd265ab1 TL |
774 | { |
775 | dout(10) << __func__ << " dev " << int(ndev) | |
776 | << ": 0x" << std::hex << off << "~" << len << std::dec | |
777 | << (buffered ? " buffered" : "") | |
778 | << dendl; | |
779 | int r; | |
20effc67 | 780 | r = _bdev_read_random(ndev, off, len, buf, buffered); |
cd265ab1 TL |
781 | if (r != 0) { |
782 | return r; | |
783 | } | |
784 | uint64_t block_size = bdev[ndev]->get_block_size(); | |
785 | if (inject_read_zeros) { | |
786 | if (len >= block_size * 2) { | |
787 | derr << __func__ << " injecting error, zeros at " | |
788 | << int(ndev) << ": 0x" << std::hex << (off + len / 2) | |
789 | << "~" << (block_size * 2) << std::dec << dendl; | |
790 | //zero middle 8K | |
791 | memset(buf + len / 2 - block_size, 0, block_size * 2); | |
792 | inject_read_zeros--; | |
793 | } | |
794 | } | |
795 | //make a check if there is a block with all 0 | |
796 | uint64_t to_check_len = len; | |
797 | const char* data = buf; | |
798 | uint64_t skip = p2nphase(off, block_size); | |
799 | if (skip >= to_check_len) { | |
800 | return r; | |
801 | } | |
802 | to_check_len -= skip; | |
803 | data += skip; | |
804 | ||
805 | bool all_zeros = false; | |
806 | while (all_zeros == false && to_check_len >= block_size) { | |
807 | if (mem_is_zero(data, block_size)) { | |
808 | // at least one block is all zeros | |
809 | all_zeros = true; | |
810 | break; | |
811 | } | |
812 | data += block_size; | |
813 | to_check_len -= block_size; | |
814 | } | |
815 | if (all_zeros) { | |
816 | logger->inc(l_bluefs_read_zeros_candidate, 1); | |
817 | std::unique_ptr<char[]> data_reread(new char[len]); | |
20effc67 | 818 | r = _bdev_read_random(ndev, off, len, &data_reread[0], buffered); |
cd265ab1 TL |
819 | if (r != 0) { |
820 | return r; | |
821 | } | |
822 | // check if both read gave the same | |
823 | if (memcmp(buf, &data_reread[0], len) != 0) { | |
824 | derr << __func__ << " initial read of " << int(ndev) | |
825 | << ": 0x" << std::hex << off << "~" << len | |
826 | << std::dec << ": different then re-read " << dendl; | |
827 | logger->inc(l_bluefs_read_zeros_errors, 1); | |
828 | // second read is probably better | |
829 | memcpy(buf, &data_reread[0], len); | |
830 | } | |
831 | } | |
832 | return r; | |
833 | } | |
834 | ||
20effc67 TL |
835 | int BlueFS::_bdev_read(uint8_t ndev, uint64_t off, uint64_t len, |
836 | ceph::buffer::list* pbl, IOContext* ioc, bool buffered) | |
837 | { | |
838 | int cnt = 0; | |
839 | switch (ndev) { | |
840 | case BDEV_WAL: cnt = l_bluefs_read_disk_bytes_wal; break; | |
841 | case BDEV_DB: cnt = l_bluefs_read_disk_bytes_db; break; | |
842 | case BDEV_SLOW: cnt = l_bluefs_read_disk_bytes_slow; break; | |
843 | ||
844 | } | |
845 | if (cnt) { | |
846 | logger->inc(cnt, len); | |
847 | } | |
848 | return bdev[ndev]->read(off, len, pbl, ioc, buffered); | |
849 | } | |
850 | ||
851 | int BlueFS::_bdev_read_random(uint8_t ndev, uint64_t off, uint64_t len, | |
852 | char* buf, bool buffered) | |
853 | { | |
854 | int cnt = 0; | |
855 | switch (ndev) { | |
856 | case BDEV_WAL: cnt = l_bluefs_read_random_disk_bytes_wal; break; | |
857 | case BDEV_DB: cnt = l_bluefs_read_random_disk_bytes_db; break; | |
858 | case BDEV_SLOW: cnt = l_bluefs_read_random_disk_bytes_slow; break; | |
859 | } | |
860 | if (cnt) { | |
861 | logger->inc(cnt, len); | |
862 | } | |
863 | return bdev[ndev]->read_random(off, len, buf, buffered); | |
864 | } | |
865 | ||
7c673cae FG |
866 | int BlueFS::mount() |
867 | { | |
868 | dout(1) << __func__ << dendl; | |
869 | ||
20effc67 | 870 | _init_logger(); |
7c673cae FG |
871 | int r = _open_super(); |
872 | if (r < 0) { | |
873 | derr << __func__ << " failed to open super: " << cpp_strerror(r) << dendl; | |
874 | goto out; | |
875 | } | |
876 | ||
9f95a23c TL |
877 | // set volume selector if not provided before/outside |
878 | if (vselector == nullptr) { | |
879 | vselector.reset( | |
880 | new OriginalVolumeSelector( | |
881 | get_block_device_size(BlueFS::BDEV_WAL) * 95 / 100, | |
882 | get_block_device_size(BlueFS::BDEV_DB) * 95 / 100, | |
883 | get_block_device_size(BlueFS::BDEV_SLOW) * 95 / 100)); | |
884 | } | |
885 | ||
7c673cae FG |
886 | _init_alloc(); |
887 | ||
11fdf7f2 | 888 | r = _replay(false, false); |
7c673cae FG |
889 | if (r < 0) { |
890 | derr << __func__ << " failed to replay log: " << cpp_strerror(r) << dendl; | |
891 | _stop_alloc(); | |
892 | goto out; | |
893 | } | |
894 | ||
895 | // init freelist | |
20effc67 | 896 | for (auto& p : nodes.file_map) { |
7c673cae FG |
897 | dout(30) << __func__ << " noting alloc for " << p.second->fnode << dendl; |
898 | for (auto& q : p.second->fnode.extents) { | |
f67539c2 TL |
899 | bool is_shared = is_shared_alloc(q.bdev); |
900 | ceph_assert(!is_shared || (is_shared && shared_alloc)); | |
901 | if (is_shared && shared_alloc->need_init && shared_alloc->a) { | |
902 | shared_alloc->bluefs_used += q.length; | |
903 | alloc[q.bdev]->init_rm_free(q.offset, q.length); | |
904 | } else if (!is_shared) { | |
905 | alloc[q.bdev]->init_rm_free(q.offset, q.length); | |
906 | } | |
7c673cae FG |
907 | } |
908 | } | |
f67539c2 TL |
909 | if (shared_alloc) { |
910 | shared_alloc->need_init = false; | |
911 | dout(1) << __func__ << " shared_bdev_used = " | |
912 | << shared_alloc->bluefs_used << dendl; | |
913 | } else { | |
914 | dout(1) << __func__ << " shared bdev not used" | |
915 | << dendl; | |
916 | } | |
7c673cae FG |
917 | |
918 | // set up the log for future writes | |
20effc67 TL |
919 | log.writer = _create_writer(_get_file(1)); |
920 | ceph_assert(log.writer->file->fnode.ino == 1); | |
921 | log.writer->pos = log.writer->file->fnode.size; | |
922 | log.writer->file->fnode.reset_delta(); | |
7c673cae | 923 | dout(10) << __func__ << " log write pos set to 0x" |
20effc67 | 924 | << std::hex << log.writer->pos << std::dec |
7c673cae | 925 | << dendl; |
20effc67 TL |
926 | // update log size |
927 | logger->set(l_bluefs_log_bytes, log.writer->file->fnode.size); | |
7c673cae FG |
928 | return 0; |
929 | ||
930 | out: | |
931 | super = bluefs_super_t(); | |
932 | return r; | |
933 | } | |
934 | ||
9f95a23c TL |
935 | int BlueFS::maybe_verify_layout(const bluefs_layout_t& layout) const |
936 | { | |
937 | if (super.memorized_layout) { | |
938 | if (layout == *super.memorized_layout) { | |
939 | dout(10) << __func__ << " bluefs layout verified positively" << dendl; | |
940 | } else { | |
941 | derr << __func__ << " memorized layout doesn't fit current one" << dendl; | |
942 | return -EIO; | |
943 | } | |
944 | } else { | |
945 | dout(10) << __func__ << " no memorized_layout in bluefs superblock" | |
946 | << dendl; | |
947 | } | |
948 | ||
949 | return 0; | |
950 | } | |
951 | ||
1911f103 | 952 | void BlueFS::umount(bool avoid_compact) |
7c673cae FG |
953 | { |
954 | dout(1) << __func__ << dendl; | |
955 | ||
1911f103 | 956 | sync_metadata(avoid_compact); |
20effc67 TL |
957 | if (cct->_conf->bluefs_check_volume_selector_on_umount) { |
958 | _check_vselector_LNF(); | |
959 | } | |
960 | _close_writer(log.writer); | |
961 | log.writer = NULL; | |
962 | log.t.clear(); | |
7c673cae | 963 | |
9f95a23c | 964 | vselector.reset(nullptr); |
7c673cae | 965 | _stop_alloc(); |
20effc67 TL |
966 | nodes.file_map.clear(); |
967 | nodes.dir_map.clear(); | |
7c673cae | 968 | super = bluefs_super_t(); |
7c673cae FG |
969 | _shutdown_logger(); |
970 | } | |
971 | ||
9f95a23c | 972 | int BlueFS::prepare_new_device(int id, const bluefs_layout_t& layout) |
7c673cae | 973 | { |
11fdf7f2 TL |
974 | dout(1) << __func__ << dendl; |
975 | ||
976 | if(id == BDEV_NEWDB) { | |
977 | int new_log_dev_cur = BDEV_WAL; | |
978 | int new_log_dev_next = BDEV_WAL; | |
979 | if (!bdev[BDEV_WAL]) { | |
980 | new_log_dev_cur = BDEV_NEWDB; | |
981 | new_log_dev_next = BDEV_DB; | |
982 | } | |
20effc67 | 983 | _rewrite_log_and_layout_sync_LNF_LD(false, |
11fdf7f2 TL |
984 | BDEV_NEWDB, |
985 | new_log_dev_cur, | |
986 | new_log_dev_next, | |
9f95a23c TL |
987 | RENAME_DB2SLOW, |
988 | layout); | |
11fdf7f2 TL |
989 | //} |
990 | } else if(id == BDEV_NEWWAL) { | |
20effc67 | 991 | _rewrite_log_and_layout_sync_LNF_LD(false, |
9f95a23c TL |
992 | BDEV_DB, |
993 | BDEV_NEWWAL, | |
994 | BDEV_WAL, | |
995 | REMOVE_WAL, | |
996 | layout); | |
11fdf7f2 TL |
997 | } else { |
998 | assert(false); | |
999 | } | |
1000 | return 0; | |
1001 | } | |
1002 | ||
1003 | void BlueFS::collect_metadata(map<string,string> *pm, unsigned skip_bdev_id) | |
1004 | { | |
1005 | if (skip_bdev_id != BDEV_DB && bdev[BDEV_DB]) | |
7c673cae FG |
1006 | bdev[BDEV_DB]->collect_metadata("bluefs_db_", pm); |
1007 | if (bdev[BDEV_WAL]) | |
1008 | bdev[BDEV_WAL]->collect_metadata("bluefs_wal_", pm); | |
11fdf7f2 TL |
1009 | } |
1010 | ||
1011 | void BlueFS::get_devices(set<string> *ls) | |
1012 | { | |
1013 | for (unsigned i = 0; i < MAX_BDEV; ++i) { | |
1014 | if (bdev[i]) { | |
1015 | bdev[i]->get_devices(ls); | |
1016 | } | |
1017 | } | |
7c673cae FG |
1018 | } |
1019 | ||
1020 | int BlueFS::fsck() | |
1021 | { | |
7c673cae FG |
1022 | dout(1) << __func__ << dendl; |
1023 | // hrm, i think we check everything on mount... | |
1024 | return 0; | |
1025 | } | |
1026 | ||
11fdf7f2 | 1027 | int BlueFS::_write_super(int dev) |
7c673cae FG |
1028 | { |
1029 | // build superblock | |
1030 | bufferlist bl; | |
11fdf7f2 | 1031 | encode(super, bl); |
7c673cae | 1032 | uint32_t crc = bl.crc32c(-1); |
11fdf7f2 | 1033 | encode(crc, bl); |
7c673cae FG |
1034 | dout(10) << __func__ << " super block length(encoded): " << bl.length() << dendl; |
1035 | dout(10) << __func__ << " superblock " << super.version << dendl; | |
1036 | dout(10) << __func__ << " log_fnode " << super.log_fnode << dendl; | |
9f95a23c | 1037 | ceph_assert_always(bl.length() <= get_super_length()); |
7c673cae FG |
1038 | bl.append_zero(get_super_length() - bl.length()); |
1039 | ||
11fdf7f2 | 1040 | bdev[dev]->write(get_super_offset(), bl, false, WRITE_LIFE_SHORT); |
7c673cae FG |
1041 | dout(20) << __func__ << " v " << super.version |
1042 | << " crc 0x" << std::hex << crc | |
1043 | << " offset 0x" << get_super_offset() << std::dec | |
1044 | << dendl; | |
1045 | return 0; | |
1046 | } | |
1047 | ||
1048 | int BlueFS::_open_super() | |
1049 | { | |
1050 | dout(10) << __func__ << dendl; | |
1051 | ||
1052 | bufferlist bl; | |
1053 | uint32_t expected_crc, crc; | |
1054 | int r; | |
1055 | ||
1056 | // always the second block | |
20effc67 TL |
1057 | r = _bdev_read(BDEV_DB, get_super_offset(), get_super_length(), |
1058 | &bl, ioc[BDEV_DB], false); | |
7c673cae FG |
1059 | if (r < 0) |
1060 | return r; | |
1061 | ||
11fdf7f2 TL |
1062 | auto p = bl.cbegin(); |
1063 | decode(super, p); | |
7c673cae FG |
1064 | { |
1065 | bufferlist t; | |
1066 | t.substr_of(bl, 0, p.get_off()); | |
1067 | crc = t.crc32c(-1); | |
1068 | } | |
11fdf7f2 | 1069 | decode(expected_crc, p); |
7c673cae FG |
1070 | if (crc != expected_crc) { |
1071 | derr << __func__ << " bad crc on superblock, expected 0x" | |
1072 | << std::hex << expected_crc << " != actual 0x" << crc << std::dec | |
1073 | << dendl; | |
1074 | return -EIO; | |
1075 | } | |
1076 | dout(10) << __func__ << " superblock " << super.version << dendl; | |
1077 | dout(10) << __func__ << " log_fnode " << super.log_fnode << dendl; | |
1078 | return 0; | |
1079 | } | |
1080 | ||
20effc67 TL |
1081 | int BlueFS::_check_allocations(const bluefs_fnode_t& fnode, |
1082 | boost::dynamic_bitset<uint64_t>* used_blocks, | |
1083 | bool is_alloc, //true when allocating, false when deallocating | |
1084 | const char* op_name) | |
9f95a23c TL |
1085 | { |
1086 | auto& fnode_extents = fnode.extents; | |
1087 | for (auto e : fnode_extents) { | |
1088 | auto id = e.bdev; | |
1089 | bool fail = false; | |
20effc67 TL |
1090 | ceph_assert(id < MAX_BDEV); |
1091 | if (int r = _verify_alloc_granularity(id, e.offset, e.length, | |
1092 | op_name); r < 0) { | |
1093 | return r; | |
1094 | } | |
9f95a23c TL |
1095 | |
1096 | apply_for_bitset_range(e.offset, e.length, alloc_size[id], used_blocks[id], | |
1097 | [&](uint64_t pos, boost::dynamic_bitset<uint64_t> &bs) { | |
20effc67 TL |
1098 | if (is_alloc == bs.test(pos)) { |
1099 | fail = true; | |
1100 | } else { | |
1101 | bs.flip(pos); | |
1102 | } | |
9f95a23c TL |
1103 | } |
1104 | ); | |
1105 | if (fail) { | |
20effc67 TL |
1106 | derr << __func__ << " " << op_name << " invalid extent " << int(e.bdev) |
1107 | << ": 0x" << std::hex << e.offset << "~" << e.length << std::dec | |
1108 | << (is_alloc == true ? | |
1109 | ": duplicate reference, ino " : ": double free, ino ") | |
1110 | << fnode.ino << dendl; | |
9f95a23c TL |
1111 | return -EFAULT; |
1112 | } | |
1113 | } | |
1114 | return 0; | |
1115 | } | |
1116 | ||
9f95a23c TL |
1117 | int BlueFS::_verify_alloc_granularity( |
1118 | __u8 id, uint64_t offset, uint64_t length, const char *op) | |
1119 | { | |
1120 | if ((offset & (alloc_size[id] - 1)) || | |
1121 | (length & (alloc_size[id] - 1))) { | |
1122 | derr << __func__ << " " << op << " of " << (int)id | |
1123 | << ":0x" << std::hex << offset << "~" << length << std::dec | |
1124 | << " does not align to alloc_size 0x" | |
1125 | << std::hex << alloc_size[id] << std::dec << dendl; | |
1126 | // be helpful | |
1127 | auto need = alloc_size[id]; | |
1128 | while (need && ((offset & (need - 1)) || | |
1129 | (length & (need - 1)))) { | |
1130 | need >>= 1; | |
1131 | } | |
1132 | if (need) { | |
1133 | const char *which; | |
1134 | if (id == BDEV_SLOW || | |
1135 | (id == BDEV_DB && !bdev[BDEV_SLOW])) { | |
1136 | which = "bluefs_shared_alloc_size"; | |
1137 | } else { | |
1138 | which = "bluefs_alloc_size"; | |
1139 | } | |
1140 | derr << "work-around by setting " << which << " = " << need | |
1141 | << " for this OSD" << dendl; | |
1142 | } | |
1143 | return -EFAULT; | |
1144 | } | |
1145 | return 0; | |
1146 | } | |
1147 | ||
11fdf7f2 | 1148 | int BlueFS::_replay(bool noop, bool to_stdout) |
7c673cae FG |
1149 | { |
1150 | dout(10) << __func__ << (noop ? " NO-OP" : "") << dendl; | |
1151 | ino_last = 1; // by the log | |
20effc67 | 1152 | uint64_t log_seq = 0; |
7c673cae FG |
1153 | |
1154 | FileRef log_file; | |
11fdf7f2 | 1155 | log_file = _get_file(1); |
9f95a23c | 1156 | |
f67539c2 | 1157 | log_file->fnode = super.log_fnode; |
11fdf7f2 | 1158 | if (!noop) { |
9f95a23c | 1159 | log_file->vselector_hint = |
f6b5b4d7 | 1160 | vselector->get_hint_for_log(); |
7c673cae | 1161 | } else { |
11fdf7f2 TL |
1162 | // do not use fnode from superblock in 'noop' mode - log_file's one should |
1163 | // be fine and up-to-date | |
1164 | ceph_assert(log_file->fnode.ino == 1); | |
1165 | ceph_assert(log_file->fnode.extents.size() != 0); | |
7c673cae | 1166 | } |
7c673cae | 1167 | dout(10) << __func__ << " log_fnode " << super.log_fnode << dendl; |
11fdf7f2 TL |
1168 | if (unlikely(to_stdout)) { |
1169 | std::cout << " log_fnode " << super.log_fnode << std::endl; | |
1170 | } | |
7c673cae FG |
1171 | |
1172 | FileReader *log_reader = new FileReader( | |
1173 | log_file, cct->_conf->bluefs_max_prefetch, | |
1174 | false, // !random | |
1175 | true); // ignore eof | |
9f95a23c TL |
1176 | |
1177 | bool seen_recs = false; | |
1178 | ||
1179 | boost::dynamic_bitset<uint64_t> used_blocks[MAX_BDEV]; | |
9f95a23c | 1180 | |
f67539c2 TL |
1181 | if (!noop) { |
1182 | if (cct->_conf->bluefs_log_replay_check_allocations) { | |
1183 | for (size_t i = 0; i < MAX_BDEV; ++i) { | |
1184 | if (alloc_size[i] != 0 && bdev[i] != nullptr) { | |
1185 | used_blocks[i].resize(round_up_to(bdev[i]->get_size(), alloc_size[i]) / alloc_size[i]); | |
1186 | } | |
9f95a23c | 1187 | } |
20effc67 TL |
1188 | // check initial log layout |
1189 | int r = _check_allocations(log_file->fnode, | |
1190 | used_blocks, true, "Log from super"); | |
1191 | if (r < 0) { | |
1192 | return r; | |
1193 | } | |
9f95a23c TL |
1194 | } |
1195 | } | |
1196 | ||
7c673cae | 1197 | while (true) { |
11fdf7f2 | 1198 | ceph_assert((log_reader->buf.pos & ~super.block_mask()) == 0); |
7c673cae FG |
1199 | uint64_t pos = log_reader->buf.pos; |
1200 | uint64_t read_pos = pos; | |
1201 | bufferlist bl; | |
1202 | { | |
f67539c2 | 1203 | int r = _read(log_reader, read_pos, super.block_size, |
7c673cae | 1204 | &bl, NULL); |
f6b5b4d7 | 1205 | if (r != (int)super.block_size && cct->_conf->bluefs_replay_recovery) { |
20effc67 | 1206 | r += _do_replay_recovery_read(log_reader, pos, read_pos + r, super.block_size - r, &bl); |
f6b5b4d7 TL |
1207 | } |
1208 | assert(r == (int)super.block_size); | |
7c673cae FG |
1209 | read_pos += r; |
1210 | } | |
1211 | uint64_t more = 0; | |
1212 | uint64_t seq; | |
1213 | uuid_d uuid; | |
1214 | { | |
11fdf7f2 | 1215 | auto p = bl.cbegin(); |
7c673cae FG |
1216 | __u8 a, b; |
1217 | uint32_t len; | |
11fdf7f2 TL |
1218 | decode(a, p); |
1219 | decode(b, p); | |
1220 | decode(len, p); | |
1221 | decode(uuid, p); | |
1222 | decode(seq, p); | |
7c673cae | 1223 | if (len + 6 > bl.length()) { |
11fdf7f2 | 1224 | more = round_up_to(len + 6 - bl.length(), super.block_size); |
7c673cae FG |
1225 | } |
1226 | } | |
1227 | if (uuid != super.uuid) { | |
9f95a23c TL |
1228 | if (seen_recs) { |
1229 | dout(10) << __func__ << " 0x" << std::hex << pos << std::dec | |
1230 | << ": stop: uuid " << uuid << " != super.uuid " << super.uuid | |
1231 | << dendl; | |
1232 | } else { | |
1233 | derr << __func__ << " 0x" << std::hex << pos << std::dec | |
1234 | << ": stop: uuid " << uuid << " != super.uuid " << super.uuid | |
1235 | << ", block dump: \n"; | |
1236 | bufferlist t; | |
1237 | t.substr_of(bl, 0, super.block_size); | |
1238 | t.hexdump(*_dout); | |
1239 | *_dout << dendl; | |
1240 | } | |
7c673cae FG |
1241 | break; |
1242 | } | |
1243 | if (seq != log_seq + 1) { | |
9f95a23c TL |
1244 | if (seen_recs) { |
1245 | dout(10) << __func__ << " 0x" << std::hex << pos << std::dec | |
1246 | << ": stop: seq " << seq << " != expected " << log_seq + 1 | |
1247 | << dendl;; | |
1248 | } else { | |
1249 | derr << __func__ << " 0x" << std::hex << pos << std::dec | |
1250 | << ": stop: seq " << seq << " != expected " << log_seq + 1 | |
1251 | << dendl;; | |
1252 | } | |
7c673cae FG |
1253 | break; |
1254 | } | |
1255 | if (more) { | |
1256 | dout(20) << __func__ << " need 0x" << std::hex << more << std::dec | |
1257 | << " more bytes" << dendl; | |
1258 | bufferlist t; | |
f67539c2 | 1259 | int r = _read(log_reader, read_pos, more, &t, NULL); |
7c673cae | 1260 | if (r < (int)more) { |
f6b5b4d7 TL |
1261 | dout(10) << __func__ << " 0x" << std::hex << pos |
1262 | << ": stop: len is 0x" << bl.length() + more << std::dec | |
1263 | << ", which is past eof" << dendl; | |
1264 | if (cct->_conf->bluefs_replay_recovery) { | |
1265 | //try to search for more data | |
20effc67 | 1266 | r += _do_replay_recovery_read(log_reader, pos, read_pos + r, more - r, &t); |
f6b5b4d7 TL |
1267 | if (r < (int)more) { |
1268 | //in normal mode we must read r==more, for recovery it is too strict | |
1269 | break; | |
1270 | } | |
1271 | } | |
7c673cae | 1272 | } |
11fdf7f2 | 1273 | ceph_assert(r == (int)more); |
7c673cae FG |
1274 | bl.claim_append(t); |
1275 | read_pos += r; | |
1276 | } | |
1277 | bluefs_transaction_t t; | |
1278 | try { | |
11fdf7f2 TL |
1279 | auto p = bl.cbegin(); |
1280 | decode(t, p); | |
522d829b | 1281 | seen_recs = true; |
7c673cae | 1282 | } |
f67539c2 | 1283 | catch (ceph::buffer::error& e) { |
522d829b TL |
1284 | // Multi-block transactions might be incomplete due to unexpected |
1285 | // power off. Hence let's treat that as a regular stop condition. | |
1286 | if (seen_recs && more) { | |
1287 | dout(10) << __func__ << " 0x" << std::hex << pos << std::dec | |
1288 | << ": stop: failed to decode: " << e.what() | |
1289 | << dendl; | |
1290 | } else { | |
1291 | derr << __func__ << " 0x" << std::hex << pos << std::dec | |
1292 | << ": stop: failed to decode: " << e.what() | |
1293 | << dendl; | |
1294 | delete log_reader; | |
1295 | return -EIO; | |
1296 | } | |
1297 | break; | |
7c673cae | 1298 | } |
11fdf7f2 | 1299 | ceph_assert(seq == t.seq); |
7c673cae FG |
1300 | dout(10) << __func__ << " 0x" << std::hex << pos << std::dec |
1301 | << ": " << t << dendl; | |
11fdf7f2 TL |
1302 | if (unlikely(to_stdout)) { |
1303 | std::cout << " 0x" << std::hex << pos << std::dec | |
1304 | << ": " << t << std::endl; | |
1305 | } | |
7c673cae | 1306 | |
11fdf7f2 | 1307 | auto p = t.op_bl.cbegin(); |
7c673cae FG |
1308 | while (!p.end()) { |
1309 | __u8 op; | |
11fdf7f2 | 1310 | decode(op, p); |
7c673cae FG |
1311 | switch (op) { |
1312 | ||
1313 | case bluefs_transaction_t::OP_INIT: | |
1314 | dout(20) << __func__ << " 0x" << std::hex << pos << std::dec | |
1315 | << ": op_init" << dendl; | |
11fdf7f2 TL |
1316 | if (unlikely(to_stdout)) { |
1317 | std::cout << " 0x" << std::hex << pos << std::dec | |
1318 | << ": op_init" << std::endl; | |
1319 | } | |
1320 | ||
1321 | ceph_assert(t.seq == 1); | |
7c673cae FG |
1322 | break; |
1323 | ||
1324 | case bluefs_transaction_t::OP_JUMP: | |
1325 | { | |
1326 | uint64_t next_seq; | |
1327 | uint64_t offset; | |
11fdf7f2 TL |
1328 | decode(next_seq, p); |
1329 | decode(offset, p); | |
7c673cae FG |
1330 | dout(20) << __func__ << " 0x" << std::hex << pos << std::dec |
1331 | << ": op_jump seq " << next_seq | |
1332 | << " offset 0x" << std::hex << offset << std::dec << dendl; | |
11fdf7f2 TL |
1333 | if (unlikely(to_stdout)) { |
1334 | std::cout << " 0x" << std::hex << pos << std::dec | |
1335 | << ": op_jump seq " << next_seq | |
1336 | << " offset 0x" << std::hex << offset << std::dec | |
1337 | << std::endl; | |
1338 | } | |
1339 | ||
20effc67 | 1340 | ceph_assert(next_seq > log_seq); |
7c673cae FG |
1341 | log_seq = next_seq - 1; // we will increment it below |
1342 | uint64_t skip = offset - read_pos; | |
1343 | if (skip) { | |
1344 | bufferlist junk; | |
f67539c2 | 1345 | int r = _read(log_reader, read_pos, skip, &junk, |
7c673cae FG |
1346 | NULL); |
1347 | if (r != (int)skip) { | |
1348 | dout(10) << __func__ << " 0x" << std::hex << read_pos | |
1349 | << ": stop: failed to skip to " << offset | |
1350 | << std::dec << dendl; | |
11fdf7f2 | 1351 | ceph_abort_msg("problem with op_jump"); |
7c673cae FG |
1352 | } |
1353 | } | |
1354 | } | |
1355 | break; | |
1356 | ||
1357 | case bluefs_transaction_t::OP_JUMP_SEQ: | |
1358 | { | |
1359 | uint64_t next_seq; | |
11fdf7f2 | 1360 | decode(next_seq, p); |
7c673cae FG |
1361 | dout(20) << __func__ << " 0x" << std::hex << pos << std::dec |
1362 | << ": op_jump_seq " << next_seq << dendl; | |
11fdf7f2 TL |
1363 | if (unlikely(to_stdout)) { |
1364 | std::cout << " 0x" << std::hex << pos << std::dec | |
1365 | << ": op_jump_seq " << next_seq << std::endl; | |
1366 | } | |
1367 | ||
20effc67 | 1368 | ceph_assert(next_seq > log_seq); |
7c673cae FG |
1369 | log_seq = next_seq - 1; // we will increment it below |
1370 | } | |
1371 | break; | |
1372 | ||
1373 | case bluefs_transaction_t::OP_ALLOC_ADD: | |
f67539c2 | 1374 | // LEGACY, do nothing but read params |
7c673cae | 1375 | { |
f67539c2 TL |
1376 | __u8 id; |
1377 | uint64_t offset, length; | |
1378 | decode(id, p); | |
1379 | decode(offset, p); | |
1380 | decode(length, p); | |
1381 | } | |
7c673cae FG |
1382 | break; |
1383 | ||
1384 | case bluefs_transaction_t::OP_ALLOC_RM: | |
f67539c2 | 1385 | // LEGACY, do nothing but read params |
7c673cae | 1386 | { |
f67539c2 TL |
1387 | __u8 id; |
1388 | uint64_t offset, length; | |
1389 | decode(id, p); | |
1390 | decode(offset, p); | |
1391 | decode(length, p); | |
1392 | } | |
1393 | break; | |
7c673cae FG |
1394 | |
1395 | case bluefs_transaction_t::OP_DIR_LINK: | |
1396 | { | |
1397 | string dirname, filename; | |
1398 | uint64_t ino; | |
11fdf7f2 TL |
1399 | decode(dirname, p); |
1400 | decode(filename, p); | |
1401 | decode(ino, p); | |
7c673cae FG |
1402 | dout(20) << __func__ << " 0x" << std::hex << pos << std::dec |
1403 | << ": op_dir_link " << " " << dirname << "/" << filename | |
1404 | << " to " << ino | |
1405 | << dendl; | |
11fdf7f2 TL |
1406 | if (unlikely(to_stdout)) { |
1407 | std::cout << " 0x" << std::hex << pos << std::dec | |
1408 | << ": op_dir_link " << " " << dirname << "/" << filename | |
1409 | << " to " << ino | |
1410 | << std::endl; | |
1411 | } | |
1412 | ||
7c673cae FG |
1413 | if (!noop) { |
1414 | FileRef file = _get_file(ino); | |
11fdf7f2 | 1415 | ceph_assert(file->fnode.ino); |
20effc67 TL |
1416 | map<string,DirRef>::iterator q = nodes.dir_map.find(dirname); |
1417 | ceph_assert(q != nodes.dir_map.end()); | |
7c673cae | 1418 | map<string,FileRef>::iterator r = q->second->file_map.find(filename); |
11fdf7f2 | 1419 | ceph_assert(r == q->second->file_map.end()); |
9f95a23c TL |
1420 | |
1421 | vselector->sub_usage(file->vselector_hint, file->fnode); | |
1422 | file->vselector_hint = | |
1423 | vselector->get_hint_by_dir(dirname); | |
1424 | vselector->add_usage(file->vselector_hint, file->fnode); | |
1425 | ||
7c673cae FG |
1426 | q->second->file_map[filename] = file; |
1427 | ++file->refs; | |
1428 | } | |
1429 | } | |
1430 | break; | |
1431 | ||
1432 | case bluefs_transaction_t::OP_DIR_UNLINK: | |
1433 | { | |
1434 | string dirname, filename; | |
11fdf7f2 TL |
1435 | decode(dirname, p); |
1436 | decode(filename, p); | |
7c673cae FG |
1437 | dout(20) << __func__ << " 0x" << std::hex << pos << std::dec |
1438 | << ": op_dir_unlink " << " " << dirname << "/" << filename | |
1439 | << dendl; | |
11fdf7f2 TL |
1440 | if (unlikely(to_stdout)) { |
1441 | std::cout << " 0x" << std::hex << pos << std::dec | |
1442 | << ": op_dir_unlink " << " " << dirname << "/" << filename | |
1443 | << std::endl; | |
1444 | } | |
1445 | ||
7c673cae | 1446 | if (!noop) { |
20effc67 TL |
1447 | map<string,DirRef>::iterator q = nodes.dir_map.find(dirname); |
1448 | ceph_assert(q != nodes.dir_map.end()); | |
7c673cae | 1449 | map<string,FileRef>::iterator r = q->second->file_map.find(filename); |
11fdf7f2 TL |
1450 | ceph_assert(r != q->second->file_map.end()); |
1451 | ceph_assert(r->second->refs > 0); | |
7c673cae FG |
1452 | --r->second->refs; |
1453 | q->second->file_map.erase(r); | |
1454 | } | |
1455 | } | |
1456 | break; | |
1457 | ||
1458 | case bluefs_transaction_t::OP_DIR_CREATE: | |
1459 | { | |
1460 | string dirname; | |
11fdf7f2 | 1461 | decode(dirname, p); |
7c673cae FG |
1462 | dout(20) << __func__ << " 0x" << std::hex << pos << std::dec |
1463 | << ": op_dir_create " << dirname << dendl; | |
11fdf7f2 TL |
1464 | if (unlikely(to_stdout)) { |
1465 | std::cout << " 0x" << std::hex << pos << std::dec | |
1466 | << ": op_dir_create " << dirname << std::endl; | |
1467 | } | |
1468 | ||
7c673cae | 1469 | if (!noop) { |
20effc67 TL |
1470 | map<string,DirRef>::iterator q = nodes.dir_map.find(dirname); |
1471 | ceph_assert(q == nodes.dir_map.end()); | |
1472 | nodes.dir_map[dirname] = ceph::make_ref<Dir>(); | |
7c673cae FG |
1473 | } |
1474 | } | |
1475 | break; | |
1476 | ||
1477 | case bluefs_transaction_t::OP_DIR_REMOVE: | |
1478 | { | |
1479 | string dirname; | |
11fdf7f2 | 1480 | decode(dirname, p); |
7c673cae FG |
1481 | dout(20) << __func__ << " 0x" << std::hex << pos << std::dec |
1482 | << ": op_dir_remove " << dirname << dendl; | |
11fdf7f2 TL |
1483 | if (unlikely(to_stdout)) { |
1484 | std::cout << " 0x" << std::hex << pos << std::dec | |
1485 | << ": op_dir_remove " << dirname << std::endl; | |
1486 | } | |
1487 | ||
7c673cae | 1488 | if (!noop) { |
20effc67 TL |
1489 | map<string,DirRef>::iterator q = nodes.dir_map.find(dirname); |
1490 | ceph_assert(q != nodes.dir_map.end()); | |
11fdf7f2 | 1491 | ceph_assert(q->second->file_map.empty()); |
20effc67 | 1492 | nodes.dir_map.erase(q); |
7c673cae FG |
1493 | } |
1494 | } | |
1495 | break; | |
1496 | ||
1497 | case bluefs_transaction_t::OP_FILE_UPDATE: | |
1498 | { | |
1499 | bluefs_fnode_t fnode; | |
11fdf7f2 | 1500 | decode(fnode, p); |
7c673cae | 1501 | dout(20) << __func__ << " 0x" << std::hex << pos << std::dec |
9f95a23c | 1502 | << ": op_file_update " << " " << fnode << " " << dendl; |
11fdf7f2 TL |
1503 | if (unlikely(to_stdout)) { |
1504 | std::cout << " 0x" << std::hex << pos << std::dec | |
1505 | << ": op_file_update " << " " << fnode << std::endl; | |
1506 | } | |
9f95a23c | 1507 | if (!noop) { |
7c673cae | 1508 | FileRef f = _get_file(fnode.ino); |
20effc67 TL |
1509 | if (cct->_conf->bluefs_log_replay_check_allocations) { |
1510 | int r = _check_allocations(f->fnode, | |
1511 | used_blocks, false, "OP_FILE_UPDATE"); | |
1512 | if (r < 0) { | |
1513 | return r; | |
9f95a23c TL |
1514 | } |
1515 | } | |
9f95a23c TL |
1516 | if (fnode.ino != 1) { |
1517 | vselector->sub_usage(f->vselector_hint, f->fnode); | |
1518 | } | |
1519 | f->fnode = fnode; | |
1520 | if (fnode.ino != 1) { | |
1521 | vselector->add_usage(f->vselector_hint, f->fnode); | |
1522 | } | |
1523 | ||
7c673cae FG |
1524 | if (fnode.ino > ino_last) { |
1525 | ino_last = fnode.ino; | |
1526 | } | |
9f95a23c | 1527 | if (cct->_conf->bluefs_log_replay_check_allocations) { |
20effc67 TL |
1528 | int r = _check_allocations(f->fnode, |
1529 | used_blocks, true, "OP_FILE_UPDATE"); | |
9f95a23c TL |
1530 | if (r < 0) { |
1531 | return r; | |
1532 | } | |
1533 | } | |
522d829b TL |
1534 | } else if (noop && fnode.ino == 1) { |
1535 | FileRef f = _get_file(fnode.ino); | |
1536 | f->fnode = fnode; | |
7c673cae | 1537 | } |
9f95a23c | 1538 | } |
7c673cae | 1539 | break; |
20effc67 TL |
1540 | case bluefs_transaction_t::OP_FILE_UPDATE_INC: |
1541 | { | |
1542 | bluefs_fnode_delta_t delta; | |
1543 | decode(delta, p); | |
1544 | dout(20) << __func__ << " 0x" << std::hex << pos << std::dec | |
1545 | << ": op_file_update_inc " << " " << delta << " " << dendl; | |
1546 | if (unlikely(to_stdout)) { | |
1547 | std::cout << " 0x" << std::hex << pos << std::dec | |
1548 | << ": op_file_update_inc " << " " << delta << std::endl; | |
1549 | } | |
1550 | if (!noop) { | |
1551 | FileRef f = _get_file(delta.ino); | |
1552 | bluefs_fnode_t& fnode = f->fnode; | |
1553 | if (delta.offset != fnode.allocated) { | |
1554 | derr << __func__ << " invalid op_file_update_inc, new extents miss end of file" | |
1555 | << " fnode=" << fnode | |
1556 | << " delta=" << delta | |
1557 | << dendl; | |
1558 | ceph_assert(delta.offset == fnode.allocated); | |
1559 | } | |
1560 | if (cct->_conf->bluefs_log_replay_check_allocations) { | |
1561 | int r = _check_allocations(fnode, | |
1562 | used_blocks, false, "OP_FILE_UPDATE_INC"); | |
1563 | if (r < 0) { | |
1564 | return r; | |
1565 | } | |
1566 | } | |
1567 | ||
1568 | fnode.ino = delta.ino; | |
1569 | fnode.mtime = delta.mtime; | |
1570 | if (fnode.ino != 1) { | |
1571 | vselector->sub_usage(f->vselector_hint, fnode); | |
1572 | } | |
1573 | fnode.size = delta.size; | |
1574 | fnode.claim_extents(delta.extents); | |
1575 | dout(20) << __func__ << " 0x" << std::hex << pos << std::dec | |
1576 | << ": op_file_update_inc produced " << " " << fnode << " " << dendl; | |
1577 | ||
1578 | if (fnode.ino != 1) { | |
1579 | vselector->add_usage(f->vselector_hint, fnode); | |
1580 | } | |
1581 | ||
1582 | if (fnode.ino > ino_last) { | |
1583 | ino_last = fnode.ino; | |
1584 | } | |
1585 | if (cct->_conf->bluefs_log_replay_check_allocations) { | |
1586 | int r = _check_allocations(f->fnode, | |
1587 | used_blocks, true, "OP_FILE_UPDATE_INC"); | |
1588 | if (r < 0) { | |
1589 | return r; | |
1590 | } | |
1591 | } | |
1592 | } else if (noop && delta.ino == 1) { | |
1593 | // we need to track bluefs log, even in noop mode | |
1594 | FileRef f = _get_file(1); | |
1595 | bluefs_fnode_t& fnode = f->fnode; | |
1596 | fnode.ino = delta.ino; | |
1597 | fnode.mtime = delta.mtime; | |
1598 | fnode.size = delta.size; | |
1599 | fnode.claim_extents(delta.extents); | |
1600 | } | |
1601 | } | |
1602 | break; | |
7c673cae FG |
1603 | |
1604 | case bluefs_transaction_t::OP_FILE_REMOVE: | |
1605 | { | |
1606 | uint64_t ino; | |
11fdf7f2 | 1607 | decode(ino, p); |
7c673cae FG |
1608 | dout(20) << __func__ << " 0x" << std::hex << pos << std::dec |
1609 | << ": op_file_remove " << ino << dendl; | |
11fdf7f2 TL |
1610 | if (unlikely(to_stdout)) { |
1611 | std::cout << " 0x" << std::hex << pos << std::dec | |
1612 | << ": op_file_remove " << ino << std::endl; | |
1613 | } | |
1614 | ||
9f95a23c | 1615 | if (!noop) { |
20effc67 TL |
1616 | auto p = nodes.file_map.find(ino); |
1617 | ceph_assert(p != nodes.file_map.end()); | |
9f95a23c TL |
1618 | vselector->sub_usage(p->second->vselector_hint, p->second->fnode); |
1619 | if (cct->_conf->bluefs_log_replay_check_allocations) { | |
20effc67 TL |
1620 | int r = _check_allocations(p->second->fnode, |
1621 | used_blocks, false, "OP_FILE_REMOVE"); | |
1622 | if (r < 0) { | |
1623 | return r; | |
9f95a23c TL |
1624 | } |
1625 | } | |
20effc67 | 1626 | nodes.file_map.erase(p); |
9f95a23c TL |
1627 | } |
1628 | } | |
7c673cae FG |
1629 | break; |
1630 | ||
1631 | default: | |
1632 | derr << __func__ << " 0x" << std::hex << pos << std::dec | |
1633 | << ": stop: unrecognized op " << (int)op << dendl; | |
1634 | delete log_reader; | |
1635 | return -EIO; | |
1636 | } | |
1637 | } | |
11fdf7f2 | 1638 | ceph_assert(p.end()); |
7c673cae FG |
1639 | |
1640 | // we successfully replayed the transaction; bump the seq and log size | |
1641 | ++log_seq; | |
1642 | log_file->fnode.size = log_reader->buf.pos; | |
1643 | } | |
f67539c2 TL |
1644 | if (!noop) { |
1645 | vselector->add_usage(log_file->vselector_hint, log_file->fnode); | |
20effc67 TL |
1646 | log.seq_live = log_seq + 1; |
1647 | dirty.seq_live = log_seq + 1; | |
1648 | log.t.seq = log.seq_live; | |
1649 | dirty.seq_stable = log_seq; | |
9f95a23c | 1650 | } |
7c673cae FG |
1651 | |
1652 | dout(10) << __func__ << " log file size was 0x" | |
1653 | << std::hex << log_file->fnode.size << std::dec << dendl; | |
11fdf7f2 TL |
1654 | if (unlikely(to_stdout)) { |
1655 | std::cout << " log file size was 0x" | |
1656 | << std::hex << log_file->fnode.size << std::dec << std::endl; | |
1657 | } | |
1658 | ||
7c673cae FG |
1659 | delete log_reader; |
1660 | ||
1661 | if (!noop) { | |
1662 | // verify file link counts are all >0 | |
20effc67 | 1663 | for (auto& p : nodes.file_map) { |
7c673cae FG |
1664 | if (p.second->refs == 0 && |
1665 | p.second->fnode.ino > 1) { | |
1666 | derr << __func__ << " file with link count 0: " << p.second->fnode | |
1667 | << dendl; | |
1668 | return -EIO; | |
1669 | } | |
1670 | } | |
1671 | } | |
20effc67 TL |
1672 | // reflect file count in logger |
1673 | logger->set(l_bluefs_num_files, nodes.file_map.size()); | |
7c673cae FG |
1674 | |
1675 | dout(10) << __func__ << " done" << dendl; | |
1676 | return 0; | |
1677 | } | |
1678 | ||
11fdf7f2 TL |
1679 | int BlueFS::log_dump() |
1680 | { | |
1681 | // only dump log file's content | |
20effc67 TL |
1682 | ceph_assert(log.writer == nullptr && "cannot log_dump on mounted BlueFS"); |
1683 | _init_logger(); | |
f67539c2 | 1684 | int r = _open_super(); |
11fdf7f2 | 1685 | if (r < 0) { |
f67539c2 | 1686 | derr << __func__ << " failed to open super: " << cpp_strerror(r) << dendl; |
11fdf7f2 TL |
1687 | return r; |
1688 | } | |
f67539c2 TL |
1689 | r = _replay(true, true); |
1690 | if (r < 0) { | |
1691 | derr << __func__ << " failed to replay log: " << cpp_strerror(r) << dendl; | |
1692 | } | |
1693 | _shutdown_logger(); | |
1694 | super = bluefs_super_t(); | |
1695 | return r; | |
11fdf7f2 TL |
1696 | } |
1697 | ||
1698 | int BlueFS::device_migrate_to_existing( | |
1699 | CephContext *cct, | |
1700 | const set<int>& devs_source, | |
9f95a23c TL |
1701 | int dev_target, |
1702 | const bluefs_layout_t& layout) | |
11fdf7f2 TL |
1703 | { |
1704 | vector<byte> buf; | |
1705 | bool buffered = cct->_conf->bluefs_buffered_io; | |
1706 | ||
eafe8130 TL |
1707 | dout(10) << __func__ << " devs_source " << devs_source |
1708 | << " dev_target " << dev_target << dendl; | |
11fdf7f2 TL |
1709 | assert(dev_target < (int)MAX_BDEV); |
1710 | ||
1711 | int flags = 0; | |
1712 | flags |= devs_source.count(BDEV_DB) ? | |
1713 | (REMOVE_DB | RENAME_SLOW2DB) : 0; | |
1714 | flags |= devs_source.count(BDEV_WAL) ? REMOVE_WAL : 0; | |
1715 | int dev_target_new = dev_target; | |
1716 | ||
1717 | // Slow device without separate DB one is addressed via BDEV_DB | |
1718 | // Hence need renaming. | |
1719 | if ((flags & REMOVE_DB) && dev_target == BDEV_SLOW) { | |
1720 | dev_target_new = BDEV_DB; | |
1721 | dout(0) << __func__ << " super to be written to " << dev_target << dendl; | |
1722 | } | |
1723 | ||
20effc67 | 1724 | for (auto& [ino, file_ref] : nodes.file_map) { |
11fdf7f2 | 1725 | //do not copy log |
9f95a23c | 1726 | if (file_ref->fnode.ino == 1) { |
11fdf7f2 TL |
1727 | continue; |
1728 | } | |
9f95a23c | 1729 | dout(10) << __func__ << " " << ino << " " << file_ref->fnode << dendl; |
eafe8130 | 1730 | |
9f95a23c | 1731 | auto& fnode_extents = file_ref->fnode.extents; |
20effc67 | 1732 | vselector->sub_usage(file_ref->vselector_hint, file_ref->fnode); |
11fdf7f2 | 1733 | |
9f95a23c TL |
1734 | bool rewrite = std::any_of( |
1735 | fnode_extents.begin(), | |
1736 | fnode_extents.end(), | |
1737 | [=](auto& ext) { | |
1738 | return ext.bdev != dev_target && devs_source.count(ext.bdev); | |
1739 | }); | |
eafe8130 TL |
1740 | if (rewrite) { |
1741 | dout(10) << __func__ << " migrating" << dendl; | |
1742 | ||
1743 | // read entire file | |
1744 | bufferlist bl; | |
1745 | for (auto old_ext : fnode_extents) { | |
1746 | buf.resize(old_ext.length); | |
20effc67 | 1747 | int r = _bdev_read_random(old_ext.bdev, |
eafe8130 TL |
1748 | old_ext.offset, |
1749 | old_ext.length, | |
1750 | (char*)&buf.at(0), | |
1751 | buffered); | |
1752 | if (r != 0) { | |
1753 | derr << __func__ << " failed to read 0x" << std::hex | |
1754 | << old_ext.offset << "~" << old_ext.length << std::dec | |
1755 | << " from " << (int)dev_target << dendl; | |
1756 | return -EIO; | |
1757 | } | |
1758 | bl.append((char*)&buf[0], old_ext.length); | |
1759 | } | |
11fdf7f2 | 1760 | |
eafe8130 TL |
1761 | // write entire file |
1762 | PExtentVector extents; | |
1763 | auto l = _allocate_without_fallback(dev_target, bl.length(), &extents); | |
1764 | if (l < 0) { | |
1765 | derr << __func__ << " unable to allocate len 0x" << std::hex | |
1766 | << bl.length() << std::dec << " from " << (int)dev_target | |
1767 | << ": " << cpp_strerror(l) << dendl; | |
1768 | return -ENOSPC; | |
1769 | } | |
11fdf7f2 | 1770 | |
eafe8130 TL |
1771 | uint64_t off = 0; |
1772 | for (auto& i : extents) { | |
1773 | bufferlist cur; | |
1774 | uint64_t cur_len = std::min<uint64_t>(i.length, bl.length() - off); | |
1775 | ceph_assert(cur_len > 0); | |
1776 | cur.substr_of(bl, off, cur_len); | |
1777 | int r = bdev[dev_target]->write(i.offset, cur, buffered); | |
1778 | ceph_assert(r == 0); | |
1779 | off += cur_len; | |
1780 | } | |
1781 | ||
1782 | // release old extents | |
1783 | for (auto old_ext : fnode_extents) { | |
1784 | PExtentVector to_release; | |
1785 | to_release.emplace_back(old_ext.offset, old_ext.length); | |
1786 | alloc[old_ext.bdev]->release(to_release); | |
f67539c2 TL |
1787 | if (is_shared_alloc(old_ext.bdev)) { |
1788 | shared_alloc->bluefs_used -= to_release.size(); | |
1789 | } | |
eafe8130 TL |
1790 | } |
1791 | ||
1792 | // update fnode | |
1793 | fnode_extents.clear(); | |
1794 | for (auto& i : extents) { | |
1795 | fnode_extents.emplace_back(dev_target_new, i.offset, i.length); | |
1796 | } | |
1797 | } else { | |
9f95a23c TL |
1798 | for (auto& ext : fnode_extents) { |
1799 | if (dev_target != dev_target_new && ext.bdev == dev_target) { | |
eafe8130 | 1800 | dout(20) << __func__ << " " << " ... adjusting extent 0x" |
9f95a23c | 1801 | << std::hex << ext.offset << std::dec |
eafe8130 TL |
1802 | << " bdev " << dev_target << " -> " << dev_target_new |
1803 | << dendl; | |
9f95a23c | 1804 | ext.bdev = dev_target_new; |
11fdf7f2 | 1805 | } |
11fdf7f2 TL |
1806 | } |
1807 | } | |
20effc67 | 1808 | vselector->add_usage(file_ref->vselector_hint, file_ref->fnode); |
11fdf7f2 TL |
1809 | } |
1810 | // new logging device in the current naming scheme | |
1811 | int new_log_dev_cur = bdev[BDEV_WAL] ? | |
1812 | BDEV_WAL : | |
1813 | bdev[BDEV_DB] ? BDEV_DB : BDEV_SLOW; | |
1814 | ||
1815 | // new logging device in new naming scheme | |
1816 | int new_log_dev_next = new_log_dev_cur; | |
1817 | ||
1818 | if (devs_source.count(new_log_dev_cur)) { | |
1819 | // SLOW device is addressed via BDEV_DB too hence either WAL or DB | |
1820 | new_log_dev_next = (flags & REMOVE_WAL) || !bdev[BDEV_WAL] ? | |
1821 | BDEV_DB : | |
1822 | BDEV_WAL; | |
1823 | ||
1824 | dout(0) << __func__ << " log moved from " << new_log_dev_cur | |
1825 | << " to " << new_log_dev_next << dendl; | |
1826 | ||
1827 | new_log_dev_cur = | |
1828 | (flags & REMOVE_DB) && new_log_dev_next == BDEV_DB ? | |
1829 | BDEV_SLOW : | |
1830 | new_log_dev_next; | |
1831 | } | |
1832 | ||
20effc67 | 1833 | _rewrite_log_and_layout_sync_LNF_LD( |
11fdf7f2 TL |
1834 | false, |
1835 | (flags & REMOVE_DB) ? BDEV_SLOW : BDEV_DB, | |
1836 | new_log_dev_cur, | |
1837 | new_log_dev_next, | |
9f95a23c TL |
1838 | flags, |
1839 | layout); | |
11fdf7f2 TL |
1840 | return 0; |
1841 | } | |
1842 | ||
1843 | int BlueFS::device_migrate_to_new( | |
1844 | CephContext *cct, | |
1845 | const set<int>& devs_source, | |
9f95a23c TL |
1846 | int dev_target, |
1847 | const bluefs_layout_t& layout) | |
11fdf7f2 TL |
1848 | { |
1849 | vector<byte> buf; | |
1850 | bool buffered = cct->_conf->bluefs_buffered_io; | |
1851 | ||
eafe8130 TL |
1852 | dout(10) << __func__ << " devs_source " << devs_source |
1853 | << " dev_target " << dev_target << dendl; | |
20effc67 | 1854 | assert(dev_target == (int)BDEV_NEWDB || dev_target == (int)BDEV_NEWWAL); |
11fdf7f2 TL |
1855 | |
1856 | int flags = 0; | |
1857 | ||
1858 | flags |= devs_source.count(BDEV_DB) ? | |
1859 | (!bdev[BDEV_SLOW] ? RENAME_DB2SLOW: REMOVE_DB) : | |
1860 | 0; | |
1861 | flags |= devs_source.count(BDEV_WAL) ? REMOVE_WAL : 0; | |
9f95a23c | 1862 | int dev_target_new = dev_target; //FIXME: remove, makes no sense |
11fdf7f2 | 1863 | |
20effc67 | 1864 | for (auto& p : nodes.file_map) { |
11fdf7f2 TL |
1865 | //do not copy log |
1866 | if (p.second->fnode.ino == 1) { | |
1867 | continue; | |
1868 | } | |
eafe8130 TL |
1869 | dout(10) << __func__ << " " << p.first << " " << p.second->fnode << dendl; |
1870 | ||
11fdf7f2 TL |
1871 | auto& fnode_extents = p.second->fnode.extents; |
1872 | ||
eafe8130 | 1873 | bool rewrite = false; |
11fdf7f2 | 1874 | for (auto ext_it = fnode_extents.begin(); |
eafe8130 TL |
1875 | ext_it != p.second->fnode.extents.end(); |
1876 | ++ext_it) { | |
11fdf7f2 | 1877 | if (ext_it->bdev != dev_target && devs_source.count(ext_it->bdev)) { |
eafe8130 TL |
1878 | rewrite = true; |
1879 | break; | |
1880 | } | |
1881 | } | |
1882 | if (rewrite) { | |
1883 | dout(10) << __func__ << " migrating" << dendl; | |
1884 | ||
1885 | // read entire file | |
1886 | bufferlist bl; | |
1887 | for (auto old_ext : fnode_extents) { | |
1888 | buf.resize(old_ext.length); | |
20effc67 | 1889 | int r = _bdev_read_random(old_ext.bdev, |
eafe8130 TL |
1890 | old_ext.offset, |
1891 | old_ext.length, | |
1892 | (char*)&buf.at(0), | |
1893 | buffered); | |
1894 | if (r != 0) { | |
1895 | derr << __func__ << " failed to read 0x" << std::hex | |
1896 | << old_ext.offset << "~" << old_ext.length << std::dec | |
1897 | << " from " << (int)dev_target << dendl; | |
1898 | return -EIO; | |
11fdf7f2 | 1899 | } |
eafe8130 TL |
1900 | bl.append((char*)&buf[0], old_ext.length); |
1901 | } | |
1902 | ||
1903 | // write entire file | |
1904 | PExtentVector extents; | |
1905 | auto l = _allocate_without_fallback(dev_target, bl.length(), &extents); | |
1906 | if (l < 0) { | |
1907 | derr << __func__ << " unable to allocate len 0x" << std::hex | |
1908 | << bl.length() << std::dec << " from " << (int)dev_target | |
1909 | << ": " << cpp_strerror(l) << dendl; | |
1910 | return -ENOSPC; | |
1911 | } | |
1912 | ||
1913 | uint64_t off = 0; | |
1914 | for (auto& i : extents) { | |
1915 | bufferlist cur; | |
1916 | uint64_t cur_len = std::min<uint64_t>(i.length, bl.length() - off); | |
1917 | ceph_assert(cur_len > 0); | |
1918 | cur.substr_of(bl, off, cur_len); | |
1919 | int r = bdev[dev_target]->write(i.offset, cur, buffered); | |
1920 | ceph_assert(r == 0); | |
1921 | off += cur_len; | |
1922 | } | |
1923 | ||
1924 | // release old extents | |
1925 | for (auto old_ext : fnode_extents) { | |
1926 | PExtentVector to_release; | |
1927 | to_release.emplace_back(old_ext.offset, old_ext.length); | |
1928 | alloc[old_ext.bdev]->release(to_release); | |
f67539c2 TL |
1929 | if (is_shared_alloc(old_ext.bdev)) { |
1930 | shared_alloc->bluefs_used -= to_release.size(); | |
1931 | } | |
eafe8130 TL |
1932 | } |
1933 | ||
1934 | // update fnode | |
1935 | fnode_extents.clear(); | |
1936 | for (auto& i : extents) { | |
1937 | fnode_extents.emplace_back(dev_target_new, i.offset, i.length); | |
11fdf7f2 TL |
1938 | } |
1939 | } | |
11fdf7f2 TL |
1940 | } |
1941 | // new logging device in the current naming scheme | |
1942 | int new_log_dev_cur = | |
1943 | bdev[BDEV_NEWWAL] ? | |
1944 | BDEV_NEWWAL : | |
1945 | bdev[BDEV_WAL] && !(flags & REMOVE_WAL) ? | |
1946 | BDEV_WAL : | |
1947 | bdev[BDEV_NEWDB] ? | |
1948 | BDEV_NEWDB : | |
1949 | bdev[BDEV_DB] && !(flags & REMOVE_DB)? | |
1950 | BDEV_DB : | |
1951 | BDEV_SLOW; | |
1952 | ||
1953 | // new logging device in new naming scheme | |
1954 | int new_log_dev_next = | |
1955 | new_log_dev_cur == BDEV_NEWWAL ? | |
1956 | BDEV_WAL : | |
1957 | new_log_dev_cur == BDEV_NEWDB ? | |
1958 | BDEV_DB : | |
1959 | new_log_dev_cur; | |
1960 | ||
1961 | int super_dev = | |
1962 | dev_target == BDEV_NEWDB ? | |
1963 | BDEV_NEWDB : | |
1964 | bdev[BDEV_DB] ? | |
1965 | BDEV_DB : | |
1966 | BDEV_SLOW; | |
1967 | ||
20effc67 | 1968 | _rewrite_log_and_layout_sync_LNF_LD( |
11fdf7f2 TL |
1969 | false, |
1970 | super_dev, | |
1971 | new_log_dev_cur, | |
1972 | new_log_dev_next, | |
9f95a23c TL |
1973 | flags, |
1974 | layout); | |
11fdf7f2 TL |
1975 | return 0; |
1976 | } | |
1977 | ||
7c673cae FG |
1978 | BlueFS::FileRef BlueFS::_get_file(uint64_t ino) |
1979 | { | |
20effc67 TL |
1980 | auto p = nodes.file_map.find(ino); |
1981 | if (p == nodes.file_map.end()) { | |
9f95a23c | 1982 | FileRef f = ceph::make_ref<File>(); |
20effc67 TL |
1983 | nodes.file_map[ino] = f; |
1984 | // track files count in logger | |
1985 | logger->set(l_bluefs_num_files, nodes.file_map.size()); | |
7c673cae FG |
1986 | dout(30) << __func__ << " ino " << ino << " = " << f |
1987 | << " (new)" << dendl; | |
1988 | return f; | |
1989 | } else { | |
1990 | dout(30) << __func__ << " ino " << ino << " = " << p->second << dendl; | |
1991 | return p->second; | |
1992 | } | |
1993 | } | |
1994 | ||
20effc67 TL |
1995 | |
1996 | /** | |
1997 | To modify fnode both FileWriter::lock and File::lock must be obtained. | |
1998 | The special case is when we modify bluefs log (ino 1) or | |
1999 | we are compacting log (ino 0). | |
2000 | ||
2001 | In any case it is enough to hold File::lock to be sure fnode will not be modified. | |
2002 | */ | |
2003 | struct lock_fnode_print { | |
2004 | BlueFS::FileRef file; | |
2005 | lock_fnode_print(BlueFS::FileRef file) : file(file) {}; | |
2006 | }; | |
2007 | std::ostream& operator<<(std::ostream& out, const lock_fnode_print& to_lock) { | |
2008 | std::lock_guard l(to_lock.file->lock); | |
2009 | out << to_lock.file->fnode; | |
2010 | return out; | |
2011 | } | |
2012 | ||
2013 | void BlueFS::_drop_link_D(FileRef file) | |
7c673cae FG |
2014 | { |
2015 | dout(20) << __func__ << " had refs " << file->refs | |
20effc67 | 2016 | << " on " << lock_fnode_print(file) << dendl; |
11fdf7f2 | 2017 | ceph_assert(file->refs > 0); |
20effc67 TL |
2018 | ceph_assert(ceph_mutex_is_locked(log.lock)); |
2019 | ceph_assert(ceph_mutex_is_locked(nodes.lock)); | |
2020 | ||
7c673cae FG |
2021 | --file->refs; |
2022 | if (file->refs == 0) { | |
2023 | dout(20) << __func__ << " destroying " << file->fnode << dendl; | |
11fdf7f2 | 2024 | ceph_assert(file->num_reading.load() == 0); |
9f95a23c | 2025 | vselector->sub_usage(file->vselector_hint, file->fnode); |
20effc67 TL |
2026 | log.t.op_file_remove(file->fnode.ino); |
2027 | nodes.file_map.erase(file->fnode.ino); | |
2028 | logger->set(l_bluefs_num_files, nodes.file_map.size()); | |
7c673cae | 2029 | file->deleted = true; |
94b18763 | 2030 | |
20effc67 TL |
2031 | std::lock_guard dl(dirty.lock); |
2032 | for (auto& r : file->fnode.extents) { | |
2033 | dirty.pending_release[r.bdev].insert(r.offset, r.length); | |
2034 | } | |
2035 | if (file->dirty_seq > dirty.seq_stable) { | |
2036 | // retract request to serialize changes | |
2037 | ceph_assert(dirty.files.count(file->dirty_seq)); | |
2038 | auto it = dirty.files[file->dirty_seq].iterator_to(*file); | |
2039 | dirty.files[file->dirty_seq].erase(it); | |
2040 | file->dirty_seq = dirty.seq_stable; | |
7c673cae FG |
2041 | } |
2042 | } | |
2043 | } | |
2044 | ||
adb31ebb | 2045 | int64_t BlueFS::_read_random( |
7c673cae FG |
2046 | FileReader *h, ///< [in] read from here |
2047 | uint64_t off, ///< [in] offset | |
9f95a23c | 2048 | uint64_t len, ///< [in] this many bytes |
f67539c2 | 2049 | char *out) ///< [out] copy it here |
7c673cae | 2050 | { |
494da23a TL |
2051 | auto* buf = &h->buf; |
2052 | ||
adb31ebb | 2053 | int64_t ret = 0; |
7c673cae FG |
2054 | dout(10) << __func__ << " h " << h |
2055 | << " 0x" << std::hex << off << "~" << len << std::dec | |
20effc67 | 2056 | << " from " << lock_fnode_print(h->file) << dendl; |
7c673cae FG |
2057 | |
2058 | ++h->file->num_reading; | |
2059 | ||
2060 | if (!h->ignore_eof && | |
2061 | off + len > h->file->fnode.size) { | |
2062 | if (off > h->file->fnode.size) | |
2063 | len = 0; | |
2064 | else | |
2065 | len = h->file->fnode.size - off; | |
2066 | dout(20) << __func__ << " reaching (or past) eof, len clipped to 0x" | |
2067 | << std::hex << len << std::dec << dendl; | |
2068 | } | |
494da23a TL |
2069 | logger->inc(l_bluefs_read_random_count, 1); |
2070 | logger->inc(l_bluefs_read_random_bytes, len); | |
7c673cae | 2071 | |
494da23a | 2072 | std::shared_lock s_lock(h->lock); |
f91f0fd5 | 2073 | buf->bl.reassign_to_mempool(mempool::mempool_bluefs_file_reader); |
7c673cae | 2074 | while (len > 0) { |
494da23a TL |
2075 | if (off < buf->bl_off || off >= buf->get_buf_end()) { |
2076 | s_lock.unlock(); | |
2077 | uint64_t x_off = 0; | |
2078 | auto p = h->file->fnode.seek(off, &x_off); | |
f6b5b4d7 | 2079 | ceph_assert(p != h->file->fnode.extents.end()); |
9f95a23c | 2080 | uint64_t l = std::min(p->length - x_off, len); |
adb31ebb TL |
2081 | //hard cap to 1GB |
2082 | l = std::min(l, uint64_t(1) << 30); | |
494da23a TL |
2083 | dout(20) << __func__ << " read random 0x" |
2084 | << std::hex << x_off << "~" << l << std::dec | |
2085 | << " of " << *p << dendl; | |
cd265ab1 TL |
2086 | int r; |
2087 | if (!cct->_conf->bluefs_check_for_zeros) { | |
20effc67 TL |
2088 | r = _bdev_read_random(p->bdev, p->offset + x_off, l, out, |
2089 | cct->_conf->bluefs_buffered_io); | |
cd265ab1 | 2090 | } else { |
20effc67 | 2091 | r = _read_random_and_check(p->bdev, p->offset + x_off, l, out, |
cd265ab1 TL |
2092 | cct->_conf->bluefs_buffered_io); |
2093 | } | |
494da23a TL |
2094 | ceph_assert(r == 0); |
2095 | off += l; | |
2096 | len -= l; | |
2097 | ret += l; | |
2098 | out += l; | |
2099 | ||
2100 | logger->inc(l_bluefs_read_random_disk_count, 1); | |
2101 | logger->inc(l_bluefs_read_random_disk_bytes, l); | |
2102 | if (len > 0) { | |
2103 | s_lock.lock(); | |
2104 | } | |
2105 | } else { | |
2106 | auto left = buf->get_buf_remaining(off); | |
adb31ebb | 2107 | int64_t r = std::min(len, left); |
494da23a TL |
2108 | logger->inc(l_bluefs_read_random_buffer_count, 1); |
2109 | logger->inc(l_bluefs_read_random_buffer_bytes, r); | |
2110 | dout(20) << __func__ << " left 0x" << std::hex << left | |
2111 | << " 0x" << off << "~" << len << std::dec | |
2112 | << dendl; | |
2113 | ||
f67539c2 TL |
2114 | auto p = buf->bl.begin(); |
2115 | p.seek(off - buf->bl_off); | |
2116 | p.copy(r, out); | |
2117 | out += r; | |
7c673cae | 2118 | |
494da23a TL |
2119 | dout(30) << __func__ << " result chunk (0x" |
2120 | << std::hex << r << std::dec << " bytes):\n"; | |
2121 | bufferlist t; | |
2122 | t.substr_of(buf->bl, off - buf->bl_off, r); | |
2123 | t.hexdump(*_dout); | |
2124 | *_dout << dendl; | |
2125 | ||
2126 | off += r; | |
2127 | len -= r; | |
2128 | ret += r; | |
2129 | buf->pos += r; | |
2130 | } | |
2131 | } | |
7c673cae FG |
2132 | dout(20) << __func__ << " got " << ret << dendl; |
2133 | --h->file->num_reading; | |
2134 | return ret; | |
2135 | } | |
2136 | ||
adb31ebb | 2137 | int64_t BlueFS::_read( |
7c673cae | 2138 | FileReader *h, ///< [in] read from here |
7c673cae FG |
2139 | uint64_t off, ///< [in] offset |
2140 | size_t len, ///< [in] this many bytes | |
2141 | bufferlist *outbl, ///< [out] optional: reference the result here | |
2142 | char *out) ///< [out] optional: or copy it here | |
2143 | { | |
f67539c2 TL |
2144 | FileReaderBuffer *buf = &(h->buf); |
2145 | ||
494da23a | 2146 | bool prefetch = !outbl && !out; |
7c673cae FG |
2147 | dout(10) << __func__ << " h " << h |
2148 | << " 0x" << std::hex << off << "~" << len << std::dec | |
20effc67 | 2149 | << " from " << lock_fnode_print(h->file) |
494da23a TL |
2150 | << (prefetch ? " prefetch" : "") |
2151 | << dendl; | |
7c673cae FG |
2152 | |
2153 | ++h->file->num_reading; | |
2154 | ||
2155 | if (!h->ignore_eof && | |
2156 | off + len > h->file->fnode.size) { | |
2157 | if (off > h->file->fnode.size) | |
2158 | len = 0; | |
2159 | else | |
2160 | len = h->file->fnode.size - off; | |
2161 | dout(20) << __func__ << " reaching (or past) eof, len clipped to 0x" | |
2162 | << std::hex << len << std::dec << dendl; | |
2163 | } | |
494da23a TL |
2164 | logger->inc(l_bluefs_read_count, 1); |
2165 | logger->inc(l_bluefs_read_bytes, len); | |
2166 | if (prefetch) { | |
2167 | logger->inc(l_bluefs_read_prefetch_count, 1); | |
2168 | logger->inc(l_bluefs_read_prefetch_bytes, len); | |
2169 | } | |
2170 | ||
7c673cae FG |
2171 | if (outbl) |
2172 | outbl->clear(); | |
2173 | ||
adb31ebb | 2174 | int64_t ret = 0; |
494da23a | 2175 | std::shared_lock s_lock(h->lock); |
7c673cae FG |
2176 | while (len > 0) { |
2177 | size_t left; | |
2178 | if (off < buf->bl_off || off >= buf->get_buf_end()) { | |
494da23a TL |
2179 | s_lock.unlock(); |
2180 | std::unique_lock u_lock(h->lock); | |
f91f0fd5 | 2181 | buf->bl.reassign_to_mempool(mempool::mempool_bluefs_file_reader); |
494da23a TL |
2182 | if (off < buf->bl_off || off >= buf->get_buf_end()) { |
2183 | // if precondition hasn't changed during locking upgrade. | |
2184 | buf->bl.clear(); | |
2185 | buf->bl_off = off & super.block_mask(); | |
2186 | uint64_t x_off = 0; | |
2187 | auto p = h->file->fnode.seek(buf->bl_off, &x_off); | |
f6b5b4d7 TL |
2188 | if (p == h->file->fnode.extents.end()) { |
2189 | dout(5) << __func__ << " reading less then required " | |
2190 | << ret << "<" << ret + len << dendl; | |
2191 | break; | |
2192 | } | |
2193 | ||
494da23a TL |
2194 | uint64_t want = round_up_to(len + (off & ~super.block_mask()), |
2195 | super.block_size); | |
2196 | want = std::max(want, buf->max_prefetch); | |
2197 | uint64_t l = std::min(p->length - x_off, want); | |
adb31ebb TL |
2198 | //hard cap to 1GB |
2199 | l = std::min(l, uint64_t(1) << 30); | |
494da23a TL |
2200 | uint64_t eof_offset = round_up_to(h->file->fnode.size, super.block_size); |
2201 | if (!h->ignore_eof && | |
2202 | buf->bl_off + l > eof_offset) { | |
2203 | l = eof_offset - buf->bl_off; | |
2204 | } | |
2205 | dout(20) << __func__ << " fetching 0x" | |
2206 | << std::hex << x_off << "~" << l << std::dec | |
2207 | << " of " << *p << dendl; | |
cd265ab1 TL |
2208 | int r; |
2209 | if (!cct->_conf->bluefs_check_for_zeros) { | |
20effc67 TL |
2210 | r = _bdev_read(p->bdev, p->offset + x_off, l, &buf->bl, ioc[p->bdev], |
2211 | cct->_conf->bluefs_buffered_io); | |
cd265ab1 | 2212 | } else { |
20effc67 TL |
2213 | r = _read_and_check( |
2214 | p->bdev, p->offset + x_off, l, &buf->bl, ioc[p->bdev], | |
2215 | cct->_conf->bluefs_buffered_io); | |
cd265ab1 | 2216 | } |
20effc67 TL |
2217 | logger->inc(l_bluefs_read_disk_count, 1); |
2218 | logger->inc(l_bluefs_read_disk_bytes, l); | |
2219 | ||
494da23a | 2220 | ceph_assert(r == 0); |
7c673cae | 2221 | } |
494da23a TL |
2222 | u_lock.unlock(); |
2223 | s_lock.lock(); | |
2224 | // we should recheck if buffer is valid after lock downgrade | |
2225 | continue; | |
7c673cae FG |
2226 | } |
2227 | left = buf->get_buf_remaining(off); | |
2228 | dout(20) << __func__ << " left 0x" << std::hex << left | |
2229 | << " len 0x" << len << std::dec << dendl; | |
2230 | ||
adb31ebb | 2231 | int64_t r = std::min(len, left); |
7c673cae FG |
2232 | if (outbl) { |
2233 | bufferlist t; | |
2234 | t.substr_of(buf->bl, off - buf->bl_off, r); | |
2235 | outbl->claim_append(t); | |
2236 | } | |
2237 | if (out) { | |
f67539c2 TL |
2238 | auto p = buf->bl.begin(); |
2239 | p.seek(off - buf->bl_off); | |
2240 | p.copy(r, out); | |
7c673cae FG |
2241 | out += r; |
2242 | } | |
2243 | ||
2244 | dout(30) << __func__ << " result chunk (0x" | |
2245 | << std::hex << r << std::dec << " bytes):\n"; | |
2246 | bufferlist t; | |
2247 | t.substr_of(buf->bl, off - buf->bl_off, r); | |
2248 | t.hexdump(*_dout); | |
2249 | *_dout << dendl; | |
2250 | ||
2251 | off += r; | |
2252 | len -= r; | |
2253 | ret += r; | |
2254 | buf->pos += r; | |
2255 | } | |
f67539c2 | 2256 | |
7c673cae | 2257 | dout(20) << __func__ << " got " << ret << dendl; |
11fdf7f2 | 2258 | ceph_assert(!outbl || (int)outbl->length() == ret); |
7c673cae FG |
2259 | --h->file->num_reading; |
2260 | return ret; | |
2261 | } | |
2262 | ||
20effc67 | 2263 | void BlueFS::invalidate_cache(FileRef f, uint64_t offset, uint64_t length) |
7c673cae | 2264 | { |
20effc67 | 2265 | std::lock_guard l(f->lock); |
7c673cae FG |
2266 | dout(10) << __func__ << " file " << f->fnode |
2267 | << " 0x" << std::hex << offset << "~" << length << std::dec | |
2268 | << dendl; | |
2269 | if (offset & ~super.block_mask()) { | |
2270 | offset &= super.block_mask(); | |
11fdf7f2 | 2271 | length = round_up_to(length, super.block_size); |
7c673cae FG |
2272 | } |
2273 | uint64_t x_off = 0; | |
2274 | auto p = f->fnode.seek(offset, &x_off); | |
2275 | while (length > 0 && p != f->fnode.extents.end()) { | |
11fdf7f2 | 2276 | uint64_t x_len = std::min(p->length - x_off, length); |
7c673cae FG |
2277 | bdev[p->bdev]->invalidate_cache(p->offset + x_off, x_len); |
2278 | dout(20) << __func__ << " 0x" << std::hex << x_off << "~" << x_len | |
2279 | << std:: dec << " of " << *p << dendl; | |
2280 | offset += x_len; | |
2281 | length -= x_len; | |
2282 | } | |
2283 | } | |
2284 | ||
20effc67 | 2285 | uint64_t BlueFS::_estimate_log_size_N() |
7c673cae | 2286 | { |
20effc67 | 2287 | std::lock_guard nl(nodes.lock); |
7c673cae FG |
2288 | int avg_dir_size = 40; // fixme |
2289 | int avg_file_size = 12; | |
2290 | uint64_t size = 4096 * 2; | |
20effc67 TL |
2291 | size += nodes.file_map.size() * (1 + sizeof(bluefs_fnode_t)); |
2292 | size += nodes.dir_map.size() + (1 + avg_dir_size); | |
2293 | size += nodes.file_map.size() * (1 + avg_dir_size + avg_file_size); | |
11fdf7f2 | 2294 | return round_up_to(size, super.block_size); |
7c673cae FG |
2295 | } |
2296 | ||
20effc67 | 2297 | void BlueFS::compact_log()/*_LNF_LD_NF_D*/ |
7c673cae | 2298 | { |
f6b5b4d7 TL |
2299 | if (!cct->_conf->bluefs_replay_recovery_disable_compact) { |
2300 | if (cct->_conf->bluefs_compact_log_sync) { | |
20effc67 | 2301 | _compact_log_sync_LNF_LD(); |
f6b5b4d7 | 2302 | } else { |
20effc67 | 2303 | _compact_log_async_LD_LNF_D(); |
f6b5b4d7 | 2304 | } |
7c673cae FG |
2305 | } |
2306 | } | |
2307 | ||
20effc67 | 2308 | bool BlueFS::_should_start_compact_log_L_N() |
7c673cae | 2309 | { |
20effc67 TL |
2310 | if (log_is_compacting.load() == true) { |
2311 | // compaction is already running | |
2312 | return false; | |
2313 | } | |
2314 | uint64_t current; | |
2315 | { | |
2316 | std::lock_guard ll(log.lock); | |
2317 | current = log.writer->file->fnode.size; | |
2318 | } | |
2319 | uint64_t expected = _estimate_log_size_N(); | |
7c673cae FG |
2320 | float ratio = (float)current / (float)expected; |
2321 | dout(10) << __func__ << " current 0x" << std::hex << current | |
2322 | << " expected " << expected << std::dec | |
2323 | << " ratio " << ratio | |
7c673cae | 2324 | << dendl; |
20effc67 | 2325 | if (current < cct->_conf->bluefs_log_compact_min_size || |
7c673cae FG |
2326 | ratio < cct->_conf->bluefs_log_compact_min_ratio) { |
2327 | return false; | |
2328 | } | |
2329 | return true; | |
2330 | } | |
2331 | ||
20effc67 | 2332 | void BlueFS::_compact_log_dump_metadata_NF(bluefs_transaction_t *t, |
11fdf7f2 | 2333 | int flags) |
7c673cae | 2334 | { |
20effc67 TL |
2335 | std::lock_guard nl(nodes.lock); |
2336 | ||
7c673cae FG |
2337 | t->seq = 1; |
2338 | t->uuid = super.uuid; | |
2339 | dout(20) << __func__ << " op_init" << dendl; | |
2340 | ||
2341 | t->op_init(); | |
20effc67 | 2342 | for (auto& [ino, file_ref] : nodes.file_map) { |
9f95a23c | 2343 | if (ino == 1) |
7c673cae | 2344 | continue; |
9f95a23c | 2345 | ceph_assert(ino > 1); |
20effc67 | 2346 | std::lock_guard fl(file_ref->lock); |
9f95a23c | 2347 | for(auto& e : file_ref->fnode.extents) { |
11fdf7f2 TL |
2348 | auto bdev = e.bdev; |
2349 | auto bdev_new = bdev; | |
2350 | ceph_assert(!((flags & REMOVE_WAL) && bdev == BDEV_WAL)); | |
2351 | if ((flags & RENAME_SLOW2DB) && bdev == BDEV_SLOW) { | |
2352 | bdev_new = BDEV_DB; | |
2353 | } | |
2354 | if ((flags & RENAME_DB2SLOW) && bdev == BDEV_DB) { | |
2355 | bdev_new = BDEV_SLOW; | |
2356 | } | |
2357 | if (bdev == BDEV_NEWDB) { | |
2358 | // REMOVE_DB xor RENAME_DB | |
2359 | ceph_assert(!(flags & REMOVE_DB) != !(flags & RENAME_DB2SLOW)); | |
2360 | ceph_assert(!(flags & RENAME_SLOW2DB)); | |
2361 | bdev_new = BDEV_DB; | |
2362 | } | |
2363 | if (bdev == BDEV_NEWWAL) { | |
2364 | ceph_assert(flags & REMOVE_WAL); | |
2365 | bdev_new = BDEV_WAL; | |
2366 | } | |
2367 | e.bdev = bdev_new; | |
2368 | } | |
9f95a23c TL |
2369 | dout(20) << __func__ << " op_file_update " << file_ref->fnode << dendl; |
2370 | t->op_file_update(file_ref->fnode); | |
7c673cae | 2371 | } |
20effc67 TL |
2372 | for (auto& [path, dir_ref] : nodes.dir_map) { |
2373 | dout(20) << __func__ << " op_dir_create " << path << dendl; | |
2374 | t->op_dir_create(path); | |
2375 | for (auto& [fname, file_ref] : dir_ref->file_map) { | |
2376 | dout(20) << __func__ << " op_dir_link " << path << "/" << fname | |
2377 | << " to " << file_ref->fnode.ino << dendl; | |
2378 | t->op_dir_link(path, fname, file_ref->fnode.ino); | |
2379 | } | |
2380 | } | |
2381 | } | |
2382 | /* Streams to t files modified before *capture_before_seq* and all dirs */ | |
2383 | void BlueFS::_compact_log_async_dump_metadata_NF(bluefs_transaction_t *t, | |
2384 | uint64_t capture_before_seq) | |
2385 | { | |
2386 | std::lock_guard nl(nodes.lock); | |
2387 | ||
2388 | t->seq = 1; | |
2389 | t->uuid = super.uuid; | |
2390 | dout(20) << __func__ << " op_init" << dendl; | |
2391 | ||
2392 | t->op_init(); | |
2393 | for (auto& [ino, file_ref] : nodes.file_map) { | |
2394 | if (ino == 1) | |
2395 | continue; | |
2396 | ceph_assert(ino > 1); | |
2397 | std::lock_guard fl(file_ref->lock); | |
2398 | if (file_ref->dirty_seq < capture_before_seq) { | |
2399 | dout(20) << __func__ << " op_file_update " << file_ref->fnode << dendl; | |
2400 | } else { | |
2401 | dout(20) << __func__ << " op_file_update just modified, dirty_seq=" | |
2402 | << file_ref->dirty_seq << " " << file_ref->fnode << dendl; | |
2403 | } | |
2404 | t->op_file_update(file_ref->fnode); | |
2405 | } | |
2406 | for (auto& [path, dir_ref] : nodes.dir_map) { | |
9f95a23c TL |
2407 | dout(20) << __func__ << " op_dir_create " << path << dendl; |
2408 | t->op_dir_create(path); | |
2409 | for (auto& [fname, file_ref] : dir_ref->file_map) { | |
2410 | dout(20) << __func__ << " op_dir_link " << path << "/" << fname | |
2411 | << " to " << file_ref->fnode.ino << dendl; | |
2412 | t->op_dir_link(path, fname, file_ref->fnode.ino); | |
7c673cae FG |
2413 | } |
2414 | } | |
2415 | } | |
2416 | ||
20effc67 | 2417 | void BlueFS::_compact_log_sync_LNF_LD() |
7c673cae FG |
2418 | { |
2419 | dout(10) << __func__ << dendl; | |
20effc67 TL |
2420 | uint8_t prefer_bdev; |
2421 | { | |
2422 | std::lock_guard ll(log.lock); | |
2423 | prefer_bdev = | |
2424 | vselector->select_prefer_bdev(log.writer->file->vselector_hint); | |
2425 | } | |
2426 | _rewrite_log_and_layout_sync_LNF_LD(true, | |
11fdf7f2 | 2427 | BDEV_DB, |
9f95a23c TL |
2428 | prefer_bdev, |
2429 | prefer_bdev, | |
2430 | 0, | |
2431 | super.memorized_layout); | |
11fdf7f2 TL |
2432 | logger->inc(l_bluefs_log_compactions); |
2433 | } | |
2434 | ||
20effc67 TL |
2435 | void BlueFS::_rewrite_log_and_layout_sync_LNF_LD(bool allocate_with_fallback, |
2436 | int super_dev, | |
2437 | int log_dev, | |
2438 | int log_dev_new, | |
2439 | int flags, | |
2440 | std::optional<bluefs_layout_t> layout) | |
11fdf7f2 | 2441 | { |
20effc67 TL |
2442 | std::lock_guard ll(log.lock); |
2443 | ||
2444 | File *log_file = log.writer->file.get(); | |
7c673cae | 2445 | |
20effc67 TL |
2446 | // log.t.seq is always set to current live seq |
2447 | ceph_assert(log.t.seq == log.seq_live); | |
2448 | // Capturing entire state. Dump anything that has been stored there. | |
2449 | log.t.clear(); | |
2450 | log.t.seq = log.seq_live; | |
2451 | // From now on, no changes to log.t are permitted until we finish rewriting log. | |
2452 | // Can allow dirty to remain dirty - log.seq_live will not change. | |
7c673cae | 2453 | |
11fdf7f2 TL |
2454 | dout(20) << __func__ << " super_dev:" << super_dev |
2455 | << " log_dev:" << log_dev | |
2456 | << " log_dev_new:" << log_dev_new | |
2457 | << " flags:" << flags | |
2458 | << dendl; | |
7c673cae | 2459 | bluefs_transaction_t t; |
20effc67 | 2460 | _compact_log_dump_metadata_NF(&t, flags); |
7c673cae | 2461 | |
20effc67 TL |
2462 | dout(20) << __func__ << " op_jump_seq " << log.seq_live << dendl; |
2463 | t.op_jump_seq(log.seq_live); | |
7c673cae FG |
2464 | |
2465 | bufferlist bl; | |
11fdf7f2 | 2466 | encode(t, bl); |
7c673cae FG |
2467 | _pad_bl(bl); |
2468 | ||
2469 | uint64_t need = bl.length() + cct->_conf->bluefs_max_log_runway; | |
2470 | dout(20) << __func__ << " need " << need << dendl; | |
2471 | ||
494da23a | 2472 | bluefs_fnode_t old_fnode; |
11fdf7f2 | 2473 | int r; |
20effc67 | 2474 | vselector->sub_usage(log_file->vselector_hint, log_file->fnode); |
494da23a | 2475 | log_file->fnode.swap_extents(old_fnode); |
11fdf7f2 TL |
2476 | if (allocate_with_fallback) { |
2477 | r = _allocate(log_dev, need, &log_file->fnode); | |
2478 | ceph_assert(r == 0); | |
2479 | } else { | |
2480 | PExtentVector extents; | |
2481 | r = _allocate_without_fallback(log_dev, | |
2482 | need, | |
2483 | &extents); | |
2484 | ceph_assert(r == 0); | |
2485 | for (auto& p : extents) { | |
2486 | log_file->fnode.append_extent( | |
2487 | bluefs_extent_t(log_dev, p.offset, p.length)); | |
2488 | } | |
7c673cae FG |
2489 | } |
2490 | ||
20effc67 | 2491 | _close_writer(log.writer); |
7c673cae | 2492 | |
20effc67 TL |
2493 | // we will write it to super |
2494 | log_file->fnode.reset_delta(); | |
7c673cae | 2495 | log_file->fnode.size = bl.length(); |
9f95a23c | 2496 | |
20effc67 TL |
2497 | log.writer = _create_writer(log_file); |
2498 | log.writer->append(bl); | |
2499 | _flush_special(log.writer); | |
2500 | vselector->add_usage(log_file->vselector_hint, log_file->fnode); | |
11fdf7f2 TL |
2501 | #ifdef HAVE_LIBAIO |
2502 | if (!cct->_conf->bluefs_sync_write) { | |
2503 | list<aio_t> completed_ios; | |
20effc67 TL |
2504 | _claim_completed_aios(log.writer, &completed_ios); |
2505 | _wait_for_aio(log.writer); | |
11fdf7f2 TL |
2506 | completed_ios.clear(); |
2507 | } | |
2508 | #endif | |
20effc67 | 2509 | _flush_bdev(); |
224ce89b | 2510 | |
9f95a23c | 2511 | super.memorized_layout = layout; |
7c673cae | 2512 | super.log_fnode = log_file->fnode; |
11fdf7f2 TL |
2513 | // rename device if needed |
2514 | if (log_dev != log_dev_new) { | |
2515 | dout(10) << __func__ << " renaming log extents to " << log_dev_new << dendl; | |
2516 | for (auto& p : super.log_fnode.extents) { | |
2517 | p.bdev = log_dev_new; | |
2518 | } | |
2519 | } | |
2520 | dout(10) << __func__ << " writing super, log fnode: " << super.log_fnode << dendl; | |
2521 | ||
7c673cae | 2522 | ++super.version; |
11fdf7f2 | 2523 | _write_super(super_dev); |
20effc67 | 2524 | _flush_bdev(); |
7c673cae | 2525 | |
494da23a | 2526 | dout(10) << __func__ << " release old log extents " << old_fnode.extents << dendl; |
20effc67 | 2527 | std::lock_guard dl(dirty.lock); |
494da23a | 2528 | for (auto& r : old_fnode.extents) { |
20effc67 | 2529 | dirty.pending_release[r.bdev].insert(r.offset, r.length); |
7c673cae | 2530 | } |
7c673cae FG |
2531 | } |
2532 | ||
2533 | /* | |
2534 | * 1. Allocate a new extent to continue the log, and then log an event | |
2535 | * that jumps the log write position to the new extent. At this point, the | |
2536 | * old extent(s) won't be written to, and reflect everything to compact. | |
2537 | * New events will be written to the new region that we'll keep. | |
2538 | * | |
2539 | * 2. While still holding the lock, encode a bufferlist that dumps all of the | |
2540 | * in-memory fnodes and names. This will become the new beginning of the | |
2541 | * log. The last event will jump to the log continuation extent from #1. | |
2542 | * | |
2543 | * 3. Queue a write to a new extent for the new beginnging of the log. | |
2544 | * | |
2545 | * 4. Drop lock and wait | |
2546 | * | |
2547 | * 5. Retake the lock. | |
2548 | * | |
2549 | * 6. Update the log_fnode to splice in the new beginning. | |
2550 | * | |
2551 | * 7. Write the new superblock. | |
2552 | * | |
2553 | * 8. Release the old log space. Clean up. | |
2554 | */ | |
20effc67 TL |
2555 | |
2556 | void BlueFS::_compact_log_async_LD_LNF_D() //also locks FW for new_writer | |
7c673cae FG |
2557 | { |
2558 | dout(10) << __func__ << dendl; | |
20effc67 TL |
2559 | // only one compaction allowed at one time |
2560 | bool old_is_comp = std::atomic_exchange(&log_is_compacting, true); | |
2561 | if (old_is_comp) { | |
2562 | dout(10) << __func__ << " ongoing" <<dendl; | |
2563 | return; | |
2564 | } | |
2565 | ||
2566 | log.lock.lock(); | |
2567 | File *log_file = log.writer->file.get(); | |
2568 | FileWriter *new_log_writer = nullptr; | |
2569 | FileRef new_log = nullptr; | |
2570 | uint64_t new_log_jump_to = 0; | |
2571 | uint64_t old_log_jump_to = 0; | |
7c673cae | 2572 | |
9f95a23c | 2573 | new_log = ceph::make_ref<File>(); |
20effc67 | 2574 | new_log->fnode.ino = 0; // we use _flush_special to avoid log of the fnode |
181888fb | 2575 | |
20effc67 TL |
2576 | // Part 1. |
2577 | // Prepare current log for jumping into it. | |
2578 | // 1. Allocate extent | |
2579 | // 2. Update op to log | |
2580 | // 3. Jump op to log | |
2581 | // During that, no one else can write to log, otherwise we risk jumping backwards. | |
2582 | // We need to sync log, because we are injecting discontinuity, and writer is not prepared for that. | |
2583 | ||
2584 | //signal _maybe_extend_log that expansion of log is temporary inacceptable | |
2585 | bool old_forbidden = atomic_exchange(&log_forbidden_to_expand, true); | |
2586 | ceph_assert(old_forbidden == false); | |
3efd9988 | 2587 | |
9f95a23c TL |
2588 | vselector->sub_usage(log_file->vselector_hint, log_file->fnode); |
2589 | ||
20effc67 | 2590 | // 1.1 allocate new log space and jump to it. |
7c673cae | 2591 | old_log_jump_to = log_file->fnode.get_allocated(); |
20effc67 | 2592 | uint64_t runway = log_file->fnode.get_allocated() - log.writer->get_effective_write_pos(); |
7c673cae | 2593 | dout(10) << __func__ << " old_log_jump_to 0x" << std::hex << old_log_jump_to |
11fdf7f2 | 2594 | << " need 0x" << (old_log_jump_to + cct->_conf->bluefs_max_log_runway) << std::dec << dendl; |
9f95a23c TL |
2595 | int r = _allocate(vselector->select_prefer_bdev(log_file->vselector_hint), |
2596 | cct->_conf->bluefs_max_log_runway, | |
2597 | &log_file->fnode); | |
11fdf7f2 | 2598 | ceph_assert(r == 0); |
9f95a23c TL |
2599 | //adjust usage as flush below will need it |
2600 | vselector->add_usage(log_file->vselector_hint, log_file->fnode); | |
7c673cae FG |
2601 | dout(10) << __func__ << " log extents " << log_file->fnode.extents << dendl; |
2602 | ||
2603 | // update the log file change and log a jump to the offset where we want to | |
2604 | // write the new entries | |
20effc67 TL |
2605 | log.t.op_file_update(log_file->fnode); |
2606 | // jump to new position should mean next seq | |
2607 | log.t.op_jump(log.seq_live + 1, old_log_jump_to); | |
2608 | uint64_t seq_now = log.seq_live; | |
2609 | // we need to flush all bdev because we will be streaming all dirty files to log | |
2610 | // TODO - think - if _flush_and_sync_log_jump will not add dirty files nor release pending allocations | |
2611 | // then flush_bdev() will not be necessary | |
2612 | _flush_bdev(); | |
2613 | _flush_and_sync_log_jump_D(old_log_jump_to, runway); | |
2614 | ||
2615 | // out of jump section | |
7c673cae FG |
2616 | |
2617 | // 2. prepare compacted log | |
2618 | bluefs_transaction_t t; | |
20effc67 TL |
2619 | _compact_log_async_dump_metadata_NF(&t, seq_now); |
2620 | ||
2621 | // now state is captured to bufferlist | |
2622 | // log can be used to write to, ops in log will be continuation of captured state | |
2623 | log.lock.unlock(); | |
7c673cae | 2624 | |
eafe8130 TL |
2625 | uint64_t max_alloc_size = std::max(alloc_size[BDEV_WAL], |
2626 | std::max(alloc_size[BDEV_DB], | |
2627 | alloc_size[BDEV_SLOW])); | |
2628 | ||
7c673cae | 2629 | // conservative estimate for final encoded size |
11fdf7f2 | 2630 | new_log_jump_to = round_up_to(t.op_bl.length() + super.block_size * 2, |
eafe8130 | 2631 | max_alloc_size); |
20effc67 TL |
2632 | //newly constructed log head will jump to what we had before |
2633 | t.op_jump(seq_now, new_log_jump_to); | |
7c673cae | 2634 | |
11fdf7f2 | 2635 | // allocate |
9f95a23c | 2636 | //FIXME: check if we want DB here? |
11fdf7f2 TL |
2637 | r = _allocate(BlueFS::BDEV_DB, new_log_jump_to, |
2638 | &new_log->fnode); | |
2639 | ceph_assert(r == 0); | |
2640 | ||
7c673cae | 2641 | bufferlist bl; |
11fdf7f2 | 2642 | encode(t, bl); |
7c673cae FG |
2643 | _pad_bl(bl); |
2644 | ||
2645 | dout(10) << __func__ << " new_log_jump_to 0x" << std::hex << new_log_jump_to | |
2646 | << std::dec << dendl; | |
2647 | ||
7c673cae | 2648 | new_log_writer = _create_writer(new_log); |
7c673cae | 2649 | |
20effc67 | 2650 | new_log_writer->append(bl); |
7c673cae | 2651 | // 3. flush |
20effc67 | 2652 | _flush_special(new_log_writer); |
7c673cae FG |
2653 | |
2654 | // 4. wait | |
20effc67 | 2655 | _flush_bdev(new_log_writer); |
11fdf7f2 | 2656 | // 5. update our log fnode |
20effc67 TL |
2657 | // we need to append to new_log the extents that were allocated in step 1.1 |
2658 | // we do it by inverse logic - we drop 'old_log_jump_to' bytes and keep rest | |
2659 | // todo - maybe improve _allocate so we will give clear set of new allocations | |
2660 | uint64_t processed = 0; | |
7c673cae | 2661 | mempool::bluefs::vector<bluefs_extent_t> old_extents; |
20effc67 TL |
2662 | for (auto& e : log_file->fnode.extents) { |
2663 | if (processed + e.length <= old_log_jump_to) { | |
2664 | // drop whole extent | |
7c673cae | 2665 | dout(10) << __func__ << " remove old log extent " << e << dendl; |
20effc67 | 2666 | old_extents.push_back(e); |
7c673cae | 2667 | } else { |
20effc67 TL |
2668 | // keep, but how much? |
2669 | if (processed < old_log_jump_to) { | |
2670 | ceph_assert(processed + e.length > old_log_jump_to); | |
2671 | ceph_assert(old_log_jump_to - processed <= std::numeric_limits<uint32_t>::max()); | |
2672 | uint32_t cut_at = uint32_t(old_log_jump_to - processed); | |
2673 | // need to cut, first half gets dropped | |
2674 | bluefs_extent_t retire(e.bdev, e.offset, cut_at); | |
2675 | old_extents.push_back(retire); | |
2676 | // second half goes to new log | |
2677 | bluefs_extent_t keep(e.bdev, e.offset + cut_at, e.length - cut_at); | |
2678 | new_log->fnode.append_extent(keep); | |
2679 | dout(10) << __func__ << " kept " << keep << " removed " << retire << dendl; | |
2680 | } else { | |
2681 | // take entire extent | |
2682 | ceph_assert(processed >= old_log_jump_to); | |
2683 | new_log->fnode.append_extent(e); | |
2684 | dout(10) << __func__ << " kept " << e << dendl; | |
2685 | } | |
7c673cae | 2686 | } |
20effc67 | 2687 | processed += e.length; |
94b18763 | 2688 | } |
20effc67 TL |
2689 | // we will write it to super |
2690 | new_log->fnode.reset_delta(); | |
7c673cae | 2691 | |
20effc67 TL |
2692 | // 6. write the super block to reflect the changes |
2693 | dout(10) << __func__ << " writing super" << dendl; | |
2694 | new_log->fnode.ino = log_file->fnode.ino; | |
2695 | new_log->fnode.size = 0; | |
2696 | new_log->fnode.mtime = ceph_clock_now(); | |
2697 | super.log_fnode = new_log->fnode; | |
2698 | ++super.version; | |
2699 | _write_super(BDEV_DB); | |
2700 | _flush_bdev(); | |
2701 | ||
2702 | log.lock.lock(); | |
2703 | // swapping log_file and new_log | |
9f95a23c TL |
2704 | vselector->sub_usage(log_file->vselector_hint, log_file->fnode); |
2705 | ||
7c673cae | 2706 | // clear the extents from old log file, they are added to new log |
94b18763 | 2707 | log_file->fnode.clear_extents(); |
7c673cae | 2708 | // swap the log files. New log file is the log file now. |
94b18763 FG |
2709 | new_log->fnode.swap_extents(log_file->fnode); |
2710 | ||
20effc67 TL |
2711 | log.writer->pos = log.writer->file->fnode.size = |
2712 | log.writer->pos - old_log_jump_to + new_log_jump_to; | |
7c673cae | 2713 | |
9f95a23c TL |
2714 | vselector->add_usage(log_file->vselector_hint, log_file->fnode); |
2715 | ||
20effc67 | 2716 | log.lock.unlock(); |
7c673cae | 2717 | |
20effc67 TL |
2718 | old_forbidden = atomic_exchange(&log_forbidden_to_expand, false); |
2719 | ceph_assert(old_forbidden == true); | |
2720 | //to wake up if someone was in need of expanding log | |
2721 | log_cond.notify_all(); | |
7c673cae | 2722 | |
11fdf7f2 | 2723 | // 7. release old space |
7c673cae | 2724 | dout(10) << __func__ << " release old log extents " << old_extents << dendl; |
20effc67 TL |
2725 | { |
2726 | std::lock_guard dl(dirty.lock); | |
2727 | for (auto& r : old_extents) { | |
2728 | dirty.pending_release[r.bdev].insert(r.offset, r.length); | |
2729 | } | |
7c673cae FG |
2730 | } |
2731 | ||
2732 | // delete the new log, remove from the dirty files list | |
2733 | _close_writer(new_log_writer); | |
7c673cae FG |
2734 | new_log_writer = nullptr; |
2735 | new_log = nullptr; | |
2736 | log_cond.notify_all(); | |
2737 | ||
2738 | dout(10) << __func__ << " log extents " << log_file->fnode.extents << dendl; | |
2739 | logger->inc(l_bluefs_log_compactions); | |
20effc67 TL |
2740 | |
2741 | old_is_comp = atomic_exchange(&log_is_compacting, false); | |
2742 | ceph_assert(old_is_comp); | |
7c673cae FG |
2743 | } |
2744 | ||
2745 | void BlueFS::_pad_bl(bufferlist& bl) | |
2746 | { | |
2747 | uint64_t partial = bl.length() % super.block_size; | |
2748 | if (partial) { | |
2749 | dout(10) << __func__ << " padding with 0x" << std::hex | |
2750 | << super.block_size - partial << " zeros" << std::dec << dendl; | |
2751 | bl.append_zero(super.block_size - partial); | |
2752 | } | |
2753 | } | |
2754 | ||
7c673cae | 2755 | |
20effc67 TL |
2756 | // Returns log seq that was live before advance. |
2757 | uint64_t BlueFS::_log_advance_seq() | |
7c673cae | 2758 | { |
20effc67 TL |
2759 | ceph_assert(ceph_mutex_is_locked(dirty.lock)); |
2760 | ceph_assert(ceph_mutex_is_locked(log.lock)); | |
2761 | //acquire new seq | |
2762 | // this will became seq_stable once we write | |
2763 | ceph_assert(dirty.seq_stable < dirty.seq_live); | |
2764 | ceph_assert(log.t.seq == log.seq_live); | |
2765 | uint64_t seq = log.seq_live; | |
2766 | log.t.uuid = super.uuid; | |
2767 | ||
2768 | ++dirty.seq_live; | |
2769 | ++log.seq_live; | |
2770 | ceph_assert(dirty.seq_live == log.seq_live); | |
2771 | return seq; | |
2772 | } | |
7c673cae | 2773 | |
a8e16298 | 2774 | |
20effc67 TL |
2775 | // Adds to log.t file modifications mentioned in `dirty.files`. |
2776 | // Note: some bluefs ops may have already been stored in log.t transaction. | |
2777 | void BlueFS::_consume_dirty(uint64_t seq) | |
2778 | { | |
2779 | ceph_assert(ceph_mutex_is_locked(dirty.lock)); | |
2780 | ceph_assert(ceph_mutex_is_locked(log.lock)); | |
7c673cae FG |
2781 | |
2782 | // log dirty files | |
20effc67 TL |
2783 | // we just incremented log_seq. It is now illegal to add to dirty.files[log_seq] |
2784 | auto lsi = dirty.files.find(seq); | |
2785 | if (lsi != dirty.files.end()) { | |
2786 | dout(20) << __func__ << " " << lsi->second.size() << " dirty.files" << dendl; | |
7c673cae | 2787 | for (auto &f : lsi->second) { |
20effc67 TL |
2788 | // fnode here is protected indirectly |
2789 | // the only path that adds to dirty.files goes from _fsync() | |
2790 | // _fsync() is executed under writer lock, | |
2791 | // and does not exit until syncing log is done | |
2792 | dout(20) << __func__ << " op_file_update_inc " << f.fnode << dendl; | |
2793 | log.t.op_file_update_inc(f.fnode); | |
7c673cae FG |
2794 | } |
2795 | } | |
20effc67 | 2796 | } |
7c673cae | 2797 | |
20effc67 TL |
2798 | // Extends log if its free space is smaller then bluefs_min_log_runway. |
2799 | // Returns space available *BEFORE* adding new space. Signed for additional <0 detection. | |
2800 | int64_t BlueFS::_maybe_extend_log() | |
2801 | { | |
2802 | ceph_assert(ceph_mutex_is_locked(log.lock)); | |
7c673cae | 2803 | // allocate some more space (before we run out)? |
20effc67 TL |
2804 | // BTW: this triggers `flush()` in the `page_aligned_appender` of `log.writer`. |
2805 | int64_t runway = log.writer->file->fnode.get_allocated() - | |
2806 | log.writer->get_effective_write_pos(); | |
7c673cae FG |
2807 | if (runway < (int64_t)cct->_conf->bluefs_min_log_runway) { |
2808 | dout(10) << __func__ << " allocating more log runway (0x" | |
2809 | << std::hex << runway << std::dec << " remaining)" << dendl; | |
20effc67 TL |
2810 | /* |
2811 | * Usually, when we are low on space in log, we just allocate new extent, | |
2812 | * put update op(log) to log and we are fine. | |
2813 | * Problem - it interferes with log compaction: | |
2814 | * New log produced in compaction will include - as last op - jump into some offset (anchor) of current log. | |
2815 | * It is assumed that log region (anchor - end) will contain all changes made by bluefs since | |
2816 | * full state capture into new log. | |
2817 | * Putting log update into (anchor - end) region is illegal, because any update there must be compatible with | |
2818 | * both logs, but old log is different then new log. | |
2819 | * | |
2820 | * Possible solutions: | |
2821 | * - stall extending log until we finish compacting and switch log (CURRENT) | |
2822 | * - re-run compaction with more runway for old log | |
2823 | * - add OP_FILE_ADDEXT that adds extent; will be compatible with both logs | |
2824 | */ | |
2825 | if (log_forbidden_to_expand.load() == true) { | |
2826 | return -EWOULDBLOCK; | |
7c673cae | 2827 | } |
20effc67 | 2828 | vselector->sub_usage(log.writer->file->vselector_hint, log.writer->file->fnode); |
9f95a23c | 2829 | int r = _allocate( |
20effc67 | 2830 | vselector->select_prefer_bdev(log.writer->file->vselector_hint), |
9f95a23c | 2831 | cct->_conf->bluefs_max_log_runway, |
20effc67 | 2832 | &log.writer->file->fnode); |
11fdf7f2 | 2833 | ceph_assert(r == 0); |
20effc67 TL |
2834 | vselector->add_usage(log.writer->file->vselector_hint, log.writer->file->fnode); |
2835 | log.t.op_file_update_inc(log.writer->file->fnode); | |
7c673cae | 2836 | } |
20effc67 TL |
2837 | return runway; |
2838 | } | |
2839 | ||
2840 | void BlueFS::_flush_and_sync_log_core(int64_t runway) | |
2841 | { | |
2842 | ceph_assert(ceph_mutex_is_locked(log.lock)); | |
2843 | dout(10) << __func__ << " " << log.t << dendl; | |
7c673cae FG |
2844 | |
2845 | bufferlist bl; | |
11fdf7f2 | 2846 | bl.reserve(super.block_size); |
20effc67 | 2847 | encode(log.t, bl); |
7c673cae | 2848 | // pad to block boundary |
11fdf7f2 TL |
2849 | size_t realign = super.block_size - (bl.length() % super.block_size); |
2850 | if (realign && realign != super.block_size) | |
2851 | bl.append_zero(realign); | |
2852 | ||
7c673cae FG |
2853 | logger->inc(l_bluefs_logged_bytes, bl.length()); |
2854 | ||
20effc67 | 2855 | if (true) { |
f6b5b4d7 | 2856 | ceph_assert(bl.length() <= runway); // if we write this, we will have an unrecoverable data loss |
20effc67 | 2857 | // transaction will not fit extents before growth -> data loss on _replay |
f6b5b4d7 TL |
2858 | } |
2859 | ||
20effc67 | 2860 | log.writer->append(bl); |
7c673cae | 2861 | |
20effc67 TL |
2862 | // prepare log for new transactions |
2863 | log.t.clear(); | |
2864 | log.t.seq = log.seq_live; | |
7c673cae | 2865 | |
20effc67 TL |
2866 | uint64_t new_data = _flush_special(log.writer); |
2867 | vselector->add_usage(log.writer->file->vselector_hint, new_data); | |
2868 | } | |
7c673cae | 2869 | |
20effc67 TL |
2870 | // Clears dirty.files up to (including) seq_stable. |
2871 | void BlueFS::_clear_dirty_set_stable_D(uint64_t seq) | |
2872 | { | |
2873 | std::lock_guard dl(dirty.lock); | |
7c673cae FG |
2874 | |
2875 | // clean dirty files | |
20effc67 TL |
2876 | if (seq > dirty.seq_stable) { |
2877 | dirty.seq_stable = seq; | |
2878 | dout(20) << __func__ << " seq_stable " << dirty.seq_stable << dendl; | |
2879 | ||
2880 | // undirty all files that were already streamed to log | |
2881 | auto p = dirty.files.begin(); | |
2882 | while (p != dirty.files.end()) { | |
2883 | if (p->first > dirty.seq_stable) { | |
7c673cae FG |
2884 | dout(20) << __func__ << " done cleaning up dirty files" << dendl; |
2885 | break; | |
2886 | } | |
2887 | ||
2888 | auto l = p->second.begin(); | |
2889 | while (l != p->second.end()) { | |
2890 | File *file = &*l; | |
20effc67 TL |
2891 | ceph_assert(file->dirty_seq <= dirty.seq_stable); |
2892 | dout(20) << __func__ << " cleaned file " << file->fnode.ino << dendl; | |
2893 | file->dirty_seq = dirty.seq_stable; | |
7c673cae FG |
2894 | p->second.erase(l++); |
2895 | } | |
2896 | ||
11fdf7f2 | 2897 | ceph_assert(p->second.empty()); |
20effc67 | 2898 | dirty.files.erase(p++); |
7c673cae FG |
2899 | } |
2900 | } else { | |
20effc67 | 2901 | dout(20) << __func__ << " seq_stable " << dirty.seq_stable |
7c673cae FG |
2902 | << " already >= out seq " << seq |
2903 | << ", we lost a race against another log flush, done" << dendl; | |
2904 | } | |
20effc67 | 2905 | } |
a8e16298 | 2906 | |
20effc67 TL |
2907 | void BlueFS::_release_pending_allocations(vector<interval_set<uint64_t>>& to_release) |
2908 | { | |
a8e16298 TL |
2909 | for (unsigned i = 0; i < to_release.size(); ++i) { |
2910 | if (!to_release[i].empty()) { | |
2911 | /* OK, now we have the guarantee alloc[i] won't be null. */ | |
11fdf7f2 TL |
2912 | int r = 0; |
2913 | if (cct->_conf->bdev_enable_discard && cct->_conf->bdev_async_discard) { | |
2914 | r = bdev[i]->queue_discard(to_release[i]); | |
2915 | if (r == 0) | |
2916 | continue; | |
2917 | } else if (cct->_conf->bdev_enable_discard) { | |
2918 | for (auto p = to_release[i].begin(); p != to_release[i].end(); ++p) { | |
2919 | bdev[i]->discard(p.get_start(), p.get_len()); | |
2920 | } | |
2921 | } | |
a8e16298 | 2922 | alloc[i]->release(to_release[i]); |
f67539c2 TL |
2923 | if (is_shared_alloc(i)) { |
2924 | shared_alloc->bluefs_used -= to_release[i].size(); | |
2925 | } | |
a8e16298 TL |
2926 | } |
2927 | } | |
20effc67 TL |
2928 | } |
2929 | ||
2930 | int BlueFS::_flush_and_sync_log_LD(uint64_t want_seq) | |
2931 | { | |
2932 | int64_t available_runway; | |
2933 | do { | |
2934 | log.lock.lock(); | |
2935 | dirty.lock.lock(); | |
2936 | if (want_seq && want_seq <= dirty.seq_stable) { | |
2937 | dout(10) << __func__ << " want_seq " << want_seq << " <= seq_stable " | |
2938 | << dirty.seq_stable << ", done" << dendl; | |
2939 | dirty.lock.unlock(); | |
2940 | log.lock.unlock(); | |
2941 | return 0; | |
2942 | } | |
2943 | ||
2944 | available_runway = _maybe_extend_log(); | |
2945 | if (available_runway == -EWOULDBLOCK) { | |
2946 | // we are in need of adding runway, but we are during log-switch from compaction | |
2947 | dirty.lock.unlock(); | |
2948 | //instead log.lock.unlock() do move ownership | |
2949 | std::unique_lock<ceph::mutex> ll(log.lock, std::adopt_lock); | |
2950 | while (log_forbidden_to_expand.load()) { | |
2951 | log_cond.wait(ll); | |
2952 | } | |
2953 | } else { | |
2954 | ceph_assert(available_runway >= 0); | |
2955 | } | |
2956 | } while (available_runway < 0); | |
2957 | ||
2958 | ceph_assert(want_seq == 0 || want_seq <= dirty.seq_live); // illegal to request seq that was not created yet | |
2959 | uint64_t seq =_log_advance_seq(); | |
2960 | _consume_dirty(seq); | |
2961 | vector<interval_set<uint64_t>> to_release(dirty.pending_release.size()); | |
2962 | to_release.swap(dirty.pending_release); | |
2963 | dirty.lock.unlock(); | |
2964 | ||
2965 | _flush_and_sync_log_core(available_runway); | |
2966 | _flush_bdev(log.writer); | |
2967 | logger->set(l_bluefs_log_bytes, log.writer->file->fnode.size); | |
2968 | //now log.lock is no longer needed | |
2969 | log.lock.unlock(); | |
2970 | ||
2971 | _clear_dirty_set_stable_D(seq); | |
2972 | _release_pending_allocations(to_release); | |
a8e16298 | 2973 | |
7c673cae | 2974 | _update_logger_stats(); |
20effc67 TL |
2975 | return 0; |
2976 | } | |
2977 | ||
2978 | // Flushes log and immediately adjusts log_writer pos. | |
2979 | int BlueFS::_flush_and_sync_log_jump_D(uint64_t jump_to, | |
2980 | int64_t available_runway) | |
2981 | { | |
2982 | ceph_assert(ceph_mutex_is_locked(log.lock)); | |
2983 | ||
2984 | ceph_assert(jump_to); | |
2985 | // we synchronize writing to log, by lock to log.lock | |
2986 | ||
2987 | dirty.lock.lock(); | |
2988 | uint64_t seq =_log_advance_seq(); | |
2989 | _consume_dirty(seq); | |
2990 | vector<interval_set<uint64_t>> to_release(dirty.pending_release.size()); | |
2991 | to_release.swap(dirty.pending_release); | |
2992 | dirty.lock.unlock(); | |
2993 | _flush_and_sync_log_core(available_runway); | |
7c673cae | 2994 | |
20effc67 TL |
2995 | dout(10) << __func__ << " jumping log offset from 0x" << std::hex |
2996 | << log.writer->pos << " -> 0x" << jump_to << std::dec << dendl; | |
2997 | log.writer->pos = jump_to; | |
2998 | vselector->sub_usage(log.writer->file->vselector_hint, log.writer->file->fnode.size); | |
2999 | log.writer->file->fnode.size = jump_to; | |
3000 | vselector->add_usage(log.writer->file->vselector_hint, log.writer->file->fnode.size); | |
3001 | ||
3002 | _flush_bdev(log.writer); | |
3003 | ||
3004 | _clear_dirty_set_stable_D(seq); | |
3005 | _release_pending_allocations(to_release); | |
3006 | ||
3007 | logger->set(l_bluefs_log_bytes, log.writer->file->fnode.size); | |
3008 | _update_logger_stats(); | |
7c673cae FG |
3009 | return 0; |
3010 | } | |
3011 | ||
f67539c2 TL |
3012 | ceph::bufferlist BlueFS::FileWriter::flush_buffer( |
3013 | CephContext* const cct, | |
3014 | const bool partial, | |
3015 | const unsigned length, | |
3016 | const bluefs_super_t& super) | |
3017 | { | |
20effc67 | 3018 | ceph_assert(ceph_mutex_is_locked(this->lock) || file->fnode.ino <= 1); |
f67539c2 TL |
3019 | ceph::bufferlist bl; |
3020 | if (partial) { | |
3021 | tail_block.splice(0, tail_block.length(), &bl); | |
3022 | } | |
3023 | const auto remaining_len = length - bl.length(); | |
3024 | buffer.splice(0, remaining_len, &bl); | |
3025 | if (buffer.length()) { | |
3026 | dout(20) << " leaving 0x" << std::hex << buffer.length() << std::dec | |
3027 | << " unflushed" << dendl; | |
3028 | } | |
3029 | if (const unsigned tail = bl.length() & ~super.block_mask(); tail) { | |
3030 | const auto padding_len = super.block_size - tail; | |
3031 | dout(20) << __func__ << " caching tail of 0x" | |
3032 | << std::hex << tail | |
3033 | << " and padding block with 0x" << padding_len | |
3034 | << " buffer.length() " << buffer.length() | |
3035 | << std::dec << dendl; | |
3036 | // We need to go through the `buffer_appender` to get a chance to | |
3037 | // preserve in-memory contiguity and not mess with the alignment. | |
3038 | // Otherwise a costly rebuild could happen in e.g. `KernelDevice`. | |
3039 | buffer_appender.append_zero(padding_len); | |
3040 | buffer.splice(buffer.length() - padding_len, padding_len, &bl); | |
3041 | // Deep copy the tail here. This allows to avoid costlier copy on | |
3042 | // bufferlist rebuild in e.g. `KernelDevice` and minimizes number | |
3043 | // of memory allocations. | |
3044 | // The alternative approach would be to place the entire tail and | |
3045 | // padding on a dedicated, 4 KB long memory chunk. This shouldn't | |
3046 | // trigger the rebuild while still being less expensive. | |
3047 | buffer_appender.substr_of(bl, bl.length() - padding_len - tail, tail); | |
3048 | buffer.splice(buffer.length() - tail, tail, &tail_block); | |
3049 | } else { | |
3050 | tail_block.clear(); | |
3051 | } | |
3052 | return bl; | |
3053 | } | |
3054 | ||
20effc67 | 3055 | int BlueFS::_signal_dirty_to_log_D(FileWriter *h) |
522d829b | 3056 | { |
20effc67 TL |
3057 | ceph_assert(ceph_mutex_is_locked(h->lock)); |
3058 | std::lock_guard dl(dirty.lock); | |
522d829b TL |
3059 | h->file->fnode.mtime = ceph_clock_now(); |
3060 | ceph_assert(h->file->fnode.ino >= 1); | |
20effc67 TL |
3061 | if (h->file->dirty_seq <= dirty.seq_stable) { |
3062 | h->file->dirty_seq = dirty.seq_live; | |
3063 | dirty.files[h->file->dirty_seq].push_back(*h->file); | |
3064 | dout(20) << __func__ << " dirty_seq = " << dirty.seq_live | |
522d829b TL |
3065 | << " (was clean)" << dendl; |
3066 | } else { | |
20effc67 | 3067 | if (h->file->dirty_seq != dirty.seq_live) { |
522d829b | 3068 | // need re-dirty, erase from list first |
20effc67 TL |
3069 | ceph_assert(dirty.files.count(h->file->dirty_seq)); |
3070 | auto it = dirty.files[h->file->dirty_seq].iterator_to(*h->file); | |
3071 | dirty.files[h->file->dirty_seq].erase(it); | |
3072 | h->file->dirty_seq = dirty.seq_live; | |
3073 | dirty.files[h->file->dirty_seq].push_back(*h->file); | |
3074 | dout(20) << __func__ << " dirty_seq = " << dirty.seq_live | |
522d829b TL |
3075 | << " (was " << h->file->dirty_seq << ")" << dendl; |
3076 | } else { | |
20effc67 | 3077 | dout(20) << __func__ << " dirty_seq = " << dirty.seq_live |
522d829b TL |
3078 | << " (unchanged, do nothing) " << dendl; |
3079 | } | |
3080 | } | |
3081 | return 0; | |
3082 | } | |
3083 | ||
20effc67 | 3084 | void BlueFS::flush_range(FileWriter *h, uint64_t offset, uint64_t length)/*_WF*/ |
7c673cae | 3085 | { |
20effc67 TL |
3086 | _maybe_check_vselector_LNF(); |
3087 | std::unique_lock hl(h->lock); | |
3088 | _flush_range_F(h, offset, length); | |
3089 | } | |
3090 | ||
3091 | int BlueFS::_flush_range_F(FileWriter *h, uint64_t offset, uint64_t length) | |
3092 | { | |
3093 | ceph_assert(ceph_mutex_is_locked(h->lock)); | |
3094 | ceph_assert(h->file->num_readers.load() == 0); | |
3095 | ceph_assert(h->file->fnode.ino > 1); | |
3096 | ||
7c673cae FG |
3097 | dout(10) << __func__ << " " << h << " pos 0x" << std::hex << h->pos |
3098 | << " 0x" << offset << "~" << length << std::dec | |
3099 | << " to " << h->file->fnode << dendl; | |
f67539c2 TL |
3100 | if (h->file->deleted) { |
3101 | dout(10) << __func__ << " deleted, no-op" << dendl; | |
3102 | return 0; | |
3103 | } | |
7c673cae | 3104 | |
20effc67 | 3105 | bool buffered = cct->_conf->bluefs_buffered_io; |
7c673cae FG |
3106 | |
3107 | if (offset + length <= h->pos) | |
3108 | return 0; | |
3109 | if (offset < h->pos) { | |
3110 | length -= h->pos - offset; | |
3111 | offset = h->pos; | |
3112 | dout(10) << " still need 0x" | |
3113 | << std::hex << offset << "~" << length << std::dec | |
3114 | << dendl; | |
3115 | } | |
20effc67 | 3116 | std::lock_guard file_lock(h->file->lock); |
11fdf7f2 | 3117 | ceph_assert(offset <= h->file->fnode.size); |
7c673cae FG |
3118 | |
3119 | uint64_t allocated = h->file->fnode.get_allocated(); | |
9f95a23c | 3120 | vselector->sub_usage(h->file->vselector_hint, h->file->fnode); |
7c673cae FG |
3121 | // do not bother to dirty the file if we are overwriting |
3122 | // previously allocated extents. | |
7c673cae FG |
3123 | if (allocated < offset + length) { |
3124 | // we should never run out of log space here; see the min runway check | |
3125 | // in _flush_and_sync_log. | |
9f95a23c | 3126 | int r = _allocate(vselector->select_prefer_bdev(h->file->vselector_hint), |
7c673cae | 3127 | offset + length - allocated, |
94b18763 | 3128 | &h->file->fnode); |
7c673cae FG |
3129 | if (r < 0) { |
3130 | derr << __func__ << " allocated: 0x" << std::hex << allocated | |
3131 | << " offset: 0x" << offset << " length: 0x" << length << std::dec | |
3132 | << dendl; | |
9f95a23c | 3133 | vselector->add_usage(h->file->vselector_hint, h->file->fnode); // undo |
11fdf7f2 | 3134 | ceph_abort_msg("bluefs enospc"); |
7c673cae FG |
3135 | return r; |
3136 | } | |
522d829b | 3137 | h->file->is_dirty = true; |
7c673cae FG |
3138 | } |
3139 | if (h->file->fnode.size < offset + length) { | |
3140 | h->file->fnode.size = offset + length; | |
20effc67 | 3141 | h->file->is_dirty = true; |
7c673cae | 3142 | } |
20effc67 | 3143 | |
522d829b | 3144 | dout(20) << __func__ << " file now, unflushed " << h->file->fnode << dendl; |
20effc67 TL |
3145 | int res = _flush_data(h, offset, length, buffered); |
3146 | vselector->add_usage(h->file->vselector_hint, h->file->fnode); | |
3147 | return res; | |
3148 | } | |
7c673cae | 3149 | |
20effc67 TL |
3150 | int BlueFS::_flush_data(FileWriter *h, uint64_t offset, uint64_t length, bool buffered) |
3151 | { | |
3152 | if (h->file->fnode.ino > 1) { | |
3153 | ceph_assert(ceph_mutex_is_locked(h->lock)); | |
3154 | ceph_assert(ceph_mutex_is_locked(h->file->lock)); | |
3155 | } | |
7c673cae FG |
3156 | uint64_t x_off = 0; |
3157 | auto p = h->file->fnode.seek(offset, &x_off); | |
11fdf7f2 | 3158 | ceph_assert(p != h->file->fnode.extents.end()); |
7c673cae FG |
3159 | dout(20) << __func__ << " in " << *p << " x_off 0x" |
3160 | << std::hex << x_off << std::dec << dendl; | |
3161 | ||
3162 | unsigned partial = x_off & ~super.block_mask(); | |
7c673cae FG |
3163 | if (partial) { |
3164 | dout(20) << __func__ << " using partial tail 0x" | |
3165 | << std::hex << partial << std::dec << dendl; | |
7c673cae FG |
3166 | x_off -= partial; |
3167 | offset -= partial; | |
3168 | length += partial; | |
3169 | dout(20) << __func__ << " waiting for previous aio to complete" << dendl; | |
3170 | for (auto p : h->iocv) { | |
3171 | if (p) { | |
3172 | p->aio_wait(); | |
3173 | } | |
3174 | } | |
3175 | } | |
7c673cae | 3176 | |
f67539c2 TL |
3177 | auto bl = h->flush_buffer(cct, partial, length, super); |
3178 | ceph_assert(bl.length() >= length); | |
9f95a23c | 3179 | h->pos = offset + length; |
f67539c2 | 3180 | length = bl.length(); |
9f95a23c | 3181 | |
7c673cae FG |
3182 | switch (h->writer_type) { |
3183 | case WRITER_WAL: | |
3184 | logger->inc(l_bluefs_bytes_written_wal, length); | |
3185 | break; | |
3186 | case WRITER_SST: | |
3187 | logger->inc(l_bluefs_bytes_written_sst, length); | |
3188 | break; | |
3189 | } | |
3190 | ||
3191 | dout(30) << "dump:\n"; | |
3192 | bl.hexdump(*_dout); | |
3193 | *_dout << dendl; | |
3194 | ||
7c673cae | 3195 | uint64_t bloff = 0; |
11fdf7f2 | 3196 | uint64_t bytes_written_slow = 0; |
7c673cae | 3197 | while (length > 0) { |
11fdf7f2 | 3198 | uint64_t x_len = std::min(p->length - x_off, length); |
7c673cae FG |
3199 | bufferlist t; |
3200 | t.substr_of(bl, bloff, x_len); | |
7c673cae | 3201 | if (cct->_conf->bluefs_sync_write) { |
11fdf7f2 | 3202 | bdev[p->bdev]->write(p->offset + x_off, t, buffered, h->write_hint); |
7c673cae | 3203 | } else { |
11fdf7f2 TL |
3204 | bdev[p->bdev]->aio_write(p->offset + x_off, t, h->iocv[p->bdev], buffered, h->write_hint); |
3205 | } | |
3206 | h->dirty_devs[p->bdev] = true; | |
3207 | if (p->bdev == BDEV_SLOW) { | |
3208 | bytes_written_slow += t.length(); | |
7c673cae | 3209 | } |
11fdf7f2 | 3210 | |
7c673cae FG |
3211 | bloff += x_len; |
3212 | length -= x_len; | |
3213 | ++p; | |
3214 | x_off = 0; | |
3215 | } | |
f67539c2 TL |
3216 | if (bytes_written_slow) { |
3217 | logger->inc(l_bluefs_bytes_written_slow, bytes_written_slow); | |
3218 | } | |
7c673cae FG |
3219 | for (unsigned i = 0; i < MAX_BDEV; ++i) { |
3220 | if (bdev[i]) { | |
11fdf7f2 | 3221 | if (h->iocv[i] && h->iocv[i]->has_pending_aios()) { |
7c673cae FG |
3222 | bdev[i]->aio_submit(h->iocv[i]); |
3223 | } | |
3224 | } | |
3225 | } | |
3226 | dout(20) << __func__ << " h " << h << " pos now 0x" | |
3227 | << std::hex << h->pos << std::dec << dendl; | |
3228 | return 0; | |
3229 | } | |
3230 | ||
11fdf7f2 | 3231 | #ifdef HAVE_LIBAIO |
7c673cae FG |
3232 | // we need to retire old completed aios so they don't stick around in |
3233 | // memory indefinitely (along with their bufferlist refs). | |
3234 | void BlueFS::_claim_completed_aios(FileWriter *h, list<aio_t> *ls) | |
3235 | { | |
3236 | for (auto p : h->iocv) { | |
3237 | if (p) { | |
3238 | ls->splice(ls->end(), p->running_aios); | |
3239 | } | |
3240 | } | |
3241 | dout(10) << __func__ << " got " << ls->size() << " aios" << dendl; | |
3242 | } | |
3243 | ||
20effc67 | 3244 | void BlueFS::_wait_for_aio(FileWriter *h) |
7c673cae FG |
3245 | { |
3246 | // NOTE: this is safe to call without a lock, as long as our reference is | |
3247 | // stable. | |
f67539c2 TL |
3248 | utime_t start; |
3249 | lgeneric_subdout(cct, bluefs, 10) << __func__; | |
3250 | start = ceph_clock_now(); | |
3251 | *_dout << " " << h << dendl; | |
7c673cae FG |
3252 | for (auto p : h->iocv) { |
3253 | if (p) { | |
3254 | p->aio_wait(); | |
3255 | } | |
3256 | } | |
11fdf7f2 | 3257 | dout(10) << __func__ << " " << h << " done in " << (ceph_clock_now() - start) << dendl; |
7c673cae | 3258 | } |
11fdf7f2 | 3259 | #endif |
7c673cae | 3260 | |
20effc67 TL |
3261 | void BlueFS::append_try_flush(FileWriter *h, const char* buf, size_t len)/*_WF_LNF_NF_LD_D*/ |
3262 | { | |
3263 | bool flushed_sum = false; | |
3264 | { | |
3265 | std::unique_lock hl(h->lock); | |
3266 | size_t max_size = 1ull << 30; // cap to 1GB | |
3267 | while (len > 0) { | |
3268 | bool need_flush = true; | |
3269 | auto l0 = h->get_buffer_length(); | |
3270 | if (l0 < max_size) { | |
3271 | size_t l = std::min(len, max_size - l0); | |
3272 | h->append(buf, l); | |
3273 | buf += l; | |
3274 | len -= l; | |
3275 | need_flush = h->get_buffer_length() >= cct->_conf->bluefs_min_flush_size; | |
3276 | } | |
3277 | if (need_flush) { | |
3278 | bool flushed = false; | |
3279 | int r = _flush_F(h, true, &flushed); | |
3280 | ceph_assert(r == 0); | |
3281 | flushed_sum |= flushed; | |
3282 | // make sure we've made any progress with flush hence the | |
3283 | // loop doesn't iterate forever | |
3284 | ceph_assert(h->get_buffer_length() < max_size); | |
3285 | } | |
3286 | } | |
3287 | } | |
3288 | if (flushed_sum) { | |
3289 | _maybe_compact_log_LNF_NF_LD_D(); | |
3290 | } | |
3291 | } | |
3292 | ||
3293 | void BlueFS::flush(FileWriter *h, bool force)/*_WF_LNF_NF_LD_D*/ | |
f6b5b4d7 TL |
3294 | { |
3295 | bool flushed = false; | |
20effc67 TL |
3296 | int r; |
3297 | { | |
3298 | std::unique_lock hl(h->lock); | |
3299 | r = _flush_F(h, force, &flushed); | |
3300 | ceph_assert(r == 0); | |
3301 | } | |
f6b5b4d7 | 3302 | if (r == 0 && flushed) { |
20effc67 | 3303 | _maybe_compact_log_LNF_NF_LD_D(); |
f6b5b4d7 | 3304 | } |
f6b5b4d7 TL |
3305 | } |
3306 | ||
20effc67 | 3307 | int BlueFS::_flush_F(FileWriter *h, bool force, bool *flushed) |
7c673cae | 3308 | { |
20effc67 | 3309 | ceph_assert(ceph_mutex_is_locked(h->lock)); |
f67539c2 | 3310 | uint64_t length = h->get_buffer_length(); |
7c673cae | 3311 | uint64_t offset = h->pos; |
f6b5b4d7 TL |
3312 | if (flushed) { |
3313 | *flushed = false; | |
3314 | } | |
7c673cae FG |
3315 | if (!force && |
3316 | length < cct->_conf->bluefs_min_flush_size) { | |
3317 | dout(10) << __func__ << " " << h << " ignoring, length " << length | |
3318 | << " < min_flush_size " << cct->_conf->bluefs_min_flush_size | |
3319 | << dendl; | |
3320 | return 0; | |
3321 | } | |
3322 | if (length == 0) { | |
3323 | dout(10) << __func__ << " " << h << " no dirty data on " | |
3324 | << h->file->fnode << dendl; | |
3325 | return 0; | |
3326 | } | |
3327 | dout(10) << __func__ << " " << h << " 0x" | |
3328 | << std::hex << offset << "~" << length << std::dec | |
3329 | << " to " << h->file->fnode << dendl; | |
11fdf7f2 | 3330 | ceph_assert(h->pos <= h->file->fnode.size); |
20effc67 | 3331 | int r = _flush_range_F(h, offset, length); |
f6b5b4d7 TL |
3332 | if (flushed) { |
3333 | *flushed = true; | |
3334 | } | |
3335 | return r; | |
7c673cae FG |
3336 | } |
3337 | ||
20effc67 TL |
3338 | // Flush for bluefs special files. |
3339 | // Does not add extents to h. | |
3340 | // Does not mark h as dirty. | |
3341 | // we do not need to dirty the log file (or it's compacting | |
3342 | // replacement) when the file size changes because replay is | |
3343 | // smart enough to discover it on its own. | |
3344 | uint64_t BlueFS::_flush_special(FileWriter *h) | |
3345 | { | |
3346 | ceph_assert(h->file->fnode.ino <= 1); | |
3347 | uint64_t length = h->get_buffer_length(); | |
3348 | uint64_t offset = h->pos; | |
3349 | uint64_t new_data = 0; | |
3350 | ceph_assert(length + offset <= h->file->fnode.get_allocated()); | |
3351 | if (h->file->fnode.size < offset + length) { | |
3352 | new_data = offset + length - h->file->fnode.size; | |
3353 | h->file->fnode.size = offset + length; | |
3354 | } | |
3355 | _flush_data(h, offset, length, false); | |
3356 | return new_data; | |
3357 | } | |
3358 | ||
3359 | int BlueFS::truncate(FileWriter *h, uint64_t offset)/*_WF_L*/ | |
7c673cae | 3360 | { |
20effc67 | 3361 | std::lock_guard hl(h->lock); |
7c673cae FG |
3362 | dout(10) << __func__ << " 0x" << std::hex << offset << std::dec |
3363 | << " file " << h->file->fnode << dendl; | |
3364 | if (h->file->deleted) { | |
3365 | dout(10) << __func__ << " deleted, no-op" << dendl; | |
3366 | return 0; | |
3367 | } | |
3368 | ||
3369 | // we never truncate internal log files | |
11fdf7f2 | 3370 | ceph_assert(h->file->fnode.ino > 1); |
7c673cae | 3371 | |
7c673cae FG |
3372 | // truncate off unflushed data? |
3373 | if (h->pos < offset && | |
f67539c2 | 3374 | h->pos + h->get_buffer_length() > offset) { |
7c673cae FG |
3375 | dout(20) << __func__ << " tossing out last " << offset - h->pos |
3376 | << " unflushed bytes" << dendl; | |
11fdf7f2 | 3377 | ceph_abort_msg("actually this shouldn't happen"); |
7c673cae | 3378 | } |
f67539c2 | 3379 | if (h->get_buffer_length()) { |
20effc67 | 3380 | int r = _flush_F(h, true); |
7c673cae FG |
3381 | if (r < 0) |
3382 | return r; | |
3383 | } | |
3384 | if (offset == h->file->fnode.size) { | |
3385 | return 0; // no-op! | |
3386 | } | |
3387 | if (offset > h->file->fnode.size) { | |
11fdf7f2 | 3388 | ceph_abort_msg("truncate up not supported"); |
7c673cae | 3389 | } |
11fdf7f2 | 3390 | ceph_assert(h->file->fnode.size >= offset); |
20effc67 TL |
3391 | _flush_bdev(h); |
3392 | ||
3393 | std::lock_guard ll(log.lock); | |
9f95a23c | 3394 | vselector->sub_usage(h->file->vselector_hint, h->file->fnode.size); |
7c673cae | 3395 | h->file->fnode.size = offset; |
9f95a23c | 3396 | vselector->add_usage(h->file->vselector_hint, h->file->fnode.size); |
20effc67 | 3397 | log.t.op_file_update_inc(h->file->fnode); |
7c673cae FG |
3398 | return 0; |
3399 | } | |
3400 | ||
20effc67 | 3401 | int BlueFS::fsync(FileWriter *h)/*_WF_WD_WLD_WLNF_WNF*/ |
7c673cae | 3402 | { |
20effc67 TL |
3403 | _maybe_check_vselector_LNF(); |
3404 | std::unique_lock hl(h->lock); | |
3405 | uint64_t old_dirty_seq = 0; | |
3406 | { | |
3407 | dout(10) << __func__ << " " << h << " " << h->file->fnode << dendl; | |
3408 | int r = _flush_F(h, true); | |
3409 | if (r < 0) | |
3410 | return r; | |
3411 | _flush_bdev(h); | |
3412 | if (h->file->is_dirty) { | |
3413 | _signal_dirty_to_log_D(h); | |
3414 | h->file->is_dirty = false; | |
3415 | } | |
3416 | { | |
3417 | std::lock_guard dl(dirty.lock); | |
3418 | if (dirty.seq_stable < h->file->dirty_seq) { | |
3419 | old_dirty_seq = h->file->dirty_seq; | |
3420 | dout(20) << __func__ << " file metadata was dirty (" << old_dirty_seq | |
3421 | << ") on " << h->file->fnode << ", flushing log" << dendl; | |
3422 | } | |
3423 | } | |
522d829b | 3424 | } |
7c673cae | 3425 | if (old_dirty_seq) { |
20effc67 | 3426 | _flush_and_sync_log_LD(old_dirty_seq); |
7c673cae | 3427 | } |
20effc67 TL |
3428 | _maybe_compact_log_LNF_NF_LD_D(); |
3429 | ||
7c673cae FG |
3430 | return 0; |
3431 | } | |
3432 | ||
20effc67 TL |
3433 | // be careful - either h->file->lock or log.lock must be taken |
3434 | void BlueFS::_flush_bdev(FileWriter *h) | |
7c673cae | 3435 | { |
20effc67 TL |
3436 | if (h->file->fnode.ino > 1) { |
3437 | ceph_assert(ceph_mutex_is_locked(h->lock)); | |
3438 | } else if (h->file->fnode.ino == 1) { | |
3439 | ceph_assert(ceph_mutex_is_locked(log.lock)); | |
3440 | } | |
11fdf7f2 TL |
3441 | std::array<bool, MAX_BDEV> flush_devs = h->dirty_devs; |
3442 | h->dirty_devs.fill(false); | |
3443 | #ifdef HAVE_LIBAIO | |
7c673cae FG |
3444 | if (!cct->_conf->bluefs_sync_write) { |
3445 | list<aio_t> completed_ios; | |
3446 | _claim_completed_aios(h, &completed_ios); | |
20effc67 | 3447 | _wait_for_aio(h); |
7c673cae | 3448 | completed_ios.clear(); |
7c673cae | 3449 | } |
20effc67 TL |
3450 | #endif |
3451 | _flush_bdev(flush_devs); | |
7c673cae FG |
3452 | } |
3453 | ||
20effc67 | 3454 | void BlueFS::_flush_bdev(std::array<bool, MAX_BDEV>& dirty_bdevs) |
11fdf7f2 TL |
3455 | { |
3456 | // NOTE: this is safe to call without a lock. | |
3457 | dout(20) << __func__ << dendl; | |
3458 | for (unsigned i = 0; i < MAX_BDEV; i++) { | |
3459 | if (dirty_bdevs[i]) | |
3460 | bdev[i]->flush(); | |
3461 | } | |
3462 | } | |
3463 | ||
20effc67 | 3464 | void BlueFS::_flush_bdev() |
7c673cae FG |
3465 | { |
3466 | // NOTE: this is safe to call without a lock. | |
3467 | dout(20) << __func__ << dendl; | |
f67539c2 TL |
3468 | for (unsigned i = 0; i < MAX_BDEV; i++) { |
3469 | // alloc space from BDEV_SLOW is unexpected. | |
3470 | // So most cases we don't alloc from BDEV_SLOW and so avoiding flush not-used device. | |
3471 | if (bdev[i] && (i != BDEV_SLOW || _get_used(i))) { | |
3472 | bdev[i]->flush(); | |
3473 | } | |
7c673cae FG |
3474 | } |
3475 | } | |
3476 | ||
eafe8130 TL |
3477 | const char* BlueFS::get_device_name(unsigned id) |
3478 | { | |
3479 | if (id >= MAX_BDEV) return "BDEV_INV"; | |
3480 | const char* names[] = {"BDEV_WAL", "BDEV_DB", "BDEV_SLOW", "BDEV_NEWWAL", "BDEV_NEWDB"}; | |
3481 | return names[id]; | |
3482 | } | |
3483 | ||
11fdf7f2 TL |
3484 | int BlueFS::_allocate_without_fallback(uint8_t id, uint64_t len, |
3485 | PExtentVector* extents) | |
3486 | { | |
3487 | dout(10) << __func__ << " len 0x" << std::hex << len << std::dec | |
3488 | << " from " << (int)id << dendl; | |
3489 | assert(id < alloc.size()); | |
11fdf7f2 TL |
3490 | if (!alloc[id]) { |
3491 | return -ENOENT; | |
3492 | } | |
3493 | extents->reserve(4); // 4 should be (more than) enough for most allocations | |
f67539c2 TL |
3494 | int64_t need = round_up_to(len, alloc_size[id]); |
3495 | int64_t alloc_len = alloc[id]->allocate(need, alloc_size[id], 0, extents); | |
3496 | if (alloc_len < 0 || alloc_len < need) { | |
eafe8130 | 3497 | if (alloc_len > 0) { |
11fdf7f2 TL |
3498 | alloc[id]->release(*extents); |
3499 | } | |
f67539c2 TL |
3500 | derr << __func__ << " unable to allocate 0x" << std::hex << need |
3501 | << " on bdev " << (int)id | |
3502 | << ", allocator name " << alloc[id]->get_name() | |
3503 | << ", allocator type " << alloc[id]->get_type() | |
3504 | << ", capacity 0x" << alloc[id]->get_capacity() | |
3505 | << ", block size 0x" << alloc[id]->get_block_size() | |
20effc67 | 3506 | << ", alloc size 0x" << alloc_size[id] |
f67539c2 TL |
3507 | << ", free 0x" << alloc[id]->get_free() |
3508 | << ", fragmentation " << alloc[id]->get_fragmentation() | |
3509 | << ", allocated 0x" << (alloc_len > 0 ? alloc_len : 0) | |
3510 | << std::dec << dendl; | |
3511 | alloc[id]->dump(); | |
11fdf7f2 TL |
3512 | return -ENOSPC; |
3513 | } | |
f67539c2 TL |
3514 | if (is_shared_alloc(id)) { |
3515 | shared_alloc->bluefs_used += alloc_len; | |
3516 | } | |
11fdf7f2 TL |
3517 | |
3518 | return 0; | |
3519 | } | |
3520 | ||
7c673cae | 3521 | int BlueFS::_allocate(uint8_t id, uint64_t len, |
94b18763 | 3522 | bluefs_fnode_t* node) |
7c673cae FG |
3523 | { |
3524 | dout(10) << __func__ << " len 0x" << std::hex << len << std::dec | |
3525 | << " from " << (int)id << dendl; | |
11fdf7f2 | 3526 | ceph_assert(id < alloc.size()); |
b32b8144 | 3527 | int64_t alloc_len = 0; |
a8e16298 | 3528 | PExtentVector extents; |
11fdf7f2 | 3529 | uint64_t hint = 0; |
f67539c2 | 3530 | int64_t need = len; |
7c673cae | 3531 | if (alloc[id]) { |
f67539c2 | 3532 | need = round_up_to(len, alloc_size[id]); |
94b18763 FG |
3533 | if (!node->extents.empty() && node->extents.back().bdev == id) { |
3534 | hint = node->extents.back().end(); | |
11fdf7f2 | 3535 | } |
b32b8144 | 3536 | extents.reserve(4); // 4 should be (more than) enough for most allocations |
f67539c2 | 3537 | alloc_len = alloc[id]->allocate(need, alloc_size[id], hint, &extents); |
b32b8144 | 3538 | } |
f67539c2 TL |
3539 | if (alloc_len < 0 || alloc_len < need) { |
3540 | if (alloc[id]) { | |
3541 | if (alloc_len > 0) { | |
3542 | alloc[id]->release(extents); | |
3543 | } | |
3544 | dout(1) << __func__ << " unable to allocate 0x" << std::hex << need | |
3545 | << " on bdev " << (int)id | |
3546 | << ", allocator name " << alloc[id]->get_name() | |
3547 | << ", allocator type " << alloc[id]->get_type() | |
3548 | << ", capacity 0x" << alloc[id]->get_capacity() | |
3549 | << ", block size 0x" << alloc[id]->get_block_size() | |
20effc67 | 3550 | << ", alloc size 0x" << alloc_size[id] |
f67539c2 TL |
3551 | << ", free 0x" << alloc[id]->get_free() |
3552 | << ", fragmentation " << alloc[id]->get_fragmentation() | |
3553 | << ", allocated 0x" << (alloc_len > 0 ? alloc_len : 0) | |
3554 | << std::dec << dendl; | |
20effc67 TL |
3555 | } else { |
3556 | dout(20) << __func__ << " alloc-id not set on index="<< (int)id << " unable to allocate 0x" << std::hex << need | |
3557 | << " on bdev " << (int)id << std::dec << dendl; | |
b32b8144 | 3558 | } |
7c673cae | 3559 | if (id != BDEV_SLOW) { |
f67539c2 | 3560 | dout(20) << __func__ << " fallback to bdev " |
20effc67 | 3561 | << (int)id + 1 |
f67539c2 | 3562 | << dendl; |
94b18763 | 3563 | return _allocate(id + 1, len, node); |
11fdf7f2 | 3564 | } else { |
f67539c2 TL |
3565 | derr << __func__ << " allocation failed, needed 0x" << std::hex << need |
3566 | << dendl; | |
11fdf7f2 | 3567 | } |
f67539c2 | 3568 | return -ENOSPC; |
11fdf7f2 | 3569 | } else { |
f67539c2 TL |
3570 | uint64_t used = _get_used(id); |
3571 | if (max_bytes[id] < used) { | |
3572 | logger->set(max_bytes_pcounters[id], used); | |
3573 | max_bytes[id] = used; | |
3574 | } | |
3575 | if (is_shared_alloc(id)) { | |
3576 | shared_alloc->bluefs_used += alloc_len; | |
11fdf7f2 | 3577 | } |
7c673cae FG |
3578 | } |
3579 | ||
3580 | for (auto& p : extents) { | |
94b18763 | 3581 | node->append_extent(bluefs_extent_t(id, p.offset, p.length)); |
7c673cae FG |
3582 | } |
3583 | ||
3584 | return 0; | |
3585 | } | |
3586 | ||
20effc67 | 3587 | int BlueFS::preallocate(FileRef f, uint64_t off, uint64_t len)/*_LF*/ |
7c673cae | 3588 | { |
20effc67 TL |
3589 | std::lock_guard ll(log.lock); |
3590 | std::lock_guard fl(f->lock); | |
7c673cae FG |
3591 | dout(10) << __func__ << " file " << f->fnode << " 0x" |
3592 | << std::hex << off << "~" << len << std::dec << dendl; | |
3593 | if (f->deleted) { | |
3594 | dout(10) << __func__ << " deleted, no-op" << dendl; | |
3595 | return 0; | |
3596 | } | |
11fdf7f2 | 3597 | ceph_assert(f->fnode.ino > 1); |
7c673cae FG |
3598 | uint64_t allocated = f->fnode.get_allocated(); |
3599 | if (off + len > allocated) { | |
3600 | uint64_t want = off + len - allocated; | |
9f95a23c | 3601 | |
20effc67 | 3602 | vselector->sub_usage(f->vselector_hint, f->fnode); |
9f95a23c TL |
3603 | int r = _allocate(vselector->select_prefer_bdev(f->vselector_hint), |
3604 | want, | |
3605 | &f->fnode); | |
3606 | vselector->add_usage(f->vselector_hint, f->fnode); | |
7c673cae FG |
3607 | if (r < 0) |
3608 | return r; | |
20effc67 TL |
3609 | |
3610 | log.t.op_file_update_inc(f->fnode); | |
7c673cae FG |
3611 | } |
3612 | return 0; | |
3613 | } | |
3614 | ||
20effc67 | 3615 | void BlueFS::sync_metadata(bool avoid_compact)/*_LNF_NF_LD_D*/ |
7c673cae | 3616 | { |
20effc67 TL |
3617 | bool can_skip_flush; |
3618 | { | |
3619 | std::lock_guard ll(log.lock); | |
3620 | std::lock_guard dl(dirty.lock); | |
3621 | can_skip_flush = log.t.empty() && dirty.files.empty(); | |
3622 | } | |
3623 | if (can_skip_flush) { | |
7c673cae | 3624 | dout(10) << __func__ << " - no pending log events" << dendl; |
11fdf7f2 | 3625 | } else { |
f67539c2 TL |
3626 | utime_t start; |
3627 | lgeneric_subdout(cct, bluefs, 10) << __func__; | |
3628 | start = ceph_clock_now(); | |
3629 | *_dout << dendl; | |
20effc67 TL |
3630 | _flush_bdev(); // FIXME? |
3631 | _flush_and_sync_log_LD(); | |
11fdf7f2 | 3632 | dout(10) << __func__ << " done in " << (ceph_clock_now() - start) << dendl; |
7c673cae | 3633 | } |
7c673cae | 3634 | |
f6b5b4d7 | 3635 | if (!avoid_compact) { |
20effc67 | 3636 | _maybe_compact_log_LNF_NF_LD_D(); |
f6b5b4d7 TL |
3637 | } |
3638 | } | |
3639 | ||
20effc67 | 3640 | void BlueFS::_maybe_compact_log_LNF_NF_LD_D() |
f6b5b4d7 TL |
3641 | { |
3642 | if (!cct->_conf->bluefs_replay_recovery_disable_compact && | |
20effc67 | 3643 | _should_start_compact_log_L_N()) { |
7c673cae | 3644 | if (cct->_conf->bluefs_compact_log_sync) { |
20effc67 | 3645 | _compact_log_sync_LNF_LD(); |
7c673cae | 3646 | } else { |
20effc67 | 3647 | _compact_log_async_LD_LNF_D(); |
7c673cae FG |
3648 | } |
3649 | } | |
7c673cae FG |
3650 | } |
3651 | ||
3652 | int BlueFS::open_for_write( | |
b3b6e05e TL |
3653 | std::string_view dirname, |
3654 | std::string_view filename, | |
7c673cae | 3655 | FileWriter **h, |
20effc67 | 3656 | bool overwrite)/*_N_LD*/ |
7c673cae | 3657 | { |
20effc67 TL |
3658 | _maybe_check_vselector_LNF(); |
3659 | FileRef file; | |
3660 | bool create = false; | |
3661 | bool truncate = false; | |
3662 | mempool::bluefs::vector<bluefs_extent_t> pending_release_extents; | |
3663 | { | |
3664 | std::unique_lock nl(nodes.lock); | |
7c673cae | 3665 | dout(10) << __func__ << " " << dirname << "/" << filename << dendl; |
20effc67 | 3666 | map<string,DirRef>::iterator p = nodes.dir_map.find(dirname); |
7c673cae | 3667 | DirRef dir; |
20effc67 | 3668 | if (p == nodes.dir_map.end()) { |
7c673cae FG |
3669 | // implicitly create the dir |
3670 | dout(20) << __func__ << " dir " << dirname | |
3671 | << " does not exist" << dendl; | |
3672 | return -ENOENT; | |
3673 | } else { | |
3674 | dir = p->second; | |
3675 | } | |
3676 | ||
7c673cae FG |
3677 | map<string,FileRef>::iterator q = dir->file_map.find(filename); |
3678 | if (q == dir->file_map.end()) { | |
3679 | if (overwrite) { | |
3680 | dout(20) << __func__ << " dir " << dirname << " (" << dir | |
3681 | << ") file " << filename | |
3682 | << " does not exist" << dendl; | |
3683 | return -ENOENT; | |
3684 | } | |
9f95a23c | 3685 | file = ceph::make_ref<File>(); |
7c673cae | 3686 | file->fnode.ino = ++ino_last; |
20effc67 | 3687 | nodes.file_map[ino_last] = file; |
b3b6e05e | 3688 | dir->file_map[string{filename}] = file; |
7c673cae FG |
3689 | ++file->refs; |
3690 | create = true; | |
20effc67 | 3691 | logger->set(l_bluefs_num_files, nodes.file_map.size()); |
7c673cae FG |
3692 | } else { |
3693 | // overwrite existing file? | |
3694 | file = q->second; | |
3695 | if (overwrite) { | |
3696 | dout(20) << __func__ << " dir " << dirname << " (" << dir | |
3697 | << ") file " << filename | |
3698 | << " already exists, overwrite in place" << dendl; | |
3699 | } else { | |
3700 | dout(20) << __func__ << " dir " << dirname << " (" << dir | |
3701 | << ") file " << filename | |
3702 | << " already exists, truncate + overwrite" << dendl; | |
9f95a23c | 3703 | vselector->sub_usage(file->vselector_hint, file->fnode); |
7c673cae | 3704 | file->fnode.size = 0; |
20effc67 | 3705 | pending_release_extents.swap(file->fnode.extents); |
f6b5b4d7 | 3706 | truncate = true; |
94b18763 FG |
3707 | |
3708 | file->fnode.clear_extents(); | |
7c673cae FG |
3709 | } |
3710 | } | |
11fdf7f2 | 3711 | ceph_assert(file->fnode.ino > 1); |
7c673cae FG |
3712 | |
3713 | file->fnode.mtime = ceph_clock_now(); | |
9f95a23c | 3714 | file->vselector_hint = vselector->get_hint_by_dir(dirname); |
f6b5b4d7 TL |
3715 | if (create || truncate) { |
3716 | vselector->add_usage(file->vselector_hint, file->fnode); // update file count | |
3717 | } | |
9f95a23c | 3718 | |
7c673cae | 3719 | dout(20) << __func__ << " mapping " << dirname << "/" << filename |
9f95a23c TL |
3720 | << " vsel_hint " << file->vselector_hint |
3721 | << dendl; | |
20effc67 TL |
3722 | } |
3723 | { | |
3724 | std::lock_guard ll(log.lock); | |
3725 | log.t.op_file_update(file->fnode); | |
3726 | if (create) | |
3727 | log.t.op_dir_link(dirname, filename, file->fnode.ino); | |
3728 | ||
3729 | std::lock_guard dl(dirty.lock); | |
3730 | for (auto& p : pending_release_extents) { | |
3731 | dirty.pending_release[p.bdev].insert(p.offset, p.length); | |
3732 | } | |
3733 | } | |
7c673cae FG |
3734 | *h = _create_writer(file); |
3735 | ||
3736 | if (boost::algorithm::ends_with(filename, ".log")) { | |
3737 | (*h)->writer_type = BlueFS::WRITER_WAL; | |
3738 | if (logger && !overwrite) { | |
3739 | logger->inc(l_bluefs_files_written_wal); | |
3740 | } | |
3741 | } else if (boost::algorithm::ends_with(filename, ".sst")) { | |
3742 | (*h)->writer_type = BlueFS::WRITER_SST; | |
3743 | if (logger) { | |
3744 | logger->inc(l_bluefs_files_written_sst); | |
3745 | } | |
3746 | } | |
3747 | ||
3748 | dout(10) << __func__ << " h " << *h << " on " << file->fnode << dendl; | |
3749 | return 0; | |
3750 | } | |
3751 | ||
3752 | BlueFS::FileWriter *BlueFS::_create_writer(FileRef f) | |
3753 | { | |
3754 | FileWriter *w = new FileWriter(f); | |
3755 | for (unsigned i = 0; i < MAX_BDEV; ++i) { | |
3756 | if (bdev[i]) { | |
3757 | w->iocv[i] = new IOContext(cct, NULL); | |
7c673cae FG |
3758 | } |
3759 | } | |
3760 | return w; | |
3761 | } | |
3762 | ||
20effc67 | 3763 | void BlueFS::_drain_writer(FileWriter *h) |
7c673cae FG |
3764 | { |
3765 | dout(10) << __func__ << " " << h << " type " << h->writer_type << dendl; | |
f67539c2 | 3766 | //h->buffer.reassign_to_mempool(mempool::mempool_bluefs_file_writer); |
7c673cae FG |
3767 | for (unsigned i=0; i<MAX_BDEV; ++i) { |
3768 | if (bdev[i]) { | |
11fdf7f2 TL |
3769 | if (h->iocv[i]) { |
3770 | h->iocv[i]->aio_wait(); | |
20effc67 | 3771 | delete h->iocv[i]; |
11fdf7f2 | 3772 | } |
7c673cae FG |
3773 | } |
3774 | } | |
522d829b TL |
3775 | // sanity |
3776 | if (h->file->fnode.size >= (1ull << 30)) { | |
3777 | dout(10) << __func__ << " file is unexpectedly large:" << h->file->fnode << dendl; | |
3778 | } | |
20effc67 TL |
3779 | } |
3780 | ||
3781 | void BlueFS::_close_writer(FileWriter *h) | |
3782 | { | |
3783 | _drain_writer(h); | |
3784 | delete h; | |
3785 | } | |
3786 | void BlueFS::close_writer(FileWriter *h) | |
3787 | { | |
3788 | { | |
3789 | std::lock_guard l(h->lock); | |
3790 | _drain_writer(h); | |
3791 | } | |
7c673cae FG |
3792 | delete h; |
3793 | } | |
3794 | ||
522d829b TL |
3795 | uint64_t BlueFS::debug_get_dirty_seq(FileWriter *h) |
3796 | { | |
20effc67 | 3797 | std::lock_guard l(h->lock); |
522d829b TL |
3798 | return h->file->dirty_seq; |
3799 | } | |
3800 | ||
3801 | bool BlueFS::debug_get_is_dev_dirty(FileWriter *h, uint8_t dev) | |
3802 | { | |
20effc67 | 3803 | std::lock_guard l(h->lock); |
522d829b TL |
3804 | return h->dirty_devs[dev]; |
3805 | } | |
3806 | ||
7c673cae | 3807 | int BlueFS::open_for_read( |
b3b6e05e TL |
3808 | std::string_view dirname, |
3809 | std::string_view filename, | |
7c673cae | 3810 | FileReader **h, |
20effc67 | 3811 | bool random)/*_N*/ |
7c673cae | 3812 | { |
20effc67 TL |
3813 | _maybe_check_vselector_LNF(); |
3814 | std::lock_guard nl(nodes.lock); | |
7c673cae FG |
3815 | dout(10) << __func__ << " " << dirname << "/" << filename |
3816 | << (random ? " (random)":" (sequential)") << dendl; | |
20effc67 TL |
3817 | map<string,DirRef>::iterator p = nodes.dir_map.find(dirname); |
3818 | if (p == nodes.dir_map.end()) { | |
7c673cae FG |
3819 | dout(20) << __func__ << " dir " << dirname << " not found" << dendl; |
3820 | return -ENOENT; | |
3821 | } | |
3822 | DirRef dir = p->second; | |
3823 | ||
3824 | map<string,FileRef>::iterator q = dir->file_map.find(filename); | |
3825 | if (q == dir->file_map.end()) { | |
3826 | dout(20) << __func__ << " dir " << dirname << " (" << dir | |
3827 | << ") file " << filename | |
3828 | << " not found" << dendl; | |
3829 | return -ENOENT; | |
3830 | } | |
3831 | File *file = q->second.get(); | |
3832 | ||
3833 | *h = new FileReader(file, random ? 4096 : cct->_conf->bluefs_max_prefetch, | |
3834 | random, false); | |
3835 | dout(10) << __func__ << " h " << *h << " on " << file->fnode << dendl; | |
3836 | return 0; | |
3837 | } | |
3838 | ||
3839 | int BlueFS::rename( | |
b3b6e05e | 3840 | std::string_view old_dirname, std::string_view old_filename, |
20effc67 | 3841 | std::string_view new_dirname, std::string_view new_filename)/*_LND*/ |
7c673cae | 3842 | { |
20effc67 TL |
3843 | std::lock_guard ll(log.lock); |
3844 | std::lock_guard nl(nodes.lock); | |
7c673cae FG |
3845 | dout(10) << __func__ << " " << old_dirname << "/" << old_filename |
3846 | << " -> " << new_dirname << "/" << new_filename << dendl; | |
20effc67 TL |
3847 | map<string,DirRef>::iterator p = nodes.dir_map.find(old_dirname); |
3848 | if (p == nodes.dir_map.end()) { | |
7c673cae FG |
3849 | dout(20) << __func__ << " dir " << old_dirname << " not found" << dendl; |
3850 | return -ENOENT; | |
3851 | } | |
3852 | DirRef old_dir = p->second; | |
3853 | map<string,FileRef>::iterator q = old_dir->file_map.find(old_filename); | |
3854 | if (q == old_dir->file_map.end()) { | |
3855 | dout(20) << __func__ << " dir " << old_dirname << " (" << old_dir | |
3856 | << ") file " << old_filename | |
3857 | << " not found" << dendl; | |
3858 | return -ENOENT; | |
3859 | } | |
3860 | FileRef file = q->second; | |
3861 | ||
20effc67 TL |
3862 | p = nodes.dir_map.find(new_dirname); |
3863 | if (p == nodes.dir_map.end()) { | |
7c673cae FG |
3864 | dout(20) << __func__ << " dir " << new_dirname << " not found" << dendl; |
3865 | return -ENOENT; | |
3866 | } | |
3867 | DirRef new_dir = p->second; | |
3868 | q = new_dir->file_map.find(new_filename); | |
3869 | if (q != new_dir->file_map.end()) { | |
3870 | dout(20) << __func__ << " dir " << new_dirname << " (" << old_dir | |
3871 | << ") file " << new_filename | |
3872 | << " already exists, unlinking" << dendl; | |
11fdf7f2 | 3873 | ceph_assert(q->second != file); |
20effc67 TL |
3874 | log.t.op_dir_unlink(new_dirname, new_filename); |
3875 | _drop_link_D(q->second); | |
7c673cae FG |
3876 | } |
3877 | ||
3878 | dout(10) << __func__ << " " << new_dirname << "/" << new_filename << " " | |
3879 | << " " << file->fnode << dendl; | |
3880 | ||
b3b6e05e TL |
3881 | new_dir->file_map[string{new_filename}] = file; |
3882 | old_dir->file_map.erase(string{old_filename}); | |
7c673cae | 3883 | |
20effc67 TL |
3884 | log.t.op_dir_link(new_dirname, new_filename, file->fnode.ino); |
3885 | log.t.op_dir_unlink(old_dirname, old_filename); | |
7c673cae FG |
3886 | return 0; |
3887 | } | |
3888 | ||
20effc67 | 3889 | int BlueFS::mkdir(std::string_view dirname)/*_LN*/ |
7c673cae | 3890 | { |
20effc67 TL |
3891 | std::lock_guard ll(log.lock); |
3892 | std::lock_guard nl(nodes.lock); | |
7c673cae | 3893 | dout(10) << __func__ << " " << dirname << dendl; |
20effc67 TL |
3894 | map<string,DirRef>::iterator p = nodes.dir_map.find(dirname); |
3895 | if (p != nodes.dir_map.end()) { | |
7c673cae FG |
3896 | dout(20) << __func__ << " dir " << dirname << " exists" << dendl; |
3897 | return -EEXIST; | |
3898 | } | |
20effc67 TL |
3899 | nodes.dir_map[string{dirname}] = ceph::make_ref<Dir>(); |
3900 | log.t.op_dir_create(dirname); | |
7c673cae FG |
3901 | return 0; |
3902 | } | |
3903 | ||
20effc67 | 3904 | int BlueFS::rmdir(std::string_view dirname)/*_LN*/ |
7c673cae | 3905 | { |
20effc67 TL |
3906 | std::lock_guard ll(log.lock); |
3907 | std::lock_guard nl(nodes.lock); | |
7c673cae | 3908 | dout(10) << __func__ << " " << dirname << dendl; |
20effc67 TL |
3909 | auto p = nodes.dir_map.find(dirname); |
3910 | if (p == nodes.dir_map.end()) { | |
7c673cae FG |
3911 | dout(20) << __func__ << " dir " << dirname << " does not exist" << dendl; |
3912 | return -ENOENT; | |
3913 | } | |
3914 | DirRef dir = p->second; | |
3915 | if (!dir->file_map.empty()) { | |
3916 | dout(20) << __func__ << " dir " << dirname << " not empty" << dendl; | |
3917 | return -ENOTEMPTY; | |
3918 | } | |
20effc67 TL |
3919 | nodes.dir_map.erase(string{dirname}); |
3920 | log.t.op_dir_remove(dirname); | |
7c673cae FG |
3921 | return 0; |
3922 | } | |
3923 | ||
20effc67 | 3924 | bool BlueFS::dir_exists(std::string_view dirname)/*_N*/ |
7c673cae | 3925 | { |
20effc67 TL |
3926 | std::lock_guard nl(nodes.lock); |
3927 | map<string,DirRef>::iterator p = nodes.dir_map.find(dirname); | |
3928 | bool exists = p != nodes.dir_map.end(); | |
7c673cae FG |
3929 | dout(10) << __func__ << " " << dirname << " = " << (int)exists << dendl; |
3930 | return exists; | |
3931 | } | |
3932 | ||
b3b6e05e | 3933 | int BlueFS::stat(std::string_view dirname, std::string_view filename, |
20effc67 | 3934 | uint64_t *size, utime_t *mtime)/*_N*/ |
7c673cae | 3935 | { |
20effc67 | 3936 | std::lock_guard nl(nodes.lock); |
7c673cae | 3937 | dout(10) << __func__ << " " << dirname << "/" << filename << dendl; |
20effc67 TL |
3938 | map<string,DirRef>::iterator p = nodes.dir_map.find(dirname); |
3939 | if (p == nodes.dir_map.end()) { | |
7c673cae FG |
3940 | dout(20) << __func__ << " dir " << dirname << " not found" << dendl; |
3941 | return -ENOENT; | |
3942 | } | |
3943 | DirRef dir = p->second; | |
3944 | map<string,FileRef>::iterator q = dir->file_map.find(filename); | |
3945 | if (q == dir->file_map.end()) { | |
3946 | dout(20) << __func__ << " dir " << dirname << " (" << dir | |
3947 | << ") file " << filename | |
3948 | << " not found" << dendl; | |
3949 | return -ENOENT; | |
3950 | } | |
3951 | File *file = q->second.get(); | |
3952 | dout(10) << __func__ << " " << dirname << "/" << filename | |
3953 | << " " << file->fnode << dendl; | |
3954 | if (size) | |
3955 | *size = file->fnode.size; | |
3956 | if (mtime) | |
3957 | *mtime = file->fnode.mtime; | |
3958 | return 0; | |
3959 | } | |
3960 | ||
b3b6e05e | 3961 | int BlueFS::lock_file(std::string_view dirname, std::string_view filename, |
20effc67 | 3962 | FileLock **plock)/*_LN*/ |
7c673cae | 3963 | { |
20effc67 TL |
3964 | std::lock_guard ll(log.lock); |
3965 | std::lock_guard nl(nodes.lock); | |
7c673cae | 3966 | dout(10) << __func__ << " " << dirname << "/" << filename << dendl; |
20effc67 TL |
3967 | map<string,DirRef>::iterator p = nodes.dir_map.find(dirname); |
3968 | if (p == nodes.dir_map.end()) { | |
7c673cae FG |
3969 | dout(20) << __func__ << " dir " << dirname << " not found" << dendl; |
3970 | return -ENOENT; | |
3971 | } | |
3972 | DirRef dir = p->second; | |
b3b6e05e | 3973 | auto q = dir->file_map.find(filename); |
9f95a23c | 3974 | FileRef file; |
7c673cae FG |
3975 | if (q == dir->file_map.end()) { |
3976 | dout(20) << __func__ << " dir " << dirname << " (" << dir | |
3977 | << ") file " << filename | |
3978 | << " not found, creating" << dendl; | |
9f95a23c | 3979 | file = ceph::make_ref<File>(); |
7c673cae FG |
3980 | file->fnode.ino = ++ino_last; |
3981 | file->fnode.mtime = ceph_clock_now(); | |
20effc67 | 3982 | nodes.file_map[ino_last] = file; |
b3b6e05e | 3983 | dir->file_map[string{filename}] = file; |
20effc67 | 3984 | logger->set(l_bluefs_num_files, nodes.file_map.size()); |
7c673cae | 3985 | ++file->refs; |
20effc67 TL |
3986 | log.t.op_file_update(file->fnode); |
3987 | log.t.op_dir_link(dirname, filename, file->fnode.ino); | |
7c673cae | 3988 | } else { |
9f95a23c | 3989 | file = q->second; |
7c673cae FG |
3990 | if (file->locked) { |
3991 | dout(10) << __func__ << " already locked" << dendl; | |
11fdf7f2 | 3992 | return -ENOLCK; |
7c673cae FG |
3993 | } |
3994 | } | |
3995 | file->locked = true; | |
3996 | *plock = new FileLock(file); | |
3997 | dout(10) << __func__ << " locked " << file->fnode | |
3998 | << " with " << *plock << dendl; | |
3999 | return 0; | |
4000 | } | |
4001 | ||
20effc67 | 4002 | int BlueFS::unlock_file(FileLock *fl)/*_N*/ |
7c673cae | 4003 | { |
20effc67 | 4004 | std::lock_guard nl(nodes.lock); |
7c673cae | 4005 | dout(10) << __func__ << " " << fl << " on " << fl->file->fnode << dendl; |
11fdf7f2 | 4006 | ceph_assert(fl->file->locked); |
7c673cae FG |
4007 | fl->file->locked = false; |
4008 | delete fl; | |
4009 | return 0; | |
4010 | } | |
4011 | ||
20effc67 | 4012 | int BlueFS::readdir(std::string_view dirname, vector<string> *ls)/*_N*/ |
7c673cae | 4013 | { |
b3b6e05e TL |
4014 | // dirname may contain a trailing / |
4015 | if (!dirname.empty() && dirname.back() == '/') { | |
4016 | dirname.remove_suffix(1); | |
4017 | } | |
20effc67 | 4018 | std::lock_guard nl(nodes.lock); |
7c673cae FG |
4019 | dout(10) << __func__ << " " << dirname << dendl; |
4020 | if (dirname.empty()) { | |
4021 | // list dirs | |
20effc67 TL |
4022 | ls->reserve(nodes.dir_map.size() + 2); |
4023 | for (auto& q : nodes.dir_map) { | |
7c673cae FG |
4024 | ls->push_back(q.first); |
4025 | } | |
4026 | } else { | |
4027 | // list files in dir | |
20effc67 TL |
4028 | map<string,DirRef>::iterator p = nodes.dir_map.find(dirname); |
4029 | if (p == nodes.dir_map.end()) { | |
7c673cae FG |
4030 | dout(20) << __func__ << " dir " << dirname << " not found" << dendl; |
4031 | return -ENOENT; | |
4032 | } | |
4033 | DirRef dir = p->second; | |
4034 | ls->reserve(dir->file_map.size() + 2); | |
4035 | for (auto& q : dir->file_map) { | |
4036 | ls->push_back(q.first); | |
4037 | } | |
4038 | } | |
4039 | ls->push_back("."); | |
4040 | ls->push_back(".."); | |
4041 | return 0; | |
4042 | } | |
4043 | ||
20effc67 | 4044 | int BlueFS::unlink(std::string_view dirname, std::string_view filename)/*_LND*/ |
7c673cae | 4045 | { |
20effc67 TL |
4046 | std::lock_guard ll(log.lock); |
4047 | std::lock_guard nl(nodes.lock); | |
7c673cae | 4048 | dout(10) << __func__ << " " << dirname << "/" << filename << dendl; |
20effc67 TL |
4049 | map<string,DirRef>::iterator p = nodes.dir_map.find(dirname); |
4050 | if (p == nodes.dir_map.end()) { | |
7c673cae FG |
4051 | dout(20) << __func__ << " dir " << dirname << " not found" << dendl; |
4052 | return -ENOENT; | |
4053 | } | |
4054 | DirRef dir = p->second; | |
4055 | map<string,FileRef>::iterator q = dir->file_map.find(filename); | |
4056 | if (q == dir->file_map.end()) { | |
4057 | dout(20) << __func__ << " file " << dirname << "/" << filename | |
4058 | << " not found" << dendl; | |
4059 | return -ENOENT; | |
4060 | } | |
4061 | FileRef file = q->second; | |
4062 | if (file->locked) { | |
4063 | dout(20) << __func__ << " file " << dirname << "/" << filename | |
4064 | << " is locked" << dendl; | |
4065 | return -EBUSY; | |
4066 | } | |
b3b6e05e | 4067 | dir->file_map.erase(string{filename}); |
20effc67 TL |
4068 | log.t.op_dir_unlink(dirname, filename); |
4069 | _drop_link_D(file); | |
7c673cae FG |
4070 | return 0; |
4071 | } | |
d2e6a577 FG |
4072 | |
4073 | bool BlueFS::wal_is_rotational() | |
4074 | { | |
94b18763 FG |
4075 | if (bdev[BDEV_WAL]) { |
4076 | return bdev[BDEV_WAL]->is_rotational(); | |
4077 | } else if (bdev[BDEV_DB]) { | |
4078 | return bdev[BDEV_DB]->is_rotational(); | |
4079 | } | |
4080 | return bdev[BDEV_SLOW]->is_rotational(); | |
d2e6a577 | 4081 | } |
9f95a23c | 4082 | |
f6b5b4d7 TL |
4083 | /* |
4084 | Algorithm. | |
4085 | do_replay_recovery_read is used when bluefs log abruptly ends, but it seems that more data should be there. | |
4086 | Idea is to search disk for definiton of extents that will be accompanied with bluefs log in future, | |
4087 | and try if using it will produce healthy bluefs transaction. | |
4088 | We encode already known bluefs log extents and search disk for these bytes. | |
4089 | When we find it, we decode following bytes as extent. | |
4090 | We read that whole extent and then check if merged with existing log part gives a proper bluefs transaction. | |
4091 | */ | |
20effc67 | 4092 | int BlueFS::_do_replay_recovery_read(FileReader *log_reader, |
f6b5b4d7 TL |
4093 | size_t replay_pos, |
4094 | size_t read_offset, | |
4095 | size_t read_len, | |
4096 | bufferlist* bl) { | |
4097 | dout(1) << __func__ << " replay_pos=0x" << std::hex << replay_pos << | |
4098 | " needs 0x" << read_offset << "~" << read_len << std::dec << dendl; | |
4099 | ||
4100 | bluefs_fnode_t& log_fnode = log_reader->file->fnode; | |
4101 | bufferlist bin_extents; | |
f67539c2 | 4102 | ::encode(log_fnode.extents, bin_extents); |
f6b5b4d7 TL |
4103 | dout(2) << __func__ << " log file encoded extents length = " << bin_extents.length() << dendl; |
4104 | ||
4105 | // cannot process if too small to effectively search | |
4106 | ceph_assert(bin_extents.length() >= 32); | |
4107 | bufferlist last_32; | |
4108 | last_32.substr_of(bin_extents, bin_extents.length() - 32, 32); | |
4109 | ||
4110 | //read fixed part from replay_pos to end of bluefs_log extents | |
4111 | bufferlist fixed; | |
4112 | uint64_t e_off = 0; | |
4113 | auto e = log_fnode.seek(replay_pos, &e_off); | |
4114 | ceph_assert(e != log_fnode.extents.end()); | |
20effc67 TL |
4115 | int r = _bdev_read(e->bdev, e->offset + e_off, e->length - e_off, &fixed, ioc[e->bdev], |
4116 | cct->_conf->bluefs_buffered_io); | |
f6b5b4d7 TL |
4117 | ceph_assert(r == 0); |
4118 | //capture dev of last good extent | |
4119 | uint8_t last_e_dev = e->bdev; | |
4120 | uint64_t last_e_off = e->offset; | |
4121 | ++e; | |
4122 | while (e != log_fnode.extents.end()) { | |
20effc67 TL |
4123 | r = _bdev_read(e->bdev, e->offset, e->length, &fixed, ioc[e->bdev], |
4124 | cct->_conf->bluefs_buffered_io); | |
f6b5b4d7 TL |
4125 | ceph_assert(r == 0); |
4126 | last_e_dev = e->bdev; | |
4127 | ++e; | |
4128 | } | |
4129 | ceph_assert(replay_pos + fixed.length() == read_offset); | |
4130 | ||
4131 | dout(2) << __func__ << " valid data in log = " << fixed.length() << dendl; | |
4132 | ||
4133 | struct compare { | |
4134 | bool operator()(const bluefs_extent_t& a, const bluefs_extent_t& b) const { | |
4135 | if (a.bdev < b.bdev) return true; | |
4136 | if (a.offset < b.offset) return true; | |
4137 | return a.length < b.length; | |
4138 | } | |
4139 | }; | |
4140 | std::set<bluefs_extent_t, compare> extents_rejected; | |
4141 | for (int dcnt = 0; dcnt < 3; dcnt++) { | |
4142 | uint8_t dev = (last_e_dev + dcnt) % MAX_BDEV; | |
4143 | if (bdev[dev] == nullptr) continue; | |
4144 | dout(2) << __func__ << " processing " << get_device_name(dev) << dendl; | |
4145 | interval_set<uint64_t> disk_regions; | |
4146 | disk_regions.insert(0, bdev[dev]->get_size()); | |
20effc67 | 4147 | for (auto f : nodes.file_map) { |
f6b5b4d7 TL |
4148 | auto& e = f.second->fnode.extents; |
4149 | for (auto& p : e) { | |
4150 | if (p.bdev == dev) { | |
4151 | disk_regions.erase(p.offset, p.length); | |
4152 | } | |
4153 | } | |
4154 | } | |
4155 | size_t disk_regions_count = disk_regions.num_intervals(); | |
4156 | dout(5) << __func__ << " " << disk_regions_count << " regions to scan on " << get_device_name(dev) << dendl; | |
4157 | ||
4158 | auto reg = disk_regions.lower_bound(last_e_off); | |
4159 | //for all except first, start from beginning | |
4160 | last_e_off = 0; | |
4161 | if (reg == disk_regions.end()) { | |
4162 | reg = disk_regions.begin(); | |
4163 | } | |
4164 | const uint64_t chunk_size = 4 * 1024 * 1024; | |
4165 | const uint64_t page_size = 4096; | |
4166 | const uint64_t max_extent_size = 16; | |
4167 | uint64_t overlay_size = last_32.length() + max_extent_size; | |
4168 | for (size_t i = 0; i < disk_regions_count; reg++, i++) { | |
4169 | if (reg == disk_regions.end()) { | |
4170 | reg = disk_regions.begin(); | |
4171 | } | |
4172 | uint64_t pos = reg.get_start(); | |
4173 | uint64_t len = reg.get_len(); | |
4174 | ||
4175 | std::unique_ptr<char[]> raw_data_p{new char[page_size + chunk_size]}; | |
4176 | char* raw_data = raw_data_p.get(); | |
4177 | memset(raw_data, 0, page_size); | |
4178 | ||
4179 | while (len > last_32.length()) { | |
4180 | uint64_t chunk_len = len > chunk_size ? chunk_size : len; | |
4181 | dout(5) << __func__ << " read " | |
20effc67 TL |
4182 | << get_device_name(dev) << ":0x" << std::hex << pos << "+" << chunk_len |
4183 | << std::dec << dendl; | |
4184 | r = _bdev_read_random(dev, pos, chunk_len, | |
4185 | raw_data + page_size, cct->_conf->bluefs_buffered_io); | |
f6b5b4d7 TL |
4186 | ceph_assert(r == 0); |
4187 | ||
4188 | //search for fixed_last_32 | |
4189 | char* chunk_b = raw_data + page_size; | |
4190 | char* chunk_e = chunk_b + chunk_len; | |
4191 | ||
4192 | char* search_b = chunk_b - overlay_size; | |
4193 | char* search_e = chunk_e; | |
4194 | ||
4195 | for (char* sp = search_b; ; sp += last_32.length()) { | |
4196 | sp = (char*)memmem(sp, search_e - sp, last_32.c_str(), last_32.length()); | |
4197 | if (sp == nullptr) { | |
4198 | break; | |
4199 | } | |
4200 | ||
4201 | char* n = sp + last_32.length(); | |
4202 | dout(5) << __func__ << " checking location 0x" << std::hex << pos + (n - chunk_b) << std::dec << dendl; | |
4203 | bufferlist test; | |
4204 | test.append(n, std::min<size_t>(max_extent_size, chunk_e - n)); | |
4205 | bluefs_extent_t ne; | |
4206 | try { | |
4207 | bufferlist::const_iterator p = test.begin(); | |
f67539c2 | 4208 | ::decode(ne, p); |
f6b5b4d7 TL |
4209 | } catch (buffer::error& e) { |
4210 | continue; | |
4211 | } | |
4212 | if (extents_rejected.count(ne) != 0) { | |
4213 | dout(5) << __func__ << " extent " << ne << " already refected" <<dendl; | |
4214 | continue; | |
4215 | } | |
4216 | //insert as rejected already. if we succeed, it wouldn't make difference. | |
4217 | extents_rejected.insert(ne); | |
4218 | ||
4219 | if (ne.bdev >= MAX_BDEV || | |
4220 | bdev[ne.bdev] == nullptr || | |
4221 | ne.length > 16 * 1024 * 1024 || | |
4222 | (ne.length & 4095) != 0 || | |
4223 | ne.offset + ne.length > bdev[ne.bdev]->get_size() || | |
4224 | (ne.offset & 4095) != 0) { | |
4225 | dout(5) << __func__ << " refusing extent " << ne << dendl; | |
4226 | continue; | |
4227 | } | |
4228 | dout(5) << __func__ << " checking extent " << ne << dendl; | |
4229 | ||
4230 | //read candidate extent - whole | |
4231 | bufferlist candidate; | |
4232 | candidate.append(fixed); | |
20effc67 TL |
4233 | r = _bdev_read(ne.bdev, ne.offset, ne.length, &candidate, ioc[ne.bdev], |
4234 | cct->_conf->bluefs_buffered_io); | |
f6b5b4d7 TL |
4235 | ceph_assert(r == 0); |
4236 | ||
4237 | //check if transaction & crc is ok | |
4238 | bluefs_transaction_t t; | |
4239 | try { | |
f67539c2 TL |
4240 | bufferlist::const_iterator p = candidate.begin(); |
4241 | ::decode(t, p); | |
f6b5b4d7 TL |
4242 | } |
4243 | catch (buffer::error& e) { | |
4244 | dout(5) << __func__ << " failed match" << dendl; | |
4245 | continue; | |
4246 | } | |
4247 | ||
4248 | //success, it seems a probable candidate | |
4249 | uint64_t l = std::min<uint64_t>(ne.length, read_len); | |
4250 | //trim to required size | |
4251 | bufferlist requested_read; | |
4252 | requested_read.substr_of(candidate, fixed.length(), l); | |
4253 | bl->append(requested_read); | |
4254 | dout(5) << __func__ << " successful extension of log " << l << "/" << read_len << dendl; | |
4255 | log_fnode.append_extent(ne); | |
4256 | log_fnode.recalc_allocated(); | |
4257 | log_reader->buf.pos += l; | |
4258 | return l; | |
4259 | } | |
4260 | //save overlay for next search | |
4261 | memcpy(search_b, chunk_e - overlay_size, overlay_size); | |
4262 | pos += chunk_len; | |
4263 | len -= chunk_len; | |
4264 | } | |
4265 | } | |
4266 | } | |
4267 | return 0; | |
4268 | } | |
4269 | ||
20effc67 TL |
4270 | void BlueFS::_check_vselector_LNF() { |
4271 | BlueFSVolumeSelector* vs = vselector->clone_empty(); | |
4272 | if (!vs) { | |
4273 | return; | |
4274 | } | |
4275 | std::lock_guard ll(log.lock); | |
4276 | std::lock_guard nl(nodes.lock); | |
4277 | // Checking vselector is under log, nodes and file(s) locks, | |
4278 | // so any modification of vselector must be under at least one of those locks. | |
4279 | for (auto& f : nodes.file_map) { | |
4280 | f.second->lock.lock(); | |
4281 | vs->add_usage(f.second->vselector_hint, f.second->fnode); | |
4282 | } | |
4283 | bool res = vselector->compare(vs); | |
4284 | if (!res) { | |
4285 | dout(0) << "Current:"; | |
4286 | vselector->dump(*_dout); | |
4287 | *_dout << dendl; | |
4288 | dout(0) << "Expected:"; | |
4289 | vs->dump(*_dout); | |
4290 | *_dout << dendl; | |
4291 | } | |
4292 | ceph_assert(res); | |
4293 | for (auto& f : nodes.file_map) { | |
4294 | f.second->lock.unlock(); | |
4295 | } | |
4296 | delete vs; | |
4297 | } | |
4298 | ||
f67539c2 | 4299 | size_t BlueFS::probe_alloc_avail(int dev, uint64_t alloc_size) |
9f95a23c | 4300 | { |
f67539c2 TL |
4301 | size_t total = 0; |
4302 | auto iterated_allocation = [&](size_t off, size_t len) { | |
4303 | //only count in size that is alloc_size aligned | |
4304 | size_t dist_to_alignment; | |
4305 | size_t offset_in_block = off & (alloc_size - 1); | |
4306 | if (offset_in_block == 0) | |
4307 | dist_to_alignment = 0; | |
4308 | else | |
4309 | dist_to_alignment = alloc_size - offset_in_block; | |
4310 | if (dist_to_alignment >= len) | |
4311 | return; | |
4312 | len -= dist_to_alignment; | |
4313 | total += p2align(len, alloc_size); | |
4314 | }; | |
4315 | if (alloc[dev]) { | |
4316 | alloc[dev]->dump(iterated_allocation); | |
9f95a23c | 4317 | } |
f67539c2 | 4318 | return total; |
9f95a23c | 4319 | } |
9f95a23c TL |
4320 | // =============================================== |
4321 | // OriginalVolumeSelector | |
4322 | ||
f6b5b4d7 TL |
4323 | void* OriginalVolumeSelector::get_hint_for_log() const { |
4324 | return reinterpret_cast<void*>(BlueFS::BDEV_WAL); | |
9f95a23c | 4325 | } |
b3b6e05e | 4326 | void* OriginalVolumeSelector::get_hint_by_dir(std::string_view dirname) const { |
9f95a23c TL |
4327 | uint8_t res = BlueFS::BDEV_DB; |
4328 | if (dirname.length() > 5) { | |
4329 | // the "db.slow" and "db.wal" directory names are hard-coded at | |
4330 | // match up with bluestore. the slow device is always the second | |
4331 | // one (when a dedicated block.db device is present and used at | |
4332 | // bdev 0). the wal device is always last. | |
a4b75251 | 4333 | if (boost::algorithm::ends_with(dirname, ".slow") && slow_total) { |
9f95a23c | 4334 | res = BlueFS::BDEV_SLOW; |
a4b75251 | 4335 | } else if (boost::algorithm::ends_with(dirname, ".wal") && wal_total) { |
9f95a23c TL |
4336 | res = BlueFS::BDEV_WAL; |
4337 | } | |
4338 | } | |
4339 | return reinterpret_cast<void*>(res); | |
4340 | } | |
4341 | ||
4342 | uint8_t OriginalVolumeSelector::select_prefer_bdev(void* hint) | |
4343 | { | |
4344 | return (uint8_t)(reinterpret_cast<uint64_t>(hint)); | |
4345 | } | |
4346 | ||
4347 | void OriginalVolumeSelector::get_paths(const std::string& base, paths& res) const | |
4348 | { | |
4349 | res.emplace_back(base, db_total); | |
522d829b TL |
4350 | res.emplace_back(base + ".slow", |
4351 | slow_total ? slow_total : db_total); // use fake non-zero value if needed to | |
4352 | // avoid RocksDB complains | |
9f95a23c TL |
4353 | } |
4354 | ||
4355 | #undef dout_prefix | |
4356 | #define dout_prefix *_dout << "OriginalVolumeSelector: " | |
4357 | ||
4358 | void OriginalVolumeSelector::dump(ostream& sout) { | |
4359 | sout<< "wal_total:" << wal_total | |
4360 | << ", db_total:" << db_total | |
4361 | << ", slow_total:" << slow_total | |
4362 | << std::endl; | |
4363 | } | |
f67539c2 TL |
4364 | |
4365 | // =============================================== | |
4366 | // FitToFastVolumeSelector | |
4367 | ||
4368 | void FitToFastVolumeSelector::get_paths(const std::string& base, paths& res) const { | |
4369 | res.emplace_back(base, 1); // size of the last db_path has no effect | |
4370 | } |