]>
Commit | Line | Data |
---|---|---|
7c673cae FG |
1 | // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- |
2 | // vim: ts=8 sw=2 smarttab | |
39ae355f | 3 | #include <chrono> |
7c673cae | 4 | #include "boost/algorithm/string.hpp" |
9f95a23c | 5 | #include "bluestore_common.h" |
7c673cae FG |
6 | #include "BlueFS.h" |
7 | ||
8 | #include "common/debug.h" | |
9 | #include "common/errno.h" | |
10 | #include "common/perf_counters.h" | |
7c673cae | 11 | #include "Allocator.h" |
11fdf7f2 | 12 | #include "include/ceph_assert.h" |
eafe8130 | 13 | #include "common/admin_socket.h" |
7c673cae FG |
14 | |
15 | #define dout_context cct | |
16 | #define dout_subsys ceph_subsys_bluefs | |
17 | #undef dout_prefix | |
18 | #define dout_prefix *_dout << "bluefs " | |
9f95a23c | 19 | using TOPNSPC::common::cmd_getval; |
f67539c2 TL |
20 | |
21 | using std::byte; | |
22 | using std::list; | |
23 | using std::make_pair; | |
24 | using std::map; | |
25 | using std::ostream; | |
26 | using std::pair; | |
27 | using std::set; | |
28 | using std::string; | |
29 | using std::to_string; | |
30 | using std::vector; | |
39ae355f | 31 | using std::chrono::duration; |
39ae355f | 32 | using std::chrono::seconds; |
f67539c2 TL |
33 | |
34 | using ceph::bufferlist; | |
35 | using ceph::decode; | |
36 | using ceph::encode; | |
37 | using ceph::Formatter; | |
38 | ||
39 | ||
7c673cae FG |
40 | MEMPOOL_DEFINE_OBJECT_FACTORY(BlueFS::File, bluefs_file, bluefs); |
41 | MEMPOOL_DEFINE_OBJECT_FACTORY(BlueFS::Dir, bluefs_dir, bluefs); | |
f91f0fd5 | 42 | MEMPOOL_DEFINE_OBJECT_FACTORY(BlueFS::FileWriter, bluefs_file_writer, bluefs_file_writer); |
7c673cae | 43 | MEMPOOL_DEFINE_OBJECT_FACTORY(BlueFS::FileReaderBuffer, |
f91f0fd5 TL |
44 | bluefs_file_reader_buffer, bluefs_file_reader); |
45 | MEMPOOL_DEFINE_OBJECT_FACTORY(BlueFS::FileReader, bluefs_file_reader, bluefs_file_reader); | |
7c673cae FG |
46 | MEMPOOL_DEFINE_OBJECT_FACTORY(BlueFS::FileLock, bluefs_file_lock, bluefs); |
47 | ||
11fdf7f2 TL |
48 | static void wal_discard_cb(void *priv, void* priv2) { |
49 | BlueFS *bluefs = static_cast<BlueFS*>(priv); | |
50 | interval_set<uint64_t> *tmp = static_cast<interval_set<uint64_t>*>(priv2); | |
51 | bluefs->handle_discard(BlueFS::BDEV_WAL, *tmp); | |
52 | } | |
53 | ||
54 | static void db_discard_cb(void *priv, void* priv2) { | |
55 | BlueFS *bluefs = static_cast<BlueFS*>(priv); | |
56 | interval_set<uint64_t> *tmp = static_cast<interval_set<uint64_t>*>(priv2); | |
57 | bluefs->handle_discard(BlueFS::BDEV_DB, *tmp); | |
58 | } | |
59 | ||
60 | static void slow_discard_cb(void *priv, void* priv2) { | |
61 | BlueFS *bluefs = static_cast<BlueFS*>(priv); | |
62 | interval_set<uint64_t> *tmp = static_cast<interval_set<uint64_t>*>(priv2); | |
63 | bluefs->handle_discard(BlueFS::BDEV_SLOW, *tmp); | |
64 | } | |
7c673cae | 65 | |
eafe8130 TL |
66 | class BlueFS::SocketHook : public AdminSocketHook { |
67 | BlueFS* bluefs; | |
68 | public: | |
69 | static BlueFS::SocketHook* create(BlueFS* bluefs) | |
70 | { | |
71 | BlueFS::SocketHook* hook = nullptr; | |
72 | AdminSocket* admin_socket = bluefs->cct->get_admin_socket(); | |
73 | if (admin_socket) { | |
74 | hook = new BlueFS::SocketHook(bluefs); | |
f67539c2 | 75 | int r = admin_socket->register_command("bluestore bluefs device info " |
eafe8130 TL |
76 | "name=alloc_size,type=CephInt,req=false", |
77 | hook, | |
f67539c2 TL |
78 | "Shows space report for bluefs devices. " |
79 | "This also includes an estimation for space " | |
80 | "available to bluefs at main device. " | |
81 | "alloc_size, if set, specifies the custom bluefs " | |
82 | "allocation unit size for the estimation above."); | |
eafe8130 TL |
83 | if (r != 0) { |
84 | ldout(bluefs->cct, 1) << __func__ << " cannot register SocketHook" << dendl; | |
85 | delete hook; | |
86 | hook = nullptr; | |
9f95a23c | 87 | } else { |
f6b5b4d7 | 88 | r = admin_socket->register_command("bluefs stats", |
9f95a23c TL |
89 | hook, |
90 | "Dump internal statistics for bluefs." | |
91 | ""); | |
92 | ceph_assert(r == 0); | |
f67539c2 TL |
93 | r = admin_socket->register_command("bluefs files list", hook, |
94 | "print files in bluefs"); | |
95 | ceph_assert(r == 0); | |
cd265ab1 TL |
96 | r = admin_socket->register_command("bluefs debug_inject_read_zeros", hook, |
97 | "Injects 8K zeros into next BlueFS read. Debug only."); | |
98 | ceph_assert(r == 0); | |
eafe8130 TL |
99 | } |
100 | } | |
101 | return hook; | |
102 | } | |
103 | ||
104 | ~SocketHook() { | |
105 | AdminSocket* admin_socket = bluefs->cct->get_admin_socket(); | |
9f95a23c | 106 | admin_socket->unregister_commands(this); |
eafe8130 TL |
107 | } |
108 | private: | |
109 | SocketHook(BlueFS* bluefs) : | |
110 | bluefs(bluefs) {} | |
9f95a23c | 111 | int call(std::string_view command, const cmdmap_t& cmdmap, |
39ae355f | 112 | const bufferlist&, |
9f95a23c TL |
113 | Formatter *f, |
114 | std::ostream& errss, | |
115 | bufferlist& out) override { | |
f67539c2 | 116 | if (command == "bluestore bluefs device info") { |
9f95a23c TL |
117 | int64_t alloc_size = 0; |
118 | cmd_getval(cmdmap, "alloc_size", alloc_size); | |
119 | if ((alloc_size & (alloc_size - 1)) != 0) { | |
120 | errss << "Invalid allocation size:'" << alloc_size << std::endl; | |
121 | return -EINVAL; | |
122 | } | |
123 | if (alloc_size == 0) | |
f67539c2 TL |
124 | alloc_size = bluefs->cct->_conf->bluefs_shared_alloc_size; |
125 | f->open_object_section("bluefs_device_info"); | |
9f95a23c TL |
126 | for (unsigned dev = BDEV_WAL; dev <= BDEV_SLOW; dev++) { |
127 | if (bluefs->bdev[dev]) { | |
128 | f->open_object_section("dev"); | |
129 | f->dump_string("device", bluefs->get_device_name(dev)); | |
130 | ceph_assert(bluefs->alloc[dev]); | |
f67539c2 TL |
131 | auto total = bluefs->get_total(dev); |
132 | auto free = bluefs->get_free(dev); | |
133 | auto used = bluefs->get_used(dev); | |
134 | ||
135 | f->dump_int("total", total); | |
136 | f->dump_int("free", free); | |
137 | f->dump_int("bluefs_used", used); | |
138 | if (bluefs->is_shared_alloc(dev)) { | |
139 | size_t avail = bluefs->probe_alloc_avail(dev, alloc_size); | |
140 | f->dump_int("bluefs max available", avail); | |
141 | } | |
142 | f->close_section(); | |
143 | } | |
eafe8130 | 144 | } |
f67539c2 | 145 | |
9f95a23c TL |
146 | f->close_section(); |
147 | } else if (command == "bluefs stats") { | |
148 | std::stringstream ss; | |
149 | bluefs->dump_block_extents(ss); | |
150 | bluefs->dump_volume_selector(ss); | |
eafe8130 | 151 | out.append(ss); |
f67539c2 TL |
152 | } else if (command == "bluefs files list") { |
153 | const char* devnames[3] = {"wal","db","slow"}; | |
20effc67 | 154 | std::lock_guard l(bluefs->nodes.lock); |
f67539c2 | 155 | f->open_array_section("files"); |
20effc67 | 156 | for (auto &d : bluefs->nodes.dir_map) { |
f67539c2 TL |
157 | std::string dir = d.first; |
158 | for (auto &r : d.second->file_map) { | |
159 | f->open_object_section("file"); | |
160 | f->dump_string("name", (dir + "/" + r.first).c_str()); | |
161 | std::vector<size_t> sizes; | |
162 | sizes.resize(bluefs->bdev.size()); | |
163 | for(auto& i : r.second->fnode.extents) { | |
164 | sizes[i.bdev] += i.length; | |
165 | } | |
166 | for (size_t i = 0; i < sizes.size(); i++) { | |
167 | if (sizes[i]>0) { | |
168 | if (i < sizeof(devnames) / sizeof(*devnames)) | |
169 | f->dump_int(devnames[i], sizes[i]); | |
170 | else | |
171 | f->dump_int(("dev-"+to_string(i)).c_str(), sizes[i]); | |
172 | } | |
173 | } | |
174 | f->close_section(); | |
175 | } | |
176 | } | |
177 | f->close_section(); | |
178 | f->flush(out); | |
cd265ab1 TL |
179 | } else if (command == "bluefs debug_inject_read_zeros") { |
180 | bluefs->inject_read_zeros++; | |
9f95a23c TL |
181 | } else { |
182 | errss << "Invalid command" << std::endl; | |
183 | return -ENOSYS; | |
eafe8130 | 184 | } |
9f95a23c TL |
185 | return 0; |
186 | } | |
eafe8130 TL |
187 | }; |
188 | ||
7c673cae FG |
189 | BlueFS::BlueFS(CephContext* cct) |
190 | : cct(cct), | |
191 | bdev(MAX_BDEV), | |
192 | ioc(MAX_BDEV), | |
f67539c2 TL |
193 | block_reserved(MAX_BDEV), |
194 | alloc(MAX_BDEV), | |
20effc67 | 195 | alloc_size(MAX_BDEV, 0) |
7c673cae | 196 | { |
20effc67 | 197 | dirty.pending_release.resize(MAX_BDEV); |
11fdf7f2 TL |
198 | discard_cb[BDEV_WAL] = wal_discard_cb; |
199 | discard_cb[BDEV_DB] = db_discard_cb; | |
200 | discard_cb[BDEV_SLOW] = slow_discard_cb; | |
eafe8130 | 201 | asok_hook = SocketHook::create(this); |
7c673cae FG |
202 | } |
203 | ||
204 | BlueFS::~BlueFS() | |
205 | { | |
eafe8130 | 206 | delete asok_hook; |
7c673cae FG |
207 | for (auto p : ioc) { |
208 | if (p) | |
209 | p->aio_wait(); | |
210 | } | |
211 | for (auto p : bdev) { | |
212 | if (p) { | |
213 | p->close(); | |
214 | delete p; | |
215 | } | |
216 | } | |
217 | for (auto p : ioc) { | |
218 | delete p; | |
219 | } | |
220 | } | |
221 | ||
222 | void BlueFS::_init_logger() | |
223 | { | |
224 | PerfCountersBuilder b(cct, "bluefs", | |
225 | l_bluefs_first, l_bluefs_last); | |
7c673cae FG |
226 | b.add_u64(l_bluefs_db_total_bytes, "db_total_bytes", |
227 | "Total bytes (main db device)", | |
11fdf7f2 | 228 | "b", PerfCountersBuilder::PRIO_USEFUL, unit_t(UNIT_BYTES)); |
7c673cae FG |
229 | b.add_u64(l_bluefs_db_used_bytes, "db_used_bytes", |
230 | "Used bytes (main db device)", | |
11fdf7f2 | 231 | "u", PerfCountersBuilder::PRIO_USEFUL, unit_t(UNIT_BYTES)); |
7c673cae FG |
232 | b.add_u64(l_bluefs_wal_total_bytes, "wal_total_bytes", |
233 | "Total bytes (wal device)", | |
11fdf7f2 | 234 | "walb", PerfCountersBuilder::PRIO_USEFUL, unit_t(UNIT_BYTES)); |
7c673cae FG |
235 | b.add_u64(l_bluefs_wal_used_bytes, "wal_used_bytes", |
236 | "Used bytes (wal device)", | |
11fdf7f2 | 237 | "walu", PerfCountersBuilder::PRIO_USEFUL, unit_t(UNIT_BYTES)); |
7c673cae FG |
238 | b.add_u64(l_bluefs_slow_total_bytes, "slow_total_bytes", |
239 | "Total bytes (slow device)", | |
11fdf7f2 | 240 | "slob", PerfCountersBuilder::PRIO_USEFUL, unit_t(UNIT_BYTES)); |
7c673cae FG |
241 | b.add_u64(l_bluefs_slow_used_bytes, "slow_used_bytes", |
242 | "Used bytes (slow device)", | |
11fdf7f2 | 243 | "slou", PerfCountersBuilder::PRIO_USEFUL, unit_t(UNIT_BYTES)); |
7c673cae FG |
244 | b.add_u64(l_bluefs_num_files, "num_files", "File count", |
245 | "f", PerfCountersBuilder::PRIO_USEFUL); | |
246 | b.add_u64(l_bluefs_log_bytes, "log_bytes", "Size of the metadata log", | |
11fdf7f2 | 247 | "jlen", PerfCountersBuilder::PRIO_INTERESTING, unit_t(UNIT_BYTES)); |
7c673cae FG |
248 | b.add_u64_counter(l_bluefs_log_compactions, "log_compactions", |
249 | "Compactions of the metadata log"); | |
1e59de90 TL |
250 | b.add_u64_counter(l_bluefs_log_write_count, "log_write_count", |
251 | "Write op count to the metadata log"); | |
7c673cae | 252 | b.add_u64_counter(l_bluefs_logged_bytes, "logged_bytes", |
20effc67 TL |
253 | "Bytes written to the metadata log", |
254 | "j", | |
11fdf7f2 | 255 | PerfCountersBuilder::PRIO_CRITICAL, unit_t(UNIT_BYTES)); |
7c673cae FG |
256 | b.add_u64_counter(l_bluefs_files_written_wal, "files_written_wal", |
257 | "Files written to WAL"); | |
258 | b.add_u64_counter(l_bluefs_files_written_sst, "files_written_sst", | |
259 | "Files written to SSTs"); | |
1e59de90 TL |
260 | b.add_u64_counter(l_bluefs_write_count_wal, "write_count_wal", |
261 | "Write op count to WAL"); | |
262 | b.add_u64_counter(l_bluefs_write_count_sst, "write_count_sst", | |
263 | "Write op count to SSTs"); | |
7c673cae | 264 | b.add_u64_counter(l_bluefs_bytes_written_wal, "bytes_written_wal", |
20effc67 TL |
265 | "Bytes written to WAL", |
266 | "walb", | |
7c673cae FG |
267 | PerfCountersBuilder::PRIO_CRITICAL); |
268 | b.add_u64_counter(l_bluefs_bytes_written_sst, "bytes_written_sst", | |
20effc67 TL |
269 | "Bytes written to SSTs", |
270 | "sstb", | |
11fdf7f2 TL |
271 | PerfCountersBuilder::PRIO_CRITICAL, unit_t(UNIT_BYTES)); |
272 | b.add_u64_counter(l_bluefs_bytes_written_slow, "bytes_written_slow", | |
20effc67 TL |
273 | "Bytes written to WAL/SSTs at slow device", |
274 | "slwb", | |
275 | PerfCountersBuilder::PRIO_CRITICAL, unit_t(UNIT_BYTES)); | |
11fdf7f2 | 276 | b.add_u64_counter(l_bluefs_max_bytes_wal, "max_bytes_wal", |
20effc67 TL |
277 | "Maximum bytes allocated from WAL", |
278 | "mxwb", | |
279 | PerfCountersBuilder::PRIO_INTERESTING, | |
280 | unit_t(UNIT_BYTES)); | |
11fdf7f2 | 281 | b.add_u64_counter(l_bluefs_max_bytes_db, "max_bytes_db", |
20effc67 TL |
282 | "Maximum bytes allocated from DB", |
283 | "mxdb", | |
284 | PerfCountersBuilder::PRIO_INTERESTING, | |
285 | unit_t(UNIT_BYTES)); | |
11fdf7f2 | 286 | b.add_u64_counter(l_bluefs_max_bytes_slow, "max_bytes_slow", |
20effc67 TL |
287 | "Maximum bytes allocated from SLOW", |
288 | "mxwb", | |
289 | PerfCountersBuilder::PRIO_INTERESTING, | |
290 | unit_t(UNIT_BYTES)); | |
291 | b.add_u64_counter(l_bluefs_main_alloc_unit, "alloc_unit_main", | |
292 | "Allocation unit size (in bytes) for primary/shared device", | |
293 | "aumb", | |
294 | PerfCountersBuilder::PRIO_CRITICAL, | |
295 | unit_t(UNIT_BYTES)); | |
296 | b.add_u64_counter(l_bluefs_db_alloc_unit, "alloc_unit_db", | |
297 | "Allocation unit size (in bytes) for standalone DB device", | |
298 | "audb", | |
299 | PerfCountersBuilder::PRIO_CRITICAL, | |
300 | unit_t(UNIT_BYTES)); | |
301 | b.add_u64_counter(l_bluefs_wal_alloc_unit, "alloc_unit_wal", | |
302 | "Allocation unit size (in bytes) for standalone WAL device", | |
303 | "auwb", | |
304 | PerfCountersBuilder::PRIO_CRITICAL, | |
305 | unit_t(UNIT_BYTES)); | |
494da23a | 306 | b.add_u64_counter(l_bluefs_read_random_count, "read_random_count", |
20effc67 TL |
307 | "random read requests processed", |
308 | NULL, | |
309 | PerfCountersBuilder::PRIO_USEFUL); | |
494da23a | 310 | b.add_u64_counter(l_bluefs_read_random_bytes, "read_random_bytes", |
20effc67 TL |
311 | "Bytes requested in random read mode", |
312 | NULL, | |
494da23a TL |
313 | PerfCountersBuilder::PRIO_USEFUL, unit_t(UNIT_BYTES)); |
314 | b.add_u64_counter(l_bluefs_read_random_disk_count, "read_random_disk_count", | |
20effc67 TL |
315 | "random reads requests going to disk", |
316 | NULL, | |
317 | PerfCountersBuilder::PRIO_USEFUL); | |
494da23a | 318 | b.add_u64_counter(l_bluefs_read_random_disk_bytes, "read_random_disk_bytes", |
20effc67 TL |
319 | "Bytes read from disk in random read mode", |
320 | "rrb", | |
321 | PerfCountersBuilder::PRIO_INTERESTING, | |
322 | unit_t(UNIT_BYTES)); | |
323 | b.add_u64_counter(l_bluefs_read_random_disk_bytes_wal, "read_random_disk_bytes_wal", | |
324 | "random reads requests going to WAL disk", | |
325 | NULL, | |
326 | PerfCountersBuilder::PRIO_USEFUL, unit_t(UNIT_BYTES)); | |
327 | b.add_u64_counter(l_bluefs_read_random_disk_bytes_db, "read_random_disk_bytes_db", | |
328 | "random reads requests going to DB disk", | |
329 | NULL, | |
494da23a | 330 | PerfCountersBuilder::PRIO_USEFUL, unit_t(UNIT_BYTES)); |
20effc67 TL |
331 | b.add_u64_counter(l_bluefs_read_random_disk_bytes_slow, "read_random_disk_bytes_slow", |
332 | "random reads requests going to main disk", | |
333 | "rrsb", | |
334 | PerfCountersBuilder::PRIO_INTERESTING, | |
335 | unit_t(UNIT_BYTES)); | |
494da23a | 336 | b.add_u64_counter(l_bluefs_read_random_buffer_count, "read_random_buffer_count", |
20effc67 TL |
337 | "random read requests processed using prefetch buffer", |
338 | NULL, | |
339 | PerfCountersBuilder::PRIO_USEFUL); | |
494da23a | 340 | b.add_u64_counter(l_bluefs_read_random_buffer_bytes, "read_random_buffer_bytes", |
20effc67 TL |
341 | "Bytes read from prefetch buffer in random read mode", |
342 | NULL, | |
494da23a | 343 | PerfCountersBuilder::PRIO_USEFUL, unit_t(UNIT_BYTES)); |
494da23a | 344 | b.add_u64_counter(l_bluefs_read_count, "read_count", |
20effc67 TL |
345 | "buffered read requests processed", |
346 | NULL, | |
347 | PerfCountersBuilder::PRIO_USEFUL); | |
494da23a | 348 | b.add_u64_counter(l_bluefs_read_bytes, "read_bytes", |
20effc67 TL |
349 | "Bytes requested in buffered read mode", |
350 | NULL, | |
494da23a | 351 | PerfCountersBuilder::PRIO_USEFUL, unit_t(UNIT_BYTES)); |
20effc67 TL |
352 | b.add_u64_counter(l_bluefs_read_disk_count, "read_disk_count", |
353 | "buffered reads requests going to disk", | |
354 | NULL, | |
355 | PerfCountersBuilder::PRIO_USEFUL); | |
356 | b.add_u64_counter(l_bluefs_read_disk_bytes, "read_disk_bytes", | |
357 | "Bytes read in buffered mode from disk", | |
358 | "rb", | |
359 | PerfCountersBuilder::PRIO_INTERESTING, unit_t(UNIT_BYTES)); | |
360 | b.add_u64_counter(l_bluefs_read_disk_bytes_wal, "read_disk_bytes_wal", | |
361 | "reads requests going to WAL disk", | |
362 | NULL, | |
363 | PerfCountersBuilder::PRIO_USEFUL, unit_t(UNIT_BYTES)); | |
364 | b.add_u64_counter(l_bluefs_read_disk_bytes_db, "read_disk_bytes_db", | |
365 | "reads requests going to DB disk", | |
366 | NULL, | |
367 | PerfCountersBuilder::PRIO_USEFUL, unit_t(UNIT_BYTES)); | |
368 | b.add_u64_counter(l_bluefs_read_disk_bytes_slow, "read_disk_bytes_slow", | |
369 | "reads requests going to main disk", | |
370 | "rsb", | |
371 | PerfCountersBuilder::PRIO_INTERESTING, unit_t(UNIT_BYTES)); | |
494da23a | 372 | b.add_u64_counter(l_bluefs_read_prefetch_count, "read_prefetch_count", |
20effc67 TL |
373 | "prefetch read requests processed", |
374 | NULL, | |
375 | PerfCountersBuilder::PRIO_USEFUL); | |
494da23a | 376 | b.add_u64_counter(l_bluefs_read_prefetch_bytes, "read_prefetch_bytes", |
20effc67 TL |
377 | "Bytes requested in prefetch read mode", |
378 | NULL, | |
494da23a | 379 | PerfCountersBuilder::PRIO_USEFUL, unit_t(UNIT_BYTES)); |
1e59de90 TL |
380 | b.add_u64_counter(l_bluefs_write_count, "write_count", |
381 | "Write requests processed"); | |
382 | b.add_u64_counter(l_bluefs_write_disk_count, "write_disk_count", | |
383 | "Write requests sent to disk"); | |
384 | b.add_u64_counter(l_bluefs_write_bytes, "write_bytes", | |
385 | "Bytes written", NULL, | |
386 | PerfCountersBuilder::PRIO_USEFUL, unit_t(UNIT_BYTES)); | |
39ae355f TL |
387 | b.add_time_avg (l_bluefs_compaction_lat, "compact_lat", |
388 | "Average bluefs log compaction latency", | |
389 | "c__t", | |
390 | PerfCountersBuilder::PRIO_INTERESTING); | |
391 | b.add_time_avg (l_bluefs_compaction_lock_lat, "compact_lock_lat", | |
392 | "Average lock duration while compacting bluefs log", | |
393 | "c_lt", | |
394 | PerfCountersBuilder::PRIO_INTERESTING); | |
395 | b.add_u64_counter(l_bluefs_alloc_shared_dev_fallbacks, "alloc_slow_fallback", | |
396 | "Amount of allocations that required fallback to " | |
397 | " slow/shared device", | |
398 | "asdf", | |
399 | PerfCountersBuilder::PRIO_USEFUL); | |
400 | b.add_u64_counter(l_bluefs_alloc_shared_size_fallbacks, "alloc_slow_size_fallback", | |
401 | "Amount of allocations that required fallback to shared device's " | |
402 | "regular unit size", | |
403 | "assf", | |
404 | PerfCountersBuilder::PRIO_USEFUL); | |
cd265ab1 TL |
405 | b.add_u64(l_bluefs_read_zeros_candidate, "read_zeros_candidate", |
406 | "How many times bluefs read found page with all 0s"); | |
407 | b.add_u64(l_bluefs_read_zeros_errors, "read_zeros_errors", | |
408 | "How many times bluefs read found transient page with all 0s"); | |
494da23a | 409 | |
7c673cae FG |
410 | logger = b.create_perf_counters(); |
411 | cct->get_perfcounters_collection()->add(logger); | |
412 | } | |
413 | ||
414 | void BlueFS::_shutdown_logger() | |
415 | { | |
416 | cct->get_perfcounters_collection()->remove(logger); | |
417 | delete logger; | |
418 | } | |
419 | ||
420 | void BlueFS::_update_logger_stats() | |
421 | { | |
7c673cae | 422 | if (alloc[BDEV_WAL]) { |
f67539c2 TL |
423 | logger->set(l_bluefs_wal_total_bytes, _get_total(BDEV_WAL)); |
424 | logger->set(l_bluefs_wal_used_bytes, _get_used(BDEV_WAL)); | |
7c673cae FG |
425 | } |
426 | if (alloc[BDEV_DB]) { | |
f67539c2 TL |
427 | logger->set(l_bluefs_db_total_bytes, _get_total(BDEV_DB)); |
428 | logger->set(l_bluefs_db_used_bytes, _get_used(BDEV_DB)); | |
7c673cae FG |
429 | } |
430 | if (alloc[BDEV_SLOW]) { | |
f67539c2 TL |
431 | logger->set(l_bluefs_slow_total_bytes, _get_total(BDEV_SLOW)); |
432 | logger->set(l_bluefs_slow_used_bytes, _get_used(BDEV_SLOW)); | |
7c673cae FG |
433 | } |
434 | } | |
435 | ||
11fdf7f2 | 436 | int BlueFS::add_block_device(unsigned id, const string& path, bool trim, |
f67539c2 TL |
437 | uint64_t reserved, |
438 | bluefs_shared_alloc_context_t* _shared_alloc) | |
7c673cae | 439 | { |
f67539c2 TL |
440 | dout(10) << __func__ << " bdev " << id << " path " << path << " " |
441 | << reserved << dendl; | |
11fdf7f2 TL |
442 | ceph_assert(id < bdev.size()); |
443 | ceph_assert(bdev[id] == NULL); | |
444 | BlockDevice *b = BlockDevice::create(cct, path, NULL, NULL, | |
445 | discard_cb[id], static_cast<void*>(this)); | |
f67539c2 TL |
446 | block_reserved[id] = reserved; |
447 | if (_shared_alloc) { | |
11fdf7f2 TL |
448 | b->set_no_exclusive_lock(); |
449 | } | |
7c673cae FG |
450 | int r = b->open(path); |
451 | if (r < 0) { | |
452 | delete b; | |
453 | return r; | |
454 | } | |
11fdf7f2 | 455 | if (trim) { |
1e59de90 TL |
456 | interval_set<uint64_t> whole_device; |
457 | whole_device.insert(0, b->get_size()); | |
458 | b->try_discard(whole_device, false); | |
11fdf7f2 TL |
459 | } |
460 | ||
7c673cae | 461 | dout(1) << __func__ << " bdev " << id << " path " << path |
1adf2230 | 462 | << " size " << byte_u_t(b->get_size()) << dendl; |
7c673cae FG |
463 | bdev[id] = b; |
464 | ioc[id] = new IOContext(cct, NULL); | |
f67539c2 TL |
465 | if (_shared_alloc) { |
466 | ceph_assert(!shared_alloc); | |
467 | shared_alloc = _shared_alloc; | |
468 | alloc[id] = shared_alloc->a; | |
469 | shared_alloc_id = id; | |
470 | } | |
7c673cae FG |
471 | return 0; |
472 | } | |
473 | ||
474 | bool BlueFS::bdev_support_label(unsigned id) | |
475 | { | |
11fdf7f2 TL |
476 | ceph_assert(id < bdev.size()); |
477 | ceph_assert(bdev[id]); | |
7c673cae FG |
478 | return bdev[id]->supported_bdev_label(); |
479 | } | |
480 | ||
f67539c2 | 481 | uint64_t BlueFS::get_block_device_size(unsigned id) const |
7c673cae FG |
482 | { |
483 | if (id < bdev.size() && bdev[id]) | |
484 | return bdev[id]->get_size(); | |
485 | return 0; | |
486 | } | |
487 | ||
f67539c2 | 488 | void BlueFS::handle_discard(unsigned id, interval_set<uint64_t>& to_release) |
7c673cae | 489 | { |
f67539c2 TL |
490 | dout(10) << __func__ << " bdev " << id << dendl; |
491 | ceph_assert(alloc[id]); | |
492 | alloc[id]->release(to_release); | |
493 | if (is_shared_alloc(id)) { | |
494 | shared_alloc->bluefs_used -= to_release.size(); | |
7c673cae | 495 | } |
7c673cae FG |
496 | } |
497 | ||
f67539c2 | 498 | uint64_t BlueFS::get_used() |
7c673cae | 499 | { |
f67539c2 TL |
500 | uint64_t used = 0; |
501 | for (unsigned id = 0; id < MAX_BDEV; ++id) { | |
502 | used += _get_used(id); | |
7c673cae | 503 | } |
f67539c2 TL |
504 | return used; |
505 | } | |
7c673cae | 506 | |
f67539c2 TL |
507 | uint64_t BlueFS::_get_used(unsigned id) const |
508 | { | |
509 | uint64_t used = 0; | |
510 | if (!alloc[id]) | |
511 | return 0; | |
9f95a23c | 512 | |
f67539c2 TL |
513 | if (is_shared_alloc(id)) { |
514 | used = shared_alloc->bluefs_used; | |
515 | } else { | |
516 | used = _get_total(id) - alloc[id]->get_free(); | |
9f95a23c | 517 | } |
f67539c2 | 518 | return used; |
7c673cae FG |
519 | } |
520 | ||
f67539c2 | 521 | uint64_t BlueFS::get_used(unsigned id) |
7c673cae | 522 | { |
f67539c2 | 523 | ceph_assert(id < alloc.size()); |
11fdf7f2 | 524 | ceph_assert(alloc[id]); |
f67539c2 | 525 | return _get_used(id); |
11fdf7f2 TL |
526 | } |
527 | ||
f67539c2 | 528 | uint64_t BlueFS::_get_total(unsigned id) const |
11fdf7f2 | 529 | { |
f67539c2 TL |
530 | ceph_assert(id < bdev.size()); |
531 | ceph_assert(id < block_reserved.size()); | |
532 | return get_block_device_size(id) - block_reserved[id]; | |
7c673cae FG |
533 | } |
534 | ||
535 | uint64_t BlueFS::get_total(unsigned id) | |
536 | { | |
f67539c2 | 537 | return _get_total(id); |
7c673cae FG |
538 | } |
539 | ||
540 | uint64_t BlueFS::get_free(unsigned id) | |
541 | { | |
11fdf7f2 | 542 | ceph_assert(id < alloc.size()); |
7c673cae FG |
543 | return alloc[id]->get_free(); |
544 | } | |
545 | ||
546 | void BlueFS::dump_perf_counters(Formatter *f) | |
547 | { | |
548 | f->open_object_section("bluefs_perf_counters"); | |
1e59de90 | 549 | logger->dump_formatted(f, false, false); |
7c673cae FG |
550 | f->close_section(); |
551 | } | |
552 | ||
3efd9988 FG |
553 | void BlueFS::dump_block_extents(ostream& out) |
554 | { | |
555 | for (unsigned i = 0; i < MAX_BDEV; ++i) { | |
556 | if (!bdev[i]) { | |
557 | continue; | |
558 | } | |
f67539c2 | 559 | auto total = get_total(i); |
11fdf7f2 | 560 | auto free = get_free(i); |
1911f103 | 561 | |
f67539c2 TL |
562 | out << i << " : device size 0x" << std::hex << total |
563 | << " : using 0x" << total - free | |
564 | << std::dec << "(" << byte_u_t(total - free) << ")"; | |
1911f103 | 565 | out << "\n"; |
3efd9988 FG |
566 | } |
567 | } | |
7c673cae | 568 | |
1e59de90 TL |
569 | void BlueFS::foreach_block_extents( |
570 | unsigned id, | |
571 | std::function<void(uint64_t, uint32_t)> fn) | |
7c673cae | 572 | { |
20effc67 | 573 | std::lock_guard nl(nodes.lock); |
7c673cae | 574 | dout(10) << __func__ << " bdev " << id << dendl; |
f67539c2 | 575 | ceph_assert(id < alloc.size()); |
20effc67 | 576 | for (auto& p : nodes.file_map) { |
f67539c2 TL |
577 | for (auto& q : p.second->fnode.extents) { |
578 | if (q.bdev == id) { | |
1e59de90 | 579 | fn(q.offset, q.length); |
f67539c2 TL |
580 | } |
581 | } | |
582 | } | |
7c673cae FG |
583 | } |
584 | ||
9f95a23c | 585 | int BlueFS::mkfs(uuid_d osd_uuid, const bluefs_layout_t& layout) |
7c673cae | 586 | { |
7c673cae FG |
587 | dout(1) << __func__ |
588 | << " osd_uuid " << osd_uuid | |
589 | << dendl; | |
590 | ||
9f95a23c TL |
591 | // set volume selector if not provided before/outside |
592 | if (vselector == nullptr) { | |
593 | vselector.reset( | |
594 | new OriginalVolumeSelector( | |
595 | get_block_device_size(BlueFS::BDEV_WAL) * 95 / 100, | |
596 | get_block_device_size(BlueFS::BDEV_DB) * 95 / 100, | |
597 | get_block_device_size(BlueFS::BDEV_SLOW) * 95 / 100)); | |
598 | } | |
599 | ||
7c673cae | 600 | _init_logger(); |
20effc67 | 601 | _init_alloc(); |
7c673cae | 602 | |
39ae355f | 603 | super.version = 0; |
7c673cae FG |
604 | super.block_size = bdev[BDEV_DB]->get_block_size(); |
605 | super.osd_uuid = osd_uuid; | |
606 | super.uuid.generate_random(); | |
607 | dout(1) << __func__ << " uuid " << super.uuid << dendl; | |
608 | ||
609 | // init log | |
9f95a23c | 610 | FileRef log_file = ceph::make_ref<File>(); |
7c673cae | 611 | log_file->fnode.ino = 1; |
f6b5b4d7 | 612 | log_file->vselector_hint = vselector->get_hint_for_log(); |
7c673cae | 613 | int r = _allocate( |
9f95a23c | 614 | vselector->select_prefer_bdev(log_file->vselector_hint), |
7c673cae | 615 | cct->_conf->bluefs_max_log_runway, |
39ae355f | 616 | 0, |
94b18763 | 617 | &log_file->fnode); |
9f95a23c | 618 | vselector->add_usage(log_file->vselector_hint, log_file->fnode); |
11fdf7f2 | 619 | ceph_assert(r == 0); |
20effc67 | 620 | log.writer = _create_writer(log_file); |
7c673cae FG |
621 | |
622 | // initial txn | |
20effc67 TL |
623 | ceph_assert(log.seq_live == 1); |
624 | log.t.seq = 1; | |
625 | log.t.op_init(); | |
626 | _flush_and_sync_log_LD(); | |
7c673cae FG |
627 | |
628 | // write supers | |
629 | super.log_fnode = log_file->fnode; | |
9f95a23c | 630 | super.memorized_layout = layout; |
11fdf7f2 | 631 | _write_super(BDEV_DB); |
20effc67 | 632 | _flush_bdev(); |
7c673cae FG |
633 | |
634 | // clean up | |
635 | super = bluefs_super_t(); | |
20effc67 TL |
636 | _close_writer(log.writer); |
637 | log.writer = NULL; | |
9f95a23c | 638 | vselector.reset(nullptr); |
7c673cae FG |
639 | _stop_alloc(); |
640 | _shutdown_logger(); | |
f67539c2 TL |
641 | if (shared_alloc) { |
642 | ceph_assert(shared_alloc->need_init); | |
643 | shared_alloc->need_init = false; | |
644 | } | |
7c673cae FG |
645 | |
646 | dout(10) << __func__ << " success" << dendl; | |
647 | return 0; | |
648 | } | |
649 | ||
650 | void BlueFS::_init_alloc() | |
651 | { | |
652 | dout(20) << __func__ << dendl; | |
eafe8130 | 653 | |
20effc67 | 654 | size_t wal_alloc_size = 0; |
eafe8130 | 655 | if (bdev[BDEV_WAL]) { |
20effc67 TL |
656 | wal_alloc_size = cct->_conf->bluefs_alloc_size; |
657 | alloc_size[BDEV_WAL] = wal_alloc_size; | |
eafe8130 | 658 | } |
20effc67 TL |
659 | logger->set(l_bluefs_wal_alloc_unit, wal_alloc_size); |
660 | ||
aee94f69 TL |
661 | |
662 | uint64_t shared_alloc_size = cct->_conf->bluefs_shared_alloc_size; | |
663 | if (shared_alloc && shared_alloc->a) { | |
664 | uint64_t unit = shared_alloc->a->get_block_size(); | |
665 | shared_alloc_size = std::max( | |
666 | unit, | |
667 | shared_alloc_size); | |
668 | ceph_assert(0 == p2phase(shared_alloc_size, unit)); | |
669 | } | |
eafe8130 TL |
670 | if (bdev[BDEV_SLOW]) { |
671 | alloc_size[BDEV_DB] = cct->_conf->bluefs_alloc_size; | |
aee94f69 | 672 | alloc_size[BDEV_SLOW] = shared_alloc_size; |
eafe8130 | 673 | } else { |
aee94f69 TL |
674 | alloc_size[BDEV_DB] = shared_alloc_size; |
675 | alloc_size[BDEV_SLOW] = 0; | |
eafe8130 | 676 | } |
aee94f69 TL |
677 | logger->set(l_bluefs_db_alloc_unit, alloc_size[BDEV_DB]); |
678 | logger->set(l_bluefs_main_alloc_unit, alloc_size[BDEV_SLOW]); | |
eafe8130 TL |
679 | // new wal and db devices are never shared |
680 | if (bdev[BDEV_NEWWAL]) { | |
681 | alloc_size[BDEV_NEWWAL] = cct->_conf->bluefs_alloc_size; | |
682 | } | |
683 | if (bdev[BDEV_NEWDB]) { | |
684 | alloc_size[BDEV_NEWDB] = cct->_conf->bluefs_alloc_size; | |
685 | } | |
686 | ||
7c673cae FG |
687 | for (unsigned id = 0; id < bdev.size(); ++id) { |
688 | if (!bdev[id]) { | |
689 | continue; | |
690 | } | |
11fdf7f2 | 691 | ceph_assert(bdev[id]->get_size()); |
f67539c2 TL |
692 | if (is_shared_alloc(id)) { |
693 | dout(1) << __func__ << " shared, id " << id << std::hex | |
694 | << ", capacity 0x" << bdev[id]->get_size() | |
695 | << ", block size 0x" << alloc_size[id] | |
696 | << std::dec << dendl; | |
697 | } else { | |
aee94f69 | 698 | ceph_assert(alloc_size[id]); |
f67539c2 TL |
699 | std::string name = "bluefs-"; |
700 | const char* devnames[] = { "wal","db","slow" }; | |
701 | if (id <= BDEV_SLOW) | |
702 | name += devnames[id]; | |
703 | else | |
704 | name += to_string(uintptr_t(this)); | |
705 | dout(1) << __func__ << " new, id " << id << std::hex | |
706 | << ", allocator name " << name | |
707 | << ", allocator type " << cct->_conf->bluefs_allocator | |
708 | << ", capacity 0x" << bdev[id]->get_size() | |
709 | << ", block size 0x" << alloc_size[id] | |
710 | << std::dec << dendl; | |
711 | alloc[id] = Allocator::create(cct, cct->_conf->bluefs_allocator, | |
712 | bdev[id]->get_size(), | |
20effc67 TL |
713 | alloc_size[id], |
714 | 0, 0, | |
715 | name); | |
f67539c2 TL |
716 | alloc[id]->init_add_free( |
717 | block_reserved[id], | |
718 | _get_total(id)); | |
7c673cae FG |
719 | } |
720 | } | |
721 | } | |
722 | ||
723 | void BlueFS::_stop_alloc() | |
724 | { | |
725 | dout(20) << __func__ << dendl; | |
11fdf7f2 TL |
726 | for (auto p : bdev) { |
727 | if (p) | |
728 | p->discard_drain(); | |
729 | } | |
730 | ||
f67539c2 TL |
731 | for (size_t i = 0; i < alloc.size(); ++i) { |
732 | if (alloc[i] && !is_shared_alloc(i)) { | |
733 | alloc[i]->shutdown(); | |
734 | delete alloc[i]; | |
735 | alloc[i] = nullptr; | |
7c673cae FG |
736 | } |
737 | } | |
7c673cae FG |
738 | } |
739 | ||
20effc67 TL |
740 | int BlueFS::_read_and_check(uint8_t ndev, uint64_t off, uint64_t len, |
741 | ceph::buffer::list *pbl, IOContext *ioc, bool buffered) | |
cd265ab1 TL |
742 | { |
743 | dout(10) << __func__ << " dev " << int(ndev) | |
744 | << ": 0x" << std::hex << off << "~" << len << std::dec | |
745 | << (buffered ? " buffered" : "") | |
746 | << dendl; | |
747 | int r; | |
748 | bufferlist bl; | |
20effc67 | 749 | r = _bdev_read(ndev, off, len, &bl, ioc, buffered); |
cd265ab1 TL |
750 | if (r != 0) { |
751 | return r; | |
752 | } | |
753 | uint64_t block_size = bdev[ndev]->get_block_size(); | |
754 | if (inject_read_zeros) { | |
755 | if (len >= block_size * 2) { | |
756 | derr << __func__ << " injecting error, zeros at " | |
757 | << int(ndev) << ": 0x" << std::hex << (off + len / 2) | |
758 | << "~" << (block_size * 2) << std::dec << dendl; | |
759 | //use beginning, replace 8K in the middle with zeros, use tail | |
760 | bufferlist temp; | |
761 | bl.splice(0, len / 2 - block_size, &temp); | |
f67539c2 | 762 | temp.append(buffer::create(block_size * 2, 0)); |
cd265ab1 TL |
763 | bl.splice(block_size * 2, len / 2 - block_size, &temp); |
764 | bl = temp; | |
765 | inject_read_zeros--; | |
766 | } | |
767 | } | |
768 | //make a check if there is a block with all 0 | |
769 | uint64_t to_check_len = len; | |
770 | uint64_t skip = p2nphase(off, block_size); | |
771 | if (skip >= to_check_len) { | |
772 | return r; | |
773 | } | |
774 | auto it = bl.begin(skip); | |
775 | to_check_len -= skip; | |
776 | bool all_zeros = false; | |
777 | while (all_zeros == false && to_check_len >= block_size) { | |
778 | // checking 0s step | |
779 | unsigned block_left = block_size; | |
780 | unsigned avail; | |
781 | const char* data; | |
782 | all_zeros = true; | |
783 | while (all_zeros && block_left > 0) { | |
784 | avail = it.get_ptr_and_advance(block_left, &data); | |
785 | block_left -= avail; | |
786 | all_zeros = mem_is_zero(data, avail); | |
787 | } | |
788 | // skipping step | |
789 | while (block_left > 0) { | |
790 | avail = it.get_ptr_and_advance(block_left, &data); | |
791 | block_left -= avail; | |
792 | } | |
793 | to_check_len -= block_size; | |
794 | } | |
795 | if (all_zeros) { | |
796 | logger->inc(l_bluefs_read_zeros_candidate, 1); | |
797 | bufferlist bl_reread; | |
20effc67 | 798 | r = _bdev_read(ndev, off, len, &bl_reread, ioc, buffered); |
cd265ab1 TL |
799 | if (r != 0) { |
800 | return r; | |
801 | } | |
802 | // check if both read gave the same | |
803 | if (!bl.contents_equal(bl_reread)) { | |
804 | // report problems to log, but continue, maybe it will be good now... | |
805 | derr << __func__ << " initial read of " << int(ndev) | |
806 | << ": 0x" << std::hex << off << "~" << len | |
807 | << std::dec << ": different then re-read " << dendl; | |
808 | logger->inc(l_bluefs_read_zeros_errors, 1); | |
809 | } | |
810 | // use second read will be better if is different | |
811 | pbl->append(bl_reread); | |
812 | } else { | |
813 | pbl->append(bl); | |
814 | } | |
815 | return r; | |
816 | } | |
817 | ||
20effc67 TL |
818 | int BlueFS::_read_random_and_check( |
819 | uint8_t ndev, uint64_t off, uint64_t len, char *buf, bool buffered) | |
cd265ab1 TL |
820 | { |
821 | dout(10) << __func__ << " dev " << int(ndev) | |
822 | << ": 0x" << std::hex << off << "~" << len << std::dec | |
823 | << (buffered ? " buffered" : "") | |
824 | << dendl; | |
825 | int r; | |
20effc67 | 826 | r = _bdev_read_random(ndev, off, len, buf, buffered); |
cd265ab1 TL |
827 | if (r != 0) { |
828 | return r; | |
829 | } | |
830 | uint64_t block_size = bdev[ndev]->get_block_size(); | |
831 | if (inject_read_zeros) { | |
832 | if (len >= block_size * 2) { | |
833 | derr << __func__ << " injecting error, zeros at " | |
834 | << int(ndev) << ": 0x" << std::hex << (off + len / 2) | |
835 | << "~" << (block_size * 2) << std::dec << dendl; | |
836 | //zero middle 8K | |
837 | memset(buf + len / 2 - block_size, 0, block_size * 2); | |
838 | inject_read_zeros--; | |
839 | } | |
840 | } | |
841 | //make a check if there is a block with all 0 | |
842 | uint64_t to_check_len = len; | |
843 | const char* data = buf; | |
844 | uint64_t skip = p2nphase(off, block_size); | |
845 | if (skip >= to_check_len) { | |
846 | return r; | |
847 | } | |
848 | to_check_len -= skip; | |
849 | data += skip; | |
850 | ||
851 | bool all_zeros = false; | |
852 | while (all_zeros == false && to_check_len >= block_size) { | |
853 | if (mem_is_zero(data, block_size)) { | |
854 | // at least one block is all zeros | |
855 | all_zeros = true; | |
856 | break; | |
857 | } | |
858 | data += block_size; | |
859 | to_check_len -= block_size; | |
860 | } | |
861 | if (all_zeros) { | |
862 | logger->inc(l_bluefs_read_zeros_candidate, 1); | |
863 | std::unique_ptr<char[]> data_reread(new char[len]); | |
20effc67 | 864 | r = _bdev_read_random(ndev, off, len, &data_reread[0], buffered); |
cd265ab1 TL |
865 | if (r != 0) { |
866 | return r; | |
867 | } | |
868 | // check if both read gave the same | |
869 | if (memcmp(buf, &data_reread[0], len) != 0) { | |
870 | derr << __func__ << " initial read of " << int(ndev) | |
871 | << ": 0x" << std::hex << off << "~" << len | |
872 | << std::dec << ": different then re-read " << dendl; | |
873 | logger->inc(l_bluefs_read_zeros_errors, 1); | |
874 | // second read is probably better | |
875 | memcpy(buf, &data_reread[0], len); | |
876 | } | |
877 | } | |
878 | return r; | |
879 | } | |
880 | ||
20effc67 TL |
881 | int BlueFS::_bdev_read(uint8_t ndev, uint64_t off, uint64_t len, |
882 | ceph::buffer::list* pbl, IOContext* ioc, bool buffered) | |
883 | { | |
884 | int cnt = 0; | |
885 | switch (ndev) { | |
886 | case BDEV_WAL: cnt = l_bluefs_read_disk_bytes_wal; break; | |
887 | case BDEV_DB: cnt = l_bluefs_read_disk_bytes_db; break; | |
888 | case BDEV_SLOW: cnt = l_bluefs_read_disk_bytes_slow; break; | |
889 | ||
890 | } | |
891 | if (cnt) { | |
892 | logger->inc(cnt, len); | |
893 | } | |
894 | return bdev[ndev]->read(off, len, pbl, ioc, buffered); | |
895 | } | |
896 | ||
897 | int BlueFS::_bdev_read_random(uint8_t ndev, uint64_t off, uint64_t len, | |
898 | char* buf, bool buffered) | |
899 | { | |
900 | int cnt = 0; | |
901 | switch (ndev) { | |
902 | case BDEV_WAL: cnt = l_bluefs_read_random_disk_bytes_wal; break; | |
903 | case BDEV_DB: cnt = l_bluefs_read_random_disk_bytes_db; break; | |
904 | case BDEV_SLOW: cnt = l_bluefs_read_random_disk_bytes_slow; break; | |
905 | } | |
906 | if (cnt) { | |
907 | logger->inc(cnt, len); | |
908 | } | |
909 | return bdev[ndev]->read_random(off, len, buf, buffered); | |
910 | } | |
911 | ||
7c673cae FG |
912 | int BlueFS::mount() |
913 | { | |
914 | dout(1) << __func__ << dendl; | |
915 | ||
20effc67 | 916 | _init_logger(); |
7c673cae FG |
917 | int r = _open_super(); |
918 | if (r < 0) { | |
919 | derr << __func__ << " failed to open super: " << cpp_strerror(r) << dendl; | |
920 | goto out; | |
921 | } | |
922 | ||
9f95a23c TL |
923 | // set volume selector if not provided before/outside |
924 | if (vselector == nullptr) { | |
925 | vselector.reset( | |
926 | new OriginalVolumeSelector( | |
927 | get_block_device_size(BlueFS::BDEV_WAL) * 95 / 100, | |
928 | get_block_device_size(BlueFS::BDEV_DB) * 95 / 100, | |
929 | get_block_device_size(BlueFS::BDEV_SLOW) * 95 / 100)); | |
930 | } | |
931 | ||
7c673cae FG |
932 | _init_alloc(); |
933 | ||
11fdf7f2 | 934 | r = _replay(false, false); |
7c673cae FG |
935 | if (r < 0) { |
936 | derr << __func__ << " failed to replay log: " << cpp_strerror(r) << dendl; | |
937 | _stop_alloc(); | |
938 | goto out; | |
939 | } | |
940 | ||
941 | // init freelist | |
20effc67 | 942 | for (auto& p : nodes.file_map) { |
7c673cae FG |
943 | dout(30) << __func__ << " noting alloc for " << p.second->fnode << dendl; |
944 | for (auto& q : p.second->fnode.extents) { | |
f67539c2 TL |
945 | bool is_shared = is_shared_alloc(q.bdev); |
946 | ceph_assert(!is_shared || (is_shared && shared_alloc)); | |
947 | if (is_shared && shared_alloc->need_init && shared_alloc->a) { | |
948 | shared_alloc->bluefs_used += q.length; | |
949 | alloc[q.bdev]->init_rm_free(q.offset, q.length); | |
950 | } else if (!is_shared) { | |
951 | alloc[q.bdev]->init_rm_free(q.offset, q.length); | |
952 | } | |
7c673cae FG |
953 | } |
954 | } | |
f67539c2 TL |
955 | if (shared_alloc) { |
956 | shared_alloc->need_init = false; | |
957 | dout(1) << __func__ << " shared_bdev_used = " | |
958 | << shared_alloc->bluefs_used << dendl; | |
959 | } else { | |
960 | dout(1) << __func__ << " shared bdev not used" | |
961 | << dendl; | |
962 | } | |
7c673cae FG |
963 | |
964 | // set up the log for future writes | |
20effc67 TL |
965 | log.writer = _create_writer(_get_file(1)); |
966 | ceph_assert(log.writer->file->fnode.ino == 1); | |
967 | log.writer->pos = log.writer->file->fnode.size; | |
968 | log.writer->file->fnode.reset_delta(); | |
7c673cae | 969 | dout(10) << __func__ << " log write pos set to 0x" |
20effc67 | 970 | << std::hex << log.writer->pos << std::dec |
7c673cae | 971 | << dendl; |
20effc67 TL |
972 | // update log size |
973 | logger->set(l_bluefs_log_bytes, log.writer->file->fnode.size); | |
7c673cae FG |
974 | return 0; |
975 | ||
976 | out: | |
977 | super = bluefs_super_t(); | |
978 | return r; | |
979 | } | |
980 | ||
9f95a23c TL |
981 | int BlueFS::maybe_verify_layout(const bluefs_layout_t& layout) const |
982 | { | |
983 | if (super.memorized_layout) { | |
984 | if (layout == *super.memorized_layout) { | |
985 | dout(10) << __func__ << " bluefs layout verified positively" << dendl; | |
986 | } else { | |
987 | derr << __func__ << " memorized layout doesn't fit current one" << dendl; | |
988 | return -EIO; | |
989 | } | |
990 | } else { | |
991 | dout(10) << __func__ << " no memorized_layout in bluefs superblock" | |
992 | << dendl; | |
993 | } | |
994 | ||
995 | return 0; | |
996 | } | |
997 | ||
1911f103 | 998 | void BlueFS::umount(bool avoid_compact) |
7c673cae FG |
999 | { |
1000 | dout(1) << __func__ << dendl; | |
1001 | ||
1911f103 | 1002 | sync_metadata(avoid_compact); |
20effc67 TL |
1003 | if (cct->_conf->bluefs_check_volume_selector_on_umount) { |
1004 | _check_vselector_LNF(); | |
1005 | } | |
1006 | _close_writer(log.writer); | |
1007 | log.writer = NULL; | |
1008 | log.t.clear(); | |
7c673cae | 1009 | |
9f95a23c | 1010 | vselector.reset(nullptr); |
7c673cae | 1011 | _stop_alloc(); |
20effc67 TL |
1012 | nodes.file_map.clear(); |
1013 | nodes.dir_map.clear(); | |
7c673cae | 1014 | super = bluefs_super_t(); |
7c673cae FG |
1015 | _shutdown_logger(); |
1016 | } | |
1017 | ||
9f95a23c | 1018 | int BlueFS::prepare_new_device(int id, const bluefs_layout_t& layout) |
7c673cae | 1019 | { |
11fdf7f2 TL |
1020 | dout(1) << __func__ << dendl; |
1021 | ||
1022 | if(id == BDEV_NEWDB) { | |
1023 | int new_log_dev_cur = BDEV_WAL; | |
1024 | int new_log_dev_next = BDEV_WAL; | |
1025 | if (!bdev[BDEV_WAL]) { | |
1026 | new_log_dev_cur = BDEV_NEWDB; | |
1027 | new_log_dev_next = BDEV_DB; | |
1028 | } | |
20effc67 | 1029 | _rewrite_log_and_layout_sync_LNF_LD(false, |
11fdf7f2 TL |
1030 | BDEV_NEWDB, |
1031 | new_log_dev_cur, | |
1032 | new_log_dev_next, | |
9f95a23c TL |
1033 | RENAME_DB2SLOW, |
1034 | layout); | |
11fdf7f2 | 1035 | } else if(id == BDEV_NEWWAL) { |
20effc67 | 1036 | _rewrite_log_and_layout_sync_LNF_LD(false, |
9f95a23c TL |
1037 | BDEV_DB, |
1038 | BDEV_NEWWAL, | |
1039 | BDEV_WAL, | |
1040 | REMOVE_WAL, | |
1041 | layout); | |
11fdf7f2 TL |
1042 | } else { |
1043 | assert(false); | |
1044 | } | |
1045 | return 0; | |
1046 | } | |
1047 | ||
1048 | void BlueFS::collect_metadata(map<string,string> *pm, unsigned skip_bdev_id) | |
1049 | { | |
1050 | if (skip_bdev_id != BDEV_DB && bdev[BDEV_DB]) | |
7c673cae FG |
1051 | bdev[BDEV_DB]->collect_metadata("bluefs_db_", pm); |
1052 | if (bdev[BDEV_WAL]) | |
1053 | bdev[BDEV_WAL]->collect_metadata("bluefs_wal_", pm); | |
11fdf7f2 TL |
1054 | } |
1055 | ||
1056 | void BlueFS::get_devices(set<string> *ls) | |
1057 | { | |
1058 | for (unsigned i = 0; i < MAX_BDEV; ++i) { | |
1059 | if (bdev[i]) { | |
1060 | bdev[i]->get_devices(ls); | |
1061 | } | |
1062 | } | |
7c673cae FG |
1063 | } |
1064 | ||
1065 | int BlueFS::fsck() | |
1066 | { | |
7c673cae FG |
1067 | dout(1) << __func__ << dendl; |
1068 | // hrm, i think we check everything on mount... | |
1069 | return 0; | |
1070 | } | |
1071 | ||
11fdf7f2 | 1072 | int BlueFS::_write_super(int dev) |
7c673cae | 1073 | { |
39ae355f | 1074 | ++super.version; |
7c673cae FG |
1075 | // build superblock |
1076 | bufferlist bl; | |
11fdf7f2 | 1077 | encode(super, bl); |
7c673cae | 1078 | uint32_t crc = bl.crc32c(-1); |
11fdf7f2 | 1079 | encode(crc, bl); |
7c673cae FG |
1080 | dout(10) << __func__ << " super block length(encoded): " << bl.length() << dendl; |
1081 | dout(10) << __func__ << " superblock " << super.version << dendl; | |
1082 | dout(10) << __func__ << " log_fnode " << super.log_fnode << dendl; | |
9f95a23c | 1083 | ceph_assert_always(bl.length() <= get_super_length()); |
7c673cae FG |
1084 | bl.append_zero(get_super_length() - bl.length()); |
1085 | ||
11fdf7f2 | 1086 | bdev[dev]->write(get_super_offset(), bl, false, WRITE_LIFE_SHORT); |
7c673cae FG |
1087 | dout(20) << __func__ << " v " << super.version |
1088 | << " crc 0x" << std::hex << crc | |
1089 | << " offset 0x" << get_super_offset() << std::dec | |
1090 | << dendl; | |
1091 | return 0; | |
1092 | } | |
1093 | ||
1094 | int BlueFS::_open_super() | |
1095 | { | |
1096 | dout(10) << __func__ << dendl; | |
1097 | ||
1098 | bufferlist bl; | |
1099 | uint32_t expected_crc, crc; | |
1100 | int r; | |
1101 | ||
1102 | // always the second block | |
20effc67 TL |
1103 | r = _bdev_read(BDEV_DB, get_super_offset(), get_super_length(), |
1104 | &bl, ioc[BDEV_DB], false); | |
7c673cae FG |
1105 | if (r < 0) |
1106 | return r; | |
1107 | ||
11fdf7f2 TL |
1108 | auto p = bl.cbegin(); |
1109 | decode(super, p); | |
7c673cae FG |
1110 | { |
1111 | bufferlist t; | |
1112 | t.substr_of(bl, 0, p.get_off()); | |
1113 | crc = t.crc32c(-1); | |
1114 | } | |
11fdf7f2 | 1115 | decode(expected_crc, p); |
7c673cae FG |
1116 | if (crc != expected_crc) { |
1117 | derr << __func__ << " bad crc on superblock, expected 0x" | |
1118 | << std::hex << expected_crc << " != actual 0x" << crc << std::dec | |
1119 | << dendl; | |
1120 | return -EIO; | |
1121 | } | |
1122 | dout(10) << __func__ << " superblock " << super.version << dendl; | |
1123 | dout(10) << __func__ << " log_fnode " << super.log_fnode << dendl; | |
1124 | return 0; | |
1125 | } | |
1126 | ||
20effc67 TL |
1127 | int BlueFS::_check_allocations(const bluefs_fnode_t& fnode, |
1128 | boost::dynamic_bitset<uint64_t>* used_blocks, | |
1129 | bool is_alloc, //true when allocating, false when deallocating | |
1130 | const char* op_name) | |
9f95a23c TL |
1131 | { |
1132 | auto& fnode_extents = fnode.extents; | |
1133 | for (auto e : fnode_extents) { | |
1134 | auto id = e.bdev; | |
1135 | bool fail = false; | |
20effc67 | 1136 | ceph_assert(id < MAX_BDEV); |
39ae355f TL |
1137 | ceph_assert(bdev[id]); |
1138 | // let's use minimal allocation unit we can have | |
1139 | auto alloc_unit = bdev[id]->get_block_size(); | |
1140 | ||
20effc67 | 1141 | if (int r = _verify_alloc_granularity(id, e.offset, e.length, |
39ae355f | 1142 | alloc_unit, |
20effc67 TL |
1143 | op_name); r < 0) { |
1144 | return r; | |
1145 | } | |
9f95a23c | 1146 | |
39ae355f | 1147 | apply_for_bitset_range(e.offset, e.length, alloc_unit, used_blocks[id], |
9f95a23c | 1148 | [&](uint64_t pos, boost::dynamic_bitset<uint64_t> &bs) { |
20effc67 TL |
1149 | if (is_alloc == bs.test(pos)) { |
1150 | fail = true; | |
1151 | } else { | |
1152 | bs.flip(pos); | |
1153 | } | |
9f95a23c TL |
1154 | } |
1155 | ); | |
1156 | if (fail) { | |
20effc67 TL |
1157 | derr << __func__ << " " << op_name << " invalid extent " << int(e.bdev) |
1158 | << ": 0x" << std::hex << e.offset << "~" << e.length << std::dec | |
1159 | << (is_alloc == true ? | |
1160 | ": duplicate reference, ino " : ": double free, ino ") | |
1161 | << fnode.ino << dendl; | |
9f95a23c TL |
1162 | return -EFAULT; |
1163 | } | |
1164 | } | |
1165 | return 0; | |
1166 | } | |
1167 | ||
9f95a23c | 1168 | int BlueFS::_verify_alloc_granularity( |
39ae355f | 1169 | __u8 id, uint64_t offset, uint64_t length, uint64_t alloc_unit, const char *op) |
9f95a23c | 1170 | { |
39ae355f TL |
1171 | if ((offset & (alloc_unit - 1)) || |
1172 | (length & (alloc_unit - 1))) { | |
9f95a23c TL |
1173 | derr << __func__ << " " << op << " of " << (int)id |
1174 | << ":0x" << std::hex << offset << "~" << length << std::dec | |
1175 | << " does not align to alloc_size 0x" | |
39ae355f | 1176 | << std::hex << alloc_unit << std::dec << dendl; |
9f95a23c TL |
1177 | return -EFAULT; |
1178 | } | |
1179 | return 0; | |
1180 | } | |
1181 | ||
11fdf7f2 | 1182 | int BlueFS::_replay(bool noop, bool to_stdout) |
7c673cae FG |
1183 | { |
1184 | dout(10) << __func__ << (noop ? " NO-OP" : "") << dendl; | |
1185 | ino_last = 1; // by the log | |
20effc67 | 1186 | uint64_t log_seq = 0; |
7c673cae FG |
1187 | |
1188 | FileRef log_file; | |
11fdf7f2 | 1189 | log_file = _get_file(1); |
9f95a23c | 1190 | |
f67539c2 | 1191 | log_file->fnode = super.log_fnode; |
11fdf7f2 | 1192 | if (!noop) { |
9f95a23c | 1193 | log_file->vselector_hint = |
f6b5b4d7 | 1194 | vselector->get_hint_for_log(); |
7c673cae | 1195 | } |
7c673cae | 1196 | dout(10) << __func__ << " log_fnode " << super.log_fnode << dendl; |
11fdf7f2 TL |
1197 | if (unlikely(to_stdout)) { |
1198 | std::cout << " log_fnode " << super.log_fnode << std::endl; | |
1199 | } | |
7c673cae FG |
1200 | |
1201 | FileReader *log_reader = new FileReader( | |
1202 | log_file, cct->_conf->bluefs_max_prefetch, | |
1203 | false, // !random | |
1204 | true); // ignore eof | |
9f95a23c TL |
1205 | |
1206 | bool seen_recs = false; | |
1207 | ||
1208 | boost::dynamic_bitset<uint64_t> used_blocks[MAX_BDEV]; | |
9f95a23c | 1209 | |
f67539c2 TL |
1210 | if (!noop) { |
1211 | if (cct->_conf->bluefs_log_replay_check_allocations) { | |
1212 | for (size_t i = 0; i < MAX_BDEV; ++i) { | |
39ae355f TL |
1213 | if (bdev[i] != nullptr) { |
1214 | // let's use minimal allocation unit we can have | |
1215 | auto au = bdev[i]->get_block_size(); | |
1216 | //hmm... on 32TB/4K drive this would take 1GB RAM!!! | |
1217 | used_blocks[i].resize(round_up_to(bdev[i]->get_size(), au) / au); | |
f67539c2 | 1218 | } |
9f95a23c | 1219 | } |
20effc67 TL |
1220 | // check initial log layout |
1221 | int r = _check_allocations(log_file->fnode, | |
1222 | used_blocks, true, "Log from super"); | |
1223 | if (r < 0) { | |
1224 | return r; | |
1225 | } | |
9f95a23c TL |
1226 | } |
1227 | } | |
1228 | ||
7c673cae | 1229 | while (true) { |
11fdf7f2 | 1230 | ceph_assert((log_reader->buf.pos & ~super.block_mask()) == 0); |
7c673cae FG |
1231 | uint64_t pos = log_reader->buf.pos; |
1232 | uint64_t read_pos = pos; | |
1233 | bufferlist bl; | |
1234 | { | |
f67539c2 | 1235 | int r = _read(log_reader, read_pos, super.block_size, |
7c673cae | 1236 | &bl, NULL); |
f6b5b4d7 | 1237 | if (r != (int)super.block_size && cct->_conf->bluefs_replay_recovery) { |
20effc67 | 1238 | r += _do_replay_recovery_read(log_reader, pos, read_pos + r, super.block_size - r, &bl); |
f6b5b4d7 TL |
1239 | } |
1240 | assert(r == (int)super.block_size); | |
7c673cae FG |
1241 | read_pos += r; |
1242 | } | |
1243 | uint64_t more = 0; | |
1244 | uint64_t seq; | |
1245 | uuid_d uuid; | |
1246 | { | |
11fdf7f2 | 1247 | auto p = bl.cbegin(); |
7c673cae FG |
1248 | __u8 a, b; |
1249 | uint32_t len; | |
11fdf7f2 TL |
1250 | decode(a, p); |
1251 | decode(b, p); | |
1252 | decode(len, p); | |
1253 | decode(uuid, p); | |
1254 | decode(seq, p); | |
7c673cae | 1255 | if (len + 6 > bl.length()) { |
11fdf7f2 | 1256 | more = round_up_to(len + 6 - bl.length(), super.block_size); |
7c673cae FG |
1257 | } |
1258 | } | |
1259 | if (uuid != super.uuid) { | |
9f95a23c TL |
1260 | if (seen_recs) { |
1261 | dout(10) << __func__ << " 0x" << std::hex << pos << std::dec | |
1262 | << ": stop: uuid " << uuid << " != super.uuid " << super.uuid | |
1263 | << dendl; | |
1264 | } else { | |
1265 | derr << __func__ << " 0x" << std::hex << pos << std::dec | |
1266 | << ": stop: uuid " << uuid << " != super.uuid " << super.uuid | |
1267 | << ", block dump: \n"; | |
1268 | bufferlist t; | |
1269 | t.substr_of(bl, 0, super.block_size); | |
1270 | t.hexdump(*_dout); | |
1271 | *_dout << dendl; | |
1272 | } | |
7c673cae FG |
1273 | break; |
1274 | } | |
1275 | if (seq != log_seq + 1) { | |
9f95a23c TL |
1276 | if (seen_recs) { |
1277 | dout(10) << __func__ << " 0x" << std::hex << pos << std::dec | |
1278 | << ": stop: seq " << seq << " != expected " << log_seq + 1 | |
1279 | << dendl;; | |
1280 | } else { | |
1281 | derr << __func__ << " 0x" << std::hex << pos << std::dec | |
1282 | << ": stop: seq " << seq << " != expected " << log_seq + 1 | |
1283 | << dendl;; | |
1284 | } | |
7c673cae FG |
1285 | break; |
1286 | } | |
1287 | if (more) { | |
1288 | dout(20) << __func__ << " need 0x" << std::hex << more << std::dec | |
1289 | << " more bytes" << dendl; | |
1290 | bufferlist t; | |
f67539c2 | 1291 | int r = _read(log_reader, read_pos, more, &t, NULL); |
7c673cae | 1292 | if (r < (int)more) { |
f6b5b4d7 TL |
1293 | dout(10) << __func__ << " 0x" << std::hex << pos |
1294 | << ": stop: len is 0x" << bl.length() + more << std::dec | |
1295 | << ", which is past eof" << dendl; | |
1296 | if (cct->_conf->bluefs_replay_recovery) { | |
1297 | //try to search for more data | |
20effc67 | 1298 | r += _do_replay_recovery_read(log_reader, pos, read_pos + r, more - r, &t); |
f6b5b4d7 TL |
1299 | if (r < (int)more) { |
1300 | //in normal mode we must read r==more, for recovery it is too strict | |
1301 | break; | |
1302 | } | |
1303 | } | |
7c673cae | 1304 | } |
11fdf7f2 | 1305 | ceph_assert(r == (int)more); |
7c673cae FG |
1306 | bl.claim_append(t); |
1307 | read_pos += r; | |
1308 | } | |
1309 | bluefs_transaction_t t; | |
1310 | try { | |
11fdf7f2 TL |
1311 | auto p = bl.cbegin(); |
1312 | decode(t, p); | |
522d829b | 1313 | seen_recs = true; |
7c673cae | 1314 | } |
f67539c2 | 1315 | catch (ceph::buffer::error& e) { |
522d829b TL |
1316 | // Multi-block transactions might be incomplete due to unexpected |
1317 | // power off. Hence let's treat that as a regular stop condition. | |
1318 | if (seen_recs && more) { | |
1319 | dout(10) << __func__ << " 0x" << std::hex << pos << std::dec | |
1320 | << ": stop: failed to decode: " << e.what() | |
1321 | << dendl; | |
1322 | } else { | |
1323 | derr << __func__ << " 0x" << std::hex << pos << std::dec | |
1324 | << ": stop: failed to decode: " << e.what() | |
1325 | << dendl; | |
1326 | delete log_reader; | |
1327 | return -EIO; | |
1328 | } | |
1329 | break; | |
7c673cae | 1330 | } |
11fdf7f2 | 1331 | ceph_assert(seq == t.seq); |
7c673cae FG |
1332 | dout(10) << __func__ << " 0x" << std::hex << pos << std::dec |
1333 | << ": " << t << dendl; | |
11fdf7f2 TL |
1334 | if (unlikely(to_stdout)) { |
1335 | std::cout << " 0x" << std::hex << pos << std::dec | |
1336 | << ": " << t << std::endl; | |
1337 | } | |
7c673cae | 1338 | |
11fdf7f2 | 1339 | auto p = t.op_bl.cbegin(); |
39ae355f | 1340 | auto pos0 = pos; |
7c673cae | 1341 | while (!p.end()) { |
39ae355f | 1342 | pos = pos0 + p.get_off(); |
7c673cae | 1343 | __u8 op; |
11fdf7f2 | 1344 | decode(op, p); |
7c673cae FG |
1345 | switch (op) { |
1346 | ||
1347 | case bluefs_transaction_t::OP_INIT: | |
1348 | dout(20) << __func__ << " 0x" << std::hex << pos << std::dec | |
1349 | << ": op_init" << dendl; | |
11fdf7f2 TL |
1350 | if (unlikely(to_stdout)) { |
1351 | std::cout << " 0x" << std::hex << pos << std::dec | |
1352 | << ": op_init" << std::endl; | |
1353 | } | |
1354 | ||
1355 | ceph_assert(t.seq == 1); | |
7c673cae FG |
1356 | break; |
1357 | ||
1358 | case bluefs_transaction_t::OP_JUMP: | |
1359 | { | |
1360 | uint64_t next_seq; | |
1361 | uint64_t offset; | |
11fdf7f2 TL |
1362 | decode(next_seq, p); |
1363 | decode(offset, p); | |
7c673cae FG |
1364 | dout(20) << __func__ << " 0x" << std::hex << pos << std::dec |
1365 | << ": op_jump seq " << next_seq | |
1366 | << " offset 0x" << std::hex << offset << std::dec << dendl; | |
11fdf7f2 TL |
1367 | if (unlikely(to_stdout)) { |
1368 | std::cout << " 0x" << std::hex << pos << std::dec | |
1369 | << ": op_jump seq " << next_seq | |
1370 | << " offset 0x" << std::hex << offset << std::dec | |
1371 | << std::endl; | |
1372 | } | |
1373 | ||
20effc67 | 1374 | ceph_assert(next_seq > log_seq); |
7c673cae FG |
1375 | log_seq = next_seq - 1; // we will increment it below |
1376 | uint64_t skip = offset - read_pos; | |
1377 | if (skip) { | |
1378 | bufferlist junk; | |
f67539c2 | 1379 | int r = _read(log_reader, read_pos, skip, &junk, |
7c673cae FG |
1380 | NULL); |
1381 | if (r != (int)skip) { | |
1382 | dout(10) << __func__ << " 0x" << std::hex << read_pos | |
1383 | << ": stop: failed to skip to " << offset | |
1384 | << std::dec << dendl; | |
11fdf7f2 | 1385 | ceph_abort_msg("problem with op_jump"); |
7c673cae FG |
1386 | } |
1387 | } | |
1388 | } | |
1389 | break; | |
1390 | ||
1391 | case bluefs_transaction_t::OP_JUMP_SEQ: | |
1392 | { | |
1393 | uint64_t next_seq; | |
11fdf7f2 | 1394 | decode(next_seq, p); |
7c673cae FG |
1395 | dout(20) << __func__ << " 0x" << std::hex << pos << std::dec |
1396 | << ": op_jump_seq " << next_seq << dendl; | |
11fdf7f2 TL |
1397 | if (unlikely(to_stdout)) { |
1398 | std::cout << " 0x" << std::hex << pos << std::dec | |
1399 | << ": op_jump_seq " << next_seq << std::endl; | |
1400 | } | |
1401 | ||
20effc67 | 1402 | ceph_assert(next_seq > log_seq); |
7c673cae FG |
1403 | log_seq = next_seq - 1; // we will increment it below |
1404 | } | |
1405 | break; | |
1406 | ||
1407 | case bluefs_transaction_t::OP_ALLOC_ADD: | |
f67539c2 | 1408 | // LEGACY, do nothing but read params |
7c673cae | 1409 | { |
f67539c2 TL |
1410 | __u8 id; |
1411 | uint64_t offset, length; | |
1412 | decode(id, p); | |
1413 | decode(offset, p); | |
1414 | decode(length, p); | |
1415 | } | |
7c673cae FG |
1416 | break; |
1417 | ||
1418 | case bluefs_transaction_t::OP_ALLOC_RM: | |
f67539c2 | 1419 | // LEGACY, do nothing but read params |
7c673cae | 1420 | { |
f67539c2 TL |
1421 | __u8 id; |
1422 | uint64_t offset, length; | |
1423 | decode(id, p); | |
1424 | decode(offset, p); | |
1425 | decode(length, p); | |
1426 | } | |
1427 | break; | |
7c673cae FG |
1428 | |
1429 | case bluefs_transaction_t::OP_DIR_LINK: | |
1430 | { | |
1431 | string dirname, filename; | |
1432 | uint64_t ino; | |
11fdf7f2 TL |
1433 | decode(dirname, p); |
1434 | decode(filename, p); | |
1435 | decode(ino, p); | |
7c673cae FG |
1436 | dout(20) << __func__ << " 0x" << std::hex << pos << std::dec |
1437 | << ": op_dir_link " << " " << dirname << "/" << filename | |
1438 | << " to " << ino | |
1439 | << dendl; | |
11fdf7f2 TL |
1440 | if (unlikely(to_stdout)) { |
1441 | std::cout << " 0x" << std::hex << pos << std::dec | |
1442 | << ": op_dir_link " << " " << dirname << "/" << filename | |
1443 | << " to " << ino | |
1444 | << std::endl; | |
1445 | } | |
1446 | ||
7c673cae FG |
1447 | if (!noop) { |
1448 | FileRef file = _get_file(ino); | |
11fdf7f2 | 1449 | ceph_assert(file->fnode.ino); |
20effc67 TL |
1450 | map<string,DirRef>::iterator q = nodes.dir_map.find(dirname); |
1451 | ceph_assert(q != nodes.dir_map.end()); | |
7c673cae | 1452 | map<string,FileRef>::iterator r = q->second->file_map.find(filename); |
11fdf7f2 | 1453 | ceph_assert(r == q->second->file_map.end()); |
9f95a23c TL |
1454 | |
1455 | vselector->sub_usage(file->vselector_hint, file->fnode); | |
1456 | file->vselector_hint = | |
1457 | vselector->get_hint_by_dir(dirname); | |
1458 | vselector->add_usage(file->vselector_hint, file->fnode); | |
1459 | ||
7c673cae FG |
1460 | q->second->file_map[filename] = file; |
1461 | ++file->refs; | |
1462 | } | |
1463 | } | |
1464 | break; | |
1465 | ||
1466 | case bluefs_transaction_t::OP_DIR_UNLINK: | |
1467 | { | |
1468 | string dirname, filename; | |
11fdf7f2 TL |
1469 | decode(dirname, p); |
1470 | decode(filename, p); | |
7c673cae FG |
1471 | dout(20) << __func__ << " 0x" << std::hex << pos << std::dec |
1472 | << ": op_dir_unlink " << " " << dirname << "/" << filename | |
1473 | << dendl; | |
11fdf7f2 TL |
1474 | if (unlikely(to_stdout)) { |
1475 | std::cout << " 0x" << std::hex << pos << std::dec | |
1476 | << ": op_dir_unlink " << " " << dirname << "/" << filename | |
1477 | << std::endl; | |
1478 | } | |
1479 | ||
7c673cae | 1480 | if (!noop) { |
20effc67 TL |
1481 | map<string,DirRef>::iterator q = nodes.dir_map.find(dirname); |
1482 | ceph_assert(q != nodes.dir_map.end()); | |
7c673cae | 1483 | map<string,FileRef>::iterator r = q->second->file_map.find(filename); |
11fdf7f2 TL |
1484 | ceph_assert(r != q->second->file_map.end()); |
1485 | ceph_assert(r->second->refs > 0); | |
7c673cae FG |
1486 | --r->second->refs; |
1487 | q->second->file_map.erase(r); | |
1488 | } | |
1489 | } | |
1490 | break; | |
1491 | ||
1492 | case bluefs_transaction_t::OP_DIR_CREATE: | |
1493 | { | |
1494 | string dirname; | |
11fdf7f2 | 1495 | decode(dirname, p); |
7c673cae FG |
1496 | dout(20) << __func__ << " 0x" << std::hex << pos << std::dec |
1497 | << ": op_dir_create " << dirname << dendl; | |
11fdf7f2 TL |
1498 | if (unlikely(to_stdout)) { |
1499 | std::cout << " 0x" << std::hex << pos << std::dec | |
1500 | << ": op_dir_create " << dirname << std::endl; | |
1501 | } | |
1502 | ||
7c673cae | 1503 | if (!noop) { |
20effc67 TL |
1504 | map<string,DirRef>::iterator q = nodes.dir_map.find(dirname); |
1505 | ceph_assert(q == nodes.dir_map.end()); | |
1506 | nodes.dir_map[dirname] = ceph::make_ref<Dir>(); | |
7c673cae FG |
1507 | } |
1508 | } | |
1509 | break; | |
1510 | ||
1511 | case bluefs_transaction_t::OP_DIR_REMOVE: | |
1512 | { | |
1513 | string dirname; | |
11fdf7f2 | 1514 | decode(dirname, p); |
7c673cae FG |
1515 | dout(20) << __func__ << " 0x" << std::hex << pos << std::dec |
1516 | << ": op_dir_remove " << dirname << dendl; | |
11fdf7f2 TL |
1517 | if (unlikely(to_stdout)) { |
1518 | std::cout << " 0x" << std::hex << pos << std::dec | |
1519 | << ": op_dir_remove " << dirname << std::endl; | |
1520 | } | |
1521 | ||
7c673cae | 1522 | if (!noop) { |
20effc67 TL |
1523 | map<string,DirRef>::iterator q = nodes.dir_map.find(dirname); |
1524 | ceph_assert(q != nodes.dir_map.end()); | |
11fdf7f2 | 1525 | ceph_assert(q->second->file_map.empty()); |
20effc67 | 1526 | nodes.dir_map.erase(q); |
7c673cae FG |
1527 | } |
1528 | } | |
1529 | break; | |
1530 | ||
1531 | case bluefs_transaction_t::OP_FILE_UPDATE: | |
1532 | { | |
1533 | bluefs_fnode_t fnode; | |
11fdf7f2 | 1534 | decode(fnode, p); |
7c673cae | 1535 | dout(20) << __func__ << " 0x" << std::hex << pos << std::dec |
9f95a23c | 1536 | << ": op_file_update " << " " << fnode << " " << dendl; |
11fdf7f2 TL |
1537 | if (unlikely(to_stdout)) { |
1538 | std::cout << " 0x" << std::hex << pos << std::dec | |
1539 | << ": op_file_update " << " " << fnode << std::endl; | |
1540 | } | |
9f95a23c | 1541 | if (!noop) { |
7c673cae | 1542 | FileRef f = _get_file(fnode.ino); |
20effc67 TL |
1543 | if (cct->_conf->bluefs_log_replay_check_allocations) { |
1544 | int r = _check_allocations(f->fnode, | |
1545 | used_blocks, false, "OP_FILE_UPDATE"); | |
1546 | if (r < 0) { | |
1547 | return r; | |
9f95a23c TL |
1548 | } |
1549 | } | |
9f95a23c TL |
1550 | if (fnode.ino != 1) { |
1551 | vselector->sub_usage(f->vselector_hint, f->fnode); | |
1552 | } | |
1553 | f->fnode = fnode; | |
1554 | if (fnode.ino != 1) { | |
1555 | vselector->add_usage(f->vselector_hint, f->fnode); | |
1556 | } | |
1557 | ||
7c673cae FG |
1558 | if (fnode.ino > ino_last) { |
1559 | ino_last = fnode.ino; | |
1560 | } | |
9f95a23c | 1561 | if (cct->_conf->bluefs_log_replay_check_allocations) { |
20effc67 TL |
1562 | int r = _check_allocations(f->fnode, |
1563 | used_blocks, true, "OP_FILE_UPDATE"); | |
9f95a23c TL |
1564 | if (r < 0) { |
1565 | return r; | |
1566 | } | |
1567 | } | |
522d829b TL |
1568 | } else if (noop && fnode.ino == 1) { |
1569 | FileRef f = _get_file(fnode.ino); | |
1570 | f->fnode = fnode; | |
7c673cae | 1571 | } |
9f95a23c | 1572 | } |
7c673cae | 1573 | break; |
20effc67 TL |
1574 | case bluefs_transaction_t::OP_FILE_UPDATE_INC: |
1575 | { | |
1576 | bluefs_fnode_delta_t delta; | |
1577 | decode(delta, p); | |
1578 | dout(20) << __func__ << " 0x" << std::hex << pos << std::dec | |
1579 | << ": op_file_update_inc " << " " << delta << " " << dendl; | |
1580 | if (unlikely(to_stdout)) { | |
1581 | std::cout << " 0x" << std::hex << pos << std::dec | |
1582 | << ": op_file_update_inc " << " " << delta << std::endl; | |
1583 | } | |
1584 | if (!noop) { | |
1585 | FileRef f = _get_file(delta.ino); | |
1586 | bluefs_fnode_t& fnode = f->fnode; | |
1587 | if (delta.offset != fnode.allocated) { | |
1588 | derr << __func__ << " invalid op_file_update_inc, new extents miss end of file" | |
1589 | << " fnode=" << fnode | |
1590 | << " delta=" << delta | |
1591 | << dendl; | |
1592 | ceph_assert(delta.offset == fnode.allocated); | |
1593 | } | |
1594 | if (cct->_conf->bluefs_log_replay_check_allocations) { | |
1595 | int r = _check_allocations(fnode, | |
1596 | used_blocks, false, "OP_FILE_UPDATE_INC"); | |
1597 | if (r < 0) { | |
1598 | return r; | |
1599 | } | |
1600 | } | |
1601 | ||
1602 | fnode.ino = delta.ino; | |
1603 | fnode.mtime = delta.mtime; | |
1604 | if (fnode.ino != 1) { | |
1605 | vselector->sub_usage(f->vselector_hint, fnode); | |
1606 | } | |
1607 | fnode.size = delta.size; | |
1608 | fnode.claim_extents(delta.extents); | |
1609 | dout(20) << __func__ << " 0x" << std::hex << pos << std::dec | |
1610 | << ": op_file_update_inc produced " << " " << fnode << " " << dendl; | |
1611 | ||
1612 | if (fnode.ino != 1) { | |
1613 | vselector->add_usage(f->vselector_hint, fnode); | |
1614 | } | |
1615 | ||
1616 | if (fnode.ino > ino_last) { | |
1617 | ino_last = fnode.ino; | |
1618 | } | |
1619 | if (cct->_conf->bluefs_log_replay_check_allocations) { | |
1620 | int r = _check_allocations(f->fnode, | |
1621 | used_blocks, true, "OP_FILE_UPDATE_INC"); | |
1622 | if (r < 0) { | |
1623 | return r; | |
1624 | } | |
1625 | } | |
1626 | } else if (noop && delta.ino == 1) { | |
1627 | // we need to track bluefs log, even in noop mode | |
1628 | FileRef f = _get_file(1); | |
1629 | bluefs_fnode_t& fnode = f->fnode; | |
1630 | fnode.ino = delta.ino; | |
1631 | fnode.mtime = delta.mtime; | |
1632 | fnode.size = delta.size; | |
1633 | fnode.claim_extents(delta.extents); | |
1634 | } | |
1635 | } | |
1636 | break; | |
7c673cae FG |
1637 | |
1638 | case bluefs_transaction_t::OP_FILE_REMOVE: | |
1639 | { | |
1640 | uint64_t ino; | |
11fdf7f2 | 1641 | decode(ino, p); |
7c673cae FG |
1642 | dout(20) << __func__ << " 0x" << std::hex << pos << std::dec |
1643 | << ": op_file_remove " << ino << dendl; | |
11fdf7f2 TL |
1644 | if (unlikely(to_stdout)) { |
1645 | std::cout << " 0x" << std::hex << pos << std::dec | |
1646 | << ": op_file_remove " << ino << std::endl; | |
1647 | } | |
1648 | ||
9f95a23c | 1649 | if (!noop) { |
20effc67 TL |
1650 | auto p = nodes.file_map.find(ino); |
1651 | ceph_assert(p != nodes.file_map.end()); | |
9f95a23c TL |
1652 | vselector->sub_usage(p->second->vselector_hint, p->second->fnode); |
1653 | if (cct->_conf->bluefs_log_replay_check_allocations) { | |
20effc67 TL |
1654 | int r = _check_allocations(p->second->fnode, |
1655 | used_blocks, false, "OP_FILE_REMOVE"); | |
1656 | if (r < 0) { | |
1657 | return r; | |
9f95a23c TL |
1658 | } |
1659 | } | |
20effc67 | 1660 | nodes.file_map.erase(p); |
9f95a23c TL |
1661 | } |
1662 | } | |
7c673cae FG |
1663 | break; |
1664 | ||
1665 | default: | |
1666 | derr << __func__ << " 0x" << std::hex << pos << std::dec | |
1667 | << ": stop: unrecognized op " << (int)op << dendl; | |
1668 | delete log_reader; | |
1669 | return -EIO; | |
1670 | } | |
1671 | } | |
11fdf7f2 | 1672 | ceph_assert(p.end()); |
7c673cae FG |
1673 | |
1674 | // we successfully replayed the transaction; bump the seq and log size | |
1675 | ++log_seq; | |
1676 | log_file->fnode.size = log_reader->buf.pos; | |
1677 | } | |
f67539c2 TL |
1678 | if (!noop) { |
1679 | vselector->add_usage(log_file->vselector_hint, log_file->fnode); | |
20effc67 TL |
1680 | log.seq_live = log_seq + 1; |
1681 | dirty.seq_live = log_seq + 1; | |
1682 | log.t.seq = log.seq_live; | |
1683 | dirty.seq_stable = log_seq; | |
9f95a23c | 1684 | } |
7c673cae FG |
1685 | |
1686 | dout(10) << __func__ << " log file size was 0x" | |
1687 | << std::hex << log_file->fnode.size << std::dec << dendl; | |
11fdf7f2 TL |
1688 | if (unlikely(to_stdout)) { |
1689 | std::cout << " log file size was 0x" | |
1690 | << std::hex << log_file->fnode.size << std::dec << std::endl; | |
1691 | } | |
1692 | ||
7c673cae FG |
1693 | delete log_reader; |
1694 | ||
1695 | if (!noop) { | |
1696 | // verify file link counts are all >0 | |
20effc67 | 1697 | for (auto& p : nodes.file_map) { |
7c673cae FG |
1698 | if (p.second->refs == 0 && |
1699 | p.second->fnode.ino > 1) { | |
1700 | derr << __func__ << " file with link count 0: " << p.second->fnode | |
1701 | << dendl; | |
1702 | return -EIO; | |
1703 | } | |
1704 | } | |
1705 | } | |
20effc67 TL |
1706 | // reflect file count in logger |
1707 | logger->set(l_bluefs_num_files, nodes.file_map.size()); | |
7c673cae FG |
1708 | |
1709 | dout(10) << __func__ << " done" << dendl; | |
1710 | return 0; | |
1711 | } | |
1712 | ||
11fdf7f2 TL |
1713 | int BlueFS::log_dump() |
1714 | { | |
1715 | // only dump log file's content | |
20effc67 TL |
1716 | ceph_assert(log.writer == nullptr && "cannot log_dump on mounted BlueFS"); |
1717 | _init_logger(); | |
f67539c2 | 1718 | int r = _open_super(); |
11fdf7f2 | 1719 | if (r < 0) { |
f67539c2 | 1720 | derr << __func__ << " failed to open super: " << cpp_strerror(r) << dendl; |
11fdf7f2 TL |
1721 | return r; |
1722 | } | |
f67539c2 TL |
1723 | r = _replay(true, true); |
1724 | if (r < 0) { | |
1725 | derr << __func__ << " failed to replay log: " << cpp_strerror(r) << dendl; | |
1726 | } | |
1727 | _shutdown_logger(); | |
1728 | super = bluefs_super_t(); | |
1729 | return r; | |
11fdf7f2 TL |
1730 | } |
1731 | ||
1732 | int BlueFS::device_migrate_to_existing( | |
1733 | CephContext *cct, | |
1734 | const set<int>& devs_source, | |
9f95a23c TL |
1735 | int dev_target, |
1736 | const bluefs_layout_t& layout) | |
11fdf7f2 TL |
1737 | { |
1738 | vector<byte> buf; | |
1739 | bool buffered = cct->_conf->bluefs_buffered_io; | |
1740 | ||
eafe8130 TL |
1741 | dout(10) << __func__ << " devs_source " << devs_source |
1742 | << " dev_target " << dev_target << dendl; | |
11fdf7f2 TL |
1743 | assert(dev_target < (int)MAX_BDEV); |
1744 | ||
1745 | int flags = 0; | |
1746 | flags |= devs_source.count(BDEV_DB) ? | |
1747 | (REMOVE_DB | RENAME_SLOW2DB) : 0; | |
1748 | flags |= devs_source.count(BDEV_WAL) ? REMOVE_WAL : 0; | |
1749 | int dev_target_new = dev_target; | |
1750 | ||
1751 | // Slow device without separate DB one is addressed via BDEV_DB | |
1752 | // Hence need renaming. | |
1753 | if ((flags & REMOVE_DB) && dev_target == BDEV_SLOW) { | |
1754 | dev_target_new = BDEV_DB; | |
1755 | dout(0) << __func__ << " super to be written to " << dev_target << dendl; | |
1756 | } | |
1757 | ||
20effc67 | 1758 | for (auto& [ino, file_ref] : nodes.file_map) { |
11fdf7f2 | 1759 | //do not copy log |
39ae355f | 1760 | if (ino == 1) { |
11fdf7f2 TL |
1761 | continue; |
1762 | } | |
9f95a23c | 1763 | dout(10) << __func__ << " " << ino << " " << file_ref->fnode << dendl; |
eafe8130 | 1764 | |
20effc67 | 1765 | vselector->sub_usage(file_ref->vselector_hint, file_ref->fnode); |
11fdf7f2 | 1766 | |
9f95a23c | 1767 | bool rewrite = std::any_of( |
39ae355f TL |
1768 | file_ref->fnode.extents.begin(), |
1769 | file_ref->fnode.extents.end(), | |
9f95a23c TL |
1770 | [=](auto& ext) { |
1771 | return ext.bdev != dev_target && devs_source.count(ext.bdev); | |
1772 | }); | |
eafe8130 TL |
1773 | if (rewrite) { |
1774 | dout(10) << __func__ << " migrating" << dendl; | |
39ae355f TL |
1775 | bluefs_fnode_t old_fnode; |
1776 | old_fnode.swap_extents(file_ref->fnode); | |
1777 | auto& old_fnode_extents = old_fnode.extents; | |
eafe8130 TL |
1778 | // read entire file |
1779 | bufferlist bl; | |
39ae355f | 1780 | for (const auto &old_ext : old_fnode_extents) { |
eafe8130 | 1781 | buf.resize(old_ext.length); |
20effc67 | 1782 | int r = _bdev_read_random(old_ext.bdev, |
eafe8130 TL |
1783 | old_ext.offset, |
1784 | old_ext.length, | |
1785 | (char*)&buf.at(0), | |
1786 | buffered); | |
1787 | if (r != 0) { | |
1788 | derr << __func__ << " failed to read 0x" << std::hex | |
1789 | << old_ext.offset << "~" << old_ext.length << std::dec | |
1790 | << " from " << (int)dev_target << dendl; | |
1791 | return -EIO; | |
1792 | } | |
1793 | bl.append((char*)&buf[0], old_ext.length); | |
1794 | } | |
11fdf7f2 | 1795 | |
eafe8130 | 1796 | // write entire file |
39ae355f TL |
1797 | auto l = _allocate(dev_target, bl.length(), 0, |
1798 | &file_ref->fnode, 0, false); | |
eafe8130 TL |
1799 | if (l < 0) { |
1800 | derr << __func__ << " unable to allocate len 0x" << std::hex | |
1801 | << bl.length() << std::dec << " from " << (int)dev_target | |
1802 | << ": " << cpp_strerror(l) << dendl; | |
1803 | return -ENOSPC; | |
1804 | } | |
11fdf7f2 | 1805 | |
eafe8130 | 1806 | uint64_t off = 0; |
39ae355f | 1807 | for (auto& i : file_ref->fnode.extents) { |
eafe8130 TL |
1808 | bufferlist cur; |
1809 | uint64_t cur_len = std::min<uint64_t>(i.length, bl.length() - off); | |
1810 | ceph_assert(cur_len > 0); | |
1811 | cur.substr_of(bl, off, cur_len); | |
1812 | int r = bdev[dev_target]->write(i.offset, cur, buffered); | |
1813 | ceph_assert(r == 0); | |
1814 | off += cur_len; | |
1815 | } | |
1816 | ||
1817 | // release old extents | |
39ae355f | 1818 | for (const auto &old_ext : old_fnode_extents) { |
eafe8130 TL |
1819 | PExtentVector to_release; |
1820 | to_release.emplace_back(old_ext.offset, old_ext.length); | |
1821 | alloc[old_ext.bdev]->release(to_release); | |
f67539c2 TL |
1822 | if (is_shared_alloc(old_ext.bdev)) { |
1823 | shared_alloc->bluefs_used -= to_release.size(); | |
1824 | } | |
eafe8130 TL |
1825 | } |
1826 | ||
1827 | // update fnode | |
39ae355f TL |
1828 | for (auto& i : file_ref->fnode.extents) { |
1829 | i.bdev = dev_target_new; | |
eafe8130 TL |
1830 | } |
1831 | } else { | |
39ae355f | 1832 | for (auto& ext : file_ref->fnode.extents) { |
9f95a23c | 1833 | if (dev_target != dev_target_new && ext.bdev == dev_target) { |
eafe8130 | 1834 | dout(20) << __func__ << " " << " ... adjusting extent 0x" |
9f95a23c | 1835 | << std::hex << ext.offset << std::dec |
eafe8130 TL |
1836 | << " bdev " << dev_target << " -> " << dev_target_new |
1837 | << dendl; | |
9f95a23c | 1838 | ext.bdev = dev_target_new; |
11fdf7f2 | 1839 | } |
11fdf7f2 TL |
1840 | } |
1841 | } | |
20effc67 | 1842 | vselector->add_usage(file_ref->vselector_hint, file_ref->fnode); |
11fdf7f2 TL |
1843 | } |
1844 | // new logging device in the current naming scheme | |
1845 | int new_log_dev_cur = bdev[BDEV_WAL] ? | |
1846 | BDEV_WAL : | |
1847 | bdev[BDEV_DB] ? BDEV_DB : BDEV_SLOW; | |
1848 | ||
1849 | // new logging device in new naming scheme | |
1850 | int new_log_dev_next = new_log_dev_cur; | |
1851 | ||
1852 | if (devs_source.count(new_log_dev_cur)) { | |
1853 | // SLOW device is addressed via BDEV_DB too hence either WAL or DB | |
1854 | new_log_dev_next = (flags & REMOVE_WAL) || !bdev[BDEV_WAL] ? | |
1855 | BDEV_DB : | |
1856 | BDEV_WAL; | |
1857 | ||
1858 | dout(0) << __func__ << " log moved from " << new_log_dev_cur | |
1859 | << " to " << new_log_dev_next << dendl; | |
1860 | ||
1861 | new_log_dev_cur = | |
1862 | (flags & REMOVE_DB) && new_log_dev_next == BDEV_DB ? | |
1863 | BDEV_SLOW : | |
1864 | new_log_dev_next; | |
1865 | } | |
1866 | ||
20effc67 | 1867 | _rewrite_log_and_layout_sync_LNF_LD( |
11fdf7f2 TL |
1868 | false, |
1869 | (flags & REMOVE_DB) ? BDEV_SLOW : BDEV_DB, | |
1870 | new_log_dev_cur, | |
1871 | new_log_dev_next, | |
9f95a23c TL |
1872 | flags, |
1873 | layout); | |
11fdf7f2 TL |
1874 | return 0; |
1875 | } | |
1876 | ||
1877 | int BlueFS::device_migrate_to_new( | |
1878 | CephContext *cct, | |
1879 | const set<int>& devs_source, | |
9f95a23c TL |
1880 | int dev_target, |
1881 | const bluefs_layout_t& layout) | |
11fdf7f2 TL |
1882 | { |
1883 | vector<byte> buf; | |
1884 | bool buffered = cct->_conf->bluefs_buffered_io; | |
1885 | ||
eafe8130 TL |
1886 | dout(10) << __func__ << " devs_source " << devs_source |
1887 | << " dev_target " << dev_target << dendl; | |
20effc67 | 1888 | assert(dev_target == (int)BDEV_NEWDB || dev_target == (int)BDEV_NEWWAL); |
11fdf7f2 TL |
1889 | |
1890 | int flags = 0; | |
1891 | ||
1892 | flags |= devs_source.count(BDEV_DB) ? | |
1893 | (!bdev[BDEV_SLOW] ? RENAME_DB2SLOW: REMOVE_DB) : | |
1894 | 0; | |
1895 | flags |= devs_source.count(BDEV_WAL) ? REMOVE_WAL : 0; | |
9f95a23c | 1896 | int dev_target_new = dev_target; //FIXME: remove, makes no sense |
11fdf7f2 | 1897 | |
39ae355f | 1898 | for (auto& [ino, file_ref] : nodes.file_map) { |
11fdf7f2 | 1899 | //do not copy log |
39ae355f | 1900 | if (ino == 1) { |
11fdf7f2 TL |
1901 | continue; |
1902 | } | |
39ae355f | 1903 | dout(10) << __func__ << " " << ino << " " << file_ref->fnode << dendl; |
eafe8130 | 1904 | |
39ae355f | 1905 | vselector->sub_usage(file_ref->vselector_hint, file_ref->fnode); |
11fdf7f2 | 1906 | |
39ae355f TL |
1907 | bool rewrite = std::any_of( |
1908 | file_ref->fnode.extents.begin(), | |
1909 | file_ref->fnode.extents.end(), | |
1910 | [=](auto& ext) { | |
1911 | return ext.bdev != dev_target && devs_source.count(ext.bdev); | |
1912 | }); | |
eafe8130 TL |
1913 | if (rewrite) { |
1914 | dout(10) << __func__ << " migrating" << dendl; | |
39ae355f TL |
1915 | bluefs_fnode_t old_fnode; |
1916 | old_fnode.swap_extents(file_ref->fnode); | |
1917 | auto& old_fnode_extents = old_fnode.extents; | |
eafe8130 TL |
1918 | // read entire file |
1919 | bufferlist bl; | |
39ae355f | 1920 | for (const auto &old_ext : old_fnode_extents) { |
eafe8130 | 1921 | buf.resize(old_ext.length); |
20effc67 | 1922 | int r = _bdev_read_random(old_ext.bdev, |
eafe8130 TL |
1923 | old_ext.offset, |
1924 | old_ext.length, | |
1925 | (char*)&buf.at(0), | |
1926 | buffered); | |
1927 | if (r != 0) { | |
1928 | derr << __func__ << " failed to read 0x" << std::hex | |
1929 | << old_ext.offset << "~" << old_ext.length << std::dec | |
1930 | << " from " << (int)dev_target << dendl; | |
1931 | return -EIO; | |
11fdf7f2 | 1932 | } |
eafe8130 TL |
1933 | bl.append((char*)&buf[0], old_ext.length); |
1934 | } | |
1935 | ||
1936 | // write entire file | |
39ae355f TL |
1937 | auto l = _allocate(dev_target, bl.length(), 0, |
1938 | &file_ref->fnode, 0, false); | |
eafe8130 TL |
1939 | if (l < 0) { |
1940 | derr << __func__ << " unable to allocate len 0x" << std::hex | |
1941 | << bl.length() << std::dec << " from " << (int)dev_target | |
1942 | << ": " << cpp_strerror(l) << dendl; | |
1943 | return -ENOSPC; | |
1944 | } | |
1945 | ||
1946 | uint64_t off = 0; | |
39ae355f | 1947 | for (auto& i : file_ref->fnode.extents) { |
eafe8130 TL |
1948 | bufferlist cur; |
1949 | uint64_t cur_len = std::min<uint64_t>(i.length, bl.length() - off); | |
1950 | ceph_assert(cur_len > 0); | |
1951 | cur.substr_of(bl, off, cur_len); | |
1952 | int r = bdev[dev_target]->write(i.offset, cur, buffered); | |
1953 | ceph_assert(r == 0); | |
1954 | off += cur_len; | |
1955 | } | |
1956 | ||
1957 | // release old extents | |
39ae355f | 1958 | for (const auto &old_ext : old_fnode_extents) { |
eafe8130 TL |
1959 | PExtentVector to_release; |
1960 | to_release.emplace_back(old_ext.offset, old_ext.length); | |
1961 | alloc[old_ext.bdev]->release(to_release); | |
f67539c2 TL |
1962 | if (is_shared_alloc(old_ext.bdev)) { |
1963 | shared_alloc->bluefs_used -= to_release.size(); | |
1964 | } | |
eafe8130 TL |
1965 | } |
1966 | ||
1967 | // update fnode | |
39ae355f TL |
1968 | for (auto& i : file_ref->fnode.extents) { |
1969 | i.bdev = dev_target_new; | |
11fdf7f2 TL |
1970 | } |
1971 | } | |
11fdf7f2 TL |
1972 | } |
1973 | // new logging device in the current naming scheme | |
1974 | int new_log_dev_cur = | |
1975 | bdev[BDEV_NEWWAL] ? | |
1976 | BDEV_NEWWAL : | |
1977 | bdev[BDEV_WAL] && !(flags & REMOVE_WAL) ? | |
1978 | BDEV_WAL : | |
1979 | bdev[BDEV_NEWDB] ? | |
1980 | BDEV_NEWDB : | |
1981 | bdev[BDEV_DB] && !(flags & REMOVE_DB)? | |
1982 | BDEV_DB : | |
1983 | BDEV_SLOW; | |
1984 | ||
1985 | // new logging device in new naming scheme | |
1986 | int new_log_dev_next = | |
1987 | new_log_dev_cur == BDEV_NEWWAL ? | |
1988 | BDEV_WAL : | |
1989 | new_log_dev_cur == BDEV_NEWDB ? | |
1990 | BDEV_DB : | |
1991 | new_log_dev_cur; | |
1992 | ||
1993 | int super_dev = | |
1994 | dev_target == BDEV_NEWDB ? | |
1995 | BDEV_NEWDB : | |
1996 | bdev[BDEV_DB] ? | |
1997 | BDEV_DB : | |
1998 | BDEV_SLOW; | |
1999 | ||
20effc67 | 2000 | _rewrite_log_and_layout_sync_LNF_LD( |
11fdf7f2 TL |
2001 | false, |
2002 | super_dev, | |
2003 | new_log_dev_cur, | |
2004 | new_log_dev_next, | |
9f95a23c TL |
2005 | flags, |
2006 | layout); | |
11fdf7f2 TL |
2007 | return 0; |
2008 | } | |
2009 | ||
7c673cae FG |
2010 | BlueFS::FileRef BlueFS::_get_file(uint64_t ino) |
2011 | { | |
20effc67 TL |
2012 | auto p = nodes.file_map.find(ino); |
2013 | if (p == nodes.file_map.end()) { | |
9f95a23c | 2014 | FileRef f = ceph::make_ref<File>(); |
20effc67 TL |
2015 | nodes.file_map[ino] = f; |
2016 | // track files count in logger | |
2017 | logger->set(l_bluefs_num_files, nodes.file_map.size()); | |
7c673cae FG |
2018 | dout(30) << __func__ << " ino " << ino << " = " << f |
2019 | << " (new)" << dendl; | |
2020 | return f; | |
2021 | } else { | |
2022 | dout(30) << __func__ << " ino " << ino << " = " << p->second << dendl; | |
2023 | return p->second; | |
2024 | } | |
2025 | } | |
2026 | ||
20effc67 TL |
2027 | |
2028 | /** | |
2029 | To modify fnode both FileWriter::lock and File::lock must be obtained. | |
2030 | The special case is when we modify bluefs log (ino 1) or | |
2031 | we are compacting log (ino 0). | |
2032 | ||
2033 | In any case it is enough to hold File::lock to be sure fnode will not be modified. | |
2034 | */ | |
2035 | struct lock_fnode_print { | |
2036 | BlueFS::FileRef file; | |
2037 | lock_fnode_print(BlueFS::FileRef file) : file(file) {}; | |
2038 | }; | |
2039 | std::ostream& operator<<(std::ostream& out, const lock_fnode_print& to_lock) { | |
2040 | std::lock_guard l(to_lock.file->lock); | |
2041 | out << to_lock.file->fnode; | |
2042 | return out; | |
2043 | } | |
2044 | ||
2045 | void BlueFS::_drop_link_D(FileRef file) | |
7c673cae FG |
2046 | { |
2047 | dout(20) << __func__ << " had refs " << file->refs | |
20effc67 | 2048 | << " on " << lock_fnode_print(file) << dendl; |
11fdf7f2 | 2049 | ceph_assert(file->refs > 0); |
20effc67 TL |
2050 | ceph_assert(ceph_mutex_is_locked(log.lock)); |
2051 | ceph_assert(ceph_mutex_is_locked(nodes.lock)); | |
2052 | ||
7c673cae FG |
2053 | --file->refs; |
2054 | if (file->refs == 0) { | |
2055 | dout(20) << __func__ << " destroying " << file->fnode << dendl; | |
11fdf7f2 | 2056 | ceph_assert(file->num_reading.load() == 0); |
9f95a23c | 2057 | vselector->sub_usage(file->vselector_hint, file->fnode); |
20effc67 TL |
2058 | log.t.op_file_remove(file->fnode.ino); |
2059 | nodes.file_map.erase(file->fnode.ino); | |
2060 | logger->set(l_bluefs_num_files, nodes.file_map.size()); | |
7c673cae | 2061 | file->deleted = true; |
94b18763 | 2062 | |
20effc67 TL |
2063 | std::lock_guard dl(dirty.lock); |
2064 | for (auto& r : file->fnode.extents) { | |
2065 | dirty.pending_release[r.bdev].insert(r.offset, r.length); | |
2066 | } | |
2067 | if (file->dirty_seq > dirty.seq_stable) { | |
2068 | // retract request to serialize changes | |
2069 | ceph_assert(dirty.files.count(file->dirty_seq)); | |
2070 | auto it = dirty.files[file->dirty_seq].iterator_to(*file); | |
2071 | dirty.files[file->dirty_seq].erase(it); | |
2072 | file->dirty_seq = dirty.seq_stable; | |
7c673cae FG |
2073 | } |
2074 | } | |
2075 | } | |
2076 | ||
adb31ebb | 2077 | int64_t BlueFS::_read_random( |
7c673cae FG |
2078 | FileReader *h, ///< [in] read from here |
2079 | uint64_t off, ///< [in] offset | |
9f95a23c | 2080 | uint64_t len, ///< [in] this many bytes |
f67539c2 | 2081 | char *out) ///< [out] copy it here |
7c673cae | 2082 | { |
494da23a TL |
2083 | auto* buf = &h->buf; |
2084 | ||
adb31ebb | 2085 | int64_t ret = 0; |
7c673cae FG |
2086 | dout(10) << __func__ << " h " << h |
2087 | << " 0x" << std::hex << off << "~" << len << std::dec | |
20effc67 | 2088 | << " from " << lock_fnode_print(h->file) << dendl; |
7c673cae FG |
2089 | |
2090 | ++h->file->num_reading; | |
2091 | ||
2092 | if (!h->ignore_eof && | |
2093 | off + len > h->file->fnode.size) { | |
2094 | if (off > h->file->fnode.size) | |
2095 | len = 0; | |
2096 | else | |
2097 | len = h->file->fnode.size - off; | |
2098 | dout(20) << __func__ << " reaching (or past) eof, len clipped to 0x" | |
2099 | << std::hex << len << std::dec << dendl; | |
2100 | } | |
494da23a TL |
2101 | logger->inc(l_bluefs_read_random_count, 1); |
2102 | logger->inc(l_bluefs_read_random_bytes, len); | |
7c673cae | 2103 | |
494da23a | 2104 | std::shared_lock s_lock(h->lock); |
f91f0fd5 | 2105 | buf->bl.reassign_to_mempool(mempool::mempool_bluefs_file_reader); |
7c673cae | 2106 | while (len > 0) { |
494da23a TL |
2107 | if (off < buf->bl_off || off >= buf->get_buf_end()) { |
2108 | s_lock.unlock(); | |
2109 | uint64_t x_off = 0; | |
2110 | auto p = h->file->fnode.seek(off, &x_off); | |
f6b5b4d7 | 2111 | ceph_assert(p != h->file->fnode.extents.end()); |
9f95a23c | 2112 | uint64_t l = std::min(p->length - x_off, len); |
adb31ebb TL |
2113 | //hard cap to 1GB |
2114 | l = std::min(l, uint64_t(1) << 30); | |
494da23a TL |
2115 | dout(20) << __func__ << " read random 0x" |
2116 | << std::hex << x_off << "~" << l << std::dec | |
2117 | << " of " << *p << dendl; | |
cd265ab1 TL |
2118 | int r; |
2119 | if (!cct->_conf->bluefs_check_for_zeros) { | |
20effc67 TL |
2120 | r = _bdev_read_random(p->bdev, p->offset + x_off, l, out, |
2121 | cct->_conf->bluefs_buffered_io); | |
cd265ab1 | 2122 | } else { |
20effc67 | 2123 | r = _read_random_and_check(p->bdev, p->offset + x_off, l, out, |
cd265ab1 TL |
2124 | cct->_conf->bluefs_buffered_io); |
2125 | } | |
494da23a TL |
2126 | ceph_assert(r == 0); |
2127 | off += l; | |
2128 | len -= l; | |
2129 | ret += l; | |
2130 | out += l; | |
2131 | ||
2132 | logger->inc(l_bluefs_read_random_disk_count, 1); | |
2133 | logger->inc(l_bluefs_read_random_disk_bytes, l); | |
2134 | if (len > 0) { | |
2135 | s_lock.lock(); | |
2136 | } | |
2137 | } else { | |
2138 | auto left = buf->get_buf_remaining(off); | |
adb31ebb | 2139 | int64_t r = std::min(len, left); |
494da23a TL |
2140 | logger->inc(l_bluefs_read_random_buffer_count, 1); |
2141 | logger->inc(l_bluefs_read_random_buffer_bytes, r); | |
2142 | dout(20) << __func__ << " left 0x" << std::hex << left | |
2143 | << " 0x" << off << "~" << len << std::dec | |
2144 | << dendl; | |
2145 | ||
f67539c2 TL |
2146 | auto p = buf->bl.begin(); |
2147 | p.seek(off - buf->bl_off); | |
2148 | p.copy(r, out); | |
2149 | out += r; | |
7c673cae | 2150 | |
494da23a TL |
2151 | dout(30) << __func__ << " result chunk (0x" |
2152 | << std::hex << r << std::dec << " bytes):\n"; | |
2153 | bufferlist t; | |
2154 | t.substr_of(buf->bl, off - buf->bl_off, r); | |
2155 | t.hexdump(*_dout); | |
2156 | *_dout << dendl; | |
2157 | ||
2158 | off += r; | |
2159 | len -= r; | |
2160 | ret += r; | |
2161 | buf->pos += r; | |
2162 | } | |
2163 | } | |
39ae355f TL |
2164 | dout(20) << __func__ << std::hex |
2165 | << " got 0x" << ret | |
2166 | << std::dec << dendl; | |
7c673cae FG |
2167 | --h->file->num_reading; |
2168 | return ret; | |
2169 | } | |
2170 | ||
adb31ebb | 2171 | int64_t BlueFS::_read( |
7c673cae | 2172 | FileReader *h, ///< [in] read from here |
7c673cae FG |
2173 | uint64_t off, ///< [in] offset |
2174 | size_t len, ///< [in] this many bytes | |
2175 | bufferlist *outbl, ///< [out] optional: reference the result here | |
2176 | char *out) ///< [out] optional: or copy it here | |
2177 | { | |
f67539c2 TL |
2178 | FileReaderBuffer *buf = &(h->buf); |
2179 | ||
494da23a | 2180 | bool prefetch = !outbl && !out; |
7c673cae FG |
2181 | dout(10) << __func__ << " h " << h |
2182 | << " 0x" << std::hex << off << "~" << len << std::dec | |
20effc67 | 2183 | << " from " << lock_fnode_print(h->file) |
494da23a TL |
2184 | << (prefetch ? " prefetch" : "") |
2185 | << dendl; | |
7c673cae FG |
2186 | |
2187 | ++h->file->num_reading; | |
2188 | ||
2189 | if (!h->ignore_eof && | |
2190 | off + len > h->file->fnode.size) { | |
2191 | if (off > h->file->fnode.size) | |
2192 | len = 0; | |
2193 | else | |
2194 | len = h->file->fnode.size - off; | |
2195 | dout(20) << __func__ << " reaching (or past) eof, len clipped to 0x" | |
2196 | << std::hex << len << std::dec << dendl; | |
2197 | } | |
494da23a TL |
2198 | logger->inc(l_bluefs_read_count, 1); |
2199 | logger->inc(l_bluefs_read_bytes, len); | |
2200 | if (prefetch) { | |
2201 | logger->inc(l_bluefs_read_prefetch_count, 1); | |
2202 | logger->inc(l_bluefs_read_prefetch_bytes, len); | |
2203 | } | |
2204 | ||
7c673cae FG |
2205 | if (outbl) |
2206 | outbl->clear(); | |
2207 | ||
adb31ebb | 2208 | int64_t ret = 0; |
494da23a | 2209 | std::shared_lock s_lock(h->lock); |
7c673cae FG |
2210 | while (len > 0) { |
2211 | size_t left; | |
2212 | if (off < buf->bl_off || off >= buf->get_buf_end()) { | |
494da23a TL |
2213 | s_lock.unlock(); |
2214 | std::unique_lock u_lock(h->lock); | |
f91f0fd5 | 2215 | buf->bl.reassign_to_mempool(mempool::mempool_bluefs_file_reader); |
494da23a TL |
2216 | if (off < buf->bl_off || off >= buf->get_buf_end()) { |
2217 | // if precondition hasn't changed during locking upgrade. | |
2218 | buf->bl.clear(); | |
2219 | buf->bl_off = off & super.block_mask(); | |
2220 | uint64_t x_off = 0; | |
2221 | auto p = h->file->fnode.seek(buf->bl_off, &x_off); | |
f6b5b4d7 TL |
2222 | if (p == h->file->fnode.extents.end()) { |
2223 | dout(5) << __func__ << " reading less then required " | |
2224 | << ret << "<" << ret + len << dendl; | |
2225 | break; | |
2226 | } | |
2227 | ||
494da23a TL |
2228 | uint64_t want = round_up_to(len + (off & ~super.block_mask()), |
2229 | super.block_size); | |
2230 | want = std::max(want, buf->max_prefetch); | |
2231 | uint64_t l = std::min(p->length - x_off, want); | |
adb31ebb TL |
2232 | //hard cap to 1GB |
2233 | l = std::min(l, uint64_t(1) << 30); | |
494da23a TL |
2234 | uint64_t eof_offset = round_up_to(h->file->fnode.size, super.block_size); |
2235 | if (!h->ignore_eof && | |
2236 | buf->bl_off + l > eof_offset) { | |
2237 | l = eof_offset - buf->bl_off; | |
2238 | } | |
2239 | dout(20) << __func__ << " fetching 0x" | |
2240 | << std::hex << x_off << "~" << l << std::dec | |
2241 | << " of " << *p << dendl; | |
cd265ab1 | 2242 | int r; |
39ae355f TL |
2243 | // when reading BlueFS log (only happens on startup) use non-buffered io |
2244 | // it makes it in sync with logic in _flush_range() | |
2245 | bool use_buffered_io = h->file->fnode.ino == 1 ? false : cct->_conf->bluefs_buffered_io; | |
cd265ab1 | 2246 | if (!cct->_conf->bluefs_check_for_zeros) { |
20effc67 | 2247 | r = _bdev_read(p->bdev, p->offset + x_off, l, &buf->bl, ioc[p->bdev], |
39ae355f | 2248 | use_buffered_io); |
cd265ab1 | 2249 | } else { |
20effc67 TL |
2250 | r = _read_and_check( |
2251 | p->bdev, p->offset + x_off, l, &buf->bl, ioc[p->bdev], | |
39ae355f | 2252 | use_buffered_io); |
cd265ab1 | 2253 | } |
20effc67 TL |
2254 | logger->inc(l_bluefs_read_disk_count, 1); |
2255 | logger->inc(l_bluefs_read_disk_bytes, l); | |
2256 | ||
494da23a | 2257 | ceph_assert(r == 0); |
7c673cae | 2258 | } |
494da23a TL |
2259 | u_lock.unlock(); |
2260 | s_lock.lock(); | |
2261 | // we should recheck if buffer is valid after lock downgrade | |
2262 | continue; | |
7c673cae FG |
2263 | } |
2264 | left = buf->get_buf_remaining(off); | |
2265 | dout(20) << __func__ << " left 0x" << std::hex << left | |
2266 | << " len 0x" << len << std::dec << dendl; | |
2267 | ||
adb31ebb | 2268 | int64_t r = std::min(len, left); |
7c673cae FG |
2269 | if (outbl) { |
2270 | bufferlist t; | |
2271 | t.substr_of(buf->bl, off - buf->bl_off, r); | |
2272 | outbl->claim_append(t); | |
2273 | } | |
2274 | if (out) { | |
f67539c2 TL |
2275 | auto p = buf->bl.begin(); |
2276 | p.seek(off - buf->bl_off); | |
2277 | p.copy(r, out); | |
7c673cae FG |
2278 | out += r; |
2279 | } | |
2280 | ||
2281 | dout(30) << __func__ << " result chunk (0x" | |
2282 | << std::hex << r << std::dec << " bytes):\n"; | |
2283 | bufferlist t; | |
2284 | t.substr_of(buf->bl, off - buf->bl_off, r); | |
2285 | t.hexdump(*_dout); | |
2286 | *_dout << dendl; | |
2287 | ||
2288 | off += r; | |
2289 | len -= r; | |
2290 | ret += r; | |
2291 | buf->pos += r; | |
2292 | } | |
f67539c2 | 2293 | |
39ae355f TL |
2294 | dout(20) << __func__ << std::hex |
2295 | << " got 0x" << ret | |
2296 | << std::dec << dendl; | |
11fdf7f2 | 2297 | ceph_assert(!outbl || (int)outbl->length() == ret); |
7c673cae FG |
2298 | --h->file->num_reading; |
2299 | return ret; | |
2300 | } | |
2301 | ||
20effc67 | 2302 | void BlueFS::invalidate_cache(FileRef f, uint64_t offset, uint64_t length) |
7c673cae | 2303 | { |
20effc67 | 2304 | std::lock_guard l(f->lock); |
7c673cae FG |
2305 | dout(10) << __func__ << " file " << f->fnode |
2306 | << " 0x" << std::hex << offset << "~" << length << std::dec | |
2307 | << dendl; | |
2308 | if (offset & ~super.block_mask()) { | |
2309 | offset &= super.block_mask(); | |
11fdf7f2 | 2310 | length = round_up_to(length, super.block_size); |
7c673cae FG |
2311 | } |
2312 | uint64_t x_off = 0; | |
2313 | auto p = f->fnode.seek(offset, &x_off); | |
2314 | while (length > 0 && p != f->fnode.extents.end()) { | |
11fdf7f2 | 2315 | uint64_t x_len = std::min(p->length - x_off, length); |
7c673cae FG |
2316 | bdev[p->bdev]->invalidate_cache(p->offset + x_off, x_len); |
2317 | dout(20) << __func__ << " 0x" << std::hex << x_off << "~" << x_len | |
2318 | << std:: dec << " of " << *p << dendl; | |
2319 | offset += x_len; | |
2320 | length -= x_len; | |
2321 | } | |
2322 | } | |
2323 | ||
39ae355f TL |
2324 | |
2325 | uint64_t BlueFS::_estimate_transaction_size(bluefs_transaction_t* t) | |
2326 | { | |
2327 | uint64_t max_alloc_size = std::max(alloc_size[BDEV_WAL], | |
2328 | std::max(alloc_size[BDEV_DB], | |
2329 | alloc_size[BDEV_SLOW])); | |
2330 | ||
2331 | // conservative estimate for final encoded size | |
2332 | return round_up_to(t->op_bl.length() + super.block_size * 2, max_alloc_size); | |
2333 | } | |
2334 | ||
2335 | uint64_t BlueFS::_make_initial_transaction(uint64_t start_seq, | |
2336 | bluefs_fnode_t& fnode, | |
2337 | uint64_t expected_final_size, | |
2338 | bufferlist* out) | |
2339 | { | |
2340 | bluefs_transaction_t t0; | |
2341 | t0.seq = start_seq; | |
2342 | t0.uuid = super.uuid; | |
2343 | t0.op_init(); | |
2344 | t0.op_file_update_inc(fnode); | |
2345 | t0.op_jump(start_seq, expected_final_size); // this is a fixed size op, | |
2346 | // hence it's valid with fake | |
2347 | // params for overall txc size | |
2348 | // estimation | |
2349 | if (!out) { | |
2350 | return _estimate_transaction_size(&t0); | |
2351 | } | |
2352 | ||
2353 | ceph_assert(expected_final_size > 0); | |
2354 | out->reserve(expected_final_size); | |
2355 | encode(t0, *out); | |
2356 | // make sure we're not wrong aboth the size | |
2357 | ceph_assert(out->length() <= expected_final_size); | |
2358 | _pad_bl(*out, expected_final_size); | |
2359 | return expected_final_size; | |
2360 | } | |
2361 | ||
20effc67 | 2362 | uint64_t BlueFS::_estimate_log_size_N() |
7c673cae | 2363 | { |
20effc67 | 2364 | std::lock_guard nl(nodes.lock); |
7c673cae FG |
2365 | int avg_dir_size = 40; // fixme |
2366 | int avg_file_size = 12; | |
2367 | uint64_t size = 4096 * 2; | |
20effc67 TL |
2368 | size += nodes.file_map.size() * (1 + sizeof(bluefs_fnode_t)); |
2369 | size += nodes.dir_map.size() + (1 + avg_dir_size); | |
2370 | size += nodes.file_map.size() * (1 + avg_dir_size + avg_file_size); | |
11fdf7f2 | 2371 | return round_up_to(size, super.block_size); |
7c673cae FG |
2372 | } |
2373 | ||
20effc67 | 2374 | void BlueFS::compact_log()/*_LNF_LD_NF_D*/ |
7c673cae | 2375 | { |
f6b5b4d7 TL |
2376 | if (!cct->_conf->bluefs_replay_recovery_disable_compact) { |
2377 | if (cct->_conf->bluefs_compact_log_sync) { | |
20effc67 | 2378 | _compact_log_sync_LNF_LD(); |
f6b5b4d7 | 2379 | } else { |
20effc67 | 2380 | _compact_log_async_LD_LNF_D(); |
f6b5b4d7 | 2381 | } |
7c673cae FG |
2382 | } |
2383 | } | |
2384 | ||
20effc67 | 2385 | bool BlueFS::_should_start_compact_log_L_N() |
7c673cae | 2386 | { |
20effc67 TL |
2387 | if (log_is_compacting.load() == true) { |
2388 | // compaction is already running | |
2389 | return false; | |
2390 | } | |
2391 | uint64_t current; | |
2392 | { | |
2393 | std::lock_guard ll(log.lock); | |
2394 | current = log.writer->file->fnode.size; | |
2395 | } | |
2396 | uint64_t expected = _estimate_log_size_N(); | |
7c673cae FG |
2397 | float ratio = (float)current / (float)expected; |
2398 | dout(10) << __func__ << " current 0x" << std::hex << current | |
2399 | << " expected " << expected << std::dec | |
2400 | << " ratio " << ratio | |
7c673cae | 2401 | << dendl; |
20effc67 | 2402 | if (current < cct->_conf->bluefs_log_compact_min_size || |
7c673cae FG |
2403 | ratio < cct->_conf->bluefs_log_compact_min_ratio) { |
2404 | return false; | |
2405 | } | |
2406 | return true; | |
2407 | } | |
2408 | ||
39ae355f TL |
2409 | void BlueFS::_compact_log_dump_metadata_NF(uint64_t start_seq, |
2410 | bluefs_transaction_t *t, | |
2411 | int bdev_update_flags, | |
2412 | uint64_t capture_before_seq) | |
7c673cae | 2413 | { |
39ae355f TL |
2414 | dout(20) << __func__ << dendl; |
2415 | t->seq = start_seq; | |
7c673cae | 2416 | t->uuid = super.uuid; |
7c673cae | 2417 | |
20effc67 TL |
2418 | std::lock_guard nl(nodes.lock); |
2419 | ||
20effc67 TL |
2420 | for (auto& [ino, file_ref] : nodes.file_map) { |
2421 | if (ino == 1) | |
2422 | continue; | |
2423 | ceph_assert(ino > 1); | |
2424 | std::lock_guard fl(file_ref->lock); | |
39ae355f TL |
2425 | if (bdev_update_flags) { |
2426 | for(auto& e : file_ref->fnode.extents) { | |
2427 | auto bdev = e.bdev; | |
2428 | auto bdev_new = bdev; | |
2429 | ceph_assert(!((bdev_update_flags & REMOVE_WAL) && bdev == BDEV_WAL)); | |
2430 | if ((bdev_update_flags & RENAME_SLOW2DB) && bdev == BDEV_SLOW) { | |
2431 | bdev_new = BDEV_DB; | |
2432 | } | |
2433 | if ((bdev_update_flags & RENAME_DB2SLOW) && bdev == BDEV_DB) { | |
2434 | bdev_new = BDEV_SLOW; | |
2435 | } | |
2436 | if (bdev == BDEV_NEWDB) { | |
2437 | // REMOVE_DB xor RENAME_DB | |
2438 | ceph_assert(!(bdev_update_flags & REMOVE_DB) != !(bdev_update_flags & RENAME_DB2SLOW)); | |
2439 | ceph_assert(!(bdev_update_flags & RENAME_SLOW2DB)); | |
2440 | bdev_new = BDEV_DB; | |
2441 | } | |
2442 | if (bdev == BDEV_NEWWAL) { | |
2443 | ceph_assert(bdev_update_flags & REMOVE_WAL); | |
2444 | bdev_new = BDEV_WAL; | |
2445 | } | |
2446 | e.bdev = bdev_new; | |
2447 | } | |
2448 | } | |
2449 | if (capture_before_seq == 0 || file_ref->dirty_seq < capture_before_seq) { | |
20effc67 TL |
2450 | dout(20) << __func__ << " op_file_update " << file_ref->fnode << dendl; |
2451 | } else { | |
2452 | dout(20) << __func__ << " op_file_update just modified, dirty_seq=" | |
39ae355f | 2453 | << file_ref->dirty_seq << " " << file_ref->fnode << dendl; |
20effc67 TL |
2454 | } |
2455 | t->op_file_update(file_ref->fnode); | |
2456 | } | |
2457 | for (auto& [path, dir_ref] : nodes.dir_map) { | |
9f95a23c TL |
2458 | dout(20) << __func__ << " op_dir_create " << path << dendl; |
2459 | t->op_dir_create(path); | |
2460 | for (auto& [fname, file_ref] : dir_ref->file_map) { | |
2461 | dout(20) << __func__ << " op_dir_link " << path << "/" << fname | |
2462 | << " to " << file_ref->fnode.ino << dendl; | |
2463 | t->op_dir_link(path, fname, file_ref->fnode.ino); | |
7c673cae FG |
2464 | } |
2465 | } | |
2466 | } | |
2467 | ||
20effc67 | 2468 | void BlueFS::_compact_log_sync_LNF_LD() |
7c673cae FG |
2469 | { |
2470 | dout(10) << __func__ << dendl; | |
20effc67 TL |
2471 | uint8_t prefer_bdev; |
2472 | { | |
2473 | std::lock_guard ll(log.lock); | |
2474 | prefer_bdev = | |
2475 | vselector->select_prefer_bdev(log.writer->file->vselector_hint); | |
2476 | } | |
2477 | _rewrite_log_and_layout_sync_LNF_LD(true, | |
11fdf7f2 | 2478 | BDEV_DB, |
9f95a23c TL |
2479 | prefer_bdev, |
2480 | prefer_bdev, | |
2481 | 0, | |
2482 | super.memorized_layout); | |
11fdf7f2 TL |
2483 | logger->inc(l_bluefs_log_compactions); |
2484 | } | |
2485 | ||
39ae355f TL |
2486 | /* |
2487 | * SYNC LOG COMPACTION | |
2488 | * | |
2489 | * 0. Lock the log completely through the whole procedure | |
2490 | * | |
2491 | * 1. Build new log. It will include log's starter and compacted metadata | |
2492 | * body. Jump op appended to the starter will link the pieces together. | |
2493 | * | |
2494 | * 2. Write out new log's content | |
2495 | * | |
2496 | * 3. Write out new superblock. This includes relevant device layout update. | |
2497 | * | |
2498 | * 4. Finalization. Old space release. | |
2499 | */ | |
2500 | ||
2501 | void BlueFS::_rewrite_log_and_layout_sync_LNF_LD(bool permit_dev_fallback, | |
20effc67 TL |
2502 | int super_dev, |
2503 | int log_dev, | |
2504 | int log_dev_new, | |
2505 | int flags, | |
2506 | std::optional<bluefs_layout_t> layout) | |
11fdf7f2 | 2507 | { |
39ae355f TL |
2508 | // we substitute log_dev with log_dev_new for new allocations below |
2509 | // and permitting fallback allocations prevents such a substitution | |
2510 | ceph_assert((permit_dev_fallback && log_dev == log_dev_new) || | |
2511 | !permit_dev_fallback); | |
2512 | ||
2513 | dout(10) << __func__ << " super_dev:" << super_dev | |
2514 | << " log_dev:" << log_dev | |
2515 | << " log_dev_new:" << log_dev_new | |
2516 | << " flags:" << flags | |
2517 | << " seq:" << log.seq_live | |
2518 | << dendl; | |
2519 | utime_t mtime = ceph_clock_now(); | |
2520 | uint64_t starter_seq = 1; | |
2521 | ||
2522 | // Part 0. | |
2523 | // Lock the log totally till the end of the procedure | |
20effc67 | 2524 | std::lock_guard ll(log.lock); |
39ae355f | 2525 | auto t0 = mono_clock::now(); |
20effc67 TL |
2526 | |
2527 | File *log_file = log.writer->file.get(); | |
39ae355f | 2528 | bluefs_fnode_t fnode_tail; |
20effc67 TL |
2529 | // log.t.seq is always set to current live seq |
2530 | ceph_assert(log.t.seq == log.seq_live); | |
2531 | // Capturing entire state. Dump anything that has been stored there. | |
2532 | log.t.clear(); | |
2533 | log.t.seq = log.seq_live; | |
2534 | // From now on, no changes to log.t are permitted until we finish rewriting log. | |
2535 | // Can allow dirty to remain dirty - log.seq_live will not change. | |
7c673cae | 2536 | |
39ae355f TL |
2537 | // |
2538 | // Part 1. | |
2539 | // Build new log starter and compacted metadata body | |
2540 | // 1.1. Build full compacted meta transaction. | |
2541 | // Encode a bluefs transaction that dumps all of the in-memory fnodes | |
2542 | // and names. | |
2543 | // This might be pretty large and its allocation map can exceed | |
2544 | // superblock size. Hence instead we'll need log starter part which | |
2545 | // goes to superblock and refers that new meta through op_update_inc. | |
2546 | // 1.2. Allocate space for the above transaction | |
2547 | // using its size estimation. | |
2548 | // 1.3. Allocate the space required for the starter part of the new log. | |
2549 | // It should be small enough to fit into superblock. | |
2550 | // 1.4 Building new log persistent fnode representation which will | |
2551 | // finally land to disk. | |
2552 | // Depending on input parameters we might need to perform device ids | |
2553 | // rename - runtime and persistent replicas should be different when we | |
2554 | // are in the device migration process. | |
2555 | // 1.5 Store starter fnode to run-time superblock, to be written out later. | |
2556 | // It doesn't contain compacted meta to fit relevant alocation map into | |
2557 | // superblock. | |
2558 | // 1.6 Proceed building new log persistent fnode representation. | |
2559 | // Will add log tail with compacted meta extents from 1.1. | |
2560 | // Device rename applied as well | |
2561 | // | |
2562 | // 1.7. Encode new log fnode starter, | |
2563 | // It will include op_init, new log's op_update_inc | |
2564 | // and jump to the compacted meta transaction beginning. | |
2565 | // Superblock will reference this starter part | |
2566 | // | |
2567 | // 1.8. Encode compacted meta transaction, | |
2568 | // extend the transaction with a jump to proper sequence no | |
2569 | // | |
2570 | ||
2571 | ||
2572 | // 1.1 Build full compacted meta transaction | |
2573 | bluefs_transaction_t compacted_meta_t; | |
2574 | _compact_log_dump_metadata_NF(starter_seq + 1, &compacted_meta_t, flags, 0); | |
2575 | ||
2576 | // 1.2 Allocate the space required for the compacted meta transaction | |
2577 | uint64_t compacted_meta_need = | |
2578 | _estimate_transaction_size(&compacted_meta_t) + | |
2579 | cct->_conf->bluefs_max_log_runway; | |
2580 | ||
2581 | dout(20) << __func__ << " compacted_meta_need " << compacted_meta_need << dendl; | |
2582 | ||
2583 | int r = _allocate(log_dev, compacted_meta_need, 0, &fnode_tail, 0, | |
2584 | permit_dev_fallback); | |
2585 | ceph_assert(r == 0); | |
7c673cae | 2586 | |
7c673cae | 2587 | |
39ae355f TL |
2588 | // 1.3 Allocate the space required for the starter part of the new log. |
2589 | // estimate new log fnode size to be referenced from superblock | |
2590 | // hence use dummy fnode and jump parameters | |
2591 | uint64_t starter_need = _make_initial_transaction(starter_seq, fnode_tail, 0, nullptr); | |
7c673cae | 2592 | |
39ae355f TL |
2593 | bluefs_fnode_t fnode_starter(log_file->fnode.ino, 0, mtime); |
2594 | r = _allocate(log_dev, starter_need, 0, &fnode_starter, 0, | |
2595 | permit_dev_fallback); | |
2596 | ceph_assert(r == 0); | |
7c673cae | 2597 | |
39ae355f TL |
2598 | // 1.4 Building starter fnode |
2599 | bluefs_fnode_t fnode_persistent(fnode_starter.ino, 0, mtime); | |
2600 | for (auto p : fnode_starter.extents) { | |
2601 | // rename device if needed - this is possible when fallback allocations | |
2602 | // are prohibited only. Which means every extent is targeted to the same | |
2603 | // device and we can unconditionally update them. | |
2604 | if (log_dev != log_dev_new) { | |
2605 | dout(10) << __func__ << " renaming log extents to " | |
2606 | << log_dev_new << dendl; | |
2607 | p.bdev = log_dev_new; | |
11fdf7f2 | 2608 | } |
39ae355f | 2609 | fnode_persistent.append_extent(p); |
7c673cae FG |
2610 | } |
2611 | ||
39ae355f TL |
2612 | // 1.5 Store starter fnode to run-time superblock, to be written out later |
2613 | super.log_fnode = fnode_persistent; | |
7c673cae | 2614 | |
39ae355f TL |
2615 | // 1.6 Proceed building new log persistent fnode representation |
2616 | // we'll build incremental update starting from this point | |
2617 | fnode_persistent.reset_delta(); | |
2618 | for (auto p : fnode_tail.extents) { | |
2619 | // rename device if needed - this is possible when fallback allocations | |
2620 | // are prohibited only. Which means every extent is targeted to the same | |
2621 | // device and we can unconditionally update them. | |
2622 | if (log_dev != log_dev_new) { | |
2623 | dout(10) << __func__ << " renaming log extents to " | |
2624 | << log_dev_new << dendl; | |
2625 | p.bdev = log_dev_new; | |
2626 | } | |
2627 | fnode_persistent.append_extent(p); | |
2628 | } | |
2629 | ||
2630 | // 1.7 Encode new log fnode | |
2631 | // This will flush incremental part of fnode_persistent only. | |
2632 | bufferlist starter_bl; | |
2633 | _make_initial_transaction(starter_seq, fnode_persistent, starter_need, &starter_bl); | |
9f95a23c | 2634 | |
39ae355f TL |
2635 | // 1.8 Encode compacted meta transaction |
2636 | dout(20) << __func__ << " op_jump_seq " << log.seq_live << dendl; | |
2637 | // hopefully "compact_meta_need" estimation provides enough extra space | |
2638 | // for this op, assert below if not | |
2639 | compacted_meta_t.op_jump_seq(log.seq_live); | |
2640 | ||
2641 | bufferlist compacted_meta_bl; | |
2642 | encode(compacted_meta_t, compacted_meta_bl); | |
2643 | _pad_bl(compacted_meta_bl); | |
2644 | ceph_assert(compacted_meta_bl.length() <= compacted_meta_need); | |
2645 | ||
2646 | // | |
2647 | // Part 2 | |
2648 | // Write out new log's content | |
2649 | // 2.1. Build the full runtime new log's fnode | |
2650 | // | |
2651 | // 2.2. Write out new log's | |
2652 | // | |
2653 | // 2.3. Do flush and wait for completion through flush_bdev() | |
2654 | // | |
2655 | // 2.4. Finalize log update | |
2656 | // Update all sequence numbers | |
2657 | // | |
2658 | ||
2659 | // 2.1 Build the full runtime new log's fnode | |
2660 | bluefs_fnode_t old_log_fnode; | |
2661 | old_log_fnode.swap(fnode_starter); | |
2662 | old_log_fnode.clone_extents(fnode_tail); | |
2663 | old_log_fnode.reset_delta(); | |
2664 | log_file->fnode.swap(old_log_fnode); | |
2665 | ||
2666 | // 2.2 Write out new log's content | |
2667 | // Get rid off old writer | |
2668 | _close_writer(log.writer); | |
2669 | // Make new log writer and stage new log's content writing | |
20effc67 | 2670 | log.writer = _create_writer(log_file); |
39ae355f TL |
2671 | log.writer->append(starter_bl); |
2672 | log.writer->append(compacted_meta_bl); | |
2673 | ||
2674 | // 2.3 Do flush and wait for completion through flush_bdev() | |
20effc67 | 2675 | _flush_special(log.writer); |
11fdf7f2 TL |
2676 | #ifdef HAVE_LIBAIO |
2677 | if (!cct->_conf->bluefs_sync_write) { | |
2678 | list<aio_t> completed_ios; | |
20effc67 TL |
2679 | _claim_completed_aios(log.writer, &completed_ios); |
2680 | _wait_for_aio(log.writer); | |
11fdf7f2 TL |
2681 | completed_ios.clear(); |
2682 | } | |
2683 | #endif | |
20effc67 | 2684 | _flush_bdev(); |
39ae355f TL |
2685 | |
2686 | // 2.4 Finalize log update | |
1d09f67e TL |
2687 | ++log.seq_live; |
2688 | dirty.seq_live = log.seq_live; | |
2689 | log.t.seq = log.seq_live; | |
39ae355f TL |
2690 | vselector->sub_usage(log_file->vselector_hint, old_log_fnode); |
2691 | vselector->add_usage(log_file->vselector_hint, log_file->fnode); | |
224ce89b | 2692 | |
39ae355f TL |
2693 | // Part 3. |
2694 | // Write out new superblock to reflect all the changes. | |
2695 | // | |
11fdf7f2 | 2696 | |
39ae355f | 2697 | super.memorized_layout = layout; |
11fdf7f2 | 2698 | _write_super(super_dev); |
20effc67 | 2699 | _flush_bdev(); |
7c673cae | 2700 | |
39ae355f TL |
2701 | // we're mostly done |
2702 | dout(10) << __func__ << " log extents " << log_file->fnode.extents << dendl; | |
2703 | logger->inc(l_bluefs_log_compactions); | |
2704 | ||
2705 | // Part 4 | |
2706 | // Finalization. Release old space. | |
2707 | // | |
2708 | { | |
2709 | dout(10) << __func__ | |
2710 | << " release old log extents " << old_log_fnode.extents | |
2711 | << dendl; | |
2712 | std::lock_guard dl(dirty.lock); | |
2713 | for (auto& r : old_log_fnode.extents) { | |
2714 | dirty.pending_release[r.bdev].insert(r.offset, r.length); | |
2715 | } | |
7c673cae | 2716 | } |
39ae355f | 2717 | logger->tinc(l_bluefs_compaction_lock_lat, mono_clock::now() - t0); |
7c673cae FG |
2718 | } |
2719 | ||
2720 | /* | |
39ae355f | 2721 | * ASYNC LOG COMPACTION |
7c673cae | 2722 | * |
39ae355f TL |
2723 | * 0. Lock the log and forbid its extension. The former covers just |
2724 | * a part of the below procedure while the latter spans over it | |
2725 | * completely. | |
2726 | * 1. Allocate a new extent to continue the log, and then log an event | |
2727 | * that jumps the log write position to the new extent. At this point, the | |
2728 | * old extent(s) won't be written to, and reflect everything to compact. | |
2729 | * New events will be written to the new region that we'll keep. | |
2730 | * The latter will finally become new log tail on compaction completion. | |
7c673cae | 2731 | * |
39ae355f TL |
2732 | * 2. Build new log. It will include log's starter, compacted metadata |
2733 | * body and the above tail. Jump ops appended to the starter and meta body | |
2734 | * will link the pieces togather. Log's lock is releases in the mid of the | |
2735 | * process to permit parallel access to it. | |
7c673cae | 2736 | * |
39ae355f | 2737 | * 3. Write out new log's content. |
7c673cae | 2738 | * |
39ae355f | 2739 | * 4. Write out new superblock to reflect all the changes. |
7c673cae | 2740 | * |
39ae355f | 2741 | * 5. Apply new log fnode, log is locked for a while. |
7c673cae | 2742 | * |
39ae355f | 2743 | * 6. Finalization. Clean up, old space release and total unlocking. |
7c673cae | 2744 | */ |
20effc67 TL |
2745 | |
2746 | void BlueFS::_compact_log_async_LD_LNF_D() //also locks FW for new_writer | |
7c673cae FG |
2747 | { |
2748 | dout(10) << __func__ << dendl; | |
39ae355f TL |
2749 | utime_t mtime = ceph_clock_now(); |
2750 | uint64_t starter_seq = 1; | |
2751 | uint64_t old_log_jump_to = 0; | |
2752 | ||
2753 | // Part 0. | |
2754 | // Lock the log and forbid its expansion and other compactions | |
2755 | ||
20effc67 TL |
2756 | // only one compaction allowed at one time |
2757 | bool old_is_comp = std::atomic_exchange(&log_is_compacting, true); | |
2758 | if (old_is_comp) { | |
2759 | dout(10) << __func__ << " ongoing" <<dendl; | |
2760 | return; | |
2761 | } | |
39ae355f | 2762 | // lock log's run-time structures for a while |
20effc67 | 2763 | log.lock.lock(); |
39ae355f | 2764 | auto t0 = mono_clock::now(); |
181888fb | 2765 | |
20effc67 TL |
2766 | // Part 1. |
2767 | // Prepare current log for jumping into it. | |
2768 | // 1. Allocate extent | |
2769 | // 2. Update op to log | |
2770 | // 3. Jump op to log | |
2771 | // During that, no one else can write to log, otherwise we risk jumping backwards. | |
2772 | // We need to sync log, because we are injecting discontinuity, and writer is not prepared for that. | |
2773 | ||
2774 | //signal _maybe_extend_log that expansion of log is temporary inacceptable | |
2775 | bool old_forbidden = atomic_exchange(&log_forbidden_to_expand, true); | |
2776 | ceph_assert(old_forbidden == false); | |
3efd9988 | 2777 | |
39ae355f TL |
2778 | // |
2779 | // Part 1. | |
2780 | // Prepare current log for jumping into it. | |
2781 | // 1.1. Allocate extent | |
2782 | // 1.2. Save log's fnode extents and add new extents | |
2783 | // 1.3. Update op to log | |
2784 | // 1.4. Jump op to log | |
2785 | // During that, no one else can write to log, otherwise we risk jumping backwards. | |
2786 | // We need to sync log, because we are injecting discontinuity, and writer is not prepared for that. | |
9f95a23c | 2787 | |
39ae355f TL |
2788 | // 1.1 allocate new log extents and store them at fnode_tail |
2789 | File *log_file = log.writer->file.get(); | |
7c673cae | 2790 | old_log_jump_to = log_file->fnode.get_allocated(); |
39ae355f | 2791 | bluefs_fnode_t fnode_tail; |
20effc67 | 2792 | uint64_t runway = log_file->fnode.get_allocated() - log.writer->get_effective_write_pos(); |
7c673cae | 2793 | dout(10) << __func__ << " old_log_jump_to 0x" << std::hex << old_log_jump_to |
39ae355f | 2794 | << " need 0x" << cct->_conf->bluefs_max_log_runway << std::dec << dendl; |
9f95a23c TL |
2795 | int r = _allocate(vselector->select_prefer_bdev(log_file->vselector_hint), |
2796 | cct->_conf->bluefs_max_log_runway, | |
39ae355f TL |
2797 | 0, |
2798 | &fnode_tail); | |
11fdf7f2 | 2799 | ceph_assert(r == 0); |
39ae355f TL |
2800 | |
2801 | // 1.2 save log's fnode extents and add new extents | |
2802 | bluefs_fnode_t old_log_fnode(log_file->fnode); | |
2803 | log_file->fnode.clone_extents(fnode_tail); | |
9f95a23c | 2804 | //adjust usage as flush below will need it |
39ae355f | 2805 | vselector->sub_usage(log_file->vselector_hint, old_log_fnode); |
9f95a23c | 2806 | vselector->add_usage(log_file->vselector_hint, log_file->fnode); |
7c673cae FG |
2807 | dout(10) << __func__ << " log extents " << log_file->fnode.extents << dendl; |
2808 | ||
39ae355f | 2809 | // 1.3 update the log file change and log a jump to the offset where we want to |
7c673cae | 2810 | // write the new entries |
39ae355f TL |
2811 | log.t.op_file_update_inc(log_file->fnode); |
2812 | ||
2813 | // 1.4 jump to new position should mean next seq | |
20effc67 TL |
2814 | log.t.op_jump(log.seq_live + 1, old_log_jump_to); |
2815 | uint64_t seq_now = log.seq_live; | |
2816 | // we need to flush all bdev because we will be streaming all dirty files to log | |
2817 | // TODO - think - if _flush_and_sync_log_jump will not add dirty files nor release pending allocations | |
2818 | // then flush_bdev() will not be necessary | |
2819 | _flush_bdev(); | |
2820 | _flush_and_sync_log_jump_D(old_log_jump_to, runway); | |
2821 | ||
39ae355f TL |
2822 | // |
2823 | // Part 2. | |
2824 | // Build new log starter and compacted metadata body | |
2825 | // 2.1. Build full compacted meta transaction. | |
2826 | // While still holding the lock, encode a bluefs transaction | |
2827 | // that dumps all of the in-memory fnodes and names. | |
2828 | // This might be pretty large and its allocation map can exceed | |
2829 | // superblock size. Hence instead we'll need log starter part which | |
2830 | // goes to superblock and refers that new meta through op_update_inc. | |
2831 | // 2.2. After releasing the lock allocate space for the above transaction | |
2832 | // using its size estimation. | |
2833 | // Then build tailing list of extents which consists of these | |
2834 | // newly allocated extents followed by ones from Part 1. | |
2835 | // 2.3. Allocate the space required for the starter part of the new log. | |
2836 | // It should be small enough to fit into superblock. | |
2837 | // Effectively we start building new log fnode here. | |
2838 | // 2.4. Store starter fnode to run-time superblock, to be written out later | |
2839 | // 2.5. Finalize new log's fnode building | |
2840 | // This will include log's starter and tailing extents built at 2.2 | |
2841 | // 2.6. Encode new log fnode starter, | |
2842 | // It will include op_init, new log's op_update_inc | |
2843 | // and jump to the compacted meta transaction beginning. | |
2844 | // Superblock will reference this starter part | |
2845 | // 2.7. Encode compacted meta transaction, | |
2846 | // extend the transaction with a jump to the log tail from 1.1 before | |
2847 | // encoding. | |
2848 | // | |
2849 | ||
2850 | // 2.1 Build full compacted meta transaction | |
2851 | bluefs_transaction_t compacted_meta_t; | |
2852 | _compact_log_dump_metadata_NF(starter_seq + 1, &compacted_meta_t, 0, seq_now); | |
2853 | ||
2854 | // now state is captured to compacted_meta_t, | |
2855 | // current log can be used to write to, | |
2856 | //ops in log will be continuation of captured state | |
2857 | logger->tinc(l_bluefs_compaction_lock_lat, mono_clock::now() - t0); | |
20effc67 | 2858 | log.lock.unlock(); |
7c673cae | 2859 | |
39ae355f TL |
2860 | // 2.2 Allocate the space required for the compacted meta transaction |
2861 | uint64_t compacted_meta_need = _estimate_transaction_size(&compacted_meta_t); | |
2862 | dout(20) << __func__ << " compacted_meta_need " << compacted_meta_need | |
2863 | << dendl; | |
2864 | { | |
2865 | bluefs_fnode_t fnode_pre_tail; | |
2866 | // do allocate | |
2867 | r = _allocate(vselector->select_prefer_bdev(log_file->vselector_hint), | |
2868 | compacted_meta_need, | |
2869 | 0, | |
2870 | &fnode_pre_tail); | |
2871 | ceph_assert(r == 0); | |
2872 | // build trailing list of extents in fnode_tail, | |
2873 | // this will include newly allocated extents for compacted meta | |
2874 | // and aux extents allocated at step 1.1 | |
2875 | fnode_pre_tail.claim_extents(fnode_tail.extents); | |
2876 | fnode_tail.swap_extents(fnode_pre_tail); | |
2877 | } | |
eafe8130 | 2878 | |
39ae355f TL |
2879 | // 2.3 Allocate the space required for the starter part of the new log. |
2880 | // Start building New log fnode | |
2881 | FileRef new_log = nullptr; | |
2882 | new_log = ceph::make_ref<File>(); | |
2883 | new_log->fnode.ino = log_file->fnode.ino; | |
2884 | new_log->fnode.mtime = mtime; | |
2885 | // Estimate the required space | |
2886 | uint64_t starter_need = | |
2887 | _make_initial_transaction(starter_seq, fnode_tail, 0, nullptr); | |
2888 | // and now allocate and store at new_log_fnode | |
2889 | r = _allocate(vselector->select_prefer_bdev(log_file->vselector_hint), | |
2890 | starter_need, | |
2891 | 0, | |
2892 | &new_log->fnode); | |
11fdf7f2 TL |
2893 | ceph_assert(r == 0); |
2894 | ||
39ae355f TL |
2895 | // 2.4 Store starter fnode to run-time superblock, to be written out later |
2896 | super.log_fnode = new_log->fnode; | |
7c673cae | 2897 | |
39ae355f TL |
2898 | // 2.5 Finalize new log's fnode building |
2899 | // start collecting new log fnode updates (to make op_update_inc later) | |
2900 | // since this point. This will include compacted meta from 2.2 and aux | |
2901 | // extents from 1.1. | |
2902 | new_log->fnode.reset_delta(); | |
2903 | new_log->fnode.claim_extents(fnode_tail.extents); | |
7c673cae | 2904 | |
39ae355f TL |
2905 | // 2.6 Encode new log fnode |
2906 | bufferlist starter_bl; | |
2907 | _make_initial_transaction(starter_seq, new_log->fnode, starter_need, | |
2908 | &starter_bl); | |
7c673cae | 2909 | |
39ae355f TL |
2910 | // 2.7 Encode compacted meta transaction, |
2911 | dout(20) << __func__ | |
2912 | << " new_log jump seq " << seq_now | |
2913 | << std::hex << " offset 0x" << starter_need + compacted_meta_need | |
2914 | << std::dec << dendl; | |
2915 | // Extent compacted_meta transaction with a just to new log tail. | |
2916 | // Hopefully "compact_meta_need" estimation provides enough extra space | |
2917 | // for this new jump, assert below if not | |
2918 | compacted_meta_t.op_jump(seq_now, starter_need + compacted_meta_need); | |
2919 | // Now do encodeing and padding | |
2920 | bufferlist compacted_meta_bl; | |
2921 | compacted_meta_bl.reserve(compacted_meta_need); | |
2922 | encode(compacted_meta_t, compacted_meta_bl); | |
2923 | ceph_assert(compacted_meta_bl.length() <= compacted_meta_need); | |
2924 | _pad_bl(compacted_meta_bl, compacted_meta_need); | |
2925 | ||
2926 | // | |
2927 | // Part 3. | |
2928 | // Write out new log's content | |
2929 | // 3.1 Stage new log's content writing | |
2930 | // 3.2 Do flush and wait for completion through flush_bdev() | |
2931 | // | |
2932 | ||
2933 | // 3.1 Stage new log's content writing | |
2934 | // Make new log writer and append bufferlists to write out. | |
2935 | FileWriter *new_log_writer = _create_writer(new_log); | |
2936 | // And append all new log's bufferlists to write out. | |
2937 | new_log_writer->append(starter_bl); | |
2938 | new_log_writer->append(compacted_meta_bl); | |
2939 | ||
2940 | // 3.2. flush and wait | |
20effc67 | 2941 | _flush_special(new_log_writer); |
39ae355f | 2942 | _flush_bdev(new_log_writer, false); // do not check log.lock is locked |
7c673cae | 2943 | |
39ae355f TL |
2944 | // Part 4. |
2945 | // Write out new superblock to reflect all the changes. | |
2946 | // | |
7c673cae | 2947 | |
20effc67 TL |
2948 | _write_super(BDEV_DB); |
2949 | _flush_bdev(); | |
2950 | ||
39ae355f TL |
2951 | // Part 5. |
2952 | // Apply new log fnode | |
2953 | // | |
2954 | ||
2955 | // we need to acquire log's lock back at this point | |
20effc67 | 2956 | log.lock.lock(); |
39ae355f | 2957 | // Reconstruct actual log object from the new one. |
9f95a23c | 2958 | vselector->sub_usage(log_file->vselector_hint, log_file->fnode); |
39ae355f TL |
2959 | log_file->fnode.size = |
2960 | log.writer->pos - old_log_jump_to + starter_need + compacted_meta_need; | |
2961 | log_file->fnode.mtime = std::max(mtime, log_file->fnode.mtime); | |
2962 | log_file->fnode.swap_extents(new_log->fnode); | |
2963 | // update log's writer | |
2964 | log.writer->pos = log.writer->file->fnode.size; | |
9f95a23c | 2965 | vselector->add_usage(log_file->vselector_hint, log_file->fnode); |
39ae355f | 2966 | // and unlock |
20effc67 | 2967 | log.lock.unlock(); |
7c673cae | 2968 | |
39ae355f TL |
2969 | // we're mostly done |
2970 | dout(10) << __func__ << " log extents " << log_file->fnode.extents << dendl; | |
2971 | logger->inc(l_bluefs_log_compactions); | |
2972 | ||
2973 | //Part 6. | |
2974 | // Finalization | |
2975 | // 6.1 Permit log's extension, forbidden at step 0. | |
2976 | // | |
2977 | // 6.2 Release the new log writer | |
2978 | // | |
2979 | // 6.3 Release old space | |
2980 | // | |
2981 | // 6.4. Enable other compactions | |
2982 | // | |
2983 | ||
2984 | // 6.1 Permit log's extension, forbidden at step 0. | |
20effc67 TL |
2985 | old_forbidden = atomic_exchange(&log_forbidden_to_expand, false); |
2986 | ceph_assert(old_forbidden == true); | |
2987 | //to wake up if someone was in need of expanding log | |
2988 | log_cond.notify_all(); | |
7c673cae | 2989 | |
39ae355f TL |
2990 | // 6.2 Release the new log writer |
2991 | _close_writer(new_log_writer); | |
2992 | new_log_writer = nullptr; | |
2993 | new_log = nullptr; | |
2994 | ||
2995 | // 6.3 Release old space | |
20effc67 | 2996 | { |
39ae355f TL |
2997 | dout(10) << __func__ |
2998 | << " release old log extents " << old_log_fnode.extents | |
2999 | << dendl; | |
20effc67 | 3000 | std::lock_guard dl(dirty.lock); |
39ae355f | 3001 | for (auto& r : old_log_fnode.extents) { |
20effc67 TL |
3002 | dirty.pending_release[r.bdev].insert(r.offset, r.length); |
3003 | } | |
7c673cae FG |
3004 | } |
3005 | ||
39ae355f | 3006 | // 6.4. Enable other compactions |
20effc67 TL |
3007 | old_is_comp = atomic_exchange(&log_is_compacting, false); |
3008 | ceph_assert(old_is_comp); | |
7c673cae FG |
3009 | } |
3010 | ||
39ae355f | 3011 | void BlueFS::_pad_bl(bufferlist& bl, uint64_t pad_size) |
7c673cae | 3012 | { |
39ae355f TL |
3013 | pad_size = std::max(pad_size, uint64_t(super.block_size)); |
3014 | uint64_t partial = bl.length() % pad_size; | |
7c673cae FG |
3015 | if (partial) { |
3016 | dout(10) << __func__ << " padding with 0x" << std::hex | |
39ae355f TL |
3017 | << pad_size - partial << " zeros" << std::dec << dendl; |
3018 | bl.append_zero(pad_size - partial); | |
7c673cae FG |
3019 | } |
3020 | } | |
3021 | ||
7c673cae | 3022 | |
20effc67 TL |
3023 | // Returns log seq that was live before advance. |
3024 | uint64_t BlueFS::_log_advance_seq() | |
7c673cae | 3025 | { |
20effc67 TL |
3026 | ceph_assert(ceph_mutex_is_locked(dirty.lock)); |
3027 | ceph_assert(ceph_mutex_is_locked(log.lock)); | |
3028 | //acquire new seq | |
3029 | // this will became seq_stable once we write | |
3030 | ceph_assert(dirty.seq_stable < dirty.seq_live); | |
3031 | ceph_assert(log.t.seq == log.seq_live); | |
3032 | uint64_t seq = log.seq_live; | |
3033 | log.t.uuid = super.uuid; | |
3034 | ||
3035 | ++dirty.seq_live; | |
3036 | ++log.seq_live; | |
3037 | ceph_assert(dirty.seq_live == log.seq_live); | |
3038 | return seq; | |
3039 | } | |
7c673cae | 3040 | |
a8e16298 | 3041 | |
20effc67 TL |
3042 | // Adds to log.t file modifications mentioned in `dirty.files`. |
3043 | // Note: some bluefs ops may have already been stored in log.t transaction. | |
3044 | void BlueFS::_consume_dirty(uint64_t seq) | |
3045 | { | |
3046 | ceph_assert(ceph_mutex_is_locked(dirty.lock)); | |
3047 | ceph_assert(ceph_mutex_is_locked(log.lock)); | |
7c673cae FG |
3048 | |
3049 | // log dirty files | |
20effc67 TL |
3050 | // we just incremented log_seq. It is now illegal to add to dirty.files[log_seq] |
3051 | auto lsi = dirty.files.find(seq); | |
3052 | if (lsi != dirty.files.end()) { | |
3053 | dout(20) << __func__ << " " << lsi->second.size() << " dirty.files" << dendl; | |
7c673cae | 3054 | for (auto &f : lsi->second) { |
20effc67 TL |
3055 | // fnode here is protected indirectly |
3056 | // the only path that adds to dirty.files goes from _fsync() | |
3057 | // _fsync() is executed under writer lock, | |
3058 | // and does not exit until syncing log is done | |
3059 | dout(20) << __func__ << " op_file_update_inc " << f.fnode << dendl; | |
3060 | log.t.op_file_update_inc(f.fnode); | |
7c673cae FG |
3061 | } |
3062 | } | |
20effc67 | 3063 | } |
7c673cae | 3064 | |
20effc67 TL |
3065 | // Extends log if its free space is smaller then bluefs_min_log_runway. |
3066 | // Returns space available *BEFORE* adding new space. Signed for additional <0 detection. | |
3067 | int64_t BlueFS::_maybe_extend_log() | |
3068 | { | |
3069 | ceph_assert(ceph_mutex_is_locked(log.lock)); | |
7c673cae | 3070 | // allocate some more space (before we run out)? |
20effc67 TL |
3071 | // BTW: this triggers `flush()` in the `page_aligned_appender` of `log.writer`. |
3072 | int64_t runway = log.writer->file->fnode.get_allocated() - | |
3073 | log.writer->get_effective_write_pos(); | |
7c673cae FG |
3074 | if (runway < (int64_t)cct->_conf->bluefs_min_log_runway) { |
3075 | dout(10) << __func__ << " allocating more log runway (0x" | |
3076 | << std::hex << runway << std::dec << " remaining)" << dendl; | |
20effc67 TL |
3077 | /* |
3078 | * Usually, when we are low on space in log, we just allocate new extent, | |
3079 | * put update op(log) to log and we are fine. | |
3080 | * Problem - it interferes with log compaction: | |
3081 | * New log produced in compaction will include - as last op - jump into some offset (anchor) of current log. | |
3082 | * It is assumed that log region (anchor - end) will contain all changes made by bluefs since | |
3083 | * full state capture into new log. | |
3084 | * Putting log update into (anchor - end) region is illegal, because any update there must be compatible with | |
3085 | * both logs, but old log is different then new log. | |
3086 | * | |
3087 | * Possible solutions: | |
3088 | * - stall extending log until we finish compacting and switch log (CURRENT) | |
3089 | * - re-run compaction with more runway for old log | |
3090 | * - add OP_FILE_ADDEXT that adds extent; will be compatible with both logs | |
3091 | */ | |
3092 | if (log_forbidden_to_expand.load() == true) { | |
3093 | return -EWOULDBLOCK; | |
7c673cae | 3094 | } |
20effc67 | 3095 | vselector->sub_usage(log.writer->file->vselector_hint, log.writer->file->fnode); |
9f95a23c | 3096 | int r = _allocate( |
20effc67 | 3097 | vselector->select_prefer_bdev(log.writer->file->vselector_hint), |
9f95a23c | 3098 | cct->_conf->bluefs_max_log_runway, |
39ae355f | 3099 | 0, |
20effc67 | 3100 | &log.writer->file->fnode); |
11fdf7f2 | 3101 | ceph_assert(r == 0); |
20effc67 TL |
3102 | vselector->add_usage(log.writer->file->vselector_hint, log.writer->file->fnode); |
3103 | log.t.op_file_update_inc(log.writer->file->fnode); | |
7c673cae | 3104 | } |
20effc67 TL |
3105 | return runway; |
3106 | } | |
3107 | ||
3108 | void BlueFS::_flush_and_sync_log_core(int64_t runway) | |
3109 | { | |
3110 | ceph_assert(ceph_mutex_is_locked(log.lock)); | |
3111 | dout(10) << __func__ << " " << log.t << dendl; | |
7c673cae FG |
3112 | |
3113 | bufferlist bl; | |
11fdf7f2 | 3114 | bl.reserve(super.block_size); |
20effc67 | 3115 | encode(log.t, bl); |
7c673cae | 3116 | // pad to block boundary |
11fdf7f2 TL |
3117 | size_t realign = super.block_size - (bl.length() % super.block_size); |
3118 | if (realign && realign != super.block_size) | |
3119 | bl.append_zero(realign); | |
3120 | ||
1e59de90 | 3121 | logger->inc(l_bluefs_log_write_count, 1); |
7c673cae FG |
3122 | logger->inc(l_bluefs_logged_bytes, bl.length()); |
3123 | ||
20effc67 | 3124 | if (true) { |
f6b5b4d7 | 3125 | ceph_assert(bl.length() <= runway); // if we write this, we will have an unrecoverable data loss |
20effc67 | 3126 | // transaction will not fit extents before growth -> data loss on _replay |
f6b5b4d7 TL |
3127 | } |
3128 | ||
20effc67 | 3129 | log.writer->append(bl); |
7c673cae | 3130 | |
20effc67 TL |
3131 | // prepare log for new transactions |
3132 | log.t.clear(); | |
3133 | log.t.seq = log.seq_live; | |
7c673cae | 3134 | |
20effc67 TL |
3135 | uint64_t new_data = _flush_special(log.writer); |
3136 | vselector->add_usage(log.writer->file->vselector_hint, new_data); | |
3137 | } | |
7c673cae | 3138 | |
20effc67 TL |
3139 | // Clears dirty.files up to (including) seq_stable. |
3140 | void BlueFS::_clear_dirty_set_stable_D(uint64_t seq) | |
3141 | { | |
3142 | std::lock_guard dl(dirty.lock); | |
7c673cae FG |
3143 | |
3144 | // clean dirty files | |
20effc67 TL |
3145 | if (seq > dirty.seq_stable) { |
3146 | dirty.seq_stable = seq; | |
3147 | dout(20) << __func__ << " seq_stable " << dirty.seq_stable << dendl; | |
3148 | ||
3149 | // undirty all files that were already streamed to log | |
3150 | auto p = dirty.files.begin(); | |
3151 | while (p != dirty.files.end()) { | |
3152 | if (p->first > dirty.seq_stable) { | |
7c673cae FG |
3153 | dout(20) << __func__ << " done cleaning up dirty files" << dendl; |
3154 | break; | |
3155 | } | |
3156 | ||
3157 | auto l = p->second.begin(); | |
3158 | while (l != p->second.end()) { | |
3159 | File *file = &*l; | |
20effc67 TL |
3160 | ceph_assert(file->dirty_seq <= dirty.seq_stable); |
3161 | dout(20) << __func__ << " cleaned file " << file->fnode.ino << dendl; | |
3162 | file->dirty_seq = dirty.seq_stable; | |
7c673cae FG |
3163 | p->second.erase(l++); |
3164 | } | |
3165 | ||
11fdf7f2 | 3166 | ceph_assert(p->second.empty()); |
20effc67 | 3167 | dirty.files.erase(p++); |
7c673cae FG |
3168 | } |
3169 | } else { | |
20effc67 | 3170 | dout(20) << __func__ << " seq_stable " << dirty.seq_stable |
7c673cae FG |
3171 | << " already >= out seq " << seq |
3172 | << ", we lost a race against another log flush, done" << dendl; | |
3173 | } | |
20effc67 | 3174 | } |
a8e16298 | 3175 | |
20effc67 TL |
3176 | void BlueFS::_release_pending_allocations(vector<interval_set<uint64_t>>& to_release) |
3177 | { | |
a8e16298 | 3178 | for (unsigned i = 0; i < to_release.size(); ++i) { |
1e59de90 TL |
3179 | if (to_release[i].empty()) { |
3180 | continue; | |
3181 | } | |
3182 | /* OK, now we have the guarantee alloc[i] won't be null. */ | |
3183 | ||
3184 | bool discard_queued = bdev[i]->try_discard(to_release[i]); | |
3185 | if (!discard_queued) { | |
a8e16298 | 3186 | alloc[i]->release(to_release[i]); |
f67539c2 TL |
3187 | if (is_shared_alloc(i)) { |
3188 | shared_alloc->bluefs_used -= to_release[i].size(); | |
3189 | } | |
a8e16298 TL |
3190 | } |
3191 | } | |
20effc67 TL |
3192 | } |
3193 | ||
3194 | int BlueFS::_flush_and_sync_log_LD(uint64_t want_seq) | |
3195 | { | |
3196 | int64_t available_runway; | |
3197 | do { | |
3198 | log.lock.lock(); | |
3199 | dirty.lock.lock(); | |
3200 | if (want_seq && want_seq <= dirty.seq_stable) { | |
3201 | dout(10) << __func__ << " want_seq " << want_seq << " <= seq_stable " | |
3202 | << dirty.seq_stable << ", done" << dendl; | |
3203 | dirty.lock.unlock(); | |
3204 | log.lock.unlock(); | |
3205 | return 0; | |
3206 | } | |
3207 | ||
3208 | available_runway = _maybe_extend_log(); | |
3209 | if (available_runway == -EWOULDBLOCK) { | |
3210 | // we are in need of adding runway, but we are during log-switch from compaction | |
3211 | dirty.lock.unlock(); | |
3212 | //instead log.lock.unlock() do move ownership | |
3213 | std::unique_lock<ceph::mutex> ll(log.lock, std::adopt_lock); | |
3214 | while (log_forbidden_to_expand.load()) { | |
3215 | log_cond.wait(ll); | |
3216 | } | |
3217 | } else { | |
3218 | ceph_assert(available_runway >= 0); | |
3219 | } | |
3220 | } while (available_runway < 0); | |
3221 | ||
3222 | ceph_assert(want_seq == 0 || want_seq <= dirty.seq_live); // illegal to request seq that was not created yet | |
3223 | uint64_t seq =_log_advance_seq(); | |
3224 | _consume_dirty(seq); | |
3225 | vector<interval_set<uint64_t>> to_release(dirty.pending_release.size()); | |
3226 | to_release.swap(dirty.pending_release); | |
3227 | dirty.lock.unlock(); | |
3228 | ||
3229 | _flush_and_sync_log_core(available_runway); | |
3230 | _flush_bdev(log.writer); | |
3231 | logger->set(l_bluefs_log_bytes, log.writer->file->fnode.size); | |
3232 | //now log.lock is no longer needed | |
3233 | log.lock.unlock(); | |
3234 | ||
3235 | _clear_dirty_set_stable_D(seq); | |
3236 | _release_pending_allocations(to_release); | |
a8e16298 | 3237 | |
7c673cae | 3238 | _update_logger_stats(); |
20effc67 TL |
3239 | return 0; |
3240 | } | |
3241 | ||
3242 | // Flushes log and immediately adjusts log_writer pos. | |
3243 | int BlueFS::_flush_and_sync_log_jump_D(uint64_t jump_to, | |
3244 | int64_t available_runway) | |
3245 | { | |
3246 | ceph_assert(ceph_mutex_is_locked(log.lock)); | |
3247 | ||
3248 | ceph_assert(jump_to); | |
3249 | // we synchronize writing to log, by lock to log.lock | |
3250 | ||
3251 | dirty.lock.lock(); | |
3252 | uint64_t seq =_log_advance_seq(); | |
3253 | _consume_dirty(seq); | |
3254 | vector<interval_set<uint64_t>> to_release(dirty.pending_release.size()); | |
3255 | to_release.swap(dirty.pending_release); | |
3256 | dirty.lock.unlock(); | |
3257 | _flush_and_sync_log_core(available_runway); | |
7c673cae | 3258 | |
20effc67 TL |
3259 | dout(10) << __func__ << " jumping log offset from 0x" << std::hex |
3260 | << log.writer->pos << " -> 0x" << jump_to << std::dec << dendl; | |
3261 | log.writer->pos = jump_to; | |
3262 | vselector->sub_usage(log.writer->file->vselector_hint, log.writer->file->fnode.size); | |
3263 | log.writer->file->fnode.size = jump_to; | |
3264 | vselector->add_usage(log.writer->file->vselector_hint, log.writer->file->fnode.size); | |
3265 | ||
3266 | _flush_bdev(log.writer); | |
3267 | ||
3268 | _clear_dirty_set_stable_D(seq); | |
3269 | _release_pending_allocations(to_release); | |
3270 | ||
3271 | logger->set(l_bluefs_log_bytes, log.writer->file->fnode.size); | |
3272 | _update_logger_stats(); | |
7c673cae FG |
3273 | return 0; |
3274 | } | |
3275 | ||
f67539c2 TL |
3276 | ceph::bufferlist BlueFS::FileWriter::flush_buffer( |
3277 | CephContext* const cct, | |
3278 | const bool partial, | |
3279 | const unsigned length, | |
3280 | const bluefs_super_t& super) | |
3281 | { | |
20effc67 | 3282 | ceph_assert(ceph_mutex_is_locked(this->lock) || file->fnode.ino <= 1); |
f67539c2 TL |
3283 | ceph::bufferlist bl; |
3284 | if (partial) { | |
3285 | tail_block.splice(0, tail_block.length(), &bl); | |
3286 | } | |
3287 | const auto remaining_len = length - bl.length(); | |
3288 | buffer.splice(0, remaining_len, &bl); | |
3289 | if (buffer.length()) { | |
3290 | dout(20) << " leaving 0x" << std::hex << buffer.length() << std::dec | |
3291 | << " unflushed" << dendl; | |
3292 | } | |
3293 | if (const unsigned tail = bl.length() & ~super.block_mask(); tail) { | |
3294 | const auto padding_len = super.block_size - tail; | |
3295 | dout(20) << __func__ << " caching tail of 0x" | |
3296 | << std::hex << tail | |
3297 | << " and padding block with 0x" << padding_len | |
3298 | << " buffer.length() " << buffer.length() | |
3299 | << std::dec << dendl; | |
3300 | // We need to go through the `buffer_appender` to get a chance to | |
3301 | // preserve in-memory contiguity and not mess with the alignment. | |
3302 | // Otherwise a costly rebuild could happen in e.g. `KernelDevice`. | |
3303 | buffer_appender.append_zero(padding_len); | |
3304 | buffer.splice(buffer.length() - padding_len, padding_len, &bl); | |
3305 | // Deep copy the tail here. This allows to avoid costlier copy on | |
3306 | // bufferlist rebuild in e.g. `KernelDevice` and minimizes number | |
3307 | // of memory allocations. | |
3308 | // The alternative approach would be to place the entire tail and | |
3309 | // padding on a dedicated, 4 KB long memory chunk. This shouldn't | |
3310 | // trigger the rebuild while still being less expensive. | |
3311 | buffer_appender.substr_of(bl, bl.length() - padding_len - tail, tail); | |
3312 | buffer.splice(buffer.length() - tail, tail, &tail_block); | |
3313 | } else { | |
3314 | tail_block.clear(); | |
3315 | } | |
3316 | return bl; | |
3317 | } | |
3318 | ||
20effc67 | 3319 | int BlueFS::_signal_dirty_to_log_D(FileWriter *h) |
522d829b | 3320 | { |
20effc67 TL |
3321 | ceph_assert(ceph_mutex_is_locked(h->lock)); |
3322 | std::lock_guard dl(dirty.lock); | |
1e59de90 TL |
3323 | if (h->file->deleted) { |
3324 | dout(10) << __func__ << " deleted, no-op" << dendl; | |
3325 | return 0; | |
3326 | } | |
3327 | ||
522d829b TL |
3328 | h->file->fnode.mtime = ceph_clock_now(); |
3329 | ceph_assert(h->file->fnode.ino >= 1); | |
20effc67 TL |
3330 | if (h->file->dirty_seq <= dirty.seq_stable) { |
3331 | h->file->dirty_seq = dirty.seq_live; | |
3332 | dirty.files[h->file->dirty_seq].push_back(*h->file); | |
3333 | dout(20) << __func__ << " dirty_seq = " << dirty.seq_live | |
522d829b TL |
3334 | << " (was clean)" << dendl; |
3335 | } else { | |
20effc67 | 3336 | if (h->file->dirty_seq != dirty.seq_live) { |
522d829b | 3337 | // need re-dirty, erase from list first |
20effc67 TL |
3338 | ceph_assert(dirty.files.count(h->file->dirty_seq)); |
3339 | auto it = dirty.files[h->file->dirty_seq].iterator_to(*h->file); | |
3340 | dirty.files[h->file->dirty_seq].erase(it); | |
3341 | h->file->dirty_seq = dirty.seq_live; | |
3342 | dirty.files[h->file->dirty_seq].push_back(*h->file); | |
3343 | dout(20) << __func__ << " dirty_seq = " << dirty.seq_live | |
522d829b TL |
3344 | << " (was " << h->file->dirty_seq << ")" << dendl; |
3345 | } else { | |
20effc67 | 3346 | dout(20) << __func__ << " dirty_seq = " << dirty.seq_live |
522d829b TL |
3347 | << " (unchanged, do nothing) " << dendl; |
3348 | } | |
3349 | } | |
3350 | return 0; | |
3351 | } | |
3352 | ||
20effc67 | 3353 | void BlueFS::flush_range(FileWriter *h, uint64_t offset, uint64_t length)/*_WF*/ |
7c673cae | 3354 | { |
20effc67 TL |
3355 | _maybe_check_vselector_LNF(); |
3356 | std::unique_lock hl(h->lock); | |
3357 | _flush_range_F(h, offset, length); | |
3358 | } | |
3359 | ||
3360 | int BlueFS::_flush_range_F(FileWriter *h, uint64_t offset, uint64_t length) | |
3361 | { | |
3362 | ceph_assert(ceph_mutex_is_locked(h->lock)); | |
3363 | ceph_assert(h->file->num_readers.load() == 0); | |
3364 | ceph_assert(h->file->fnode.ino > 1); | |
3365 | ||
7c673cae FG |
3366 | dout(10) << __func__ << " " << h << " pos 0x" << std::hex << h->pos |
3367 | << " 0x" << offset << "~" << length << std::dec | |
3368 | << " to " << h->file->fnode << dendl; | |
f67539c2 TL |
3369 | if (h->file->deleted) { |
3370 | dout(10) << __func__ << " deleted, no-op" << dendl; | |
3371 | return 0; | |
3372 | } | |
7c673cae | 3373 | |
20effc67 | 3374 | bool buffered = cct->_conf->bluefs_buffered_io; |
7c673cae FG |
3375 | |
3376 | if (offset + length <= h->pos) | |
3377 | return 0; | |
3378 | if (offset < h->pos) { | |
3379 | length -= h->pos - offset; | |
3380 | offset = h->pos; | |
3381 | dout(10) << " still need 0x" | |
3382 | << std::hex << offset << "~" << length << std::dec | |
3383 | << dendl; | |
3384 | } | |
20effc67 | 3385 | std::lock_guard file_lock(h->file->lock); |
11fdf7f2 | 3386 | ceph_assert(offset <= h->file->fnode.size); |
7c673cae FG |
3387 | |
3388 | uint64_t allocated = h->file->fnode.get_allocated(); | |
9f95a23c | 3389 | vselector->sub_usage(h->file->vselector_hint, h->file->fnode); |
7c673cae FG |
3390 | // do not bother to dirty the file if we are overwriting |
3391 | // previously allocated extents. | |
7c673cae FG |
3392 | if (allocated < offset + length) { |
3393 | // we should never run out of log space here; see the min runway check | |
3394 | // in _flush_and_sync_log. | |
9f95a23c | 3395 | int r = _allocate(vselector->select_prefer_bdev(h->file->vselector_hint), |
7c673cae | 3396 | offset + length - allocated, |
39ae355f | 3397 | 0, |
94b18763 | 3398 | &h->file->fnode); |
7c673cae FG |
3399 | if (r < 0) { |
3400 | derr << __func__ << " allocated: 0x" << std::hex << allocated | |
3401 | << " offset: 0x" << offset << " length: 0x" << length << std::dec | |
3402 | << dendl; | |
9f95a23c | 3403 | vselector->add_usage(h->file->vselector_hint, h->file->fnode); // undo |
11fdf7f2 | 3404 | ceph_abort_msg("bluefs enospc"); |
7c673cae FG |
3405 | return r; |
3406 | } | |
522d829b | 3407 | h->file->is_dirty = true; |
7c673cae FG |
3408 | } |
3409 | if (h->file->fnode.size < offset + length) { | |
3410 | h->file->fnode.size = offset + length; | |
20effc67 | 3411 | h->file->is_dirty = true; |
7c673cae | 3412 | } |
20effc67 | 3413 | |
522d829b | 3414 | dout(20) << __func__ << " file now, unflushed " << h->file->fnode << dendl; |
20effc67 TL |
3415 | int res = _flush_data(h, offset, length, buffered); |
3416 | vselector->add_usage(h->file->vselector_hint, h->file->fnode); | |
3417 | return res; | |
3418 | } | |
7c673cae | 3419 | |
20effc67 TL |
3420 | int BlueFS::_flush_data(FileWriter *h, uint64_t offset, uint64_t length, bool buffered) |
3421 | { | |
3422 | if (h->file->fnode.ino > 1) { | |
3423 | ceph_assert(ceph_mutex_is_locked(h->lock)); | |
3424 | ceph_assert(ceph_mutex_is_locked(h->file->lock)); | |
3425 | } | |
7c673cae FG |
3426 | uint64_t x_off = 0; |
3427 | auto p = h->file->fnode.seek(offset, &x_off); | |
11fdf7f2 | 3428 | ceph_assert(p != h->file->fnode.extents.end()); |
7c673cae FG |
3429 | dout(20) << __func__ << " in " << *p << " x_off 0x" |
3430 | << std::hex << x_off << std::dec << dendl; | |
3431 | ||
3432 | unsigned partial = x_off & ~super.block_mask(); | |
7c673cae FG |
3433 | if (partial) { |
3434 | dout(20) << __func__ << " using partial tail 0x" | |
3435 | << std::hex << partial << std::dec << dendl; | |
7c673cae FG |
3436 | x_off -= partial; |
3437 | offset -= partial; | |
3438 | length += partial; | |
3439 | dout(20) << __func__ << " waiting for previous aio to complete" << dendl; | |
3440 | for (auto p : h->iocv) { | |
3441 | if (p) { | |
3442 | p->aio_wait(); | |
3443 | } | |
3444 | } | |
3445 | } | |
7c673cae | 3446 | |
f67539c2 TL |
3447 | auto bl = h->flush_buffer(cct, partial, length, super); |
3448 | ceph_assert(bl.length() >= length); | |
9f95a23c | 3449 | h->pos = offset + length; |
f67539c2 | 3450 | length = bl.length(); |
9f95a23c | 3451 | |
1e59de90 TL |
3452 | logger->inc(l_bluefs_write_count, 1); |
3453 | logger->inc(l_bluefs_write_bytes, length); | |
3454 | ||
7c673cae FG |
3455 | switch (h->writer_type) { |
3456 | case WRITER_WAL: | |
1e59de90 | 3457 | logger->inc(l_bluefs_write_count_wal, 1); |
7c673cae FG |
3458 | logger->inc(l_bluefs_bytes_written_wal, length); |
3459 | break; | |
3460 | case WRITER_SST: | |
1e59de90 | 3461 | logger->inc(l_bluefs_write_count_sst, 1); |
7c673cae FG |
3462 | logger->inc(l_bluefs_bytes_written_sst, length); |
3463 | break; | |
3464 | } | |
3465 | ||
3466 | dout(30) << "dump:\n"; | |
3467 | bl.hexdump(*_dout); | |
3468 | *_dout << dendl; | |
3469 | ||
7c673cae | 3470 | uint64_t bloff = 0; |
11fdf7f2 | 3471 | uint64_t bytes_written_slow = 0; |
7c673cae | 3472 | while (length > 0) { |
1e59de90 TL |
3473 | logger->inc(l_bluefs_write_disk_count, 1); |
3474 | ||
11fdf7f2 | 3475 | uint64_t x_len = std::min(p->length - x_off, length); |
7c673cae FG |
3476 | bufferlist t; |
3477 | t.substr_of(bl, bloff, x_len); | |
7c673cae | 3478 | if (cct->_conf->bluefs_sync_write) { |
11fdf7f2 | 3479 | bdev[p->bdev]->write(p->offset + x_off, t, buffered, h->write_hint); |
7c673cae | 3480 | } else { |
11fdf7f2 TL |
3481 | bdev[p->bdev]->aio_write(p->offset + x_off, t, h->iocv[p->bdev], buffered, h->write_hint); |
3482 | } | |
3483 | h->dirty_devs[p->bdev] = true; | |
3484 | if (p->bdev == BDEV_SLOW) { | |
3485 | bytes_written_slow += t.length(); | |
7c673cae | 3486 | } |
11fdf7f2 | 3487 | |
7c673cae FG |
3488 | bloff += x_len; |
3489 | length -= x_len; | |
3490 | ++p; | |
3491 | x_off = 0; | |
3492 | } | |
f67539c2 TL |
3493 | if (bytes_written_slow) { |
3494 | logger->inc(l_bluefs_bytes_written_slow, bytes_written_slow); | |
3495 | } | |
7c673cae FG |
3496 | for (unsigned i = 0; i < MAX_BDEV; ++i) { |
3497 | if (bdev[i]) { | |
11fdf7f2 | 3498 | if (h->iocv[i] && h->iocv[i]->has_pending_aios()) { |
7c673cae FG |
3499 | bdev[i]->aio_submit(h->iocv[i]); |
3500 | } | |
3501 | } | |
3502 | } | |
3503 | dout(20) << __func__ << " h " << h << " pos now 0x" | |
3504 | << std::hex << h->pos << std::dec << dendl; | |
3505 | return 0; | |
3506 | } | |
3507 | ||
11fdf7f2 | 3508 | #ifdef HAVE_LIBAIO |
7c673cae FG |
3509 | // we need to retire old completed aios so they don't stick around in |
3510 | // memory indefinitely (along with their bufferlist refs). | |
3511 | void BlueFS::_claim_completed_aios(FileWriter *h, list<aio_t> *ls) | |
3512 | { | |
3513 | for (auto p : h->iocv) { | |
3514 | if (p) { | |
3515 | ls->splice(ls->end(), p->running_aios); | |
3516 | } | |
3517 | } | |
3518 | dout(10) << __func__ << " got " << ls->size() << " aios" << dendl; | |
3519 | } | |
3520 | ||
20effc67 | 3521 | void BlueFS::_wait_for_aio(FileWriter *h) |
7c673cae FG |
3522 | { |
3523 | // NOTE: this is safe to call without a lock, as long as our reference is | |
3524 | // stable. | |
f67539c2 TL |
3525 | utime_t start; |
3526 | lgeneric_subdout(cct, bluefs, 10) << __func__; | |
3527 | start = ceph_clock_now(); | |
3528 | *_dout << " " << h << dendl; | |
7c673cae FG |
3529 | for (auto p : h->iocv) { |
3530 | if (p) { | |
3531 | p->aio_wait(); | |
3532 | } | |
3533 | } | |
11fdf7f2 | 3534 | dout(10) << __func__ << " " << h << " done in " << (ceph_clock_now() - start) << dendl; |
7c673cae | 3535 | } |
11fdf7f2 | 3536 | #endif |
7c673cae | 3537 | |
20effc67 TL |
3538 | void BlueFS::append_try_flush(FileWriter *h, const char* buf, size_t len)/*_WF_LNF_NF_LD_D*/ |
3539 | { | |
3540 | bool flushed_sum = false; | |
3541 | { | |
3542 | std::unique_lock hl(h->lock); | |
3543 | size_t max_size = 1ull << 30; // cap to 1GB | |
3544 | while (len > 0) { | |
3545 | bool need_flush = true; | |
3546 | auto l0 = h->get_buffer_length(); | |
3547 | if (l0 < max_size) { | |
3548 | size_t l = std::min(len, max_size - l0); | |
3549 | h->append(buf, l); | |
3550 | buf += l; | |
3551 | len -= l; | |
3552 | need_flush = h->get_buffer_length() >= cct->_conf->bluefs_min_flush_size; | |
3553 | } | |
3554 | if (need_flush) { | |
3555 | bool flushed = false; | |
3556 | int r = _flush_F(h, true, &flushed); | |
3557 | ceph_assert(r == 0); | |
3558 | flushed_sum |= flushed; | |
3559 | // make sure we've made any progress with flush hence the | |
3560 | // loop doesn't iterate forever | |
3561 | ceph_assert(h->get_buffer_length() < max_size); | |
3562 | } | |
3563 | } | |
3564 | } | |
3565 | if (flushed_sum) { | |
3566 | _maybe_compact_log_LNF_NF_LD_D(); | |
3567 | } | |
3568 | } | |
3569 | ||
3570 | void BlueFS::flush(FileWriter *h, bool force)/*_WF_LNF_NF_LD_D*/ | |
f6b5b4d7 TL |
3571 | { |
3572 | bool flushed = false; | |
20effc67 TL |
3573 | int r; |
3574 | { | |
3575 | std::unique_lock hl(h->lock); | |
3576 | r = _flush_F(h, force, &flushed); | |
3577 | ceph_assert(r == 0); | |
3578 | } | |
f6b5b4d7 | 3579 | if (r == 0 && flushed) { |
20effc67 | 3580 | _maybe_compact_log_LNF_NF_LD_D(); |
f6b5b4d7 | 3581 | } |
f6b5b4d7 TL |
3582 | } |
3583 | ||
20effc67 | 3584 | int BlueFS::_flush_F(FileWriter *h, bool force, bool *flushed) |
7c673cae | 3585 | { |
20effc67 | 3586 | ceph_assert(ceph_mutex_is_locked(h->lock)); |
f67539c2 | 3587 | uint64_t length = h->get_buffer_length(); |
7c673cae | 3588 | uint64_t offset = h->pos; |
f6b5b4d7 TL |
3589 | if (flushed) { |
3590 | *flushed = false; | |
3591 | } | |
7c673cae FG |
3592 | if (!force && |
3593 | length < cct->_conf->bluefs_min_flush_size) { | |
3594 | dout(10) << __func__ << " " << h << " ignoring, length " << length | |
3595 | << " < min_flush_size " << cct->_conf->bluefs_min_flush_size | |
3596 | << dendl; | |
3597 | return 0; | |
3598 | } | |
3599 | if (length == 0) { | |
3600 | dout(10) << __func__ << " " << h << " no dirty data on " | |
3601 | << h->file->fnode << dendl; | |
3602 | return 0; | |
3603 | } | |
3604 | dout(10) << __func__ << " " << h << " 0x" | |
3605 | << std::hex << offset << "~" << length << std::dec | |
3606 | << " to " << h->file->fnode << dendl; | |
11fdf7f2 | 3607 | ceph_assert(h->pos <= h->file->fnode.size); |
20effc67 | 3608 | int r = _flush_range_F(h, offset, length); |
f6b5b4d7 TL |
3609 | if (flushed) { |
3610 | *flushed = true; | |
3611 | } | |
3612 | return r; | |
7c673cae FG |
3613 | } |
3614 | ||
20effc67 TL |
3615 | // Flush for bluefs special files. |
3616 | // Does not add extents to h. | |
3617 | // Does not mark h as dirty. | |
3618 | // we do not need to dirty the log file (or it's compacting | |
3619 | // replacement) when the file size changes because replay is | |
3620 | // smart enough to discover it on its own. | |
3621 | uint64_t BlueFS::_flush_special(FileWriter *h) | |
3622 | { | |
3623 | ceph_assert(h->file->fnode.ino <= 1); | |
3624 | uint64_t length = h->get_buffer_length(); | |
3625 | uint64_t offset = h->pos; | |
3626 | uint64_t new_data = 0; | |
3627 | ceph_assert(length + offset <= h->file->fnode.get_allocated()); | |
3628 | if (h->file->fnode.size < offset + length) { | |
3629 | new_data = offset + length - h->file->fnode.size; | |
3630 | h->file->fnode.size = offset + length; | |
3631 | } | |
3632 | _flush_data(h, offset, length, false); | |
3633 | return new_data; | |
3634 | } | |
3635 | ||
3636 | int BlueFS::truncate(FileWriter *h, uint64_t offset)/*_WF_L*/ | |
7c673cae | 3637 | { |
20effc67 | 3638 | std::lock_guard hl(h->lock); |
7c673cae FG |
3639 | dout(10) << __func__ << " 0x" << std::hex << offset << std::dec |
3640 | << " file " << h->file->fnode << dendl; | |
3641 | if (h->file->deleted) { | |
3642 | dout(10) << __func__ << " deleted, no-op" << dendl; | |
3643 | return 0; | |
3644 | } | |
3645 | ||
3646 | // we never truncate internal log files | |
11fdf7f2 | 3647 | ceph_assert(h->file->fnode.ino > 1); |
7c673cae | 3648 | |
7c673cae FG |
3649 | // truncate off unflushed data? |
3650 | if (h->pos < offset && | |
f67539c2 | 3651 | h->pos + h->get_buffer_length() > offset) { |
7c673cae FG |
3652 | dout(20) << __func__ << " tossing out last " << offset - h->pos |
3653 | << " unflushed bytes" << dendl; | |
11fdf7f2 | 3654 | ceph_abort_msg("actually this shouldn't happen"); |
7c673cae | 3655 | } |
f67539c2 | 3656 | if (h->get_buffer_length()) { |
20effc67 | 3657 | int r = _flush_F(h, true); |
7c673cae FG |
3658 | if (r < 0) |
3659 | return r; | |
3660 | } | |
3661 | if (offset == h->file->fnode.size) { | |
3662 | return 0; // no-op! | |
3663 | } | |
3664 | if (offset > h->file->fnode.size) { | |
11fdf7f2 | 3665 | ceph_abort_msg("truncate up not supported"); |
7c673cae | 3666 | } |
11fdf7f2 | 3667 | ceph_assert(h->file->fnode.size >= offset); |
20effc67 TL |
3668 | _flush_bdev(h); |
3669 | ||
3670 | std::lock_guard ll(log.lock); | |
9f95a23c | 3671 | vselector->sub_usage(h->file->vselector_hint, h->file->fnode.size); |
7c673cae | 3672 | h->file->fnode.size = offset; |
1e59de90 | 3673 | h->file->is_dirty = true; |
9f95a23c | 3674 | vselector->add_usage(h->file->vselector_hint, h->file->fnode.size); |
20effc67 | 3675 | log.t.op_file_update_inc(h->file->fnode); |
7c673cae FG |
3676 | return 0; |
3677 | } | |
3678 | ||
20effc67 | 3679 | int BlueFS::fsync(FileWriter *h)/*_WF_WD_WLD_WLNF_WNF*/ |
7c673cae | 3680 | { |
20effc67 TL |
3681 | _maybe_check_vselector_LNF(); |
3682 | std::unique_lock hl(h->lock); | |
3683 | uint64_t old_dirty_seq = 0; | |
3684 | { | |
1e59de90 TL |
3685 | dout(10) << __func__ << " " << h << " " << h->file->fnode |
3686 | << " dirty " << h->file->is_dirty << dendl; | |
20effc67 TL |
3687 | int r = _flush_F(h, true); |
3688 | if (r < 0) | |
3689 | return r; | |
3690 | _flush_bdev(h); | |
3691 | if (h->file->is_dirty) { | |
3692 | _signal_dirty_to_log_D(h); | |
3693 | h->file->is_dirty = false; | |
3694 | } | |
3695 | { | |
3696 | std::lock_guard dl(dirty.lock); | |
3697 | if (dirty.seq_stable < h->file->dirty_seq) { | |
3698 | old_dirty_seq = h->file->dirty_seq; | |
3699 | dout(20) << __func__ << " file metadata was dirty (" << old_dirty_seq | |
3700 | << ") on " << h->file->fnode << ", flushing log" << dendl; | |
3701 | } | |
3702 | } | |
522d829b | 3703 | } |
7c673cae | 3704 | if (old_dirty_seq) { |
20effc67 | 3705 | _flush_and_sync_log_LD(old_dirty_seq); |
7c673cae | 3706 | } |
20effc67 TL |
3707 | _maybe_compact_log_LNF_NF_LD_D(); |
3708 | ||
7c673cae FG |
3709 | return 0; |
3710 | } | |
3711 | ||
20effc67 | 3712 | // be careful - either h->file->lock or log.lock must be taken |
39ae355f | 3713 | void BlueFS::_flush_bdev(FileWriter *h, bool check_mutext_locked) |
7c673cae | 3714 | { |
39ae355f TL |
3715 | if (check_mutext_locked) { |
3716 | if (h->file->fnode.ino > 1) { | |
3717 | ceph_assert(ceph_mutex_is_locked(h->lock)); | |
3718 | } else if (h->file->fnode.ino == 1) { | |
3719 | ceph_assert(ceph_mutex_is_locked(log.lock)); | |
3720 | } | |
20effc67 | 3721 | } |
11fdf7f2 TL |
3722 | std::array<bool, MAX_BDEV> flush_devs = h->dirty_devs; |
3723 | h->dirty_devs.fill(false); | |
3724 | #ifdef HAVE_LIBAIO | |
7c673cae FG |
3725 | if (!cct->_conf->bluefs_sync_write) { |
3726 | list<aio_t> completed_ios; | |
3727 | _claim_completed_aios(h, &completed_ios); | |
20effc67 | 3728 | _wait_for_aio(h); |
7c673cae | 3729 | completed_ios.clear(); |
7c673cae | 3730 | } |
20effc67 TL |
3731 | #endif |
3732 | _flush_bdev(flush_devs); | |
7c673cae FG |
3733 | } |
3734 | ||
20effc67 | 3735 | void BlueFS::_flush_bdev(std::array<bool, MAX_BDEV>& dirty_bdevs) |
11fdf7f2 TL |
3736 | { |
3737 | // NOTE: this is safe to call without a lock. | |
3738 | dout(20) << __func__ << dendl; | |
3739 | for (unsigned i = 0; i < MAX_BDEV; i++) { | |
3740 | if (dirty_bdevs[i]) | |
3741 | bdev[i]->flush(); | |
3742 | } | |
3743 | } | |
3744 | ||
20effc67 | 3745 | void BlueFS::_flush_bdev() |
7c673cae FG |
3746 | { |
3747 | // NOTE: this is safe to call without a lock. | |
3748 | dout(20) << __func__ << dendl; | |
f67539c2 TL |
3749 | for (unsigned i = 0; i < MAX_BDEV; i++) { |
3750 | // alloc space from BDEV_SLOW is unexpected. | |
3751 | // So most cases we don't alloc from BDEV_SLOW and so avoiding flush not-used device. | |
3752 | if (bdev[i] && (i != BDEV_SLOW || _get_used(i))) { | |
3753 | bdev[i]->flush(); | |
3754 | } | |
7c673cae FG |
3755 | } |
3756 | } | |
3757 | ||
eafe8130 TL |
3758 | const char* BlueFS::get_device_name(unsigned id) |
3759 | { | |
3760 | if (id >= MAX_BDEV) return "BDEV_INV"; | |
3761 | const char* names[] = {"BDEV_WAL", "BDEV_DB", "BDEV_SLOW", "BDEV_NEWWAL", "BDEV_NEWDB"}; | |
3762 | return names[id]; | |
3763 | } | |
3764 | ||
7c673cae | 3765 | int BlueFS::_allocate(uint8_t id, uint64_t len, |
39ae355f TL |
3766 | uint64_t alloc_unit, |
3767 | bluefs_fnode_t* node, | |
3768 | size_t alloc_attempts, | |
3769 | bool permit_dev_fallback) | |
3770 | { | |
3771 | dout(10) << __func__ << " len 0x" << std::hex << len | |
3772 | << " au 0x" << alloc_unit | |
3773 | << std::dec << " from " << (int)id | |
3774 | << " cooldown " << cooldown_deadline | |
3775 | << dendl; | |
11fdf7f2 | 3776 | ceph_assert(id < alloc.size()); |
b32b8144 | 3777 | int64_t alloc_len = 0; |
a8e16298 | 3778 | PExtentVector extents; |
11fdf7f2 | 3779 | uint64_t hint = 0; |
f67539c2 | 3780 | int64_t need = len; |
39ae355f TL |
3781 | bool shared = is_shared_alloc(id); |
3782 | auto shared_unit = shared_alloc ? shared_alloc->alloc_unit : 0; | |
3783 | bool was_cooldown = false; | |
7c673cae | 3784 | if (alloc[id]) { |
39ae355f TL |
3785 | if (!alloc_unit) { |
3786 | alloc_unit = alloc_size[id]; | |
3787 | } | |
3788 | // do not attempt shared_allocator with bluefs alloc unit | |
3789 | // when cooling down, fallback to slow dev alloc unit. | |
3790 | if (shared && alloc_unit != shared_unit) { | |
3791 | if (duration_cast<seconds>(real_clock::now().time_since_epoch()).count() < | |
3792 | cooldown_deadline) { | |
3793 | logger->inc(l_bluefs_alloc_shared_size_fallbacks); | |
3794 | alloc_unit = shared_unit; | |
3795 | was_cooldown = true; | |
3796 | } else if (cooldown_deadline.fetch_and(0)) { | |
3797 | // we might get false cooldown_deadline reset at this point | |
3798 | // but that's mostly harmless. | |
3799 | dout(1) << __func__ << " shared allocation cooldown period elapsed" | |
3800 | << dendl; | |
3801 | } | |
3802 | } | |
3803 | need = round_up_to(len, alloc_unit); | |
94b18763 FG |
3804 | if (!node->extents.empty() && node->extents.back().bdev == id) { |
3805 | hint = node->extents.back().end(); | |
11fdf7f2 | 3806 | } |
39ae355f | 3807 | ++alloc_attempts; |
b32b8144 | 3808 | extents.reserve(4); // 4 should be (more than) enough for most allocations |
39ae355f | 3809 | alloc_len = alloc[id]->allocate(need, alloc_unit, hint, &extents); |
b32b8144 | 3810 | } |
f67539c2 TL |
3811 | if (alloc_len < 0 || alloc_len < need) { |
3812 | if (alloc[id]) { | |
3813 | if (alloc_len > 0) { | |
3814 | alloc[id]->release(extents); | |
3815 | } | |
39ae355f TL |
3816 | if (!was_cooldown && shared) { |
3817 | auto delay_s = cct->_conf->bluefs_failed_shared_alloc_cooldown; | |
3818 | cooldown_deadline = delay_s + | |
3819 | duration_cast<seconds>(real_clock::now().time_since_epoch()).count(); | |
3820 | dout(1) << __func__ << " shared allocation cooldown set for " | |
3821 | << delay_s << "s" | |
3822 | << dendl; | |
3823 | } | |
f67539c2 TL |
3824 | dout(1) << __func__ << " unable to allocate 0x" << std::hex << need |
3825 | << " on bdev " << (int)id | |
3826 | << ", allocator name " << alloc[id]->get_name() | |
3827 | << ", allocator type " << alloc[id]->get_type() | |
3828 | << ", capacity 0x" << alloc[id]->get_capacity() | |
3829 | << ", block size 0x" << alloc[id]->get_block_size() | |
39ae355f | 3830 | << ", alloc unit 0x" << alloc_unit |
f67539c2 TL |
3831 | << ", free 0x" << alloc[id]->get_free() |
3832 | << ", fragmentation " << alloc[id]->get_fragmentation() | |
3833 | << ", allocated 0x" << (alloc_len > 0 ? alloc_len : 0) | |
3834 | << std::dec << dendl; | |
20effc67 | 3835 | } else { |
39ae355f TL |
3836 | dout(20) << __func__ << " alloc-id not set on index="<< (int)id |
3837 | << " unable to allocate 0x" << std::hex << need | |
20effc67 | 3838 | << " on bdev " << (int)id << std::dec << dendl; |
b32b8144 | 3839 | } |
39ae355f TL |
3840 | if (alloc[id] && shared && alloc_unit != shared_unit) { |
3841 | alloc_unit = shared_unit; | |
3842 | dout(20) << __func__ << " fallback to bdev " | |
3843 | << (int)id | |
3844 | << " with alloc unit 0x" << std::hex << alloc_unit | |
3845 | << std::dec << dendl; | |
3846 | logger->inc(l_bluefs_alloc_shared_size_fallbacks); | |
3847 | return _allocate(id, | |
3848 | len, | |
3849 | alloc_unit, | |
3850 | node, | |
3851 | alloc_attempts, | |
3852 | permit_dev_fallback); | |
3853 | } else if (permit_dev_fallback && id != BDEV_SLOW && alloc[id + 1]) { | |
f67539c2 | 3854 | dout(20) << __func__ << " fallback to bdev " |
20effc67 | 3855 | << (int)id + 1 |
f67539c2 | 3856 | << dendl; |
39ae355f TL |
3857 | if (alloc_attempts > 0 && is_shared_alloc(id + 1)) { |
3858 | logger->inc(l_bluefs_alloc_shared_dev_fallbacks); | |
3859 | } | |
3860 | return _allocate(id + 1, | |
3861 | len, | |
3862 | 0, // back to default alloc unit | |
3863 | node, | |
3864 | alloc_attempts, | |
3865 | permit_dev_fallback); | |
11fdf7f2 | 3866 | } else { |
f67539c2 TL |
3867 | derr << __func__ << " allocation failed, needed 0x" << std::hex << need |
3868 | << dendl; | |
11fdf7f2 | 3869 | } |
f67539c2 | 3870 | return -ENOSPC; |
11fdf7f2 | 3871 | } else { |
f67539c2 TL |
3872 | uint64_t used = _get_used(id); |
3873 | if (max_bytes[id] < used) { | |
3874 | logger->set(max_bytes_pcounters[id], used); | |
3875 | max_bytes[id] = used; | |
3876 | } | |
39ae355f | 3877 | if (shared) { |
f67539c2 | 3878 | shared_alloc->bluefs_used += alloc_len; |
11fdf7f2 | 3879 | } |
7c673cae FG |
3880 | } |
3881 | ||
3882 | for (auto& p : extents) { | |
94b18763 | 3883 | node->append_extent(bluefs_extent_t(id, p.offset, p.length)); |
7c673cae FG |
3884 | } |
3885 | ||
3886 | return 0; | |
3887 | } | |
3888 | ||
20effc67 | 3889 | int BlueFS::preallocate(FileRef f, uint64_t off, uint64_t len)/*_LF*/ |
7c673cae | 3890 | { |
20effc67 TL |
3891 | std::lock_guard ll(log.lock); |
3892 | std::lock_guard fl(f->lock); | |
7c673cae FG |
3893 | dout(10) << __func__ << " file " << f->fnode << " 0x" |
3894 | << std::hex << off << "~" << len << std::dec << dendl; | |
3895 | if (f->deleted) { | |
3896 | dout(10) << __func__ << " deleted, no-op" << dendl; | |
3897 | return 0; | |
3898 | } | |
11fdf7f2 | 3899 | ceph_assert(f->fnode.ino > 1); |
7c673cae FG |
3900 | uint64_t allocated = f->fnode.get_allocated(); |
3901 | if (off + len > allocated) { | |
3902 | uint64_t want = off + len - allocated; | |
9f95a23c | 3903 | |
20effc67 | 3904 | vselector->sub_usage(f->vselector_hint, f->fnode); |
9f95a23c TL |
3905 | int r = _allocate(vselector->select_prefer_bdev(f->vselector_hint), |
3906 | want, | |
39ae355f | 3907 | 0, |
9f95a23c TL |
3908 | &f->fnode); |
3909 | vselector->add_usage(f->vselector_hint, f->fnode); | |
7c673cae FG |
3910 | if (r < 0) |
3911 | return r; | |
20effc67 TL |
3912 | |
3913 | log.t.op_file_update_inc(f->fnode); | |
7c673cae FG |
3914 | } |
3915 | return 0; | |
3916 | } | |
3917 | ||
20effc67 | 3918 | void BlueFS::sync_metadata(bool avoid_compact)/*_LNF_NF_LD_D*/ |
7c673cae | 3919 | { |
20effc67 TL |
3920 | bool can_skip_flush; |
3921 | { | |
3922 | std::lock_guard ll(log.lock); | |
3923 | std::lock_guard dl(dirty.lock); | |
3924 | can_skip_flush = log.t.empty() && dirty.files.empty(); | |
3925 | } | |
3926 | if (can_skip_flush) { | |
7c673cae | 3927 | dout(10) << __func__ << " - no pending log events" << dendl; |
11fdf7f2 | 3928 | } else { |
f67539c2 TL |
3929 | utime_t start; |
3930 | lgeneric_subdout(cct, bluefs, 10) << __func__; | |
3931 | start = ceph_clock_now(); | |
3932 | *_dout << dendl; | |
20effc67 TL |
3933 | _flush_bdev(); // FIXME? |
3934 | _flush_and_sync_log_LD(); | |
11fdf7f2 | 3935 | dout(10) << __func__ << " done in " << (ceph_clock_now() - start) << dendl; |
7c673cae | 3936 | } |
7c673cae | 3937 | |
f6b5b4d7 | 3938 | if (!avoid_compact) { |
20effc67 | 3939 | _maybe_compact_log_LNF_NF_LD_D(); |
f6b5b4d7 TL |
3940 | } |
3941 | } | |
3942 | ||
20effc67 | 3943 | void BlueFS::_maybe_compact_log_LNF_NF_LD_D() |
f6b5b4d7 TL |
3944 | { |
3945 | if (!cct->_conf->bluefs_replay_recovery_disable_compact && | |
20effc67 | 3946 | _should_start_compact_log_L_N()) { |
39ae355f | 3947 | auto t0 = mono_clock::now(); |
7c673cae | 3948 | if (cct->_conf->bluefs_compact_log_sync) { |
20effc67 | 3949 | _compact_log_sync_LNF_LD(); |
7c673cae | 3950 | } else { |
20effc67 | 3951 | _compact_log_async_LD_LNF_D(); |
7c673cae | 3952 | } |
39ae355f | 3953 | logger->tinc(l_bluefs_compaction_lat, mono_clock::now() - t0); |
7c673cae | 3954 | } |
7c673cae FG |
3955 | } |
3956 | ||
3957 | int BlueFS::open_for_write( | |
b3b6e05e TL |
3958 | std::string_view dirname, |
3959 | std::string_view filename, | |
7c673cae | 3960 | FileWriter **h, |
1e59de90 | 3961 | bool overwrite)/*_LND*/ |
7c673cae | 3962 | { |
20effc67 TL |
3963 | _maybe_check_vselector_LNF(); |
3964 | FileRef file; | |
3965 | bool create = false; | |
3966 | bool truncate = false; | |
3967 | mempool::bluefs::vector<bluefs_extent_t> pending_release_extents; | |
3968 | { | |
1e59de90 TL |
3969 | std::lock_guard ll(log.lock); |
3970 | std::lock_guard nl(nodes.lock); | |
7c673cae | 3971 | dout(10) << __func__ << " " << dirname << "/" << filename << dendl; |
20effc67 | 3972 | map<string,DirRef>::iterator p = nodes.dir_map.find(dirname); |
7c673cae | 3973 | DirRef dir; |
20effc67 | 3974 | if (p == nodes.dir_map.end()) { |
7c673cae FG |
3975 | // implicitly create the dir |
3976 | dout(20) << __func__ << " dir " << dirname | |
3977 | << " does not exist" << dendl; | |
3978 | return -ENOENT; | |
3979 | } else { | |
3980 | dir = p->second; | |
3981 | } | |
3982 | ||
7c673cae FG |
3983 | map<string,FileRef>::iterator q = dir->file_map.find(filename); |
3984 | if (q == dir->file_map.end()) { | |
3985 | if (overwrite) { | |
3986 | dout(20) << __func__ << " dir " << dirname << " (" << dir | |
3987 | << ") file " << filename | |
3988 | << " does not exist" << dendl; | |
3989 | return -ENOENT; | |
3990 | } | |
9f95a23c | 3991 | file = ceph::make_ref<File>(); |
7c673cae | 3992 | file->fnode.ino = ++ino_last; |
20effc67 | 3993 | nodes.file_map[ino_last] = file; |
b3b6e05e | 3994 | dir->file_map[string{filename}] = file; |
7c673cae FG |
3995 | ++file->refs; |
3996 | create = true; | |
20effc67 | 3997 | logger->set(l_bluefs_num_files, nodes.file_map.size()); |
7c673cae FG |
3998 | } else { |
3999 | // overwrite existing file? | |
4000 | file = q->second; | |
4001 | if (overwrite) { | |
4002 | dout(20) << __func__ << " dir " << dirname << " (" << dir | |
4003 | << ") file " << filename | |
4004 | << " already exists, overwrite in place" << dendl; | |
4005 | } else { | |
4006 | dout(20) << __func__ << " dir " << dirname << " (" << dir | |
4007 | << ") file " << filename | |
4008 | << " already exists, truncate + overwrite" << dendl; | |
9f95a23c | 4009 | vselector->sub_usage(file->vselector_hint, file->fnode); |
7c673cae | 4010 | file->fnode.size = 0; |
20effc67 | 4011 | pending_release_extents.swap(file->fnode.extents); |
f6b5b4d7 | 4012 | truncate = true; |
94b18763 FG |
4013 | |
4014 | file->fnode.clear_extents(); | |
7c673cae FG |
4015 | } |
4016 | } | |
11fdf7f2 | 4017 | ceph_assert(file->fnode.ino > 1); |
7c673cae FG |
4018 | |
4019 | file->fnode.mtime = ceph_clock_now(); | |
9f95a23c | 4020 | file->vselector_hint = vselector->get_hint_by_dir(dirname); |
f6b5b4d7 TL |
4021 | if (create || truncate) { |
4022 | vselector->add_usage(file->vselector_hint, file->fnode); // update file count | |
4023 | } | |
9f95a23c | 4024 | |
7c673cae | 4025 | dout(20) << __func__ << " mapping " << dirname << "/" << filename |
9f95a23c TL |
4026 | << " vsel_hint " << file->vselector_hint |
4027 | << dendl; | |
20effc67 | 4028 | |
1e59de90 TL |
4029 | log.t.op_file_update(file->fnode); |
4030 | if (create) | |
4031 | log.t.op_dir_link(dirname, filename, file->fnode.ino); | |
4032 | ||
4033 | std::lock_guard dl(dirty.lock); | |
4034 | for (auto& p : pending_release_extents) { | |
4035 | dirty.pending_release[p.bdev].insert(p.offset, p.length); | |
4036 | } | |
20effc67 | 4037 | } |
7c673cae FG |
4038 | *h = _create_writer(file); |
4039 | ||
4040 | if (boost::algorithm::ends_with(filename, ".log")) { | |
4041 | (*h)->writer_type = BlueFS::WRITER_WAL; | |
4042 | if (logger && !overwrite) { | |
4043 | logger->inc(l_bluefs_files_written_wal); | |
4044 | } | |
4045 | } else if (boost::algorithm::ends_with(filename, ".sst")) { | |
4046 | (*h)->writer_type = BlueFS::WRITER_SST; | |
4047 | if (logger) { | |
4048 | logger->inc(l_bluefs_files_written_sst); | |
4049 | } | |
4050 | } | |
4051 | ||
4052 | dout(10) << __func__ << " h " << *h << " on " << file->fnode << dendl; | |
4053 | return 0; | |
4054 | } | |
4055 | ||
4056 | BlueFS::FileWriter *BlueFS::_create_writer(FileRef f) | |
4057 | { | |
4058 | FileWriter *w = new FileWriter(f); | |
4059 | for (unsigned i = 0; i < MAX_BDEV; ++i) { | |
4060 | if (bdev[i]) { | |
4061 | w->iocv[i] = new IOContext(cct, NULL); | |
7c673cae FG |
4062 | } |
4063 | } | |
4064 | return w; | |
4065 | } | |
4066 | ||
20effc67 | 4067 | void BlueFS::_drain_writer(FileWriter *h) |
7c673cae FG |
4068 | { |
4069 | dout(10) << __func__ << " " << h << " type " << h->writer_type << dendl; | |
f67539c2 | 4070 | //h->buffer.reassign_to_mempool(mempool::mempool_bluefs_file_writer); |
7c673cae FG |
4071 | for (unsigned i=0; i<MAX_BDEV; ++i) { |
4072 | if (bdev[i]) { | |
11fdf7f2 TL |
4073 | if (h->iocv[i]) { |
4074 | h->iocv[i]->aio_wait(); | |
20effc67 | 4075 | delete h->iocv[i]; |
11fdf7f2 | 4076 | } |
7c673cae FG |
4077 | } |
4078 | } | |
522d829b TL |
4079 | // sanity |
4080 | if (h->file->fnode.size >= (1ull << 30)) { | |
4081 | dout(10) << __func__ << " file is unexpectedly large:" << h->file->fnode << dendl; | |
4082 | } | |
20effc67 TL |
4083 | } |
4084 | ||
4085 | void BlueFS::_close_writer(FileWriter *h) | |
4086 | { | |
4087 | _drain_writer(h); | |
4088 | delete h; | |
4089 | } | |
4090 | void BlueFS::close_writer(FileWriter *h) | |
4091 | { | |
4092 | { | |
4093 | std::lock_guard l(h->lock); | |
4094 | _drain_writer(h); | |
4095 | } | |
7c673cae FG |
4096 | delete h; |
4097 | } | |
4098 | ||
522d829b TL |
4099 | uint64_t BlueFS::debug_get_dirty_seq(FileWriter *h) |
4100 | { | |
20effc67 | 4101 | std::lock_guard l(h->lock); |
522d829b TL |
4102 | return h->file->dirty_seq; |
4103 | } | |
4104 | ||
4105 | bool BlueFS::debug_get_is_dev_dirty(FileWriter *h, uint8_t dev) | |
4106 | { | |
20effc67 | 4107 | std::lock_guard l(h->lock); |
522d829b TL |
4108 | return h->dirty_devs[dev]; |
4109 | } | |
4110 | ||
7c673cae | 4111 | int BlueFS::open_for_read( |
b3b6e05e TL |
4112 | std::string_view dirname, |
4113 | std::string_view filename, | |
7c673cae | 4114 | FileReader **h, |
20effc67 | 4115 | bool random)/*_N*/ |
7c673cae | 4116 | { |
20effc67 TL |
4117 | _maybe_check_vselector_LNF(); |
4118 | std::lock_guard nl(nodes.lock); | |
7c673cae FG |
4119 | dout(10) << __func__ << " " << dirname << "/" << filename |
4120 | << (random ? " (random)":" (sequential)") << dendl; | |
20effc67 TL |
4121 | map<string,DirRef>::iterator p = nodes.dir_map.find(dirname); |
4122 | if (p == nodes.dir_map.end()) { | |
7c673cae FG |
4123 | dout(20) << __func__ << " dir " << dirname << " not found" << dendl; |
4124 | return -ENOENT; | |
4125 | } | |
4126 | DirRef dir = p->second; | |
4127 | ||
4128 | map<string,FileRef>::iterator q = dir->file_map.find(filename); | |
4129 | if (q == dir->file_map.end()) { | |
4130 | dout(20) << __func__ << " dir " << dirname << " (" << dir | |
4131 | << ") file " << filename | |
4132 | << " not found" << dendl; | |
4133 | return -ENOENT; | |
4134 | } | |
4135 | File *file = q->second.get(); | |
4136 | ||
4137 | *h = new FileReader(file, random ? 4096 : cct->_conf->bluefs_max_prefetch, | |
4138 | random, false); | |
4139 | dout(10) << __func__ << " h " << *h << " on " << file->fnode << dendl; | |
4140 | return 0; | |
4141 | } | |
4142 | ||
4143 | int BlueFS::rename( | |
b3b6e05e | 4144 | std::string_view old_dirname, std::string_view old_filename, |
20effc67 | 4145 | std::string_view new_dirname, std::string_view new_filename)/*_LND*/ |
7c673cae | 4146 | { |
20effc67 TL |
4147 | std::lock_guard ll(log.lock); |
4148 | std::lock_guard nl(nodes.lock); | |
7c673cae FG |
4149 | dout(10) << __func__ << " " << old_dirname << "/" << old_filename |
4150 | << " -> " << new_dirname << "/" << new_filename << dendl; | |
20effc67 TL |
4151 | map<string,DirRef>::iterator p = nodes.dir_map.find(old_dirname); |
4152 | if (p == nodes.dir_map.end()) { | |
7c673cae FG |
4153 | dout(20) << __func__ << " dir " << old_dirname << " not found" << dendl; |
4154 | return -ENOENT; | |
4155 | } | |
4156 | DirRef old_dir = p->second; | |
4157 | map<string,FileRef>::iterator q = old_dir->file_map.find(old_filename); | |
4158 | if (q == old_dir->file_map.end()) { | |
4159 | dout(20) << __func__ << " dir " << old_dirname << " (" << old_dir | |
4160 | << ") file " << old_filename | |
4161 | << " not found" << dendl; | |
4162 | return -ENOENT; | |
4163 | } | |
4164 | FileRef file = q->second; | |
4165 | ||
20effc67 TL |
4166 | p = nodes.dir_map.find(new_dirname); |
4167 | if (p == nodes.dir_map.end()) { | |
7c673cae FG |
4168 | dout(20) << __func__ << " dir " << new_dirname << " not found" << dendl; |
4169 | return -ENOENT; | |
4170 | } | |
4171 | DirRef new_dir = p->second; | |
4172 | q = new_dir->file_map.find(new_filename); | |
4173 | if (q != new_dir->file_map.end()) { | |
4174 | dout(20) << __func__ << " dir " << new_dirname << " (" << old_dir | |
4175 | << ") file " << new_filename | |
4176 | << " already exists, unlinking" << dendl; | |
11fdf7f2 | 4177 | ceph_assert(q->second != file); |
20effc67 TL |
4178 | log.t.op_dir_unlink(new_dirname, new_filename); |
4179 | _drop_link_D(q->second); | |
7c673cae FG |
4180 | } |
4181 | ||
4182 | dout(10) << __func__ << " " << new_dirname << "/" << new_filename << " " | |
4183 | << " " << file->fnode << dendl; | |
4184 | ||
b3b6e05e TL |
4185 | new_dir->file_map[string{new_filename}] = file; |
4186 | old_dir->file_map.erase(string{old_filename}); | |
7c673cae | 4187 | |
20effc67 TL |
4188 | log.t.op_dir_link(new_dirname, new_filename, file->fnode.ino); |
4189 | log.t.op_dir_unlink(old_dirname, old_filename); | |
7c673cae FG |
4190 | return 0; |
4191 | } | |
4192 | ||
20effc67 | 4193 | int BlueFS::mkdir(std::string_view dirname)/*_LN*/ |
7c673cae | 4194 | { |
20effc67 TL |
4195 | std::lock_guard ll(log.lock); |
4196 | std::lock_guard nl(nodes.lock); | |
7c673cae | 4197 | dout(10) << __func__ << " " << dirname << dendl; |
20effc67 TL |
4198 | map<string,DirRef>::iterator p = nodes.dir_map.find(dirname); |
4199 | if (p != nodes.dir_map.end()) { | |
7c673cae FG |
4200 | dout(20) << __func__ << " dir " << dirname << " exists" << dendl; |
4201 | return -EEXIST; | |
4202 | } | |
20effc67 TL |
4203 | nodes.dir_map[string{dirname}] = ceph::make_ref<Dir>(); |
4204 | log.t.op_dir_create(dirname); | |
7c673cae FG |
4205 | return 0; |
4206 | } | |
4207 | ||
20effc67 | 4208 | int BlueFS::rmdir(std::string_view dirname)/*_LN*/ |
7c673cae | 4209 | { |
20effc67 TL |
4210 | std::lock_guard ll(log.lock); |
4211 | std::lock_guard nl(nodes.lock); | |
7c673cae | 4212 | dout(10) << __func__ << " " << dirname << dendl; |
20effc67 TL |
4213 | auto p = nodes.dir_map.find(dirname); |
4214 | if (p == nodes.dir_map.end()) { | |
7c673cae FG |
4215 | dout(20) << __func__ << " dir " << dirname << " does not exist" << dendl; |
4216 | return -ENOENT; | |
4217 | } | |
4218 | DirRef dir = p->second; | |
4219 | if (!dir->file_map.empty()) { | |
4220 | dout(20) << __func__ << " dir " << dirname << " not empty" << dendl; | |
4221 | return -ENOTEMPTY; | |
4222 | } | |
20effc67 TL |
4223 | nodes.dir_map.erase(string{dirname}); |
4224 | log.t.op_dir_remove(dirname); | |
7c673cae FG |
4225 | return 0; |
4226 | } | |
4227 | ||
20effc67 | 4228 | bool BlueFS::dir_exists(std::string_view dirname)/*_N*/ |
7c673cae | 4229 | { |
20effc67 TL |
4230 | std::lock_guard nl(nodes.lock); |
4231 | map<string,DirRef>::iterator p = nodes.dir_map.find(dirname); | |
4232 | bool exists = p != nodes.dir_map.end(); | |
7c673cae FG |
4233 | dout(10) << __func__ << " " << dirname << " = " << (int)exists << dendl; |
4234 | return exists; | |
4235 | } | |
4236 | ||
b3b6e05e | 4237 | int BlueFS::stat(std::string_view dirname, std::string_view filename, |
20effc67 | 4238 | uint64_t *size, utime_t *mtime)/*_N*/ |
7c673cae | 4239 | { |
20effc67 | 4240 | std::lock_guard nl(nodes.lock); |
7c673cae | 4241 | dout(10) << __func__ << " " << dirname << "/" << filename << dendl; |
20effc67 TL |
4242 | map<string,DirRef>::iterator p = nodes.dir_map.find(dirname); |
4243 | if (p == nodes.dir_map.end()) { | |
7c673cae FG |
4244 | dout(20) << __func__ << " dir " << dirname << " not found" << dendl; |
4245 | return -ENOENT; | |
4246 | } | |
4247 | DirRef dir = p->second; | |
4248 | map<string,FileRef>::iterator q = dir->file_map.find(filename); | |
4249 | if (q == dir->file_map.end()) { | |
4250 | dout(20) << __func__ << " dir " << dirname << " (" << dir | |
4251 | << ") file " << filename | |
4252 | << " not found" << dendl; | |
4253 | return -ENOENT; | |
4254 | } | |
4255 | File *file = q->second.get(); | |
4256 | dout(10) << __func__ << " " << dirname << "/" << filename | |
4257 | << " " << file->fnode << dendl; | |
4258 | if (size) | |
4259 | *size = file->fnode.size; | |
4260 | if (mtime) | |
4261 | *mtime = file->fnode.mtime; | |
4262 | return 0; | |
4263 | } | |
4264 | ||
b3b6e05e | 4265 | int BlueFS::lock_file(std::string_view dirname, std::string_view filename, |
20effc67 | 4266 | FileLock **plock)/*_LN*/ |
7c673cae | 4267 | { |
20effc67 TL |
4268 | std::lock_guard ll(log.lock); |
4269 | std::lock_guard nl(nodes.lock); | |
7c673cae | 4270 | dout(10) << __func__ << " " << dirname << "/" << filename << dendl; |
20effc67 TL |
4271 | map<string,DirRef>::iterator p = nodes.dir_map.find(dirname); |
4272 | if (p == nodes.dir_map.end()) { | |
7c673cae FG |
4273 | dout(20) << __func__ << " dir " << dirname << " not found" << dendl; |
4274 | return -ENOENT; | |
4275 | } | |
4276 | DirRef dir = p->second; | |
b3b6e05e | 4277 | auto q = dir->file_map.find(filename); |
9f95a23c | 4278 | FileRef file; |
7c673cae FG |
4279 | if (q == dir->file_map.end()) { |
4280 | dout(20) << __func__ << " dir " << dirname << " (" << dir | |
4281 | << ") file " << filename | |
4282 | << " not found, creating" << dendl; | |
9f95a23c | 4283 | file = ceph::make_ref<File>(); |
7c673cae FG |
4284 | file->fnode.ino = ++ino_last; |
4285 | file->fnode.mtime = ceph_clock_now(); | |
20effc67 | 4286 | nodes.file_map[ino_last] = file; |
b3b6e05e | 4287 | dir->file_map[string{filename}] = file; |
20effc67 | 4288 | logger->set(l_bluefs_num_files, nodes.file_map.size()); |
7c673cae | 4289 | ++file->refs; |
20effc67 TL |
4290 | log.t.op_file_update(file->fnode); |
4291 | log.t.op_dir_link(dirname, filename, file->fnode.ino); | |
7c673cae | 4292 | } else { |
9f95a23c | 4293 | file = q->second; |
7c673cae FG |
4294 | if (file->locked) { |
4295 | dout(10) << __func__ << " already locked" << dendl; | |
11fdf7f2 | 4296 | return -ENOLCK; |
7c673cae FG |
4297 | } |
4298 | } | |
4299 | file->locked = true; | |
4300 | *plock = new FileLock(file); | |
4301 | dout(10) << __func__ << " locked " << file->fnode | |
4302 | << " with " << *plock << dendl; | |
4303 | return 0; | |
4304 | } | |
4305 | ||
20effc67 | 4306 | int BlueFS::unlock_file(FileLock *fl)/*_N*/ |
7c673cae | 4307 | { |
20effc67 | 4308 | std::lock_guard nl(nodes.lock); |
7c673cae | 4309 | dout(10) << __func__ << " " << fl << " on " << fl->file->fnode << dendl; |
11fdf7f2 | 4310 | ceph_assert(fl->file->locked); |
7c673cae FG |
4311 | fl->file->locked = false; |
4312 | delete fl; | |
4313 | return 0; | |
4314 | } | |
4315 | ||
20effc67 | 4316 | int BlueFS::readdir(std::string_view dirname, vector<string> *ls)/*_N*/ |
7c673cae | 4317 | { |
b3b6e05e TL |
4318 | // dirname may contain a trailing / |
4319 | if (!dirname.empty() && dirname.back() == '/') { | |
4320 | dirname.remove_suffix(1); | |
4321 | } | |
20effc67 | 4322 | std::lock_guard nl(nodes.lock); |
7c673cae FG |
4323 | dout(10) << __func__ << " " << dirname << dendl; |
4324 | if (dirname.empty()) { | |
4325 | // list dirs | |
20effc67 TL |
4326 | ls->reserve(nodes.dir_map.size() + 2); |
4327 | for (auto& q : nodes.dir_map) { | |
7c673cae FG |
4328 | ls->push_back(q.first); |
4329 | } | |
4330 | } else { | |
4331 | // list files in dir | |
20effc67 TL |
4332 | map<string,DirRef>::iterator p = nodes.dir_map.find(dirname); |
4333 | if (p == nodes.dir_map.end()) { | |
7c673cae FG |
4334 | dout(20) << __func__ << " dir " << dirname << " not found" << dendl; |
4335 | return -ENOENT; | |
4336 | } | |
4337 | DirRef dir = p->second; | |
4338 | ls->reserve(dir->file_map.size() + 2); | |
4339 | for (auto& q : dir->file_map) { | |
4340 | ls->push_back(q.first); | |
4341 | } | |
4342 | } | |
4343 | ls->push_back("."); | |
4344 | ls->push_back(".."); | |
4345 | return 0; | |
4346 | } | |
4347 | ||
20effc67 | 4348 | int BlueFS::unlink(std::string_view dirname, std::string_view filename)/*_LND*/ |
7c673cae | 4349 | { |
20effc67 TL |
4350 | std::lock_guard ll(log.lock); |
4351 | std::lock_guard nl(nodes.lock); | |
7c673cae | 4352 | dout(10) << __func__ << " " << dirname << "/" << filename << dendl; |
20effc67 TL |
4353 | map<string,DirRef>::iterator p = nodes.dir_map.find(dirname); |
4354 | if (p == nodes.dir_map.end()) { | |
7c673cae FG |
4355 | dout(20) << __func__ << " dir " << dirname << " not found" << dendl; |
4356 | return -ENOENT; | |
4357 | } | |
4358 | DirRef dir = p->second; | |
4359 | map<string,FileRef>::iterator q = dir->file_map.find(filename); | |
4360 | if (q == dir->file_map.end()) { | |
4361 | dout(20) << __func__ << " file " << dirname << "/" << filename | |
4362 | << " not found" << dendl; | |
4363 | return -ENOENT; | |
4364 | } | |
4365 | FileRef file = q->second; | |
4366 | if (file->locked) { | |
4367 | dout(20) << __func__ << " file " << dirname << "/" << filename | |
4368 | << " is locked" << dendl; | |
4369 | return -EBUSY; | |
4370 | } | |
b3b6e05e | 4371 | dir->file_map.erase(string{filename}); |
20effc67 TL |
4372 | log.t.op_dir_unlink(dirname, filename); |
4373 | _drop_link_D(file); | |
7c673cae FG |
4374 | return 0; |
4375 | } | |
d2e6a577 FG |
4376 | |
4377 | bool BlueFS::wal_is_rotational() | |
4378 | { | |
94b18763 FG |
4379 | if (bdev[BDEV_WAL]) { |
4380 | return bdev[BDEV_WAL]->is_rotational(); | |
4381 | } else if (bdev[BDEV_DB]) { | |
4382 | return bdev[BDEV_DB]->is_rotational(); | |
4383 | } | |
4384 | return bdev[BDEV_SLOW]->is_rotational(); | |
d2e6a577 | 4385 | } |
9f95a23c | 4386 | |
1d09f67e TL |
4387 | bool BlueFS::db_is_rotational() |
4388 | { | |
4389 | if (bdev[BDEV_DB]) { | |
4390 | return bdev[BDEV_DB]->is_rotational(); | |
4391 | } | |
4392 | return bdev[BDEV_SLOW]->is_rotational(); | |
4393 | } | |
4394 | ||
f6b5b4d7 TL |
4395 | /* |
4396 | Algorithm. | |
4397 | do_replay_recovery_read is used when bluefs log abruptly ends, but it seems that more data should be there. | |
4398 | Idea is to search disk for definiton of extents that will be accompanied with bluefs log in future, | |
4399 | and try if using it will produce healthy bluefs transaction. | |
4400 | We encode already known bluefs log extents and search disk for these bytes. | |
4401 | When we find it, we decode following bytes as extent. | |
4402 | We read that whole extent and then check if merged with existing log part gives a proper bluefs transaction. | |
4403 | */ | |
20effc67 | 4404 | int BlueFS::_do_replay_recovery_read(FileReader *log_reader, |
f6b5b4d7 TL |
4405 | size_t replay_pos, |
4406 | size_t read_offset, | |
4407 | size_t read_len, | |
4408 | bufferlist* bl) { | |
4409 | dout(1) << __func__ << " replay_pos=0x" << std::hex << replay_pos << | |
4410 | " needs 0x" << read_offset << "~" << read_len << std::dec << dendl; | |
4411 | ||
4412 | bluefs_fnode_t& log_fnode = log_reader->file->fnode; | |
4413 | bufferlist bin_extents; | |
f67539c2 | 4414 | ::encode(log_fnode.extents, bin_extents); |
f6b5b4d7 TL |
4415 | dout(2) << __func__ << " log file encoded extents length = " << bin_extents.length() << dendl; |
4416 | ||
4417 | // cannot process if too small to effectively search | |
4418 | ceph_assert(bin_extents.length() >= 32); | |
4419 | bufferlist last_32; | |
4420 | last_32.substr_of(bin_extents, bin_extents.length() - 32, 32); | |
4421 | ||
4422 | //read fixed part from replay_pos to end of bluefs_log extents | |
4423 | bufferlist fixed; | |
4424 | uint64_t e_off = 0; | |
4425 | auto e = log_fnode.seek(replay_pos, &e_off); | |
4426 | ceph_assert(e != log_fnode.extents.end()); | |
20effc67 TL |
4427 | int r = _bdev_read(e->bdev, e->offset + e_off, e->length - e_off, &fixed, ioc[e->bdev], |
4428 | cct->_conf->bluefs_buffered_io); | |
f6b5b4d7 TL |
4429 | ceph_assert(r == 0); |
4430 | //capture dev of last good extent | |
4431 | uint8_t last_e_dev = e->bdev; | |
4432 | uint64_t last_e_off = e->offset; | |
4433 | ++e; | |
4434 | while (e != log_fnode.extents.end()) { | |
20effc67 TL |
4435 | r = _bdev_read(e->bdev, e->offset, e->length, &fixed, ioc[e->bdev], |
4436 | cct->_conf->bluefs_buffered_io); | |
f6b5b4d7 TL |
4437 | ceph_assert(r == 0); |
4438 | last_e_dev = e->bdev; | |
4439 | ++e; | |
4440 | } | |
4441 | ceph_assert(replay_pos + fixed.length() == read_offset); | |
4442 | ||
4443 | dout(2) << __func__ << " valid data in log = " << fixed.length() << dendl; | |
4444 | ||
4445 | struct compare { | |
4446 | bool operator()(const bluefs_extent_t& a, const bluefs_extent_t& b) const { | |
4447 | if (a.bdev < b.bdev) return true; | |
4448 | if (a.offset < b.offset) return true; | |
4449 | return a.length < b.length; | |
4450 | } | |
4451 | }; | |
4452 | std::set<bluefs_extent_t, compare> extents_rejected; | |
4453 | for (int dcnt = 0; dcnt < 3; dcnt++) { | |
4454 | uint8_t dev = (last_e_dev + dcnt) % MAX_BDEV; | |
4455 | if (bdev[dev] == nullptr) continue; | |
4456 | dout(2) << __func__ << " processing " << get_device_name(dev) << dendl; | |
4457 | interval_set<uint64_t> disk_regions; | |
4458 | disk_regions.insert(0, bdev[dev]->get_size()); | |
20effc67 | 4459 | for (auto f : nodes.file_map) { |
f6b5b4d7 TL |
4460 | auto& e = f.second->fnode.extents; |
4461 | for (auto& p : e) { | |
4462 | if (p.bdev == dev) { | |
4463 | disk_regions.erase(p.offset, p.length); | |
4464 | } | |
4465 | } | |
4466 | } | |
4467 | size_t disk_regions_count = disk_regions.num_intervals(); | |
4468 | dout(5) << __func__ << " " << disk_regions_count << " regions to scan on " << get_device_name(dev) << dendl; | |
4469 | ||
4470 | auto reg = disk_regions.lower_bound(last_e_off); | |
4471 | //for all except first, start from beginning | |
4472 | last_e_off = 0; | |
4473 | if (reg == disk_regions.end()) { | |
4474 | reg = disk_regions.begin(); | |
4475 | } | |
4476 | const uint64_t chunk_size = 4 * 1024 * 1024; | |
4477 | const uint64_t page_size = 4096; | |
4478 | const uint64_t max_extent_size = 16; | |
4479 | uint64_t overlay_size = last_32.length() + max_extent_size; | |
4480 | for (size_t i = 0; i < disk_regions_count; reg++, i++) { | |
4481 | if (reg == disk_regions.end()) { | |
4482 | reg = disk_regions.begin(); | |
4483 | } | |
4484 | uint64_t pos = reg.get_start(); | |
4485 | uint64_t len = reg.get_len(); | |
4486 | ||
4487 | std::unique_ptr<char[]> raw_data_p{new char[page_size + chunk_size]}; | |
4488 | char* raw_data = raw_data_p.get(); | |
4489 | memset(raw_data, 0, page_size); | |
4490 | ||
4491 | while (len > last_32.length()) { | |
4492 | uint64_t chunk_len = len > chunk_size ? chunk_size : len; | |
4493 | dout(5) << __func__ << " read " | |
20effc67 TL |
4494 | << get_device_name(dev) << ":0x" << std::hex << pos << "+" << chunk_len |
4495 | << std::dec << dendl; | |
4496 | r = _bdev_read_random(dev, pos, chunk_len, | |
4497 | raw_data + page_size, cct->_conf->bluefs_buffered_io); | |
f6b5b4d7 TL |
4498 | ceph_assert(r == 0); |
4499 | ||
4500 | //search for fixed_last_32 | |
4501 | char* chunk_b = raw_data + page_size; | |
4502 | char* chunk_e = chunk_b + chunk_len; | |
4503 | ||
4504 | char* search_b = chunk_b - overlay_size; | |
4505 | char* search_e = chunk_e; | |
4506 | ||
4507 | for (char* sp = search_b; ; sp += last_32.length()) { | |
4508 | sp = (char*)memmem(sp, search_e - sp, last_32.c_str(), last_32.length()); | |
4509 | if (sp == nullptr) { | |
4510 | break; | |
4511 | } | |
4512 | ||
4513 | char* n = sp + last_32.length(); | |
4514 | dout(5) << __func__ << " checking location 0x" << std::hex << pos + (n - chunk_b) << std::dec << dendl; | |
4515 | bufferlist test; | |
4516 | test.append(n, std::min<size_t>(max_extent_size, chunk_e - n)); | |
4517 | bluefs_extent_t ne; | |
4518 | try { | |
4519 | bufferlist::const_iterator p = test.begin(); | |
f67539c2 | 4520 | ::decode(ne, p); |
f6b5b4d7 TL |
4521 | } catch (buffer::error& e) { |
4522 | continue; | |
4523 | } | |
4524 | if (extents_rejected.count(ne) != 0) { | |
4525 | dout(5) << __func__ << " extent " << ne << " already refected" <<dendl; | |
4526 | continue; | |
4527 | } | |
4528 | //insert as rejected already. if we succeed, it wouldn't make difference. | |
4529 | extents_rejected.insert(ne); | |
4530 | ||
4531 | if (ne.bdev >= MAX_BDEV || | |
4532 | bdev[ne.bdev] == nullptr || | |
4533 | ne.length > 16 * 1024 * 1024 || | |
4534 | (ne.length & 4095) != 0 || | |
4535 | ne.offset + ne.length > bdev[ne.bdev]->get_size() || | |
4536 | (ne.offset & 4095) != 0) { | |
4537 | dout(5) << __func__ << " refusing extent " << ne << dendl; | |
4538 | continue; | |
4539 | } | |
4540 | dout(5) << __func__ << " checking extent " << ne << dendl; | |
4541 | ||
4542 | //read candidate extent - whole | |
4543 | bufferlist candidate; | |
4544 | candidate.append(fixed); | |
20effc67 TL |
4545 | r = _bdev_read(ne.bdev, ne.offset, ne.length, &candidate, ioc[ne.bdev], |
4546 | cct->_conf->bluefs_buffered_io); | |
f6b5b4d7 TL |
4547 | ceph_assert(r == 0); |
4548 | ||
4549 | //check if transaction & crc is ok | |
4550 | bluefs_transaction_t t; | |
4551 | try { | |
f67539c2 TL |
4552 | bufferlist::const_iterator p = candidate.begin(); |
4553 | ::decode(t, p); | |
f6b5b4d7 TL |
4554 | } |
4555 | catch (buffer::error& e) { | |
4556 | dout(5) << __func__ << " failed match" << dendl; | |
4557 | continue; | |
4558 | } | |
4559 | ||
4560 | //success, it seems a probable candidate | |
4561 | uint64_t l = std::min<uint64_t>(ne.length, read_len); | |
4562 | //trim to required size | |
4563 | bufferlist requested_read; | |
4564 | requested_read.substr_of(candidate, fixed.length(), l); | |
4565 | bl->append(requested_read); | |
4566 | dout(5) << __func__ << " successful extension of log " << l << "/" << read_len << dendl; | |
4567 | log_fnode.append_extent(ne); | |
4568 | log_fnode.recalc_allocated(); | |
4569 | log_reader->buf.pos += l; | |
4570 | return l; | |
4571 | } | |
4572 | //save overlay for next search | |
4573 | memcpy(search_b, chunk_e - overlay_size, overlay_size); | |
4574 | pos += chunk_len; | |
4575 | len -= chunk_len; | |
4576 | } | |
4577 | } | |
4578 | } | |
4579 | return 0; | |
4580 | } | |
4581 | ||
20effc67 TL |
4582 | void BlueFS::_check_vselector_LNF() { |
4583 | BlueFSVolumeSelector* vs = vselector->clone_empty(); | |
4584 | if (!vs) { | |
4585 | return; | |
4586 | } | |
4587 | std::lock_guard ll(log.lock); | |
4588 | std::lock_guard nl(nodes.lock); | |
4589 | // Checking vselector is under log, nodes and file(s) locks, | |
4590 | // so any modification of vselector must be under at least one of those locks. | |
4591 | for (auto& f : nodes.file_map) { | |
4592 | f.second->lock.lock(); | |
4593 | vs->add_usage(f.second->vselector_hint, f.second->fnode); | |
4594 | } | |
4595 | bool res = vselector->compare(vs); | |
4596 | if (!res) { | |
4597 | dout(0) << "Current:"; | |
4598 | vselector->dump(*_dout); | |
4599 | *_dout << dendl; | |
4600 | dout(0) << "Expected:"; | |
4601 | vs->dump(*_dout); | |
4602 | *_dout << dendl; | |
4603 | } | |
4604 | ceph_assert(res); | |
4605 | for (auto& f : nodes.file_map) { | |
4606 | f.second->lock.unlock(); | |
4607 | } | |
4608 | delete vs; | |
4609 | } | |
4610 | ||
f67539c2 | 4611 | size_t BlueFS::probe_alloc_avail(int dev, uint64_t alloc_size) |
9f95a23c | 4612 | { |
f67539c2 TL |
4613 | size_t total = 0; |
4614 | auto iterated_allocation = [&](size_t off, size_t len) { | |
4615 | //only count in size that is alloc_size aligned | |
4616 | size_t dist_to_alignment; | |
4617 | size_t offset_in_block = off & (alloc_size - 1); | |
4618 | if (offset_in_block == 0) | |
4619 | dist_to_alignment = 0; | |
4620 | else | |
4621 | dist_to_alignment = alloc_size - offset_in_block; | |
4622 | if (dist_to_alignment >= len) | |
4623 | return; | |
4624 | len -= dist_to_alignment; | |
4625 | total += p2align(len, alloc_size); | |
4626 | }; | |
4627 | if (alloc[dev]) { | |
1e59de90 | 4628 | alloc[dev]->foreach(iterated_allocation); |
9f95a23c | 4629 | } |
f67539c2 | 4630 | return total; |
9f95a23c | 4631 | } |
9f95a23c TL |
4632 | // =============================================== |
4633 | // OriginalVolumeSelector | |
4634 | ||
f6b5b4d7 TL |
4635 | void* OriginalVolumeSelector::get_hint_for_log() const { |
4636 | return reinterpret_cast<void*>(BlueFS::BDEV_WAL); | |
9f95a23c | 4637 | } |
b3b6e05e | 4638 | void* OriginalVolumeSelector::get_hint_by_dir(std::string_view dirname) const { |
9f95a23c TL |
4639 | uint8_t res = BlueFS::BDEV_DB; |
4640 | if (dirname.length() > 5) { | |
4641 | // the "db.slow" and "db.wal" directory names are hard-coded at | |
4642 | // match up with bluestore. the slow device is always the second | |
4643 | // one (when a dedicated block.db device is present and used at | |
4644 | // bdev 0). the wal device is always last. | |
a4b75251 | 4645 | if (boost::algorithm::ends_with(dirname, ".slow") && slow_total) { |
9f95a23c | 4646 | res = BlueFS::BDEV_SLOW; |
a4b75251 | 4647 | } else if (boost::algorithm::ends_with(dirname, ".wal") && wal_total) { |
9f95a23c TL |
4648 | res = BlueFS::BDEV_WAL; |
4649 | } | |
4650 | } | |
4651 | return reinterpret_cast<void*>(res); | |
4652 | } | |
4653 | ||
4654 | uint8_t OriginalVolumeSelector::select_prefer_bdev(void* hint) | |
4655 | { | |
4656 | return (uint8_t)(reinterpret_cast<uint64_t>(hint)); | |
4657 | } | |
4658 | ||
4659 | void OriginalVolumeSelector::get_paths(const std::string& base, paths& res) const | |
4660 | { | |
4661 | res.emplace_back(base, db_total); | |
522d829b TL |
4662 | res.emplace_back(base + ".slow", |
4663 | slow_total ? slow_total : db_total); // use fake non-zero value if needed to | |
4664 | // avoid RocksDB complains | |
9f95a23c TL |
4665 | } |
4666 | ||
4667 | #undef dout_prefix | |
4668 | #define dout_prefix *_dout << "OriginalVolumeSelector: " | |
4669 | ||
4670 | void OriginalVolumeSelector::dump(ostream& sout) { | |
4671 | sout<< "wal_total:" << wal_total | |
4672 | << ", db_total:" << db_total | |
4673 | << ", slow_total:" << slow_total | |
4674 | << std::endl; | |
4675 | } | |
f67539c2 TL |
4676 | |
4677 | // =============================================== | |
4678 | // FitToFastVolumeSelector | |
4679 | ||
4680 | void FitToFastVolumeSelector::get_paths(const std::string& base, paths& res) const { | |
4681 | res.emplace_back(base, 1); // size of the last db_path has no effect | |
4682 | } |