]>
Commit | Line | Data |
---|---|---|
7c673cae FG |
1 | // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- |
2 | // vim: ts=8 sw=2 smarttab | |
3 | ||
4 | #include "boost/algorithm/string.hpp" | |
5 | #include "BlueFS.h" | |
6 | ||
7 | #include "common/debug.h" | |
8 | #include "common/errno.h" | |
9 | #include "common/perf_counters.h" | |
10 | #include "BlockDevice.h" | |
11 | #include "Allocator.h" | |
11fdf7f2 | 12 | #include "include/ceph_assert.h" |
7c673cae FG |
13 | |
14 | #define dout_context cct | |
15 | #define dout_subsys ceph_subsys_bluefs | |
16 | #undef dout_prefix | |
17 | #define dout_prefix *_dout << "bluefs " | |
18 | ||
19 | MEMPOOL_DEFINE_OBJECT_FACTORY(BlueFS::File, bluefs_file, bluefs); | |
20 | MEMPOOL_DEFINE_OBJECT_FACTORY(BlueFS::Dir, bluefs_dir, bluefs); | |
21 | MEMPOOL_DEFINE_OBJECT_FACTORY(BlueFS::FileWriter, bluefs_file_writer, bluefs); | |
22 | MEMPOOL_DEFINE_OBJECT_FACTORY(BlueFS::FileReaderBuffer, | |
23 | bluefs_file_reader_buffer, bluefs); | |
24 | MEMPOOL_DEFINE_OBJECT_FACTORY(BlueFS::FileReader, bluefs_file_reader, bluefs); | |
25 | MEMPOOL_DEFINE_OBJECT_FACTORY(BlueFS::FileLock, bluefs_file_lock, bluefs); | |
26 | ||
11fdf7f2 TL |
27 | static void wal_discard_cb(void *priv, void* priv2) { |
28 | BlueFS *bluefs = static_cast<BlueFS*>(priv); | |
29 | interval_set<uint64_t> *tmp = static_cast<interval_set<uint64_t>*>(priv2); | |
30 | bluefs->handle_discard(BlueFS::BDEV_WAL, *tmp); | |
31 | } | |
32 | ||
33 | static void db_discard_cb(void *priv, void* priv2) { | |
34 | BlueFS *bluefs = static_cast<BlueFS*>(priv); | |
35 | interval_set<uint64_t> *tmp = static_cast<interval_set<uint64_t>*>(priv2); | |
36 | bluefs->handle_discard(BlueFS::BDEV_DB, *tmp); | |
37 | } | |
38 | ||
39 | static void slow_discard_cb(void *priv, void* priv2) { | |
40 | BlueFS *bluefs = static_cast<BlueFS*>(priv); | |
41 | interval_set<uint64_t> *tmp = static_cast<interval_set<uint64_t>*>(priv2); | |
42 | bluefs->handle_discard(BlueFS::BDEV_SLOW, *tmp); | |
43 | } | |
7c673cae FG |
44 | |
45 | BlueFS::BlueFS(CephContext* cct) | |
46 | : cct(cct), | |
47 | bdev(MAX_BDEV), | |
48 | ioc(MAX_BDEV), | |
11fdf7f2 | 49 | block_all(MAX_BDEV) |
7c673cae | 50 | { |
11fdf7f2 TL |
51 | discard_cb[BDEV_WAL] = wal_discard_cb; |
52 | discard_cb[BDEV_DB] = db_discard_cb; | |
53 | discard_cb[BDEV_SLOW] = slow_discard_cb; | |
7c673cae FG |
54 | } |
55 | ||
56 | BlueFS::~BlueFS() | |
57 | { | |
58 | for (auto p : ioc) { | |
59 | if (p) | |
60 | p->aio_wait(); | |
61 | } | |
62 | for (auto p : bdev) { | |
63 | if (p) { | |
64 | p->close(); | |
65 | delete p; | |
66 | } | |
67 | } | |
68 | for (auto p : ioc) { | |
69 | delete p; | |
70 | } | |
71 | } | |
72 | ||
73 | void BlueFS::_init_logger() | |
74 | { | |
75 | PerfCountersBuilder b(cct, "bluefs", | |
76 | l_bluefs_first, l_bluefs_last); | |
77 | b.add_u64_counter(l_bluefs_gift_bytes, "gift_bytes", | |
11fdf7f2 | 78 | "Bytes gifted from BlueStore", NULL, 0, unit_t(UNIT_BYTES)); |
7c673cae | 79 | b.add_u64_counter(l_bluefs_reclaim_bytes, "reclaim_bytes", |
11fdf7f2 | 80 | "Bytes reclaimed by BlueStore", NULL, 0, unit_t(UNIT_BYTES)); |
7c673cae FG |
81 | b.add_u64(l_bluefs_db_total_bytes, "db_total_bytes", |
82 | "Total bytes (main db device)", | |
11fdf7f2 | 83 | "b", PerfCountersBuilder::PRIO_USEFUL, unit_t(UNIT_BYTES)); |
7c673cae FG |
84 | b.add_u64(l_bluefs_db_used_bytes, "db_used_bytes", |
85 | "Used bytes (main db device)", | |
11fdf7f2 | 86 | "u", PerfCountersBuilder::PRIO_USEFUL, unit_t(UNIT_BYTES)); |
7c673cae FG |
87 | b.add_u64(l_bluefs_wal_total_bytes, "wal_total_bytes", |
88 | "Total bytes (wal device)", | |
11fdf7f2 | 89 | "walb", PerfCountersBuilder::PRIO_USEFUL, unit_t(UNIT_BYTES)); |
7c673cae FG |
90 | b.add_u64(l_bluefs_wal_used_bytes, "wal_used_bytes", |
91 | "Used bytes (wal device)", | |
11fdf7f2 | 92 | "walu", PerfCountersBuilder::PRIO_USEFUL, unit_t(UNIT_BYTES)); |
7c673cae FG |
93 | b.add_u64(l_bluefs_slow_total_bytes, "slow_total_bytes", |
94 | "Total bytes (slow device)", | |
11fdf7f2 | 95 | "slob", PerfCountersBuilder::PRIO_USEFUL, unit_t(UNIT_BYTES)); |
7c673cae FG |
96 | b.add_u64(l_bluefs_slow_used_bytes, "slow_used_bytes", |
97 | "Used bytes (slow device)", | |
11fdf7f2 | 98 | "slou", PerfCountersBuilder::PRIO_USEFUL, unit_t(UNIT_BYTES)); |
7c673cae FG |
99 | b.add_u64(l_bluefs_num_files, "num_files", "File count", |
100 | "f", PerfCountersBuilder::PRIO_USEFUL); | |
101 | b.add_u64(l_bluefs_log_bytes, "log_bytes", "Size of the metadata log", | |
11fdf7f2 | 102 | "jlen", PerfCountersBuilder::PRIO_INTERESTING, unit_t(UNIT_BYTES)); |
7c673cae FG |
103 | b.add_u64_counter(l_bluefs_log_compactions, "log_compactions", |
104 | "Compactions of the metadata log"); | |
105 | b.add_u64_counter(l_bluefs_logged_bytes, "logged_bytes", | |
106 | "Bytes written to the metadata log", "j", | |
11fdf7f2 | 107 | PerfCountersBuilder::PRIO_CRITICAL, unit_t(UNIT_BYTES)); |
7c673cae FG |
108 | b.add_u64_counter(l_bluefs_files_written_wal, "files_written_wal", |
109 | "Files written to WAL"); | |
110 | b.add_u64_counter(l_bluefs_files_written_sst, "files_written_sst", | |
111 | "Files written to SSTs"); | |
112 | b.add_u64_counter(l_bluefs_bytes_written_wal, "bytes_written_wal", | |
113 | "Bytes written to WAL", "wal", | |
114 | PerfCountersBuilder::PRIO_CRITICAL); | |
115 | b.add_u64_counter(l_bluefs_bytes_written_sst, "bytes_written_sst", | |
116 | "Bytes written to SSTs", "sst", | |
11fdf7f2 TL |
117 | PerfCountersBuilder::PRIO_CRITICAL, unit_t(UNIT_BYTES)); |
118 | b.add_u64_counter(l_bluefs_bytes_written_slow, "bytes_written_slow", | |
119 | "Bytes written to WAL/SSTs at slow device", NULL, | |
120 | PerfCountersBuilder::PRIO_USEFUL, unit_t(UNIT_BYTES)); | |
121 | b.add_u64_counter(l_bluefs_max_bytes_wal, "max_bytes_wal", | |
122 | "Maximum bytes allocated from WAL"); | |
123 | b.add_u64_counter(l_bluefs_max_bytes_db, "max_bytes_db", | |
124 | "Maximum bytes allocated from DB"); | |
125 | b.add_u64_counter(l_bluefs_max_bytes_slow, "max_bytes_slow", | |
126 | "Maximum bytes allocated from SLOW"); | |
494da23a TL |
127 | |
128 | b.add_u64_counter(l_bluefs_read_random_count, "read_random_count", | |
129 | "random read requests processed"); | |
130 | b.add_u64_counter(l_bluefs_read_random_bytes, "read_random_bytes", | |
131 | "Bytes requested in random read mode", NULL, | |
132 | PerfCountersBuilder::PRIO_USEFUL, unit_t(UNIT_BYTES)); | |
133 | b.add_u64_counter(l_bluefs_read_random_disk_count, "read_random_disk_count", | |
134 | "random reads requests going to disk"); | |
135 | b.add_u64_counter(l_bluefs_read_random_disk_bytes, "read_random_disk_bytes", | |
136 | "Bytes read from disk in random read mode", NULL, | |
137 | PerfCountersBuilder::PRIO_USEFUL, unit_t(UNIT_BYTES)); | |
138 | b.add_u64_counter(l_bluefs_read_random_buffer_count, "read_random_buffer_count", | |
139 | "random read requests processed using prefetch buffer"); | |
140 | b.add_u64_counter(l_bluefs_read_random_buffer_bytes, "read_random_buffer_bytes", | |
141 | "Bytes read from prefetch buffer in random read mode", NULL, | |
142 | PerfCountersBuilder::PRIO_USEFUL, unit_t(UNIT_BYTES)); | |
143 | ||
144 | b.add_u64_counter(l_bluefs_read_count, "read_count", | |
145 | "buffered read requests processed"); | |
146 | b.add_u64_counter(l_bluefs_read_bytes, "read_bytes", | |
147 | "Bytes requested in buffered read mode", NULL, | |
148 | PerfCountersBuilder::PRIO_USEFUL, unit_t(UNIT_BYTES)); | |
149 | ||
150 | b.add_u64_counter(l_bluefs_read_prefetch_count, "read_prefetch_count", | |
151 | "prefetch read requests processed"); | |
152 | b.add_u64_counter(l_bluefs_read_prefetch_bytes, "read_prefetch_bytes", | |
153 | "Bytes requested in prefetch read mode", NULL, | |
154 | PerfCountersBuilder::PRIO_USEFUL, unit_t(UNIT_BYTES)); | |
155 | ||
7c673cae FG |
156 | logger = b.create_perf_counters(); |
157 | cct->get_perfcounters_collection()->add(logger); | |
158 | } | |
159 | ||
160 | void BlueFS::_shutdown_logger() | |
161 | { | |
162 | cct->get_perfcounters_collection()->remove(logger); | |
163 | delete logger; | |
164 | } | |
165 | ||
166 | void BlueFS::_update_logger_stats() | |
167 | { | |
168 | // we must be holding the lock | |
169 | logger->set(l_bluefs_num_files, file_map.size()); | |
170 | logger->set(l_bluefs_log_bytes, log_writer->file->fnode.size); | |
171 | ||
172 | if (alloc[BDEV_WAL]) { | |
11fdf7f2 | 173 | logger->set(l_bluefs_wal_total_bytes, block_all[BDEV_WAL].size()); |
7c673cae | 174 | logger->set(l_bluefs_wal_used_bytes, |
11fdf7f2 | 175 | block_all[BDEV_WAL].size() - alloc[BDEV_WAL]->get_free()); |
7c673cae FG |
176 | } |
177 | if (alloc[BDEV_DB]) { | |
11fdf7f2 | 178 | logger->set(l_bluefs_db_total_bytes, block_all[BDEV_DB].size()); |
7c673cae | 179 | logger->set(l_bluefs_db_used_bytes, |
11fdf7f2 | 180 | block_all[BDEV_DB].size() - alloc[BDEV_DB]->get_free()); |
7c673cae FG |
181 | } |
182 | if (alloc[BDEV_SLOW]) { | |
11fdf7f2 | 183 | logger->set(l_bluefs_slow_total_bytes, block_all[BDEV_SLOW].size()); |
7c673cae | 184 | logger->set(l_bluefs_slow_used_bytes, |
11fdf7f2 | 185 | block_all[BDEV_SLOW].size() - alloc[BDEV_SLOW]->get_free()); |
7c673cae FG |
186 | } |
187 | } | |
188 | ||
11fdf7f2 TL |
189 | int BlueFS::add_block_device(unsigned id, const string& path, bool trim, |
190 | bool shared_with_bluestore) | |
7c673cae FG |
191 | { |
192 | dout(10) << __func__ << " bdev " << id << " path " << path << dendl; | |
11fdf7f2 TL |
193 | ceph_assert(id < bdev.size()); |
194 | ceph_assert(bdev[id] == NULL); | |
195 | BlockDevice *b = BlockDevice::create(cct, path, NULL, NULL, | |
196 | discard_cb[id], static_cast<void*>(this)); | |
197 | if (shared_with_bluestore) { | |
198 | b->set_no_exclusive_lock(); | |
199 | } | |
7c673cae FG |
200 | int r = b->open(path); |
201 | if (r < 0) { | |
202 | delete b; | |
203 | return r; | |
204 | } | |
11fdf7f2 TL |
205 | if (trim) { |
206 | b->discard(0, b->get_size()); | |
207 | } | |
208 | ||
7c673cae | 209 | dout(1) << __func__ << " bdev " << id << " path " << path |
1adf2230 | 210 | << " size " << byte_u_t(b->get_size()) << dendl; |
7c673cae FG |
211 | bdev[id] = b; |
212 | ioc[id] = new IOContext(cct, NULL); | |
213 | return 0; | |
214 | } | |
215 | ||
216 | bool BlueFS::bdev_support_label(unsigned id) | |
217 | { | |
11fdf7f2 TL |
218 | ceph_assert(id < bdev.size()); |
219 | ceph_assert(bdev[id]); | |
7c673cae FG |
220 | return bdev[id]->supported_bdev_label(); |
221 | } | |
222 | ||
223 | uint64_t BlueFS::get_block_device_size(unsigned id) | |
224 | { | |
225 | if (id < bdev.size() && bdev[id]) | |
226 | return bdev[id]->get_size(); | |
227 | return 0; | |
228 | } | |
229 | ||
11fdf7f2 | 230 | void BlueFS::_add_block_extent(unsigned id, uint64_t offset, uint64_t length) |
7c673cae | 231 | { |
7c673cae | 232 | dout(1) << __func__ << " bdev " << id |
11fdf7f2 | 233 | << " 0x" << std::hex << offset << "~" << length << std::dec |
7c673cae | 234 | << dendl; |
11fdf7f2 TL |
235 | |
236 | ceph_assert(id < bdev.size()); | |
237 | ceph_assert(bdev[id]); | |
238 | ceph_assert(bdev[id]->get_size() >= offset + length); | |
7c673cae | 239 | block_all[id].insert(offset, length); |
7c673cae FG |
240 | |
241 | if (id < alloc.size() && alloc[id]) { | |
242 | log_t.op_alloc_add(id, offset, length); | |
7c673cae FG |
243 | alloc[id]->init_add_free(offset, length); |
244 | } | |
245 | ||
246 | if (logger) | |
247 | logger->inc(l_bluefs_gift_bytes, length); | |
248 | dout(10) << __func__ << " done" << dendl; | |
249 | } | |
250 | ||
251 | int BlueFS::reclaim_blocks(unsigned id, uint64_t want, | |
a8e16298 | 252 | PExtentVector *extents) |
7c673cae | 253 | { |
11fdf7f2 | 254 | std::unique_lock l(lock); |
7c673cae FG |
255 | dout(1) << __func__ << " bdev " << id |
256 | << " want 0x" << std::hex << want << std::dec << dendl; | |
11fdf7f2 TL |
257 | ceph_assert(id < alloc.size()); |
258 | ceph_assert(alloc[id]); | |
a8e16298 | 259 | |
7c673cae FG |
260 | int64_t got = alloc[id]->allocate(want, cct->_conf->bluefs_alloc_size, 0, |
261 | extents); | |
11fdf7f2 | 262 | ceph_assert(got != 0); |
a8e16298 | 263 | if (got < 0) { |
7c673cae | 264 | derr << __func__ << " failed to allocate space to return to bluestore" |
a8e16298 | 265 | << dendl; |
7c673cae FG |
266 | alloc[id]->dump(); |
267 | return got; | |
268 | } | |
269 | ||
270 | for (auto& p : *extents) { | |
271 | block_all[id].erase(p.offset, p.length); | |
7c673cae FG |
272 | log_t.op_alloc_rm(id, p.offset, p.length); |
273 | } | |
274 | ||
275 | flush_bdev(); | |
a8e16298 | 276 | int r = _flush_and_sync_log(l); |
11fdf7f2 | 277 | ceph_assert(r == 0); |
7c673cae | 278 | |
11fdf7f2 | 279 | logger->inc(l_bluefs_reclaim_bytes, got); |
7c673cae FG |
280 | dout(1) << __func__ << " bdev " << id << " want 0x" << std::hex << want |
281 | << " got " << *extents << dendl; | |
282 | return 0; | |
283 | } | |
284 | ||
11fdf7f2 | 285 | void BlueFS::handle_discard(unsigned id, interval_set<uint64_t>& to_release) |
7c673cae | 286 | { |
11fdf7f2 TL |
287 | dout(10) << __func__ << " bdev " << id << dendl; |
288 | ceph_assert(alloc[id]); | |
289 | alloc[id]->release(to_release); | |
290 | } | |
291 | ||
292 | uint64_t BlueFS::get_used() | |
293 | { | |
294 | std::lock_guard l(lock); | |
295 | uint64_t used = 0; | |
296 | for (unsigned id = 0; id < MAX_BDEV; ++id) { | |
297 | if (alloc[id]) { | |
298 | used += block_all[id].size() - alloc[id]->get_free(); | |
299 | } | |
7c673cae | 300 | } |
11fdf7f2 | 301 | return used; |
7c673cae FG |
302 | } |
303 | ||
304 | uint64_t BlueFS::get_total(unsigned id) | |
305 | { | |
11fdf7f2 TL |
306 | std::lock_guard l(lock); |
307 | ceph_assert(id < block_all.size()); | |
308 | return block_all[id].size(); | |
7c673cae FG |
309 | } |
310 | ||
311 | uint64_t BlueFS::get_free(unsigned id) | |
312 | { | |
11fdf7f2 TL |
313 | std::lock_guard l(lock); |
314 | ceph_assert(id < alloc.size()); | |
7c673cae FG |
315 | return alloc[id]->get_free(); |
316 | } | |
317 | ||
318 | void BlueFS::dump_perf_counters(Formatter *f) | |
319 | { | |
320 | f->open_object_section("bluefs_perf_counters"); | |
321 | logger->dump_formatted(f,0); | |
322 | f->close_section(); | |
323 | } | |
324 | ||
3efd9988 FG |
325 | void BlueFS::dump_block_extents(ostream& out) |
326 | { | |
327 | for (unsigned i = 0; i < MAX_BDEV; ++i) { | |
328 | if (!bdev[i]) { | |
329 | continue; | |
330 | } | |
11fdf7f2 TL |
331 | auto owned = get_total(i); |
332 | auto free = get_free(i); | |
333 | out << i << " : device size 0x" << std::hex << bdev[i]->get_size() | |
334 | << " : own 0x" << block_all[i] | |
335 | << " = 0x" << owned | |
336 | << " : using 0x" << owned - free | |
494da23a TL |
337 | << std::dec << "(" << byte_u_t(owned - free) << ")" |
338 | << "\n"; | |
3efd9988 FG |
339 | } |
340 | } | |
7c673cae FG |
341 | |
342 | void BlueFS::get_usage(vector<pair<uint64_t,uint64_t>> *usage) | |
343 | { | |
11fdf7f2 | 344 | std::lock_guard l(lock); |
7c673cae FG |
345 | usage->resize(bdev.size()); |
346 | for (unsigned id = 0; id < bdev.size(); ++id) { | |
347 | if (!bdev[id]) { | |
348 | (*usage)[id] = make_pair(0, 0); | |
349 | continue; | |
350 | } | |
351 | (*usage)[id].first = alloc[id]->get_free(); | |
11fdf7f2 | 352 | (*usage)[id].second = block_all[id].size(); |
7c673cae | 353 | uint64_t used = |
11fdf7f2 | 354 | (block_all[id].size() - (*usage)[id].first) * 100 / block_all[id].size(); |
7c673cae FG |
355 | dout(10) << __func__ << " bdev " << id |
356 | << " free " << (*usage)[id].first | |
1adf2230 | 357 | << " (" << byte_u_t((*usage)[id].first) << ")" |
7c673cae | 358 | << " / " << (*usage)[id].second |
1adf2230 | 359 | << " (" << byte_u_t((*usage)[id].second) << ")" |
7c673cae FG |
360 | << ", used " << used << "%" |
361 | << dendl; | |
362 | } | |
363 | } | |
364 | ||
365 | int BlueFS::get_block_extents(unsigned id, interval_set<uint64_t> *extents) | |
366 | { | |
11fdf7f2 | 367 | std::lock_guard l(lock); |
7c673cae FG |
368 | dout(10) << __func__ << " bdev " << id << dendl; |
369 | if (id >= block_all.size()) | |
370 | return -EINVAL; | |
371 | *extents = block_all[id]; | |
372 | return 0; | |
373 | } | |
374 | ||
375 | int BlueFS::mkfs(uuid_d osd_uuid) | |
376 | { | |
11fdf7f2 | 377 | std::unique_lock l(lock); |
7c673cae FG |
378 | dout(1) << __func__ |
379 | << " osd_uuid " << osd_uuid | |
380 | << dendl; | |
381 | ||
382 | _init_alloc(); | |
383 | _init_logger(); | |
384 | ||
385 | super.version = 1; | |
386 | super.block_size = bdev[BDEV_DB]->get_block_size(); | |
387 | super.osd_uuid = osd_uuid; | |
388 | super.uuid.generate_random(); | |
389 | dout(1) << __func__ << " uuid " << super.uuid << dendl; | |
390 | ||
391 | // init log | |
392 | FileRef log_file = new File; | |
393 | log_file->fnode.ino = 1; | |
394 | log_file->fnode.prefer_bdev = BDEV_WAL; | |
395 | int r = _allocate( | |
396 | log_file->fnode.prefer_bdev, | |
397 | cct->_conf->bluefs_max_log_runway, | |
94b18763 | 398 | &log_file->fnode); |
11fdf7f2 | 399 | ceph_assert(r == 0); |
7c673cae FG |
400 | log_writer = _create_writer(log_file); |
401 | ||
402 | // initial txn | |
403 | log_t.op_init(); | |
404 | for (unsigned bdev = 0; bdev < MAX_BDEV; ++bdev) { | |
405 | interval_set<uint64_t>& p = block_all[bdev]; | |
406 | if (p.empty()) | |
407 | continue; | |
408 | for (interval_set<uint64_t>::iterator q = p.begin(); q != p.end(); ++q) { | |
409 | dout(20) << __func__ << " op_alloc_add " << bdev << " 0x" | |
410 | << std::hex << q.get_start() << "~" << q.get_len() << std::dec | |
411 | << dendl; | |
412 | log_t.op_alloc_add(bdev, q.get_start(), q.get_len()); | |
413 | } | |
414 | } | |
415 | _flush_and_sync_log(l); | |
416 | ||
417 | // write supers | |
418 | super.log_fnode = log_file->fnode; | |
11fdf7f2 | 419 | _write_super(BDEV_DB); |
7c673cae FG |
420 | flush_bdev(); |
421 | ||
422 | // clean up | |
423 | super = bluefs_super_t(); | |
424 | _close_writer(log_writer); | |
425 | log_writer = NULL; | |
426 | block_all.clear(); | |
7c673cae FG |
427 | _stop_alloc(); |
428 | _shutdown_logger(); | |
429 | ||
430 | dout(10) << __func__ << " success" << dendl; | |
431 | return 0; | |
432 | } | |
433 | ||
434 | void BlueFS::_init_alloc() | |
435 | { | |
436 | dout(20) << __func__ << dendl; | |
437 | alloc.resize(MAX_BDEV); | |
438 | pending_release.resize(MAX_BDEV); | |
439 | for (unsigned id = 0; id < bdev.size(); ++id) { | |
440 | if (!bdev[id]) { | |
441 | continue; | |
442 | } | |
11fdf7f2 | 443 | ceph_assert(bdev[id]->get_size()); |
7c673cae FG |
444 | alloc[id] = Allocator::create(cct, cct->_conf->bluefs_allocator, |
445 | bdev[id]->get_size(), | |
446 | cct->_conf->bluefs_alloc_size); | |
447 | interval_set<uint64_t>& p = block_all[id]; | |
448 | for (interval_set<uint64_t>::iterator q = p.begin(); q != p.end(); ++q) { | |
449 | alloc[id]->init_add_free(q.get_start(), q.get_len()); | |
450 | } | |
451 | } | |
452 | } | |
453 | ||
454 | void BlueFS::_stop_alloc() | |
455 | { | |
456 | dout(20) << __func__ << dendl; | |
11fdf7f2 TL |
457 | for (auto p : bdev) { |
458 | if (p) | |
459 | p->discard_drain(); | |
460 | } | |
461 | ||
7c673cae FG |
462 | for (auto p : alloc) { |
463 | if (p != nullptr) { | |
464 | p->shutdown(); | |
465 | delete p; | |
466 | } | |
467 | } | |
468 | alloc.clear(); | |
469 | } | |
470 | ||
471 | int BlueFS::mount() | |
472 | { | |
473 | dout(1) << __func__ << dendl; | |
474 | ||
475 | int r = _open_super(); | |
476 | if (r < 0) { | |
477 | derr << __func__ << " failed to open super: " << cpp_strerror(r) << dendl; | |
478 | goto out; | |
479 | } | |
480 | ||
481 | block_all.clear(); | |
482 | block_all.resize(MAX_BDEV); | |
7c673cae | 483 | _init_alloc(); |
494da23a | 484 | _init_logger(); |
7c673cae | 485 | |
11fdf7f2 | 486 | r = _replay(false, false); |
7c673cae FG |
487 | if (r < 0) { |
488 | derr << __func__ << " failed to replay log: " << cpp_strerror(r) << dendl; | |
489 | _stop_alloc(); | |
490 | goto out; | |
491 | } | |
492 | ||
493 | // init freelist | |
494 | for (auto& p : file_map) { | |
495 | dout(30) << __func__ << " noting alloc for " << p.second->fnode << dendl; | |
496 | for (auto& q : p.second->fnode.extents) { | |
497 | alloc[q.bdev]->init_rm_free(q.offset, q.length); | |
498 | } | |
499 | } | |
500 | ||
501 | // set up the log for future writes | |
502 | log_writer = _create_writer(_get_file(1)); | |
11fdf7f2 | 503 | ceph_assert(log_writer->file->fnode.ino == 1); |
7c673cae FG |
504 | log_writer->pos = log_writer->file->fnode.size; |
505 | dout(10) << __func__ << " log write pos set to 0x" | |
506 | << std::hex << log_writer->pos << std::dec | |
507 | << dendl; | |
508 | ||
7c673cae FG |
509 | return 0; |
510 | ||
511 | out: | |
512 | super = bluefs_super_t(); | |
513 | return r; | |
514 | } | |
515 | ||
516 | void BlueFS::umount() | |
517 | { | |
518 | dout(1) << __func__ << dendl; | |
519 | ||
520 | sync_metadata(); | |
521 | ||
522 | _close_writer(log_writer); | |
523 | log_writer = NULL; | |
524 | ||
525 | _stop_alloc(); | |
526 | file_map.clear(); | |
527 | dir_map.clear(); | |
528 | super = bluefs_super_t(); | |
529 | log_t.clear(); | |
530 | _shutdown_logger(); | |
531 | } | |
532 | ||
11fdf7f2 | 533 | int BlueFS::prepare_new_device(int id) |
7c673cae | 534 | { |
11fdf7f2 TL |
535 | dout(1) << __func__ << dendl; |
536 | ||
537 | if(id == BDEV_NEWDB) { | |
538 | int new_log_dev_cur = BDEV_WAL; | |
539 | int new_log_dev_next = BDEV_WAL; | |
540 | if (!bdev[BDEV_WAL]) { | |
541 | new_log_dev_cur = BDEV_NEWDB; | |
542 | new_log_dev_next = BDEV_DB; | |
543 | } | |
544 | _rewrite_log_sync(false, | |
545 | BDEV_NEWDB, | |
546 | new_log_dev_cur, | |
547 | new_log_dev_next, | |
548 | RENAME_DB2SLOW); | |
549 | //} | |
550 | } else if(id == BDEV_NEWWAL) { | |
551 | _rewrite_log_sync(false, BDEV_DB, BDEV_NEWWAL, BDEV_WAL, REMOVE_WAL); | |
552 | } else { | |
553 | assert(false); | |
554 | } | |
555 | return 0; | |
556 | } | |
557 | ||
558 | void BlueFS::collect_metadata(map<string,string> *pm, unsigned skip_bdev_id) | |
559 | { | |
560 | if (skip_bdev_id != BDEV_DB && bdev[BDEV_DB]) | |
7c673cae FG |
561 | bdev[BDEV_DB]->collect_metadata("bluefs_db_", pm); |
562 | if (bdev[BDEV_WAL]) | |
563 | bdev[BDEV_WAL]->collect_metadata("bluefs_wal_", pm); | |
11fdf7f2 TL |
564 | } |
565 | ||
566 | void BlueFS::get_devices(set<string> *ls) | |
567 | { | |
568 | for (unsigned i = 0; i < MAX_BDEV; ++i) { | |
569 | if (bdev[i]) { | |
570 | bdev[i]->get_devices(ls); | |
571 | } | |
572 | } | |
7c673cae FG |
573 | } |
574 | ||
575 | int BlueFS::fsck() | |
576 | { | |
11fdf7f2 | 577 | std::lock_guard l(lock); |
7c673cae FG |
578 | dout(1) << __func__ << dendl; |
579 | // hrm, i think we check everything on mount... | |
580 | return 0; | |
581 | } | |
582 | ||
11fdf7f2 | 583 | int BlueFS::_write_super(int dev) |
7c673cae FG |
584 | { |
585 | // build superblock | |
586 | bufferlist bl; | |
11fdf7f2 | 587 | encode(super, bl); |
7c673cae | 588 | uint32_t crc = bl.crc32c(-1); |
11fdf7f2 | 589 | encode(crc, bl); |
7c673cae FG |
590 | dout(10) << __func__ << " super block length(encoded): " << bl.length() << dendl; |
591 | dout(10) << __func__ << " superblock " << super.version << dendl; | |
592 | dout(10) << __func__ << " log_fnode " << super.log_fnode << dendl; | |
11fdf7f2 | 593 | ceph_assert(bl.length() <= get_super_length()); |
7c673cae FG |
594 | bl.append_zero(get_super_length() - bl.length()); |
595 | ||
11fdf7f2 | 596 | bdev[dev]->write(get_super_offset(), bl, false, WRITE_LIFE_SHORT); |
7c673cae FG |
597 | dout(20) << __func__ << " v " << super.version |
598 | << " crc 0x" << std::hex << crc | |
599 | << " offset 0x" << get_super_offset() << std::dec | |
600 | << dendl; | |
601 | return 0; | |
602 | } | |
603 | ||
604 | int BlueFS::_open_super() | |
605 | { | |
606 | dout(10) << __func__ << dendl; | |
607 | ||
608 | bufferlist bl; | |
609 | uint32_t expected_crc, crc; | |
610 | int r; | |
611 | ||
612 | // always the second block | |
613 | r = bdev[BDEV_DB]->read(get_super_offset(), get_super_length(), | |
614 | &bl, ioc[BDEV_DB], false); | |
615 | if (r < 0) | |
616 | return r; | |
617 | ||
11fdf7f2 TL |
618 | auto p = bl.cbegin(); |
619 | decode(super, p); | |
7c673cae FG |
620 | { |
621 | bufferlist t; | |
622 | t.substr_of(bl, 0, p.get_off()); | |
623 | crc = t.crc32c(-1); | |
624 | } | |
11fdf7f2 | 625 | decode(expected_crc, p); |
7c673cae FG |
626 | if (crc != expected_crc) { |
627 | derr << __func__ << " bad crc on superblock, expected 0x" | |
628 | << std::hex << expected_crc << " != actual 0x" << crc << std::dec | |
629 | << dendl; | |
630 | return -EIO; | |
631 | } | |
632 | dout(10) << __func__ << " superblock " << super.version << dendl; | |
633 | dout(10) << __func__ << " log_fnode " << super.log_fnode << dendl; | |
634 | return 0; | |
635 | } | |
636 | ||
11fdf7f2 | 637 | int BlueFS::_replay(bool noop, bool to_stdout) |
7c673cae FG |
638 | { |
639 | dout(10) << __func__ << (noop ? " NO-OP" : "") << dendl; | |
640 | ino_last = 1; // by the log | |
641 | log_seq = 0; | |
642 | ||
643 | FileRef log_file; | |
11fdf7f2 TL |
644 | log_file = _get_file(1); |
645 | if (!noop) { | |
646 | log_file->fnode = super.log_fnode; | |
7c673cae | 647 | } else { |
11fdf7f2 TL |
648 | // do not use fnode from superblock in 'noop' mode - log_file's one should |
649 | // be fine and up-to-date | |
650 | ceph_assert(log_file->fnode.ino == 1); | |
651 | ceph_assert(log_file->fnode.extents.size() != 0); | |
7c673cae | 652 | } |
7c673cae | 653 | dout(10) << __func__ << " log_fnode " << super.log_fnode << dendl; |
11fdf7f2 TL |
654 | if (unlikely(to_stdout)) { |
655 | std::cout << " log_fnode " << super.log_fnode << std::endl; | |
656 | } | |
7c673cae FG |
657 | |
658 | FileReader *log_reader = new FileReader( | |
659 | log_file, cct->_conf->bluefs_max_prefetch, | |
660 | false, // !random | |
661 | true); // ignore eof | |
662 | while (true) { | |
11fdf7f2 | 663 | ceph_assert((log_reader->buf.pos & ~super.block_mask()) == 0); |
7c673cae FG |
664 | uint64_t pos = log_reader->buf.pos; |
665 | uint64_t read_pos = pos; | |
666 | bufferlist bl; | |
667 | { | |
668 | int r = _read(log_reader, &log_reader->buf, read_pos, super.block_size, | |
669 | &bl, NULL); | |
11fdf7f2 | 670 | ceph_assert(r == (int)super.block_size); |
7c673cae FG |
671 | read_pos += r; |
672 | } | |
673 | uint64_t more = 0; | |
674 | uint64_t seq; | |
675 | uuid_d uuid; | |
676 | { | |
11fdf7f2 | 677 | auto p = bl.cbegin(); |
7c673cae FG |
678 | __u8 a, b; |
679 | uint32_t len; | |
11fdf7f2 TL |
680 | decode(a, p); |
681 | decode(b, p); | |
682 | decode(len, p); | |
683 | decode(uuid, p); | |
684 | decode(seq, p); | |
7c673cae | 685 | if (len + 6 > bl.length()) { |
11fdf7f2 | 686 | more = round_up_to(len + 6 - bl.length(), super.block_size); |
7c673cae FG |
687 | } |
688 | } | |
689 | if (uuid != super.uuid) { | |
690 | dout(10) << __func__ << " 0x" << std::hex << pos << std::dec | |
691 | << ": stop: uuid " << uuid << " != super.uuid " << super.uuid | |
692 | << dendl; | |
693 | break; | |
694 | } | |
695 | if (seq != log_seq + 1) { | |
696 | dout(10) << __func__ << " 0x" << std::hex << pos << std::dec | |
697 | << ": stop: seq " << seq << " != expected " << log_seq + 1 | |
698 | << dendl; | |
699 | break; | |
700 | } | |
701 | if (more) { | |
702 | dout(20) << __func__ << " need 0x" << std::hex << more << std::dec | |
703 | << " more bytes" << dendl; | |
704 | bufferlist t; | |
705 | int r = _read(log_reader, &log_reader->buf, read_pos, more, &t, NULL); | |
706 | if (r < (int)more) { | |
707 | dout(10) << __func__ << " 0x" << std::hex << pos | |
708 | << ": stop: len is 0x" << bl.length() + more << std::dec | |
709 | << ", which is past eof" << dendl; | |
710 | break; | |
711 | } | |
11fdf7f2 | 712 | ceph_assert(r == (int)more); |
7c673cae FG |
713 | bl.claim_append(t); |
714 | read_pos += r; | |
715 | } | |
716 | bluefs_transaction_t t; | |
717 | try { | |
11fdf7f2 TL |
718 | auto p = bl.cbegin(); |
719 | decode(t, p); | |
7c673cae FG |
720 | } |
721 | catch (buffer::error& e) { | |
722 | dout(10) << __func__ << " 0x" << std::hex << pos << std::dec | |
723 | << ": stop: failed to decode: " << e.what() | |
724 | << dendl; | |
725 | delete log_reader; | |
726 | return -EIO; | |
727 | } | |
11fdf7f2 | 728 | ceph_assert(seq == t.seq); |
7c673cae FG |
729 | dout(10) << __func__ << " 0x" << std::hex << pos << std::dec |
730 | << ": " << t << dendl; | |
11fdf7f2 TL |
731 | if (unlikely(to_stdout)) { |
732 | std::cout << " 0x" << std::hex << pos << std::dec | |
733 | << ": " << t << std::endl; | |
734 | } | |
7c673cae | 735 | |
11fdf7f2 | 736 | auto p = t.op_bl.cbegin(); |
7c673cae FG |
737 | while (!p.end()) { |
738 | __u8 op; | |
11fdf7f2 | 739 | decode(op, p); |
7c673cae FG |
740 | switch (op) { |
741 | ||
742 | case bluefs_transaction_t::OP_INIT: | |
743 | dout(20) << __func__ << " 0x" << std::hex << pos << std::dec | |
744 | << ": op_init" << dendl; | |
11fdf7f2 TL |
745 | if (unlikely(to_stdout)) { |
746 | std::cout << " 0x" << std::hex << pos << std::dec | |
747 | << ": op_init" << std::endl; | |
748 | } | |
749 | ||
750 | ceph_assert(t.seq == 1); | |
7c673cae FG |
751 | break; |
752 | ||
753 | case bluefs_transaction_t::OP_JUMP: | |
754 | { | |
755 | uint64_t next_seq; | |
756 | uint64_t offset; | |
11fdf7f2 TL |
757 | decode(next_seq, p); |
758 | decode(offset, p); | |
7c673cae FG |
759 | dout(20) << __func__ << " 0x" << std::hex << pos << std::dec |
760 | << ": op_jump seq " << next_seq | |
761 | << " offset 0x" << std::hex << offset << std::dec << dendl; | |
11fdf7f2 TL |
762 | if (unlikely(to_stdout)) { |
763 | std::cout << " 0x" << std::hex << pos << std::dec | |
764 | << ": op_jump seq " << next_seq | |
765 | << " offset 0x" << std::hex << offset << std::dec | |
766 | << std::endl; | |
767 | } | |
768 | ||
769 | ceph_assert(next_seq >= log_seq); | |
7c673cae FG |
770 | log_seq = next_seq - 1; // we will increment it below |
771 | uint64_t skip = offset - read_pos; | |
772 | if (skip) { | |
773 | bufferlist junk; | |
774 | int r = _read(log_reader, &log_reader->buf, read_pos, skip, &junk, | |
775 | NULL); | |
776 | if (r != (int)skip) { | |
777 | dout(10) << __func__ << " 0x" << std::hex << read_pos | |
778 | << ": stop: failed to skip to " << offset | |
779 | << std::dec << dendl; | |
11fdf7f2 | 780 | ceph_abort_msg("problem with op_jump"); |
7c673cae FG |
781 | } |
782 | } | |
783 | } | |
784 | break; | |
785 | ||
786 | case bluefs_transaction_t::OP_JUMP_SEQ: | |
787 | { | |
788 | uint64_t next_seq; | |
11fdf7f2 | 789 | decode(next_seq, p); |
7c673cae FG |
790 | dout(20) << __func__ << " 0x" << std::hex << pos << std::dec |
791 | << ": op_jump_seq " << next_seq << dendl; | |
11fdf7f2 TL |
792 | if (unlikely(to_stdout)) { |
793 | std::cout << " 0x" << std::hex << pos << std::dec | |
794 | << ": op_jump_seq " << next_seq << std::endl; | |
795 | } | |
796 | ||
797 | ceph_assert(next_seq >= log_seq); | |
7c673cae FG |
798 | log_seq = next_seq - 1; // we will increment it below |
799 | } | |
800 | break; | |
801 | ||
802 | case bluefs_transaction_t::OP_ALLOC_ADD: | |
803 | { | |
804 | __u8 id; | |
805 | uint64_t offset, length; | |
11fdf7f2 TL |
806 | decode(id, p); |
807 | decode(offset, p); | |
808 | decode(length, p); | |
7c673cae FG |
809 | dout(20) << __func__ << " 0x" << std::hex << pos << std::dec |
810 | << ": op_alloc_add " << " " << (int)id | |
811 | << ":0x" << std::hex << offset << "~" << length << std::dec | |
812 | << dendl; | |
11fdf7f2 TL |
813 | if (unlikely(to_stdout)) { |
814 | std::cout << " 0x" << std::hex << pos << std::dec | |
815 | << ": op_alloc_add " << " " << (int)id | |
816 | << ":0x" << std::hex << offset << "~" << length << std::dec | |
817 | << std::endl; | |
818 | } | |
819 | ||
7c673cae FG |
820 | if (!noop) { |
821 | block_all[id].insert(offset, length); | |
7c673cae FG |
822 | alloc[id]->init_add_free(offset, length); |
823 | } | |
824 | } | |
825 | break; | |
826 | ||
827 | case bluefs_transaction_t::OP_ALLOC_RM: | |
828 | { | |
829 | __u8 id; | |
830 | uint64_t offset, length; | |
11fdf7f2 TL |
831 | decode(id, p); |
832 | decode(offset, p); | |
833 | decode(length, p); | |
7c673cae FG |
834 | dout(20) << __func__ << " 0x" << std::hex << pos << std::dec |
835 | << ": op_alloc_rm " << " " << (int)id | |
836 | << ":0x" << std::hex << offset << "~" << length << std::dec | |
837 | << dendl; | |
11fdf7f2 TL |
838 | if (unlikely(to_stdout)) { |
839 | std::cout << " 0x" << std::hex << pos << std::dec | |
840 | << ": op_alloc_rm " << " " << (int)id | |
841 | << ":0x" << std::hex << offset << "~" << length << std::dec | |
842 | << std::endl; | |
843 | } | |
844 | ||
7c673cae FG |
845 | if (!noop) { |
846 | block_all[id].erase(offset, length); | |
7c673cae FG |
847 | alloc[id]->init_rm_free(offset, length); |
848 | } | |
849 | } | |
850 | break; | |
851 | ||
852 | case bluefs_transaction_t::OP_DIR_LINK: | |
853 | { | |
854 | string dirname, filename; | |
855 | uint64_t ino; | |
11fdf7f2 TL |
856 | decode(dirname, p); |
857 | decode(filename, p); | |
858 | decode(ino, p); | |
7c673cae FG |
859 | dout(20) << __func__ << " 0x" << std::hex << pos << std::dec |
860 | << ": op_dir_link " << " " << dirname << "/" << filename | |
861 | << " to " << ino | |
862 | << dendl; | |
11fdf7f2 TL |
863 | if (unlikely(to_stdout)) { |
864 | std::cout << " 0x" << std::hex << pos << std::dec | |
865 | << ": op_dir_link " << " " << dirname << "/" << filename | |
866 | << " to " << ino | |
867 | << std::endl; | |
868 | } | |
869 | ||
7c673cae FG |
870 | if (!noop) { |
871 | FileRef file = _get_file(ino); | |
11fdf7f2 | 872 | ceph_assert(file->fnode.ino); |
7c673cae | 873 | map<string,DirRef>::iterator q = dir_map.find(dirname); |
11fdf7f2 | 874 | ceph_assert(q != dir_map.end()); |
7c673cae | 875 | map<string,FileRef>::iterator r = q->second->file_map.find(filename); |
11fdf7f2 | 876 | ceph_assert(r == q->second->file_map.end()); |
7c673cae FG |
877 | q->second->file_map[filename] = file; |
878 | ++file->refs; | |
879 | } | |
880 | } | |
881 | break; | |
882 | ||
883 | case bluefs_transaction_t::OP_DIR_UNLINK: | |
884 | { | |
885 | string dirname, filename; | |
11fdf7f2 TL |
886 | decode(dirname, p); |
887 | decode(filename, p); | |
7c673cae FG |
888 | dout(20) << __func__ << " 0x" << std::hex << pos << std::dec |
889 | << ": op_dir_unlink " << " " << dirname << "/" << filename | |
890 | << dendl; | |
11fdf7f2 TL |
891 | if (unlikely(to_stdout)) { |
892 | std::cout << " 0x" << std::hex << pos << std::dec | |
893 | << ": op_dir_unlink " << " " << dirname << "/" << filename | |
894 | << std::endl; | |
895 | } | |
896 | ||
7c673cae FG |
897 | if (!noop) { |
898 | map<string,DirRef>::iterator q = dir_map.find(dirname); | |
11fdf7f2 | 899 | ceph_assert(q != dir_map.end()); |
7c673cae | 900 | map<string,FileRef>::iterator r = q->second->file_map.find(filename); |
11fdf7f2 TL |
901 | ceph_assert(r != q->second->file_map.end()); |
902 | ceph_assert(r->second->refs > 0); | |
7c673cae FG |
903 | --r->second->refs; |
904 | q->second->file_map.erase(r); | |
905 | } | |
906 | } | |
907 | break; | |
908 | ||
909 | case bluefs_transaction_t::OP_DIR_CREATE: | |
910 | { | |
911 | string dirname; | |
11fdf7f2 | 912 | decode(dirname, p); |
7c673cae FG |
913 | dout(20) << __func__ << " 0x" << std::hex << pos << std::dec |
914 | << ": op_dir_create " << dirname << dendl; | |
11fdf7f2 TL |
915 | if (unlikely(to_stdout)) { |
916 | std::cout << " 0x" << std::hex << pos << std::dec | |
917 | << ": op_dir_create " << dirname << std::endl; | |
918 | } | |
919 | ||
7c673cae FG |
920 | if (!noop) { |
921 | map<string,DirRef>::iterator q = dir_map.find(dirname); | |
11fdf7f2 | 922 | ceph_assert(q == dir_map.end()); |
7c673cae FG |
923 | dir_map[dirname] = new Dir; |
924 | } | |
925 | } | |
926 | break; | |
927 | ||
928 | case bluefs_transaction_t::OP_DIR_REMOVE: | |
929 | { | |
930 | string dirname; | |
11fdf7f2 | 931 | decode(dirname, p); |
7c673cae FG |
932 | dout(20) << __func__ << " 0x" << std::hex << pos << std::dec |
933 | << ": op_dir_remove " << dirname << dendl; | |
11fdf7f2 TL |
934 | if (unlikely(to_stdout)) { |
935 | std::cout << " 0x" << std::hex << pos << std::dec | |
936 | << ": op_dir_remove " << dirname << std::endl; | |
937 | } | |
938 | ||
7c673cae FG |
939 | if (!noop) { |
940 | map<string,DirRef>::iterator q = dir_map.find(dirname); | |
11fdf7f2 TL |
941 | ceph_assert(q != dir_map.end()); |
942 | ceph_assert(q->second->file_map.empty()); | |
7c673cae FG |
943 | dir_map.erase(q); |
944 | } | |
945 | } | |
946 | break; | |
947 | ||
948 | case bluefs_transaction_t::OP_FILE_UPDATE: | |
949 | { | |
950 | bluefs_fnode_t fnode; | |
11fdf7f2 | 951 | decode(fnode, p); |
7c673cae FG |
952 | dout(20) << __func__ << " 0x" << std::hex << pos << std::dec |
953 | << ": op_file_update " << " " << fnode << dendl; | |
11fdf7f2 TL |
954 | if (unlikely(to_stdout)) { |
955 | std::cout << " 0x" << std::hex << pos << std::dec | |
956 | << ": op_file_update " << " " << fnode << std::endl; | |
957 | } | |
958 | ||
7c673cae FG |
959 | if (!noop) { |
960 | FileRef f = _get_file(fnode.ino); | |
961 | f->fnode = fnode; | |
962 | if (fnode.ino > ino_last) { | |
963 | ino_last = fnode.ino; | |
964 | } | |
965 | } | |
966 | } | |
967 | break; | |
968 | ||
969 | case bluefs_transaction_t::OP_FILE_REMOVE: | |
970 | { | |
971 | uint64_t ino; | |
11fdf7f2 | 972 | decode(ino, p); |
7c673cae FG |
973 | dout(20) << __func__ << " 0x" << std::hex << pos << std::dec |
974 | << ": op_file_remove " << ino << dendl; | |
11fdf7f2 TL |
975 | if (unlikely(to_stdout)) { |
976 | std::cout << " 0x" << std::hex << pos << std::dec | |
977 | << ": op_file_remove " << ino << std::endl; | |
978 | } | |
979 | ||
7c673cae FG |
980 | if (!noop) { |
981 | auto p = file_map.find(ino); | |
11fdf7f2 | 982 | ceph_assert(p != file_map.end()); |
7c673cae FG |
983 | file_map.erase(p); |
984 | } | |
985 | } | |
986 | break; | |
987 | ||
988 | default: | |
989 | derr << __func__ << " 0x" << std::hex << pos << std::dec | |
990 | << ": stop: unrecognized op " << (int)op << dendl; | |
991 | delete log_reader; | |
992 | return -EIO; | |
993 | } | |
994 | } | |
11fdf7f2 | 995 | ceph_assert(p.end()); |
7c673cae FG |
996 | |
997 | // we successfully replayed the transaction; bump the seq and log size | |
998 | ++log_seq; | |
999 | log_file->fnode.size = log_reader->buf.pos; | |
1000 | } | |
1001 | ||
1002 | dout(10) << __func__ << " log file size was 0x" | |
1003 | << std::hex << log_file->fnode.size << std::dec << dendl; | |
11fdf7f2 TL |
1004 | if (unlikely(to_stdout)) { |
1005 | std::cout << " log file size was 0x" | |
1006 | << std::hex << log_file->fnode.size << std::dec << std::endl; | |
1007 | } | |
1008 | ||
7c673cae FG |
1009 | delete log_reader; |
1010 | ||
1011 | if (!noop) { | |
1012 | // verify file link counts are all >0 | |
1013 | for (auto& p : file_map) { | |
1014 | if (p.second->refs == 0 && | |
1015 | p.second->fnode.ino > 1) { | |
1016 | derr << __func__ << " file with link count 0: " << p.second->fnode | |
1017 | << dendl; | |
1018 | return -EIO; | |
1019 | } | |
1020 | } | |
1021 | } | |
1022 | ||
1023 | dout(10) << __func__ << " done" << dendl; | |
1024 | return 0; | |
1025 | } | |
1026 | ||
11fdf7f2 TL |
1027 | int BlueFS::log_dump() |
1028 | { | |
1029 | // only dump log file's content | |
1030 | int r = _replay(true, true); | |
1031 | if (r < 0) { | |
1032 | derr << __func__ << " failed to replay log: " << cpp_strerror(r) << dendl; | |
1033 | return r; | |
1034 | } | |
1035 | ||
1036 | return 0; | |
1037 | } | |
1038 | ||
1039 | int BlueFS::device_migrate_to_existing( | |
1040 | CephContext *cct, | |
1041 | const set<int>& devs_source, | |
1042 | int dev_target) | |
1043 | { | |
1044 | vector<byte> buf; | |
1045 | bool buffered = cct->_conf->bluefs_buffered_io; | |
1046 | ||
1047 | assert(dev_target < (int)MAX_BDEV); | |
1048 | ||
1049 | int flags = 0; | |
1050 | flags |= devs_source.count(BDEV_DB) ? | |
1051 | (REMOVE_DB | RENAME_SLOW2DB) : 0; | |
1052 | flags |= devs_source.count(BDEV_WAL) ? REMOVE_WAL : 0; | |
1053 | int dev_target_new = dev_target; | |
1054 | ||
1055 | // Slow device without separate DB one is addressed via BDEV_DB | |
1056 | // Hence need renaming. | |
1057 | if ((flags & REMOVE_DB) && dev_target == BDEV_SLOW) { | |
1058 | dev_target_new = BDEV_DB; | |
1059 | dout(0) << __func__ << " super to be written to " << dev_target << dendl; | |
1060 | } | |
1061 | ||
1062 | for (auto& p : file_map) { | |
1063 | //do not copy log | |
1064 | if (p.second->fnode.ino == 1) { | |
1065 | continue; | |
1066 | } | |
1067 | auto& fnode_extents = p.second->fnode.extents; | |
1068 | ||
1069 | for (auto ext_it = fnode_extents.begin(); | |
1070 | ext_it != p.second->fnode.extents.end(); | |
1071 | ++ext_it) { | |
1072 | if (ext_it->bdev != dev_target && devs_source.count(ext_it->bdev)) { | |
1073 | bluefs_extent_t old_ext = *ext_it; | |
1074 | PExtentVector extents; | |
1075 | auto l = | |
1076 | _allocate_without_fallback(dev_target, old_ext.length, &extents); | |
1077 | if (l == 0) { | |
1078 | buf.resize(old_ext.length); | |
1079 | int r = bdev[old_ext.bdev]->read_random( | |
1080 | old_ext.offset, | |
1081 | old_ext.length, | |
1082 | (char*)&buf.at(0), | |
1083 | buffered); | |
1084 | if (r != 0) { | |
1085 | derr << __func__ << " failed to read 0x" << std::hex | |
1086 | << old_ext.offset << "~" <<old_ext.length << std::dec | |
1087 | << " from " << (int)dev_target << dendl; | |
1088 | return -EIO; | |
1089 | } | |
1090 | ||
1091 | assert(extents.size() > 0); | |
1092 | uint64_t src_buf_pos = 0; | |
1093 | { | |
1094 | // overwrite existing extent | |
1095 | *ext_it= | |
1096 | bluefs_extent_t(dev_target_new, extents[0].offset, extents[0].length); | |
1097 | bufferlist bl; | |
1098 | bl.append((char*)&buf.at(src_buf_pos), extents[0].length); | |
1099 | int r = bdev[dev_target]->write(extents[0].offset, bl, buffered); | |
1100 | ceph_assert(r == 0); | |
1101 | src_buf_pos += extents[0].length; | |
1102 | } | |
1103 | // then insert more extents if needed | |
1104 | for( size_t i = 1; i < extents.size(); ++i) { | |
1105 | bufferlist bl; | |
1106 | bl.append((char*)&buf.at(src_buf_pos), extents[i].length); | |
1107 | ++ext_it; | |
1108 | ext_it = fnode_extents.emplace(ext_it, dev_target_new, | |
1109 | extents[i].offset, extents[i].length); | |
1110 | int r = bdev[dev_target]->write(extents[i].offset, bl, buffered); | |
1111 | ceph_assert(r == 0); | |
1112 | src_buf_pos += extents[i].length; | |
1113 | } | |
1114 | { | |
1115 | PExtentVector to_release; | |
1116 | to_release.emplace_back(old_ext.offset, old_ext.length); | |
1117 | alloc[old_ext.bdev]->release(to_release); | |
1118 | } | |
1119 | ||
1120 | } else { | |
1121 | derr << __func__ << " unable to allocate len 0x" << std::hex | |
1122 | << old_ext.length << std::dec << " from " << (int)dev_target | |
1123 | << dendl; | |
1124 | return -ENOSPC; | |
1125 | } | |
1126 | } else if (dev_target != dev_target_new && ext_it->bdev == dev_target) { | |
1127 | ext_it->bdev = dev_target_new; | |
1128 | } | |
1129 | } | |
1130 | auto& prefer_bdev = p.second->fnode.prefer_bdev; | |
1131 | if (prefer_bdev != dev_target && devs_source.count(prefer_bdev)) { | |
1132 | prefer_bdev = dev_target_new; | |
1133 | } | |
1134 | } | |
1135 | // new logging device in the current naming scheme | |
1136 | int new_log_dev_cur = bdev[BDEV_WAL] ? | |
1137 | BDEV_WAL : | |
1138 | bdev[BDEV_DB] ? BDEV_DB : BDEV_SLOW; | |
1139 | ||
1140 | // new logging device in new naming scheme | |
1141 | int new_log_dev_next = new_log_dev_cur; | |
1142 | ||
1143 | if (devs_source.count(new_log_dev_cur)) { | |
1144 | // SLOW device is addressed via BDEV_DB too hence either WAL or DB | |
1145 | new_log_dev_next = (flags & REMOVE_WAL) || !bdev[BDEV_WAL] ? | |
1146 | BDEV_DB : | |
1147 | BDEV_WAL; | |
1148 | ||
1149 | dout(0) << __func__ << " log moved from " << new_log_dev_cur | |
1150 | << " to " << new_log_dev_next << dendl; | |
1151 | ||
1152 | new_log_dev_cur = | |
1153 | (flags & REMOVE_DB) && new_log_dev_next == BDEV_DB ? | |
1154 | BDEV_SLOW : | |
1155 | new_log_dev_next; | |
1156 | } | |
1157 | ||
1158 | _rewrite_log_sync( | |
1159 | false, | |
1160 | (flags & REMOVE_DB) ? BDEV_SLOW : BDEV_DB, | |
1161 | new_log_dev_cur, | |
1162 | new_log_dev_next, | |
1163 | flags); | |
1164 | return 0; | |
1165 | } | |
1166 | ||
1167 | int BlueFS::device_migrate_to_new( | |
1168 | CephContext *cct, | |
1169 | const set<int>& devs_source, | |
1170 | int dev_target) | |
1171 | { | |
1172 | vector<byte> buf; | |
1173 | bool buffered = cct->_conf->bluefs_buffered_io; | |
1174 | ||
1175 | assert(dev_target == (int)BDEV_NEWDB || (int)BDEV_NEWWAL); | |
1176 | ||
1177 | int flags = 0; | |
1178 | ||
1179 | flags |= devs_source.count(BDEV_DB) ? | |
1180 | (!bdev[BDEV_SLOW] ? RENAME_DB2SLOW: REMOVE_DB) : | |
1181 | 0; | |
1182 | flags |= devs_source.count(BDEV_WAL) ? REMOVE_WAL : 0; | |
1183 | int dev_target_new = dev_target; | |
1184 | ||
1185 | for (auto& p : file_map) { | |
1186 | //do not copy log | |
1187 | if (p.second->fnode.ino == 1) { | |
1188 | continue; | |
1189 | } | |
1190 | auto& fnode_extents = p.second->fnode.extents; | |
1191 | ||
1192 | for (auto ext_it = fnode_extents.begin(); | |
1193 | ext_it != p.second->fnode.extents.end(); | |
1194 | ++ext_it) { | |
1195 | if (ext_it->bdev != dev_target && devs_source.count(ext_it->bdev)) { | |
1196 | bluefs_extent_t old_ext = *ext_it; | |
1197 | PExtentVector extents; | |
1198 | auto l = | |
1199 | _allocate_without_fallback(dev_target, old_ext.length, &extents); | |
1200 | if (l == 0) { | |
1201 | buf.resize(old_ext.length); | |
1202 | int r = bdev[old_ext.bdev]->read_random( | |
1203 | old_ext.offset, | |
1204 | old_ext.length, | |
1205 | (char*)&buf.at(0), | |
1206 | buffered); | |
1207 | dout(10)<<__func__<<" read = "<<r<<dendl; | |
1208 | if (r != 0) { | |
1209 | derr << __func__ << " failed to read 0x" << std::hex | |
1210 | << old_ext.offset << "~" <<old_ext.length << std::dec | |
1211 | << " from " << (int)dev_target << dendl; | |
1212 | return -EIO; | |
1213 | } | |
1214 | ||
1215 | assert(extents.size() > 0); | |
1216 | uint64_t src_buf_pos = 0; | |
1217 | { | |
1218 | // overwrite existing extent | |
1219 | *ext_it= | |
1220 | bluefs_extent_t(dev_target_new, extents[0].offset, extents[0].length); | |
1221 | bufferlist bl; | |
1222 | bl.append((char*)&buf.at(src_buf_pos), extents[0].length); | |
1223 | int r = bdev[dev_target]->write(extents[0].offset, bl, buffered); | |
1224 | ceph_assert(r == 0); | |
1225 | src_buf_pos += extents[0].length; | |
1226 | } | |
1227 | // then insert more extents if needed | |
1228 | for( size_t i = 1; i < extents.size(); ++i) { | |
1229 | bufferlist bl; | |
1230 | bl.append((char*)&buf.at(src_buf_pos), extents[i].length); | |
1231 | ++ext_it; | |
1232 | ext_it = fnode_extents.emplace(ext_it, dev_target_new, | |
1233 | extents[i].offset, extents[i].length); | |
1234 | int r = bdev[dev_target]->write(extents[i].offset, bl, buffered); | |
1235 | ceph_assert(r == 0); | |
1236 | src_buf_pos += extents[i].length; | |
1237 | } | |
1238 | { | |
1239 | PExtentVector to_release; | |
1240 | to_release.emplace_back(old_ext.offset, old_ext.length); | |
1241 | alloc[old_ext.bdev]->release(to_release); | |
1242 | } | |
1243 | } else { | |
1244 | derr << __func__ << " unable to allocate len 0x" << std::hex | |
1245 | << old_ext.length << std::dec << " from " << (int)dev_target | |
1246 | << dendl; | |
1247 | return -ENOSPC; | |
1248 | } | |
1249 | } else if (dev_target != dev_target_new && ext_it->bdev == dev_target) { | |
1250 | ext_it->bdev = dev_target_new; | |
1251 | } | |
1252 | } | |
1253 | auto& prefer_bdev = p.second->fnode.prefer_bdev; | |
1254 | if (prefer_bdev != dev_target && devs_source.count(prefer_bdev)) { | |
1255 | prefer_bdev = dev_target_new; | |
1256 | } | |
1257 | } | |
1258 | // new logging device in the current naming scheme | |
1259 | int new_log_dev_cur = | |
1260 | bdev[BDEV_NEWWAL] ? | |
1261 | BDEV_NEWWAL : | |
1262 | bdev[BDEV_WAL] && !(flags & REMOVE_WAL) ? | |
1263 | BDEV_WAL : | |
1264 | bdev[BDEV_NEWDB] ? | |
1265 | BDEV_NEWDB : | |
1266 | bdev[BDEV_DB] && !(flags & REMOVE_DB)? | |
1267 | BDEV_DB : | |
1268 | BDEV_SLOW; | |
1269 | ||
1270 | // new logging device in new naming scheme | |
1271 | int new_log_dev_next = | |
1272 | new_log_dev_cur == BDEV_NEWWAL ? | |
1273 | BDEV_WAL : | |
1274 | new_log_dev_cur == BDEV_NEWDB ? | |
1275 | BDEV_DB : | |
1276 | new_log_dev_cur; | |
1277 | ||
1278 | int super_dev = | |
1279 | dev_target == BDEV_NEWDB ? | |
1280 | BDEV_NEWDB : | |
1281 | bdev[BDEV_DB] ? | |
1282 | BDEV_DB : | |
1283 | BDEV_SLOW; | |
1284 | ||
1285 | _rewrite_log_sync( | |
1286 | false, | |
1287 | super_dev, | |
1288 | new_log_dev_cur, | |
1289 | new_log_dev_next, | |
1290 | flags); | |
1291 | return 0; | |
1292 | } | |
1293 | ||
7c673cae FG |
1294 | BlueFS::FileRef BlueFS::_get_file(uint64_t ino) |
1295 | { | |
1296 | auto p = file_map.find(ino); | |
1297 | if (p == file_map.end()) { | |
1298 | FileRef f = new File; | |
1299 | file_map[ino] = f; | |
1300 | dout(30) << __func__ << " ino " << ino << " = " << f | |
1301 | << " (new)" << dendl; | |
1302 | return f; | |
1303 | } else { | |
1304 | dout(30) << __func__ << " ino " << ino << " = " << p->second << dendl; | |
1305 | return p->second; | |
1306 | } | |
1307 | } | |
1308 | ||
1309 | void BlueFS::_drop_link(FileRef file) | |
1310 | { | |
1311 | dout(20) << __func__ << " had refs " << file->refs | |
1312 | << " on " << file->fnode << dendl; | |
11fdf7f2 | 1313 | ceph_assert(file->refs > 0); |
7c673cae FG |
1314 | --file->refs; |
1315 | if (file->refs == 0) { | |
1316 | dout(20) << __func__ << " destroying " << file->fnode << dendl; | |
11fdf7f2 | 1317 | ceph_assert(file->num_reading.load() == 0); |
7c673cae FG |
1318 | log_t.op_file_remove(file->fnode.ino); |
1319 | for (auto& r : file->fnode.extents) { | |
1320 | pending_release[r.bdev].insert(r.offset, r.length); | |
1321 | } | |
1322 | file_map.erase(file->fnode.ino); | |
1323 | file->deleted = true; | |
94b18763 | 1324 | |
7c673cae | 1325 | if (file->dirty_seq) { |
11fdf7f2 TL |
1326 | ceph_assert(file->dirty_seq > log_seq_stable); |
1327 | ceph_assert(dirty_files.count(file->dirty_seq)); | |
7c673cae FG |
1328 | auto it = dirty_files[file->dirty_seq].iterator_to(*file); |
1329 | dirty_files[file->dirty_seq].erase(it); | |
1330 | file->dirty_seq = 0; | |
1331 | } | |
1332 | } | |
1333 | } | |
1334 | ||
1335 | int BlueFS::_read_random( | |
1336 | FileReader *h, ///< [in] read from here | |
1337 | uint64_t off, ///< [in] offset | |
1338 | size_t len, ///< [in] this many bytes | |
1339 | char *out) ///< [out] optional: or copy it here | |
1340 | { | |
494da23a TL |
1341 | auto* buf = &h->buf; |
1342 | ||
1343 | int ret = 0; | |
7c673cae FG |
1344 | dout(10) << __func__ << " h " << h |
1345 | << " 0x" << std::hex << off << "~" << len << std::dec | |
1346 | << " from " << h->file->fnode << dendl; | |
1347 | ||
1348 | ++h->file->num_reading; | |
1349 | ||
1350 | if (!h->ignore_eof && | |
1351 | off + len > h->file->fnode.size) { | |
1352 | if (off > h->file->fnode.size) | |
1353 | len = 0; | |
1354 | else | |
1355 | len = h->file->fnode.size - off; | |
1356 | dout(20) << __func__ << " reaching (or past) eof, len clipped to 0x" | |
1357 | << std::hex << len << std::dec << dendl; | |
1358 | } | |
494da23a TL |
1359 | logger->inc(l_bluefs_read_random_count, 1); |
1360 | logger->inc(l_bluefs_read_random_bytes, len); | |
7c673cae | 1361 | |
494da23a | 1362 | std::shared_lock s_lock(h->lock); |
7c673cae | 1363 | while (len > 0) { |
494da23a TL |
1364 | if (off < buf->bl_off || off >= buf->get_buf_end()) { |
1365 | s_lock.unlock(); | |
1366 | uint64_t x_off = 0; | |
1367 | auto p = h->file->fnode.seek(off, &x_off); | |
1368 | uint64_t l = std::min(p->length - x_off, static_cast<uint64_t>(len)); | |
1369 | dout(20) << __func__ << " read random 0x" | |
1370 | << std::hex << x_off << "~" << l << std::dec | |
1371 | << " of " << *p << dendl; | |
1372 | int r = bdev[p->bdev]->read_random(p->offset + x_off, l, out, | |
1373 | cct->_conf->bluefs_buffered_io); | |
1374 | ceph_assert(r == 0); | |
1375 | off += l; | |
1376 | len -= l; | |
1377 | ret += l; | |
1378 | out += l; | |
1379 | ||
1380 | logger->inc(l_bluefs_read_random_disk_count, 1); | |
1381 | logger->inc(l_bluefs_read_random_disk_bytes, l); | |
1382 | if (len > 0) { | |
1383 | s_lock.lock(); | |
1384 | } | |
1385 | } else { | |
1386 | auto left = buf->get_buf_remaining(off); | |
1387 | int r = std::min(len, left); | |
1388 | logger->inc(l_bluefs_read_random_buffer_count, 1); | |
1389 | logger->inc(l_bluefs_read_random_buffer_bytes, r); | |
1390 | dout(20) << __func__ << " left 0x" << std::hex << left | |
1391 | << " 0x" << off << "~" << len << std::dec | |
1392 | << dendl; | |
1393 | ||
1394 | if (out) { | |
1395 | // NOTE: h->bl is normally a contiguous buffer so c_str() is free. | |
1396 | memcpy(out, buf->bl.c_str() + off - buf->bl_off, r); | |
1397 | out += r; | |
1398 | } | |
7c673cae | 1399 | |
494da23a TL |
1400 | dout(30) << __func__ << " result chunk (0x" |
1401 | << std::hex << r << std::dec << " bytes):\n"; | |
1402 | bufferlist t; | |
1403 | t.substr_of(buf->bl, off - buf->bl_off, r); | |
1404 | t.hexdump(*_dout); | |
1405 | *_dout << dendl; | |
1406 | ||
1407 | off += r; | |
1408 | len -= r; | |
1409 | ret += r; | |
1410 | buf->pos += r; | |
1411 | } | |
1412 | } | |
7c673cae FG |
1413 | dout(20) << __func__ << " got " << ret << dendl; |
1414 | --h->file->num_reading; | |
1415 | return ret; | |
1416 | } | |
1417 | ||
1418 | int BlueFS::_read( | |
1419 | FileReader *h, ///< [in] read from here | |
1420 | FileReaderBuffer *buf, ///< [in] reader state | |
1421 | uint64_t off, ///< [in] offset | |
1422 | size_t len, ///< [in] this many bytes | |
1423 | bufferlist *outbl, ///< [out] optional: reference the result here | |
1424 | char *out) ///< [out] optional: or copy it here | |
1425 | { | |
494da23a | 1426 | bool prefetch = !outbl && !out; |
7c673cae FG |
1427 | dout(10) << __func__ << " h " << h |
1428 | << " 0x" << std::hex << off << "~" << len << std::dec | |
494da23a TL |
1429 | << " from " << h->file->fnode |
1430 | << (prefetch ? " prefetch" : "") | |
1431 | << dendl; | |
7c673cae FG |
1432 | |
1433 | ++h->file->num_reading; | |
1434 | ||
1435 | if (!h->ignore_eof && | |
1436 | off + len > h->file->fnode.size) { | |
1437 | if (off > h->file->fnode.size) | |
1438 | len = 0; | |
1439 | else | |
1440 | len = h->file->fnode.size - off; | |
1441 | dout(20) << __func__ << " reaching (or past) eof, len clipped to 0x" | |
1442 | << std::hex << len << std::dec << dendl; | |
1443 | } | |
494da23a TL |
1444 | logger->inc(l_bluefs_read_count, 1); |
1445 | logger->inc(l_bluefs_read_bytes, len); | |
1446 | if (prefetch) { | |
1447 | logger->inc(l_bluefs_read_prefetch_count, 1); | |
1448 | logger->inc(l_bluefs_read_prefetch_bytes, len); | |
1449 | } | |
1450 | ||
7c673cae FG |
1451 | if (outbl) |
1452 | outbl->clear(); | |
1453 | ||
1454 | int ret = 0; | |
494da23a | 1455 | std::shared_lock s_lock(h->lock); |
7c673cae FG |
1456 | while (len > 0) { |
1457 | size_t left; | |
1458 | if (off < buf->bl_off || off >= buf->get_buf_end()) { | |
494da23a TL |
1459 | s_lock.unlock(); |
1460 | std::unique_lock u_lock(h->lock); | |
1461 | if (off < buf->bl_off || off >= buf->get_buf_end()) { | |
1462 | // if precondition hasn't changed during locking upgrade. | |
1463 | buf->bl.clear(); | |
1464 | buf->bl_off = off & super.block_mask(); | |
1465 | uint64_t x_off = 0; | |
1466 | auto p = h->file->fnode.seek(buf->bl_off, &x_off); | |
1467 | uint64_t want = round_up_to(len + (off & ~super.block_mask()), | |
1468 | super.block_size); | |
1469 | want = std::max(want, buf->max_prefetch); | |
1470 | uint64_t l = std::min(p->length - x_off, want); | |
1471 | uint64_t eof_offset = round_up_to(h->file->fnode.size, super.block_size); | |
1472 | if (!h->ignore_eof && | |
1473 | buf->bl_off + l > eof_offset) { | |
1474 | l = eof_offset - buf->bl_off; | |
1475 | } | |
1476 | dout(20) << __func__ << " fetching 0x" | |
1477 | << std::hex << x_off << "~" << l << std::dec | |
1478 | << " of " << *p << dendl; | |
1479 | int r = bdev[p->bdev]->read(p->offset + x_off, l, &buf->bl, ioc[p->bdev], | |
1480 | cct->_conf->bluefs_buffered_io); | |
1481 | ceph_assert(r == 0); | |
7c673cae | 1482 | } |
494da23a TL |
1483 | u_lock.unlock(); |
1484 | s_lock.lock(); | |
1485 | // we should recheck if buffer is valid after lock downgrade | |
1486 | continue; | |
7c673cae FG |
1487 | } |
1488 | left = buf->get_buf_remaining(off); | |
1489 | dout(20) << __func__ << " left 0x" << std::hex << left | |
1490 | << " len 0x" << len << std::dec << dendl; | |
1491 | ||
11fdf7f2 | 1492 | int r = std::min(len, left); |
7c673cae FG |
1493 | if (outbl) { |
1494 | bufferlist t; | |
1495 | t.substr_of(buf->bl, off - buf->bl_off, r); | |
1496 | outbl->claim_append(t); | |
1497 | } | |
1498 | if (out) { | |
1499 | // NOTE: h->bl is normally a contiguous buffer so c_str() is free. | |
1500 | memcpy(out, buf->bl.c_str() + off - buf->bl_off, r); | |
1501 | out += r; | |
1502 | } | |
1503 | ||
1504 | dout(30) << __func__ << " result chunk (0x" | |
1505 | << std::hex << r << std::dec << " bytes):\n"; | |
1506 | bufferlist t; | |
1507 | t.substr_of(buf->bl, off - buf->bl_off, r); | |
1508 | t.hexdump(*_dout); | |
1509 | *_dout << dendl; | |
1510 | ||
1511 | off += r; | |
1512 | len -= r; | |
1513 | ret += r; | |
1514 | buf->pos += r; | |
1515 | } | |
1516 | ||
1517 | dout(20) << __func__ << " got " << ret << dendl; | |
11fdf7f2 | 1518 | ceph_assert(!outbl || (int)outbl->length() == ret); |
7c673cae FG |
1519 | --h->file->num_reading; |
1520 | return ret; | |
1521 | } | |
1522 | ||
1523 | void BlueFS::_invalidate_cache(FileRef f, uint64_t offset, uint64_t length) | |
1524 | { | |
1525 | dout(10) << __func__ << " file " << f->fnode | |
1526 | << " 0x" << std::hex << offset << "~" << length << std::dec | |
1527 | << dendl; | |
1528 | if (offset & ~super.block_mask()) { | |
1529 | offset &= super.block_mask(); | |
11fdf7f2 | 1530 | length = round_up_to(length, super.block_size); |
7c673cae FG |
1531 | } |
1532 | uint64_t x_off = 0; | |
1533 | auto p = f->fnode.seek(offset, &x_off); | |
1534 | while (length > 0 && p != f->fnode.extents.end()) { | |
11fdf7f2 | 1535 | uint64_t x_len = std::min(p->length - x_off, length); |
7c673cae FG |
1536 | bdev[p->bdev]->invalidate_cache(p->offset + x_off, x_len); |
1537 | dout(20) << __func__ << " 0x" << std::hex << x_off << "~" << x_len | |
1538 | << std:: dec << " of " << *p << dendl; | |
1539 | offset += x_len; | |
1540 | length -= x_len; | |
1541 | } | |
1542 | } | |
1543 | ||
1544 | uint64_t BlueFS::_estimate_log_size() | |
1545 | { | |
1546 | int avg_dir_size = 40; // fixme | |
1547 | int avg_file_size = 12; | |
1548 | uint64_t size = 4096 * 2; | |
1549 | size += file_map.size() * (1 + sizeof(bluefs_fnode_t)); | |
1550 | for (auto& p : block_all) | |
1551 | size += p.num_intervals() * (1 + 1 + sizeof(uint64_t) * 2); | |
1552 | size += dir_map.size() + (1 + avg_dir_size); | |
1553 | size += file_map.size() * (1 + avg_dir_size + avg_file_size); | |
11fdf7f2 | 1554 | return round_up_to(size, super.block_size); |
7c673cae FG |
1555 | } |
1556 | ||
1557 | void BlueFS::compact_log() | |
1558 | { | |
11fdf7f2 | 1559 | std::unique_lock l(lock); |
7c673cae FG |
1560 | if (cct->_conf->bluefs_compact_log_sync) { |
1561 | _compact_log_sync(); | |
1562 | } else { | |
1563 | _compact_log_async(l); | |
1564 | } | |
1565 | } | |
1566 | ||
1567 | bool BlueFS::_should_compact_log() | |
1568 | { | |
1569 | uint64_t current = log_writer->file->fnode.size; | |
1570 | uint64_t expected = _estimate_log_size(); | |
1571 | float ratio = (float)current / (float)expected; | |
1572 | dout(10) << __func__ << " current 0x" << std::hex << current | |
1573 | << " expected " << expected << std::dec | |
1574 | << " ratio " << ratio | |
1575 | << (new_log ? " (async compaction in progress)" : "") | |
1576 | << dendl; | |
1577 | if (new_log || | |
1578 | current < cct->_conf->bluefs_log_compact_min_size || | |
1579 | ratio < cct->_conf->bluefs_log_compact_min_ratio) { | |
1580 | return false; | |
1581 | } | |
1582 | return true; | |
1583 | } | |
1584 | ||
11fdf7f2 TL |
1585 | void BlueFS::_compact_log_dump_metadata(bluefs_transaction_t *t, |
1586 | int flags) | |
7c673cae FG |
1587 | { |
1588 | t->seq = 1; | |
1589 | t->uuid = super.uuid; | |
1590 | dout(20) << __func__ << " op_init" << dendl; | |
1591 | ||
1592 | t->op_init(); | |
1593 | for (unsigned bdev = 0; bdev < MAX_BDEV; ++bdev) { | |
1594 | interval_set<uint64_t>& p = block_all[bdev]; | |
1595 | for (interval_set<uint64_t>::iterator q = p.begin(); q != p.end(); ++q) { | |
11fdf7f2 TL |
1596 | auto bdev_new = bdev; |
1597 | if ((flags & REMOVE_WAL) && bdev == BDEV_WAL) { | |
1598 | continue; | |
1599 | } | |
1600 | if ((flags & REMOVE_DB) && bdev == BDEV_DB) { | |
1601 | continue; | |
1602 | } | |
1603 | if ((flags & RENAME_SLOW2DB) && bdev == BDEV_SLOW) { | |
1604 | bdev_new = BDEV_DB; | |
1605 | } | |
1606 | if ((flags & RENAME_DB2SLOW) && bdev == BDEV_DB) { | |
1607 | bdev_new = BDEV_SLOW; | |
1608 | } | |
1609 | if (bdev == BDEV_NEWDB) { | |
1610 | // REMOVE_DB xor RENAME_DB | |
1611 | ceph_assert(!(flags & REMOVE_DB) != !(flags & RENAME_DB2SLOW)); | |
1612 | ceph_assert(!(flags & RENAME_SLOW2DB)); | |
1613 | bdev_new = BDEV_DB; | |
1614 | } | |
1615 | if (bdev == BDEV_NEWWAL) { | |
1616 | ceph_assert(flags & REMOVE_WAL); | |
1617 | bdev_new = BDEV_WAL; | |
1618 | } | |
1619 | dout(20) << __func__ << " op_alloc_add " << bdev_new << " 0x" | |
7c673cae FG |
1620 | << std::hex << q.get_start() << "~" << q.get_len() << std::dec |
1621 | << dendl; | |
11fdf7f2 | 1622 | t->op_alloc_add(bdev_new, q.get_start(), q.get_len()); |
7c673cae FG |
1623 | } |
1624 | } | |
1625 | for (auto& p : file_map) { | |
1626 | if (p.first == 1) | |
1627 | continue; | |
11fdf7f2 TL |
1628 | ceph_assert(p.first > 1); |
1629 | ||
1630 | for(auto& e : p.second->fnode.extents) { | |
1631 | auto bdev = e.bdev; | |
1632 | auto bdev_new = bdev; | |
1633 | ceph_assert(!((flags & REMOVE_WAL) && bdev == BDEV_WAL)); | |
1634 | if ((flags & RENAME_SLOW2DB) && bdev == BDEV_SLOW) { | |
1635 | bdev_new = BDEV_DB; | |
1636 | } | |
1637 | if ((flags & RENAME_DB2SLOW) && bdev == BDEV_DB) { | |
1638 | bdev_new = BDEV_SLOW; | |
1639 | } | |
1640 | if (bdev == BDEV_NEWDB) { | |
1641 | // REMOVE_DB xor RENAME_DB | |
1642 | ceph_assert(!(flags & REMOVE_DB) != !(flags & RENAME_DB2SLOW)); | |
1643 | ceph_assert(!(flags & RENAME_SLOW2DB)); | |
1644 | bdev_new = BDEV_DB; | |
1645 | } | |
1646 | if (bdev == BDEV_NEWWAL) { | |
1647 | ceph_assert(flags & REMOVE_WAL); | |
1648 | bdev_new = BDEV_WAL; | |
1649 | } | |
1650 | e.bdev = bdev_new; | |
1651 | } | |
7c673cae | 1652 | dout(20) << __func__ << " op_file_update " << p.second->fnode << dendl; |
7c673cae FG |
1653 | t->op_file_update(p.second->fnode); |
1654 | } | |
1655 | for (auto& p : dir_map) { | |
1656 | dout(20) << __func__ << " op_dir_create " << p.first << dendl; | |
1657 | t->op_dir_create(p.first); | |
1658 | for (auto& q : p.second->file_map) { | |
1659 | dout(20) << __func__ << " op_dir_link " << p.first << "/" << q.first | |
1660 | << " to " << q.second->fnode.ino << dendl; | |
1661 | t->op_dir_link(p.first, q.first, q.second->fnode.ino); | |
1662 | } | |
1663 | } | |
1664 | } | |
1665 | ||
1666 | void BlueFS::_compact_log_sync() | |
1667 | { | |
1668 | dout(10) << __func__ << dendl; | |
11fdf7f2 TL |
1669 | _rewrite_log_sync(true, |
1670 | BDEV_DB, | |
1671 | log_writer->file->fnode.prefer_bdev, | |
1672 | log_writer->file->fnode.prefer_bdev, | |
1673 | 0); | |
1674 | logger->inc(l_bluefs_log_compactions); | |
1675 | } | |
1676 | ||
1677 | void BlueFS::_rewrite_log_sync(bool allocate_with_fallback, | |
1678 | int super_dev, | |
1679 | int log_dev, | |
1680 | int log_dev_new, | |
1681 | int flags) | |
1682 | { | |
7c673cae FG |
1683 | File *log_file = log_writer->file.get(); |
1684 | ||
1685 | // clear out log (be careful who calls us!!!) | |
1686 | log_t.clear(); | |
1687 | ||
11fdf7f2 TL |
1688 | dout(20) << __func__ << " super_dev:" << super_dev |
1689 | << " log_dev:" << log_dev | |
1690 | << " log_dev_new:" << log_dev_new | |
1691 | << " flags:" << flags | |
1692 | << dendl; | |
7c673cae | 1693 | bluefs_transaction_t t; |
11fdf7f2 | 1694 | _compact_log_dump_metadata(&t, flags); |
7c673cae FG |
1695 | |
1696 | dout(20) << __func__ << " op_jump_seq " << log_seq << dendl; | |
1697 | t.op_jump_seq(log_seq); | |
1698 | ||
1699 | bufferlist bl; | |
11fdf7f2 | 1700 | encode(t, bl); |
7c673cae FG |
1701 | _pad_bl(bl); |
1702 | ||
1703 | uint64_t need = bl.length() + cct->_conf->bluefs_max_log_runway; | |
1704 | dout(20) << __func__ << " need " << need << dendl; | |
1705 | ||
494da23a | 1706 | bluefs_fnode_t old_fnode; |
11fdf7f2 | 1707 | int r; |
494da23a | 1708 | log_file->fnode.swap_extents(old_fnode); |
11fdf7f2 TL |
1709 | if (allocate_with_fallback) { |
1710 | r = _allocate(log_dev, need, &log_file->fnode); | |
1711 | ceph_assert(r == 0); | |
1712 | } else { | |
1713 | PExtentVector extents; | |
1714 | r = _allocate_without_fallback(log_dev, | |
1715 | need, | |
1716 | &extents); | |
1717 | ceph_assert(r == 0); | |
1718 | for (auto& p : extents) { | |
1719 | log_file->fnode.append_extent( | |
1720 | bluefs_extent_t(log_dev, p.offset, p.length)); | |
1721 | } | |
7c673cae FG |
1722 | } |
1723 | ||
1724 | _close_writer(log_writer); | |
1725 | ||
1726 | log_file->fnode.size = bl.length(); | |
1727 | log_writer = _create_writer(log_file); | |
1728 | log_writer->append(bl); | |
11fdf7f2 TL |
1729 | r = _flush(log_writer, true); |
1730 | ceph_assert(r == 0); | |
1731 | #ifdef HAVE_LIBAIO | |
1732 | if (!cct->_conf->bluefs_sync_write) { | |
1733 | list<aio_t> completed_ios; | |
1734 | _claim_completed_aios(log_writer, &completed_ios); | |
1735 | wait_for_aio(log_writer); | |
1736 | completed_ios.clear(); | |
1737 | } | |
1738 | #endif | |
224ce89b | 1739 | flush_bdev(); |
224ce89b | 1740 | |
7c673cae | 1741 | super.log_fnode = log_file->fnode; |
11fdf7f2 TL |
1742 | // rename device if needed |
1743 | if (log_dev != log_dev_new) { | |
1744 | dout(10) << __func__ << " renaming log extents to " << log_dev_new << dendl; | |
1745 | for (auto& p : super.log_fnode.extents) { | |
1746 | p.bdev = log_dev_new; | |
1747 | } | |
1748 | } | |
1749 | dout(10) << __func__ << " writing super, log fnode: " << super.log_fnode << dendl; | |
1750 | ||
7c673cae | 1751 | ++super.version; |
11fdf7f2 | 1752 | _write_super(super_dev); |
7c673cae FG |
1753 | flush_bdev(); |
1754 | ||
494da23a TL |
1755 | dout(10) << __func__ << " release old log extents " << old_fnode.extents << dendl; |
1756 | for (auto& r : old_fnode.extents) { | |
7c673cae FG |
1757 | pending_release[r.bdev].insert(r.offset, r.length); |
1758 | } | |
7c673cae FG |
1759 | } |
1760 | ||
1761 | /* | |
1762 | * 1. Allocate a new extent to continue the log, and then log an event | |
1763 | * that jumps the log write position to the new extent. At this point, the | |
1764 | * old extent(s) won't be written to, and reflect everything to compact. | |
1765 | * New events will be written to the new region that we'll keep. | |
1766 | * | |
1767 | * 2. While still holding the lock, encode a bufferlist that dumps all of the | |
1768 | * in-memory fnodes and names. This will become the new beginning of the | |
1769 | * log. The last event will jump to the log continuation extent from #1. | |
1770 | * | |
1771 | * 3. Queue a write to a new extent for the new beginnging of the log. | |
1772 | * | |
1773 | * 4. Drop lock and wait | |
1774 | * | |
1775 | * 5. Retake the lock. | |
1776 | * | |
1777 | * 6. Update the log_fnode to splice in the new beginning. | |
1778 | * | |
1779 | * 7. Write the new superblock. | |
1780 | * | |
1781 | * 8. Release the old log space. Clean up. | |
1782 | */ | |
11fdf7f2 | 1783 | void BlueFS::_compact_log_async(std::unique_lock<ceph::mutex>& l) |
7c673cae FG |
1784 | { |
1785 | dout(10) << __func__ << dendl; | |
1786 | File *log_file = log_writer->file.get(); | |
11fdf7f2 TL |
1787 | ceph_assert(!new_log); |
1788 | ceph_assert(!new_log_writer); | |
7c673cae | 1789 | |
181888fb FG |
1790 | // create a new log [writer] so that we know compaction is in progress |
1791 | // (see _should_compact_log) | |
1792 | new_log = new File; | |
1793 | new_log->fnode.ino = 0; // so that _flush_range won't try to log the fnode | |
1794 | ||
3efd9988 FG |
1795 | // 0. wait for any racing flushes to complete. (We do not want to block |
1796 | // in _flush_sync_log with jump_to set or else a racing thread might flush | |
1797 | // our entries and our jump_to update won't be correct.) | |
1798 | while (log_flushing) { | |
1799 | dout(10) << __func__ << " log is currently flushing, waiting" << dendl; | |
1800 | log_cond.wait(l); | |
1801 | } | |
1802 | ||
7c673cae FG |
1803 | // 1. allocate new log space and jump to it. |
1804 | old_log_jump_to = log_file->fnode.get_allocated(); | |
7c673cae | 1805 | dout(10) << __func__ << " old_log_jump_to 0x" << std::hex << old_log_jump_to |
11fdf7f2 TL |
1806 | << " need 0x" << (old_log_jump_to + cct->_conf->bluefs_max_log_runway) << std::dec << dendl; |
1807 | int r = _allocate(log_file->fnode.prefer_bdev, | |
1808 | cct->_conf->bluefs_max_log_runway, &log_file->fnode); | |
1809 | ceph_assert(r == 0); | |
7c673cae FG |
1810 | dout(10) << __func__ << " log extents " << log_file->fnode.extents << dendl; |
1811 | ||
1812 | // update the log file change and log a jump to the offset where we want to | |
1813 | // write the new entries | |
1814 | log_t.op_file_update(log_file->fnode); | |
1815 | log_t.op_jump(log_seq, old_log_jump_to); | |
1816 | ||
1817 | flush_bdev(); // FIXME? | |
1818 | ||
1819 | _flush_and_sync_log(l, 0, old_log_jump_to); | |
1820 | ||
1821 | // 2. prepare compacted log | |
1822 | bluefs_transaction_t t; | |
224ce89b WB |
1823 | //avoid record two times in log_t and _compact_log_dump_metadata. |
1824 | log_t.clear(); | |
11fdf7f2 | 1825 | _compact_log_dump_metadata(&t, 0); |
7c673cae FG |
1826 | |
1827 | // conservative estimate for final encoded size | |
11fdf7f2 | 1828 | new_log_jump_to = round_up_to(t.op_bl.length() + super.block_size * 2, |
7c673cae FG |
1829 | cct->_conf->bluefs_alloc_size); |
1830 | t.op_jump(log_seq, new_log_jump_to); | |
1831 | ||
11fdf7f2 TL |
1832 | // allocate |
1833 | r = _allocate(BlueFS::BDEV_DB, new_log_jump_to, | |
1834 | &new_log->fnode); | |
1835 | ceph_assert(r == 0); | |
1836 | ||
1837 | // we might have some more ops in log_t due to _allocate call | |
1838 | t.claim_ops(log_t); | |
1839 | ||
7c673cae | 1840 | bufferlist bl; |
11fdf7f2 | 1841 | encode(t, bl); |
7c673cae FG |
1842 | _pad_bl(bl); |
1843 | ||
1844 | dout(10) << __func__ << " new_log_jump_to 0x" << std::hex << new_log_jump_to | |
1845 | << std::dec << dendl; | |
1846 | ||
7c673cae FG |
1847 | new_log_writer = _create_writer(new_log); |
1848 | new_log_writer->append(bl); | |
1849 | ||
1850 | // 3. flush | |
1851 | r = _flush(new_log_writer, true); | |
11fdf7f2 | 1852 | ceph_assert(r == 0); |
7c673cae FG |
1853 | |
1854 | // 4. wait | |
11fdf7f2 | 1855 | _flush_bdev_safely(new_log_writer); |
7c673cae | 1856 | |
11fdf7f2 | 1857 | // 5. update our log fnode |
7c673cae FG |
1858 | // discard first old_log_jump_to extents |
1859 | dout(10) << __func__ << " remove 0x" << std::hex << old_log_jump_to << std::dec | |
1860 | << " of " << log_file->fnode.extents << dendl; | |
1861 | uint64_t discarded = 0; | |
1862 | mempool::bluefs::vector<bluefs_extent_t> old_extents; | |
1863 | while (discarded < old_log_jump_to) { | |
11fdf7f2 | 1864 | ceph_assert(!log_file->fnode.extents.empty()); |
7c673cae FG |
1865 | bluefs_extent_t& e = log_file->fnode.extents.front(); |
1866 | bluefs_extent_t temp = e; | |
1867 | if (discarded + e.length <= old_log_jump_to) { | |
1868 | dout(10) << __func__ << " remove old log extent " << e << dendl; | |
1869 | discarded += e.length; | |
94b18763 | 1870 | log_file->fnode.pop_front_extent(); |
7c673cae FG |
1871 | } else { |
1872 | dout(10) << __func__ << " remove front of old log extent " << e << dendl; | |
1873 | uint64_t drop = old_log_jump_to - discarded; | |
1874 | temp.length = drop; | |
1875 | e.offset += drop; | |
1876 | e.length -= drop; | |
1877 | discarded += drop; | |
1878 | dout(10) << __func__ << " kept " << e << " removed " << temp << dendl; | |
1879 | } | |
1880 | old_extents.push_back(temp); | |
1881 | } | |
94b18763 FG |
1882 | auto from = log_file->fnode.extents.begin(); |
1883 | auto to = log_file->fnode.extents.end(); | |
1884 | while (from != to) { | |
1885 | new_log->fnode.append_extent(*from); | |
1886 | ++from; | |
1887 | } | |
7c673cae FG |
1888 | |
1889 | // clear the extents from old log file, they are added to new log | |
94b18763 | 1890 | log_file->fnode.clear_extents(); |
7c673cae | 1891 | // swap the log files. New log file is the log file now. |
94b18763 FG |
1892 | new_log->fnode.swap_extents(log_file->fnode); |
1893 | ||
7c673cae FG |
1894 | log_writer->pos = log_writer->file->fnode.size = |
1895 | log_writer->pos - old_log_jump_to + new_log_jump_to; | |
1896 | ||
11fdf7f2 | 1897 | // 6. write the super block to reflect the changes |
7c673cae FG |
1898 | dout(10) << __func__ << " writing super" << dendl; |
1899 | super.log_fnode = log_file->fnode; | |
1900 | ++super.version; | |
11fdf7f2 | 1901 | _write_super(BDEV_DB); |
7c673cae FG |
1902 | |
1903 | lock.unlock(); | |
1904 | flush_bdev(); | |
1905 | lock.lock(); | |
1906 | ||
11fdf7f2 | 1907 | // 7. release old space |
7c673cae FG |
1908 | dout(10) << __func__ << " release old log extents " << old_extents << dendl; |
1909 | for (auto& r : old_extents) { | |
1910 | pending_release[r.bdev].insert(r.offset, r.length); | |
1911 | } | |
1912 | ||
1913 | // delete the new log, remove from the dirty files list | |
1914 | _close_writer(new_log_writer); | |
1915 | if (new_log->dirty_seq) { | |
11fdf7f2 | 1916 | ceph_assert(dirty_files.count(new_log->dirty_seq)); |
7c673cae FG |
1917 | auto it = dirty_files[new_log->dirty_seq].iterator_to(*new_log); |
1918 | dirty_files[new_log->dirty_seq].erase(it); | |
1919 | } | |
1920 | new_log_writer = nullptr; | |
1921 | new_log = nullptr; | |
1922 | log_cond.notify_all(); | |
1923 | ||
1924 | dout(10) << __func__ << " log extents " << log_file->fnode.extents << dendl; | |
1925 | logger->inc(l_bluefs_log_compactions); | |
1926 | } | |
1927 | ||
1928 | void BlueFS::_pad_bl(bufferlist& bl) | |
1929 | { | |
1930 | uint64_t partial = bl.length() % super.block_size; | |
1931 | if (partial) { | |
1932 | dout(10) << __func__ << " padding with 0x" << std::hex | |
1933 | << super.block_size - partial << " zeros" << std::dec << dendl; | |
1934 | bl.append_zero(super.block_size - partial); | |
1935 | } | |
1936 | } | |
1937 | ||
1938 | void BlueFS::flush_log() | |
1939 | { | |
11fdf7f2 | 1940 | std::unique_lock l(lock); |
7c673cae FG |
1941 | flush_bdev(); |
1942 | _flush_and_sync_log(l); | |
1943 | } | |
1944 | ||
11fdf7f2 | 1945 | int BlueFS::_flush_and_sync_log(std::unique_lock<ceph::mutex>& l, |
7c673cae FG |
1946 | uint64_t want_seq, |
1947 | uint64_t jump_to) | |
1948 | { | |
1949 | while (log_flushing) { | |
1950 | dout(10) << __func__ << " want_seq " << want_seq | |
1951 | << " log is currently flushing, waiting" << dendl; | |
11fdf7f2 | 1952 | ceph_assert(!jump_to); |
7c673cae FG |
1953 | log_cond.wait(l); |
1954 | } | |
1955 | if (want_seq && want_seq <= log_seq_stable) { | |
1956 | dout(10) << __func__ << " want_seq " << want_seq << " <= log_seq_stable " | |
1957 | << log_seq_stable << ", done" << dendl; | |
11fdf7f2 | 1958 | ceph_assert(!jump_to); |
7c673cae FG |
1959 | return 0; |
1960 | } | |
1961 | if (log_t.empty() && dirty_files.empty()) { | |
1962 | dout(10) << __func__ << " want_seq " << want_seq | |
1963 | << " " << log_t << " not dirty, dirty_files empty, no-op" << dendl; | |
11fdf7f2 | 1964 | ceph_assert(!jump_to); |
7c673cae FG |
1965 | return 0; |
1966 | } | |
1967 | ||
a8e16298 TL |
1968 | vector<interval_set<uint64_t>> to_release(pending_release.size()); |
1969 | to_release.swap(pending_release); | |
1970 | ||
7c673cae | 1971 | uint64_t seq = log_t.seq = ++log_seq; |
11fdf7f2 | 1972 | ceph_assert(want_seq == 0 || want_seq <= seq); |
7c673cae FG |
1973 | log_t.uuid = super.uuid; |
1974 | ||
1975 | // log dirty files | |
1976 | auto lsi = dirty_files.find(seq); | |
1977 | if (lsi != dirty_files.end()) { | |
1978 | dout(20) << __func__ << " " << lsi->second.size() << " dirty_files" << dendl; | |
1979 | for (auto &f : lsi->second) { | |
1980 | dout(20) << __func__ << " op_file_update " << f.fnode << dendl; | |
1981 | log_t.op_file_update(f.fnode); | |
1982 | } | |
1983 | } | |
1984 | ||
1985 | dout(10) << __func__ << " " << log_t << dendl; | |
11fdf7f2 | 1986 | ceph_assert(!log_t.empty()); |
7c673cae FG |
1987 | |
1988 | // allocate some more space (before we run out)? | |
1989 | int64_t runway = log_writer->file->fnode.get_allocated() - | |
1990 | log_writer->get_effective_write_pos(); | |
1991 | if (runway < (int64_t)cct->_conf->bluefs_min_log_runway) { | |
1992 | dout(10) << __func__ << " allocating more log runway (0x" | |
1993 | << std::hex << runway << std::dec << " remaining)" << dendl; | |
1994 | while (new_log_writer) { | |
1995 | dout(10) << __func__ << " waiting for async compaction" << dendl; | |
1996 | log_cond.wait(l); | |
1997 | } | |
1998 | int r = _allocate(log_writer->file->fnode.prefer_bdev, | |
1999 | cct->_conf->bluefs_max_log_runway, | |
94b18763 | 2000 | &log_writer->file->fnode); |
11fdf7f2 | 2001 | ceph_assert(r == 0); |
7c673cae FG |
2002 | log_t.op_file_update(log_writer->file->fnode); |
2003 | } | |
2004 | ||
2005 | bufferlist bl; | |
11fdf7f2 TL |
2006 | bl.reserve(super.block_size); |
2007 | encode(log_t, bl); | |
7c673cae | 2008 | // pad to block boundary |
11fdf7f2 TL |
2009 | size_t realign = super.block_size - (bl.length() % super.block_size); |
2010 | if (realign && realign != super.block_size) | |
2011 | bl.append_zero(realign); | |
2012 | ||
7c673cae FG |
2013 | logger->inc(l_bluefs_logged_bytes, bl.length()); |
2014 | ||
2015 | log_writer->append(bl); | |
2016 | ||
2017 | log_t.clear(); | |
2018 | log_t.seq = 0; // just so debug output is less confusing | |
2019 | log_flushing = true; | |
2020 | ||
2021 | int r = _flush(log_writer, true); | |
11fdf7f2 | 2022 | ceph_assert(r == 0); |
7c673cae FG |
2023 | |
2024 | if (jump_to) { | |
2025 | dout(10) << __func__ << " jumping log offset from 0x" << std::hex | |
2026 | << log_writer->pos << " -> 0x" << jump_to << std::dec << dendl; | |
2027 | log_writer->pos = jump_to; | |
2028 | log_writer->file->fnode.size = jump_to; | |
2029 | } | |
2030 | ||
2031 | _flush_bdev_safely(log_writer); | |
2032 | ||
2033 | log_flushing = false; | |
2034 | log_cond.notify_all(); | |
2035 | ||
2036 | // clean dirty files | |
2037 | if (seq > log_seq_stable) { | |
2038 | log_seq_stable = seq; | |
2039 | dout(20) << __func__ << " log_seq_stable " << log_seq_stable << dendl; | |
2040 | ||
2041 | auto p = dirty_files.begin(); | |
2042 | while (p != dirty_files.end()) { | |
2043 | if (p->first > log_seq_stable) { | |
2044 | dout(20) << __func__ << " done cleaning up dirty files" << dendl; | |
2045 | break; | |
2046 | } | |
2047 | ||
2048 | auto l = p->second.begin(); | |
2049 | while (l != p->second.end()) { | |
2050 | File *file = &*l; | |
11fdf7f2 TL |
2051 | ceph_assert(file->dirty_seq > 0); |
2052 | ceph_assert(file->dirty_seq <= log_seq_stable); | |
7c673cae FG |
2053 | dout(20) << __func__ << " cleaned file " << file->fnode << dendl; |
2054 | file->dirty_seq = 0; | |
2055 | p->second.erase(l++); | |
2056 | } | |
2057 | ||
11fdf7f2 | 2058 | ceph_assert(p->second.empty()); |
7c673cae FG |
2059 | dirty_files.erase(p++); |
2060 | } | |
2061 | } else { | |
2062 | dout(20) << __func__ << " log_seq_stable " << log_seq_stable | |
2063 | << " already >= out seq " << seq | |
2064 | << ", we lost a race against another log flush, done" << dendl; | |
2065 | } | |
a8e16298 TL |
2066 | |
2067 | for (unsigned i = 0; i < to_release.size(); ++i) { | |
2068 | if (!to_release[i].empty()) { | |
2069 | /* OK, now we have the guarantee alloc[i] won't be null. */ | |
11fdf7f2 TL |
2070 | int r = 0; |
2071 | if (cct->_conf->bdev_enable_discard && cct->_conf->bdev_async_discard) { | |
2072 | r = bdev[i]->queue_discard(to_release[i]); | |
2073 | if (r == 0) | |
2074 | continue; | |
2075 | } else if (cct->_conf->bdev_enable_discard) { | |
2076 | for (auto p = to_release[i].begin(); p != to_release[i].end(); ++p) { | |
2077 | bdev[i]->discard(p.get_start(), p.get_len()); | |
2078 | } | |
2079 | } | |
a8e16298 TL |
2080 | alloc[i]->release(to_release[i]); |
2081 | } | |
2082 | } | |
2083 | ||
7c673cae FG |
2084 | _update_logger_stats(); |
2085 | ||
2086 | return 0; | |
2087 | } | |
2088 | ||
2089 | int BlueFS::_flush_range(FileWriter *h, uint64_t offset, uint64_t length) | |
2090 | { | |
2091 | dout(10) << __func__ << " " << h << " pos 0x" << std::hex << h->pos | |
2092 | << " 0x" << offset << "~" << length << std::dec | |
2093 | << " to " << h->file->fnode << dendl; | |
11fdf7f2 TL |
2094 | ceph_assert(!h->file->deleted); |
2095 | ceph_assert(h->file->num_readers.load() == 0); | |
7c673cae FG |
2096 | |
2097 | h->buffer_appender.flush(); | |
2098 | ||
2099 | bool buffered; | |
2100 | if (h->file->fnode.ino == 1) | |
2101 | buffered = false; | |
2102 | else | |
2103 | buffered = cct->_conf->bluefs_buffered_io; | |
2104 | ||
2105 | if (offset + length <= h->pos) | |
2106 | return 0; | |
2107 | if (offset < h->pos) { | |
2108 | length -= h->pos - offset; | |
2109 | offset = h->pos; | |
2110 | dout(10) << " still need 0x" | |
2111 | << std::hex << offset << "~" << length << std::dec | |
2112 | << dendl; | |
2113 | } | |
11fdf7f2 | 2114 | ceph_assert(offset <= h->file->fnode.size); |
7c673cae FG |
2115 | |
2116 | uint64_t allocated = h->file->fnode.get_allocated(); | |
2117 | ||
2118 | // do not bother to dirty the file if we are overwriting | |
2119 | // previously allocated extents. | |
2120 | bool must_dirty = false; | |
2121 | if (allocated < offset + length) { | |
2122 | // we should never run out of log space here; see the min runway check | |
2123 | // in _flush_and_sync_log. | |
11fdf7f2 | 2124 | ceph_assert(h->file->fnode.ino != 1); |
7c673cae FG |
2125 | int r = _allocate(h->file->fnode.prefer_bdev, |
2126 | offset + length - allocated, | |
94b18763 | 2127 | &h->file->fnode); |
7c673cae FG |
2128 | if (r < 0) { |
2129 | derr << __func__ << " allocated: 0x" << std::hex << allocated | |
2130 | << " offset: 0x" << offset << " length: 0x" << length << std::dec | |
2131 | << dendl; | |
11fdf7f2 | 2132 | ceph_abort_msg("bluefs enospc"); |
7c673cae FG |
2133 | return r; |
2134 | } | |
7c673cae FG |
2135 | if (cct->_conf->bluefs_preextend_wal_files && |
2136 | h->writer_type == WRITER_WAL) { | |
2137 | // NOTE: this *requires* that rocksdb also has log recycling | |
2138 | // enabled and is therefore doing robust CRCs on the log | |
2139 | // records. otherwise, we will fail to reply the rocksdb log | |
2140 | // properly due to garbage on the device. | |
2141 | h->file->fnode.size = h->file->fnode.get_allocated(); | |
2142 | dout(10) << __func__ << " extending WAL size to 0x" << std::hex | |
2143 | << h->file->fnode.size << std::dec << " to include allocated" | |
2144 | << dendl; | |
2145 | } | |
2146 | must_dirty = true; | |
2147 | } | |
2148 | if (h->file->fnode.size < offset + length) { | |
2149 | h->file->fnode.size = offset + length; | |
2150 | if (h->file->fnode.ino > 1) { | |
2151 | // we do not need to dirty the log file (or it's compacting | |
2152 | // replacement) when the file size changes because replay is | |
2153 | // smart enough to discover it on its own. | |
2154 | must_dirty = true; | |
2155 | } | |
2156 | } | |
2157 | if (must_dirty) { | |
2158 | h->file->fnode.mtime = ceph_clock_now(); | |
11fdf7f2 | 2159 | ceph_assert(h->file->fnode.ino >= 1); |
7c673cae FG |
2160 | if (h->file->dirty_seq == 0) { |
2161 | h->file->dirty_seq = log_seq + 1; | |
2162 | dirty_files[h->file->dirty_seq].push_back(*h->file); | |
2163 | dout(20) << __func__ << " dirty_seq = " << log_seq + 1 | |
2164 | << " (was clean)" << dendl; | |
2165 | } else { | |
2166 | if (h->file->dirty_seq != log_seq + 1) { | |
2167 | // need re-dirty, erase from list first | |
11fdf7f2 | 2168 | ceph_assert(dirty_files.count(h->file->dirty_seq)); |
7c673cae FG |
2169 | auto it = dirty_files[h->file->dirty_seq].iterator_to(*h->file); |
2170 | dirty_files[h->file->dirty_seq].erase(it); | |
2171 | h->file->dirty_seq = log_seq + 1; | |
2172 | dirty_files[h->file->dirty_seq].push_back(*h->file); | |
2173 | dout(20) << __func__ << " dirty_seq = " << log_seq + 1 | |
2174 | << " (was " << h->file->dirty_seq << ")" << dendl; | |
2175 | } else { | |
2176 | dout(20) << __func__ << " dirty_seq = " << log_seq + 1 | |
2177 | << " (unchanged, do nothing) " << dendl; | |
2178 | } | |
2179 | } | |
2180 | } | |
2181 | dout(20) << __func__ << " file now " << h->file->fnode << dendl; | |
2182 | ||
2183 | uint64_t x_off = 0; | |
2184 | auto p = h->file->fnode.seek(offset, &x_off); | |
11fdf7f2 | 2185 | ceph_assert(p != h->file->fnode.extents.end()); |
7c673cae FG |
2186 | dout(20) << __func__ << " in " << *p << " x_off 0x" |
2187 | << std::hex << x_off << std::dec << dendl; | |
2188 | ||
2189 | unsigned partial = x_off & ~super.block_mask(); | |
2190 | bufferlist bl; | |
2191 | if (partial) { | |
2192 | dout(20) << __func__ << " using partial tail 0x" | |
2193 | << std::hex << partial << std::dec << dendl; | |
11fdf7f2 | 2194 | ceph_assert(h->tail_block.length() == partial); |
31f18b77 | 2195 | bl.claim_append_piecewise(h->tail_block); |
7c673cae FG |
2196 | x_off -= partial; |
2197 | offset -= partial; | |
2198 | length += partial; | |
2199 | dout(20) << __func__ << " waiting for previous aio to complete" << dendl; | |
2200 | for (auto p : h->iocv) { | |
2201 | if (p) { | |
2202 | p->aio_wait(); | |
2203 | } | |
2204 | } | |
2205 | } | |
2206 | if (length == partial + h->buffer.length()) { | |
31f18b77 | 2207 | bl.claim_append_piecewise(h->buffer); |
7c673cae FG |
2208 | } else { |
2209 | bufferlist t; | |
31f18b77 FG |
2210 | h->buffer.splice(0, length, &t); |
2211 | bl.claim_append_piecewise(t); | |
7c673cae FG |
2212 | t.substr_of(h->buffer, length, h->buffer.length() - length); |
2213 | h->buffer.swap(t); | |
2214 | dout(20) << " leaving 0x" << std::hex << h->buffer.length() << std::dec | |
2215 | << " unflushed" << dendl; | |
2216 | } | |
11fdf7f2 | 2217 | ceph_assert(bl.length() == length); |
7c673cae FG |
2218 | |
2219 | switch (h->writer_type) { | |
2220 | case WRITER_WAL: | |
2221 | logger->inc(l_bluefs_bytes_written_wal, length); | |
2222 | break; | |
2223 | case WRITER_SST: | |
2224 | logger->inc(l_bluefs_bytes_written_sst, length); | |
2225 | break; | |
2226 | } | |
2227 | ||
2228 | dout(30) << "dump:\n"; | |
2229 | bl.hexdump(*_dout); | |
2230 | *_dout << dendl; | |
2231 | ||
2232 | h->pos = offset + length; | |
2233 | h->tail_block.clear(); | |
2234 | ||
2235 | uint64_t bloff = 0; | |
11fdf7f2 | 2236 | uint64_t bytes_written_slow = 0; |
7c673cae | 2237 | while (length > 0) { |
11fdf7f2 | 2238 | uint64_t x_len = std::min(p->length - x_off, length); |
7c673cae FG |
2239 | bufferlist t; |
2240 | t.substr_of(bl, bloff, x_len); | |
2241 | unsigned tail = x_len & ~super.block_mask(); | |
2242 | if (tail) { | |
2243 | size_t zlen = super.block_size - tail; | |
2244 | dout(20) << __func__ << " caching tail of 0x" | |
2245 | << std::hex << tail | |
2246 | << " and padding block with 0x" << zlen | |
2247 | << std::dec << dendl; | |
2248 | h->tail_block.substr_of(bl, bl.length() - tail, tail); | |
2249 | if (h->file->fnode.ino > 1) { | |
2250 | // we are using the page_aligned_appender, and can safely use | |
2251 | // the tail of the raw buffer. | |
2252 | const bufferptr &last = t.back(); | |
2253 | if (last.unused_tail_length() < zlen) { | |
2254 | derr << " wtf, last is " << last << " from " << t << dendl; | |
11fdf7f2 | 2255 | ceph_assert(last.unused_tail_length() >= zlen); |
7c673cae FG |
2256 | } |
2257 | bufferptr z = last; | |
2258 | z.set_offset(last.offset() + last.length()); | |
2259 | z.set_length(zlen); | |
2260 | z.zero(); | |
2261 | t.append(z, 0, zlen); | |
2262 | } else { | |
2263 | t.append_zero(zlen); | |
2264 | } | |
2265 | } | |
2266 | if (cct->_conf->bluefs_sync_write) { | |
11fdf7f2 | 2267 | bdev[p->bdev]->write(p->offset + x_off, t, buffered, h->write_hint); |
7c673cae | 2268 | } else { |
11fdf7f2 TL |
2269 | bdev[p->bdev]->aio_write(p->offset + x_off, t, h->iocv[p->bdev], buffered, h->write_hint); |
2270 | } | |
2271 | h->dirty_devs[p->bdev] = true; | |
2272 | if (p->bdev == BDEV_SLOW) { | |
2273 | bytes_written_slow += t.length(); | |
7c673cae | 2274 | } |
11fdf7f2 | 2275 | |
7c673cae FG |
2276 | bloff += x_len; |
2277 | length -= x_len; | |
2278 | ++p; | |
2279 | x_off = 0; | |
2280 | } | |
11fdf7f2 | 2281 | logger->inc(l_bluefs_bytes_written_slow, bytes_written_slow); |
7c673cae FG |
2282 | for (unsigned i = 0; i < MAX_BDEV; ++i) { |
2283 | if (bdev[i]) { | |
11fdf7f2 | 2284 | if (h->iocv[i] && h->iocv[i]->has_pending_aios()) { |
7c673cae FG |
2285 | bdev[i]->aio_submit(h->iocv[i]); |
2286 | } | |
2287 | } | |
2288 | } | |
2289 | dout(20) << __func__ << " h " << h << " pos now 0x" | |
2290 | << std::hex << h->pos << std::dec << dendl; | |
2291 | return 0; | |
2292 | } | |
2293 | ||
11fdf7f2 | 2294 | #ifdef HAVE_LIBAIO |
7c673cae FG |
2295 | // we need to retire old completed aios so they don't stick around in |
2296 | // memory indefinitely (along with their bufferlist refs). | |
2297 | void BlueFS::_claim_completed_aios(FileWriter *h, list<aio_t> *ls) | |
2298 | { | |
2299 | for (auto p : h->iocv) { | |
2300 | if (p) { | |
2301 | ls->splice(ls->end(), p->running_aios); | |
2302 | } | |
2303 | } | |
2304 | dout(10) << __func__ << " got " << ls->size() << " aios" << dendl; | |
2305 | } | |
2306 | ||
2307 | void BlueFS::wait_for_aio(FileWriter *h) | |
2308 | { | |
2309 | // NOTE: this is safe to call without a lock, as long as our reference is | |
2310 | // stable. | |
2311 | dout(10) << __func__ << " " << h << dendl; | |
2312 | utime_t start = ceph_clock_now(); | |
2313 | for (auto p : h->iocv) { | |
2314 | if (p) { | |
2315 | p->aio_wait(); | |
2316 | } | |
2317 | } | |
11fdf7f2 | 2318 | dout(10) << __func__ << " " << h << " done in " << (ceph_clock_now() - start) << dendl; |
7c673cae | 2319 | } |
11fdf7f2 | 2320 | #endif |
7c673cae FG |
2321 | |
2322 | int BlueFS::_flush(FileWriter *h, bool force) | |
2323 | { | |
2324 | h->buffer_appender.flush(); | |
2325 | uint64_t length = h->buffer.length(); | |
2326 | uint64_t offset = h->pos; | |
2327 | if (!force && | |
2328 | length < cct->_conf->bluefs_min_flush_size) { | |
2329 | dout(10) << __func__ << " " << h << " ignoring, length " << length | |
2330 | << " < min_flush_size " << cct->_conf->bluefs_min_flush_size | |
2331 | << dendl; | |
2332 | return 0; | |
2333 | } | |
2334 | if (length == 0) { | |
2335 | dout(10) << __func__ << " " << h << " no dirty data on " | |
2336 | << h->file->fnode << dendl; | |
2337 | return 0; | |
2338 | } | |
2339 | dout(10) << __func__ << " " << h << " 0x" | |
2340 | << std::hex << offset << "~" << length << std::dec | |
2341 | << " to " << h->file->fnode << dendl; | |
11fdf7f2 | 2342 | ceph_assert(h->pos <= h->file->fnode.size); |
7c673cae FG |
2343 | return _flush_range(h, offset, length); |
2344 | } | |
2345 | ||
2346 | int BlueFS::_truncate(FileWriter *h, uint64_t offset) | |
2347 | { | |
2348 | dout(10) << __func__ << " 0x" << std::hex << offset << std::dec | |
2349 | << " file " << h->file->fnode << dendl; | |
2350 | if (h->file->deleted) { | |
2351 | dout(10) << __func__ << " deleted, no-op" << dendl; | |
2352 | return 0; | |
2353 | } | |
2354 | ||
2355 | // we never truncate internal log files | |
11fdf7f2 | 2356 | ceph_assert(h->file->fnode.ino > 1); |
7c673cae FG |
2357 | |
2358 | h->buffer_appender.flush(); | |
2359 | ||
2360 | // truncate off unflushed data? | |
2361 | if (h->pos < offset && | |
2362 | h->pos + h->buffer.length() > offset) { | |
2363 | bufferlist t; | |
2364 | dout(20) << __func__ << " tossing out last " << offset - h->pos | |
2365 | << " unflushed bytes" << dendl; | |
2366 | t.substr_of(h->buffer, 0, offset - h->pos); | |
2367 | h->buffer.swap(t); | |
11fdf7f2 | 2368 | ceph_abort_msg("actually this shouldn't happen"); |
7c673cae FG |
2369 | } |
2370 | if (h->buffer.length()) { | |
2371 | int r = _flush(h, true); | |
2372 | if (r < 0) | |
2373 | return r; | |
2374 | } | |
2375 | if (offset == h->file->fnode.size) { | |
2376 | return 0; // no-op! | |
2377 | } | |
2378 | if (offset > h->file->fnode.size) { | |
11fdf7f2 | 2379 | ceph_abort_msg("truncate up not supported"); |
7c673cae | 2380 | } |
11fdf7f2 | 2381 | ceph_assert(h->file->fnode.size >= offset); |
7c673cae FG |
2382 | h->file->fnode.size = offset; |
2383 | log_t.op_file_update(h->file->fnode); | |
2384 | return 0; | |
2385 | } | |
2386 | ||
11fdf7f2 | 2387 | int BlueFS::_fsync(FileWriter *h, std::unique_lock<ceph::mutex>& l) |
7c673cae FG |
2388 | { |
2389 | dout(10) << __func__ << " " << h << " " << h->file->fnode << dendl; | |
2390 | int r = _flush(h, true); | |
2391 | if (r < 0) | |
2392 | return r; | |
2393 | uint64_t old_dirty_seq = h->file->dirty_seq; | |
2394 | ||
2395 | _flush_bdev_safely(h); | |
2396 | ||
2397 | if (old_dirty_seq) { | |
2398 | uint64_t s = log_seq; | |
2399 | dout(20) << __func__ << " file metadata was dirty (" << old_dirty_seq | |
2400 | << ") on " << h->file->fnode << ", flushing log" << dendl; | |
2401 | _flush_and_sync_log(l, old_dirty_seq); | |
11fdf7f2 | 2402 | ceph_assert(h->file->dirty_seq == 0 || // cleaned |
7c673cae FG |
2403 | h->file->dirty_seq > s); // or redirtied by someone else |
2404 | } | |
2405 | return 0; | |
2406 | } | |
2407 | ||
2408 | void BlueFS::_flush_bdev_safely(FileWriter *h) | |
2409 | { | |
11fdf7f2 TL |
2410 | std::array<bool, MAX_BDEV> flush_devs = h->dirty_devs; |
2411 | h->dirty_devs.fill(false); | |
2412 | #ifdef HAVE_LIBAIO | |
7c673cae FG |
2413 | if (!cct->_conf->bluefs_sync_write) { |
2414 | list<aio_t> completed_ios; | |
2415 | _claim_completed_aios(h, &completed_ios); | |
2416 | lock.unlock(); | |
2417 | wait_for_aio(h); | |
2418 | completed_ios.clear(); | |
11fdf7f2 | 2419 | flush_bdev(flush_devs); |
7c673cae | 2420 | lock.lock(); |
11fdf7f2 TL |
2421 | } else |
2422 | #endif | |
2423 | { | |
7c673cae | 2424 | lock.unlock(); |
11fdf7f2 | 2425 | flush_bdev(flush_devs); |
7c673cae FG |
2426 | lock.lock(); |
2427 | } | |
2428 | } | |
2429 | ||
11fdf7f2 TL |
2430 | void BlueFS::flush_bdev(std::array<bool, MAX_BDEV>& dirty_bdevs) |
2431 | { | |
2432 | // NOTE: this is safe to call without a lock. | |
2433 | dout(20) << __func__ << dendl; | |
2434 | for (unsigned i = 0; i < MAX_BDEV; i++) { | |
2435 | if (dirty_bdevs[i]) | |
2436 | bdev[i]->flush(); | |
2437 | } | |
2438 | } | |
2439 | ||
7c673cae FG |
2440 | void BlueFS::flush_bdev() |
2441 | { | |
2442 | // NOTE: this is safe to call without a lock. | |
2443 | dout(20) << __func__ << dendl; | |
2444 | for (auto p : bdev) { | |
2445 | if (p) | |
2446 | p->flush(); | |
2447 | } | |
2448 | } | |
2449 | ||
11fdf7f2 TL |
2450 | int BlueFS::_expand_slow_device(uint64_t need, PExtentVector& extents) |
2451 | { | |
2452 | int r = -ENOSPC; | |
2453 | if (slow_dev_expander) { | |
2454 | auto min_alloc_size = cct->_conf->bluefs_alloc_size; | |
2455 | int id = _get_slow_device_id(); | |
2456 | ceph_assert(id <= (int)alloc.size() && alloc[id]); | |
2457 | auto min_need = round_up_to(need, min_alloc_size); | |
2458 | need = std::max(need, | |
2459 | slow_dev_expander->get_recommended_expansion_delta( | |
2460 | alloc[id]->get_free(), block_all[id].size())); | |
2461 | ||
2462 | need = round_up_to(need, min_alloc_size); | |
2463 | dout(10) << __func__ << " expanding slow device by 0x" | |
2464 | << std::hex << need << std::dec | |
2465 | << dendl; | |
2466 | r = slow_dev_expander->allocate_freespace(min_need, need, extents); | |
2467 | } | |
2468 | return r; | |
2469 | } | |
2470 | ||
2471 | int BlueFS::_allocate_without_fallback(uint8_t id, uint64_t len, | |
2472 | PExtentVector* extents) | |
2473 | { | |
2474 | dout(10) << __func__ << " len 0x" << std::hex << len << std::dec | |
2475 | << " from " << (int)id << dendl; | |
2476 | assert(id < alloc.size()); | |
2477 | uint64_t min_alloc_size = cct->_conf->bluefs_alloc_size; | |
2478 | ||
2479 | uint64_t left = round_up_to(len, min_alloc_size); | |
2480 | ||
2481 | if (!alloc[id]) { | |
2482 | return -ENOENT; | |
2483 | } | |
2484 | extents->reserve(4); // 4 should be (more than) enough for most allocations | |
2485 | int64_t alloc_len = alloc[id]->allocate(left, min_alloc_size, 0, extents); | |
2486 | if (alloc_len < (int64_t)left) { | |
2487 | if (alloc_len != 0) { | |
2488 | alloc[id]->release(*extents); | |
2489 | } | |
2490 | if (bdev[id]) | |
2491 | derr << __func__ << " failed to allocate 0x" << std::hex << left | |
2492 | << " on bdev " << (int)id | |
2493 | << ", free 0x" << alloc[id]->get_free() << std::dec << dendl; | |
2494 | else | |
2495 | derr << __func__ << " failed to allocate 0x" << std::hex << left | |
2496 | << " on bdev " << (int)id << ", dne" << std::dec << dendl; | |
2497 | if (alloc[id]) | |
2498 | alloc[id]->dump(); | |
2499 | return -ENOSPC; | |
2500 | } | |
2501 | ||
2502 | return 0; | |
2503 | } | |
2504 | ||
7c673cae | 2505 | int BlueFS::_allocate(uint8_t id, uint64_t len, |
94b18763 | 2506 | bluefs_fnode_t* node) |
7c673cae FG |
2507 | { |
2508 | dout(10) << __func__ << " len 0x" << std::hex << len << std::dec | |
2509 | << " from " << (int)id << dendl; | |
11fdf7f2 | 2510 | ceph_assert(id < alloc.size()); |
7c673cae FG |
2511 | uint64_t min_alloc_size = cct->_conf->bluefs_alloc_size; |
2512 | ||
11fdf7f2 | 2513 | uint64_t left = round_up_to(len, min_alloc_size); |
b32b8144 | 2514 | int64_t alloc_len = 0; |
a8e16298 | 2515 | PExtentVector extents; |
b32b8144 | 2516 | |
11fdf7f2 | 2517 | uint64_t hint = 0; |
7c673cae | 2518 | if (alloc[id]) { |
94b18763 FG |
2519 | if (!node->extents.empty() && node->extents.back().bdev == id) { |
2520 | hint = node->extents.back().end(); | |
11fdf7f2 | 2521 | } |
b32b8144 FG |
2522 | extents.reserve(4); // 4 should be (more than) enough for most allocations |
2523 | alloc_len = alloc[id]->allocate(left, min_alloc_size, hint, &extents); | |
2524 | } | |
a8e16298 | 2525 | if (alloc_len < (int64_t)left) { |
11fdf7f2 | 2526 | if (alloc_len > 0) { |
a8e16298 | 2527 | alloc[id]->release(extents); |
b32b8144 | 2528 | } |
7c673cae FG |
2529 | if (id != BDEV_SLOW) { |
2530 | if (bdev[id]) { | |
2531 | dout(1) << __func__ << " failed to allocate 0x" << std::hex << left | |
2532 | << " on bdev " << (int)id | |
2533 | << ", free 0x" << alloc[id]->get_free() | |
2534 | << "; fallback to bdev " << (int)id + 1 | |
2535 | << std::dec << dendl; | |
2536 | } | |
94b18763 | 2537 | return _allocate(id + 1, len, node); |
7c673cae | 2538 | } |
11fdf7f2 TL |
2539 | dout(1) << __func__ << " unable to allocate 0x" << std::hex << left |
2540 | << " on bdev " << (int)id << ", free 0x" | |
2541 | << (alloc[id] ? alloc[id]->get_free() : (uint64_t)-1) | |
2542 | << "; fallback to slow device expander " | |
2543 | << std::dec << dendl; | |
2544 | extents.clear(); | |
2545 | if (_expand_slow_device(left, extents) == 0) { | |
2546 | id = _get_slow_device_id(); | |
2547 | for (auto& e : extents) { | |
2548 | _add_block_extent(id, e.offset, e.length); | |
2549 | } | |
2550 | extents.clear(); | |
2551 | auto* last_alloc = alloc[id]; | |
2552 | ceph_assert(last_alloc); | |
2553 | // try again | |
2554 | alloc_len = last_alloc->allocate(left, min_alloc_size, hint, &extents); | |
2555 | if (alloc_len < (int64_t)left) { | |
2556 | if (alloc_len > 0) { | |
2557 | last_alloc->release(extents); | |
2558 | } | |
2559 | derr << __func__ << " failed to allocate 0x" << std::hex << left | |
2560 | << " on bdev " << (int)id | |
2561 | << ", free 0x" << last_alloc->get_free() << std::dec << dendl; | |
2562 | return -ENOSPC; | |
2563 | } | |
2564 | } else { | |
2565 | derr << __func__ << " failed to expand slow device to fit +0x" | |
2566 | << std::hex << left << std::dec | |
2567 | << dendl; | |
2568 | return -ENOSPC; | |
2569 | } | |
2570 | } else { | |
2571 | uint64_t total_allocated = | |
2572 | block_all[id].size() - alloc[id]->get_free(); | |
2573 | if (max_bytes[id] < total_allocated) { | |
2574 | logger->set(max_bytes_pcounters[id], total_allocated); | |
2575 | max_bytes[id] = total_allocated; | |
2576 | } | |
7c673cae FG |
2577 | } |
2578 | ||
2579 | for (auto& p : extents) { | |
94b18763 | 2580 | node->append_extent(bluefs_extent_t(id, p.offset, p.length)); |
7c673cae FG |
2581 | } |
2582 | ||
2583 | return 0; | |
2584 | } | |
2585 | ||
2586 | int BlueFS::_preallocate(FileRef f, uint64_t off, uint64_t len) | |
2587 | { | |
2588 | dout(10) << __func__ << " file " << f->fnode << " 0x" | |
2589 | << std::hex << off << "~" << len << std::dec << dendl; | |
2590 | if (f->deleted) { | |
2591 | dout(10) << __func__ << " deleted, no-op" << dendl; | |
2592 | return 0; | |
2593 | } | |
11fdf7f2 | 2594 | ceph_assert(f->fnode.ino > 1); |
7c673cae FG |
2595 | uint64_t allocated = f->fnode.get_allocated(); |
2596 | if (off + len > allocated) { | |
2597 | uint64_t want = off + len - allocated; | |
94b18763 | 2598 | int r = _allocate(f->fnode.prefer_bdev, want, &f->fnode); |
7c673cae FG |
2599 | if (r < 0) |
2600 | return r; | |
7c673cae FG |
2601 | log_t.op_file_update(f->fnode); |
2602 | } | |
2603 | return 0; | |
2604 | } | |
2605 | ||
2606 | void BlueFS::sync_metadata() | |
2607 | { | |
11fdf7f2 | 2608 | std::unique_lock l(lock); |
7c673cae FG |
2609 | if (log_t.empty()) { |
2610 | dout(10) << __func__ << " - no pending log events" << dendl; | |
11fdf7f2 TL |
2611 | } else { |
2612 | dout(10) << __func__ << dendl; | |
2613 | utime_t start = ceph_clock_now(); | |
2614 | flush_bdev(); // FIXME? | |
2615 | _flush_and_sync_log(l); | |
2616 | dout(10) << __func__ << " done in " << (ceph_clock_now() - start) << dendl; | |
7c673cae | 2617 | } |
7c673cae FG |
2618 | |
2619 | if (_should_compact_log()) { | |
2620 | if (cct->_conf->bluefs_compact_log_sync) { | |
2621 | _compact_log_sync(); | |
2622 | } else { | |
2623 | _compact_log_async(l); | |
2624 | } | |
2625 | } | |
7c673cae FG |
2626 | } |
2627 | ||
2628 | int BlueFS::open_for_write( | |
2629 | const string& dirname, | |
2630 | const string& filename, | |
2631 | FileWriter **h, | |
2632 | bool overwrite) | |
2633 | { | |
11fdf7f2 | 2634 | std::lock_guard l(lock); |
7c673cae FG |
2635 | dout(10) << __func__ << " " << dirname << "/" << filename << dendl; |
2636 | map<string,DirRef>::iterator p = dir_map.find(dirname); | |
2637 | DirRef dir; | |
2638 | if (p == dir_map.end()) { | |
2639 | // implicitly create the dir | |
2640 | dout(20) << __func__ << " dir " << dirname | |
2641 | << " does not exist" << dendl; | |
2642 | return -ENOENT; | |
2643 | } else { | |
2644 | dir = p->second; | |
2645 | } | |
2646 | ||
2647 | FileRef file; | |
2648 | bool create = false; | |
2649 | map<string,FileRef>::iterator q = dir->file_map.find(filename); | |
2650 | if (q == dir->file_map.end()) { | |
2651 | if (overwrite) { | |
2652 | dout(20) << __func__ << " dir " << dirname << " (" << dir | |
2653 | << ") file " << filename | |
2654 | << " does not exist" << dendl; | |
2655 | return -ENOENT; | |
2656 | } | |
2657 | file = new File; | |
2658 | file->fnode.ino = ++ino_last; | |
2659 | file_map[ino_last] = file; | |
2660 | dir->file_map[filename] = file; | |
2661 | ++file->refs; | |
2662 | create = true; | |
2663 | } else { | |
2664 | // overwrite existing file? | |
2665 | file = q->second; | |
2666 | if (overwrite) { | |
2667 | dout(20) << __func__ << " dir " << dirname << " (" << dir | |
2668 | << ") file " << filename | |
2669 | << " already exists, overwrite in place" << dendl; | |
2670 | } else { | |
2671 | dout(20) << __func__ << " dir " << dirname << " (" << dir | |
2672 | << ") file " << filename | |
2673 | << " already exists, truncate + overwrite" << dendl; | |
2674 | file->fnode.size = 0; | |
2675 | for (auto& p : file->fnode.extents) { | |
2676 | pending_release[p.bdev].insert(p.offset, p.length); | |
2677 | } | |
94b18763 FG |
2678 | |
2679 | file->fnode.clear_extents(); | |
7c673cae FG |
2680 | } |
2681 | } | |
11fdf7f2 | 2682 | ceph_assert(file->fnode.ino > 1); |
7c673cae FG |
2683 | |
2684 | file->fnode.mtime = ceph_clock_now(); | |
2685 | file->fnode.prefer_bdev = BlueFS::BDEV_DB; | |
2686 | if (dirname.length() > 5) { | |
2687 | // the "db.slow" and "db.wal" directory names are hard-coded at | |
2688 | // match up with bluestore. the slow device is always the second | |
2689 | // one (when a dedicated block.db device is present and used at | |
2690 | // bdev 0). the wal device is always last. | |
31f18b77 | 2691 | if (boost::algorithm::ends_with(dirname, ".slow")) { |
7c673cae FG |
2692 | file->fnode.prefer_bdev = BlueFS::BDEV_SLOW; |
2693 | } else if (boost::algorithm::ends_with(dirname, ".wal")) { | |
2694 | file->fnode.prefer_bdev = BlueFS::BDEV_WAL; | |
2695 | } | |
2696 | } | |
2697 | dout(20) << __func__ << " mapping " << dirname << "/" << filename | |
2698 | << " to bdev " << (int)file->fnode.prefer_bdev << dendl; | |
2699 | ||
2700 | log_t.op_file_update(file->fnode); | |
2701 | if (create) | |
2702 | log_t.op_dir_link(dirname, filename, file->fnode.ino); | |
2703 | ||
2704 | *h = _create_writer(file); | |
2705 | ||
2706 | if (boost::algorithm::ends_with(filename, ".log")) { | |
2707 | (*h)->writer_type = BlueFS::WRITER_WAL; | |
2708 | if (logger && !overwrite) { | |
2709 | logger->inc(l_bluefs_files_written_wal); | |
2710 | } | |
2711 | } else if (boost::algorithm::ends_with(filename, ".sst")) { | |
2712 | (*h)->writer_type = BlueFS::WRITER_SST; | |
2713 | if (logger) { | |
2714 | logger->inc(l_bluefs_files_written_sst); | |
2715 | } | |
2716 | } | |
2717 | ||
2718 | dout(10) << __func__ << " h " << *h << " on " << file->fnode << dendl; | |
2719 | return 0; | |
2720 | } | |
2721 | ||
2722 | BlueFS::FileWriter *BlueFS::_create_writer(FileRef f) | |
2723 | { | |
2724 | FileWriter *w = new FileWriter(f); | |
2725 | for (unsigned i = 0; i < MAX_BDEV; ++i) { | |
2726 | if (bdev[i]) { | |
2727 | w->iocv[i] = new IOContext(cct, NULL); | |
7c673cae FG |
2728 | } |
2729 | } | |
2730 | return w; | |
2731 | } | |
2732 | ||
2733 | void BlueFS::_close_writer(FileWriter *h) | |
2734 | { | |
2735 | dout(10) << __func__ << " " << h << " type " << h->writer_type << dendl; | |
2736 | for (unsigned i=0; i<MAX_BDEV; ++i) { | |
2737 | if (bdev[i]) { | |
11fdf7f2 TL |
2738 | if (h->iocv[i]) { |
2739 | h->iocv[i]->aio_wait(); | |
2740 | bdev[i]->queue_reap_ioc(h->iocv[i]); | |
2741 | } | |
7c673cae FG |
2742 | } |
2743 | } | |
2744 | delete h; | |
2745 | } | |
2746 | ||
2747 | int BlueFS::open_for_read( | |
2748 | const string& dirname, | |
2749 | const string& filename, | |
2750 | FileReader **h, | |
2751 | bool random) | |
2752 | { | |
11fdf7f2 | 2753 | std::lock_guard l(lock); |
7c673cae FG |
2754 | dout(10) << __func__ << " " << dirname << "/" << filename |
2755 | << (random ? " (random)":" (sequential)") << dendl; | |
2756 | map<string,DirRef>::iterator p = dir_map.find(dirname); | |
2757 | if (p == dir_map.end()) { | |
2758 | dout(20) << __func__ << " dir " << dirname << " not found" << dendl; | |
2759 | return -ENOENT; | |
2760 | } | |
2761 | DirRef dir = p->second; | |
2762 | ||
2763 | map<string,FileRef>::iterator q = dir->file_map.find(filename); | |
2764 | if (q == dir->file_map.end()) { | |
2765 | dout(20) << __func__ << " dir " << dirname << " (" << dir | |
2766 | << ") file " << filename | |
2767 | << " not found" << dendl; | |
2768 | return -ENOENT; | |
2769 | } | |
2770 | File *file = q->second.get(); | |
2771 | ||
2772 | *h = new FileReader(file, random ? 4096 : cct->_conf->bluefs_max_prefetch, | |
2773 | random, false); | |
2774 | dout(10) << __func__ << " h " << *h << " on " << file->fnode << dendl; | |
2775 | return 0; | |
2776 | } | |
2777 | ||
2778 | int BlueFS::rename( | |
2779 | const string& old_dirname, const string& old_filename, | |
2780 | const string& new_dirname, const string& new_filename) | |
2781 | { | |
11fdf7f2 | 2782 | std::lock_guard l(lock); |
7c673cae FG |
2783 | dout(10) << __func__ << " " << old_dirname << "/" << old_filename |
2784 | << " -> " << new_dirname << "/" << new_filename << dendl; | |
2785 | map<string,DirRef>::iterator p = dir_map.find(old_dirname); | |
2786 | if (p == dir_map.end()) { | |
2787 | dout(20) << __func__ << " dir " << old_dirname << " not found" << dendl; | |
2788 | return -ENOENT; | |
2789 | } | |
2790 | DirRef old_dir = p->second; | |
2791 | map<string,FileRef>::iterator q = old_dir->file_map.find(old_filename); | |
2792 | if (q == old_dir->file_map.end()) { | |
2793 | dout(20) << __func__ << " dir " << old_dirname << " (" << old_dir | |
2794 | << ") file " << old_filename | |
2795 | << " not found" << dendl; | |
2796 | return -ENOENT; | |
2797 | } | |
2798 | FileRef file = q->second; | |
2799 | ||
2800 | p = dir_map.find(new_dirname); | |
2801 | if (p == dir_map.end()) { | |
2802 | dout(20) << __func__ << " dir " << new_dirname << " not found" << dendl; | |
2803 | return -ENOENT; | |
2804 | } | |
2805 | DirRef new_dir = p->second; | |
2806 | q = new_dir->file_map.find(new_filename); | |
2807 | if (q != new_dir->file_map.end()) { | |
2808 | dout(20) << __func__ << " dir " << new_dirname << " (" << old_dir | |
2809 | << ") file " << new_filename | |
2810 | << " already exists, unlinking" << dendl; | |
11fdf7f2 | 2811 | ceph_assert(q->second != file); |
7c673cae FG |
2812 | log_t.op_dir_unlink(new_dirname, new_filename); |
2813 | _drop_link(q->second); | |
2814 | } | |
2815 | ||
2816 | dout(10) << __func__ << " " << new_dirname << "/" << new_filename << " " | |
2817 | << " " << file->fnode << dendl; | |
2818 | ||
2819 | new_dir->file_map[new_filename] = file; | |
2820 | old_dir->file_map.erase(old_filename); | |
2821 | ||
2822 | log_t.op_dir_link(new_dirname, new_filename, file->fnode.ino); | |
2823 | log_t.op_dir_unlink(old_dirname, old_filename); | |
2824 | return 0; | |
2825 | } | |
2826 | ||
2827 | int BlueFS::mkdir(const string& dirname) | |
2828 | { | |
11fdf7f2 | 2829 | std::lock_guard l(lock); |
7c673cae FG |
2830 | dout(10) << __func__ << " " << dirname << dendl; |
2831 | map<string,DirRef>::iterator p = dir_map.find(dirname); | |
2832 | if (p != dir_map.end()) { | |
2833 | dout(20) << __func__ << " dir " << dirname << " exists" << dendl; | |
2834 | return -EEXIST; | |
2835 | } | |
2836 | dir_map[dirname] = new Dir; | |
2837 | log_t.op_dir_create(dirname); | |
2838 | return 0; | |
2839 | } | |
2840 | ||
2841 | int BlueFS::rmdir(const string& dirname) | |
2842 | { | |
11fdf7f2 | 2843 | std::lock_guard l(lock); |
7c673cae FG |
2844 | dout(10) << __func__ << " " << dirname << dendl; |
2845 | map<string,DirRef>::iterator p = dir_map.find(dirname); | |
2846 | if (p == dir_map.end()) { | |
2847 | dout(20) << __func__ << " dir " << dirname << " does not exist" << dendl; | |
2848 | return -ENOENT; | |
2849 | } | |
2850 | DirRef dir = p->second; | |
2851 | if (!dir->file_map.empty()) { | |
2852 | dout(20) << __func__ << " dir " << dirname << " not empty" << dendl; | |
2853 | return -ENOTEMPTY; | |
2854 | } | |
2855 | dir_map.erase(dirname); | |
2856 | log_t.op_dir_remove(dirname); | |
2857 | return 0; | |
2858 | } | |
2859 | ||
2860 | bool BlueFS::dir_exists(const string& dirname) | |
2861 | { | |
11fdf7f2 | 2862 | std::lock_guard l(lock); |
7c673cae FG |
2863 | map<string,DirRef>::iterator p = dir_map.find(dirname); |
2864 | bool exists = p != dir_map.end(); | |
2865 | dout(10) << __func__ << " " << dirname << " = " << (int)exists << dendl; | |
2866 | return exists; | |
2867 | } | |
2868 | ||
2869 | int BlueFS::stat(const string& dirname, const string& filename, | |
2870 | uint64_t *size, utime_t *mtime) | |
2871 | { | |
11fdf7f2 | 2872 | std::lock_guard l(lock); |
7c673cae FG |
2873 | dout(10) << __func__ << " " << dirname << "/" << filename << dendl; |
2874 | map<string,DirRef>::iterator p = dir_map.find(dirname); | |
2875 | if (p == dir_map.end()) { | |
2876 | dout(20) << __func__ << " dir " << dirname << " not found" << dendl; | |
2877 | return -ENOENT; | |
2878 | } | |
2879 | DirRef dir = p->second; | |
2880 | map<string,FileRef>::iterator q = dir->file_map.find(filename); | |
2881 | if (q == dir->file_map.end()) { | |
2882 | dout(20) << __func__ << " dir " << dirname << " (" << dir | |
2883 | << ") file " << filename | |
2884 | << " not found" << dendl; | |
2885 | return -ENOENT; | |
2886 | } | |
2887 | File *file = q->second.get(); | |
2888 | dout(10) << __func__ << " " << dirname << "/" << filename | |
2889 | << " " << file->fnode << dendl; | |
2890 | if (size) | |
2891 | *size = file->fnode.size; | |
2892 | if (mtime) | |
2893 | *mtime = file->fnode.mtime; | |
2894 | return 0; | |
2895 | } | |
2896 | ||
2897 | int BlueFS::lock_file(const string& dirname, const string& filename, | |
2898 | FileLock **plock) | |
2899 | { | |
11fdf7f2 | 2900 | std::lock_guard l(lock); |
7c673cae FG |
2901 | dout(10) << __func__ << " " << dirname << "/" << filename << dendl; |
2902 | map<string,DirRef>::iterator p = dir_map.find(dirname); | |
2903 | if (p == dir_map.end()) { | |
2904 | dout(20) << __func__ << " dir " << dirname << " not found" << dendl; | |
2905 | return -ENOENT; | |
2906 | } | |
2907 | DirRef dir = p->second; | |
2908 | map<string,FileRef>::iterator q = dir->file_map.find(filename); | |
2909 | File *file; | |
2910 | if (q == dir->file_map.end()) { | |
2911 | dout(20) << __func__ << " dir " << dirname << " (" << dir | |
2912 | << ") file " << filename | |
2913 | << " not found, creating" << dendl; | |
2914 | file = new File; | |
2915 | file->fnode.ino = ++ino_last; | |
2916 | file->fnode.mtime = ceph_clock_now(); | |
2917 | file_map[ino_last] = file; | |
2918 | dir->file_map[filename] = file; | |
2919 | ++file->refs; | |
2920 | log_t.op_file_update(file->fnode); | |
2921 | log_t.op_dir_link(dirname, filename, file->fnode.ino); | |
2922 | } else { | |
2923 | file = q->second.get(); | |
2924 | if (file->locked) { | |
2925 | dout(10) << __func__ << " already locked" << dendl; | |
11fdf7f2 | 2926 | return -ENOLCK; |
7c673cae FG |
2927 | } |
2928 | } | |
2929 | file->locked = true; | |
2930 | *plock = new FileLock(file); | |
2931 | dout(10) << __func__ << " locked " << file->fnode | |
2932 | << " with " << *plock << dendl; | |
2933 | return 0; | |
2934 | } | |
2935 | ||
2936 | int BlueFS::unlock_file(FileLock *fl) | |
2937 | { | |
11fdf7f2 | 2938 | std::lock_guard l(lock); |
7c673cae | 2939 | dout(10) << __func__ << " " << fl << " on " << fl->file->fnode << dendl; |
11fdf7f2 | 2940 | ceph_assert(fl->file->locked); |
7c673cae FG |
2941 | fl->file->locked = false; |
2942 | delete fl; | |
2943 | return 0; | |
2944 | } | |
2945 | ||
2946 | int BlueFS::readdir(const string& dirname, vector<string> *ls) | |
2947 | { | |
11fdf7f2 | 2948 | std::lock_guard l(lock); |
7c673cae FG |
2949 | dout(10) << __func__ << " " << dirname << dendl; |
2950 | if (dirname.empty()) { | |
2951 | // list dirs | |
2952 | ls->reserve(dir_map.size() + 2); | |
2953 | for (auto& q : dir_map) { | |
2954 | ls->push_back(q.first); | |
2955 | } | |
2956 | } else { | |
2957 | // list files in dir | |
2958 | map<string,DirRef>::iterator p = dir_map.find(dirname); | |
2959 | if (p == dir_map.end()) { | |
2960 | dout(20) << __func__ << " dir " << dirname << " not found" << dendl; | |
2961 | return -ENOENT; | |
2962 | } | |
2963 | DirRef dir = p->second; | |
2964 | ls->reserve(dir->file_map.size() + 2); | |
2965 | for (auto& q : dir->file_map) { | |
2966 | ls->push_back(q.first); | |
2967 | } | |
2968 | } | |
2969 | ls->push_back("."); | |
2970 | ls->push_back(".."); | |
2971 | return 0; | |
2972 | } | |
2973 | ||
2974 | int BlueFS::unlink(const string& dirname, const string& filename) | |
2975 | { | |
11fdf7f2 | 2976 | std::lock_guard l(lock); |
7c673cae FG |
2977 | dout(10) << __func__ << " " << dirname << "/" << filename << dendl; |
2978 | map<string,DirRef>::iterator p = dir_map.find(dirname); | |
2979 | if (p == dir_map.end()) { | |
2980 | dout(20) << __func__ << " dir " << dirname << " not found" << dendl; | |
2981 | return -ENOENT; | |
2982 | } | |
2983 | DirRef dir = p->second; | |
2984 | map<string,FileRef>::iterator q = dir->file_map.find(filename); | |
2985 | if (q == dir->file_map.end()) { | |
2986 | dout(20) << __func__ << " file " << dirname << "/" << filename | |
2987 | << " not found" << dendl; | |
2988 | return -ENOENT; | |
2989 | } | |
2990 | FileRef file = q->second; | |
2991 | if (file->locked) { | |
2992 | dout(20) << __func__ << " file " << dirname << "/" << filename | |
2993 | << " is locked" << dendl; | |
2994 | return -EBUSY; | |
2995 | } | |
2996 | dir->file_map.erase(filename); | |
2997 | log_t.op_dir_unlink(dirname, filename); | |
2998 | _drop_link(file); | |
2999 | return 0; | |
3000 | } | |
d2e6a577 FG |
3001 | |
3002 | bool BlueFS::wal_is_rotational() | |
3003 | { | |
94b18763 FG |
3004 | if (bdev[BDEV_WAL]) { |
3005 | return bdev[BDEV_WAL]->is_rotational(); | |
3006 | } else if (bdev[BDEV_DB]) { | |
3007 | return bdev[BDEV_DB]->is_rotational(); | |
3008 | } | |
3009 | return bdev[BDEV_SLOW]->is_rotational(); | |
d2e6a577 | 3010 | } |