]>
Commit | Line | Data |
---|---|---|
7c673cae FG |
1 | // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- |
2 | // vim: ts=8 sw=2 smarttab | |
3 | ||
4 | #include "boost/algorithm/string.hpp" | |
5 | #include "BlueFS.h" | |
6 | ||
7 | #include "common/debug.h" | |
8 | #include "common/errno.h" | |
9 | #include "common/perf_counters.h" | |
10 | #include "BlockDevice.h" | |
11 | #include "Allocator.h" | |
12 | #include "include/assert.h" | |
13 | ||
14 | #define dout_context cct | |
15 | #define dout_subsys ceph_subsys_bluefs | |
16 | #undef dout_prefix | |
17 | #define dout_prefix *_dout << "bluefs " | |
18 | ||
19 | MEMPOOL_DEFINE_OBJECT_FACTORY(BlueFS::File, bluefs_file, bluefs); | |
20 | MEMPOOL_DEFINE_OBJECT_FACTORY(BlueFS::Dir, bluefs_dir, bluefs); | |
21 | MEMPOOL_DEFINE_OBJECT_FACTORY(BlueFS::FileWriter, bluefs_file_writer, bluefs); | |
22 | MEMPOOL_DEFINE_OBJECT_FACTORY(BlueFS::FileReaderBuffer, | |
23 | bluefs_file_reader_buffer, bluefs); | |
24 | MEMPOOL_DEFINE_OBJECT_FACTORY(BlueFS::FileReader, bluefs_file_reader, bluefs); | |
25 | MEMPOOL_DEFINE_OBJECT_FACTORY(BlueFS::FileLock, bluefs_file_lock, bluefs); | |
26 | ||
27 | ||
28 | BlueFS::BlueFS(CephContext* cct) | |
29 | : cct(cct), | |
30 | bdev(MAX_BDEV), | |
31 | ioc(MAX_BDEV), | |
32 | block_all(MAX_BDEV), | |
33 | block_total(MAX_BDEV, 0) | |
34 | { | |
35 | } | |
36 | ||
37 | BlueFS::~BlueFS() | |
38 | { | |
39 | for (auto p : ioc) { | |
40 | if (p) | |
41 | p->aio_wait(); | |
42 | } | |
43 | for (auto p : bdev) { | |
44 | if (p) { | |
45 | p->close(); | |
46 | delete p; | |
47 | } | |
48 | } | |
49 | for (auto p : ioc) { | |
50 | delete p; | |
51 | } | |
52 | } | |
53 | ||
54 | void BlueFS::_init_logger() | |
55 | { | |
56 | PerfCountersBuilder b(cct, "bluefs", | |
57 | l_bluefs_first, l_bluefs_last); | |
58 | b.add_u64_counter(l_bluefs_gift_bytes, "gift_bytes", | |
1adf2230 | 59 | "Bytes gifted from BlueStore", NULL, 0, unit_t(BYTES)); |
7c673cae | 60 | b.add_u64_counter(l_bluefs_reclaim_bytes, "reclaim_bytes", |
1adf2230 | 61 | "Bytes reclaimed by BlueStore", NULL, 0, unit_t(BYTES)); |
7c673cae FG |
62 | b.add_u64(l_bluefs_db_total_bytes, "db_total_bytes", |
63 | "Total bytes (main db device)", | |
1adf2230 | 64 | "b", PerfCountersBuilder::PRIO_USEFUL, unit_t(BYTES)); |
7c673cae FG |
65 | b.add_u64(l_bluefs_db_used_bytes, "db_used_bytes", |
66 | "Used bytes (main db device)", | |
1adf2230 | 67 | "u", PerfCountersBuilder::PRIO_USEFUL, unit_t(BYTES)); |
7c673cae FG |
68 | b.add_u64(l_bluefs_wal_total_bytes, "wal_total_bytes", |
69 | "Total bytes (wal device)", | |
1adf2230 | 70 | "walb", PerfCountersBuilder::PRIO_USEFUL, unit_t(BYTES)); |
7c673cae FG |
71 | b.add_u64(l_bluefs_wal_used_bytes, "wal_used_bytes", |
72 | "Used bytes (wal device)", | |
1adf2230 | 73 | "walu", PerfCountersBuilder::PRIO_USEFUL, unit_t(BYTES)); |
7c673cae FG |
74 | b.add_u64(l_bluefs_slow_total_bytes, "slow_total_bytes", |
75 | "Total bytes (slow device)", | |
1adf2230 | 76 | "slob", PerfCountersBuilder::PRIO_USEFUL, unit_t(BYTES)); |
7c673cae FG |
77 | b.add_u64(l_bluefs_slow_used_bytes, "slow_used_bytes", |
78 | "Used bytes (slow device)", | |
1adf2230 | 79 | "slou", PerfCountersBuilder::PRIO_USEFUL, unit_t(BYTES)); |
7c673cae FG |
80 | b.add_u64(l_bluefs_num_files, "num_files", "File count", |
81 | "f", PerfCountersBuilder::PRIO_USEFUL); | |
82 | b.add_u64(l_bluefs_log_bytes, "log_bytes", "Size of the metadata log", | |
1adf2230 | 83 | "jlen", PerfCountersBuilder::PRIO_INTERESTING, unit_t(BYTES)); |
7c673cae FG |
84 | b.add_u64_counter(l_bluefs_log_compactions, "log_compactions", |
85 | "Compactions of the metadata log"); | |
86 | b.add_u64_counter(l_bluefs_logged_bytes, "logged_bytes", | |
87 | "Bytes written to the metadata log", "j", | |
1adf2230 | 88 | PerfCountersBuilder::PRIO_CRITICAL, unit_t(BYTES)); |
7c673cae FG |
89 | b.add_u64_counter(l_bluefs_files_written_wal, "files_written_wal", |
90 | "Files written to WAL"); | |
91 | b.add_u64_counter(l_bluefs_files_written_sst, "files_written_sst", | |
92 | "Files written to SSTs"); | |
93 | b.add_u64_counter(l_bluefs_bytes_written_wal, "bytes_written_wal", | |
94 | "Bytes written to WAL", "wal", | |
95 | PerfCountersBuilder::PRIO_CRITICAL); | |
96 | b.add_u64_counter(l_bluefs_bytes_written_sst, "bytes_written_sst", | |
97 | "Bytes written to SSTs", "sst", | |
1adf2230 | 98 | PerfCountersBuilder::PRIO_CRITICAL, unit_t(BYTES)); |
7c673cae FG |
99 | logger = b.create_perf_counters(); |
100 | cct->get_perfcounters_collection()->add(logger); | |
101 | } | |
102 | ||
103 | void BlueFS::_shutdown_logger() | |
104 | { | |
105 | cct->get_perfcounters_collection()->remove(logger); | |
106 | delete logger; | |
107 | } | |
108 | ||
109 | void BlueFS::_update_logger_stats() | |
110 | { | |
111 | // we must be holding the lock | |
112 | logger->set(l_bluefs_num_files, file_map.size()); | |
113 | logger->set(l_bluefs_log_bytes, log_writer->file->fnode.size); | |
114 | ||
115 | if (alloc[BDEV_WAL]) { | |
116 | logger->set(l_bluefs_wal_total_bytes, block_total[BDEV_WAL]); | |
117 | logger->set(l_bluefs_wal_used_bytes, | |
118 | block_total[BDEV_WAL] - alloc[BDEV_WAL]->get_free()); | |
119 | } | |
120 | if (alloc[BDEV_DB]) { | |
121 | logger->set(l_bluefs_db_total_bytes, block_total[BDEV_DB]); | |
122 | logger->set(l_bluefs_db_used_bytes, | |
123 | block_total[BDEV_DB] - alloc[BDEV_DB]->get_free()); | |
124 | } | |
125 | if (alloc[BDEV_SLOW]) { | |
126 | logger->set(l_bluefs_slow_total_bytes, block_total[BDEV_SLOW]); | |
127 | logger->set(l_bluefs_slow_used_bytes, | |
128 | block_total[BDEV_SLOW] - alloc[BDEV_SLOW]->get_free()); | |
129 | } | |
130 | } | |
131 | ||
c07f9fc5 | 132 | int BlueFS::add_block_device(unsigned id, const string& path) |
7c673cae FG |
133 | { |
134 | dout(10) << __func__ << " bdev " << id << " path " << path << dendl; | |
135 | assert(id < bdev.size()); | |
136 | assert(bdev[id] == NULL); | |
137 | BlockDevice *b = BlockDevice::create(cct, path, NULL, NULL); | |
138 | int r = b->open(path); | |
139 | if (r < 0) { | |
140 | delete b; | |
141 | return r; | |
142 | } | |
143 | dout(1) << __func__ << " bdev " << id << " path " << path | |
1adf2230 | 144 | << " size " << byte_u_t(b->get_size()) << dendl; |
7c673cae FG |
145 | bdev[id] = b; |
146 | ioc[id] = new IOContext(cct, NULL); | |
147 | return 0; | |
148 | } | |
149 | ||
150 | bool BlueFS::bdev_support_label(unsigned id) | |
151 | { | |
152 | assert(id < bdev.size()); | |
153 | assert(bdev[id]); | |
154 | return bdev[id]->supported_bdev_label(); | |
155 | } | |
156 | ||
157 | uint64_t BlueFS::get_block_device_size(unsigned id) | |
158 | { | |
159 | if (id < bdev.size() && bdev[id]) | |
160 | return bdev[id]->get_size(); | |
161 | return 0; | |
162 | } | |
163 | ||
164 | void BlueFS::add_block_extent(unsigned id, uint64_t offset, uint64_t length) | |
165 | { | |
166 | std::unique_lock<std::mutex> l(lock); | |
167 | dout(1) << __func__ << " bdev " << id | |
168 | << " 0x" << std::hex << offset << "~" << length << std::dec | |
169 | << dendl; | |
170 | assert(id < bdev.size()); | |
171 | assert(bdev[id]); | |
172 | assert(bdev[id]->get_size() >= offset + length); | |
173 | block_all[id].insert(offset, length); | |
174 | block_total[id] += length; | |
175 | ||
176 | if (id < alloc.size() && alloc[id]) { | |
177 | log_t.op_alloc_add(id, offset, length); | |
178 | int r = _flush_and_sync_log(l); | |
179 | assert(r == 0); | |
180 | alloc[id]->init_add_free(offset, length); | |
181 | } | |
182 | ||
183 | if (logger) | |
184 | logger->inc(l_bluefs_gift_bytes, length); | |
185 | dout(10) << __func__ << " done" << dendl; | |
186 | } | |
187 | ||
188 | int BlueFS::reclaim_blocks(unsigned id, uint64_t want, | |
189 | AllocExtentVector *extents) | |
190 | { | |
191 | std::unique_lock<std::mutex> l(lock); | |
192 | dout(1) << __func__ << " bdev " << id | |
193 | << " want 0x" << std::hex << want << std::dec << dendl; | |
194 | assert(id < alloc.size()); | |
195 | assert(alloc[id]); | |
196 | int r = alloc[id]->reserve(want); | |
197 | assert(r == 0); // caller shouldn't ask for more than they can get | |
198 | int64_t got = alloc[id]->allocate(want, cct->_conf->bluefs_alloc_size, 0, | |
199 | extents); | |
200 | if (got < (int64_t)want) { | |
201 | alloc[id]->unreserve(want - MAX(0, got)); | |
202 | } | |
203 | if (got <= 0) { | |
204 | derr << __func__ << " failed to allocate space to return to bluestore" | |
205 | << dendl; | |
206 | alloc[id]->dump(); | |
207 | return got; | |
208 | } | |
209 | ||
210 | for (auto& p : *extents) { | |
211 | block_all[id].erase(p.offset, p.length); | |
212 | block_total[id] -= p.length; | |
213 | log_t.op_alloc_rm(id, p.offset, p.length); | |
214 | } | |
215 | ||
216 | flush_bdev(); | |
217 | r = _flush_and_sync_log(l); | |
218 | assert(r == 0); | |
219 | ||
220 | if (logger) | |
221 | logger->inc(l_bluefs_reclaim_bytes, got); | |
222 | dout(1) << __func__ << " bdev " << id << " want 0x" << std::hex << want | |
223 | << " got " << *extents << dendl; | |
224 | return 0; | |
225 | } | |
226 | ||
227 | uint64_t BlueFS::get_fs_usage() | |
228 | { | |
229 | std::lock_guard<std::mutex> l(lock); | |
230 | uint64_t total_bytes = 0; | |
231 | for (auto& p : file_map) { | |
232 | total_bytes += p.second->fnode.get_allocated(); | |
233 | } | |
234 | return total_bytes; | |
235 | } | |
236 | ||
237 | uint64_t BlueFS::get_total(unsigned id) | |
238 | { | |
239 | std::lock_guard<std::mutex> l(lock); | |
240 | assert(id < block_all.size()); | |
241 | return block_total[id]; | |
242 | } | |
243 | ||
244 | uint64_t BlueFS::get_free(unsigned id) | |
245 | { | |
246 | std::lock_guard<std::mutex> l(lock); | |
247 | assert(id < alloc.size()); | |
248 | return alloc[id]->get_free(); | |
249 | } | |
250 | ||
251 | void BlueFS::dump_perf_counters(Formatter *f) | |
252 | { | |
253 | f->open_object_section("bluefs_perf_counters"); | |
254 | logger->dump_formatted(f,0); | |
255 | f->close_section(); | |
256 | } | |
257 | ||
3efd9988 FG |
258 | void BlueFS::dump_block_extents(ostream& out) |
259 | { | |
260 | for (unsigned i = 0; i < MAX_BDEV; ++i) { | |
261 | if (!bdev[i]) { | |
262 | continue; | |
263 | } | |
264 | out << i << " : size 0x" << std::hex << bdev[i]->get_size() | |
265 | << " : own 0x" << block_all[i] << std::dec << "\n"; | |
266 | } | |
267 | } | |
7c673cae FG |
268 | |
269 | void BlueFS::get_usage(vector<pair<uint64_t,uint64_t>> *usage) | |
270 | { | |
271 | std::lock_guard<std::mutex> l(lock); | |
272 | usage->resize(bdev.size()); | |
273 | for (unsigned id = 0; id < bdev.size(); ++id) { | |
274 | if (!bdev[id]) { | |
275 | (*usage)[id] = make_pair(0, 0); | |
276 | continue; | |
277 | } | |
278 | (*usage)[id].first = alloc[id]->get_free(); | |
279 | (*usage)[id].second = block_total[id]; | |
280 | uint64_t used = | |
281 | (block_total[id] - (*usage)[id].first) * 100 / block_total[id]; | |
282 | dout(10) << __func__ << " bdev " << id | |
283 | << " free " << (*usage)[id].first | |
1adf2230 | 284 | << " (" << byte_u_t((*usage)[id].first) << ")" |
7c673cae | 285 | << " / " << (*usage)[id].second |
1adf2230 | 286 | << " (" << byte_u_t((*usage)[id].second) << ")" |
7c673cae FG |
287 | << ", used " << used << "%" |
288 | << dendl; | |
289 | } | |
290 | } | |
291 | ||
292 | int BlueFS::get_block_extents(unsigned id, interval_set<uint64_t> *extents) | |
293 | { | |
294 | std::lock_guard<std::mutex> l(lock); | |
295 | dout(10) << __func__ << " bdev " << id << dendl; | |
296 | if (id >= block_all.size()) | |
297 | return -EINVAL; | |
298 | *extents = block_all[id]; | |
299 | return 0; | |
300 | } | |
301 | ||
f64942e4 AA |
302 | // returns true if specified device is attached |
303 | bool BlueFS::is_device(unsigned id) | |
304 | { | |
305 | return !(id >= MAX_BDEV || bdev[id] == nullptr); | |
306 | } | |
307 | ||
308 | // returns true if specified device is under full bluefs control | |
309 | // and hence can be expanded | |
310 | bool BlueFS::is_device_expandable(unsigned id) | |
311 | { | |
312 | if (id >= MAX_BDEV || bdev[id] == nullptr) { | |
313 | return false; | |
314 | } | |
315 | switch(id) { | |
316 | case BDEV_WAL: | |
317 | return true; | |
318 | ||
319 | case BDEV_DB: | |
320 | // true if DB volume is non-shared | |
321 | return bdev[BDEV_SLOW] != nullptr; | |
322 | } | |
323 | return false; | |
324 | } | |
325 | ||
7c673cae FG |
326 | int BlueFS::mkfs(uuid_d osd_uuid) |
327 | { | |
328 | std::unique_lock<std::mutex> l(lock); | |
329 | dout(1) << __func__ | |
330 | << " osd_uuid " << osd_uuid | |
331 | << dendl; | |
332 | ||
333 | _init_alloc(); | |
334 | _init_logger(); | |
335 | ||
336 | super.version = 1; | |
337 | super.block_size = bdev[BDEV_DB]->get_block_size(); | |
338 | super.osd_uuid = osd_uuid; | |
339 | super.uuid.generate_random(); | |
340 | dout(1) << __func__ << " uuid " << super.uuid << dendl; | |
341 | ||
342 | // init log | |
343 | FileRef log_file = new File; | |
344 | log_file->fnode.ino = 1; | |
345 | log_file->fnode.prefer_bdev = BDEV_WAL; | |
346 | int r = _allocate( | |
347 | log_file->fnode.prefer_bdev, | |
348 | cct->_conf->bluefs_max_log_runway, | |
94b18763 | 349 | &log_file->fnode); |
7c673cae FG |
350 | assert(r == 0); |
351 | log_writer = _create_writer(log_file); | |
352 | ||
353 | // initial txn | |
354 | log_t.op_init(); | |
355 | for (unsigned bdev = 0; bdev < MAX_BDEV; ++bdev) { | |
356 | interval_set<uint64_t>& p = block_all[bdev]; | |
357 | if (p.empty()) | |
358 | continue; | |
359 | for (interval_set<uint64_t>::iterator q = p.begin(); q != p.end(); ++q) { | |
360 | dout(20) << __func__ << " op_alloc_add " << bdev << " 0x" | |
361 | << std::hex << q.get_start() << "~" << q.get_len() << std::dec | |
362 | << dendl; | |
363 | log_t.op_alloc_add(bdev, q.get_start(), q.get_len()); | |
364 | } | |
365 | } | |
366 | _flush_and_sync_log(l); | |
367 | ||
368 | // write supers | |
369 | super.log_fnode = log_file->fnode; | |
370 | _write_super(); | |
371 | flush_bdev(); | |
372 | ||
373 | // clean up | |
374 | super = bluefs_super_t(); | |
375 | _close_writer(log_writer); | |
376 | log_writer = NULL; | |
377 | block_all.clear(); | |
378 | block_total.clear(); | |
379 | _stop_alloc(); | |
380 | _shutdown_logger(); | |
381 | ||
382 | dout(10) << __func__ << " success" << dendl; | |
383 | return 0; | |
384 | } | |
385 | ||
386 | void BlueFS::_init_alloc() | |
387 | { | |
388 | dout(20) << __func__ << dendl; | |
389 | alloc.resize(MAX_BDEV); | |
390 | pending_release.resize(MAX_BDEV); | |
391 | for (unsigned id = 0; id < bdev.size(); ++id) { | |
392 | if (!bdev[id]) { | |
393 | continue; | |
394 | } | |
395 | assert(bdev[id]->get_size()); | |
396 | alloc[id] = Allocator::create(cct, cct->_conf->bluefs_allocator, | |
397 | bdev[id]->get_size(), | |
398 | cct->_conf->bluefs_alloc_size); | |
399 | interval_set<uint64_t>& p = block_all[id]; | |
400 | for (interval_set<uint64_t>::iterator q = p.begin(); q != p.end(); ++q) { | |
401 | alloc[id]->init_add_free(q.get_start(), q.get_len()); | |
402 | } | |
403 | } | |
404 | } | |
405 | ||
406 | void BlueFS::_stop_alloc() | |
407 | { | |
408 | dout(20) << __func__ << dendl; | |
409 | for (auto p : alloc) { | |
410 | if (p != nullptr) { | |
411 | p->shutdown(); | |
412 | delete p; | |
413 | } | |
414 | } | |
415 | alloc.clear(); | |
416 | } | |
417 | ||
418 | int BlueFS::mount() | |
419 | { | |
420 | dout(1) << __func__ << dendl; | |
421 | ||
422 | int r = _open_super(); | |
423 | if (r < 0) { | |
424 | derr << __func__ << " failed to open super: " << cpp_strerror(r) << dendl; | |
425 | goto out; | |
426 | } | |
427 | ||
428 | block_all.clear(); | |
429 | block_all.resize(MAX_BDEV); | |
430 | block_total.clear(); | |
431 | block_total.resize(MAX_BDEV, 0); | |
432 | _init_alloc(); | |
433 | ||
434 | r = _replay(false); | |
435 | if (r < 0) { | |
436 | derr << __func__ << " failed to replay log: " << cpp_strerror(r) << dendl; | |
437 | _stop_alloc(); | |
438 | goto out; | |
439 | } | |
440 | ||
441 | // init freelist | |
442 | for (auto& p : file_map) { | |
443 | dout(30) << __func__ << " noting alloc for " << p.second->fnode << dendl; | |
444 | for (auto& q : p.second->fnode.extents) { | |
445 | alloc[q.bdev]->init_rm_free(q.offset, q.length); | |
446 | } | |
447 | } | |
448 | ||
449 | // set up the log for future writes | |
450 | log_writer = _create_writer(_get_file(1)); | |
451 | assert(log_writer->file->fnode.ino == 1); | |
452 | log_writer->pos = log_writer->file->fnode.size; | |
453 | dout(10) << __func__ << " log write pos set to 0x" | |
454 | << std::hex << log_writer->pos << std::dec | |
455 | << dendl; | |
456 | ||
457 | _init_logger(); | |
458 | return 0; | |
459 | ||
460 | out: | |
461 | super = bluefs_super_t(); | |
462 | return r; | |
463 | } | |
464 | ||
465 | void BlueFS::umount() | |
466 | { | |
467 | dout(1) << __func__ << dendl; | |
468 | ||
469 | sync_metadata(); | |
470 | ||
471 | _close_writer(log_writer); | |
472 | log_writer = NULL; | |
473 | ||
474 | _stop_alloc(); | |
475 | file_map.clear(); | |
476 | dir_map.clear(); | |
477 | super = bluefs_super_t(); | |
478 | log_t.clear(); | |
479 | _shutdown_logger(); | |
480 | } | |
481 | ||
482 | void BlueFS::collect_metadata(map<string,string> *pm) | |
483 | { | |
484 | if (bdev[BDEV_DB]) | |
485 | bdev[BDEV_DB]->collect_metadata("bluefs_db_", pm); | |
486 | if (bdev[BDEV_WAL]) | |
487 | bdev[BDEV_WAL]->collect_metadata("bluefs_wal_", pm); | |
488 | if (bdev[BDEV_SLOW]) | |
489 | bdev[BDEV_SLOW]->collect_metadata("bluefs_slow_", pm); | |
490 | } | |
491 | ||
492 | int BlueFS::fsck() | |
493 | { | |
494 | std::lock_guard<std::mutex> l(lock); | |
495 | dout(1) << __func__ << dendl; | |
496 | // hrm, i think we check everything on mount... | |
497 | return 0; | |
498 | } | |
499 | ||
500 | int BlueFS::_write_super() | |
501 | { | |
502 | // build superblock | |
503 | bufferlist bl; | |
504 | ::encode(super, bl); | |
505 | uint32_t crc = bl.crc32c(-1); | |
506 | ::encode(crc, bl); | |
507 | dout(10) << __func__ << " super block length(encoded): " << bl.length() << dendl; | |
508 | dout(10) << __func__ << " superblock " << super.version << dendl; | |
509 | dout(10) << __func__ << " log_fnode " << super.log_fnode << dendl; | |
510 | assert(bl.length() <= get_super_length()); | |
511 | bl.append_zero(get_super_length() - bl.length()); | |
512 | ||
513 | bdev[BDEV_DB]->write(get_super_offset(), bl, false); | |
514 | dout(20) << __func__ << " v " << super.version | |
515 | << " crc 0x" << std::hex << crc | |
516 | << " offset 0x" << get_super_offset() << std::dec | |
517 | << dendl; | |
518 | return 0; | |
519 | } | |
520 | ||
521 | int BlueFS::_open_super() | |
522 | { | |
523 | dout(10) << __func__ << dendl; | |
524 | ||
525 | bufferlist bl; | |
526 | uint32_t expected_crc, crc; | |
527 | int r; | |
528 | ||
529 | // always the second block | |
530 | r = bdev[BDEV_DB]->read(get_super_offset(), get_super_length(), | |
531 | &bl, ioc[BDEV_DB], false); | |
532 | if (r < 0) | |
533 | return r; | |
534 | ||
535 | bufferlist::iterator p = bl.begin(); | |
536 | ::decode(super, p); | |
537 | { | |
538 | bufferlist t; | |
539 | t.substr_of(bl, 0, p.get_off()); | |
540 | crc = t.crc32c(-1); | |
541 | } | |
542 | ::decode(expected_crc, p); | |
543 | if (crc != expected_crc) { | |
544 | derr << __func__ << " bad crc on superblock, expected 0x" | |
545 | << std::hex << expected_crc << " != actual 0x" << crc << std::dec | |
546 | << dendl; | |
547 | return -EIO; | |
548 | } | |
549 | dout(10) << __func__ << " superblock " << super.version << dendl; | |
550 | dout(10) << __func__ << " log_fnode " << super.log_fnode << dendl; | |
551 | return 0; | |
552 | } | |
553 | ||
554 | int BlueFS::_replay(bool noop) | |
555 | { | |
556 | dout(10) << __func__ << (noop ? " NO-OP" : "") << dendl; | |
557 | ino_last = 1; // by the log | |
558 | log_seq = 0; | |
559 | ||
560 | FileRef log_file; | |
561 | if (noop) { | |
562 | log_file = new File; | |
563 | } else { | |
564 | log_file = _get_file(1); | |
565 | } | |
566 | log_file->fnode = super.log_fnode; | |
567 | dout(10) << __func__ << " log_fnode " << super.log_fnode << dendl; | |
568 | ||
569 | FileReader *log_reader = new FileReader( | |
570 | log_file, cct->_conf->bluefs_max_prefetch, | |
571 | false, // !random | |
572 | true); // ignore eof | |
573 | while (true) { | |
574 | assert((log_reader->buf.pos & ~super.block_mask()) == 0); | |
575 | uint64_t pos = log_reader->buf.pos; | |
576 | uint64_t read_pos = pos; | |
577 | bufferlist bl; | |
578 | { | |
579 | int r = _read(log_reader, &log_reader->buf, read_pos, super.block_size, | |
580 | &bl, NULL); | |
581 | assert(r == (int)super.block_size); | |
582 | read_pos += r; | |
583 | } | |
584 | uint64_t more = 0; | |
585 | uint64_t seq; | |
586 | uuid_d uuid; | |
587 | { | |
588 | bufferlist::iterator p = bl.begin(); | |
589 | __u8 a, b; | |
590 | uint32_t len; | |
591 | ::decode(a, p); | |
592 | ::decode(b, p); | |
593 | ::decode(len, p); | |
594 | ::decode(uuid, p); | |
595 | ::decode(seq, p); | |
596 | if (len + 6 > bl.length()) { | |
597 | more = ROUND_UP_TO(len + 6 - bl.length(), super.block_size); | |
598 | } | |
599 | } | |
600 | if (uuid != super.uuid) { | |
601 | dout(10) << __func__ << " 0x" << std::hex << pos << std::dec | |
602 | << ": stop: uuid " << uuid << " != super.uuid " << super.uuid | |
603 | << dendl; | |
604 | break; | |
605 | } | |
606 | if (seq != log_seq + 1) { | |
607 | dout(10) << __func__ << " 0x" << std::hex << pos << std::dec | |
608 | << ": stop: seq " << seq << " != expected " << log_seq + 1 | |
609 | << dendl; | |
610 | break; | |
611 | } | |
612 | if (more) { | |
613 | dout(20) << __func__ << " need 0x" << std::hex << more << std::dec | |
614 | << " more bytes" << dendl; | |
615 | bufferlist t; | |
616 | int r = _read(log_reader, &log_reader->buf, read_pos, more, &t, NULL); | |
617 | if (r < (int)more) { | |
618 | dout(10) << __func__ << " 0x" << std::hex << pos | |
619 | << ": stop: len is 0x" << bl.length() + more << std::dec | |
620 | << ", which is past eof" << dendl; | |
621 | break; | |
622 | } | |
623 | assert(r == (int)more); | |
624 | bl.claim_append(t); | |
625 | read_pos += r; | |
626 | } | |
627 | bluefs_transaction_t t; | |
628 | try { | |
629 | bufferlist::iterator p = bl.begin(); | |
630 | ::decode(t, p); | |
631 | } | |
632 | catch (buffer::error& e) { | |
633 | dout(10) << __func__ << " 0x" << std::hex << pos << std::dec | |
634 | << ": stop: failed to decode: " << e.what() | |
635 | << dendl; | |
636 | delete log_reader; | |
637 | return -EIO; | |
638 | } | |
639 | assert(seq == t.seq); | |
640 | dout(10) << __func__ << " 0x" << std::hex << pos << std::dec | |
641 | << ": " << t << dendl; | |
642 | ||
643 | bufferlist::iterator p = t.op_bl.begin(); | |
644 | while (!p.end()) { | |
645 | __u8 op; | |
646 | ::decode(op, p); | |
647 | switch (op) { | |
648 | ||
649 | case bluefs_transaction_t::OP_INIT: | |
650 | dout(20) << __func__ << " 0x" << std::hex << pos << std::dec | |
651 | << ": op_init" << dendl; | |
652 | assert(t.seq == 1); | |
653 | break; | |
654 | ||
655 | case bluefs_transaction_t::OP_JUMP: | |
656 | { | |
657 | uint64_t next_seq; | |
658 | uint64_t offset; | |
659 | ::decode(next_seq, p); | |
660 | ::decode(offset, p); | |
661 | dout(20) << __func__ << " 0x" << std::hex << pos << std::dec | |
662 | << ": op_jump seq " << next_seq | |
663 | << " offset 0x" << std::hex << offset << std::dec << dendl; | |
664 | assert(next_seq >= log_seq); | |
665 | log_seq = next_seq - 1; // we will increment it below | |
666 | uint64_t skip = offset - read_pos; | |
667 | if (skip) { | |
668 | bufferlist junk; | |
669 | int r = _read(log_reader, &log_reader->buf, read_pos, skip, &junk, | |
670 | NULL); | |
671 | if (r != (int)skip) { | |
672 | dout(10) << __func__ << " 0x" << std::hex << read_pos | |
673 | << ": stop: failed to skip to " << offset | |
674 | << std::dec << dendl; | |
675 | assert(0 == "problem with op_jump"); | |
676 | } | |
677 | } | |
678 | } | |
679 | break; | |
680 | ||
681 | case bluefs_transaction_t::OP_JUMP_SEQ: | |
682 | { | |
683 | uint64_t next_seq; | |
684 | ::decode(next_seq, p); | |
685 | dout(20) << __func__ << " 0x" << std::hex << pos << std::dec | |
686 | << ": op_jump_seq " << next_seq << dendl; | |
687 | assert(next_seq >= log_seq); | |
688 | log_seq = next_seq - 1; // we will increment it below | |
689 | } | |
690 | break; | |
691 | ||
692 | case bluefs_transaction_t::OP_ALLOC_ADD: | |
693 | { | |
694 | __u8 id; | |
695 | uint64_t offset, length; | |
696 | ::decode(id, p); | |
697 | ::decode(offset, p); | |
698 | ::decode(length, p); | |
699 | dout(20) << __func__ << " 0x" << std::hex << pos << std::dec | |
700 | << ": op_alloc_add " << " " << (int)id | |
701 | << ":0x" << std::hex << offset << "~" << length << std::dec | |
702 | << dendl; | |
703 | if (!noop) { | |
704 | block_all[id].insert(offset, length); | |
705 | block_total[id] += length; | |
706 | alloc[id]->init_add_free(offset, length); | |
707 | } | |
708 | } | |
709 | break; | |
710 | ||
711 | case bluefs_transaction_t::OP_ALLOC_RM: | |
712 | { | |
713 | __u8 id; | |
714 | uint64_t offset, length; | |
715 | ::decode(id, p); | |
716 | ::decode(offset, p); | |
717 | ::decode(length, p); | |
718 | dout(20) << __func__ << " 0x" << std::hex << pos << std::dec | |
719 | << ": op_alloc_rm " << " " << (int)id | |
720 | << ":0x" << std::hex << offset << "~" << length << std::dec | |
721 | << dendl; | |
722 | if (!noop) { | |
723 | block_all[id].erase(offset, length); | |
724 | block_total[id] -= length; | |
725 | alloc[id]->init_rm_free(offset, length); | |
726 | } | |
727 | } | |
728 | break; | |
729 | ||
730 | case bluefs_transaction_t::OP_DIR_LINK: | |
731 | { | |
732 | string dirname, filename; | |
733 | uint64_t ino; | |
734 | ::decode(dirname, p); | |
735 | ::decode(filename, p); | |
736 | ::decode(ino, p); | |
737 | dout(20) << __func__ << " 0x" << std::hex << pos << std::dec | |
738 | << ": op_dir_link " << " " << dirname << "/" << filename | |
739 | << " to " << ino | |
740 | << dendl; | |
741 | if (!noop) { | |
742 | FileRef file = _get_file(ino); | |
743 | assert(file->fnode.ino); | |
744 | map<string,DirRef>::iterator q = dir_map.find(dirname); | |
745 | assert(q != dir_map.end()); | |
746 | map<string,FileRef>::iterator r = q->second->file_map.find(filename); | |
747 | assert(r == q->second->file_map.end()); | |
748 | q->second->file_map[filename] = file; | |
749 | ++file->refs; | |
750 | } | |
751 | } | |
752 | break; | |
753 | ||
754 | case bluefs_transaction_t::OP_DIR_UNLINK: | |
755 | { | |
756 | string dirname, filename; | |
757 | ::decode(dirname, p); | |
758 | ::decode(filename, p); | |
759 | dout(20) << __func__ << " 0x" << std::hex << pos << std::dec | |
760 | << ": op_dir_unlink " << " " << dirname << "/" << filename | |
761 | << dendl; | |
762 | if (!noop) { | |
763 | map<string,DirRef>::iterator q = dir_map.find(dirname); | |
764 | assert(q != dir_map.end()); | |
765 | map<string,FileRef>::iterator r = q->second->file_map.find(filename); | |
766 | assert(r != q->second->file_map.end()); | |
767 | assert(r->second->refs > 0); | |
768 | --r->second->refs; | |
769 | q->second->file_map.erase(r); | |
770 | } | |
771 | } | |
772 | break; | |
773 | ||
774 | case bluefs_transaction_t::OP_DIR_CREATE: | |
775 | { | |
776 | string dirname; | |
777 | ::decode(dirname, p); | |
778 | dout(20) << __func__ << " 0x" << std::hex << pos << std::dec | |
779 | << ": op_dir_create " << dirname << dendl; | |
780 | if (!noop) { | |
781 | map<string,DirRef>::iterator q = dir_map.find(dirname); | |
782 | assert(q == dir_map.end()); | |
783 | dir_map[dirname] = new Dir; | |
784 | } | |
785 | } | |
786 | break; | |
787 | ||
788 | case bluefs_transaction_t::OP_DIR_REMOVE: | |
789 | { | |
790 | string dirname; | |
791 | ::decode(dirname, p); | |
792 | dout(20) << __func__ << " 0x" << std::hex << pos << std::dec | |
793 | << ": op_dir_remove " << dirname << dendl; | |
794 | if (!noop) { | |
795 | map<string,DirRef>::iterator q = dir_map.find(dirname); | |
796 | assert(q != dir_map.end()); | |
797 | assert(q->second->file_map.empty()); | |
798 | dir_map.erase(q); | |
799 | } | |
800 | } | |
801 | break; | |
802 | ||
803 | case bluefs_transaction_t::OP_FILE_UPDATE: | |
804 | { | |
805 | bluefs_fnode_t fnode; | |
806 | ::decode(fnode, p); | |
807 | dout(20) << __func__ << " 0x" << std::hex << pos << std::dec | |
808 | << ": op_file_update " << " " << fnode << dendl; | |
809 | if (!noop) { | |
810 | FileRef f = _get_file(fnode.ino); | |
811 | f->fnode = fnode; | |
812 | if (fnode.ino > ino_last) { | |
813 | ino_last = fnode.ino; | |
814 | } | |
815 | } | |
816 | } | |
817 | break; | |
818 | ||
819 | case bluefs_transaction_t::OP_FILE_REMOVE: | |
820 | { | |
821 | uint64_t ino; | |
822 | ::decode(ino, p); | |
823 | dout(20) << __func__ << " 0x" << std::hex << pos << std::dec | |
824 | << ": op_file_remove " << ino << dendl; | |
825 | if (!noop) { | |
826 | auto p = file_map.find(ino); | |
827 | assert(p != file_map.end()); | |
828 | file_map.erase(p); | |
829 | } | |
830 | } | |
831 | break; | |
832 | ||
833 | default: | |
834 | derr << __func__ << " 0x" << std::hex << pos << std::dec | |
835 | << ": stop: unrecognized op " << (int)op << dendl; | |
836 | delete log_reader; | |
837 | return -EIO; | |
838 | } | |
839 | } | |
840 | assert(p.end()); | |
841 | ||
842 | // we successfully replayed the transaction; bump the seq and log size | |
843 | ++log_seq; | |
844 | log_file->fnode.size = log_reader->buf.pos; | |
845 | } | |
846 | ||
847 | dout(10) << __func__ << " log file size was 0x" | |
848 | << std::hex << log_file->fnode.size << std::dec << dendl; | |
849 | delete log_reader; | |
850 | ||
851 | if (!noop) { | |
852 | // verify file link counts are all >0 | |
853 | for (auto& p : file_map) { | |
854 | if (p.second->refs == 0 && | |
855 | p.second->fnode.ino > 1) { | |
856 | derr << __func__ << " file with link count 0: " << p.second->fnode | |
857 | << dendl; | |
858 | return -EIO; | |
859 | } | |
860 | } | |
861 | } | |
862 | ||
863 | dout(10) << __func__ << " done" << dendl; | |
864 | return 0; | |
865 | } | |
866 | ||
867 | BlueFS::FileRef BlueFS::_get_file(uint64_t ino) | |
868 | { | |
869 | auto p = file_map.find(ino); | |
870 | if (p == file_map.end()) { | |
871 | FileRef f = new File; | |
872 | file_map[ino] = f; | |
873 | dout(30) << __func__ << " ino " << ino << " = " << f | |
874 | << " (new)" << dendl; | |
875 | return f; | |
876 | } else { | |
877 | dout(30) << __func__ << " ino " << ino << " = " << p->second << dendl; | |
878 | return p->second; | |
879 | } | |
880 | } | |
881 | ||
882 | void BlueFS::_drop_link(FileRef file) | |
883 | { | |
884 | dout(20) << __func__ << " had refs " << file->refs | |
885 | << " on " << file->fnode << dendl; | |
886 | assert(file->refs > 0); | |
887 | --file->refs; | |
888 | if (file->refs == 0) { | |
889 | dout(20) << __func__ << " destroying " << file->fnode << dendl; | |
890 | assert(file->num_reading.load() == 0); | |
891 | log_t.op_file_remove(file->fnode.ino); | |
892 | for (auto& r : file->fnode.extents) { | |
893 | pending_release[r.bdev].insert(r.offset, r.length); | |
894 | } | |
895 | file_map.erase(file->fnode.ino); | |
896 | file->deleted = true; | |
94b18763 | 897 | |
7c673cae FG |
898 | if (file->dirty_seq) { |
899 | assert(file->dirty_seq > log_seq_stable); | |
900 | assert(dirty_files.count(file->dirty_seq)); | |
901 | auto it = dirty_files[file->dirty_seq].iterator_to(*file); | |
902 | dirty_files[file->dirty_seq].erase(it); | |
903 | file->dirty_seq = 0; | |
904 | } | |
905 | } | |
906 | } | |
907 | ||
908 | int BlueFS::_read_random( | |
909 | FileReader *h, ///< [in] read from here | |
910 | uint64_t off, ///< [in] offset | |
911 | size_t len, ///< [in] this many bytes | |
912 | char *out) ///< [out] optional: or copy it here | |
913 | { | |
914 | dout(10) << __func__ << " h " << h | |
915 | << " 0x" << std::hex << off << "~" << len << std::dec | |
916 | << " from " << h->file->fnode << dendl; | |
917 | ||
918 | ++h->file->num_reading; | |
919 | ||
920 | if (!h->ignore_eof && | |
921 | off + len > h->file->fnode.size) { | |
922 | if (off > h->file->fnode.size) | |
923 | len = 0; | |
924 | else | |
925 | len = h->file->fnode.size - off; | |
926 | dout(20) << __func__ << " reaching (or past) eof, len clipped to 0x" | |
927 | << std::hex << len << std::dec << dendl; | |
928 | } | |
929 | ||
930 | int ret = 0; | |
931 | while (len > 0) { | |
932 | uint64_t x_off = 0; | |
933 | auto p = h->file->fnode.seek(off, &x_off); | |
934 | uint64_t l = MIN(p->length - x_off, len); | |
935 | dout(20) << __func__ << " read buffered 0x" | |
936 | << std::hex << x_off << "~" << l << std::dec | |
937 | << " of " << *p << dendl; | |
938 | int r = bdev[p->bdev]->read_random(p->offset + x_off, l, out, | |
939 | cct->_conf->bluefs_buffered_io); | |
940 | assert(r == 0); | |
941 | off += l; | |
942 | len -= l; | |
943 | ret += l; | |
944 | out += l; | |
945 | } | |
946 | ||
947 | dout(20) << __func__ << " got " << ret << dendl; | |
948 | --h->file->num_reading; | |
949 | return ret; | |
950 | } | |
951 | ||
952 | int BlueFS::_read( | |
953 | FileReader *h, ///< [in] read from here | |
954 | FileReaderBuffer *buf, ///< [in] reader state | |
955 | uint64_t off, ///< [in] offset | |
956 | size_t len, ///< [in] this many bytes | |
957 | bufferlist *outbl, ///< [out] optional: reference the result here | |
958 | char *out) ///< [out] optional: or copy it here | |
959 | { | |
960 | dout(10) << __func__ << " h " << h | |
961 | << " 0x" << std::hex << off << "~" << len << std::dec | |
962 | << " from " << h->file->fnode << dendl; | |
963 | ||
964 | ++h->file->num_reading; | |
965 | ||
966 | if (!h->ignore_eof && | |
967 | off + len > h->file->fnode.size) { | |
968 | if (off > h->file->fnode.size) | |
969 | len = 0; | |
970 | else | |
971 | len = h->file->fnode.size - off; | |
972 | dout(20) << __func__ << " reaching (or past) eof, len clipped to 0x" | |
973 | << std::hex << len << std::dec << dendl; | |
974 | } | |
975 | if (outbl) | |
976 | outbl->clear(); | |
977 | ||
978 | int ret = 0; | |
979 | while (len > 0) { | |
980 | size_t left; | |
981 | if (off < buf->bl_off || off >= buf->get_buf_end()) { | |
982 | buf->bl.clear(); | |
983 | buf->bl_off = off & super.block_mask(); | |
984 | uint64_t x_off = 0; | |
985 | auto p = h->file->fnode.seek(buf->bl_off, &x_off); | |
986 | uint64_t want = ROUND_UP_TO(len + (off & ~super.block_mask()), | |
987 | super.block_size); | |
988 | want = MAX(want, buf->max_prefetch); | |
989 | uint64_t l = MIN(p->length - x_off, want); | |
990 | uint64_t eof_offset = ROUND_UP_TO(h->file->fnode.size, super.block_size); | |
991 | if (!h->ignore_eof && | |
992 | buf->bl_off + l > eof_offset) { | |
993 | l = eof_offset - buf->bl_off; | |
994 | } | |
995 | dout(20) << __func__ << " fetching 0x" | |
996 | << std::hex << x_off << "~" << l << std::dec | |
997 | << " of " << *p << dendl; | |
998 | int r = bdev[p->bdev]->read(p->offset + x_off, l, &buf->bl, ioc[p->bdev], | |
999 | cct->_conf->bluefs_buffered_io); | |
1000 | assert(r == 0); | |
1001 | } | |
1002 | left = buf->get_buf_remaining(off); | |
1003 | dout(20) << __func__ << " left 0x" << std::hex << left | |
1004 | << " len 0x" << len << std::dec << dendl; | |
1005 | ||
1006 | int r = MIN(len, left); | |
1007 | if (outbl) { | |
1008 | bufferlist t; | |
1009 | t.substr_of(buf->bl, off - buf->bl_off, r); | |
1010 | outbl->claim_append(t); | |
1011 | } | |
1012 | if (out) { | |
1013 | // NOTE: h->bl is normally a contiguous buffer so c_str() is free. | |
1014 | memcpy(out, buf->bl.c_str() + off - buf->bl_off, r); | |
1015 | out += r; | |
1016 | } | |
1017 | ||
1018 | dout(30) << __func__ << " result chunk (0x" | |
1019 | << std::hex << r << std::dec << " bytes):\n"; | |
1020 | bufferlist t; | |
1021 | t.substr_of(buf->bl, off - buf->bl_off, r); | |
1022 | t.hexdump(*_dout); | |
1023 | *_dout << dendl; | |
1024 | ||
1025 | off += r; | |
1026 | len -= r; | |
1027 | ret += r; | |
1028 | buf->pos += r; | |
1029 | } | |
1030 | ||
1031 | dout(20) << __func__ << " got " << ret << dendl; | |
1032 | assert(!outbl || (int)outbl->length() == ret); | |
1033 | --h->file->num_reading; | |
1034 | return ret; | |
1035 | } | |
1036 | ||
1037 | void BlueFS::_invalidate_cache(FileRef f, uint64_t offset, uint64_t length) | |
1038 | { | |
1039 | dout(10) << __func__ << " file " << f->fnode | |
1040 | << " 0x" << std::hex << offset << "~" << length << std::dec | |
1041 | << dendl; | |
1042 | if (offset & ~super.block_mask()) { | |
1043 | offset &= super.block_mask(); | |
1044 | length = ROUND_UP_TO(length, super.block_size); | |
1045 | } | |
1046 | uint64_t x_off = 0; | |
1047 | auto p = f->fnode.seek(offset, &x_off); | |
1048 | while (length > 0 && p != f->fnode.extents.end()) { | |
1049 | uint64_t x_len = MIN(p->length - x_off, length); | |
1050 | bdev[p->bdev]->invalidate_cache(p->offset + x_off, x_len); | |
1051 | dout(20) << __func__ << " 0x" << std::hex << x_off << "~" << x_len | |
1052 | << std:: dec << " of " << *p << dendl; | |
1053 | offset += x_len; | |
1054 | length -= x_len; | |
1055 | } | |
1056 | } | |
1057 | ||
1058 | uint64_t BlueFS::_estimate_log_size() | |
1059 | { | |
1060 | int avg_dir_size = 40; // fixme | |
1061 | int avg_file_size = 12; | |
1062 | uint64_t size = 4096 * 2; | |
1063 | size += file_map.size() * (1 + sizeof(bluefs_fnode_t)); | |
1064 | for (auto& p : block_all) | |
1065 | size += p.num_intervals() * (1 + 1 + sizeof(uint64_t) * 2); | |
1066 | size += dir_map.size() + (1 + avg_dir_size); | |
1067 | size += file_map.size() * (1 + avg_dir_size + avg_file_size); | |
1068 | return ROUND_UP_TO(size, super.block_size); | |
1069 | } | |
1070 | ||
1071 | void BlueFS::compact_log() | |
1072 | { | |
1073 | std::unique_lock<std::mutex> l(lock); | |
1074 | if (cct->_conf->bluefs_compact_log_sync) { | |
1075 | _compact_log_sync(); | |
1076 | } else { | |
1077 | _compact_log_async(l); | |
1078 | } | |
1079 | } | |
1080 | ||
1081 | bool BlueFS::_should_compact_log() | |
1082 | { | |
1083 | uint64_t current = log_writer->file->fnode.size; | |
1084 | uint64_t expected = _estimate_log_size(); | |
1085 | float ratio = (float)current / (float)expected; | |
1086 | dout(10) << __func__ << " current 0x" << std::hex << current | |
1087 | << " expected " << expected << std::dec | |
1088 | << " ratio " << ratio | |
1089 | << (new_log ? " (async compaction in progress)" : "") | |
1090 | << dendl; | |
1091 | if (new_log || | |
1092 | current < cct->_conf->bluefs_log_compact_min_size || | |
1093 | ratio < cct->_conf->bluefs_log_compact_min_ratio) { | |
1094 | return false; | |
1095 | } | |
1096 | return true; | |
1097 | } | |
1098 | ||
1099 | void BlueFS::_compact_log_dump_metadata(bluefs_transaction_t *t) | |
1100 | { | |
1101 | t->seq = 1; | |
1102 | t->uuid = super.uuid; | |
1103 | dout(20) << __func__ << " op_init" << dendl; | |
1104 | ||
1105 | t->op_init(); | |
1106 | for (unsigned bdev = 0; bdev < MAX_BDEV; ++bdev) { | |
1107 | interval_set<uint64_t>& p = block_all[bdev]; | |
1108 | for (interval_set<uint64_t>::iterator q = p.begin(); q != p.end(); ++q) { | |
1109 | dout(20) << __func__ << " op_alloc_add " << bdev << " 0x" | |
1110 | << std::hex << q.get_start() << "~" << q.get_len() << std::dec | |
1111 | << dendl; | |
1112 | t->op_alloc_add(bdev, q.get_start(), q.get_len()); | |
1113 | } | |
1114 | } | |
1115 | for (auto& p : file_map) { | |
1116 | if (p.first == 1) | |
1117 | continue; | |
1118 | dout(20) << __func__ << " op_file_update " << p.second->fnode << dendl; | |
1119 | assert(p.first > 1); | |
1120 | t->op_file_update(p.second->fnode); | |
1121 | } | |
1122 | for (auto& p : dir_map) { | |
1123 | dout(20) << __func__ << " op_dir_create " << p.first << dendl; | |
1124 | t->op_dir_create(p.first); | |
1125 | for (auto& q : p.second->file_map) { | |
1126 | dout(20) << __func__ << " op_dir_link " << p.first << "/" << q.first | |
1127 | << " to " << q.second->fnode.ino << dendl; | |
1128 | t->op_dir_link(p.first, q.first, q.second->fnode.ino); | |
1129 | } | |
1130 | } | |
1131 | } | |
1132 | ||
1133 | void BlueFS::_compact_log_sync() | |
1134 | { | |
1135 | dout(10) << __func__ << dendl; | |
1136 | File *log_file = log_writer->file.get(); | |
1137 | ||
1138 | // clear out log (be careful who calls us!!!) | |
1139 | log_t.clear(); | |
1140 | ||
1141 | bluefs_transaction_t t; | |
1142 | _compact_log_dump_metadata(&t); | |
1143 | ||
1144 | dout(20) << __func__ << " op_jump_seq " << log_seq << dendl; | |
1145 | t.op_jump_seq(log_seq); | |
1146 | ||
1147 | bufferlist bl; | |
1148 | ::encode(t, bl); | |
1149 | _pad_bl(bl); | |
1150 | ||
1151 | uint64_t need = bl.length() + cct->_conf->bluefs_max_log_runway; | |
1152 | dout(20) << __func__ << " need " << need << dendl; | |
1153 | ||
1154 | mempool::bluefs::vector<bluefs_extent_t> old_extents; | |
94b18763 FG |
1155 | uint64_t old_allocated = 0; |
1156 | log_file->fnode.swap_extents(old_extents, old_allocated); | |
7c673cae FG |
1157 | while (log_file->fnode.get_allocated() < need) { |
1158 | int r = _allocate(log_file->fnode.prefer_bdev, | |
1159 | need - log_file->fnode.get_allocated(), | |
94b18763 | 1160 | &log_file->fnode); |
7c673cae FG |
1161 | assert(r == 0); |
1162 | } | |
1163 | ||
1164 | _close_writer(log_writer); | |
1165 | ||
1166 | log_file->fnode.size = bl.length(); | |
1167 | log_writer = _create_writer(log_file); | |
1168 | log_writer->append(bl); | |
1169 | int r = _flush(log_writer, true); | |
1170 | assert(r == 0); | |
1171 | wait_for_aio(log_writer); | |
1172 | ||
224ce89b WB |
1173 | list<aio_t> completed_ios; |
1174 | _claim_completed_aios(log_writer, &completed_ios); | |
1175 | flush_bdev(); | |
1176 | completed_ios.clear(); | |
1177 | ||
7c673cae FG |
1178 | dout(10) << __func__ << " writing super" << dendl; |
1179 | super.log_fnode = log_file->fnode; | |
1180 | ++super.version; | |
1181 | _write_super(); | |
1182 | flush_bdev(); | |
1183 | ||
1184 | dout(10) << __func__ << " release old log extents " << old_extents << dendl; | |
1185 | for (auto& r : old_extents) { | |
1186 | pending_release[r.bdev].insert(r.offset, r.length); | |
1187 | } | |
1188 | ||
1189 | logger->inc(l_bluefs_log_compactions); | |
1190 | } | |
1191 | ||
1192 | /* | |
1193 | * 1. Allocate a new extent to continue the log, and then log an event | |
1194 | * that jumps the log write position to the new extent. At this point, the | |
1195 | * old extent(s) won't be written to, and reflect everything to compact. | |
1196 | * New events will be written to the new region that we'll keep. | |
1197 | * | |
1198 | * 2. While still holding the lock, encode a bufferlist that dumps all of the | |
1199 | * in-memory fnodes and names. This will become the new beginning of the | |
1200 | * log. The last event will jump to the log continuation extent from #1. | |
1201 | * | |
1202 | * 3. Queue a write to a new extent for the new beginnging of the log. | |
1203 | * | |
1204 | * 4. Drop lock and wait | |
1205 | * | |
1206 | * 5. Retake the lock. | |
1207 | * | |
1208 | * 6. Update the log_fnode to splice in the new beginning. | |
1209 | * | |
1210 | * 7. Write the new superblock. | |
1211 | * | |
1212 | * 8. Release the old log space. Clean up. | |
1213 | */ | |
1214 | void BlueFS::_compact_log_async(std::unique_lock<std::mutex>& l) | |
1215 | { | |
1216 | dout(10) << __func__ << dendl; | |
1217 | File *log_file = log_writer->file.get(); | |
1218 | assert(!new_log); | |
1219 | assert(!new_log_writer); | |
1220 | ||
181888fb FG |
1221 | // create a new log [writer] so that we know compaction is in progress |
1222 | // (see _should_compact_log) | |
1223 | new_log = new File; | |
1224 | new_log->fnode.ino = 0; // so that _flush_range won't try to log the fnode | |
1225 | ||
3efd9988 FG |
1226 | // 0. wait for any racing flushes to complete. (We do not want to block |
1227 | // in _flush_sync_log with jump_to set or else a racing thread might flush | |
1228 | // our entries and our jump_to update won't be correct.) | |
1229 | while (log_flushing) { | |
1230 | dout(10) << __func__ << " log is currently flushing, waiting" << dendl; | |
1231 | log_cond.wait(l); | |
1232 | } | |
1233 | ||
7c673cae FG |
1234 | // 1. allocate new log space and jump to it. |
1235 | old_log_jump_to = log_file->fnode.get_allocated(); | |
1236 | uint64_t need = old_log_jump_to + cct->_conf->bluefs_max_log_runway; | |
1237 | dout(10) << __func__ << " old_log_jump_to 0x" << std::hex << old_log_jump_to | |
1238 | << " need 0x" << need << std::dec << dendl; | |
1239 | while (log_file->fnode.get_allocated() < need) { | |
1240 | int r = _allocate(log_file->fnode.prefer_bdev, | |
1241 | cct->_conf->bluefs_max_log_runway, | |
94b18763 | 1242 | &log_file->fnode); |
7c673cae | 1243 | assert(r == 0); |
7c673cae FG |
1244 | } |
1245 | dout(10) << __func__ << " log extents " << log_file->fnode.extents << dendl; | |
1246 | ||
1247 | // update the log file change and log a jump to the offset where we want to | |
1248 | // write the new entries | |
1249 | log_t.op_file_update(log_file->fnode); | |
1250 | log_t.op_jump(log_seq, old_log_jump_to); | |
1251 | ||
1252 | flush_bdev(); // FIXME? | |
1253 | ||
1254 | _flush_and_sync_log(l, 0, old_log_jump_to); | |
1255 | ||
1256 | // 2. prepare compacted log | |
1257 | bluefs_transaction_t t; | |
224ce89b WB |
1258 | //avoid record two times in log_t and _compact_log_dump_metadata. |
1259 | log_t.clear(); | |
7c673cae FG |
1260 | _compact_log_dump_metadata(&t); |
1261 | ||
1262 | // conservative estimate for final encoded size | |
1263 | new_log_jump_to = ROUND_UP_TO(t.op_bl.length() + super.block_size * 2, | |
1264 | cct->_conf->bluefs_alloc_size); | |
1265 | t.op_jump(log_seq, new_log_jump_to); | |
1266 | ||
1267 | bufferlist bl; | |
1268 | ::encode(t, bl); | |
1269 | _pad_bl(bl); | |
1270 | ||
1271 | dout(10) << __func__ << " new_log_jump_to 0x" << std::hex << new_log_jump_to | |
1272 | << std::dec << dendl; | |
1273 | ||
181888fb | 1274 | // allocate |
7c673cae | 1275 | int r = _allocate(BlueFS::BDEV_DB, new_log_jump_to, |
94b18763 | 1276 | &new_log->fnode); |
7c673cae | 1277 | assert(r == 0); |
7c673cae FG |
1278 | new_log_writer = _create_writer(new_log); |
1279 | new_log_writer->append(bl); | |
1280 | ||
1281 | // 3. flush | |
1282 | r = _flush(new_log_writer, true); | |
1283 | assert(r == 0); | |
1284 | lock.unlock(); | |
1285 | ||
1286 | // 4. wait | |
1287 | dout(10) << __func__ << " waiting for compacted log to sync" << dendl; | |
1288 | wait_for_aio(new_log_writer); | |
224ce89b WB |
1289 | |
1290 | list<aio_t> completed_ios; | |
1291 | _claim_completed_aios(new_log_writer, &completed_ios); | |
7c673cae | 1292 | flush_bdev(); |
224ce89b | 1293 | completed_ios.clear(); |
7c673cae FG |
1294 | |
1295 | // 5. retake lock | |
1296 | lock.lock(); | |
1297 | ||
1298 | // 6. update our log fnode | |
1299 | // discard first old_log_jump_to extents | |
1300 | dout(10) << __func__ << " remove 0x" << std::hex << old_log_jump_to << std::dec | |
1301 | << " of " << log_file->fnode.extents << dendl; | |
1302 | uint64_t discarded = 0; | |
1303 | mempool::bluefs::vector<bluefs_extent_t> old_extents; | |
1304 | while (discarded < old_log_jump_to) { | |
1305 | assert(!log_file->fnode.extents.empty()); | |
1306 | bluefs_extent_t& e = log_file->fnode.extents.front(); | |
1307 | bluefs_extent_t temp = e; | |
1308 | if (discarded + e.length <= old_log_jump_to) { | |
1309 | dout(10) << __func__ << " remove old log extent " << e << dendl; | |
1310 | discarded += e.length; | |
94b18763 | 1311 | log_file->fnode.pop_front_extent(); |
7c673cae FG |
1312 | } else { |
1313 | dout(10) << __func__ << " remove front of old log extent " << e << dendl; | |
1314 | uint64_t drop = old_log_jump_to - discarded; | |
1315 | temp.length = drop; | |
1316 | e.offset += drop; | |
1317 | e.length -= drop; | |
1318 | discarded += drop; | |
1319 | dout(10) << __func__ << " kept " << e << " removed " << temp << dendl; | |
1320 | } | |
1321 | old_extents.push_back(temp); | |
1322 | } | |
94b18763 FG |
1323 | auto from = log_file->fnode.extents.begin(); |
1324 | auto to = log_file->fnode.extents.end(); | |
1325 | while (from != to) { | |
1326 | new_log->fnode.append_extent(*from); | |
1327 | ++from; | |
1328 | } | |
7c673cae FG |
1329 | |
1330 | // clear the extents from old log file, they are added to new log | |
94b18763 | 1331 | log_file->fnode.clear_extents(); |
7c673cae | 1332 | // swap the log files. New log file is the log file now. |
94b18763 FG |
1333 | new_log->fnode.swap_extents(log_file->fnode); |
1334 | ||
7c673cae FG |
1335 | log_writer->pos = log_writer->file->fnode.size = |
1336 | log_writer->pos - old_log_jump_to + new_log_jump_to; | |
1337 | ||
1338 | // 7. write the super block to reflect the changes | |
1339 | dout(10) << __func__ << " writing super" << dendl; | |
1340 | super.log_fnode = log_file->fnode; | |
1341 | ++super.version; | |
1342 | _write_super(); | |
1343 | ||
1344 | lock.unlock(); | |
1345 | flush_bdev(); | |
1346 | lock.lock(); | |
1347 | ||
1348 | // 8. release old space | |
1349 | dout(10) << __func__ << " release old log extents " << old_extents << dendl; | |
1350 | for (auto& r : old_extents) { | |
1351 | pending_release[r.bdev].insert(r.offset, r.length); | |
1352 | } | |
1353 | ||
1354 | // delete the new log, remove from the dirty files list | |
1355 | _close_writer(new_log_writer); | |
1356 | if (new_log->dirty_seq) { | |
1357 | assert(dirty_files.count(new_log->dirty_seq)); | |
1358 | auto it = dirty_files[new_log->dirty_seq].iterator_to(*new_log); | |
1359 | dirty_files[new_log->dirty_seq].erase(it); | |
1360 | } | |
1361 | new_log_writer = nullptr; | |
1362 | new_log = nullptr; | |
1363 | log_cond.notify_all(); | |
1364 | ||
1365 | dout(10) << __func__ << " log extents " << log_file->fnode.extents << dendl; | |
1366 | logger->inc(l_bluefs_log_compactions); | |
1367 | } | |
1368 | ||
1369 | void BlueFS::_pad_bl(bufferlist& bl) | |
1370 | { | |
1371 | uint64_t partial = bl.length() % super.block_size; | |
1372 | if (partial) { | |
1373 | dout(10) << __func__ << " padding with 0x" << std::hex | |
1374 | << super.block_size - partial << " zeros" << std::dec << dendl; | |
1375 | bl.append_zero(super.block_size - partial); | |
1376 | } | |
1377 | } | |
1378 | ||
1379 | void BlueFS::flush_log() | |
1380 | { | |
1381 | std::unique_lock<std::mutex> l(lock); | |
1382 | flush_bdev(); | |
1383 | _flush_and_sync_log(l); | |
1384 | } | |
1385 | ||
1386 | int BlueFS::_flush_and_sync_log(std::unique_lock<std::mutex>& l, | |
1387 | uint64_t want_seq, | |
1388 | uint64_t jump_to) | |
1389 | { | |
1390 | while (log_flushing) { | |
1391 | dout(10) << __func__ << " want_seq " << want_seq | |
1392 | << " log is currently flushing, waiting" << dendl; | |
3efd9988 | 1393 | assert(!jump_to); |
7c673cae FG |
1394 | log_cond.wait(l); |
1395 | } | |
1396 | if (want_seq && want_seq <= log_seq_stable) { | |
1397 | dout(10) << __func__ << " want_seq " << want_seq << " <= log_seq_stable " | |
1398 | << log_seq_stable << ", done" << dendl; | |
3efd9988 | 1399 | assert(!jump_to); |
7c673cae FG |
1400 | return 0; |
1401 | } | |
1402 | if (log_t.empty() && dirty_files.empty()) { | |
1403 | dout(10) << __func__ << " want_seq " << want_seq | |
1404 | << " " << log_t << " not dirty, dirty_files empty, no-op" << dendl; | |
3efd9988 | 1405 | assert(!jump_to); |
7c673cae FG |
1406 | return 0; |
1407 | } | |
1408 | ||
1409 | uint64_t seq = log_t.seq = ++log_seq; | |
1410 | assert(want_seq == 0 || want_seq <= seq); | |
1411 | log_t.uuid = super.uuid; | |
1412 | ||
1413 | // log dirty files | |
1414 | auto lsi = dirty_files.find(seq); | |
1415 | if (lsi != dirty_files.end()) { | |
1416 | dout(20) << __func__ << " " << lsi->second.size() << " dirty_files" << dendl; | |
1417 | for (auto &f : lsi->second) { | |
1418 | dout(20) << __func__ << " op_file_update " << f.fnode << dendl; | |
1419 | log_t.op_file_update(f.fnode); | |
1420 | } | |
1421 | } | |
1422 | ||
1423 | dout(10) << __func__ << " " << log_t << dendl; | |
1424 | assert(!log_t.empty()); | |
1425 | ||
1426 | // allocate some more space (before we run out)? | |
1427 | int64_t runway = log_writer->file->fnode.get_allocated() - | |
1428 | log_writer->get_effective_write_pos(); | |
1429 | if (runway < (int64_t)cct->_conf->bluefs_min_log_runway) { | |
1430 | dout(10) << __func__ << " allocating more log runway (0x" | |
1431 | << std::hex << runway << std::dec << " remaining)" << dendl; | |
1432 | while (new_log_writer) { | |
1433 | dout(10) << __func__ << " waiting for async compaction" << dendl; | |
1434 | log_cond.wait(l); | |
1435 | } | |
1436 | int r = _allocate(log_writer->file->fnode.prefer_bdev, | |
1437 | cct->_conf->bluefs_max_log_runway, | |
94b18763 | 1438 | &log_writer->file->fnode); |
7c673cae | 1439 | assert(r == 0); |
7c673cae FG |
1440 | log_t.op_file_update(log_writer->file->fnode); |
1441 | } | |
1442 | ||
1443 | bufferlist bl; | |
1444 | ::encode(log_t, bl); | |
1445 | ||
1446 | // pad to block boundary | |
1447 | _pad_bl(bl); | |
1448 | logger->inc(l_bluefs_logged_bytes, bl.length()); | |
1449 | ||
1450 | log_writer->append(bl); | |
1451 | ||
1452 | log_t.clear(); | |
1453 | log_t.seq = 0; // just so debug output is less confusing | |
1454 | log_flushing = true; | |
1455 | ||
1456 | int r = _flush(log_writer, true); | |
1457 | assert(r == 0); | |
1458 | ||
1459 | if (jump_to) { | |
1460 | dout(10) << __func__ << " jumping log offset from 0x" << std::hex | |
1461 | << log_writer->pos << " -> 0x" << jump_to << std::dec << dendl; | |
1462 | log_writer->pos = jump_to; | |
1463 | log_writer->file->fnode.size = jump_to; | |
1464 | } | |
1465 | ||
1466 | _flush_bdev_safely(log_writer); | |
1467 | ||
1468 | log_flushing = false; | |
1469 | log_cond.notify_all(); | |
1470 | ||
1471 | // clean dirty files | |
1472 | if (seq > log_seq_stable) { | |
1473 | log_seq_stable = seq; | |
1474 | dout(20) << __func__ << " log_seq_stable " << log_seq_stable << dendl; | |
1475 | ||
1476 | auto p = dirty_files.begin(); | |
1477 | while (p != dirty_files.end()) { | |
1478 | if (p->first > log_seq_stable) { | |
1479 | dout(20) << __func__ << " done cleaning up dirty files" << dendl; | |
1480 | break; | |
1481 | } | |
1482 | ||
1483 | auto l = p->second.begin(); | |
1484 | while (l != p->second.end()) { | |
1485 | File *file = &*l; | |
1486 | assert(file->dirty_seq > 0); | |
1487 | assert(file->dirty_seq <= log_seq_stable); | |
1488 | dout(20) << __func__ << " cleaned file " << file->fnode << dendl; | |
1489 | file->dirty_seq = 0; | |
1490 | p->second.erase(l++); | |
1491 | } | |
1492 | ||
1493 | assert(p->second.empty()); | |
1494 | dirty_files.erase(p++); | |
1495 | } | |
1496 | } else { | |
1497 | dout(20) << __func__ << " log_seq_stable " << log_seq_stable | |
1498 | << " already >= out seq " << seq | |
1499 | << ", we lost a race against another log flush, done" << dendl; | |
1500 | } | |
1501 | _update_logger_stats(); | |
1502 | ||
1503 | return 0; | |
1504 | } | |
1505 | ||
1506 | int BlueFS::_flush_range(FileWriter *h, uint64_t offset, uint64_t length) | |
1507 | { | |
1508 | dout(10) << __func__ << " " << h << " pos 0x" << std::hex << h->pos | |
1509 | << " 0x" << offset << "~" << length << std::dec | |
1510 | << " to " << h->file->fnode << dendl; | |
1511 | assert(!h->file->deleted); | |
1512 | assert(h->file->num_readers.load() == 0); | |
1513 | ||
1514 | h->buffer_appender.flush(); | |
1515 | ||
1516 | bool buffered; | |
1517 | if (h->file->fnode.ino == 1) | |
1518 | buffered = false; | |
1519 | else | |
1520 | buffered = cct->_conf->bluefs_buffered_io; | |
1521 | ||
1522 | if (offset + length <= h->pos) | |
1523 | return 0; | |
1524 | if (offset < h->pos) { | |
1525 | length -= h->pos - offset; | |
1526 | offset = h->pos; | |
1527 | dout(10) << " still need 0x" | |
1528 | << std::hex << offset << "~" << length << std::dec | |
1529 | << dendl; | |
1530 | } | |
1531 | assert(offset <= h->file->fnode.size); | |
1532 | ||
1533 | uint64_t allocated = h->file->fnode.get_allocated(); | |
1534 | ||
1535 | // do not bother to dirty the file if we are overwriting | |
1536 | // previously allocated extents. | |
1537 | bool must_dirty = false; | |
1538 | if (allocated < offset + length) { | |
1539 | // we should never run out of log space here; see the min runway check | |
1540 | // in _flush_and_sync_log. | |
1541 | assert(h->file->fnode.ino != 1); | |
1542 | int r = _allocate(h->file->fnode.prefer_bdev, | |
1543 | offset + length - allocated, | |
94b18763 | 1544 | &h->file->fnode); |
7c673cae FG |
1545 | if (r < 0) { |
1546 | derr << __func__ << " allocated: 0x" << std::hex << allocated | |
1547 | << " offset: 0x" << offset << " length: 0x" << length << std::dec | |
1548 | << dendl; | |
3efd9988 | 1549 | assert(0 == "bluefs enospc"); |
7c673cae FG |
1550 | return r; |
1551 | } | |
7c673cae FG |
1552 | if (cct->_conf->bluefs_preextend_wal_files && |
1553 | h->writer_type == WRITER_WAL) { | |
1554 | // NOTE: this *requires* that rocksdb also has log recycling | |
1555 | // enabled and is therefore doing robust CRCs on the log | |
1556 | // records. otherwise, we will fail to reply the rocksdb log | |
1557 | // properly due to garbage on the device. | |
1558 | h->file->fnode.size = h->file->fnode.get_allocated(); | |
1559 | dout(10) << __func__ << " extending WAL size to 0x" << std::hex | |
1560 | << h->file->fnode.size << std::dec << " to include allocated" | |
1561 | << dendl; | |
1562 | } | |
1563 | must_dirty = true; | |
1564 | } | |
1565 | if (h->file->fnode.size < offset + length) { | |
1566 | h->file->fnode.size = offset + length; | |
1567 | if (h->file->fnode.ino > 1) { | |
1568 | // we do not need to dirty the log file (or it's compacting | |
1569 | // replacement) when the file size changes because replay is | |
1570 | // smart enough to discover it on its own. | |
1571 | must_dirty = true; | |
1572 | } | |
1573 | } | |
1574 | if (must_dirty) { | |
1575 | h->file->fnode.mtime = ceph_clock_now(); | |
1576 | assert(h->file->fnode.ino >= 1); | |
1577 | if (h->file->dirty_seq == 0) { | |
1578 | h->file->dirty_seq = log_seq + 1; | |
1579 | dirty_files[h->file->dirty_seq].push_back(*h->file); | |
1580 | dout(20) << __func__ << " dirty_seq = " << log_seq + 1 | |
1581 | << " (was clean)" << dendl; | |
1582 | } else { | |
1583 | if (h->file->dirty_seq != log_seq + 1) { | |
1584 | // need re-dirty, erase from list first | |
1585 | assert(dirty_files.count(h->file->dirty_seq)); | |
1586 | auto it = dirty_files[h->file->dirty_seq].iterator_to(*h->file); | |
1587 | dirty_files[h->file->dirty_seq].erase(it); | |
1588 | h->file->dirty_seq = log_seq + 1; | |
1589 | dirty_files[h->file->dirty_seq].push_back(*h->file); | |
1590 | dout(20) << __func__ << " dirty_seq = " << log_seq + 1 | |
1591 | << " (was " << h->file->dirty_seq << ")" << dendl; | |
1592 | } else { | |
1593 | dout(20) << __func__ << " dirty_seq = " << log_seq + 1 | |
1594 | << " (unchanged, do nothing) " << dendl; | |
1595 | } | |
1596 | } | |
1597 | } | |
1598 | dout(20) << __func__ << " file now " << h->file->fnode << dendl; | |
1599 | ||
1600 | uint64_t x_off = 0; | |
1601 | auto p = h->file->fnode.seek(offset, &x_off); | |
1602 | assert(p != h->file->fnode.extents.end()); | |
1603 | dout(20) << __func__ << " in " << *p << " x_off 0x" | |
1604 | << std::hex << x_off << std::dec << dendl; | |
1605 | ||
1606 | unsigned partial = x_off & ~super.block_mask(); | |
1607 | bufferlist bl; | |
1608 | if (partial) { | |
1609 | dout(20) << __func__ << " using partial tail 0x" | |
1610 | << std::hex << partial << std::dec << dendl; | |
1611 | assert(h->tail_block.length() == partial); | |
31f18b77 | 1612 | bl.claim_append_piecewise(h->tail_block); |
7c673cae FG |
1613 | x_off -= partial; |
1614 | offset -= partial; | |
1615 | length += partial; | |
1616 | dout(20) << __func__ << " waiting for previous aio to complete" << dendl; | |
1617 | for (auto p : h->iocv) { | |
1618 | if (p) { | |
1619 | p->aio_wait(); | |
1620 | } | |
1621 | } | |
1622 | } | |
1623 | if (length == partial + h->buffer.length()) { | |
31f18b77 | 1624 | bl.claim_append_piecewise(h->buffer); |
7c673cae FG |
1625 | } else { |
1626 | bufferlist t; | |
31f18b77 FG |
1627 | h->buffer.splice(0, length, &t); |
1628 | bl.claim_append_piecewise(t); | |
7c673cae FG |
1629 | t.substr_of(h->buffer, length, h->buffer.length() - length); |
1630 | h->buffer.swap(t); | |
1631 | dout(20) << " leaving 0x" << std::hex << h->buffer.length() << std::dec | |
1632 | << " unflushed" << dendl; | |
1633 | } | |
1634 | assert(bl.length() == length); | |
1635 | ||
1636 | switch (h->writer_type) { | |
1637 | case WRITER_WAL: | |
1638 | logger->inc(l_bluefs_bytes_written_wal, length); | |
1639 | break; | |
1640 | case WRITER_SST: | |
1641 | logger->inc(l_bluefs_bytes_written_sst, length); | |
1642 | break; | |
1643 | } | |
1644 | ||
1645 | dout(30) << "dump:\n"; | |
1646 | bl.hexdump(*_dout); | |
1647 | *_dout << dendl; | |
1648 | ||
1649 | h->pos = offset + length; | |
1650 | h->tail_block.clear(); | |
1651 | ||
1652 | uint64_t bloff = 0; | |
1653 | while (length > 0) { | |
1654 | uint64_t x_len = MIN(p->length - x_off, length); | |
1655 | bufferlist t; | |
1656 | t.substr_of(bl, bloff, x_len); | |
1657 | unsigned tail = x_len & ~super.block_mask(); | |
1658 | if (tail) { | |
1659 | size_t zlen = super.block_size - tail; | |
1660 | dout(20) << __func__ << " caching tail of 0x" | |
1661 | << std::hex << tail | |
1662 | << " and padding block with 0x" << zlen | |
1663 | << std::dec << dendl; | |
1664 | h->tail_block.substr_of(bl, bl.length() - tail, tail); | |
1665 | if (h->file->fnode.ino > 1) { | |
1666 | // we are using the page_aligned_appender, and can safely use | |
1667 | // the tail of the raw buffer. | |
1668 | const bufferptr &last = t.back(); | |
1669 | if (last.unused_tail_length() < zlen) { | |
1670 | derr << " wtf, last is " << last << " from " << t << dendl; | |
1671 | assert(last.unused_tail_length() >= zlen); | |
1672 | } | |
1673 | bufferptr z = last; | |
1674 | z.set_offset(last.offset() + last.length()); | |
1675 | z.set_length(zlen); | |
1676 | z.zero(); | |
1677 | t.append(z, 0, zlen); | |
1678 | } else { | |
1679 | t.append_zero(zlen); | |
1680 | } | |
1681 | } | |
1682 | if (cct->_conf->bluefs_sync_write) { | |
1683 | bdev[p->bdev]->write(p->offset + x_off, t, buffered); | |
1684 | } else { | |
1685 | bdev[p->bdev]->aio_write(p->offset + x_off, t, h->iocv[p->bdev], buffered); | |
1686 | } | |
1687 | bloff += x_len; | |
1688 | length -= x_len; | |
1689 | ++p; | |
1690 | x_off = 0; | |
1691 | } | |
1692 | for (unsigned i = 0; i < MAX_BDEV; ++i) { | |
1693 | if (bdev[i]) { | |
1694 | assert(h->iocv[i]); | |
1695 | if (h->iocv[i]->has_pending_aios()) { | |
1696 | bdev[i]->aio_submit(h->iocv[i]); | |
1697 | } | |
1698 | } | |
1699 | } | |
1700 | dout(20) << __func__ << " h " << h << " pos now 0x" | |
1701 | << std::hex << h->pos << std::dec << dendl; | |
1702 | return 0; | |
1703 | } | |
1704 | ||
1705 | // we need to retire old completed aios so they don't stick around in | |
1706 | // memory indefinitely (along with their bufferlist refs). | |
1707 | void BlueFS::_claim_completed_aios(FileWriter *h, list<aio_t> *ls) | |
1708 | { | |
1709 | for (auto p : h->iocv) { | |
1710 | if (p) { | |
1711 | ls->splice(ls->end(), p->running_aios); | |
1712 | } | |
1713 | } | |
1714 | dout(10) << __func__ << " got " << ls->size() << " aios" << dendl; | |
1715 | } | |
1716 | ||
1717 | void BlueFS::wait_for_aio(FileWriter *h) | |
1718 | { | |
1719 | // NOTE: this is safe to call without a lock, as long as our reference is | |
1720 | // stable. | |
1721 | dout(10) << __func__ << " " << h << dendl; | |
1722 | utime_t start = ceph_clock_now(); | |
1723 | for (auto p : h->iocv) { | |
1724 | if (p) { | |
1725 | p->aio_wait(); | |
1726 | } | |
1727 | } | |
1728 | utime_t end = ceph_clock_now(); | |
1729 | utime_t dur = end - start; | |
1730 | dout(10) << __func__ << " " << h << " done in " << dur << dendl; | |
1731 | } | |
1732 | ||
1733 | int BlueFS::_flush(FileWriter *h, bool force) | |
1734 | { | |
1735 | h->buffer_appender.flush(); | |
1736 | uint64_t length = h->buffer.length(); | |
1737 | uint64_t offset = h->pos; | |
1738 | if (!force && | |
1739 | length < cct->_conf->bluefs_min_flush_size) { | |
1740 | dout(10) << __func__ << " " << h << " ignoring, length " << length | |
1741 | << " < min_flush_size " << cct->_conf->bluefs_min_flush_size | |
1742 | << dendl; | |
1743 | return 0; | |
1744 | } | |
1745 | if (length == 0) { | |
1746 | dout(10) << __func__ << " " << h << " no dirty data on " | |
1747 | << h->file->fnode << dendl; | |
1748 | return 0; | |
1749 | } | |
1750 | dout(10) << __func__ << " " << h << " 0x" | |
1751 | << std::hex << offset << "~" << length << std::dec | |
1752 | << " to " << h->file->fnode << dendl; | |
1753 | assert(h->pos <= h->file->fnode.size); | |
1754 | return _flush_range(h, offset, length); | |
1755 | } | |
1756 | ||
1757 | int BlueFS::_truncate(FileWriter *h, uint64_t offset) | |
1758 | { | |
1759 | dout(10) << __func__ << " 0x" << std::hex << offset << std::dec | |
1760 | << " file " << h->file->fnode << dendl; | |
1761 | if (h->file->deleted) { | |
1762 | dout(10) << __func__ << " deleted, no-op" << dendl; | |
1763 | return 0; | |
1764 | } | |
1765 | ||
1766 | // we never truncate internal log files | |
1767 | assert(h->file->fnode.ino > 1); | |
1768 | ||
1769 | h->buffer_appender.flush(); | |
1770 | ||
1771 | // truncate off unflushed data? | |
1772 | if (h->pos < offset && | |
1773 | h->pos + h->buffer.length() > offset) { | |
1774 | bufferlist t; | |
1775 | dout(20) << __func__ << " tossing out last " << offset - h->pos | |
1776 | << " unflushed bytes" << dendl; | |
1777 | t.substr_of(h->buffer, 0, offset - h->pos); | |
1778 | h->buffer.swap(t); | |
1779 | assert(0 == "actually this shouldn't happen"); | |
1780 | } | |
1781 | if (h->buffer.length()) { | |
1782 | int r = _flush(h, true); | |
1783 | if (r < 0) | |
1784 | return r; | |
1785 | } | |
1786 | if (offset == h->file->fnode.size) { | |
1787 | return 0; // no-op! | |
1788 | } | |
1789 | if (offset > h->file->fnode.size) { | |
1790 | assert(0 == "truncate up not supported"); | |
1791 | } | |
1792 | assert(h->file->fnode.size >= offset); | |
1793 | h->file->fnode.size = offset; | |
1794 | log_t.op_file_update(h->file->fnode); | |
1795 | return 0; | |
1796 | } | |
1797 | ||
1798 | int BlueFS::_fsync(FileWriter *h, std::unique_lock<std::mutex>& l) | |
1799 | { | |
1800 | dout(10) << __func__ << " " << h << " " << h->file->fnode << dendl; | |
1801 | int r = _flush(h, true); | |
1802 | if (r < 0) | |
1803 | return r; | |
1804 | uint64_t old_dirty_seq = h->file->dirty_seq; | |
1805 | ||
1806 | _flush_bdev_safely(h); | |
1807 | ||
1808 | if (old_dirty_seq) { | |
1809 | uint64_t s = log_seq; | |
1810 | dout(20) << __func__ << " file metadata was dirty (" << old_dirty_seq | |
1811 | << ") on " << h->file->fnode << ", flushing log" << dendl; | |
1812 | _flush_and_sync_log(l, old_dirty_seq); | |
1813 | assert(h->file->dirty_seq == 0 || // cleaned | |
1814 | h->file->dirty_seq > s); // or redirtied by someone else | |
1815 | } | |
1816 | return 0; | |
1817 | } | |
1818 | ||
1819 | void BlueFS::_flush_bdev_safely(FileWriter *h) | |
1820 | { | |
1821 | if (!cct->_conf->bluefs_sync_write) { | |
1822 | list<aio_t> completed_ios; | |
1823 | _claim_completed_aios(h, &completed_ios); | |
1824 | lock.unlock(); | |
1825 | wait_for_aio(h); | |
1826 | completed_ios.clear(); | |
1827 | flush_bdev(); | |
1828 | lock.lock(); | |
1829 | } else { | |
1830 | lock.unlock(); | |
1831 | flush_bdev(); | |
1832 | lock.lock(); | |
1833 | } | |
1834 | } | |
1835 | ||
1836 | void BlueFS::flush_bdev() | |
1837 | { | |
1838 | // NOTE: this is safe to call without a lock. | |
1839 | dout(20) << __func__ << dendl; | |
1840 | for (auto p : bdev) { | |
1841 | if (p) | |
1842 | p->flush(); | |
1843 | } | |
1844 | } | |
1845 | ||
1846 | int BlueFS::_allocate(uint8_t id, uint64_t len, | |
94b18763 | 1847 | bluefs_fnode_t* node) |
7c673cae FG |
1848 | { |
1849 | dout(10) << __func__ << " len 0x" << std::hex << len << std::dec | |
1850 | << " from " << (int)id << dendl; | |
1851 | assert(id < alloc.size()); | |
1852 | uint64_t min_alloc_size = cct->_conf->bluefs_alloc_size; | |
1853 | ||
1854 | uint64_t left = ROUND_UP_TO(len, min_alloc_size); | |
1855 | int r = -ENOSPC; | |
b32b8144 FG |
1856 | int64_t alloc_len = 0; |
1857 | AllocExtentVector extents; | |
1858 | ||
7c673cae FG |
1859 | if (alloc[id]) { |
1860 | r = alloc[id]->reserve(left); | |
1861 | } | |
b32b8144 FG |
1862 | |
1863 | if (r == 0) { | |
1864 | uint64_t hint = 0; | |
94b18763 FG |
1865 | if (!node->extents.empty() && node->extents.back().bdev == id) { |
1866 | hint = node->extents.back().end(); | |
1867 | } | |
b32b8144 FG |
1868 | extents.reserve(4); // 4 should be (more than) enough for most allocations |
1869 | alloc_len = alloc[id]->allocate(left, min_alloc_size, hint, &extents); | |
1870 | } | |
1871 | if (r < 0 || (alloc_len < (int64_t)left)) { | |
1872 | if (r == 0) { | |
1873 | alloc[id]->unreserve(left - alloc_len); | |
1874 | for (auto& p : extents) { | |
1875 | alloc[id]->release(p.offset, p.length); | |
1876 | } | |
1877 | } | |
7c673cae FG |
1878 | if (id != BDEV_SLOW) { |
1879 | if (bdev[id]) { | |
1880 | dout(1) << __func__ << " failed to allocate 0x" << std::hex << left | |
1881 | << " on bdev " << (int)id | |
1882 | << ", free 0x" << alloc[id]->get_free() | |
1883 | << "; fallback to bdev " << (int)id + 1 | |
1884 | << std::dec << dendl; | |
1885 | } | |
94b18763 | 1886 | return _allocate(id + 1, len, node); |
7c673cae FG |
1887 | } |
1888 | if (bdev[id]) | |
1889 | derr << __func__ << " failed to allocate 0x" << std::hex << left | |
1890 | << " on bdev " << (int)id | |
1891 | << ", free 0x" << alloc[id]->get_free() << std::dec << dendl; | |
1892 | else | |
1893 | derr << __func__ << " failed to allocate 0x" << std::hex << left | |
1894 | << " on bdev " << (int)id << ", dne" << std::dec << dendl; | |
b32b8144 FG |
1895 | if (alloc[id]) |
1896 | alloc[id]->dump(); | |
7c673cae FG |
1897 | return -ENOSPC; |
1898 | } | |
1899 | ||
1900 | for (auto& p : extents) { | |
94b18763 | 1901 | node->append_extent(bluefs_extent_t(id, p.offset, p.length)); |
7c673cae FG |
1902 | } |
1903 | ||
1904 | return 0; | |
1905 | } | |
1906 | ||
1907 | int BlueFS::_preallocate(FileRef f, uint64_t off, uint64_t len) | |
1908 | { | |
1909 | dout(10) << __func__ << " file " << f->fnode << " 0x" | |
1910 | << std::hex << off << "~" << len << std::dec << dendl; | |
1911 | if (f->deleted) { | |
1912 | dout(10) << __func__ << " deleted, no-op" << dendl; | |
1913 | return 0; | |
1914 | } | |
1915 | assert(f->fnode.ino > 1); | |
1916 | uint64_t allocated = f->fnode.get_allocated(); | |
1917 | if (off + len > allocated) { | |
1918 | uint64_t want = off + len - allocated; | |
94b18763 | 1919 | int r = _allocate(f->fnode.prefer_bdev, want, &f->fnode); |
7c673cae FG |
1920 | if (r < 0) |
1921 | return r; | |
7c673cae FG |
1922 | log_t.op_file_update(f->fnode); |
1923 | } | |
1924 | return 0; | |
1925 | } | |
1926 | ||
1927 | void BlueFS::sync_metadata() | |
1928 | { | |
1929 | std::unique_lock<std::mutex> l(lock); | |
1930 | if (log_t.empty()) { | |
1931 | dout(10) << __func__ << " - no pending log events" << dendl; | |
1932 | return; | |
1933 | } | |
1934 | dout(10) << __func__ << dendl; | |
1935 | utime_t start = ceph_clock_now(); | |
1936 | vector<interval_set<uint64_t>> to_release(pending_release.size()); | |
1937 | to_release.swap(pending_release); | |
1938 | flush_bdev(); // FIXME? | |
1939 | _flush_and_sync_log(l); | |
1940 | for (unsigned i = 0; i < to_release.size(); ++i) { | |
1941 | for (auto p = to_release[i].begin(); p != to_release[i].end(); ++p) { | |
1942 | alloc[i]->release(p.get_start(), p.get_len()); | |
1943 | } | |
1944 | } | |
1945 | ||
1946 | if (_should_compact_log()) { | |
1947 | if (cct->_conf->bluefs_compact_log_sync) { | |
1948 | _compact_log_sync(); | |
1949 | } else { | |
1950 | _compact_log_async(l); | |
1951 | } | |
1952 | } | |
1953 | ||
1954 | utime_t end = ceph_clock_now(); | |
1955 | utime_t dur = end - start; | |
1956 | dout(10) << __func__ << " done in " << dur << dendl; | |
1957 | } | |
1958 | ||
1959 | int BlueFS::open_for_write( | |
1960 | const string& dirname, | |
1961 | const string& filename, | |
1962 | FileWriter **h, | |
1963 | bool overwrite) | |
1964 | { | |
1965 | std::lock_guard<std::mutex> l(lock); | |
1966 | dout(10) << __func__ << " " << dirname << "/" << filename << dendl; | |
1967 | map<string,DirRef>::iterator p = dir_map.find(dirname); | |
1968 | DirRef dir; | |
1969 | if (p == dir_map.end()) { | |
1970 | // implicitly create the dir | |
1971 | dout(20) << __func__ << " dir " << dirname | |
1972 | << " does not exist" << dendl; | |
1973 | return -ENOENT; | |
1974 | } else { | |
1975 | dir = p->second; | |
1976 | } | |
1977 | ||
1978 | FileRef file; | |
1979 | bool create = false; | |
1980 | map<string,FileRef>::iterator q = dir->file_map.find(filename); | |
1981 | if (q == dir->file_map.end()) { | |
1982 | if (overwrite) { | |
1983 | dout(20) << __func__ << " dir " << dirname << " (" << dir | |
1984 | << ") file " << filename | |
1985 | << " does not exist" << dendl; | |
1986 | return -ENOENT; | |
1987 | } | |
1988 | file = new File; | |
1989 | file->fnode.ino = ++ino_last; | |
1990 | file_map[ino_last] = file; | |
1991 | dir->file_map[filename] = file; | |
1992 | ++file->refs; | |
1993 | create = true; | |
1994 | } else { | |
1995 | // overwrite existing file? | |
1996 | file = q->second; | |
1997 | if (overwrite) { | |
1998 | dout(20) << __func__ << " dir " << dirname << " (" << dir | |
1999 | << ") file " << filename | |
2000 | << " already exists, overwrite in place" << dendl; | |
2001 | } else { | |
2002 | dout(20) << __func__ << " dir " << dirname << " (" << dir | |
2003 | << ") file " << filename | |
2004 | << " already exists, truncate + overwrite" << dendl; | |
2005 | file->fnode.size = 0; | |
2006 | for (auto& p : file->fnode.extents) { | |
2007 | pending_release[p.bdev].insert(p.offset, p.length); | |
2008 | } | |
94b18763 FG |
2009 | |
2010 | file->fnode.clear_extents(); | |
7c673cae FG |
2011 | } |
2012 | } | |
2013 | assert(file->fnode.ino > 1); | |
2014 | ||
2015 | file->fnode.mtime = ceph_clock_now(); | |
2016 | file->fnode.prefer_bdev = BlueFS::BDEV_DB; | |
2017 | if (dirname.length() > 5) { | |
2018 | // the "db.slow" and "db.wal" directory names are hard-coded at | |
2019 | // match up with bluestore. the slow device is always the second | |
2020 | // one (when a dedicated block.db device is present and used at | |
2021 | // bdev 0). the wal device is always last. | |
31f18b77 | 2022 | if (boost::algorithm::ends_with(dirname, ".slow")) { |
7c673cae FG |
2023 | file->fnode.prefer_bdev = BlueFS::BDEV_SLOW; |
2024 | } else if (boost::algorithm::ends_with(dirname, ".wal")) { | |
2025 | file->fnode.prefer_bdev = BlueFS::BDEV_WAL; | |
2026 | } | |
2027 | } | |
2028 | dout(20) << __func__ << " mapping " << dirname << "/" << filename | |
2029 | << " to bdev " << (int)file->fnode.prefer_bdev << dendl; | |
2030 | ||
2031 | log_t.op_file_update(file->fnode); | |
2032 | if (create) | |
2033 | log_t.op_dir_link(dirname, filename, file->fnode.ino); | |
2034 | ||
2035 | *h = _create_writer(file); | |
2036 | ||
2037 | if (boost::algorithm::ends_with(filename, ".log")) { | |
2038 | (*h)->writer_type = BlueFS::WRITER_WAL; | |
2039 | if (logger && !overwrite) { | |
2040 | logger->inc(l_bluefs_files_written_wal); | |
2041 | } | |
2042 | } else if (boost::algorithm::ends_with(filename, ".sst")) { | |
2043 | (*h)->writer_type = BlueFS::WRITER_SST; | |
2044 | if (logger) { | |
2045 | logger->inc(l_bluefs_files_written_sst); | |
2046 | } | |
2047 | } | |
2048 | ||
2049 | dout(10) << __func__ << " h " << *h << " on " << file->fnode << dendl; | |
2050 | return 0; | |
2051 | } | |
2052 | ||
2053 | BlueFS::FileWriter *BlueFS::_create_writer(FileRef f) | |
2054 | { | |
2055 | FileWriter *w = new FileWriter(f); | |
2056 | for (unsigned i = 0; i < MAX_BDEV; ++i) { | |
2057 | if (bdev[i]) { | |
2058 | w->iocv[i] = new IOContext(cct, NULL); | |
2059 | } else { | |
2060 | w->iocv[i] = NULL; | |
2061 | } | |
2062 | } | |
2063 | return w; | |
2064 | } | |
2065 | ||
2066 | void BlueFS::_close_writer(FileWriter *h) | |
2067 | { | |
2068 | dout(10) << __func__ << " " << h << " type " << h->writer_type << dendl; | |
2069 | for (unsigned i=0; i<MAX_BDEV; ++i) { | |
2070 | if (bdev[i]) { | |
2071 | assert(h->iocv[i]); | |
2072 | h->iocv[i]->aio_wait(); | |
2073 | bdev[i]->queue_reap_ioc(h->iocv[i]); | |
2074 | } | |
2075 | } | |
2076 | delete h; | |
2077 | } | |
2078 | ||
2079 | int BlueFS::open_for_read( | |
2080 | const string& dirname, | |
2081 | const string& filename, | |
2082 | FileReader **h, | |
2083 | bool random) | |
2084 | { | |
2085 | std::lock_guard<std::mutex> l(lock); | |
2086 | dout(10) << __func__ << " " << dirname << "/" << filename | |
2087 | << (random ? " (random)":" (sequential)") << dendl; | |
2088 | map<string,DirRef>::iterator p = dir_map.find(dirname); | |
2089 | if (p == dir_map.end()) { | |
2090 | dout(20) << __func__ << " dir " << dirname << " not found" << dendl; | |
2091 | return -ENOENT; | |
2092 | } | |
2093 | DirRef dir = p->second; | |
2094 | ||
2095 | map<string,FileRef>::iterator q = dir->file_map.find(filename); | |
2096 | if (q == dir->file_map.end()) { | |
2097 | dout(20) << __func__ << " dir " << dirname << " (" << dir | |
2098 | << ") file " << filename | |
2099 | << " not found" << dendl; | |
2100 | return -ENOENT; | |
2101 | } | |
2102 | File *file = q->second.get(); | |
2103 | ||
2104 | *h = new FileReader(file, random ? 4096 : cct->_conf->bluefs_max_prefetch, | |
2105 | random, false); | |
2106 | dout(10) << __func__ << " h " << *h << " on " << file->fnode << dendl; | |
2107 | return 0; | |
2108 | } | |
2109 | ||
2110 | int BlueFS::rename( | |
2111 | const string& old_dirname, const string& old_filename, | |
2112 | const string& new_dirname, const string& new_filename) | |
2113 | { | |
2114 | std::lock_guard<std::mutex> l(lock); | |
2115 | dout(10) << __func__ << " " << old_dirname << "/" << old_filename | |
2116 | << " -> " << new_dirname << "/" << new_filename << dendl; | |
2117 | map<string,DirRef>::iterator p = dir_map.find(old_dirname); | |
2118 | if (p == dir_map.end()) { | |
2119 | dout(20) << __func__ << " dir " << old_dirname << " not found" << dendl; | |
2120 | return -ENOENT; | |
2121 | } | |
2122 | DirRef old_dir = p->second; | |
2123 | map<string,FileRef>::iterator q = old_dir->file_map.find(old_filename); | |
2124 | if (q == old_dir->file_map.end()) { | |
2125 | dout(20) << __func__ << " dir " << old_dirname << " (" << old_dir | |
2126 | << ") file " << old_filename | |
2127 | << " not found" << dendl; | |
2128 | return -ENOENT; | |
2129 | } | |
2130 | FileRef file = q->second; | |
2131 | ||
2132 | p = dir_map.find(new_dirname); | |
2133 | if (p == dir_map.end()) { | |
2134 | dout(20) << __func__ << " dir " << new_dirname << " not found" << dendl; | |
2135 | return -ENOENT; | |
2136 | } | |
2137 | DirRef new_dir = p->second; | |
2138 | q = new_dir->file_map.find(new_filename); | |
2139 | if (q != new_dir->file_map.end()) { | |
2140 | dout(20) << __func__ << " dir " << new_dirname << " (" << old_dir | |
2141 | << ") file " << new_filename | |
2142 | << " already exists, unlinking" << dendl; | |
2143 | assert(q->second != file); | |
2144 | log_t.op_dir_unlink(new_dirname, new_filename); | |
2145 | _drop_link(q->second); | |
2146 | } | |
2147 | ||
2148 | dout(10) << __func__ << " " << new_dirname << "/" << new_filename << " " | |
2149 | << " " << file->fnode << dendl; | |
2150 | ||
2151 | new_dir->file_map[new_filename] = file; | |
2152 | old_dir->file_map.erase(old_filename); | |
2153 | ||
2154 | log_t.op_dir_link(new_dirname, new_filename, file->fnode.ino); | |
2155 | log_t.op_dir_unlink(old_dirname, old_filename); | |
2156 | return 0; | |
2157 | } | |
2158 | ||
2159 | int BlueFS::mkdir(const string& dirname) | |
2160 | { | |
2161 | std::lock_guard<std::mutex> l(lock); | |
2162 | dout(10) << __func__ << " " << dirname << dendl; | |
2163 | map<string,DirRef>::iterator p = dir_map.find(dirname); | |
2164 | if (p != dir_map.end()) { | |
2165 | dout(20) << __func__ << " dir " << dirname << " exists" << dendl; | |
2166 | return -EEXIST; | |
2167 | } | |
2168 | dir_map[dirname] = new Dir; | |
2169 | log_t.op_dir_create(dirname); | |
2170 | return 0; | |
2171 | } | |
2172 | ||
2173 | int BlueFS::rmdir(const string& dirname) | |
2174 | { | |
2175 | std::lock_guard<std::mutex> l(lock); | |
2176 | dout(10) << __func__ << " " << dirname << dendl; | |
2177 | map<string,DirRef>::iterator p = dir_map.find(dirname); | |
2178 | if (p == dir_map.end()) { | |
2179 | dout(20) << __func__ << " dir " << dirname << " does not exist" << dendl; | |
2180 | return -ENOENT; | |
2181 | } | |
2182 | DirRef dir = p->second; | |
2183 | if (!dir->file_map.empty()) { | |
2184 | dout(20) << __func__ << " dir " << dirname << " not empty" << dendl; | |
2185 | return -ENOTEMPTY; | |
2186 | } | |
2187 | dir_map.erase(dirname); | |
2188 | log_t.op_dir_remove(dirname); | |
2189 | return 0; | |
2190 | } | |
2191 | ||
2192 | bool BlueFS::dir_exists(const string& dirname) | |
2193 | { | |
2194 | std::lock_guard<std::mutex> l(lock); | |
2195 | map<string,DirRef>::iterator p = dir_map.find(dirname); | |
2196 | bool exists = p != dir_map.end(); | |
2197 | dout(10) << __func__ << " " << dirname << " = " << (int)exists << dendl; | |
2198 | return exists; | |
2199 | } | |
2200 | ||
2201 | int BlueFS::stat(const string& dirname, const string& filename, | |
2202 | uint64_t *size, utime_t *mtime) | |
2203 | { | |
2204 | std::lock_guard<std::mutex> l(lock); | |
2205 | dout(10) << __func__ << " " << dirname << "/" << filename << dendl; | |
2206 | map<string,DirRef>::iterator p = dir_map.find(dirname); | |
2207 | if (p == dir_map.end()) { | |
2208 | dout(20) << __func__ << " dir " << dirname << " not found" << dendl; | |
2209 | return -ENOENT; | |
2210 | } | |
2211 | DirRef dir = p->second; | |
2212 | map<string,FileRef>::iterator q = dir->file_map.find(filename); | |
2213 | if (q == dir->file_map.end()) { | |
2214 | dout(20) << __func__ << " dir " << dirname << " (" << dir | |
2215 | << ") file " << filename | |
2216 | << " not found" << dendl; | |
2217 | return -ENOENT; | |
2218 | } | |
2219 | File *file = q->second.get(); | |
2220 | dout(10) << __func__ << " " << dirname << "/" << filename | |
2221 | << " " << file->fnode << dendl; | |
2222 | if (size) | |
2223 | *size = file->fnode.size; | |
2224 | if (mtime) | |
2225 | *mtime = file->fnode.mtime; | |
2226 | return 0; | |
2227 | } | |
2228 | ||
2229 | int BlueFS::lock_file(const string& dirname, const string& filename, | |
2230 | FileLock **plock) | |
2231 | { | |
2232 | std::lock_guard<std::mutex> l(lock); | |
2233 | dout(10) << __func__ << " " << dirname << "/" << filename << dendl; | |
2234 | map<string,DirRef>::iterator p = dir_map.find(dirname); | |
2235 | if (p == dir_map.end()) { | |
2236 | dout(20) << __func__ << " dir " << dirname << " not found" << dendl; | |
2237 | return -ENOENT; | |
2238 | } | |
2239 | DirRef dir = p->second; | |
2240 | map<string,FileRef>::iterator q = dir->file_map.find(filename); | |
2241 | File *file; | |
2242 | if (q == dir->file_map.end()) { | |
2243 | dout(20) << __func__ << " dir " << dirname << " (" << dir | |
2244 | << ") file " << filename | |
2245 | << " not found, creating" << dendl; | |
2246 | file = new File; | |
2247 | file->fnode.ino = ++ino_last; | |
2248 | file->fnode.mtime = ceph_clock_now(); | |
2249 | file_map[ino_last] = file; | |
2250 | dir->file_map[filename] = file; | |
2251 | ++file->refs; | |
2252 | log_t.op_file_update(file->fnode); | |
2253 | log_t.op_dir_link(dirname, filename, file->fnode.ino); | |
2254 | } else { | |
2255 | file = q->second.get(); | |
2256 | if (file->locked) { | |
2257 | dout(10) << __func__ << " already locked" << dendl; | |
2258 | return -EBUSY; | |
2259 | } | |
2260 | } | |
2261 | file->locked = true; | |
2262 | *plock = new FileLock(file); | |
2263 | dout(10) << __func__ << " locked " << file->fnode | |
2264 | << " with " << *plock << dendl; | |
2265 | return 0; | |
2266 | } | |
2267 | ||
2268 | int BlueFS::unlock_file(FileLock *fl) | |
2269 | { | |
2270 | std::lock_guard<std::mutex> l(lock); | |
2271 | dout(10) << __func__ << " " << fl << " on " << fl->file->fnode << dendl; | |
2272 | assert(fl->file->locked); | |
2273 | fl->file->locked = false; | |
2274 | delete fl; | |
2275 | return 0; | |
2276 | } | |
2277 | ||
2278 | int BlueFS::readdir(const string& dirname, vector<string> *ls) | |
2279 | { | |
2280 | std::lock_guard<std::mutex> l(lock); | |
2281 | dout(10) << __func__ << " " << dirname << dendl; | |
2282 | if (dirname.empty()) { | |
2283 | // list dirs | |
2284 | ls->reserve(dir_map.size() + 2); | |
2285 | for (auto& q : dir_map) { | |
2286 | ls->push_back(q.first); | |
2287 | } | |
2288 | } else { | |
2289 | // list files in dir | |
2290 | map<string,DirRef>::iterator p = dir_map.find(dirname); | |
2291 | if (p == dir_map.end()) { | |
2292 | dout(20) << __func__ << " dir " << dirname << " not found" << dendl; | |
2293 | return -ENOENT; | |
2294 | } | |
2295 | DirRef dir = p->second; | |
2296 | ls->reserve(dir->file_map.size() + 2); | |
2297 | for (auto& q : dir->file_map) { | |
2298 | ls->push_back(q.first); | |
2299 | } | |
2300 | } | |
2301 | ls->push_back("."); | |
2302 | ls->push_back(".."); | |
2303 | return 0; | |
2304 | } | |
2305 | ||
2306 | int BlueFS::unlink(const string& dirname, const string& filename) | |
2307 | { | |
2308 | std::lock_guard<std::mutex> l(lock); | |
2309 | dout(10) << __func__ << " " << dirname << "/" << filename << dendl; | |
2310 | map<string,DirRef>::iterator p = dir_map.find(dirname); | |
2311 | if (p == dir_map.end()) { | |
2312 | dout(20) << __func__ << " dir " << dirname << " not found" << dendl; | |
2313 | return -ENOENT; | |
2314 | } | |
2315 | DirRef dir = p->second; | |
2316 | map<string,FileRef>::iterator q = dir->file_map.find(filename); | |
2317 | if (q == dir->file_map.end()) { | |
2318 | dout(20) << __func__ << " file " << dirname << "/" << filename | |
2319 | << " not found" << dendl; | |
2320 | return -ENOENT; | |
2321 | } | |
2322 | FileRef file = q->second; | |
2323 | if (file->locked) { | |
2324 | dout(20) << __func__ << " file " << dirname << "/" << filename | |
2325 | << " is locked" << dendl; | |
2326 | return -EBUSY; | |
2327 | } | |
2328 | dir->file_map.erase(filename); | |
2329 | log_t.op_dir_unlink(dirname, filename); | |
2330 | _drop_link(file); | |
2331 | return 0; | |
2332 | } | |
d2e6a577 FG |
2333 | |
2334 | bool BlueFS::wal_is_rotational() | |
2335 | { | |
94b18763 FG |
2336 | if (bdev[BDEV_WAL]) { |
2337 | return bdev[BDEV_WAL]->is_rotational(); | |
2338 | } else if (bdev[BDEV_DB]) { | |
2339 | return bdev[BDEV_DB]->is_rotational(); | |
2340 | } | |
2341 | return bdev[BDEV_SLOW]->is_rotational(); | |
d2e6a577 | 2342 | } |