]> git.proxmox.com Git - ceph.git/blob - ceph/src/os/bluestore/BlueFS.cc
update sources to v12.2.1
[ceph.git] / ceph / src / os / bluestore / BlueFS.cc
1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2 // vim: ts=8 sw=2 smarttab
3
4 #include "boost/algorithm/string.hpp"
5 #include "BlueFS.h"
6
7 #include "common/debug.h"
8 #include "common/errno.h"
9 #include "common/perf_counters.h"
10 #include "BlockDevice.h"
11 #include "Allocator.h"
12 #include "include/assert.h"
13
14 #define dout_context cct
15 #define dout_subsys ceph_subsys_bluefs
16 #undef dout_prefix
17 #define dout_prefix *_dout << "bluefs "
18
19 MEMPOOL_DEFINE_OBJECT_FACTORY(BlueFS::File, bluefs_file, bluefs);
20 MEMPOOL_DEFINE_OBJECT_FACTORY(BlueFS::Dir, bluefs_dir, bluefs);
21 MEMPOOL_DEFINE_OBJECT_FACTORY(BlueFS::FileWriter, bluefs_file_writer, bluefs);
22 MEMPOOL_DEFINE_OBJECT_FACTORY(BlueFS::FileReaderBuffer,
23 bluefs_file_reader_buffer, bluefs);
24 MEMPOOL_DEFINE_OBJECT_FACTORY(BlueFS::FileReader, bluefs_file_reader, bluefs);
25 MEMPOOL_DEFINE_OBJECT_FACTORY(BlueFS::FileLock, bluefs_file_lock, bluefs);
26
27
28 BlueFS::BlueFS(CephContext* cct)
29 : cct(cct),
30 bdev(MAX_BDEV),
31 ioc(MAX_BDEV),
32 block_all(MAX_BDEV),
33 block_total(MAX_BDEV, 0)
34 {
35 }
36
37 BlueFS::~BlueFS()
38 {
39 for (auto p : ioc) {
40 if (p)
41 p->aio_wait();
42 }
43 for (auto p : bdev) {
44 if (p) {
45 p->close();
46 delete p;
47 }
48 }
49 for (auto p : ioc) {
50 delete p;
51 }
52 }
53
54 void BlueFS::_init_logger()
55 {
56 PerfCountersBuilder b(cct, "bluefs",
57 l_bluefs_first, l_bluefs_last);
58 b.add_u64_counter(l_bluefs_gift_bytes, "gift_bytes",
59 "Bytes gifted from BlueStore");
60 b.add_u64_counter(l_bluefs_reclaim_bytes, "reclaim_bytes",
61 "Bytes reclaimed by BlueStore");
62 b.add_u64(l_bluefs_db_total_bytes, "db_total_bytes",
63 "Total bytes (main db device)",
64 "b", PerfCountersBuilder::PRIO_USEFUL);
65 b.add_u64(l_bluefs_db_used_bytes, "db_used_bytes",
66 "Used bytes (main db device)",
67 "u", PerfCountersBuilder::PRIO_USEFUL);
68 b.add_u64(l_bluefs_wal_total_bytes, "wal_total_bytes",
69 "Total bytes (wal device)",
70 "walb", PerfCountersBuilder::PRIO_USEFUL);
71 b.add_u64(l_bluefs_wal_used_bytes, "wal_used_bytes",
72 "Used bytes (wal device)",
73 "walu", PerfCountersBuilder::PRIO_USEFUL);
74 b.add_u64(l_bluefs_slow_total_bytes, "slow_total_bytes",
75 "Total bytes (slow device)",
76 "slob", PerfCountersBuilder::PRIO_USEFUL);
77 b.add_u64(l_bluefs_slow_used_bytes, "slow_used_bytes",
78 "Used bytes (slow device)",
79 "slou", PerfCountersBuilder::PRIO_USEFUL);
80 b.add_u64(l_bluefs_num_files, "num_files", "File count",
81 "f", PerfCountersBuilder::PRIO_USEFUL);
82 b.add_u64(l_bluefs_log_bytes, "log_bytes", "Size of the metadata log",
83 "jlen", PerfCountersBuilder::PRIO_INTERESTING);
84 b.add_u64_counter(l_bluefs_log_compactions, "log_compactions",
85 "Compactions of the metadata log");
86 b.add_u64_counter(l_bluefs_logged_bytes, "logged_bytes",
87 "Bytes written to the metadata log", "j",
88 PerfCountersBuilder::PRIO_CRITICAL);
89 b.add_u64_counter(l_bluefs_files_written_wal, "files_written_wal",
90 "Files written to WAL");
91 b.add_u64_counter(l_bluefs_files_written_sst, "files_written_sst",
92 "Files written to SSTs");
93 b.add_u64_counter(l_bluefs_bytes_written_wal, "bytes_written_wal",
94 "Bytes written to WAL", "wal",
95 PerfCountersBuilder::PRIO_CRITICAL);
96 b.add_u64_counter(l_bluefs_bytes_written_sst, "bytes_written_sst",
97 "Bytes written to SSTs", "sst",
98 PerfCountersBuilder::PRIO_CRITICAL);
99 logger = b.create_perf_counters();
100 cct->get_perfcounters_collection()->add(logger);
101 }
102
103 void BlueFS::_shutdown_logger()
104 {
105 cct->get_perfcounters_collection()->remove(logger);
106 delete logger;
107 }
108
109 void BlueFS::_update_logger_stats()
110 {
111 // we must be holding the lock
112 logger->set(l_bluefs_num_files, file_map.size());
113 logger->set(l_bluefs_log_bytes, log_writer->file->fnode.size);
114
115 if (alloc[BDEV_WAL]) {
116 logger->set(l_bluefs_wal_total_bytes, block_total[BDEV_WAL]);
117 logger->set(l_bluefs_wal_used_bytes,
118 block_total[BDEV_WAL] - alloc[BDEV_WAL]->get_free());
119 }
120 if (alloc[BDEV_DB]) {
121 logger->set(l_bluefs_db_total_bytes, block_total[BDEV_DB]);
122 logger->set(l_bluefs_db_used_bytes,
123 block_total[BDEV_DB] - alloc[BDEV_DB]->get_free());
124 }
125 if (alloc[BDEV_SLOW]) {
126 logger->set(l_bluefs_slow_total_bytes, block_total[BDEV_SLOW]);
127 logger->set(l_bluefs_slow_used_bytes,
128 block_total[BDEV_SLOW] - alloc[BDEV_SLOW]->get_free());
129 }
130 }
131
132 int BlueFS::add_block_device(unsigned id, const string& path)
133 {
134 dout(10) << __func__ << " bdev " << id << " path " << path << dendl;
135 assert(id < bdev.size());
136 assert(bdev[id] == NULL);
137 BlockDevice *b = BlockDevice::create(cct, path, NULL, NULL);
138 int r = b->open(path);
139 if (r < 0) {
140 delete b;
141 return r;
142 }
143 dout(1) << __func__ << " bdev " << id << " path " << path
144 << " size " << pretty_si_t(b->get_size()) << "B" << dendl;
145 bdev[id] = b;
146 ioc[id] = new IOContext(cct, NULL);
147 return 0;
148 }
149
150 bool BlueFS::bdev_support_label(unsigned id)
151 {
152 assert(id < bdev.size());
153 assert(bdev[id]);
154 return bdev[id]->supported_bdev_label();
155 }
156
157 uint64_t BlueFS::get_block_device_size(unsigned id)
158 {
159 if (id < bdev.size() && bdev[id])
160 return bdev[id]->get_size();
161 return 0;
162 }
163
164 void BlueFS::add_block_extent(unsigned id, uint64_t offset, uint64_t length)
165 {
166 std::unique_lock<std::mutex> l(lock);
167 dout(1) << __func__ << " bdev " << id
168 << " 0x" << std::hex << offset << "~" << length << std::dec
169 << dendl;
170 assert(id < bdev.size());
171 assert(bdev[id]);
172 assert(bdev[id]->get_size() >= offset + length);
173 block_all[id].insert(offset, length);
174 block_total[id] += length;
175
176 if (id < alloc.size() && alloc[id]) {
177 log_t.op_alloc_add(id, offset, length);
178 int r = _flush_and_sync_log(l);
179 assert(r == 0);
180 alloc[id]->init_add_free(offset, length);
181 }
182
183 if (logger)
184 logger->inc(l_bluefs_gift_bytes, length);
185 dout(10) << __func__ << " done" << dendl;
186 }
187
188 int BlueFS::reclaim_blocks(unsigned id, uint64_t want,
189 AllocExtentVector *extents)
190 {
191 std::unique_lock<std::mutex> l(lock);
192 dout(1) << __func__ << " bdev " << id
193 << " want 0x" << std::hex << want << std::dec << dendl;
194 assert(id < alloc.size());
195 assert(alloc[id]);
196 int r = alloc[id]->reserve(want);
197 assert(r == 0); // caller shouldn't ask for more than they can get
198 int64_t got = alloc[id]->allocate(want, cct->_conf->bluefs_alloc_size, 0,
199 extents);
200 if (got < (int64_t)want) {
201 alloc[id]->unreserve(want - MAX(0, got));
202 }
203 if (got <= 0) {
204 derr << __func__ << " failed to allocate space to return to bluestore"
205 << dendl;
206 alloc[id]->dump();
207 return got;
208 }
209
210 for (auto& p : *extents) {
211 block_all[id].erase(p.offset, p.length);
212 block_total[id] -= p.length;
213 log_t.op_alloc_rm(id, p.offset, p.length);
214 }
215
216 flush_bdev();
217 r = _flush_and_sync_log(l);
218 assert(r == 0);
219
220 if (logger)
221 logger->inc(l_bluefs_reclaim_bytes, got);
222 dout(1) << __func__ << " bdev " << id << " want 0x" << std::hex << want
223 << " got " << *extents << dendl;
224 return 0;
225 }
226
227 uint64_t BlueFS::get_fs_usage()
228 {
229 std::lock_guard<std::mutex> l(lock);
230 uint64_t total_bytes = 0;
231 for (auto& p : file_map) {
232 total_bytes += p.second->fnode.get_allocated();
233 }
234 return total_bytes;
235 }
236
237 uint64_t BlueFS::get_total(unsigned id)
238 {
239 std::lock_guard<std::mutex> l(lock);
240 assert(id < block_all.size());
241 return block_total[id];
242 }
243
244 uint64_t BlueFS::get_free(unsigned id)
245 {
246 std::lock_guard<std::mutex> l(lock);
247 assert(id < alloc.size());
248 return alloc[id]->get_free();
249 }
250
251 void BlueFS::dump_perf_counters(Formatter *f)
252 {
253 f->open_object_section("bluefs_perf_counters");
254 logger->dump_formatted(f,0);
255 f->close_section();
256 }
257
258
259 void BlueFS::get_usage(vector<pair<uint64_t,uint64_t>> *usage)
260 {
261 std::lock_guard<std::mutex> l(lock);
262 usage->resize(bdev.size());
263 for (unsigned id = 0; id < bdev.size(); ++id) {
264 if (!bdev[id]) {
265 (*usage)[id] = make_pair(0, 0);
266 continue;
267 }
268 (*usage)[id].first = alloc[id]->get_free();
269 (*usage)[id].second = block_total[id];
270 uint64_t used =
271 (block_total[id] - (*usage)[id].first) * 100 / block_total[id];
272 dout(10) << __func__ << " bdev " << id
273 << " free " << (*usage)[id].first
274 << " (" << pretty_si_t((*usage)[id].first) << "B)"
275 << " / " << (*usage)[id].second
276 << " (" << pretty_si_t((*usage)[id].second) << "B)"
277 << ", used " << used << "%"
278 << dendl;
279 }
280 }
281
282 int BlueFS::get_block_extents(unsigned id, interval_set<uint64_t> *extents)
283 {
284 std::lock_guard<std::mutex> l(lock);
285 dout(10) << __func__ << " bdev " << id << dendl;
286 if (id >= block_all.size())
287 return -EINVAL;
288 *extents = block_all[id];
289 return 0;
290 }
291
292 int BlueFS::mkfs(uuid_d osd_uuid)
293 {
294 std::unique_lock<std::mutex> l(lock);
295 dout(1) << __func__
296 << " osd_uuid " << osd_uuid
297 << dendl;
298
299 _init_alloc();
300 _init_logger();
301
302 super.version = 1;
303 super.block_size = bdev[BDEV_DB]->get_block_size();
304 super.osd_uuid = osd_uuid;
305 super.uuid.generate_random();
306 dout(1) << __func__ << " uuid " << super.uuid << dendl;
307
308 // init log
309 FileRef log_file = new File;
310 log_file->fnode.ino = 1;
311 log_file->fnode.prefer_bdev = BDEV_WAL;
312 int r = _allocate(
313 log_file->fnode.prefer_bdev,
314 cct->_conf->bluefs_max_log_runway,
315 &log_file->fnode.extents);
316 log_file->fnode.recalc_allocated();
317 assert(r == 0);
318 log_writer = _create_writer(log_file);
319
320 // initial txn
321 log_t.op_init();
322 for (unsigned bdev = 0; bdev < MAX_BDEV; ++bdev) {
323 interval_set<uint64_t>& p = block_all[bdev];
324 if (p.empty())
325 continue;
326 for (interval_set<uint64_t>::iterator q = p.begin(); q != p.end(); ++q) {
327 dout(20) << __func__ << " op_alloc_add " << bdev << " 0x"
328 << std::hex << q.get_start() << "~" << q.get_len() << std::dec
329 << dendl;
330 log_t.op_alloc_add(bdev, q.get_start(), q.get_len());
331 }
332 }
333 _flush_and_sync_log(l);
334
335 // write supers
336 super.log_fnode = log_file->fnode;
337 _write_super();
338 flush_bdev();
339
340 // clean up
341 super = bluefs_super_t();
342 _close_writer(log_writer);
343 log_writer = NULL;
344 block_all.clear();
345 block_total.clear();
346 _stop_alloc();
347 _shutdown_logger();
348
349 dout(10) << __func__ << " success" << dendl;
350 return 0;
351 }
352
353 void BlueFS::_init_alloc()
354 {
355 dout(20) << __func__ << dendl;
356 alloc.resize(MAX_BDEV);
357 pending_release.resize(MAX_BDEV);
358 for (unsigned id = 0; id < bdev.size(); ++id) {
359 if (!bdev[id]) {
360 continue;
361 }
362 assert(bdev[id]->get_size());
363 alloc[id] = Allocator::create(cct, cct->_conf->bluefs_allocator,
364 bdev[id]->get_size(),
365 cct->_conf->bluefs_alloc_size);
366 interval_set<uint64_t>& p = block_all[id];
367 for (interval_set<uint64_t>::iterator q = p.begin(); q != p.end(); ++q) {
368 alloc[id]->init_add_free(q.get_start(), q.get_len());
369 }
370 }
371 }
372
373 void BlueFS::_stop_alloc()
374 {
375 dout(20) << __func__ << dendl;
376 for (auto p : alloc) {
377 if (p != nullptr) {
378 p->shutdown();
379 delete p;
380 }
381 }
382 alloc.clear();
383 }
384
385 int BlueFS::mount()
386 {
387 dout(1) << __func__ << dendl;
388
389 int r = _open_super();
390 if (r < 0) {
391 derr << __func__ << " failed to open super: " << cpp_strerror(r) << dendl;
392 goto out;
393 }
394
395 block_all.clear();
396 block_all.resize(MAX_BDEV);
397 block_total.clear();
398 block_total.resize(MAX_BDEV, 0);
399 _init_alloc();
400
401 r = _replay(false);
402 if (r < 0) {
403 derr << __func__ << " failed to replay log: " << cpp_strerror(r) << dendl;
404 _stop_alloc();
405 goto out;
406 }
407
408 // init freelist
409 for (auto& p : file_map) {
410 dout(30) << __func__ << " noting alloc for " << p.second->fnode << dendl;
411 for (auto& q : p.second->fnode.extents) {
412 alloc[q.bdev]->init_rm_free(q.offset, q.length);
413 }
414 }
415
416 // set up the log for future writes
417 log_writer = _create_writer(_get_file(1));
418 assert(log_writer->file->fnode.ino == 1);
419 log_writer->pos = log_writer->file->fnode.size;
420 dout(10) << __func__ << " log write pos set to 0x"
421 << std::hex << log_writer->pos << std::dec
422 << dendl;
423
424 _init_logger();
425 return 0;
426
427 out:
428 super = bluefs_super_t();
429 return r;
430 }
431
432 void BlueFS::umount()
433 {
434 dout(1) << __func__ << dendl;
435
436 sync_metadata();
437
438 _close_writer(log_writer);
439 log_writer = NULL;
440
441 _stop_alloc();
442 file_map.clear();
443 dir_map.clear();
444 super = bluefs_super_t();
445 log_t.clear();
446 _shutdown_logger();
447 }
448
449 void BlueFS::collect_metadata(map<string,string> *pm)
450 {
451 if (bdev[BDEV_DB])
452 bdev[BDEV_DB]->collect_metadata("bluefs_db_", pm);
453 if (bdev[BDEV_WAL])
454 bdev[BDEV_WAL]->collect_metadata("bluefs_wal_", pm);
455 if (bdev[BDEV_SLOW])
456 bdev[BDEV_SLOW]->collect_metadata("bluefs_slow_", pm);
457 }
458
459 int BlueFS::fsck()
460 {
461 std::lock_guard<std::mutex> l(lock);
462 dout(1) << __func__ << dendl;
463 // hrm, i think we check everything on mount...
464 return 0;
465 }
466
467 int BlueFS::_write_super()
468 {
469 // build superblock
470 bufferlist bl;
471 ::encode(super, bl);
472 uint32_t crc = bl.crc32c(-1);
473 ::encode(crc, bl);
474 dout(10) << __func__ << " super block length(encoded): " << bl.length() << dendl;
475 dout(10) << __func__ << " superblock " << super.version << dendl;
476 dout(10) << __func__ << " log_fnode " << super.log_fnode << dendl;
477 assert(bl.length() <= get_super_length());
478 bl.append_zero(get_super_length() - bl.length());
479
480 bdev[BDEV_DB]->write(get_super_offset(), bl, false);
481 dout(20) << __func__ << " v " << super.version
482 << " crc 0x" << std::hex << crc
483 << " offset 0x" << get_super_offset() << std::dec
484 << dendl;
485 return 0;
486 }
487
488 int BlueFS::_open_super()
489 {
490 dout(10) << __func__ << dendl;
491
492 bufferlist bl;
493 uint32_t expected_crc, crc;
494 int r;
495
496 // always the second block
497 r = bdev[BDEV_DB]->read(get_super_offset(), get_super_length(),
498 &bl, ioc[BDEV_DB], false);
499 if (r < 0)
500 return r;
501
502 bufferlist::iterator p = bl.begin();
503 ::decode(super, p);
504 {
505 bufferlist t;
506 t.substr_of(bl, 0, p.get_off());
507 crc = t.crc32c(-1);
508 }
509 ::decode(expected_crc, p);
510 if (crc != expected_crc) {
511 derr << __func__ << " bad crc on superblock, expected 0x"
512 << std::hex << expected_crc << " != actual 0x" << crc << std::dec
513 << dendl;
514 return -EIO;
515 }
516 dout(10) << __func__ << " superblock " << super.version << dendl;
517 dout(10) << __func__ << " log_fnode " << super.log_fnode << dendl;
518 return 0;
519 }
520
521 int BlueFS::_replay(bool noop)
522 {
523 dout(10) << __func__ << (noop ? " NO-OP" : "") << dendl;
524 ino_last = 1; // by the log
525 log_seq = 0;
526
527 FileRef log_file;
528 if (noop) {
529 log_file = new File;
530 } else {
531 log_file = _get_file(1);
532 }
533 log_file->fnode = super.log_fnode;
534 dout(10) << __func__ << " log_fnode " << super.log_fnode << dendl;
535
536 FileReader *log_reader = new FileReader(
537 log_file, cct->_conf->bluefs_max_prefetch,
538 false, // !random
539 true); // ignore eof
540 while (true) {
541 assert((log_reader->buf.pos & ~super.block_mask()) == 0);
542 uint64_t pos = log_reader->buf.pos;
543 uint64_t read_pos = pos;
544 bufferlist bl;
545 {
546 int r = _read(log_reader, &log_reader->buf, read_pos, super.block_size,
547 &bl, NULL);
548 assert(r == (int)super.block_size);
549 read_pos += r;
550 }
551 uint64_t more = 0;
552 uint64_t seq;
553 uuid_d uuid;
554 {
555 bufferlist::iterator p = bl.begin();
556 __u8 a, b;
557 uint32_t len;
558 ::decode(a, p);
559 ::decode(b, p);
560 ::decode(len, p);
561 ::decode(uuid, p);
562 ::decode(seq, p);
563 if (len + 6 > bl.length()) {
564 more = ROUND_UP_TO(len + 6 - bl.length(), super.block_size);
565 }
566 }
567 if (uuid != super.uuid) {
568 dout(10) << __func__ << " 0x" << std::hex << pos << std::dec
569 << ": stop: uuid " << uuid << " != super.uuid " << super.uuid
570 << dendl;
571 break;
572 }
573 if (seq != log_seq + 1) {
574 dout(10) << __func__ << " 0x" << std::hex << pos << std::dec
575 << ": stop: seq " << seq << " != expected " << log_seq + 1
576 << dendl;
577 break;
578 }
579 if (more) {
580 dout(20) << __func__ << " need 0x" << std::hex << more << std::dec
581 << " more bytes" << dendl;
582 bufferlist t;
583 int r = _read(log_reader, &log_reader->buf, read_pos, more, &t, NULL);
584 if (r < (int)more) {
585 dout(10) << __func__ << " 0x" << std::hex << pos
586 << ": stop: len is 0x" << bl.length() + more << std::dec
587 << ", which is past eof" << dendl;
588 break;
589 }
590 assert(r == (int)more);
591 bl.claim_append(t);
592 read_pos += r;
593 }
594 bluefs_transaction_t t;
595 try {
596 bufferlist::iterator p = bl.begin();
597 ::decode(t, p);
598 }
599 catch (buffer::error& e) {
600 dout(10) << __func__ << " 0x" << std::hex << pos << std::dec
601 << ": stop: failed to decode: " << e.what()
602 << dendl;
603 delete log_reader;
604 return -EIO;
605 }
606 assert(seq == t.seq);
607 dout(10) << __func__ << " 0x" << std::hex << pos << std::dec
608 << ": " << t << dendl;
609
610 bufferlist::iterator p = t.op_bl.begin();
611 while (!p.end()) {
612 __u8 op;
613 ::decode(op, p);
614 switch (op) {
615
616 case bluefs_transaction_t::OP_INIT:
617 dout(20) << __func__ << " 0x" << std::hex << pos << std::dec
618 << ": op_init" << dendl;
619 assert(t.seq == 1);
620 break;
621
622 case bluefs_transaction_t::OP_JUMP:
623 {
624 uint64_t next_seq;
625 uint64_t offset;
626 ::decode(next_seq, p);
627 ::decode(offset, p);
628 dout(20) << __func__ << " 0x" << std::hex << pos << std::dec
629 << ": op_jump seq " << next_seq
630 << " offset 0x" << std::hex << offset << std::dec << dendl;
631 assert(next_seq >= log_seq);
632 log_seq = next_seq - 1; // we will increment it below
633 uint64_t skip = offset - read_pos;
634 if (skip) {
635 bufferlist junk;
636 int r = _read(log_reader, &log_reader->buf, read_pos, skip, &junk,
637 NULL);
638 if (r != (int)skip) {
639 dout(10) << __func__ << " 0x" << std::hex << read_pos
640 << ": stop: failed to skip to " << offset
641 << std::dec << dendl;
642 assert(0 == "problem with op_jump");
643 }
644 }
645 }
646 break;
647
648 case bluefs_transaction_t::OP_JUMP_SEQ:
649 {
650 uint64_t next_seq;
651 ::decode(next_seq, p);
652 dout(20) << __func__ << " 0x" << std::hex << pos << std::dec
653 << ": op_jump_seq " << next_seq << dendl;
654 assert(next_seq >= log_seq);
655 log_seq = next_seq - 1; // we will increment it below
656 }
657 break;
658
659 case bluefs_transaction_t::OP_ALLOC_ADD:
660 {
661 __u8 id;
662 uint64_t offset, length;
663 ::decode(id, p);
664 ::decode(offset, p);
665 ::decode(length, p);
666 dout(20) << __func__ << " 0x" << std::hex << pos << std::dec
667 << ": op_alloc_add " << " " << (int)id
668 << ":0x" << std::hex << offset << "~" << length << std::dec
669 << dendl;
670 if (!noop) {
671 block_all[id].insert(offset, length);
672 block_total[id] += length;
673 alloc[id]->init_add_free(offset, length);
674 }
675 }
676 break;
677
678 case bluefs_transaction_t::OP_ALLOC_RM:
679 {
680 __u8 id;
681 uint64_t offset, length;
682 ::decode(id, p);
683 ::decode(offset, p);
684 ::decode(length, p);
685 dout(20) << __func__ << " 0x" << std::hex << pos << std::dec
686 << ": op_alloc_rm " << " " << (int)id
687 << ":0x" << std::hex << offset << "~" << length << std::dec
688 << dendl;
689 if (!noop) {
690 block_all[id].erase(offset, length);
691 block_total[id] -= length;
692 alloc[id]->init_rm_free(offset, length);
693 }
694 }
695 break;
696
697 case bluefs_transaction_t::OP_DIR_LINK:
698 {
699 string dirname, filename;
700 uint64_t ino;
701 ::decode(dirname, p);
702 ::decode(filename, p);
703 ::decode(ino, p);
704 dout(20) << __func__ << " 0x" << std::hex << pos << std::dec
705 << ": op_dir_link " << " " << dirname << "/" << filename
706 << " to " << ino
707 << dendl;
708 if (!noop) {
709 FileRef file = _get_file(ino);
710 assert(file->fnode.ino);
711 map<string,DirRef>::iterator q = dir_map.find(dirname);
712 assert(q != dir_map.end());
713 map<string,FileRef>::iterator r = q->second->file_map.find(filename);
714 assert(r == q->second->file_map.end());
715 q->second->file_map[filename] = file;
716 ++file->refs;
717 }
718 }
719 break;
720
721 case bluefs_transaction_t::OP_DIR_UNLINK:
722 {
723 string dirname, filename;
724 ::decode(dirname, p);
725 ::decode(filename, p);
726 dout(20) << __func__ << " 0x" << std::hex << pos << std::dec
727 << ": op_dir_unlink " << " " << dirname << "/" << filename
728 << dendl;
729 if (!noop) {
730 map<string,DirRef>::iterator q = dir_map.find(dirname);
731 assert(q != dir_map.end());
732 map<string,FileRef>::iterator r = q->second->file_map.find(filename);
733 assert(r != q->second->file_map.end());
734 assert(r->second->refs > 0);
735 --r->second->refs;
736 q->second->file_map.erase(r);
737 }
738 }
739 break;
740
741 case bluefs_transaction_t::OP_DIR_CREATE:
742 {
743 string dirname;
744 ::decode(dirname, p);
745 dout(20) << __func__ << " 0x" << std::hex << pos << std::dec
746 << ": op_dir_create " << dirname << dendl;
747 if (!noop) {
748 map<string,DirRef>::iterator q = dir_map.find(dirname);
749 assert(q == dir_map.end());
750 dir_map[dirname] = new Dir;
751 }
752 }
753 break;
754
755 case bluefs_transaction_t::OP_DIR_REMOVE:
756 {
757 string dirname;
758 ::decode(dirname, p);
759 dout(20) << __func__ << " 0x" << std::hex << pos << std::dec
760 << ": op_dir_remove " << dirname << dendl;
761 if (!noop) {
762 map<string,DirRef>::iterator q = dir_map.find(dirname);
763 assert(q != dir_map.end());
764 assert(q->second->file_map.empty());
765 dir_map.erase(q);
766 }
767 }
768 break;
769
770 case bluefs_transaction_t::OP_FILE_UPDATE:
771 {
772 bluefs_fnode_t fnode;
773 ::decode(fnode, p);
774 dout(20) << __func__ << " 0x" << std::hex << pos << std::dec
775 << ": op_file_update " << " " << fnode << dendl;
776 if (!noop) {
777 FileRef f = _get_file(fnode.ino);
778 f->fnode = fnode;
779 if (fnode.ino > ino_last) {
780 ino_last = fnode.ino;
781 }
782 }
783 }
784 break;
785
786 case bluefs_transaction_t::OP_FILE_REMOVE:
787 {
788 uint64_t ino;
789 ::decode(ino, p);
790 dout(20) << __func__ << " 0x" << std::hex << pos << std::dec
791 << ": op_file_remove " << ino << dendl;
792 if (!noop) {
793 auto p = file_map.find(ino);
794 assert(p != file_map.end());
795 file_map.erase(p);
796 }
797 }
798 break;
799
800 default:
801 derr << __func__ << " 0x" << std::hex << pos << std::dec
802 << ": stop: unrecognized op " << (int)op << dendl;
803 delete log_reader;
804 return -EIO;
805 }
806 }
807 assert(p.end());
808
809 // we successfully replayed the transaction; bump the seq and log size
810 ++log_seq;
811 log_file->fnode.size = log_reader->buf.pos;
812 }
813
814 dout(10) << __func__ << " log file size was 0x"
815 << std::hex << log_file->fnode.size << std::dec << dendl;
816 delete log_reader;
817
818 if (!noop) {
819 // verify file link counts are all >0
820 for (auto& p : file_map) {
821 if (p.second->refs == 0 &&
822 p.second->fnode.ino > 1) {
823 derr << __func__ << " file with link count 0: " << p.second->fnode
824 << dendl;
825 return -EIO;
826 }
827 }
828 }
829
830 dout(10) << __func__ << " done" << dendl;
831 return 0;
832 }
833
834 BlueFS::FileRef BlueFS::_get_file(uint64_t ino)
835 {
836 auto p = file_map.find(ino);
837 if (p == file_map.end()) {
838 FileRef f = new File;
839 file_map[ino] = f;
840 dout(30) << __func__ << " ino " << ino << " = " << f
841 << " (new)" << dendl;
842 return f;
843 } else {
844 dout(30) << __func__ << " ino " << ino << " = " << p->second << dendl;
845 return p->second;
846 }
847 }
848
849 void BlueFS::_drop_link(FileRef file)
850 {
851 dout(20) << __func__ << " had refs " << file->refs
852 << " on " << file->fnode << dendl;
853 assert(file->refs > 0);
854 --file->refs;
855 if (file->refs == 0) {
856 dout(20) << __func__ << " destroying " << file->fnode << dendl;
857 assert(file->num_reading.load() == 0);
858 log_t.op_file_remove(file->fnode.ino);
859 for (auto& r : file->fnode.extents) {
860 pending_release[r.bdev].insert(r.offset, r.length);
861 }
862 file_map.erase(file->fnode.ino);
863 file->deleted = true;
864 file->fnode.recalc_allocated();
865 if (file->dirty_seq) {
866 assert(file->dirty_seq > log_seq_stable);
867 assert(dirty_files.count(file->dirty_seq));
868 auto it = dirty_files[file->dirty_seq].iterator_to(*file);
869 dirty_files[file->dirty_seq].erase(it);
870 file->dirty_seq = 0;
871 }
872 }
873 }
874
875 int BlueFS::_read_random(
876 FileReader *h, ///< [in] read from here
877 uint64_t off, ///< [in] offset
878 size_t len, ///< [in] this many bytes
879 char *out) ///< [out] optional: or copy it here
880 {
881 dout(10) << __func__ << " h " << h
882 << " 0x" << std::hex << off << "~" << len << std::dec
883 << " from " << h->file->fnode << dendl;
884
885 ++h->file->num_reading;
886
887 if (!h->ignore_eof &&
888 off + len > h->file->fnode.size) {
889 if (off > h->file->fnode.size)
890 len = 0;
891 else
892 len = h->file->fnode.size - off;
893 dout(20) << __func__ << " reaching (or past) eof, len clipped to 0x"
894 << std::hex << len << std::dec << dendl;
895 }
896
897 int ret = 0;
898 while (len > 0) {
899 uint64_t x_off = 0;
900 auto p = h->file->fnode.seek(off, &x_off);
901 uint64_t l = MIN(p->length - x_off, len);
902 dout(20) << __func__ << " read buffered 0x"
903 << std::hex << x_off << "~" << l << std::dec
904 << " of " << *p << dendl;
905 int r = bdev[p->bdev]->read_random(p->offset + x_off, l, out,
906 cct->_conf->bluefs_buffered_io);
907 assert(r == 0);
908 off += l;
909 len -= l;
910 ret += l;
911 out += l;
912 }
913
914 dout(20) << __func__ << " got " << ret << dendl;
915 --h->file->num_reading;
916 return ret;
917 }
918
919 int BlueFS::_read(
920 FileReader *h, ///< [in] read from here
921 FileReaderBuffer *buf, ///< [in] reader state
922 uint64_t off, ///< [in] offset
923 size_t len, ///< [in] this many bytes
924 bufferlist *outbl, ///< [out] optional: reference the result here
925 char *out) ///< [out] optional: or copy it here
926 {
927 dout(10) << __func__ << " h " << h
928 << " 0x" << std::hex << off << "~" << len << std::dec
929 << " from " << h->file->fnode << dendl;
930
931 ++h->file->num_reading;
932
933 if (!h->ignore_eof &&
934 off + len > h->file->fnode.size) {
935 if (off > h->file->fnode.size)
936 len = 0;
937 else
938 len = h->file->fnode.size - off;
939 dout(20) << __func__ << " reaching (or past) eof, len clipped to 0x"
940 << std::hex << len << std::dec << dendl;
941 }
942 if (outbl)
943 outbl->clear();
944
945 int ret = 0;
946 while (len > 0) {
947 size_t left;
948 if (off < buf->bl_off || off >= buf->get_buf_end()) {
949 buf->bl.clear();
950 buf->bl_off = off & super.block_mask();
951 uint64_t x_off = 0;
952 auto p = h->file->fnode.seek(buf->bl_off, &x_off);
953 uint64_t want = ROUND_UP_TO(len + (off & ~super.block_mask()),
954 super.block_size);
955 want = MAX(want, buf->max_prefetch);
956 uint64_t l = MIN(p->length - x_off, want);
957 uint64_t eof_offset = ROUND_UP_TO(h->file->fnode.size, super.block_size);
958 if (!h->ignore_eof &&
959 buf->bl_off + l > eof_offset) {
960 l = eof_offset - buf->bl_off;
961 }
962 dout(20) << __func__ << " fetching 0x"
963 << std::hex << x_off << "~" << l << std::dec
964 << " of " << *p << dendl;
965 int r = bdev[p->bdev]->read(p->offset + x_off, l, &buf->bl, ioc[p->bdev],
966 cct->_conf->bluefs_buffered_io);
967 assert(r == 0);
968 }
969 left = buf->get_buf_remaining(off);
970 dout(20) << __func__ << " left 0x" << std::hex << left
971 << " len 0x" << len << std::dec << dendl;
972
973 int r = MIN(len, left);
974 if (outbl) {
975 bufferlist t;
976 t.substr_of(buf->bl, off - buf->bl_off, r);
977 outbl->claim_append(t);
978 }
979 if (out) {
980 // NOTE: h->bl is normally a contiguous buffer so c_str() is free.
981 memcpy(out, buf->bl.c_str() + off - buf->bl_off, r);
982 out += r;
983 }
984
985 dout(30) << __func__ << " result chunk (0x"
986 << std::hex << r << std::dec << " bytes):\n";
987 bufferlist t;
988 t.substr_of(buf->bl, off - buf->bl_off, r);
989 t.hexdump(*_dout);
990 *_dout << dendl;
991
992 off += r;
993 len -= r;
994 ret += r;
995 buf->pos += r;
996 }
997
998 dout(20) << __func__ << " got " << ret << dendl;
999 assert(!outbl || (int)outbl->length() == ret);
1000 --h->file->num_reading;
1001 return ret;
1002 }
1003
1004 void BlueFS::_invalidate_cache(FileRef f, uint64_t offset, uint64_t length)
1005 {
1006 dout(10) << __func__ << " file " << f->fnode
1007 << " 0x" << std::hex << offset << "~" << length << std::dec
1008 << dendl;
1009 if (offset & ~super.block_mask()) {
1010 offset &= super.block_mask();
1011 length = ROUND_UP_TO(length, super.block_size);
1012 }
1013 uint64_t x_off = 0;
1014 auto p = f->fnode.seek(offset, &x_off);
1015 while (length > 0 && p != f->fnode.extents.end()) {
1016 uint64_t x_len = MIN(p->length - x_off, length);
1017 bdev[p->bdev]->invalidate_cache(p->offset + x_off, x_len);
1018 dout(20) << __func__ << " 0x" << std::hex << x_off << "~" << x_len
1019 << std:: dec << " of " << *p << dendl;
1020 offset += x_len;
1021 length -= x_len;
1022 }
1023 }
1024
1025 uint64_t BlueFS::_estimate_log_size()
1026 {
1027 int avg_dir_size = 40; // fixme
1028 int avg_file_size = 12;
1029 uint64_t size = 4096 * 2;
1030 size += file_map.size() * (1 + sizeof(bluefs_fnode_t));
1031 for (auto& p : block_all)
1032 size += p.num_intervals() * (1 + 1 + sizeof(uint64_t) * 2);
1033 size += dir_map.size() + (1 + avg_dir_size);
1034 size += file_map.size() * (1 + avg_dir_size + avg_file_size);
1035 return ROUND_UP_TO(size, super.block_size);
1036 }
1037
1038 void BlueFS::compact_log()
1039 {
1040 std::unique_lock<std::mutex> l(lock);
1041 if (cct->_conf->bluefs_compact_log_sync) {
1042 _compact_log_sync();
1043 } else {
1044 _compact_log_async(l);
1045 }
1046 }
1047
1048 bool BlueFS::_should_compact_log()
1049 {
1050 uint64_t current = log_writer->file->fnode.size;
1051 uint64_t expected = _estimate_log_size();
1052 float ratio = (float)current / (float)expected;
1053 dout(10) << __func__ << " current 0x" << std::hex << current
1054 << " expected " << expected << std::dec
1055 << " ratio " << ratio
1056 << (new_log ? " (async compaction in progress)" : "")
1057 << dendl;
1058 if (new_log ||
1059 current < cct->_conf->bluefs_log_compact_min_size ||
1060 ratio < cct->_conf->bluefs_log_compact_min_ratio) {
1061 return false;
1062 }
1063 return true;
1064 }
1065
1066 void BlueFS::_compact_log_dump_metadata(bluefs_transaction_t *t)
1067 {
1068 t->seq = 1;
1069 t->uuid = super.uuid;
1070 dout(20) << __func__ << " op_init" << dendl;
1071
1072 t->op_init();
1073 for (unsigned bdev = 0; bdev < MAX_BDEV; ++bdev) {
1074 interval_set<uint64_t>& p = block_all[bdev];
1075 for (interval_set<uint64_t>::iterator q = p.begin(); q != p.end(); ++q) {
1076 dout(20) << __func__ << " op_alloc_add " << bdev << " 0x"
1077 << std::hex << q.get_start() << "~" << q.get_len() << std::dec
1078 << dendl;
1079 t->op_alloc_add(bdev, q.get_start(), q.get_len());
1080 }
1081 }
1082 for (auto& p : file_map) {
1083 if (p.first == 1)
1084 continue;
1085 dout(20) << __func__ << " op_file_update " << p.second->fnode << dendl;
1086 assert(p.first > 1);
1087 t->op_file_update(p.second->fnode);
1088 }
1089 for (auto& p : dir_map) {
1090 dout(20) << __func__ << " op_dir_create " << p.first << dendl;
1091 t->op_dir_create(p.first);
1092 for (auto& q : p.second->file_map) {
1093 dout(20) << __func__ << " op_dir_link " << p.first << "/" << q.first
1094 << " to " << q.second->fnode.ino << dendl;
1095 t->op_dir_link(p.first, q.first, q.second->fnode.ino);
1096 }
1097 }
1098 }
1099
1100 void BlueFS::_compact_log_sync()
1101 {
1102 dout(10) << __func__ << dendl;
1103 File *log_file = log_writer->file.get();
1104
1105 // clear out log (be careful who calls us!!!)
1106 log_t.clear();
1107
1108 bluefs_transaction_t t;
1109 _compact_log_dump_metadata(&t);
1110
1111 dout(20) << __func__ << " op_jump_seq " << log_seq << dendl;
1112 t.op_jump_seq(log_seq);
1113
1114 bufferlist bl;
1115 ::encode(t, bl);
1116 _pad_bl(bl);
1117
1118 uint64_t need = bl.length() + cct->_conf->bluefs_max_log_runway;
1119 dout(20) << __func__ << " need " << need << dendl;
1120
1121 mempool::bluefs::vector<bluefs_extent_t> old_extents;
1122 old_extents.swap(log_file->fnode.extents);
1123 log_file->fnode.recalc_allocated();
1124 while (log_file->fnode.get_allocated() < need) {
1125 int r = _allocate(log_file->fnode.prefer_bdev,
1126 need - log_file->fnode.get_allocated(),
1127 &log_file->fnode.extents);
1128 log_file->fnode.recalc_allocated();
1129 assert(r == 0);
1130 }
1131
1132 _close_writer(log_writer);
1133
1134 log_file->fnode.size = bl.length();
1135 log_writer = _create_writer(log_file);
1136 log_writer->append(bl);
1137 int r = _flush(log_writer, true);
1138 assert(r == 0);
1139 wait_for_aio(log_writer);
1140
1141 list<aio_t> completed_ios;
1142 _claim_completed_aios(log_writer, &completed_ios);
1143 flush_bdev();
1144 completed_ios.clear();
1145
1146 dout(10) << __func__ << " writing super" << dendl;
1147 super.log_fnode = log_file->fnode;
1148 ++super.version;
1149 _write_super();
1150 flush_bdev();
1151
1152 dout(10) << __func__ << " release old log extents " << old_extents << dendl;
1153 for (auto& r : old_extents) {
1154 pending_release[r.bdev].insert(r.offset, r.length);
1155 }
1156
1157 logger->inc(l_bluefs_log_compactions);
1158 }
1159
1160 /*
1161 * 1. Allocate a new extent to continue the log, and then log an event
1162 * that jumps the log write position to the new extent. At this point, the
1163 * old extent(s) won't be written to, and reflect everything to compact.
1164 * New events will be written to the new region that we'll keep.
1165 *
1166 * 2. While still holding the lock, encode a bufferlist that dumps all of the
1167 * in-memory fnodes and names. This will become the new beginning of the
1168 * log. The last event will jump to the log continuation extent from #1.
1169 *
1170 * 3. Queue a write to a new extent for the new beginnging of the log.
1171 *
1172 * 4. Drop lock and wait
1173 *
1174 * 5. Retake the lock.
1175 *
1176 * 6. Update the log_fnode to splice in the new beginning.
1177 *
1178 * 7. Write the new superblock.
1179 *
1180 * 8. Release the old log space. Clean up.
1181 */
1182 void BlueFS::_compact_log_async(std::unique_lock<std::mutex>& l)
1183 {
1184 dout(10) << __func__ << dendl;
1185 File *log_file = log_writer->file.get();
1186 assert(!new_log);
1187 assert(!new_log_writer);
1188
1189 // create a new log [writer] so that we know compaction is in progress
1190 // (see _should_compact_log)
1191 new_log = new File;
1192 new_log->fnode.ino = 0; // so that _flush_range won't try to log the fnode
1193
1194 // 1. allocate new log space and jump to it.
1195 old_log_jump_to = log_file->fnode.get_allocated();
1196 uint64_t need = old_log_jump_to + cct->_conf->bluefs_max_log_runway;
1197 dout(10) << __func__ << " old_log_jump_to 0x" << std::hex << old_log_jump_to
1198 << " need 0x" << need << std::dec << dendl;
1199 while (log_file->fnode.get_allocated() < need) {
1200 int r = _allocate(log_file->fnode.prefer_bdev,
1201 cct->_conf->bluefs_max_log_runway,
1202 &log_file->fnode.extents);
1203 assert(r == 0);
1204 log_file->fnode.recalc_allocated();
1205 }
1206 dout(10) << __func__ << " log extents " << log_file->fnode.extents << dendl;
1207
1208 // update the log file change and log a jump to the offset where we want to
1209 // write the new entries
1210 log_t.op_file_update(log_file->fnode);
1211 log_t.op_jump(log_seq, old_log_jump_to);
1212
1213 flush_bdev(); // FIXME?
1214
1215 _flush_and_sync_log(l, 0, old_log_jump_to);
1216
1217 // 2. prepare compacted log
1218 bluefs_transaction_t t;
1219 //avoid record two times in log_t and _compact_log_dump_metadata.
1220 log_t.clear();
1221 _compact_log_dump_metadata(&t);
1222
1223 // conservative estimate for final encoded size
1224 new_log_jump_to = ROUND_UP_TO(t.op_bl.length() + super.block_size * 2,
1225 cct->_conf->bluefs_alloc_size);
1226 t.op_jump(log_seq, new_log_jump_to);
1227
1228 bufferlist bl;
1229 ::encode(t, bl);
1230 _pad_bl(bl);
1231
1232 dout(10) << __func__ << " new_log_jump_to 0x" << std::hex << new_log_jump_to
1233 << std::dec << dendl;
1234
1235 // allocate
1236 int r = _allocate(BlueFS::BDEV_DB, new_log_jump_to,
1237 &new_log->fnode.extents);
1238 assert(r == 0);
1239 new_log->fnode.recalc_allocated();
1240 new_log_writer = _create_writer(new_log);
1241 new_log_writer->append(bl);
1242
1243 // 3. flush
1244 r = _flush(new_log_writer, true);
1245 assert(r == 0);
1246 lock.unlock();
1247
1248 // 4. wait
1249 dout(10) << __func__ << " waiting for compacted log to sync" << dendl;
1250 wait_for_aio(new_log_writer);
1251
1252 list<aio_t> completed_ios;
1253 _claim_completed_aios(new_log_writer, &completed_ios);
1254 flush_bdev();
1255 completed_ios.clear();
1256
1257 // 5. retake lock
1258 lock.lock();
1259
1260 // 6. update our log fnode
1261 // discard first old_log_jump_to extents
1262 dout(10) << __func__ << " remove 0x" << std::hex << old_log_jump_to << std::dec
1263 << " of " << log_file->fnode.extents << dendl;
1264 uint64_t discarded = 0;
1265 mempool::bluefs::vector<bluefs_extent_t> old_extents;
1266 while (discarded < old_log_jump_to) {
1267 assert(!log_file->fnode.extents.empty());
1268 bluefs_extent_t& e = log_file->fnode.extents.front();
1269 bluefs_extent_t temp = e;
1270 if (discarded + e.length <= old_log_jump_to) {
1271 dout(10) << __func__ << " remove old log extent " << e << dendl;
1272 discarded += e.length;
1273 log_file->fnode.extents.erase(log_file->fnode.extents.begin());
1274 } else {
1275 dout(10) << __func__ << " remove front of old log extent " << e << dendl;
1276 uint64_t drop = old_log_jump_to - discarded;
1277 temp.length = drop;
1278 e.offset += drop;
1279 e.length -= drop;
1280 discarded += drop;
1281 dout(10) << __func__ << " kept " << e << " removed " << temp << dendl;
1282 }
1283 old_extents.push_back(temp);
1284 }
1285 new_log->fnode.extents.insert(new_log->fnode.extents.end(),
1286 log_file->fnode.extents.begin(),
1287 log_file->fnode.extents.end());
1288
1289 // clear the extents from old log file, they are added to new log
1290 log_file->fnode.extents.clear();
1291
1292 // swap the log files. New log file is the log file now.
1293 log_file->fnode.extents.swap(new_log->fnode.extents);
1294 log_file->fnode.recalc_allocated();
1295 new_log->fnode.recalc_allocated();
1296 log_writer->pos = log_writer->file->fnode.size =
1297 log_writer->pos - old_log_jump_to + new_log_jump_to;
1298
1299 // 7. write the super block to reflect the changes
1300 dout(10) << __func__ << " writing super" << dendl;
1301 super.log_fnode = log_file->fnode;
1302 ++super.version;
1303 _write_super();
1304
1305 lock.unlock();
1306 flush_bdev();
1307 lock.lock();
1308
1309 // 8. release old space
1310 dout(10) << __func__ << " release old log extents " << old_extents << dendl;
1311 for (auto& r : old_extents) {
1312 pending_release[r.bdev].insert(r.offset, r.length);
1313 }
1314
1315 // delete the new log, remove from the dirty files list
1316 _close_writer(new_log_writer);
1317 if (new_log->dirty_seq) {
1318 assert(dirty_files.count(new_log->dirty_seq));
1319 auto it = dirty_files[new_log->dirty_seq].iterator_to(*new_log);
1320 dirty_files[new_log->dirty_seq].erase(it);
1321 }
1322 new_log_writer = nullptr;
1323 new_log = nullptr;
1324 log_cond.notify_all();
1325
1326 dout(10) << __func__ << " log extents " << log_file->fnode.extents << dendl;
1327 logger->inc(l_bluefs_log_compactions);
1328 }
1329
1330 void BlueFS::_pad_bl(bufferlist& bl)
1331 {
1332 uint64_t partial = bl.length() % super.block_size;
1333 if (partial) {
1334 dout(10) << __func__ << " padding with 0x" << std::hex
1335 << super.block_size - partial << " zeros" << std::dec << dendl;
1336 bl.append_zero(super.block_size - partial);
1337 }
1338 }
1339
1340 void BlueFS::flush_log()
1341 {
1342 std::unique_lock<std::mutex> l(lock);
1343 flush_bdev();
1344 _flush_and_sync_log(l);
1345 }
1346
1347 int BlueFS::_flush_and_sync_log(std::unique_lock<std::mutex>& l,
1348 uint64_t want_seq,
1349 uint64_t jump_to)
1350 {
1351 while (log_flushing) {
1352 dout(10) << __func__ << " want_seq " << want_seq
1353 << " log is currently flushing, waiting" << dendl;
1354 log_cond.wait(l);
1355 }
1356 if (want_seq && want_seq <= log_seq_stable) {
1357 dout(10) << __func__ << " want_seq " << want_seq << " <= log_seq_stable "
1358 << log_seq_stable << ", done" << dendl;
1359 return 0;
1360 }
1361 if (log_t.empty() && dirty_files.empty()) {
1362 dout(10) << __func__ << " want_seq " << want_seq
1363 << " " << log_t << " not dirty, dirty_files empty, no-op" << dendl;
1364 return 0;
1365 }
1366
1367 uint64_t seq = log_t.seq = ++log_seq;
1368 assert(want_seq == 0 || want_seq <= seq);
1369 log_t.uuid = super.uuid;
1370
1371 // log dirty files
1372 auto lsi = dirty_files.find(seq);
1373 if (lsi != dirty_files.end()) {
1374 dout(20) << __func__ << " " << lsi->second.size() << " dirty_files" << dendl;
1375 for (auto &f : lsi->second) {
1376 dout(20) << __func__ << " op_file_update " << f.fnode << dendl;
1377 log_t.op_file_update(f.fnode);
1378 }
1379 }
1380
1381 dout(10) << __func__ << " " << log_t << dendl;
1382 assert(!log_t.empty());
1383
1384 // allocate some more space (before we run out)?
1385 int64_t runway = log_writer->file->fnode.get_allocated() -
1386 log_writer->get_effective_write_pos();
1387 if (runway < (int64_t)cct->_conf->bluefs_min_log_runway) {
1388 dout(10) << __func__ << " allocating more log runway (0x"
1389 << std::hex << runway << std::dec << " remaining)" << dendl;
1390 while (new_log_writer) {
1391 dout(10) << __func__ << " waiting for async compaction" << dendl;
1392 log_cond.wait(l);
1393 }
1394 int r = _allocate(log_writer->file->fnode.prefer_bdev,
1395 cct->_conf->bluefs_max_log_runway,
1396 &log_writer->file->fnode.extents);
1397 assert(r == 0);
1398 log_writer->file->fnode.recalc_allocated();
1399 log_t.op_file_update(log_writer->file->fnode);
1400 }
1401
1402 bufferlist bl;
1403 ::encode(log_t, bl);
1404
1405 // pad to block boundary
1406 _pad_bl(bl);
1407 logger->inc(l_bluefs_logged_bytes, bl.length());
1408
1409 log_writer->append(bl);
1410
1411 log_t.clear();
1412 log_t.seq = 0; // just so debug output is less confusing
1413 log_flushing = true;
1414
1415 int r = _flush(log_writer, true);
1416 assert(r == 0);
1417
1418 if (jump_to) {
1419 dout(10) << __func__ << " jumping log offset from 0x" << std::hex
1420 << log_writer->pos << " -> 0x" << jump_to << std::dec << dendl;
1421 log_writer->pos = jump_to;
1422 log_writer->file->fnode.size = jump_to;
1423 }
1424
1425 _flush_bdev_safely(log_writer);
1426
1427 log_flushing = false;
1428 log_cond.notify_all();
1429
1430 // clean dirty files
1431 if (seq > log_seq_stable) {
1432 log_seq_stable = seq;
1433 dout(20) << __func__ << " log_seq_stable " << log_seq_stable << dendl;
1434
1435 auto p = dirty_files.begin();
1436 while (p != dirty_files.end()) {
1437 if (p->first > log_seq_stable) {
1438 dout(20) << __func__ << " done cleaning up dirty files" << dendl;
1439 break;
1440 }
1441
1442 auto l = p->second.begin();
1443 while (l != p->second.end()) {
1444 File *file = &*l;
1445 assert(file->dirty_seq > 0);
1446 assert(file->dirty_seq <= log_seq_stable);
1447 dout(20) << __func__ << " cleaned file " << file->fnode << dendl;
1448 file->dirty_seq = 0;
1449 p->second.erase(l++);
1450 }
1451
1452 assert(p->second.empty());
1453 dirty_files.erase(p++);
1454 }
1455 } else {
1456 dout(20) << __func__ << " log_seq_stable " << log_seq_stable
1457 << " already >= out seq " << seq
1458 << ", we lost a race against another log flush, done" << dendl;
1459 }
1460 _update_logger_stats();
1461
1462 return 0;
1463 }
1464
1465 int BlueFS::_flush_range(FileWriter *h, uint64_t offset, uint64_t length)
1466 {
1467 dout(10) << __func__ << " " << h << " pos 0x" << std::hex << h->pos
1468 << " 0x" << offset << "~" << length << std::dec
1469 << " to " << h->file->fnode << dendl;
1470 assert(!h->file->deleted);
1471 assert(h->file->num_readers.load() == 0);
1472
1473 h->buffer_appender.flush();
1474
1475 bool buffered;
1476 if (h->file->fnode.ino == 1)
1477 buffered = false;
1478 else
1479 buffered = cct->_conf->bluefs_buffered_io;
1480
1481 if (offset + length <= h->pos)
1482 return 0;
1483 if (offset < h->pos) {
1484 length -= h->pos - offset;
1485 offset = h->pos;
1486 dout(10) << " still need 0x"
1487 << std::hex << offset << "~" << length << std::dec
1488 << dendl;
1489 }
1490 assert(offset <= h->file->fnode.size);
1491
1492 uint64_t allocated = h->file->fnode.get_allocated();
1493
1494 // do not bother to dirty the file if we are overwriting
1495 // previously allocated extents.
1496 bool must_dirty = false;
1497 if (allocated < offset + length) {
1498 // we should never run out of log space here; see the min runway check
1499 // in _flush_and_sync_log.
1500 assert(h->file->fnode.ino != 1);
1501 int r = _allocate(h->file->fnode.prefer_bdev,
1502 offset + length - allocated,
1503 &h->file->fnode.extents);
1504 if (r < 0) {
1505 derr << __func__ << " allocated: 0x" << std::hex << allocated
1506 << " offset: 0x" << offset << " length: 0x" << length << std::dec
1507 << dendl;
1508 return r;
1509 }
1510 h->file->fnode.recalc_allocated();
1511 if (cct->_conf->bluefs_preextend_wal_files &&
1512 h->writer_type == WRITER_WAL) {
1513 // NOTE: this *requires* that rocksdb also has log recycling
1514 // enabled and is therefore doing robust CRCs on the log
1515 // records. otherwise, we will fail to reply the rocksdb log
1516 // properly due to garbage on the device.
1517 h->file->fnode.size = h->file->fnode.get_allocated();
1518 dout(10) << __func__ << " extending WAL size to 0x" << std::hex
1519 << h->file->fnode.size << std::dec << " to include allocated"
1520 << dendl;
1521 }
1522 must_dirty = true;
1523 }
1524 if (h->file->fnode.size < offset + length) {
1525 h->file->fnode.size = offset + length;
1526 if (h->file->fnode.ino > 1) {
1527 // we do not need to dirty the log file (or it's compacting
1528 // replacement) when the file size changes because replay is
1529 // smart enough to discover it on its own.
1530 must_dirty = true;
1531 }
1532 }
1533 if (must_dirty) {
1534 h->file->fnode.mtime = ceph_clock_now();
1535 assert(h->file->fnode.ino >= 1);
1536 if (h->file->dirty_seq == 0) {
1537 h->file->dirty_seq = log_seq + 1;
1538 dirty_files[h->file->dirty_seq].push_back(*h->file);
1539 dout(20) << __func__ << " dirty_seq = " << log_seq + 1
1540 << " (was clean)" << dendl;
1541 } else {
1542 if (h->file->dirty_seq != log_seq + 1) {
1543 // need re-dirty, erase from list first
1544 assert(dirty_files.count(h->file->dirty_seq));
1545 auto it = dirty_files[h->file->dirty_seq].iterator_to(*h->file);
1546 dirty_files[h->file->dirty_seq].erase(it);
1547 h->file->dirty_seq = log_seq + 1;
1548 dirty_files[h->file->dirty_seq].push_back(*h->file);
1549 dout(20) << __func__ << " dirty_seq = " << log_seq + 1
1550 << " (was " << h->file->dirty_seq << ")" << dendl;
1551 } else {
1552 dout(20) << __func__ << " dirty_seq = " << log_seq + 1
1553 << " (unchanged, do nothing) " << dendl;
1554 }
1555 }
1556 }
1557 dout(20) << __func__ << " file now " << h->file->fnode << dendl;
1558
1559 uint64_t x_off = 0;
1560 auto p = h->file->fnode.seek(offset, &x_off);
1561 assert(p != h->file->fnode.extents.end());
1562 dout(20) << __func__ << " in " << *p << " x_off 0x"
1563 << std::hex << x_off << std::dec << dendl;
1564
1565 unsigned partial = x_off & ~super.block_mask();
1566 bufferlist bl;
1567 if (partial) {
1568 dout(20) << __func__ << " using partial tail 0x"
1569 << std::hex << partial << std::dec << dendl;
1570 assert(h->tail_block.length() == partial);
1571 bl.claim_append_piecewise(h->tail_block);
1572 x_off -= partial;
1573 offset -= partial;
1574 length += partial;
1575 dout(20) << __func__ << " waiting for previous aio to complete" << dendl;
1576 for (auto p : h->iocv) {
1577 if (p) {
1578 p->aio_wait();
1579 }
1580 }
1581 }
1582 if (length == partial + h->buffer.length()) {
1583 bl.claim_append_piecewise(h->buffer);
1584 } else {
1585 bufferlist t;
1586 h->buffer.splice(0, length, &t);
1587 bl.claim_append_piecewise(t);
1588 t.substr_of(h->buffer, length, h->buffer.length() - length);
1589 h->buffer.swap(t);
1590 dout(20) << " leaving 0x" << std::hex << h->buffer.length() << std::dec
1591 << " unflushed" << dendl;
1592 }
1593 assert(bl.length() == length);
1594
1595 switch (h->writer_type) {
1596 case WRITER_WAL:
1597 logger->inc(l_bluefs_bytes_written_wal, length);
1598 break;
1599 case WRITER_SST:
1600 logger->inc(l_bluefs_bytes_written_sst, length);
1601 break;
1602 }
1603
1604 dout(30) << "dump:\n";
1605 bl.hexdump(*_dout);
1606 *_dout << dendl;
1607
1608 h->pos = offset + length;
1609 h->tail_block.clear();
1610
1611 uint64_t bloff = 0;
1612 while (length > 0) {
1613 uint64_t x_len = MIN(p->length - x_off, length);
1614 bufferlist t;
1615 t.substr_of(bl, bloff, x_len);
1616 unsigned tail = x_len & ~super.block_mask();
1617 if (tail) {
1618 size_t zlen = super.block_size - tail;
1619 dout(20) << __func__ << " caching tail of 0x"
1620 << std::hex << tail
1621 << " and padding block with 0x" << zlen
1622 << std::dec << dendl;
1623 h->tail_block.substr_of(bl, bl.length() - tail, tail);
1624 if (h->file->fnode.ino > 1) {
1625 // we are using the page_aligned_appender, and can safely use
1626 // the tail of the raw buffer.
1627 const bufferptr &last = t.back();
1628 if (last.unused_tail_length() < zlen) {
1629 derr << " wtf, last is " << last << " from " << t << dendl;
1630 assert(last.unused_tail_length() >= zlen);
1631 }
1632 bufferptr z = last;
1633 z.set_offset(last.offset() + last.length());
1634 z.set_length(zlen);
1635 z.zero();
1636 t.append(z, 0, zlen);
1637 } else {
1638 t.append_zero(zlen);
1639 }
1640 }
1641 if (cct->_conf->bluefs_sync_write) {
1642 bdev[p->bdev]->write(p->offset + x_off, t, buffered);
1643 } else {
1644 bdev[p->bdev]->aio_write(p->offset + x_off, t, h->iocv[p->bdev], buffered);
1645 }
1646 bloff += x_len;
1647 length -= x_len;
1648 ++p;
1649 x_off = 0;
1650 }
1651 for (unsigned i = 0; i < MAX_BDEV; ++i) {
1652 if (bdev[i]) {
1653 assert(h->iocv[i]);
1654 if (h->iocv[i]->has_pending_aios()) {
1655 bdev[i]->aio_submit(h->iocv[i]);
1656 }
1657 }
1658 }
1659 dout(20) << __func__ << " h " << h << " pos now 0x"
1660 << std::hex << h->pos << std::dec << dendl;
1661 return 0;
1662 }
1663
1664 // we need to retire old completed aios so they don't stick around in
1665 // memory indefinitely (along with their bufferlist refs).
1666 void BlueFS::_claim_completed_aios(FileWriter *h, list<aio_t> *ls)
1667 {
1668 for (auto p : h->iocv) {
1669 if (p) {
1670 ls->splice(ls->end(), p->running_aios);
1671 }
1672 }
1673 dout(10) << __func__ << " got " << ls->size() << " aios" << dendl;
1674 }
1675
1676 void BlueFS::wait_for_aio(FileWriter *h)
1677 {
1678 // NOTE: this is safe to call without a lock, as long as our reference is
1679 // stable.
1680 dout(10) << __func__ << " " << h << dendl;
1681 utime_t start = ceph_clock_now();
1682 for (auto p : h->iocv) {
1683 if (p) {
1684 p->aio_wait();
1685 }
1686 }
1687 utime_t end = ceph_clock_now();
1688 utime_t dur = end - start;
1689 dout(10) << __func__ << " " << h << " done in " << dur << dendl;
1690 }
1691
1692 int BlueFS::_flush(FileWriter *h, bool force)
1693 {
1694 h->buffer_appender.flush();
1695 uint64_t length = h->buffer.length();
1696 uint64_t offset = h->pos;
1697 if (!force &&
1698 length < cct->_conf->bluefs_min_flush_size) {
1699 dout(10) << __func__ << " " << h << " ignoring, length " << length
1700 << " < min_flush_size " << cct->_conf->bluefs_min_flush_size
1701 << dendl;
1702 return 0;
1703 }
1704 if (length == 0) {
1705 dout(10) << __func__ << " " << h << " no dirty data on "
1706 << h->file->fnode << dendl;
1707 return 0;
1708 }
1709 dout(10) << __func__ << " " << h << " 0x"
1710 << std::hex << offset << "~" << length << std::dec
1711 << " to " << h->file->fnode << dendl;
1712 assert(h->pos <= h->file->fnode.size);
1713 return _flush_range(h, offset, length);
1714 }
1715
1716 int BlueFS::_truncate(FileWriter *h, uint64_t offset)
1717 {
1718 dout(10) << __func__ << " 0x" << std::hex << offset << std::dec
1719 << " file " << h->file->fnode << dendl;
1720 if (h->file->deleted) {
1721 dout(10) << __func__ << " deleted, no-op" << dendl;
1722 return 0;
1723 }
1724
1725 // we never truncate internal log files
1726 assert(h->file->fnode.ino > 1);
1727
1728 h->buffer_appender.flush();
1729
1730 // truncate off unflushed data?
1731 if (h->pos < offset &&
1732 h->pos + h->buffer.length() > offset) {
1733 bufferlist t;
1734 dout(20) << __func__ << " tossing out last " << offset - h->pos
1735 << " unflushed bytes" << dendl;
1736 t.substr_of(h->buffer, 0, offset - h->pos);
1737 h->buffer.swap(t);
1738 assert(0 == "actually this shouldn't happen");
1739 }
1740 if (h->buffer.length()) {
1741 int r = _flush(h, true);
1742 if (r < 0)
1743 return r;
1744 }
1745 if (offset == h->file->fnode.size) {
1746 return 0; // no-op!
1747 }
1748 if (offset > h->file->fnode.size) {
1749 assert(0 == "truncate up not supported");
1750 }
1751 assert(h->file->fnode.size >= offset);
1752 h->file->fnode.size = offset;
1753 log_t.op_file_update(h->file->fnode);
1754 return 0;
1755 }
1756
1757 int BlueFS::_fsync(FileWriter *h, std::unique_lock<std::mutex>& l)
1758 {
1759 dout(10) << __func__ << " " << h << " " << h->file->fnode << dendl;
1760 int r = _flush(h, true);
1761 if (r < 0)
1762 return r;
1763 uint64_t old_dirty_seq = h->file->dirty_seq;
1764
1765 _flush_bdev_safely(h);
1766
1767 if (old_dirty_seq) {
1768 uint64_t s = log_seq;
1769 dout(20) << __func__ << " file metadata was dirty (" << old_dirty_seq
1770 << ") on " << h->file->fnode << ", flushing log" << dendl;
1771 _flush_and_sync_log(l, old_dirty_seq);
1772 assert(h->file->dirty_seq == 0 || // cleaned
1773 h->file->dirty_seq > s); // or redirtied by someone else
1774 }
1775 return 0;
1776 }
1777
1778 void BlueFS::_flush_bdev_safely(FileWriter *h)
1779 {
1780 if (!cct->_conf->bluefs_sync_write) {
1781 list<aio_t> completed_ios;
1782 _claim_completed_aios(h, &completed_ios);
1783 lock.unlock();
1784 wait_for_aio(h);
1785 completed_ios.clear();
1786 flush_bdev();
1787 lock.lock();
1788 } else {
1789 lock.unlock();
1790 flush_bdev();
1791 lock.lock();
1792 }
1793 }
1794
1795 void BlueFS::flush_bdev()
1796 {
1797 // NOTE: this is safe to call without a lock.
1798 dout(20) << __func__ << dendl;
1799 for (auto p : bdev) {
1800 if (p)
1801 p->flush();
1802 }
1803 }
1804
1805 int BlueFS::_allocate(uint8_t id, uint64_t len,
1806 mempool::bluefs::vector<bluefs_extent_t> *ev)
1807 {
1808 dout(10) << __func__ << " len 0x" << std::hex << len << std::dec
1809 << " from " << (int)id << dendl;
1810 assert(id < alloc.size());
1811 uint64_t min_alloc_size = cct->_conf->bluefs_alloc_size;
1812
1813 uint64_t left = ROUND_UP_TO(len, min_alloc_size);
1814 int r = -ENOSPC;
1815 if (alloc[id]) {
1816 r = alloc[id]->reserve(left);
1817 }
1818 if (r < 0) {
1819 if (id != BDEV_SLOW) {
1820 if (bdev[id]) {
1821 dout(1) << __func__ << " failed to allocate 0x" << std::hex << left
1822 << " on bdev " << (int)id
1823 << ", free 0x" << alloc[id]->get_free()
1824 << "; fallback to bdev " << (int)id + 1
1825 << std::dec << dendl;
1826 }
1827 return _allocate(id + 1, len, ev);
1828 }
1829 if (bdev[id])
1830 derr << __func__ << " failed to allocate 0x" << std::hex << left
1831 << " on bdev " << (int)id
1832 << ", free 0x" << alloc[id]->get_free() << std::dec << dendl;
1833 else
1834 derr << __func__ << " failed to allocate 0x" << std::hex << left
1835 << " on bdev " << (int)id << ", dne" << std::dec << dendl;
1836 return r;
1837 }
1838
1839 uint64_t hint = 0;
1840 if (!ev->empty()) {
1841 hint = ev->back().end();
1842 }
1843
1844 AllocExtentVector extents;
1845 extents.reserve(4); // 4 should be (more than) enough for most allocations
1846 int64_t alloc_len = alloc[id]->allocate(left, min_alloc_size, hint,
1847 &extents);
1848 if (alloc_len < (int64_t)left) {
1849 derr << __func__ << " allocate failed on 0x" << std::hex << left
1850 << " min_alloc_size 0x" << min_alloc_size
1851 << " hint 0x" << hint << std::dec << dendl;
1852 alloc[id]->dump();
1853 assert(0 == "allocate failed... wtf");
1854 return -ENOSPC;
1855 }
1856
1857 for (auto& p : extents) {
1858 bluefs_extent_t e = bluefs_extent_t(id, p.offset, p.length);
1859 if (!ev->empty() &&
1860 ev->back().bdev == e.bdev &&
1861 ev->back().end() == (uint64_t) e.offset) {
1862 ev->back().length += e.length;
1863 } else {
1864 ev->push_back(e);
1865 }
1866 }
1867
1868 return 0;
1869 }
1870
1871 int BlueFS::_preallocate(FileRef f, uint64_t off, uint64_t len)
1872 {
1873 dout(10) << __func__ << " file " << f->fnode << " 0x"
1874 << std::hex << off << "~" << len << std::dec << dendl;
1875 if (f->deleted) {
1876 dout(10) << __func__ << " deleted, no-op" << dendl;
1877 return 0;
1878 }
1879 assert(f->fnode.ino > 1);
1880 uint64_t allocated = f->fnode.get_allocated();
1881 if (off + len > allocated) {
1882 uint64_t want = off + len - allocated;
1883 int r = _allocate(f->fnode.prefer_bdev, want, &f->fnode.extents);
1884 if (r < 0)
1885 return r;
1886 f->fnode.recalc_allocated();
1887 log_t.op_file_update(f->fnode);
1888 }
1889 return 0;
1890 }
1891
1892 void BlueFS::sync_metadata()
1893 {
1894 std::unique_lock<std::mutex> l(lock);
1895 if (log_t.empty()) {
1896 dout(10) << __func__ << " - no pending log events" << dendl;
1897 return;
1898 }
1899 dout(10) << __func__ << dendl;
1900 utime_t start = ceph_clock_now();
1901 vector<interval_set<uint64_t>> to_release(pending_release.size());
1902 to_release.swap(pending_release);
1903 flush_bdev(); // FIXME?
1904 _flush_and_sync_log(l);
1905 for (unsigned i = 0; i < to_release.size(); ++i) {
1906 for (auto p = to_release[i].begin(); p != to_release[i].end(); ++p) {
1907 alloc[i]->release(p.get_start(), p.get_len());
1908 }
1909 }
1910
1911 if (_should_compact_log()) {
1912 if (cct->_conf->bluefs_compact_log_sync) {
1913 _compact_log_sync();
1914 } else {
1915 _compact_log_async(l);
1916 }
1917 }
1918
1919 utime_t end = ceph_clock_now();
1920 utime_t dur = end - start;
1921 dout(10) << __func__ << " done in " << dur << dendl;
1922 }
1923
1924 int BlueFS::open_for_write(
1925 const string& dirname,
1926 const string& filename,
1927 FileWriter **h,
1928 bool overwrite)
1929 {
1930 std::lock_guard<std::mutex> l(lock);
1931 dout(10) << __func__ << " " << dirname << "/" << filename << dendl;
1932 map<string,DirRef>::iterator p = dir_map.find(dirname);
1933 DirRef dir;
1934 if (p == dir_map.end()) {
1935 // implicitly create the dir
1936 dout(20) << __func__ << " dir " << dirname
1937 << " does not exist" << dendl;
1938 return -ENOENT;
1939 } else {
1940 dir = p->second;
1941 }
1942
1943 FileRef file;
1944 bool create = false;
1945 map<string,FileRef>::iterator q = dir->file_map.find(filename);
1946 if (q == dir->file_map.end()) {
1947 if (overwrite) {
1948 dout(20) << __func__ << " dir " << dirname << " (" << dir
1949 << ") file " << filename
1950 << " does not exist" << dendl;
1951 return -ENOENT;
1952 }
1953 file = new File;
1954 file->fnode.ino = ++ino_last;
1955 file_map[ino_last] = file;
1956 dir->file_map[filename] = file;
1957 ++file->refs;
1958 create = true;
1959 } else {
1960 // overwrite existing file?
1961 file = q->second;
1962 if (overwrite) {
1963 dout(20) << __func__ << " dir " << dirname << " (" << dir
1964 << ") file " << filename
1965 << " already exists, overwrite in place" << dendl;
1966 } else {
1967 dout(20) << __func__ << " dir " << dirname << " (" << dir
1968 << ") file " << filename
1969 << " already exists, truncate + overwrite" << dendl;
1970 file->fnode.size = 0;
1971 for (auto& p : file->fnode.extents) {
1972 pending_release[p.bdev].insert(p.offset, p.length);
1973 }
1974 file->fnode.extents.clear();
1975 file->fnode.recalc_allocated();
1976 }
1977 }
1978 assert(file->fnode.ino > 1);
1979
1980 file->fnode.mtime = ceph_clock_now();
1981 file->fnode.prefer_bdev = BlueFS::BDEV_DB;
1982 if (dirname.length() > 5) {
1983 // the "db.slow" and "db.wal" directory names are hard-coded at
1984 // match up with bluestore. the slow device is always the second
1985 // one (when a dedicated block.db device is present and used at
1986 // bdev 0). the wal device is always last.
1987 if (boost::algorithm::ends_with(dirname, ".slow")) {
1988 file->fnode.prefer_bdev = BlueFS::BDEV_SLOW;
1989 } else if (boost::algorithm::ends_with(dirname, ".wal")) {
1990 file->fnode.prefer_bdev = BlueFS::BDEV_WAL;
1991 }
1992 }
1993 dout(20) << __func__ << " mapping " << dirname << "/" << filename
1994 << " to bdev " << (int)file->fnode.prefer_bdev << dendl;
1995
1996 log_t.op_file_update(file->fnode);
1997 if (create)
1998 log_t.op_dir_link(dirname, filename, file->fnode.ino);
1999
2000 *h = _create_writer(file);
2001
2002 if (boost::algorithm::ends_with(filename, ".log")) {
2003 (*h)->writer_type = BlueFS::WRITER_WAL;
2004 if (logger && !overwrite) {
2005 logger->inc(l_bluefs_files_written_wal);
2006 }
2007 } else if (boost::algorithm::ends_with(filename, ".sst")) {
2008 (*h)->writer_type = BlueFS::WRITER_SST;
2009 if (logger) {
2010 logger->inc(l_bluefs_files_written_sst);
2011 }
2012 }
2013
2014 dout(10) << __func__ << " h " << *h << " on " << file->fnode << dendl;
2015 return 0;
2016 }
2017
2018 BlueFS::FileWriter *BlueFS::_create_writer(FileRef f)
2019 {
2020 FileWriter *w = new FileWriter(f);
2021 for (unsigned i = 0; i < MAX_BDEV; ++i) {
2022 if (bdev[i]) {
2023 w->iocv[i] = new IOContext(cct, NULL);
2024 } else {
2025 w->iocv[i] = NULL;
2026 }
2027 }
2028 return w;
2029 }
2030
2031 void BlueFS::_close_writer(FileWriter *h)
2032 {
2033 dout(10) << __func__ << " " << h << " type " << h->writer_type << dendl;
2034 for (unsigned i=0; i<MAX_BDEV; ++i) {
2035 if (bdev[i]) {
2036 assert(h->iocv[i]);
2037 h->iocv[i]->aio_wait();
2038 bdev[i]->queue_reap_ioc(h->iocv[i]);
2039 }
2040 }
2041 delete h;
2042 }
2043
2044 int BlueFS::open_for_read(
2045 const string& dirname,
2046 const string& filename,
2047 FileReader **h,
2048 bool random)
2049 {
2050 std::lock_guard<std::mutex> l(lock);
2051 dout(10) << __func__ << " " << dirname << "/" << filename
2052 << (random ? " (random)":" (sequential)") << dendl;
2053 map<string,DirRef>::iterator p = dir_map.find(dirname);
2054 if (p == dir_map.end()) {
2055 dout(20) << __func__ << " dir " << dirname << " not found" << dendl;
2056 return -ENOENT;
2057 }
2058 DirRef dir = p->second;
2059
2060 map<string,FileRef>::iterator q = dir->file_map.find(filename);
2061 if (q == dir->file_map.end()) {
2062 dout(20) << __func__ << " dir " << dirname << " (" << dir
2063 << ") file " << filename
2064 << " not found" << dendl;
2065 return -ENOENT;
2066 }
2067 File *file = q->second.get();
2068
2069 *h = new FileReader(file, random ? 4096 : cct->_conf->bluefs_max_prefetch,
2070 random, false);
2071 dout(10) << __func__ << " h " << *h << " on " << file->fnode << dendl;
2072 return 0;
2073 }
2074
2075 int BlueFS::rename(
2076 const string& old_dirname, const string& old_filename,
2077 const string& new_dirname, const string& new_filename)
2078 {
2079 std::lock_guard<std::mutex> l(lock);
2080 dout(10) << __func__ << " " << old_dirname << "/" << old_filename
2081 << " -> " << new_dirname << "/" << new_filename << dendl;
2082 map<string,DirRef>::iterator p = dir_map.find(old_dirname);
2083 if (p == dir_map.end()) {
2084 dout(20) << __func__ << " dir " << old_dirname << " not found" << dendl;
2085 return -ENOENT;
2086 }
2087 DirRef old_dir = p->second;
2088 map<string,FileRef>::iterator q = old_dir->file_map.find(old_filename);
2089 if (q == old_dir->file_map.end()) {
2090 dout(20) << __func__ << " dir " << old_dirname << " (" << old_dir
2091 << ") file " << old_filename
2092 << " not found" << dendl;
2093 return -ENOENT;
2094 }
2095 FileRef file = q->second;
2096
2097 p = dir_map.find(new_dirname);
2098 if (p == dir_map.end()) {
2099 dout(20) << __func__ << " dir " << new_dirname << " not found" << dendl;
2100 return -ENOENT;
2101 }
2102 DirRef new_dir = p->second;
2103 q = new_dir->file_map.find(new_filename);
2104 if (q != new_dir->file_map.end()) {
2105 dout(20) << __func__ << " dir " << new_dirname << " (" << old_dir
2106 << ") file " << new_filename
2107 << " already exists, unlinking" << dendl;
2108 assert(q->second != file);
2109 log_t.op_dir_unlink(new_dirname, new_filename);
2110 _drop_link(q->second);
2111 }
2112
2113 dout(10) << __func__ << " " << new_dirname << "/" << new_filename << " "
2114 << " " << file->fnode << dendl;
2115
2116 new_dir->file_map[new_filename] = file;
2117 old_dir->file_map.erase(old_filename);
2118
2119 log_t.op_dir_link(new_dirname, new_filename, file->fnode.ino);
2120 log_t.op_dir_unlink(old_dirname, old_filename);
2121 return 0;
2122 }
2123
2124 int BlueFS::mkdir(const string& dirname)
2125 {
2126 std::lock_guard<std::mutex> l(lock);
2127 dout(10) << __func__ << " " << dirname << dendl;
2128 map<string,DirRef>::iterator p = dir_map.find(dirname);
2129 if (p != dir_map.end()) {
2130 dout(20) << __func__ << " dir " << dirname << " exists" << dendl;
2131 return -EEXIST;
2132 }
2133 dir_map[dirname] = new Dir;
2134 log_t.op_dir_create(dirname);
2135 return 0;
2136 }
2137
2138 int BlueFS::rmdir(const string& dirname)
2139 {
2140 std::lock_guard<std::mutex> l(lock);
2141 dout(10) << __func__ << " " << dirname << dendl;
2142 map<string,DirRef>::iterator p = dir_map.find(dirname);
2143 if (p == dir_map.end()) {
2144 dout(20) << __func__ << " dir " << dirname << " does not exist" << dendl;
2145 return -ENOENT;
2146 }
2147 DirRef dir = p->second;
2148 if (!dir->file_map.empty()) {
2149 dout(20) << __func__ << " dir " << dirname << " not empty" << dendl;
2150 return -ENOTEMPTY;
2151 }
2152 dir_map.erase(dirname);
2153 log_t.op_dir_remove(dirname);
2154 return 0;
2155 }
2156
2157 bool BlueFS::dir_exists(const string& dirname)
2158 {
2159 std::lock_guard<std::mutex> l(lock);
2160 map<string,DirRef>::iterator p = dir_map.find(dirname);
2161 bool exists = p != dir_map.end();
2162 dout(10) << __func__ << " " << dirname << " = " << (int)exists << dendl;
2163 return exists;
2164 }
2165
2166 int BlueFS::stat(const string& dirname, const string& filename,
2167 uint64_t *size, utime_t *mtime)
2168 {
2169 std::lock_guard<std::mutex> l(lock);
2170 dout(10) << __func__ << " " << dirname << "/" << filename << dendl;
2171 map<string,DirRef>::iterator p = dir_map.find(dirname);
2172 if (p == dir_map.end()) {
2173 dout(20) << __func__ << " dir " << dirname << " not found" << dendl;
2174 return -ENOENT;
2175 }
2176 DirRef dir = p->second;
2177 map<string,FileRef>::iterator q = dir->file_map.find(filename);
2178 if (q == dir->file_map.end()) {
2179 dout(20) << __func__ << " dir " << dirname << " (" << dir
2180 << ") file " << filename
2181 << " not found" << dendl;
2182 return -ENOENT;
2183 }
2184 File *file = q->second.get();
2185 dout(10) << __func__ << " " << dirname << "/" << filename
2186 << " " << file->fnode << dendl;
2187 if (size)
2188 *size = file->fnode.size;
2189 if (mtime)
2190 *mtime = file->fnode.mtime;
2191 return 0;
2192 }
2193
2194 int BlueFS::lock_file(const string& dirname, const string& filename,
2195 FileLock **plock)
2196 {
2197 std::lock_guard<std::mutex> l(lock);
2198 dout(10) << __func__ << " " << dirname << "/" << filename << dendl;
2199 map<string,DirRef>::iterator p = dir_map.find(dirname);
2200 if (p == dir_map.end()) {
2201 dout(20) << __func__ << " dir " << dirname << " not found" << dendl;
2202 return -ENOENT;
2203 }
2204 DirRef dir = p->second;
2205 map<string,FileRef>::iterator q = dir->file_map.find(filename);
2206 File *file;
2207 if (q == dir->file_map.end()) {
2208 dout(20) << __func__ << " dir " << dirname << " (" << dir
2209 << ") file " << filename
2210 << " not found, creating" << dendl;
2211 file = new File;
2212 file->fnode.ino = ++ino_last;
2213 file->fnode.mtime = ceph_clock_now();
2214 file_map[ino_last] = file;
2215 dir->file_map[filename] = file;
2216 ++file->refs;
2217 log_t.op_file_update(file->fnode);
2218 log_t.op_dir_link(dirname, filename, file->fnode.ino);
2219 } else {
2220 file = q->second.get();
2221 if (file->locked) {
2222 dout(10) << __func__ << " already locked" << dendl;
2223 return -EBUSY;
2224 }
2225 }
2226 file->locked = true;
2227 *plock = new FileLock(file);
2228 dout(10) << __func__ << " locked " << file->fnode
2229 << " with " << *plock << dendl;
2230 return 0;
2231 }
2232
2233 int BlueFS::unlock_file(FileLock *fl)
2234 {
2235 std::lock_guard<std::mutex> l(lock);
2236 dout(10) << __func__ << " " << fl << " on " << fl->file->fnode << dendl;
2237 assert(fl->file->locked);
2238 fl->file->locked = false;
2239 delete fl;
2240 return 0;
2241 }
2242
2243 int BlueFS::readdir(const string& dirname, vector<string> *ls)
2244 {
2245 std::lock_guard<std::mutex> l(lock);
2246 dout(10) << __func__ << " " << dirname << dendl;
2247 if (dirname.empty()) {
2248 // list dirs
2249 ls->reserve(dir_map.size() + 2);
2250 for (auto& q : dir_map) {
2251 ls->push_back(q.first);
2252 }
2253 } else {
2254 // list files in dir
2255 map<string,DirRef>::iterator p = dir_map.find(dirname);
2256 if (p == dir_map.end()) {
2257 dout(20) << __func__ << " dir " << dirname << " not found" << dendl;
2258 return -ENOENT;
2259 }
2260 DirRef dir = p->second;
2261 ls->reserve(dir->file_map.size() + 2);
2262 for (auto& q : dir->file_map) {
2263 ls->push_back(q.first);
2264 }
2265 }
2266 ls->push_back(".");
2267 ls->push_back("..");
2268 return 0;
2269 }
2270
2271 int BlueFS::unlink(const string& dirname, const string& filename)
2272 {
2273 std::lock_guard<std::mutex> l(lock);
2274 dout(10) << __func__ << " " << dirname << "/" << filename << dendl;
2275 map<string,DirRef>::iterator p = dir_map.find(dirname);
2276 if (p == dir_map.end()) {
2277 dout(20) << __func__ << " dir " << dirname << " not found" << dendl;
2278 return -ENOENT;
2279 }
2280 DirRef dir = p->second;
2281 map<string,FileRef>::iterator q = dir->file_map.find(filename);
2282 if (q == dir->file_map.end()) {
2283 dout(20) << __func__ << " file " << dirname << "/" << filename
2284 << " not found" << dendl;
2285 return -ENOENT;
2286 }
2287 FileRef file = q->second;
2288 if (file->locked) {
2289 dout(20) << __func__ << " file " << dirname << "/" << filename
2290 << " is locked" << dendl;
2291 return -EBUSY;
2292 }
2293 dir->file_map.erase(filename);
2294 log_t.op_dir_unlink(dirname, filename);
2295 _drop_link(file);
2296 return 0;
2297 }
2298
2299 bool BlueFS::wal_is_rotational()
2300 {
2301 if (!bdev[BDEV_WAL] || bdev[BDEV_WAL]->is_rotational())
2302 return true;
2303 return false;
2304 }