]> git.proxmox.com Git - ceph.git/blob - ceph/src/os/bluestore/BlueFS.cc
update source to 12.2.11
[ceph.git] / ceph / src / os / bluestore / BlueFS.cc
1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2 // vim: ts=8 sw=2 smarttab
3
4 #include "boost/algorithm/string.hpp"
5 #include "BlueFS.h"
6
7 #include "common/debug.h"
8 #include "common/errno.h"
9 #include "common/perf_counters.h"
10 #include "BlockDevice.h"
11 #include "Allocator.h"
12 #include "include/assert.h"
13
14 #define dout_context cct
15 #define dout_subsys ceph_subsys_bluefs
16 #undef dout_prefix
17 #define dout_prefix *_dout << "bluefs "
18
19 MEMPOOL_DEFINE_OBJECT_FACTORY(BlueFS::File, bluefs_file, bluefs);
20 MEMPOOL_DEFINE_OBJECT_FACTORY(BlueFS::Dir, bluefs_dir, bluefs);
21 MEMPOOL_DEFINE_OBJECT_FACTORY(BlueFS::FileWriter, bluefs_file_writer, bluefs);
22 MEMPOOL_DEFINE_OBJECT_FACTORY(BlueFS::FileReaderBuffer,
23 bluefs_file_reader_buffer, bluefs);
24 MEMPOOL_DEFINE_OBJECT_FACTORY(BlueFS::FileReader, bluefs_file_reader, bluefs);
25 MEMPOOL_DEFINE_OBJECT_FACTORY(BlueFS::FileLock, bluefs_file_lock, bluefs);
26
27
28 BlueFS::BlueFS(CephContext* cct)
29 : cct(cct),
30 bdev(MAX_BDEV),
31 ioc(MAX_BDEV),
32 block_all(MAX_BDEV),
33 block_total(MAX_BDEV, 0)
34 {
35 }
36
37 BlueFS::~BlueFS()
38 {
39 for (auto p : ioc) {
40 if (p)
41 p->aio_wait();
42 }
43 for (auto p : bdev) {
44 if (p) {
45 p->close();
46 delete p;
47 }
48 }
49 for (auto p : ioc) {
50 delete p;
51 }
52 }
53
54 void BlueFS::_init_logger()
55 {
56 PerfCountersBuilder b(cct, "bluefs",
57 l_bluefs_first, l_bluefs_last);
58 b.add_u64_counter(l_bluefs_gift_bytes, "gift_bytes",
59 "Bytes gifted from BlueStore", NULL, 0, unit_t(BYTES));
60 b.add_u64_counter(l_bluefs_reclaim_bytes, "reclaim_bytes",
61 "Bytes reclaimed by BlueStore", NULL, 0, unit_t(BYTES));
62 b.add_u64(l_bluefs_db_total_bytes, "db_total_bytes",
63 "Total bytes (main db device)",
64 "b", PerfCountersBuilder::PRIO_USEFUL, unit_t(BYTES));
65 b.add_u64(l_bluefs_db_used_bytes, "db_used_bytes",
66 "Used bytes (main db device)",
67 "u", PerfCountersBuilder::PRIO_USEFUL, unit_t(BYTES));
68 b.add_u64(l_bluefs_wal_total_bytes, "wal_total_bytes",
69 "Total bytes (wal device)",
70 "walb", PerfCountersBuilder::PRIO_USEFUL, unit_t(BYTES));
71 b.add_u64(l_bluefs_wal_used_bytes, "wal_used_bytes",
72 "Used bytes (wal device)",
73 "walu", PerfCountersBuilder::PRIO_USEFUL, unit_t(BYTES));
74 b.add_u64(l_bluefs_slow_total_bytes, "slow_total_bytes",
75 "Total bytes (slow device)",
76 "slob", PerfCountersBuilder::PRIO_USEFUL, unit_t(BYTES));
77 b.add_u64(l_bluefs_slow_used_bytes, "slow_used_bytes",
78 "Used bytes (slow device)",
79 "slou", PerfCountersBuilder::PRIO_USEFUL, unit_t(BYTES));
80 b.add_u64(l_bluefs_num_files, "num_files", "File count",
81 "f", PerfCountersBuilder::PRIO_USEFUL);
82 b.add_u64(l_bluefs_log_bytes, "log_bytes", "Size of the metadata log",
83 "jlen", PerfCountersBuilder::PRIO_INTERESTING, unit_t(BYTES));
84 b.add_u64_counter(l_bluefs_log_compactions, "log_compactions",
85 "Compactions of the metadata log");
86 b.add_u64_counter(l_bluefs_logged_bytes, "logged_bytes",
87 "Bytes written to the metadata log", "j",
88 PerfCountersBuilder::PRIO_CRITICAL, unit_t(BYTES));
89 b.add_u64_counter(l_bluefs_files_written_wal, "files_written_wal",
90 "Files written to WAL");
91 b.add_u64_counter(l_bluefs_files_written_sst, "files_written_sst",
92 "Files written to SSTs");
93 b.add_u64_counter(l_bluefs_bytes_written_wal, "bytes_written_wal",
94 "Bytes written to WAL", "wal",
95 PerfCountersBuilder::PRIO_CRITICAL);
96 b.add_u64_counter(l_bluefs_bytes_written_sst, "bytes_written_sst",
97 "Bytes written to SSTs", "sst",
98 PerfCountersBuilder::PRIO_CRITICAL, unit_t(BYTES));
99 logger = b.create_perf_counters();
100 cct->get_perfcounters_collection()->add(logger);
101 }
102
103 void BlueFS::_shutdown_logger()
104 {
105 cct->get_perfcounters_collection()->remove(logger);
106 delete logger;
107 }
108
109 void BlueFS::_update_logger_stats()
110 {
111 // we must be holding the lock
112 logger->set(l_bluefs_num_files, file_map.size());
113 logger->set(l_bluefs_log_bytes, log_writer->file->fnode.size);
114
115 if (alloc[BDEV_WAL]) {
116 logger->set(l_bluefs_wal_total_bytes, block_total[BDEV_WAL]);
117 logger->set(l_bluefs_wal_used_bytes,
118 block_total[BDEV_WAL] - alloc[BDEV_WAL]->get_free());
119 }
120 if (alloc[BDEV_DB]) {
121 logger->set(l_bluefs_db_total_bytes, block_total[BDEV_DB]);
122 logger->set(l_bluefs_db_used_bytes,
123 block_total[BDEV_DB] - alloc[BDEV_DB]->get_free());
124 }
125 if (alloc[BDEV_SLOW]) {
126 logger->set(l_bluefs_slow_total_bytes, block_total[BDEV_SLOW]);
127 logger->set(l_bluefs_slow_used_bytes,
128 block_total[BDEV_SLOW] - alloc[BDEV_SLOW]->get_free());
129 }
130 }
131
132 int BlueFS::add_block_device(unsigned id, const string& path)
133 {
134 dout(10) << __func__ << " bdev " << id << " path " << path << dendl;
135 assert(id < bdev.size());
136 assert(bdev[id] == NULL);
137 BlockDevice *b = BlockDevice::create(cct, path, NULL, NULL);
138 int r = b->open(path);
139 if (r < 0) {
140 delete b;
141 return r;
142 }
143 dout(1) << __func__ << " bdev " << id << " path " << path
144 << " size " << byte_u_t(b->get_size()) << dendl;
145 bdev[id] = b;
146 ioc[id] = new IOContext(cct, NULL);
147 return 0;
148 }
149
150 bool BlueFS::bdev_support_label(unsigned id)
151 {
152 assert(id < bdev.size());
153 assert(bdev[id]);
154 return bdev[id]->supported_bdev_label();
155 }
156
157 uint64_t BlueFS::get_block_device_size(unsigned id)
158 {
159 if (id < bdev.size() && bdev[id])
160 return bdev[id]->get_size();
161 return 0;
162 }
163
164 void BlueFS::add_block_extent(unsigned id, uint64_t offset, uint64_t length)
165 {
166 std::unique_lock<std::mutex> l(lock);
167 dout(1) << __func__ << " bdev " << id
168 << " 0x" << std::hex << offset << "~" << length << std::dec
169 << dendl;
170 assert(id < bdev.size());
171 assert(bdev[id]);
172 assert(bdev[id]->get_size() >= offset + length);
173 block_all[id].insert(offset, length);
174 block_total[id] += length;
175
176 if (id < alloc.size() && alloc[id]) {
177 log_t.op_alloc_add(id, offset, length);
178 int r = _flush_and_sync_log(l);
179 assert(r == 0);
180 alloc[id]->init_add_free(offset, length);
181 }
182
183 if (logger)
184 logger->inc(l_bluefs_gift_bytes, length);
185 dout(10) << __func__ << " done" << dendl;
186 }
187
188 int BlueFS::reclaim_blocks(unsigned id, uint64_t want,
189 AllocExtentVector *extents)
190 {
191 std::unique_lock<std::mutex> l(lock);
192 dout(1) << __func__ << " bdev " << id
193 << " want 0x" << std::hex << want << std::dec << dendl;
194 assert(id < alloc.size());
195 assert(alloc[id]);
196 int r = alloc[id]->reserve(want);
197 assert(r == 0); // caller shouldn't ask for more than they can get
198 int64_t got = alloc[id]->allocate(want, cct->_conf->bluefs_alloc_size, 0,
199 extents);
200 if (got < (int64_t)want) {
201 alloc[id]->unreserve(want - MAX(0, got));
202 }
203 if (got <= 0) {
204 derr << __func__ << " failed to allocate space to return to bluestore"
205 << dendl;
206 alloc[id]->dump();
207 return got;
208 }
209
210 for (auto& p : *extents) {
211 block_all[id].erase(p.offset, p.length);
212 block_total[id] -= p.length;
213 log_t.op_alloc_rm(id, p.offset, p.length);
214 }
215
216 flush_bdev();
217 r = _flush_and_sync_log(l);
218 assert(r == 0);
219
220 if (logger)
221 logger->inc(l_bluefs_reclaim_bytes, got);
222 dout(1) << __func__ << " bdev " << id << " want 0x" << std::hex << want
223 << " got " << *extents << dendl;
224 return 0;
225 }
226
227 uint64_t BlueFS::get_fs_usage()
228 {
229 std::lock_guard<std::mutex> l(lock);
230 uint64_t total_bytes = 0;
231 for (auto& p : file_map) {
232 total_bytes += p.second->fnode.get_allocated();
233 }
234 return total_bytes;
235 }
236
237 uint64_t BlueFS::get_total(unsigned id)
238 {
239 std::lock_guard<std::mutex> l(lock);
240 assert(id < block_all.size());
241 return block_total[id];
242 }
243
244 uint64_t BlueFS::get_free(unsigned id)
245 {
246 std::lock_guard<std::mutex> l(lock);
247 assert(id < alloc.size());
248 return alloc[id]->get_free();
249 }
250
251 void BlueFS::dump_perf_counters(Formatter *f)
252 {
253 f->open_object_section("bluefs_perf_counters");
254 logger->dump_formatted(f,0);
255 f->close_section();
256 }
257
258 void BlueFS::dump_block_extents(ostream& out)
259 {
260 for (unsigned i = 0; i < MAX_BDEV; ++i) {
261 if (!bdev[i]) {
262 continue;
263 }
264 out << i << " : size 0x" << std::hex << bdev[i]->get_size()
265 << " : own 0x" << block_all[i] << std::dec << "\n";
266 }
267 }
268
269 void BlueFS::get_usage(vector<pair<uint64_t,uint64_t>> *usage)
270 {
271 std::lock_guard<std::mutex> l(lock);
272 usage->resize(bdev.size());
273 for (unsigned id = 0; id < bdev.size(); ++id) {
274 if (!bdev[id]) {
275 (*usage)[id] = make_pair(0, 0);
276 continue;
277 }
278 (*usage)[id].first = alloc[id]->get_free();
279 (*usage)[id].second = block_total[id];
280 uint64_t used =
281 (block_total[id] - (*usage)[id].first) * 100 / block_total[id];
282 dout(10) << __func__ << " bdev " << id
283 << " free " << (*usage)[id].first
284 << " (" << byte_u_t((*usage)[id].first) << ")"
285 << " / " << (*usage)[id].second
286 << " (" << byte_u_t((*usage)[id].second) << ")"
287 << ", used " << used << "%"
288 << dendl;
289 }
290 }
291
292 int BlueFS::get_block_extents(unsigned id, interval_set<uint64_t> *extents)
293 {
294 std::lock_guard<std::mutex> l(lock);
295 dout(10) << __func__ << " bdev " << id << dendl;
296 if (id >= block_all.size())
297 return -EINVAL;
298 *extents = block_all[id];
299 return 0;
300 }
301
302 // returns true if specified device is attached
303 bool BlueFS::is_device(unsigned id)
304 {
305 return !(id >= MAX_BDEV || bdev[id] == nullptr);
306 }
307
308 // returns true if specified device is under full bluefs control
309 // and hence can be expanded
310 bool BlueFS::is_device_expandable(unsigned id)
311 {
312 if (id >= MAX_BDEV || bdev[id] == nullptr) {
313 return false;
314 }
315 switch(id) {
316 case BDEV_WAL:
317 return true;
318
319 case BDEV_DB:
320 // true if DB volume is non-shared
321 return bdev[BDEV_SLOW] != nullptr;
322 }
323 return false;
324 }
325
326 int BlueFS::mkfs(uuid_d osd_uuid)
327 {
328 std::unique_lock<std::mutex> l(lock);
329 dout(1) << __func__
330 << " osd_uuid " << osd_uuid
331 << dendl;
332
333 _init_alloc();
334 _init_logger();
335
336 super.version = 1;
337 super.block_size = bdev[BDEV_DB]->get_block_size();
338 super.osd_uuid = osd_uuid;
339 super.uuid.generate_random();
340 dout(1) << __func__ << " uuid " << super.uuid << dendl;
341
342 // init log
343 FileRef log_file = new File;
344 log_file->fnode.ino = 1;
345 log_file->fnode.prefer_bdev = BDEV_WAL;
346 int r = _allocate(
347 log_file->fnode.prefer_bdev,
348 cct->_conf->bluefs_max_log_runway,
349 &log_file->fnode);
350 assert(r == 0);
351 log_writer = _create_writer(log_file);
352
353 // initial txn
354 log_t.op_init();
355 for (unsigned bdev = 0; bdev < MAX_BDEV; ++bdev) {
356 interval_set<uint64_t>& p = block_all[bdev];
357 if (p.empty())
358 continue;
359 for (interval_set<uint64_t>::iterator q = p.begin(); q != p.end(); ++q) {
360 dout(20) << __func__ << " op_alloc_add " << bdev << " 0x"
361 << std::hex << q.get_start() << "~" << q.get_len() << std::dec
362 << dendl;
363 log_t.op_alloc_add(bdev, q.get_start(), q.get_len());
364 }
365 }
366 _flush_and_sync_log(l);
367
368 // write supers
369 super.log_fnode = log_file->fnode;
370 _write_super();
371 flush_bdev();
372
373 // clean up
374 super = bluefs_super_t();
375 _close_writer(log_writer);
376 log_writer = NULL;
377 block_all.clear();
378 block_total.clear();
379 _stop_alloc();
380 _shutdown_logger();
381
382 dout(10) << __func__ << " success" << dendl;
383 return 0;
384 }
385
386 void BlueFS::_init_alloc()
387 {
388 dout(20) << __func__ << dendl;
389 alloc.resize(MAX_BDEV);
390 pending_release.resize(MAX_BDEV);
391 for (unsigned id = 0; id < bdev.size(); ++id) {
392 if (!bdev[id]) {
393 continue;
394 }
395 assert(bdev[id]->get_size());
396 alloc[id] = Allocator::create(cct, cct->_conf->bluefs_allocator,
397 bdev[id]->get_size(),
398 cct->_conf->bluefs_alloc_size);
399 interval_set<uint64_t>& p = block_all[id];
400 for (interval_set<uint64_t>::iterator q = p.begin(); q != p.end(); ++q) {
401 alloc[id]->init_add_free(q.get_start(), q.get_len());
402 }
403 }
404 }
405
406 void BlueFS::_stop_alloc()
407 {
408 dout(20) << __func__ << dendl;
409 for (auto p : alloc) {
410 if (p != nullptr) {
411 p->shutdown();
412 delete p;
413 }
414 }
415 alloc.clear();
416 }
417
418 int BlueFS::mount()
419 {
420 dout(1) << __func__ << dendl;
421
422 int r = _open_super();
423 if (r < 0) {
424 derr << __func__ << " failed to open super: " << cpp_strerror(r) << dendl;
425 goto out;
426 }
427
428 block_all.clear();
429 block_all.resize(MAX_BDEV);
430 block_total.clear();
431 block_total.resize(MAX_BDEV, 0);
432 _init_alloc();
433
434 r = _replay(false);
435 if (r < 0) {
436 derr << __func__ << " failed to replay log: " << cpp_strerror(r) << dendl;
437 _stop_alloc();
438 goto out;
439 }
440
441 // init freelist
442 for (auto& p : file_map) {
443 dout(30) << __func__ << " noting alloc for " << p.second->fnode << dendl;
444 for (auto& q : p.second->fnode.extents) {
445 alloc[q.bdev]->init_rm_free(q.offset, q.length);
446 }
447 }
448
449 // set up the log for future writes
450 log_writer = _create_writer(_get_file(1));
451 assert(log_writer->file->fnode.ino == 1);
452 log_writer->pos = log_writer->file->fnode.size;
453 dout(10) << __func__ << " log write pos set to 0x"
454 << std::hex << log_writer->pos << std::dec
455 << dendl;
456
457 _init_logger();
458 return 0;
459
460 out:
461 super = bluefs_super_t();
462 return r;
463 }
464
465 void BlueFS::umount()
466 {
467 dout(1) << __func__ << dendl;
468
469 sync_metadata();
470
471 _close_writer(log_writer);
472 log_writer = NULL;
473
474 _stop_alloc();
475 file_map.clear();
476 dir_map.clear();
477 super = bluefs_super_t();
478 log_t.clear();
479 _shutdown_logger();
480 }
481
482 void BlueFS::collect_metadata(map<string,string> *pm)
483 {
484 if (bdev[BDEV_DB])
485 bdev[BDEV_DB]->collect_metadata("bluefs_db_", pm);
486 if (bdev[BDEV_WAL])
487 bdev[BDEV_WAL]->collect_metadata("bluefs_wal_", pm);
488 if (bdev[BDEV_SLOW])
489 bdev[BDEV_SLOW]->collect_metadata("bluefs_slow_", pm);
490 }
491
492 int BlueFS::fsck()
493 {
494 std::lock_guard<std::mutex> l(lock);
495 dout(1) << __func__ << dendl;
496 // hrm, i think we check everything on mount...
497 return 0;
498 }
499
500 int BlueFS::_write_super()
501 {
502 // build superblock
503 bufferlist bl;
504 ::encode(super, bl);
505 uint32_t crc = bl.crc32c(-1);
506 ::encode(crc, bl);
507 dout(10) << __func__ << " super block length(encoded): " << bl.length() << dendl;
508 dout(10) << __func__ << " superblock " << super.version << dendl;
509 dout(10) << __func__ << " log_fnode " << super.log_fnode << dendl;
510 assert(bl.length() <= get_super_length());
511 bl.append_zero(get_super_length() - bl.length());
512
513 bdev[BDEV_DB]->write(get_super_offset(), bl, false);
514 dout(20) << __func__ << " v " << super.version
515 << " crc 0x" << std::hex << crc
516 << " offset 0x" << get_super_offset() << std::dec
517 << dendl;
518 return 0;
519 }
520
521 int BlueFS::_open_super()
522 {
523 dout(10) << __func__ << dendl;
524
525 bufferlist bl;
526 uint32_t expected_crc, crc;
527 int r;
528
529 // always the second block
530 r = bdev[BDEV_DB]->read(get_super_offset(), get_super_length(),
531 &bl, ioc[BDEV_DB], false);
532 if (r < 0)
533 return r;
534
535 bufferlist::iterator p = bl.begin();
536 ::decode(super, p);
537 {
538 bufferlist t;
539 t.substr_of(bl, 0, p.get_off());
540 crc = t.crc32c(-1);
541 }
542 ::decode(expected_crc, p);
543 if (crc != expected_crc) {
544 derr << __func__ << " bad crc on superblock, expected 0x"
545 << std::hex << expected_crc << " != actual 0x" << crc << std::dec
546 << dendl;
547 return -EIO;
548 }
549 dout(10) << __func__ << " superblock " << super.version << dendl;
550 dout(10) << __func__ << " log_fnode " << super.log_fnode << dendl;
551 return 0;
552 }
553
554 int BlueFS::_replay(bool noop)
555 {
556 dout(10) << __func__ << (noop ? " NO-OP" : "") << dendl;
557 ino_last = 1; // by the log
558 log_seq = 0;
559
560 FileRef log_file;
561 if (noop) {
562 log_file = new File;
563 } else {
564 log_file = _get_file(1);
565 }
566 log_file->fnode = super.log_fnode;
567 dout(10) << __func__ << " log_fnode " << super.log_fnode << dendl;
568
569 FileReader *log_reader = new FileReader(
570 log_file, cct->_conf->bluefs_max_prefetch,
571 false, // !random
572 true); // ignore eof
573 while (true) {
574 assert((log_reader->buf.pos & ~super.block_mask()) == 0);
575 uint64_t pos = log_reader->buf.pos;
576 uint64_t read_pos = pos;
577 bufferlist bl;
578 {
579 int r = _read(log_reader, &log_reader->buf, read_pos, super.block_size,
580 &bl, NULL);
581 assert(r == (int)super.block_size);
582 read_pos += r;
583 }
584 uint64_t more = 0;
585 uint64_t seq;
586 uuid_d uuid;
587 {
588 bufferlist::iterator p = bl.begin();
589 __u8 a, b;
590 uint32_t len;
591 ::decode(a, p);
592 ::decode(b, p);
593 ::decode(len, p);
594 ::decode(uuid, p);
595 ::decode(seq, p);
596 if (len + 6 > bl.length()) {
597 more = ROUND_UP_TO(len + 6 - bl.length(), super.block_size);
598 }
599 }
600 if (uuid != super.uuid) {
601 dout(10) << __func__ << " 0x" << std::hex << pos << std::dec
602 << ": stop: uuid " << uuid << " != super.uuid " << super.uuid
603 << dendl;
604 break;
605 }
606 if (seq != log_seq + 1) {
607 dout(10) << __func__ << " 0x" << std::hex << pos << std::dec
608 << ": stop: seq " << seq << " != expected " << log_seq + 1
609 << dendl;
610 break;
611 }
612 if (more) {
613 dout(20) << __func__ << " need 0x" << std::hex << more << std::dec
614 << " more bytes" << dendl;
615 bufferlist t;
616 int r = _read(log_reader, &log_reader->buf, read_pos, more, &t, NULL);
617 if (r < (int)more) {
618 dout(10) << __func__ << " 0x" << std::hex << pos
619 << ": stop: len is 0x" << bl.length() + more << std::dec
620 << ", which is past eof" << dendl;
621 break;
622 }
623 assert(r == (int)more);
624 bl.claim_append(t);
625 read_pos += r;
626 }
627 bluefs_transaction_t t;
628 try {
629 bufferlist::iterator p = bl.begin();
630 ::decode(t, p);
631 }
632 catch (buffer::error& e) {
633 dout(10) << __func__ << " 0x" << std::hex << pos << std::dec
634 << ": stop: failed to decode: " << e.what()
635 << dendl;
636 delete log_reader;
637 return -EIO;
638 }
639 assert(seq == t.seq);
640 dout(10) << __func__ << " 0x" << std::hex << pos << std::dec
641 << ": " << t << dendl;
642
643 bufferlist::iterator p = t.op_bl.begin();
644 while (!p.end()) {
645 __u8 op;
646 ::decode(op, p);
647 switch (op) {
648
649 case bluefs_transaction_t::OP_INIT:
650 dout(20) << __func__ << " 0x" << std::hex << pos << std::dec
651 << ": op_init" << dendl;
652 assert(t.seq == 1);
653 break;
654
655 case bluefs_transaction_t::OP_JUMP:
656 {
657 uint64_t next_seq;
658 uint64_t offset;
659 ::decode(next_seq, p);
660 ::decode(offset, p);
661 dout(20) << __func__ << " 0x" << std::hex << pos << std::dec
662 << ": op_jump seq " << next_seq
663 << " offset 0x" << std::hex << offset << std::dec << dendl;
664 assert(next_seq >= log_seq);
665 log_seq = next_seq - 1; // we will increment it below
666 uint64_t skip = offset - read_pos;
667 if (skip) {
668 bufferlist junk;
669 int r = _read(log_reader, &log_reader->buf, read_pos, skip, &junk,
670 NULL);
671 if (r != (int)skip) {
672 dout(10) << __func__ << " 0x" << std::hex << read_pos
673 << ": stop: failed to skip to " << offset
674 << std::dec << dendl;
675 assert(0 == "problem with op_jump");
676 }
677 }
678 }
679 break;
680
681 case bluefs_transaction_t::OP_JUMP_SEQ:
682 {
683 uint64_t next_seq;
684 ::decode(next_seq, p);
685 dout(20) << __func__ << " 0x" << std::hex << pos << std::dec
686 << ": op_jump_seq " << next_seq << dendl;
687 assert(next_seq >= log_seq);
688 log_seq = next_seq - 1; // we will increment it below
689 }
690 break;
691
692 case bluefs_transaction_t::OP_ALLOC_ADD:
693 {
694 __u8 id;
695 uint64_t offset, length;
696 ::decode(id, p);
697 ::decode(offset, p);
698 ::decode(length, p);
699 dout(20) << __func__ << " 0x" << std::hex << pos << std::dec
700 << ": op_alloc_add " << " " << (int)id
701 << ":0x" << std::hex << offset << "~" << length << std::dec
702 << dendl;
703 if (!noop) {
704 block_all[id].insert(offset, length);
705 block_total[id] += length;
706 alloc[id]->init_add_free(offset, length);
707 }
708 }
709 break;
710
711 case bluefs_transaction_t::OP_ALLOC_RM:
712 {
713 __u8 id;
714 uint64_t offset, length;
715 ::decode(id, p);
716 ::decode(offset, p);
717 ::decode(length, p);
718 dout(20) << __func__ << " 0x" << std::hex << pos << std::dec
719 << ": op_alloc_rm " << " " << (int)id
720 << ":0x" << std::hex << offset << "~" << length << std::dec
721 << dendl;
722 if (!noop) {
723 block_all[id].erase(offset, length);
724 block_total[id] -= length;
725 alloc[id]->init_rm_free(offset, length);
726 }
727 }
728 break;
729
730 case bluefs_transaction_t::OP_DIR_LINK:
731 {
732 string dirname, filename;
733 uint64_t ino;
734 ::decode(dirname, p);
735 ::decode(filename, p);
736 ::decode(ino, p);
737 dout(20) << __func__ << " 0x" << std::hex << pos << std::dec
738 << ": op_dir_link " << " " << dirname << "/" << filename
739 << " to " << ino
740 << dendl;
741 if (!noop) {
742 FileRef file = _get_file(ino);
743 assert(file->fnode.ino);
744 map<string,DirRef>::iterator q = dir_map.find(dirname);
745 assert(q != dir_map.end());
746 map<string,FileRef>::iterator r = q->second->file_map.find(filename);
747 assert(r == q->second->file_map.end());
748 q->second->file_map[filename] = file;
749 ++file->refs;
750 }
751 }
752 break;
753
754 case bluefs_transaction_t::OP_DIR_UNLINK:
755 {
756 string dirname, filename;
757 ::decode(dirname, p);
758 ::decode(filename, p);
759 dout(20) << __func__ << " 0x" << std::hex << pos << std::dec
760 << ": op_dir_unlink " << " " << dirname << "/" << filename
761 << dendl;
762 if (!noop) {
763 map<string,DirRef>::iterator q = dir_map.find(dirname);
764 assert(q != dir_map.end());
765 map<string,FileRef>::iterator r = q->second->file_map.find(filename);
766 assert(r != q->second->file_map.end());
767 assert(r->second->refs > 0);
768 --r->second->refs;
769 q->second->file_map.erase(r);
770 }
771 }
772 break;
773
774 case bluefs_transaction_t::OP_DIR_CREATE:
775 {
776 string dirname;
777 ::decode(dirname, p);
778 dout(20) << __func__ << " 0x" << std::hex << pos << std::dec
779 << ": op_dir_create " << dirname << dendl;
780 if (!noop) {
781 map<string,DirRef>::iterator q = dir_map.find(dirname);
782 assert(q == dir_map.end());
783 dir_map[dirname] = new Dir;
784 }
785 }
786 break;
787
788 case bluefs_transaction_t::OP_DIR_REMOVE:
789 {
790 string dirname;
791 ::decode(dirname, p);
792 dout(20) << __func__ << " 0x" << std::hex << pos << std::dec
793 << ": op_dir_remove " << dirname << dendl;
794 if (!noop) {
795 map<string,DirRef>::iterator q = dir_map.find(dirname);
796 assert(q != dir_map.end());
797 assert(q->second->file_map.empty());
798 dir_map.erase(q);
799 }
800 }
801 break;
802
803 case bluefs_transaction_t::OP_FILE_UPDATE:
804 {
805 bluefs_fnode_t fnode;
806 ::decode(fnode, p);
807 dout(20) << __func__ << " 0x" << std::hex << pos << std::dec
808 << ": op_file_update " << " " << fnode << dendl;
809 if (!noop) {
810 FileRef f = _get_file(fnode.ino);
811 f->fnode = fnode;
812 if (fnode.ino > ino_last) {
813 ino_last = fnode.ino;
814 }
815 }
816 }
817 break;
818
819 case bluefs_transaction_t::OP_FILE_REMOVE:
820 {
821 uint64_t ino;
822 ::decode(ino, p);
823 dout(20) << __func__ << " 0x" << std::hex << pos << std::dec
824 << ": op_file_remove " << ino << dendl;
825 if (!noop) {
826 auto p = file_map.find(ino);
827 assert(p != file_map.end());
828 file_map.erase(p);
829 }
830 }
831 break;
832
833 default:
834 derr << __func__ << " 0x" << std::hex << pos << std::dec
835 << ": stop: unrecognized op " << (int)op << dendl;
836 delete log_reader;
837 return -EIO;
838 }
839 }
840 assert(p.end());
841
842 // we successfully replayed the transaction; bump the seq and log size
843 ++log_seq;
844 log_file->fnode.size = log_reader->buf.pos;
845 }
846
847 dout(10) << __func__ << " log file size was 0x"
848 << std::hex << log_file->fnode.size << std::dec << dendl;
849 delete log_reader;
850
851 if (!noop) {
852 // verify file link counts are all >0
853 for (auto& p : file_map) {
854 if (p.second->refs == 0 &&
855 p.second->fnode.ino > 1) {
856 derr << __func__ << " file with link count 0: " << p.second->fnode
857 << dendl;
858 return -EIO;
859 }
860 }
861 }
862
863 dout(10) << __func__ << " done" << dendl;
864 return 0;
865 }
866
867 BlueFS::FileRef BlueFS::_get_file(uint64_t ino)
868 {
869 auto p = file_map.find(ino);
870 if (p == file_map.end()) {
871 FileRef f = new File;
872 file_map[ino] = f;
873 dout(30) << __func__ << " ino " << ino << " = " << f
874 << " (new)" << dendl;
875 return f;
876 } else {
877 dout(30) << __func__ << " ino " << ino << " = " << p->second << dendl;
878 return p->second;
879 }
880 }
881
882 void BlueFS::_drop_link(FileRef file)
883 {
884 dout(20) << __func__ << " had refs " << file->refs
885 << " on " << file->fnode << dendl;
886 assert(file->refs > 0);
887 --file->refs;
888 if (file->refs == 0) {
889 dout(20) << __func__ << " destroying " << file->fnode << dendl;
890 assert(file->num_reading.load() == 0);
891 log_t.op_file_remove(file->fnode.ino);
892 for (auto& r : file->fnode.extents) {
893 pending_release[r.bdev].insert(r.offset, r.length);
894 }
895 file_map.erase(file->fnode.ino);
896 file->deleted = true;
897
898 if (file->dirty_seq) {
899 assert(file->dirty_seq > log_seq_stable);
900 assert(dirty_files.count(file->dirty_seq));
901 auto it = dirty_files[file->dirty_seq].iterator_to(*file);
902 dirty_files[file->dirty_seq].erase(it);
903 file->dirty_seq = 0;
904 }
905 }
906 }
907
908 int BlueFS::_read_random(
909 FileReader *h, ///< [in] read from here
910 uint64_t off, ///< [in] offset
911 size_t len, ///< [in] this many bytes
912 char *out) ///< [out] optional: or copy it here
913 {
914 dout(10) << __func__ << " h " << h
915 << " 0x" << std::hex << off << "~" << len << std::dec
916 << " from " << h->file->fnode << dendl;
917
918 ++h->file->num_reading;
919
920 if (!h->ignore_eof &&
921 off + len > h->file->fnode.size) {
922 if (off > h->file->fnode.size)
923 len = 0;
924 else
925 len = h->file->fnode.size - off;
926 dout(20) << __func__ << " reaching (or past) eof, len clipped to 0x"
927 << std::hex << len << std::dec << dendl;
928 }
929
930 int ret = 0;
931 while (len > 0) {
932 uint64_t x_off = 0;
933 auto p = h->file->fnode.seek(off, &x_off);
934 uint64_t l = MIN(p->length - x_off, len);
935 dout(20) << __func__ << " read buffered 0x"
936 << std::hex << x_off << "~" << l << std::dec
937 << " of " << *p << dendl;
938 int r = bdev[p->bdev]->read_random(p->offset + x_off, l, out,
939 cct->_conf->bluefs_buffered_io);
940 assert(r == 0);
941 off += l;
942 len -= l;
943 ret += l;
944 out += l;
945 }
946
947 dout(20) << __func__ << " got " << ret << dendl;
948 --h->file->num_reading;
949 return ret;
950 }
951
952 int BlueFS::_read(
953 FileReader *h, ///< [in] read from here
954 FileReaderBuffer *buf, ///< [in] reader state
955 uint64_t off, ///< [in] offset
956 size_t len, ///< [in] this many bytes
957 bufferlist *outbl, ///< [out] optional: reference the result here
958 char *out) ///< [out] optional: or copy it here
959 {
960 dout(10) << __func__ << " h " << h
961 << " 0x" << std::hex << off << "~" << len << std::dec
962 << " from " << h->file->fnode << dendl;
963
964 ++h->file->num_reading;
965
966 if (!h->ignore_eof &&
967 off + len > h->file->fnode.size) {
968 if (off > h->file->fnode.size)
969 len = 0;
970 else
971 len = h->file->fnode.size - off;
972 dout(20) << __func__ << " reaching (or past) eof, len clipped to 0x"
973 << std::hex << len << std::dec << dendl;
974 }
975 if (outbl)
976 outbl->clear();
977
978 int ret = 0;
979 while (len > 0) {
980 size_t left;
981 if (off < buf->bl_off || off >= buf->get_buf_end()) {
982 buf->bl.clear();
983 buf->bl_off = off & super.block_mask();
984 uint64_t x_off = 0;
985 auto p = h->file->fnode.seek(buf->bl_off, &x_off);
986 uint64_t want = ROUND_UP_TO(len + (off & ~super.block_mask()),
987 super.block_size);
988 want = MAX(want, buf->max_prefetch);
989 uint64_t l = MIN(p->length - x_off, want);
990 uint64_t eof_offset = ROUND_UP_TO(h->file->fnode.size, super.block_size);
991 if (!h->ignore_eof &&
992 buf->bl_off + l > eof_offset) {
993 l = eof_offset - buf->bl_off;
994 }
995 dout(20) << __func__ << " fetching 0x"
996 << std::hex << x_off << "~" << l << std::dec
997 << " of " << *p << dendl;
998 int r = bdev[p->bdev]->read(p->offset + x_off, l, &buf->bl, ioc[p->bdev],
999 cct->_conf->bluefs_buffered_io);
1000 assert(r == 0);
1001 }
1002 left = buf->get_buf_remaining(off);
1003 dout(20) << __func__ << " left 0x" << std::hex << left
1004 << " len 0x" << len << std::dec << dendl;
1005
1006 int r = MIN(len, left);
1007 if (outbl) {
1008 bufferlist t;
1009 t.substr_of(buf->bl, off - buf->bl_off, r);
1010 outbl->claim_append(t);
1011 }
1012 if (out) {
1013 // NOTE: h->bl is normally a contiguous buffer so c_str() is free.
1014 memcpy(out, buf->bl.c_str() + off - buf->bl_off, r);
1015 out += r;
1016 }
1017
1018 dout(30) << __func__ << " result chunk (0x"
1019 << std::hex << r << std::dec << " bytes):\n";
1020 bufferlist t;
1021 t.substr_of(buf->bl, off - buf->bl_off, r);
1022 t.hexdump(*_dout);
1023 *_dout << dendl;
1024
1025 off += r;
1026 len -= r;
1027 ret += r;
1028 buf->pos += r;
1029 }
1030
1031 dout(20) << __func__ << " got " << ret << dendl;
1032 assert(!outbl || (int)outbl->length() == ret);
1033 --h->file->num_reading;
1034 return ret;
1035 }
1036
1037 void BlueFS::_invalidate_cache(FileRef f, uint64_t offset, uint64_t length)
1038 {
1039 dout(10) << __func__ << " file " << f->fnode
1040 << " 0x" << std::hex << offset << "~" << length << std::dec
1041 << dendl;
1042 if (offset & ~super.block_mask()) {
1043 offset &= super.block_mask();
1044 length = ROUND_UP_TO(length, super.block_size);
1045 }
1046 uint64_t x_off = 0;
1047 auto p = f->fnode.seek(offset, &x_off);
1048 while (length > 0 && p != f->fnode.extents.end()) {
1049 uint64_t x_len = MIN(p->length - x_off, length);
1050 bdev[p->bdev]->invalidate_cache(p->offset + x_off, x_len);
1051 dout(20) << __func__ << " 0x" << std::hex << x_off << "~" << x_len
1052 << std:: dec << " of " << *p << dendl;
1053 offset += x_len;
1054 length -= x_len;
1055 }
1056 }
1057
1058 uint64_t BlueFS::_estimate_log_size()
1059 {
1060 int avg_dir_size = 40; // fixme
1061 int avg_file_size = 12;
1062 uint64_t size = 4096 * 2;
1063 size += file_map.size() * (1 + sizeof(bluefs_fnode_t));
1064 for (auto& p : block_all)
1065 size += p.num_intervals() * (1 + 1 + sizeof(uint64_t) * 2);
1066 size += dir_map.size() + (1 + avg_dir_size);
1067 size += file_map.size() * (1 + avg_dir_size + avg_file_size);
1068 return ROUND_UP_TO(size, super.block_size);
1069 }
1070
1071 void BlueFS::compact_log()
1072 {
1073 std::unique_lock<std::mutex> l(lock);
1074 if (cct->_conf->bluefs_compact_log_sync) {
1075 _compact_log_sync();
1076 } else {
1077 _compact_log_async(l);
1078 }
1079 }
1080
1081 bool BlueFS::_should_compact_log()
1082 {
1083 uint64_t current = log_writer->file->fnode.size;
1084 uint64_t expected = _estimate_log_size();
1085 float ratio = (float)current / (float)expected;
1086 dout(10) << __func__ << " current 0x" << std::hex << current
1087 << " expected " << expected << std::dec
1088 << " ratio " << ratio
1089 << (new_log ? " (async compaction in progress)" : "")
1090 << dendl;
1091 if (new_log ||
1092 current < cct->_conf->bluefs_log_compact_min_size ||
1093 ratio < cct->_conf->bluefs_log_compact_min_ratio) {
1094 return false;
1095 }
1096 return true;
1097 }
1098
1099 void BlueFS::_compact_log_dump_metadata(bluefs_transaction_t *t)
1100 {
1101 t->seq = 1;
1102 t->uuid = super.uuid;
1103 dout(20) << __func__ << " op_init" << dendl;
1104
1105 t->op_init();
1106 for (unsigned bdev = 0; bdev < MAX_BDEV; ++bdev) {
1107 interval_set<uint64_t>& p = block_all[bdev];
1108 for (interval_set<uint64_t>::iterator q = p.begin(); q != p.end(); ++q) {
1109 dout(20) << __func__ << " op_alloc_add " << bdev << " 0x"
1110 << std::hex << q.get_start() << "~" << q.get_len() << std::dec
1111 << dendl;
1112 t->op_alloc_add(bdev, q.get_start(), q.get_len());
1113 }
1114 }
1115 for (auto& p : file_map) {
1116 if (p.first == 1)
1117 continue;
1118 dout(20) << __func__ << " op_file_update " << p.second->fnode << dendl;
1119 assert(p.first > 1);
1120 t->op_file_update(p.second->fnode);
1121 }
1122 for (auto& p : dir_map) {
1123 dout(20) << __func__ << " op_dir_create " << p.first << dendl;
1124 t->op_dir_create(p.first);
1125 for (auto& q : p.second->file_map) {
1126 dout(20) << __func__ << " op_dir_link " << p.first << "/" << q.first
1127 << " to " << q.second->fnode.ino << dendl;
1128 t->op_dir_link(p.first, q.first, q.second->fnode.ino);
1129 }
1130 }
1131 }
1132
1133 void BlueFS::_compact_log_sync()
1134 {
1135 dout(10) << __func__ << dendl;
1136 File *log_file = log_writer->file.get();
1137
1138 // clear out log (be careful who calls us!!!)
1139 log_t.clear();
1140
1141 bluefs_transaction_t t;
1142 _compact_log_dump_metadata(&t);
1143
1144 dout(20) << __func__ << " op_jump_seq " << log_seq << dendl;
1145 t.op_jump_seq(log_seq);
1146
1147 bufferlist bl;
1148 ::encode(t, bl);
1149 _pad_bl(bl);
1150
1151 uint64_t need = bl.length() + cct->_conf->bluefs_max_log_runway;
1152 dout(20) << __func__ << " need " << need << dendl;
1153
1154 mempool::bluefs::vector<bluefs_extent_t> old_extents;
1155 uint64_t old_allocated = 0;
1156 log_file->fnode.swap_extents(old_extents, old_allocated);
1157 while (log_file->fnode.get_allocated() < need) {
1158 int r = _allocate(log_file->fnode.prefer_bdev,
1159 need - log_file->fnode.get_allocated(),
1160 &log_file->fnode);
1161 assert(r == 0);
1162 }
1163
1164 _close_writer(log_writer);
1165
1166 log_file->fnode.size = bl.length();
1167 log_writer = _create_writer(log_file);
1168 log_writer->append(bl);
1169 int r = _flush(log_writer, true);
1170 assert(r == 0);
1171 wait_for_aio(log_writer);
1172
1173 list<aio_t> completed_ios;
1174 _claim_completed_aios(log_writer, &completed_ios);
1175 flush_bdev();
1176 completed_ios.clear();
1177
1178 dout(10) << __func__ << " writing super" << dendl;
1179 super.log_fnode = log_file->fnode;
1180 ++super.version;
1181 _write_super();
1182 flush_bdev();
1183
1184 dout(10) << __func__ << " release old log extents " << old_extents << dendl;
1185 for (auto& r : old_extents) {
1186 pending_release[r.bdev].insert(r.offset, r.length);
1187 }
1188
1189 logger->inc(l_bluefs_log_compactions);
1190 }
1191
1192 /*
1193 * 1. Allocate a new extent to continue the log, and then log an event
1194 * that jumps the log write position to the new extent. At this point, the
1195 * old extent(s) won't be written to, and reflect everything to compact.
1196 * New events will be written to the new region that we'll keep.
1197 *
1198 * 2. While still holding the lock, encode a bufferlist that dumps all of the
1199 * in-memory fnodes and names. This will become the new beginning of the
1200 * log. The last event will jump to the log continuation extent from #1.
1201 *
1202 * 3. Queue a write to a new extent for the new beginnging of the log.
1203 *
1204 * 4. Drop lock and wait
1205 *
1206 * 5. Retake the lock.
1207 *
1208 * 6. Update the log_fnode to splice in the new beginning.
1209 *
1210 * 7. Write the new superblock.
1211 *
1212 * 8. Release the old log space. Clean up.
1213 */
1214 void BlueFS::_compact_log_async(std::unique_lock<std::mutex>& l)
1215 {
1216 dout(10) << __func__ << dendl;
1217 File *log_file = log_writer->file.get();
1218 assert(!new_log);
1219 assert(!new_log_writer);
1220
1221 // create a new log [writer] so that we know compaction is in progress
1222 // (see _should_compact_log)
1223 new_log = new File;
1224 new_log->fnode.ino = 0; // so that _flush_range won't try to log the fnode
1225
1226 // 0. wait for any racing flushes to complete. (We do not want to block
1227 // in _flush_sync_log with jump_to set or else a racing thread might flush
1228 // our entries and our jump_to update won't be correct.)
1229 while (log_flushing) {
1230 dout(10) << __func__ << " log is currently flushing, waiting" << dendl;
1231 log_cond.wait(l);
1232 }
1233
1234 // 1. allocate new log space and jump to it.
1235 old_log_jump_to = log_file->fnode.get_allocated();
1236 uint64_t need = old_log_jump_to + cct->_conf->bluefs_max_log_runway;
1237 dout(10) << __func__ << " old_log_jump_to 0x" << std::hex << old_log_jump_to
1238 << " need 0x" << need << std::dec << dendl;
1239 while (log_file->fnode.get_allocated() < need) {
1240 int r = _allocate(log_file->fnode.prefer_bdev,
1241 cct->_conf->bluefs_max_log_runway,
1242 &log_file->fnode);
1243 assert(r == 0);
1244 }
1245 dout(10) << __func__ << " log extents " << log_file->fnode.extents << dendl;
1246
1247 // update the log file change and log a jump to the offset where we want to
1248 // write the new entries
1249 log_t.op_file_update(log_file->fnode);
1250 log_t.op_jump(log_seq, old_log_jump_to);
1251
1252 flush_bdev(); // FIXME?
1253
1254 _flush_and_sync_log(l, 0, old_log_jump_to);
1255
1256 // 2. prepare compacted log
1257 bluefs_transaction_t t;
1258 //avoid record two times in log_t and _compact_log_dump_metadata.
1259 log_t.clear();
1260 _compact_log_dump_metadata(&t);
1261
1262 // conservative estimate for final encoded size
1263 new_log_jump_to = ROUND_UP_TO(t.op_bl.length() + super.block_size * 2,
1264 cct->_conf->bluefs_alloc_size);
1265 t.op_jump(log_seq, new_log_jump_to);
1266
1267 bufferlist bl;
1268 ::encode(t, bl);
1269 _pad_bl(bl);
1270
1271 dout(10) << __func__ << " new_log_jump_to 0x" << std::hex << new_log_jump_to
1272 << std::dec << dendl;
1273
1274 // allocate
1275 int r = _allocate(BlueFS::BDEV_DB, new_log_jump_to,
1276 &new_log->fnode);
1277 assert(r == 0);
1278 new_log_writer = _create_writer(new_log);
1279 new_log_writer->append(bl);
1280
1281 // 3. flush
1282 r = _flush(new_log_writer, true);
1283 assert(r == 0);
1284 lock.unlock();
1285
1286 // 4. wait
1287 dout(10) << __func__ << " waiting for compacted log to sync" << dendl;
1288 wait_for_aio(new_log_writer);
1289
1290 list<aio_t> completed_ios;
1291 _claim_completed_aios(new_log_writer, &completed_ios);
1292 flush_bdev();
1293 completed_ios.clear();
1294
1295 // 5. retake lock
1296 lock.lock();
1297
1298 // 6. update our log fnode
1299 // discard first old_log_jump_to extents
1300 dout(10) << __func__ << " remove 0x" << std::hex << old_log_jump_to << std::dec
1301 << " of " << log_file->fnode.extents << dendl;
1302 uint64_t discarded = 0;
1303 mempool::bluefs::vector<bluefs_extent_t> old_extents;
1304 while (discarded < old_log_jump_to) {
1305 assert(!log_file->fnode.extents.empty());
1306 bluefs_extent_t& e = log_file->fnode.extents.front();
1307 bluefs_extent_t temp = e;
1308 if (discarded + e.length <= old_log_jump_to) {
1309 dout(10) << __func__ << " remove old log extent " << e << dendl;
1310 discarded += e.length;
1311 log_file->fnode.pop_front_extent();
1312 } else {
1313 dout(10) << __func__ << " remove front of old log extent " << e << dendl;
1314 uint64_t drop = old_log_jump_to - discarded;
1315 temp.length = drop;
1316 e.offset += drop;
1317 e.length -= drop;
1318 discarded += drop;
1319 dout(10) << __func__ << " kept " << e << " removed " << temp << dendl;
1320 }
1321 old_extents.push_back(temp);
1322 }
1323 auto from = log_file->fnode.extents.begin();
1324 auto to = log_file->fnode.extents.end();
1325 while (from != to) {
1326 new_log->fnode.append_extent(*from);
1327 ++from;
1328 }
1329
1330 // clear the extents from old log file, they are added to new log
1331 log_file->fnode.clear_extents();
1332 // swap the log files. New log file is the log file now.
1333 new_log->fnode.swap_extents(log_file->fnode);
1334
1335 log_writer->pos = log_writer->file->fnode.size =
1336 log_writer->pos - old_log_jump_to + new_log_jump_to;
1337
1338 // 7. write the super block to reflect the changes
1339 dout(10) << __func__ << " writing super" << dendl;
1340 super.log_fnode = log_file->fnode;
1341 ++super.version;
1342 _write_super();
1343
1344 lock.unlock();
1345 flush_bdev();
1346 lock.lock();
1347
1348 // 8. release old space
1349 dout(10) << __func__ << " release old log extents " << old_extents << dendl;
1350 for (auto& r : old_extents) {
1351 pending_release[r.bdev].insert(r.offset, r.length);
1352 }
1353
1354 // delete the new log, remove from the dirty files list
1355 _close_writer(new_log_writer);
1356 if (new_log->dirty_seq) {
1357 assert(dirty_files.count(new_log->dirty_seq));
1358 auto it = dirty_files[new_log->dirty_seq].iterator_to(*new_log);
1359 dirty_files[new_log->dirty_seq].erase(it);
1360 }
1361 new_log_writer = nullptr;
1362 new_log = nullptr;
1363 log_cond.notify_all();
1364
1365 dout(10) << __func__ << " log extents " << log_file->fnode.extents << dendl;
1366 logger->inc(l_bluefs_log_compactions);
1367 }
1368
1369 void BlueFS::_pad_bl(bufferlist& bl)
1370 {
1371 uint64_t partial = bl.length() % super.block_size;
1372 if (partial) {
1373 dout(10) << __func__ << " padding with 0x" << std::hex
1374 << super.block_size - partial << " zeros" << std::dec << dendl;
1375 bl.append_zero(super.block_size - partial);
1376 }
1377 }
1378
1379 void BlueFS::flush_log()
1380 {
1381 std::unique_lock<std::mutex> l(lock);
1382 flush_bdev();
1383 _flush_and_sync_log(l);
1384 }
1385
1386 int BlueFS::_flush_and_sync_log(std::unique_lock<std::mutex>& l,
1387 uint64_t want_seq,
1388 uint64_t jump_to)
1389 {
1390 while (log_flushing) {
1391 dout(10) << __func__ << " want_seq " << want_seq
1392 << " log is currently flushing, waiting" << dendl;
1393 assert(!jump_to);
1394 log_cond.wait(l);
1395 }
1396 if (want_seq && want_seq <= log_seq_stable) {
1397 dout(10) << __func__ << " want_seq " << want_seq << " <= log_seq_stable "
1398 << log_seq_stable << ", done" << dendl;
1399 assert(!jump_to);
1400 return 0;
1401 }
1402 if (log_t.empty() && dirty_files.empty()) {
1403 dout(10) << __func__ << " want_seq " << want_seq
1404 << " " << log_t << " not dirty, dirty_files empty, no-op" << dendl;
1405 assert(!jump_to);
1406 return 0;
1407 }
1408
1409 uint64_t seq = log_t.seq = ++log_seq;
1410 assert(want_seq == 0 || want_seq <= seq);
1411 log_t.uuid = super.uuid;
1412
1413 // log dirty files
1414 auto lsi = dirty_files.find(seq);
1415 if (lsi != dirty_files.end()) {
1416 dout(20) << __func__ << " " << lsi->second.size() << " dirty_files" << dendl;
1417 for (auto &f : lsi->second) {
1418 dout(20) << __func__ << " op_file_update " << f.fnode << dendl;
1419 log_t.op_file_update(f.fnode);
1420 }
1421 }
1422
1423 dout(10) << __func__ << " " << log_t << dendl;
1424 assert(!log_t.empty());
1425
1426 // allocate some more space (before we run out)?
1427 int64_t runway = log_writer->file->fnode.get_allocated() -
1428 log_writer->get_effective_write_pos();
1429 if (runway < (int64_t)cct->_conf->bluefs_min_log_runway) {
1430 dout(10) << __func__ << " allocating more log runway (0x"
1431 << std::hex << runway << std::dec << " remaining)" << dendl;
1432 while (new_log_writer) {
1433 dout(10) << __func__ << " waiting for async compaction" << dendl;
1434 log_cond.wait(l);
1435 }
1436 int r = _allocate(log_writer->file->fnode.prefer_bdev,
1437 cct->_conf->bluefs_max_log_runway,
1438 &log_writer->file->fnode);
1439 assert(r == 0);
1440 log_t.op_file_update(log_writer->file->fnode);
1441 }
1442
1443 bufferlist bl;
1444 ::encode(log_t, bl);
1445
1446 // pad to block boundary
1447 _pad_bl(bl);
1448 logger->inc(l_bluefs_logged_bytes, bl.length());
1449
1450 log_writer->append(bl);
1451
1452 log_t.clear();
1453 log_t.seq = 0; // just so debug output is less confusing
1454 log_flushing = true;
1455
1456 int r = _flush(log_writer, true);
1457 assert(r == 0);
1458
1459 if (jump_to) {
1460 dout(10) << __func__ << " jumping log offset from 0x" << std::hex
1461 << log_writer->pos << " -> 0x" << jump_to << std::dec << dendl;
1462 log_writer->pos = jump_to;
1463 log_writer->file->fnode.size = jump_to;
1464 }
1465
1466 _flush_bdev_safely(log_writer);
1467
1468 log_flushing = false;
1469 log_cond.notify_all();
1470
1471 // clean dirty files
1472 if (seq > log_seq_stable) {
1473 log_seq_stable = seq;
1474 dout(20) << __func__ << " log_seq_stable " << log_seq_stable << dendl;
1475
1476 auto p = dirty_files.begin();
1477 while (p != dirty_files.end()) {
1478 if (p->first > log_seq_stable) {
1479 dout(20) << __func__ << " done cleaning up dirty files" << dendl;
1480 break;
1481 }
1482
1483 auto l = p->second.begin();
1484 while (l != p->second.end()) {
1485 File *file = &*l;
1486 assert(file->dirty_seq > 0);
1487 assert(file->dirty_seq <= log_seq_stable);
1488 dout(20) << __func__ << " cleaned file " << file->fnode << dendl;
1489 file->dirty_seq = 0;
1490 p->second.erase(l++);
1491 }
1492
1493 assert(p->second.empty());
1494 dirty_files.erase(p++);
1495 }
1496 } else {
1497 dout(20) << __func__ << " log_seq_stable " << log_seq_stable
1498 << " already >= out seq " << seq
1499 << ", we lost a race against another log flush, done" << dendl;
1500 }
1501 _update_logger_stats();
1502
1503 return 0;
1504 }
1505
1506 int BlueFS::_flush_range(FileWriter *h, uint64_t offset, uint64_t length)
1507 {
1508 dout(10) << __func__ << " " << h << " pos 0x" << std::hex << h->pos
1509 << " 0x" << offset << "~" << length << std::dec
1510 << " to " << h->file->fnode << dendl;
1511 assert(!h->file->deleted);
1512 assert(h->file->num_readers.load() == 0);
1513
1514 h->buffer_appender.flush();
1515
1516 bool buffered;
1517 if (h->file->fnode.ino == 1)
1518 buffered = false;
1519 else
1520 buffered = cct->_conf->bluefs_buffered_io;
1521
1522 if (offset + length <= h->pos)
1523 return 0;
1524 if (offset < h->pos) {
1525 length -= h->pos - offset;
1526 offset = h->pos;
1527 dout(10) << " still need 0x"
1528 << std::hex << offset << "~" << length << std::dec
1529 << dendl;
1530 }
1531 assert(offset <= h->file->fnode.size);
1532
1533 uint64_t allocated = h->file->fnode.get_allocated();
1534
1535 // do not bother to dirty the file if we are overwriting
1536 // previously allocated extents.
1537 bool must_dirty = false;
1538 if (allocated < offset + length) {
1539 // we should never run out of log space here; see the min runway check
1540 // in _flush_and_sync_log.
1541 assert(h->file->fnode.ino != 1);
1542 int r = _allocate(h->file->fnode.prefer_bdev,
1543 offset + length - allocated,
1544 &h->file->fnode);
1545 if (r < 0) {
1546 derr << __func__ << " allocated: 0x" << std::hex << allocated
1547 << " offset: 0x" << offset << " length: 0x" << length << std::dec
1548 << dendl;
1549 assert(0 == "bluefs enospc");
1550 return r;
1551 }
1552 if (cct->_conf->bluefs_preextend_wal_files &&
1553 h->writer_type == WRITER_WAL) {
1554 // NOTE: this *requires* that rocksdb also has log recycling
1555 // enabled and is therefore doing robust CRCs on the log
1556 // records. otherwise, we will fail to reply the rocksdb log
1557 // properly due to garbage on the device.
1558 h->file->fnode.size = h->file->fnode.get_allocated();
1559 dout(10) << __func__ << " extending WAL size to 0x" << std::hex
1560 << h->file->fnode.size << std::dec << " to include allocated"
1561 << dendl;
1562 }
1563 must_dirty = true;
1564 }
1565 if (h->file->fnode.size < offset + length) {
1566 h->file->fnode.size = offset + length;
1567 if (h->file->fnode.ino > 1) {
1568 // we do not need to dirty the log file (or it's compacting
1569 // replacement) when the file size changes because replay is
1570 // smart enough to discover it on its own.
1571 must_dirty = true;
1572 }
1573 }
1574 if (must_dirty) {
1575 h->file->fnode.mtime = ceph_clock_now();
1576 assert(h->file->fnode.ino >= 1);
1577 if (h->file->dirty_seq == 0) {
1578 h->file->dirty_seq = log_seq + 1;
1579 dirty_files[h->file->dirty_seq].push_back(*h->file);
1580 dout(20) << __func__ << " dirty_seq = " << log_seq + 1
1581 << " (was clean)" << dendl;
1582 } else {
1583 if (h->file->dirty_seq != log_seq + 1) {
1584 // need re-dirty, erase from list first
1585 assert(dirty_files.count(h->file->dirty_seq));
1586 auto it = dirty_files[h->file->dirty_seq].iterator_to(*h->file);
1587 dirty_files[h->file->dirty_seq].erase(it);
1588 h->file->dirty_seq = log_seq + 1;
1589 dirty_files[h->file->dirty_seq].push_back(*h->file);
1590 dout(20) << __func__ << " dirty_seq = " << log_seq + 1
1591 << " (was " << h->file->dirty_seq << ")" << dendl;
1592 } else {
1593 dout(20) << __func__ << " dirty_seq = " << log_seq + 1
1594 << " (unchanged, do nothing) " << dendl;
1595 }
1596 }
1597 }
1598 dout(20) << __func__ << " file now " << h->file->fnode << dendl;
1599
1600 uint64_t x_off = 0;
1601 auto p = h->file->fnode.seek(offset, &x_off);
1602 assert(p != h->file->fnode.extents.end());
1603 dout(20) << __func__ << " in " << *p << " x_off 0x"
1604 << std::hex << x_off << std::dec << dendl;
1605
1606 unsigned partial = x_off & ~super.block_mask();
1607 bufferlist bl;
1608 if (partial) {
1609 dout(20) << __func__ << " using partial tail 0x"
1610 << std::hex << partial << std::dec << dendl;
1611 assert(h->tail_block.length() == partial);
1612 bl.claim_append_piecewise(h->tail_block);
1613 x_off -= partial;
1614 offset -= partial;
1615 length += partial;
1616 dout(20) << __func__ << " waiting for previous aio to complete" << dendl;
1617 for (auto p : h->iocv) {
1618 if (p) {
1619 p->aio_wait();
1620 }
1621 }
1622 }
1623 if (length == partial + h->buffer.length()) {
1624 bl.claim_append_piecewise(h->buffer);
1625 } else {
1626 bufferlist t;
1627 h->buffer.splice(0, length, &t);
1628 bl.claim_append_piecewise(t);
1629 t.substr_of(h->buffer, length, h->buffer.length() - length);
1630 h->buffer.swap(t);
1631 dout(20) << " leaving 0x" << std::hex << h->buffer.length() << std::dec
1632 << " unflushed" << dendl;
1633 }
1634 assert(bl.length() == length);
1635
1636 switch (h->writer_type) {
1637 case WRITER_WAL:
1638 logger->inc(l_bluefs_bytes_written_wal, length);
1639 break;
1640 case WRITER_SST:
1641 logger->inc(l_bluefs_bytes_written_sst, length);
1642 break;
1643 }
1644
1645 dout(30) << "dump:\n";
1646 bl.hexdump(*_dout);
1647 *_dout << dendl;
1648
1649 h->pos = offset + length;
1650 h->tail_block.clear();
1651
1652 uint64_t bloff = 0;
1653 while (length > 0) {
1654 uint64_t x_len = MIN(p->length - x_off, length);
1655 bufferlist t;
1656 t.substr_of(bl, bloff, x_len);
1657 unsigned tail = x_len & ~super.block_mask();
1658 if (tail) {
1659 size_t zlen = super.block_size - tail;
1660 dout(20) << __func__ << " caching tail of 0x"
1661 << std::hex << tail
1662 << " and padding block with 0x" << zlen
1663 << std::dec << dendl;
1664 h->tail_block.substr_of(bl, bl.length() - tail, tail);
1665 if (h->file->fnode.ino > 1) {
1666 // we are using the page_aligned_appender, and can safely use
1667 // the tail of the raw buffer.
1668 const bufferptr &last = t.back();
1669 if (last.unused_tail_length() < zlen) {
1670 derr << " wtf, last is " << last << " from " << t << dendl;
1671 assert(last.unused_tail_length() >= zlen);
1672 }
1673 bufferptr z = last;
1674 z.set_offset(last.offset() + last.length());
1675 z.set_length(zlen);
1676 z.zero();
1677 t.append(z, 0, zlen);
1678 } else {
1679 t.append_zero(zlen);
1680 }
1681 }
1682 if (cct->_conf->bluefs_sync_write) {
1683 bdev[p->bdev]->write(p->offset + x_off, t, buffered);
1684 } else {
1685 bdev[p->bdev]->aio_write(p->offset + x_off, t, h->iocv[p->bdev], buffered);
1686 }
1687 bloff += x_len;
1688 length -= x_len;
1689 ++p;
1690 x_off = 0;
1691 }
1692 for (unsigned i = 0; i < MAX_BDEV; ++i) {
1693 if (bdev[i]) {
1694 assert(h->iocv[i]);
1695 if (h->iocv[i]->has_pending_aios()) {
1696 bdev[i]->aio_submit(h->iocv[i]);
1697 }
1698 }
1699 }
1700 dout(20) << __func__ << " h " << h << " pos now 0x"
1701 << std::hex << h->pos << std::dec << dendl;
1702 return 0;
1703 }
1704
1705 // we need to retire old completed aios so they don't stick around in
1706 // memory indefinitely (along with their bufferlist refs).
1707 void BlueFS::_claim_completed_aios(FileWriter *h, list<aio_t> *ls)
1708 {
1709 for (auto p : h->iocv) {
1710 if (p) {
1711 ls->splice(ls->end(), p->running_aios);
1712 }
1713 }
1714 dout(10) << __func__ << " got " << ls->size() << " aios" << dendl;
1715 }
1716
1717 void BlueFS::wait_for_aio(FileWriter *h)
1718 {
1719 // NOTE: this is safe to call without a lock, as long as our reference is
1720 // stable.
1721 dout(10) << __func__ << " " << h << dendl;
1722 utime_t start = ceph_clock_now();
1723 for (auto p : h->iocv) {
1724 if (p) {
1725 p->aio_wait();
1726 }
1727 }
1728 utime_t end = ceph_clock_now();
1729 utime_t dur = end - start;
1730 dout(10) << __func__ << " " << h << " done in " << dur << dendl;
1731 }
1732
1733 int BlueFS::_flush(FileWriter *h, bool force)
1734 {
1735 h->buffer_appender.flush();
1736 uint64_t length = h->buffer.length();
1737 uint64_t offset = h->pos;
1738 if (!force &&
1739 length < cct->_conf->bluefs_min_flush_size) {
1740 dout(10) << __func__ << " " << h << " ignoring, length " << length
1741 << " < min_flush_size " << cct->_conf->bluefs_min_flush_size
1742 << dendl;
1743 return 0;
1744 }
1745 if (length == 0) {
1746 dout(10) << __func__ << " " << h << " no dirty data on "
1747 << h->file->fnode << dendl;
1748 return 0;
1749 }
1750 dout(10) << __func__ << " " << h << " 0x"
1751 << std::hex << offset << "~" << length << std::dec
1752 << " to " << h->file->fnode << dendl;
1753 assert(h->pos <= h->file->fnode.size);
1754 return _flush_range(h, offset, length);
1755 }
1756
1757 int BlueFS::_truncate(FileWriter *h, uint64_t offset)
1758 {
1759 dout(10) << __func__ << " 0x" << std::hex << offset << std::dec
1760 << " file " << h->file->fnode << dendl;
1761 if (h->file->deleted) {
1762 dout(10) << __func__ << " deleted, no-op" << dendl;
1763 return 0;
1764 }
1765
1766 // we never truncate internal log files
1767 assert(h->file->fnode.ino > 1);
1768
1769 h->buffer_appender.flush();
1770
1771 // truncate off unflushed data?
1772 if (h->pos < offset &&
1773 h->pos + h->buffer.length() > offset) {
1774 bufferlist t;
1775 dout(20) << __func__ << " tossing out last " << offset - h->pos
1776 << " unflushed bytes" << dendl;
1777 t.substr_of(h->buffer, 0, offset - h->pos);
1778 h->buffer.swap(t);
1779 assert(0 == "actually this shouldn't happen");
1780 }
1781 if (h->buffer.length()) {
1782 int r = _flush(h, true);
1783 if (r < 0)
1784 return r;
1785 }
1786 if (offset == h->file->fnode.size) {
1787 return 0; // no-op!
1788 }
1789 if (offset > h->file->fnode.size) {
1790 assert(0 == "truncate up not supported");
1791 }
1792 assert(h->file->fnode.size >= offset);
1793 h->file->fnode.size = offset;
1794 log_t.op_file_update(h->file->fnode);
1795 return 0;
1796 }
1797
1798 int BlueFS::_fsync(FileWriter *h, std::unique_lock<std::mutex>& l)
1799 {
1800 dout(10) << __func__ << " " << h << " " << h->file->fnode << dendl;
1801 int r = _flush(h, true);
1802 if (r < 0)
1803 return r;
1804 uint64_t old_dirty_seq = h->file->dirty_seq;
1805
1806 _flush_bdev_safely(h);
1807
1808 if (old_dirty_seq) {
1809 uint64_t s = log_seq;
1810 dout(20) << __func__ << " file metadata was dirty (" << old_dirty_seq
1811 << ") on " << h->file->fnode << ", flushing log" << dendl;
1812 _flush_and_sync_log(l, old_dirty_seq);
1813 assert(h->file->dirty_seq == 0 || // cleaned
1814 h->file->dirty_seq > s); // or redirtied by someone else
1815 }
1816 return 0;
1817 }
1818
1819 void BlueFS::_flush_bdev_safely(FileWriter *h)
1820 {
1821 if (!cct->_conf->bluefs_sync_write) {
1822 list<aio_t> completed_ios;
1823 _claim_completed_aios(h, &completed_ios);
1824 lock.unlock();
1825 wait_for_aio(h);
1826 completed_ios.clear();
1827 flush_bdev();
1828 lock.lock();
1829 } else {
1830 lock.unlock();
1831 flush_bdev();
1832 lock.lock();
1833 }
1834 }
1835
1836 void BlueFS::flush_bdev()
1837 {
1838 // NOTE: this is safe to call without a lock.
1839 dout(20) << __func__ << dendl;
1840 for (auto p : bdev) {
1841 if (p)
1842 p->flush();
1843 }
1844 }
1845
1846 int BlueFS::_allocate(uint8_t id, uint64_t len,
1847 bluefs_fnode_t* node)
1848 {
1849 dout(10) << __func__ << " len 0x" << std::hex << len << std::dec
1850 << " from " << (int)id << dendl;
1851 assert(id < alloc.size());
1852 uint64_t min_alloc_size = cct->_conf->bluefs_alloc_size;
1853
1854 uint64_t left = ROUND_UP_TO(len, min_alloc_size);
1855 int r = -ENOSPC;
1856 int64_t alloc_len = 0;
1857 AllocExtentVector extents;
1858
1859 if (alloc[id]) {
1860 r = alloc[id]->reserve(left);
1861 }
1862
1863 if (r == 0) {
1864 uint64_t hint = 0;
1865 if (!node->extents.empty() && node->extents.back().bdev == id) {
1866 hint = node->extents.back().end();
1867 }
1868 extents.reserve(4); // 4 should be (more than) enough for most allocations
1869 alloc_len = alloc[id]->allocate(left, min_alloc_size, hint, &extents);
1870 }
1871 if (r < 0 || (alloc_len < (int64_t)left)) {
1872 if (r == 0) {
1873 alloc[id]->unreserve(left - alloc_len);
1874 for (auto& p : extents) {
1875 alloc[id]->release(p.offset, p.length);
1876 }
1877 }
1878 if (id != BDEV_SLOW) {
1879 if (bdev[id]) {
1880 dout(1) << __func__ << " failed to allocate 0x" << std::hex << left
1881 << " on bdev " << (int)id
1882 << ", free 0x" << alloc[id]->get_free()
1883 << "; fallback to bdev " << (int)id + 1
1884 << std::dec << dendl;
1885 }
1886 return _allocate(id + 1, len, node);
1887 }
1888 if (bdev[id])
1889 derr << __func__ << " failed to allocate 0x" << std::hex << left
1890 << " on bdev " << (int)id
1891 << ", free 0x" << alloc[id]->get_free() << std::dec << dendl;
1892 else
1893 derr << __func__ << " failed to allocate 0x" << std::hex << left
1894 << " on bdev " << (int)id << ", dne" << std::dec << dendl;
1895 if (alloc[id])
1896 alloc[id]->dump();
1897 return -ENOSPC;
1898 }
1899
1900 for (auto& p : extents) {
1901 node->append_extent(bluefs_extent_t(id, p.offset, p.length));
1902 }
1903
1904 return 0;
1905 }
1906
1907 int BlueFS::_preallocate(FileRef f, uint64_t off, uint64_t len)
1908 {
1909 dout(10) << __func__ << " file " << f->fnode << " 0x"
1910 << std::hex << off << "~" << len << std::dec << dendl;
1911 if (f->deleted) {
1912 dout(10) << __func__ << " deleted, no-op" << dendl;
1913 return 0;
1914 }
1915 assert(f->fnode.ino > 1);
1916 uint64_t allocated = f->fnode.get_allocated();
1917 if (off + len > allocated) {
1918 uint64_t want = off + len - allocated;
1919 int r = _allocate(f->fnode.prefer_bdev, want, &f->fnode);
1920 if (r < 0)
1921 return r;
1922 log_t.op_file_update(f->fnode);
1923 }
1924 return 0;
1925 }
1926
1927 void BlueFS::sync_metadata()
1928 {
1929 std::unique_lock<std::mutex> l(lock);
1930 if (log_t.empty()) {
1931 dout(10) << __func__ << " - no pending log events" << dendl;
1932 return;
1933 }
1934 dout(10) << __func__ << dendl;
1935 utime_t start = ceph_clock_now();
1936 vector<interval_set<uint64_t>> to_release(pending_release.size());
1937 to_release.swap(pending_release);
1938 flush_bdev(); // FIXME?
1939 _flush_and_sync_log(l);
1940 for (unsigned i = 0; i < to_release.size(); ++i) {
1941 for (auto p = to_release[i].begin(); p != to_release[i].end(); ++p) {
1942 alloc[i]->release(p.get_start(), p.get_len());
1943 }
1944 }
1945
1946 if (_should_compact_log()) {
1947 if (cct->_conf->bluefs_compact_log_sync) {
1948 _compact_log_sync();
1949 } else {
1950 _compact_log_async(l);
1951 }
1952 }
1953
1954 utime_t end = ceph_clock_now();
1955 utime_t dur = end - start;
1956 dout(10) << __func__ << " done in " << dur << dendl;
1957 }
1958
1959 int BlueFS::open_for_write(
1960 const string& dirname,
1961 const string& filename,
1962 FileWriter **h,
1963 bool overwrite)
1964 {
1965 std::lock_guard<std::mutex> l(lock);
1966 dout(10) << __func__ << " " << dirname << "/" << filename << dendl;
1967 map<string,DirRef>::iterator p = dir_map.find(dirname);
1968 DirRef dir;
1969 if (p == dir_map.end()) {
1970 // implicitly create the dir
1971 dout(20) << __func__ << " dir " << dirname
1972 << " does not exist" << dendl;
1973 return -ENOENT;
1974 } else {
1975 dir = p->second;
1976 }
1977
1978 FileRef file;
1979 bool create = false;
1980 map<string,FileRef>::iterator q = dir->file_map.find(filename);
1981 if (q == dir->file_map.end()) {
1982 if (overwrite) {
1983 dout(20) << __func__ << " dir " << dirname << " (" << dir
1984 << ") file " << filename
1985 << " does not exist" << dendl;
1986 return -ENOENT;
1987 }
1988 file = new File;
1989 file->fnode.ino = ++ino_last;
1990 file_map[ino_last] = file;
1991 dir->file_map[filename] = file;
1992 ++file->refs;
1993 create = true;
1994 } else {
1995 // overwrite existing file?
1996 file = q->second;
1997 if (overwrite) {
1998 dout(20) << __func__ << " dir " << dirname << " (" << dir
1999 << ") file " << filename
2000 << " already exists, overwrite in place" << dendl;
2001 } else {
2002 dout(20) << __func__ << " dir " << dirname << " (" << dir
2003 << ") file " << filename
2004 << " already exists, truncate + overwrite" << dendl;
2005 file->fnode.size = 0;
2006 for (auto& p : file->fnode.extents) {
2007 pending_release[p.bdev].insert(p.offset, p.length);
2008 }
2009
2010 file->fnode.clear_extents();
2011 }
2012 }
2013 assert(file->fnode.ino > 1);
2014
2015 file->fnode.mtime = ceph_clock_now();
2016 file->fnode.prefer_bdev = BlueFS::BDEV_DB;
2017 if (dirname.length() > 5) {
2018 // the "db.slow" and "db.wal" directory names are hard-coded at
2019 // match up with bluestore. the slow device is always the second
2020 // one (when a dedicated block.db device is present and used at
2021 // bdev 0). the wal device is always last.
2022 if (boost::algorithm::ends_with(dirname, ".slow")) {
2023 file->fnode.prefer_bdev = BlueFS::BDEV_SLOW;
2024 } else if (boost::algorithm::ends_with(dirname, ".wal")) {
2025 file->fnode.prefer_bdev = BlueFS::BDEV_WAL;
2026 }
2027 }
2028 dout(20) << __func__ << " mapping " << dirname << "/" << filename
2029 << " to bdev " << (int)file->fnode.prefer_bdev << dendl;
2030
2031 log_t.op_file_update(file->fnode);
2032 if (create)
2033 log_t.op_dir_link(dirname, filename, file->fnode.ino);
2034
2035 *h = _create_writer(file);
2036
2037 if (boost::algorithm::ends_with(filename, ".log")) {
2038 (*h)->writer_type = BlueFS::WRITER_WAL;
2039 if (logger && !overwrite) {
2040 logger->inc(l_bluefs_files_written_wal);
2041 }
2042 } else if (boost::algorithm::ends_with(filename, ".sst")) {
2043 (*h)->writer_type = BlueFS::WRITER_SST;
2044 if (logger) {
2045 logger->inc(l_bluefs_files_written_sst);
2046 }
2047 }
2048
2049 dout(10) << __func__ << " h " << *h << " on " << file->fnode << dendl;
2050 return 0;
2051 }
2052
2053 BlueFS::FileWriter *BlueFS::_create_writer(FileRef f)
2054 {
2055 FileWriter *w = new FileWriter(f);
2056 for (unsigned i = 0; i < MAX_BDEV; ++i) {
2057 if (bdev[i]) {
2058 w->iocv[i] = new IOContext(cct, NULL);
2059 } else {
2060 w->iocv[i] = NULL;
2061 }
2062 }
2063 return w;
2064 }
2065
2066 void BlueFS::_close_writer(FileWriter *h)
2067 {
2068 dout(10) << __func__ << " " << h << " type " << h->writer_type << dendl;
2069 for (unsigned i=0; i<MAX_BDEV; ++i) {
2070 if (bdev[i]) {
2071 assert(h->iocv[i]);
2072 h->iocv[i]->aio_wait();
2073 bdev[i]->queue_reap_ioc(h->iocv[i]);
2074 }
2075 }
2076 delete h;
2077 }
2078
2079 int BlueFS::open_for_read(
2080 const string& dirname,
2081 const string& filename,
2082 FileReader **h,
2083 bool random)
2084 {
2085 std::lock_guard<std::mutex> l(lock);
2086 dout(10) << __func__ << " " << dirname << "/" << filename
2087 << (random ? " (random)":" (sequential)") << dendl;
2088 map<string,DirRef>::iterator p = dir_map.find(dirname);
2089 if (p == dir_map.end()) {
2090 dout(20) << __func__ << " dir " << dirname << " not found" << dendl;
2091 return -ENOENT;
2092 }
2093 DirRef dir = p->second;
2094
2095 map<string,FileRef>::iterator q = dir->file_map.find(filename);
2096 if (q == dir->file_map.end()) {
2097 dout(20) << __func__ << " dir " << dirname << " (" << dir
2098 << ") file " << filename
2099 << " not found" << dendl;
2100 return -ENOENT;
2101 }
2102 File *file = q->second.get();
2103
2104 *h = new FileReader(file, random ? 4096 : cct->_conf->bluefs_max_prefetch,
2105 random, false);
2106 dout(10) << __func__ << " h " << *h << " on " << file->fnode << dendl;
2107 return 0;
2108 }
2109
2110 int BlueFS::rename(
2111 const string& old_dirname, const string& old_filename,
2112 const string& new_dirname, const string& new_filename)
2113 {
2114 std::lock_guard<std::mutex> l(lock);
2115 dout(10) << __func__ << " " << old_dirname << "/" << old_filename
2116 << " -> " << new_dirname << "/" << new_filename << dendl;
2117 map<string,DirRef>::iterator p = dir_map.find(old_dirname);
2118 if (p == dir_map.end()) {
2119 dout(20) << __func__ << " dir " << old_dirname << " not found" << dendl;
2120 return -ENOENT;
2121 }
2122 DirRef old_dir = p->second;
2123 map<string,FileRef>::iterator q = old_dir->file_map.find(old_filename);
2124 if (q == old_dir->file_map.end()) {
2125 dout(20) << __func__ << " dir " << old_dirname << " (" << old_dir
2126 << ") file " << old_filename
2127 << " not found" << dendl;
2128 return -ENOENT;
2129 }
2130 FileRef file = q->second;
2131
2132 p = dir_map.find(new_dirname);
2133 if (p == dir_map.end()) {
2134 dout(20) << __func__ << " dir " << new_dirname << " not found" << dendl;
2135 return -ENOENT;
2136 }
2137 DirRef new_dir = p->second;
2138 q = new_dir->file_map.find(new_filename);
2139 if (q != new_dir->file_map.end()) {
2140 dout(20) << __func__ << " dir " << new_dirname << " (" << old_dir
2141 << ") file " << new_filename
2142 << " already exists, unlinking" << dendl;
2143 assert(q->second != file);
2144 log_t.op_dir_unlink(new_dirname, new_filename);
2145 _drop_link(q->second);
2146 }
2147
2148 dout(10) << __func__ << " " << new_dirname << "/" << new_filename << " "
2149 << " " << file->fnode << dendl;
2150
2151 new_dir->file_map[new_filename] = file;
2152 old_dir->file_map.erase(old_filename);
2153
2154 log_t.op_dir_link(new_dirname, new_filename, file->fnode.ino);
2155 log_t.op_dir_unlink(old_dirname, old_filename);
2156 return 0;
2157 }
2158
2159 int BlueFS::mkdir(const string& dirname)
2160 {
2161 std::lock_guard<std::mutex> l(lock);
2162 dout(10) << __func__ << " " << dirname << dendl;
2163 map<string,DirRef>::iterator p = dir_map.find(dirname);
2164 if (p != dir_map.end()) {
2165 dout(20) << __func__ << " dir " << dirname << " exists" << dendl;
2166 return -EEXIST;
2167 }
2168 dir_map[dirname] = new Dir;
2169 log_t.op_dir_create(dirname);
2170 return 0;
2171 }
2172
2173 int BlueFS::rmdir(const string& dirname)
2174 {
2175 std::lock_guard<std::mutex> l(lock);
2176 dout(10) << __func__ << " " << dirname << dendl;
2177 map<string,DirRef>::iterator p = dir_map.find(dirname);
2178 if (p == dir_map.end()) {
2179 dout(20) << __func__ << " dir " << dirname << " does not exist" << dendl;
2180 return -ENOENT;
2181 }
2182 DirRef dir = p->second;
2183 if (!dir->file_map.empty()) {
2184 dout(20) << __func__ << " dir " << dirname << " not empty" << dendl;
2185 return -ENOTEMPTY;
2186 }
2187 dir_map.erase(dirname);
2188 log_t.op_dir_remove(dirname);
2189 return 0;
2190 }
2191
2192 bool BlueFS::dir_exists(const string& dirname)
2193 {
2194 std::lock_guard<std::mutex> l(lock);
2195 map<string,DirRef>::iterator p = dir_map.find(dirname);
2196 bool exists = p != dir_map.end();
2197 dout(10) << __func__ << " " << dirname << " = " << (int)exists << dendl;
2198 return exists;
2199 }
2200
2201 int BlueFS::stat(const string& dirname, const string& filename,
2202 uint64_t *size, utime_t *mtime)
2203 {
2204 std::lock_guard<std::mutex> l(lock);
2205 dout(10) << __func__ << " " << dirname << "/" << filename << dendl;
2206 map<string,DirRef>::iterator p = dir_map.find(dirname);
2207 if (p == dir_map.end()) {
2208 dout(20) << __func__ << " dir " << dirname << " not found" << dendl;
2209 return -ENOENT;
2210 }
2211 DirRef dir = p->second;
2212 map<string,FileRef>::iterator q = dir->file_map.find(filename);
2213 if (q == dir->file_map.end()) {
2214 dout(20) << __func__ << " dir " << dirname << " (" << dir
2215 << ") file " << filename
2216 << " not found" << dendl;
2217 return -ENOENT;
2218 }
2219 File *file = q->second.get();
2220 dout(10) << __func__ << " " << dirname << "/" << filename
2221 << " " << file->fnode << dendl;
2222 if (size)
2223 *size = file->fnode.size;
2224 if (mtime)
2225 *mtime = file->fnode.mtime;
2226 return 0;
2227 }
2228
2229 int BlueFS::lock_file(const string& dirname, const string& filename,
2230 FileLock **plock)
2231 {
2232 std::lock_guard<std::mutex> l(lock);
2233 dout(10) << __func__ << " " << dirname << "/" << filename << dendl;
2234 map<string,DirRef>::iterator p = dir_map.find(dirname);
2235 if (p == dir_map.end()) {
2236 dout(20) << __func__ << " dir " << dirname << " not found" << dendl;
2237 return -ENOENT;
2238 }
2239 DirRef dir = p->second;
2240 map<string,FileRef>::iterator q = dir->file_map.find(filename);
2241 File *file;
2242 if (q == dir->file_map.end()) {
2243 dout(20) << __func__ << " dir " << dirname << " (" << dir
2244 << ") file " << filename
2245 << " not found, creating" << dendl;
2246 file = new File;
2247 file->fnode.ino = ++ino_last;
2248 file->fnode.mtime = ceph_clock_now();
2249 file_map[ino_last] = file;
2250 dir->file_map[filename] = file;
2251 ++file->refs;
2252 log_t.op_file_update(file->fnode);
2253 log_t.op_dir_link(dirname, filename, file->fnode.ino);
2254 } else {
2255 file = q->second.get();
2256 if (file->locked) {
2257 dout(10) << __func__ << " already locked" << dendl;
2258 return -EBUSY;
2259 }
2260 }
2261 file->locked = true;
2262 *plock = new FileLock(file);
2263 dout(10) << __func__ << " locked " << file->fnode
2264 << " with " << *plock << dendl;
2265 return 0;
2266 }
2267
2268 int BlueFS::unlock_file(FileLock *fl)
2269 {
2270 std::lock_guard<std::mutex> l(lock);
2271 dout(10) << __func__ << " " << fl << " on " << fl->file->fnode << dendl;
2272 assert(fl->file->locked);
2273 fl->file->locked = false;
2274 delete fl;
2275 return 0;
2276 }
2277
2278 int BlueFS::readdir(const string& dirname, vector<string> *ls)
2279 {
2280 std::lock_guard<std::mutex> l(lock);
2281 dout(10) << __func__ << " " << dirname << dendl;
2282 if (dirname.empty()) {
2283 // list dirs
2284 ls->reserve(dir_map.size() + 2);
2285 for (auto& q : dir_map) {
2286 ls->push_back(q.first);
2287 }
2288 } else {
2289 // list files in dir
2290 map<string,DirRef>::iterator p = dir_map.find(dirname);
2291 if (p == dir_map.end()) {
2292 dout(20) << __func__ << " dir " << dirname << " not found" << dendl;
2293 return -ENOENT;
2294 }
2295 DirRef dir = p->second;
2296 ls->reserve(dir->file_map.size() + 2);
2297 for (auto& q : dir->file_map) {
2298 ls->push_back(q.first);
2299 }
2300 }
2301 ls->push_back(".");
2302 ls->push_back("..");
2303 return 0;
2304 }
2305
2306 int BlueFS::unlink(const string& dirname, const string& filename)
2307 {
2308 std::lock_guard<std::mutex> l(lock);
2309 dout(10) << __func__ << " " << dirname << "/" << filename << dendl;
2310 map<string,DirRef>::iterator p = dir_map.find(dirname);
2311 if (p == dir_map.end()) {
2312 dout(20) << __func__ << " dir " << dirname << " not found" << dendl;
2313 return -ENOENT;
2314 }
2315 DirRef dir = p->second;
2316 map<string,FileRef>::iterator q = dir->file_map.find(filename);
2317 if (q == dir->file_map.end()) {
2318 dout(20) << __func__ << " file " << dirname << "/" << filename
2319 << " not found" << dendl;
2320 return -ENOENT;
2321 }
2322 FileRef file = q->second;
2323 if (file->locked) {
2324 dout(20) << __func__ << " file " << dirname << "/" << filename
2325 << " is locked" << dendl;
2326 return -EBUSY;
2327 }
2328 dir->file_map.erase(filename);
2329 log_t.op_dir_unlink(dirname, filename);
2330 _drop_link(file);
2331 return 0;
2332 }
2333
2334 bool BlueFS::wal_is_rotational()
2335 {
2336 if (bdev[BDEV_WAL]) {
2337 return bdev[BDEV_WAL]->is_rotational();
2338 } else if (bdev[BDEV_DB]) {
2339 return bdev[BDEV_DB]->is_rotational();
2340 }
2341 return bdev[BDEV_SLOW]->is_rotational();
2342 }