]> git.proxmox.com Git - ceph.git/blob - ceph/src/os/bluestore/BlueFS.cc
update sources to 12.2.8
[ceph.git] / ceph / src / os / bluestore / BlueFS.cc
1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2 // vim: ts=8 sw=2 smarttab
3
4 #include "boost/algorithm/string.hpp"
5 #include "BlueFS.h"
6
7 #include "common/debug.h"
8 #include "common/errno.h"
9 #include "common/perf_counters.h"
10 #include "BlockDevice.h"
11 #include "Allocator.h"
12 #include "include/assert.h"
13
14 #define dout_context cct
15 #define dout_subsys ceph_subsys_bluefs
16 #undef dout_prefix
17 #define dout_prefix *_dout << "bluefs "
18
19 MEMPOOL_DEFINE_OBJECT_FACTORY(BlueFS::File, bluefs_file, bluefs);
20 MEMPOOL_DEFINE_OBJECT_FACTORY(BlueFS::Dir, bluefs_dir, bluefs);
21 MEMPOOL_DEFINE_OBJECT_FACTORY(BlueFS::FileWriter, bluefs_file_writer, bluefs);
22 MEMPOOL_DEFINE_OBJECT_FACTORY(BlueFS::FileReaderBuffer,
23 bluefs_file_reader_buffer, bluefs);
24 MEMPOOL_DEFINE_OBJECT_FACTORY(BlueFS::FileReader, bluefs_file_reader, bluefs);
25 MEMPOOL_DEFINE_OBJECT_FACTORY(BlueFS::FileLock, bluefs_file_lock, bluefs);
26
27
28 BlueFS::BlueFS(CephContext* cct)
29 : cct(cct),
30 bdev(MAX_BDEV),
31 ioc(MAX_BDEV),
32 block_all(MAX_BDEV),
33 block_total(MAX_BDEV, 0)
34 {
35 }
36
37 BlueFS::~BlueFS()
38 {
39 for (auto p : ioc) {
40 if (p)
41 p->aio_wait();
42 }
43 for (auto p : bdev) {
44 if (p) {
45 p->close();
46 delete p;
47 }
48 }
49 for (auto p : ioc) {
50 delete p;
51 }
52 }
53
54 void BlueFS::_init_logger()
55 {
56 PerfCountersBuilder b(cct, "bluefs",
57 l_bluefs_first, l_bluefs_last);
58 b.add_u64_counter(l_bluefs_gift_bytes, "gift_bytes",
59 "Bytes gifted from BlueStore", NULL, 0, unit_t(BYTES));
60 b.add_u64_counter(l_bluefs_reclaim_bytes, "reclaim_bytes",
61 "Bytes reclaimed by BlueStore", NULL, 0, unit_t(BYTES));
62 b.add_u64(l_bluefs_db_total_bytes, "db_total_bytes",
63 "Total bytes (main db device)",
64 "b", PerfCountersBuilder::PRIO_USEFUL, unit_t(BYTES));
65 b.add_u64(l_bluefs_db_used_bytes, "db_used_bytes",
66 "Used bytes (main db device)",
67 "u", PerfCountersBuilder::PRIO_USEFUL, unit_t(BYTES));
68 b.add_u64(l_bluefs_wal_total_bytes, "wal_total_bytes",
69 "Total bytes (wal device)",
70 "walb", PerfCountersBuilder::PRIO_USEFUL, unit_t(BYTES));
71 b.add_u64(l_bluefs_wal_used_bytes, "wal_used_bytes",
72 "Used bytes (wal device)",
73 "walu", PerfCountersBuilder::PRIO_USEFUL, unit_t(BYTES));
74 b.add_u64(l_bluefs_slow_total_bytes, "slow_total_bytes",
75 "Total bytes (slow device)",
76 "slob", PerfCountersBuilder::PRIO_USEFUL, unit_t(BYTES));
77 b.add_u64(l_bluefs_slow_used_bytes, "slow_used_bytes",
78 "Used bytes (slow device)",
79 "slou", PerfCountersBuilder::PRIO_USEFUL, unit_t(BYTES));
80 b.add_u64(l_bluefs_num_files, "num_files", "File count",
81 "f", PerfCountersBuilder::PRIO_USEFUL);
82 b.add_u64(l_bluefs_log_bytes, "log_bytes", "Size of the metadata log",
83 "jlen", PerfCountersBuilder::PRIO_INTERESTING, unit_t(BYTES));
84 b.add_u64_counter(l_bluefs_log_compactions, "log_compactions",
85 "Compactions of the metadata log");
86 b.add_u64_counter(l_bluefs_logged_bytes, "logged_bytes",
87 "Bytes written to the metadata log", "j",
88 PerfCountersBuilder::PRIO_CRITICAL, unit_t(BYTES));
89 b.add_u64_counter(l_bluefs_files_written_wal, "files_written_wal",
90 "Files written to WAL");
91 b.add_u64_counter(l_bluefs_files_written_sst, "files_written_sst",
92 "Files written to SSTs");
93 b.add_u64_counter(l_bluefs_bytes_written_wal, "bytes_written_wal",
94 "Bytes written to WAL", "wal",
95 PerfCountersBuilder::PRIO_CRITICAL);
96 b.add_u64_counter(l_bluefs_bytes_written_sst, "bytes_written_sst",
97 "Bytes written to SSTs", "sst",
98 PerfCountersBuilder::PRIO_CRITICAL, unit_t(BYTES));
99 logger = b.create_perf_counters();
100 cct->get_perfcounters_collection()->add(logger);
101 }
102
103 void BlueFS::_shutdown_logger()
104 {
105 cct->get_perfcounters_collection()->remove(logger);
106 delete logger;
107 }
108
109 void BlueFS::_update_logger_stats()
110 {
111 // we must be holding the lock
112 logger->set(l_bluefs_num_files, file_map.size());
113 logger->set(l_bluefs_log_bytes, log_writer->file->fnode.size);
114
115 if (alloc[BDEV_WAL]) {
116 logger->set(l_bluefs_wal_total_bytes, block_total[BDEV_WAL]);
117 logger->set(l_bluefs_wal_used_bytes,
118 block_total[BDEV_WAL] - alloc[BDEV_WAL]->get_free());
119 }
120 if (alloc[BDEV_DB]) {
121 logger->set(l_bluefs_db_total_bytes, block_total[BDEV_DB]);
122 logger->set(l_bluefs_db_used_bytes,
123 block_total[BDEV_DB] - alloc[BDEV_DB]->get_free());
124 }
125 if (alloc[BDEV_SLOW]) {
126 logger->set(l_bluefs_slow_total_bytes, block_total[BDEV_SLOW]);
127 logger->set(l_bluefs_slow_used_bytes,
128 block_total[BDEV_SLOW] - alloc[BDEV_SLOW]->get_free());
129 }
130 }
131
132 int BlueFS::add_block_device(unsigned id, const string& path)
133 {
134 dout(10) << __func__ << " bdev " << id << " path " << path << dendl;
135 assert(id < bdev.size());
136 assert(bdev[id] == NULL);
137 BlockDevice *b = BlockDevice::create(cct, path, NULL, NULL);
138 int r = b->open(path);
139 if (r < 0) {
140 delete b;
141 return r;
142 }
143 dout(1) << __func__ << " bdev " << id << " path " << path
144 << " size " << byte_u_t(b->get_size()) << dendl;
145 bdev[id] = b;
146 ioc[id] = new IOContext(cct, NULL);
147 return 0;
148 }
149
150 bool BlueFS::bdev_support_label(unsigned id)
151 {
152 assert(id < bdev.size());
153 assert(bdev[id]);
154 return bdev[id]->supported_bdev_label();
155 }
156
157 uint64_t BlueFS::get_block_device_size(unsigned id)
158 {
159 if (id < bdev.size() && bdev[id])
160 return bdev[id]->get_size();
161 return 0;
162 }
163
164 void BlueFS::add_block_extent(unsigned id, uint64_t offset, uint64_t length)
165 {
166 std::unique_lock<std::mutex> l(lock);
167 dout(1) << __func__ << " bdev " << id
168 << " 0x" << std::hex << offset << "~" << length << std::dec
169 << dendl;
170 assert(id < bdev.size());
171 assert(bdev[id]);
172 assert(bdev[id]->get_size() >= offset + length);
173 block_all[id].insert(offset, length);
174 block_total[id] += length;
175
176 if (id < alloc.size() && alloc[id]) {
177 log_t.op_alloc_add(id, offset, length);
178 int r = _flush_and_sync_log(l);
179 assert(r == 0);
180 alloc[id]->init_add_free(offset, length);
181 }
182
183 if (logger)
184 logger->inc(l_bluefs_gift_bytes, length);
185 dout(10) << __func__ << " done" << dendl;
186 }
187
188 int BlueFS::reclaim_blocks(unsigned id, uint64_t want,
189 AllocExtentVector *extents)
190 {
191 std::unique_lock<std::mutex> l(lock);
192 dout(1) << __func__ << " bdev " << id
193 << " want 0x" << std::hex << want << std::dec << dendl;
194 assert(id < alloc.size());
195 assert(alloc[id]);
196 int r = alloc[id]->reserve(want);
197 assert(r == 0); // caller shouldn't ask for more than they can get
198 int64_t got = alloc[id]->allocate(want, cct->_conf->bluefs_alloc_size, 0,
199 extents);
200 if (got < (int64_t)want) {
201 alloc[id]->unreserve(want - MAX(0, got));
202 }
203 if (got <= 0) {
204 derr << __func__ << " failed to allocate space to return to bluestore"
205 << dendl;
206 alloc[id]->dump();
207 return got;
208 }
209
210 for (auto& p : *extents) {
211 block_all[id].erase(p.offset, p.length);
212 block_total[id] -= p.length;
213 log_t.op_alloc_rm(id, p.offset, p.length);
214 }
215
216 flush_bdev();
217 r = _flush_and_sync_log(l);
218 assert(r == 0);
219
220 if (logger)
221 logger->inc(l_bluefs_reclaim_bytes, got);
222 dout(1) << __func__ << " bdev " << id << " want 0x" << std::hex << want
223 << " got " << *extents << dendl;
224 return 0;
225 }
226
227 uint64_t BlueFS::get_fs_usage()
228 {
229 std::lock_guard<std::mutex> l(lock);
230 uint64_t total_bytes = 0;
231 for (auto& p : file_map) {
232 total_bytes += p.second->fnode.get_allocated();
233 }
234 return total_bytes;
235 }
236
237 uint64_t BlueFS::get_total(unsigned id)
238 {
239 std::lock_guard<std::mutex> l(lock);
240 assert(id < block_all.size());
241 return block_total[id];
242 }
243
244 uint64_t BlueFS::get_free(unsigned id)
245 {
246 std::lock_guard<std::mutex> l(lock);
247 assert(id < alloc.size());
248 return alloc[id]->get_free();
249 }
250
251 void BlueFS::dump_perf_counters(Formatter *f)
252 {
253 f->open_object_section("bluefs_perf_counters");
254 logger->dump_formatted(f,0);
255 f->close_section();
256 }
257
258 void BlueFS::dump_block_extents(ostream& out)
259 {
260 for (unsigned i = 0; i < MAX_BDEV; ++i) {
261 if (!bdev[i]) {
262 continue;
263 }
264 out << i << " : size 0x" << std::hex << bdev[i]->get_size()
265 << " : own 0x" << block_all[i] << std::dec << "\n";
266 }
267 }
268
269 void BlueFS::get_usage(vector<pair<uint64_t,uint64_t>> *usage)
270 {
271 std::lock_guard<std::mutex> l(lock);
272 usage->resize(bdev.size());
273 for (unsigned id = 0; id < bdev.size(); ++id) {
274 if (!bdev[id]) {
275 (*usage)[id] = make_pair(0, 0);
276 continue;
277 }
278 (*usage)[id].first = alloc[id]->get_free();
279 (*usage)[id].second = block_total[id];
280 uint64_t used =
281 (block_total[id] - (*usage)[id].first) * 100 / block_total[id];
282 dout(10) << __func__ << " bdev " << id
283 << " free " << (*usage)[id].first
284 << " (" << byte_u_t((*usage)[id].first) << ")"
285 << " / " << (*usage)[id].second
286 << " (" << byte_u_t((*usage)[id].second) << ")"
287 << ", used " << used << "%"
288 << dendl;
289 }
290 }
291
292 int BlueFS::get_block_extents(unsigned id, interval_set<uint64_t> *extents)
293 {
294 std::lock_guard<std::mutex> l(lock);
295 dout(10) << __func__ << " bdev " << id << dendl;
296 if (id >= block_all.size())
297 return -EINVAL;
298 *extents = block_all[id];
299 return 0;
300 }
301
302 int BlueFS::mkfs(uuid_d osd_uuid)
303 {
304 std::unique_lock<std::mutex> l(lock);
305 dout(1) << __func__
306 << " osd_uuid " << osd_uuid
307 << dendl;
308
309 _init_alloc();
310 _init_logger();
311
312 super.version = 1;
313 super.block_size = bdev[BDEV_DB]->get_block_size();
314 super.osd_uuid = osd_uuid;
315 super.uuid.generate_random();
316 dout(1) << __func__ << " uuid " << super.uuid << dendl;
317
318 // init log
319 FileRef log_file = new File;
320 log_file->fnode.ino = 1;
321 log_file->fnode.prefer_bdev = BDEV_WAL;
322 int r = _allocate(
323 log_file->fnode.prefer_bdev,
324 cct->_conf->bluefs_max_log_runway,
325 &log_file->fnode);
326 assert(r == 0);
327 log_writer = _create_writer(log_file);
328
329 // initial txn
330 log_t.op_init();
331 for (unsigned bdev = 0; bdev < MAX_BDEV; ++bdev) {
332 interval_set<uint64_t>& p = block_all[bdev];
333 if (p.empty())
334 continue;
335 for (interval_set<uint64_t>::iterator q = p.begin(); q != p.end(); ++q) {
336 dout(20) << __func__ << " op_alloc_add " << bdev << " 0x"
337 << std::hex << q.get_start() << "~" << q.get_len() << std::dec
338 << dendl;
339 log_t.op_alloc_add(bdev, q.get_start(), q.get_len());
340 }
341 }
342 _flush_and_sync_log(l);
343
344 // write supers
345 super.log_fnode = log_file->fnode;
346 _write_super();
347 flush_bdev();
348
349 // clean up
350 super = bluefs_super_t();
351 _close_writer(log_writer);
352 log_writer = NULL;
353 block_all.clear();
354 block_total.clear();
355 _stop_alloc();
356 _shutdown_logger();
357
358 dout(10) << __func__ << " success" << dendl;
359 return 0;
360 }
361
362 void BlueFS::_init_alloc()
363 {
364 dout(20) << __func__ << dendl;
365 alloc.resize(MAX_BDEV);
366 pending_release.resize(MAX_BDEV);
367 for (unsigned id = 0; id < bdev.size(); ++id) {
368 if (!bdev[id]) {
369 continue;
370 }
371 assert(bdev[id]->get_size());
372 alloc[id] = Allocator::create(cct, cct->_conf->bluefs_allocator,
373 bdev[id]->get_size(),
374 cct->_conf->bluefs_alloc_size);
375 interval_set<uint64_t>& p = block_all[id];
376 for (interval_set<uint64_t>::iterator q = p.begin(); q != p.end(); ++q) {
377 alloc[id]->init_add_free(q.get_start(), q.get_len());
378 }
379 }
380 }
381
382 void BlueFS::_stop_alloc()
383 {
384 dout(20) << __func__ << dendl;
385 for (auto p : alloc) {
386 if (p != nullptr) {
387 p->shutdown();
388 delete p;
389 }
390 }
391 alloc.clear();
392 }
393
394 int BlueFS::mount()
395 {
396 dout(1) << __func__ << dendl;
397
398 int r = _open_super();
399 if (r < 0) {
400 derr << __func__ << " failed to open super: " << cpp_strerror(r) << dendl;
401 goto out;
402 }
403
404 block_all.clear();
405 block_all.resize(MAX_BDEV);
406 block_total.clear();
407 block_total.resize(MAX_BDEV, 0);
408 _init_alloc();
409
410 r = _replay(false);
411 if (r < 0) {
412 derr << __func__ << " failed to replay log: " << cpp_strerror(r) << dendl;
413 _stop_alloc();
414 goto out;
415 }
416
417 // init freelist
418 for (auto& p : file_map) {
419 dout(30) << __func__ << " noting alloc for " << p.second->fnode << dendl;
420 for (auto& q : p.second->fnode.extents) {
421 alloc[q.bdev]->init_rm_free(q.offset, q.length);
422 }
423 }
424
425 // set up the log for future writes
426 log_writer = _create_writer(_get_file(1));
427 assert(log_writer->file->fnode.ino == 1);
428 log_writer->pos = log_writer->file->fnode.size;
429 dout(10) << __func__ << " log write pos set to 0x"
430 << std::hex << log_writer->pos << std::dec
431 << dendl;
432
433 _init_logger();
434 return 0;
435
436 out:
437 super = bluefs_super_t();
438 return r;
439 }
440
441 void BlueFS::umount()
442 {
443 dout(1) << __func__ << dendl;
444
445 sync_metadata();
446
447 _close_writer(log_writer);
448 log_writer = NULL;
449
450 _stop_alloc();
451 file_map.clear();
452 dir_map.clear();
453 super = bluefs_super_t();
454 log_t.clear();
455 _shutdown_logger();
456 }
457
458 void BlueFS::collect_metadata(map<string,string> *pm)
459 {
460 if (bdev[BDEV_DB])
461 bdev[BDEV_DB]->collect_metadata("bluefs_db_", pm);
462 if (bdev[BDEV_WAL])
463 bdev[BDEV_WAL]->collect_metadata("bluefs_wal_", pm);
464 if (bdev[BDEV_SLOW])
465 bdev[BDEV_SLOW]->collect_metadata("bluefs_slow_", pm);
466 }
467
468 int BlueFS::fsck()
469 {
470 std::lock_guard<std::mutex> l(lock);
471 dout(1) << __func__ << dendl;
472 // hrm, i think we check everything on mount...
473 return 0;
474 }
475
476 int BlueFS::_write_super()
477 {
478 // build superblock
479 bufferlist bl;
480 ::encode(super, bl);
481 uint32_t crc = bl.crc32c(-1);
482 ::encode(crc, bl);
483 dout(10) << __func__ << " super block length(encoded): " << bl.length() << dendl;
484 dout(10) << __func__ << " superblock " << super.version << dendl;
485 dout(10) << __func__ << " log_fnode " << super.log_fnode << dendl;
486 assert(bl.length() <= get_super_length());
487 bl.append_zero(get_super_length() - bl.length());
488
489 bdev[BDEV_DB]->write(get_super_offset(), bl, false);
490 dout(20) << __func__ << " v " << super.version
491 << " crc 0x" << std::hex << crc
492 << " offset 0x" << get_super_offset() << std::dec
493 << dendl;
494 return 0;
495 }
496
497 int BlueFS::_open_super()
498 {
499 dout(10) << __func__ << dendl;
500
501 bufferlist bl;
502 uint32_t expected_crc, crc;
503 int r;
504
505 // always the second block
506 r = bdev[BDEV_DB]->read(get_super_offset(), get_super_length(),
507 &bl, ioc[BDEV_DB], false);
508 if (r < 0)
509 return r;
510
511 bufferlist::iterator p = bl.begin();
512 ::decode(super, p);
513 {
514 bufferlist t;
515 t.substr_of(bl, 0, p.get_off());
516 crc = t.crc32c(-1);
517 }
518 ::decode(expected_crc, p);
519 if (crc != expected_crc) {
520 derr << __func__ << " bad crc on superblock, expected 0x"
521 << std::hex << expected_crc << " != actual 0x" << crc << std::dec
522 << dendl;
523 return -EIO;
524 }
525 dout(10) << __func__ << " superblock " << super.version << dendl;
526 dout(10) << __func__ << " log_fnode " << super.log_fnode << dendl;
527 return 0;
528 }
529
530 int BlueFS::_replay(bool noop)
531 {
532 dout(10) << __func__ << (noop ? " NO-OP" : "") << dendl;
533 ino_last = 1; // by the log
534 log_seq = 0;
535
536 FileRef log_file;
537 if (noop) {
538 log_file = new File;
539 } else {
540 log_file = _get_file(1);
541 }
542 log_file->fnode = super.log_fnode;
543 dout(10) << __func__ << " log_fnode " << super.log_fnode << dendl;
544
545 FileReader *log_reader = new FileReader(
546 log_file, cct->_conf->bluefs_max_prefetch,
547 false, // !random
548 true); // ignore eof
549 while (true) {
550 assert((log_reader->buf.pos & ~super.block_mask()) == 0);
551 uint64_t pos = log_reader->buf.pos;
552 uint64_t read_pos = pos;
553 bufferlist bl;
554 {
555 int r = _read(log_reader, &log_reader->buf, read_pos, super.block_size,
556 &bl, NULL);
557 assert(r == (int)super.block_size);
558 read_pos += r;
559 }
560 uint64_t more = 0;
561 uint64_t seq;
562 uuid_d uuid;
563 {
564 bufferlist::iterator p = bl.begin();
565 __u8 a, b;
566 uint32_t len;
567 ::decode(a, p);
568 ::decode(b, p);
569 ::decode(len, p);
570 ::decode(uuid, p);
571 ::decode(seq, p);
572 if (len + 6 > bl.length()) {
573 more = ROUND_UP_TO(len + 6 - bl.length(), super.block_size);
574 }
575 }
576 if (uuid != super.uuid) {
577 dout(10) << __func__ << " 0x" << std::hex << pos << std::dec
578 << ": stop: uuid " << uuid << " != super.uuid " << super.uuid
579 << dendl;
580 break;
581 }
582 if (seq != log_seq + 1) {
583 dout(10) << __func__ << " 0x" << std::hex << pos << std::dec
584 << ": stop: seq " << seq << " != expected " << log_seq + 1
585 << dendl;
586 break;
587 }
588 if (more) {
589 dout(20) << __func__ << " need 0x" << std::hex << more << std::dec
590 << " more bytes" << dendl;
591 bufferlist t;
592 int r = _read(log_reader, &log_reader->buf, read_pos, more, &t, NULL);
593 if (r < (int)more) {
594 dout(10) << __func__ << " 0x" << std::hex << pos
595 << ": stop: len is 0x" << bl.length() + more << std::dec
596 << ", which is past eof" << dendl;
597 break;
598 }
599 assert(r == (int)more);
600 bl.claim_append(t);
601 read_pos += r;
602 }
603 bluefs_transaction_t t;
604 try {
605 bufferlist::iterator p = bl.begin();
606 ::decode(t, p);
607 }
608 catch (buffer::error& e) {
609 dout(10) << __func__ << " 0x" << std::hex << pos << std::dec
610 << ": stop: failed to decode: " << e.what()
611 << dendl;
612 delete log_reader;
613 return -EIO;
614 }
615 assert(seq == t.seq);
616 dout(10) << __func__ << " 0x" << std::hex << pos << std::dec
617 << ": " << t << dendl;
618
619 bufferlist::iterator p = t.op_bl.begin();
620 while (!p.end()) {
621 __u8 op;
622 ::decode(op, p);
623 switch (op) {
624
625 case bluefs_transaction_t::OP_INIT:
626 dout(20) << __func__ << " 0x" << std::hex << pos << std::dec
627 << ": op_init" << dendl;
628 assert(t.seq == 1);
629 break;
630
631 case bluefs_transaction_t::OP_JUMP:
632 {
633 uint64_t next_seq;
634 uint64_t offset;
635 ::decode(next_seq, p);
636 ::decode(offset, p);
637 dout(20) << __func__ << " 0x" << std::hex << pos << std::dec
638 << ": op_jump seq " << next_seq
639 << " offset 0x" << std::hex << offset << std::dec << dendl;
640 assert(next_seq >= log_seq);
641 log_seq = next_seq - 1; // we will increment it below
642 uint64_t skip = offset - read_pos;
643 if (skip) {
644 bufferlist junk;
645 int r = _read(log_reader, &log_reader->buf, read_pos, skip, &junk,
646 NULL);
647 if (r != (int)skip) {
648 dout(10) << __func__ << " 0x" << std::hex << read_pos
649 << ": stop: failed to skip to " << offset
650 << std::dec << dendl;
651 assert(0 == "problem with op_jump");
652 }
653 }
654 }
655 break;
656
657 case bluefs_transaction_t::OP_JUMP_SEQ:
658 {
659 uint64_t next_seq;
660 ::decode(next_seq, p);
661 dout(20) << __func__ << " 0x" << std::hex << pos << std::dec
662 << ": op_jump_seq " << next_seq << dendl;
663 assert(next_seq >= log_seq);
664 log_seq = next_seq - 1; // we will increment it below
665 }
666 break;
667
668 case bluefs_transaction_t::OP_ALLOC_ADD:
669 {
670 __u8 id;
671 uint64_t offset, length;
672 ::decode(id, p);
673 ::decode(offset, p);
674 ::decode(length, p);
675 dout(20) << __func__ << " 0x" << std::hex << pos << std::dec
676 << ": op_alloc_add " << " " << (int)id
677 << ":0x" << std::hex << offset << "~" << length << std::dec
678 << dendl;
679 if (!noop) {
680 block_all[id].insert(offset, length);
681 block_total[id] += length;
682 alloc[id]->init_add_free(offset, length);
683 }
684 }
685 break;
686
687 case bluefs_transaction_t::OP_ALLOC_RM:
688 {
689 __u8 id;
690 uint64_t offset, length;
691 ::decode(id, p);
692 ::decode(offset, p);
693 ::decode(length, p);
694 dout(20) << __func__ << " 0x" << std::hex << pos << std::dec
695 << ": op_alloc_rm " << " " << (int)id
696 << ":0x" << std::hex << offset << "~" << length << std::dec
697 << dendl;
698 if (!noop) {
699 block_all[id].erase(offset, length);
700 block_total[id] -= length;
701 alloc[id]->init_rm_free(offset, length);
702 }
703 }
704 break;
705
706 case bluefs_transaction_t::OP_DIR_LINK:
707 {
708 string dirname, filename;
709 uint64_t ino;
710 ::decode(dirname, p);
711 ::decode(filename, p);
712 ::decode(ino, p);
713 dout(20) << __func__ << " 0x" << std::hex << pos << std::dec
714 << ": op_dir_link " << " " << dirname << "/" << filename
715 << " to " << ino
716 << dendl;
717 if (!noop) {
718 FileRef file = _get_file(ino);
719 assert(file->fnode.ino);
720 map<string,DirRef>::iterator q = dir_map.find(dirname);
721 assert(q != dir_map.end());
722 map<string,FileRef>::iterator r = q->second->file_map.find(filename);
723 assert(r == q->second->file_map.end());
724 q->second->file_map[filename] = file;
725 ++file->refs;
726 }
727 }
728 break;
729
730 case bluefs_transaction_t::OP_DIR_UNLINK:
731 {
732 string dirname, filename;
733 ::decode(dirname, p);
734 ::decode(filename, p);
735 dout(20) << __func__ << " 0x" << std::hex << pos << std::dec
736 << ": op_dir_unlink " << " " << dirname << "/" << filename
737 << dendl;
738 if (!noop) {
739 map<string,DirRef>::iterator q = dir_map.find(dirname);
740 assert(q != dir_map.end());
741 map<string,FileRef>::iterator r = q->second->file_map.find(filename);
742 assert(r != q->second->file_map.end());
743 assert(r->second->refs > 0);
744 --r->second->refs;
745 q->second->file_map.erase(r);
746 }
747 }
748 break;
749
750 case bluefs_transaction_t::OP_DIR_CREATE:
751 {
752 string dirname;
753 ::decode(dirname, p);
754 dout(20) << __func__ << " 0x" << std::hex << pos << std::dec
755 << ": op_dir_create " << dirname << dendl;
756 if (!noop) {
757 map<string,DirRef>::iterator q = dir_map.find(dirname);
758 assert(q == dir_map.end());
759 dir_map[dirname] = new Dir;
760 }
761 }
762 break;
763
764 case bluefs_transaction_t::OP_DIR_REMOVE:
765 {
766 string dirname;
767 ::decode(dirname, p);
768 dout(20) << __func__ << " 0x" << std::hex << pos << std::dec
769 << ": op_dir_remove " << dirname << dendl;
770 if (!noop) {
771 map<string,DirRef>::iterator q = dir_map.find(dirname);
772 assert(q != dir_map.end());
773 assert(q->second->file_map.empty());
774 dir_map.erase(q);
775 }
776 }
777 break;
778
779 case bluefs_transaction_t::OP_FILE_UPDATE:
780 {
781 bluefs_fnode_t fnode;
782 ::decode(fnode, p);
783 dout(20) << __func__ << " 0x" << std::hex << pos << std::dec
784 << ": op_file_update " << " " << fnode << dendl;
785 if (!noop) {
786 FileRef f = _get_file(fnode.ino);
787 f->fnode = fnode;
788 if (fnode.ino > ino_last) {
789 ino_last = fnode.ino;
790 }
791 }
792 }
793 break;
794
795 case bluefs_transaction_t::OP_FILE_REMOVE:
796 {
797 uint64_t ino;
798 ::decode(ino, p);
799 dout(20) << __func__ << " 0x" << std::hex << pos << std::dec
800 << ": op_file_remove " << ino << dendl;
801 if (!noop) {
802 auto p = file_map.find(ino);
803 assert(p != file_map.end());
804 file_map.erase(p);
805 }
806 }
807 break;
808
809 default:
810 derr << __func__ << " 0x" << std::hex << pos << std::dec
811 << ": stop: unrecognized op " << (int)op << dendl;
812 delete log_reader;
813 return -EIO;
814 }
815 }
816 assert(p.end());
817
818 // we successfully replayed the transaction; bump the seq and log size
819 ++log_seq;
820 log_file->fnode.size = log_reader->buf.pos;
821 }
822
823 dout(10) << __func__ << " log file size was 0x"
824 << std::hex << log_file->fnode.size << std::dec << dendl;
825 delete log_reader;
826
827 if (!noop) {
828 // verify file link counts are all >0
829 for (auto& p : file_map) {
830 if (p.second->refs == 0 &&
831 p.second->fnode.ino > 1) {
832 derr << __func__ << " file with link count 0: " << p.second->fnode
833 << dendl;
834 return -EIO;
835 }
836 }
837 }
838
839 dout(10) << __func__ << " done" << dendl;
840 return 0;
841 }
842
843 BlueFS::FileRef BlueFS::_get_file(uint64_t ino)
844 {
845 auto p = file_map.find(ino);
846 if (p == file_map.end()) {
847 FileRef f = new File;
848 file_map[ino] = f;
849 dout(30) << __func__ << " ino " << ino << " = " << f
850 << " (new)" << dendl;
851 return f;
852 } else {
853 dout(30) << __func__ << " ino " << ino << " = " << p->second << dendl;
854 return p->second;
855 }
856 }
857
858 void BlueFS::_drop_link(FileRef file)
859 {
860 dout(20) << __func__ << " had refs " << file->refs
861 << " on " << file->fnode << dendl;
862 assert(file->refs > 0);
863 --file->refs;
864 if (file->refs == 0) {
865 dout(20) << __func__ << " destroying " << file->fnode << dendl;
866 assert(file->num_reading.load() == 0);
867 log_t.op_file_remove(file->fnode.ino);
868 for (auto& r : file->fnode.extents) {
869 pending_release[r.bdev].insert(r.offset, r.length);
870 }
871 file_map.erase(file->fnode.ino);
872 file->deleted = true;
873
874 if (file->dirty_seq) {
875 assert(file->dirty_seq > log_seq_stable);
876 assert(dirty_files.count(file->dirty_seq));
877 auto it = dirty_files[file->dirty_seq].iterator_to(*file);
878 dirty_files[file->dirty_seq].erase(it);
879 file->dirty_seq = 0;
880 }
881 }
882 }
883
884 int BlueFS::_read_random(
885 FileReader *h, ///< [in] read from here
886 uint64_t off, ///< [in] offset
887 size_t len, ///< [in] this many bytes
888 char *out) ///< [out] optional: or copy it here
889 {
890 dout(10) << __func__ << " h " << h
891 << " 0x" << std::hex << off << "~" << len << std::dec
892 << " from " << h->file->fnode << dendl;
893
894 ++h->file->num_reading;
895
896 if (!h->ignore_eof &&
897 off + len > h->file->fnode.size) {
898 if (off > h->file->fnode.size)
899 len = 0;
900 else
901 len = h->file->fnode.size - off;
902 dout(20) << __func__ << " reaching (or past) eof, len clipped to 0x"
903 << std::hex << len << std::dec << dendl;
904 }
905
906 int ret = 0;
907 while (len > 0) {
908 uint64_t x_off = 0;
909 auto p = h->file->fnode.seek(off, &x_off);
910 uint64_t l = MIN(p->length - x_off, len);
911 dout(20) << __func__ << " read buffered 0x"
912 << std::hex << x_off << "~" << l << std::dec
913 << " of " << *p << dendl;
914 int r = bdev[p->bdev]->read_random(p->offset + x_off, l, out,
915 cct->_conf->bluefs_buffered_io);
916 assert(r == 0);
917 off += l;
918 len -= l;
919 ret += l;
920 out += l;
921 }
922
923 dout(20) << __func__ << " got " << ret << dendl;
924 --h->file->num_reading;
925 return ret;
926 }
927
928 int BlueFS::_read(
929 FileReader *h, ///< [in] read from here
930 FileReaderBuffer *buf, ///< [in] reader state
931 uint64_t off, ///< [in] offset
932 size_t len, ///< [in] this many bytes
933 bufferlist *outbl, ///< [out] optional: reference the result here
934 char *out) ///< [out] optional: or copy it here
935 {
936 dout(10) << __func__ << " h " << h
937 << " 0x" << std::hex << off << "~" << len << std::dec
938 << " from " << h->file->fnode << dendl;
939
940 ++h->file->num_reading;
941
942 if (!h->ignore_eof &&
943 off + len > h->file->fnode.size) {
944 if (off > h->file->fnode.size)
945 len = 0;
946 else
947 len = h->file->fnode.size - off;
948 dout(20) << __func__ << " reaching (or past) eof, len clipped to 0x"
949 << std::hex << len << std::dec << dendl;
950 }
951 if (outbl)
952 outbl->clear();
953
954 int ret = 0;
955 while (len > 0) {
956 size_t left;
957 if (off < buf->bl_off || off >= buf->get_buf_end()) {
958 buf->bl.clear();
959 buf->bl_off = off & super.block_mask();
960 uint64_t x_off = 0;
961 auto p = h->file->fnode.seek(buf->bl_off, &x_off);
962 uint64_t want = ROUND_UP_TO(len + (off & ~super.block_mask()),
963 super.block_size);
964 want = MAX(want, buf->max_prefetch);
965 uint64_t l = MIN(p->length - x_off, want);
966 uint64_t eof_offset = ROUND_UP_TO(h->file->fnode.size, super.block_size);
967 if (!h->ignore_eof &&
968 buf->bl_off + l > eof_offset) {
969 l = eof_offset - buf->bl_off;
970 }
971 dout(20) << __func__ << " fetching 0x"
972 << std::hex << x_off << "~" << l << std::dec
973 << " of " << *p << dendl;
974 int r = bdev[p->bdev]->read(p->offset + x_off, l, &buf->bl, ioc[p->bdev],
975 cct->_conf->bluefs_buffered_io);
976 assert(r == 0);
977 }
978 left = buf->get_buf_remaining(off);
979 dout(20) << __func__ << " left 0x" << std::hex << left
980 << " len 0x" << len << std::dec << dendl;
981
982 int r = MIN(len, left);
983 if (outbl) {
984 bufferlist t;
985 t.substr_of(buf->bl, off - buf->bl_off, r);
986 outbl->claim_append(t);
987 }
988 if (out) {
989 // NOTE: h->bl is normally a contiguous buffer so c_str() is free.
990 memcpy(out, buf->bl.c_str() + off - buf->bl_off, r);
991 out += r;
992 }
993
994 dout(30) << __func__ << " result chunk (0x"
995 << std::hex << r << std::dec << " bytes):\n";
996 bufferlist t;
997 t.substr_of(buf->bl, off - buf->bl_off, r);
998 t.hexdump(*_dout);
999 *_dout << dendl;
1000
1001 off += r;
1002 len -= r;
1003 ret += r;
1004 buf->pos += r;
1005 }
1006
1007 dout(20) << __func__ << " got " << ret << dendl;
1008 assert(!outbl || (int)outbl->length() == ret);
1009 --h->file->num_reading;
1010 return ret;
1011 }
1012
1013 void BlueFS::_invalidate_cache(FileRef f, uint64_t offset, uint64_t length)
1014 {
1015 dout(10) << __func__ << " file " << f->fnode
1016 << " 0x" << std::hex << offset << "~" << length << std::dec
1017 << dendl;
1018 if (offset & ~super.block_mask()) {
1019 offset &= super.block_mask();
1020 length = ROUND_UP_TO(length, super.block_size);
1021 }
1022 uint64_t x_off = 0;
1023 auto p = f->fnode.seek(offset, &x_off);
1024 while (length > 0 && p != f->fnode.extents.end()) {
1025 uint64_t x_len = MIN(p->length - x_off, length);
1026 bdev[p->bdev]->invalidate_cache(p->offset + x_off, x_len);
1027 dout(20) << __func__ << " 0x" << std::hex << x_off << "~" << x_len
1028 << std:: dec << " of " << *p << dendl;
1029 offset += x_len;
1030 length -= x_len;
1031 }
1032 }
1033
1034 uint64_t BlueFS::_estimate_log_size()
1035 {
1036 int avg_dir_size = 40; // fixme
1037 int avg_file_size = 12;
1038 uint64_t size = 4096 * 2;
1039 size += file_map.size() * (1 + sizeof(bluefs_fnode_t));
1040 for (auto& p : block_all)
1041 size += p.num_intervals() * (1 + 1 + sizeof(uint64_t) * 2);
1042 size += dir_map.size() + (1 + avg_dir_size);
1043 size += file_map.size() * (1 + avg_dir_size + avg_file_size);
1044 return ROUND_UP_TO(size, super.block_size);
1045 }
1046
1047 void BlueFS::compact_log()
1048 {
1049 std::unique_lock<std::mutex> l(lock);
1050 if (cct->_conf->bluefs_compact_log_sync) {
1051 _compact_log_sync();
1052 } else {
1053 _compact_log_async(l);
1054 }
1055 }
1056
1057 bool BlueFS::_should_compact_log()
1058 {
1059 uint64_t current = log_writer->file->fnode.size;
1060 uint64_t expected = _estimate_log_size();
1061 float ratio = (float)current / (float)expected;
1062 dout(10) << __func__ << " current 0x" << std::hex << current
1063 << " expected " << expected << std::dec
1064 << " ratio " << ratio
1065 << (new_log ? " (async compaction in progress)" : "")
1066 << dendl;
1067 if (new_log ||
1068 current < cct->_conf->bluefs_log_compact_min_size ||
1069 ratio < cct->_conf->bluefs_log_compact_min_ratio) {
1070 return false;
1071 }
1072 return true;
1073 }
1074
1075 void BlueFS::_compact_log_dump_metadata(bluefs_transaction_t *t)
1076 {
1077 t->seq = 1;
1078 t->uuid = super.uuid;
1079 dout(20) << __func__ << " op_init" << dendl;
1080
1081 t->op_init();
1082 for (unsigned bdev = 0; bdev < MAX_BDEV; ++bdev) {
1083 interval_set<uint64_t>& p = block_all[bdev];
1084 for (interval_set<uint64_t>::iterator q = p.begin(); q != p.end(); ++q) {
1085 dout(20) << __func__ << " op_alloc_add " << bdev << " 0x"
1086 << std::hex << q.get_start() << "~" << q.get_len() << std::dec
1087 << dendl;
1088 t->op_alloc_add(bdev, q.get_start(), q.get_len());
1089 }
1090 }
1091 for (auto& p : file_map) {
1092 if (p.first == 1)
1093 continue;
1094 dout(20) << __func__ << " op_file_update " << p.second->fnode << dendl;
1095 assert(p.first > 1);
1096 t->op_file_update(p.second->fnode);
1097 }
1098 for (auto& p : dir_map) {
1099 dout(20) << __func__ << " op_dir_create " << p.first << dendl;
1100 t->op_dir_create(p.first);
1101 for (auto& q : p.second->file_map) {
1102 dout(20) << __func__ << " op_dir_link " << p.first << "/" << q.first
1103 << " to " << q.second->fnode.ino << dendl;
1104 t->op_dir_link(p.first, q.first, q.second->fnode.ino);
1105 }
1106 }
1107 }
1108
1109 void BlueFS::_compact_log_sync()
1110 {
1111 dout(10) << __func__ << dendl;
1112 File *log_file = log_writer->file.get();
1113
1114 // clear out log (be careful who calls us!!!)
1115 log_t.clear();
1116
1117 bluefs_transaction_t t;
1118 _compact_log_dump_metadata(&t);
1119
1120 dout(20) << __func__ << " op_jump_seq " << log_seq << dendl;
1121 t.op_jump_seq(log_seq);
1122
1123 bufferlist bl;
1124 ::encode(t, bl);
1125 _pad_bl(bl);
1126
1127 uint64_t need = bl.length() + cct->_conf->bluefs_max_log_runway;
1128 dout(20) << __func__ << " need " << need << dendl;
1129
1130 mempool::bluefs::vector<bluefs_extent_t> old_extents;
1131 uint64_t old_allocated = 0;
1132 log_file->fnode.swap_extents(old_extents, old_allocated);
1133 while (log_file->fnode.get_allocated() < need) {
1134 int r = _allocate(log_file->fnode.prefer_bdev,
1135 need - log_file->fnode.get_allocated(),
1136 &log_file->fnode);
1137 assert(r == 0);
1138 }
1139
1140 _close_writer(log_writer);
1141
1142 log_file->fnode.size = bl.length();
1143 log_writer = _create_writer(log_file);
1144 log_writer->append(bl);
1145 int r = _flush(log_writer, true);
1146 assert(r == 0);
1147 wait_for_aio(log_writer);
1148
1149 list<aio_t> completed_ios;
1150 _claim_completed_aios(log_writer, &completed_ios);
1151 flush_bdev();
1152 completed_ios.clear();
1153
1154 dout(10) << __func__ << " writing super" << dendl;
1155 super.log_fnode = log_file->fnode;
1156 ++super.version;
1157 _write_super();
1158 flush_bdev();
1159
1160 dout(10) << __func__ << " release old log extents " << old_extents << dendl;
1161 for (auto& r : old_extents) {
1162 pending_release[r.bdev].insert(r.offset, r.length);
1163 }
1164
1165 logger->inc(l_bluefs_log_compactions);
1166 }
1167
1168 /*
1169 * 1. Allocate a new extent to continue the log, and then log an event
1170 * that jumps the log write position to the new extent. At this point, the
1171 * old extent(s) won't be written to, and reflect everything to compact.
1172 * New events will be written to the new region that we'll keep.
1173 *
1174 * 2. While still holding the lock, encode a bufferlist that dumps all of the
1175 * in-memory fnodes and names. This will become the new beginning of the
1176 * log. The last event will jump to the log continuation extent from #1.
1177 *
1178 * 3. Queue a write to a new extent for the new beginnging of the log.
1179 *
1180 * 4. Drop lock and wait
1181 *
1182 * 5. Retake the lock.
1183 *
1184 * 6. Update the log_fnode to splice in the new beginning.
1185 *
1186 * 7. Write the new superblock.
1187 *
1188 * 8. Release the old log space. Clean up.
1189 */
1190 void BlueFS::_compact_log_async(std::unique_lock<std::mutex>& l)
1191 {
1192 dout(10) << __func__ << dendl;
1193 File *log_file = log_writer->file.get();
1194 assert(!new_log);
1195 assert(!new_log_writer);
1196
1197 // create a new log [writer] so that we know compaction is in progress
1198 // (see _should_compact_log)
1199 new_log = new File;
1200 new_log->fnode.ino = 0; // so that _flush_range won't try to log the fnode
1201
1202 // 0. wait for any racing flushes to complete. (We do not want to block
1203 // in _flush_sync_log with jump_to set or else a racing thread might flush
1204 // our entries and our jump_to update won't be correct.)
1205 while (log_flushing) {
1206 dout(10) << __func__ << " log is currently flushing, waiting" << dendl;
1207 log_cond.wait(l);
1208 }
1209
1210 // 1. allocate new log space and jump to it.
1211 old_log_jump_to = log_file->fnode.get_allocated();
1212 uint64_t need = old_log_jump_to + cct->_conf->bluefs_max_log_runway;
1213 dout(10) << __func__ << " old_log_jump_to 0x" << std::hex << old_log_jump_to
1214 << " need 0x" << need << std::dec << dendl;
1215 while (log_file->fnode.get_allocated() < need) {
1216 int r = _allocate(log_file->fnode.prefer_bdev,
1217 cct->_conf->bluefs_max_log_runway,
1218 &log_file->fnode);
1219 assert(r == 0);
1220 }
1221 dout(10) << __func__ << " log extents " << log_file->fnode.extents << dendl;
1222
1223 // update the log file change and log a jump to the offset where we want to
1224 // write the new entries
1225 log_t.op_file_update(log_file->fnode);
1226 log_t.op_jump(log_seq, old_log_jump_to);
1227
1228 flush_bdev(); // FIXME?
1229
1230 _flush_and_sync_log(l, 0, old_log_jump_to);
1231
1232 // 2. prepare compacted log
1233 bluefs_transaction_t t;
1234 //avoid record two times in log_t and _compact_log_dump_metadata.
1235 log_t.clear();
1236 _compact_log_dump_metadata(&t);
1237
1238 // conservative estimate for final encoded size
1239 new_log_jump_to = ROUND_UP_TO(t.op_bl.length() + super.block_size * 2,
1240 cct->_conf->bluefs_alloc_size);
1241 t.op_jump(log_seq, new_log_jump_to);
1242
1243 bufferlist bl;
1244 ::encode(t, bl);
1245 _pad_bl(bl);
1246
1247 dout(10) << __func__ << " new_log_jump_to 0x" << std::hex << new_log_jump_to
1248 << std::dec << dendl;
1249
1250 // allocate
1251 int r = _allocate(BlueFS::BDEV_DB, new_log_jump_to,
1252 &new_log->fnode);
1253 assert(r == 0);
1254 new_log_writer = _create_writer(new_log);
1255 new_log_writer->append(bl);
1256
1257 // 3. flush
1258 r = _flush(new_log_writer, true);
1259 assert(r == 0);
1260 lock.unlock();
1261
1262 // 4. wait
1263 dout(10) << __func__ << " waiting for compacted log to sync" << dendl;
1264 wait_for_aio(new_log_writer);
1265
1266 list<aio_t> completed_ios;
1267 _claim_completed_aios(new_log_writer, &completed_ios);
1268 flush_bdev();
1269 completed_ios.clear();
1270
1271 // 5. retake lock
1272 lock.lock();
1273
1274 // 6. update our log fnode
1275 // discard first old_log_jump_to extents
1276 dout(10) << __func__ << " remove 0x" << std::hex << old_log_jump_to << std::dec
1277 << " of " << log_file->fnode.extents << dendl;
1278 uint64_t discarded = 0;
1279 mempool::bluefs::vector<bluefs_extent_t> old_extents;
1280 while (discarded < old_log_jump_to) {
1281 assert(!log_file->fnode.extents.empty());
1282 bluefs_extent_t& e = log_file->fnode.extents.front();
1283 bluefs_extent_t temp = e;
1284 if (discarded + e.length <= old_log_jump_to) {
1285 dout(10) << __func__ << " remove old log extent " << e << dendl;
1286 discarded += e.length;
1287 log_file->fnode.pop_front_extent();
1288 } else {
1289 dout(10) << __func__ << " remove front of old log extent " << e << dendl;
1290 uint64_t drop = old_log_jump_to - discarded;
1291 temp.length = drop;
1292 e.offset += drop;
1293 e.length -= drop;
1294 discarded += drop;
1295 dout(10) << __func__ << " kept " << e << " removed " << temp << dendl;
1296 }
1297 old_extents.push_back(temp);
1298 }
1299 auto from = log_file->fnode.extents.begin();
1300 auto to = log_file->fnode.extents.end();
1301 while (from != to) {
1302 new_log->fnode.append_extent(*from);
1303 ++from;
1304 }
1305
1306 // clear the extents from old log file, they are added to new log
1307 log_file->fnode.clear_extents();
1308 // swap the log files. New log file is the log file now.
1309 new_log->fnode.swap_extents(log_file->fnode);
1310
1311 log_writer->pos = log_writer->file->fnode.size =
1312 log_writer->pos - old_log_jump_to + new_log_jump_to;
1313
1314 // 7. write the super block to reflect the changes
1315 dout(10) << __func__ << " writing super" << dendl;
1316 super.log_fnode = log_file->fnode;
1317 ++super.version;
1318 _write_super();
1319
1320 lock.unlock();
1321 flush_bdev();
1322 lock.lock();
1323
1324 // 8. release old space
1325 dout(10) << __func__ << " release old log extents " << old_extents << dendl;
1326 for (auto& r : old_extents) {
1327 pending_release[r.bdev].insert(r.offset, r.length);
1328 }
1329
1330 // delete the new log, remove from the dirty files list
1331 _close_writer(new_log_writer);
1332 if (new_log->dirty_seq) {
1333 assert(dirty_files.count(new_log->dirty_seq));
1334 auto it = dirty_files[new_log->dirty_seq].iterator_to(*new_log);
1335 dirty_files[new_log->dirty_seq].erase(it);
1336 }
1337 new_log_writer = nullptr;
1338 new_log = nullptr;
1339 log_cond.notify_all();
1340
1341 dout(10) << __func__ << " log extents " << log_file->fnode.extents << dendl;
1342 logger->inc(l_bluefs_log_compactions);
1343 }
1344
1345 void BlueFS::_pad_bl(bufferlist& bl)
1346 {
1347 uint64_t partial = bl.length() % super.block_size;
1348 if (partial) {
1349 dout(10) << __func__ << " padding with 0x" << std::hex
1350 << super.block_size - partial << " zeros" << std::dec << dendl;
1351 bl.append_zero(super.block_size - partial);
1352 }
1353 }
1354
1355 void BlueFS::flush_log()
1356 {
1357 std::unique_lock<std::mutex> l(lock);
1358 flush_bdev();
1359 _flush_and_sync_log(l);
1360 }
1361
1362 int BlueFS::_flush_and_sync_log(std::unique_lock<std::mutex>& l,
1363 uint64_t want_seq,
1364 uint64_t jump_to)
1365 {
1366 while (log_flushing) {
1367 dout(10) << __func__ << " want_seq " << want_seq
1368 << " log is currently flushing, waiting" << dendl;
1369 assert(!jump_to);
1370 log_cond.wait(l);
1371 }
1372 if (want_seq && want_seq <= log_seq_stable) {
1373 dout(10) << __func__ << " want_seq " << want_seq << " <= log_seq_stable "
1374 << log_seq_stable << ", done" << dendl;
1375 assert(!jump_to);
1376 return 0;
1377 }
1378 if (log_t.empty() && dirty_files.empty()) {
1379 dout(10) << __func__ << " want_seq " << want_seq
1380 << " " << log_t << " not dirty, dirty_files empty, no-op" << dendl;
1381 assert(!jump_to);
1382 return 0;
1383 }
1384
1385 uint64_t seq = log_t.seq = ++log_seq;
1386 assert(want_seq == 0 || want_seq <= seq);
1387 log_t.uuid = super.uuid;
1388
1389 // log dirty files
1390 auto lsi = dirty_files.find(seq);
1391 if (lsi != dirty_files.end()) {
1392 dout(20) << __func__ << " " << lsi->second.size() << " dirty_files" << dendl;
1393 for (auto &f : lsi->second) {
1394 dout(20) << __func__ << " op_file_update " << f.fnode << dendl;
1395 log_t.op_file_update(f.fnode);
1396 }
1397 }
1398
1399 dout(10) << __func__ << " " << log_t << dendl;
1400 assert(!log_t.empty());
1401
1402 // allocate some more space (before we run out)?
1403 int64_t runway = log_writer->file->fnode.get_allocated() -
1404 log_writer->get_effective_write_pos();
1405 if (runway < (int64_t)cct->_conf->bluefs_min_log_runway) {
1406 dout(10) << __func__ << " allocating more log runway (0x"
1407 << std::hex << runway << std::dec << " remaining)" << dendl;
1408 while (new_log_writer) {
1409 dout(10) << __func__ << " waiting for async compaction" << dendl;
1410 log_cond.wait(l);
1411 }
1412 int r = _allocate(log_writer->file->fnode.prefer_bdev,
1413 cct->_conf->bluefs_max_log_runway,
1414 &log_writer->file->fnode);
1415 assert(r == 0);
1416 log_t.op_file_update(log_writer->file->fnode);
1417 }
1418
1419 bufferlist bl;
1420 ::encode(log_t, bl);
1421
1422 // pad to block boundary
1423 _pad_bl(bl);
1424 logger->inc(l_bluefs_logged_bytes, bl.length());
1425
1426 log_writer->append(bl);
1427
1428 log_t.clear();
1429 log_t.seq = 0; // just so debug output is less confusing
1430 log_flushing = true;
1431
1432 int r = _flush(log_writer, true);
1433 assert(r == 0);
1434
1435 if (jump_to) {
1436 dout(10) << __func__ << " jumping log offset from 0x" << std::hex
1437 << log_writer->pos << " -> 0x" << jump_to << std::dec << dendl;
1438 log_writer->pos = jump_to;
1439 log_writer->file->fnode.size = jump_to;
1440 }
1441
1442 _flush_bdev_safely(log_writer);
1443
1444 log_flushing = false;
1445 log_cond.notify_all();
1446
1447 // clean dirty files
1448 if (seq > log_seq_stable) {
1449 log_seq_stable = seq;
1450 dout(20) << __func__ << " log_seq_stable " << log_seq_stable << dendl;
1451
1452 auto p = dirty_files.begin();
1453 while (p != dirty_files.end()) {
1454 if (p->first > log_seq_stable) {
1455 dout(20) << __func__ << " done cleaning up dirty files" << dendl;
1456 break;
1457 }
1458
1459 auto l = p->second.begin();
1460 while (l != p->second.end()) {
1461 File *file = &*l;
1462 assert(file->dirty_seq > 0);
1463 assert(file->dirty_seq <= log_seq_stable);
1464 dout(20) << __func__ << " cleaned file " << file->fnode << dendl;
1465 file->dirty_seq = 0;
1466 p->second.erase(l++);
1467 }
1468
1469 assert(p->second.empty());
1470 dirty_files.erase(p++);
1471 }
1472 } else {
1473 dout(20) << __func__ << " log_seq_stable " << log_seq_stable
1474 << " already >= out seq " << seq
1475 << ", we lost a race against another log flush, done" << dendl;
1476 }
1477 _update_logger_stats();
1478
1479 return 0;
1480 }
1481
1482 int BlueFS::_flush_range(FileWriter *h, uint64_t offset, uint64_t length)
1483 {
1484 dout(10) << __func__ << " " << h << " pos 0x" << std::hex << h->pos
1485 << " 0x" << offset << "~" << length << std::dec
1486 << " to " << h->file->fnode << dendl;
1487 assert(!h->file->deleted);
1488 assert(h->file->num_readers.load() == 0);
1489
1490 h->buffer_appender.flush();
1491
1492 bool buffered;
1493 if (h->file->fnode.ino == 1)
1494 buffered = false;
1495 else
1496 buffered = cct->_conf->bluefs_buffered_io;
1497
1498 if (offset + length <= h->pos)
1499 return 0;
1500 if (offset < h->pos) {
1501 length -= h->pos - offset;
1502 offset = h->pos;
1503 dout(10) << " still need 0x"
1504 << std::hex << offset << "~" << length << std::dec
1505 << dendl;
1506 }
1507 assert(offset <= h->file->fnode.size);
1508
1509 uint64_t allocated = h->file->fnode.get_allocated();
1510
1511 // do not bother to dirty the file if we are overwriting
1512 // previously allocated extents.
1513 bool must_dirty = false;
1514 if (allocated < offset + length) {
1515 // we should never run out of log space here; see the min runway check
1516 // in _flush_and_sync_log.
1517 assert(h->file->fnode.ino != 1);
1518 int r = _allocate(h->file->fnode.prefer_bdev,
1519 offset + length - allocated,
1520 &h->file->fnode);
1521 if (r < 0) {
1522 derr << __func__ << " allocated: 0x" << std::hex << allocated
1523 << " offset: 0x" << offset << " length: 0x" << length << std::dec
1524 << dendl;
1525 assert(0 == "bluefs enospc");
1526 return r;
1527 }
1528 if (cct->_conf->bluefs_preextend_wal_files &&
1529 h->writer_type == WRITER_WAL) {
1530 // NOTE: this *requires* that rocksdb also has log recycling
1531 // enabled and is therefore doing robust CRCs on the log
1532 // records. otherwise, we will fail to reply the rocksdb log
1533 // properly due to garbage on the device.
1534 h->file->fnode.size = h->file->fnode.get_allocated();
1535 dout(10) << __func__ << " extending WAL size to 0x" << std::hex
1536 << h->file->fnode.size << std::dec << " to include allocated"
1537 << dendl;
1538 }
1539 must_dirty = true;
1540 }
1541 if (h->file->fnode.size < offset + length) {
1542 h->file->fnode.size = offset + length;
1543 if (h->file->fnode.ino > 1) {
1544 // we do not need to dirty the log file (or it's compacting
1545 // replacement) when the file size changes because replay is
1546 // smart enough to discover it on its own.
1547 must_dirty = true;
1548 }
1549 }
1550 if (must_dirty) {
1551 h->file->fnode.mtime = ceph_clock_now();
1552 assert(h->file->fnode.ino >= 1);
1553 if (h->file->dirty_seq == 0) {
1554 h->file->dirty_seq = log_seq + 1;
1555 dirty_files[h->file->dirty_seq].push_back(*h->file);
1556 dout(20) << __func__ << " dirty_seq = " << log_seq + 1
1557 << " (was clean)" << dendl;
1558 } else {
1559 if (h->file->dirty_seq != log_seq + 1) {
1560 // need re-dirty, erase from list first
1561 assert(dirty_files.count(h->file->dirty_seq));
1562 auto it = dirty_files[h->file->dirty_seq].iterator_to(*h->file);
1563 dirty_files[h->file->dirty_seq].erase(it);
1564 h->file->dirty_seq = log_seq + 1;
1565 dirty_files[h->file->dirty_seq].push_back(*h->file);
1566 dout(20) << __func__ << " dirty_seq = " << log_seq + 1
1567 << " (was " << h->file->dirty_seq << ")" << dendl;
1568 } else {
1569 dout(20) << __func__ << " dirty_seq = " << log_seq + 1
1570 << " (unchanged, do nothing) " << dendl;
1571 }
1572 }
1573 }
1574 dout(20) << __func__ << " file now " << h->file->fnode << dendl;
1575
1576 uint64_t x_off = 0;
1577 auto p = h->file->fnode.seek(offset, &x_off);
1578 assert(p != h->file->fnode.extents.end());
1579 dout(20) << __func__ << " in " << *p << " x_off 0x"
1580 << std::hex << x_off << std::dec << dendl;
1581
1582 unsigned partial = x_off & ~super.block_mask();
1583 bufferlist bl;
1584 if (partial) {
1585 dout(20) << __func__ << " using partial tail 0x"
1586 << std::hex << partial << std::dec << dendl;
1587 assert(h->tail_block.length() == partial);
1588 bl.claim_append_piecewise(h->tail_block);
1589 x_off -= partial;
1590 offset -= partial;
1591 length += partial;
1592 dout(20) << __func__ << " waiting for previous aio to complete" << dendl;
1593 for (auto p : h->iocv) {
1594 if (p) {
1595 p->aio_wait();
1596 }
1597 }
1598 }
1599 if (length == partial + h->buffer.length()) {
1600 bl.claim_append_piecewise(h->buffer);
1601 } else {
1602 bufferlist t;
1603 h->buffer.splice(0, length, &t);
1604 bl.claim_append_piecewise(t);
1605 t.substr_of(h->buffer, length, h->buffer.length() - length);
1606 h->buffer.swap(t);
1607 dout(20) << " leaving 0x" << std::hex << h->buffer.length() << std::dec
1608 << " unflushed" << dendl;
1609 }
1610 assert(bl.length() == length);
1611
1612 switch (h->writer_type) {
1613 case WRITER_WAL:
1614 logger->inc(l_bluefs_bytes_written_wal, length);
1615 break;
1616 case WRITER_SST:
1617 logger->inc(l_bluefs_bytes_written_sst, length);
1618 break;
1619 }
1620
1621 dout(30) << "dump:\n";
1622 bl.hexdump(*_dout);
1623 *_dout << dendl;
1624
1625 h->pos = offset + length;
1626 h->tail_block.clear();
1627
1628 uint64_t bloff = 0;
1629 while (length > 0) {
1630 uint64_t x_len = MIN(p->length - x_off, length);
1631 bufferlist t;
1632 t.substr_of(bl, bloff, x_len);
1633 unsigned tail = x_len & ~super.block_mask();
1634 if (tail) {
1635 size_t zlen = super.block_size - tail;
1636 dout(20) << __func__ << " caching tail of 0x"
1637 << std::hex << tail
1638 << " and padding block with 0x" << zlen
1639 << std::dec << dendl;
1640 h->tail_block.substr_of(bl, bl.length() - tail, tail);
1641 if (h->file->fnode.ino > 1) {
1642 // we are using the page_aligned_appender, and can safely use
1643 // the tail of the raw buffer.
1644 const bufferptr &last = t.back();
1645 if (last.unused_tail_length() < zlen) {
1646 derr << " wtf, last is " << last << " from " << t << dendl;
1647 assert(last.unused_tail_length() >= zlen);
1648 }
1649 bufferptr z = last;
1650 z.set_offset(last.offset() + last.length());
1651 z.set_length(zlen);
1652 z.zero();
1653 t.append(z, 0, zlen);
1654 } else {
1655 t.append_zero(zlen);
1656 }
1657 }
1658 if (cct->_conf->bluefs_sync_write) {
1659 bdev[p->bdev]->write(p->offset + x_off, t, buffered);
1660 } else {
1661 bdev[p->bdev]->aio_write(p->offset + x_off, t, h->iocv[p->bdev], buffered);
1662 }
1663 bloff += x_len;
1664 length -= x_len;
1665 ++p;
1666 x_off = 0;
1667 }
1668 for (unsigned i = 0; i < MAX_BDEV; ++i) {
1669 if (bdev[i]) {
1670 assert(h->iocv[i]);
1671 if (h->iocv[i]->has_pending_aios()) {
1672 bdev[i]->aio_submit(h->iocv[i]);
1673 }
1674 }
1675 }
1676 dout(20) << __func__ << " h " << h << " pos now 0x"
1677 << std::hex << h->pos << std::dec << dendl;
1678 return 0;
1679 }
1680
1681 // we need to retire old completed aios so they don't stick around in
1682 // memory indefinitely (along with their bufferlist refs).
1683 void BlueFS::_claim_completed_aios(FileWriter *h, list<aio_t> *ls)
1684 {
1685 for (auto p : h->iocv) {
1686 if (p) {
1687 ls->splice(ls->end(), p->running_aios);
1688 }
1689 }
1690 dout(10) << __func__ << " got " << ls->size() << " aios" << dendl;
1691 }
1692
1693 void BlueFS::wait_for_aio(FileWriter *h)
1694 {
1695 // NOTE: this is safe to call without a lock, as long as our reference is
1696 // stable.
1697 dout(10) << __func__ << " " << h << dendl;
1698 utime_t start = ceph_clock_now();
1699 for (auto p : h->iocv) {
1700 if (p) {
1701 p->aio_wait();
1702 }
1703 }
1704 utime_t end = ceph_clock_now();
1705 utime_t dur = end - start;
1706 dout(10) << __func__ << " " << h << " done in " << dur << dendl;
1707 }
1708
1709 int BlueFS::_flush(FileWriter *h, bool force)
1710 {
1711 h->buffer_appender.flush();
1712 uint64_t length = h->buffer.length();
1713 uint64_t offset = h->pos;
1714 if (!force &&
1715 length < cct->_conf->bluefs_min_flush_size) {
1716 dout(10) << __func__ << " " << h << " ignoring, length " << length
1717 << " < min_flush_size " << cct->_conf->bluefs_min_flush_size
1718 << dendl;
1719 return 0;
1720 }
1721 if (length == 0) {
1722 dout(10) << __func__ << " " << h << " no dirty data on "
1723 << h->file->fnode << dendl;
1724 return 0;
1725 }
1726 dout(10) << __func__ << " " << h << " 0x"
1727 << std::hex << offset << "~" << length << std::dec
1728 << " to " << h->file->fnode << dendl;
1729 assert(h->pos <= h->file->fnode.size);
1730 return _flush_range(h, offset, length);
1731 }
1732
1733 int BlueFS::_truncate(FileWriter *h, uint64_t offset)
1734 {
1735 dout(10) << __func__ << " 0x" << std::hex << offset << std::dec
1736 << " file " << h->file->fnode << dendl;
1737 if (h->file->deleted) {
1738 dout(10) << __func__ << " deleted, no-op" << dendl;
1739 return 0;
1740 }
1741
1742 // we never truncate internal log files
1743 assert(h->file->fnode.ino > 1);
1744
1745 h->buffer_appender.flush();
1746
1747 // truncate off unflushed data?
1748 if (h->pos < offset &&
1749 h->pos + h->buffer.length() > offset) {
1750 bufferlist t;
1751 dout(20) << __func__ << " tossing out last " << offset - h->pos
1752 << " unflushed bytes" << dendl;
1753 t.substr_of(h->buffer, 0, offset - h->pos);
1754 h->buffer.swap(t);
1755 assert(0 == "actually this shouldn't happen");
1756 }
1757 if (h->buffer.length()) {
1758 int r = _flush(h, true);
1759 if (r < 0)
1760 return r;
1761 }
1762 if (offset == h->file->fnode.size) {
1763 return 0; // no-op!
1764 }
1765 if (offset > h->file->fnode.size) {
1766 assert(0 == "truncate up not supported");
1767 }
1768 assert(h->file->fnode.size >= offset);
1769 h->file->fnode.size = offset;
1770 log_t.op_file_update(h->file->fnode);
1771 return 0;
1772 }
1773
1774 int BlueFS::_fsync(FileWriter *h, std::unique_lock<std::mutex>& l)
1775 {
1776 dout(10) << __func__ << " " << h << " " << h->file->fnode << dendl;
1777 int r = _flush(h, true);
1778 if (r < 0)
1779 return r;
1780 uint64_t old_dirty_seq = h->file->dirty_seq;
1781
1782 _flush_bdev_safely(h);
1783
1784 if (old_dirty_seq) {
1785 uint64_t s = log_seq;
1786 dout(20) << __func__ << " file metadata was dirty (" << old_dirty_seq
1787 << ") on " << h->file->fnode << ", flushing log" << dendl;
1788 _flush_and_sync_log(l, old_dirty_seq);
1789 assert(h->file->dirty_seq == 0 || // cleaned
1790 h->file->dirty_seq > s); // or redirtied by someone else
1791 }
1792 return 0;
1793 }
1794
1795 void BlueFS::_flush_bdev_safely(FileWriter *h)
1796 {
1797 if (!cct->_conf->bluefs_sync_write) {
1798 list<aio_t> completed_ios;
1799 _claim_completed_aios(h, &completed_ios);
1800 lock.unlock();
1801 wait_for_aio(h);
1802 completed_ios.clear();
1803 flush_bdev();
1804 lock.lock();
1805 } else {
1806 lock.unlock();
1807 flush_bdev();
1808 lock.lock();
1809 }
1810 }
1811
1812 void BlueFS::flush_bdev()
1813 {
1814 // NOTE: this is safe to call without a lock.
1815 dout(20) << __func__ << dendl;
1816 for (auto p : bdev) {
1817 if (p)
1818 p->flush();
1819 }
1820 }
1821
1822 int BlueFS::_allocate(uint8_t id, uint64_t len,
1823 bluefs_fnode_t* node)
1824 {
1825 dout(10) << __func__ << " len 0x" << std::hex << len << std::dec
1826 << " from " << (int)id << dendl;
1827 assert(id < alloc.size());
1828 uint64_t min_alloc_size = cct->_conf->bluefs_alloc_size;
1829
1830 uint64_t left = ROUND_UP_TO(len, min_alloc_size);
1831 int r = -ENOSPC;
1832 int64_t alloc_len = 0;
1833 AllocExtentVector extents;
1834
1835 if (alloc[id]) {
1836 r = alloc[id]->reserve(left);
1837 }
1838
1839 if (r == 0) {
1840 uint64_t hint = 0;
1841 if (!node->extents.empty() && node->extents.back().bdev == id) {
1842 hint = node->extents.back().end();
1843 }
1844 extents.reserve(4); // 4 should be (more than) enough for most allocations
1845 alloc_len = alloc[id]->allocate(left, min_alloc_size, hint, &extents);
1846 }
1847 if (r < 0 || (alloc_len < (int64_t)left)) {
1848 if (r == 0) {
1849 alloc[id]->unreserve(left - alloc_len);
1850 for (auto& p : extents) {
1851 alloc[id]->release(p.offset, p.length);
1852 }
1853 }
1854 if (id != BDEV_SLOW) {
1855 if (bdev[id]) {
1856 dout(1) << __func__ << " failed to allocate 0x" << std::hex << left
1857 << " on bdev " << (int)id
1858 << ", free 0x" << alloc[id]->get_free()
1859 << "; fallback to bdev " << (int)id + 1
1860 << std::dec << dendl;
1861 }
1862 return _allocate(id + 1, len, node);
1863 }
1864 if (bdev[id])
1865 derr << __func__ << " failed to allocate 0x" << std::hex << left
1866 << " on bdev " << (int)id
1867 << ", free 0x" << alloc[id]->get_free() << std::dec << dendl;
1868 else
1869 derr << __func__ << " failed to allocate 0x" << std::hex << left
1870 << " on bdev " << (int)id << ", dne" << std::dec << dendl;
1871 if (alloc[id])
1872 alloc[id]->dump();
1873 return -ENOSPC;
1874 }
1875
1876 for (auto& p : extents) {
1877 node->append_extent(bluefs_extent_t(id, p.offset, p.length));
1878 }
1879
1880 return 0;
1881 }
1882
1883 int BlueFS::_preallocate(FileRef f, uint64_t off, uint64_t len)
1884 {
1885 dout(10) << __func__ << " file " << f->fnode << " 0x"
1886 << std::hex << off << "~" << len << std::dec << dendl;
1887 if (f->deleted) {
1888 dout(10) << __func__ << " deleted, no-op" << dendl;
1889 return 0;
1890 }
1891 assert(f->fnode.ino > 1);
1892 uint64_t allocated = f->fnode.get_allocated();
1893 if (off + len > allocated) {
1894 uint64_t want = off + len - allocated;
1895 int r = _allocate(f->fnode.prefer_bdev, want, &f->fnode);
1896 if (r < 0)
1897 return r;
1898 log_t.op_file_update(f->fnode);
1899 }
1900 return 0;
1901 }
1902
1903 void BlueFS::sync_metadata()
1904 {
1905 std::unique_lock<std::mutex> l(lock);
1906 if (log_t.empty()) {
1907 dout(10) << __func__ << " - no pending log events" << dendl;
1908 return;
1909 }
1910 dout(10) << __func__ << dendl;
1911 utime_t start = ceph_clock_now();
1912 vector<interval_set<uint64_t>> to_release(pending_release.size());
1913 to_release.swap(pending_release);
1914 flush_bdev(); // FIXME?
1915 _flush_and_sync_log(l);
1916 for (unsigned i = 0; i < to_release.size(); ++i) {
1917 for (auto p = to_release[i].begin(); p != to_release[i].end(); ++p) {
1918 alloc[i]->release(p.get_start(), p.get_len());
1919 }
1920 }
1921
1922 if (_should_compact_log()) {
1923 if (cct->_conf->bluefs_compact_log_sync) {
1924 _compact_log_sync();
1925 } else {
1926 _compact_log_async(l);
1927 }
1928 }
1929
1930 utime_t end = ceph_clock_now();
1931 utime_t dur = end - start;
1932 dout(10) << __func__ << " done in " << dur << dendl;
1933 }
1934
1935 int BlueFS::open_for_write(
1936 const string& dirname,
1937 const string& filename,
1938 FileWriter **h,
1939 bool overwrite)
1940 {
1941 std::lock_guard<std::mutex> l(lock);
1942 dout(10) << __func__ << " " << dirname << "/" << filename << dendl;
1943 map<string,DirRef>::iterator p = dir_map.find(dirname);
1944 DirRef dir;
1945 if (p == dir_map.end()) {
1946 // implicitly create the dir
1947 dout(20) << __func__ << " dir " << dirname
1948 << " does not exist" << dendl;
1949 return -ENOENT;
1950 } else {
1951 dir = p->second;
1952 }
1953
1954 FileRef file;
1955 bool create = false;
1956 map<string,FileRef>::iterator q = dir->file_map.find(filename);
1957 if (q == dir->file_map.end()) {
1958 if (overwrite) {
1959 dout(20) << __func__ << " dir " << dirname << " (" << dir
1960 << ") file " << filename
1961 << " does not exist" << dendl;
1962 return -ENOENT;
1963 }
1964 file = new File;
1965 file->fnode.ino = ++ino_last;
1966 file_map[ino_last] = file;
1967 dir->file_map[filename] = file;
1968 ++file->refs;
1969 create = true;
1970 } else {
1971 // overwrite existing file?
1972 file = q->second;
1973 if (overwrite) {
1974 dout(20) << __func__ << " dir " << dirname << " (" << dir
1975 << ") file " << filename
1976 << " already exists, overwrite in place" << dendl;
1977 } else {
1978 dout(20) << __func__ << " dir " << dirname << " (" << dir
1979 << ") file " << filename
1980 << " already exists, truncate + overwrite" << dendl;
1981 file->fnode.size = 0;
1982 for (auto& p : file->fnode.extents) {
1983 pending_release[p.bdev].insert(p.offset, p.length);
1984 }
1985
1986 file->fnode.clear_extents();
1987 }
1988 }
1989 assert(file->fnode.ino > 1);
1990
1991 file->fnode.mtime = ceph_clock_now();
1992 file->fnode.prefer_bdev = BlueFS::BDEV_DB;
1993 if (dirname.length() > 5) {
1994 // the "db.slow" and "db.wal" directory names are hard-coded at
1995 // match up with bluestore. the slow device is always the second
1996 // one (when a dedicated block.db device is present and used at
1997 // bdev 0). the wal device is always last.
1998 if (boost::algorithm::ends_with(dirname, ".slow")) {
1999 file->fnode.prefer_bdev = BlueFS::BDEV_SLOW;
2000 } else if (boost::algorithm::ends_with(dirname, ".wal")) {
2001 file->fnode.prefer_bdev = BlueFS::BDEV_WAL;
2002 }
2003 }
2004 dout(20) << __func__ << " mapping " << dirname << "/" << filename
2005 << " to bdev " << (int)file->fnode.prefer_bdev << dendl;
2006
2007 log_t.op_file_update(file->fnode);
2008 if (create)
2009 log_t.op_dir_link(dirname, filename, file->fnode.ino);
2010
2011 *h = _create_writer(file);
2012
2013 if (boost::algorithm::ends_with(filename, ".log")) {
2014 (*h)->writer_type = BlueFS::WRITER_WAL;
2015 if (logger && !overwrite) {
2016 logger->inc(l_bluefs_files_written_wal);
2017 }
2018 } else if (boost::algorithm::ends_with(filename, ".sst")) {
2019 (*h)->writer_type = BlueFS::WRITER_SST;
2020 if (logger) {
2021 logger->inc(l_bluefs_files_written_sst);
2022 }
2023 }
2024
2025 dout(10) << __func__ << " h " << *h << " on " << file->fnode << dendl;
2026 return 0;
2027 }
2028
2029 BlueFS::FileWriter *BlueFS::_create_writer(FileRef f)
2030 {
2031 FileWriter *w = new FileWriter(f);
2032 for (unsigned i = 0; i < MAX_BDEV; ++i) {
2033 if (bdev[i]) {
2034 w->iocv[i] = new IOContext(cct, NULL);
2035 } else {
2036 w->iocv[i] = NULL;
2037 }
2038 }
2039 return w;
2040 }
2041
2042 void BlueFS::_close_writer(FileWriter *h)
2043 {
2044 dout(10) << __func__ << " " << h << " type " << h->writer_type << dendl;
2045 for (unsigned i=0; i<MAX_BDEV; ++i) {
2046 if (bdev[i]) {
2047 assert(h->iocv[i]);
2048 h->iocv[i]->aio_wait();
2049 bdev[i]->queue_reap_ioc(h->iocv[i]);
2050 }
2051 }
2052 delete h;
2053 }
2054
2055 int BlueFS::open_for_read(
2056 const string& dirname,
2057 const string& filename,
2058 FileReader **h,
2059 bool random)
2060 {
2061 std::lock_guard<std::mutex> l(lock);
2062 dout(10) << __func__ << " " << dirname << "/" << filename
2063 << (random ? " (random)":" (sequential)") << dendl;
2064 map<string,DirRef>::iterator p = dir_map.find(dirname);
2065 if (p == dir_map.end()) {
2066 dout(20) << __func__ << " dir " << dirname << " not found" << dendl;
2067 return -ENOENT;
2068 }
2069 DirRef dir = p->second;
2070
2071 map<string,FileRef>::iterator q = dir->file_map.find(filename);
2072 if (q == dir->file_map.end()) {
2073 dout(20) << __func__ << " dir " << dirname << " (" << dir
2074 << ") file " << filename
2075 << " not found" << dendl;
2076 return -ENOENT;
2077 }
2078 File *file = q->second.get();
2079
2080 *h = new FileReader(file, random ? 4096 : cct->_conf->bluefs_max_prefetch,
2081 random, false);
2082 dout(10) << __func__ << " h " << *h << " on " << file->fnode << dendl;
2083 return 0;
2084 }
2085
2086 int BlueFS::rename(
2087 const string& old_dirname, const string& old_filename,
2088 const string& new_dirname, const string& new_filename)
2089 {
2090 std::lock_guard<std::mutex> l(lock);
2091 dout(10) << __func__ << " " << old_dirname << "/" << old_filename
2092 << " -> " << new_dirname << "/" << new_filename << dendl;
2093 map<string,DirRef>::iterator p = dir_map.find(old_dirname);
2094 if (p == dir_map.end()) {
2095 dout(20) << __func__ << " dir " << old_dirname << " not found" << dendl;
2096 return -ENOENT;
2097 }
2098 DirRef old_dir = p->second;
2099 map<string,FileRef>::iterator q = old_dir->file_map.find(old_filename);
2100 if (q == old_dir->file_map.end()) {
2101 dout(20) << __func__ << " dir " << old_dirname << " (" << old_dir
2102 << ") file " << old_filename
2103 << " not found" << dendl;
2104 return -ENOENT;
2105 }
2106 FileRef file = q->second;
2107
2108 p = dir_map.find(new_dirname);
2109 if (p == dir_map.end()) {
2110 dout(20) << __func__ << " dir " << new_dirname << " not found" << dendl;
2111 return -ENOENT;
2112 }
2113 DirRef new_dir = p->second;
2114 q = new_dir->file_map.find(new_filename);
2115 if (q != new_dir->file_map.end()) {
2116 dout(20) << __func__ << " dir " << new_dirname << " (" << old_dir
2117 << ") file " << new_filename
2118 << " already exists, unlinking" << dendl;
2119 assert(q->second != file);
2120 log_t.op_dir_unlink(new_dirname, new_filename);
2121 _drop_link(q->second);
2122 }
2123
2124 dout(10) << __func__ << " " << new_dirname << "/" << new_filename << " "
2125 << " " << file->fnode << dendl;
2126
2127 new_dir->file_map[new_filename] = file;
2128 old_dir->file_map.erase(old_filename);
2129
2130 log_t.op_dir_link(new_dirname, new_filename, file->fnode.ino);
2131 log_t.op_dir_unlink(old_dirname, old_filename);
2132 return 0;
2133 }
2134
2135 int BlueFS::mkdir(const string& dirname)
2136 {
2137 std::lock_guard<std::mutex> l(lock);
2138 dout(10) << __func__ << " " << dirname << dendl;
2139 map<string,DirRef>::iterator p = dir_map.find(dirname);
2140 if (p != dir_map.end()) {
2141 dout(20) << __func__ << " dir " << dirname << " exists" << dendl;
2142 return -EEXIST;
2143 }
2144 dir_map[dirname] = new Dir;
2145 log_t.op_dir_create(dirname);
2146 return 0;
2147 }
2148
2149 int BlueFS::rmdir(const string& dirname)
2150 {
2151 std::lock_guard<std::mutex> l(lock);
2152 dout(10) << __func__ << " " << dirname << dendl;
2153 map<string,DirRef>::iterator p = dir_map.find(dirname);
2154 if (p == dir_map.end()) {
2155 dout(20) << __func__ << " dir " << dirname << " does not exist" << dendl;
2156 return -ENOENT;
2157 }
2158 DirRef dir = p->second;
2159 if (!dir->file_map.empty()) {
2160 dout(20) << __func__ << " dir " << dirname << " not empty" << dendl;
2161 return -ENOTEMPTY;
2162 }
2163 dir_map.erase(dirname);
2164 log_t.op_dir_remove(dirname);
2165 return 0;
2166 }
2167
2168 bool BlueFS::dir_exists(const string& dirname)
2169 {
2170 std::lock_guard<std::mutex> l(lock);
2171 map<string,DirRef>::iterator p = dir_map.find(dirname);
2172 bool exists = p != dir_map.end();
2173 dout(10) << __func__ << " " << dirname << " = " << (int)exists << dendl;
2174 return exists;
2175 }
2176
2177 int BlueFS::stat(const string& dirname, const string& filename,
2178 uint64_t *size, utime_t *mtime)
2179 {
2180 std::lock_guard<std::mutex> l(lock);
2181 dout(10) << __func__ << " " << dirname << "/" << filename << dendl;
2182 map<string,DirRef>::iterator p = dir_map.find(dirname);
2183 if (p == dir_map.end()) {
2184 dout(20) << __func__ << " dir " << dirname << " not found" << dendl;
2185 return -ENOENT;
2186 }
2187 DirRef dir = p->second;
2188 map<string,FileRef>::iterator q = dir->file_map.find(filename);
2189 if (q == dir->file_map.end()) {
2190 dout(20) << __func__ << " dir " << dirname << " (" << dir
2191 << ") file " << filename
2192 << " not found" << dendl;
2193 return -ENOENT;
2194 }
2195 File *file = q->second.get();
2196 dout(10) << __func__ << " " << dirname << "/" << filename
2197 << " " << file->fnode << dendl;
2198 if (size)
2199 *size = file->fnode.size;
2200 if (mtime)
2201 *mtime = file->fnode.mtime;
2202 return 0;
2203 }
2204
2205 int BlueFS::lock_file(const string& dirname, const string& filename,
2206 FileLock **plock)
2207 {
2208 std::lock_guard<std::mutex> l(lock);
2209 dout(10) << __func__ << " " << dirname << "/" << filename << dendl;
2210 map<string,DirRef>::iterator p = dir_map.find(dirname);
2211 if (p == dir_map.end()) {
2212 dout(20) << __func__ << " dir " << dirname << " not found" << dendl;
2213 return -ENOENT;
2214 }
2215 DirRef dir = p->second;
2216 map<string,FileRef>::iterator q = dir->file_map.find(filename);
2217 File *file;
2218 if (q == dir->file_map.end()) {
2219 dout(20) << __func__ << " dir " << dirname << " (" << dir
2220 << ") file " << filename
2221 << " not found, creating" << dendl;
2222 file = new File;
2223 file->fnode.ino = ++ino_last;
2224 file->fnode.mtime = ceph_clock_now();
2225 file_map[ino_last] = file;
2226 dir->file_map[filename] = file;
2227 ++file->refs;
2228 log_t.op_file_update(file->fnode);
2229 log_t.op_dir_link(dirname, filename, file->fnode.ino);
2230 } else {
2231 file = q->second.get();
2232 if (file->locked) {
2233 dout(10) << __func__ << " already locked" << dendl;
2234 return -EBUSY;
2235 }
2236 }
2237 file->locked = true;
2238 *plock = new FileLock(file);
2239 dout(10) << __func__ << " locked " << file->fnode
2240 << " with " << *plock << dendl;
2241 return 0;
2242 }
2243
2244 int BlueFS::unlock_file(FileLock *fl)
2245 {
2246 std::lock_guard<std::mutex> l(lock);
2247 dout(10) << __func__ << " " << fl << " on " << fl->file->fnode << dendl;
2248 assert(fl->file->locked);
2249 fl->file->locked = false;
2250 delete fl;
2251 return 0;
2252 }
2253
2254 int BlueFS::readdir(const string& dirname, vector<string> *ls)
2255 {
2256 std::lock_guard<std::mutex> l(lock);
2257 dout(10) << __func__ << " " << dirname << dendl;
2258 if (dirname.empty()) {
2259 // list dirs
2260 ls->reserve(dir_map.size() + 2);
2261 for (auto& q : dir_map) {
2262 ls->push_back(q.first);
2263 }
2264 } else {
2265 // list files in dir
2266 map<string,DirRef>::iterator p = dir_map.find(dirname);
2267 if (p == dir_map.end()) {
2268 dout(20) << __func__ << " dir " << dirname << " not found" << dendl;
2269 return -ENOENT;
2270 }
2271 DirRef dir = p->second;
2272 ls->reserve(dir->file_map.size() + 2);
2273 for (auto& q : dir->file_map) {
2274 ls->push_back(q.first);
2275 }
2276 }
2277 ls->push_back(".");
2278 ls->push_back("..");
2279 return 0;
2280 }
2281
2282 int BlueFS::unlink(const string& dirname, const string& filename)
2283 {
2284 std::lock_guard<std::mutex> l(lock);
2285 dout(10) << __func__ << " " << dirname << "/" << filename << dendl;
2286 map<string,DirRef>::iterator p = dir_map.find(dirname);
2287 if (p == dir_map.end()) {
2288 dout(20) << __func__ << " dir " << dirname << " not found" << dendl;
2289 return -ENOENT;
2290 }
2291 DirRef dir = p->second;
2292 map<string,FileRef>::iterator q = dir->file_map.find(filename);
2293 if (q == dir->file_map.end()) {
2294 dout(20) << __func__ << " file " << dirname << "/" << filename
2295 << " not found" << dendl;
2296 return -ENOENT;
2297 }
2298 FileRef file = q->second;
2299 if (file->locked) {
2300 dout(20) << __func__ << " file " << dirname << "/" << filename
2301 << " is locked" << dendl;
2302 return -EBUSY;
2303 }
2304 dir->file_map.erase(filename);
2305 log_t.op_dir_unlink(dirname, filename);
2306 _drop_link(file);
2307 return 0;
2308 }
2309
2310 bool BlueFS::wal_is_rotational()
2311 {
2312 if (bdev[BDEV_WAL]) {
2313 return bdev[BDEV_WAL]->is_rotational();
2314 } else if (bdev[BDEV_DB]) {
2315 return bdev[BDEV_DB]->is_rotational();
2316 }
2317 return bdev[BDEV_SLOW]->is_rotational();
2318 }