]> git.proxmox.com Git - ceph.git/blob - ceph/src/crimson/os/seastore/async_cleaner.cc
update ceph source to reef 18.2.1
[ceph.git] / ceph / src / crimson / os / seastore / async_cleaner.cc
1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2 // vim: ts=8 sw=2 smarttab
3
4 #include <fmt/chrono.h>
5 #include <seastar/core/metrics.hh>
6
7 #include "crimson/os/seastore/logging.h"
8
9 #include "crimson/os/seastore/async_cleaner.h"
10 #include "crimson/os/seastore/backref_manager.h"
11 #include "crimson/os/seastore/transaction_manager.h"
12
13 SET_SUBSYS(seastore_cleaner);
14
15 namespace {
16
17 enum class gc_formula_t {
18 GREEDY,
19 BENEFIT,
20 COST_BENEFIT,
21 };
22 constexpr auto gc_formula = gc_formula_t::COST_BENEFIT;
23
24 }
25
26 namespace crimson::os::seastore {
27
28 void segment_info_t::set_open(
29 segment_seq_t _seq, segment_type_t _type,
30 data_category_t _category, rewrite_gen_t _generation)
31 {
32 ceph_assert(_seq != NULL_SEG_SEQ);
33 ceph_assert(_type != segment_type_t::NULL_SEG);
34 ceph_assert(_category != data_category_t::NUM);
35 ceph_assert(is_rewrite_generation(_generation));
36 state = Segment::segment_state_t::OPEN;
37 seq = _seq;
38 type = _type;
39 category = _category;
40 generation = _generation;
41 written_to = 0;
42 }
43
44 void segment_info_t::set_empty()
45 {
46 state = Segment::segment_state_t::EMPTY;
47 seq = NULL_SEG_SEQ;
48 type = segment_type_t::NULL_SEG;
49 category = data_category_t::NUM;
50 generation = NULL_GENERATION;
51 modify_time = NULL_TIME;
52 num_extents = 0;
53 written_to = 0;
54 }
55
56 void segment_info_t::set_closed()
57 {
58 state = Segment::segment_state_t::CLOSED;
59 // the rest of information is unchanged
60 }
61
62 void segment_info_t::init_closed(
63 segment_seq_t _seq, segment_type_t _type,
64 data_category_t _category, rewrite_gen_t _generation,
65 segment_off_t seg_size)
66 {
67 ceph_assert(_seq != NULL_SEG_SEQ);
68 ceph_assert(_type != segment_type_t::NULL_SEG);
69 ceph_assert(_category != data_category_t::NUM);
70 ceph_assert(is_rewrite_generation(_generation));
71 state = Segment::segment_state_t::CLOSED;
72 seq = _seq;
73 type = _type;
74 category = _category;
75 generation = _generation;
76 written_to = seg_size;
77 }
78
79 std::ostream& operator<<(std::ostream &out, const segment_info_t &info)
80 {
81 out << "seg_info_t("
82 << "state=" << info.state
83 << ", " << info.id;
84 if (info.is_empty()) {
85 // pass
86 } else { // open or closed
87 out << " " << info.type
88 << " " << segment_seq_printer_t{info.seq}
89 << " " << info.category
90 << " " << rewrite_gen_printer_t{info.generation}
91 << ", modify_time=" << sea_time_point_printer_t{info.modify_time}
92 << ", num_extents=" << info.num_extents
93 << ", written_to=" << info.written_to;
94 }
95 return out << ")";
96 }
97
98 void segments_info_t::reset()
99 {
100 segments.clear();
101
102 segment_size = 0;
103
104 journal_segment_id = NULL_SEG_ID;
105 num_in_journal_open = 0;
106 num_type_journal = 0;
107 num_type_ool = 0;
108
109 num_open = 0;
110 num_empty = 0;
111 num_closed = 0;
112
113 count_open_journal = 0;
114 count_open_ool = 0;
115 count_release_journal = 0;
116 count_release_ool = 0;
117 count_close_journal = 0;
118 count_close_ool = 0;
119
120 total_bytes = 0;
121 avail_bytes_in_open = 0;
122
123 modify_times.clear();
124 }
125
126 void segments_info_t::add_segment_manager(
127 SegmentManager &segment_manager)
128 {
129 LOG_PREFIX(segments_info_t::add_segment_manager);
130 device_id_t d_id = segment_manager.get_device_id();
131 auto ssize = segment_manager.get_segment_size();
132 auto nsegments = segment_manager.get_num_segments();
133 auto sm_size = segment_manager.get_available_size();
134 INFO("adding segment manager {}, size={}, ssize={}, segments={}",
135 device_id_printer_t{d_id}, sm_size, ssize, nsegments);
136 ceph_assert(ssize > 0);
137 ceph_assert(nsegments > 0);
138 ceph_assert(sm_size > 0);
139
140 // also validate if the device is duplicated
141 segments.add_device(d_id, nsegments, segment_info_t{});
142
143 // assume all the segment managers share the same settings as follows.
144 if (segment_size == 0) {
145 ceph_assert(ssize > 0);
146 segment_size = ssize;
147 } else {
148 ceph_assert(segment_size == ssize);
149 }
150
151 // NOTE: by default the segments are empty
152 num_empty += nsegments;
153
154 total_bytes += sm_size;
155 }
156
157 void segments_info_t::init_closed(
158 segment_id_t segment, segment_seq_t seq, segment_type_t type,
159 data_category_t category, rewrite_gen_t generation)
160 {
161 LOG_PREFIX(segments_info_t::init_closed);
162 auto& segment_info = segments[segment];
163 DEBUG("initiating {} {} {} {} {}, {}, "
164 "num_segments(empty={}, opened={}, closed={})",
165 segment, type, segment_seq_printer_t{seq},
166 category, rewrite_gen_printer_t{generation},
167 segment_info, num_empty, num_open, num_closed);
168 ceph_assert(segment_info.is_empty());
169 ceph_assert(num_empty > 0);
170 --num_empty;
171 ++num_closed;
172 if (type == segment_type_t::JOURNAL) {
173 // init_closed won't initialize journal_segment_id
174 ceph_assert(get_submitted_journal_head() == JOURNAL_SEQ_NULL);
175 ++num_type_journal;
176 } else {
177 ++num_type_ool;
178 }
179 // do not increment count_close_*;
180
181 if (segment_info.modify_time != NULL_TIME) {
182 modify_times.insert(segment_info.modify_time);
183 } else {
184 ceph_assert(segment_info.num_extents == 0);
185 }
186
187 segment_info.init_closed(
188 seq, type, category, generation, get_segment_size());
189 }
190
191 void segments_info_t::mark_open(
192 segment_id_t segment, segment_seq_t seq, segment_type_t type,
193 data_category_t category, rewrite_gen_t generation)
194 {
195 LOG_PREFIX(segments_info_t::mark_open);
196 auto& segment_info = segments[segment];
197 INFO("opening {} {} {} {} {}, {}, "
198 "num_segments(empty={}, opened={}, closed={})",
199 segment, type, segment_seq_printer_t{seq},
200 category, rewrite_gen_printer_t{generation},
201 segment_info, num_empty, num_open, num_closed);
202 ceph_assert(segment_info.is_empty());
203 ceph_assert(num_empty > 0);
204 --num_empty;
205 ++num_open;
206 if (type == segment_type_t::JOURNAL) {
207 if (journal_segment_id != NULL_SEG_ID) {
208 auto& last_journal_segment = segments[journal_segment_id];
209 ceph_assert(last_journal_segment.is_closed());
210 ceph_assert(last_journal_segment.type == segment_type_t::JOURNAL);
211 ceph_assert(last_journal_segment.seq + 1 == seq);
212 }
213 journal_segment_id = segment;
214
215 ++num_in_journal_open;
216 ++num_type_journal;
217 ++count_open_journal;
218 } else {
219 ++num_type_ool;
220 ++count_open_ool;
221 }
222 avail_bytes_in_open += get_segment_size();
223
224 segment_info.set_open(seq, type, category, generation);
225 }
226
227 void segments_info_t::mark_empty(
228 segment_id_t segment)
229 {
230 LOG_PREFIX(segments_info_t::mark_empty);
231 auto& segment_info = segments[segment];
232 INFO("releasing {}, {}, num_segments(empty={}, opened={}, closed={})",
233 segment, segment_info,
234 num_empty, num_open, num_closed);
235 ceph_assert(segment_info.is_closed());
236 auto type = segment_info.type;
237 assert(type != segment_type_t::NULL_SEG);
238 ceph_assert(num_closed > 0);
239 --num_closed;
240 ++num_empty;
241 if (type == segment_type_t::JOURNAL) {
242 ceph_assert(num_type_journal > 0);
243 --num_type_journal;
244 ++count_release_journal;
245 } else {
246 ceph_assert(num_type_ool > 0);
247 --num_type_ool;
248 ++count_release_ool;
249 }
250
251 if (segment_info.modify_time != NULL_TIME) {
252 auto to_erase = modify_times.find(segment_info.modify_time);
253 ceph_assert(to_erase != modify_times.end());
254 modify_times.erase(to_erase);
255 } else {
256 ceph_assert(segment_info.num_extents == 0);
257 }
258
259 segment_info.set_empty();
260 }
261
262 void segments_info_t::mark_closed(
263 segment_id_t segment)
264 {
265 LOG_PREFIX(segments_info_t::mark_closed);
266 auto& segment_info = segments[segment];
267 INFO("closing {}, {}, num_segments(empty={}, opened={}, closed={})",
268 segment, segment_info,
269 num_empty, num_open, num_closed);
270 ceph_assert(segment_info.is_open());
271 ceph_assert(num_open > 0);
272 --num_open;
273 ++num_closed;
274 if (segment_info.type == segment_type_t::JOURNAL) {
275 ceph_assert(num_in_journal_open > 0);
276 --num_in_journal_open;
277 ++count_close_journal;
278 } else {
279 ++count_close_ool;
280 }
281 ceph_assert(get_segment_size() >= segment_info.written_to);
282 auto seg_avail_bytes = get_segment_size() - segment_info.written_to;
283 ceph_assert(avail_bytes_in_open >= (std::size_t)seg_avail_bytes);
284 avail_bytes_in_open -= seg_avail_bytes;
285
286 if (segment_info.modify_time != NULL_TIME) {
287 modify_times.insert(segment_info.modify_time);
288 } else {
289 ceph_assert(segment_info.num_extents == 0);
290 }
291
292 segment_info.set_closed();
293 }
294
295 void segments_info_t::update_written_to(
296 segment_type_t type,
297 paddr_t offset)
298 {
299 LOG_PREFIX(segments_info_t::update_written_to);
300 auto& saddr = offset.as_seg_paddr();
301 auto& segment_info = segments[saddr.get_segment_id()];
302 if (!segment_info.is_open()) {
303 ERROR("segment is not open, not updating, type={}, offset={}, {}",
304 type, offset, segment_info);
305 ceph_abort();
306 }
307
308 auto new_written_to = saddr.get_segment_off();
309 ceph_assert(new_written_to <= get_segment_size());
310 if (segment_info.written_to > new_written_to) {
311 ERROR("written_to should not decrease! type={}, offset={}, {}",
312 type, offset, segment_info);
313 ceph_abort();
314 }
315
316 DEBUG("type={}, offset={}, {}", type, offset, segment_info);
317 ceph_assert(type == segment_info.type);
318 auto avail_deduction = new_written_to - segment_info.written_to;
319 ceph_assert(avail_bytes_in_open >= (std::size_t)avail_deduction);
320 avail_bytes_in_open -= avail_deduction;
321 segment_info.written_to = new_written_to;
322 }
323
324 std::ostream &operator<<(std::ostream &os, const segments_info_t &infos)
325 {
326 return os << "segments("
327 << "empty=" << infos.get_num_empty()
328 << ", open=" << infos.get_num_open()
329 << ", closed=" << infos.get_num_closed()
330 << ", type_journal=" << infos.get_num_type_journal()
331 << ", type_ool=" << infos.get_num_type_ool()
332 << ", total=" << infos.get_total_bytes() << "B"
333 << ", available=" << infos.get_available_bytes() << "B"
334 << ", unavailable=" << infos.get_unavailable_bytes() << "B"
335 << ", available_ratio=" << infos.get_available_ratio()
336 << ", submitted_head=" << infos.get_submitted_journal_head()
337 << ", time_bound=" << sea_time_point_printer_t{infos.get_time_bound()}
338 << ")";
339 }
340
341 void JournalTrimmerImpl::config_t::validate() const
342 {
343 ceph_assert(max_journal_bytes <= DEVICE_OFF_MAX);
344 ceph_assert(max_journal_bytes > target_journal_dirty_bytes);
345 ceph_assert(max_journal_bytes > target_journal_alloc_bytes);
346 ceph_assert(rewrite_dirty_bytes_per_cycle > 0);
347 ceph_assert(rewrite_backref_bytes_per_cycle > 0);
348 }
349
350 JournalTrimmerImpl::config_t
351 JournalTrimmerImpl::config_t::get_default(
352 std::size_t roll_size, journal_type_t type)
353 {
354 assert(roll_size);
355 std::size_t target_dirty_bytes = 0;
356 std::size_t target_alloc_bytes = 0;
357 std::size_t max_journal_bytes = 0;
358 if (type == journal_type_t::SEGMENTED) {
359 target_dirty_bytes = 12 * roll_size;
360 target_alloc_bytes = 2 * roll_size;
361 max_journal_bytes = 16 * roll_size;
362 } else {
363 assert(type == journal_type_t::RANDOM_BLOCK);
364 target_dirty_bytes = roll_size / 4;
365 target_alloc_bytes = roll_size / 4;
366 max_journal_bytes = roll_size / 2;
367 }
368 return config_t{
369 target_dirty_bytes,
370 target_alloc_bytes,
371 max_journal_bytes,
372 1<<17,// rewrite_dirty_bytes_per_cycle
373 1<<24 // rewrite_backref_bytes_per_cycle
374 };
375 }
376
377 JournalTrimmerImpl::config_t
378 JournalTrimmerImpl::config_t::get_test(
379 std::size_t roll_size, journal_type_t type)
380 {
381 assert(roll_size);
382 std::size_t target_dirty_bytes = 0;
383 std::size_t target_alloc_bytes = 0;
384 std::size_t max_journal_bytes = 0;
385 if (type == journal_type_t::SEGMENTED) {
386 target_dirty_bytes = 2 * roll_size;
387 target_alloc_bytes = 2 * roll_size;
388 max_journal_bytes = 4 * roll_size;
389 } else {
390 assert(type == journal_type_t::RANDOM_BLOCK);
391 target_dirty_bytes = roll_size / 4;
392 target_alloc_bytes = roll_size / 4;
393 max_journal_bytes = roll_size / 2;
394 }
395 return config_t{
396 target_dirty_bytes,
397 target_alloc_bytes,
398 max_journal_bytes,
399 1<<17,// rewrite_dirty_bytes_per_cycle
400 1<<24 // rewrite_backref_bytes_per_cycle
401 };
402 }
403
404 JournalTrimmerImpl::JournalTrimmerImpl(
405 BackrefManager &backref_manager,
406 config_t config,
407 journal_type_t type,
408 device_off_t roll_start,
409 device_off_t roll_size)
410 : backref_manager(backref_manager),
411 config(config),
412 journal_type(type),
413 roll_start(roll_start),
414 roll_size(roll_size),
415 reserved_usage(0)
416 {
417 config.validate();
418 ceph_assert(roll_start >= 0);
419 ceph_assert(roll_size > 0);
420 register_metrics();
421 }
422
423 void JournalTrimmerImpl::set_journal_head(journal_seq_t head)
424 {
425 LOG_PREFIX(JournalTrimmerImpl::set_journal_head);
426
427 ceph_assert(head != JOURNAL_SEQ_NULL);
428 ceph_assert(journal_head == JOURNAL_SEQ_NULL ||
429 head >= journal_head);
430 ceph_assert(journal_alloc_tail == JOURNAL_SEQ_NULL ||
431 head >= journal_alloc_tail);
432 ceph_assert(journal_dirty_tail == JOURNAL_SEQ_NULL ||
433 head >= journal_dirty_tail);
434
435 std::swap(journal_head, head);
436 if (journal_head.segment_seq == head.segment_seq) {
437 DEBUG("journal_head {} => {}, {}",
438 head, journal_head, stat_printer_t{*this, false});
439 } else {
440 INFO("journal_head {} => {}, {}",
441 head, journal_head, stat_printer_t{*this, false});
442 }
443 background_callback->maybe_wake_background();
444 }
445
446 void JournalTrimmerImpl::update_journal_tails(
447 journal_seq_t dirty_tail,
448 journal_seq_t alloc_tail)
449 {
450 LOG_PREFIX(JournalTrimmerImpl::update_journal_tails);
451
452 if (dirty_tail != JOURNAL_SEQ_NULL) {
453 ceph_assert(journal_head == JOURNAL_SEQ_NULL ||
454 journal_head >= dirty_tail);
455 if (journal_dirty_tail != JOURNAL_SEQ_NULL &&
456 journal_dirty_tail > dirty_tail) {
457 ERROR("journal_dirty_tail {} => {} is backwards!",
458 journal_dirty_tail, dirty_tail);
459 ceph_abort();
460 }
461 std::swap(journal_dirty_tail, dirty_tail);
462 if (journal_dirty_tail.segment_seq == dirty_tail.segment_seq) {
463 DEBUG("journal_dirty_tail {} => {}, {}",
464 dirty_tail, journal_dirty_tail, stat_printer_t{*this, false});
465 } else {
466 INFO("journal_dirty_tail {} => {}, {}",
467 dirty_tail, journal_dirty_tail, stat_printer_t{*this, false});
468 }
469 }
470
471 if (alloc_tail != JOURNAL_SEQ_NULL) {
472 ceph_assert(journal_head == JOURNAL_SEQ_NULL ||
473 journal_head >= alloc_tail);
474 if (journal_alloc_tail != JOURNAL_SEQ_NULL &&
475 journal_alloc_tail > alloc_tail) {
476 ERROR("journal_alloc_tail {} => {} is backwards!",
477 journal_alloc_tail, alloc_tail);
478 ceph_abort();
479 }
480 std::swap(journal_alloc_tail, alloc_tail);
481 if (journal_alloc_tail.segment_seq == alloc_tail.segment_seq) {
482 DEBUG("journal_alloc_tail {} => {}, {}",
483 alloc_tail, journal_alloc_tail, stat_printer_t{*this, false});
484 } else {
485 INFO("journal_alloc_tail {} => {}, {}",
486 alloc_tail, journal_alloc_tail, stat_printer_t{*this, false});
487 }
488 }
489
490 background_callback->maybe_wake_background();
491 background_callback->maybe_wake_blocked_io();
492 }
493
494 journal_seq_t JournalTrimmerImpl::get_tail_limit() const
495 {
496 assert(background_callback->is_ready());
497 auto ret = journal_head.add_offset(
498 journal_type,
499 -static_cast<device_off_t>(config.max_journal_bytes),
500 roll_start,
501 roll_size);
502 return ret;
503 }
504
505 journal_seq_t JournalTrimmerImpl::get_dirty_tail_target() const
506 {
507 assert(background_callback->is_ready());
508 auto ret = journal_head.add_offset(
509 journal_type,
510 -static_cast<device_off_t>(config.target_journal_dirty_bytes),
511 roll_start,
512 roll_size);
513 return ret;
514 }
515
516 journal_seq_t JournalTrimmerImpl::get_alloc_tail_target() const
517 {
518 assert(background_callback->is_ready());
519 auto ret = journal_head.add_offset(
520 journal_type,
521 -static_cast<device_off_t>(config.target_journal_alloc_bytes),
522 roll_start,
523 roll_size);
524 return ret;
525 }
526
527 std::size_t JournalTrimmerImpl::get_dirty_journal_size() const
528 {
529 if (!background_callback->is_ready()) {
530 return 0;
531 }
532 auto ret = journal_head.relative_to(
533 journal_type,
534 journal_dirty_tail,
535 roll_start,
536 roll_size);
537 ceph_assert(ret >= 0);
538 return static_cast<std::size_t>(ret);
539 }
540
541 std::size_t JournalTrimmerImpl::get_alloc_journal_size() const
542 {
543 if (!background_callback->is_ready()) {
544 return 0;
545 }
546 auto ret = journal_head.relative_to(
547 journal_type,
548 journal_alloc_tail,
549 roll_start,
550 roll_size);
551 ceph_assert(ret >= 0);
552 return static_cast<std::size_t>(ret);
553 }
554
555 seastar::future<> JournalTrimmerImpl::trim() {
556 return seastar::when_all(
557 [this] {
558 if (should_trim_alloc()) {
559 return trim_alloc(
560 ).handle_error(
561 crimson::ct_error::assert_all{
562 "encountered invalid error in trim_alloc"
563 }
564 );
565 } else {
566 return seastar::now();
567 }
568 },
569 [this] {
570 if (should_trim_dirty()) {
571 return trim_dirty(
572 ).handle_error(
573 crimson::ct_error::assert_all{
574 "encountered invalid error in trim_dirty"
575 }
576 );
577 } else {
578 return seastar::now();
579 }
580 }
581 ).discard_result();
582 }
583
584 JournalTrimmerImpl::trim_ertr::future<>
585 JournalTrimmerImpl::trim_alloc()
586 {
587 LOG_PREFIX(JournalTrimmerImpl::trim_alloc);
588 assert(background_callback->is_ready());
589 return repeat_eagain([this, FNAME] {
590 return extent_callback->with_transaction_intr(
591 Transaction::src_t::TRIM_ALLOC,
592 "trim_alloc",
593 [this, FNAME](auto &t)
594 {
595 auto target = get_alloc_tail_target();
596 DEBUGT("start, alloc_tail={}, target={}",
597 t, journal_alloc_tail, target);
598 return backref_manager.merge_cached_backrefs(
599 t,
600 target,
601 config.rewrite_backref_bytes_per_cycle
602 ).si_then([this, FNAME, &t](auto trim_alloc_to)
603 -> ExtentCallbackInterface::submit_transaction_direct_iertr::future<>
604 {
605 DEBUGT("trim_alloc_to={}", t, trim_alloc_to);
606 if (trim_alloc_to != JOURNAL_SEQ_NULL) {
607 return extent_callback->submit_transaction_direct(
608 t, std::make_optional<journal_seq_t>(trim_alloc_to));
609 }
610 return seastar::now();
611 });
612 });
613 }).safe_then([this, FNAME] {
614 DEBUG("finish, alloc_tail={}", journal_alloc_tail);
615 });
616 }
617
618 JournalTrimmerImpl::trim_ertr::future<>
619 JournalTrimmerImpl::trim_dirty()
620 {
621 LOG_PREFIX(JournalTrimmerImpl::trim_dirty);
622 assert(background_callback->is_ready());
623 return repeat_eagain([this, FNAME] {
624 return extent_callback->with_transaction_intr(
625 Transaction::src_t::TRIM_DIRTY,
626 "trim_dirty",
627 [this, FNAME](auto &t)
628 {
629 auto target = get_dirty_tail_target();
630 DEBUGT("start, dirty_tail={}, target={}",
631 t, journal_dirty_tail, target);
632 return extent_callback->get_next_dirty_extents(
633 t,
634 target,
635 config.rewrite_dirty_bytes_per_cycle
636 ).si_then([this, FNAME, &t](auto dirty_list) {
637 DEBUGT("rewrite {} dirty extents", t, dirty_list.size());
638 return seastar::do_with(
639 std::move(dirty_list),
640 [this, &t](auto &dirty_list)
641 {
642 return trans_intr::do_for_each(
643 dirty_list,
644 [this, &t](auto &e) {
645 return extent_callback->rewrite_extent(
646 t, e, INIT_GENERATION, NULL_TIME);
647 });
648 });
649 }).si_then([this, &t] {
650 return extent_callback->submit_transaction_direct(t);
651 });
652 });
653 }).safe_then([this, FNAME] {
654 DEBUG("finish, dirty_tail={}", journal_dirty_tail);
655 });
656 }
657
658 void JournalTrimmerImpl::register_metrics()
659 {
660 namespace sm = seastar::metrics;
661 metrics.add_group("journal_trimmer", {
662 sm::make_counter("dirty_journal_bytes",
663 [this] { return get_dirty_journal_size(); },
664 sm::description("the size of the journal for dirty extents")),
665 sm::make_counter("alloc_journal_bytes",
666 [this] { return get_alloc_journal_size(); },
667 sm::description("the size of the journal for alloc info"))
668 });
669 }
670
671 std::ostream &operator<<(
672 std::ostream &os, const JournalTrimmerImpl::stat_printer_t &stats)
673 {
674 os << "JournalTrimmer(";
675 if (stats.trimmer.background_callback->is_ready()) {
676 os << "should_block_io_on_trim=" << stats.trimmer.should_block_io_on_trim()
677 << ", should_(trim_dirty=" << stats.trimmer.should_trim_dirty()
678 << ", trim_alloc=" << stats.trimmer.should_trim_alloc() << ")";
679 } else {
680 os << "not-ready";
681 }
682 if (stats.detailed) {
683 os << ", journal_head=" << stats.trimmer.get_journal_head()
684 << ", alloc_tail=" << stats.trimmer.get_alloc_tail()
685 << ", dirty_tail=" << stats.trimmer.get_dirty_tail();
686 if (stats.trimmer.background_callback->is_ready()) {
687 os << ", alloc_tail_target=" << stats.trimmer.get_alloc_tail_target()
688 << ", dirty_tail_target=" << stats.trimmer.get_dirty_tail_target()
689 << ", tail_limit=" << stats.trimmer.get_tail_limit();
690 }
691 }
692 os << ")";
693 return os;
694 }
695
696 bool SpaceTrackerSimple::equals(const SpaceTrackerI &_other) const
697 {
698 LOG_PREFIX(SpaceTrackerSimple::equals);
699 const auto &other = static_cast<const SpaceTrackerSimple&>(_other);
700
701 if (other.live_bytes_by_segment.size() != live_bytes_by_segment.size()) {
702 ERROR("different segment counts, bug in test");
703 assert(0 == "segment counts should match");
704 return false;
705 }
706
707 bool all_match = true;
708 for (auto i = live_bytes_by_segment.begin(), j = other.live_bytes_by_segment.begin();
709 i != live_bytes_by_segment.end(); ++i, ++j) {
710 if (i->second.live_bytes != j->second.live_bytes) {
711 all_match = false;
712 DEBUG("segment_id {} live bytes mismatch *this: {}, other: {}",
713 i->first, i->second.live_bytes, j->second.live_bytes);
714 }
715 }
716 return all_match;
717 }
718
719 int64_t SpaceTrackerDetailed::SegmentMap::allocate(
720 device_segment_id_t segment,
721 segment_off_t offset,
722 extent_len_t len,
723 const extent_len_t block_size)
724 {
725 LOG_PREFIX(SegmentMap::allocate);
726 assert(offset % block_size == 0);
727 assert(len % block_size == 0);
728
729 const auto b = (offset / block_size);
730 const auto e = (offset + len) / block_size;
731
732 bool error = false;
733 for (auto i = b; i < e; ++i) {
734 if (bitmap[i]) {
735 if (!error) {
736 ERROR("found allocated in {}, {} ~ {}", segment, offset, len);
737 error = true;
738 }
739 DEBUG("block {} allocated", i * block_size);
740 }
741 bitmap[i] = true;
742 }
743 return update_usage(len);
744 }
745
746 int64_t SpaceTrackerDetailed::SegmentMap::release(
747 device_segment_id_t segment,
748 segment_off_t offset,
749 extent_len_t len,
750 const extent_len_t block_size)
751 {
752 LOG_PREFIX(SegmentMap::release);
753 assert(offset % block_size == 0);
754 assert(len % block_size == 0);
755
756 const auto b = (offset / block_size);
757 const auto e = (offset + len) / block_size;
758
759 bool error = false;
760 for (auto i = b; i < e; ++i) {
761 if (!bitmap[i]) {
762 if (!error) {
763 ERROR("found unallocated in {}, {} ~ {}", segment, offset, len);
764 error = true;
765 }
766 DEBUG("block {} unallocated", i * block_size);
767 }
768 bitmap[i] = false;
769 }
770 return update_usage(-(int64_t)len);
771 }
772
773 bool SpaceTrackerDetailed::equals(const SpaceTrackerI &_other) const
774 {
775 LOG_PREFIX(SpaceTrackerDetailed::equals);
776 const auto &other = static_cast<const SpaceTrackerDetailed&>(_other);
777
778 if (other.segment_usage.size() != segment_usage.size()) {
779 ERROR("different segment counts, bug in test");
780 assert(0 == "segment counts should match");
781 return false;
782 }
783
784 bool all_match = true;
785 for (auto i = segment_usage.begin(), j = other.segment_usage.begin();
786 i != segment_usage.end(); ++i, ++j) {
787 if (i->second.get_usage() != j->second.get_usage()) {
788 all_match = false;
789 ERROR("segment_id {} live bytes mismatch *this: {}, other: {}",
790 i->first, i->second.get_usage(), j->second.get_usage());
791 }
792 }
793 return all_match;
794 }
795
796 void SpaceTrackerDetailed::SegmentMap::dump_usage(extent_len_t block_size) const
797 {
798 LOG_PREFIX(SegmentMap::dump_usage);
799 INFO("dump start");
800 for (unsigned i = 0; i < bitmap.size(); ++i) {
801 if (bitmap[i]) {
802 LOCAL_LOGGER.info(" {} still live", i * block_size);
803 }
804 }
805 }
806
807 void SpaceTrackerDetailed::dump_usage(segment_id_t id) const
808 {
809 LOG_PREFIX(SpaceTrackerDetailed::dump_usage);
810 INFO("{}", id);
811 segment_usage[id].dump_usage(
812 block_size_by_segment_manager[id.device_id()]);
813 }
814
815 void SpaceTrackerSimple::dump_usage(segment_id_t id) const
816 {
817 LOG_PREFIX(SpaceTrackerSimple::dump_usage);
818 INFO("id: {}, live_bytes: {}",
819 id, live_bytes_by_segment[id].live_bytes);
820 }
821
822 std::ostream &operator<<(
823 std::ostream &os, const AsyncCleaner::stat_printer_t &stats)
824 {
825 stats.cleaner.print(os, stats.detailed);
826 return os;
827 }
828
829 SegmentCleaner::SegmentCleaner(
830 config_t config,
831 SegmentManagerGroupRef&& sm_group,
832 BackrefManager &backref_manager,
833 SegmentSeqAllocator &segment_seq_allocator,
834 bool detailed,
835 bool is_cold)
836 : detailed(detailed),
837 is_cold(is_cold),
838 config(config),
839 sm_group(std::move(sm_group)),
840 backref_manager(backref_manager),
841 ool_segment_seq_allocator(segment_seq_allocator)
842 {
843 config.validate();
844 }
845
846 void SegmentCleaner::register_metrics()
847 {
848 namespace sm = seastar::metrics;
849 stats.segment_util.buckets.resize(UTIL_BUCKETS);
850 std::size_t i;
851 for (i = 0; i < UTIL_BUCKETS; ++i) {
852 stats.segment_util.buckets[i].upper_bound = ((double)(i + 1)) / 10;
853 stats.segment_util.buckets[i].count = 0;
854 }
855 // NOTE: by default the segments are empty
856 i = get_bucket_index(UTIL_STATE_EMPTY);
857 stats.segment_util.buckets[i].count = segments.get_num_segments();
858
859 std::string prefix;
860 if (is_cold) {
861 prefix.append("cold_");
862 }
863 prefix.append("segment_cleaner");
864
865 metrics.add_group(prefix, {
866 sm::make_counter("segments_number",
867 [this] { return segments.get_num_segments(); },
868 sm::description("the number of segments")),
869 sm::make_counter("segment_size",
870 [this] { return segments.get_segment_size(); },
871 sm::description("the bytes of a segment")),
872 sm::make_counter("segments_in_journal",
873 [this] { return get_segments_in_journal(); },
874 sm::description("the number of segments in journal")),
875 sm::make_counter("segments_type_journal",
876 [this] { return segments.get_num_type_journal(); },
877 sm::description("the number of segments typed journal")),
878 sm::make_counter("segments_type_ool",
879 [this] { return segments.get_num_type_ool(); },
880 sm::description("the number of segments typed out-of-line")),
881 sm::make_counter("segments_open",
882 [this] { return segments.get_num_open(); },
883 sm::description("the number of open segments")),
884 sm::make_counter("segments_empty",
885 [this] { return segments.get_num_empty(); },
886 sm::description("the number of empty segments")),
887 sm::make_counter("segments_closed",
888 [this] { return segments.get_num_closed(); },
889 sm::description("the number of closed segments")),
890
891 sm::make_counter("segments_count_open_journal",
892 [this] { return segments.get_count_open_journal(); },
893 sm::description("the count of open journal segment operations")),
894 sm::make_counter("segments_count_open_ool",
895 [this] { return segments.get_count_open_ool(); },
896 sm::description("the count of open ool segment operations")),
897 sm::make_counter("segments_count_release_journal",
898 [this] { return segments.get_count_release_journal(); },
899 sm::description("the count of release journal segment operations")),
900 sm::make_counter("segments_count_release_ool",
901 [this] { return segments.get_count_release_ool(); },
902 sm::description("the count of release ool segment operations")),
903 sm::make_counter("segments_count_close_journal",
904 [this] { return segments.get_count_close_journal(); },
905 sm::description("the count of close journal segment operations")),
906 sm::make_counter("segments_count_close_ool",
907 [this] { return segments.get_count_close_ool(); },
908 sm::description("the count of close ool segment operations")),
909
910 sm::make_counter("total_bytes",
911 [this] { return segments.get_total_bytes(); },
912 sm::description("the size of the space")),
913 sm::make_counter("available_bytes",
914 [this] { return segments.get_available_bytes(); },
915 sm::description("the size of the space is available")),
916 sm::make_counter("unavailable_unreclaimable_bytes",
917 [this] { return get_unavailable_unreclaimable_bytes(); },
918 sm::description("the size of the space is unavailable and unreclaimable")),
919 sm::make_counter("unavailable_reclaimable_bytes",
920 [this] { return get_unavailable_reclaimable_bytes(); },
921 sm::description("the size of the space is unavailable and reclaimable")),
922 sm::make_counter("used_bytes", stats.used_bytes,
923 sm::description("the size of the space occupied by live extents")),
924 sm::make_counter("unavailable_unused_bytes",
925 [this] { return get_unavailable_unused_bytes(); },
926 sm::description("the size of the space is unavailable and not alive")),
927
928 sm::make_counter("projected_count", stats.projected_count,
929 sm::description("the number of projected usage reservations")),
930 sm::make_counter("projected_used_bytes_sum", stats.projected_used_bytes_sum,
931 sm::description("the sum of the projected usage in bytes")),
932
933 sm::make_counter("reclaimed_bytes", stats.reclaimed_bytes,
934 sm::description("rewritten bytes due to reclaim")),
935 sm::make_counter("reclaimed_segment_bytes", stats.reclaimed_segment_bytes,
936 sm::description("rewritten bytes due to reclaim")),
937 sm::make_counter("closed_journal_used_bytes", stats.closed_journal_used_bytes,
938 sm::description("used bytes when close a journal segment")),
939 sm::make_counter("closed_journal_total_bytes", stats.closed_journal_total_bytes,
940 sm::description("total bytes of closed journal segments")),
941 sm::make_counter("closed_ool_used_bytes", stats.closed_ool_used_bytes,
942 sm::description("used bytes when close a ool segment")),
943 sm::make_counter("closed_ool_total_bytes", stats.closed_ool_total_bytes,
944 sm::description("total bytes of closed ool segments")),
945
946 sm::make_gauge("available_ratio",
947 [this] { return segments.get_available_ratio(); },
948 sm::description("ratio of available space to total space")),
949 sm::make_gauge("reclaim_ratio",
950 [this] { return get_reclaim_ratio(); },
951 sm::description("ratio of reclaimable space to unavailable space")),
952
953 sm::make_histogram("segment_utilization_distribution",
954 [this]() -> seastar::metrics::histogram& {
955 return stats.segment_util;
956 },
957 sm::description("utilization distribution of all segments"))
958 });
959 }
960
961 segment_id_t SegmentCleaner::allocate_segment(
962 segment_seq_t seq,
963 segment_type_t type,
964 data_category_t category,
965 rewrite_gen_t generation)
966 {
967 LOG_PREFIX(SegmentCleaner::allocate_segment);
968 assert(seq != NULL_SEG_SEQ);
969 ceph_assert(type == segment_type_t::OOL ||
970 trimmer != nullptr); // segment_type_t::JOURNAL
971 for (auto it = segments.begin();
972 it != segments.end();
973 ++it) {
974 auto seg_id = it->first;
975 auto& segment_info = it->second;
976 if (segment_info.is_empty()) {
977 auto old_usage = calc_utilization(seg_id);
978 segments.mark_open(seg_id, seq, type, category, generation);
979 background_callback->maybe_wake_background();
980 auto new_usage = calc_utilization(seg_id);
981 adjust_segment_util(old_usage, new_usage);
982 INFO("opened, {}", stat_printer_t{*this, false});
983 return seg_id;
984 }
985 }
986 ERROR("out of space with {} {} {} {}",
987 type, segment_seq_printer_t{seq}, category,
988 rewrite_gen_printer_t{generation});
989 ceph_abort("seastore device size setting is too small");
990 return NULL_SEG_ID;
991 }
992
993 void SegmentCleaner::close_segment(segment_id_t segment)
994 {
995 LOG_PREFIX(SegmentCleaner::close_segment);
996 auto old_usage = calc_utilization(segment);
997 segments.mark_closed(segment);
998 auto &seg_info = segments[segment];
999 if (seg_info.type == segment_type_t::JOURNAL) {
1000 stats.closed_journal_used_bytes += space_tracker->get_usage(segment);
1001 stats.closed_journal_total_bytes += segments.get_segment_size();
1002 } else {
1003 stats.closed_ool_used_bytes += space_tracker->get_usage(segment);
1004 stats.closed_ool_total_bytes += segments.get_segment_size();
1005 }
1006 auto new_usage = calc_utilization(segment);
1007 adjust_segment_util(old_usage, new_usage);
1008 INFO("closed, {} -- {}", stat_printer_t{*this, false}, seg_info);
1009 }
1010
1011 double SegmentCleaner::calc_gc_benefit_cost(
1012 segment_id_t id,
1013 const sea_time_point &now_time,
1014 const sea_time_point &bound_time) const
1015 {
1016 double util = calc_utilization(id);
1017 ceph_assert(util >= 0 && util < 1);
1018 if constexpr (gc_formula == gc_formula_t::GREEDY) {
1019 return 1 - util;
1020 }
1021
1022 if constexpr (gc_formula == gc_formula_t::COST_BENEFIT) {
1023 if (util == 0) {
1024 return std::numeric_limits<double>::max();
1025 }
1026 auto modify_time = segments[id].modify_time;
1027 double age_segment = modify_time.time_since_epoch().count();
1028 double age_now = now_time.time_since_epoch().count();
1029 if (likely(age_now > age_segment)) {
1030 return (1 - util) * (age_now - age_segment) / (2 * util);
1031 } else {
1032 // time is wrong
1033 return (1 - util) / (2 * util);
1034 }
1035 }
1036
1037 assert(gc_formula == gc_formula_t::BENEFIT);
1038 auto modify_time = segments[id].modify_time;
1039 double age_factor = 0.5; // middle value if age is invalid
1040 if (likely(bound_time != NULL_TIME &&
1041 modify_time != NULL_TIME &&
1042 now_time > modify_time)) {
1043 assert(modify_time >= bound_time);
1044 double age_bound = bound_time.time_since_epoch().count();
1045 double age_now = now_time.time_since_epoch().count();
1046 double age_segment = modify_time.time_since_epoch().count();
1047 age_factor = (age_now - age_segment) / (age_now - age_bound);
1048 }
1049 return ((1 - 2 * age_factor) * util * util +
1050 (2 * age_factor - 2) * util + 1);
1051 }
1052
1053 SegmentCleaner::do_reclaim_space_ret
1054 SegmentCleaner::do_reclaim_space(
1055 const std::vector<CachedExtentRef> &backref_extents,
1056 const backref_pin_list_t &pin_list,
1057 std::size_t &reclaimed,
1058 std::size_t &runs)
1059 {
1060 return repeat_eagain([this, &backref_extents,
1061 &pin_list, &reclaimed, &runs] {
1062 reclaimed = 0;
1063 runs++;
1064 auto src = Transaction::src_t::CLEANER_MAIN;
1065 if (is_cold) {
1066 src = Transaction::src_t::CLEANER_COLD;
1067 }
1068 return extent_callback->with_transaction_intr(
1069 src,
1070 "clean_reclaim_space",
1071 [this, &backref_extents, &pin_list, &reclaimed](auto &t)
1072 {
1073 return seastar::do_with(
1074 std::vector<CachedExtentRef>(backref_extents),
1075 [this, &t, &reclaimed, &pin_list](auto &extents)
1076 {
1077 LOG_PREFIX(SegmentCleaner::do_reclaim_space);
1078 // calculate live extents
1079 auto cached_backref_entries =
1080 backref_manager.get_cached_backref_entries_in_range(
1081 reclaim_state->start_pos, reclaim_state->end_pos);
1082 backref_entry_query_set_t backref_entries;
1083 for (auto &pin : pin_list) {
1084 backref_entries.emplace(
1085 pin->get_key(),
1086 pin->get_val(),
1087 pin->get_length(),
1088 pin->get_type(),
1089 JOURNAL_SEQ_NULL);
1090 }
1091 for (auto &cached_backref : cached_backref_entries) {
1092 if (cached_backref.laddr == L_ADDR_NULL) {
1093 auto it = backref_entries.find(cached_backref.paddr);
1094 assert(it->len == cached_backref.len);
1095 backref_entries.erase(it);
1096 } else {
1097 backref_entries.emplace(cached_backref);
1098 }
1099 }
1100 // retrieve live extents
1101 DEBUGT("start, backref_entries={}, backref_extents={}",
1102 t, backref_entries.size(), extents.size());
1103 return seastar::do_with(
1104 std::move(backref_entries),
1105 [this, &extents, &t](auto &backref_entries) {
1106 return trans_intr::parallel_for_each(
1107 backref_entries,
1108 [this, &extents, &t](auto &ent)
1109 {
1110 LOG_PREFIX(SegmentCleaner::do_reclaim_space);
1111 TRACET("getting extent of type {} at {}~{}",
1112 t,
1113 ent.type,
1114 ent.paddr,
1115 ent.len);
1116 return extent_callback->get_extents_if_live(
1117 t, ent.type, ent.paddr, ent.laddr, ent.len
1118 ).si_then([FNAME, &extents, &ent, &t](auto list) {
1119 if (list.empty()) {
1120 TRACET("addr {} dead, skipping", t, ent.paddr);
1121 } else {
1122 for (auto &e : list) {
1123 extents.emplace_back(std::move(e));
1124 }
1125 }
1126 });
1127 });
1128 }).si_then([FNAME, &extents, this, &reclaimed, &t] {
1129 DEBUGT("reclaim {} extents", t, extents.size());
1130 // rewrite live extents
1131 auto modify_time = segments[reclaim_state->get_segment_id()].modify_time;
1132 return trans_intr::do_for_each(
1133 extents,
1134 [this, modify_time, &t, &reclaimed](auto ext)
1135 {
1136 reclaimed += ext->get_length();
1137 return extent_callback->rewrite_extent(
1138 t, ext, reclaim_state->target_generation, modify_time);
1139 });
1140 });
1141 }).si_then([this, &t] {
1142 return extent_callback->submit_transaction_direct(t);
1143 });
1144 });
1145 });
1146 }
1147
1148 SegmentCleaner::clean_space_ret SegmentCleaner::clean_space()
1149 {
1150 LOG_PREFIX(SegmentCleaner::clean_space);
1151 assert(background_callback->is_ready());
1152 ceph_assert(can_clean_space());
1153 if (!reclaim_state) {
1154 segment_id_t seg_id = get_next_reclaim_segment();
1155 auto &segment_info = segments[seg_id];
1156 INFO("reclaim {} {} start, usage={}, time_bound={}",
1157 seg_id, segment_info,
1158 space_tracker->calc_utilization(seg_id),
1159 sea_time_point_printer_t{segments.get_time_bound()});
1160 ceph_assert(segment_info.is_closed());
1161 reclaim_state = reclaim_state_t::create(
1162 seg_id, segment_info.generation, segments.get_segment_size());
1163 }
1164 reclaim_state->advance(config.reclaim_bytes_per_cycle);
1165
1166 DEBUG("reclaiming {} {}~{}",
1167 rewrite_gen_printer_t{reclaim_state->generation},
1168 reclaim_state->start_pos,
1169 reclaim_state->end_pos);
1170 double pavail_ratio = get_projected_available_ratio();
1171 sea_time_point start = seastar::lowres_system_clock::now();
1172
1173 // Backref-tree doesn't support tree-read during tree-updates with parallel
1174 // transactions. So, concurrent transactions between trim and reclaim are
1175 // not allowed right now.
1176 return seastar::do_with(
1177 std::pair<std::vector<CachedExtentRef>, backref_pin_list_t>(),
1178 [this](auto &weak_read_ret) {
1179 return repeat_eagain([this, &weak_read_ret] {
1180 return extent_callback->with_transaction_intr(
1181 Transaction::src_t::READ,
1182 "retrieve_from_backref_tree",
1183 [this, &weak_read_ret](auto &t) {
1184 return backref_manager.get_mappings(
1185 t,
1186 reclaim_state->start_pos,
1187 reclaim_state->end_pos
1188 ).si_then([this, &t, &weak_read_ret](auto pin_list) {
1189 if (!pin_list.empty()) {
1190 auto it = pin_list.begin();
1191 auto &first_pin = *it;
1192 if (first_pin->get_key() < reclaim_state->start_pos) {
1193 // BackrefManager::get_mappings may include a entry before
1194 // reclaim_state->start_pos, which is semantically inconsistent
1195 // with the requirements of the cleaner
1196 pin_list.erase(it);
1197 }
1198 }
1199 return backref_manager.retrieve_backref_extents_in_range(
1200 t,
1201 reclaim_state->start_pos,
1202 reclaim_state->end_pos
1203 ).si_then([pin_list=std::move(pin_list),
1204 &weak_read_ret](auto extents) mutable {
1205 weak_read_ret = std::make_pair(std::move(extents), std::move(pin_list));
1206 });
1207 });
1208 });
1209 }).safe_then([&weak_read_ret] {
1210 return std::move(weak_read_ret);
1211 });
1212 }).safe_then([this, FNAME, pavail_ratio, start](auto weak_read_ret) {
1213 return seastar::do_with(
1214 std::move(weak_read_ret.first),
1215 std::move(weak_read_ret.second),
1216 (size_t)0,
1217 (size_t)0,
1218 [this, FNAME, pavail_ratio, start](
1219 auto &backref_extents, auto &pin_list, auto &reclaimed, auto &runs)
1220 {
1221 return do_reclaim_space(
1222 backref_extents,
1223 pin_list,
1224 reclaimed,
1225 runs
1226 ).safe_then([this, FNAME, pavail_ratio, start, &reclaimed, &runs] {
1227 stats.reclaiming_bytes += reclaimed;
1228 auto d = seastar::lowres_system_clock::now() - start;
1229 DEBUG("duration: {}, pavail_ratio before: {}, repeats: {}",
1230 d, pavail_ratio, runs);
1231 if (reclaim_state->is_complete()) {
1232 auto segment_to_release = reclaim_state->get_segment_id();
1233 INFO("reclaim {} finish, reclaimed alive/total={}",
1234 segment_to_release,
1235 stats.reclaiming_bytes/(double)segments.get_segment_size());
1236 stats.reclaimed_bytes += stats.reclaiming_bytes;
1237 stats.reclaimed_segment_bytes += segments.get_segment_size();
1238 stats.reclaiming_bytes = 0;
1239 reclaim_state.reset();
1240 return sm_group->release_segment(segment_to_release
1241 ).handle_error(
1242 clean_space_ertr::pass_further{},
1243 crimson::ct_error::assert_all{
1244 "SegmentCleaner::clean_space encountered invalid error in release_segment"
1245 }
1246 ).safe_then([this, FNAME, segment_to_release] {
1247 auto old_usage = calc_utilization(segment_to_release);
1248 if(unlikely(old_usage != 0)) {
1249 space_tracker->dump_usage(segment_to_release);
1250 ERROR("segment {} old_usage {} != 0",
1251 segment_to_release, old_usage);
1252 ceph_abort();
1253 }
1254 segments.mark_empty(segment_to_release);
1255 auto new_usage = calc_utilization(segment_to_release);
1256 adjust_segment_util(old_usage, new_usage);
1257 INFO("released {}, {}",
1258 segment_to_release, stat_printer_t{*this, false});
1259 background_callback->maybe_wake_blocked_io();
1260 });
1261 } else {
1262 return clean_space_ertr::now();
1263 }
1264 });
1265 });
1266 });
1267 }
1268
1269 SegmentCleaner::mount_ret SegmentCleaner::mount()
1270 {
1271 LOG_PREFIX(SegmentCleaner::mount);
1272 const auto& sms = sm_group->get_segment_managers();
1273 INFO("{} segment managers", sms.size());
1274
1275 assert(background_callback->get_state() == state_t::MOUNT);
1276
1277 space_tracker.reset(
1278 detailed ?
1279 (SpaceTrackerI*)new SpaceTrackerDetailed(
1280 sms) :
1281 (SpaceTrackerI*)new SpaceTrackerSimple(
1282 sms));
1283
1284 segments.reset();
1285 for (auto sm : sms) {
1286 segments.add_segment_manager(*sm);
1287 }
1288 segments.assign_ids();
1289
1290 stats = {};
1291 metrics.clear();
1292 register_metrics();
1293
1294 INFO("{} segments", segments.get_num_segments());
1295 return crimson::do_for_each(
1296 segments.begin(),
1297 segments.end(),
1298 [this, FNAME](auto& it)
1299 {
1300 auto segment_id = it.first;
1301 return sm_group->read_segment_header(
1302 segment_id
1303 ).safe_then([segment_id, this, FNAME](auto header) {
1304 DEBUG("segment_id={} -- {}", segment_id, header);
1305 auto s_type = header.get_type();
1306 if (s_type == segment_type_t::NULL_SEG) {
1307 ERROR("got null segment, segment_id={} -- {}", segment_id, header);
1308 ceph_abort();
1309 }
1310 return sm_group->read_segment_tail(
1311 segment_id
1312 ).safe_then([this, FNAME, segment_id, header](auto tail)
1313 -> scan_extents_ertr::future<> {
1314 if (tail.segment_nonce != header.segment_nonce) {
1315 return scan_no_tail_segment(header, segment_id);
1316 }
1317 ceph_assert(header.get_type() == tail.get_type());
1318
1319 sea_time_point modify_time = mod_to_timepoint(tail.modify_time);
1320 std::size_t num_extents = tail.num_extents;
1321 if ((modify_time == NULL_TIME && num_extents == 0) ||
1322 (modify_time != NULL_TIME && num_extents != 0)) {
1323 segments.update_modify_time(segment_id, modify_time, num_extents);
1324 } else {
1325 ERROR("illegal modify time {}", tail);
1326 return crimson::ct_error::input_output_error::make();
1327 }
1328
1329 init_mark_segment_closed(
1330 segment_id,
1331 header.segment_seq,
1332 header.type,
1333 header.category,
1334 header.generation);
1335 return seastar::now();
1336 }).handle_error(
1337 crimson::ct_error::enodata::handle(
1338 [this, header, segment_id](auto) {
1339 return scan_no_tail_segment(header, segment_id);
1340 }),
1341 crimson::ct_error::pass_further_all{}
1342 );
1343 }).handle_error(
1344 crimson::ct_error::enoent::handle([](auto) {
1345 return mount_ertr::now();
1346 }),
1347 crimson::ct_error::enodata::handle([](auto) {
1348 return mount_ertr::now();
1349 }),
1350 crimson::ct_error::input_output_error::pass_further{},
1351 crimson::ct_error::assert_all{"unexpected error"}
1352 );
1353 }).safe_then([this, FNAME] {
1354 INFO("done, {}", segments);
1355 });
1356 }
1357
1358 SegmentCleaner::scan_extents_ret SegmentCleaner::scan_no_tail_segment(
1359 const segment_header_t &segment_header,
1360 segment_id_t segment_id)
1361 {
1362 LOG_PREFIX(SegmentCleaner::scan_no_tail_segment);
1363 INFO("scan {} {}", segment_id, segment_header);
1364 return seastar::do_with(
1365 scan_valid_records_cursor({
1366 segments[segment_id].seq,
1367 paddr_t::make_seg_paddr(segment_id, 0)
1368 }),
1369 SegmentManagerGroup::found_record_handler_t(
1370 [this, segment_id, segment_header, FNAME](
1371 record_locator_t locator,
1372 const record_group_header_t &record_group_header,
1373 const bufferlist& mdbuf
1374 ) mutable -> SegmentManagerGroup::scan_valid_records_ertr::future<>
1375 {
1376 DEBUG("{} {}, decoding {} records",
1377 segment_id, segment_header.get_type(), record_group_header.records);
1378
1379 auto maybe_headers = try_decode_record_headers(
1380 record_group_header, mdbuf);
1381 if (!maybe_headers) {
1382 // This should be impossible, we did check the crc on the mdbuf
1383 ERROR("unable to decode record headers for record group {}",
1384 locator.record_block_base);
1385 return crimson::ct_error::input_output_error::make();
1386 }
1387
1388 for (auto &record_header : *maybe_headers) {
1389 auto modify_time = mod_to_timepoint(record_header.modify_time);
1390 if (record_header.extents == 0 || modify_time != NULL_TIME) {
1391 segments.update_modify_time(
1392 segment_id, modify_time, record_header.extents);
1393 } else {
1394 ERROR("illegal modify time {}", record_header);
1395 return crimson::ct_error::input_output_error::make();
1396 }
1397 }
1398 return seastar::now();
1399 }),
1400 [this, segment_header](auto &cursor, auto &handler)
1401 {
1402 return sm_group->scan_valid_records(
1403 cursor,
1404 segment_header.segment_nonce,
1405 segments.get_segment_size(),
1406 handler).discard_result();
1407 }).safe_then([this, segment_id, segment_header] {
1408 init_mark_segment_closed(
1409 segment_id,
1410 segment_header.segment_seq,
1411 segment_header.type,
1412 segment_header.category,
1413 segment_header.generation);
1414 });
1415 }
1416
1417 bool SegmentCleaner::check_usage()
1418 {
1419 SpaceTrackerIRef tracker(space_tracker->make_empty());
1420 extent_callback->with_transaction_weak(
1421 "check_usage",
1422 [this, &tracker](auto &t) {
1423 return backref_manager.scan_mapped_space(
1424 t,
1425 [&tracker](
1426 paddr_t paddr,
1427 paddr_t backref_key,
1428 extent_len_t len,
1429 extent_types_t type,
1430 laddr_t laddr)
1431 {
1432 if (paddr.get_addr_type() == paddr_types_t::SEGMENT) {
1433 if (is_backref_node(type)) {
1434 assert(laddr == L_ADDR_NULL);
1435 assert(backref_key != P_ADDR_NULL);
1436 tracker->allocate(
1437 paddr.as_seg_paddr().get_segment_id(),
1438 paddr.as_seg_paddr().get_segment_off(),
1439 len);
1440 } else if (laddr == L_ADDR_NULL) {
1441 assert(backref_key == P_ADDR_NULL);
1442 tracker->release(
1443 paddr.as_seg_paddr().get_segment_id(),
1444 paddr.as_seg_paddr().get_segment_off(),
1445 len);
1446 } else {
1447 assert(backref_key == P_ADDR_NULL);
1448 tracker->allocate(
1449 paddr.as_seg_paddr().get_segment_id(),
1450 paddr.as_seg_paddr().get_segment_off(),
1451 len);
1452 }
1453 }
1454 });
1455 }).unsafe_get0();
1456 return space_tracker->equals(*tracker);
1457 }
1458
1459 void SegmentCleaner::mark_space_used(
1460 paddr_t addr,
1461 extent_len_t len)
1462 {
1463 LOG_PREFIX(SegmentCleaner::mark_space_used);
1464 assert(background_callback->get_state() >= state_t::SCAN_SPACE);
1465 assert(len);
1466 // TODO: drop
1467 if (addr.get_addr_type() != paddr_types_t::SEGMENT) {
1468 return;
1469 }
1470
1471 auto& seg_addr = addr.as_seg_paddr();
1472 stats.used_bytes += len;
1473 auto old_usage = calc_utilization(seg_addr.get_segment_id());
1474 [[maybe_unused]] auto ret = space_tracker->allocate(
1475 seg_addr.get_segment_id(),
1476 seg_addr.get_segment_off(),
1477 len);
1478 auto new_usage = calc_utilization(seg_addr.get_segment_id());
1479 adjust_segment_util(old_usage, new_usage);
1480
1481 background_callback->maybe_wake_background();
1482 assert(ret > 0);
1483 DEBUG("segment {} new len: {}~{}, live_bytes: {}",
1484 seg_addr.get_segment_id(),
1485 addr,
1486 len,
1487 space_tracker->get_usage(seg_addr.get_segment_id()));
1488 }
1489
1490 void SegmentCleaner::mark_space_free(
1491 paddr_t addr,
1492 extent_len_t len)
1493 {
1494 LOG_PREFIX(SegmentCleaner::mark_space_free);
1495 assert(background_callback->get_state() >= state_t::SCAN_SPACE);
1496 assert(len);
1497 // TODO: drop
1498 if (addr.get_addr_type() != paddr_types_t::SEGMENT) {
1499 return;
1500 }
1501
1502 ceph_assert(stats.used_bytes >= len);
1503 stats.used_bytes -= len;
1504 auto& seg_addr = addr.as_seg_paddr();
1505
1506 DEBUG("segment {} free len: {}~{}",
1507 seg_addr.get_segment_id(), addr, len);
1508 auto old_usage = calc_utilization(seg_addr.get_segment_id());
1509 [[maybe_unused]] auto ret = space_tracker->release(
1510 seg_addr.get_segment_id(),
1511 seg_addr.get_segment_off(),
1512 len);
1513 auto new_usage = calc_utilization(seg_addr.get_segment_id());
1514 adjust_segment_util(old_usage, new_usage);
1515 background_callback->maybe_wake_blocked_io();
1516 assert(ret >= 0);
1517 DEBUG("segment {} free len: {}~{}, live_bytes: {}",
1518 seg_addr.get_segment_id(),
1519 addr,
1520 len,
1521 space_tracker->get_usage(seg_addr.get_segment_id()));
1522 }
1523
1524 segment_id_t SegmentCleaner::get_next_reclaim_segment() const
1525 {
1526 LOG_PREFIX(SegmentCleaner::get_next_reclaim_segment);
1527 segment_id_t id = NULL_SEG_ID;
1528 double max_benefit_cost = 0;
1529 sea_time_point now_time;
1530 if constexpr (gc_formula != gc_formula_t::GREEDY) {
1531 now_time = seastar::lowres_system_clock::now();
1532 } else {
1533 now_time = NULL_TIME;
1534 }
1535 sea_time_point bound_time;
1536 if constexpr (gc_formula == gc_formula_t::BENEFIT) {
1537 bound_time = segments.get_time_bound();
1538 if (bound_time == NULL_TIME) {
1539 WARN("BENEFIT -- bound_time is NULL_TIME");
1540 }
1541 } else {
1542 bound_time = NULL_TIME;
1543 }
1544 for (auto& [_id, segment_info] : segments) {
1545 if (segment_info.is_closed() &&
1546 (trimmer == nullptr ||
1547 !segment_info.is_in_journal(trimmer->get_journal_tail()))) {
1548 double benefit_cost = calc_gc_benefit_cost(_id, now_time, bound_time);
1549 if (benefit_cost > max_benefit_cost) {
1550 id = _id;
1551 max_benefit_cost = benefit_cost;
1552 }
1553 }
1554 }
1555 if (id != NULL_SEG_ID) {
1556 DEBUG("segment {}, benefit_cost {}",
1557 id, max_benefit_cost);
1558 return id;
1559 } else {
1560 ceph_assert(get_segments_reclaimable() == 0);
1561 // see should_clean_space()
1562 ceph_abort("impossible!");
1563 return NULL_SEG_ID;
1564 }
1565 }
1566
1567 bool SegmentCleaner::try_reserve_projected_usage(std::size_t projected_usage)
1568 {
1569 assert(background_callback->is_ready());
1570 stats.projected_used_bytes += projected_usage;
1571 if (should_block_io_on_clean()) {
1572 stats.projected_used_bytes -= projected_usage;
1573 return false;
1574 } else {
1575 ++stats.projected_count;
1576 stats.projected_used_bytes_sum += stats.projected_used_bytes;
1577 return true;
1578 }
1579 }
1580
1581 void SegmentCleaner::release_projected_usage(std::size_t projected_usage)
1582 {
1583 assert(background_callback->is_ready());
1584 ceph_assert(stats.projected_used_bytes >= projected_usage);
1585 stats.projected_used_bytes -= projected_usage;
1586 background_callback->maybe_wake_blocked_io();
1587 }
1588
1589 void SegmentCleaner::print(std::ostream &os, bool is_detailed) const
1590 {
1591 os << "SegmentCleaner(";
1592 if (background_callback->is_ready()) {
1593 os << "should_block_io_on_clean=" << should_block_io_on_clean()
1594 << ", should_clean=" << should_clean_space();
1595 } else {
1596 os << "not-ready";
1597 }
1598 os << ", projected_avail_ratio=" << get_projected_available_ratio()
1599 << ", reclaim_ratio=" << get_reclaim_ratio()
1600 << ", alive_ratio=" << get_alive_ratio();
1601 if (is_detailed) {
1602 os << ", unavailable_unreclaimable="
1603 << get_unavailable_unreclaimable_bytes() << "B"
1604 << ", unavailable_reclaimble="
1605 << get_unavailable_reclaimable_bytes() << "B"
1606 << ", alive=" << stats.used_bytes << "B"
1607 << ", " << segments;
1608 }
1609 os << ")";
1610 }
1611
1612 RBMCleaner::RBMCleaner(
1613 RBMDeviceGroupRef&& rb_group,
1614 BackrefManager &backref_manager,
1615 bool detailed)
1616 : detailed(detailed),
1617 rb_group(std::move(rb_group)),
1618 backref_manager(backref_manager)
1619 {}
1620
1621 void RBMCleaner::print(std::ostream &os, bool is_detailed) const
1622 {
1623 // TODO
1624 return;
1625 }
1626
1627 void RBMCleaner::mark_space_used(
1628 paddr_t addr,
1629 extent_len_t len)
1630 {
1631 LOG_PREFIX(RBMCleaner::mark_space_used);
1632 assert(addr.get_addr_type() == paddr_types_t::RANDOM_BLOCK);
1633 auto rbms = rb_group->get_rb_managers();
1634 for (auto rbm : rbms) {
1635 if (addr.get_device_id() == rbm->get_device_id()) {
1636 if (rbm->get_start() <= addr) {
1637 INFO("allocate addr: {} len: {}", addr, len);
1638 stats.used_bytes += len;
1639 rbm->mark_space_used(addr, len);
1640 }
1641 return;
1642 }
1643 }
1644 }
1645
1646 void RBMCleaner::mark_space_free(
1647 paddr_t addr,
1648 extent_len_t len)
1649 {
1650 LOG_PREFIX(RBMCleaner::mark_space_free);
1651 assert(addr.get_addr_type() == paddr_types_t::RANDOM_BLOCK);
1652 auto rbms = rb_group->get_rb_managers();
1653 for (auto rbm : rbms) {
1654 if (addr.get_device_id() == rbm->get_device_id()) {
1655 if (rbm->get_start() <= addr) {
1656 INFO("free addr: {} len: {}", addr, len);
1657 ceph_assert(stats.used_bytes >= len);
1658 stats.used_bytes -= len;
1659 rbm->mark_space_free(addr, len);
1660 }
1661 return;
1662 }
1663 }
1664 }
1665
1666 void RBMCleaner::commit_space_used(paddr_t addr, extent_len_t len)
1667 {
1668 auto rbms = rb_group->get_rb_managers();
1669 for (auto rbm : rbms) {
1670 if (addr.get_device_id() == rbm->get_device_id()) {
1671 if (rbm->get_start() <= addr) {
1672 rbm->complete_allocation(addr, len);
1673 }
1674 return;
1675 }
1676 }
1677 }
1678
1679 bool RBMCleaner::try_reserve_projected_usage(std::size_t projected_usage)
1680 {
1681 assert(background_callback->is_ready());
1682 stats.projected_used_bytes += projected_usage;
1683 return true;
1684 }
1685
1686 void RBMCleaner::release_projected_usage(std::size_t projected_usage)
1687 {
1688 assert(background_callback->is_ready());
1689 ceph_assert(stats.projected_used_bytes >= projected_usage);
1690 stats.projected_used_bytes -= projected_usage;
1691 background_callback->maybe_wake_blocked_io();
1692 }
1693
1694 RBMCleaner::clean_space_ret RBMCleaner::clean_space()
1695 {
1696 // TODO
1697 return clean_space_ertr::now();
1698 }
1699
1700 RBMCleaner::mount_ret RBMCleaner::mount()
1701 {
1702 stats = {};
1703 register_metrics();
1704 return seastar::do_with(
1705 rb_group->get_rb_managers(),
1706 [](auto &rbs) {
1707 return crimson::do_for_each(
1708 rbs.begin(),
1709 rbs.end(),
1710 [](auto& it) {
1711 return it->open(
1712 ).handle_error(
1713 crimson::ct_error::input_output_error::pass_further(),
1714 crimson::ct_error::assert_all{
1715 "Invalid error when opening RBM"}
1716 );
1717 });
1718 });
1719 }
1720
1721 bool RBMCleaner::check_usage()
1722 {
1723 assert(detailed);
1724 const auto& rbms = rb_group->get_rb_managers();
1725 RBMSpaceTracker tracker(rbms);
1726 extent_callback->with_transaction_weak(
1727 "check_usage",
1728 [this, &tracker, &rbms](auto &t) {
1729 return backref_manager.scan_mapped_space(
1730 t,
1731 [&tracker, &rbms](
1732 paddr_t paddr,
1733 paddr_t backref_key,
1734 extent_len_t len,
1735 extent_types_t type,
1736 laddr_t laddr)
1737 {
1738 for (auto rbm : rbms) {
1739 if (rbm->get_device_id() == paddr.get_device_id()) {
1740 if (is_backref_node(type)) {
1741 assert(laddr == L_ADDR_NULL);
1742 assert(backref_key != P_ADDR_NULL);
1743 tracker.allocate(
1744 paddr,
1745 len);
1746 } else if (laddr == L_ADDR_NULL) {
1747 assert(backref_key == P_ADDR_NULL);
1748 tracker.release(
1749 paddr,
1750 len);
1751 } else {
1752 assert(backref_key == P_ADDR_NULL);
1753 tracker.allocate(
1754 paddr,
1755 len);
1756 }
1757 }
1758 }
1759 });
1760 }).unsafe_get0();
1761 return equals(tracker);
1762 }
1763
1764 bool RBMCleaner::equals(const RBMSpaceTracker &_other) const
1765 {
1766 LOG_PREFIX(RBMSpaceTracker::equals);
1767 const auto &other = static_cast<const RBMSpaceTracker&>(_other);
1768 auto rbs = rb_group->get_rb_managers();
1769 //TODO: multiple rbm allocator
1770 auto rbm = rbs[0];
1771 assert(rbm);
1772
1773 if (rbm->get_device()->get_available_size() / rbm->get_block_size()
1774 != other.block_usage.size()) {
1775 assert(0 == "block counts should match");
1776 return false;
1777 }
1778 bool all_match = true;
1779 for (auto i = other.block_usage.begin();
1780 i != other.block_usage.end(); ++i) {
1781 if (i->first < rbm->get_start().as_blk_paddr().get_device_off()) {
1782 continue;
1783 }
1784 auto addr = i->first;
1785 auto state = rbm->get_extent_state(
1786 convert_abs_addr_to_paddr(addr, rbm->get_device_id()),
1787 rbm->get_block_size());
1788 if ((i->second.used && state == rbm_extent_state_t::ALLOCATED) ||
1789 (!i->second.used && (state == rbm_extent_state_t::FREE ||
1790 state == rbm_extent_state_t::RESERVED))) {
1791 // pass
1792 } else {
1793 all_match = false;
1794 ERROR("block addr {} mismatch other used: {}",
1795 addr, i->second.used);
1796 }
1797 }
1798 return all_match;
1799 }
1800
1801 void RBMCleaner::register_metrics()
1802 {
1803 namespace sm = seastar::metrics;
1804
1805 metrics.add_group("rbm_cleaner", {
1806 sm::make_counter("total_bytes",
1807 [this] { return get_total_bytes(); },
1808 sm::description("the size of the space")),
1809 sm::make_counter("available_bytes",
1810 [this] { return get_total_bytes() - get_journal_bytes() - stats.used_bytes; },
1811 sm::description("the size of the space is available")),
1812 sm::make_counter("used_bytes", stats.used_bytes,
1813 sm::description("the size of the space occupied by live extents")),
1814 });
1815 }
1816
1817 }