1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2 // vim: ts=8 sw=2 smarttab
4 #include <fmt/chrono.h>
5 #include <seastar/core/metrics.hh>
7 #include "crimson/os/seastore/logging.h"
9 #include "crimson/os/seastore/async_cleaner.h"
10 #include "crimson/os/seastore/backref_manager.h"
11 #include "crimson/os/seastore/transaction_manager.h"
13 SET_SUBSYS(seastore_cleaner
);
17 enum class gc_formula_t
{
22 constexpr auto gc_formula
= gc_formula_t::COST_BENEFIT
;
26 namespace crimson::os::seastore
{
28 void segment_info_t::set_open(
29 segment_seq_t _seq
, segment_type_t _type
,
30 data_category_t _category
, rewrite_gen_t _generation
)
32 ceph_assert(_seq
!= NULL_SEG_SEQ
);
33 ceph_assert(_type
!= segment_type_t::NULL_SEG
);
34 ceph_assert(_category
!= data_category_t::NUM
);
35 ceph_assert(is_rewrite_generation(_generation
));
36 state
= Segment::segment_state_t::OPEN
;
40 generation
= _generation
;
44 void segment_info_t::set_empty()
46 state
= Segment::segment_state_t::EMPTY
;
48 type
= segment_type_t::NULL_SEG
;
49 category
= data_category_t::NUM
;
50 generation
= NULL_GENERATION
;
51 modify_time
= NULL_TIME
;
56 void segment_info_t::set_closed()
58 state
= Segment::segment_state_t::CLOSED
;
59 // the rest of information is unchanged
62 void segment_info_t::init_closed(
63 segment_seq_t _seq
, segment_type_t _type
,
64 data_category_t _category
, rewrite_gen_t _generation
,
65 segment_off_t seg_size
)
67 ceph_assert(_seq
!= NULL_SEG_SEQ
);
68 ceph_assert(_type
!= segment_type_t::NULL_SEG
);
69 ceph_assert(_category
!= data_category_t::NUM
);
70 ceph_assert(is_rewrite_generation(_generation
));
71 state
= Segment::segment_state_t::CLOSED
;
75 generation
= _generation
;
76 written_to
= seg_size
;
79 std::ostream
& operator<<(std::ostream
&out
, const segment_info_t
&info
)
82 << "state=" << info
.state
84 if (info
.is_empty()) {
86 } else { // open or closed
87 out
<< " " << info
.type
88 << " " << segment_seq_printer_t
{info
.seq
}
89 << " " << info
.category
90 << " " << rewrite_gen_printer_t
{info
.generation
}
91 << ", modify_time=" << sea_time_point_printer_t
{info
.modify_time
}
92 << ", num_extents=" << info
.num_extents
93 << ", written_to=" << info
.written_to
;
98 void segments_info_t::reset()
104 journal_segment_id
= NULL_SEG_ID
;
105 num_in_journal_open
= 0;
106 num_type_journal
= 0;
113 count_open_journal
= 0;
115 count_release_journal
= 0;
116 count_release_ool
= 0;
117 count_close_journal
= 0;
121 avail_bytes_in_open
= 0;
123 modify_times
.clear();
126 void segments_info_t::add_segment_manager(
127 SegmentManager
&segment_manager
)
129 LOG_PREFIX(segments_info_t::add_segment_manager
);
130 device_id_t d_id
= segment_manager
.get_device_id();
131 auto ssize
= segment_manager
.get_segment_size();
132 auto nsegments
= segment_manager
.get_num_segments();
133 auto sm_size
= segment_manager
.get_available_size();
134 INFO("adding segment manager {}, size={}, ssize={}, segments={}",
135 device_id_printer_t
{d_id
}, sm_size
, ssize
, nsegments
);
136 ceph_assert(ssize
> 0);
137 ceph_assert(nsegments
> 0);
138 ceph_assert(sm_size
> 0);
140 // also validate if the device is duplicated
141 segments
.add_device(d_id
, nsegments
, segment_info_t
{});
143 // assume all the segment managers share the same settings as follows.
144 if (segment_size
== 0) {
145 ceph_assert(ssize
> 0);
146 segment_size
= ssize
;
148 ceph_assert(segment_size
== ssize
);
151 // NOTE: by default the segments are empty
152 num_empty
+= nsegments
;
154 total_bytes
+= sm_size
;
157 void segments_info_t::init_closed(
158 segment_id_t segment
, segment_seq_t seq
, segment_type_t type
,
159 data_category_t category
, rewrite_gen_t generation
)
161 LOG_PREFIX(segments_info_t::init_closed
);
162 auto& segment_info
= segments
[segment
];
163 DEBUG("initiating {} {} {} {} {}, {}, "
164 "num_segments(empty={}, opened={}, closed={})",
165 segment
, type
, segment_seq_printer_t
{seq
},
166 category
, rewrite_gen_printer_t
{generation
},
167 segment_info
, num_empty
, num_open
, num_closed
);
168 ceph_assert(segment_info
.is_empty());
169 ceph_assert(num_empty
> 0);
172 if (type
== segment_type_t::JOURNAL
) {
173 // init_closed won't initialize journal_segment_id
174 ceph_assert(get_submitted_journal_head() == JOURNAL_SEQ_NULL
);
179 // do not increment count_close_*;
181 if (segment_info
.modify_time
!= NULL_TIME
) {
182 modify_times
.insert(segment_info
.modify_time
);
184 ceph_assert(segment_info
.num_extents
== 0);
187 segment_info
.init_closed(
188 seq
, type
, category
, generation
, get_segment_size());
191 void segments_info_t::mark_open(
192 segment_id_t segment
, segment_seq_t seq
, segment_type_t type
,
193 data_category_t category
, rewrite_gen_t generation
)
195 LOG_PREFIX(segments_info_t::mark_open
);
196 auto& segment_info
= segments
[segment
];
197 INFO("opening {} {} {} {} {}, {}, "
198 "num_segments(empty={}, opened={}, closed={})",
199 segment
, type
, segment_seq_printer_t
{seq
},
200 category
, rewrite_gen_printer_t
{generation
},
201 segment_info
, num_empty
, num_open
, num_closed
);
202 ceph_assert(segment_info
.is_empty());
203 ceph_assert(num_empty
> 0);
206 if (type
== segment_type_t::JOURNAL
) {
207 if (journal_segment_id
!= NULL_SEG_ID
) {
208 auto& last_journal_segment
= segments
[journal_segment_id
];
209 ceph_assert(last_journal_segment
.is_closed());
210 ceph_assert(last_journal_segment
.type
== segment_type_t::JOURNAL
);
211 ceph_assert(last_journal_segment
.seq
+ 1 == seq
);
213 journal_segment_id
= segment
;
215 ++num_in_journal_open
;
217 ++count_open_journal
;
222 avail_bytes_in_open
+= get_segment_size();
224 segment_info
.set_open(seq
, type
, category
, generation
);
227 void segments_info_t::mark_empty(
228 segment_id_t segment
)
230 LOG_PREFIX(segments_info_t::mark_empty
);
231 auto& segment_info
= segments
[segment
];
232 INFO("releasing {}, {}, num_segments(empty={}, opened={}, closed={})",
233 segment
, segment_info
,
234 num_empty
, num_open
, num_closed
);
235 ceph_assert(segment_info
.is_closed());
236 auto type
= segment_info
.type
;
237 assert(type
!= segment_type_t::NULL_SEG
);
238 ceph_assert(num_closed
> 0);
241 if (type
== segment_type_t::JOURNAL
) {
242 ceph_assert(num_type_journal
> 0);
244 ++count_release_journal
;
246 ceph_assert(num_type_ool
> 0);
251 if (segment_info
.modify_time
!= NULL_TIME
) {
252 auto to_erase
= modify_times
.find(segment_info
.modify_time
);
253 ceph_assert(to_erase
!= modify_times
.end());
254 modify_times
.erase(to_erase
);
256 ceph_assert(segment_info
.num_extents
== 0);
259 segment_info
.set_empty();
262 void segments_info_t::mark_closed(
263 segment_id_t segment
)
265 LOG_PREFIX(segments_info_t::mark_closed
);
266 auto& segment_info
= segments
[segment
];
267 INFO("closing {}, {}, num_segments(empty={}, opened={}, closed={})",
268 segment
, segment_info
,
269 num_empty
, num_open
, num_closed
);
270 ceph_assert(segment_info
.is_open());
271 ceph_assert(num_open
> 0);
274 if (segment_info
.type
== segment_type_t::JOURNAL
) {
275 ceph_assert(num_in_journal_open
> 0);
276 --num_in_journal_open
;
277 ++count_close_journal
;
281 ceph_assert(get_segment_size() >= segment_info
.written_to
);
282 auto seg_avail_bytes
= get_segment_size() - segment_info
.written_to
;
283 ceph_assert(avail_bytes_in_open
>= (std::size_t)seg_avail_bytes
);
284 avail_bytes_in_open
-= seg_avail_bytes
;
286 if (segment_info
.modify_time
!= NULL_TIME
) {
287 modify_times
.insert(segment_info
.modify_time
);
289 ceph_assert(segment_info
.num_extents
== 0);
292 segment_info
.set_closed();
295 void segments_info_t::update_written_to(
299 LOG_PREFIX(segments_info_t::update_written_to
);
300 auto& saddr
= offset
.as_seg_paddr();
301 auto& segment_info
= segments
[saddr
.get_segment_id()];
302 if (!segment_info
.is_open()) {
303 ERROR("segment is not open, not updating, type={}, offset={}, {}",
304 type
, offset
, segment_info
);
308 auto new_written_to
= saddr
.get_segment_off();
309 ceph_assert(new_written_to
<= get_segment_size());
310 if (segment_info
.written_to
> new_written_to
) {
311 ERROR("written_to should not decrease! type={}, offset={}, {}",
312 type
, offset
, segment_info
);
316 DEBUG("type={}, offset={}, {}", type
, offset
, segment_info
);
317 ceph_assert(type
== segment_info
.type
);
318 auto avail_deduction
= new_written_to
- segment_info
.written_to
;
319 ceph_assert(avail_bytes_in_open
>= (std::size_t)avail_deduction
);
320 avail_bytes_in_open
-= avail_deduction
;
321 segment_info
.written_to
= new_written_to
;
324 std::ostream
&operator<<(std::ostream
&os
, const segments_info_t
&infos
)
326 return os
<< "segments("
327 << "empty=" << infos
.get_num_empty()
328 << ", open=" << infos
.get_num_open()
329 << ", closed=" << infos
.get_num_closed()
330 << ", type_journal=" << infos
.get_num_type_journal()
331 << ", type_ool=" << infos
.get_num_type_ool()
332 << ", total=" << infos
.get_total_bytes() << "B"
333 << ", available=" << infos
.get_available_bytes() << "B"
334 << ", unavailable=" << infos
.get_unavailable_bytes() << "B"
335 << ", available_ratio=" << infos
.get_available_ratio()
336 << ", submitted_head=" << infos
.get_submitted_journal_head()
337 << ", time_bound=" << sea_time_point_printer_t
{infos
.get_time_bound()}
341 void JournalTrimmerImpl::config_t::validate() const
343 ceph_assert(max_journal_bytes
<= DEVICE_OFF_MAX
);
344 ceph_assert(max_journal_bytes
> target_journal_dirty_bytes
);
345 ceph_assert(max_journal_bytes
> target_journal_alloc_bytes
);
346 ceph_assert(rewrite_dirty_bytes_per_cycle
> 0);
347 ceph_assert(rewrite_backref_bytes_per_cycle
> 0);
350 JournalTrimmerImpl::config_t
351 JournalTrimmerImpl::config_t::get_default(
352 std::size_t roll_size
, journal_type_t type
)
355 std::size_t target_dirty_bytes
= 0;
356 std::size_t target_alloc_bytes
= 0;
357 std::size_t max_journal_bytes
= 0;
358 if (type
== journal_type_t::SEGMENTED
) {
359 target_dirty_bytes
= 12 * roll_size
;
360 target_alloc_bytes
= 2 * roll_size
;
361 max_journal_bytes
= 16 * roll_size
;
363 assert(type
== journal_type_t::RANDOM_BLOCK
);
364 target_dirty_bytes
= roll_size
/ 4;
365 target_alloc_bytes
= roll_size
/ 4;
366 max_journal_bytes
= roll_size
/ 2;
372 1<<17,// rewrite_dirty_bytes_per_cycle
373 1<<24 // rewrite_backref_bytes_per_cycle
377 JournalTrimmerImpl::config_t
378 JournalTrimmerImpl::config_t::get_test(
379 std::size_t roll_size
, journal_type_t type
)
382 std::size_t target_dirty_bytes
= 0;
383 std::size_t target_alloc_bytes
= 0;
384 std::size_t max_journal_bytes
= 0;
385 if (type
== journal_type_t::SEGMENTED
) {
386 target_dirty_bytes
= 2 * roll_size
;
387 target_alloc_bytes
= 2 * roll_size
;
388 max_journal_bytes
= 4 * roll_size
;
390 assert(type
== journal_type_t::RANDOM_BLOCK
);
391 target_dirty_bytes
= roll_size
/ 4;
392 target_alloc_bytes
= roll_size
/ 4;
393 max_journal_bytes
= roll_size
/ 2;
399 1<<17,// rewrite_dirty_bytes_per_cycle
400 1<<24 // rewrite_backref_bytes_per_cycle
404 JournalTrimmerImpl::JournalTrimmerImpl(
405 BackrefManager
&backref_manager
,
408 device_off_t roll_start
,
409 device_off_t roll_size
)
410 : backref_manager(backref_manager
),
413 roll_start(roll_start
),
414 roll_size(roll_size
),
418 ceph_assert(roll_start
>= 0);
419 ceph_assert(roll_size
> 0);
423 void JournalTrimmerImpl::set_journal_head(journal_seq_t head
)
425 LOG_PREFIX(JournalTrimmerImpl::set_journal_head
);
427 ceph_assert(head
!= JOURNAL_SEQ_NULL
);
428 ceph_assert(journal_head
== JOURNAL_SEQ_NULL
||
429 head
>= journal_head
);
430 ceph_assert(journal_alloc_tail
== JOURNAL_SEQ_NULL
||
431 head
>= journal_alloc_tail
);
432 ceph_assert(journal_dirty_tail
== JOURNAL_SEQ_NULL
||
433 head
>= journal_dirty_tail
);
435 std::swap(journal_head
, head
);
436 if (journal_head
.segment_seq
== head
.segment_seq
) {
437 DEBUG("journal_head {} => {}, {}",
438 head
, journal_head
, stat_printer_t
{*this, false});
440 INFO("journal_head {} => {}, {}",
441 head
, journal_head
, stat_printer_t
{*this, false});
443 background_callback
->maybe_wake_background();
446 void JournalTrimmerImpl::update_journal_tails(
447 journal_seq_t dirty_tail
,
448 journal_seq_t alloc_tail
)
450 LOG_PREFIX(JournalTrimmerImpl::update_journal_tails
);
452 if (dirty_tail
!= JOURNAL_SEQ_NULL
) {
453 ceph_assert(journal_head
== JOURNAL_SEQ_NULL
||
454 journal_head
>= dirty_tail
);
455 if (journal_dirty_tail
!= JOURNAL_SEQ_NULL
&&
456 journal_dirty_tail
> dirty_tail
) {
457 ERROR("journal_dirty_tail {} => {} is backwards!",
458 journal_dirty_tail
, dirty_tail
);
461 std::swap(journal_dirty_tail
, dirty_tail
);
462 if (journal_dirty_tail
.segment_seq
== dirty_tail
.segment_seq
) {
463 DEBUG("journal_dirty_tail {} => {}, {}",
464 dirty_tail
, journal_dirty_tail
, stat_printer_t
{*this, false});
466 INFO("journal_dirty_tail {} => {}, {}",
467 dirty_tail
, journal_dirty_tail
, stat_printer_t
{*this, false});
471 if (alloc_tail
!= JOURNAL_SEQ_NULL
) {
472 ceph_assert(journal_head
== JOURNAL_SEQ_NULL
||
473 journal_head
>= alloc_tail
);
474 if (journal_alloc_tail
!= JOURNAL_SEQ_NULL
&&
475 journal_alloc_tail
> alloc_tail
) {
476 ERROR("journal_alloc_tail {} => {} is backwards!",
477 journal_alloc_tail
, alloc_tail
);
480 std::swap(journal_alloc_tail
, alloc_tail
);
481 if (journal_alloc_tail
.segment_seq
== alloc_tail
.segment_seq
) {
482 DEBUG("journal_alloc_tail {} => {}, {}",
483 alloc_tail
, journal_alloc_tail
, stat_printer_t
{*this, false});
485 INFO("journal_alloc_tail {} => {}, {}",
486 alloc_tail
, journal_alloc_tail
, stat_printer_t
{*this, false});
490 background_callback
->maybe_wake_background();
491 background_callback
->maybe_wake_blocked_io();
494 journal_seq_t
JournalTrimmerImpl::get_tail_limit() const
496 assert(background_callback
->is_ready());
497 auto ret
= journal_head
.add_offset(
499 -static_cast<device_off_t
>(config
.max_journal_bytes
),
505 journal_seq_t
JournalTrimmerImpl::get_dirty_tail_target() const
507 assert(background_callback
->is_ready());
508 auto ret
= journal_head
.add_offset(
510 -static_cast<device_off_t
>(config
.target_journal_dirty_bytes
),
516 journal_seq_t
JournalTrimmerImpl::get_alloc_tail_target() const
518 assert(background_callback
->is_ready());
519 auto ret
= journal_head
.add_offset(
521 -static_cast<device_off_t
>(config
.target_journal_alloc_bytes
),
527 std::size_t JournalTrimmerImpl::get_dirty_journal_size() const
529 if (!background_callback
->is_ready()) {
532 auto ret
= journal_head
.relative_to(
537 ceph_assert(ret
>= 0);
538 return static_cast<std::size_t>(ret
);
541 std::size_t JournalTrimmerImpl::get_alloc_journal_size() const
543 if (!background_callback
->is_ready()) {
546 auto ret
= journal_head
.relative_to(
551 ceph_assert(ret
>= 0);
552 return static_cast<std::size_t>(ret
);
555 seastar::future
<> JournalTrimmerImpl::trim() {
556 return seastar::when_all(
558 if (should_trim_alloc()) {
561 crimson::ct_error::assert_all
{
562 "encountered invalid error in trim_alloc"
566 return seastar::now();
570 if (should_trim_dirty()) {
573 crimson::ct_error::assert_all
{
574 "encountered invalid error in trim_dirty"
578 return seastar::now();
584 JournalTrimmerImpl::trim_ertr::future
<>
585 JournalTrimmerImpl::trim_alloc()
587 LOG_PREFIX(JournalTrimmerImpl::trim_alloc
);
588 assert(background_callback
->is_ready());
589 return repeat_eagain([this, FNAME
] {
590 return extent_callback
->with_transaction_intr(
591 Transaction::src_t::TRIM_ALLOC
,
593 [this, FNAME
](auto &t
)
595 auto target
= get_alloc_tail_target();
596 DEBUGT("start, alloc_tail={}, target={}",
597 t
, journal_alloc_tail
, target
);
598 return backref_manager
.merge_cached_backrefs(
601 config
.rewrite_backref_bytes_per_cycle
602 ).si_then([this, FNAME
, &t
](auto trim_alloc_to
)
603 -> ExtentCallbackInterface::submit_transaction_direct_iertr::future
<>
605 DEBUGT("trim_alloc_to={}", t
, trim_alloc_to
);
606 if (trim_alloc_to
!= JOURNAL_SEQ_NULL
) {
607 return extent_callback
->submit_transaction_direct(
608 t
, std::make_optional
<journal_seq_t
>(trim_alloc_to
));
610 return seastar::now();
613 }).safe_then([this, FNAME
] {
614 DEBUG("finish, alloc_tail={}", journal_alloc_tail
);
618 JournalTrimmerImpl::trim_ertr::future
<>
619 JournalTrimmerImpl::trim_dirty()
621 LOG_PREFIX(JournalTrimmerImpl::trim_dirty
);
622 assert(background_callback
->is_ready());
623 return repeat_eagain([this, FNAME
] {
624 return extent_callback
->with_transaction_intr(
625 Transaction::src_t::TRIM_DIRTY
,
627 [this, FNAME
](auto &t
)
629 auto target
= get_dirty_tail_target();
630 DEBUGT("start, dirty_tail={}, target={}",
631 t
, journal_dirty_tail
, target
);
632 return extent_callback
->get_next_dirty_extents(
635 config
.rewrite_dirty_bytes_per_cycle
636 ).si_then([this, FNAME
, &t
](auto dirty_list
) {
637 DEBUGT("rewrite {} dirty extents", t
, dirty_list
.size());
638 return seastar::do_with(
639 std::move(dirty_list
),
640 [this, &t
](auto &dirty_list
)
642 return trans_intr::do_for_each(
644 [this, &t
](auto &e
) {
645 return extent_callback
->rewrite_extent(
646 t
, e
, INIT_GENERATION
, NULL_TIME
);
649 }).si_then([this, &t
] {
650 return extent_callback
->submit_transaction_direct(t
);
653 }).safe_then([this, FNAME
] {
654 DEBUG("finish, dirty_tail={}", journal_dirty_tail
);
658 void JournalTrimmerImpl::register_metrics()
660 namespace sm
= seastar::metrics
;
661 metrics
.add_group("journal_trimmer", {
662 sm::make_counter("dirty_journal_bytes",
663 [this] { return get_dirty_journal_size(); },
664 sm::description("the size of the journal for dirty extents")),
665 sm::make_counter("alloc_journal_bytes",
666 [this] { return get_alloc_journal_size(); },
667 sm::description("the size of the journal for alloc info"))
671 std::ostream
&operator<<(
672 std::ostream
&os
, const JournalTrimmerImpl::stat_printer_t
&stats
)
674 os
<< "JournalTrimmer(";
675 if (stats
.trimmer
.background_callback
->is_ready()) {
676 os
<< "should_block_io_on_trim=" << stats
.trimmer
.should_block_io_on_trim()
677 << ", should_(trim_dirty=" << stats
.trimmer
.should_trim_dirty()
678 << ", trim_alloc=" << stats
.trimmer
.should_trim_alloc() << ")";
682 if (stats
.detailed
) {
683 os
<< ", journal_head=" << stats
.trimmer
.get_journal_head()
684 << ", alloc_tail=" << stats
.trimmer
.get_alloc_tail()
685 << ", dirty_tail=" << stats
.trimmer
.get_dirty_tail();
686 if (stats
.trimmer
.background_callback
->is_ready()) {
687 os
<< ", alloc_tail_target=" << stats
.trimmer
.get_alloc_tail_target()
688 << ", dirty_tail_target=" << stats
.trimmer
.get_dirty_tail_target()
689 << ", tail_limit=" << stats
.trimmer
.get_tail_limit();
696 bool SpaceTrackerSimple::equals(const SpaceTrackerI
&_other
) const
698 LOG_PREFIX(SpaceTrackerSimple::equals
);
699 const auto &other
= static_cast<const SpaceTrackerSimple
&>(_other
);
701 if (other
.live_bytes_by_segment
.size() != live_bytes_by_segment
.size()) {
702 ERROR("different segment counts, bug in test");
703 assert(0 == "segment counts should match");
707 bool all_match
= true;
708 for (auto i
= live_bytes_by_segment
.begin(), j
= other
.live_bytes_by_segment
.begin();
709 i
!= live_bytes_by_segment
.end(); ++i
, ++j
) {
710 if (i
->second
.live_bytes
!= j
->second
.live_bytes
) {
712 DEBUG("segment_id {} live bytes mismatch *this: {}, other: {}",
713 i
->first
, i
->second
.live_bytes
, j
->second
.live_bytes
);
719 int64_t SpaceTrackerDetailed::SegmentMap::allocate(
720 device_segment_id_t segment
,
721 segment_off_t offset
,
723 const extent_len_t block_size
)
725 LOG_PREFIX(SegmentMap::allocate
);
726 assert(offset
% block_size
== 0);
727 assert(len
% block_size
== 0);
729 const auto b
= (offset
/ block_size
);
730 const auto e
= (offset
+ len
) / block_size
;
733 for (auto i
= b
; i
< e
; ++i
) {
736 ERROR("found allocated in {}, {} ~ {}", segment
, offset
, len
);
739 DEBUG("block {} allocated", i
* block_size
);
743 return update_usage(len
);
746 int64_t SpaceTrackerDetailed::SegmentMap::release(
747 device_segment_id_t segment
,
748 segment_off_t offset
,
750 const extent_len_t block_size
)
752 LOG_PREFIX(SegmentMap::release
);
753 assert(offset
% block_size
== 0);
754 assert(len
% block_size
== 0);
756 const auto b
= (offset
/ block_size
);
757 const auto e
= (offset
+ len
) / block_size
;
760 for (auto i
= b
; i
< e
; ++i
) {
763 ERROR("found unallocated in {}, {} ~ {}", segment
, offset
, len
);
766 DEBUG("block {} unallocated", i
* block_size
);
770 return update_usage(-(int64_t)len
);
773 bool SpaceTrackerDetailed::equals(const SpaceTrackerI
&_other
) const
775 LOG_PREFIX(SpaceTrackerDetailed::equals
);
776 const auto &other
= static_cast<const SpaceTrackerDetailed
&>(_other
);
778 if (other
.segment_usage
.size() != segment_usage
.size()) {
779 ERROR("different segment counts, bug in test");
780 assert(0 == "segment counts should match");
784 bool all_match
= true;
785 for (auto i
= segment_usage
.begin(), j
= other
.segment_usage
.begin();
786 i
!= segment_usage
.end(); ++i
, ++j
) {
787 if (i
->second
.get_usage() != j
->second
.get_usage()) {
789 ERROR("segment_id {} live bytes mismatch *this: {}, other: {}",
790 i
->first
, i
->second
.get_usage(), j
->second
.get_usage());
796 void SpaceTrackerDetailed::SegmentMap::dump_usage(extent_len_t block_size
) const
798 LOG_PREFIX(SegmentMap::dump_usage
);
800 for (unsigned i
= 0; i
< bitmap
.size(); ++i
) {
802 LOCAL_LOGGER
.info(" {} still live", i
* block_size
);
807 void SpaceTrackerDetailed::dump_usage(segment_id_t id
) const
809 LOG_PREFIX(SpaceTrackerDetailed::dump_usage
);
811 segment_usage
[id
].dump_usage(
812 block_size_by_segment_manager
[id
.device_id()]);
815 void SpaceTrackerSimple::dump_usage(segment_id_t id
) const
817 LOG_PREFIX(SpaceTrackerSimple::dump_usage
);
818 INFO("id: {}, live_bytes: {}",
819 id
, live_bytes_by_segment
[id
].live_bytes
);
822 std::ostream
&operator<<(
823 std::ostream
&os
, const AsyncCleaner::stat_printer_t
&stats
)
825 stats
.cleaner
.print(os
, stats
.detailed
);
829 SegmentCleaner::SegmentCleaner(
831 SegmentManagerGroupRef
&& sm_group
,
832 BackrefManager
&backref_manager
,
833 SegmentSeqAllocator
&segment_seq_allocator
,
836 : detailed(detailed
),
839 sm_group(std::move(sm_group
)),
840 backref_manager(backref_manager
),
841 ool_segment_seq_allocator(segment_seq_allocator
)
846 void SegmentCleaner::register_metrics()
848 namespace sm
= seastar::metrics
;
849 stats
.segment_util
.buckets
.resize(UTIL_BUCKETS
);
851 for (i
= 0; i
< UTIL_BUCKETS
; ++i
) {
852 stats
.segment_util
.buckets
[i
].upper_bound
= ((double)(i
+ 1)) / 10;
853 stats
.segment_util
.buckets
[i
].count
= 0;
855 // NOTE: by default the segments are empty
856 i
= get_bucket_index(UTIL_STATE_EMPTY
);
857 stats
.segment_util
.buckets
[i
].count
= segments
.get_num_segments();
861 prefix
.append("cold_");
863 prefix
.append("segment_cleaner");
865 metrics
.add_group(prefix
, {
866 sm::make_counter("segments_number",
867 [this] { return segments
.get_num_segments(); },
868 sm::description("the number of segments")),
869 sm::make_counter("segment_size",
870 [this] { return segments
.get_segment_size(); },
871 sm::description("the bytes of a segment")),
872 sm::make_counter("segments_in_journal",
873 [this] { return get_segments_in_journal(); },
874 sm::description("the number of segments in journal")),
875 sm::make_counter("segments_type_journal",
876 [this] { return segments
.get_num_type_journal(); },
877 sm::description("the number of segments typed journal")),
878 sm::make_counter("segments_type_ool",
879 [this] { return segments
.get_num_type_ool(); },
880 sm::description("the number of segments typed out-of-line")),
881 sm::make_counter("segments_open",
882 [this] { return segments
.get_num_open(); },
883 sm::description("the number of open segments")),
884 sm::make_counter("segments_empty",
885 [this] { return segments
.get_num_empty(); },
886 sm::description("the number of empty segments")),
887 sm::make_counter("segments_closed",
888 [this] { return segments
.get_num_closed(); },
889 sm::description("the number of closed segments")),
891 sm::make_counter("segments_count_open_journal",
892 [this] { return segments
.get_count_open_journal(); },
893 sm::description("the count of open journal segment operations")),
894 sm::make_counter("segments_count_open_ool",
895 [this] { return segments
.get_count_open_ool(); },
896 sm::description("the count of open ool segment operations")),
897 sm::make_counter("segments_count_release_journal",
898 [this] { return segments
.get_count_release_journal(); },
899 sm::description("the count of release journal segment operations")),
900 sm::make_counter("segments_count_release_ool",
901 [this] { return segments
.get_count_release_ool(); },
902 sm::description("the count of release ool segment operations")),
903 sm::make_counter("segments_count_close_journal",
904 [this] { return segments
.get_count_close_journal(); },
905 sm::description("the count of close journal segment operations")),
906 sm::make_counter("segments_count_close_ool",
907 [this] { return segments
.get_count_close_ool(); },
908 sm::description("the count of close ool segment operations")),
910 sm::make_counter("total_bytes",
911 [this] { return segments
.get_total_bytes(); },
912 sm::description("the size of the space")),
913 sm::make_counter("available_bytes",
914 [this] { return segments
.get_available_bytes(); },
915 sm::description("the size of the space is available")),
916 sm::make_counter("unavailable_unreclaimable_bytes",
917 [this] { return get_unavailable_unreclaimable_bytes(); },
918 sm::description("the size of the space is unavailable and unreclaimable")),
919 sm::make_counter("unavailable_reclaimable_bytes",
920 [this] { return get_unavailable_reclaimable_bytes(); },
921 sm::description("the size of the space is unavailable and reclaimable")),
922 sm::make_counter("used_bytes", stats
.used_bytes
,
923 sm::description("the size of the space occupied by live extents")),
924 sm::make_counter("unavailable_unused_bytes",
925 [this] { return get_unavailable_unused_bytes(); },
926 sm::description("the size of the space is unavailable and not alive")),
928 sm::make_counter("projected_count", stats
.projected_count
,
929 sm::description("the number of projected usage reservations")),
930 sm::make_counter("projected_used_bytes_sum", stats
.projected_used_bytes_sum
,
931 sm::description("the sum of the projected usage in bytes")),
933 sm::make_counter("reclaimed_bytes", stats
.reclaimed_bytes
,
934 sm::description("rewritten bytes due to reclaim")),
935 sm::make_counter("reclaimed_segment_bytes", stats
.reclaimed_segment_bytes
,
936 sm::description("rewritten bytes due to reclaim")),
937 sm::make_counter("closed_journal_used_bytes", stats
.closed_journal_used_bytes
,
938 sm::description("used bytes when close a journal segment")),
939 sm::make_counter("closed_journal_total_bytes", stats
.closed_journal_total_bytes
,
940 sm::description("total bytes of closed journal segments")),
941 sm::make_counter("closed_ool_used_bytes", stats
.closed_ool_used_bytes
,
942 sm::description("used bytes when close a ool segment")),
943 sm::make_counter("closed_ool_total_bytes", stats
.closed_ool_total_bytes
,
944 sm::description("total bytes of closed ool segments")),
946 sm::make_gauge("available_ratio",
947 [this] { return segments
.get_available_ratio(); },
948 sm::description("ratio of available space to total space")),
949 sm::make_gauge("reclaim_ratio",
950 [this] { return get_reclaim_ratio(); },
951 sm::description("ratio of reclaimable space to unavailable space")),
953 sm::make_histogram("segment_utilization_distribution",
954 [this]() -> seastar::metrics::histogram
& {
955 return stats
.segment_util
;
957 sm::description("utilization distribution of all segments"))
961 segment_id_t
SegmentCleaner::allocate_segment(
964 data_category_t category
,
965 rewrite_gen_t generation
)
967 LOG_PREFIX(SegmentCleaner::allocate_segment
);
968 assert(seq
!= NULL_SEG_SEQ
);
969 ceph_assert(type
== segment_type_t::OOL
||
970 trimmer
!= nullptr); // segment_type_t::JOURNAL
971 for (auto it
= segments
.begin();
972 it
!= segments
.end();
974 auto seg_id
= it
->first
;
975 auto& segment_info
= it
->second
;
976 if (segment_info
.is_empty()) {
977 auto old_usage
= calc_utilization(seg_id
);
978 segments
.mark_open(seg_id
, seq
, type
, category
, generation
);
979 background_callback
->maybe_wake_background();
980 auto new_usage
= calc_utilization(seg_id
);
981 adjust_segment_util(old_usage
, new_usage
);
982 INFO("opened, {}", stat_printer_t
{*this, false});
986 ERROR("out of space with {} {} {} {}",
987 type
, segment_seq_printer_t
{seq
}, category
,
988 rewrite_gen_printer_t
{generation
});
989 ceph_abort("seastore device size setting is too small");
993 void SegmentCleaner::close_segment(segment_id_t segment
)
995 LOG_PREFIX(SegmentCleaner::close_segment
);
996 auto old_usage
= calc_utilization(segment
);
997 segments
.mark_closed(segment
);
998 auto &seg_info
= segments
[segment
];
999 if (seg_info
.type
== segment_type_t::JOURNAL
) {
1000 stats
.closed_journal_used_bytes
+= space_tracker
->get_usage(segment
);
1001 stats
.closed_journal_total_bytes
+= segments
.get_segment_size();
1003 stats
.closed_ool_used_bytes
+= space_tracker
->get_usage(segment
);
1004 stats
.closed_ool_total_bytes
+= segments
.get_segment_size();
1006 auto new_usage
= calc_utilization(segment
);
1007 adjust_segment_util(old_usage
, new_usage
);
1008 INFO("closed, {} -- {}", stat_printer_t
{*this, false}, seg_info
);
1011 double SegmentCleaner::calc_gc_benefit_cost(
1013 const sea_time_point
&now_time
,
1014 const sea_time_point
&bound_time
) const
1016 double util
= calc_utilization(id
);
1017 ceph_assert(util
>= 0 && util
< 1);
1018 if constexpr (gc_formula
== gc_formula_t::GREEDY
) {
1022 if constexpr (gc_formula
== gc_formula_t::COST_BENEFIT
) {
1024 return std::numeric_limits
<double>::max();
1026 auto modify_time
= segments
[id
].modify_time
;
1027 double age_segment
= modify_time
.time_since_epoch().count();
1028 double age_now
= now_time
.time_since_epoch().count();
1029 if (likely(age_now
> age_segment
)) {
1030 return (1 - util
) * (age_now
- age_segment
) / (2 * util
);
1033 return (1 - util
) / (2 * util
);
1037 assert(gc_formula
== gc_formula_t::BENEFIT
);
1038 auto modify_time
= segments
[id
].modify_time
;
1039 double age_factor
= 0.5; // middle value if age is invalid
1040 if (likely(bound_time
!= NULL_TIME
&&
1041 modify_time
!= NULL_TIME
&&
1042 now_time
> modify_time
)) {
1043 assert(modify_time
>= bound_time
);
1044 double age_bound
= bound_time
.time_since_epoch().count();
1045 double age_now
= now_time
.time_since_epoch().count();
1046 double age_segment
= modify_time
.time_since_epoch().count();
1047 age_factor
= (age_now
- age_segment
) / (age_now
- age_bound
);
1049 return ((1 - 2 * age_factor
) * util
* util
+
1050 (2 * age_factor
- 2) * util
+ 1);
1053 SegmentCleaner::do_reclaim_space_ret
1054 SegmentCleaner::do_reclaim_space(
1055 const std::vector
<CachedExtentRef
> &backref_extents
,
1056 const backref_pin_list_t
&pin_list
,
1057 std::size_t &reclaimed
,
1060 return repeat_eagain([this, &backref_extents
,
1061 &pin_list
, &reclaimed
, &runs
] {
1064 auto src
= Transaction::src_t::CLEANER_MAIN
;
1066 src
= Transaction::src_t::CLEANER_COLD
;
1068 return extent_callback
->with_transaction_intr(
1070 "clean_reclaim_space",
1071 [this, &backref_extents
, &pin_list
, &reclaimed
](auto &t
)
1073 return seastar::do_with(
1074 std::vector
<CachedExtentRef
>(backref_extents
),
1075 [this, &t
, &reclaimed
, &pin_list
](auto &extents
)
1077 LOG_PREFIX(SegmentCleaner::do_reclaim_space
);
1078 // calculate live extents
1079 auto cached_backref_entries
=
1080 backref_manager
.get_cached_backref_entries_in_range(
1081 reclaim_state
->start_pos
, reclaim_state
->end_pos
);
1082 backref_entry_query_set_t backref_entries
;
1083 for (auto &pin
: pin_list
) {
1084 backref_entries
.emplace(
1091 for (auto &cached_backref
: cached_backref_entries
) {
1092 if (cached_backref
.laddr
== L_ADDR_NULL
) {
1093 auto it
= backref_entries
.find(cached_backref
.paddr
);
1094 assert(it
->len
== cached_backref
.len
);
1095 backref_entries
.erase(it
);
1097 backref_entries
.emplace(cached_backref
);
1100 // retrieve live extents
1101 DEBUGT("start, backref_entries={}, backref_extents={}",
1102 t
, backref_entries
.size(), extents
.size());
1103 return seastar::do_with(
1104 std::move(backref_entries
),
1105 [this, &extents
, &t
](auto &backref_entries
) {
1106 return trans_intr::parallel_for_each(
1108 [this, &extents
, &t
](auto &ent
)
1110 LOG_PREFIX(SegmentCleaner::do_reclaim_space
);
1111 TRACET("getting extent of type {} at {}~{}",
1116 return extent_callback
->get_extents_if_live(
1117 t
, ent
.type
, ent
.paddr
, ent
.laddr
, ent
.len
1118 ).si_then([FNAME
, &extents
, &ent
, &t
](auto list
) {
1120 TRACET("addr {} dead, skipping", t
, ent
.paddr
);
1122 for (auto &e
: list
) {
1123 extents
.emplace_back(std::move(e
));
1128 }).si_then([FNAME
, &extents
, this, &reclaimed
, &t
] {
1129 DEBUGT("reclaim {} extents", t
, extents
.size());
1130 // rewrite live extents
1131 auto modify_time
= segments
[reclaim_state
->get_segment_id()].modify_time
;
1132 return trans_intr::do_for_each(
1134 [this, modify_time
, &t
, &reclaimed
](auto ext
)
1136 reclaimed
+= ext
->get_length();
1137 return extent_callback
->rewrite_extent(
1138 t
, ext
, reclaim_state
->target_generation
, modify_time
);
1141 }).si_then([this, &t
] {
1142 return extent_callback
->submit_transaction_direct(t
);
1148 SegmentCleaner::clean_space_ret
SegmentCleaner::clean_space()
1150 LOG_PREFIX(SegmentCleaner::clean_space
);
1151 assert(background_callback
->is_ready());
1152 ceph_assert(can_clean_space());
1153 if (!reclaim_state
) {
1154 segment_id_t seg_id
= get_next_reclaim_segment();
1155 auto &segment_info
= segments
[seg_id
];
1156 INFO("reclaim {} {} start, usage={}, time_bound={}",
1157 seg_id
, segment_info
,
1158 space_tracker
->calc_utilization(seg_id
),
1159 sea_time_point_printer_t
{segments
.get_time_bound()});
1160 ceph_assert(segment_info
.is_closed());
1161 reclaim_state
= reclaim_state_t::create(
1162 seg_id
, segment_info
.generation
, segments
.get_segment_size());
1164 reclaim_state
->advance(config
.reclaim_bytes_per_cycle
);
1166 DEBUG("reclaiming {} {}~{}",
1167 rewrite_gen_printer_t
{reclaim_state
->generation
},
1168 reclaim_state
->start_pos
,
1169 reclaim_state
->end_pos
);
1170 double pavail_ratio
= get_projected_available_ratio();
1171 sea_time_point start
= seastar::lowres_system_clock::now();
1173 // Backref-tree doesn't support tree-read during tree-updates with parallel
1174 // transactions. So, concurrent transactions between trim and reclaim are
1175 // not allowed right now.
1176 return seastar::do_with(
1177 std::pair
<std::vector
<CachedExtentRef
>, backref_pin_list_t
>(),
1178 [this](auto &weak_read_ret
) {
1179 return repeat_eagain([this, &weak_read_ret
] {
1180 return extent_callback
->with_transaction_intr(
1181 Transaction::src_t::READ
,
1182 "retrieve_from_backref_tree",
1183 [this, &weak_read_ret
](auto &t
) {
1184 return backref_manager
.get_mappings(
1186 reclaim_state
->start_pos
,
1187 reclaim_state
->end_pos
1188 ).si_then([this, &t
, &weak_read_ret
](auto pin_list
) {
1189 if (!pin_list
.empty()) {
1190 auto it
= pin_list
.begin();
1191 auto &first_pin
= *it
;
1192 if (first_pin
->get_key() < reclaim_state
->start_pos
) {
1193 // BackrefManager::get_mappings may include a entry before
1194 // reclaim_state->start_pos, which is semantically inconsistent
1195 // with the requirements of the cleaner
1199 return backref_manager
.retrieve_backref_extents_in_range(
1201 reclaim_state
->start_pos
,
1202 reclaim_state
->end_pos
1203 ).si_then([pin_list
=std::move(pin_list
),
1204 &weak_read_ret
](auto extents
) mutable {
1205 weak_read_ret
= std::make_pair(std::move(extents
), std::move(pin_list
));
1209 }).safe_then([&weak_read_ret
] {
1210 return std::move(weak_read_ret
);
1212 }).safe_then([this, FNAME
, pavail_ratio
, start
](auto weak_read_ret
) {
1213 return seastar::do_with(
1214 std::move(weak_read_ret
.first
),
1215 std::move(weak_read_ret
.second
),
1218 [this, FNAME
, pavail_ratio
, start
](
1219 auto &backref_extents
, auto &pin_list
, auto &reclaimed
, auto &runs
)
1221 return do_reclaim_space(
1226 ).safe_then([this, FNAME
, pavail_ratio
, start
, &reclaimed
, &runs
] {
1227 stats
.reclaiming_bytes
+= reclaimed
;
1228 auto d
= seastar::lowres_system_clock::now() - start
;
1229 DEBUG("duration: {}, pavail_ratio before: {}, repeats: {}",
1230 d
, pavail_ratio
, runs
);
1231 if (reclaim_state
->is_complete()) {
1232 auto segment_to_release
= reclaim_state
->get_segment_id();
1233 INFO("reclaim {} finish, reclaimed alive/total={}",
1235 stats
.reclaiming_bytes
/(double)segments
.get_segment_size());
1236 stats
.reclaimed_bytes
+= stats
.reclaiming_bytes
;
1237 stats
.reclaimed_segment_bytes
+= segments
.get_segment_size();
1238 stats
.reclaiming_bytes
= 0;
1239 reclaim_state
.reset();
1240 return sm_group
->release_segment(segment_to_release
1242 clean_space_ertr::pass_further
{},
1243 crimson::ct_error::assert_all
{
1244 "SegmentCleaner::clean_space encountered invalid error in release_segment"
1246 ).safe_then([this, FNAME
, segment_to_release
] {
1247 auto old_usage
= calc_utilization(segment_to_release
);
1248 if(unlikely(old_usage
!= 0)) {
1249 space_tracker
->dump_usage(segment_to_release
);
1250 ERROR("segment {} old_usage {} != 0",
1251 segment_to_release
, old_usage
);
1254 segments
.mark_empty(segment_to_release
);
1255 auto new_usage
= calc_utilization(segment_to_release
);
1256 adjust_segment_util(old_usage
, new_usage
);
1257 INFO("released {}, {}",
1258 segment_to_release
, stat_printer_t
{*this, false});
1259 background_callback
->maybe_wake_blocked_io();
1262 return clean_space_ertr::now();
1269 SegmentCleaner::mount_ret
SegmentCleaner::mount()
1271 LOG_PREFIX(SegmentCleaner::mount
);
1272 const auto& sms
= sm_group
->get_segment_managers();
1273 INFO("{} segment managers", sms
.size());
1275 assert(background_callback
->get_state() == state_t::MOUNT
);
1277 space_tracker
.reset(
1279 (SpaceTrackerI
*)new SpaceTrackerDetailed(
1281 (SpaceTrackerI
*)new SpaceTrackerSimple(
1285 for (auto sm
: sms
) {
1286 segments
.add_segment_manager(*sm
);
1288 segments
.assign_ids();
1294 INFO("{} segments", segments
.get_num_segments());
1295 return crimson::do_for_each(
1298 [this, FNAME
](auto& it
)
1300 auto segment_id
= it
.first
;
1301 return sm_group
->read_segment_header(
1303 ).safe_then([segment_id
, this, FNAME
](auto header
) {
1304 DEBUG("segment_id={} -- {}", segment_id
, header
);
1305 auto s_type
= header
.get_type();
1306 if (s_type
== segment_type_t::NULL_SEG
) {
1307 ERROR("got null segment, segment_id={} -- {}", segment_id
, header
);
1310 return sm_group
->read_segment_tail(
1312 ).safe_then([this, FNAME
, segment_id
, header
](auto tail
)
1313 -> scan_extents_ertr::future
<> {
1314 if (tail
.segment_nonce
!= header
.segment_nonce
) {
1315 return scan_no_tail_segment(header
, segment_id
);
1317 ceph_assert(header
.get_type() == tail
.get_type());
1319 sea_time_point modify_time
= mod_to_timepoint(tail
.modify_time
);
1320 std::size_t num_extents
= tail
.num_extents
;
1321 if ((modify_time
== NULL_TIME
&& num_extents
== 0) ||
1322 (modify_time
!= NULL_TIME
&& num_extents
!= 0)) {
1323 segments
.update_modify_time(segment_id
, modify_time
, num_extents
);
1325 ERROR("illegal modify time {}", tail
);
1326 return crimson::ct_error::input_output_error::make();
1329 init_mark_segment_closed(
1335 return seastar::now();
1337 crimson::ct_error::enodata::handle(
1338 [this, header
, segment_id
](auto) {
1339 return scan_no_tail_segment(header
, segment_id
);
1341 crimson::ct_error::pass_further_all
{}
1344 crimson::ct_error::enoent::handle([](auto) {
1345 return mount_ertr::now();
1347 crimson::ct_error::enodata::handle([](auto) {
1348 return mount_ertr::now();
1350 crimson::ct_error::input_output_error::pass_further
{},
1351 crimson::ct_error::assert_all
{"unexpected error"}
1353 }).safe_then([this, FNAME
] {
1354 INFO("done, {}", segments
);
1358 SegmentCleaner::scan_extents_ret
SegmentCleaner::scan_no_tail_segment(
1359 const segment_header_t
&segment_header
,
1360 segment_id_t segment_id
)
1362 LOG_PREFIX(SegmentCleaner::scan_no_tail_segment
);
1363 INFO("scan {} {}", segment_id
, segment_header
);
1364 return seastar::do_with(
1365 scan_valid_records_cursor({
1366 segments
[segment_id
].seq
,
1367 paddr_t::make_seg_paddr(segment_id
, 0)
1369 SegmentManagerGroup::found_record_handler_t(
1370 [this, segment_id
, segment_header
, FNAME
](
1371 record_locator_t locator
,
1372 const record_group_header_t
&record_group_header
,
1373 const bufferlist
& mdbuf
1374 ) mutable -> SegmentManagerGroup::scan_valid_records_ertr::future
<>
1376 DEBUG("{} {}, decoding {} records",
1377 segment_id
, segment_header
.get_type(), record_group_header
.records
);
1379 auto maybe_headers
= try_decode_record_headers(
1380 record_group_header
, mdbuf
);
1381 if (!maybe_headers
) {
1382 // This should be impossible, we did check the crc on the mdbuf
1383 ERROR("unable to decode record headers for record group {}",
1384 locator
.record_block_base
);
1385 return crimson::ct_error::input_output_error::make();
1388 for (auto &record_header
: *maybe_headers
) {
1389 auto modify_time
= mod_to_timepoint(record_header
.modify_time
);
1390 if (record_header
.extents
== 0 || modify_time
!= NULL_TIME
) {
1391 segments
.update_modify_time(
1392 segment_id
, modify_time
, record_header
.extents
);
1394 ERROR("illegal modify time {}", record_header
);
1395 return crimson::ct_error::input_output_error::make();
1398 return seastar::now();
1400 [this, segment_header
](auto &cursor
, auto &handler
)
1402 return sm_group
->scan_valid_records(
1404 segment_header
.segment_nonce
,
1405 segments
.get_segment_size(),
1406 handler
).discard_result();
1407 }).safe_then([this, segment_id
, segment_header
] {
1408 init_mark_segment_closed(
1410 segment_header
.segment_seq
,
1411 segment_header
.type
,
1412 segment_header
.category
,
1413 segment_header
.generation
);
1417 bool SegmentCleaner::check_usage()
1419 SpaceTrackerIRef
tracker(space_tracker
->make_empty());
1420 extent_callback
->with_transaction_weak(
1422 [this, &tracker
](auto &t
) {
1423 return backref_manager
.scan_mapped_space(
1427 paddr_t backref_key
,
1429 extent_types_t type
,
1432 if (paddr
.get_addr_type() == paddr_types_t::SEGMENT
) {
1433 if (is_backref_node(type
)) {
1434 assert(laddr
== L_ADDR_NULL
);
1435 assert(backref_key
!= P_ADDR_NULL
);
1437 paddr
.as_seg_paddr().get_segment_id(),
1438 paddr
.as_seg_paddr().get_segment_off(),
1440 } else if (laddr
== L_ADDR_NULL
) {
1441 assert(backref_key
== P_ADDR_NULL
);
1443 paddr
.as_seg_paddr().get_segment_id(),
1444 paddr
.as_seg_paddr().get_segment_off(),
1447 assert(backref_key
== P_ADDR_NULL
);
1449 paddr
.as_seg_paddr().get_segment_id(),
1450 paddr
.as_seg_paddr().get_segment_off(),
1456 return space_tracker
->equals(*tracker
);
1459 void SegmentCleaner::mark_space_used(
1463 LOG_PREFIX(SegmentCleaner::mark_space_used
);
1464 assert(background_callback
->get_state() >= state_t::SCAN_SPACE
);
1467 if (addr
.get_addr_type() != paddr_types_t::SEGMENT
) {
1471 auto& seg_addr
= addr
.as_seg_paddr();
1472 stats
.used_bytes
+= len
;
1473 auto old_usage
= calc_utilization(seg_addr
.get_segment_id());
1474 [[maybe_unused
]] auto ret
= space_tracker
->allocate(
1475 seg_addr
.get_segment_id(),
1476 seg_addr
.get_segment_off(),
1478 auto new_usage
= calc_utilization(seg_addr
.get_segment_id());
1479 adjust_segment_util(old_usage
, new_usage
);
1481 background_callback
->maybe_wake_background();
1483 DEBUG("segment {} new len: {}~{}, live_bytes: {}",
1484 seg_addr
.get_segment_id(),
1487 space_tracker
->get_usage(seg_addr
.get_segment_id()));
1490 void SegmentCleaner::mark_space_free(
1494 LOG_PREFIX(SegmentCleaner::mark_space_free
);
1495 assert(background_callback
->get_state() >= state_t::SCAN_SPACE
);
1498 if (addr
.get_addr_type() != paddr_types_t::SEGMENT
) {
1502 ceph_assert(stats
.used_bytes
>= len
);
1503 stats
.used_bytes
-= len
;
1504 auto& seg_addr
= addr
.as_seg_paddr();
1506 DEBUG("segment {} free len: {}~{}",
1507 seg_addr
.get_segment_id(), addr
, len
);
1508 auto old_usage
= calc_utilization(seg_addr
.get_segment_id());
1509 [[maybe_unused
]] auto ret
= space_tracker
->release(
1510 seg_addr
.get_segment_id(),
1511 seg_addr
.get_segment_off(),
1513 auto new_usage
= calc_utilization(seg_addr
.get_segment_id());
1514 adjust_segment_util(old_usage
, new_usage
);
1515 background_callback
->maybe_wake_blocked_io();
1517 DEBUG("segment {} free len: {}~{}, live_bytes: {}",
1518 seg_addr
.get_segment_id(),
1521 space_tracker
->get_usage(seg_addr
.get_segment_id()));
1524 segment_id_t
SegmentCleaner::get_next_reclaim_segment() const
1526 LOG_PREFIX(SegmentCleaner::get_next_reclaim_segment
);
1527 segment_id_t id
= NULL_SEG_ID
;
1528 double max_benefit_cost
= 0;
1529 sea_time_point now_time
;
1530 if constexpr (gc_formula
!= gc_formula_t::GREEDY
) {
1531 now_time
= seastar::lowres_system_clock::now();
1533 now_time
= NULL_TIME
;
1535 sea_time_point bound_time
;
1536 if constexpr (gc_formula
== gc_formula_t::BENEFIT
) {
1537 bound_time
= segments
.get_time_bound();
1538 if (bound_time
== NULL_TIME
) {
1539 WARN("BENEFIT -- bound_time is NULL_TIME");
1542 bound_time
= NULL_TIME
;
1544 for (auto& [_id
, segment_info
] : segments
) {
1545 if (segment_info
.is_closed() &&
1546 (trimmer
== nullptr ||
1547 !segment_info
.is_in_journal(trimmer
->get_journal_tail()))) {
1548 double benefit_cost
= calc_gc_benefit_cost(_id
, now_time
, bound_time
);
1549 if (benefit_cost
> max_benefit_cost
) {
1551 max_benefit_cost
= benefit_cost
;
1555 if (id
!= NULL_SEG_ID
) {
1556 DEBUG("segment {}, benefit_cost {}",
1557 id
, max_benefit_cost
);
1560 ceph_assert(get_segments_reclaimable() == 0);
1561 // see should_clean_space()
1562 ceph_abort("impossible!");
1567 bool SegmentCleaner::try_reserve_projected_usage(std::size_t projected_usage
)
1569 assert(background_callback
->is_ready());
1570 stats
.projected_used_bytes
+= projected_usage
;
1571 if (should_block_io_on_clean()) {
1572 stats
.projected_used_bytes
-= projected_usage
;
1575 ++stats
.projected_count
;
1576 stats
.projected_used_bytes_sum
+= stats
.projected_used_bytes
;
1581 void SegmentCleaner::release_projected_usage(std::size_t projected_usage
)
1583 assert(background_callback
->is_ready());
1584 ceph_assert(stats
.projected_used_bytes
>= projected_usage
);
1585 stats
.projected_used_bytes
-= projected_usage
;
1586 background_callback
->maybe_wake_blocked_io();
1589 void SegmentCleaner::print(std::ostream
&os
, bool is_detailed
) const
1591 os
<< "SegmentCleaner(";
1592 if (background_callback
->is_ready()) {
1593 os
<< "should_block_io_on_clean=" << should_block_io_on_clean()
1594 << ", should_clean=" << should_clean_space();
1598 os
<< ", projected_avail_ratio=" << get_projected_available_ratio()
1599 << ", reclaim_ratio=" << get_reclaim_ratio()
1600 << ", alive_ratio=" << get_alive_ratio();
1602 os
<< ", unavailable_unreclaimable="
1603 << get_unavailable_unreclaimable_bytes() << "B"
1604 << ", unavailable_reclaimble="
1605 << get_unavailable_reclaimable_bytes() << "B"
1606 << ", alive=" << stats
.used_bytes
<< "B"
1607 << ", " << segments
;
1612 RBMCleaner::RBMCleaner(
1613 RBMDeviceGroupRef
&& rb_group
,
1614 BackrefManager
&backref_manager
,
1616 : detailed(detailed
),
1617 rb_group(std::move(rb_group
)),
1618 backref_manager(backref_manager
)
1621 void RBMCleaner::print(std::ostream
&os
, bool is_detailed
) const
1627 void RBMCleaner::mark_space_used(
1631 LOG_PREFIX(RBMCleaner::mark_space_used
);
1632 assert(addr
.get_addr_type() == paddr_types_t::RANDOM_BLOCK
);
1633 auto rbms
= rb_group
->get_rb_managers();
1634 for (auto rbm
: rbms
) {
1635 if (addr
.get_device_id() == rbm
->get_device_id()) {
1636 if (rbm
->get_start() <= addr
) {
1637 INFO("allocate addr: {} len: {}", addr
, len
);
1638 stats
.used_bytes
+= len
;
1639 rbm
->mark_space_used(addr
, len
);
1646 void RBMCleaner::mark_space_free(
1650 LOG_PREFIX(RBMCleaner::mark_space_free
);
1651 assert(addr
.get_addr_type() == paddr_types_t::RANDOM_BLOCK
);
1652 auto rbms
= rb_group
->get_rb_managers();
1653 for (auto rbm
: rbms
) {
1654 if (addr
.get_device_id() == rbm
->get_device_id()) {
1655 if (rbm
->get_start() <= addr
) {
1656 INFO("free addr: {} len: {}", addr
, len
);
1657 ceph_assert(stats
.used_bytes
>= len
);
1658 stats
.used_bytes
-= len
;
1659 rbm
->mark_space_free(addr
, len
);
1666 void RBMCleaner::commit_space_used(paddr_t addr
, extent_len_t len
)
1668 auto rbms
= rb_group
->get_rb_managers();
1669 for (auto rbm
: rbms
) {
1670 if (addr
.get_device_id() == rbm
->get_device_id()) {
1671 if (rbm
->get_start() <= addr
) {
1672 rbm
->complete_allocation(addr
, len
);
1679 bool RBMCleaner::try_reserve_projected_usage(std::size_t projected_usage
)
1681 assert(background_callback
->is_ready());
1682 stats
.projected_used_bytes
+= projected_usage
;
1686 void RBMCleaner::release_projected_usage(std::size_t projected_usage
)
1688 assert(background_callback
->is_ready());
1689 ceph_assert(stats
.projected_used_bytes
>= projected_usage
);
1690 stats
.projected_used_bytes
-= projected_usage
;
1691 background_callback
->maybe_wake_blocked_io();
1694 RBMCleaner::clean_space_ret
RBMCleaner::clean_space()
1697 return clean_space_ertr::now();
1700 RBMCleaner::mount_ret
RBMCleaner::mount()
1704 return seastar::do_with(
1705 rb_group
->get_rb_managers(),
1707 return crimson::do_for_each(
1713 crimson::ct_error::input_output_error::pass_further(),
1714 crimson::ct_error::assert_all
{
1715 "Invalid error when opening RBM"}
1721 bool RBMCleaner::check_usage()
1724 const auto& rbms
= rb_group
->get_rb_managers();
1725 RBMSpaceTracker
tracker(rbms
);
1726 extent_callback
->with_transaction_weak(
1728 [this, &tracker
, &rbms
](auto &t
) {
1729 return backref_manager
.scan_mapped_space(
1733 paddr_t backref_key
,
1735 extent_types_t type
,
1738 for (auto rbm
: rbms
) {
1739 if (rbm
->get_device_id() == paddr
.get_device_id()) {
1740 if (is_backref_node(type
)) {
1741 assert(laddr
== L_ADDR_NULL
);
1742 assert(backref_key
!= P_ADDR_NULL
);
1746 } else if (laddr
== L_ADDR_NULL
) {
1747 assert(backref_key
== P_ADDR_NULL
);
1752 assert(backref_key
== P_ADDR_NULL
);
1761 return equals(tracker
);
1764 bool RBMCleaner::equals(const RBMSpaceTracker
&_other
) const
1766 LOG_PREFIX(RBMSpaceTracker::equals
);
1767 const auto &other
= static_cast<const RBMSpaceTracker
&>(_other
);
1768 auto rbs
= rb_group
->get_rb_managers();
1769 //TODO: multiple rbm allocator
1773 if (rbm
->get_device()->get_available_size() / rbm
->get_block_size()
1774 != other
.block_usage
.size()) {
1775 assert(0 == "block counts should match");
1778 bool all_match
= true;
1779 for (auto i
= other
.block_usage
.begin();
1780 i
!= other
.block_usage
.end(); ++i
) {
1781 if (i
->first
< rbm
->get_start().as_blk_paddr().get_device_off()) {
1784 auto addr
= i
->first
;
1785 auto state
= rbm
->get_extent_state(
1786 convert_abs_addr_to_paddr(addr
, rbm
->get_device_id()),
1787 rbm
->get_block_size());
1788 if ((i
->second
.used
&& state
== rbm_extent_state_t::ALLOCATED
) ||
1789 (!i
->second
.used
&& (state
== rbm_extent_state_t::FREE
||
1790 state
== rbm_extent_state_t::RESERVED
))) {
1794 ERROR("block addr {} mismatch other used: {}",
1795 addr
, i
->second
.used
);
1801 void RBMCleaner::register_metrics()
1803 namespace sm
= seastar::metrics
;
1805 metrics
.add_group("rbm_cleaner", {
1806 sm::make_counter("total_bytes",
1807 [this] { return get_total_bytes(); },
1808 sm::description("the size of the space")),
1809 sm::make_counter("available_bytes",
1810 [this] { return get_total_bytes() - get_journal_bytes() - stats
.used_bytes
; },
1811 sm::description("the size of the space is available")),
1812 sm::make_counter("used_bytes", stats
.used_bytes
,
1813 sm::description("the size of the space occupied by live extents")),