]> git.proxmox.com Git - ceph.git/blob - ceph/src/crimson/os/seastore/extent_placement_manager.h
update ceph source to reef 18.2.1
[ceph.git] / ceph / src / crimson / os / seastore / extent_placement_manager.h
1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:nil -*-
2 // vim: ts=8 sw=2 smarttab expandtab
3
4 #pragma once
5
6 #include "seastar/core/gate.hh"
7
8 #include "crimson/os/seastore/async_cleaner.h"
9 #include "crimson/os/seastore/cached_extent.h"
10 #include "crimson/os/seastore/journal/segment_allocator.h"
11 #include "crimson/os/seastore/journal/record_submitter.h"
12 #include "crimson/os/seastore/transaction.h"
13 #include "crimson/os/seastore/random_block_manager.h"
14 #include "crimson/os/seastore/random_block_manager/block_rb_manager.h"
15 #include "crimson/os/seastore/randomblock_manager_group.h"
16
17 class transaction_manager_test_t;
18
19 namespace crimson::os::seastore {
20
21 /**
22 * ExtentOolWriter
23 *
24 * Write the extents as out-of-line and allocate the physical addresses.
25 * Different writers write extents to different locations.
26 */
27 class ExtentOolWriter {
28 using base_ertr = crimson::errorator<
29 crimson::ct_error::input_output_error>;
30 public:
31 virtual ~ExtentOolWriter() {}
32
33 using open_ertr = base_ertr;
34 virtual open_ertr::future<> open() = 0;
35
36 virtual paddr_t alloc_paddr(extent_len_t length) = 0;
37
38 using alloc_write_ertr = base_ertr;
39 using alloc_write_iertr = trans_iertr<alloc_write_ertr>;
40 virtual alloc_write_iertr::future<> alloc_write_ool_extents(
41 Transaction &t,
42 std::list<LogicalCachedExtentRef> &extents) = 0;
43
44 using close_ertr = base_ertr;
45 virtual close_ertr::future<> close() = 0;
46 };
47 using ExtentOolWriterRef = std::unique_ptr<ExtentOolWriter>;
48
49 /**
50 * SegmentedOolWriter
51 *
52 * Different writers write extents to different out-of-line segments provided
53 * by the SegmentProvider.
54 */
55 class SegmentedOolWriter : public ExtentOolWriter {
56 public:
57 SegmentedOolWriter(data_category_t category,
58 rewrite_gen_t gen,
59 SegmentProvider &sp,
60 SegmentSeqAllocator &ssa);
61
62 open_ertr::future<> open() final {
63 return record_submitter.open(false).discard_result();
64 }
65
66 alloc_write_iertr::future<> alloc_write_ool_extents(
67 Transaction &t,
68 std::list<LogicalCachedExtentRef> &extents) final;
69
70 close_ertr::future<> close() final {
71 return write_guard.close().then([this] {
72 return record_submitter.close();
73 }).safe_then([this] {
74 write_guard = seastar::gate();
75 });
76 }
77
78 paddr_t alloc_paddr(extent_len_t length) final {
79 return make_delayed_temp_paddr(0);
80 }
81
82 private:
83 alloc_write_iertr::future<> do_write(
84 Transaction& t,
85 std::list<LogicalCachedExtentRef> &extent);
86
87 alloc_write_ertr::future<> write_record(
88 Transaction& t,
89 record_t&& record,
90 std::list<LogicalCachedExtentRef> &&extents,
91 bool with_atomic_roll_segment=false);
92
93 journal::SegmentAllocator segment_allocator;
94 journal::RecordSubmitter record_submitter;
95 seastar::gate write_guard;
96 };
97
98
99 class RandomBlockOolWriter : public ExtentOolWriter {
100 public:
101 RandomBlockOolWriter(RBMCleaner* rb_cleaner) :
102 rb_cleaner(rb_cleaner) {}
103
104 using open_ertr = ExtentOolWriter::open_ertr;
105 open_ertr::future<> open() final {
106 return open_ertr::now();
107 }
108
109 alloc_write_iertr::future<> alloc_write_ool_extents(
110 Transaction &t,
111 std::list<LogicalCachedExtentRef> &extents) final;
112
113 close_ertr::future<> close() final {
114 return write_guard.close().then([this] {
115 write_guard = seastar::gate();
116 return close_ertr::now();
117 });
118 }
119
120 paddr_t alloc_paddr(extent_len_t length) final {
121 assert(rb_cleaner);
122 return rb_cleaner->alloc_paddr(length);
123 }
124
125 private:
126 alloc_write_iertr::future<> do_write(
127 Transaction& t,
128 std::list<LogicalCachedExtentRef> &extent);
129
130 RBMCleaner* rb_cleaner;
131 seastar::gate write_guard;
132 };
133
134 struct cleaner_usage_t {
135 // The size of all extents write to the main devices, including inline extents
136 // and out-of-line extents.
137 std::size_t main_usage = 0;
138 // The size of extents write to the cold devices
139 std::size_t cold_ool_usage = 0;
140 };
141
142 struct reserve_cleaner_result_t {
143 bool reserve_main_success = true;
144 bool reserve_cold_success = true;
145
146 bool is_successful() const {
147 return reserve_main_success &&
148 reserve_cold_success;
149 }
150 };
151
152 /**
153 * io_usage_t
154 *
155 * io_usage_t describes the space usage consumed by client IO.
156 */
157 struct io_usage_t {
158 // The total size of all inlined extents, not including deltas and other metadata
159 // produced by Cache::prepare_record.
160 std::size_t inline_usage = 0;
161 cleaner_usage_t cleaner_usage;
162 friend std::ostream &operator<<(std::ostream &out, const io_usage_t &usage) {
163 return out << "io_usage_t("
164 << "inline_usage=" << usage.inline_usage
165 << ", main_cleaner_usage=" << usage.cleaner_usage.main_usage
166 << ", cold_cleaner_usage=" << usage.cleaner_usage.cold_ool_usage
167 << ")";
168 }
169 };
170
171 struct reserve_io_result_t {
172 bool reserve_inline_success = true;
173 reserve_cleaner_result_t cleaner_result;
174
175 bool is_successful() const {
176 return reserve_inline_success &&
177 cleaner_result.is_successful();
178 }
179 };
180
181 class ExtentPlacementManager {
182 public:
183 ExtentPlacementManager()
184 : ool_segment_seq_allocator(
185 std::make_unique<SegmentSeqAllocator>(segment_type_t::OOL))
186 {
187 devices_by_id.resize(DEVICE_ID_MAX, nullptr);
188 }
189
190 void init(JournalTrimmerImplRef &&, AsyncCleanerRef &&, AsyncCleanerRef &&);
191
192 SegmentSeqAllocator &get_ool_segment_seq_allocator() const {
193 return *ool_segment_seq_allocator;
194 }
195
196 void set_primary_device(Device *device);
197
198 void set_extent_callback(ExtentCallbackInterface *cb) {
199 background_process.set_extent_callback(cb);
200 }
201
202 journal_type_t get_journal_type() const {
203 return background_process.get_journal_type();
204 }
205
206 extent_len_t get_block_size() const {
207 assert(primary_device != nullptr);
208 // assume all the devices have the same block size
209 return primary_device->get_block_size();
210 }
211
212 Device& get_primary_device() {
213 assert(primary_device != nullptr);
214 return *primary_device;
215 }
216
217 store_statfs_t get_stat() const {
218 return background_process.get_stat();
219 }
220
221 using mount_ertr = crimson::errorator<
222 crimson::ct_error::input_output_error>;
223 using mount_ret = mount_ertr::future<>;
224 mount_ret mount() {
225 return background_process.mount();
226 }
227
228 using open_ertr = ExtentOolWriter::open_ertr;
229 open_ertr::future<> open_for_write();
230
231 void start_scan_space() {
232 return background_process.start_scan_space();
233 }
234
235 void start_background() {
236 return background_process.start_background();
237 }
238
239 struct alloc_result_t {
240 paddr_t paddr;
241 bufferptr bp;
242 rewrite_gen_t gen;
243 };
244 alloc_result_t alloc_new_extent(
245 Transaction& t,
246 extent_types_t type,
247 extent_len_t length,
248 placement_hint_t hint,
249 #ifdef UNIT_TESTS_BUILT
250 rewrite_gen_t gen,
251 std::optional<paddr_t> external_paddr = std::nullopt
252 #else
253 rewrite_gen_t gen
254 #endif
255 ) {
256 assert(hint < placement_hint_t::NUM_HINTS);
257 assert(is_target_rewrite_generation(gen));
258 assert(gen == INIT_GENERATION || hint == placement_hint_t::REWRITE);
259
260 data_category_t category = get_extent_category(type);
261 gen = adjust_generation(category, type, hint, gen);
262
263 // XXX: bp might be extended to point to different memory (e.g. PMem)
264 // according to the allocator.
265 auto bp = ceph::bufferptr(
266 buffer::create_page_aligned(length));
267 bp.zero();
268 paddr_t addr;
269 #ifdef UNIT_TESTS_BUILT
270 if (unlikely(external_paddr.has_value())) {
271 assert(external_paddr->is_fake());
272 addr = *external_paddr;
273 } else if (gen == INLINE_GENERATION) {
274 #else
275 if (gen == INLINE_GENERATION) {
276 #endif
277 addr = make_record_relative_paddr(0);
278 } else if (category == data_category_t::DATA) {
279 assert(data_writers_by_gen[generation_to_writer(gen)]);
280 addr = data_writers_by_gen[
281 generation_to_writer(gen)]->alloc_paddr(length);
282 } else {
283 assert(category == data_category_t::METADATA);
284 assert(md_writers_by_gen[generation_to_writer(gen)]);
285 addr = md_writers_by_gen[
286 generation_to_writer(gen)]->alloc_paddr(length);
287 }
288 return {addr, std::move(bp), gen};
289 }
290
291 /**
292 * dispatch_result_t
293 *
294 * ool extents are placed in alloc_map and passed to
295 * EPM::write_delayed_ool_extents,
296 * delayed_extents is used to update lba mapping.
297 * usage is used to reserve projected space
298 */
299 using extents_by_writer_t =
300 std::map<ExtentOolWriter*, std::list<LogicalCachedExtentRef>>;
301 struct dispatch_result_t {
302 extents_by_writer_t alloc_map;
303 std::list<LogicalCachedExtentRef> delayed_extents;
304 io_usage_t usage;
305 };
306
307 /**
308 * dispatch_delayed_extents
309 *
310 * Performs delayed allocation
311 */
312 dispatch_result_t dispatch_delayed_extents(Transaction& t);
313
314 /**
315 * write_delayed_ool_extents
316 *
317 * Do writes for out-of-line extents.
318 */
319 using alloc_paddr_iertr = ExtentOolWriter::alloc_write_iertr;
320 alloc_paddr_iertr::future<> write_delayed_ool_extents(
321 Transaction& t,
322 extents_by_writer_t& alloc_map);
323
324 /**
325 * write_preallocated_ool_extents
326 *
327 * Performs ool writes for extents with pre-allocated addresses.
328 * See Transaction::pre_alloc_list
329 */
330 alloc_paddr_iertr::future<> write_preallocated_ool_extents(
331 Transaction &t,
332 std::list<LogicalCachedExtentRef> extents);
333
334 seastar::future<> stop_background() {
335 return background_process.stop_background();
336 }
337
338 using close_ertr = ExtentOolWriter::close_ertr;
339 close_ertr::future<> close();
340
341 using read_ertr = Device::read_ertr;
342 read_ertr::future<> read(
343 paddr_t addr,
344 size_t len,
345 ceph::bufferptr &out
346 ) {
347 assert(devices_by_id[addr.get_device_id()] != nullptr);
348 return devices_by_id[addr.get_device_id()]->read(addr, len, out);
349 }
350
351 void mark_space_used(paddr_t addr, extent_len_t len) {
352 background_process.mark_space_used(addr, len);
353 }
354
355 void mark_space_free(paddr_t addr, extent_len_t len) {
356 background_process.mark_space_free(addr, len);
357 }
358
359 void commit_space_used(paddr_t addr, extent_len_t len) {
360 return background_process.commit_space_used(addr, len);
361 }
362
363 seastar::future<> reserve_projected_usage(io_usage_t usage) {
364 return background_process.reserve_projected_usage(usage);
365 }
366
367 void release_projected_usage(const io_usage_t &usage) {
368 background_process.release_projected_usage(usage);
369 }
370
371 backend_type_t get_main_backend_type() const {
372 if (!background_process.is_no_background()) {
373 return background_process.get_main_backend_type();
374 }
375 // for test
376 assert(primary_device);
377 return primary_device->get_backend_type();
378 }
379
380 // Testing interfaces
381
382 void test_init_no_background(Device *test_device) {
383 assert(test_device->get_backend_type() == backend_type_t::SEGMENTED);
384 add_device(test_device);
385 set_primary_device(test_device);
386 }
387
388 bool check_usage() {
389 return background_process.check_usage();
390 }
391
392 seastar::future<> run_background_work_until_halt() {
393 return background_process.run_until_halt();
394 }
395
396 private:
397 rewrite_gen_t adjust_generation(
398 data_category_t category,
399 extent_types_t type,
400 placement_hint_t hint,
401 rewrite_gen_t gen) {
402 if (type == extent_types_t::ROOT) {
403 gen = INLINE_GENERATION;
404 } else if (get_main_backend_type() == backend_type_t::SEGMENTED &&
405 is_lba_backref_node(type)) {
406 gen = INLINE_GENERATION;
407 } else if (hint == placement_hint_t::COLD) {
408 assert(gen == INIT_GENERATION);
409 if (background_process.has_cold_tier()) {
410 gen = MIN_COLD_GENERATION;
411 } else {
412 gen = MIN_REWRITE_GENERATION;
413 }
414 } else if (gen == INIT_GENERATION) {
415 if (category == data_category_t::METADATA) {
416 if (get_main_backend_type() == backend_type_t::SEGMENTED) {
417 // with SEGMENTED, default not to ool metadata extents to reduce
418 // padding overhead.
419 // TODO: improve padding so we can default to the ool path.
420 gen = INLINE_GENERATION;
421 } else {
422 // with RBM, all extents must be OOL
423 assert(get_main_backend_type() ==
424 backend_type_t::RANDOM_BLOCK);
425 gen = OOL_GENERATION;
426 }
427 } else {
428 assert(category == data_category_t::DATA);
429 gen = OOL_GENERATION;
430 }
431 } else if (background_process.has_cold_tier()) {
432 gen = background_process.adjust_generation(gen);
433 }
434
435 if (gen > dynamic_max_rewrite_generation) {
436 gen = dynamic_max_rewrite_generation;
437 }
438
439 return gen;
440 }
441
442 void add_device(Device *device) {
443 auto device_id = device->get_device_id();
444 ceph_assert(devices_by_id[device_id] == nullptr);
445 devices_by_id[device_id] = device;
446 ++num_devices;
447 }
448
449 /**
450 * dispatch_delayed_extent
451 *
452 * Specify the extent inline or ool
453 * return true indicates inline otherwise ool
454 */
455 bool dispatch_delayed_extent(LogicalCachedExtentRef& extent) {
456 // TODO: all delayed extents are ool currently
457 boost::ignore_unused(extent);
458 return false;
459 }
460
461 ExtentOolWriter* get_writer(placement_hint_t hint,
462 data_category_t category,
463 rewrite_gen_t gen) {
464 assert(hint < placement_hint_t::NUM_HINTS);
465 assert(is_rewrite_generation(gen));
466 assert(gen != INLINE_GENERATION);
467 assert(gen <= dynamic_max_rewrite_generation);
468 if (category == data_category_t::DATA) {
469 return data_writers_by_gen[generation_to_writer(gen)];
470 } else {
471 assert(category == data_category_t::METADATA);
472 return md_writers_by_gen[generation_to_writer(gen)];
473 }
474 }
475
476 /**
477 * BackgroundProcess
478 *
479 * Background process to schedule background transactions.
480 *
481 * TODO: device tiering
482 */
483 class BackgroundProcess : public BackgroundListener {
484 public:
485 BackgroundProcess() = default;
486
487 void init(JournalTrimmerImplRef &&_trimmer,
488 AsyncCleanerRef &&_cleaner,
489 AsyncCleanerRef &&_cold_cleaner) {
490 trimmer = std::move(_trimmer);
491 trimmer->set_background_callback(this);
492 main_cleaner = std::move(_cleaner);
493 main_cleaner->set_background_callback(this);
494 if (_cold_cleaner) {
495 cold_cleaner = std::move(_cold_cleaner);
496 cold_cleaner->set_background_callback(this);
497
498 cleaners_by_device_id.resize(DEVICE_ID_MAX, nullptr);
499 for (auto id : main_cleaner->get_device_ids()) {
500 cleaners_by_device_id[id] = main_cleaner.get();
501 }
502 for (auto id : cold_cleaner->get_device_ids()) {
503 cleaners_by_device_id[id] = cold_cleaner.get();
504 }
505
506 eviction_state.init(
507 crimson::common::get_conf<double>(
508 "seastore_multiple_tiers_stop_evict_ratio"),
509 crimson::common::get_conf<double>(
510 "seastore_multiple_tiers_default_evict_ratio"),
511 crimson::common::get_conf<double>(
512 "seastore_multiple_tiers_fast_evict_ratio"));
513 }
514 }
515
516 journal_type_t get_journal_type() const {
517 return trimmer->get_journal_type();
518 }
519
520 bool has_cold_tier() const {
521 return cold_cleaner.get() != nullptr;
522 }
523
524 void set_extent_callback(ExtentCallbackInterface *cb) {
525 trimmer->set_extent_callback(cb);
526 main_cleaner->set_extent_callback(cb);
527 if (has_cold_tier()) {
528 cold_cleaner->set_extent_callback(cb);
529 }
530 }
531
532 store_statfs_t get_stat() const {
533 auto stat = main_cleaner->get_stat();
534 if (has_cold_tier()) {
535 stat.add(cold_cleaner->get_stat());
536 }
537 return stat;
538 }
539
540 using mount_ret = ExtentPlacementManager::mount_ret;
541 mount_ret mount() {
542 ceph_assert(state == state_t::STOP);
543 state = state_t::MOUNT;
544 trimmer->reset();
545 stats = {};
546 register_metrics();
547 return main_cleaner->mount(
548 ).safe_then([this] {
549 return has_cold_tier() ? cold_cleaner->mount() : mount_ertr::now();
550 });
551 }
552
553 void start_scan_space() {
554 ceph_assert(state == state_t::MOUNT);
555 state = state_t::SCAN_SPACE;
556 ceph_assert(main_cleaner->check_usage_is_empty());
557 ceph_assert(!has_cold_tier() ||
558 cold_cleaner->check_usage_is_empty());
559 }
560
561 void start_background();
562
563 void mark_space_used(paddr_t addr, extent_len_t len) {
564 if (state < state_t::SCAN_SPACE) {
565 return;
566 }
567
568 if (!has_cold_tier()) {
569 assert(main_cleaner);
570 main_cleaner->mark_space_used(addr, len);
571 } else {
572 auto id = addr.get_device_id();
573 assert(id < cleaners_by_device_id.size());
574 auto cleaner = cleaners_by_device_id[id];
575 assert(cleaner);
576 cleaner->mark_space_used(addr, len);
577 }
578 }
579
580 void mark_space_free(paddr_t addr, extent_len_t len) {
581 if (state < state_t::SCAN_SPACE) {
582 return;
583 }
584
585 if (!has_cold_tier()) {
586 assert(main_cleaner);
587 main_cleaner->mark_space_free(addr, len);
588 } else {
589 auto id = addr.get_device_id();
590 assert(id < cleaners_by_device_id.size());
591 auto cleaner = cleaners_by_device_id[id];
592 assert(cleaner);
593 cleaner->mark_space_free(addr, len);
594 }
595 }
596
597 void commit_space_used(paddr_t addr, extent_len_t len) {
598 if (state < state_t::SCAN_SPACE) {
599 return;
600 }
601
602 if (!has_cold_tier()) {
603 assert(main_cleaner);
604 main_cleaner->commit_space_used(addr, len);
605 } else {
606 auto id = addr.get_device_id();
607 assert(id < cleaners_by_device_id.size());
608 auto cleaner = cleaners_by_device_id[id];
609 assert(cleaner);
610 cleaner->commit_space_used(addr, len);
611 }
612 }
613
614 rewrite_gen_t adjust_generation(rewrite_gen_t gen) {
615 if (has_cold_tier()) {
616 return eviction_state.adjust_generation_with_eviction(gen);
617 } else {
618 return gen;
619 }
620 }
621
622 seastar::future<> reserve_projected_usage(io_usage_t usage);
623
624 void release_projected_usage(const io_usage_t &usage) {
625 if (is_ready()) {
626 trimmer->release_inline_usage(usage.inline_usage);
627 main_cleaner->release_projected_usage(usage.cleaner_usage.main_usage);
628 if (has_cold_tier()) {
629 cold_cleaner->release_projected_usage(usage.cleaner_usage.cold_ool_usage);
630 }
631 }
632 }
633
634 seastar::future<> stop_background();
635 backend_type_t get_main_backend_type() const {
636 return get_journal_type();
637 }
638
639 // Testing interfaces
640
641 bool check_usage() {
642 return main_cleaner->check_usage() &&
643 (!has_cold_tier() || cold_cleaner->check_usage());
644 }
645
646 seastar::future<> run_until_halt();
647
648 bool is_no_background() const {
649 return !trimmer || !main_cleaner;
650 }
651
652 protected:
653 state_t get_state() const final {
654 return state;
655 }
656
657 void maybe_wake_background() final {
658 if (!is_running()) {
659 return;
660 }
661 if (background_should_run()) {
662 do_wake_background();
663 }
664 }
665
666 void maybe_wake_blocked_io() final {
667 if (!is_ready()) {
668 return;
669 }
670 if (!should_block_io() && blocking_io) {
671 blocking_io->set_value();
672 blocking_io = std::nullopt;
673 }
674 }
675
676 private:
677 // reserve helpers
678 bool try_reserve_cold(std::size_t usage);
679 void abort_cold_usage(std::size_t usage, bool success);
680
681 reserve_cleaner_result_t try_reserve_cleaner(const cleaner_usage_t &usage);
682 void abort_cleaner_usage(const cleaner_usage_t &usage,
683 const reserve_cleaner_result_t &result);
684
685 reserve_io_result_t try_reserve_io(const io_usage_t &usage);
686 void abort_io_usage(const io_usage_t &usage,
687 const reserve_io_result_t &result);
688
689 bool is_running() const {
690 if (state == state_t::RUNNING) {
691 assert(process_join);
692 return true;
693 } else {
694 assert(!process_join);
695 return false;
696 }
697 }
698
699 void log_state(const char *caller) const;
700
701 seastar::future<> run();
702
703 void do_wake_background() {
704 if (blocking_background) {
705 blocking_background->set_value();
706 blocking_background = std::nullopt;
707 }
708 }
709
710 // background_should_run() should be atomic with do_background_cycle()
711 // to make sure the condition is consistent.
712 bool background_should_run() {
713 assert(is_ready());
714 maybe_update_eviction_mode();
715 return main_cleaner_should_run()
716 || cold_cleaner_should_run()
717 || trimmer->should_trim();
718 }
719
720 bool main_cleaner_should_run() const {
721 assert(is_ready());
722 return main_cleaner->should_clean_space() ||
723 (has_cold_tier() &&
724 main_cleaner->can_clean_space() &&
725 eviction_state.is_fast_mode());
726 }
727
728 bool cold_cleaner_should_run() const {
729 assert(is_ready());
730 return has_cold_tier() &&
731 cold_cleaner->should_clean_space();
732 }
733
734 bool should_block_io() const {
735 assert(is_ready());
736 return trimmer->should_block_io_on_trim() ||
737 main_cleaner->should_block_io_on_clean() ||
738 (has_cold_tier() &&
739 cold_cleaner->should_block_io_on_clean());
740 }
741
742 void maybe_update_eviction_mode() {
743 if (has_cold_tier()) {
744 auto main_alive_ratio = main_cleaner->get_stat().get_used_raw_ratio();
745 eviction_state.maybe_update_eviction_mode(main_alive_ratio);
746 }
747 }
748
749 struct eviction_state_t {
750 enum class eviction_mode_t {
751 STOP, // generation greater than or equal to MIN_COLD_GENERATION
752 // will be set to MIN_COLD_GENERATION - 1, which means
753 // no extents will be evicted.
754 DEFAULT, // generation incremented with each rewrite. Extents will
755 // be evicted when generation reaches MIN_COLD_GENERATION.
756 FAST, // map all generations located in
757 // [MIN_REWRITE_GENERATION, MIN_COLD_GENERATIOIN) to
758 // MIN_COLD_GENERATION.
759 };
760
761 eviction_mode_t eviction_mode;
762 double stop_evict_ratio;
763 double default_evict_ratio;
764 double fast_evict_ratio;
765
766 void init(double stop_ratio,
767 double default_ratio,
768 double fast_ratio) {
769 ceph_assert(0 <= stop_ratio);
770 ceph_assert(stop_ratio < default_ratio);
771 ceph_assert(default_ratio < fast_ratio);
772 ceph_assert(fast_ratio <= 1);
773 eviction_mode = eviction_mode_t::STOP;
774 stop_evict_ratio = stop_ratio;
775 default_evict_ratio = default_ratio;
776 fast_evict_ratio = fast_ratio;
777 }
778
779 bool is_stop_mode() const {
780 return eviction_mode == eviction_mode_t::STOP;
781 }
782
783 bool is_default_mode() const {
784 return eviction_mode == eviction_mode_t::DEFAULT;
785 }
786
787 bool is_fast_mode() const {
788 return eviction_mode == eviction_mode_t::FAST;
789 }
790
791 rewrite_gen_t adjust_generation_with_eviction(rewrite_gen_t gen) {
792 rewrite_gen_t ret = gen;
793 switch(eviction_mode) {
794 case eviction_mode_t::STOP:
795 if (gen == MIN_COLD_GENERATION) {
796 ret = MIN_COLD_GENERATION - 1;
797 }
798 break;
799 case eviction_mode_t::DEFAULT:
800 break;
801 case eviction_mode_t::FAST:
802 if (gen >= MIN_REWRITE_GENERATION && gen < MIN_COLD_GENERATION) {
803 ret = MIN_COLD_GENERATION;
804 }
805 break;
806 default:
807 ceph_abort("impossible");
808 }
809 return ret;
810 }
811
812 // We change the state of eviction_mode according to the alive ratio
813 // of the main cleaner.
814 //
815 // Use A, B, C, D to represent the state of alive ratio:
816 // A: alive ratio <= stop_evict_ratio
817 // B: alive ratio <= default_evict_ratio
818 // C: alive ratio <= fast_evict_ratio
819 // D: alive ratio > fast_evict_ratio
820 //
821 // and use X, Y, Z to shorten the state of eviction_mode_t:
822 // X: STOP
823 // Y: DEFAULT
824 // Z: FAST
825 //
826 // Then we can use a form like (A && X) to describe the current state
827 // of the main cleaner, which indicates the alive ratio is less than or
828 // equal to stop_evict_ratio and current eviction mode is STOP.
829 //
830 // all valid state transitions show as follow:
831 // (A && X) => (B && X) => (C && Y) => (D && Z) =>
832 // (C && Z) => (B && Y) => (A && X)
833 // `--> (C && Y) => ...
834 //
835 // when the system restarts, the init state is (_ && X), the
836 // transitions should be:
837 // (_ && X) -> (A && X) => normal transition
838 // -> (B && X) => normal transition
839 // -> (C && X) => (C && Y) => normal transition
840 // -> (D && X) => (D && Z) => normal transition
841 void maybe_update_eviction_mode(double main_alive_ratio) {
842 if (main_alive_ratio <= stop_evict_ratio) {
843 eviction_mode = eviction_mode_t::STOP;
844 } else if (main_alive_ratio <= default_evict_ratio) {
845 if (eviction_mode > eviction_mode_t::DEFAULT) {
846 eviction_mode = eviction_mode_t::DEFAULT;
847 }
848 } else if (main_alive_ratio <= fast_evict_ratio) {
849 if (eviction_mode < eviction_mode_t::DEFAULT) {
850 eviction_mode = eviction_mode_t::DEFAULT;
851 }
852 } else {
853 assert(main_alive_ratio > fast_evict_ratio);
854 eviction_mode = eviction_mode_t::FAST;
855 }
856 }
857 };
858
859 seastar::future<> do_background_cycle();
860
861 void register_metrics();
862
863 struct {
864 uint64_t io_blocking_num = 0;
865 uint64_t io_count = 0;
866 uint64_t io_blocked_count = 0;
867 uint64_t io_blocked_count_trim = 0;
868 uint64_t io_blocked_count_clean = 0;
869 uint64_t io_blocked_sum = 0;
870 } stats;
871 seastar::metrics::metric_group metrics;
872
873 JournalTrimmerImplRef trimmer;
874 AsyncCleanerRef main_cleaner;
875
876 /*
877 * cold tier (optional, see has_cold_tier())
878 */
879 AsyncCleanerRef cold_cleaner;
880 std::vector<AsyncCleaner*> cleaners_by_device_id;
881
882 std::optional<seastar::future<>> process_join;
883 std::optional<seastar::promise<>> blocking_background;
884 std::optional<seastar::promise<>> blocking_io;
885 bool is_running_until_halt = false;
886 state_t state = state_t::STOP;
887 eviction_state_t eviction_state;
888
889 friend class ::transaction_manager_test_t;
890 };
891
892 std::vector<ExtentOolWriterRef> writer_refs;
893 std::vector<ExtentOolWriter*> data_writers_by_gen;
894 // gen 0 METADATA writer is the journal writer
895 std::vector<ExtentOolWriter*> md_writers_by_gen;
896
897 std::vector<Device*> devices_by_id;
898 Device* primary_device = nullptr;
899 std::size_t num_devices = 0;
900
901 rewrite_gen_t dynamic_max_rewrite_generation = REWRITE_GENERATIONS;
902 BackgroundProcess background_process;
903 // TODO: drop once paddr->journal_seq_t is introduced
904 SegmentSeqAllocatorRef ool_segment_seq_allocator;
905
906 friend class ::transaction_manager_test_t;
907 };
908
909 using ExtentPlacementManagerRef = std::unique_ptr<ExtentPlacementManager>;
910
911 }
912
913 #if FMT_VERSION >= 90000
914 template <> struct fmt::formatter<crimson::os::seastore::io_usage_t> : fmt::ostream_formatter {};
915 #endif