]>
Commit | Line | Data |
---|---|---|
20effc67 TL |
1 | // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:nil -*- |
2 | // vim: ts=8 sw=2 smarttab expandtab | |
3 | ||
4 | #pragma once | |
5 | ||
6 | #include "seastar/core/gate.hh" | |
7 | ||
1e59de90 | 8 | #include "crimson/os/seastore/async_cleaner.h" |
20effc67 | 9 | #include "crimson/os/seastore/cached_extent.h" |
1e59de90 TL |
10 | #include "crimson/os/seastore/journal/segment_allocator.h" |
11 | #include "crimson/os/seastore/journal/record_submitter.h" | |
12 | #include "crimson/os/seastore/transaction.h" | |
13 | #include "crimson/os/seastore/random_block_manager.h" | |
14 | #include "crimson/os/seastore/random_block_manager/block_rb_manager.h" | |
15 | #include "crimson/os/seastore/randomblock_manager_group.h" | |
16 | ||
17 | class transaction_manager_test_t; | |
20effc67 TL |
18 | |
19 | namespace crimson::os::seastore { | |
20 | ||
21 | /** | |
1e59de90 | 22 | * ExtentOolWriter |
20effc67 | 23 | * |
1e59de90 TL |
24 | * Write the extents as out-of-line and allocate the physical addresses. |
25 | * Different writers write extents to different locations. | |
20effc67 | 26 | */ |
1e59de90 TL |
27 | class ExtentOolWriter { |
28 | using base_ertr = crimson::errorator< | |
29 | crimson::ct_error::input_output_error>; | |
30 | public: | |
31 | virtual ~ExtentOolWriter() {} | |
32 | ||
33 | using open_ertr = base_ertr; | |
34 | virtual open_ertr::future<> open() = 0; | |
35 | ||
36 | virtual paddr_t alloc_paddr(extent_len_t length) = 0; | |
37 | ||
38 | using alloc_write_ertr = base_ertr; | |
39 | using alloc_write_iertr = trans_iertr<alloc_write_ertr>; | |
40 | virtual alloc_write_iertr::future<> alloc_write_ool_extents( | |
41 | Transaction &t, | |
42 | std::list<LogicalCachedExtentRef> &extents) = 0; | |
20effc67 | 43 | |
1e59de90 TL |
44 | using close_ertr = base_ertr; |
45 | virtual close_ertr::future<> close() = 0; | |
46 | }; | |
47 | using ExtentOolWriterRef = std::unique_ptr<ExtentOolWriter>; | |
48 | ||
49 | /** | |
50 | * SegmentedOolWriter | |
51 | * | |
52 | * Different writers write extents to different out-of-line segments provided | |
53 | * by the SegmentProvider. | |
54 | */ | |
55 | class SegmentedOolWriter : public ExtentOolWriter { | |
20effc67 | 56 | public: |
1e59de90 TL |
57 | SegmentedOolWriter(data_category_t category, |
58 | rewrite_gen_t gen, | |
59 | SegmentProvider &sp, | |
60 | SegmentSeqAllocator &ssa); | |
61 | ||
62 | open_ertr::future<> open() final { | |
63 | return record_submitter.open(false).discard_result(); | |
20effc67 | 64 | } |
1e59de90 TL |
65 | |
66 | alloc_write_iertr::future<> alloc_write_ool_extents( | |
67 | Transaction &t, | |
68 | std::list<LogicalCachedExtentRef> &extents) final; | |
69 | ||
70 | close_ertr::future<> close() final { | |
71 | return write_guard.close().then([this] { | |
72 | return record_submitter.close(); | |
73 | }).safe_then([this] { | |
74 | write_guard = seastar::gate(); | |
75 | }); | |
20effc67 | 76 | } |
1e59de90 TL |
77 | |
78 | paddr_t alloc_paddr(extent_len_t length) final { | |
79 | return make_delayed_temp_paddr(0); | |
20effc67 | 80 | } |
1e59de90 TL |
81 | |
82 | private: | |
83 | alloc_write_iertr::future<> do_write( | |
84 | Transaction& t, | |
85 | std::list<LogicalCachedExtentRef> &extent); | |
86 | ||
87 | alloc_write_ertr::future<> write_record( | |
88 | Transaction& t, | |
89 | record_t&& record, | |
90 | std::list<LogicalCachedExtentRef> &&extents, | |
91 | bool with_atomic_roll_segment=false); | |
92 | ||
93 | journal::SegmentAllocator segment_allocator; | |
94 | journal::RecordSubmitter record_submitter; | |
95 | seastar::gate write_guard; | |
96 | }; | |
97 | ||
98 | ||
99 | class RandomBlockOolWriter : public ExtentOolWriter { | |
100 | public: | |
101 | RandomBlockOolWriter(RBMCleaner* rb_cleaner) : | |
102 | rb_cleaner(rb_cleaner) {} | |
103 | ||
104 | using open_ertr = ExtentOolWriter::open_ertr; | |
105 | open_ertr::future<> open() final { | |
106 | return open_ertr::now(); | |
20effc67 | 107 | } |
1e59de90 TL |
108 | |
109 | alloc_write_iertr::future<> alloc_write_ool_extents( | |
110 | Transaction &t, | |
111 | std::list<LogicalCachedExtentRef> &extents) final; | |
112 | ||
113 | close_ertr::future<> close() final { | |
114 | return write_guard.close().then([this] { | |
115 | write_guard = seastar::gate(); | |
116 | return close_ertr::now(); | |
117 | }); | |
20effc67 | 118 | } |
1e59de90 TL |
119 | |
120 | paddr_t alloc_paddr(extent_len_t length) final { | |
121 | assert(rb_cleaner); | |
122 | return rb_cleaner->alloc_paddr(length); | |
20effc67 TL |
123 | } |
124 | ||
125 | private: | |
1e59de90 TL |
126 | alloc_write_iertr::future<> do_write( |
127 | Transaction& t, | |
128 | std::list<LogicalCachedExtentRef> &extent); | |
129 | ||
130 | RBMCleaner* rb_cleaner; | |
131 | seastar::gate write_guard; | |
20effc67 TL |
132 | }; |
133 | ||
1e59de90 TL |
134 | struct cleaner_usage_t { |
135 | // The size of all extents write to the main devices, including inline extents | |
136 | // and out-of-line extents. | |
137 | std::size_t main_usage = 0; | |
138 | // The size of extents write to the cold devices | |
139 | std::size_t cold_ool_usage = 0; | |
140 | }; | |
141 | ||
142 | struct reserve_cleaner_result_t { | |
143 | bool reserve_main_success = true; | |
144 | bool reserve_cold_success = true; | |
145 | ||
146 | bool is_successful() const { | |
147 | return reserve_main_success && | |
148 | reserve_cold_success; | |
149 | } | |
20effc67 TL |
150 | }; |
151 | ||
152 | /** | |
1e59de90 | 153 | * io_usage_t |
20effc67 | 154 | * |
1e59de90 | 155 | * io_usage_t describes the space usage consumed by client IO. |
20effc67 | 156 | */ |
1e59de90 TL |
157 | struct io_usage_t { |
158 | // The total size of all inlined extents, not including deltas and other metadata | |
159 | // produced by Cache::prepare_record. | |
160 | std::size_t inline_usage = 0; | |
161 | cleaner_usage_t cleaner_usage; | |
162 | friend std::ostream &operator<<(std::ostream &out, const io_usage_t &usage) { | |
163 | return out << "io_usage_t(" | |
164 | << "inline_usage=" << usage.inline_usage | |
165 | << ", main_cleaner_usage=" << usage.cleaner_usage.main_usage | |
166 | << ", cold_cleaner_usage=" << usage.cleaner_usage.cold_ool_usage | |
167 | << ")"; | |
168 | } | |
20effc67 | 169 | }; |
1e59de90 TL |
170 | |
171 | struct reserve_io_result_t { | |
172 | bool reserve_inline_success = true; | |
173 | reserve_cleaner_result_t cleaner_result; | |
174 | ||
175 | bool is_successful() const { | |
176 | return reserve_inline_success && | |
177 | cleaner_result.is_successful(); | |
178 | } | |
20effc67 TL |
179 | }; |
180 | ||
1e59de90 TL |
181 | class ExtentPlacementManager { |
182 | public: | |
183 | ExtentPlacementManager() | |
184 | : ool_segment_seq_allocator( | |
185 | std::make_unique<SegmentSeqAllocator>(segment_type_t::OOL)) | |
186 | { | |
187 | devices_by_id.resize(DEVICE_ID_MAX, nullptr); | |
188 | } | |
20effc67 | 189 | |
1e59de90 | 190 | void init(JournalTrimmerImplRef &&, AsyncCleanerRef &&, AsyncCleanerRef &&); |
20effc67 | 191 | |
1e59de90 TL |
192 | SegmentSeqAllocator &get_ool_segment_seq_allocator() const { |
193 | return *ool_segment_seq_allocator; | |
194 | } | |
195 | ||
196 | void set_primary_device(Device *device); | |
197 | ||
198 | void set_extent_callback(ExtentCallbackInterface *cb) { | |
199 | background_process.set_extent_callback(cb); | |
200 | } | |
201 | ||
202 | journal_type_t get_journal_type() const { | |
203 | return background_process.get_journal_type(); | |
204 | } | |
20effc67 | 205 | |
1e59de90 TL |
206 | extent_len_t get_block_size() const { |
207 | assert(primary_device != nullptr); | |
208 | // assume all the devices have the same block size | |
209 | return primary_device->get_block_size(); | |
210 | } | |
211 | ||
212 | Device& get_primary_device() { | |
213 | assert(primary_device != nullptr); | |
214 | return *primary_device; | |
215 | } | |
216 | ||
217 | store_statfs_t get_stat() const { | |
218 | return background_process.get_stat(); | |
219 | } | |
220 | ||
221 | using mount_ertr = crimson::errorator< | |
20effc67 | 222 | crimson::ct_error::input_output_error>; |
1e59de90 TL |
223 | using mount_ret = mount_ertr::future<>; |
224 | mount_ret mount() { | |
225 | return background_process.mount(); | |
226 | } | |
20effc67 | 227 | |
1e59de90 TL |
228 | using open_ertr = ExtentOolWriter::open_ertr; |
229 | open_ertr::future<> open_for_write(); | |
230 | ||
231 | void start_scan_space() { | |
232 | return background_process.start_scan_space(); | |
20effc67 TL |
233 | } |
234 | ||
1e59de90 TL |
235 | void start_background() { |
236 | return background_process.start_background(); | |
20effc67 TL |
237 | } |
238 | ||
1e59de90 TL |
239 | struct alloc_result_t { |
240 | paddr_t paddr; | |
241 | bufferptr bp; | |
242 | rewrite_gen_t gen; | |
243 | }; | |
244 | alloc_result_t alloc_new_extent( | |
245 | Transaction& t, | |
246 | extent_types_t type, | |
247 | extent_len_t length, | |
248 | placement_hint_t hint, | |
249 | #ifdef UNIT_TESTS_BUILT | |
250 | rewrite_gen_t gen, | |
251 | std::optional<paddr_t> external_paddr = std::nullopt | |
252 | #else | |
253 | rewrite_gen_t gen | |
254 | #endif | |
255 | ) { | |
256 | assert(hint < placement_hint_t::NUM_HINTS); | |
257 | assert(is_target_rewrite_generation(gen)); | |
258 | assert(gen == INIT_GENERATION || hint == placement_hint_t::REWRITE); | |
259 | ||
260 | data_category_t category = get_extent_category(type); | |
261 | gen = adjust_generation(category, type, hint, gen); | |
262 | ||
263 | // XXX: bp might be extended to point to different memory (e.g. PMem) | |
264 | // according to the allocator. | |
265 | auto bp = ceph::bufferptr( | |
266 | buffer::create_page_aligned(length)); | |
267 | bp.zero(); | |
268 | paddr_t addr; | |
269 | #ifdef UNIT_TESTS_BUILT | |
270 | if (unlikely(external_paddr.has_value())) { | |
271 | assert(external_paddr->is_fake()); | |
272 | addr = *external_paddr; | |
273 | } else if (gen == INLINE_GENERATION) { | |
274 | #else | |
275 | if (gen == INLINE_GENERATION) { | |
276 | #endif | |
277 | addr = make_record_relative_paddr(0); | |
278 | } else if (category == data_category_t::DATA) { | |
279 | assert(data_writers_by_gen[generation_to_writer(gen)]); | |
280 | addr = data_writers_by_gen[ | |
281 | generation_to_writer(gen)]->alloc_paddr(length); | |
282 | } else { | |
283 | assert(category == data_category_t::METADATA); | |
284 | assert(md_writers_by_gen[generation_to_writer(gen)]); | |
285 | addr = md_writers_by_gen[ | |
286 | generation_to_writer(gen)]->alloc_paddr(length); | |
287 | } | |
288 | return {addr, std::move(bp), gen}; | |
20effc67 | 289 | } |
20effc67 | 290 | |
1e59de90 TL |
291 | /** |
292 | * dispatch_result_t | |
293 | * | |
294 | * ool extents are placed in alloc_map and passed to | |
295 | * EPM::write_delayed_ool_extents, | |
296 | * delayed_extents is used to update lba mapping. | |
297 | * usage is used to reserve projected space | |
298 | */ | |
299 | using extents_by_writer_t = | |
300 | std::map<ExtentOolWriter*, std::list<LogicalCachedExtentRef>>; | |
301 | struct dispatch_result_t { | |
302 | extents_by_writer_t alloc_map; | |
303 | std::list<LogicalCachedExtentRef> delayed_extents; | |
304 | io_usage_t usage; | |
305 | }; | |
20effc67 TL |
306 | |
307 | /** | |
1e59de90 | 308 | * dispatch_delayed_extents |
20effc67 | 309 | * |
1e59de90 | 310 | * Performs delayed allocation |
20effc67 | 311 | */ |
1e59de90 TL |
312 | dispatch_result_t dispatch_delayed_extents(Transaction& t); |
313 | ||
314 | /** | |
315 | * write_delayed_ool_extents | |
316 | * | |
317 | * Do writes for out-of-line extents. | |
318 | */ | |
319 | using alloc_paddr_iertr = ExtentOolWriter::alloc_write_iertr; | |
320 | alloc_paddr_iertr::future<> write_delayed_ool_extents( | |
20effc67 | 321 | Transaction& t, |
1e59de90 | 322 | extents_by_writer_t& alloc_map); |
20effc67 TL |
323 | |
324 | /** | |
1e59de90 | 325 | * write_preallocated_ool_extents |
20effc67 | 326 | * |
1e59de90 TL |
327 | * Performs ool writes for extents with pre-allocated addresses. |
328 | * See Transaction::pre_alloc_list | |
20effc67 | 329 | */ |
1e59de90 TL |
330 | alloc_paddr_iertr::future<> write_preallocated_ool_extents( |
331 | Transaction &t, | |
332 | std::list<LogicalCachedExtentRef> extents); | |
333 | ||
334 | seastar::future<> stop_background() { | |
335 | return background_process.stop_background(); | |
336 | } | |
337 | ||
338 | using close_ertr = ExtentOolWriter::close_ertr; | |
339 | close_ertr::future<> close(); | |
340 | ||
341 | using read_ertr = Device::read_ertr; | |
342 | read_ertr::future<> read( | |
343 | paddr_t addr, | |
344 | size_t len, | |
345 | ceph::bufferptr &out | |
346 | ) { | |
347 | assert(devices_by_id[addr.get_device_id()] != nullptr); | |
348 | return devices_by_id[addr.get_device_id()]->read(addr, len, out); | |
349 | } | |
350 | ||
351 | void mark_space_used(paddr_t addr, extent_len_t len) { | |
352 | background_process.mark_space_used(addr, len); | |
353 | } | |
354 | ||
355 | void mark_space_free(paddr_t addr, extent_len_t len) { | |
356 | background_process.mark_space_free(addr, len); | |
357 | } | |
358 | ||
359 | void commit_space_used(paddr_t addr, extent_len_t len) { | |
360 | return background_process.commit_space_used(addr, len); | |
361 | } | |
362 | ||
363 | seastar::future<> reserve_projected_usage(io_usage_t usage) { | |
364 | return background_process.reserve_projected_usage(usage); | |
365 | } | |
366 | ||
367 | void release_projected_usage(const io_usage_t &usage) { | |
368 | background_process.release_projected_usage(usage); | |
369 | } | |
370 | ||
371 | backend_type_t get_main_backend_type() const { | |
372 | if (!background_process.is_no_background()) { | |
373 | return background_process.get_main_backend_type(); | |
374 | } | |
375 | // for test | |
376 | assert(primary_device); | |
377 | return primary_device->get_backend_type(); | |
378 | } | |
379 | ||
380 | // Testing interfaces | |
381 | ||
382 | void test_init_no_background(Device *test_device) { | |
383 | assert(test_device->get_backend_type() == backend_type_t::SEGMENTED); | |
384 | add_device(test_device); | |
385 | set_primary_device(test_device); | |
386 | } | |
387 | ||
388 | bool check_usage() { | |
389 | return background_process.check_usage(); | |
390 | } | |
391 | ||
392 | seastar::future<> run_background_work_until_halt() { | |
393 | return background_process.run_until_halt(); | |
394 | } | |
395 | ||
396 | private: | |
397 | rewrite_gen_t adjust_generation( | |
398 | data_category_t category, | |
399 | extent_types_t type, | |
400 | placement_hint_t hint, | |
401 | rewrite_gen_t gen) { | |
402 | if (type == extent_types_t::ROOT) { | |
403 | gen = INLINE_GENERATION; | |
404 | } else if (get_main_backend_type() == backend_type_t::SEGMENTED && | |
405 | is_lba_backref_node(type)) { | |
406 | gen = INLINE_GENERATION; | |
407 | } else if (hint == placement_hint_t::COLD) { | |
408 | assert(gen == INIT_GENERATION); | |
409 | if (background_process.has_cold_tier()) { | |
410 | gen = MIN_COLD_GENERATION; | |
411 | } else { | |
412 | gen = MIN_REWRITE_GENERATION; | |
413 | } | |
414 | } else if (gen == INIT_GENERATION) { | |
415 | if (category == data_category_t::METADATA) { | |
416 | if (get_main_backend_type() == backend_type_t::SEGMENTED) { | |
417 | // with SEGMENTED, default not to ool metadata extents to reduce | |
418 | // padding overhead. | |
419 | // TODO: improve padding so we can default to the ool path. | |
420 | gen = INLINE_GENERATION; | |
421 | } else { | |
422 | // with RBM, all extents must be OOL | |
423 | assert(get_main_backend_type() == | |
424 | backend_type_t::RANDOM_BLOCK); | |
425 | gen = OOL_GENERATION; | |
20effc67 | 426 | } |
1e59de90 TL |
427 | } else { |
428 | assert(category == data_category_t::DATA); | |
429 | gen = OOL_GENERATION; | |
20effc67 | 430 | } |
1e59de90 TL |
431 | } else if (background_process.has_cold_tier()) { |
432 | gen = background_process.adjust_generation(gen); | |
433 | } | |
434 | ||
435 | if (gen > dynamic_max_rewrite_generation) { | |
436 | gen = dynamic_max_rewrite_generation; | |
437 | } | |
438 | ||
439 | return gen; | |
20effc67 TL |
440 | } |
441 | ||
1e59de90 TL |
442 | void add_device(Device *device) { |
443 | auto device_id = device->get_device_id(); | |
444 | ceph_assert(devices_by_id[device_id] == nullptr); | |
445 | devices_by_id[device_id] = device; | |
446 | ++num_devices; | |
20effc67 TL |
447 | } |
448 | ||
1e59de90 TL |
449 | /** |
450 | * dispatch_delayed_extent | |
451 | * | |
452 | * Specify the extent inline or ool | |
453 | * return true indicates inline otherwise ool | |
454 | */ | |
455 | bool dispatch_delayed_extent(LogicalCachedExtentRef& extent) { | |
456 | // TODO: all delayed extents are ool currently | |
457 | boost::ignore_unused(extent); | |
458 | return false; | |
20effc67 TL |
459 | } |
460 | ||
1e59de90 TL |
461 | ExtentOolWriter* get_writer(placement_hint_t hint, |
462 | data_category_t category, | |
463 | rewrite_gen_t gen) { | |
464 | assert(hint < placement_hint_t::NUM_HINTS); | |
465 | assert(is_rewrite_generation(gen)); | |
466 | assert(gen != INLINE_GENERATION); | |
467 | assert(gen <= dynamic_max_rewrite_generation); | |
468 | if (category == data_category_t::DATA) { | |
469 | return data_writers_by_gen[generation_to_writer(gen)]; | |
470 | } else { | |
471 | assert(category == data_category_t::METADATA); | |
472 | return md_writers_by_gen[generation_to_writer(gen)]; | |
473 | } | |
20effc67 TL |
474 | } |
475 | ||
1e59de90 TL |
476 | /** |
477 | * BackgroundProcess | |
478 | * | |
479 | * Background process to schedule background transactions. | |
480 | * | |
481 | * TODO: device tiering | |
482 | */ | |
483 | class BackgroundProcess : public BackgroundListener { | |
484 | public: | |
485 | BackgroundProcess() = default; | |
486 | ||
487 | void init(JournalTrimmerImplRef &&_trimmer, | |
488 | AsyncCleanerRef &&_cleaner, | |
489 | AsyncCleanerRef &&_cold_cleaner) { | |
490 | trimmer = std::move(_trimmer); | |
491 | trimmer->set_background_callback(this); | |
492 | main_cleaner = std::move(_cleaner); | |
493 | main_cleaner->set_background_callback(this); | |
494 | if (_cold_cleaner) { | |
495 | cold_cleaner = std::move(_cold_cleaner); | |
496 | cold_cleaner->set_background_callback(this); | |
497 | ||
498 | cleaners_by_device_id.resize(DEVICE_ID_MAX, nullptr); | |
499 | for (auto id : main_cleaner->get_device_ids()) { | |
500 | cleaners_by_device_id[id] = main_cleaner.get(); | |
501 | } | |
502 | for (auto id : cold_cleaner->get_device_ids()) { | |
503 | cleaners_by_device_id[id] = cold_cleaner.get(); | |
504 | } | |
505 | ||
506 | eviction_state.init( | |
507 | crimson::common::get_conf<double>( | |
508 | "seastore_multiple_tiers_stop_evict_ratio"), | |
509 | crimson::common::get_conf<double>( | |
510 | "seastore_multiple_tiers_default_evict_ratio"), | |
511 | crimson::common::get_conf<double>( | |
512 | "seastore_multiple_tiers_fast_evict_ratio")); | |
513 | } | |
514 | } | |
515 | ||
516 | journal_type_t get_journal_type() const { | |
517 | return trimmer->get_journal_type(); | |
518 | } | |
519 | ||
520 | bool has_cold_tier() const { | |
521 | return cold_cleaner.get() != nullptr; | |
522 | } | |
523 | ||
524 | void set_extent_callback(ExtentCallbackInterface *cb) { | |
525 | trimmer->set_extent_callback(cb); | |
526 | main_cleaner->set_extent_callback(cb); | |
527 | if (has_cold_tier()) { | |
528 | cold_cleaner->set_extent_callback(cb); | |
529 | } | |
530 | } | |
531 | ||
532 | store_statfs_t get_stat() const { | |
533 | auto stat = main_cleaner->get_stat(); | |
534 | if (has_cold_tier()) { | |
535 | stat.add(cold_cleaner->get_stat()); | |
536 | } | |
537 | return stat; | |
538 | } | |
539 | ||
540 | using mount_ret = ExtentPlacementManager::mount_ret; | |
541 | mount_ret mount() { | |
542 | ceph_assert(state == state_t::STOP); | |
543 | state = state_t::MOUNT; | |
544 | trimmer->reset(); | |
545 | stats = {}; | |
546 | register_metrics(); | |
547 | return main_cleaner->mount( | |
548 | ).safe_then([this] { | |
549 | return has_cold_tier() ? cold_cleaner->mount() : mount_ertr::now(); | |
550 | }); | |
551 | } | |
552 | ||
553 | void start_scan_space() { | |
554 | ceph_assert(state == state_t::MOUNT); | |
555 | state = state_t::SCAN_SPACE; | |
556 | ceph_assert(main_cleaner->check_usage_is_empty()); | |
557 | ceph_assert(!has_cold_tier() || | |
558 | cold_cleaner->check_usage_is_empty()); | |
559 | } | |
560 | ||
561 | void start_background(); | |
562 | ||
563 | void mark_space_used(paddr_t addr, extent_len_t len) { | |
564 | if (state < state_t::SCAN_SPACE) { | |
565 | return; | |
566 | } | |
567 | ||
568 | if (!has_cold_tier()) { | |
569 | assert(main_cleaner); | |
570 | main_cleaner->mark_space_used(addr, len); | |
571 | } else { | |
572 | auto id = addr.get_device_id(); | |
573 | assert(id < cleaners_by_device_id.size()); | |
574 | auto cleaner = cleaners_by_device_id[id]; | |
575 | assert(cleaner); | |
576 | cleaner->mark_space_used(addr, len); | |
577 | } | |
578 | } | |
579 | ||
580 | void mark_space_free(paddr_t addr, extent_len_t len) { | |
581 | if (state < state_t::SCAN_SPACE) { | |
582 | return; | |
583 | } | |
584 | ||
585 | if (!has_cold_tier()) { | |
586 | assert(main_cleaner); | |
587 | main_cleaner->mark_space_free(addr, len); | |
588 | } else { | |
589 | auto id = addr.get_device_id(); | |
590 | assert(id < cleaners_by_device_id.size()); | |
591 | auto cleaner = cleaners_by_device_id[id]; | |
592 | assert(cleaner); | |
593 | cleaner->mark_space_free(addr, len); | |
594 | } | |
595 | } | |
596 | ||
597 | void commit_space_used(paddr_t addr, extent_len_t len) { | |
598 | if (state < state_t::SCAN_SPACE) { | |
599 | return; | |
600 | } | |
601 | ||
602 | if (!has_cold_tier()) { | |
603 | assert(main_cleaner); | |
604 | main_cleaner->commit_space_used(addr, len); | |
605 | } else { | |
606 | auto id = addr.get_device_id(); | |
607 | assert(id < cleaners_by_device_id.size()); | |
608 | auto cleaner = cleaners_by_device_id[id]; | |
609 | assert(cleaner); | |
610 | cleaner->commit_space_used(addr, len); | |
611 | } | |
612 | } | |
613 | ||
614 | rewrite_gen_t adjust_generation(rewrite_gen_t gen) { | |
615 | if (has_cold_tier()) { | |
616 | return eviction_state.adjust_generation_with_eviction(gen); | |
617 | } else { | |
618 | return gen; | |
619 | } | |
620 | } | |
621 | ||
622 | seastar::future<> reserve_projected_usage(io_usage_t usage); | |
623 | ||
624 | void release_projected_usage(const io_usage_t &usage) { | |
625 | if (is_ready()) { | |
626 | trimmer->release_inline_usage(usage.inline_usage); | |
627 | main_cleaner->release_projected_usage(usage.cleaner_usage.main_usage); | |
628 | if (has_cold_tier()) { | |
629 | cold_cleaner->release_projected_usage(usage.cleaner_usage.cold_ool_usage); | |
630 | } | |
631 | } | |
632 | } | |
633 | ||
634 | seastar::future<> stop_background(); | |
635 | backend_type_t get_main_backend_type() const { | |
636 | return get_journal_type(); | |
637 | } | |
638 | ||
639 | // Testing interfaces | |
640 | ||
641 | bool check_usage() { | |
642 | return main_cleaner->check_usage() && | |
643 | (!has_cold_tier() || cold_cleaner->check_usage()); | |
644 | } | |
645 | ||
646 | seastar::future<> run_until_halt(); | |
647 | ||
648 | bool is_no_background() const { | |
649 | return !trimmer || !main_cleaner; | |
650 | } | |
651 | ||
652 | protected: | |
653 | state_t get_state() const final { | |
654 | return state; | |
655 | } | |
656 | ||
657 | void maybe_wake_background() final { | |
658 | if (!is_running()) { | |
659 | return; | |
660 | } | |
661 | if (background_should_run()) { | |
662 | do_wake_background(); | |
663 | } | |
664 | } | |
665 | ||
666 | void maybe_wake_blocked_io() final { | |
667 | if (!is_ready()) { | |
668 | return; | |
669 | } | |
670 | if (!should_block_io() && blocking_io) { | |
671 | blocking_io->set_value(); | |
672 | blocking_io = std::nullopt; | |
673 | } | |
674 | } | |
675 | ||
676 | private: | |
677 | // reserve helpers | |
678 | bool try_reserve_cold(std::size_t usage); | |
679 | void abort_cold_usage(std::size_t usage, bool success); | |
680 | ||
681 | reserve_cleaner_result_t try_reserve_cleaner(const cleaner_usage_t &usage); | |
682 | void abort_cleaner_usage(const cleaner_usage_t &usage, | |
683 | const reserve_cleaner_result_t &result); | |
684 | ||
685 | reserve_io_result_t try_reserve_io(const io_usage_t &usage); | |
686 | void abort_io_usage(const io_usage_t &usage, | |
687 | const reserve_io_result_t &result); | |
688 | ||
689 | bool is_running() const { | |
690 | if (state == state_t::RUNNING) { | |
691 | assert(process_join); | |
692 | return true; | |
693 | } else { | |
694 | assert(!process_join); | |
695 | return false; | |
696 | } | |
697 | } | |
698 | ||
699 | void log_state(const char *caller) const; | |
700 | ||
701 | seastar::future<> run(); | |
702 | ||
703 | void do_wake_background() { | |
704 | if (blocking_background) { | |
705 | blocking_background->set_value(); | |
706 | blocking_background = std::nullopt; | |
707 | } | |
708 | } | |
709 | ||
710 | // background_should_run() should be atomic with do_background_cycle() | |
711 | // to make sure the condition is consistent. | |
712 | bool background_should_run() { | |
713 | assert(is_ready()); | |
714 | maybe_update_eviction_mode(); | |
715 | return main_cleaner_should_run() | |
716 | || cold_cleaner_should_run() | |
717 | || trimmer->should_trim(); | |
718 | } | |
719 | ||
720 | bool main_cleaner_should_run() const { | |
721 | assert(is_ready()); | |
722 | return main_cleaner->should_clean_space() || | |
723 | (has_cold_tier() && | |
724 | main_cleaner->can_clean_space() && | |
725 | eviction_state.is_fast_mode()); | |
726 | } | |
727 | ||
728 | bool cold_cleaner_should_run() const { | |
729 | assert(is_ready()); | |
730 | return has_cold_tier() && | |
731 | cold_cleaner->should_clean_space(); | |
732 | } | |
733 | ||
734 | bool should_block_io() const { | |
735 | assert(is_ready()); | |
736 | return trimmer->should_block_io_on_trim() || | |
737 | main_cleaner->should_block_io_on_clean() || | |
738 | (has_cold_tier() && | |
739 | cold_cleaner->should_block_io_on_clean()); | |
740 | } | |
741 | ||
742 | void maybe_update_eviction_mode() { | |
743 | if (has_cold_tier()) { | |
744 | auto main_alive_ratio = main_cleaner->get_stat().get_used_raw_ratio(); | |
745 | eviction_state.maybe_update_eviction_mode(main_alive_ratio); | |
746 | } | |
747 | } | |
748 | ||
749 | struct eviction_state_t { | |
750 | enum class eviction_mode_t { | |
751 | STOP, // generation greater than or equal to MIN_COLD_GENERATION | |
752 | // will be set to MIN_COLD_GENERATION - 1, which means | |
753 | // no extents will be evicted. | |
754 | DEFAULT, // generation incremented with each rewrite. Extents will | |
755 | // be evicted when generation reaches MIN_COLD_GENERATION. | |
756 | FAST, // map all generations located in | |
757 | // [MIN_REWRITE_GENERATION, MIN_COLD_GENERATIOIN) to | |
758 | // MIN_COLD_GENERATION. | |
759 | }; | |
760 | ||
761 | eviction_mode_t eviction_mode; | |
762 | double stop_evict_ratio; | |
763 | double default_evict_ratio; | |
764 | double fast_evict_ratio; | |
765 | ||
766 | void init(double stop_ratio, | |
767 | double default_ratio, | |
768 | double fast_ratio) { | |
769 | ceph_assert(0 <= stop_ratio); | |
770 | ceph_assert(stop_ratio < default_ratio); | |
771 | ceph_assert(default_ratio < fast_ratio); | |
772 | ceph_assert(fast_ratio <= 1); | |
773 | eviction_mode = eviction_mode_t::STOP; | |
774 | stop_evict_ratio = stop_ratio; | |
775 | default_evict_ratio = default_ratio; | |
776 | fast_evict_ratio = fast_ratio; | |
777 | } | |
778 | ||
779 | bool is_stop_mode() const { | |
780 | return eviction_mode == eviction_mode_t::STOP; | |
781 | } | |
782 | ||
783 | bool is_default_mode() const { | |
784 | return eviction_mode == eviction_mode_t::DEFAULT; | |
785 | } | |
786 | ||
787 | bool is_fast_mode() const { | |
788 | return eviction_mode == eviction_mode_t::FAST; | |
789 | } | |
790 | ||
791 | rewrite_gen_t adjust_generation_with_eviction(rewrite_gen_t gen) { | |
792 | rewrite_gen_t ret = gen; | |
793 | switch(eviction_mode) { | |
794 | case eviction_mode_t::STOP: | |
795 | if (gen == MIN_COLD_GENERATION) { | |
796 | ret = MIN_COLD_GENERATION - 1; | |
797 | } | |
798 | break; | |
799 | case eviction_mode_t::DEFAULT: | |
800 | break; | |
801 | case eviction_mode_t::FAST: | |
802 | if (gen >= MIN_REWRITE_GENERATION && gen < MIN_COLD_GENERATION) { | |
803 | ret = MIN_COLD_GENERATION; | |
804 | } | |
805 | break; | |
806 | default: | |
807 | ceph_abort("impossible"); | |
808 | } | |
809 | return ret; | |
810 | } | |
811 | ||
812 | // We change the state of eviction_mode according to the alive ratio | |
813 | // of the main cleaner. | |
814 | // | |
815 | // Use A, B, C, D to represent the state of alive ratio: | |
816 | // A: alive ratio <= stop_evict_ratio | |
817 | // B: alive ratio <= default_evict_ratio | |
818 | // C: alive ratio <= fast_evict_ratio | |
819 | // D: alive ratio > fast_evict_ratio | |
820 | // | |
821 | // and use X, Y, Z to shorten the state of eviction_mode_t: | |
822 | // X: STOP | |
823 | // Y: DEFAULT | |
824 | // Z: FAST | |
825 | // | |
826 | // Then we can use a form like (A && X) to describe the current state | |
827 | // of the main cleaner, which indicates the alive ratio is less than or | |
828 | // equal to stop_evict_ratio and current eviction mode is STOP. | |
829 | // | |
830 | // all valid state transitions show as follow: | |
831 | // (A && X) => (B && X) => (C && Y) => (D && Z) => | |
832 | // (C && Z) => (B && Y) => (A && X) | |
833 | // `--> (C && Y) => ... | |
834 | // | |
835 | // when the system restarts, the init state is (_ && X), the | |
836 | // transitions should be: | |
837 | // (_ && X) -> (A && X) => normal transition | |
838 | // -> (B && X) => normal transition | |
839 | // -> (C && X) => (C && Y) => normal transition | |
840 | // -> (D && X) => (D && Z) => normal transition | |
841 | void maybe_update_eviction_mode(double main_alive_ratio) { | |
842 | if (main_alive_ratio <= stop_evict_ratio) { | |
843 | eviction_mode = eviction_mode_t::STOP; | |
844 | } else if (main_alive_ratio <= default_evict_ratio) { | |
845 | if (eviction_mode > eviction_mode_t::DEFAULT) { | |
846 | eviction_mode = eviction_mode_t::DEFAULT; | |
847 | } | |
848 | } else if (main_alive_ratio <= fast_evict_ratio) { | |
849 | if (eviction_mode < eviction_mode_t::DEFAULT) { | |
850 | eviction_mode = eviction_mode_t::DEFAULT; | |
851 | } | |
852 | } else { | |
853 | assert(main_alive_ratio > fast_evict_ratio); | |
854 | eviction_mode = eviction_mode_t::FAST; | |
855 | } | |
856 | } | |
857 | }; | |
858 | ||
859 | seastar::future<> do_background_cycle(); | |
860 | ||
861 | void register_metrics(); | |
862 | ||
863 | struct { | |
864 | uint64_t io_blocking_num = 0; | |
865 | uint64_t io_count = 0; | |
866 | uint64_t io_blocked_count = 0; | |
867 | uint64_t io_blocked_count_trim = 0; | |
868 | uint64_t io_blocked_count_clean = 0; | |
869 | uint64_t io_blocked_sum = 0; | |
870 | } stats; | |
871 | seastar::metrics::metric_group metrics; | |
872 | ||
873 | JournalTrimmerImplRef trimmer; | |
874 | AsyncCleanerRef main_cleaner; | |
875 | ||
876 | /* | |
877 | * cold tier (optional, see has_cold_tier()) | |
878 | */ | |
879 | AsyncCleanerRef cold_cleaner; | |
880 | std::vector<AsyncCleaner*> cleaners_by_device_id; | |
881 | ||
882 | std::optional<seastar::future<>> process_join; | |
883 | std::optional<seastar::promise<>> blocking_background; | |
884 | std::optional<seastar::promise<>> blocking_io; | |
885 | bool is_running_until_halt = false; | |
886 | state_t state = state_t::STOP; | |
887 | eviction_state_t eviction_state; | |
888 | ||
889 | friend class ::transaction_manager_test_t; | |
890 | }; | |
891 | ||
892 | std::vector<ExtentOolWriterRef> writer_refs; | |
893 | std::vector<ExtentOolWriter*> data_writers_by_gen; | |
894 | // gen 0 METADATA writer is the journal writer | |
895 | std::vector<ExtentOolWriter*> md_writers_by_gen; | |
896 | ||
897 | std::vector<Device*> devices_by_id; | |
898 | Device* primary_device = nullptr; | |
899 | std::size_t num_devices = 0; | |
900 | ||
901 | rewrite_gen_t dynamic_max_rewrite_generation = REWRITE_GENERATIONS; | |
902 | BackgroundProcess background_process; | |
903 | // TODO: drop once paddr->journal_seq_t is introduced | |
904 | SegmentSeqAllocatorRef ool_segment_seq_allocator; | |
905 | ||
906 | friend class ::transaction_manager_test_t; | |
20effc67 | 907 | }; |
1e59de90 | 908 | |
20effc67 TL |
909 | using ExtentPlacementManagerRef = std::unique_ptr<ExtentPlacementManager>; |
910 | ||
911 | } | |
1e59de90 TL |
912 | |
913 | #if FMT_VERSION >= 90000 | |
914 | template <> struct fmt::formatter<crimson::os::seastore::io_usage_t> : fmt::ostream_formatter {}; | |
915 | #endif |