]>
Commit | Line | Data |
---|---|---|
1e59de90 TL |
1 | // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- |
2 | // vim: ts=8 sw=2 smarttab | |
3 | ||
4 | #include <fmt/chrono.h> | |
5 | #include <seastar/core/metrics.hh> | |
6 | ||
7 | #include "crimson/os/seastore/logging.h" | |
8 | ||
9 | #include "crimson/os/seastore/async_cleaner.h" | |
10 | #include "crimson/os/seastore/backref_manager.h" | |
11 | #include "crimson/os/seastore/transaction_manager.h" | |
12 | ||
13 | SET_SUBSYS(seastore_cleaner); | |
14 | ||
15 | namespace { | |
16 | ||
17 | enum class gc_formula_t { | |
18 | GREEDY, | |
19 | BENEFIT, | |
20 | COST_BENEFIT, | |
21 | }; | |
22 | constexpr auto gc_formula = gc_formula_t::COST_BENEFIT; | |
23 | ||
24 | } | |
25 | ||
26 | namespace crimson::os::seastore { | |
27 | ||
28 | void segment_info_t::set_open( | |
29 | segment_seq_t _seq, segment_type_t _type, | |
30 | data_category_t _category, rewrite_gen_t _generation) | |
31 | { | |
32 | ceph_assert(_seq != NULL_SEG_SEQ); | |
33 | ceph_assert(_type != segment_type_t::NULL_SEG); | |
34 | ceph_assert(_category != data_category_t::NUM); | |
35 | ceph_assert(is_rewrite_generation(_generation)); | |
36 | state = Segment::segment_state_t::OPEN; | |
37 | seq = _seq; | |
38 | type = _type; | |
39 | category = _category; | |
40 | generation = _generation; | |
41 | written_to = 0; | |
42 | } | |
43 | ||
44 | void segment_info_t::set_empty() | |
45 | { | |
46 | state = Segment::segment_state_t::EMPTY; | |
47 | seq = NULL_SEG_SEQ; | |
48 | type = segment_type_t::NULL_SEG; | |
49 | category = data_category_t::NUM; | |
50 | generation = NULL_GENERATION; | |
51 | modify_time = NULL_TIME; | |
52 | num_extents = 0; | |
53 | written_to = 0; | |
54 | } | |
55 | ||
56 | void segment_info_t::set_closed() | |
57 | { | |
58 | state = Segment::segment_state_t::CLOSED; | |
59 | // the rest of information is unchanged | |
60 | } | |
61 | ||
62 | void segment_info_t::init_closed( | |
63 | segment_seq_t _seq, segment_type_t _type, | |
64 | data_category_t _category, rewrite_gen_t _generation, | |
65 | segment_off_t seg_size) | |
66 | { | |
67 | ceph_assert(_seq != NULL_SEG_SEQ); | |
68 | ceph_assert(_type != segment_type_t::NULL_SEG); | |
69 | ceph_assert(_category != data_category_t::NUM); | |
70 | ceph_assert(is_rewrite_generation(_generation)); | |
71 | state = Segment::segment_state_t::CLOSED; | |
72 | seq = _seq; | |
73 | type = _type; | |
74 | category = _category; | |
75 | generation = _generation; | |
76 | written_to = seg_size; | |
77 | } | |
78 | ||
79 | std::ostream& operator<<(std::ostream &out, const segment_info_t &info) | |
80 | { | |
81 | out << "seg_info_t(" | |
82 | << "state=" << info.state | |
83 | << ", " << info.id; | |
84 | if (info.is_empty()) { | |
85 | // pass | |
86 | } else { // open or closed | |
87 | out << " " << info.type | |
88 | << " " << segment_seq_printer_t{info.seq} | |
89 | << " " << info.category | |
90 | << " " << rewrite_gen_printer_t{info.generation} | |
91 | << ", modify_time=" << sea_time_point_printer_t{info.modify_time} | |
92 | << ", num_extents=" << info.num_extents | |
93 | << ", written_to=" << info.written_to; | |
94 | } | |
95 | return out << ")"; | |
96 | } | |
97 | ||
98 | void segments_info_t::reset() | |
99 | { | |
100 | segments.clear(); | |
101 | ||
102 | segment_size = 0; | |
103 | ||
104 | journal_segment_id = NULL_SEG_ID; | |
105 | num_in_journal_open = 0; | |
106 | num_type_journal = 0; | |
107 | num_type_ool = 0; | |
108 | ||
109 | num_open = 0; | |
110 | num_empty = 0; | |
111 | num_closed = 0; | |
112 | ||
113 | count_open_journal = 0; | |
114 | count_open_ool = 0; | |
115 | count_release_journal = 0; | |
116 | count_release_ool = 0; | |
117 | count_close_journal = 0; | |
118 | count_close_ool = 0; | |
119 | ||
120 | total_bytes = 0; | |
121 | avail_bytes_in_open = 0; | |
122 | ||
123 | modify_times.clear(); | |
124 | } | |
125 | ||
126 | void segments_info_t::add_segment_manager( | |
127 | SegmentManager &segment_manager) | |
128 | { | |
129 | LOG_PREFIX(segments_info_t::add_segment_manager); | |
130 | device_id_t d_id = segment_manager.get_device_id(); | |
131 | auto ssize = segment_manager.get_segment_size(); | |
132 | auto nsegments = segment_manager.get_num_segments(); | |
133 | auto sm_size = segment_manager.get_available_size(); | |
134 | INFO("adding segment manager {}, size={}, ssize={}, segments={}", | |
135 | device_id_printer_t{d_id}, sm_size, ssize, nsegments); | |
136 | ceph_assert(ssize > 0); | |
137 | ceph_assert(nsegments > 0); | |
138 | ceph_assert(sm_size > 0); | |
139 | ||
140 | // also validate if the device is duplicated | |
141 | segments.add_device(d_id, nsegments, segment_info_t{}); | |
142 | ||
143 | // assume all the segment managers share the same settings as follows. | |
144 | if (segment_size == 0) { | |
145 | ceph_assert(ssize > 0); | |
146 | segment_size = ssize; | |
147 | } else { | |
148 | ceph_assert(segment_size == ssize); | |
149 | } | |
150 | ||
151 | // NOTE: by default the segments are empty | |
152 | num_empty += nsegments; | |
153 | ||
154 | total_bytes += sm_size; | |
155 | } | |
156 | ||
157 | void segments_info_t::init_closed( | |
158 | segment_id_t segment, segment_seq_t seq, segment_type_t type, | |
159 | data_category_t category, rewrite_gen_t generation) | |
160 | { | |
161 | LOG_PREFIX(segments_info_t::init_closed); | |
162 | auto& segment_info = segments[segment]; | |
163 | DEBUG("initiating {} {} {} {} {}, {}, " | |
164 | "num_segments(empty={}, opened={}, closed={})", | |
165 | segment, type, segment_seq_printer_t{seq}, | |
166 | category, rewrite_gen_printer_t{generation}, | |
167 | segment_info, num_empty, num_open, num_closed); | |
168 | ceph_assert(segment_info.is_empty()); | |
169 | ceph_assert(num_empty > 0); | |
170 | --num_empty; | |
171 | ++num_closed; | |
172 | if (type == segment_type_t::JOURNAL) { | |
173 | // init_closed won't initialize journal_segment_id | |
174 | ceph_assert(get_submitted_journal_head() == JOURNAL_SEQ_NULL); | |
175 | ++num_type_journal; | |
176 | } else { | |
177 | ++num_type_ool; | |
178 | } | |
179 | // do not increment count_close_*; | |
180 | ||
181 | if (segment_info.modify_time != NULL_TIME) { | |
182 | modify_times.insert(segment_info.modify_time); | |
183 | } else { | |
184 | ceph_assert(segment_info.num_extents == 0); | |
185 | } | |
186 | ||
187 | segment_info.init_closed( | |
188 | seq, type, category, generation, get_segment_size()); | |
189 | } | |
190 | ||
191 | void segments_info_t::mark_open( | |
192 | segment_id_t segment, segment_seq_t seq, segment_type_t type, | |
193 | data_category_t category, rewrite_gen_t generation) | |
194 | { | |
195 | LOG_PREFIX(segments_info_t::mark_open); | |
196 | auto& segment_info = segments[segment]; | |
197 | INFO("opening {} {} {} {} {}, {}, " | |
198 | "num_segments(empty={}, opened={}, closed={})", | |
199 | segment, type, segment_seq_printer_t{seq}, | |
200 | category, rewrite_gen_printer_t{generation}, | |
201 | segment_info, num_empty, num_open, num_closed); | |
202 | ceph_assert(segment_info.is_empty()); | |
203 | ceph_assert(num_empty > 0); | |
204 | --num_empty; | |
205 | ++num_open; | |
206 | if (type == segment_type_t::JOURNAL) { | |
207 | if (journal_segment_id != NULL_SEG_ID) { | |
208 | auto& last_journal_segment = segments[journal_segment_id]; | |
209 | ceph_assert(last_journal_segment.is_closed()); | |
210 | ceph_assert(last_journal_segment.type == segment_type_t::JOURNAL); | |
211 | ceph_assert(last_journal_segment.seq + 1 == seq); | |
212 | } | |
213 | journal_segment_id = segment; | |
214 | ||
215 | ++num_in_journal_open; | |
216 | ++num_type_journal; | |
217 | ++count_open_journal; | |
218 | } else { | |
219 | ++num_type_ool; | |
220 | ++count_open_ool; | |
221 | } | |
222 | avail_bytes_in_open += get_segment_size(); | |
223 | ||
224 | segment_info.set_open(seq, type, category, generation); | |
225 | } | |
226 | ||
227 | void segments_info_t::mark_empty( | |
228 | segment_id_t segment) | |
229 | { | |
230 | LOG_PREFIX(segments_info_t::mark_empty); | |
231 | auto& segment_info = segments[segment]; | |
232 | INFO("releasing {}, {}, num_segments(empty={}, opened={}, closed={})", | |
233 | segment, segment_info, | |
234 | num_empty, num_open, num_closed); | |
235 | ceph_assert(segment_info.is_closed()); | |
236 | auto type = segment_info.type; | |
237 | assert(type != segment_type_t::NULL_SEG); | |
238 | ceph_assert(num_closed > 0); | |
239 | --num_closed; | |
240 | ++num_empty; | |
241 | if (type == segment_type_t::JOURNAL) { | |
242 | ceph_assert(num_type_journal > 0); | |
243 | --num_type_journal; | |
244 | ++count_release_journal; | |
245 | } else { | |
246 | ceph_assert(num_type_ool > 0); | |
247 | --num_type_ool; | |
248 | ++count_release_ool; | |
249 | } | |
250 | ||
251 | if (segment_info.modify_time != NULL_TIME) { | |
252 | auto to_erase = modify_times.find(segment_info.modify_time); | |
253 | ceph_assert(to_erase != modify_times.end()); | |
254 | modify_times.erase(to_erase); | |
255 | } else { | |
256 | ceph_assert(segment_info.num_extents == 0); | |
257 | } | |
258 | ||
259 | segment_info.set_empty(); | |
260 | } | |
261 | ||
262 | void segments_info_t::mark_closed( | |
263 | segment_id_t segment) | |
264 | { | |
265 | LOG_PREFIX(segments_info_t::mark_closed); | |
266 | auto& segment_info = segments[segment]; | |
267 | INFO("closing {}, {}, num_segments(empty={}, opened={}, closed={})", | |
268 | segment, segment_info, | |
269 | num_empty, num_open, num_closed); | |
270 | ceph_assert(segment_info.is_open()); | |
271 | ceph_assert(num_open > 0); | |
272 | --num_open; | |
273 | ++num_closed; | |
274 | if (segment_info.type == segment_type_t::JOURNAL) { | |
275 | ceph_assert(num_in_journal_open > 0); | |
276 | --num_in_journal_open; | |
277 | ++count_close_journal; | |
278 | } else { | |
279 | ++count_close_ool; | |
280 | } | |
281 | ceph_assert(get_segment_size() >= segment_info.written_to); | |
282 | auto seg_avail_bytes = get_segment_size() - segment_info.written_to; | |
283 | ceph_assert(avail_bytes_in_open >= (std::size_t)seg_avail_bytes); | |
284 | avail_bytes_in_open -= seg_avail_bytes; | |
285 | ||
286 | if (segment_info.modify_time != NULL_TIME) { | |
287 | modify_times.insert(segment_info.modify_time); | |
288 | } else { | |
289 | ceph_assert(segment_info.num_extents == 0); | |
290 | } | |
291 | ||
292 | segment_info.set_closed(); | |
293 | } | |
294 | ||
295 | void segments_info_t::update_written_to( | |
296 | segment_type_t type, | |
297 | paddr_t offset) | |
298 | { | |
299 | LOG_PREFIX(segments_info_t::update_written_to); | |
300 | auto& saddr = offset.as_seg_paddr(); | |
301 | auto& segment_info = segments[saddr.get_segment_id()]; | |
302 | if (!segment_info.is_open()) { | |
303 | ERROR("segment is not open, not updating, type={}, offset={}, {}", | |
304 | type, offset, segment_info); | |
305 | ceph_abort(); | |
306 | } | |
307 | ||
308 | auto new_written_to = saddr.get_segment_off(); | |
309 | ceph_assert(new_written_to <= get_segment_size()); | |
310 | if (segment_info.written_to > new_written_to) { | |
311 | ERROR("written_to should not decrease! type={}, offset={}, {}", | |
312 | type, offset, segment_info); | |
313 | ceph_abort(); | |
314 | } | |
315 | ||
316 | DEBUG("type={}, offset={}, {}", type, offset, segment_info); | |
317 | ceph_assert(type == segment_info.type); | |
318 | auto avail_deduction = new_written_to - segment_info.written_to; | |
319 | ceph_assert(avail_bytes_in_open >= (std::size_t)avail_deduction); | |
320 | avail_bytes_in_open -= avail_deduction; | |
321 | segment_info.written_to = new_written_to; | |
322 | } | |
323 | ||
324 | std::ostream &operator<<(std::ostream &os, const segments_info_t &infos) | |
325 | { | |
326 | return os << "segments(" | |
327 | << "empty=" << infos.get_num_empty() | |
328 | << ", open=" << infos.get_num_open() | |
329 | << ", closed=" << infos.get_num_closed() | |
330 | << ", type_journal=" << infos.get_num_type_journal() | |
331 | << ", type_ool=" << infos.get_num_type_ool() | |
332 | << ", total=" << infos.get_total_bytes() << "B" | |
333 | << ", available=" << infos.get_available_bytes() << "B" | |
334 | << ", unavailable=" << infos.get_unavailable_bytes() << "B" | |
335 | << ", available_ratio=" << infos.get_available_ratio() | |
336 | << ", submitted_head=" << infos.get_submitted_journal_head() | |
337 | << ", time_bound=" << sea_time_point_printer_t{infos.get_time_bound()} | |
338 | << ")"; | |
339 | } | |
340 | ||
341 | void JournalTrimmerImpl::config_t::validate() const | |
342 | { | |
343 | ceph_assert(max_journal_bytes <= DEVICE_OFF_MAX); | |
344 | ceph_assert(max_journal_bytes > target_journal_dirty_bytes); | |
345 | ceph_assert(max_journal_bytes > target_journal_alloc_bytes); | |
346 | ceph_assert(rewrite_dirty_bytes_per_cycle > 0); | |
347 | ceph_assert(rewrite_backref_bytes_per_cycle > 0); | |
348 | } | |
349 | ||
350 | JournalTrimmerImpl::config_t | |
351 | JournalTrimmerImpl::config_t::get_default( | |
352 | std::size_t roll_size, journal_type_t type) | |
353 | { | |
354 | assert(roll_size); | |
355 | std::size_t target_dirty_bytes = 0; | |
356 | std::size_t target_alloc_bytes = 0; | |
357 | std::size_t max_journal_bytes = 0; | |
358 | if (type == journal_type_t::SEGMENTED) { | |
359 | target_dirty_bytes = 12 * roll_size; | |
360 | target_alloc_bytes = 2 * roll_size; | |
361 | max_journal_bytes = 16 * roll_size; | |
362 | } else { | |
363 | assert(type == journal_type_t::RANDOM_BLOCK); | |
364 | target_dirty_bytes = roll_size / 4; | |
365 | target_alloc_bytes = roll_size / 4; | |
366 | max_journal_bytes = roll_size / 2; | |
367 | } | |
368 | return config_t{ | |
369 | target_dirty_bytes, | |
370 | target_alloc_bytes, | |
371 | max_journal_bytes, | |
372 | 1<<17,// rewrite_dirty_bytes_per_cycle | |
373 | 1<<24 // rewrite_backref_bytes_per_cycle | |
374 | }; | |
375 | } | |
376 | ||
377 | JournalTrimmerImpl::config_t | |
378 | JournalTrimmerImpl::config_t::get_test( | |
379 | std::size_t roll_size, journal_type_t type) | |
380 | { | |
381 | assert(roll_size); | |
382 | std::size_t target_dirty_bytes = 0; | |
383 | std::size_t target_alloc_bytes = 0; | |
384 | std::size_t max_journal_bytes = 0; | |
385 | if (type == journal_type_t::SEGMENTED) { | |
386 | target_dirty_bytes = 2 * roll_size; | |
387 | target_alloc_bytes = 2 * roll_size; | |
388 | max_journal_bytes = 4 * roll_size; | |
389 | } else { | |
390 | assert(type == journal_type_t::RANDOM_BLOCK); | |
391 | target_dirty_bytes = roll_size / 4; | |
392 | target_alloc_bytes = roll_size / 4; | |
393 | max_journal_bytes = roll_size / 2; | |
394 | } | |
395 | return config_t{ | |
396 | target_dirty_bytes, | |
397 | target_alloc_bytes, | |
398 | max_journal_bytes, | |
399 | 1<<17,// rewrite_dirty_bytes_per_cycle | |
400 | 1<<24 // rewrite_backref_bytes_per_cycle | |
401 | }; | |
402 | } | |
403 | ||
404 | JournalTrimmerImpl::JournalTrimmerImpl( | |
405 | BackrefManager &backref_manager, | |
406 | config_t config, | |
407 | journal_type_t type, | |
408 | device_off_t roll_start, | |
409 | device_off_t roll_size) | |
410 | : backref_manager(backref_manager), | |
411 | config(config), | |
412 | journal_type(type), | |
413 | roll_start(roll_start), | |
414 | roll_size(roll_size), | |
415 | reserved_usage(0) | |
416 | { | |
417 | config.validate(); | |
418 | ceph_assert(roll_start >= 0); | |
419 | ceph_assert(roll_size > 0); | |
420 | register_metrics(); | |
421 | } | |
422 | ||
423 | void JournalTrimmerImpl::set_journal_head(journal_seq_t head) | |
424 | { | |
425 | LOG_PREFIX(JournalTrimmerImpl::set_journal_head); | |
426 | ||
427 | ceph_assert(head != JOURNAL_SEQ_NULL); | |
428 | ceph_assert(journal_head == JOURNAL_SEQ_NULL || | |
429 | head >= journal_head); | |
430 | ceph_assert(journal_alloc_tail == JOURNAL_SEQ_NULL || | |
431 | head >= journal_alloc_tail); | |
432 | ceph_assert(journal_dirty_tail == JOURNAL_SEQ_NULL || | |
433 | head >= journal_dirty_tail); | |
434 | ||
435 | std::swap(journal_head, head); | |
436 | if (journal_head.segment_seq == head.segment_seq) { | |
437 | DEBUG("journal_head {} => {}, {}", | |
438 | head, journal_head, stat_printer_t{*this, false}); | |
439 | } else { | |
440 | INFO("journal_head {} => {}, {}", | |
441 | head, journal_head, stat_printer_t{*this, false}); | |
442 | } | |
443 | background_callback->maybe_wake_background(); | |
444 | } | |
445 | ||
446 | void JournalTrimmerImpl::update_journal_tails( | |
447 | journal_seq_t dirty_tail, | |
448 | journal_seq_t alloc_tail) | |
449 | { | |
450 | LOG_PREFIX(JournalTrimmerImpl::update_journal_tails); | |
451 | ||
452 | if (dirty_tail != JOURNAL_SEQ_NULL) { | |
453 | ceph_assert(journal_head == JOURNAL_SEQ_NULL || | |
454 | journal_head >= dirty_tail); | |
455 | if (journal_dirty_tail != JOURNAL_SEQ_NULL && | |
456 | journal_dirty_tail > dirty_tail) { | |
457 | ERROR("journal_dirty_tail {} => {} is backwards!", | |
458 | journal_dirty_tail, dirty_tail); | |
459 | ceph_abort(); | |
460 | } | |
461 | std::swap(journal_dirty_tail, dirty_tail); | |
462 | if (journal_dirty_tail.segment_seq == dirty_tail.segment_seq) { | |
463 | DEBUG("journal_dirty_tail {} => {}, {}", | |
464 | dirty_tail, journal_dirty_tail, stat_printer_t{*this, false}); | |
465 | } else { | |
466 | INFO("journal_dirty_tail {} => {}, {}", | |
467 | dirty_tail, journal_dirty_tail, stat_printer_t{*this, false}); | |
468 | } | |
469 | } | |
470 | ||
471 | if (alloc_tail != JOURNAL_SEQ_NULL) { | |
472 | ceph_assert(journal_head == JOURNAL_SEQ_NULL || | |
473 | journal_head >= alloc_tail); | |
474 | if (journal_alloc_tail != JOURNAL_SEQ_NULL && | |
475 | journal_alloc_tail > alloc_tail) { | |
476 | ERROR("journal_alloc_tail {} => {} is backwards!", | |
477 | journal_alloc_tail, alloc_tail); | |
478 | ceph_abort(); | |
479 | } | |
480 | std::swap(journal_alloc_tail, alloc_tail); | |
481 | if (journal_alloc_tail.segment_seq == alloc_tail.segment_seq) { | |
482 | DEBUG("journal_alloc_tail {} => {}, {}", | |
483 | alloc_tail, journal_alloc_tail, stat_printer_t{*this, false}); | |
484 | } else { | |
485 | INFO("journal_alloc_tail {} => {}, {}", | |
486 | alloc_tail, journal_alloc_tail, stat_printer_t{*this, false}); | |
487 | } | |
488 | } | |
489 | ||
490 | background_callback->maybe_wake_background(); | |
491 | background_callback->maybe_wake_blocked_io(); | |
492 | } | |
493 | ||
494 | journal_seq_t JournalTrimmerImpl::get_tail_limit() const | |
495 | { | |
496 | assert(background_callback->is_ready()); | |
497 | auto ret = journal_head.add_offset( | |
498 | journal_type, | |
499 | -static_cast<device_off_t>(config.max_journal_bytes), | |
500 | roll_start, | |
501 | roll_size); | |
502 | return ret; | |
503 | } | |
504 | ||
505 | journal_seq_t JournalTrimmerImpl::get_dirty_tail_target() const | |
506 | { | |
507 | assert(background_callback->is_ready()); | |
508 | auto ret = journal_head.add_offset( | |
509 | journal_type, | |
510 | -static_cast<device_off_t>(config.target_journal_dirty_bytes), | |
511 | roll_start, | |
512 | roll_size); | |
513 | return ret; | |
514 | } | |
515 | ||
516 | journal_seq_t JournalTrimmerImpl::get_alloc_tail_target() const | |
517 | { | |
518 | assert(background_callback->is_ready()); | |
519 | auto ret = journal_head.add_offset( | |
520 | journal_type, | |
521 | -static_cast<device_off_t>(config.target_journal_alloc_bytes), | |
522 | roll_start, | |
523 | roll_size); | |
524 | return ret; | |
525 | } | |
526 | ||
527 | std::size_t JournalTrimmerImpl::get_dirty_journal_size() const | |
528 | { | |
529 | if (!background_callback->is_ready()) { | |
530 | return 0; | |
531 | } | |
532 | auto ret = journal_head.relative_to( | |
533 | journal_type, | |
534 | journal_dirty_tail, | |
535 | roll_start, | |
536 | roll_size); | |
537 | ceph_assert(ret >= 0); | |
538 | return static_cast<std::size_t>(ret); | |
539 | } | |
540 | ||
541 | std::size_t JournalTrimmerImpl::get_alloc_journal_size() const | |
542 | { | |
543 | if (!background_callback->is_ready()) { | |
544 | return 0; | |
545 | } | |
546 | auto ret = journal_head.relative_to( | |
547 | journal_type, | |
548 | journal_alloc_tail, | |
549 | roll_start, | |
550 | roll_size); | |
551 | ceph_assert(ret >= 0); | |
552 | return static_cast<std::size_t>(ret); | |
553 | } | |
554 | ||
555 | seastar::future<> JournalTrimmerImpl::trim() { | |
556 | return seastar::when_all( | |
557 | [this] { | |
558 | if (should_trim_alloc()) { | |
559 | return trim_alloc( | |
560 | ).handle_error( | |
561 | crimson::ct_error::assert_all{ | |
562 | "encountered invalid error in trim_alloc" | |
563 | } | |
564 | ); | |
565 | } else { | |
566 | return seastar::now(); | |
567 | } | |
568 | }, | |
569 | [this] { | |
570 | if (should_trim_dirty()) { | |
571 | return trim_dirty( | |
572 | ).handle_error( | |
573 | crimson::ct_error::assert_all{ | |
574 | "encountered invalid error in trim_dirty" | |
575 | } | |
576 | ); | |
577 | } else { | |
578 | return seastar::now(); | |
579 | } | |
580 | } | |
581 | ).discard_result(); | |
582 | } | |
583 | ||
584 | JournalTrimmerImpl::trim_ertr::future<> | |
585 | JournalTrimmerImpl::trim_alloc() | |
586 | { | |
587 | LOG_PREFIX(JournalTrimmerImpl::trim_alloc); | |
588 | assert(background_callback->is_ready()); | |
589 | return repeat_eagain([this, FNAME] { | |
590 | return extent_callback->with_transaction_intr( | |
591 | Transaction::src_t::TRIM_ALLOC, | |
592 | "trim_alloc", | |
593 | [this, FNAME](auto &t) | |
594 | { | |
595 | auto target = get_alloc_tail_target(); | |
596 | DEBUGT("start, alloc_tail={}, target={}", | |
597 | t, journal_alloc_tail, target); | |
598 | return backref_manager.merge_cached_backrefs( | |
599 | t, | |
600 | target, | |
601 | config.rewrite_backref_bytes_per_cycle | |
602 | ).si_then([this, FNAME, &t](auto trim_alloc_to) | |
603 | -> ExtentCallbackInterface::submit_transaction_direct_iertr::future<> | |
604 | { | |
605 | DEBUGT("trim_alloc_to={}", t, trim_alloc_to); | |
606 | if (trim_alloc_to != JOURNAL_SEQ_NULL) { | |
607 | return extent_callback->submit_transaction_direct( | |
608 | t, std::make_optional<journal_seq_t>(trim_alloc_to)); | |
609 | } | |
610 | return seastar::now(); | |
611 | }); | |
612 | }); | |
613 | }).safe_then([this, FNAME] { | |
614 | DEBUG("finish, alloc_tail={}", journal_alloc_tail); | |
615 | }); | |
616 | } | |
617 | ||
618 | JournalTrimmerImpl::trim_ertr::future<> | |
619 | JournalTrimmerImpl::trim_dirty() | |
620 | { | |
621 | LOG_PREFIX(JournalTrimmerImpl::trim_dirty); | |
622 | assert(background_callback->is_ready()); | |
623 | return repeat_eagain([this, FNAME] { | |
624 | return extent_callback->with_transaction_intr( | |
625 | Transaction::src_t::TRIM_DIRTY, | |
626 | "trim_dirty", | |
627 | [this, FNAME](auto &t) | |
628 | { | |
629 | auto target = get_dirty_tail_target(); | |
630 | DEBUGT("start, dirty_tail={}, target={}", | |
631 | t, journal_dirty_tail, target); | |
632 | return extent_callback->get_next_dirty_extents( | |
633 | t, | |
634 | target, | |
635 | config.rewrite_dirty_bytes_per_cycle | |
636 | ).si_then([this, FNAME, &t](auto dirty_list) { | |
637 | DEBUGT("rewrite {} dirty extents", t, dirty_list.size()); | |
638 | return seastar::do_with( | |
639 | std::move(dirty_list), | |
640 | [this, &t](auto &dirty_list) | |
641 | { | |
642 | return trans_intr::do_for_each( | |
643 | dirty_list, | |
644 | [this, &t](auto &e) { | |
645 | return extent_callback->rewrite_extent( | |
646 | t, e, INIT_GENERATION, NULL_TIME); | |
647 | }); | |
648 | }); | |
649 | }).si_then([this, &t] { | |
650 | return extent_callback->submit_transaction_direct(t); | |
651 | }); | |
652 | }); | |
653 | }).safe_then([this, FNAME] { | |
654 | DEBUG("finish, dirty_tail={}", journal_dirty_tail); | |
655 | }); | |
656 | } | |
657 | ||
658 | void JournalTrimmerImpl::register_metrics() | |
659 | { | |
660 | namespace sm = seastar::metrics; | |
661 | metrics.add_group("journal_trimmer", { | |
662 | sm::make_counter("dirty_journal_bytes", | |
663 | [this] { return get_dirty_journal_size(); }, | |
664 | sm::description("the size of the journal for dirty extents")), | |
665 | sm::make_counter("alloc_journal_bytes", | |
666 | [this] { return get_alloc_journal_size(); }, | |
667 | sm::description("the size of the journal for alloc info")) | |
668 | }); | |
669 | } | |
670 | ||
671 | std::ostream &operator<<( | |
672 | std::ostream &os, const JournalTrimmerImpl::stat_printer_t &stats) | |
673 | { | |
674 | os << "JournalTrimmer("; | |
675 | if (stats.trimmer.background_callback->is_ready()) { | |
676 | os << "should_block_io_on_trim=" << stats.trimmer.should_block_io_on_trim() | |
677 | << ", should_(trim_dirty=" << stats.trimmer.should_trim_dirty() | |
678 | << ", trim_alloc=" << stats.trimmer.should_trim_alloc() << ")"; | |
679 | } else { | |
680 | os << "not-ready"; | |
681 | } | |
682 | if (stats.detailed) { | |
683 | os << ", journal_head=" << stats.trimmer.get_journal_head() | |
684 | << ", alloc_tail=" << stats.trimmer.get_alloc_tail() | |
685 | << ", dirty_tail=" << stats.trimmer.get_dirty_tail(); | |
686 | if (stats.trimmer.background_callback->is_ready()) { | |
687 | os << ", alloc_tail_target=" << stats.trimmer.get_alloc_tail_target() | |
688 | << ", dirty_tail_target=" << stats.trimmer.get_dirty_tail_target() | |
689 | << ", tail_limit=" << stats.trimmer.get_tail_limit(); | |
690 | } | |
691 | } | |
692 | os << ")"; | |
693 | return os; | |
694 | } | |
695 | ||
696 | bool SpaceTrackerSimple::equals(const SpaceTrackerI &_other) const | |
697 | { | |
698 | LOG_PREFIX(SpaceTrackerSimple::equals); | |
699 | const auto &other = static_cast<const SpaceTrackerSimple&>(_other); | |
700 | ||
701 | if (other.live_bytes_by_segment.size() != live_bytes_by_segment.size()) { | |
702 | ERROR("different segment counts, bug in test"); | |
703 | assert(0 == "segment counts should match"); | |
704 | return false; | |
705 | } | |
706 | ||
707 | bool all_match = true; | |
708 | for (auto i = live_bytes_by_segment.begin(), j = other.live_bytes_by_segment.begin(); | |
709 | i != live_bytes_by_segment.end(); ++i, ++j) { | |
710 | if (i->second.live_bytes != j->second.live_bytes) { | |
711 | all_match = false; | |
712 | DEBUG("segment_id {} live bytes mismatch *this: {}, other: {}", | |
713 | i->first, i->second.live_bytes, j->second.live_bytes); | |
714 | } | |
715 | } | |
716 | return all_match; | |
717 | } | |
718 | ||
719 | int64_t SpaceTrackerDetailed::SegmentMap::allocate( | |
720 | device_segment_id_t segment, | |
721 | segment_off_t offset, | |
722 | extent_len_t len, | |
723 | const extent_len_t block_size) | |
724 | { | |
725 | LOG_PREFIX(SegmentMap::allocate); | |
726 | assert(offset % block_size == 0); | |
727 | assert(len % block_size == 0); | |
728 | ||
729 | const auto b = (offset / block_size); | |
730 | const auto e = (offset + len) / block_size; | |
731 | ||
732 | bool error = false; | |
733 | for (auto i = b; i < e; ++i) { | |
734 | if (bitmap[i]) { | |
735 | if (!error) { | |
736 | ERROR("found allocated in {}, {} ~ {}", segment, offset, len); | |
737 | error = true; | |
738 | } | |
739 | DEBUG("block {} allocated", i * block_size); | |
740 | } | |
741 | bitmap[i] = true; | |
742 | } | |
743 | return update_usage(len); | |
744 | } | |
745 | ||
746 | int64_t SpaceTrackerDetailed::SegmentMap::release( | |
747 | device_segment_id_t segment, | |
748 | segment_off_t offset, | |
749 | extent_len_t len, | |
750 | const extent_len_t block_size) | |
751 | { | |
752 | LOG_PREFIX(SegmentMap::release); | |
753 | assert(offset % block_size == 0); | |
754 | assert(len % block_size == 0); | |
755 | ||
756 | const auto b = (offset / block_size); | |
757 | const auto e = (offset + len) / block_size; | |
758 | ||
759 | bool error = false; | |
760 | for (auto i = b; i < e; ++i) { | |
761 | if (!bitmap[i]) { | |
762 | if (!error) { | |
763 | ERROR("found unallocated in {}, {} ~ {}", segment, offset, len); | |
764 | error = true; | |
765 | } | |
766 | DEBUG("block {} unallocated", i * block_size); | |
767 | } | |
768 | bitmap[i] = false; | |
769 | } | |
770 | return update_usage(-(int64_t)len); | |
771 | } | |
772 | ||
773 | bool SpaceTrackerDetailed::equals(const SpaceTrackerI &_other) const | |
774 | { | |
775 | LOG_PREFIX(SpaceTrackerDetailed::equals); | |
776 | const auto &other = static_cast<const SpaceTrackerDetailed&>(_other); | |
777 | ||
778 | if (other.segment_usage.size() != segment_usage.size()) { | |
779 | ERROR("different segment counts, bug in test"); | |
780 | assert(0 == "segment counts should match"); | |
781 | return false; | |
782 | } | |
783 | ||
784 | bool all_match = true; | |
785 | for (auto i = segment_usage.begin(), j = other.segment_usage.begin(); | |
786 | i != segment_usage.end(); ++i, ++j) { | |
787 | if (i->second.get_usage() != j->second.get_usage()) { | |
788 | all_match = false; | |
789 | ERROR("segment_id {} live bytes mismatch *this: {}, other: {}", | |
790 | i->first, i->second.get_usage(), j->second.get_usage()); | |
791 | } | |
792 | } | |
793 | return all_match; | |
794 | } | |
795 | ||
796 | void SpaceTrackerDetailed::SegmentMap::dump_usage(extent_len_t block_size) const | |
797 | { | |
798 | LOG_PREFIX(SegmentMap::dump_usage); | |
799 | INFO("dump start"); | |
800 | for (unsigned i = 0; i < bitmap.size(); ++i) { | |
801 | if (bitmap[i]) { | |
802 | LOCAL_LOGGER.info(" {} still live", i * block_size); | |
803 | } | |
804 | } | |
805 | } | |
806 | ||
807 | void SpaceTrackerDetailed::dump_usage(segment_id_t id) const | |
808 | { | |
809 | LOG_PREFIX(SpaceTrackerDetailed::dump_usage); | |
810 | INFO("{}", id); | |
811 | segment_usage[id].dump_usage( | |
812 | block_size_by_segment_manager[id.device_id()]); | |
813 | } | |
814 | ||
815 | void SpaceTrackerSimple::dump_usage(segment_id_t id) const | |
816 | { | |
817 | LOG_PREFIX(SpaceTrackerSimple::dump_usage); | |
818 | INFO("id: {}, live_bytes: {}", | |
819 | id, live_bytes_by_segment[id].live_bytes); | |
820 | } | |
821 | ||
822 | std::ostream &operator<<( | |
823 | std::ostream &os, const AsyncCleaner::stat_printer_t &stats) | |
824 | { | |
825 | stats.cleaner.print(os, stats.detailed); | |
826 | return os; | |
827 | } | |
828 | ||
829 | SegmentCleaner::SegmentCleaner( | |
830 | config_t config, | |
831 | SegmentManagerGroupRef&& sm_group, | |
832 | BackrefManager &backref_manager, | |
833 | SegmentSeqAllocator &segment_seq_allocator, | |
834 | bool detailed, | |
835 | bool is_cold) | |
836 | : detailed(detailed), | |
837 | is_cold(is_cold), | |
838 | config(config), | |
839 | sm_group(std::move(sm_group)), | |
840 | backref_manager(backref_manager), | |
841 | ool_segment_seq_allocator(segment_seq_allocator) | |
842 | { | |
843 | config.validate(); | |
844 | } | |
845 | ||
846 | void SegmentCleaner::register_metrics() | |
847 | { | |
848 | namespace sm = seastar::metrics; | |
849 | stats.segment_util.buckets.resize(UTIL_BUCKETS); | |
850 | std::size_t i; | |
851 | for (i = 0; i < UTIL_BUCKETS; ++i) { | |
852 | stats.segment_util.buckets[i].upper_bound = ((double)(i + 1)) / 10; | |
853 | stats.segment_util.buckets[i].count = 0; | |
854 | } | |
855 | // NOTE: by default the segments are empty | |
856 | i = get_bucket_index(UTIL_STATE_EMPTY); | |
857 | stats.segment_util.buckets[i].count = segments.get_num_segments(); | |
858 | ||
859 | std::string prefix; | |
860 | if (is_cold) { | |
861 | prefix.append("cold_"); | |
862 | } | |
863 | prefix.append("segment_cleaner"); | |
864 | ||
865 | metrics.add_group(prefix, { | |
866 | sm::make_counter("segments_number", | |
867 | [this] { return segments.get_num_segments(); }, | |
868 | sm::description("the number of segments")), | |
869 | sm::make_counter("segment_size", | |
870 | [this] { return segments.get_segment_size(); }, | |
871 | sm::description("the bytes of a segment")), | |
872 | sm::make_counter("segments_in_journal", | |
873 | [this] { return get_segments_in_journal(); }, | |
874 | sm::description("the number of segments in journal")), | |
875 | sm::make_counter("segments_type_journal", | |
876 | [this] { return segments.get_num_type_journal(); }, | |
877 | sm::description("the number of segments typed journal")), | |
878 | sm::make_counter("segments_type_ool", | |
879 | [this] { return segments.get_num_type_ool(); }, | |
880 | sm::description("the number of segments typed out-of-line")), | |
881 | sm::make_counter("segments_open", | |
882 | [this] { return segments.get_num_open(); }, | |
883 | sm::description("the number of open segments")), | |
884 | sm::make_counter("segments_empty", | |
885 | [this] { return segments.get_num_empty(); }, | |
886 | sm::description("the number of empty segments")), | |
887 | sm::make_counter("segments_closed", | |
888 | [this] { return segments.get_num_closed(); }, | |
889 | sm::description("the number of closed segments")), | |
890 | ||
891 | sm::make_counter("segments_count_open_journal", | |
892 | [this] { return segments.get_count_open_journal(); }, | |
893 | sm::description("the count of open journal segment operations")), | |
894 | sm::make_counter("segments_count_open_ool", | |
895 | [this] { return segments.get_count_open_ool(); }, | |
896 | sm::description("the count of open ool segment operations")), | |
897 | sm::make_counter("segments_count_release_journal", | |
898 | [this] { return segments.get_count_release_journal(); }, | |
899 | sm::description("the count of release journal segment operations")), | |
900 | sm::make_counter("segments_count_release_ool", | |
901 | [this] { return segments.get_count_release_ool(); }, | |
902 | sm::description("the count of release ool segment operations")), | |
903 | sm::make_counter("segments_count_close_journal", | |
904 | [this] { return segments.get_count_close_journal(); }, | |
905 | sm::description("the count of close journal segment operations")), | |
906 | sm::make_counter("segments_count_close_ool", | |
907 | [this] { return segments.get_count_close_ool(); }, | |
908 | sm::description("the count of close ool segment operations")), | |
909 | ||
910 | sm::make_counter("total_bytes", | |
911 | [this] { return segments.get_total_bytes(); }, | |
912 | sm::description("the size of the space")), | |
913 | sm::make_counter("available_bytes", | |
914 | [this] { return segments.get_available_bytes(); }, | |
915 | sm::description("the size of the space is available")), | |
916 | sm::make_counter("unavailable_unreclaimable_bytes", | |
917 | [this] { return get_unavailable_unreclaimable_bytes(); }, | |
918 | sm::description("the size of the space is unavailable and unreclaimable")), | |
919 | sm::make_counter("unavailable_reclaimable_bytes", | |
920 | [this] { return get_unavailable_reclaimable_bytes(); }, | |
921 | sm::description("the size of the space is unavailable and reclaimable")), | |
922 | sm::make_counter("used_bytes", stats.used_bytes, | |
923 | sm::description("the size of the space occupied by live extents")), | |
924 | sm::make_counter("unavailable_unused_bytes", | |
925 | [this] { return get_unavailable_unused_bytes(); }, | |
926 | sm::description("the size of the space is unavailable and not alive")), | |
927 | ||
928 | sm::make_counter("projected_count", stats.projected_count, | |
929 | sm::description("the number of projected usage reservations")), | |
930 | sm::make_counter("projected_used_bytes_sum", stats.projected_used_bytes_sum, | |
931 | sm::description("the sum of the projected usage in bytes")), | |
932 | ||
933 | sm::make_counter("reclaimed_bytes", stats.reclaimed_bytes, | |
934 | sm::description("rewritten bytes due to reclaim")), | |
935 | sm::make_counter("reclaimed_segment_bytes", stats.reclaimed_segment_bytes, | |
936 | sm::description("rewritten bytes due to reclaim")), | |
937 | sm::make_counter("closed_journal_used_bytes", stats.closed_journal_used_bytes, | |
938 | sm::description("used bytes when close a journal segment")), | |
939 | sm::make_counter("closed_journal_total_bytes", stats.closed_journal_total_bytes, | |
940 | sm::description("total bytes of closed journal segments")), | |
941 | sm::make_counter("closed_ool_used_bytes", stats.closed_ool_used_bytes, | |
942 | sm::description("used bytes when close a ool segment")), | |
943 | sm::make_counter("closed_ool_total_bytes", stats.closed_ool_total_bytes, | |
944 | sm::description("total bytes of closed ool segments")), | |
945 | ||
946 | sm::make_gauge("available_ratio", | |
947 | [this] { return segments.get_available_ratio(); }, | |
948 | sm::description("ratio of available space to total space")), | |
949 | sm::make_gauge("reclaim_ratio", | |
950 | [this] { return get_reclaim_ratio(); }, | |
951 | sm::description("ratio of reclaimable space to unavailable space")), | |
952 | ||
953 | sm::make_histogram("segment_utilization_distribution", | |
954 | [this]() -> seastar::metrics::histogram& { | |
955 | return stats.segment_util; | |
956 | }, | |
957 | sm::description("utilization distribution of all segments")) | |
958 | }); | |
959 | } | |
960 | ||
961 | segment_id_t SegmentCleaner::allocate_segment( | |
962 | segment_seq_t seq, | |
963 | segment_type_t type, | |
964 | data_category_t category, | |
965 | rewrite_gen_t generation) | |
966 | { | |
967 | LOG_PREFIX(SegmentCleaner::allocate_segment); | |
968 | assert(seq != NULL_SEG_SEQ); | |
969 | ceph_assert(type == segment_type_t::OOL || | |
970 | trimmer != nullptr); // segment_type_t::JOURNAL | |
971 | for (auto it = segments.begin(); | |
972 | it != segments.end(); | |
973 | ++it) { | |
974 | auto seg_id = it->first; | |
975 | auto& segment_info = it->second; | |
976 | if (segment_info.is_empty()) { | |
977 | auto old_usage = calc_utilization(seg_id); | |
978 | segments.mark_open(seg_id, seq, type, category, generation); | |
979 | background_callback->maybe_wake_background(); | |
980 | auto new_usage = calc_utilization(seg_id); | |
981 | adjust_segment_util(old_usage, new_usage); | |
982 | INFO("opened, {}", stat_printer_t{*this, false}); | |
983 | return seg_id; | |
984 | } | |
985 | } | |
986 | ERROR("out of space with {} {} {} {}", | |
987 | type, segment_seq_printer_t{seq}, category, | |
988 | rewrite_gen_printer_t{generation}); | |
aee94f69 | 989 | ceph_abort("seastore device size setting is too small"); |
1e59de90 TL |
990 | return NULL_SEG_ID; |
991 | } | |
992 | ||
993 | void SegmentCleaner::close_segment(segment_id_t segment) | |
994 | { | |
995 | LOG_PREFIX(SegmentCleaner::close_segment); | |
996 | auto old_usage = calc_utilization(segment); | |
997 | segments.mark_closed(segment); | |
998 | auto &seg_info = segments[segment]; | |
999 | if (seg_info.type == segment_type_t::JOURNAL) { | |
1000 | stats.closed_journal_used_bytes += space_tracker->get_usage(segment); | |
1001 | stats.closed_journal_total_bytes += segments.get_segment_size(); | |
1002 | } else { | |
1003 | stats.closed_ool_used_bytes += space_tracker->get_usage(segment); | |
1004 | stats.closed_ool_total_bytes += segments.get_segment_size(); | |
1005 | } | |
1006 | auto new_usage = calc_utilization(segment); | |
1007 | adjust_segment_util(old_usage, new_usage); | |
1008 | INFO("closed, {} -- {}", stat_printer_t{*this, false}, seg_info); | |
1009 | } | |
1010 | ||
1011 | double SegmentCleaner::calc_gc_benefit_cost( | |
1012 | segment_id_t id, | |
1013 | const sea_time_point &now_time, | |
1014 | const sea_time_point &bound_time) const | |
1015 | { | |
1016 | double util = calc_utilization(id); | |
1017 | ceph_assert(util >= 0 && util < 1); | |
1018 | if constexpr (gc_formula == gc_formula_t::GREEDY) { | |
1019 | return 1 - util; | |
1020 | } | |
1021 | ||
1022 | if constexpr (gc_formula == gc_formula_t::COST_BENEFIT) { | |
1023 | if (util == 0) { | |
1024 | return std::numeric_limits<double>::max(); | |
1025 | } | |
1026 | auto modify_time = segments[id].modify_time; | |
1027 | double age_segment = modify_time.time_since_epoch().count(); | |
1028 | double age_now = now_time.time_since_epoch().count(); | |
1029 | if (likely(age_now > age_segment)) { | |
1030 | return (1 - util) * (age_now - age_segment) / (2 * util); | |
1031 | } else { | |
1032 | // time is wrong | |
1033 | return (1 - util) / (2 * util); | |
1034 | } | |
1035 | } | |
1036 | ||
1037 | assert(gc_formula == gc_formula_t::BENEFIT); | |
1038 | auto modify_time = segments[id].modify_time; | |
1039 | double age_factor = 0.5; // middle value if age is invalid | |
1040 | if (likely(bound_time != NULL_TIME && | |
1041 | modify_time != NULL_TIME && | |
1042 | now_time > modify_time)) { | |
1043 | assert(modify_time >= bound_time); | |
1044 | double age_bound = bound_time.time_since_epoch().count(); | |
1045 | double age_now = now_time.time_since_epoch().count(); | |
1046 | double age_segment = modify_time.time_since_epoch().count(); | |
1047 | age_factor = (age_now - age_segment) / (age_now - age_bound); | |
1048 | } | |
1049 | return ((1 - 2 * age_factor) * util * util + | |
1050 | (2 * age_factor - 2) * util + 1); | |
1051 | } | |
1052 | ||
1053 | SegmentCleaner::do_reclaim_space_ret | |
1054 | SegmentCleaner::do_reclaim_space( | |
1055 | const std::vector<CachedExtentRef> &backref_extents, | |
1056 | const backref_pin_list_t &pin_list, | |
1057 | std::size_t &reclaimed, | |
1058 | std::size_t &runs) | |
1059 | { | |
1060 | return repeat_eagain([this, &backref_extents, | |
1061 | &pin_list, &reclaimed, &runs] { | |
1062 | reclaimed = 0; | |
1063 | runs++; | |
1064 | auto src = Transaction::src_t::CLEANER_MAIN; | |
1065 | if (is_cold) { | |
1066 | src = Transaction::src_t::CLEANER_COLD; | |
1067 | } | |
1068 | return extent_callback->with_transaction_intr( | |
1069 | src, | |
1070 | "clean_reclaim_space", | |
1071 | [this, &backref_extents, &pin_list, &reclaimed](auto &t) | |
1072 | { | |
1073 | return seastar::do_with( | |
1074 | std::vector<CachedExtentRef>(backref_extents), | |
1075 | [this, &t, &reclaimed, &pin_list](auto &extents) | |
1076 | { | |
1077 | LOG_PREFIX(SegmentCleaner::do_reclaim_space); | |
1078 | // calculate live extents | |
1079 | auto cached_backref_entries = | |
1080 | backref_manager.get_cached_backref_entries_in_range( | |
1081 | reclaim_state->start_pos, reclaim_state->end_pos); | |
1082 | backref_entry_query_set_t backref_entries; | |
1083 | for (auto &pin : pin_list) { | |
1084 | backref_entries.emplace( | |
1085 | pin->get_key(), | |
1086 | pin->get_val(), | |
1087 | pin->get_length(), | |
1088 | pin->get_type(), | |
1089 | JOURNAL_SEQ_NULL); | |
1090 | } | |
1091 | for (auto &cached_backref : cached_backref_entries) { | |
1092 | if (cached_backref.laddr == L_ADDR_NULL) { | |
1093 | auto it = backref_entries.find(cached_backref.paddr); | |
1094 | assert(it->len == cached_backref.len); | |
1095 | backref_entries.erase(it); | |
1096 | } else { | |
1097 | backref_entries.emplace(cached_backref); | |
1098 | } | |
1099 | } | |
1100 | // retrieve live extents | |
1101 | DEBUGT("start, backref_entries={}, backref_extents={}", | |
1102 | t, backref_entries.size(), extents.size()); | |
1103 | return seastar::do_with( | |
1104 | std::move(backref_entries), | |
1105 | [this, &extents, &t](auto &backref_entries) { | |
1106 | return trans_intr::parallel_for_each( | |
1107 | backref_entries, | |
1108 | [this, &extents, &t](auto &ent) | |
1109 | { | |
1110 | LOG_PREFIX(SegmentCleaner::do_reclaim_space); | |
1111 | TRACET("getting extent of type {} at {}~{}", | |
1112 | t, | |
1113 | ent.type, | |
1114 | ent.paddr, | |
1115 | ent.len); | |
1116 | return extent_callback->get_extents_if_live( | |
1117 | t, ent.type, ent.paddr, ent.laddr, ent.len | |
1118 | ).si_then([FNAME, &extents, &ent, &t](auto list) { | |
1119 | if (list.empty()) { | |
1120 | TRACET("addr {} dead, skipping", t, ent.paddr); | |
1121 | } else { | |
1122 | for (auto &e : list) { | |
1123 | extents.emplace_back(std::move(e)); | |
1124 | } | |
1125 | } | |
1126 | }); | |
1127 | }); | |
1128 | }).si_then([FNAME, &extents, this, &reclaimed, &t] { | |
1129 | DEBUGT("reclaim {} extents", t, extents.size()); | |
1130 | // rewrite live extents | |
1131 | auto modify_time = segments[reclaim_state->get_segment_id()].modify_time; | |
1132 | return trans_intr::do_for_each( | |
1133 | extents, | |
1134 | [this, modify_time, &t, &reclaimed](auto ext) | |
1135 | { | |
1136 | reclaimed += ext->get_length(); | |
1137 | return extent_callback->rewrite_extent( | |
1138 | t, ext, reclaim_state->target_generation, modify_time); | |
1139 | }); | |
1140 | }); | |
1141 | }).si_then([this, &t] { | |
1142 | return extent_callback->submit_transaction_direct(t); | |
1143 | }); | |
1144 | }); | |
1145 | }); | |
1146 | } | |
1147 | ||
1148 | SegmentCleaner::clean_space_ret SegmentCleaner::clean_space() | |
1149 | { | |
1150 | LOG_PREFIX(SegmentCleaner::clean_space); | |
1151 | assert(background_callback->is_ready()); | |
1152 | ceph_assert(can_clean_space()); | |
1153 | if (!reclaim_state) { | |
1154 | segment_id_t seg_id = get_next_reclaim_segment(); | |
1155 | auto &segment_info = segments[seg_id]; | |
1156 | INFO("reclaim {} {} start, usage={}, time_bound={}", | |
1157 | seg_id, segment_info, | |
1158 | space_tracker->calc_utilization(seg_id), | |
1159 | sea_time_point_printer_t{segments.get_time_bound()}); | |
1160 | ceph_assert(segment_info.is_closed()); | |
1161 | reclaim_state = reclaim_state_t::create( | |
1162 | seg_id, segment_info.generation, segments.get_segment_size()); | |
1163 | } | |
1164 | reclaim_state->advance(config.reclaim_bytes_per_cycle); | |
1165 | ||
1166 | DEBUG("reclaiming {} {}~{}", | |
1167 | rewrite_gen_printer_t{reclaim_state->generation}, | |
1168 | reclaim_state->start_pos, | |
1169 | reclaim_state->end_pos); | |
1170 | double pavail_ratio = get_projected_available_ratio(); | |
1171 | sea_time_point start = seastar::lowres_system_clock::now(); | |
1172 | ||
1173 | // Backref-tree doesn't support tree-read during tree-updates with parallel | |
1174 | // transactions. So, concurrent transactions between trim and reclaim are | |
1175 | // not allowed right now. | |
1176 | return seastar::do_with( | |
1177 | std::pair<std::vector<CachedExtentRef>, backref_pin_list_t>(), | |
1178 | [this](auto &weak_read_ret) { | |
1179 | return repeat_eagain([this, &weak_read_ret] { | |
1180 | return extent_callback->with_transaction_intr( | |
1181 | Transaction::src_t::READ, | |
1182 | "retrieve_from_backref_tree", | |
1183 | [this, &weak_read_ret](auto &t) { | |
1184 | return backref_manager.get_mappings( | |
1185 | t, | |
1186 | reclaim_state->start_pos, | |
1187 | reclaim_state->end_pos | |
1188 | ).si_then([this, &t, &weak_read_ret](auto pin_list) { | |
1189 | if (!pin_list.empty()) { | |
1190 | auto it = pin_list.begin(); | |
1191 | auto &first_pin = *it; | |
1192 | if (first_pin->get_key() < reclaim_state->start_pos) { | |
1193 | // BackrefManager::get_mappings may include a entry before | |
1194 | // reclaim_state->start_pos, which is semantically inconsistent | |
1195 | // with the requirements of the cleaner | |
1196 | pin_list.erase(it); | |
1197 | } | |
1198 | } | |
1199 | return backref_manager.retrieve_backref_extents_in_range( | |
1200 | t, | |
1201 | reclaim_state->start_pos, | |
1202 | reclaim_state->end_pos | |
1203 | ).si_then([pin_list=std::move(pin_list), | |
1204 | &weak_read_ret](auto extents) mutable { | |
1205 | weak_read_ret = std::make_pair(std::move(extents), std::move(pin_list)); | |
1206 | }); | |
1207 | }); | |
1208 | }); | |
1209 | }).safe_then([&weak_read_ret] { | |
1210 | return std::move(weak_read_ret); | |
1211 | }); | |
1212 | }).safe_then([this, FNAME, pavail_ratio, start](auto weak_read_ret) { | |
1213 | return seastar::do_with( | |
1214 | std::move(weak_read_ret.first), | |
1215 | std::move(weak_read_ret.second), | |
1216 | (size_t)0, | |
1217 | (size_t)0, | |
1218 | [this, FNAME, pavail_ratio, start]( | |
1219 | auto &backref_extents, auto &pin_list, auto &reclaimed, auto &runs) | |
1220 | { | |
1221 | return do_reclaim_space( | |
1222 | backref_extents, | |
1223 | pin_list, | |
1224 | reclaimed, | |
1225 | runs | |
1226 | ).safe_then([this, FNAME, pavail_ratio, start, &reclaimed, &runs] { | |
1227 | stats.reclaiming_bytes += reclaimed; | |
1228 | auto d = seastar::lowres_system_clock::now() - start; | |
1229 | DEBUG("duration: {}, pavail_ratio before: {}, repeats: {}", | |
1230 | d, pavail_ratio, runs); | |
1231 | if (reclaim_state->is_complete()) { | |
1232 | auto segment_to_release = reclaim_state->get_segment_id(); | |
1233 | INFO("reclaim {} finish, reclaimed alive/total={}", | |
1234 | segment_to_release, | |
1235 | stats.reclaiming_bytes/(double)segments.get_segment_size()); | |
1236 | stats.reclaimed_bytes += stats.reclaiming_bytes; | |
1237 | stats.reclaimed_segment_bytes += segments.get_segment_size(); | |
1238 | stats.reclaiming_bytes = 0; | |
1239 | reclaim_state.reset(); | |
1240 | return sm_group->release_segment(segment_to_release | |
1241 | ).handle_error( | |
1242 | clean_space_ertr::pass_further{}, | |
1243 | crimson::ct_error::assert_all{ | |
1244 | "SegmentCleaner::clean_space encountered invalid error in release_segment" | |
1245 | } | |
1246 | ).safe_then([this, FNAME, segment_to_release] { | |
1247 | auto old_usage = calc_utilization(segment_to_release); | |
1248 | if(unlikely(old_usage != 0)) { | |
1249 | space_tracker->dump_usage(segment_to_release); | |
1250 | ERROR("segment {} old_usage {} != 0", | |
1251 | segment_to_release, old_usage); | |
1252 | ceph_abort(); | |
1253 | } | |
1254 | segments.mark_empty(segment_to_release); | |
1255 | auto new_usage = calc_utilization(segment_to_release); | |
1256 | adjust_segment_util(old_usage, new_usage); | |
1257 | INFO("released {}, {}", | |
1258 | segment_to_release, stat_printer_t{*this, false}); | |
1259 | background_callback->maybe_wake_blocked_io(); | |
1260 | }); | |
1261 | } else { | |
1262 | return clean_space_ertr::now(); | |
1263 | } | |
1264 | }); | |
1265 | }); | |
1266 | }); | |
1267 | } | |
1268 | ||
1269 | SegmentCleaner::mount_ret SegmentCleaner::mount() | |
1270 | { | |
1271 | LOG_PREFIX(SegmentCleaner::mount); | |
1272 | const auto& sms = sm_group->get_segment_managers(); | |
1273 | INFO("{} segment managers", sms.size()); | |
1274 | ||
1275 | assert(background_callback->get_state() == state_t::MOUNT); | |
1276 | ||
1277 | space_tracker.reset( | |
1278 | detailed ? | |
1279 | (SpaceTrackerI*)new SpaceTrackerDetailed( | |
1280 | sms) : | |
1281 | (SpaceTrackerI*)new SpaceTrackerSimple( | |
1282 | sms)); | |
1283 | ||
1284 | segments.reset(); | |
1285 | for (auto sm : sms) { | |
1286 | segments.add_segment_manager(*sm); | |
1287 | } | |
1288 | segments.assign_ids(); | |
1289 | ||
1290 | stats = {}; | |
1291 | metrics.clear(); | |
1292 | register_metrics(); | |
1293 | ||
1294 | INFO("{} segments", segments.get_num_segments()); | |
1295 | return crimson::do_for_each( | |
1296 | segments.begin(), | |
1297 | segments.end(), | |
1298 | [this, FNAME](auto& it) | |
1299 | { | |
1300 | auto segment_id = it.first; | |
1301 | return sm_group->read_segment_header( | |
1302 | segment_id | |
1303 | ).safe_then([segment_id, this, FNAME](auto header) { | |
1304 | DEBUG("segment_id={} -- {}", segment_id, header); | |
1305 | auto s_type = header.get_type(); | |
1306 | if (s_type == segment_type_t::NULL_SEG) { | |
1307 | ERROR("got null segment, segment_id={} -- {}", segment_id, header); | |
1308 | ceph_abort(); | |
1309 | } | |
1310 | return sm_group->read_segment_tail( | |
1311 | segment_id | |
1312 | ).safe_then([this, FNAME, segment_id, header](auto tail) | |
1313 | -> scan_extents_ertr::future<> { | |
1314 | if (tail.segment_nonce != header.segment_nonce) { | |
1315 | return scan_no_tail_segment(header, segment_id); | |
1316 | } | |
1317 | ceph_assert(header.get_type() == tail.get_type()); | |
1318 | ||
1319 | sea_time_point modify_time = mod_to_timepoint(tail.modify_time); | |
1320 | std::size_t num_extents = tail.num_extents; | |
1321 | if ((modify_time == NULL_TIME && num_extents == 0) || | |
1322 | (modify_time != NULL_TIME && num_extents != 0)) { | |
1323 | segments.update_modify_time(segment_id, modify_time, num_extents); | |
1324 | } else { | |
1325 | ERROR("illegal modify time {}", tail); | |
1326 | return crimson::ct_error::input_output_error::make(); | |
1327 | } | |
1328 | ||
1329 | init_mark_segment_closed( | |
1330 | segment_id, | |
1331 | header.segment_seq, | |
1332 | header.type, | |
1333 | header.category, | |
1334 | header.generation); | |
1335 | return seastar::now(); | |
1336 | }).handle_error( | |
1337 | crimson::ct_error::enodata::handle( | |
1338 | [this, header, segment_id](auto) { | |
1339 | return scan_no_tail_segment(header, segment_id); | |
1340 | }), | |
1341 | crimson::ct_error::pass_further_all{} | |
1342 | ); | |
1343 | }).handle_error( | |
1344 | crimson::ct_error::enoent::handle([](auto) { | |
1345 | return mount_ertr::now(); | |
1346 | }), | |
1347 | crimson::ct_error::enodata::handle([](auto) { | |
1348 | return mount_ertr::now(); | |
1349 | }), | |
1350 | crimson::ct_error::input_output_error::pass_further{}, | |
1351 | crimson::ct_error::assert_all{"unexpected error"} | |
1352 | ); | |
1353 | }).safe_then([this, FNAME] { | |
1354 | INFO("done, {}", segments); | |
1355 | }); | |
1356 | } | |
1357 | ||
1358 | SegmentCleaner::scan_extents_ret SegmentCleaner::scan_no_tail_segment( | |
1359 | const segment_header_t &segment_header, | |
1360 | segment_id_t segment_id) | |
1361 | { | |
1362 | LOG_PREFIX(SegmentCleaner::scan_no_tail_segment); | |
1363 | INFO("scan {} {}", segment_id, segment_header); | |
1364 | return seastar::do_with( | |
1365 | scan_valid_records_cursor({ | |
1366 | segments[segment_id].seq, | |
1367 | paddr_t::make_seg_paddr(segment_id, 0) | |
1368 | }), | |
1369 | SegmentManagerGroup::found_record_handler_t( | |
1370 | [this, segment_id, segment_header, FNAME]( | |
1371 | record_locator_t locator, | |
1372 | const record_group_header_t &record_group_header, | |
1373 | const bufferlist& mdbuf | |
1374 | ) mutable -> SegmentManagerGroup::scan_valid_records_ertr::future<> | |
1375 | { | |
1376 | DEBUG("{} {}, decoding {} records", | |
1377 | segment_id, segment_header.get_type(), record_group_header.records); | |
1378 | ||
1379 | auto maybe_headers = try_decode_record_headers( | |
1380 | record_group_header, mdbuf); | |
1381 | if (!maybe_headers) { | |
1382 | // This should be impossible, we did check the crc on the mdbuf | |
1383 | ERROR("unable to decode record headers for record group {}", | |
1384 | locator.record_block_base); | |
1385 | return crimson::ct_error::input_output_error::make(); | |
1386 | } | |
1387 | ||
1388 | for (auto &record_header : *maybe_headers) { | |
1389 | auto modify_time = mod_to_timepoint(record_header.modify_time); | |
1390 | if (record_header.extents == 0 || modify_time != NULL_TIME) { | |
1391 | segments.update_modify_time( | |
1392 | segment_id, modify_time, record_header.extents); | |
1393 | } else { | |
1394 | ERROR("illegal modify time {}", record_header); | |
1395 | return crimson::ct_error::input_output_error::make(); | |
1396 | } | |
1397 | } | |
1398 | return seastar::now(); | |
1399 | }), | |
1400 | [this, segment_header](auto &cursor, auto &handler) | |
1401 | { | |
1402 | return sm_group->scan_valid_records( | |
1403 | cursor, | |
1404 | segment_header.segment_nonce, | |
1405 | segments.get_segment_size(), | |
1406 | handler).discard_result(); | |
1407 | }).safe_then([this, segment_id, segment_header] { | |
1408 | init_mark_segment_closed( | |
1409 | segment_id, | |
1410 | segment_header.segment_seq, | |
1411 | segment_header.type, | |
1412 | segment_header.category, | |
1413 | segment_header.generation); | |
1414 | }); | |
1415 | } | |
1416 | ||
1417 | bool SegmentCleaner::check_usage() | |
1418 | { | |
1419 | SpaceTrackerIRef tracker(space_tracker->make_empty()); | |
1420 | extent_callback->with_transaction_weak( | |
1421 | "check_usage", | |
1422 | [this, &tracker](auto &t) { | |
1423 | return backref_manager.scan_mapped_space( | |
1424 | t, | |
1425 | [&tracker]( | |
1426 | paddr_t paddr, | |
1427 | paddr_t backref_key, | |
1428 | extent_len_t len, | |
1429 | extent_types_t type, | |
1430 | laddr_t laddr) | |
1431 | { | |
1432 | if (paddr.get_addr_type() == paddr_types_t::SEGMENT) { | |
1433 | if (is_backref_node(type)) { | |
1434 | assert(laddr == L_ADDR_NULL); | |
1435 | assert(backref_key != P_ADDR_NULL); | |
1436 | tracker->allocate( | |
1437 | paddr.as_seg_paddr().get_segment_id(), | |
1438 | paddr.as_seg_paddr().get_segment_off(), | |
1439 | len); | |
1440 | } else if (laddr == L_ADDR_NULL) { | |
1441 | assert(backref_key == P_ADDR_NULL); | |
1442 | tracker->release( | |
1443 | paddr.as_seg_paddr().get_segment_id(), | |
1444 | paddr.as_seg_paddr().get_segment_off(), | |
1445 | len); | |
1446 | } else { | |
1447 | assert(backref_key == P_ADDR_NULL); | |
1448 | tracker->allocate( | |
1449 | paddr.as_seg_paddr().get_segment_id(), | |
1450 | paddr.as_seg_paddr().get_segment_off(), | |
1451 | len); | |
1452 | } | |
1453 | } | |
1454 | }); | |
1455 | }).unsafe_get0(); | |
1456 | return space_tracker->equals(*tracker); | |
1457 | } | |
1458 | ||
1459 | void SegmentCleaner::mark_space_used( | |
1460 | paddr_t addr, | |
1461 | extent_len_t len) | |
1462 | { | |
1463 | LOG_PREFIX(SegmentCleaner::mark_space_used); | |
1464 | assert(background_callback->get_state() >= state_t::SCAN_SPACE); | |
aee94f69 | 1465 | assert(len); |
1e59de90 TL |
1466 | // TODO: drop |
1467 | if (addr.get_addr_type() != paddr_types_t::SEGMENT) { | |
1468 | return; | |
1469 | } | |
1470 | ||
1471 | auto& seg_addr = addr.as_seg_paddr(); | |
1472 | stats.used_bytes += len; | |
1473 | auto old_usage = calc_utilization(seg_addr.get_segment_id()); | |
1474 | [[maybe_unused]] auto ret = space_tracker->allocate( | |
1475 | seg_addr.get_segment_id(), | |
1476 | seg_addr.get_segment_off(), | |
1477 | len); | |
1478 | auto new_usage = calc_utilization(seg_addr.get_segment_id()); | |
1479 | adjust_segment_util(old_usage, new_usage); | |
1480 | ||
1481 | background_callback->maybe_wake_background(); | |
1482 | assert(ret > 0); | |
1483 | DEBUG("segment {} new len: {}~{}, live_bytes: {}", | |
1484 | seg_addr.get_segment_id(), | |
1485 | addr, | |
1486 | len, | |
1487 | space_tracker->get_usage(seg_addr.get_segment_id())); | |
1488 | } | |
1489 | ||
1490 | void SegmentCleaner::mark_space_free( | |
1491 | paddr_t addr, | |
1492 | extent_len_t len) | |
1493 | { | |
1494 | LOG_PREFIX(SegmentCleaner::mark_space_free); | |
1495 | assert(background_callback->get_state() >= state_t::SCAN_SPACE); | |
aee94f69 | 1496 | assert(len); |
1e59de90 TL |
1497 | // TODO: drop |
1498 | if (addr.get_addr_type() != paddr_types_t::SEGMENT) { | |
1499 | return; | |
1500 | } | |
1501 | ||
1502 | ceph_assert(stats.used_bytes >= len); | |
1503 | stats.used_bytes -= len; | |
1504 | auto& seg_addr = addr.as_seg_paddr(); | |
1505 | ||
1506 | DEBUG("segment {} free len: {}~{}", | |
1507 | seg_addr.get_segment_id(), addr, len); | |
1508 | auto old_usage = calc_utilization(seg_addr.get_segment_id()); | |
1509 | [[maybe_unused]] auto ret = space_tracker->release( | |
1510 | seg_addr.get_segment_id(), | |
1511 | seg_addr.get_segment_off(), | |
1512 | len); | |
1513 | auto new_usage = calc_utilization(seg_addr.get_segment_id()); | |
1514 | adjust_segment_util(old_usage, new_usage); | |
1515 | background_callback->maybe_wake_blocked_io(); | |
1516 | assert(ret >= 0); | |
1517 | DEBUG("segment {} free len: {}~{}, live_bytes: {}", | |
1518 | seg_addr.get_segment_id(), | |
1519 | addr, | |
1520 | len, | |
1521 | space_tracker->get_usage(seg_addr.get_segment_id())); | |
1522 | } | |
1523 | ||
1524 | segment_id_t SegmentCleaner::get_next_reclaim_segment() const | |
1525 | { | |
1526 | LOG_PREFIX(SegmentCleaner::get_next_reclaim_segment); | |
1527 | segment_id_t id = NULL_SEG_ID; | |
1528 | double max_benefit_cost = 0; | |
1529 | sea_time_point now_time; | |
1530 | if constexpr (gc_formula != gc_formula_t::GREEDY) { | |
1531 | now_time = seastar::lowres_system_clock::now(); | |
1532 | } else { | |
1533 | now_time = NULL_TIME; | |
1534 | } | |
1535 | sea_time_point bound_time; | |
1536 | if constexpr (gc_formula == gc_formula_t::BENEFIT) { | |
1537 | bound_time = segments.get_time_bound(); | |
1538 | if (bound_time == NULL_TIME) { | |
1539 | WARN("BENEFIT -- bound_time is NULL_TIME"); | |
1540 | } | |
1541 | } else { | |
1542 | bound_time = NULL_TIME; | |
1543 | } | |
1544 | for (auto& [_id, segment_info] : segments) { | |
1545 | if (segment_info.is_closed() && | |
1546 | (trimmer == nullptr || | |
1547 | !segment_info.is_in_journal(trimmer->get_journal_tail()))) { | |
1548 | double benefit_cost = calc_gc_benefit_cost(_id, now_time, bound_time); | |
1549 | if (benefit_cost > max_benefit_cost) { | |
1550 | id = _id; | |
1551 | max_benefit_cost = benefit_cost; | |
1552 | } | |
1553 | } | |
1554 | } | |
1555 | if (id != NULL_SEG_ID) { | |
1556 | DEBUG("segment {}, benefit_cost {}", | |
1557 | id, max_benefit_cost); | |
1558 | return id; | |
1559 | } else { | |
1560 | ceph_assert(get_segments_reclaimable() == 0); | |
1561 | // see should_clean_space() | |
1562 | ceph_abort("impossible!"); | |
1563 | return NULL_SEG_ID; | |
1564 | } | |
1565 | } | |
1566 | ||
1567 | bool SegmentCleaner::try_reserve_projected_usage(std::size_t projected_usage) | |
1568 | { | |
1569 | assert(background_callback->is_ready()); | |
1570 | stats.projected_used_bytes += projected_usage; | |
1571 | if (should_block_io_on_clean()) { | |
1572 | stats.projected_used_bytes -= projected_usage; | |
1573 | return false; | |
1574 | } else { | |
1575 | ++stats.projected_count; | |
1576 | stats.projected_used_bytes_sum += stats.projected_used_bytes; | |
1577 | return true; | |
1578 | } | |
1579 | } | |
1580 | ||
1581 | void SegmentCleaner::release_projected_usage(std::size_t projected_usage) | |
1582 | { | |
1583 | assert(background_callback->is_ready()); | |
1584 | ceph_assert(stats.projected_used_bytes >= projected_usage); | |
1585 | stats.projected_used_bytes -= projected_usage; | |
1586 | background_callback->maybe_wake_blocked_io(); | |
1587 | } | |
1588 | ||
1589 | void SegmentCleaner::print(std::ostream &os, bool is_detailed) const | |
1590 | { | |
1591 | os << "SegmentCleaner("; | |
1592 | if (background_callback->is_ready()) { | |
1593 | os << "should_block_io_on_clean=" << should_block_io_on_clean() | |
1594 | << ", should_clean=" << should_clean_space(); | |
1595 | } else { | |
1596 | os << "not-ready"; | |
1597 | } | |
1598 | os << ", projected_avail_ratio=" << get_projected_available_ratio() | |
1599 | << ", reclaim_ratio=" << get_reclaim_ratio() | |
1600 | << ", alive_ratio=" << get_alive_ratio(); | |
1601 | if (is_detailed) { | |
1602 | os << ", unavailable_unreclaimable=" | |
1603 | << get_unavailable_unreclaimable_bytes() << "B" | |
1604 | << ", unavailable_reclaimble=" | |
1605 | << get_unavailable_reclaimable_bytes() << "B" | |
1606 | << ", alive=" << stats.used_bytes << "B" | |
1607 | << ", " << segments; | |
1608 | } | |
1609 | os << ")"; | |
1610 | } | |
1611 | ||
1612 | RBMCleaner::RBMCleaner( | |
1613 | RBMDeviceGroupRef&& rb_group, | |
1614 | BackrefManager &backref_manager, | |
1615 | bool detailed) | |
1616 | : detailed(detailed), | |
1617 | rb_group(std::move(rb_group)), | |
1618 | backref_manager(backref_manager) | |
1619 | {} | |
1620 | ||
1621 | void RBMCleaner::print(std::ostream &os, bool is_detailed) const | |
1622 | { | |
1623 | // TODO | |
1624 | return; | |
1625 | } | |
1626 | ||
1627 | void RBMCleaner::mark_space_used( | |
1628 | paddr_t addr, | |
1629 | extent_len_t len) | |
1630 | { | |
1631 | LOG_PREFIX(RBMCleaner::mark_space_used); | |
1632 | assert(addr.get_addr_type() == paddr_types_t::RANDOM_BLOCK); | |
1633 | auto rbms = rb_group->get_rb_managers(); | |
1634 | for (auto rbm : rbms) { | |
1635 | if (addr.get_device_id() == rbm->get_device_id()) { | |
1636 | if (rbm->get_start() <= addr) { | |
1637 | INFO("allocate addr: {} len: {}", addr, len); | |
1638 | stats.used_bytes += len; | |
1639 | rbm->mark_space_used(addr, len); | |
1640 | } | |
1641 | return; | |
1642 | } | |
1643 | } | |
1644 | } | |
1645 | ||
1646 | void RBMCleaner::mark_space_free( | |
1647 | paddr_t addr, | |
1648 | extent_len_t len) | |
1649 | { | |
1650 | LOG_PREFIX(RBMCleaner::mark_space_free); | |
1651 | assert(addr.get_addr_type() == paddr_types_t::RANDOM_BLOCK); | |
1652 | auto rbms = rb_group->get_rb_managers(); | |
1653 | for (auto rbm : rbms) { | |
1654 | if (addr.get_device_id() == rbm->get_device_id()) { | |
1655 | if (rbm->get_start() <= addr) { | |
1656 | INFO("free addr: {} len: {}", addr, len); | |
1657 | ceph_assert(stats.used_bytes >= len); | |
1658 | stats.used_bytes -= len; | |
1659 | rbm->mark_space_free(addr, len); | |
1660 | } | |
1661 | return; | |
1662 | } | |
1663 | } | |
1664 | } | |
1665 | ||
1666 | void RBMCleaner::commit_space_used(paddr_t addr, extent_len_t len) | |
1667 | { | |
1668 | auto rbms = rb_group->get_rb_managers(); | |
1669 | for (auto rbm : rbms) { | |
1670 | if (addr.get_device_id() == rbm->get_device_id()) { | |
1671 | if (rbm->get_start() <= addr) { | |
1672 | rbm->complete_allocation(addr, len); | |
1673 | } | |
1674 | return; | |
1675 | } | |
1676 | } | |
1677 | } | |
1678 | ||
1679 | bool RBMCleaner::try_reserve_projected_usage(std::size_t projected_usage) | |
1680 | { | |
1681 | assert(background_callback->is_ready()); | |
1682 | stats.projected_used_bytes += projected_usage; | |
1683 | return true; | |
1684 | } | |
1685 | ||
1686 | void RBMCleaner::release_projected_usage(std::size_t projected_usage) | |
1687 | { | |
1688 | assert(background_callback->is_ready()); | |
1689 | ceph_assert(stats.projected_used_bytes >= projected_usage); | |
1690 | stats.projected_used_bytes -= projected_usage; | |
1691 | background_callback->maybe_wake_blocked_io(); | |
1692 | } | |
1693 | ||
1694 | RBMCleaner::clean_space_ret RBMCleaner::clean_space() | |
1695 | { | |
1696 | // TODO | |
1697 | return clean_space_ertr::now(); | |
1698 | } | |
1699 | ||
1700 | RBMCleaner::mount_ret RBMCleaner::mount() | |
1701 | { | |
1702 | stats = {}; | |
1703 | register_metrics(); | |
1704 | return seastar::do_with( | |
1705 | rb_group->get_rb_managers(), | |
1706 | [](auto &rbs) { | |
1707 | return crimson::do_for_each( | |
1708 | rbs.begin(), | |
1709 | rbs.end(), | |
1710 | [](auto& it) { | |
1711 | return it->open( | |
1712 | ).handle_error( | |
1713 | crimson::ct_error::input_output_error::pass_further(), | |
1714 | crimson::ct_error::assert_all{ | |
1715 | "Invalid error when opening RBM"} | |
1716 | ); | |
1717 | }); | |
1718 | }); | |
1719 | } | |
1720 | ||
1721 | bool RBMCleaner::check_usage() | |
1722 | { | |
1723 | assert(detailed); | |
1724 | const auto& rbms = rb_group->get_rb_managers(); | |
1725 | RBMSpaceTracker tracker(rbms); | |
1726 | extent_callback->with_transaction_weak( | |
1727 | "check_usage", | |
1728 | [this, &tracker, &rbms](auto &t) { | |
1729 | return backref_manager.scan_mapped_space( | |
1730 | t, | |
1731 | [&tracker, &rbms]( | |
1732 | paddr_t paddr, | |
1733 | paddr_t backref_key, | |
1734 | extent_len_t len, | |
1735 | extent_types_t type, | |
1736 | laddr_t laddr) | |
1737 | { | |
1738 | for (auto rbm : rbms) { | |
1739 | if (rbm->get_device_id() == paddr.get_device_id()) { | |
1740 | if (is_backref_node(type)) { | |
1741 | assert(laddr == L_ADDR_NULL); | |
1742 | assert(backref_key != P_ADDR_NULL); | |
1743 | tracker.allocate( | |
1744 | paddr, | |
1745 | len); | |
1746 | } else if (laddr == L_ADDR_NULL) { | |
1747 | assert(backref_key == P_ADDR_NULL); | |
1748 | tracker.release( | |
1749 | paddr, | |
1750 | len); | |
1751 | } else { | |
1752 | assert(backref_key == P_ADDR_NULL); | |
1753 | tracker.allocate( | |
1754 | paddr, | |
1755 | len); | |
1756 | } | |
1757 | } | |
1758 | } | |
1759 | }); | |
1760 | }).unsafe_get0(); | |
1761 | return equals(tracker); | |
1762 | } | |
1763 | ||
1764 | bool RBMCleaner::equals(const RBMSpaceTracker &_other) const | |
1765 | { | |
1766 | LOG_PREFIX(RBMSpaceTracker::equals); | |
1767 | const auto &other = static_cast<const RBMSpaceTracker&>(_other); | |
1768 | auto rbs = rb_group->get_rb_managers(); | |
1769 | //TODO: multiple rbm allocator | |
1770 | auto rbm = rbs[0]; | |
1771 | assert(rbm); | |
1772 | ||
1773 | if (rbm->get_device()->get_available_size() / rbm->get_block_size() | |
1774 | != other.block_usage.size()) { | |
1775 | assert(0 == "block counts should match"); | |
1776 | return false; | |
1777 | } | |
1778 | bool all_match = true; | |
1779 | for (auto i = other.block_usage.begin(); | |
1780 | i != other.block_usage.end(); ++i) { | |
1781 | if (i->first < rbm->get_start().as_blk_paddr().get_device_off()) { | |
1782 | continue; | |
1783 | } | |
1784 | auto addr = i->first; | |
1785 | auto state = rbm->get_extent_state( | |
1786 | convert_abs_addr_to_paddr(addr, rbm->get_device_id()), | |
1787 | rbm->get_block_size()); | |
1788 | if ((i->second.used && state == rbm_extent_state_t::ALLOCATED) || | |
1789 | (!i->second.used && (state == rbm_extent_state_t::FREE || | |
1790 | state == rbm_extent_state_t::RESERVED))) { | |
1791 | // pass | |
1792 | } else { | |
1793 | all_match = false; | |
1794 | ERROR("block addr {} mismatch other used: {}", | |
1795 | addr, i->second.used); | |
1796 | } | |
1797 | } | |
1798 | return all_match; | |
1799 | } | |
1800 | ||
1801 | void RBMCleaner::register_metrics() | |
1802 | { | |
1803 | namespace sm = seastar::metrics; | |
1804 | ||
1805 | metrics.add_group("rbm_cleaner", { | |
1806 | sm::make_counter("total_bytes", | |
1807 | [this] { return get_total_bytes(); }, | |
1808 | sm::description("the size of the space")), | |
1809 | sm::make_counter("available_bytes", | |
1810 | [this] { return get_total_bytes() - get_journal_bytes() - stats.used_bytes; }, | |
1811 | sm::description("the size of the space is available")), | |
1812 | sm::make_counter("used_bytes", stats.used_bytes, | |
1813 | sm::description("the size of the space occupied by live extents")), | |
1814 | }); | |
1815 | } | |
1816 | ||
1817 | } |