]> git.proxmox.com Git - ceph.git/blob - ceph/src/crimson/os/seastore/cache.h
import quincy beta 17.1.0
[ceph.git] / ceph / src / crimson / os / seastore / cache.h
1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2 // vim: ts=8 sw=2 smarttab
3
4 #pragma once
5
6 #include <iostream>
7
8 #include "seastar/core/shared_future.hh"
9
10 #include "include/buffer.h"
11
12 #include "crimson/os/seastore/logging.h"
13 #include "crimson/os/seastore/seastore_types.h"
14 #include "crimson/os/seastore/transaction.h"
15 #include "crimson/os/seastore/segment_manager.h"
16 #include "crimson/common/errorator.h"
17 #include "crimson/os/seastore/cached_extent.h"
18 #include "crimson/os/seastore/root_block.h"
19 #include "crimson/os/seastore/segment_cleaner.h"
20 #include "crimson/os/seastore/random_block_manager.h"
21
22 namespace crimson::os::seastore {
23
24 /**
25 * Cache
26 *
27 * This component is responsible for buffer management, including
28 * transaction lifecycle.
29 *
30 * Seastore transactions are expressed as an atomic combination of
31 * 1) newly written blocks
32 * 2) logical mutations to existing physical blocks
33 *
34 * See record_t
35 *
36 * As such, any transaction has 3 components:
37 * 1) read_set: references to extents read during the transaction
38 * See Transaction::read_set
39 * 2) write_set: references to extents to be written as:
40 * a) new physical blocks, see Transaction::fresh_block_list
41 * b) mutations to existing physical blocks,
42 * see Transaction::mutated_block_list
43 * 3) retired_set: extent refs to be retired either due to 2b or
44 * due to releasing the extent generally.
45
46 * In the case of 2b, the CachedExtent will have been copied into
47 * a fresh CachedExtentRef such that the source extent ref is present
48 * in the read set and the newly allocated extent is present in the
49 * write_set.
50 *
51 * A transaction has 3 phases:
52 * 1) construction: user calls Cache::get_transaction() and populates
53 * the returned transaction by calling Cache methods
54 * 2) submission: user calls Cache::try_start_transaction(). If
55 * succcessful, the user may construct a record and submit the
56 * transaction to the journal.
57 * 3) completion: once the transaction is durable, the user must call
58 * Cache::complete_commit() with the block offset to complete
59 * the transaction.
60 *
61 * Internally, in phase 1, the fields in Transaction are filled in.
62 * - reads may block if the referenced extent is being written
63 * - once a read obtains a particular CachedExtentRef for a paddr_t,
64 * it'll always get the same one until overwritten
65 * - once a paddr_t is overwritten or written, subsequent reads of
66 * that addr will get the new ref
67 *
68 * In phase 2, if all extents in the read set are valid (not expired),
69 * we can commit (otherwise, we fail and the user must retry).
70 * - Expire all extents in the retired_set (they must all be valid)
71 * - Remove all extents in the retired_set from Cache::extents
72 * - Mark all extents in the write_set wait_io(), add promises to
73 * transaction
74 * - Merge Transaction::write_set into Cache::extents
75 *
76 * After phase 2, the user will submit the record to the journal.
77 * Once complete, we perform phase 3:
78 * - For each CachedExtent in block_list, call
79 * CachedExtent::complete_initial_write(paddr_t) with the block's
80 * final offset (inferred from the extent's position in the block_list
81 * and extent lengths).
82 * - For each block in mutation_list, call
83 * CachedExtent::delta_written(paddr_t) with the address of the start
84 * of the record
85 * - Complete all promises with the final record start paddr_t
86 */
87 class Cache {
88 public:
89 using base_ertr = crimson::errorator<
90 crimson::ct_error::input_output_error>;
91 using base_iertr = trans_iertr<base_ertr>;
92
93 Cache(ExtentReader &reader);
94 ~Cache();
95
96 /// Creates empty transaction by source
97 TransactionRef create_transaction(
98 Transaction::src_t src,
99 const char* name,
100 bool is_weak) {
101 LOG_PREFIX(Cache::create_transaction);
102
103 ++(get_by_src(stats.trans_created_by_src, src));
104
105 auto ret = std::make_unique<Transaction>(
106 get_dummy_ordering_handle(),
107 is_weak,
108 src,
109 last_commit,
110 [this](Transaction& t) {
111 return on_transaction_destruct(t);
112 }
113 );
114 SUBDEBUGT(seastore_cache, "created name={}, source={}, is_weak={}",
115 *ret, name, src, is_weak);
116 return ret;
117 }
118
119 /// Resets transaction preserving
120 void reset_transaction_preserve_handle(Transaction &t) {
121 LOG_PREFIX(Cache::reset_transaction_preserve_handle);
122 if (t.did_reset()) {
123 ++(get_by_src(stats.trans_created_by_src, t.get_src()));
124 }
125 t.reset_preserve_handle(last_commit);
126 SUBDEBUGT(seastore_cache, "reset", t);
127 }
128
129 /**
130 * drop_from_cache
131 *
132 * Drop extent from cache. Intended for use when
133 * ref refers to a logically dead extent as during
134 * replay.
135 */
136 void drop_from_cache(CachedExtentRef ref) {
137 remove_extent(ref);
138 }
139
140 /// Declare ref retired in t
141 void retire_extent(Transaction &t, CachedExtentRef ref) {
142 t.add_to_retired_set(ref);
143 }
144
145 /// Declare paddr retired in t
146 using retire_extent_iertr = base_iertr;
147 using retire_extent_ret = base_iertr::future<>;
148 retire_extent_ret retire_extent_addr(
149 Transaction &t, paddr_t addr, extent_len_t length);
150
151 /**
152 * get_root
153 *
154 * returns ref to current root or t.root if modified in t
155 */
156 using get_root_iertr = base_iertr;
157 using get_root_ret = get_root_iertr::future<RootBlockRef>;
158 get_root_ret get_root(Transaction &t);
159
160 /**
161 * get_root_fast
162 *
163 * returns t.root and assume it is already present/read in t
164 */
165 RootBlockRef get_root_fast(Transaction &t) {
166 assert(t.root);
167 return t.root;
168 }
169
170 /**
171 * get_extent
172 *
173 * returns ref to extent at offset~length of type T either from
174 * - extent_set if already in cache
175 * - disk
176 */
177 using src_ext_t = std::pair<Transaction::src_t, extent_types_t>;
178 using get_extent_ertr = base_ertr;
179 template <typename T>
180 using get_extent_ret = get_extent_ertr::future<TCachedExtentRef<T>>;
181 template <typename T, typename Func>
182 get_extent_ret<T> get_extent(
183 paddr_t offset, ///< [in] starting addr
184 segment_off_t length, ///< [in] length
185 const src_ext_t* p_metric_key, ///< [in] cache query metric key
186 Func &&extent_init_func ///< [in] init func for extent
187 ) {
188 auto cached = query_cache(offset, p_metric_key);
189 if (!cached) {
190 auto ret = CachedExtent::make_cached_extent_ref<T>(
191 alloc_cache_buf(length));
192 ret->set_paddr(offset);
193 ret->state = CachedExtent::extent_state_t::CLEAN_PENDING;
194 add_extent(ret);
195 extent_init_func(*ret);
196 return read_extent<T>(
197 std::move(ret));
198 }
199
200 // extent PRESENT in cache
201 if (cached->get_type() == extent_types_t::RETIRED_PLACEHOLDER) {
202 auto ret = CachedExtent::make_cached_extent_ref<T>(
203 alloc_cache_buf(length));
204 ret->set_paddr(offset);
205 ret->state = CachedExtent::extent_state_t::CLEAN_PENDING;
206 extents.replace(*ret, *cached);
207
208 // replace placeholder in transactions
209 while (!cached->transactions.empty()) {
210 auto t = cached->transactions.begin()->t;
211 t->replace_placeholder(*cached, *ret);
212 }
213
214 cached->state = CachedExtent::extent_state_t::INVALID;
215 extent_init_func(*ret);
216 return read_extent<T>(
217 std::move(ret));
218 } else {
219 auto ret = TCachedExtentRef<T>(static_cast<T*>(cached.get()));
220 return ret->wait_io(
221 ).then([ret=std::move(ret)]() mutable
222 -> get_extent_ret<T> {
223 // ret may be invalid, caller must check
224 return get_extent_ret<T>(
225 get_extent_ertr::ready_future_marker{},
226 std::move(ret));
227 });
228 }
229 }
230 template <typename T>
231 get_extent_ret<T> get_extent(
232 paddr_t offset, ///< [in] starting addr
233 segment_off_t length, ///< [in] length
234 const src_ext_t* p_metric_key ///< [in] cache query metric key
235 ) {
236 return get_extent<T>(
237 offset, length, p_metric_key,
238 [](T &){});
239 }
240
241 /**
242 * get_extent_if_cached
243 *
244 * Returns extent at offset if in cache
245 */
246 using get_extent_if_cached_iertr = base_iertr;
247 using get_extent_if_cached_ret =
248 get_extent_if_cached_iertr::future<CachedExtentRef>;
249 get_extent_if_cached_ret get_extent_if_cached(
250 Transaction &t,
251 paddr_t offset,
252 extent_types_t type) {
253 CachedExtentRef ret;
254 LOG_PREFIX(Cache::get_extent_if_cached);
255 auto result = t.get_extent(offset, &ret);
256 if (result != Transaction::get_extent_ret::ABSENT) {
257 // including get_extent_ret::RETIRED
258 SUBDEBUGT(seastore_cache,
259 "Found extent at offset {} on transaction: {}",
260 t, offset, *ret);
261 return get_extent_if_cached_iertr::make_ready_future<
262 CachedExtentRef>(ret);
263 }
264
265 // get_extent_ret::ABSENT from transaction
266 auto metric_key = std::make_pair(t.get_src(), type);
267 ret = query_cache(offset, &metric_key);
268 if (!ret ||
269 // retired_placeholder is not really cached yet
270 ret->get_type() == extent_types_t::RETIRED_PLACEHOLDER) {
271 SUBDEBUGT(seastore_cache,
272 "No extent at offset {}, retired_placeholder: {}",
273 t, offset, !!ret);
274 return get_extent_if_cached_iertr::make_ready_future<
275 CachedExtentRef>();
276 }
277
278 // present in cache and is not a retired_placeholder
279 SUBDEBUGT(seastore_cache,
280 "Found extent at offset {} in cache: {}",
281 t, offset, *ret);
282 t.add_to_read_set(ret);
283 touch_extent(*ret);
284 return ret->wait_io().then([ret] {
285 return get_extent_if_cached_iertr::make_ready_future<
286 CachedExtentRef>(ret);
287 });
288 }
289
290 /**
291 * get_extent
292 *
293 * returns ref to extent at offset~length of type T either from
294 * - t if modified by t
295 * - extent_set if already in cache
296 * - disk
297 *
298 * t *must not* have retired offset
299 */
300 using get_extent_iertr = base_iertr;
301 template <typename T, typename Func>
302 get_extent_iertr::future<TCachedExtentRef<T>> get_extent(
303 Transaction &t,
304 paddr_t offset,
305 segment_off_t length,
306 Func &&extent_init_func) {
307 CachedExtentRef ret;
308 LOG_PREFIX(Cache::get_extent);
309 auto result = t.get_extent(offset, &ret);
310 if (result != Transaction::get_extent_ret::ABSENT) {
311 assert(result != Transaction::get_extent_ret::RETIRED);
312 SUBDEBUGT(seastore_cache,
313 "Found extent at offset {} on transaction: {}",
314 t, offset, *ret);
315 return seastar::make_ready_future<TCachedExtentRef<T>>(
316 ret->cast<T>());
317 } else {
318 auto metric_key = std::make_pair(t.get_src(), T::TYPE);
319 return trans_intr::make_interruptible(
320 get_extent<T>(
321 offset, length, &metric_key,
322 std::forward<Func>(extent_init_func))
323 ).si_then([this, FNAME, offset, &t](auto ref) {
324 (void)this; // silence incorrect clang warning about capture
325 if (!ref->is_valid()) {
326 SUBDEBUGT(seastore_cache, "got invalid extent: {}", t, ref);
327 ++(get_by_src(stats.trans_conflicts_by_unknown, t.get_src()));
328 mark_transaction_conflicted(t, *ref);
329 return get_extent_iertr::make_ready_future<TCachedExtentRef<T>>();
330 } else {
331 SUBDEBUGT(seastore_cache,
332 "Read extent at offset {} in cache: {}",
333 t, offset, *ref);
334 touch_extent(*ref);
335 t.add_to_read_set(ref);
336 return get_extent_iertr::make_ready_future<TCachedExtentRef<T>>(
337 std::move(ref));
338 }
339 });
340 }
341 }
342 template <typename T>
343 get_extent_iertr::future<TCachedExtentRef<T>> get_extent(
344 Transaction &t,
345 paddr_t offset,
346 segment_off_t length) {
347 return get_extent<T>(t, offset, length, [](T &){});
348 }
349
350
351 /**
352 * get_extent_by_type
353 *
354 * Based on type, instantiate the correct concrete type
355 * and read in the extent at location offset~length.
356 */
357 private:
358 // This is a workaround std::move_only_function not being available,
359 // not really worth generalizing at this time.
360 class extent_init_func_t {
361 struct callable_i {
362 virtual void operator()(CachedExtent &extent) = 0;
363 virtual ~callable_i() = default;
364 };
365 template <typename Func>
366 struct callable_wrapper final : callable_i {
367 Func func;
368 callable_wrapper(Func &&func) : func(std::forward<Func>(func)) {}
369 void operator()(CachedExtent &extent) final {
370 return func(extent);
371 }
372 ~callable_wrapper() final = default;
373 };
374 public:
375 std::unique_ptr<callable_i> wrapped;
376 template <typename Func>
377 extent_init_func_t(Func &&func) : wrapped(
378 std::make_unique<callable_wrapper<Func>>(std::forward<Func>(func)))
379 {}
380 void operator()(CachedExtent &extent) {
381 return (*wrapped)(extent);
382 }
383 };
384 get_extent_ertr::future<CachedExtentRef> _get_extent_by_type(
385 extent_types_t type,
386 paddr_t offset,
387 laddr_t laddr,
388 segment_off_t length,
389 const Transaction::src_t* p_src,
390 extent_init_func_t &&extent_init_func
391 );
392
393 using get_extent_by_type_iertr = get_extent_iertr;
394 using get_extent_by_type_ret = get_extent_by_type_iertr::future<
395 CachedExtentRef>;
396 get_extent_by_type_ret _get_extent_by_type(
397 Transaction &t,
398 extent_types_t type,
399 paddr_t offset,
400 laddr_t laddr,
401 segment_off_t length,
402 extent_init_func_t &&extent_init_func) {
403 CachedExtentRef ret;
404 auto status = t.get_extent(offset, &ret);
405 if (status == Transaction::get_extent_ret::RETIRED) {
406 return seastar::make_ready_future<CachedExtentRef>();
407 } else if (status == Transaction::get_extent_ret::PRESENT) {
408 return seastar::make_ready_future<CachedExtentRef>(ret);
409 } else {
410 auto src = t.get_src();
411 return trans_intr::make_interruptible(
412 _get_extent_by_type(
413 type, offset, laddr, length, &src,
414 std::move(extent_init_func))
415 ).si_then([=, &t](CachedExtentRef ret) {
416 if (!ret->is_valid()) {
417 LOG_PREFIX(Cache::get_extent_by_type);
418 SUBDEBUGT(seastore_cache, "got invalid extent: {}", t, ret);
419 ++(get_by_src(stats.trans_conflicts_by_unknown, t.get_src()));
420 mark_transaction_conflicted(t, *ret.get());
421 return get_extent_ertr::make_ready_future<CachedExtentRef>();
422 } else {
423 touch_extent(*ret);
424 t.add_to_read_set(ret);
425 return get_extent_ertr::make_ready_future<CachedExtentRef>(
426 std::move(ret));
427 }
428 });
429 }
430 }
431
432 public:
433 template <typename Func>
434 get_extent_by_type_ret get_extent_by_type(
435 Transaction &t, ///< [in] transaction
436 extent_types_t type, ///< [in] type tag
437 paddr_t offset, ///< [in] starting addr
438 laddr_t laddr, ///< [in] logical address if logical
439 segment_off_t length, ///< [in] length
440 Func &&extent_init_func ///< [in] extent init func
441 ) {
442 return _get_extent_by_type(
443 t,
444 type,
445 offset,
446 laddr,
447 length,
448 extent_init_func_t(std::forward<Func>(extent_init_func)));
449 }
450 get_extent_by_type_ret get_extent_by_type(
451 Transaction &t,
452 extent_types_t type,
453 paddr_t offset,
454 laddr_t laddr,
455 segment_off_t length
456 ) {
457 return get_extent_by_type(
458 t, type, offset, laddr, length, [](CachedExtent &) {});
459 }
460
461
462 /**
463 * alloc_new_extent
464 *
465 * Allocates a fresh extent. if delayed is true, addr will be alloc'd later
466 */
467 template <typename T>
468 TCachedExtentRef<T> alloc_new_extent(
469 Transaction &t, ///< [in, out] current transaction
470 segment_off_t length, ///< [in] length
471 bool delayed = false ///< [in] whether the paddr allocation of extent is delayed
472 ) {
473 auto ret = CachedExtent::make_cached_extent_ref<T>(
474 alloc_cache_buf(length));
475 t.add_fresh_extent(ret, delayed);
476 ret->state = CachedExtent::extent_state_t::INITIAL_WRITE_PENDING;
477 return ret;
478 }
479
480 void clear_lru() {
481 lru.clear();
482 }
483
484 void mark_delayed_extent_inline(
485 Transaction& t,
486 LogicalCachedExtentRef& ref) {
487 t.mark_delayed_extent_inline(ref);
488 }
489
490 void mark_delayed_extent_ool(
491 Transaction& t,
492 LogicalCachedExtentRef& ref,
493 paddr_t final_addr) {
494 t.mark_delayed_extent_ool(ref, final_addr);
495 }
496
497 /**
498 * alloc_new_extent
499 *
500 * Allocates a fresh extent. addr will be relative until commit.
501 */
502 CachedExtentRef alloc_new_extent_by_type(
503 Transaction &t, ///< [in, out] current transaction
504 extent_types_t type, ///< [in] type tag
505 segment_off_t length, ///< [in] length
506 bool delayed = false ///< [in] whether delay addr allocation
507 );
508
509 /**
510 * Allocates mutable buffer from extent_set on offset~len
511 *
512 * TODO: Note, currently all implementations literally copy the
513 * buffer. This needn't be true, CachedExtent implementations could
514 * choose to refer to the same buffer unmodified until commit and just
515 * buffer the mutations in an ancillary data structure.
516 *
517 * @param current transaction
518 * @param extent to duplicate
519 * @return mutable extent
520 */
521 CachedExtentRef duplicate_for_write(
522 Transaction &t, ///< [in, out] current transaction
523 CachedExtentRef i ///< [in] ref to existing extent
524 );
525
526 /**
527 * prepare_record
528 *
529 * Construct the record for Journal from transaction.
530 */
531 record_t prepare_record(
532 Transaction &t ///< [in, out] current transaction
533 );
534
535 /**
536 * complete_commit
537 *
538 * Must be called upon completion of write. Releases blocks on mutating
539 * extents, fills in addresses, and calls relevant callbacks on fresh
540 * and mutated exents.
541 */
542 void complete_commit(
543 Transaction &t, ///< [in, out] current transaction
544 paddr_t final_block_start, ///< [in] offset of initial block
545 journal_seq_t seq, ///< [in] journal commit seq
546 SegmentCleaner *cleaner=nullptr ///< [out] optional segment stat listener
547 );
548
549 /**
550 * init
551 */
552 void init();
553
554 /**
555 * mkfs
556 *
557 * Alloc initial root node and add to t. The intention is for other
558 * components to use t to adjust the resulting root ref prior to commit.
559 */
560 using mkfs_iertr = base_iertr;
561 mkfs_iertr::future<> mkfs(Transaction &t);
562
563 /**
564 * close
565 *
566 * TODO: should flush dirty blocks
567 */
568 using close_ertr = crimson::errorator<
569 crimson::ct_error::input_output_error>;
570 close_ertr::future<> close();
571
572 /**
573 * replay_delta
574 *
575 * Intended for use in Journal::delta. For each delta, should decode delta,
576 * read relevant block from disk or cache (using correct type), and call
577 * CachedExtent::apply_delta marking the extent dirty.
578 */
579 using replay_delta_ertr = crimson::errorator<
580 crimson::ct_error::input_output_error>;
581 using replay_delta_ret = replay_delta_ertr::future<>;
582 replay_delta_ret replay_delta(
583 journal_seq_t seq,
584 paddr_t record_block_base,
585 const delta_info_t &delta);
586
587 /**
588 * init_cached_extents
589 *
590 * Calls passed lambda for each dirty cached block. Intended for use
591 * after replay to allow lba_manager (or w/e) to read in any ancestor
592 * blocks.
593 */
594 using init_cached_extents_iertr = base_iertr;
595 using init_cached_extents_ret = init_cached_extents_iertr::future<>;
596 template <typename F>
597 init_cached_extents_ret init_cached_extents(
598 Transaction &t,
599 F &&f)
600 {
601 // journal replay should has been finished at this point,
602 // Cache::root should have been inserted to the dirty list
603 assert(root->is_dirty());
604 std::vector<CachedExtentRef> dirty;
605 for (auto &e : extents) {
606 dirty.push_back(CachedExtentRef(&e));
607 }
608 return seastar::do_with(
609 std::forward<F>(f),
610 std::move(dirty),
611 [&t](auto &f, auto &refs) mutable {
612 return trans_intr::do_for_each(
613 refs,
614 [&t, &f](auto &e) { return f(t, e); });
615 }).handle_error_interruptible(
616 init_cached_extents_iertr::pass_further{},
617 crimson::ct_error::assert_all{
618 "Invalid error in Cache::init_cached_extents"
619 }
620 );
621 }
622
623 /**
624 * update_extent_from_transaction
625 *
626 * Updates passed extent based on t. If extent has been retired,
627 * a null result will be returned.
628 */
629 CachedExtentRef update_extent_from_transaction(
630 Transaction &t,
631 CachedExtentRef extent) {
632 if (extent->get_type() == extent_types_t::ROOT) {
633 if (t.root) {
634 return t.root;
635 } else {
636 t.add_to_read_set(extent);
637 t.root = extent->cast<RootBlock>();
638 return extent;
639 }
640 } else {
641 auto result = t.get_extent(extent->get_paddr(), &extent);
642 if (result == Transaction::get_extent_ret::RETIRED) {
643 return CachedExtentRef();
644 } else {
645 if (result == Transaction::get_extent_ret::ABSENT) {
646 t.add_to_read_set(extent);
647 }
648 return extent;
649 }
650 }
651 }
652
653 /**
654 * print
655 *
656 * Dump summary of contents (TODO)
657 */
658 std::ostream &print(
659 std::ostream &out) const {
660 return out;
661 }
662
663 /**
664 * get_next_dirty_extents
665 *
666 * Returns extents with get_dirty_from() < seq and adds to read set of
667 * t.
668 */
669 using get_next_dirty_extents_iertr = base_iertr;
670 using get_next_dirty_extents_ret = get_next_dirty_extents_iertr::future<
671 std::vector<CachedExtentRef>>;
672 get_next_dirty_extents_ret get_next_dirty_extents(
673 Transaction &t,
674 journal_seq_t seq,
675 size_t max_bytes);
676
677 /// returns std::nullopt if no dirty extents or get_dirty_from() for oldest
678 std::optional<journal_seq_t> get_oldest_dirty_from() const {
679 if (dirty.empty()) {
680 return std::nullopt;
681 } else {
682 auto oldest = dirty.begin()->get_dirty_from();
683 if (oldest == journal_seq_t()) {
684 return std::nullopt;
685 } else {
686 return oldest;
687 }
688 }
689 }
690
691 /// Dump live extents
692 void dump_contents();
693
694 private:
695 ExtentReader &reader; ///< ref to extent reader
696 RootBlockRef root; ///< ref to current root
697 ExtentIndex extents; ///< set of live extents
698
699 journal_seq_t last_commit = JOURNAL_SEQ_MIN;
700
701 /**
702 * dirty
703 *
704 * holds refs to dirty extents. Ordered by CachedExtent::get_dirty_from().
705 */
706 CachedExtent::list dirty;
707
708 /**
709 * lru
710 *
711 * holds references to recently used extents
712 */
713 class LRU {
714 // max size (bytes)
715 const size_t capacity = 0;
716
717 // current size (bytes)
718 size_t contents = 0;
719
720 CachedExtent::list lru;
721
722 void trim_to_capacity() {
723 while (contents > capacity) {
724 assert(lru.size() > 0);
725 remove_from_lru(lru.front());
726 }
727 }
728
729 void add_to_lru(CachedExtent &extent) {
730 assert(
731 extent.is_clean() &&
732 !extent.is_pending() &&
733 !extent.is_placeholder());
734
735 if (!extent.primary_ref_list_hook.is_linked()) {
736 contents += extent.get_length();
737 intrusive_ptr_add_ref(&extent);
738 lru.push_back(extent);
739 }
740 trim_to_capacity();
741 }
742
743 public:
744 LRU(size_t capacity) : capacity(capacity) {}
745
746 size_t get_current_contents_bytes() const {
747 return contents;
748 }
749
750 size_t get_current_contents_extents() const {
751 return lru.size();
752 }
753
754 void remove_from_lru(CachedExtent &extent) {
755 assert(extent.is_clean());
756 assert(!extent.is_pending());
757 assert(!extent.is_placeholder());
758
759 if (extent.primary_ref_list_hook.is_linked()) {
760 lru.erase(lru.s_iterator_to(extent));
761 assert(contents >= extent.get_length());
762 contents -= extent.get_length();
763 intrusive_ptr_release(&extent);
764 }
765 }
766
767 void move_to_top(CachedExtent &extent) {
768 assert(
769 extent.is_clean() &&
770 !extent.is_pending() &&
771 !extent.is_placeholder());
772
773 if (extent.primary_ref_list_hook.is_linked()) {
774 lru.erase(lru.s_iterator_to(extent));
775 intrusive_ptr_release(&extent);
776 assert(contents >= extent.get_length());
777 contents -= extent.get_length();
778 }
779 add_to_lru(extent);
780 }
781
782 void clear() {
783 LOG_PREFIX(Cache::LRU::clear);
784 for (auto iter = lru.begin(); iter != lru.end();) {
785 SUBDEBUG(seastore_cache, "clearing {}", *iter);
786 remove_from_lru(*(iter++));
787 }
788 }
789
790 ~LRU() {
791 clear();
792 }
793 } lru;
794
795 struct query_counters_t {
796 uint64_t access = 0;
797 uint64_t hit = 0;
798 };
799
800 /**
801 * effort_t
802 *
803 * Count the number of extents involved in the effort and the total bytes of
804 * them.
805 *
806 * Each effort_t represents the effort of a set of extents involved in the
807 * transaction, classified by read, mutate, retire and allocate behaviors,
808 * see XXX_trans_efforts_t.
809 */
810 struct effort_t {
811 uint64_t extents = 0;
812 uint64_t bytes = 0;
813
814 void increment(uint64_t extent_len) {
815 ++extents;
816 bytes += extent_len;
817 }
818 };
819
820 template <typename CounterT>
821 using counter_by_extent_t = std::array<CounterT, EXTENT_TYPES_MAX>;
822
823 struct invalid_trans_efforts_t {
824 effort_t read;
825 effort_t mutate;
826 uint64_t mutate_delta_bytes = 0;
827 effort_t retire;
828 effort_t fresh;
829 effort_t fresh_ool_written;
830 counter_by_extent_t<uint64_t> num_trans_invalidated;
831 uint64_t num_ool_records = 0;
832 uint64_t ool_record_bytes = 0;
833 };
834
835 struct commit_trans_efforts_t {
836 counter_by_extent_t<effort_t> read_by_ext;
837 counter_by_extent_t<effort_t> mutate_by_ext;
838 counter_by_extent_t<uint64_t> delta_bytes_by_ext;
839 counter_by_extent_t<effort_t> retire_by_ext;
840 counter_by_extent_t<effort_t> fresh_invalid_by_ext; // inline but is already invalid (retired)
841 counter_by_extent_t<effort_t> fresh_inline_by_ext;
842 counter_by_extent_t<effort_t> fresh_ool_by_ext;
843 uint64_t num_trans = 0; // the number of inline records
844 uint64_t num_ool_records = 0;
845 uint64_t ool_record_padding_bytes = 0;
846 uint64_t ool_record_metadata_bytes = 0;
847 uint64_t ool_record_data_bytes = 0;
848 uint64_t inline_record_metadata_bytes = 0; // metadata exclude the delta bytes
849 };
850
851 struct success_read_trans_efforts_t {
852 effort_t read;
853 uint64_t num_trans = 0;
854 };
855
856 struct tree_efforts_t {
857 uint64_t num_inserts = 0;
858 uint64_t num_erases = 0;
859
860 void increment(const Transaction::tree_stats_t& incremental) {
861 num_inserts += incremental.num_inserts;
862 num_erases += incremental.num_erases;
863 }
864 };
865
866 template <typename CounterT>
867 using counter_by_src_t = std::array<CounterT, Transaction::SRC_MAX>;
868
869 static constexpr std::size_t NUM_SRC_COMB =
870 Transaction::SRC_MAX * (Transaction::SRC_MAX + 1) / 2;
871
872 struct {
873 counter_by_src_t<uint64_t> trans_created_by_src;
874 counter_by_src_t<commit_trans_efforts_t> committed_efforts_by_src;
875 counter_by_src_t<invalid_trans_efforts_t> invalidated_efforts_by_src;
876 counter_by_src_t<query_counters_t> cache_query_by_src;
877 success_read_trans_efforts_t success_read_efforts;
878 uint64_t dirty_bytes = 0;
879
880 uint64_t onode_tree_depth = 0;
881 counter_by_src_t<tree_efforts_t> committed_onode_tree_efforts;
882 counter_by_src_t<tree_efforts_t> invalidated_onode_tree_efforts;
883
884 uint64_t lba_tree_depth = 0;
885 counter_by_src_t<tree_efforts_t> committed_lba_tree_efforts;
886 counter_by_src_t<tree_efforts_t> invalidated_lba_tree_efforts;
887
888 std::array<uint64_t, NUM_SRC_COMB> trans_conflicts_by_srcs;
889 counter_by_src_t<uint64_t> trans_conflicts_by_unknown;
890 } stats;
891
892 template <typename CounterT>
893 CounterT& get_by_src(
894 counter_by_src_t<CounterT>& counters_by_src,
895 Transaction::src_t src) {
896 assert(static_cast<std::size_t>(src) < counters_by_src.size());
897 return counters_by_src[static_cast<std::size_t>(src)];
898 }
899
900 template <typename CounterT>
901 CounterT& get_by_ext(
902 counter_by_extent_t<CounterT>& counters_by_ext,
903 extent_types_t ext) {
904 auto index = static_cast<uint8_t>(ext);
905 assert(index < EXTENT_TYPES_MAX);
906 return counters_by_ext[index];
907 }
908
909 void account_conflict(Transaction::src_t src1, Transaction::src_t src2) {
910 assert(src1 < Transaction::src_t::MAX);
911 assert(src2 < Transaction::src_t::MAX);
912 if (src1 > src2) {
913 std::swap(src1, src2);
914 }
915 // impossible combinations
916 // should be consistent with trans_srcs_invalidated in register_metrics()
917 assert(!(src1 == Transaction::src_t::READ &&
918 src2 == Transaction::src_t::READ));
919 assert(!(src1 == Transaction::src_t::CLEANER_TRIM &&
920 src2 == Transaction::src_t::CLEANER_TRIM));
921 assert(!(src1 == Transaction::src_t::CLEANER_RECLAIM &&
922 src2 == Transaction::src_t::CLEANER_RECLAIM));
923 assert(!(src1 == Transaction::src_t::CLEANER_TRIM &&
924 src2 == Transaction::src_t::CLEANER_RECLAIM));
925
926 auto src1_value = static_cast<std::size_t>(src1);
927 auto src2_value = static_cast<std::size_t>(src2);
928 auto num_srcs = static_cast<std::size_t>(Transaction::src_t::MAX);
929 auto conflict_index = num_srcs * src1_value + src2_value -
930 src1_value * (src1_value + 1) / 2;
931 assert(conflict_index < NUM_SRC_COMB);
932 ++stats.trans_conflicts_by_srcs[conflict_index];
933 }
934
935 seastar::metrics::metric_group metrics;
936 void register_metrics();
937
938 /// alloc buffer for cached extent
939 bufferptr alloc_cache_buf(size_t size) {
940 // TODO: memory pooling etc
941 auto bp = ceph::bufferptr(
942 buffer::create_page_aligned(size));
943 bp.zero();
944 return bp;
945 }
946
947 /// Update lru for access to ref
948 void touch_extent(CachedExtent &ext) {
949 assert(!ext.is_pending());
950 if (ext.is_clean() && !ext.is_placeholder()) {
951 lru.move_to_top(ext);
952 }
953 }
954
955 /// Add extent to extents handling dirty and refcounting
956 void add_extent(CachedExtentRef ref);
957
958 /// Mark exising extent ref dirty -- mainly for replay
959 void mark_dirty(CachedExtentRef ref);
960
961 /// Add dirty extent to dirty list
962 void add_to_dirty(CachedExtentRef ref);
963
964 /// Remove from dirty list
965 void remove_from_dirty(CachedExtentRef ref);
966
967 /// Remove extent from extents handling dirty and refcounting
968 void remove_extent(CachedExtentRef ref);
969
970 /// Retire extent
971 void commit_retire_extent(Transaction& t, CachedExtentRef ref);
972
973 /// Replace prev with next
974 void commit_replace_extent(Transaction& t, CachedExtentRef next, CachedExtentRef prev);
975
976 /// Invalidate extent and mark affected transactions
977 void invalidate_extent(Transaction& t, CachedExtent& extent);
978
979 /// Mark a valid transaction as conflicted
980 void mark_transaction_conflicted(
981 Transaction& t, CachedExtent& conflicting_extent);
982
983 /// Introspect transaction when it is being destructed
984 void on_transaction_destruct(Transaction& t);
985
986 template <typename T>
987 get_extent_ret<T> read_extent(
988 TCachedExtentRef<T>&& extent
989 ) {
990 assert(extent->state == CachedExtent::extent_state_t::CLEAN_PENDING);
991 extent->set_io_wait();
992 return reader.read(
993 extent->get_paddr(),
994 extent->get_length(),
995 extent->get_bptr()
996 ).safe_then(
997 [extent=std::move(extent)]() mutable {
998 extent->state = CachedExtent::extent_state_t::CLEAN;
999 /* TODO: crc should be checked against LBA manager */
1000 extent->last_committed_crc = extent->get_crc32c();
1001
1002 extent->on_clean_read();
1003 extent->complete_io();
1004 return get_extent_ertr::make_ready_future<TCachedExtentRef<T>>(
1005 std::move(extent));
1006 },
1007 get_extent_ertr::pass_further{},
1008 crimson::ct_error::assert_all{
1009 "Cache::get_extent: invalid error"
1010 }
1011 );
1012 }
1013
1014 // Extents in cache may contain placeholders
1015 CachedExtentRef query_cache(
1016 paddr_t offset,
1017 const src_ext_t* p_metric_key) {
1018 query_counters_t* p_counters = nullptr;
1019 if (p_metric_key) {
1020 p_counters = &get_by_src(stats.cache_query_by_src, p_metric_key->first);
1021 ++p_counters->access;
1022 }
1023 if (auto iter = extents.find_offset(offset);
1024 iter != extents.end()) {
1025 if (p_metric_key &&
1026 // retired_placeholder is not really cached yet
1027 iter->get_type() != extent_types_t::RETIRED_PLACEHOLDER) {
1028 ++p_counters->hit;
1029 }
1030 return CachedExtentRef(&*iter);
1031 } else {
1032 return CachedExtentRef();
1033 }
1034 }
1035
1036 };
1037 using CacheRef = std::unique_ptr<Cache>;
1038
1039 }