]> git.proxmox.com Git - ceph.git/blame - ceph/src/crimson/os/seastore/cache.h
update ceph source to reef 18.1.2
[ceph.git] / ceph / src / crimson / os / seastore / cache.h
CommitLineData
f67539c2
TL
1// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2// vim: ts=8 sw=2 smarttab
3
4#pragma once
5
6#include <iostream>
7
8#include "seastar/core/shared_future.hh"
9
10#include "include/buffer.h"
20effc67 11
f67539c2
TL
12#include "crimson/common/errorator.h"
13#include "crimson/os/seastore/cached_extent.h"
1e59de90
TL
14#include "crimson/os/seastore/extent_placement_manager.h"
15#include "crimson/os/seastore/logging.h"
20effc67 16#include "crimson/os/seastore/random_block_manager.h"
1e59de90
TL
17#include "crimson/os/seastore/root_block.h"
18#include "crimson/os/seastore/seastore_types.h"
19#include "crimson/os/seastore/segment_manager.h"
20#include "crimson/os/seastore/transaction.h"
21
22namespace crimson::os::seastore::backref {
23class BtreeBackrefManager;
24}
f67539c2
TL
25
26namespace crimson::os::seastore {
27
1e59de90
TL
28template <
29 typename node_key_t,
30 typename node_val_t,
31 typename internal_node_t,
32 typename leaf_node_t,
33 typename pin_t,
34 size_t node_size,
35 bool leaf_has_children>
36class FixedKVBtree;
37class BackrefManager;
38class SegmentProvider;
39
40struct backref_entry_t {
41 backref_entry_t(
42 const paddr_t paddr,
43 const laddr_t laddr,
44 const extent_len_t len,
45 const extent_types_t type,
46 const journal_seq_t seq)
47 : paddr(paddr),
48 laddr(laddr),
49 len(len),
50 type(type),
51 seq(seq)
52 {}
53 backref_entry_t(alloc_blk_t alloc_blk)
54 : paddr(alloc_blk.paddr),
55 laddr(alloc_blk.laddr),
56 len(alloc_blk.len),
57 type(alloc_blk.type)
58 {}
59 paddr_t paddr = P_ADDR_NULL;
60 laddr_t laddr = L_ADDR_NULL;
61 extent_len_t len = 0;
62 extent_types_t type =
63 extent_types_t::ROOT;
64 journal_seq_t seq;
65 friend bool operator< (
66 const backref_entry_t &l,
67 const backref_entry_t &r) {
68 return l.paddr < r.paddr;
69 }
70 friend bool operator> (
71 const backref_entry_t &l,
72 const backref_entry_t &r) {
73 return l.paddr > r.paddr;
74 }
75 friend bool operator== (
76 const backref_entry_t &l,
77 const backref_entry_t &r) {
78 return l.paddr == r.paddr;
79 }
80
81 using set_hook_t =
82 boost::intrusive::set_member_hook<
83 boost::intrusive::link_mode<
84 boost::intrusive::auto_unlink>>;
85 set_hook_t backref_set_hook;
86 using backref_set_member_options = boost::intrusive::member_hook<
87 backref_entry_t,
88 set_hook_t,
89 &backref_entry_t::backref_set_hook>;
90 using multiset_t = boost::intrusive::multiset<
91 backref_entry_t,
92 backref_set_member_options,
93 boost::intrusive::constant_time_size<false>>;
94
95 struct cmp_t {
96 using is_transparent = paddr_t;
97 bool operator()(
98 const backref_entry_t &l,
99 const backref_entry_t &r) const {
100 return l.paddr < r.paddr;
101 }
102 bool operator()(const paddr_t l, const backref_entry_t &r) const {
103 return l < r.paddr;
104 }
105 bool operator()(const backref_entry_t &l, const paddr_t r) const {
106 return l.paddr < r;
107 }
108 };
109};
110
111std::ostream &operator<<(std::ostream &out, const backref_entry_t &ent);
112
113using backref_entry_ref = std::unique_ptr<backref_entry_t>;
114using backref_entry_mset_t = backref_entry_t::multiset_t;
115using backref_entry_refs_t = std::vector<backref_entry_ref>;
116using backref_entryrefs_by_seq_t = std::map<journal_seq_t, backref_entry_refs_t>;
117using backref_entry_query_set_t = std::set<
118 backref_entry_t, backref_entry_t::cmp_t>;
119
f67539c2
TL
120/**
121 * Cache
122 *
123 * This component is responsible for buffer management, including
124 * transaction lifecycle.
125 *
126 * Seastore transactions are expressed as an atomic combination of
127 * 1) newly written blocks
128 * 2) logical mutations to existing physical blocks
129 *
130 * See record_t
131 *
132 * As such, any transaction has 3 components:
133 * 1) read_set: references to extents read during the transaction
134 * See Transaction::read_set
135 * 2) write_set: references to extents to be written as:
136 * a) new physical blocks, see Transaction::fresh_block_list
137 * b) mutations to existing physical blocks,
138 * see Transaction::mutated_block_list
139 * 3) retired_set: extent refs to be retired either due to 2b or
140 * due to releasing the extent generally.
141
142 * In the case of 2b, the CachedExtent will have been copied into
143 * a fresh CachedExtentRef such that the source extent ref is present
144 * in the read set and the newly allocated extent is present in the
145 * write_set.
146 *
147 * A transaction has 3 phases:
148 * 1) construction: user calls Cache::get_transaction() and populates
149 * the returned transaction by calling Cache methods
150 * 2) submission: user calls Cache::try_start_transaction(). If
151 * succcessful, the user may construct a record and submit the
152 * transaction to the journal.
153 * 3) completion: once the transaction is durable, the user must call
20effc67 154 * Cache::complete_commit() with the block offset to complete
f67539c2
TL
155 * the transaction.
156 *
157 * Internally, in phase 1, the fields in Transaction are filled in.
158 * - reads may block if the referenced extent is being written
159 * - once a read obtains a particular CachedExtentRef for a paddr_t,
160 * it'll always get the same one until overwritten
161 * - once a paddr_t is overwritten or written, subsequent reads of
162 * that addr will get the new ref
163 *
164 * In phase 2, if all extents in the read set are valid (not expired),
165 * we can commit (otherwise, we fail and the user must retry).
166 * - Expire all extents in the retired_set (they must all be valid)
167 * - Remove all extents in the retired_set from Cache::extents
168 * - Mark all extents in the write_set wait_io(), add promises to
169 * transaction
170 * - Merge Transaction::write_set into Cache::extents
171 *
172 * After phase 2, the user will submit the record to the journal.
173 * Once complete, we perform phase 3:
174 * - For each CachedExtent in block_list, call
175 * CachedExtent::complete_initial_write(paddr_t) with the block's
176 * final offset (inferred from the extent's position in the block_list
177 * and extent lengths).
178 * - For each block in mutation_list, call
179 * CachedExtent::delta_written(paddr_t) with the address of the start
180 * of the record
181 * - Complete all promises with the final record start paddr_t
1e59de90
TL
182 *
183 *
184 * Cache logs
185 *
186 * levels:
187 * - INFO: major initiation, closing operations
188 * - DEBUG: major extent related operations, INFO details
189 * - TRACE: DEBUG details
190 * - seastore_t logs
f67539c2
TL
191 */
192class Cache {
193public:
20effc67
TL
194 using base_ertr = crimson::errorator<
195 crimson::ct_error::input_output_error>;
196 using base_iertr = trans_iertr<base_ertr>;
197
1e59de90 198 Cache(ExtentPlacementManager &epm);
f67539c2
TL
199 ~Cache();
200
20effc67
TL
201 /// Creates empty transaction by source
202 TransactionRef create_transaction(
203 Transaction::src_t src,
204 const char* name,
205 bool is_weak) {
206 LOG_PREFIX(Cache::create_transaction);
207
208 ++(get_by_src(stats.trans_created_by_src, src));
209
210 auto ret = std::make_unique<Transaction>(
211 get_dummy_ordering_handle(),
212 is_weak,
213 src,
214 last_commit,
215 [this](Transaction& t) {
216 return on_transaction_destruct(t);
1e59de90
TL
217 },
218 ++next_id
20effc67 219 );
1e59de90
TL
220 SUBDEBUGT(seastore_t, "created name={}, source={}, is_weak={}",
221 *ret, name, src, is_weak);
222 assert(!is_weak || src == Transaction::src_t::READ);
20effc67
TL
223 return ret;
224 }
225
226 /// Resets transaction preserving
227 void reset_transaction_preserve_handle(Transaction &t) {
228 LOG_PREFIX(Cache::reset_transaction_preserve_handle);
229 if (t.did_reset()) {
1e59de90 230 SUBTRACET(seastore_t, "reset", t);
20effc67
TL
231 ++(get_by_src(stats.trans_created_by_src, t.get_src()));
232 }
233 t.reset_preserve_handle(last_commit);
f67539c2
TL
234 }
235
236 /// Declare ref retired in t
237 void retire_extent(Transaction &t, CachedExtentRef ref) {
1e59de90
TL
238 LOG_PREFIX(Cache::retire_extent);
239 SUBDEBUGT(seastore_cache, "retire extent -- {}", t, *ref);
f67539c2
TL
240 t.add_to_retired_set(ref);
241 }
242
20effc67
TL
243 /// Declare paddr retired in t
244 using retire_extent_iertr = base_iertr;
245 using retire_extent_ret = base_iertr::future<>;
246 retire_extent_ret retire_extent_addr(
247 Transaction &t, paddr_t addr, extent_len_t length);
f67539c2
TL
248
249 /**
250 * get_root
251 *
252 * returns ref to current root or t.root if modified in t
253 */
20effc67
TL
254 using get_root_iertr = base_iertr;
255 using get_root_ret = get_root_iertr::future<RootBlockRef>;
f67539c2
TL
256 get_root_ret get_root(Transaction &t);
257
258 /**
259 * get_root_fast
260 *
261 * returns t.root and assume it is already present/read in t
262 */
263 RootBlockRef get_root_fast(Transaction &t) {
1e59de90
TL
264 LOG_PREFIX(Cache::get_root_fast);
265 SUBTRACET(seastore_cache, "root already on t -- {}", t, *t.root);
f67539c2
TL
266 assert(t.root);
267 return t.root;
268 }
269
270 /**
271 * get_extent
272 *
273 * returns ref to extent at offset~length of type T either from
274 * - extent_set if already in cache
275 * - disk
276 */
20effc67
TL
277 using src_ext_t = std::pair<Transaction::src_t, extent_types_t>;
278 using get_extent_ertr = base_ertr;
f67539c2 279 template <typename T>
20effc67 280 using get_extent_ret = get_extent_ertr::future<TCachedExtentRef<T>>;
1e59de90 281 template <typename T, typename Func, typename OnCache>
20effc67
TL
282 get_extent_ret<T> get_extent(
283 paddr_t offset, ///< [in] starting addr
1e59de90
TL
284 extent_len_t length, ///< [in] length
285 const src_ext_t* p_src_ext, ///< [in] cache query metric key
286 Func &&extent_init_func, ///< [in] init func for extent
287 OnCache &&on_cache
f67539c2 288 ) {
1e59de90
TL
289 LOG_PREFIX(Cache::get_extent);
290 auto cached = query_cache(offset, p_src_ext);
20effc67
TL
291 if (!cached) {
292 auto ret = CachedExtent::make_cached_extent_ref<T>(
293 alloc_cache_buf(length));
1e59de90
TL
294 ret->init(CachedExtent::extent_state_t::CLEAN_PENDING,
295 offset,
296 PLACEMENT_HINT_NULL,
297 NULL_GENERATION,
298 TRANS_ID_NULL);
299 SUBDEBUG(seastore_cache,
300 "{} {}~{} is absent, add extent and reading ... -- {}",
301 T::TYPE, offset, length, *ret);
302 const auto p_src = p_src_ext ? &p_src_ext->first : nullptr;
303 add_extent(ret, p_src);
304 on_cache(*ret);
20effc67
TL
305 extent_init_func(*ret);
306 return read_extent<T>(
307 std::move(ret));
308 }
309
310 // extent PRESENT in cache
311 if (cached->get_type() == extent_types_t::RETIRED_PLACEHOLDER) {
312 auto ret = CachedExtent::make_cached_extent_ref<T>(
313 alloc_cache_buf(length));
1e59de90
TL
314 ret->init(CachedExtent::extent_state_t::CLEAN_PENDING,
315 offset,
316 PLACEMENT_HINT_NULL,
317 NULL_GENERATION,
318 TRANS_ID_NULL);
319 SUBDEBUG(seastore_cache,
320 "{} {}~{} is absent(placeholder), reading ... -- {}",
321 T::TYPE, offset, length, *ret);
20effc67 322 extents.replace(*ret, *cached);
1e59de90 323 on_cache(*ret);
20effc67
TL
324
325 // replace placeholder in transactions
326 while (!cached->transactions.empty()) {
327 auto t = cached->transactions.begin()->t;
328 t->replace_placeholder(*cached, *ret);
329 }
330
331 cached->state = CachedExtent::extent_state_t::INVALID;
332 extent_init_func(*ret);
333 return read_extent<T>(
334 std::move(ret));
f67539c2 335 } else {
1e59de90
TL
336 SUBTRACE(seastore_cache,
337 "{} {}~{} is present in cache -- {}",
338 T::TYPE, offset, length, *cached);
20effc67 339 auto ret = TCachedExtentRef<T>(static_cast<T*>(cached.get()));
1e59de90 340 on_cache(*ret);
20effc67
TL
341 return ret->wait_io(
342 ).then([ret=std::move(ret)]() mutable
343 -> get_extent_ret<T> {
344 // ret may be invalid, caller must check
345 return get_extent_ret<T>(
346 get_extent_ertr::ready_future_marker{},
347 std::move(ret));
348 });
f67539c2
TL
349 }
350 }
20effc67
TL
351 template <typename T>
352 get_extent_ret<T> get_extent(
353 paddr_t offset, ///< [in] starting addr
1e59de90 354 extent_len_t length, ///< [in] length
20effc67
TL
355 const src_ext_t* p_metric_key ///< [in] cache query metric key
356 ) {
357 return get_extent<T>(
358 offset, length, p_metric_key,
1e59de90 359 [](T &){}, [](T &) {});
20effc67 360 }
f67539c2 361
1e59de90 362
f67539c2
TL
363 /**
364 * get_extent_if_cached
365 *
366 * Returns extent at offset if in cache
367 */
20effc67
TL
368 using get_extent_if_cached_iertr = base_iertr;
369 using get_extent_if_cached_ret =
370 get_extent_if_cached_iertr::future<CachedExtentRef>;
371 get_extent_if_cached_ret get_extent_if_cached(
f67539c2
TL
372 Transaction &t,
373 paddr_t offset,
20effc67
TL
374 extent_types_t type) {
375 CachedExtentRef ret;
376 LOG_PREFIX(Cache::get_extent_if_cached);
377 auto result = t.get_extent(offset, &ret);
1e59de90
TL
378 if (result == Transaction::get_extent_ret::RETIRED) {
379 SUBDEBUGT(seastore_cache, "{} {} is retired on t -- {}",
380 t, type, offset, *ret);
20effc67
TL
381 return get_extent_if_cached_iertr::make_ready_future<
382 CachedExtentRef>(ret);
1e59de90
TL
383 } else if (result == Transaction::get_extent_ret::PRESENT) {
384 SUBTRACET(seastore_cache, "{} {} is present on t -- {}",
385 t, type, offset, *ret);
386 return ret->wait_io().then([ret] {
387 return get_extent_if_cached_iertr::make_ready_future<
388 CachedExtentRef>(ret);
389 });
f67539c2 390 }
20effc67
TL
391
392 // get_extent_ret::ABSENT from transaction
393 auto metric_key = std::make_pair(t.get_src(), type);
394 ret = query_cache(offset, &metric_key);
395 if (!ret ||
396 // retired_placeholder is not really cached yet
397 ret->get_type() == extent_types_t::RETIRED_PLACEHOLDER) {
1e59de90
TL
398 SUBDEBUGT(seastore_cache, "{} {} is absent{}",
399 t, type, offset, !!ret ? "(placeholder)" : "");
20effc67
TL
400 return get_extent_if_cached_iertr::make_ready_future<
401 CachedExtentRef>();
402 }
403
404 // present in cache and is not a retired_placeholder
1e59de90
TL
405 SUBDEBUGT(seastore_cache, "{} {} is present in cache -- {}",
406 t, type, offset, *ret);
20effc67
TL
407 t.add_to_read_set(ret);
408 touch_extent(*ret);
409 return ret->wait_io().then([ret] {
410 return get_extent_if_cached_iertr::make_ready_future<
411 CachedExtentRef>(ret);
412 });
f67539c2
TL
413 }
414
415 /**
416 * get_extent
417 *
418 * returns ref to extent at offset~length of type T either from
419 * - t if modified by t
420 * - extent_set if already in cache
421 * - disk
422 *
423 * t *must not* have retired offset
424 */
20effc67
TL
425 using get_extent_iertr = base_iertr;
426 template <typename T, typename Func>
427 get_extent_iertr::future<TCachedExtentRef<T>> get_extent(
428 Transaction &t,
429 paddr_t offset,
1e59de90 430 extent_len_t length,
20effc67 431 Func &&extent_init_func) {
f67539c2 432 CachedExtentRef ret;
20effc67 433 LOG_PREFIX(Cache::get_extent);
f67539c2
TL
434 auto result = t.get_extent(offset, &ret);
435 if (result != Transaction::get_extent_ret::ABSENT) {
1e59de90
TL
436 SUBTRACET(seastore_cache, "{} {}~{} is {} on t -- {}",
437 t,
438 T::TYPE,
439 offset,
440 length,
441 result == Transaction::get_extent_ret::PRESENT ? "present" : "retired",
442 *ret);
f67539c2 443 assert(result != Transaction::get_extent_ret::RETIRED);
1e59de90
TL
444 return ret->wait_io().then([ret] {
445 return seastar::make_ready_future<TCachedExtentRef<T>>(
446 ret->cast<T>());
20effc67 447 });
f67539c2 448 }
1e59de90
TL
449
450 SUBTRACET(seastore_cache, "{} {}~{} is absent on t, query cache ...",
451 t, T::TYPE, offset, length);
452 auto f = [&t, this](CachedExtent &ext) {
453 t.add_to_read_set(CachedExtentRef(&ext));
454 touch_extent(ext);
455 };
456 auto metric_key = std::make_pair(t.get_src(), T::TYPE);
457 return trans_intr::make_interruptible(
458 get_extent<T>(
459 offset, length, &metric_key,
460 std::forward<Func>(extent_init_func), std::move(f))
461 );
f67539c2 462 }
1e59de90
TL
463
464 /*
465 * get_absent_extent
466 *
467 * Mostly the same as Cache::get_extent(), with the only difference
468 * that get_absent_extent won't search the transaction's context for
469 * the specific CachedExtent
470 */
471 template <typename T, typename Func>
472 get_extent_iertr::future<TCachedExtentRef<T>> get_absent_extent(
473 Transaction &t,
474 paddr_t offset,
475 extent_len_t length,
476 Func &&extent_init_func) {
477 CachedExtentRef ret;
478 LOG_PREFIX(Cache::get_extent);
479
480#ifndef NDEBUG
481 auto r = t.get_extent(offset, &ret);
482 if (r != Transaction::get_extent_ret::ABSENT) {
483 SUBERRORT(seastore_cache, "unexpected non-absent extent {}", t, *ret);
484 ceph_abort();
485 }
486#endif
487
488 SUBTRACET(seastore_cache, "{} {}~{} is absent on t, query cache ...",
489 t, T::TYPE, offset, length);
490 auto f = [&t, this](CachedExtent &ext) {
491 t.add_to_read_set(CachedExtentRef(&ext));
492 touch_extent(ext);
493 };
494 auto metric_key = std::make_pair(t.get_src(), T::TYPE);
495 return trans_intr::make_interruptible(
496 get_extent<T>(
497 offset, length, &metric_key,
498 std::forward<Func>(extent_init_func), std::move(f))
499 );
500 }
501
20effc67
TL
502 template <typename T>
503 get_extent_iertr::future<TCachedExtentRef<T>> get_extent(
504 Transaction &t,
505 paddr_t offset,
1e59de90 506 extent_len_t length) {
20effc67
TL
507 return get_extent<T>(t, offset, length, [](T &){});
508 }
509
1e59de90
TL
510 /*
511 * get_absent_extent
f67539c2 512 *
1e59de90
TL
513 * Mostly the same as Cache::get_extent(), with the only difference
514 * that get_absent_extent won't search the transaction's context for
515 * the specific CachedExtent
f67539c2 516 */
1e59de90
TL
517 template <typename T>
518 get_extent_iertr::future<TCachedExtentRef<T>> get_absent_extent(
519 Transaction &t,
520 paddr_t offset,
521 extent_len_t length) {
522 return get_absent_extent<T>(t, offset, length, [](T &){});
523 }
524
525 seastar::future<CachedExtentRef> get_extent_viewable_by_trans(
526 Transaction &t,
527 CachedExtentRef extent)
528 {
529 auto p_extent = extent->get_transactional_view(t);
530 if (!p_extent->is_pending_in_trans(t.get_trans_id())) {
531 t.add_to_read_set(p_extent);
532 if (!p_extent->is_mutation_pending()) {
533 touch_extent(*p_extent);
534 }
535 }
536 return p_extent->wait_io(
537 ).then([p_extent] {
538 return CachedExtentRef(p_extent);
539 });
540 }
541
542 template <typename T>
543 seastar::future<TCachedExtentRef<T>> get_extent_viewable_by_trans(
544 Transaction &t,
545 TCachedExtentRef<T> extent)
546 {
547 return get_extent_viewable_by_trans(t, CachedExtentRef(extent.get())
548 ).then([](auto p_extent) {
549 return p_extent->template cast<T>();
550 });
551 }
552
553 extent_len_t get_block_size() const {
554 return epm.get_block_size();
555 }
556
20effc67
TL
557private:
558 // This is a workaround std::move_only_function not being available,
559 // not really worth generalizing at this time.
560 class extent_init_func_t {
561 struct callable_i {
562 virtual void operator()(CachedExtent &extent) = 0;
563 virtual ~callable_i() = default;
564 };
565 template <typename Func>
566 struct callable_wrapper final : callable_i {
567 Func func;
568 callable_wrapper(Func &&func) : func(std::forward<Func>(func)) {}
569 void operator()(CachedExtent &extent) final {
570 return func(extent);
571 }
572 ~callable_wrapper() final = default;
573 };
574 public:
575 std::unique_ptr<callable_i> wrapped;
576 template <typename Func>
577 extent_init_func_t(Func &&func) : wrapped(
578 std::make_unique<callable_wrapper<Func>>(std::forward<Func>(func)))
579 {}
580 void operator()(CachedExtent &extent) {
581 return (*wrapped)(extent);
582 }
583 };
584 get_extent_ertr::future<CachedExtentRef> _get_extent_by_type(
585 extent_types_t type,
586 paddr_t offset,
587 laddr_t laddr,
1e59de90 588 extent_len_t length,
20effc67 589 const Transaction::src_t* p_src,
1e59de90
TL
590 extent_init_func_t &&extent_init_func,
591 extent_init_func_t &&on_cache
f67539c2
TL
592 );
593
20effc67
TL
594 using get_extent_by_type_iertr = get_extent_iertr;
595 using get_extent_by_type_ret = get_extent_by_type_iertr::future<
596 CachedExtentRef>;
597 get_extent_by_type_ret _get_extent_by_type(
f67539c2
TL
598 Transaction &t,
599 extent_types_t type,
600 paddr_t offset,
601 laddr_t laddr,
1e59de90
TL
602 extent_len_t length,
603 extent_init_func_t &&extent_init_func
604 ) {
605 LOG_PREFIX(Cache::get_extent_by_type);
f67539c2 606 CachedExtentRef ret;
20effc67 607 auto status = t.get_extent(offset, &ret);
f67539c2 608 if (status == Transaction::get_extent_ret::RETIRED) {
1e59de90
TL
609 SUBDEBUGT(seastore_cache, "{} {}~{} {} is retired on t -- {}",
610 t, type, offset, length, laddr, *ret);
20effc67 611 return seastar::make_ready_future<CachedExtentRef>();
f67539c2 612 } else if (status == Transaction::get_extent_ret::PRESENT) {
1e59de90
TL
613 SUBTRACET(seastore_cache, "{} {}~{} {} is present on t -- {}",
614 t, type, offset, length, laddr, *ret);
615 return ret->wait_io().then([ret] {
616 return seastar::make_ready_future<CachedExtentRef>(ret);
617 });
f67539c2 618 } else {
1e59de90
TL
619 SUBTRACET(seastore_cache, "{} {}~{} {} is absent on t, query cache ...",
620 t, type, offset, length, laddr);
621 auto f = [&t, this](CachedExtent &ext) {
622 t.add_to_read_set(CachedExtentRef(&ext));
623 touch_extent(ext);
624 };
20effc67
TL
625 auto src = t.get_src();
626 return trans_intr::make_interruptible(
627 _get_extent_by_type(
628 type, offset, laddr, length, &src,
1e59de90
TL
629 std::move(extent_init_func), std::move(f))
630 );
631 }
632 }
633
634 get_extent_by_type_ret _get_absent_extent_by_type(
635 Transaction &t,
636 extent_types_t type,
637 paddr_t offset,
638 laddr_t laddr,
639 extent_len_t length,
640 extent_init_func_t &&extent_init_func
641 ) {
642 LOG_PREFIX(Cache::_get_absent_extent_by_type);
643
644#ifndef NDEBUG
645 CachedExtentRef ret;
646 auto r = t.get_extent(offset, &ret);
647 if (r != Transaction::get_extent_ret::ABSENT) {
648 SUBERRORT(seastore_cache, "unexpected non-absent extent {}", t, *ret);
649 ceph_abort();
650 }
651#endif
652
653 SUBTRACET(seastore_cache, "{} {}~{} {} is absent on t, query cache ...",
654 t, type, offset, length, laddr);
655 auto f = [&t, this](CachedExtent &ext) {
656 t.add_to_read_set(CachedExtentRef(&ext));
657 touch_extent(ext);
658 };
659 auto src = t.get_src();
660 return trans_intr::make_interruptible(
661 _get_extent_by_type(
662 type, offset, laddr, length, &src,
663 std::move(extent_init_func), std::move(f))
664 );
665 }
666
667 backref_entryrefs_by_seq_t backref_entryrefs_by_seq;
668 backref_entry_mset_t backref_entry_mset;
669
670 using backref_entry_query_mset_t = std::multiset<
671 backref_entry_t, backref_entry_t::cmp_t>;
672 backref_entry_query_mset_t get_backref_entries_in_range(
673 paddr_t start,
674 paddr_t end) {
675 auto start_iter = backref_entry_mset.lower_bound(
676 start,
677 backref_entry_t::cmp_t());
678 auto end_iter = backref_entry_mset.lower_bound(
679 end,
680 backref_entry_t::cmp_t());
681 backref_entry_query_mset_t res;
682 for (auto it = start_iter;
683 it != end_iter;
684 it++) {
685 res.emplace(it->paddr, it->laddr, it->len, it->type, it->seq);
686 }
687 return res;
688 }
689
690 const backref_entry_mset_t& get_backref_entry_mset() {
691 return backref_entry_mset;
692 }
693
694 backref_entryrefs_by_seq_t& get_backref_entryrefs_by_seq() {
695 return backref_entryrefs_by_seq;
696 }
697
698 const segment_info_t* get_segment_info(segment_id_t sid) {
699 auto provider = segment_providers_by_device_id[sid.device_id()];
700 if (provider) {
701 return &provider->get_seg_info(sid);
702 } else {
703 return nullptr;
f67539c2
TL
704 }
705 }
706
20effc67 707public:
1e59de90
TL
708 /**
709 * get_extent_by_type
710 *
711 * Based on type, instantiate the correct concrete type
712 * and read in the extent at location offset~length.
713 */
20effc67
TL
714 template <typename Func>
715 get_extent_by_type_ret get_extent_by_type(
716 Transaction &t, ///< [in] transaction
717 extent_types_t type, ///< [in] type tag
718 paddr_t offset, ///< [in] starting addr
719 laddr_t laddr, ///< [in] logical address if logical
1e59de90 720 extent_len_t length, ///< [in] length
20effc67 721 Func &&extent_init_func ///< [in] extent init func
f67539c2 722 ) {
20effc67
TL
723 return _get_extent_by_type(
724 t,
725 type,
726 offset,
727 laddr,
728 length,
729 extent_init_func_t(std::forward<Func>(extent_init_func)));
730 }
1e59de90
TL
731
732 /*
733 * get_absent_extent_by_type
734 *
735 * Mostly the same as Cache::get_extent_by_type(), with the only difference
736 * that get_absent_extent_by_type won't search the transaction's context for
737 * the specific CachedExtent
738 */
739 template <typename Func>
740 get_extent_by_type_ret get_absent_extent_by_type(
741 Transaction &t, ///< [in] transaction
742 extent_types_t type, ///< [in] type tag
743 paddr_t offset, ///< [in] starting addr
744 laddr_t laddr, ///< [in] logical address if logical
745 extent_len_t length, ///< [in] length
746 Func &&extent_init_func ///< [in] extent init func
747 ) {
748 return _get_absent_extent_by_type(
749 t,
750 type,
751 offset,
752 laddr,
753 length,
754 extent_init_func_t(std::forward<Func>(extent_init_func)));
755 }
756
20effc67
TL
757 get_extent_by_type_ret get_extent_by_type(
758 Transaction &t,
759 extent_types_t type,
760 paddr_t offset,
761 laddr_t laddr,
1e59de90 762 extent_len_t length
20effc67
TL
763 ) {
764 return get_extent_by_type(
765 t, type, offset, laddr, length, [](CachedExtent &) {});
f67539c2
TL
766 }
767
20effc67 768
1e59de90
TL
769 /*
770 * get_absent_extent_by_type
771 *
772 * Mostly the same as Cache::get_extent_by_type(), with the only difference
773 * that get_absent_extent_by_type won't search the transaction's context for
774 * the specific CachedExtent
775 */
776 get_extent_by_type_ret get_absent_extent_by_type(
777 Transaction &t,
778 extent_types_t type,
779 paddr_t offset,
780 laddr_t laddr,
781 extent_len_t length
782 ) {
783 return get_absent_extent_by_type(
784 t, type, offset, laddr, length, [](CachedExtent &) {});
785 }
786
787 void trim_backref_bufs(const journal_seq_t &trim_to) {
788 LOG_PREFIX(Cache::trim_backref_bufs);
789 SUBDEBUG(seastore_cache, "trimming to {}", trim_to);
790 if (!backref_entryrefs_by_seq.empty()) {
791 SUBDEBUG(seastore_cache, "backref_entryrefs_by_seq {} ~ {}, size={}",
792 backref_entryrefs_by_seq.rbegin()->first,
793 backref_entryrefs_by_seq.begin()->first,
794 backref_entryrefs_by_seq.size());
795 assert(backref_entryrefs_by_seq.rbegin()->first >= trim_to);
796 auto iter = backref_entryrefs_by_seq.upper_bound(trim_to);
797 backref_entryrefs_by_seq.erase(backref_entryrefs_by_seq.begin(), iter);
798 }
799 if (backref_entryrefs_by_seq.empty()) {
800 SUBDEBUG(seastore_cache, "backref_entryrefs_by_seq all trimmed");
801 }
802 }
803
f67539c2
TL
804 /**
805 * alloc_new_extent
806 *
1e59de90
TL
807 * Allocates a fresh extent. if delayed is true, addr will be alloc'd later.
808 * Note that epaddr can only be fed by the btree lba unittest for now
f67539c2
TL
809 */
810 template <typename T>
811 TCachedExtentRef<T> alloc_new_extent(
1e59de90
TL
812 Transaction &t, ///< [in, out] current transaction
813 extent_len_t length, ///< [in] length
814 placement_hint_t hint, ///< [in] user hint
815#ifdef UNIT_TESTS_BUILT
816 rewrite_gen_t gen, ///< [in] rewrite generation
817 std::optional<paddr_t> epaddr = std::nullopt ///< [in] paddr fed by callers
818#else
819 rewrite_gen_t gen
820#endif
f67539c2 821 ) {
1e59de90
TL
822 LOG_PREFIX(Cache::alloc_new_extent);
823 SUBTRACET(seastore_cache, "allocate {} {}B, hint={}, gen={}",
824 t, T::TYPE, length, hint, rewrite_gen_printer_t{gen});
825#ifdef UNIT_TESTS_BUILT
826 auto result = epm.alloc_new_extent(t, T::TYPE, length, hint, gen, epaddr);
827#else
828 auto result = epm.alloc_new_extent(t, T::TYPE, length, hint, gen);
829#endif
830 auto ret = CachedExtent::make_cached_extent_ref<T>(std::move(result.bp));
831 ret->init(CachedExtent::extent_state_t::INITIAL_WRITE_PENDING,
832 result.paddr,
833 hint,
834 result.gen,
835 t.get_trans_id());
836 t.add_fresh_extent(ret);
837 SUBDEBUGT(seastore_cache,
838 "allocated {} {}B extent at {}, hint={}, gen={} -- {}",
839 t, T::TYPE, length, result.paddr,
840 hint, rewrite_gen_printer_t{result.gen}, *ret);
f67539c2
TL
841 return ret;
842 }
843
844 /**
845 * alloc_new_extent
846 *
847 * Allocates a fresh extent. addr will be relative until commit.
848 */
849 CachedExtentRef alloc_new_extent_by_type(
1e59de90
TL
850 Transaction &t, ///< [in, out] current transaction
851 extent_types_t type, ///< [in] type tag
852 extent_len_t length, ///< [in] length
853 placement_hint_t hint, ///< [in] user hint
854 rewrite_gen_t gen ///< [in] rewrite generation
f67539c2
TL
855 );
856
857 /**
858 * Allocates mutable buffer from extent_set on offset~len
859 *
860 * TODO: Note, currently all implementations literally copy the
861 * buffer. This needn't be true, CachedExtent implementations could
862 * choose to refer to the same buffer unmodified until commit and just
863 * buffer the mutations in an ancillary data structure.
864 *
865 * @param current transaction
866 * @param extent to duplicate
867 * @return mutable extent
868 */
869 CachedExtentRef duplicate_for_write(
870 Transaction &t, ///< [in, out] current transaction
871 CachedExtentRef i ///< [in] ref to existing extent
872 );
873
1e59de90
TL
874 /**
875 * set_segment_provider
876 *
877 * Set to provide segment information to help identify out-dated delta.
878 *
879 * FIXME: This is specific to the segmented implementation
880 */
881 void set_segment_providers(std::vector<SegmentProvider*> &&providers) {
882 segment_providers_by_device_id = std::move(providers);
883 }
884
f67539c2 885 /**
20effc67 886 * prepare_record
f67539c2 887 *
20effc67 888 * Construct the record for Journal from transaction.
f67539c2 889 */
20effc67 890 record_t prepare_record(
1e59de90
TL
891 Transaction &t, ///< [in, out] current transaction
892 const journal_seq_t &journal_head,
893 const journal_seq_t &journal_dirty_tail
f67539c2
TL
894 );
895
896 /**
897 * complete_commit
898 *
899 * Must be called upon completion of write. Releases blocks on mutating
900 * extents, fills in addresses, and calls relevant callbacks on fresh
901 * and mutated exents.
902 */
903 void complete_commit(
904 Transaction &t, ///< [in, out] current transaction
905 paddr_t final_block_start, ///< [in] offset of initial block
1e59de90 906 journal_seq_t seq ///< [in] journal commit seq
f67539c2
TL
907 );
908
909 /**
910 * init
911 */
912 void init();
913
914 /**
915 * mkfs
916 *
917 * Alloc initial root node and add to t. The intention is for other
918 * components to use t to adjust the resulting root ref prior to commit.
919 */
20effc67
TL
920 using mkfs_iertr = base_iertr;
921 mkfs_iertr::future<> mkfs(Transaction &t);
f67539c2
TL
922
923 /**
924 * close
925 *
926 * TODO: should flush dirty blocks
927 */
928 using close_ertr = crimson::errorator<
929 crimson::ct_error::input_output_error>;
930 close_ertr::future<> close();
931
932 /**
933 * replay_delta
934 *
935 * Intended for use in Journal::delta. For each delta, should decode delta,
936 * read relevant block from disk or cache (using correct type), and call
937 * CachedExtent::apply_delta marking the extent dirty.
1e59de90
TL
938 *
939 * Returns whether the delta is applied.
f67539c2
TL
940 */
941 using replay_delta_ertr = crimson::errorator<
942 crimson::ct_error::input_output_error>;
1e59de90 943 using replay_delta_ret = replay_delta_ertr::future<bool>;
f67539c2
TL
944 replay_delta_ret replay_delta(
945 journal_seq_t seq,
946 paddr_t record_block_base,
1e59de90
TL
947 const delta_info_t &delta,
948 const journal_seq_t &dirty_tail,
949 const journal_seq_t &alloc_tail,
950 sea_time_point modify_time);
f67539c2
TL
951
952 /**
953 * init_cached_extents
954 *
955 * Calls passed lambda for each dirty cached block. Intended for use
956 * after replay to allow lba_manager (or w/e) to read in any ancestor
957 * blocks.
958 */
20effc67
TL
959 using init_cached_extents_iertr = base_iertr;
960 using init_cached_extents_ret = init_cached_extents_iertr::future<>;
f67539c2
TL
961 template <typename F>
962 init_cached_extents_ret init_cached_extents(
963 Transaction &t,
964 F &&f)
965 {
1e59de90
TL
966 LOG_PREFIX(Cache::init_cached_extents);
967 SUBINFOT(seastore_cache,
968 "start with {}({}B) extents, {} dirty, dirty_from={}, alloc_from={}",
969 t,
970 extents.size(),
971 extents.get_bytes(),
972 dirty.size(),
973 get_oldest_dirty_from().value_or(JOURNAL_SEQ_NULL),
974 get_oldest_backref_dirty_from().value_or(JOURNAL_SEQ_NULL));
975
20effc67
TL
976 // journal replay should has been finished at this point,
977 // Cache::root should have been inserted to the dirty list
978 assert(root->is_dirty());
1e59de90 979 std::vector<CachedExtentRef> _dirty;
f67539c2 980 for (auto &e : extents) {
1e59de90 981 _dirty.push_back(CachedExtentRef(&e));
f67539c2
TL
982 }
983 return seastar::do_with(
984 std::forward<F>(f),
1e59de90
TL
985 std::move(_dirty),
986 [this, FNAME, &t](auto &f, auto &refs) mutable
987 {
988 return trans_intr::do_for_each(
989 refs,
990 [this, FNAME, &t, &f](auto &e)
991 {
992 SUBTRACET(seastore_cache, "inspecting extent ... -- {}", t, *e);
993 return f(t, e
994 ).si_then([this, FNAME, &t, e](bool is_alive) {
995 if (!is_alive) {
996 SUBDEBUGT(seastore_cache, "extent is not alive, remove extent -- {}", t, *e);
997 remove_extent(e);
998 e->set_invalid(t);
999 } else {
1000 SUBDEBUGT(seastore_cache, "extent is alive -- {}", t, *e);
1001 }
1002 });
1003 });
1004 }).handle_error_interruptible(
1005 init_cached_extents_iertr::pass_further{},
1006 crimson::ct_error::assert_all{
1007 "Invalid error in Cache::init_cached_extents"
1008 }
1009 ).si_then([this, FNAME, &t] {
1010 SUBINFOT(seastore_cache,
1011 "finish with {}({}B) extents, {} dirty, dirty_from={}, alloc_from={}",
1012 t,
1013 extents.size(),
1014 extents.get_bytes(),
1015 dirty.size(),
1016 get_oldest_dirty_from().value_or(JOURNAL_SEQ_NULL),
1017 get_oldest_backref_dirty_from().value_or(JOURNAL_SEQ_NULL));
1018 });
f67539c2
TL
1019 }
1020
1021 /**
1022 * update_extent_from_transaction
1023 *
1024 * Updates passed extent based on t. If extent has been retired,
1025 * a null result will be returned.
1026 */
1027 CachedExtentRef update_extent_from_transaction(
1028 Transaction &t,
1029 CachedExtentRef extent) {
1030 if (extent->get_type() == extent_types_t::ROOT) {
1031 if (t.root) {
1032 return t.root;
1033 } else {
20effc67
TL
1034 t.add_to_read_set(extent);
1035 t.root = extent->cast<RootBlock>();
f67539c2
TL
1036 return extent;
1037 }
1038 } else {
1039 auto result = t.get_extent(extent->get_paddr(), &extent);
1040 if (result == Transaction::get_extent_ret::RETIRED) {
1041 return CachedExtentRef();
1042 } else {
20effc67
TL
1043 if (result == Transaction::get_extent_ret::ABSENT) {
1044 t.add_to_read_set(extent);
1045 }
f67539c2
TL
1046 return extent;
1047 }
1048 }
1049 }
1050
1051 /**
1052 * print
1053 *
1054 * Dump summary of contents (TODO)
1055 */
1056 std::ostream &print(
1057 std::ostream &out) const {
1058 return out;
1059 }
1060
20effc67
TL
1061 /**
1062 * get_next_dirty_extents
1063 *
1064 * Returns extents with get_dirty_from() < seq and adds to read set of
1065 * t.
1066 */
1067 using get_next_dirty_extents_iertr = base_iertr;
1068 using get_next_dirty_extents_ret = get_next_dirty_extents_iertr::future<
f67539c2
TL
1069 std::vector<CachedExtentRef>>;
1070 get_next_dirty_extents_ret get_next_dirty_extents(
20effc67
TL
1071 Transaction &t,
1072 journal_seq_t seq,
1073 size_t max_bytes);
1074
1e59de90
TL
1075 /// returns std::nullopt if no pending alloc-infos
1076 std::optional<journal_seq_t> get_oldest_backref_dirty_from() const {
1077 LOG_PREFIX(Cache::get_oldest_backref_dirty_from);
1078 if (backref_entryrefs_by_seq.empty()) {
1079 SUBDEBUG(seastore_cache, "backref_oldest: null");
1080 return std::nullopt;
1081 }
1082 auto oldest = backref_entryrefs_by_seq.begin()->first;
1083 SUBDEBUG(seastore_cache, "backref_oldest: {}", oldest);
1084 ceph_assert(oldest != JOURNAL_SEQ_NULL);
1085 return oldest;
1086 }
1087
1088 /// returns std::nullopt if no dirty extents
1089 /// returns JOURNAL_SEQ_NULL if the oldest dirty extent is still pending
20effc67 1090 std::optional<journal_seq_t> get_oldest_dirty_from() const {
1e59de90 1091 LOG_PREFIX(Cache::get_oldest_dirty_from);
20effc67 1092 if (dirty.empty()) {
1e59de90 1093 SUBDEBUG(seastore_cache, "dirty_oldest: null");
20effc67
TL
1094 return std::nullopt;
1095 } else {
1096 auto oldest = dirty.begin()->get_dirty_from();
1e59de90
TL
1097 if (oldest == JOURNAL_SEQ_NULL) {
1098 SUBDEBUG(seastore_cache, "dirty_oldest: pending");
20effc67 1099 } else {
1e59de90 1100 SUBDEBUG(seastore_cache, "dirty_oldest: {}", oldest);
20effc67 1101 }
1e59de90 1102 return oldest;
20effc67
TL
1103 }
1104 }
1105
1106 /// Dump live extents
1107 void dump_contents();
f67539c2 1108
1e59de90
TL
1109 /**
1110 * backref_extent_entry_t
1111 *
1112 * All the backref extent entries have to be indexed by paddr in memory,
1113 * so they can be retrived by range during cleaning.
1114 *
1115 * See BtreeBackrefManager::retrieve_backref_extents_in_range()
1116 */
1117 struct backref_extent_entry_t {
1118 backref_extent_entry_t(
1119 paddr_t paddr,
1120 paddr_t key,
1121 extent_types_t type)
1122 : paddr(paddr), key(key), type(type) {}
1123 paddr_t paddr = P_ADDR_NULL;
1124 paddr_t key = P_ADDR_NULL;
1125 extent_types_t type = extent_types_t::ROOT;
1126 struct cmp_t {
1127 using is_transparent = paddr_t;
1128 bool operator()(
1129 const backref_extent_entry_t &l,
1130 const backref_extent_entry_t &r) const {
1131 return l.paddr < r.paddr;
1132 }
1133 bool operator()(
1134 const paddr_t &l,
1135 const backref_extent_entry_t &r) const {
1136 return l < r.paddr;
1137 }
1138 bool operator()(
1139 const backref_extent_entry_t &l,
1140 const paddr_t &r) const {
1141 return l.paddr < r;
1142 }
1143 };
1144 };
1145
1146 void update_tree_extents_num(extent_types_t type, int64_t delta) {
1147 switch (type) {
1148 case extent_types_t::LADDR_INTERNAL:
1149 [[fallthrough]];
1150 case extent_types_t::DINK_LADDR_LEAF:
1151 [[fallthrough]];
1152 case extent_types_t::LADDR_LEAF:
1153 stats.lba_tree_extents_num += delta;
1154 ceph_assert(stats.lba_tree_extents_num >= 0);
1155 return;
1156 case extent_types_t::OMAP_INNER:
1157 [[fallthrough]];
1158 case extent_types_t::OMAP_LEAF:
1159 stats.omap_tree_extents_num += delta;
1160 ceph_assert(stats.lba_tree_extents_num >= 0);
1161 return;
1162 case extent_types_t::ONODE_BLOCK_STAGED:
1163 stats.onode_tree_extents_num += delta;
1164 ceph_assert(stats.onode_tree_extents_num >= 0);
1165 return;
1166 case extent_types_t::BACKREF_INTERNAL:
1167 [[fallthrough]];
1168 case extent_types_t::BACKREF_LEAF:
1169 stats.backref_tree_extents_num += delta;
1170 ceph_assert(stats.backref_tree_extents_num >= 0);
1171 return;
1172 default:
1173 return;
1174 }
1175 }
1176
1177 uint64_t get_omap_tree_depth() {
1178 return stats.omap_tree_depth;
1179 }
1180
1181 /// Update lru for access to ref
1182 void touch_extent(
1183 CachedExtent &ext,
1184 const Transaction::src_t* p_src=nullptr)
1185 {
1186 if (p_src && is_background_transaction(*p_src))
1187 return;
1188 if (ext.is_clean() && !ext.is_placeholder()) {
1189 lru.move_to_top(ext);
1190 }
1191 }
1192
f67539c2 1193private:
1e59de90 1194 ExtentPlacementManager& epm;
f67539c2
TL
1195 RootBlockRef root; ///< ref to current root
1196 ExtentIndex extents; ///< set of live extents
1197
20effc67
TL
1198 journal_seq_t last_commit = JOURNAL_SEQ_MIN;
1199
1e59de90
TL
1200 // FIXME: This is specific to the segmented implementation
1201 std::vector<SegmentProvider*> segment_providers_by_device_id;
1202
1203 transaction_id_t next_id = 0;
1204
f67539c2
TL
1205 /**
1206 * dirty
1207 *
20effc67 1208 * holds refs to dirty extents. Ordered by CachedExtent::get_dirty_from().
f67539c2
TL
1209 */
1210 CachedExtent::list dirty;
1211
1e59de90
TL
1212 using backref_extent_entry_query_set_t =
1213 std::set<
1214 backref_extent_entry_t,
1215 backref_extent_entry_t::cmp_t>;
1216 backref_extent_entry_query_set_t backref_extents;
1217
1218 void add_backref_extent(
1219 paddr_t paddr,
1220 paddr_t key,
1221 extent_types_t type) {
1222 assert(!paddr.is_relative());
1223 auto [iter, inserted] = backref_extents.emplace(paddr, key, type);
1224 boost::ignore_unused(inserted);
1225 assert(inserted);
1226 }
1227
1228 void remove_backref_extent(paddr_t paddr) {
1229 auto iter = backref_extents.find(paddr);
1230 if (iter != backref_extents.end())
1231 backref_extents.erase(iter);
1232 }
1233
1234 backref_extent_entry_query_set_t get_backref_extents_in_range(
1235 paddr_t start,
1236 paddr_t end) {
1237 auto start_iter = backref_extents.lower_bound(start);
1238 auto end_iter = backref_extents.upper_bound(end);
1239 backref_extent_entry_query_set_t res;
1240 res.insert(start_iter, end_iter);
1241 return res;
1242 }
1243
1244 friend class crimson::os::seastore::backref::BtreeBackrefManager;
1245 friend class crimson::os::seastore::BackrefManager;
20effc67
TL
1246 /**
1247 * lru
1248 *
1249 * holds references to recently used extents
1250 */
1251 class LRU {
1252 // max size (bytes)
1253 const size_t capacity = 0;
1254
1255 // current size (bytes)
1256 size_t contents = 0;
1257
1258 CachedExtent::list lru;
1259
1260 void trim_to_capacity() {
1261 while (contents > capacity) {
1262 assert(lru.size() > 0);
1263 remove_from_lru(lru.front());
1264 }
1265 }
1266
1267 void add_to_lru(CachedExtent &extent) {
1e59de90 1268 assert(extent.is_clean() && !extent.is_placeholder());
20effc67
TL
1269
1270 if (!extent.primary_ref_list_hook.is_linked()) {
1271 contents += extent.get_length();
1272 intrusive_ptr_add_ref(&extent);
1273 lru.push_back(extent);
1274 }
1275 trim_to_capacity();
1276 }
1277
1278 public:
1279 LRU(size_t capacity) : capacity(capacity) {}
1280
1e59de90
TL
1281 size_t get_capacity() const {
1282 return capacity;
1283 }
1284
20effc67
TL
1285 size_t get_current_contents_bytes() const {
1286 return contents;
1287 }
1288
1289 size_t get_current_contents_extents() const {
1290 return lru.size();
1291 }
1292
1293 void remove_from_lru(CachedExtent &extent) {
1e59de90 1294 assert(extent.is_clean() && !extent.is_placeholder());
20effc67
TL
1295
1296 if (extent.primary_ref_list_hook.is_linked()) {
1297 lru.erase(lru.s_iterator_to(extent));
1298 assert(contents >= extent.get_length());
1299 contents -= extent.get_length();
1300 intrusive_ptr_release(&extent);
1301 }
1302 }
1303
1304 void move_to_top(CachedExtent &extent) {
1e59de90 1305 assert(extent.is_clean() && !extent.is_placeholder());
20effc67
TL
1306
1307 if (extent.primary_ref_list_hook.is_linked()) {
1308 lru.erase(lru.s_iterator_to(extent));
1309 intrusive_ptr_release(&extent);
1310 assert(contents >= extent.get_length());
1311 contents -= extent.get_length();
1312 }
1313 add_to_lru(extent);
1314 }
1315
1316 void clear() {
1317 LOG_PREFIX(Cache::LRU::clear);
1318 for (auto iter = lru.begin(); iter != lru.end();) {
1319 SUBDEBUG(seastore_cache, "clearing {}", *iter);
1320 remove_from_lru(*(iter++));
1321 }
1322 }
1323
1324 ~LRU() {
1325 clear();
1326 }
1327 } lru;
1328
1329 struct query_counters_t {
1330 uint64_t access = 0;
1331 uint64_t hit = 0;
1332 };
1333
20effc67
TL
1334 template <typename CounterT>
1335 using counter_by_extent_t = std::array<CounterT, EXTENT_TYPES_MAX>;
1336
1337 struct invalid_trans_efforts_t {
1e59de90
TL
1338 io_stat_t read;
1339 io_stat_t mutate;
20effc67 1340 uint64_t mutate_delta_bytes = 0;
1e59de90
TL
1341 io_stat_t retire;
1342 io_stat_t fresh;
1343 io_stat_t fresh_ool_written;
20effc67
TL
1344 counter_by_extent_t<uint64_t> num_trans_invalidated;
1345 uint64_t num_ool_records = 0;
1346 uint64_t ool_record_bytes = 0;
1347 };
1348
1349 struct commit_trans_efforts_t {
1e59de90
TL
1350 counter_by_extent_t<io_stat_t> read_by_ext;
1351 counter_by_extent_t<io_stat_t> mutate_by_ext;
20effc67 1352 counter_by_extent_t<uint64_t> delta_bytes_by_ext;
1e59de90
TL
1353 counter_by_extent_t<io_stat_t> retire_by_ext;
1354 counter_by_extent_t<io_stat_t> fresh_invalid_by_ext; // inline but is already invalid (retired)
1355 counter_by_extent_t<io_stat_t> fresh_inline_by_ext;
1356 counter_by_extent_t<io_stat_t> fresh_ool_by_ext;
20effc67
TL
1357 uint64_t num_trans = 0; // the number of inline records
1358 uint64_t num_ool_records = 0;
20effc67
TL
1359 uint64_t ool_record_metadata_bytes = 0;
1360 uint64_t ool_record_data_bytes = 0;
1361 uint64_t inline_record_metadata_bytes = 0; // metadata exclude the delta bytes
1362 };
1363
1364 struct success_read_trans_efforts_t {
1e59de90 1365 io_stat_t read;
20effc67
TL
1366 uint64_t num_trans = 0;
1367 };
1368
1369 struct tree_efforts_t {
1370 uint64_t num_inserts = 0;
1371 uint64_t num_erases = 0;
1e59de90 1372 uint64_t num_updates = 0;
20effc67
TL
1373
1374 void increment(const Transaction::tree_stats_t& incremental) {
1375 num_inserts += incremental.num_inserts;
1376 num_erases += incremental.num_erases;
1e59de90 1377 num_updates += incremental.num_updates;
20effc67
TL
1378 }
1379 };
1380
1381 template <typename CounterT>
1e59de90 1382 using counter_by_src_t = std::array<CounterT, TRANSACTION_TYPE_MAX>;
20effc67
TL
1383
1384 static constexpr std::size_t NUM_SRC_COMB =
1e59de90 1385 TRANSACTION_TYPE_MAX * (TRANSACTION_TYPE_MAX + 1) / 2;
20effc67
TL
1386
1387 struct {
1388 counter_by_src_t<uint64_t> trans_created_by_src;
1389 counter_by_src_t<commit_trans_efforts_t> committed_efforts_by_src;
1390 counter_by_src_t<invalid_trans_efforts_t> invalidated_efforts_by_src;
1391 counter_by_src_t<query_counters_t> cache_query_by_src;
1392 success_read_trans_efforts_t success_read_efforts;
1393 uint64_t dirty_bytes = 0;
1394
1395 uint64_t onode_tree_depth = 0;
1e59de90 1396 int64_t onode_tree_extents_num = 0;
20effc67
TL
1397 counter_by_src_t<tree_efforts_t> committed_onode_tree_efforts;
1398 counter_by_src_t<tree_efforts_t> invalidated_onode_tree_efforts;
1399
1e59de90
TL
1400 uint64_t omap_tree_depth = 0;
1401 int64_t omap_tree_extents_num = 0;
1402 counter_by_src_t<tree_efforts_t> committed_omap_tree_efforts;
1403 counter_by_src_t<tree_efforts_t> invalidated_omap_tree_efforts;
1404
20effc67 1405 uint64_t lba_tree_depth = 0;
1e59de90 1406 int64_t lba_tree_extents_num = 0;
20effc67
TL
1407 counter_by_src_t<tree_efforts_t> committed_lba_tree_efforts;
1408 counter_by_src_t<tree_efforts_t> invalidated_lba_tree_efforts;
1409
1e59de90
TL
1410 uint64_t backref_tree_depth = 0;
1411 int64_t backref_tree_extents_num = 0;
1412 counter_by_src_t<tree_efforts_t> committed_backref_tree_efforts;
1413 counter_by_src_t<tree_efforts_t> invalidated_backref_tree_efforts;
1414
20effc67
TL
1415 std::array<uint64_t, NUM_SRC_COMB> trans_conflicts_by_srcs;
1416 counter_by_src_t<uint64_t> trans_conflicts_by_unknown;
1e59de90
TL
1417
1418 version_stat_t committed_dirty_version;
1419 version_stat_t committed_reclaim_version;
20effc67
TL
1420 } stats;
1421
1422 template <typename CounterT>
1423 CounterT& get_by_src(
1424 counter_by_src_t<CounterT>& counters_by_src,
1425 Transaction::src_t src) {
1426 assert(static_cast<std::size_t>(src) < counters_by_src.size());
1427 return counters_by_src[static_cast<std::size_t>(src)];
1428 }
1429
1430 template <typename CounterT>
1431 CounterT& get_by_ext(
1432 counter_by_extent_t<CounterT>& counters_by_ext,
1433 extent_types_t ext) {
1434 auto index = static_cast<uint8_t>(ext);
1435 assert(index < EXTENT_TYPES_MAX);
1436 return counters_by_ext[index];
1437 }
1438
1439 void account_conflict(Transaction::src_t src1, Transaction::src_t src2) {
1440 assert(src1 < Transaction::src_t::MAX);
1441 assert(src2 < Transaction::src_t::MAX);
1442 if (src1 > src2) {
1443 std::swap(src1, src2);
1444 }
1445 // impossible combinations
1446 // should be consistent with trans_srcs_invalidated in register_metrics()
1447 assert(!(src1 == Transaction::src_t::READ &&
1448 src2 == Transaction::src_t::READ));
1e59de90
TL
1449 assert(!(src1 == Transaction::src_t::TRIM_DIRTY &&
1450 src2 == Transaction::src_t::TRIM_DIRTY));
1451 assert(!(src1 == Transaction::src_t::CLEANER_MAIN &&
1452 src2 == Transaction::src_t::CLEANER_MAIN));
1453 assert(!(src1 == Transaction::src_t::CLEANER_COLD &&
1454 src2 == Transaction::src_t::CLEANER_COLD));
1455 assert(!(src1 == Transaction::src_t::TRIM_ALLOC &&
1456 src2 == Transaction::src_t::TRIM_ALLOC));
20effc67
TL
1457
1458 auto src1_value = static_cast<std::size_t>(src1);
1459 auto src2_value = static_cast<std::size_t>(src2);
1460 auto num_srcs = static_cast<std::size_t>(Transaction::src_t::MAX);
1461 auto conflict_index = num_srcs * src1_value + src2_value -
1462 src1_value * (src1_value + 1) / 2;
1463 assert(conflict_index < NUM_SRC_COMB);
1464 ++stats.trans_conflicts_by_srcs[conflict_index];
1465 }
1466
1467 seastar::metrics::metric_group metrics;
1468 void register_metrics();
1469
f67539c2
TL
1470 /// alloc buffer for cached extent
1471 bufferptr alloc_cache_buf(size_t size) {
1472 // TODO: memory pooling etc
1473 auto bp = ceph::bufferptr(
1474 buffer::create_page_aligned(size));
1475 bp.zero();
1476 return bp;
1477 }
1478
1e59de90
TL
1479 void backref_batch_update(
1480 std::vector<backref_entry_ref> &&,
1481 const journal_seq_t &);
20effc67 1482
f67539c2 1483 /// Add extent to extents handling dirty and refcounting
1e59de90 1484 void add_extent(CachedExtentRef ref, const Transaction::src_t* t_src);
f67539c2
TL
1485
1486 /// Mark exising extent ref dirty -- mainly for replay
1487 void mark_dirty(CachedExtentRef ref);
1488
1489 /// Add dirty extent to dirty list
1490 void add_to_dirty(CachedExtentRef ref);
1491
20effc67
TL
1492 /// Remove from dirty list
1493 void remove_from_dirty(CachedExtentRef ref);
1494
f67539c2
TL
1495 /// Remove extent from extents handling dirty and refcounting
1496 void remove_extent(CachedExtentRef ref);
1497
20effc67
TL
1498 /// Retire extent
1499 void commit_retire_extent(Transaction& t, CachedExtentRef ref);
1500
f67539c2 1501 /// Replace prev with next
20effc67
TL
1502 void commit_replace_extent(Transaction& t, CachedExtentRef next, CachedExtentRef prev);
1503
1504 /// Invalidate extent and mark affected transactions
1505 void invalidate_extent(Transaction& t, CachedExtent& extent);
1506
1507 /// Mark a valid transaction as conflicted
1508 void mark_transaction_conflicted(
1509 Transaction& t, CachedExtent& conflicting_extent);
1510
1511 /// Introspect transaction when it is being destructed
1512 void on_transaction_destruct(Transaction& t);
1513
1514 template <typename T>
1515 get_extent_ret<T> read_extent(
1516 TCachedExtentRef<T>&& extent
1517 ) {
1518 assert(extent->state == CachedExtent::extent_state_t::CLEAN_PENDING);
1519 extent->set_io_wait();
1e59de90 1520 return epm.read(
20effc67
TL
1521 extent->get_paddr(),
1522 extent->get_length(),
1523 extent->get_bptr()
1524 ).safe_then(
1525 [extent=std::move(extent)]() mutable {
1e59de90
TL
1526 LOG_PREFIX(Cache::read_extent);
1527 if (likely(extent->state == CachedExtent::extent_state_t::CLEAN_PENDING)) {
1528 extent->state = CachedExtent::extent_state_t::CLEAN;
1529 /* TODO: crc should be checked against LBA manager */
1530 extent->last_committed_crc = extent->get_crc32c();
20effc67 1531
1e59de90
TL
1532 extent->on_clean_read();
1533 } else {
1534 ceph_assert(!extent->is_valid());
1535 }
20effc67 1536 extent->complete_io();
1e59de90 1537 SUBDEBUG(seastore_cache, "read extent done -- {}", *extent);
20effc67
TL
1538 return get_extent_ertr::make_ready_future<TCachedExtentRef<T>>(
1539 std::move(extent));
1540 },
1541 get_extent_ertr::pass_further{},
1542 crimson::ct_error::assert_all{
1543 "Cache::get_extent: invalid error"
1544 }
1545 );
1546 }
1547
1548 // Extents in cache may contain placeholders
1549 CachedExtentRef query_cache(
1550 paddr_t offset,
1551 const src_ext_t* p_metric_key) {
1552 query_counters_t* p_counters = nullptr;
1553 if (p_metric_key) {
1554 p_counters = &get_by_src(stats.cache_query_by_src, p_metric_key->first);
1555 ++p_counters->access;
1556 }
1557 if (auto iter = extents.find_offset(offset);
1558 iter != extents.end()) {
1559 if (p_metric_key &&
1560 // retired_placeholder is not really cached yet
1561 iter->get_type() != extent_types_t::RETIRED_PLACEHOLDER) {
1562 ++p_counters->hit;
1563 }
1564 return CachedExtentRef(&*iter);
1565 } else {
1566 return CachedExtentRef();
1567 }
1568 }
1569
1e59de90
TL
1570 template <
1571 typename node_key_t,
1572 typename node_val_t,
1573 typename internal_node_t,
1574 typename leaf_node_t,
1575 typename pin_t,
1576 size_t node_size,
1577 bool leaf_has_children>
1578 friend class FixedKVBtree;
f67539c2 1579};
20effc67 1580using CacheRef = std::unique_ptr<Cache>;
f67539c2
TL
1581
1582}