]> git.proxmox.com Git - ceph.git/blame - ceph/src/crimson/os/seastore/transaction.h
update ceph source to reef 18.1.2
[ceph.git] / ceph / src / crimson / os / seastore / transaction.h
CommitLineData
f67539c2
TL
1// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2// vim: ts=8 sw=2 smarttab
3
4#pragma once
5
6#include <iostream>
7
20effc67
TL
8#include <boost/intrusive/list.hpp>
9
10#include "crimson/common/log.h"
11#include "crimson/os/seastore/logging.h"
12#include "crimson/os/seastore/ordering_handle.h"
f67539c2
TL
13#include "crimson/os/seastore/seastore_types.h"
14#include "crimson/os/seastore/cached_extent.h"
15#include "crimson/os/seastore/root_block.h"
16
17namespace crimson::os::seastore {
18
20effc67
TL
19class SeaStore;
20class Transaction;
21
1e59de90
TL
22struct io_stat_t {
23 uint64_t num = 0;
24 uint64_t bytes = 0;
25
26 bool is_clear() const {
27 return (num == 0 && bytes == 0);
28 }
29
30 void increment(uint64_t _bytes) {
31 ++num;
32 bytes += _bytes;
33 }
34
35 void increment_stat(const io_stat_t& stat) {
36 num += stat.num;
37 bytes += stat.bytes;
38 }
39};
40inline std::ostream& operator<<(std::ostream& out, const io_stat_t& stat) {
41 return out << stat.num << "(" << stat.bytes << "B)";
42}
43
44struct version_stat_t {
45 uint64_t num = 0;
46 uint64_t version = 0;
47
48 bool is_clear() const {
49 return (num == 0 && version == 0);
50 }
51
52 void increment(extent_version_t v) {
53 ++num;
54 version += v;
55 }
56
57 void increment_stat(const version_stat_t& stat) {
58 num += stat.num;
59 version += stat.version;
60 }
61};
62
f67539c2
TL
63/**
64 * Transaction
65 *
66 * Representation of in-progress mutation. Used exclusively through Cache methods.
1e59de90
TL
67 *
68 * Transaction log levels:
69 * seastore_t
70 * - DEBUG: transaction create, conflict, commit events
71 * - TRACE: DEBUG details
72 * - seastore_cache logs
f67539c2
TL
73 */
74class Transaction {
75public:
76 using Ref = std::unique_ptr<Transaction>;
20effc67 77 using on_destruct_func_t = std::function<void(Transaction&)>;
f67539c2
TL
78 enum class get_extent_ret {
79 PRESENT,
80 ABSENT,
81 RETIRED
82 };
83 get_extent_ret get_extent(paddr_t addr, CachedExtentRef *out) {
20effc67 84 LOG_PREFIX(Transaction::get_extent);
1e59de90
TL
85 // it's possible that both write_set and retired_set contain
86 // this addr at the same time when addr is absolute and the
87 // corresponding extent is used to map existing extent on disk.
88 // So search write_set first.
89 if (auto iter = write_set.find_offset(addr);
f67539c2
TL
90 iter != write_set.end()) {
91 if (out)
92 *out = CachedExtentRef(&*iter);
1e59de90
TL
93 SUBTRACET(seastore_cache, "{} is present in write_set -- {}",
94 *this, addr, *iter);
95 assert((*out)->is_valid());
f67539c2 96 return get_extent_ret::PRESENT;
1e59de90
TL
97 } else if (retired_set.count(addr)) {
98 return get_extent_ret::RETIRED;
f67539c2
TL
99 } else if (
100 auto iter = read_set.find(addr);
101 iter != read_set.end()) {
20effc67
TL
102 // placeholder in read-set should be in the retired-set
103 // at the same time.
104 assert(iter->ref->get_type() != extent_types_t::RETIRED_PLACEHOLDER);
f67539c2 105 if (out)
20effc67 106 *out = iter->ref;
1e59de90
TL
107 SUBTRACET(seastore_cache, "{} is present in read_set -- {}",
108 *this, addr, *(iter->ref));
f67539c2
TL
109 return get_extent_ret::PRESENT;
110 } else {
111 return get_extent_ret::ABSENT;
112 }
113 }
114
115 void add_to_retired_set(CachedExtentRef ref) {
116 ceph_assert(!is_weak());
1e59de90
TL
117 if (ref->is_exist_clean() ||
118 ref->is_exist_mutation_pending()) {
119 existing_block_stats.dec(ref);
120 ref->set_invalid(*this);
121 write_set.erase(*ref);
122 } else if (ref->is_initial_pending()) {
123 ref->set_invalid(*this);
20effc67
TL
124 write_set.erase(*ref);
125 } else if (ref->is_mutation_pending()) {
1e59de90 126 ref->set_invalid(*this);
20effc67
TL
127 write_set.erase(*ref);
128 assert(ref->prior_instance);
129 retired_set.insert(ref->prior_instance);
130 assert(read_set.count(ref->prior_instance->get_paddr()));
131 ref->prior_instance.reset();
132 } else {
f67539c2
TL
133 // && retired_set.count(ref->get_paddr()) == 0
134 // If it's already in the set, insert here will be a noop,
135 // which is what we want.
136 retired_set.insert(ref);
f67539c2
TL
137 }
138 }
139
140 void add_to_read_set(CachedExtentRef ref) {
141 if (is_weak()) return;
142
1e59de90
TL
143 assert(ref->is_valid());
144
145 auto it = ref->transactions.lower_bound(
146 this, read_set_item_t<Transaction>::trans_cmp_t());
147 if (it != ref->transactions.end() && it->t == this) return;
148
20effc67
TL
149 auto [iter, inserted] = read_set.emplace(this, ref);
150 ceph_assert(inserted);
1e59de90
TL
151 ref->transactions.insert_before(
152 it, const_cast<read_set_item_t<Transaction>&>(*iter));
f67539c2
TL
153 }
154
20effc67 155 void add_fresh_extent(
1e59de90 156 CachedExtentRef ref) {
f67539c2 157 ceph_assert(!is_weak());
1e59de90
TL
158 if (ref->is_exist_clean()) {
159 existing_block_stats.inc(ref);
160 existing_block_list.push_back(ref);
161 } else if (ref->get_paddr().is_delayed()) {
162 assert(ref->get_paddr() == make_delayed_temp_paddr(0));
20effc67 163 assert(ref->is_logical());
1e59de90 164 ref->set_paddr(make_delayed_temp_paddr(delayed_temp_offset));
20effc67
TL
165 delayed_temp_offset += ref->get_length();
166 delayed_alloc_list.emplace_back(ref->cast<LogicalCachedExtent>());
1e59de90
TL
167 fresh_block_stats.increment(ref->get_length());
168 } else if (ref->get_paddr().is_absolute()) {
169 pre_alloc_list.emplace_back(ref->cast<LogicalCachedExtent>());
170 fresh_block_stats.increment(ref->get_length());
20effc67 171 } else {
1e59de90
TL
172 if (likely(ref->get_paddr() == make_record_relative_paddr(0))) {
173 ref->set_paddr(make_record_relative_paddr(offset));
174 } else {
175 ceph_assert(ref->get_paddr().is_fake());
176 }
20effc67
TL
177 offset += ref->get_length();
178 inline_block_list.push_back(ref);
1e59de90 179 fresh_block_stats.increment(ref->get_length());
20effc67 180 }
20effc67 181 write_set.insert(*ref);
1e59de90
TL
182 if (is_backref_node(ref->get_type()))
183 fresh_backref_extents++;
184 }
185
186 uint64_t get_num_fresh_backref() const {
187 return fresh_backref_extents;
20effc67
TL
188 }
189
190 void mark_delayed_extent_inline(LogicalCachedExtentRef& ref) {
20effc67 191 write_set.erase(*ref);
1e59de90
TL
192 assert(ref->get_paddr().is_delayed());
193 ref->set_paddr(make_record_relative_paddr(offset),
194 /* need_update_mapping: */ true);
f67539c2 195 offset += ref->get_length();
20effc67 196 inline_block_list.push_back(ref);
20effc67
TL
197 write_set.insert(*ref);
198 }
199
1e59de90
TL
200 void mark_delayed_extent_ool(LogicalCachedExtentRef& ref) {
201 written_ool_block_list.push_back(ref);
202 }
203
204 void update_delayed_ool_extent_addr(LogicalCachedExtentRef& ref,
205 paddr_t final_addr) {
20effc67 206 write_set.erase(*ref);
1e59de90
TL
207 assert(ref->get_paddr().is_delayed());
208 ref->set_paddr(final_addr, /* need_update_mapping: */ true);
20effc67
TL
209 assert(!ref->get_paddr().is_null());
210 assert(!ref->is_inline());
f67539c2
TL
211 write_set.insert(*ref);
212 }
213
1e59de90
TL
214 void mark_allocated_extent_ool(LogicalCachedExtentRef& ref) {
215 assert(ref->get_paddr().is_absolute());
216 assert(!ref->is_inline());
217 written_ool_block_list.push_back(ref);
218 }
219
f67539c2
TL
220 void add_mutated_extent(CachedExtentRef ref) {
221 ceph_assert(!is_weak());
1e59de90
TL
222 assert(ref->is_exist_mutation_pending() ||
223 read_set.count(ref->prior_instance->get_paddr()));
f67539c2 224 mutated_block_list.push_back(ref);
1e59de90
TL
225 if (!ref->is_exist_mutation_pending()) {
226 write_set.insert(*ref);
227 } else {
228 assert(write_set.find_offset(ref->get_paddr()) !=
229 write_set.end());
230 }
f67539c2
TL
231 }
232
20effc67
TL
233 void replace_placeholder(CachedExtent& placeholder, CachedExtent& extent) {
234 ceph_assert(!is_weak());
235
236 assert(placeholder.get_type() == extent_types_t::RETIRED_PLACEHOLDER);
237 assert(extent.get_type() != extent_types_t::RETIRED_PLACEHOLDER);
238 assert(extent.get_type() != extent_types_t::ROOT);
239 assert(extent.get_paddr() == placeholder.get_paddr());
240 {
241 auto where = read_set.find(placeholder.get_paddr());
242 assert(where != read_set.end());
243 assert(where->ref.get() == &placeholder);
244 where = read_set.erase(where);
1e59de90
TL
245 auto it = read_set.emplace_hint(where, this, &extent);
246 extent.transactions.insert(const_cast<read_set_item_t<Transaction>&>(*it));
20effc67
TL
247 }
248 {
249 auto where = retired_set.find(&placeholder);
250 assert(where != retired_set.end());
251 assert(where->get() == &placeholder);
252 where = retired_set.erase(where);
253 retired_set.emplace_hint(where, &extent);
254 }
255 }
256
1e59de90
TL
257 auto get_delayed_alloc_list() {
258 std::list<LogicalCachedExtentRef> ret;
259 for (auto& extent : delayed_alloc_list) {
260 // delayed extents may be invalidated
261 if (extent->is_valid()) {
262 ret.push_back(std::move(extent));
263 } else {
264 ++num_delayed_invalid_extents;
265 }
266 }
267 delayed_alloc_list.clear();
268 return ret;
f67539c2
TL
269 }
270
1e59de90
TL
271 auto get_valid_pre_alloc_list() {
272 std::list<LogicalCachedExtentRef> ret;
273 assert(num_allocated_invalid_extents == 0);
274 for (auto& extent : pre_alloc_list) {
275 if (extent->is_valid()) {
276 ret.push_back(extent);
277 } else {
278 ++num_allocated_invalid_extents;
279 }
280 }
281 return ret;
f67539c2
TL
282 }
283
1e59de90
TL
284 const auto &get_inline_block_list() {
285 return inline_block_list;
f67539c2
TL
286 }
287
288 const auto &get_mutated_block_list() {
289 return mutated_block_list;
290 }
291
1e59de90
TL
292 const auto &get_existing_block_list() {
293 return existing_block_list;
294 }
295
f67539c2
TL
296 const auto &get_retired_set() {
297 return retired_set;
298 }
299
1e59de90
TL
300 bool is_retired(paddr_t paddr, extent_len_t len) {
301 if (retired_set.empty()) {
302 return false;
303 }
304 auto iter = retired_set.lower_bound(paddr);
305 if (iter == retired_set.end() ||
306 (*iter)->get_paddr() > paddr) {
307 assert(iter != retired_set.begin());
308 --iter;
309 }
310 auto retired_paddr = (*iter)->get_paddr();
311 auto retired_length = (*iter)->get_length();
312 return retired_paddr <= paddr &&
313 retired_paddr.add_offset(retired_length) >= paddr.add_offset(len);
314 }
315
20effc67
TL
316 template <typename F>
317 auto for_each_fresh_block(F &&f) const {
1e59de90 318 std::for_each(written_ool_block_list.begin(), written_ool_block_list.end(), f);
20effc67
TL
319 std::for_each(inline_block_list.begin(), inline_block_list.end(), f);
320 }
321
20effc67
TL
322 const io_stat_t& get_fresh_block_stats() const {
323 return fresh_block_stats;
324 }
325
1e59de90 326 using src_t = transaction_type_t;
20effc67
TL
327 src_t get_src() const {
328 return src;
329 }
330
f67539c2
TL
331 bool is_weak() const {
332 return weak;
333 }
334
20effc67
TL
335 void test_set_conflict() {
336 conflicted = true;
337 }
338
339 bool is_conflicted() const {
340 return conflicted;
341 }
342
343 auto &get_handle() {
344 return handle;
345 }
346
347 Transaction(
348 OrderingHandle &&handle,
349 bool weak,
350 src_t src,
351 journal_seq_t initiated_after,
1e59de90
TL
352 on_destruct_func_t&& f,
353 transaction_id_t trans_id
20effc67
TL
354 ) : weak(weak),
355 handle(std::move(handle)),
356 on_destruct(std::move(f)),
1e59de90
TL
357 src(src),
358 trans_id(trans_id)
20effc67
TL
359 {}
360
361 void invalidate_clear_write_set() {
362 for (auto &&i: write_set) {
1e59de90 363 i.set_invalid(*this);
20effc67
TL
364 }
365 write_set.clear();
366 }
367
368 ~Transaction() {
369 on_destruct(*this);
370 invalidate_clear_write_set();
371 }
372
373 friend class crimson::os::seastore::SeaStore;
374 friend class TransactionConflictCondition;
375
376 void reset_preserve_handle(journal_seq_t initiated_after) {
377 root.reset();
378 offset = 0;
379 delayed_temp_offset = 0;
380 read_set.clear();
1e59de90 381 fresh_backref_extents = 0;
20effc67
TL
382 invalidate_clear_write_set();
383 mutated_block_list.clear();
384 fresh_block_stats = {};
385 num_delayed_invalid_extents = 0;
1e59de90 386 num_allocated_invalid_extents = 0;
20effc67
TL
387 delayed_alloc_list.clear();
388 inline_block_list.clear();
1e59de90
TL
389 written_ool_block_list.clear();
390 pre_alloc_list.clear();
20effc67 391 retired_set.clear();
1e59de90
TL
392 existing_block_list.clear();
393 existing_block_stats = {};
20effc67 394 onode_tree_stats = {};
1e59de90 395 omap_tree_stats = {};
20effc67 396 lba_tree_stats = {};
1e59de90 397 backref_tree_stats = {};
20effc67 398 ool_write_stats = {};
1e59de90 399 rewrite_version_stats = {};
20effc67
TL
400 conflicted = false;
401 if (!has_reset) {
402 has_reset = true;
403 }
404 }
405
406 bool did_reset() const {
407 return has_reset;
408 }
409
410 struct tree_stats_t {
411 uint64_t depth = 0;
412 uint64_t num_inserts = 0;
413 uint64_t num_erases = 0;
1e59de90
TL
414 uint64_t num_updates = 0;
415 int64_t extents_num_delta = 0;
20effc67
TL
416
417 bool is_clear() const {
418 return (depth == 0 &&
419 num_inserts == 0 &&
1e59de90
TL
420 num_erases == 0 &&
421 num_updates == 0 &&
422 extents_num_delta == 0);
20effc67
TL
423 }
424 };
425 tree_stats_t& get_onode_tree_stats() {
426 return onode_tree_stats;
427 }
1e59de90
TL
428 tree_stats_t& get_omap_tree_stats() {
429 return omap_tree_stats;
430 }
20effc67
TL
431 tree_stats_t& get_lba_tree_stats() {
432 return lba_tree_stats;
433 }
1e59de90
TL
434 tree_stats_t& get_backref_tree_stats() {
435 return backref_tree_stats;
20effc67
TL
436 }
437
438 struct ool_write_stats_t {
439 io_stat_t extents;
1e59de90 440 uint64_t md_bytes = 0;
20effc67
TL
441 uint64_t num_records = 0;
442
1e59de90
TL
443 uint64_t get_data_bytes() const {
444 return extents.bytes;
445 }
446
20effc67
TL
447 bool is_clear() const {
448 return (extents.is_clear() &&
1e59de90 449 md_bytes == 0 &&
20effc67
TL
450 num_records == 0);
451 }
452 };
453 ool_write_stats_t& get_ool_write_stats() {
454 return ool_write_stats;
455 }
1e59de90
TL
456 version_stat_t& get_rewrite_version_stats() {
457 return rewrite_version_stats;
458 }
459
460 struct existing_block_stats_t {
461 uint64_t valid_num = 0;
462 uint64_t clean_num = 0;
463 uint64_t mutated_num = 0;
464 void inc(const CachedExtentRef &ref) {
465 valid_num++;
466 if (ref->is_exist_clean()) {
467 clean_num++;
468 } else {
469 mutated_num++;
470 }
471 }
472 void dec(const CachedExtentRef &ref) {
473 valid_num--;
474 if (ref->is_exist_clean()) {
475 clean_num--;
476 } else {
477 mutated_num--;
478 }
479 }
480 };
481 existing_block_stats_t& get_existing_block_stats() {
482 return existing_block_stats;
483 }
20effc67 484
1e59de90
TL
485 transaction_id_t get_trans_id() const {
486 return trans_id;
20effc67
TL
487 }
488
f67539c2
TL
489private:
490 friend class Cache;
20effc67 491 friend Ref make_test_transaction();
f67539c2
TL
492
493 /**
494 * If set, *this may not be used to perform writes and will not provide
495 * consistentency allowing operations using to avoid maintaining a read_set.
496 */
497 const bool weak;
498
499 RootBlockRef root; ///< ref to root if read or written by transaction
500
1e59de90
TL
501 device_off_t offset = 0; ///< relative offset of next block
502 device_off_t delayed_temp_offset = 0;
20effc67
TL
503
504 /**
505 * read_set
506 *
507 * Holds a reference (with a refcount) to every extent read via *this.
508 * Submitting a transaction mutating any contained extent/addr will
509 * invalidate *this.
510 */
511 read_set_t<Transaction> read_set; ///< set of extents read by paddr
512
1e59de90
TL
513 uint64_t fresh_backref_extents = 0; // counter of new backref extents
514
20effc67
TL
515 /**
516 * write_set
517 *
518 * Contains a reference (without a refcount) to every extent mutated
519 * as part of *this. No contained extent may be referenced outside
520 * of *this. Every contained extent will be in one of inline_block_list,
1e59de90
TL
521 * written_ool_block_list or/and pre_alloc_list, mutated_block_list,
522 * or delayed_alloc_list.
20effc67
TL
523 */
524 ExtentIndex write_set;
f67539c2 525
20effc67
TL
526 /**
527 * lists of fresh blocks, holds refcounts, subset of write_set
528 */
529 io_stat_t fresh_block_stats;
530 uint64_t num_delayed_invalid_extents = 0;
1e59de90 531 uint64_t num_allocated_invalid_extents = 0;
20effc67
TL
532 /// blocks that will be committed with journal record inline
533 std::list<CachedExtentRef> inline_block_list;
534 /// blocks that will be committed with out-of-line record
1e59de90 535 std::list<CachedExtentRef> written_ool_block_list;
20effc67
TL
536 /// blocks with delayed allocation, may become inline or ool above
537 std::list<LogicalCachedExtentRef> delayed_alloc_list;
f67539c2 538
1e59de90
TL
539 /// Extents with pre-allocated addresses,
540 /// will be added to written_ool_block_list after write
541 std::list<LogicalCachedExtentRef> pre_alloc_list;
542
20effc67
TL
543 /// list of mutated blocks, holds refcounts, subset of write_set
544 std::list<CachedExtentRef> mutated_block_list;
545
1e59de90
TL
546 /// partial blocks of extents on disk, with data and refcounts
547 std::list<CachedExtentRef> existing_block_list;
548 existing_block_stats_t existing_block_stats;
549
20effc67
TL
550 /**
551 * retire_set
552 *
553 * Set of extents retired by *this.
554 */
555 pextent_set_t retired_set;
f67539c2 556
20effc67
TL
557 /// stats to collect when commit or invalidate
558 tree_stats_t onode_tree_stats;
1e59de90 559 tree_stats_t omap_tree_stats; // exclude omap tree depth
20effc67 560 tree_stats_t lba_tree_stats;
1e59de90 561 tree_stats_t backref_tree_stats;
20effc67 562 ool_write_stats_t ool_write_stats;
1e59de90 563 version_stat_t rewrite_version_stats;
f67539c2 564
20effc67
TL
565 bool conflicted = false;
566
567 bool has_reset = false;
568
569 OrderingHandle handle;
570
571 on_destruct_func_t on_destruct;
572
573 const src_t src;
574
1e59de90 575 transaction_id_t trans_id = TRANS_ID_NULL;
f67539c2
TL
576};
577using TransactionRef = Transaction::Ref;
578
20effc67
TL
579/// Should only be used with dummy staged-fltree node extent manager
580inline TransactionRef make_test_transaction() {
1e59de90 581 static transaction_id_t next_id = 0;
20effc67
TL
582 return std::make_unique<Transaction>(
583 get_dummy_ordering_handle(),
584 false,
585 Transaction::src_t::MUTATE,
1e59de90
TL
586 JOURNAL_SEQ_NULL,
587 [](Transaction&) {},
588 ++next_id
20effc67 589 );
f67539c2
TL
590}
591
20effc67
TL
592struct TransactionConflictCondition {
593 class transaction_conflict final : public std::exception {
594 public:
595 const char* what() const noexcept final {
596 return "transaction conflict detected";
597 }
598 };
599
600public:
601 TransactionConflictCondition(Transaction &t) : t(t) {}
602
603 template <typename Fut>
1e59de90 604 std::optional<Fut> may_interrupt() {
20effc67 605 if (t.conflicted) {
1e59de90
TL
606 return seastar::futurize<Fut>::make_exception_future(
607 transaction_conflict());
20effc67 608 } else {
1e59de90 609 return std::optional<Fut>();
20effc67
TL
610 }
611 }
612
613 template <typename T>
614 static constexpr bool is_interruption_v =
615 std::is_same_v<T, transaction_conflict>;
616
617
618 static bool is_interruption(std::exception_ptr& eptr) {
619 return *eptr.__cxa_exception_type() == typeid(transaction_conflict);
620 }
621
622private:
623 Transaction &t;
624};
625
626using trans_intr = crimson::interruptible::interruptor<
627 TransactionConflictCondition
628 >;
629
630template <typename E>
631using trans_iertr =
632 crimson::interruptible::interruptible_errorator<
633 TransactionConflictCondition,
634 E
635 >;
636
637template <typename F, typename... Args>
638auto with_trans_intr(Transaction &t, F &&f, Args&&... args) {
639 return trans_intr::with_interruption_to_error<crimson::ct_error::eagain>(
640 std::move(f),
641 TransactionConflictCondition(t),
642 t,
643 std::forward<Args>(args)...);
644}
645
646template <typename T>
647using with_trans_ertr = typename T::base_ertr::template extend<crimson::ct_error::eagain>;
648
f67539c2 649}
1e59de90
TL
650
651#if FMT_VERSION >= 90000
652template <> struct fmt::formatter<crimson::os::seastore::io_stat_t> : fmt::ostream_formatter {};
653#endif