1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2 // vim: ts=8 sw=2 smarttab
12 #include <boost/intrusive_ptr.hpp>
13 #include <boost/iterator/counting_iterator.hpp>
14 #include <boost/smart_ptr/intrusive_ref_counter.hpp>
16 #include <seastar/core/future.hh>
18 #include "include/ceph_assert.h"
19 #include "include/buffer.h"
21 #include "crimson/osd/exceptions.h"
23 #include "crimson/os/seastore/logging.h"
24 #include "crimson/os/seastore/segment_cleaner.h"
25 #include "crimson/os/seastore/seastore_types.h"
26 #include "crimson/os/seastore/cache.h"
27 #include "crimson/os/seastore/segment_manager.h"
28 #include "crimson/os/seastore/lba_manager.h"
29 #include "crimson/os/seastore/journal.h"
30 #include "crimson/os/seastore/extent_placement_manager.h"
32 namespace crimson::os::seastore
{
36 auto repeat_eagain(F
&&f
) {
37 LOG_PREFIX("repeat_eagain");
38 return seastar::do_with(
41 return crimson::repeat(
45 return seastar::stop_iteration::yes
;
47 [FNAME
](const crimson::ct_error::eagain
&e
) {
48 SUBDEBUG(seastore_tm
, "hit eagain, restarting");
49 return seastar::stop_iteration::no
;
51 crimson::ct_error::pass_further_all
{}
60 * Abstraction hiding reading and writing to persistence.
61 * Exposes transaction based interface with read isolation.
63 class TransactionManager
: public SegmentCleaner::ExtentCallbackInterface
{
65 using base_ertr
= Cache::base_ertr
;
66 using base_iertr
= Cache::base_iertr
;
69 SegmentManager
&segment_manager
,
70 SegmentCleanerRef segment_cleaner
,
73 LBAManagerRef lba_manager
,
74 ExtentPlacementManagerRef
&& epm
,
75 ExtentReader
& scanner
);
77 /// Writes initial metadata to disk
78 using mkfs_ertr
= base_ertr
;
79 mkfs_ertr::future
<> mkfs();
81 /// Reads initial metadata from disk
82 using mount_ertr
= base_ertr
;
83 mount_ertr::future
<> mount();
85 /// Closes transaction_manager
86 using close_ertr
= base_ertr
;
87 close_ertr::future
<> close();
89 /// Creates empty transaction
90 TransactionRef
create_transaction(
91 Transaction::src_t src
,
92 const char* name
) final
{
93 return cache
->create_transaction(src
, name
, false);
96 /// Creates empty weak transaction
97 TransactionRef
create_weak_transaction(
98 Transaction::src_t src
,
100 return cache
->create_transaction(src
, name
, true);
103 /// Resets transaction
104 void reset_transaction_preserve_handle(Transaction
&t
) {
105 return cache
->reset_transaction_preserve_handle(t
);
111 * Get the logical pin at offset
113 using get_pin_iertr
= LBAManager::get_mapping_iertr
;
114 using get_pin_ret
= LBAManager::get_mapping_iertr::future
<LBAPinRef
>;
118 return lba_manager
->get_mapping(t
, offset
);
124 * Get logical pins overlapping offset~length
126 using get_pins_iertr
= LBAManager::get_mappings_iertr
;
127 using get_pins_ret
= get_pins_iertr::future
<lba_pin_list_t
>;
128 get_pins_ret
get_pins(
131 extent_len_t length
) {
132 return lba_manager
->get_mappings(
139 * Get extent mapped at pin.
141 using pin_to_extent_iertr
= get_pin_iertr::extend_ertr
<
142 SegmentManager::read_ertr
>;
143 template <typename T
>
144 using pin_to_extent_ret
= pin_to_extent_iertr::future
<
145 TCachedExtentRef
<T
>>;
146 template <typename T
>
147 pin_to_extent_ret
<T
> pin_to_extent(
150 LOG_PREFIX(TransactionManager::pin_to_extent
);
151 using ret
= pin_to_extent_ret
<T
>;
152 SUBDEBUGT(seastore_tm
, "getting extent {}", t
, *pin
);
154 return cache
->get_extent
<T
>(
158 [this, pin
=std::move(pin
)](T
&extent
) mutable {
159 assert(!extent
.has_pin());
160 assert(!extent
.has_been_invalidated());
161 assert(!pin
->has_been_invalidated());
162 extent
.set_pin(std::move(pin
));
163 lba_manager
->add_pin(extent
.get_pin());
165 ).si_then([FNAME
, &t
](auto ref
) mutable -> ret
{
166 SUBDEBUGT(seastore_tm
, "got extent {}", t
, *ref
);
167 return pin_to_extent_ret
<T
>(
168 interruptible::ready_future_marker
{},
176 * Read extent of type T at offset~length
178 using read_extent_iertr
= get_pin_iertr::extend_ertr
<
179 SegmentManager::read_ertr
>;
180 template <typename T
>
181 using read_extent_ret
= read_extent_iertr::future
<
182 TCachedExtentRef
<T
>>;
183 template <typename T
>
184 read_extent_ret
<T
> read_extent(
187 extent_len_t length
) {
188 LOG_PREFIX(TransactionManager::read_extent
);
191 ).si_then([this, FNAME
, &t
, offset
, length
] (auto pin
) {
192 if (length
!= pin
->get_length() || !pin
->get_paddr().is_real()) {
193 SUBERRORT(seastore_tm
,
194 "offset {} len {} got wrong pin {}",
195 t
, offset
, length
, *pin
);
196 ceph_assert(0 == "Should be impossible");
198 return this->pin_to_extent
<T
>(t
, std::move(pin
));
205 * Read extent of type T at offset
207 template <typename T
>
208 read_extent_ret
<T
> read_extent(
211 LOG_PREFIX(TransactionManager::read_extent
);
214 ).si_then([this, FNAME
, &t
, offset
] (auto pin
) {
215 if (!pin
->get_paddr().is_real()) {
216 SUBERRORT(seastore_tm
,
217 "offset {} got wrong pin {}",
219 ceph_assert(0 == "Should be impossible");
221 return this->pin_to_extent
<T
>(t
, std::move(pin
));
225 /// Obtain mutable copy of extent
226 LogicalCachedExtentRef
get_mutable_extent(Transaction
&t
, LogicalCachedExtentRef ref
) {
227 LOG_PREFIX(TransactionManager::get_mutable_extent
);
228 auto ret
= cache
->duplicate_for_write(
230 ref
)->cast
<LogicalCachedExtent
>();
231 stats
.extents_mutated_total
++;
232 stats
.extents_mutated_bytes
+= ret
->get_length();
233 if (!ret
->has_pin()) {
234 SUBDEBUGT(seastore_tm
,
235 "duplicating {} for write: {}",
239 ret
->set_pin(ref
->get_pin().duplicate());
241 SUBDEBUGT(seastore_tm
,
242 "{} already pending",
245 assert(ref
->is_pending());
246 assert(&*ref
== &*ret
);
252 using ref_iertr
= LBAManager::ref_iertr
;
253 using ref_ret
= ref_iertr::future
<unsigned>;
255 /// Add refcount for ref
258 LogicalCachedExtentRef
&ref
);
260 /// Add refcount for offset
265 /// Remove refcount for ref
268 LogicalCachedExtentRef
&ref
);
270 /// Remove refcount for offset
275 /// remove refcount for list of offset
276 using refs_ret
= ref_iertr::future
<std::vector
<unsigned>>;
279 std::vector
<laddr_t
> offsets
);
284 * Allocates a new block of type T with the minimum lba range of size len
285 * greater than laddr_hint.
287 using alloc_extent_iertr
= LBAManager::alloc_extent_iertr
;
288 template <typename T
>
289 using alloc_extent_ret
= alloc_extent_iertr::future
<TCachedExtentRef
<T
>>;
290 template <typename T
>
291 alloc_extent_ret
<T
> alloc_extent(
295 placement_hint_t placement_hint
;
296 if constexpr (T::TYPE
== extent_types_t::OBJECT_DATA_BLOCK
||
297 T::TYPE
== extent_types_t::COLL_BLOCK
) {
298 placement_hint
= placement_hint_t::COLD
;
300 placement_hint
= placement_hint_t::HOT
;
302 auto ext
= epm
->alloc_new_extent
<T
>(
306 return lba_manager
->alloc_extent(
311 ).si_then([ext
=std::move(ext
), len
, laddr_hint
, &t
, this](auto &&ref
) mutable {
312 LOG_PREFIX(TransactionManager::alloc_extent
);
313 ext
->set_pin(std::move(ref
));
314 stats
.extents_allocated_total
++;
315 stats
.extents_allocated_bytes
+= len
;
316 SUBDEBUGT(seastore_tm
, "new extent: {}, laddr_hint: {}", t
, *ext
, laddr_hint
);
317 return alloc_extent_iertr::make_ready_future
<TCachedExtentRef
<T
>>(
322 using reserve_extent_iertr
= alloc_extent_iertr
;
323 using reserve_extent_ret
= reserve_extent_iertr::future
<LBAPinRef
>;
324 reserve_extent_ret
reserve_region(
328 return lba_manager
->alloc_extent(
337 * allocates more than one new blocks of type T.
339 using alloc_extents_iertr
= alloc_extent_iertr
;
341 alloc_extents_iertr::future
<std::vector
<TCachedExtentRef
<T
>>>
347 return seastar::do_with(std::vector
<TCachedExtentRef
<T
>>(),
348 [this, &t
, hint
, len
, num
] (auto &extents
) {
349 return trans_intr::do_for_each(
350 boost::make_counting_iterator(0),
351 boost::make_counting_iterator(num
),
352 [this, &t
, len
, hint
, &extents
] (auto i
) {
353 return alloc_extent
<T
>(t
, hint
, len
).si_then(
354 [&extents
](auto &&node
) {
355 extents
.push_back(node
);
357 }).si_then([&extents
] {
358 return alloc_extents_iertr::make_ready_future
359 <std::vector
<TCachedExtentRef
<T
>>>(std::move(extents
));
367 * Atomically submits transaction to persistence
369 using submit_transaction_iertr
= base_iertr
;
370 submit_transaction_iertr::future
<> submit_transaction(Transaction
&);
372 /// SegmentCleaner::ExtentCallbackInterface
373 using SegmentCleaner::ExtentCallbackInterface::submit_transaction_direct_ret
;
374 submit_transaction_direct_ret
submit_transaction_direct(
375 Transaction
&t
) final
;
377 using SegmentCleaner::ExtentCallbackInterface::get_next_dirty_extents_ret
;
378 get_next_dirty_extents_ret
get_next_dirty_extents(
381 size_t max_bytes
) final
;
383 using SegmentCleaner::ExtentCallbackInterface::rewrite_extent_ret
;
384 rewrite_extent_ret
rewrite_extent(
386 CachedExtentRef extent
) final
;
388 using SegmentCleaner::ExtentCallbackInterface::get_extent_if_live_ret
;
389 get_extent_if_live_ret
get_extent_if_live(
394 segment_off_t len
) final
;
396 using release_segment_ret
=
397 SegmentCleaner::ExtentCallbackInterface::release_segment_ret
;
398 release_segment_ret
release_segment(
399 segment_id_t id
) final
{
400 return segment_manager
.release(id
);
406 * Read root block meta entry for key.
408 using read_root_meta_iertr
= base_iertr
;
409 using read_root_meta_bare
= std::optional
<std::string
>;
410 using read_root_meta_ret
= read_root_meta_iertr::future
<
411 read_root_meta_bare
>;
412 read_root_meta_ret
read_root_meta(
414 const std::string
&key
) {
415 return cache
->get_root(
417 ).si_then([&key
](auto root
) {
418 auto meta
= root
->root
.get_meta();
419 auto iter
= meta
.find(key
);
420 if (iter
== meta
.end()) {
421 return seastar::make_ready_future
<read_root_meta_bare
>(std::nullopt
);
423 return seastar::make_ready_future
<read_root_meta_bare
>(iter
->second
);
431 * Update root block meta entry for key to value.
433 using update_root_meta_iertr
= base_iertr
;
434 using update_root_meta_ret
= update_root_meta_iertr::future
<>;
435 update_root_meta_ret
update_root_meta(
437 const std::string
& key
,
438 const std::string
& value
) {
439 return cache
->get_root(
441 ).si_then([this, &t
, &key
, &value
](RootBlockRef root
) {
442 root
= cache
->duplicate_for_write(t
, root
)->cast
<RootBlock
>();
444 auto meta
= root
->root
.get_meta();
447 root
->root
.set_meta(meta
);
448 return seastar::now();
455 * Get onode-tree root logical address
457 using read_onode_root_iertr
= base_iertr
;
458 using read_onode_root_ret
= read_onode_root_iertr::future
<laddr_t
>;
459 read_onode_root_ret
read_onode_root(Transaction
&t
) {
460 return cache
->get_root(t
).si_then([](auto croot
) {
461 laddr_t ret
= croot
->get_root().onode_root
;
469 * Write onode-tree root logical address, must be called after read.
471 void write_onode_root(Transaction
&t
, laddr_t addr
) {
472 auto croot
= cache
->get_root_fast(t
);
473 croot
= cache
->duplicate_for_write(t
, croot
)->cast
<RootBlock
>();
474 croot
->get_root().onode_root
= addr
;
478 * read_collection_root
480 * Get collection root addr
482 using read_collection_root_iertr
= base_iertr
;
483 using read_collection_root_ret
= read_collection_root_iertr::future
<
485 read_collection_root_ret
read_collection_root(Transaction
&t
) {
486 return cache
->get_root(t
).si_then([](auto croot
) {
487 return croot
->get_root().collection_root
.get();
492 * write_collection_root
494 * Update collection root addr
496 void write_collection_root(Transaction
&t
, coll_root_t cmroot
) {
497 auto croot
= cache
->get_root_fast(t
);
498 croot
= cache
->duplicate_for_write(t
, croot
)->cast
<RootBlock
>();
499 croot
->get_root().collection_root
.update(cmroot
);
502 extent_len_t
get_block_size() const {
503 return segment_manager
.get_block_size();
506 store_statfs_t
store_stat() const {
507 return segment_cleaner
->stat();
510 void add_segment_manager(SegmentManager
* sm
) {
511 LOG_PREFIX(TransactionManager::add_segment_manager
);
512 SUBDEBUG(seastore_tm
, "adding segment manager {}", sm
->get_device_id());
513 scanner
.add_segment_manager(sm
);
515 device_type_t::SEGMENTED
,
516 std::make_unique
<SegmentedAllocator
>(
524 ~TransactionManager();
527 friend class Transaction
;
529 // although there might be multiple devices backing seastore,
530 // only one of them are supposed to hold the journal. This
531 // segment manager is that device
532 SegmentManager
&segment_manager
;
533 SegmentCleanerRef segment_cleaner
;
535 LBAManagerRef lba_manager
;
537 ExtentPlacementManagerRef epm
;
538 ExtentReader
& scanner
;
540 WritePipeline write_pipeline
;
543 uint64_t extents_retired_total
= 0;
544 uint64_t extents_retired_bytes
= 0;
545 uint64_t extents_mutated_total
= 0;
546 uint64_t extents_mutated_bytes
= 0;
547 uint64_t extents_allocated_total
= 0;
548 uint64_t extents_allocated_bytes
= 0;
550 seastar::metrics::metric_group metrics
;
551 void register_metrics();
553 rewrite_extent_ret
rewrite_logical_extent(
555 LogicalCachedExtentRef extent
);
557 // Testing interfaces
558 auto get_segment_cleaner() {
559 return segment_cleaner
.get();
562 auto get_lba_manager() {
563 return lba_manager
.get();
566 using TransactionManagerRef
= std::unique_ptr
<TransactionManager
>;