]>
Commit | Line | Data |
---|---|---|
f67539c2 TL |
1 | // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- |
2 | // vim: ts=8 sw=2 smarttab | |
3 | ||
4 | #pragma once | |
5 | ||
6 | #include <iostream> | |
7 | ||
8 | #include "seastar/core/shared_future.hh" | |
9 | ||
10 | #include "include/buffer.h" | |
20effc67 TL |
11 | |
12 | #include "crimson/os/seastore/logging.h" | |
f67539c2 TL |
13 | #include "crimson/os/seastore/seastore_types.h" |
14 | #include "crimson/os/seastore/transaction.h" | |
15 | #include "crimson/os/seastore/segment_manager.h" | |
16 | #include "crimson/common/errorator.h" | |
17 | #include "crimson/os/seastore/cached_extent.h" | |
18 | #include "crimson/os/seastore/root_block.h" | |
19 | #include "crimson/os/seastore/segment_cleaner.h" | |
20effc67 | 20 | #include "crimson/os/seastore/random_block_manager.h" |
f67539c2 TL |
21 | |
22 | namespace crimson::os::seastore { | |
23 | ||
24 | /** | |
25 | * Cache | |
26 | * | |
27 | * This component is responsible for buffer management, including | |
28 | * transaction lifecycle. | |
29 | * | |
30 | * Seastore transactions are expressed as an atomic combination of | |
31 | * 1) newly written blocks | |
32 | * 2) logical mutations to existing physical blocks | |
33 | * | |
34 | * See record_t | |
35 | * | |
36 | * As such, any transaction has 3 components: | |
37 | * 1) read_set: references to extents read during the transaction | |
38 | * See Transaction::read_set | |
39 | * 2) write_set: references to extents to be written as: | |
40 | * a) new physical blocks, see Transaction::fresh_block_list | |
41 | * b) mutations to existing physical blocks, | |
42 | * see Transaction::mutated_block_list | |
43 | * 3) retired_set: extent refs to be retired either due to 2b or | |
44 | * due to releasing the extent generally. | |
45 | ||
46 | * In the case of 2b, the CachedExtent will have been copied into | |
47 | * a fresh CachedExtentRef such that the source extent ref is present | |
48 | * in the read set and the newly allocated extent is present in the | |
49 | * write_set. | |
50 | * | |
51 | * A transaction has 3 phases: | |
52 | * 1) construction: user calls Cache::get_transaction() and populates | |
53 | * the returned transaction by calling Cache methods | |
54 | * 2) submission: user calls Cache::try_start_transaction(). If | |
55 | * succcessful, the user may construct a record and submit the | |
56 | * transaction to the journal. | |
57 | * 3) completion: once the transaction is durable, the user must call | |
20effc67 | 58 | * Cache::complete_commit() with the block offset to complete |
f67539c2 TL |
59 | * the transaction. |
60 | * | |
61 | * Internally, in phase 1, the fields in Transaction are filled in. | |
62 | * - reads may block if the referenced extent is being written | |
63 | * - once a read obtains a particular CachedExtentRef for a paddr_t, | |
64 | * it'll always get the same one until overwritten | |
65 | * - once a paddr_t is overwritten or written, subsequent reads of | |
66 | * that addr will get the new ref | |
67 | * | |
68 | * In phase 2, if all extents in the read set are valid (not expired), | |
69 | * we can commit (otherwise, we fail and the user must retry). | |
70 | * - Expire all extents in the retired_set (they must all be valid) | |
71 | * - Remove all extents in the retired_set from Cache::extents | |
72 | * - Mark all extents in the write_set wait_io(), add promises to | |
73 | * transaction | |
74 | * - Merge Transaction::write_set into Cache::extents | |
75 | * | |
76 | * After phase 2, the user will submit the record to the journal. | |
77 | * Once complete, we perform phase 3: | |
78 | * - For each CachedExtent in block_list, call | |
79 | * CachedExtent::complete_initial_write(paddr_t) with the block's | |
80 | * final offset (inferred from the extent's position in the block_list | |
81 | * and extent lengths). | |
82 | * - For each block in mutation_list, call | |
83 | * CachedExtent::delta_written(paddr_t) with the address of the start | |
84 | * of the record | |
85 | * - Complete all promises with the final record start paddr_t | |
86 | */ | |
87 | class Cache { | |
88 | public: | |
20effc67 TL |
89 | using base_ertr = crimson::errorator< |
90 | crimson::ct_error::input_output_error>; | |
91 | using base_iertr = trans_iertr<base_ertr>; | |
92 | ||
93 | Cache(ExtentReader &reader); | |
f67539c2 TL |
94 | ~Cache(); |
95 | ||
20effc67 TL |
96 | /// Creates empty transaction by source |
97 | TransactionRef create_transaction( | |
98 | Transaction::src_t src, | |
99 | const char* name, | |
100 | bool is_weak) { | |
101 | LOG_PREFIX(Cache::create_transaction); | |
102 | ||
103 | ++(get_by_src(stats.trans_created_by_src, src)); | |
104 | ||
105 | auto ret = std::make_unique<Transaction>( | |
106 | get_dummy_ordering_handle(), | |
107 | is_weak, | |
108 | src, | |
109 | last_commit, | |
110 | [this](Transaction& t) { | |
111 | return on_transaction_destruct(t); | |
112 | } | |
113 | ); | |
114 | SUBDEBUGT(seastore_cache, "created name={}, source={}, is_weak={}", | |
115 | *ret, name, src, is_weak); | |
116 | return ret; | |
117 | } | |
118 | ||
119 | /// Resets transaction preserving | |
120 | void reset_transaction_preserve_handle(Transaction &t) { | |
121 | LOG_PREFIX(Cache::reset_transaction_preserve_handle); | |
122 | if (t.did_reset()) { | |
123 | ++(get_by_src(stats.trans_created_by_src, t.get_src())); | |
124 | } | |
125 | t.reset_preserve_handle(last_commit); | |
126 | SUBDEBUGT(seastore_cache, "reset", t); | |
127 | } | |
128 | ||
f67539c2 TL |
129 | /** |
130 | * drop_from_cache | |
131 | * | |
132 | * Drop extent from cache. Intended for use when | |
133 | * ref refers to a logically dead extent as during | |
134 | * replay. | |
135 | */ | |
136 | void drop_from_cache(CachedExtentRef ref) { | |
137 | remove_extent(ref); | |
138 | } | |
139 | ||
140 | /// Declare ref retired in t | |
141 | void retire_extent(Transaction &t, CachedExtentRef ref) { | |
142 | t.add_to_retired_set(ref); | |
143 | } | |
144 | ||
20effc67 TL |
145 | /// Declare paddr retired in t |
146 | using retire_extent_iertr = base_iertr; | |
147 | using retire_extent_ret = base_iertr::future<>; | |
148 | retire_extent_ret retire_extent_addr( | |
149 | Transaction &t, paddr_t addr, extent_len_t length); | |
f67539c2 TL |
150 | |
151 | /** | |
152 | * get_root | |
153 | * | |
154 | * returns ref to current root or t.root if modified in t | |
155 | */ | |
20effc67 TL |
156 | using get_root_iertr = base_iertr; |
157 | using get_root_ret = get_root_iertr::future<RootBlockRef>; | |
f67539c2 TL |
158 | get_root_ret get_root(Transaction &t); |
159 | ||
160 | /** | |
161 | * get_root_fast | |
162 | * | |
163 | * returns t.root and assume it is already present/read in t | |
164 | */ | |
165 | RootBlockRef get_root_fast(Transaction &t) { | |
166 | assert(t.root); | |
167 | return t.root; | |
168 | } | |
169 | ||
170 | /** | |
171 | * get_extent | |
172 | * | |
173 | * returns ref to extent at offset~length of type T either from | |
174 | * - extent_set if already in cache | |
175 | * - disk | |
176 | */ | |
20effc67 TL |
177 | using src_ext_t = std::pair<Transaction::src_t, extent_types_t>; |
178 | using get_extent_ertr = base_ertr; | |
f67539c2 | 179 | template <typename T> |
20effc67 TL |
180 | using get_extent_ret = get_extent_ertr::future<TCachedExtentRef<T>>; |
181 | template <typename T, typename Func> | |
182 | get_extent_ret<T> get_extent( | |
183 | paddr_t offset, ///< [in] starting addr | |
184 | segment_off_t length, ///< [in] length | |
185 | const src_ext_t* p_metric_key, ///< [in] cache query metric key | |
186 | Func &&extent_init_func ///< [in] init func for extent | |
f67539c2 | 187 | ) { |
20effc67 TL |
188 | auto cached = query_cache(offset, p_metric_key); |
189 | if (!cached) { | |
190 | auto ret = CachedExtent::make_cached_extent_ref<T>( | |
191 | alloc_cache_buf(length)); | |
192 | ret->set_paddr(offset); | |
193 | ret->state = CachedExtent::extent_state_t::CLEAN_PENDING; | |
194 | add_extent(ret); | |
195 | extent_init_func(*ret); | |
196 | return read_extent<T>( | |
197 | std::move(ret)); | |
198 | } | |
199 | ||
200 | // extent PRESENT in cache | |
201 | if (cached->get_type() == extent_types_t::RETIRED_PLACEHOLDER) { | |
202 | auto ret = CachedExtent::make_cached_extent_ref<T>( | |
203 | alloc_cache_buf(length)); | |
204 | ret->set_paddr(offset); | |
205 | ret->state = CachedExtent::extent_state_t::CLEAN_PENDING; | |
206 | extents.replace(*ret, *cached); | |
207 | ||
208 | // replace placeholder in transactions | |
209 | while (!cached->transactions.empty()) { | |
210 | auto t = cached->transactions.begin()->t; | |
211 | t->replace_placeholder(*cached, *ret); | |
212 | } | |
213 | ||
214 | cached->state = CachedExtent::extent_state_t::INVALID; | |
215 | extent_init_func(*ret); | |
216 | return read_extent<T>( | |
217 | std::move(ret)); | |
f67539c2 | 218 | } else { |
20effc67 TL |
219 | auto ret = TCachedExtentRef<T>(static_cast<T*>(cached.get())); |
220 | return ret->wait_io( | |
221 | ).then([ret=std::move(ret)]() mutable | |
222 | -> get_extent_ret<T> { | |
223 | // ret may be invalid, caller must check | |
224 | return get_extent_ret<T>( | |
225 | get_extent_ertr::ready_future_marker{}, | |
226 | std::move(ret)); | |
227 | }); | |
f67539c2 TL |
228 | } |
229 | } | |
20effc67 TL |
230 | template <typename T> |
231 | get_extent_ret<T> get_extent( | |
232 | paddr_t offset, ///< [in] starting addr | |
233 | segment_off_t length, ///< [in] length | |
234 | const src_ext_t* p_metric_key ///< [in] cache query metric key | |
235 | ) { | |
236 | return get_extent<T>( | |
237 | offset, length, p_metric_key, | |
238 | [](T &){}); | |
239 | } | |
f67539c2 TL |
240 | |
241 | /** | |
242 | * get_extent_if_cached | |
243 | * | |
244 | * Returns extent at offset if in cache | |
245 | */ | |
20effc67 TL |
246 | using get_extent_if_cached_iertr = base_iertr; |
247 | using get_extent_if_cached_ret = | |
248 | get_extent_if_cached_iertr::future<CachedExtentRef>; | |
249 | get_extent_if_cached_ret get_extent_if_cached( | |
f67539c2 TL |
250 | Transaction &t, |
251 | paddr_t offset, | |
20effc67 TL |
252 | extent_types_t type) { |
253 | CachedExtentRef ret; | |
254 | LOG_PREFIX(Cache::get_extent_if_cached); | |
255 | auto result = t.get_extent(offset, &ret); | |
f67539c2 | 256 | if (result != Transaction::get_extent_ret::ABSENT) { |
20effc67 TL |
257 | // including get_extent_ret::RETIRED |
258 | SUBDEBUGT(seastore_cache, | |
259 | "Found extent at offset {} on transaction: {}", | |
260 | t, offset, *ret); | |
261 | return get_extent_if_cached_iertr::make_ready_future< | |
262 | CachedExtentRef>(ret); | |
f67539c2 | 263 | } |
20effc67 TL |
264 | |
265 | // get_extent_ret::ABSENT from transaction | |
266 | auto metric_key = std::make_pair(t.get_src(), type); | |
267 | ret = query_cache(offset, &metric_key); | |
268 | if (!ret || | |
269 | // retired_placeholder is not really cached yet | |
270 | ret->get_type() == extent_types_t::RETIRED_PLACEHOLDER) { | |
271 | SUBDEBUGT(seastore_cache, | |
272 | "No extent at offset {}, retired_placeholder: {}", | |
273 | t, offset, !!ret); | |
274 | return get_extent_if_cached_iertr::make_ready_future< | |
275 | CachedExtentRef>(); | |
276 | } | |
277 | ||
278 | // present in cache and is not a retired_placeholder | |
279 | SUBDEBUGT(seastore_cache, | |
280 | "Found extent at offset {} in cache: {}", | |
281 | t, offset, *ret); | |
282 | t.add_to_read_set(ret); | |
283 | touch_extent(*ret); | |
284 | return ret->wait_io().then([ret] { | |
285 | return get_extent_if_cached_iertr::make_ready_future< | |
286 | CachedExtentRef>(ret); | |
287 | }); | |
f67539c2 TL |
288 | } |
289 | ||
290 | /** | |
291 | * get_extent | |
292 | * | |
293 | * returns ref to extent at offset~length of type T either from | |
294 | * - t if modified by t | |
295 | * - extent_set if already in cache | |
296 | * - disk | |
297 | * | |
298 | * t *must not* have retired offset | |
299 | */ | |
20effc67 TL |
300 | using get_extent_iertr = base_iertr; |
301 | template <typename T, typename Func> | |
302 | get_extent_iertr::future<TCachedExtentRef<T>> get_extent( | |
303 | Transaction &t, | |
304 | paddr_t offset, | |
305 | segment_off_t length, | |
306 | Func &&extent_init_func) { | |
f67539c2 | 307 | CachedExtentRef ret; |
20effc67 | 308 | LOG_PREFIX(Cache::get_extent); |
f67539c2 TL |
309 | auto result = t.get_extent(offset, &ret); |
310 | if (result != Transaction::get_extent_ret::ABSENT) { | |
311 | assert(result != Transaction::get_extent_ret::RETIRED); | |
20effc67 TL |
312 | SUBDEBUGT(seastore_cache, |
313 | "Found extent at offset {} on transaction: {}", | |
314 | t, offset, *ret); | |
315 | return seastar::make_ready_future<TCachedExtentRef<T>>( | |
f67539c2 TL |
316 | ret->cast<T>()); |
317 | } else { | |
20effc67 TL |
318 | auto metric_key = std::make_pair(t.get_src(), T::TYPE); |
319 | return trans_intr::make_interruptible( | |
320 | get_extent<T>( | |
321 | offset, length, &metric_key, | |
322 | std::forward<Func>(extent_init_func)) | |
323 | ).si_then([this, FNAME, offset, &t](auto ref) { | |
324 | (void)this; // silence incorrect clang warning about capture | |
325 | if (!ref->is_valid()) { | |
326 | SUBDEBUGT(seastore_cache, "got invalid extent: {}", t, ref); | |
327 | ++(get_by_src(stats.trans_conflicts_by_unknown, t.get_src())); | |
328 | mark_transaction_conflicted(t, *ref); | |
329 | return get_extent_iertr::make_ready_future<TCachedExtentRef<T>>(); | |
330 | } else { | |
331 | SUBDEBUGT(seastore_cache, | |
332 | "Read extent at offset {} in cache: {}", | |
333 | t, offset, *ref); | |
334 | touch_extent(*ref); | |
f67539c2 | 335 | t.add_to_read_set(ref); |
20effc67 | 336 | return get_extent_iertr::make_ready_future<TCachedExtentRef<T>>( |
f67539c2 | 337 | std::move(ref)); |
20effc67 TL |
338 | } |
339 | }); | |
f67539c2 TL |
340 | } |
341 | } | |
20effc67 TL |
342 | template <typename T> |
343 | get_extent_iertr::future<TCachedExtentRef<T>> get_extent( | |
344 | Transaction &t, | |
345 | paddr_t offset, | |
346 | segment_off_t length) { | |
347 | return get_extent<T>(t, offset, length, [](T &){}); | |
348 | } | |
349 | ||
f67539c2 TL |
350 | |
351 | /** | |
352 | * get_extent_by_type | |
353 | * | |
354 | * Based on type, instantiate the correct concrete type | |
355 | * and read in the extent at location offset~length. | |
356 | */ | |
20effc67 TL |
357 | private: |
358 | // This is a workaround std::move_only_function not being available, | |
359 | // not really worth generalizing at this time. | |
360 | class extent_init_func_t { | |
361 | struct callable_i { | |
362 | virtual void operator()(CachedExtent &extent) = 0; | |
363 | virtual ~callable_i() = default; | |
364 | }; | |
365 | template <typename Func> | |
366 | struct callable_wrapper final : callable_i { | |
367 | Func func; | |
368 | callable_wrapper(Func &&func) : func(std::forward<Func>(func)) {} | |
369 | void operator()(CachedExtent &extent) final { | |
370 | return func(extent); | |
371 | } | |
372 | ~callable_wrapper() final = default; | |
373 | }; | |
374 | public: | |
375 | std::unique_ptr<callable_i> wrapped; | |
376 | template <typename Func> | |
377 | extent_init_func_t(Func &&func) : wrapped( | |
378 | std::make_unique<callable_wrapper<Func>>(std::forward<Func>(func))) | |
379 | {} | |
380 | void operator()(CachedExtent &extent) { | |
381 | return (*wrapped)(extent); | |
382 | } | |
383 | }; | |
384 | get_extent_ertr::future<CachedExtentRef> _get_extent_by_type( | |
385 | extent_types_t type, | |
386 | paddr_t offset, | |
387 | laddr_t laddr, | |
388 | segment_off_t length, | |
389 | const Transaction::src_t* p_src, | |
390 | extent_init_func_t &&extent_init_func | |
f67539c2 TL |
391 | ); |
392 | ||
20effc67 TL |
393 | using get_extent_by_type_iertr = get_extent_iertr; |
394 | using get_extent_by_type_ret = get_extent_by_type_iertr::future< | |
395 | CachedExtentRef>; | |
396 | get_extent_by_type_ret _get_extent_by_type( | |
f67539c2 TL |
397 | Transaction &t, |
398 | extent_types_t type, | |
399 | paddr_t offset, | |
400 | laddr_t laddr, | |
20effc67 TL |
401 | segment_off_t length, |
402 | extent_init_func_t &&extent_init_func) { | |
f67539c2 | 403 | CachedExtentRef ret; |
20effc67 | 404 | auto status = t.get_extent(offset, &ret); |
f67539c2 | 405 | if (status == Transaction::get_extent_ret::RETIRED) { |
20effc67 | 406 | return seastar::make_ready_future<CachedExtentRef>(); |
f67539c2 | 407 | } else if (status == Transaction::get_extent_ret::PRESENT) { |
20effc67 | 408 | return seastar::make_ready_future<CachedExtentRef>(ret); |
f67539c2 | 409 | } else { |
20effc67 TL |
410 | auto src = t.get_src(); |
411 | return trans_intr::make_interruptible( | |
412 | _get_extent_by_type( | |
413 | type, offset, laddr, length, &src, | |
414 | std::move(extent_init_func)) | |
415 | ).si_then([=, &t](CachedExtentRef ret) { | |
416 | if (!ret->is_valid()) { | |
417 | LOG_PREFIX(Cache::get_extent_by_type); | |
418 | SUBDEBUGT(seastore_cache, "got invalid extent: {}", t, ret); | |
419 | ++(get_by_src(stats.trans_conflicts_by_unknown, t.get_src())); | |
420 | mark_transaction_conflicted(t, *ret.get()); | |
421 | return get_extent_ertr::make_ready_future<CachedExtentRef>(); | |
422 | } else { | |
423 | touch_extent(*ret); | |
424 | t.add_to_read_set(ret); | |
425 | return get_extent_ertr::make_ready_future<CachedExtentRef>( | |
426 | std::move(ret)); | |
427 | } | |
f67539c2 TL |
428 | }); |
429 | } | |
430 | } | |
431 | ||
20effc67 TL |
432 | public: |
433 | template <typename Func> | |
434 | get_extent_by_type_ret get_extent_by_type( | |
435 | Transaction &t, ///< [in] transaction | |
436 | extent_types_t type, ///< [in] type tag | |
437 | paddr_t offset, ///< [in] starting addr | |
438 | laddr_t laddr, ///< [in] logical address if logical | |
439 | segment_off_t length, ///< [in] length | |
440 | Func &&extent_init_func ///< [in] extent init func | |
f67539c2 | 441 | ) { |
20effc67 TL |
442 | return _get_extent_by_type( |
443 | t, | |
444 | type, | |
445 | offset, | |
446 | laddr, | |
447 | length, | |
448 | extent_init_func_t(std::forward<Func>(extent_init_func))); | |
449 | } | |
450 | get_extent_by_type_ret get_extent_by_type( | |
451 | Transaction &t, | |
452 | extent_types_t type, | |
453 | paddr_t offset, | |
454 | laddr_t laddr, | |
455 | segment_off_t length | |
456 | ) { | |
457 | return get_extent_by_type( | |
458 | t, type, offset, laddr, length, [](CachedExtent &) {}); | |
f67539c2 TL |
459 | } |
460 | ||
20effc67 | 461 | |
f67539c2 TL |
462 | /** |
463 | * alloc_new_extent | |
464 | * | |
20effc67 | 465 | * Allocates a fresh extent. if delayed is true, addr will be alloc'd later |
f67539c2 TL |
466 | */ |
467 | template <typename T> | |
468 | TCachedExtentRef<T> alloc_new_extent( | |
20effc67 TL |
469 | Transaction &t, ///< [in, out] current transaction |
470 | segment_off_t length, ///< [in] length | |
471 | bool delayed = false ///< [in] whether the paddr allocation of extent is delayed | |
f67539c2 TL |
472 | ) { |
473 | auto ret = CachedExtent::make_cached_extent_ref<T>( | |
474 | alloc_cache_buf(length)); | |
20effc67 | 475 | t.add_fresh_extent(ret, delayed); |
f67539c2 TL |
476 | ret->state = CachedExtent::extent_state_t::INITIAL_WRITE_PENDING; |
477 | return ret; | |
478 | } | |
479 | ||
20effc67 TL |
480 | void clear_lru() { |
481 | lru.clear(); | |
482 | } | |
483 | ||
484 | void mark_delayed_extent_inline( | |
485 | Transaction& t, | |
486 | LogicalCachedExtentRef& ref) { | |
487 | t.mark_delayed_extent_inline(ref); | |
488 | } | |
489 | ||
490 | void mark_delayed_extent_ool( | |
491 | Transaction& t, | |
492 | LogicalCachedExtentRef& ref, | |
493 | paddr_t final_addr) { | |
494 | t.mark_delayed_extent_ool(ref, final_addr); | |
495 | } | |
496 | ||
f67539c2 TL |
497 | /** |
498 | * alloc_new_extent | |
499 | * | |
500 | * Allocates a fresh extent. addr will be relative until commit. | |
501 | */ | |
502 | CachedExtentRef alloc_new_extent_by_type( | |
503 | Transaction &t, ///< [in, out] current transaction | |
504 | extent_types_t type, ///< [in] type tag | |
20effc67 TL |
505 | segment_off_t length, ///< [in] length |
506 | bool delayed = false ///< [in] whether delay addr allocation | |
f67539c2 TL |
507 | ); |
508 | ||
509 | /** | |
510 | * Allocates mutable buffer from extent_set on offset~len | |
511 | * | |
512 | * TODO: Note, currently all implementations literally copy the | |
513 | * buffer. This needn't be true, CachedExtent implementations could | |
514 | * choose to refer to the same buffer unmodified until commit and just | |
515 | * buffer the mutations in an ancillary data structure. | |
516 | * | |
517 | * @param current transaction | |
518 | * @param extent to duplicate | |
519 | * @return mutable extent | |
520 | */ | |
521 | CachedExtentRef duplicate_for_write( | |
522 | Transaction &t, ///< [in, out] current transaction | |
523 | CachedExtentRef i ///< [in] ref to existing extent | |
524 | ); | |
525 | ||
526 | /** | |
20effc67 | 527 | * prepare_record |
f67539c2 | 528 | * |
20effc67 | 529 | * Construct the record for Journal from transaction. |
f67539c2 | 530 | */ |
20effc67 | 531 | record_t prepare_record( |
f67539c2 TL |
532 | Transaction &t ///< [in, out] current transaction |
533 | ); | |
534 | ||
535 | /** | |
536 | * complete_commit | |
537 | * | |
538 | * Must be called upon completion of write. Releases blocks on mutating | |
539 | * extents, fills in addresses, and calls relevant callbacks on fresh | |
540 | * and mutated exents. | |
541 | */ | |
542 | void complete_commit( | |
543 | Transaction &t, ///< [in, out] current transaction | |
544 | paddr_t final_block_start, ///< [in] offset of initial block | |
545 | journal_seq_t seq, ///< [in] journal commit seq | |
546 | SegmentCleaner *cleaner=nullptr ///< [out] optional segment stat listener | |
547 | ); | |
548 | ||
549 | /** | |
550 | * init | |
551 | */ | |
552 | void init(); | |
553 | ||
554 | /** | |
555 | * mkfs | |
556 | * | |
557 | * Alloc initial root node and add to t. The intention is for other | |
558 | * components to use t to adjust the resulting root ref prior to commit. | |
559 | */ | |
20effc67 TL |
560 | using mkfs_iertr = base_iertr; |
561 | mkfs_iertr::future<> mkfs(Transaction &t); | |
f67539c2 TL |
562 | |
563 | /** | |
564 | * close | |
565 | * | |
566 | * TODO: should flush dirty blocks | |
567 | */ | |
568 | using close_ertr = crimson::errorator< | |
569 | crimson::ct_error::input_output_error>; | |
570 | close_ertr::future<> close(); | |
571 | ||
572 | /** | |
573 | * replay_delta | |
574 | * | |
575 | * Intended for use in Journal::delta. For each delta, should decode delta, | |
576 | * read relevant block from disk or cache (using correct type), and call | |
577 | * CachedExtent::apply_delta marking the extent dirty. | |
578 | */ | |
579 | using replay_delta_ertr = crimson::errorator< | |
580 | crimson::ct_error::input_output_error>; | |
581 | using replay_delta_ret = replay_delta_ertr::future<>; | |
582 | replay_delta_ret replay_delta( | |
583 | journal_seq_t seq, | |
584 | paddr_t record_block_base, | |
585 | const delta_info_t &delta); | |
586 | ||
587 | /** | |
588 | * init_cached_extents | |
589 | * | |
590 | * Calls passed lambda for each dirty cached block. Intended for use | |
591 | * after replay to allow lba_manager (or w/e) to read in any ancestor | |
592 | * blocks. | |
593 | */ | |
20effc67 TL |
594 | using init_cached_extents_iertr = base_iertr; |
595 | using init_cached_extents_ret = init_cached_extents_iertr::future<>; | |
f67539c2 TL |
596 | template <typename F> |
597 | init_cached_extents_ret init_cached_extents( | |
598 | Transaction &t, | |
599 | F &&f) | |
600 | { | |
20effc67 TL |
601 | // journal replay should has been finished at this point, |
602 | // Cache::root should have been inserted to the dirty list | |
603 | assert(root->is_dirty()); | |
f67539c2 TL |
604 | std::vector<CachedExtentRef> dirty; |
605 | for (auto &e : extents) { | |
606 | dirty.push_back(CachedExtentRef(&e)); | |
607 | } | |
608 | return seastar::do_with( | |
609 | std::forward<F>(f), | |
610 | std::move(dirty), | |
611 | [&t](auto &f, auto &refs) mutable { | |
20effc67 | 612 | return trans_intr::do_for_each( |
f67539c2 TL |
613 | refs, |
614 | [&t, &f](auto &e) { return f(t, e); }); | |
20effc67 TL |
615 | }).handle_error_interruptible( |
616 | init_cached_extents_iertr::pass_further{}, | |
617 | crimson::ct_error::assert_all{ | |
618 | "Invalid error in Cache::init_cached_extents" | |
619 | } | |
620 | ); | |
f67539c2 TL |
621 | } |
622 | ||
623 | /** | |
624 | * update_extent_from_transaction | |
625 | * | |
626 | * Updates passed extent based on t. If extent has been retired, | |
627 | * a null result will be returned. | |
628 | */ | |
629 | CachedExtentRef update_extent_from_transaction( | |
630 | Transaction &t, | |
631 | CachedExtentRef extent) { | |
632 | if (extent->get_type() == extent_types_t::ROOT) { | |
633 | if (t.root) { | |
634 | return t.root; | |
635 | } else { | |
20effc67 TL |
636 | t.add_to_read_set(extent); |
637 | t.root = extent->cast<RootBlock>(); | |
f67539c2 TL |
638 | return extent; |
639 | } | |
640 | } else { | |
641 | auto result = t.get_extent(extent->get_paddr(), &extent); | |
642 | if (result == Transaction::get_extent_ret::RETIRED) { | |
643 | return CachedExtentRef(); | |
644 | } else { | |
20effc67 TL |
645 | if (result == Transaction::get_extent_ret::ABSENT) { |
646 | t.add_to_read_set(extent); | |
647 | } | |
f67539c2 TL |
648 | return extent; |
649 | } | |
650 | } | |
651 | } | |
652 | ||
653 | /** | |
654 | ||
655 | * | |
656 | * Dump summary of contents (TODO) | |
657 | */ | |
658 | std::ostream &print( | |
659 | std::ostream &out) const { | |
660 | return out; | |
661 | } | |
662 | ||
20effc67 TL |
663 | /** |
664 | * get_next_dirty_extents | |
665 | * | |
666 | * Returns extents with get_dirty_from() < seq and adds to read set of | |
667 | * t. | |
668 | */ | |
669 | using get_next_dirty_extents_iertr = base_iertr; | |
670 | using get_next_dirty_extents_ret = get_next_dirty_extents_iertr::future< | |
f67539c2 TL |
671 | std::vector<CachedExtentRef>>; |
672 | get_next_dirty_extents_ret get_next_dirty_extents( | |
20effc67 TL |
673 | Transaction &t, |
674 | journal_seq_t seq, | |
675 | size_t max_bytes); | |
676 | ||
677 | /// returns std::nullopt if no dirty extents or get_dirty_from() for oldest | |
678 | std::optional<journal_seq_t> get_oldest_dirty_from() const { | |
679 | if (dirty.empty()) { | |
680 | return std::nullopt; | |
681 | } else { | |
682 | auto oldest = dirty.begin()->get_dirty_from(); | |
683 | if (oldest == journal_seq_t()) { | |
684 | return std::nullopt; | |
685 | } else { | |
686 | return oldest; | |
687 | } | |
688 | } | |
689 | } | |
690 | ||
691 | /// Dump live extents | |
692 | void dump_contents(); | |
f67539c2 TL |
693 | |
694 | private: | |
20effc67 | 695 | ExtentReader &reader; ///< ref to extent reader |
f67539c2 TL |
696 | RootBlockRef root; ///< ref to current root |
697 | ExtentIndex extents; ///< set of live extents | |
698 | ||
20effc67 TL |
699 | journal_seq_t last_commit = JOURNAL_SEQ_MIN; |
700 | ||
f67539c2 TL |
701 | /** |
702 | * dirty | |
703 | * | |
20effc67 | 704 | * holds refs to dirty extents. Ordered by CachedExtent::get_dirty_from(). |
f67539c2 TL |
705 | */ |
706 | CachedExtent::list dirty; | |
707 | ||
20effc67 TL |
708 | /** |
709 | * lru | |
710 | * | |
711 | * holds references to recently used extents | |
712 | */ | |
713 | class LRU { | |
714 | // max size (bytes) | |
715 | const size_t capacity = 0; | |
716 | ||
717 | // current size (bytes) | |
718 | size_t contents = 0; | |
719 | ||
720 | CachedExtent::list lru; | |
721 | ||
722 | void trim_to_capacity() { | |
723 | while (contents > capacity) { | |
724 | assert(lru.size() > 0); | |
725 | remove_from_lru(lru.front()); | |
726 | } | |
727 | } | |
728 | ||
729 | void add_to_lru(CachedExtent &extent) { | |
730 | assert( | |
731 | extent.is_clean() && | |
732 | !extent.is_pending() && | |
733 | !extent.is_placeholder()); | |
734 | ||
735 | if (!extent.primary_ref_list_hook.is_linked()) { | |
736 | contents += extent.get_length(); | |
737 | intrusive_ptr_add_ref(&extent); | |
738 | lru.push_back(extent); | |
739 | } | |
740 | trim_to_capacity(); | |
741 | } | |
742 | ||
743 | public: | |
744 | LRU(size_t capacity) : capacity(capacity) {} | |
745 | ||
746 | size_t get_current_contents_bytes() const { | |
747 | return contents; | |
748 | } | |
749 | ||
750 | size_t get_current_contents_extents() const { | |
751 | return lru.size(); | |
752 | } | |
753 | ||
754 | void remove_from_lru(CachedExtent &extent) { | |
755 | assert(extent.is_clean()); | |
756 | assert(!extent.is_pending()); | |
757 | assert(!extent.is_placeholder()); | |
758 | ||
759 | if (extent.primary_ref_list_hook.is_linked()) { | |
760 | lru.erase(lru.s_iterator_to(extent)); | |
761 | assert(contents >= extent.get_length()); | |
762 | contents -= extent.get_length(); | |
763 | intrusive_ptr_release(&extent); | |
764 | } | |
765 | } | |
766 | ||
767 | void move_to_top(CachedExtent &extent) { | |
768 | assert( | |
769 | extent.is_clean() && | |
770 | !extent.is_pending() && | |
771 | !extent.is_placeholder()); | |
772 | ||
773 | if (extent.primary_ref_list_hook.is_linked()) { | |
774 | lru.erase(lru.s_iterator_to(extent)); | |
775 | intrusive_ptr_release(&extent); | |
776 | assert(contents >= extent.get_length()); | |
777 | contents -= extent.get_length(); | |
778 | } | |
779 | add_to_lru(extent); | |
780 | } | |
781 | ||
782 | void clear() { | |
783 | LOG_PREFIX(Cache::LRU::clear); | |
784 | for (auto iter = lru.begin(); iter != lru.end();) { | |
785 | SUBDEBUG(seastore_cache, "clearing {}", *iter); | |
786 | remove_from_lru(*(iter++)); | |
787 | } | |
788 | } | |
789 | ||
790 | ~LRU() { | |
791 | clear(); | |
792 | } | |
793 | } lru; | |
794 | ||
795 | struct query_counters_t { | |
796 | uint64_t access = 0; | |
797 | uint64_t hit = 0; | |
798 | }; | |
799 | ||
800 | /** | |
801 | * effort_t | |
802 | * | |
803 | * Count the number of extents involved in the effort and the total bytes of | |
804 | * them. | |
805 | * | |
806 | * Each effort_t represents the effort of a set of extents involved in the | |
807 | * transaction, classified by read, mutate, retire and allocate behaviors, | |
808 | * see XXX_trans_efforts_t. | |
809 | */ | |
810 | struct effort_t { | |
811 | uint64_t extents = 0; | |
812 | uint64_t bytes = 0; | |
813 | ||
814 | void increment(uint64_t extent_len) { | |
815 | ++extents; | |
816 | bytes += extent_len; | |
817 | } | |
818 | }; | |
819 | ||
820 | template <typename CounterT> | |
821 | using counter_by_extent_t = std::array<CounterT, EXTENT_TYPES_MAX>; | |
822 | ||
823 | struct invalid_trans_efforts_t { | |
824 | effort_t read; | |
825 | effort_t mutate; | |
826 | uint64_t mutate_delta_bytes = 0; | |
827 | effort_t retire; | |
828 | effort_t fresh; | |
829 | effort_t fresh_ool_written; | |
830 | counter_by_extent_t<uint64_t> num_trans_invalidated; | |
831 | uint64_t num_ool_records = 0; | |
832 | uint64_t ool_record_bytes = 0; | |
833 | }; | |
834 | ||
835 | struct commit_trans_efforts_t { | |
836 | counter_by_extent_t<effort_t> read_by_ext; | |
837 | counter_by_extent_t<effort_t> mutate_by_ext; | |
838 | counter_by_extent_t<uint64_t> delta_bytes_by_ext; | |
839 | counter_by_extent_t<effort_t> retire_by_ext; | |
840 | counter_by_extent_t<effort_t> fresh_invalid_by_ext; // inline but is already invalid (retired) | |
841 | counter_by_extent_t<effort_t> fresh_inline_by_ext; | |
842 | counter_by_extent_t<effort_t> fresh_ool_by_ext; | |
843 | uint64_t num_trans = 0; // the number of inline records | |
844 | uint64_t num_ool_records = 0; | |
845 | uint64_t ool_record_padding_bytes = 0; | |
846 | uint64_t ool_record_metadata_bytes = 0; | |
847 | uint64_t ool_record_data_bytes = 0; | |
848 | uint64_t inline_record_metadata_bytes = 0; // metadata exclude the delta bytes | |
849 | }; | |
850 | ||
851 | struct success_read_trans_efforts_t { | |
852 | effort_t read; | |
853 | uint64_t num_trans = 0; | |
854 | }; | |
855 | ||
856 | struct tree_efforts_t { | |
857 | uint64_t num_inserts = 0; | |
858 | uint64_t num_erases = 0; | |
859 | ||
860 | void increment(const Transaction::tree_stats_t& incremental) { | |
861 | num_inserts += incremental.num_inserts; | |
862 | num_erases += incremental.num_erases; | |
863 | } | |
864 | }; | |
865 | ||
866 | template <typename CounterT> | |
867 | using counter_by_src_t = std::array<CounterT, Transaction::SRC_MAX>; | |
868 | ||
869 | static constexpr std::size_t NUM_SRC_COMB = | |
870 | Transaction::SRC_MAX * (Transaction::SRC_MAX + 1) / 2; | |
871 | ||
872 | struct { | |
873 | counter_by_src_t<uint64_t> trans_created_by_src; | |
874 | counter_by_src_t<commit_trans_efforts_t> committed_efforts_by_src; | |
875 | counter_by_src_t<invalid_trans_efforts_t> invalidated_efforts_by_src; | |
876 | counter_by_src_t<query_counters_t> cache_query_by_src; | |
877 | success_read_trans_efforts_t success_read_efforts; | |
878 | uint64_t dirty_bytes = 0; | |
879 | ||
880 | uint64_t onode_tree_depth = 0; | |
881 | counter_by_src_t<tree_efforts_t> committed_onode_tree_efforts; | |
882 | counter_by_src_t<tree_efforts_t> invalidated_onode_tree_efforts; | |
883 | ||
884 | uint64_t lba_tree_depth = 0; | |
885 | counter_by_src_t<tree_efforts_t> committed_lba_tree_efforts; | |
886 | counter_by_src_t<tree_efforts_t> invalidated_lba_tree_efforts; | |
887 | ||
888 | std::array<uint64_t, NUM_SRC_COMB> trans_conflicts_by_srcs; | |
889 | counter_by_src_t<uint64_t> trans_conflicts_by_unknown; | |
890 | } stats; | |
891 | ||
892 | template <typename CounterT> | |
893 | CounterT& get_by_src( | |
894 | counter_by_src_t<CounterT>& counters_by_src, | |
895 | Transaction::src_t src) { | |
896 | assert(static_cast<std::size_t>(src) < counters_by_src.size()); | |
897 | return counters_by_src[static_cast<std::size_t>(src)]; | |
898 | } | |
899 | ||
900 | template <typename CounterT> | |
901 | CounterT& get_by_ext( | |
902 | counter_by_extent_t<CounterT>& counters_by_ext, | |
903 | extent_types_t ext) { | |
904 | auto index = static_cast<uint8_t>(ext); | |
905 | assert(index < EXTENT_TYPES_MAX); | |
906 | return counters_by_ext[index]; | |
907 | } | |
908 | ||
909 | void account_conflict(Transaction::src_t src1, Transaction::src_t src2) { | |
910 | assert(src1 < Transaction::src_t::MAX); | |
911 | assert(src2 < Transaction::src_t::MAX); | |
912 | if (src1 > src2) { | |
913 | std::swap(src1, src2); | |
914 | } | |
915 | // impossible combinations | |
916 | // should be consistent with trans_srcs_invalidated in register_metrics() | |
917 | assert(!(src1 == Transaction::src_t::READ && | |
918 | src2 == Transaction::src_t::READ)); | |
919 | assert(!(src1 == Transaction::src_t::CLEANER_TRIM && | |
920 | src2 == Transaction::src_t::CLEANER_TRIM)); | |
921 | assert(!(src1 == Transaction::src_t::CLEANER_RECLAIM && | |
922 | src2 == Transaction::src_t::CLEANER_RECLAIM)); | |
923 | assert(!(src1 == Transaction::src_t::CLEANER_TRIM && | |
924 | src2 == Transaction::src_t::CLEANER_RECLAIM)); | |
925 | ||
926 | auto src1_value = static_cast<std::size_t>(src1); | |
927 | auto src2_value = static_cast<std::size_t>(src2); | |
928 | auto num_srcs = static_cast<std::size_t>(Transaction::src_t::MAX); | |
929 | auto conflict_index = num_srcs * src1_value + src2_value - | |
930 | src1_value * (src1_value + 1) / 2; | |
931 | assert(conflict_index < NUM_SRC_COMB); | |
932 | ++stats.trans_conflicts_by_srcs[conflict_index]; | |
933 | } | |
934 | ||
935 | seastar::metrics::metric_group metrics; | |
936 | void register_metrics(); | |
937 | ||
f67539c2 TL |
938 | /// alloc buffer for cached extent |
939 | bufferptr alloc_cache_buf(size_t size) { | |
940 | // TODO: memory pooling etc | |
941 | auto bp = ceph::bufferptr( | |
942 | buffer::create_page_aligned(size)); | |
943 | bp.zero(); | |
944 | return bp; | |
945 | } | |
946 | ||
20effc67 TL |
947 | /// Update lru for access to ref |
948 | void touch_extent(CachedExtent &ext) { | |
949 | assert(!ext.is_pending()); | |
950 | if (ext.is_clean() && !ext.is_placeholder()) { | |
951 | lru.move_to_top(ext); | |
952 | } | |
953 | } | |
954 | ||
f67539c2 TL |
955 | /// Add extent to extents handling dirty and refcounting |
956 | void add_extent(CachedExtentRef ref); | |
957 | ||
958 | /// Mark exising extent ref dirty -- mainly for replay | |
959 | void mark_dirty(CachedExtentRef ref); | |
960 | ||
961 | /// Add dirty extent to dirty list | |
962 | void add_to_dirty(CachedExtentRef ref); | |
963 | ||
20effc67 TL |
964 | /// Remove from dirty list |
965 | void remove_from_dirty(CachedExtentRef ref); | |
966 | ||
f67539c2 TL |
967 | /// Remove extent from extents handling dirty and refcounting |
968 | void remove_extent(CachedExtentRef ref); | |
969 | ||
20effc67 TL |
970 | /// Retire extent |
971 | void commit_retire_extent(Transaction& t, CachedExtentRef ref); | |
972 | ||
f67539c2 | 973 | /// Replace prev with next |
20effc67 TL |
974 | void commit_replace_extent(Transaction& t, CachedExtentRef next, CachedExtentRef prev); |
975 | ||
976 | /// Invalidate extent and mark affected transactions | |
977 | void invalidate_extent(Transaction& t, CachedExtent& extent); | |
978 | ||
979 | /// Mark a valid transaction as conflicted | |
980 | void mark_transaction_conflicted( | |
981 | Transaction& t, CachedExtent& conflicting_extent); | |
982 | ||
983 | /// Introspect transaction when it is being destructed | |
984 | void on_transaction_destruct(Transaction& t); | |
985 | ||
986 | template <typename T> | |
987 | get_extent_ret<T> read_extent( | |
988 | TCachedExtentRef<T>&& extent | |
989 | ) { | |
990 | assert(extent->state == CachedExtent::extent_state_t::CLEAN_PENDING); | |
991 | extent->set_io_wait(); | |
992 | return reader.read( | |
993 | extent->get_paddr(), | |
994 | extent->get_length(), | |
995 | extent->get_bptr() | |
996 | ).safe_then( | |
997 | [extent=std::move(extent)]() mutable { | |
998 | extent->state = CachedExtent::extent_state_t::CLEAN; | |
999 | /* TODO: crc should be checked against LBA manager */ | |
1000 | extent->last_committed_crc = extent->get_crc32c(); | |
1001 | ||
1002 | extent->on_clean_read(); | |
1003 | extent->complete_io(); | |
1004 | return get_extent_ertr::make_ready_future<TCachedExtentRef<T>>( | |
1005 | std::move(extent)); | |
1006 | }, | |
1007 | get_extent_ertr::pass_further{}, | |
1008 | crimson::ct_error::assert_all{ | |
1009 | "Cache::get_extent: invalid error" | |
1010 | } | |
1011 | ); | |
1012 | } | |
1013 | ||
1014 | // Extents in cache may contain placeholders | |
1015 | CachedExtentRef query_cache( | |
1016 | paddr_t offset, | |
1017 | const src_ext_t* p_metric_key) { | |
1018 | query_counters_t* p_counters = nullptr; | |
1019 | if (p_metric_key) { | |
1020 | p_counters = &get_by_src(stats.cache_query_by_src, p_metric_key->first); | |
1021 | ++p_counters->access; | |
1022 | } | |
1023 | if (auto iter = extents.find_offset(offset); | |
1024 | iter != extents.end()) { | |
1025 | if (p_metric_key && | |
1026 | // retired_placeholder is not really cached yet | |
1027 | iter->get_type() != extent_types_t::RETIRED_PLACEHOLDER) { | |
1028 | ++p_counters->hit; | |
1029 | } | |
1030 | return CachedExtentRef(&*iter); | |
1031 | } else { | |
1032 | return CachedExtentRef(); | |
1033 | } | |
1034 | } | |
1035 | ||
f67539c2 | 1036 | }; |
20effc67 | 1037 | using CacheRef = std::unique_ptr<Cache>; |
f67539c2 TL |
1038 | |
1039 | } |