]> git.proxmox.com Git - ceph.git/blob - ceph/src/crimson/os/seastore/cache.cc
update source to Ceph Pacific 16.2.2
[ceph.git] / ceph / src / crimson / os / seastore / cache.cc
1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2 // vim: ts=8 sw=2 smarttab
3
4 #include "crimson/os/seastore/cache.h"
5 #include "crimson/common/log.h"
6
7 // included for get_extent_by_type
8 #include "crimson/os/seastore/extentmap_manager/btree/extentmap_btree_node_impl.h"
9 #include "crimson/os/seastore/lba_manager/btree/lba_btree_node_impl.h"
10 #include "crimson/os/seastore/onode_manager/simple-fltree/onode_block.h"
11 #include "crimson/os/seastore/onode_manager/staged-fltree/node_extent_manager/seastore.h"
12 #include "test/crimson/seastore/test_block.h"
13
14 namespace {
15 seastar::logger& logger() {
16 return crimson::get_logger(ceph_subsys_filestore);
17 }
18 }
19
20 namespace crimson::os::seastore {
21
22 Cache::Cache(SegmentManager &segment_manager) :
23 segment_manager(segment_manager) {}
24
25 Cache::~Cache()
26 {
27 for (auto &i: extents) {
28 logger().error("~Cache: extent {} still alive", i);
29 }
30 ceph_assert(extents.empty());
31 }
32
33 Cache::retire_extent_ret Cache::retire_extent_if_cached(
34 Transaction &t, paddr_t addr)
35 {
36 if (auto ext = t.write_set.find_offset(addr); ext != t.write_set.end()) {
37 logger().debug("{}: found {} in t.write_set", __func__, addr);
38 t.add_to_retired_set(CachedExtentRef(&*ext));
39 return retire_extent_ertr::now();
40 } else if (auto iter = extents.find_offset(addr);
41 iter != extents.end()) {
42 auto ret = CachedExtentRef(&*iter);
43 return ret->wait_io().then([&t, ret=std::move(ret)]() mutable {
44 t.add_to_retired_set(ret);
45 return retire_extent_ertr::now();
46 });
47 } else {
48 return retire_extent_ertr::now();
49 }
50 }
51
52 void Cache::add_extent(CachedExtentRef ref)
53 {
54 assert(ref->is_valid());
55 extents.insert(*ref);
56
57 if (ref->is_dirty()) {
58 add_to_dirty(ref);
59 } else {
60 ceph_assert(!ref->primary_ref_list_hook.is_linked());
61 }
62 logger().debug("add_extent: {}", *ref);
63 }
64
65 void Cache::mark_dirty(CachedExtentRef ref)
66 {
67 if (ref->is_dirty()) {
68 assert(ref->primary_ref_list_hook.is_linked());
69 return;
70 }
71
72 add_to_dirty(ref);
73 ref->state = CachedExtent::extent_state_t::DIRTY;
74
75 logger().debug("mark_dirty: {}", *ref);
76 }
77
78 void Cache::add_to_dirty(CachedExtentRef ref)
79 {
80 assert(ref->is_valid());
81 assert(!ref->primary_ref_list_hook.is_linked());
82 intrusive_ptr_add_ref(&*ref);
83 dirty.push_back(*ref);
84 }
85
86 void Cache::remove_extent(CachedExtentRef ref)
87 {
88 logger().debug("remove_extent: {}", *ref);
89 assert(ref->is_valid());
90 extents.erase(*ref);
91
92 if (ref->is_dirty()) {
93 ceph_assert(ref->primary_ref_list_hook.is_linked());
94 dirty.erase(dirty.s_iterator_to(*ref));
95 intrusive_ptr_release(&*ref);
96 } else {
97 ceph_assert(!ref->primary_ref_list_hook.is_linked());
98 }
99 }
100
101 void Cache::replace_extent(CachedExtentRef next, CachedExtentRef prev)
102 {
103 assert(next->get_paddr() == prev->get_paddr());
104 assert(next->version == prev->version + 1);
105 extents.replace(*next, *prev);
106
107 if (prev->is_dirty()) {
108 ceph_assert(prev->primary_ref_list_hook.is_linked());
109 auto prev_it = dirty.iterator_to(*prev);
110 dirty.insert(prev_it, *next);
111 dirty.erase(prev_it);
112 intrusive_ptr_release(&*prev);
113 intrusive_ptr_add_ref(&*next);
114 } else {
115 add_to_dirty(next);
116 }
117 }
118
119 CachedExtentRef Cache::alloc_new_extent_by_type(
120 Transaction &t, ///< [in, out] current transaction
121 extent_types_t type, ///< [in] type tag
122 segment_off_t length ///< [in] length
123 )
124 {
125 switch (type) {
126 case extent_types_t::ROOT:
127 assert(0 == "ROOT is never directly alloc'd");
128 return CachedExtentRef();
129 case extent_types_t::LADDR_INTERNAL:
130 return alloc_new_extent<lba_manager::btree::LBAInternalNode>(t, length);
131 case extent_types_t::LADDR_LEAF:
132 return alloc_new_extent<lba_manager::btree::LBALeafNode>(t, length);
133 case extent_types_t::ONODE_BLOCK:
134 return alloc_new_extent<OnodeBlock>(t, length);
135 case extent_types_t::EXTMAP_INNER:
136 return alloc_new_extent<extentmap_manager::ExtMapInnerNode>(t, length);
137 case extent_types_t::EXTMAP_LEAF:
138 return alloc_new_extent<extentmap_manager::ExtMapLeafNode>(t, length);
139 case extent_types_t::TEST_BLOCK:
140 return alloc_new_extent<TestBlock>(t, length);
141 case extent_types_t::TEST_BLOCK_PHYSICAL:
142 return alloc_new_extent<TestBlockPhysical>(t, length);
143 case extent_types_t::NONE: {
144 ceph_assert(0 == "NONE is an invalid extent type");
145 return CachedExtentRef();
146 }
147 default:
148 ceph_assert(0 == "impossible");
149 return CachedExtentRef();
150 }
151 }
152
153 CachedExtentRef Cache::duplicate_for_write(
154 Transaction &t,
155 CachedExtentRef i) {
156 if (i->is_pending())
157 return i;
158
159 auto ret = i->duplicate_for_write();
160 if (ret->get_type() == extent_types_t::ROOT) {
161 // root must be loaded before mutate
162 assert(t.root == i);
163 t.root = ret->cast<RootBlock>();
164 } else {
165 ret->last_committed_crc = i->last_committed_crc;
166 ret->prior_instance = i;
167 t.add_mutated_extent(ret);
168 }
169
170 ret->version++;
171 ret->state = CachedExtent::extent_state_t::MUTATION_PENDING;
172 logger().debug("Cache::duplicate_for_write: {} -> {}", *i, *ret);
173 return ret;
174 }
175
176 std::optional<record_t> Cache::try_construct_record(Transaction &t)
177 {
178 // First, validate read set
179 for (auto &i: t.read_set) {
180 if (i->state == CachedExtent::extent_state_t::INVALID)
181 return std::nullopt;
182 }
183
184 record_t record;
185
186 t.write_set.clear();
187
188 // Add new copy of mutated blocks, set_io_wait to block until written
189 record.deltas.reserve(t.mutated_block_list.size());
190 for (auto &i: t.mutated_block_list) {
191 if (!i->is_valid()) {
192 logger().debug("try_construct_record: ignoring invalid {}", *i);
193 continue;
194 }
195 logger().debug("try_construct_record: mutating {}", *i);
196
197 assert(i->prior_instance);
198 replace_extent(i, i->prior_instance);
199
200 i->prepare_write();
201 i->set_io_wait();
202
203 assert(i->get_version() > 0);
204 auto final_crc = i->get_crc32c();
205 record.deltas.push_back(
206 delta_info_t{
207 i->get_type(),
208 i->get_paddr(),
209 (i->is_logical()
210 ? i->cast<LogicalCachedExtent>()->get_laddr()
211 : L_ADDR_NULL),
212 i->last_committed_crc,
213 final_crc,
214 (segment_off_t)i->get_length(),
215 i->get_version() - 1,
216 i->get_delta()
217 });
218 i->last_committed_crc = final_crc;
219 }
220
221 if (t.root) {
222 logger().debug(
223 "{}: writing out root delta for {}",
224 __func__,
225 *t.root);
226 record.deltas.push_back(
227 delta_info_t{
228 extent_types_t::ROOT,
229 paddr_t{},
230 L_ADDR_NULL,
231 0,
232 0,
233 0,
234 t.root->get_version() - 1,
235 t.root->get_delta()
236 });
237 }
238
239 // Transaction is now a go, set up in-memory cache state
240 // invalidate now invalid blocks
241 for (auto &i: t.retired_set) {
242 logger().debug("try_construct_record: retiring {}", *i);
243 ceph_assert(i->is_valid());
244 remove_extent(i);
245 i->state = CachedExtent::extent_state_t::INVALID;
246 }
247
248 record.extents.reserve(t.fresh_block_list.size());
249 for (auto &i: t.fresh_block_list) {
250 logger().debug("try_construct_record: fresh block {}", *i);
251 bufferlist bl;
252 i->prepare_write();
253 bl.append(i->get_bptr());
254 if (i->get_type() == extent_types_t::ROOT) {
255 assert(0 == "ROOT never gets written as a fresh block");
256 }
257
258 assert(bl.length() == i->get_length());
259 record.extents.push_back(extent_t{
260 i->get_type(),
261 i->is_logical()
262 ? i->cast<LogicalCachedExtent>()->get_laddr()
263 : L_ADDR_NULL,
264 std::move(bl)
265 });
266 }
267
268 return std::make_optional<record_t>(std::move(record));
269 }
270
271 void Cache::complete_commit(
272 Transaction &t,
273 paddr_t final_block_start,
274 journal_seq_t seq,
275 SegmentCleaner *cleaner)
276 {
277 if (t.root) {
278 remove_extent(root);
279 root = t.root;
280 root->state = CachedExtent::extent_state_t::DIRTY;
281 root->on_delta_write(final_block_start);
282 root->dirty_from = seq;
283 add_extent(root);
284 logger().debug("complete_commit: new root {}", *t.root);
285 }
286
287 for (auto &i: t.fresh_block_list) {
288 i->set_paddr(final_block_start.add_relative(i->get_paddr()));
289 i->last_committed_crc = i->get_crc32c();
290 i->on_initial_write();
291
292 if (!i->is_valid()) {
293 logger().debug("complete_commit: invalid {}", *i);
294 continue;
295 }
296
297 i->state = CachedExtent::extent_state_t::CLEAN;
298 logger().debug("complete_commit: fresh {}", *i);
299 add_extent(i);
300 if (cleaner) {
301 cleaner->mark_space_used(
302 i->get_paddr(),
303 i->get_length());
304 }
305 }
306
307 // Add new copy of mutated blocks, set_io_wait to block until written
308 for (auto &i: t.mutated_block_list) {
309 logger().debug("complete_commit: mutated {}", *i);
310 assert(i->prior_instance);
311 i->on_delta_write(final_block_start);
312 i->prior_instance = CachedExtentRef();
313 if (!i->is_valid()) {
314 logger().debug("complete_commit: not dirtying invalid {}", *i);
315 continue;
316 }
317 i->state = CachedExtent::extent_state_t::DIRTY;
318 if (i->version == 1) {
319 i->dirty_from = seq;
320 }
321 }
322
323 if (cleaner) {
324 for (auto &i: t.retired_set) {
325 cleaner->mark_space_free(
326 i->get_paddr(),
327 i->get_length());
328 }
329 }
330
331 for (auto &i: t.mutated_block_list) {
332 i->complete_io();
333 }
334 }
335
336 void Cache::init() {
337 if (root) {
338 // initial creation will do mkfs followed by mount each of which calls init
339 remove_extent(root);
340 root = nullptr;
341 }
342 root = new RootBlock();
343 root->state = CachedExtent::extent_state_t::DIRTY;
344 add_extent(root);
345 }
346
347 Cache::mkfs_ertr::future<> Cache::mkfs(Transaction &t)
348 {
349 return get_root(t).safe_then([this, &t](auto croot) {
350 duplicate_for_write(t, croot);
351 return mkfs_ertr::now();
352 });
353 }
354
355 Cache::close_ertr::future<> Cache::close()
356 {
357 root.reset();
358 for (auto i = dirty.begin(); i != dirty.end(); ) {
359 auto ptr = &*i;
360 dirty.erase(i++);
361 intrusive_ptr_release(ptr);
362 }
363 return close_ertr::now();
364 }
365
366 Cache::replay_delta_ret
367 Cache::replay_delta(
368 journal_seq_t journal_seq,
369 paddr_t record_base,
370 const delta_info_t &delta)
371 {
372 if (delta.type == extent_types_t::ROOT) {
373 logger().debug("replay_delta: found root delta");
374 root->apply_delta_and_adjust_crc(record_base, delta.bl);
375 root->dirty_from = journal_seq;
376 return replay_delta_ertr::now();
377 } else {
378 auto get_extent_if_cached = [this](paddr_t addr)
379 -> replay_delta_ertr::future<CachedExtentRef> {
380 auto retiter = extents.find_offset(addr);
381 if (retiter != extents.end()) {
382 return replay_delta_ertr::make_ready_future<CachedExtentRef>(&*retiter);
383 } else {
384 return replay_delta_ertr::make_ready_future<CachedExtentRef>();
385 }
386 };
387 auto extent_fut = delta.pversion == 0 ?
388 get_extent_by_type(
389 delta.type,
390 delta.paddr,
391 delta.laddr,
392 delta.length) :
393 get_extent_if_cached(
394 delta.paddr);
395 return extent_fut.safe_then([=, &delta](auto extent) {
396 if (!extent) {
397 assert(delta.pversion > 0);
398 logger().debug(
399 "replay_delta: replaying {}, extent not present so delta is obsolete",
400 delta);
401 return;
402 }
403
404 logger().debug(
405 "replay_delta: replaying {} on {}",
406 *extent,
407 delta);
408
409 assert(extent->version == delta.pversion);
410
411 assert(extent->last_committed_crc == delta.prev_crc);
412 extent->apply_delta_and_adjust_crc(record_base, delta.bl);
413 assert(extent->last_committed_crc == delta.final_crc);
414
415 if (extent->version == 0) {
416 extent->dirty_from = journal_seq;
417 }
418 extent->version++;
419 mark_dirty(extent);
420 });
421 }
422 }
423
424 Cache::get_next_dirty_extents_ret Cache::get_next_dirty_extents(
425 journal_seq_t seq)
426 {
427 std::vector<CachedExtentRef> ret;
428 for (auto i = dirty.begin(); i != dirty.end(); ++i) {
429 CachedExtentRef cand;
430 if (i->dirty_from < seq) {
431 assert(ret.empty() || ret.back()->dirty_from <= i->dirty_from);
432 ret.push_back(&*i);
433 } else {
434 break;
435 }
436 }
437 return seastar::do_with(
438 std::move(ret),
439 [](auto &ret) {
440 return seastar::do_for_each(
441 ret,
442 [](auto &ext) {
443 logger().debug(
444 "get_next_dirty_extents: waiting on {}",
445 *ext);
446 return ext->wait_io();
447 }).then([&ret]() mutable {
448 return seastar::make_ready_future<std::vector<CachedExtentRef>>(
449 std::move(ret));
450 });
451 });
452 }
453
454 Cache::get_root_ret Cache::get_root(Transaction &t)
455 {
456 if (t.root) {
457 return get_root_ret(
458 get_root_ertr::ready_future_marker{},
459 t.root);
460 } else {
461 auto ret = root;
462 return ret->wait_io().then([ret, &t] {
463 t.root = ret;
464 return get_root_ret(
465 get_root_ertr::ready_future_marker{},
466 ret);
467 });
468 }
469 }
470
471 using StagedOnodeBlock = crimson::os::seastore::onode::SeastoreNodeExtent;
472
473 Cache::get_extent_ertr::future<CachedExtentRef> Cache::get_extent_by_type(
474 extent_types_t type,
475 paddr_t offset,
476 laddr_t laddr,
477 segment_off_t length)
478 {
479 return [=] {
480 switch (type) {
481 case extent_types_t::ROOT:
482 assert(0 == "ROOT is never directly read");
483 return get_extent_ertr::make_ready_future<CachedExtentRef>();
484 case extent_types_t::LADDR_INTERNAL:
485 return get_extent<lba_manager::btree::LBAInternalNode>(offset, length
486 ).safe_then([](auto extent) {
487 return CachedExtentRef(extent.detach(), false /* add_ref */);
488 });
489 case extent_types_t::LADDR_LEAF:
490 return get_extent<lba_manager::btree::LBALeafNode>(offset, length
491 ).safe_then([](auto extent) {
492 return CachedExtentRef(extent.detach(), false /* add_ref */);
493 });
494 case extent_types_t::EXTMAP_INNER:
495 return get_extent<extentmap_manager::ExtMapInnerNode>(offset, length
496 ).safe_then([](auto extent) {
497 return CachedExtentRef(extent.detach(), false /* add_ref */);
498 });
499 case extent_types_t::EXTMAP_LEAF:
500 return get_extent<extentmap_manager::ExtMapLeafNode>(offset, length
501 ).safe_then([](auto extent) {
502 return CachedExtentRef(extent.detach(), false /* add_ref */);
503 });
504 case extent_types_t::ONODE_BLOCK:
505 return get_extent<OnodeBlock>(offset, length
506 ).safe_then([](auto extent) {
507 return CachedExtentRef(extent.detach(), false /* add_ref */);
508 });
509 case extent_types_t::ONODE_BLOCK_STAGED:
510 return get_extent<StagedOnodeBlock>(offset, length
511 ).safe_then([](auto extent) {
512 return CachedExtentRef(extent.detach(), false /* add_ref */);
513 });
514 case extent_types_t::TEST_BLOCK:
515 return get_extent<TestBlock>(offset, length
516 ).safe_then([](auto extent) {
517 return CachedExtentRef(extent.detach(), false /* add_ref */);
518 });
519 case extent_types_t::TEST_BLOCK_PHYSICAL:
520 return get_extent<TestBlockPhysical>(offset, length
521 ).safe_then([](auto extent) {
522 return CachedExtentRef(extent.detach(), false /* add_ref */);
523 });
524 case extent_types_t::NONE: {
525 ceph_assert(0 == "NONE is an invalid extent type");
526 return get_extent_ertr::make_ready_future<CachedExtentRef>();
527 }
528 default:
529 ceph_assert(0 == "impossible");
530 return get_extent_ertr::make_ready_future<CachedExtentRef>();
531 }
532 }().safe_then([laddr](CachedExtentRef e) {
533 assert(e->is_logical() == (laddr != L_ADDR_NULL));
534 if (e->is_logical()) {
535 e->cast<LogicalCachedExtent>()->set_laddr(laddr);
536 }
537 return get_extent_ertr::make_ready_future<CachedExtentRef>(e);
538 });
539 }
540
541 }