1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2 // vim: ts=8 sw=2 smarttab
6 #include "crimson/common/log.h"
8 #include <boost/intrusive_ptr.hpp>
10 #include <seastar/core/future.hh>
12 #include "include/ceph_assert.h"
13 #include "include/buffer.h"
14 #include "include/denc.h"
16 #include "crimson/osd/exceptions.h"
17 #include "crimson/os/seastore/journal.h"
18 #include "include/uuid.h"
19 #include "crimson/os/seastore/random_block_manager.h"
20 #include "crimson/os/seastore/random_block_manager/rbm_device.h"
22 #include "crimson/os/seastore/journal/record_submitter.h"
23 #include "crimson/os/seastore/journal/circular_journal_space.h"
24 #include "crimson/os/seastore/record_scanner.h"
26 namespace crimson::os::seastore::journal
{
28 using RBMDevice
= random_block_device::RBMDevice
;
31 * CircularBoundedJournal
34 * CircularBoundedJournal (CBJournal) is the journal that works like circular
35 * queue. With CBJournal, Seastore will append some of the records if the size
36 * of the record is small (most likely metadata), at which point the head
37 * (written_to) will be moved. Then, eventually, Seastore applies the records
38 * in CBjournal to RBM (TODO).
41 * After submit_record is done, written_to is increased(this in-memory value)
42 * ---written_to represents where the new record will be appended. Note that
43 * applied_to is not changed here.
46 * At replay time, CBJournal begins to replay records in CBjournal by reading
47 * records from dirty_tail. Then, CBJournal examines whether the records is valid
48 * one by one, at which point written_to is recovered
49 * if the valid record is founded. Note that applied_to is stored
50 * permanently when the apply work---applying the records in CBJournal to RBM---
51 * is done by CBJournal (TODO).
53 * TODO: apply records from CircularBoundedJournal to RandomBlockManager
57 constexpr uint64_t DEFAULT_BLOCK_SIZE
= 4096;
59 class CircularBoundedJournal
: public Journal
, RecordScanner
{
61 CircularBoundedJournal(
62 JournalTrimmer
&trimmer
, RBMDevice
* device
, const std::string
&path
);
63 ~CircularBoundedJournal() {}
65 JournalTrimmer
&get_trimmer() final
{
69 open_for_mkfs_ret
open_for_mkfs() final
;
71 open_for_mount_ret
open_for_mount() final
;
73 close_ertr::future
<> close() final
;
75 journal_type_t
get_type() final
{
76 return journal_type_t::RANDOM_BLOCK
;
79 submit_record_ret
submit_record(
81 OrderingHandle
&handle
84 seastar::future
<> flush(
85 OrderingHandle
&handle
88 return seastar::now();
91 replay_ret
replay(delta_handler_t
&&delta_handler
) final
;
93 rbm_abs_addr
get_rbm_addr(journal_seq_t seq
) const {
94 return convert_paddr_to_abs_addr(seq
.offset
);
99 * CircularBoundedJournal write
101 * NVMe will support a large block write (< 512KB) with atomic write unit command.
102 * With this command, we expect that the most of incoming data can be stored
103 * as a single write call, which has lower overhead than existing
104 * way that uses a combination of system calls such as write() and sync().
108 seastar::future
<> update_journal_tail(
110 journal_seq_t alloc
) {
111 return cjs
.update_journal_tail(dirty
, alloc
);
113 journal_seq_t
get_dirty_tail() const {
114 return cjs
.get_dirty_tail();
116 journal_seq_t
get_alloc_tail() const {
117 return cjs
.get_alloc_tail();
120 void set_write_pipeline(WritePipeline
*_write_pipeline
) final
{
121 write_pipeline
= _write_pipeline
;
124 device_id_t
get_device_id() const {
125 return cjs
.get_device_id();
127 extent_len_t
get_block_size() const {
128 return cjs
.get_block_size();
131 rbm_abs_addr
get_journal_end() const {
132 return cjs
.get_journal_end();
135 void set_written_to(journal_seq_t seq
) {
136 cjs
.set_written_to(seq
);
139 journal_seq_t
get_written_to() {
140 return cjs
.get_written_to();
143 rbm_abs_addr
get_records_start() const {
144 return cjs
.get_records_start();
147 seastar::future
<> finish_commit(transaction_type_t type
) final
;
149 using cbj_delta_handler_t
= std::function
<
150 replay_ertr::future
<bool>(
151 const record_locator_t
&,
153 sea_time_point modify_time
)>;
155 Journal::replay_ret
scan_valid_record_delta(
156 cbj_delta_handler_t
&&delta_handler
,
159 submit_record_ret
do_submit_record(record_t
&&record
, OrderingHandle
&handle
);
161 void try_read_rolled_header(scan_valid_records_cursor
&cursor
) {
162 paddr_t addr
= convert_abs_addr_to_paddr(
165 cursor
.seq
.offset
= addr
;
166 cursor
.seq
.segment_seq
+= 1;
169 void initialize_cursor(scan_valid_records_cursor
& cursor
) final
{
170 cursor
.block_size
= get_block_size();
173 Journal::replay_ret
replay_segment(
174 cbj_delta_handler_t
&handler
, scan_valid_records_cursor
& cursor
);
176 read_ret
read(paddr_t start
, size_t len
) final
;
178 bool is_record_segment_seq_invalid(scan_valid_records_cursor
&cursor
,
179 record_group_header_t
&h
) final
;
181 int64_t get_segment_end_offset(paddr_t addr
) final
{
182 return get_journal_end();
187 CircularJournalSpace
& get_cjs() {
191 read_validate_record_metadata_ret
test_read_validate_record_metadata(
192 scan_valid_records_cursor
&cursor
,
193 segment_nonce_t nonce
)
195 return read_validate_record_metadata(cursor
, nonce
);
198 void test_initialize_cursor(scan_valid_records_cursor
&cursor
)
200 initialize_cursor(cursor
);
204 JournalTrimmer
&trimmer
;
206 WritePipeline
*write_pipeline
= nullptr;
210 * true after open_device_read_header, set to false in close().
211 * Indicates that device is open and in-memory header is valid.
213 bool initialized
= false;
215 // start address where the newest record will be written
216 // should be in range [get_records_start(), get_journal_end())
217 // written_to.segment_seq is circulation seq to track
218 // the sequence to written records
219 CircularJournalSpace cjs
;
220 RecordSubmitter record_submitter
;