1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2 // vim: ts=8 sw=2 smarttab
7 #include <fmt/format.h>
9 #include "include/buffer.h"
11 #include "crimson/common/config_proxy.h"
12 #include "crimson/common/errorator-loop.h"
14 #include "crimson/os/seastore/logging.h"
15 #include "crimson/os/seastore/segment_manager/block.h"
17 SET_SUBSYS(seastore_device
);
20 * - D<device-id> S<segment-id> offset=<off>~<len> poffset=<off> information
21 * - D<device-id> poffset=<off>~<len> information
24 * - INFO: major initiation, closing and segment operations
25 * - DEBUG: INFO details, major read and write operations
26 * - TRACE: DEBUG details
29 using segment_state_t
= crimson::os::seastore::Segment::segment_state_t
;
31 template <> struct fmt::formatter
<segment_state_t
>: fmt::formatter
<std::string_view
> {
32 // parse is inherited from formatter<string_view>.
33 template <typename FormatContext
>
34 auto format(segment_state_t s
, FormatContext
& ctx
) {
35 std::string_view name
= "unknown";
37 case segment_state_t::EMPTY
:
40 case segment_state_t::OPEN
:
43 case segment_state_t::CLOSED
:
47 return formatter
<string_view
>::format(name
, ctx
);
51 namespace crimson::os::seastore::segment_manager::block
{
53 static write_ertr::future
<> do_write(
54 device_id_t device_id
,
55 seastar::file
&device
,
59 LOG_PREFIX(block_do_write
);
60 auto len
= bptr
.length();
61 TRACE("D{} poffset={}~{} ...", device_id
, offset
, len
);
62 return device
.dma_write(
67 [FNAME
, device_id
, offset
, len
](auto e
) -> write_ertr::future
<size_t> {
68 ERROR("D{} poffset={}~{} got error -- {}",
69 device_id
, offset
, len
, e
);
70 return crimson::ct_error::input_output_error::make();
71 }).then([FNAME
, device_id
, offset
, len
](auto result
) -> write_ertr::future
<> {
73 ERROR("D{} poffset={}~{} write len={} inconsistent",
74 device_id
, offset
, len
, result
);
75 return crimson::ct_error::input_output_error::make();
77 TRACE("D{} poffset={}~{} done", device_id
, offset
, len
);
78 return write_ertr::now();
82 static write_ertr::future
<> do_writev(
83 device_id_t device_id
,
84 seastar::file
&device
,
89 LOG_PREFIX(block_do_writev
);
90 TRACE("D{} poffset={}~{}, {} buffers",
91 device_id
, offset
, bl
.length(), bl
.get_num_buffers());
93 // writev requires each buffer to be aligned to the disks' block
94 // size, we need to rebuild here
95 bl
.rebuild_aligned(block_size
);
97 return seastar::do_with(
100 [&device
, device_id
, offset
, FNAME
](auto& iovs
, auto& bl
)
102 return write_ertr::parallel_for_each(
104 [&device
, device_id
, offset
, FNAME
](auto& p
) mutable
106 auto off
= offset
+ p
.offset
;
109 TRACE("D{} poffset={}~{} dma_write ...",
110 device_id
, off
, len
);
111 return device
.dma_write(off
, std::move(iov
)
113 [FNAME
, device_id
, off
, len
](auto e
) -> write_ertr::future
<size_t>
115 ERROR("D{} poffset={}~{} dma_write got error -- {}",
116 device_id
, off
, len
, e
);
117 return crimson::ct_error::input_output_error::make();
118 }).then([FNAME
, device_id
, off
, len
](size_t written
) -> write_ertr::future
<> {
119 if (written
!= len
) {
120 ERROR("D{} poffset={}~{} dma_write len={} inconsistent",
121 device_id
, off
, len
, written
);
122 return crimson::ct_error::input_output_error::make();
124 TRACE("D{} poffset={}~{} dma_write done",
125 device_id
, off
, len
);
126 return write_ertr::now();
132 static read_ertr::future
<> do_read(
133 device_id_t device_id
,
134 seastar::file
&device
,
139 LOG_PREFIX(block_do_read
);
140 TRACE("D{} poffset={}~{} ...", device_id
, offset
, len
);
141 assert(len
<= bptr
.length());
142 return device
.dma_read(
147 //FIXME: this is a little bit tricky, since seastar::future<T>::handle_exception
148 // returns seastar::future<T>, to return an crimson::ct_error, we have to create
149 // a seastar::future<T> holding that crimson::ct_error. This is not necessary
150 // once seastar::future<T>::handle_exception() returns seastar::futurize_t<T>
151 [FNAME
, device_id
, offset
, len
](auto e
) -> read_ertr::future
<size_t>
153 ERROR("D{} poffset={}~{} got error -- {}",
154 device_id
, offset
, len
, e
);
155 return crimson::ct_error::input_output_error::make();
156 }).then([FNAME
, device_id
, offset
, len
](auto result
) -> read_ertr::future
<> {
158 ERROR("D{} poffset={}~{} read len={} inconsistent",
159 device_id
, offset
, len
, result
);
160 return crimson::ct_error::input_output_error::make();
162 TRACE("D{} poffset={}~{} done", device_id
, offset
, len
);
163 return read_ertr::now();
168 SegmentStateTracker::write_out(
169 device_id_t device_id
,
170 seastar::file
&device
,
173 LOG_PREFIX(SegmentStateTracker::write_out
);
174 DEBUG("D{} poffset={}~{}", device_id
, offset
, bptr
.length());
175 return do_write(device_id
, device
, offset
, bptr
);
179 SegmentStateTracker::read_in(
180 device_id_t device_id
,
181 seastar::file
&device
,
184 LOG_PREFIX(SegmentStateTracker::read_in
);
185 DEBUG("D{} poffset={}~{}", device_id
, offset
, bptr
.length());
195 block_sm_superblock_t
make_superblock(
196 device_id_t device_id
,
197 segment_manager_config_t sm_config
,
198 const seastar::stat_data
&data
)
200 LOG_PREFIX(block_make_superblock
);
201 using crimson::common::get_conf
;
203 auto config_size
= get_conf
<Option::size_t>(
204 "seastore_device_size");
206 size_t size
= (data
.size
== 0) ? config_size
: data
.size
;
208 auto config_segment_size
= get_conf
<Option::size_t>(
209 "seastore_segment_size");
210 size_t raw_segments
= size
/ config_segment_size
;
211 size_t tracker_size
= SegmentStateTracker::get_raw_size(
214 size_t tracker_off
= data
.block_size
;
215 size_t first_seg_off
= tracker_size
+ tracker_off
;
216 size_t segments
= (size
- first_seg_off
) / config_segment_size
;
218 INFO("D{} disk_size={}, segment_size={}, segments={}, block_size={}, "
219 "tracker_off={}, first_seg_off={}",
228 return block_sm_superblock_t
{
240 std::move(sm_config
.secondary_devices
)
244 using check_create_device_ertr
= BlockSegmentManager::access_ertr
;
245 using check_create_device_ret
= check_create_device_ertr::future
<>;
246 static check_create_device_ret
check_create_device(
247 const std::string
&path
,
250 LOG_PREFIX(block_check_create_device
);
251 INFO("path={}, size={}", path
, size
);
252 return seastar::open_file_dma(
254 seastar::open_flags::exclusive
|
255 seastar::open_flags::rw
|
256 seastar::open_flags::create
257 ).then([size
, FNAME
, &path
](auto file
) {
258 return seastar::do_with(
260 [size
, FNAME
, &path
](auto &f
) -> seastar::future
<>
262 DEBUG("path={} created, truncating to {}", path
, size
);
267 return f
.allocate(0, size
);
272 }).then_wrapped([&path
, FNAME
](auto f
) -> check_create_device_ret
{
276 return seastar::now();
277 } catch (const std::system_error
&e
) {
278 if (e
.code().value() == EEXIST
) {
279 ERROR("path={} exists", path
);
280 return seastar::now();
282 ERROR("path={} creation error -- {}", path
, e
);
283 return crimson::ct_error::input_output_error::make();
286 ERROR("path={} creation error", path
);
287 return crimson::ct_error::input_output_error::make();
291 DEBUG("path={} complete", path
);
292 std::ignore
= f
.discard_result();
293 return seastar::now();
297 using open_device_ret
=
298 BlockSegmentManager::access_ertr::future
<
299 std::pair
<seastar::file
, seastar::stat_data
>
302 open_device_ret
open_device(
303 const std::string
&path
)
305 LOG_PREFIX(block_open_device
);
306 return seastar::file_stat(path
, seastar::follow_symlink::yes
307 ).then([&path
, FNAME
](auto stat
) mutable {
308 return seastar::open_file_dma(
310 seastar::open_flags::rw
| seastar::open_flags::dsync
311 ).then([=, &path
](auto file
) {
312 INFO("path={} successful, size={}", path
, stat
.size
);
313 return std::make_pair(file
, stat
);
315 }).handle_exception([FNAME
, &path
](auto e
) -> open_device_ret
{
316 ERROR("path={} got error -- {}", path
, e
);
317 return crimson::ct_error::input_output_error::make();
323 BlockSegmentManager::access_ertr::future
<>
325 device_id_t device_id
,
326 seastar::file
&device
,
327 block_sm_superblock_t sb
)
329 LOG_PREFIX(block_write_superblock
);
330 DEBUG("D{} write {}", device_id
, sb
);
332 assert(ceph::encoded_sizeof
<block_sm_superblock_t
>(sb
) <
334 return seastar::do_with(
335 bufferptr(ceph::buffer::create_page_aligned(sb
.block_size
)),
336 [=, &device
](auto &bp
)
340 auto iter
= bl
.begin();
341 assert(bl
.length() < sb
.block_size
);
342 iter
.copy(bl
.length(), bp
.c_str());
343 return do_write(device_id
, device
, 0, bp
);
348 BlockSegmentManager::access_ertr::future
<block_sm_superblock_t
>
349 read_superblock(seastar::file
&device
, seastar::stat_data sd
)
351 LOG_PREFIX(block_read_superblock
);
352 DEBUG("reading superblock ...");
353 return seastar::do_with(
354 bufferptr(ceph::buffer::create_page_aligned(sd
.block_size
)),
355 [=, &device
](auto &bp
)
358 DEVICE_ID_NULL
, // unknown
363 ).safe_then([=, &bp
] {
366 block_sm_superblock_t ret
;
367 auto bliter
= bl
.cbegin();
371 ERROR("got decode error!");
372 ceph_assert(0 == "invalid superblock");
374 assert(ceph::encoded_sizeof
<block_sm_superblock_t
>(ret
) <
376 return BlockSegmentManager::access_ertr::future
<block_sm_superblock_t
>(
377 BlockSegmentManager::access_ertr::ready_future_marker
{},
383 BlockSegment::BlockSegment(
384 BlockSegmentManager
&manager
, segment_id_t id
)
385 : manager(manager
), id(id
) {}
387 segment_off_t
BlockSegment::get_write_capacity() const
389 return manager
.get_segment_size();
392 Segment::close_ertr::future
<> BlockSegment::close()
394 return manager
.segment_close(id
, write_pointer
);
397 Segment::write_ertr::future
<> BlockSegment::write(
398 segment_off_t offset
, ceph::bufferlist bl
)
400 LOG_PREFIX(BlockSegment::write
);
401 auto paddr
= paddr_t::make_seg_paddr(id
, offset
);
402 DEBUG("D{} S{} offset={}~{} poffset={} ...",
404 id
.device_segment_id(),
407 manager
.get_offset(paddr
));
409 if (offset
< write_pointer
||
410 offset
% manager
.superblock
.block_size
!= 0 ||
411 bl
.length() % manager
.superblock
.block_size
!= 0) {
412 ERROR("D{} S{} offset={}~{} poffset={} invalid write",
414 id
.device_segment_id(),
417 manager
.get_offset(paddr
));
418 return crimson::ct_error::invarg::make();
421 if (offset
+ bl
.length() > manager
.superblock
.segment_size
) {
422 ERROR("D{} S{} offset={}~{} poffset={} write out of the range {}",
424 id
.device_segment_id(),
427 manager
.get_offset(paddr
),
428 manager
.superblock
.segment_size
);
429 return crimson::ct_error::enospc::make();
432 write_pointer
= offset
+ bl
.length();
433 return manager
.segment_write(paddr
, bl
);
436 Segment::close_ertr::future
<> BlockSegmentManager::segment_close(
437 segment_id_t id
, segment_off_t write_pointer
)
439 LOG_PREFIX(BlockSegmentManager::segment_close
);
440 auto s_id
= id
.device_segment_id();
441 int unused_bytes
= get_segment_size() - write_pointer
;
442 INFO("D{} S{} unused_bytes={} ...",
443 get_device_id(), s_id
, unused_bytes
);
445 assert(unused_bytes
>= 0);
446 assert(id
.device_id() == get_device_id());
449 tracker
->set(s_id
, segment_state_t::CLOSED
);
450 ++stats
.closed_segments
;
451 stats
.closed_segments_unused_bytes
+= unused_bytes
;
452 stats
.metadata_write
.increment(tracker
->get_size());
453 return tracker
->write_out(
454 get_device_id(), device
, superblock
.tracker_offset
);
457 Segment::write_ertr::future
<> BlockSegmentManager::segment_write(
462 assert(addr
.get_device_id() == get_device_id());
463 assert((bl
.length() % superblock
.block_size
) == 0);
464 stats
.data_write
.increment(bl
.length());
470 superblock
.block_size
);
473 BlockSegmentManager::~BlockSegmentManager()
477 BlockSegmentManager::mount_ret
BlockSegmentManager::mount()
479 LOG_PREFIX(BlockSegmentManager::mount
);
482 ).safe_then([=](auto p
) {
483 device
= std::move(p
.first
);
485 return read_superblock(device
, sd
);
486 }).safe_then([=](auto sb
) {
487 set_device_id(sb
.device_id
);
488 INFO("D{} read {}", get_device_id(), sb
);
491 stats
.data_read
.increment(
492 ceph::encoded_sizeof
<block_sm_superblock_t
>(superblock
));
493 tracker
= std::make_unique
<SegmentStateTracker
>(
495 superblock
.block_size
);
496 stats
.data_read
.increment(tracker
->get_size());
497 return tracker
->read_in(
500 superblock
.tracker_offset
502 for (device_segment_id_t i
= 0; i
< tracker
->get_capacity(); ++i
) {
503 if (tracker
->get(i
) == segment_state_t::OPEN
) {
504 tracker
->set(i
, segment_state_t::CLOSED
);
507 stats
.metadata_write
.increment(tracker
->get_size());
508 return tracker
->write_out(
509 get_device_id(), device
, superblock
.tracker_offset
);
511 }).safe_then([this, FNAME
] {
512 INFO("D{} complete", get_device_id());
517 BlockSegmentManager::mkfs_ret
BlockSegmentManager::mkfs(
518 segment_manager_config_t sm_config
)
520 LOG_PREFIX(BlockSegmentManager::mkfs
);
521 set_device_id(sm_config
.device_id
);
522 INFO("D{} path={}, {}", get_device_id(), device_path
, sm_config
);
523 return seastar::do_with(
525 seastar::stat_data
{},
526 block_sm_superblock_t
{},
527 std::unique_ptr
<SegmentStateTracker
>(),
528 [=](auto &device
, auto &stat
, auto &sb
, auto &tracker
)
530 check_create_device_ret maybe_create
= check_create_device_ertr::now();
531 using crimson::common::get_conf
;
532 if (get_conf
<bool>("seastore_block_create")) {
533 auto size
= get_conf
<Option::size_t>("seastore_device_size");
534 maybe_create
= check_create_device(device_path
, size
);
537 return maybe_create
.safe_then([this] {
538 return open_device(device_path
);
539 }).safe_then([&, sm_config
](auto p
) {
542 sb
= make_superblock(get_device_id(), sm_config
, stat
);
543 stats
.metadata_write
.increment(
544 ceph::encoded_sizeof
<block_sm_superblock_t
>(sb
));
545 return write_superblock(get_device_id(), device
, sb
);
546 }).safe_then([&, FNAME
, this] {
547 DEBUG("D{} superblock written", get_device_id());
548 tracker
.reset(new SegmentStateTracker(sb
.segments
, sb
.block_size
));
549 stats
.metadata_write
.increment(tracker
->get_size());
550 return tracker
->write_out(
551 get_device_id(), device
, sb
.tracker_offset
);
553 return device
.close();
554 }).safe_then([FNAME
, this] {
555 INFO("D{} complete", get_device_id());
556 return mkfs_ertr::now();
561 BlockSegmentManager::close_ertr::future
<> BlockSegmentManager::close()
563 LOG_PREFIX(BlockSegmentManager::close
);
564 INFO("D{}", get_device_id());
566 return device
.close();
569 SegmentManager::open_ertr::future
<SegmentRef
> BlockSegmentManager::open(
572 LOG_PREFIX(BlockSegmentManager::open
);
573 auto s_id
= id
.device_segment_id();
574 INFO("D{} S{} ...", get_device_id(), s_id
);
576 assert(id
.device_id() == get_device_id());
578 if (s_id
>= get_num_segments()) {
579 ERROR("D{} S{} segment-id out of range {}",
580 get_device_id(), s_id
, get_num_segments());
581 return crimson::ct_error::invarg::make();
584 if (tracker
->get(s_id
) != segment_state_t::EMPTY
) {
585 ERROR("D{} S{} invalid state {} != EMPTY",
586 get_device_id(), s_id
, tracker
->get(s_id
));
587 return crimson::ct_error::invarg::make();
590 tracker
->set(s_id
, segment_state_t::OPEN
);
591 stats
.metadata_write
.increment(tracker
->get_size());
592 return tracker
->write_out(
593 get_device_id(), device
, superblock
.tracker_offset
594 ).safe_then([this, id
, FNAME
] {
595 ++stats
.opened_segments
;
596 DEBUG("D{} S{} done", get_device_id(), id
.device_segment_id());
597 return open_ertr::future
<SegmentRef
>(
598 open_ertr::ready_future_marker
{},
599 SegmentRef(new BlockSegment(*this, id
)));
603 SegmentManager::release_ertr::future
<> BlockSegmentManager::release(
606 LOG_PREFIX(BlockSegmentManager::release
);
607 auto s_id
= id
.device_segment_id();
608 INFO("D{} S{} ...", get_device_id(), s_id
);
610 assert(id
.device_id() == get_device_id());
612 if (s_id
>= get_num_segments()) {
613 ERROR("D{} S{} segment-id out of range {}",
614 get_device_id(), s_id
, get_num_segments());
615 return crimson::ct_error::invarg::make();
618 if (tracker
->get(s_id
) != segment_state_t::CLOSED
) {
619 ERROR("D{} S{} invalid state {} != CLOSED",
620 get_device_id(), s_id
, tracker
->get(s_id
));
621 return crimson::ct_error::invarg::make();
624 tracker
->set(s_id
, segment_state_t::EMPTY
);
625 ++stats
.released_segments
;
626 stats
.metadata_write
.increment(tracker
->get_size());
627 return tracker
->write_out(
628 get_device_id(), device
, superblock
.tracker_offset
);
631 SegmentManager::read_ertr::future
<> BlockSegmentManager::read(
634 ceph::bufferptr
&out
)
636 LOG_PREFIX(BlockSegmentManager::read
);
637 auto& seg_addr
= addr
.as_seg_paddr();
638 auto s_id
= seg_addr
.get_segment_id().device_segment_id();
639 auto s_off
= seg_addr
.get_segment_off();
640 auto p_off
= get_offset(addr
);
641 DEBUG("D{} S{} offset={}~{} poffset={} ...",
642 get_device_id(), s_id
, s_off
, len
, p_off
);
644 assert(addr
.get_device_id() == get_device_id());
646 if (s_off
% superblock
.block_size
!= 0 ||
647 len
% superblock
.block_size
!= 0) {
648 ERROR("D{} S{} offset={}~{} poffset={} invalid read",
649 get_device_id(), s_id
, s_off
, len
, p_off
);
650 return crimson::ct_error::invarg::make();
653 if (s_id
>= get_num_segments()) {
654 ERROR("D{} S{} offset={}~{} poffset={} segment-id out of range {}",
655 get_device_id(), s_id
, s_off
, len
, p_off
,
657 return crimson::ct_error::invarg::make();
660 if (s_off
+ len
> superblock
.segment_size
) {
661 ERROR("D{} S{} offset={}~{} poffset={} read out of range {}",
662 get_device_id(), s_id
, s_off
, len
, p_off
,
663 superblock
.segment_size
);
664 return crimson::ct_error::invarg::make();
667 if (tracker
->get(s_id
) == segment_state_t::EMPTY
) {
668 // XXX: not an error during scanning,
669 // might need refactor to increase the log level
670 DEBUG("D{} S{} offset={}~{} poffset={} invalid state {}",
671 get_device_id(), s_id
, s_off
, len
, p_off
,
673 return crimson::ct_error::enoent::make();
676 stats
.data_read
.increment(len
);
685 void BlockSegmentManager::register_metrics()
687 LOG_PREFIX(BlockSegmentManager::register_metrics
);
688 DEBUG("D{}", get_device_id());
689 namespace sm
= seastar::metrics
;
690 sm::label
label("device_id");
691 std::vector
<sm::label_instance
> label_instances
;
692 label_instances
.push_back(label(get_device_id()));
700 sm::description("total number of data read"),
705 stats
.data_read
.bytes
,
706 sm::description("total bytes of data read"),
711 stats
.data_write
.num
,
712 sm::description("total number of data write"),
717 stats
.data_write
.bytes
,
718 sm::description("total bytes of data write"),
722 "metadata_write_num",
723 stats
.metadata_write
.num
,
724 sm::description("total number of metadata write"),
728 "metadata_write_bytes",
729 stats
.metadata_write
.bytes
,
730 sm::description("total bytes of metadata write"),
735 stats
.opened_segments
,
736 sm::description("total segments opened"),
741 stats
.closed_segments
,
742 sm::description("total segments closed"),
746 "closed_segments_unused_bytes",
747 stats
.closed_segments_unused_bytes
,
748 sm::description("total unused bytes of closed segments"),
753 stats
.released_segments
,
754 sm::description("total segments released"),