1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2 // vim: ts=8 sw=2 smarttab
6 #include <linux/blkzoned.h>
8 #include <fmt/format.h>
9 #include "crimson/os/seastore/segment_manager/zbd.h"
10 #include "crimson/common/config_proxy.h"
11 #include "crimson/os/seastore/logging.h"
12 #include "crimson/common/errorator-loop.h"
13 #include "include/buffer.h"
15 SET_SUBSYS(seastore_device
);
18 #define RESERVED_ZONES 1
19 // limit the max padding buf size to 1MB
20 #define MAX_PADDING_SIZE 4194304
22 using z_op
= crimson::os::seastore::segment_manager::zbd::zone_op
;
23 template <> struct fmt::formatter
<z_op
>: fmt::formatter
<std::string_view
> {
24 template <typename FormatContext
>
25 auto format(z_op s
, FormatContext
& ctx
) {
26 std::string_view name
= "Unknown";
33 name
= "BLKFINISHZONE";
36 name
= "BLKCLOSEZONE";
39 name
= "BLKRESETZONE";
42 return formatter
<string_view
>::format(name
, ctx
);
46 namespace crimson::os::seastore::segment_manager::zbd
{
48 using open_device_ret
= ZBDSegmentManager::access_ertr::future
<
49 std::pair
<seastar::file
, seastar::stat_data
>>;
50 static open_device_ret
open_device(
51 const std::string
&path
,
52 seastar::open_flags mode
)
54 LOG_PREFIX(ZBDSegmentManager::open_device
);
55 return seastar::file_stat(
56 path
, seastar::follow_symlink::yes
57 ).then([FNAME
, mode
, &path
](auto stat
) mutable {
58 return seastar::open_file_dma(path
, mode
).then([=](auto file
) {
59 DEBUG("open of device {} successful, size {}",
62 return std::make_pair(file
, stat
);
65 [FNAME
](auto e
) -> open_device_ret
{
68 return crimson::ct_error::input_output_error::make();
73 static zbd_sm_metadata_t
make_metadata(
76 const seastar::stat_data
&data
,
77 size_t zone_size_sectors
,
78 size_t zone_capacity_sectors
,
82 LOG_PREFIX(ZBDSegmentManager::make_metadata
);
84 // Using only SWR zones in a SMR drive, for now
85 auto skipped_zones
= RESERVED_ZONES
+ nr_cnv_zones
;
86 assert(num_zones
> skipped_zones
);
88 // TODO: support Option::size_t seastore_segment_size
89 // to allow zones_per_segment > 1 with striping.
90 size_t zone_size
= zone_size_sectors
<< SECT_SHIFT
;
91 assert(total_size
== num_zones
* zone_size
);
92 size_t zone_capacity
= zone_capacity_sectors
<< SECT_SHIFT
;
93 size_t segment_size
= zone_size
;
94 size_t zones_per_segment
= segment_size
/ zone_size
;
95 size_t segments
= (num_zones
- skipped_zones
) / zones_per_segment
;
96 size_t per_shard_segments
= segments
/ seastar::smp::count
;
97 size_t available_size
= zone_capacity
* segments
;
98 size_t per_shard_available_size
= zone_capacity
* per_shard_segments
;
101 WARN("Ignoring configuration values for device and segment size");
103 "device size: {}, available size: {}, block size: {}, allocated size: {},"
104 " total zones {}, zone size: {}, zone capacity: {},"
105 " total segments: {}, zones per segment: {}, segment size: {}"
106 " conv zones: {}, swr zones: {}, per shard segments: {}"
107 " per shard available size: {}",
117 zone_capacity
* zones_per_segment
,
119 num_zones
- nr_cnv_zones
,
121 per_shard_available_size
);
123 std::vector
<zbd_shard_info_t
> shard_infos(seastar::smp::count
);
124 for (unsigned int i
= 0; i
< seastar::smp::count
; i
++) {
125 shard_infos
[i
].size
= per_shard_available_size
;
126 shard_infos
[i
].segments
= per_shard_segments
;
127 shard_infos
[i
].first_segment_offset
= zone_size
* skipped_zones
128 + i
* segment_size
* per_shard_segments
;
129 INFO("First segment offset for shard {} is: {}",
130 i
, shard_infos
[i
].first_segment_offset
);
133 zbd_sm_metadata_t ret
= zbd_sm_metadata_t
{
136 zone_capacity
* zones_per_segment
,
148 struct blk_zone_report
*hdr
;
149 ZoneReport(int nr_zones
)
150 : hdr((blk_zone_report
*)malloc(
151 sizeof(struct blk_zone_report
) + nr_zones
* sizeof(struct blk_zone
))){;}
155 ZoneReport(const ZoneReport
&) = delete;
156 ZoneReport(ZoneReport
&&rhs
) : hdr(rhs
.hdr
) {
161 static seastar::future
<size_t> get_blk_dev_size(
162 seastar::file
&device
)
164 return seastar::do_with(
166 [&](auto& size_sects
) {
170 ).then([&](int ret
) {
171 ceph_assert(size_sects
);
172 size_t size
= size_sects
<< SECT_SHIFT
;
173 return seastar::make_ready_future
<size_t>(size
);
178 // zone_size should be in 512B sectors
179 static seastar::future
<> reset_device(
180 seastar::file
&device
,
181 uint64_t zone_size_sects
,
184 return seastar::do_with(
186 [&, nr_zones
, zone_size_sects
](auto &range
) {
188 range
.nr_sectors
= zone_size_sects
* nr_zones
;
193 return seastar::now();
199 static seastar::future
<size_t> get_zone_capacity(
200 seastar::file
&device
,
203 return seastar::do_with(
204 ZoneReport(nr_zones
),
207 zr
.hdr
->nr_zones
= nr_zones
;
211 ).then([&](int ret
) {
212 return seastar::make_ready_future
<size_t>(zr
.hdr
->zones
[0].capacity
);
218 // get the number of conventional zones of SMR HDD,
219 // they are randomly writable and don't respond to zone operations
220 static seastar::future
<size_t> get_nr_cnv_zones(
221 seastar::file
&device
,
224 return seastar::do_with(
225 ZoneReport(nr_zones
),
228 zr
.hdr
->nr_zones
= nr_zones
;
232 ).then([&, nr_zones
](int ret
) {
233 size_t cnv_zones
= 0;
234 for (uint32_t i
= 0; i
< nr_zones
; i
++) {
235 if (zr
.hdr
->zones
[i
].type
== BLK_ZONE_TYPE_CONVENTIONAL
)
238 return seastar::make_ready_future
<size_t>(cnv_zones
);
245 static write_ertr::future
<> do_write(
246 seastar::file
&device
,
250 LOG_PREFIX(ZBDSegmentManager::do_write
);
251 DEBUG("offset {} len {}",
254 return device
.dma_write(
259 [FNAME
](auto e
) -> write_ertr::future
<size_t> {
260 ERROR("dma_write got error {}",
262 return crimson::ct_error::input_output_error::make();
264 ).then([length
= bptr
.length()](auto result
) -> write_ertr::future
<> {
265 if (result
!= length
) {
266 return crimson::ct_error::input_output_error::make();
268 return write_ertr::now();
272 static write_ertr::future
<> do_writev(
273 device_id_t device_id
,
274 seastar::file
&device
,
279 LOG_PREFIX(ZBDSegmentManager::do_writev
);
280 DEBUG("{} offset {} len {}",
281 device_id_printer_t
{device_id
}, offset
, bl
.length());
282 // writev requires each buffer to be aligned to the disks' block
283 // size, we need to rebuild here
284 bl
.rebuild_aligned(block_size
);
286 return seastar::do_with(
289 [&device
, device_id
, offset
, FNAME
](auto& iovs
, auto& bl
)
291 return write_ertr::parallel_for_each(
293 [&device
, device_id
, offset
, FNAME
](auto& p
)
295 auto off
= offset
+ p
.offset
;
298 DEBUG("{} poffset={}~{} dma_write ...",
299 device_id_printer_t
{device_id
},
301 return device
.dma_write(off
, std::move(iov
)
303 [FNAME
, device_id
, off
, len
](auto e
) -> write_ertr::future
<size_t>
305 ERROR("{} poffset={}~{} dma_write got error -- {}",
306 device_id_printer_t
{device_id
}, off
, len
, e
);
307 return crimson::ct_error::input_output_error::make();
308 }).then([FNAME
, device_id
, off
, len
](size_t written
) -> write_ertr::future
<> {
309 if (written
!= len
) {
310 ERROR("{} poffset={}~{} dma_write len={} inconsistent",
311 device_id_printer_t
{device_id
}, off
, len
, written
);
312 return crimson::ct_error::input_output_error::make();
314 DEBUG("{} poffset={}~{} dma_write done",
315 device_id_printer_t
{device_id
},
317 return write_ertr::now();
323 static ZBDSegmentManager::access_ertr::future
<>
324 write_metadata(seastar::file
&device
, zbd_sm_metadata_t sb
)
326 assert(ceph::encoded_sizeof_bounded
<zbd_sm_metadata_t
>() <
328 return seastar::do_with(
329 bufferptr(ceph::buffer::create_page_aligned(sb
.block_size
)),
330 [=, &device
](auto &bp
) {
331 LOG_PREFIX(ZBDSegmentManager::write_metadata
);
332 DEBUG("block_size {}", sb
.block_size
);
335 auto iter
= bl
.begin();
336 assert(bl
.length() < sb
.block_size
);
337 DEBUG("buffer length {}", bl
.length());
338 iter
.copy(bl
.length(), bp
.c_str());
339 DEBUG("doing writeout");
340 return do_write(device
, 0, bp
);
344 static read_ertr::future
<> do_read(
345 seastar::file
&device
,
350 LOG_PREFIX(ZBDSegmentManager::do_read
);
351 assert(len
<= bptr
.length());
352 DEBUG("offset {} len {}",
355 return device
.dma_read(
360 [FNAME
](auto e
) -> read_ertr::future
<size_t> {
361 ERROR("dma_read got error {}",
363 return crimson::ct_error::input_output_error::make();
365 ).then([len
](auto result
) -> read_ertr::future
<> {
367 return crimson::ct_error::input_output_error::make();
369 return read_ertr::now();
374 ZBDSegmentManager::access_ertr::future
<zbd_sm_metadata_t
>
375 read_metadata(seastar::file
&device
, seastar::stat_data sd
)
377 assert(ceph::encoded_sizeof_bounded
<zbd_sm_metadata_t
>() <
379 return seastar::do_with(
380 bufferptr(ceph::buffer::create_page_aligned(sd
.block_size
)),
381 [=, &device
](auto &bp
) {
387 ).safe_then([=, &bp
] {
390 zbd_sm_metadata_t ret
;
391 auto bliter
= bl
.cbegin();
394 return ZBDSegmentManager::access_ertr::future
<zbd_sm_metadata_t
>(
395 ZBDSegmentManager::access_ertr::ready_future_marker
{},
401 ZBDSegmentManager::mount_ret
ZBDSegmentManager::mount()
403 return shard_devices
.invoke_on_all([](auto &local_device
) {
404 return local_device
.shard_mount(
406 crimson::ct_error::assert_all
{
407 "Invalid error in ZBDSegmentManager::mount"
412 ZBDSegmentManager::mount_ret
ZBDSegmentManager::shard_mount()
415 device_path
, seastar::open_flags::rw
416 ).safe_then([=, this](auto p
) {
417 device
= std::move(p
.first
);
419 return read_metadata(device
, sd
);
420 }).safe_then([=, this](auto meta
){
421 shard_info
= meta
.shard_infos
[seastar::this_shard_id()];
423 return mount_ertr::now();
427 ZBDSegmentManager::mkfs_ret
ZBDSegmentManager::mkfs(
428 device_config_t config
)
430 return shard_devices
.local().primary_mkfs(config
432 return shard_devices
.invoke_on_all([](auto &local_device
) {
433 return local_device
.shard_mkfs(
435 crimson::ct_error::assert_all
{
436 "Invalid error in ZBDSegmentManager::mkfs"
442 ZBDSegmentManager::mkfs_ret
ZBDSegmentManager::primary_mkfs(
443 device_config_t config
)
445 LOG_PREFIX(ZBDSegmentManager::primary_mkfs
);
446 INFO("starting, device_path {}", device_path
);
447 return seastar::do_with(
449 seastar::stat_data
{},
459 auto &zone_size_sects
,
462 auto &nr_cnv_zones
) {
465 seastar::open_flags::rw
466 ).safe_then([=, this, &device
, &stat
, &sb
, &zone_size_sects
, &nr_zones
, &size
, &nr_cnv_zones
](auto p
) {
472 ).then([&](int ret
) {
474 return seastar::make_exception_future
<int>(
475 std::system_error(std::make_error_code(std::errc::io_error
)));
477 return device
.ioctl(BLKGETZONESZ
, (void *)&zone_size_sects
);
478 }).then([&](int ret
) {
479 ceph_assert(zone_size_sects
);
480 return reset_device(device
, zone_size_sects
, nr_zones
);
482 return get_blk_dev_size(device
);
483 }).then([&](auto devsize
) {
485 return get_nr_cnv_zones(device
, nr_zones
);
486 }).then([&](auto cnv_zones
) {
487 DEBUG("Found {} conventional zones", cnv_zones
);
488 nr_cnv_zones
= cnv_zones
;
489 return get_zone_capacity(device
, nr_zones
);
490 }).then([&, FNAME
, config
](auto zone_capacity_sects
) {
491 ceph_assert(zone_capacity_sects
);
492 DEBUG("zone_size in sectors {}, zone_capacity in sectors {}",
493 zone_size_sects
, zone_capacity_sects
);
503 stats
.metadata_write
.increment(
504 ceph::encoded_sizeof_bounded
<zbd_sm_metadata_t
>());
505 DEBUG("Wrote to stats.");
506 return write_metadata(device
, sb
);
507 }).finally([&, FNAME
] {
508 DEBUG("Closing device.");
509 return device
.close();
510 }).safe_then([FNAME
] {
511 DEBUG("Returning from mkfs.");
512 return mkfs_ertr::now();
518 ZBDSegmentManager::mkfs_ret
ZBDSegmentManager::shard_mkfs()
520 LOG_PREFIX(ZBDSegmentManager::shard_mkfs
);
521 INFO("starting, device_path {}", device_path
);
523 device_path
, seastar::open_flags::rw
524 ).safe_then([=, this](auto p
) {
525 device
= std::move(p
.first
);
527 return read_metadata(device
, sd
);
528 }).safe_then([=, this](auto meta
){
529 shard_info
= meta
.shard_infos
[seastar::this_shard_id()];
531 return device
.close();
532 }).safe_then([FNAME
] {
533 DEBUG("Returning from shard_mkfs.");
534 return mkfs_ertr::now();
538 // Return range of sectors to operate on.
539 struct blk_zone_range
make_range(
542 size_t first_segment_offset
)
544 return blk_zone_range
{
545 (id
.device_segment_id() * (segment_size
>> SECT_SHIFT
)
546 + (first_segment_offset
>> SECT_SHIFT
)),
547 (segment_size
>> SECT_SHIFT
)
551 using blk_zone_op_ertr
= crimson::errorator
<
552 crimson::ct_error::input_output_error
>;
553 using blk_zone_op_ret
= blk_zone_op_ertr::future
<>;
554 blk_zone_op_ret
blk_zone_op(seastar::file
&device
,
555 blk_zone_range
&range
,
557 LOG_PREFIX(ZBDSegmentManager::blk_zone_op
);
559 unsigned long ioctl_op
= 0;
563 ioctl_op
= BLKOPENZONE
;
566 ioctl_op
= BLKFINISHZONE
;
569 ioctl_op
= BLKRESETZONE
;
572 ioctl_op
= BLKCLOSEZONE
;
575 ERROR("Invalid zone operation {}", op
);
576 ceph_assert(ioctl_op
);
582 ).then_wrapped([=](auto f
) -> blk_zone_op_ret
{
584 ERROR("{} ioctl failed", op
);
585 return crimson::ct_error::input_output_error::make();
589 return seastar::now();
591 ERROR("{} ioctl failed with return code {}", op
, ret
);
592 return crimson::ct_error::input_output_error::make();
598 ZBDSegmentManager::open_ertr::future
<SegmentRef
> ZBDSegmentManager::open(
601 LOG_PREFIX(ZBDSegmentManager::open
);
602 return seastar::do_with(
604 [=, this](auto &range
) {
607 metadata
.segment_size
,
608 shard_info
.first_segment_offset
);
615 ).safe_then([=, this] {
616 DEBUG("segment {}, open successful", id
);
617 return open_ertr::future
<SegmentRef
>(
618 open_ertr::ready_future_marker
{},
619 SegmentRef(new ZBDSegment(*this, id
))
624 ZBDSegmentManager::release_ertr::future
<> ZBDSegmentManager::release(
627 LOG_PREFIX(ZBDSegmentManager::release
);
628 DEBUG("Resetting zone/segment {}", id
);
629 return seastar::do_with(
631 [=, this](auto &range
) {
634 metadata
.segment_size
,
635 shard_info
.first_segment_offset
);
643 DEBUG("segment release successful");
644 return release_ertr::now();
648 SegmentManager::read_ertr::future
<> ZBDSegmentManager::read(
651 ceph::bufferptr
&out
)
653 LOG_PREFIX(ZBDSegmentManager::read
);
654 auto& seg_addr
= addr
.as_seg_paddr();
655 if (seg_addr
.get_segment_id().device_segment_id() >= get_num_segments()) {
656 ERROR("invalid segment {}",
657 seg_addr
.get_segment_id().device_segment_id());
658 return crimson::ct_error::invarg::make();
661 if (seg_addr
.get_segment_off() + len
> metadata
.segment_capacity
) {
662 ERROR("invalid read offset {}, len {}",
665 return crimson::ct_error::invarg::make();
674 Segment::close_ertr::future
<> ZBDSegmentManager::segment_close(
675 segment_id_t id
, segment_off_t write_pointer
)
677 LOG_PREFIX(ZBDSegmentManager::segment_close
);
678 return seastar::do_with(
680 [=, this](auto &range
) {
683 metadata
.segment_size
,
684 shard_info
.first_segment_offset
);
692 DEBUG("zone finish successful");
693 return Segment::close_ertr::now();
697 Segment::write_ertr::future
<> ZBDSegmentManager::segment_write(
702 LOG_PREFIX(ZBDSegmentManager::segment_write
);
703 assert(addr
.get_device_id() == get_device_id());
704 assert((bl
.length() % metadata
.block_size
) == 0);
705 auto& seg_addr
= addr
.as_seg_paddr();
706 DEBUG("write to segment {} at offset {}, physical offset {}, len {}",
707 seg_addr
.get_segment_id(),
708 seg_addr
.get_segment_off(),
711 stats
.data_write
.increment(bl
.length());
717 metadata
.block_size
);
720 device_id_t
ZBDSegmentManager::get_device_id() const
722 return metadata
.device_id
;
725 secondary_device_set_t
& ZBDSegmentManager::get_secondary_devices()
727 return metadata
.secondary_devices
;
730 magic_t
ZBDSegmentManager::get_magic() const
732 return metadata
.magic
;
735 segment_off_t
ZBDSegment::get_write_capacity() const
737 return manager
.get_segment_size();
740 SegmentManager::close_ertr::future
<> ZBDSegmentManager::close()
743 return device
.close();
745 return seastar::now();
748 Segment::close_ertr::future
<> ZBDSegment::close()
750 return manager
.segment_close(id
, write_pointer
);
753 Segment::write_ertr::future
<> ZBDSegment::write(
754 segment_off_t offset
, ceph::bufferlist bl
)
756 LOG_PREFIX(ZBDSegment::write
);
757 if (offset
!= write_pointer
|| offset
% manager
.metadata
.block_size
!= 0) {
758 ERROR("Segment offset and zone write pointer mismatch. "
759 "segment {} segment-offset {} write pointer {}",
760 id
, offset
, write_pointer
);
761 return crimson::ct_error::invarg::make();
763 if (offset
+ bl
.length() > manager
.metadata
.segment_capacity
) {
764 return crimson::ct_error::enospc::make();
767 write_pointer
= offset
+ bl
.length();
768 return manager
.segment_write(paddr_t::make_seg_paddr(id
, offset
), bl
);
771 Segment::write_ertr::future
<> ZBDSegment::write_padding_bytes(
772 size_t padding_bytes
)
774 LOG_PREFIX(ZBDSegment::write_padding_bytes
);
775 DEBUG("Writing {} padding bytes to segment {} at wp {}",
776 padding_bytes
, id
, write_pointer
);
778 return crimson::repeat([FNAME
, padding_bytes
, this] () mutable {
780 if (padding_bytes
>= MAX_PADDING_SIZE
) {
781 bufsize
= MAX_PADDING_SIZE
;
783 bufsize
= padding_bytes
;
786 padding_bytes
-= bufsize
;
787 bufferptr
bp(ceph::buffer::create_page_aligned(bufsize
));
791 return write(write_pointer
, padd_bl
).safe_then([FNAME
, padding_bytes
, this]() {
792 if (padding_bytes
== 0) {
793 return write_ertr::make_ready_future
<seastar::stop_iteration
>(seastar::stop_iteration::yes
);
795 return write_ertr::make_ready_future
<seastar::stop_iteration
>(seastar::stop_iteration::no
);
801 // Advance write pointer, to given offset.
802 Segment::write_ertr::future
<> ZBDSegment::advance_wp(
803 segment_off_t offset
)
805 LOG_PREFIX(ZBDSegment::advance_wp
);
807 DEBUG("Advancing write pointer from {} to {}", write_pointer
, offset
);
808 if (offset
< write_pointer
) {
809 return crimson::ct_error::invarg::make();
812 size_t padding_bytes
= offset
- write_pointer
;
814 if (padding_bytes
== 0) {
815 return write_ertr::now();
818 assert(padding_bytes
% manager
.metadata
.block_size
== 0);
820 return write_padding_bytes(padding_bytes
);