]> git.proxmox.com Git - ceph.git/blame - ceph/src/crimson/os/seastore/segment_manager/zbd.cc
update ceph source to reef 18.2.1
[ceph.git] / ceph / src / crimson / os / seastore / segment_manager / zbd.cc
CommitLineData
20effc67
TL
1// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2// vim: ts=8 sw=2 smarttab
3
4#include <sys/mman.h>
5#include <string.h>
6#include <linux/blkzoned.h>
7
1e59de90 8#include <fmt/format.h>
aee94f69 9#include "crimson/os/seastore/segment_manager/zbd.h"
20effc67 10#include "crimson/common/config_proxy.h"
1e59de90 11#include "crimson/os/seastore/logging.h"
aee94f69 12#include "crimson/common/errorator-loop.h"
20effc67
TL
13#include "include/buffer.h"
14
1e59de90
TL
15SET_SUBSYS(seastore_device);
16
17#define SECT_SHIFT 9
18#define RESERVED_ZONES 1
19// limit the max padding buf size to 1MB
aee94f69 20#define MAX_PADDING_SIZE 4194304
1e59de90 21
aee94f69 22using z_op = crimson::os::seastore::segment_manager::zbd::zone_op;
1e59de90
TL
23template <> struct fmt::formatter<z_op>: fmt::formatter<std::string_view> {
24 template <typename FormatContext>
25 auto format(z_op s, FormatContext& ctx) {
26 std::string_view name = "Unknown";
27 switch (s) {
28 using enum z_op;
29 case OPEN:
30 name = "BLKOPENZONE";
31 break;
32 case FINISH:
33 name = "BLKFINISHZONE";
34 break;
35 case CLOSE:
36 name = "BLKCLOSEZONE";
37 break;
38 case RESET:
39 name = "BLKRESETZONE";
40 break;
41 }
42 return formatter<string_view>::format(name, ctx);
43 }
44};
20effc67 45
aee94f69 46namespace crimson::os::seastore::segment_manager::zbd {
20effc67 47
aee94f69 48using open_device_ret = ZBDSegmentManager::access_ertr::future<
20effc67
TL
49 std::pair<seastar::file, seastar::stat_data>>;
50static open_device_ret open_device(
51 const std::string &path,
52 seastar::open_flags mode)
53{
aee94f69 54 LOG_PREFIX(ZBDSegmentManager::open_device);
20effc67
TL
55 return seastar::file_stat(
56 path, seastar::follow_symlink::yes
1e59de90
TL
57 ).then([FNAME, mode, &path](auto stat) mutable {
58 return seastar::open_file_dma(path, mode).then([=](auto file) {
59 DEBUG("open of device {} successful, size {}",
60 path,
61 stat.size);
20effc67
TL
62 return std::make_pair(file, stat);
63 });
64 }).handle_exception(
1e59de90
TL
65 [FNAME](auto e) -> open_device_ret {
66 ERROR("got error {}",
20effc67
TL
67 e);
68 return crimson::ct_error::input_output_error::make();
69 }
70 );
71}
72
aee94f69 73static zbd_sm_metadata_t make_metadata(
1e59de90 74 uint64_t total_size,
20effc67
TL
75 seastore_meta_t meta,
76 const seastar::stat_data &data,
1e59de90
TL
77 size_t zone_size_sectors,
78 size_t zone_capacity_sectors,
aee94f69 79 size_t nr_cnv_zones,
20effc67
TL
80 size_t num_zones)
81{
aee94f69
TL
82 LOG_PREFIX(ZBDSegmentManager::make_metadata);
83
84 // Using only SWR zones in a SMR drive, for now
85 auto skipped_zones = RESERVED_ZONES + nr_cnv_zones;
86 assert(num_zones > skipped_zones);
1e59de90
TL
87
88 // TODO: support Option::size_t seastore_segment_size
89 // to allow zones_per_segment > 1 with striping.
90 size_t zone_size = zone_size_sectors << SECT_SHIFT;
aee94f69 91 assert(total_size == num_zones * zone_size);
1e59de90
TL
92 size_t zone_capacity = zone_capacity_sectors << SECT_SHIFT;
93 size_t segment_size = zone_size;
94 size_t zones_per_segment = segment_size / zone_size;
aee94f69 95 size_t segments = (num_zones - skipped_zones) / zones_per_segment;
1e59de90
TL
96 size_t per_shard_segments = segments / seastar::smp::count;
97 size_t available_size = zone_capacity * segments;
98 size_t per_shard_available_size = zone_capacity * per_shard_segments;
1e59de90 99
1e59de90
TL
100
101 WARN("Ignoring configuration values for device and segment size");
102 INFO(
aee94f69
TL
103 "device size: {}, available size: {}, block size: {}, allocated size: {},"
104 " total zones {}, zone size: {}, zone capacity: {},"
105 " total segments: {}, zones per segment: {}, segment size: {}"
106 " conv zones: {}, swr zones: {}, per shard segments: {}"
107 " per shard available size: {}",
1e59de90
TL
108 total_size,
109 available_size,
20effc67
TL
110 data.block_size,
111 data.allocated_size,
1e59de90
TL
112 num_zones,
113 zone_size,
114 zone_capacity,
115 segments,
116 zones_per_segment,
aee94f69
TL
117 zone_capacity * zones_per_segment,
118 nr_cnv_zones,
119 num_zones - nr_cnv_zones,
120 per_shard_segments,
121 per_shard_available_size);
122
123 std::vector<zbd_shard_info_t> shard_infos(seastar::smp::count);
124 for (unsigned int i = 0; i < seastar::smp::count; i++) {
125 shard_infos[i].size = per_shard_available_size;
126 shard_infos[i].segments = per_shard_segments;
127 shard_infos[i].first_segment_offset = zone_size * skipped_zones
128 + i * segment_size * per_shard_segments;
129 INFO("First segment offset for shard {} is: {}",
130 i, shard_infos[i].first_segment_offset);
131 }
1e59de90 132
aee94f69 133 zbd_sm_metadata_t ret = zbd_sm_metadata_t{
1e59de90
TL
134 seastar::smp::count,
135 segment_size,
20effc67
TL
136 zone_capacity * zones_per_segment,
137 zones_per_segment,
138 zone_capacity,
139 data.block_size,
20effc67 140 zone_size,
1e59de90 141 shard_infos,
20effc67 142 meta};
1e59de90 143 ret.validate();
20effc67
TL
144 return ret;
145}
146
147struct ZoneReport {
148 struct blk_zone_report *hdr;
149 ZoneReport(int nr_zones)
150 : hdr((blk_zone_report *)malloc(
151 sizeof(struct blk_zone_report) + nr_zones * sizeof(struct blk_zone))){;}
152 ~ZoneReport(){
153 free(hdr);
154 }
155 ZoneReport(const ZoneReport &) = delete;
156 ZoneReport(ZoneReport &&rhs) : hdr(rhs.hdr) {
157 rhs.hdr = nullptr;
158 }
159};
160
1e59de90
TL
161static seastar::future<size_t> get_blk_dev_size(
162 seastar::file &device)
163{
164 return seastar::do_with(
165 (uint64_t)0,
166 [&](auto& size_sects) {
167 return device.ioctl(
168 BLKGETSIZE,
169 (void *)&size_sects
170 ).then([&](int ret) {
171 ceph_assert(size_sects);
172 size_t size = size_sects << SECT_SHIFT;
173 return seastar::make_ready_future<size_t>(size);
174 });
175 });
176}
177
178// zone_size should be in 512B sectors
20effc67 179static seastar::future<> reset_device(
1e59de90
TL
180 seastar::file &device,
181 uint64_t zone_size_sects,
182 uint64_t nr_zones)
20effc67
TL
183{
184 return seastar::do_with(
185 blk_zone_range{},
1e59de90 186 [&, nr_zones, zone_size_sects](auto &range) {
20effc67 187 range.sector = 0;
1e59de90 188 range.nr_sectors = zone_size_sects * nr_zones;
20effc67
TL
189 return device.ioctl(
190 BLKRESETZONE,
191 &range
192 ).then([&](int ret){
193 return seastar::now();
194 });
195 }
196 );
197}
198
199static seastar::future<size_t> get_zone_capacity(
1e59de90 200 seastar::file &device,
20effc67
TL
201 uint32_t nr_zones)
202{
203 return seastar::do_with(
20effc67 204 ZoneReport(nr_zones),
1e59de90
TL
205 [&](auto &zr) {
206 zr.hdr->sector = 0;
207 zr.hdr->nr_zones = nr_zones;
208 return device.ioctl(
209 BLKREPORTZONE,
210 zr.hdr
211 ).then([&](int ret) {
212 return seastar::make_ready_future<size_t>(zr.hdr->zones[0].capacity);
20effc67
TL
213 });
214 }
215 );
216}
217
aee94f69
TL
218// get the number of conventional zones of SMR HDD,
219// they are randomly writable and don't respond to zone operations
220static seastar::future<size_t> get_nr_cnv_zones(
221 seastar::file &device,
222 uint32_t nr_zones)
223{
224 return seastar::do_with(
225 ZoneReport(nr_zones),
226 [&](auto &zr) {
227 zr.hdr->sector = 0;
228 zr.hdr->nr_zones = nr_zones;
229 return device.ioctl(
230 BLKREPORTZONE,
231 zr.hdr
232 ).then([&, nr_zones](int ret) {
233 size_t cnv_zones = 0;
234 for (uint32_t i = 0; i < nr_zones; i++) {
235 if (zr.hdr->zones[i].type == BLK_ZONE_TYPE_CONVENTIONAL)
236 cnv_zones++;
237 }
238 return seastar::make_ready_future<size_t>(cnv_zones);
239 });
240 }
241 );
242}
243
244
20effc67
TL
245static write_ertr::future<> do_write(
246 seastar::file &device,
247 uint64_t offset,
248 bufferptr &bptr)
249{
aee94f69 250 LOG_PREFIX(ZBDSegmentManager::do_write);
1e59de90 251 DEBUG("offset {} len {}",
20effc67
TL
252 offset,
253 bptr.length());
254 return device.dma_write(
255 offset,
256 bptr.c_str(),
257 bptr.length()
258 ).handle_exception(
1e59de90
TL
259 [FNAME](auto e) -> write_ertr::future<size_t> {
260 ERROR("dma_write got error {}",
20effc67
TL
261 e);
262 return crimson::ct_error::input_output_error::make();
263 }
264 ).then([length = bptr.length()](auto result) -> write_ertr::future<> {
265 if (result != length) {
266 return crimson::ct_error::input_output_error::make();
267 }
268 return write_ertr::now();
269 });
270}
271
272static write_ertr::future<> do_writev(
aee94f69 273 device_id_t device_id,
20effc67
TL
274 seastar::file &device,
275 uint64_t offset,
276 bufferlist&& bl,
277 size_t block_size)
278{
aee94f69
TL
279 LOG_PREFIX(ZBDSegmentManager::do_writev);
280 DEBUG("{} offset {} len {}",
281 device_id_printer_t{device_id}, offset, bl.length());
20effc67
TL
282 // writev requires each buffer to be aligned to the disks' block
283 // size, we need to rebuild here
284 bl.rebuild_aligned(block_size);
285
aee94f69
TL
286 return seastar::do_with(
287 bl.prepare_iovs(),
288 std::move(bl),
289 [&device, device_id, offset, FNAME](auto& iovs, auto& bl)
290 {
291 return write_ertr::parallel_for_each(
292 iovs,
293 [&device, device_id, offset, FNAME](auto& p)
294 {
295 auto off = offset + p.offset;
296 auto len = p.length;
297 auto& iov = p.iov;
298 DEBUG("{} poffset={}~{} dma_write ...",
299 device_id_printer_t{device_id},
300 off, len);
301 return device.dma_write(off, std::move(iov)
302 ).handle_exception(
303 [FNAME, device_id, off, len](auto e) -> write_ertr::future<size_t>
304 {
305 ERROR("{} poffset={}~{} dma_write got error -- {}",
306 device_id_printer_t{device_id}, off, len, e);
307 return crimson::ct_error::input_output_error::make();
308 }).then([FNAME, device_id, off, len](size_t written) -> write_ertr::future<> {
309 if (written != len) {
310 ERROR("{} poffset={}~{} dma_write len={} inconsistent",
311 device_id_printer_t{device_id}, off, len, written);
312 return crimson::ct_error::input_output_error::make();
313 }
314 DEBUG("{} poffset={}~{} dma_write done",
315 device_id_printer_t{device_id},
316 off, len);
317 return write_ertr::now();
318 });
319 });
20effc67
TL
320 });
321}
322
aee94f69
TL
323static ZBDSegmentManager::access_ertr::future<>
324write_metadata(seastar::file &device, zbd_sm_metadata_t sb)
20effc67 325{
aee94f69 326 assert(ceph::encoded_sizeof_bounded<zbd_sm_metadata_t>() <
20effc67
TL
327 sb.block_size);
328 return seastar::do_with(
329 bufferptr(ceph::buffer::create_page_aligned(sb.block_size)),
1e59de90 330 [=, &device](auto &bp) {
aee94f69 331 LOG_PREFIX(ZBDSegmentManager::write_metadata);
1e59de90 332 DEBUG("block_size {}", sb.block_size);
20effc67
TL
333 bufferlist bl;
334 encode(sb, bl);
335 auto iter = bl.begin();
336 assert(bl.length() < sb.block_size);
1e59de90 337 DEBUG("buffer length {}", bl.length());
20effc67 338 iter.copy(bl.length(), bp.c_str());
1e59de90 339 DEBUG("doing writeout");
20effc67
TL
340 return do_write(device, 0, bp);
341 });
342}
343
344static read_ertr::future<> do_read(
345 seastar::file &device,
346 uint64_t offset,
347 size_t len,
348 bufferptr &bptr)
349{
aee94f69 350 LOG_PREFIX(ZBDSegmentManager::do_read);
20effc67 351 assert(len <= bptr.length());
1e59de90 352 DEBUG("offset {} len {}",
20effc67
TL
353 offset,
354 len);
355 return device.dma_read(
356 offset,
357 bptr.c_str(),
358 len
359 ).handle_exception(
1e59de90
TL
360 [FNAME](auto e) -> read_ertr::future<size_t> {
361 ERROR("dma_read got error {}",
20effc67
TL
362 e);
363 return crimson::ct_error::input_output_error::make();
364 }
365 ).then([len](auto result) -> read_ertr::future<> {
366 if (result != len) {
367 return crimson::ct_error::input_output_error::make();
368 }
369 return read_ertr::now();
370 });
371}
372
373static
aee94f69 374ZBDSegmentManager::access_ertr::future<zbd_sm_metadata_t>
20effc67
TL
375read_metadata(seastar::file &device, seastar::stat_data sd)
376{
aee94f69 377 assert(ceph::encoded_sizeof_bounded<zbd_sm_metadata_t>() <
20effc67
TL
378 sd.block_size);
379 return seastar::do_with(
380 bufferptr(ceph::buffer::create_page_aligned(sd.block_size)),
381 [=, &device](auto &bp) {
382 return do_read(
383 device,
384 0,
385 bp.length(),
386 bp
387 ).safe_then([=, &bp] {
388 bufferlist bl;
389 bl.push_back(bp);
aee94f69 390 zbd_sm_metadata_t ret;
20effc67
TL
391 auto bliter = bl.cbegin();
392 decode(ret, bliter);
1e59de90 393 ret.validate();
aee94f69
TL
394 return ZBDSegmentManager::access_ertr::future<zbd_sm_metadata_t>(
395 ZBDSegmentManager::access_ertr::ready_future_marker{},
20effc67
TL
396 ret);
397 });
398 });
399}
400
aee94f69 401ZBDSegmentManager::mount_ret ZBDSegmentManager::mount()
1e59de90
TL
402{
403 return shard_devices.invoke_on_all([](auto &local_device) {
404 return local_device.shard_mount(
405 ).handle_error(
406 crimson::ct_error::assert_all{
aee94f69 407 "Invalid error in ZBDSegmentManager::mount"
1e59de90
TL
408 });
409 });
410}
411
aee94f69 412ZBDSegmentManager::mount_ret ZBDSegmentManager::shard_mount()
20effc67
TL
413{
414 return open_device(
415 device_path, seastar::open_flags::rw
1e59de90 416 ).safe_then([=, this](auto p) {
20effc67
TL
417 device = std::move(p.first);
418 auto sd = p.second;
419 return read_metadata(device, sd);
1e59de90
TL
420 }).safe_then([=, this](auto meta){
421 shard_info = meta.shard_infos[seastar::this_shard_id()];
20effc67
TL
422 metadata = meta;
423 return mount_ertr::now();
424 });
425}
426
aee94f69 427ZBDSegmentManager::mkfs_ret ZBDSegmentManager::mkfs(
1e59de90 428 device_config_t config)
20effc67 429{
1e59de90
TL
430 return shard_devices.local().primary_mkfs(config
431 ).safe_then([this] {
432 return shard_devices.invoke_on_all([](auto &local_device) {
433 return local_device.shard_mkfs(
434 ).handle_error(
435 crimson::ct_error::assert_all{
aee94f69 436 "Invalid error in ZBDSegmentManager::mkfs"
1e59de90
TL
437 });
438 });
439 });
440}
441
aee94f69 442ZBDSegmentManager::mkfs_ret ZBDSegmentManager::primary_mkfs(
1e59de90
TL
443 device_config_t config)
444{
aee94f69 445 LOG_PREFIX(ZBDSegmentManager::primary_mkfs);
1e59de90 446 INFO("starting, device_path {}", device_path);
20effc67
TL
447 return seastar::do_with(
448 seastar::file{},
449 seastar::stat_data{},
aee94f69
TL
450 zbd_sm_metadata_t{},
451 size_t(),
20effc67
TL
452 size_t(),
453 size_t(),
1e59de90 454 size_t(),
aee94f69
TL
455 [=, this]
456 (auto &device,
457 auto &stat,
458 auto &sb,
459 auto &zone_size_sects,
460 auto &nr_zones,
461 auto &size,
462 auto &nr_cnv_zones) {
20effc67 463 return open_device(
1e59de90 464 device_path,
20effc67 465 seastar::open_flags::rw
aee94f69 466 ).safe_then([=, this, &device, &stat, &sb, &zone_size_sects, &nr_zones, &size, &nr_cnv_zones](auto p) {
20effc67
TL
467 device = p.first;
468 stat = p.second;
469 return device.ioctl(
1e59de90 470 BLKGETNRZONES,
20effc67 471 (void *)&nr_zones
1e59de90 472 ).then([&](int ret) {
20effc67
TL
473 if (nr_zones == 0) {
474 return seastar::make_exception_future<int>(
475 std::system_error(std::make_error_code(std::errc::io_error)));
476 }
1e59de90
TL
477 return device.ioctl(BLKGETZONESZ, (void *)&zone_size_sects);
478 }).then([&](int ret) {
479 ceph_assert(zone_size_sects);
480 return reset_device(device, zone_size_sects, nr_zones);
481 }).then([&] {
482 return get_blk_dev_size(device);
483 }).then([&](auto devsize) {
484 size = devsize;
aee94f69
TL
485 return get_nr_cnv_zones(device, nr_zones);
486 }).then([&](auto cnv_zones) {
487 DEBUG("Found {} conventional zones", cnv_zones);
488 nr_cnv_zones = cnv_zones;
1e59de90
TL
489 return get_zone_capacity(device, nr_zones);
490 }).then([&, FNAME, config](auto zone_capacity_sects) {
491 ceph_assert(zone_capacity_sects);
492 DEBUG("zone_size in sectors {}, zone_capacity in sectors {}",
493 zone_size_sects, zone_capacity_sects);
20effc67 494 sb = make_metadata(
1e59de90
TL
495 size,
496 config.meta,
497 stat,
498 zone_size_sects,
499 zone_capacity_sects,
aee94f69 500 nr_cnv_zones,
20effc67
TL
501 nr_zones);
502 metadata = sb;
503 stats.metadata_write.increment(
aee94f69 504 ceph::encoded_sizeof_bounded<zbd_sm_metadata_t>());
1e59de90 505 DEBUG("Wrote to stats.");
20effc67 506 return write_metadata(device, sb);
1e59de90
TL
507 }).finally([&, FNAME] {
508 DEBUG("Closing device.");
20effc67 509 return device.close();
1e59de90
TL
510 }).safe_then([FNAME] {
511 DEBUG("Returning from mkfs.");
20effc67
TL
512 return mkfs_ertr::now();
513 });
514 });
515 });
516}
517
aee94f69 518ZBDSegmentManager::mkfs_ret ZBDSegmentManager::shard_mkfs()
1e59de90 519{
aee94f69 520 LOG_PREFIX(ZBDSegmentManager::shard_mkfs);
1e59de90
TL
521 INFO("starting, device_path {}", device_path);
522 return open_device(
523 device_path, seastar::open_flags::rw
524 ).safe_then([=, this](auto p) {
525 device = std::move(p.first);
526 auto sd = p.second;
527 return read_metadata(device, sd);
528 }).safe_then([=, this](auto meta){
529 shard_info = meta.shard_infos[seastar::this_shard_id()];
530 metadata = meta;
531 return device.close();
532 }).safe_then([FNAME] {
533 DEBUG("Returning from shard_mkfs.");
534 return mkfs_ertr::now();
535 });
536}
537
538// Return range of sectors to operate on.
20effc67 539struct blk_zone_range make_range(
1e59de90
TL
540 segment_id_t id,
541 size_t segment_size,
20effc67
TL
542 size_t first_segment_offset)
543{
544 return blk_zone_range{
1e59de90
TL
545 (id.device_segment_id() * (segment_size >> SECT_SHIFT)
546 + (first_segment_offset >> SECT_SHIFT)),
547 (segment_size >> SECT_SHIFT)
20effc67
TL
548 };
549}
550
1e59de90 551using blk_zone_op_ertr = crimson::errorator<
20effc67 552 crimson::ct_error::input_output_error>;
1e59de90
TL
553using blk_zone_op_ret = blk_zone_op_ertr::future<>;
554blk_zone_op_ret blk_zone_op(seastar::file &device,
555 blk_zone_range &range,
556 zone_op op) {
aee94f69 557 LOG_PREFIX(ZBDSegmentManager::blk_zone_op);
1e59de90
TL
558
559 unsigned long ioctl_op = 0;
560 switch (op) {
561 using enum zone_op;
562 case OPEN:
563 ioctl_op = BLKOPENZONE;
564 break;
565 case FINISH:
566 ioctl_op = BLKFINISHZONE;
567 break;
568 case RESET:
569 ioctl_op = BLKRESETZONE;
570 break;
571 case CLOSE:
572 ioctl_op = BLKCLOSEZONE;
573 break;
574 default:
575 ERROR("Invalid zone operation {}", op);
576 ceph_assert(ioctl_op);
577 }
578
20effc67 579 return device.ioctl(
1e59de90 580 ioctl_op,
20effc67 581 &range
1e59de90 582 ).then_wrapped([=](auto f) -> blk_zone_op_ret {
20effc67 583 if (f.failed()) {
1e59de90 584 ERROR("{} ioctl failed", op);
20effc67 585 return crimson::ct_error::input_output_error::make();
1e59de90 586 } else {
20effc67
TL
587 int ret = f.get();
588 if (ret == 0) {
589 return seastar::now();
590 } else {
1e59de90 591 ERROR("{} ioctl failed with return code {}", op, ret);
20effc67
TL
592 return crimson::ct_error::input_output_error::make();
593 }
594 }
595 });
596}
597
aee94f69 598ZBDSegmentManager::open_ertr::future<SegmentRef> ZBDSegmentManager::open(
20effc67
TL
599 segment_id_t id)
600{
aee94f69 601 LOG_PREFIX(ZBDSegmentManager::open);
20effc67
TL
602 return seastar::do_with(
603 blk_zone_range{},
1e59de90 604 [=, this](auto &range) {
20effc67 605 range = make_range(
1e59de90
TL
606 id,
607 metadata.segment_size,
608 shard_info.first_segment_offset);
609 return blk_zone_op(
610 device,
611 range,
612 zone_op::OPEN
20effc67
TL
613 );
614 }
1e59de90
TL
615 ).safe_then([=, this] {
616 DEBUG("segment {}, open successful", id);
20effc67
TL
617 return open_ertr::future<SegmentRef>(
618 open_ertr::ready_future_marker{},
aee94f69 619 SegmentRef(new ZBDSegment(*this, id))
20effc67
TL
620 );
621 });
622}
623
aee94f69 624ZBDSegmentManager::release_ertr::future<> ZBDSegmentManager::release(
20effc67
TL
625 segment_id_t id)
626{
aee94f69 627 LOG_PREFIX(ZBDSegmentManager::release);
1e59de90 628 DEBUG("Resetting zone/segment {}", id);
20effc67
TL
629 return seastar::do_with(
630 blk_zone_range{},
1e59de90 631 [=, this](auto &range) {
20effc67 632 range = make_range(
1e59de90
TL
633 id,
634 metadata.segment_size,
635 shard_info.first_segment_offset);
636 return blk_zone_op(
637 device,
638 range,
639 zone_op::RESET
20effc67
TL
640 );
641 }
642 ).safe_then([=] {
1e59de90 643 DEBUG("segment release successful");
20effc67
TL
644 return release_ertr::now();
645 });
646}
647
aee94f69 648SegmentManager::read_ertr::future<> ZBDSegmentManager::read(
20effc67
TL
649 paddr_t addr,
650 size_t len,
651 ceph::bufferptr &out)
652{
aee94f69 653 LOG_PREFIX(ZBDSegmentManager::read);
20effc67
TL
654 auto& seg_addr = addr.as_seg_paddr();
655 if (seg_addr.get_segment_id().device_segment_id() >= get_num_segments()) {
1e59de90
TL
656 ERROR("invalid segment {}",
657 seg_addr.get_segment_id().device_segment_id());
20effc67
TL
658 return crimson::ct_error::invarg::make();
659 }
660
1e59de90
TL
661 if (seg_addr.get_segment_off() + len > metadata.segment_capacity) {
662 ERROR("invalid read offset {}, len {}",
20effc67
TL
663 addr,
664 len);
665 return crimson::ct_error::invarg::make();
666 }
667 return do_read(
668 device,
669 get_offset(addr),
670 len,
671 out);
672}
673
aee94f69 674Segment::close_ertr::future<> ZBDSegmentManager::segment_close(
20effc67
TL
675 segment_id_t id, segment_off_t write_pointer)
676{
aee94f69 677 LOG_PREFIX(ZBDSegmentManager::segment_close);
20effc67
TL
678 return seastar::do_with(
679 blk_zone_range{},
1e59de90 680 [=, this](auto &range) {
20effc67 681 range = make_range(
1e59de90
TL
682 id,
683 metadata.segment_size,
684 shard_info.first_segment_offset);
685 return blk_zone_op(
686 device,
687 range,
688 zone_op::FINISH
20effc67
TL
689 );
690 }
691 ).safe_then([=] {
1e59de90 692 DEBUG("zone finish successful");
20effc67
TL
693 return Segment::close_ertr::now();
694 });
695}
696
aee94f69 697Segment::write_ertr::future<> ZBDSegmentManager::segment_write(
20effc67
TL
698 paddr_t addr,
699 ceph::bufferlist bl,
700 bool ignore_check)
701{
aee94f69 702 LOG_PREFIX(ZBDSegmentManager::segment_write);
20effc67
TL
703 assert(addr.get_device_id() == get_device_id());
704 assert((bl.length() % metadata.block_size) == 0);
705 auto& seg_addr = addr.as_seg_paddr();
1e59de90 706 DEBUG("write to segment {} at offset {}, physical offset {}, len {}",
20effc67
TL
707 seg_addr.get_segment_id(),
708 seg_addr.get_segment_off(),
709 get_offset(addr),
710 bl.length());
711 stats.data_write.increment(bl.length());
712 return do_writev(
aee94f69 713 get_device_id(),
20effc67
TL
714 device,
715 get_offset(addr),
716 std::move(bl),
717 metadata.block_size);
718}
719
aee94f69 720device_id_t ZBDSegmentManager::get_device_id() const
20effc67
TL
721{
722 return metadata.device_id;
723};
724
aee94f69 725secondary_device_set_t& ZBDSegmentManager::get_secondary_devices()
20effc67
TL
726{
727 return metadata.secondary_devices;
728};
729
aee94f69 730magic_t ZBDSegmentManager::get_magic() const
20effc67
TL
731{
732 return metadata.magic;
733};
734
aee94f69 735segment_off_t ZBDSegment::get_write_capacity() const
20effc67
TL
736{
737 return manager.get_segment_size();
738}
739
aee94f69 740SegmentManager::close_ertr::future<> ZBDSegmentManager::close()
20effc67
TL
741{
742 if (device) {
743 return device.close();
744 }
745 return seastar::now();
746}
747
aee94f69 748Segment::close_ertr::future<> ZBDSegment::close()
20effc67
TL
749{
750 return manager.segment_close(id, write_pointer);
751}
752
aee94f69 753Segment::write_ertr::future<> ZBDSegment::write(
20effc67
TL
754 segment_off_t offset, ceph::bufferlist bl)
755{
aee94f69 756 LOG_PREFIX(ZBDSegment::write);
1e59de90
TL
757 if (offset != write_pointer || offset % manager.metadata.block_size != 0) {
758 ERROR("Segment offset and zone write pointer mismatch. "
759 "segment {} segment-offset {} write pointer {}",
760 id, offset, write_pointer);
20effc67
TL
761 return crimson::ct_error::invarg::make();
762 }
1e59de90 763 if (offset + bl.length() > manager.metadata.segment_capacity) {
20effc67 764 return crimson::ct_error::enospc::make();
1e59de90 765 }
20effc67
TL
766
767 write_pointer = offset + bl.length();
768 return manager.segment_write(paddr_t::make_seg_paddr(id, offset), bl);
769}
770
aee94f69 771Segment::write_ertr::future<> ZBDSegment::write_padding_bytes(
1e59de90
TL
772 size_t padding_bytes)
773{
aee94f69 774 LOG_PREFIX(ZBDSegment::write_padding_bytes);
1e59de90
TL
775 DEBUG("Writing {} padding bytes to segment {} at wp {}",
776 padding_bytes, id, write_pointer);
777
778 return crimson::repeat([FNAME, padding_bytes, this] () mutable {
779 size_t bufsize = 0;
780 if (padding_bytes >= MAX_PADDING_SIZE) {
781 bufsize = MAX_PADDING_SIZE;
782 } else {
783 bufsize = padding_bytes;
784 }
785
786 padding_bytes -= bufsize;
787 bufferptr bp(ceph::buffer::create_page_aligned(bufsize));
788 bp.zero();
789 bufferlist padd_bl;
790 padd_bl.append(bp);
791 return write(write_pointer, padd_bl).safe_then([FNAME, padding_bytes, this]() {
792 if (padding_bytes == 0) {
793 return write_ertr::make_ready_future<seastar::stop_iteration>(seastar::stop_iteration::yes);
794 } else {
795 return write_ertr::make_ready_future<seastar::stop_iteration>(seastar::stop_iteration::no);
796 }
797 });
798 });
799}
800
801// Advance write pointer, to given offset.
aee94f69 802Segment::write_ertr::future<> ZBDSegment::advance_wp(
1e59de90
TL
803 segment_off_t offset)
804{
aee94f69 805 LOG_PREFIX(ZBDSegment::advance_wp);
1e59de90
TL
806
807 DEBUG("Advancing write pointer from {} to {}", write_pointer, offset);
808 if (offset < write_pointer) {
809 return crimson::ct_error::invarg::make();
810 }
811
812 size_t padding_bytes = offset - write_pointer;
813
814 if (padding_bytes == 0) {
815 return write_ertr::now();
816 }
817
818 assert(padding_bytes % manager.metadata.block_size == 0);
819
820 return write_padding_bytes(padding_bytes);
821}
822
20effc67 823}