]> git.proxmox.com Git - ceph.git/blob - ceph/src/crimson/os/seastore/segment_manager/zns.cc
import quincy beta 17.1.0
[ceph.git] / ceph / src / crimson / os / seastore / segment_manager / zns.cc
1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2 // vim: ts=8 sw=2 smarttab
3
4 #include <sys/mman.h>
5 #include <string.h>
6 #include <linux/blkzoned.h>
7
8 #include "crimson/os/seastore/segment_manager/zns.h"
9 #include "crimson/common/config_proxy.h"
10 #include "crimson/common/log.h"
11 #include "include/buffer.h"
12
13 namespace {
14 seastar::logger &logger(){
15 return crimson::get_logger(ceph_subsys_seastore_device);
16 }
17 }
18
19 namespace crimson::os::seastore::segment_manager::zns {
20
21 using open_device_ret = ZNSSegmentManager::access_ertr::future<
22 std::pair<seastar::file, seastar::stat_data>>;
23 static open_device_ret open_device(
24 const std::string &path,
25 seastar::open_flags mode)
26 {
27 return seastar::file_stat(
28 path, seastar::follow_symlink::yes
29 ).then([mode, &path](auto stat) mutable{
30 return seastar::open_file_dma(path, mode).then([=](auto file){
31 logger().error(
32 "open_device: open successful, size {}",
33 stat.size);
34 return std::make_pair(file, stat);
35 });
36 }).handle_exception(
37 [](auto e) -> open_device_ret {
38 logger().error(
39 "open_device: got error {}",
40 e);
41 return crimson::ct_error::input_output_error::make();
42 }
43 );
44 }
45
46 static zns_sm_metadata_t make_metadata(
47 seastore_meta_t meta,
48 const seastar::stat_data &data,
49 size_t zone_size,
50 size_t zone_capacity,
51 size_t num_zones)
52 {
53 using crimson::common::get_conf;
54
55 auto config_size = get_conf<Option::size_t>(
56 "seastore_device_size");
57
58 size_t size = (data.size == 0) ? config_size : data.size;
59
60 auto config_segment_size = get_conf<Option::size_t>(
61 "seastore_segment_size");
62 logger().error("CONFIG SIZE: {}", config_segment_size);
63 size_t zones_per_segment = config_segment_size / zone_capacity;
64
65 size_t segments = (num_zones - 1) * zones_per_segment;
66
67 logger().debug(
68 "{}: size {}, block_size {}, allocated_size {}, configured_size {}, "
69 "segment_size {}",
70 __func__,
71 data.size,
72 data.block_size,
73 data.allocated_size,
74 config_size,
75 config_segment_size);
76
77 zns_sm_metadata_t ret = zns_sm_metadata_t{
78 size,
79 config_segment_size,
80 zone_capacity * zones_per_segment,
81 zones_per_segment,
82 zone_capacity,
83 data.block_size,
84 segments,
85 zone_size,
86 zone_size,
87 meta};
88 return ret;
89 }
90
91 struct ZoneReport {
92 struct blk_zone_report *hdr;
93 ZoneReport(int nr_zones)
94 : hdr((blk_zone_report *)malloc(
95 sizeof(struct blk_zone_report) + nr_zones * sizeof(struct blk_zone))){;}
96 ~ZoneReport(){
97 free(hdr);
98 }
99 ZoneReport(const ZoneReport &) = delete;
100 ZoneReport(ZoneReport &&rhs) : hdr(rhs.hdr) {
101 rhs.hdr = nullptr;
102 }
103 };
104
105 static seastar::future<> reset_device(
106 seastar::file &device,
107 uint32_t zone_size,
108 uint32_t nr_zones)
109 {
110 return seastar::do_with(
111 blk_zone_range{},
112 ZoneReport(nr_zones),
113 [&, nr_zones] (auto &range, auto &zr){
114 range.sector = 0;
115 range.nr_sectors = zone_size * nr_zones;
116 return device.ioctl(
117 BLKRESETZONE,
118 &range
119 ).then([&](int ret){
120 return seastar::now();
121 });
122 }
123 );
124 }
125
126 static seastar::future<size_t> get_zone_capacity(
127 seastar::file &device,
128 uint32_t zone_size,
129 uint32_t nr_zones)
130 {
131 return seastar::do_with(
132 blk_zone_range{},
133 ZoneReport(nr_zones),
134 [&] (auto &first_zone_range, auto &zr){
135 first_zone_range.sector = 0;
136 first_zone_range.nr_sectors = zone_size;
137 return device.ioctl(
138 BLKOPENZONE,
139 &first_zone_range
140 ).then([&](int ret){
141 return device.ioctl(BLKREPORTZONE, zr.hdr);
142 }).then([&] (int ret){
143 return device.ioctl(BLKRESETZONE, &first_zone_range);
144 }).then([&](int ret){
145 return seastar::make_ready_future<size_t>(zr.hdr->zones[0].wp);
146 });
147 }
148 );
149 }
150
151 static write_ertr::future<> do_write(
152 seastar::file &device,
153 uint64_t offset,
154 bufferptr &bptr)
155 {
156 logger().debug(
157 "zns: do_write offset {} len {}",
158 offset,
159 bptr.length());
160 return device.dma_write(
161 offset,
162 bptr.c_str(),
163 bptr.length()
164 ).handle_exception(
165 [](auto e) -> write_ertr::future<size_t> {
166 logger().error(
167 "do_write: dma_write got error {}",
168 e);
169 return crimson::ct_error::input_output_error::make();
170 }
171 ).then([length = bptr.length()](auto result) -> write_ertr::future<> {
172 if (result != length) {
173 return crimson::ct_error::input_output_error::make();
174 }
175 return write_ertr::now();
176 });
177 }
178
179 static write_ertr::future<> do_writev(
180 seastar::file &device,
181 uint64_t offset,
182 bufferlist&& bl,
183 size_t block_size)
184 {
185 logger().error(
186 "block: do_writev offset {} len {}",
187 offset,
188 bl.length());
189 // writev requires each buffer to be aligned to the disks' block
190 // size, we need to rebuild here
191 bl.rebuild_aligned(block_size);
192
193 std::vector<iovec> iov;
194 bl.prepare_iov(&iov);
195 return device.dma_write(
196 offset,
197 std::move(iov)
198 ).handle_exception(
199 [](auto e) -> write_ertr::future<size_t> {
200 logger().error(
201 "do_writev: dma_write got error {}",
202 e);
203 return crimson::ct_error::input_output_error::make();
204 }
205 ).then([bl=std::move(bl)/* hold the buf until the end of io */](size_t written)
206 -> write_ertr::future<> {
207 if (written != bl.length()) {
208 return crimson::ct_error::input_output_error::make();
209 }
210 return write_ertr::now();
211 });
212 }
213
214 static ZNSSegmentManager::access_ertr::future<>
215 write_metadata(seastar::file &device, zns_sm_metadata_t sb)
216 {
217 assert(ceph::encoded_sizeof_bounded<zns_sm_metadata_t>() <
218 sb.block_size);
219 return seastar::do_with(
220 bufferptr(ceph::buffer::create_page_aligned(sb.block_size)),
221 [=, &device](auto &bp){
222 logger().error("BLOCK SIZE: {}", sb.block_size);
223 bufferlist bl;
224 encode(sb, bl);
225 auto iter = bl.begin();
226 assert(bl.length() < sb.block_size);
227 logger().error("{}", bl.length());
228 iter.copy(bl.length(), bp.c_str());
229 logger().debug("write_metadata: doing writeout");
230 return do_write(device, 0, bp);
231 });
232 }
233
234 static read_ertr::future<> do_read(
235 seastar::file &device,
236 uint64_t offset,
237 size_t len,
238 bufferptr &bptr)
239 {
240 assert(len <= bptr.length());
241 logger().debug(
242 "block: do_read offset {} len {}",
243 offset,
244 len);
245 return device.dma_read(
246 offset,
247 bptr.c_str(),
248 len
249 ).handle_exception(
250 [](auto e) -> read_ertr::future<size_t> {
251 logger().error(
252 "do_read: dma_read got error {}",
253 e);
254 return crimson::ct_error::input_output_error::make();
255 }
256 ).then([len](auto result) -> read_ertr::future<> {
257 if (result != len) {
258 return crimson::ct_error::input_output_error::make();
259 }
260 return read_ertr::now();
261 });
262 }
263
264 static
265 ZNSSegmentManager::access_ertr::future<zns_sm_metadata_t>
266 read_metadata(seastar::file &device, seastar::stat_data sd)
267 {
268 assert(ceph::encoded_sizeof_bounded<zns_sm_metadata_t>() <
269 sd.block_size);
270 return seastar::do_with(
271 bufferptr(ceph::buffer::create_page_aligned(sd.block_size)),
272 [=, &device](auto &bp) {
273 return do_read(
274 device,
275 0,
276 bp.length(),
277 bp
278 ).safe_then([=, &bp] {
279 bufferlist bl;
280 bl.push_back(bp);
281 zns_sm_metadata_t ret;
282 auto bliter = bl.cbegin();
283 decode(ret, bliter);
284 return ZNSSegmentManager::access_ertr::future<zns_sm_metadata_t>(
285 ZNSSegmentManager::access_ertr::ready_future_marker{},
286 ret);
287 });
288 });
289 }
290
291 ZNSSegmentManager::mount_ret ZNSSegmentManager::mount()
292 {
293 return open_device(
294 device_path, seastar::open_flags::rw
295 ).safe_then([=](auto p) {
296 device = std::move(p.first);
297 auto sd = p.second;
298 return read_metadata(device, sd);
299 }).safe_then([=](auto meta){
300 metadata = meta;
301 return mount_ertr::now();
302 });
303 }
304
305 ZNSSegmentManager::mkfs_ret ZNSSegmentManager::mkfs(
306 segment_manager_config_t config)
307 {
308 logger().error("ZNSSegmentManager::mkfs: starting");
309 return seastar::do_with(
310 seastar::file{},
311 seastar::stat_data{},
312 zns_sm_metadata_t{},
313 size_t(),
314 size_t(),
315 [=](auto &device, auto &stat, auto &sb, auto &zone_size, auto &nr_zones){
316 logger().error("ZNSSegmentManager::mkfs path {}", device_path);
317 return open_device(
318 device_path,
319 seastar::open_flags::rw
320 ).safe_then([=, &device, &stat, &sb, &zone_size, &nr_zones](auto p){
321 device = p.first;
322 stat = p.second;
323 return device.ioctl(
324 BLKGETNRZONES,
325 (void *)&nr_zones
326 ).then([&](int ret){
327 if (nr_zones == 0) {
328 return seastar::make_exception_future<int>(
329 std::system_error(std::make_error_code(std::errc::io_error)));
330 }
331 return device.ioctl(BLKGETZONESZ, (void *)&zone_size);
332 }).then([&] (int ret){
333 return reset_device(device, zone_size, nr_zones);
334 }).then([&] {
335 return get_zone_capacity(device, zone_size, nr_zones);
336 }).then([&, config] (auto zone_capacity){
337 sb = make_metadata(
338 config.meta,
339 stat,
340 zone_size,
341 zone_capacity,
342 nr_zones);
343 metadata = sb;
344 stats.metadata_write.increment(
345 ceph::encoded_sizeof_bounded<zns_sm_metadata_t>());
346 logger().error("WROTE TO STATS");
347 return write_metadata(device, sb);
348 }).finally([&] {
349 logger().error("CLOSING DEVICE");
350 return device.close();
351 }).safe_then([] {
352 logger().error("RETURNING FROM MKFS");
353 return mkfs_ertr::now();
354 });
355 });
356 });
357 }
358
359 struct blk_zone_range make_range(
360 segment_id_t id,
361 size_t segment_size,
362 size_t block_size,
363 size_t first_segment_offset)
364 {
365 return blk_zone_range{
366 (id.device_segment_id() * segment_size + first_segment_offset),
367 (segment_size)
368 };
369 }
370
371 using blk_open_zone_ertr = crimson::errorator<
372 crimson::ct_error::input_output_error>;
373 using blk_open_zone_ret = blk_open_zone_ertr::future<>;
374 blk_open_zone_ret blk_open_zone(seastar::file &device, blk_zone_range &range){
375 return device.ioctl(
376 BLKOPENZONE,
377 &range
378 ).then_wrapped([=](auto f) -> blk_open_zone_ret{
379 if (f.failed()) {
380 return crimson::ct_error::input_output_error::make();
381 }
382 else {
383 int ret = f.get();
384 if (ret == 0) {
385 return seastar::now();
386 } else {
387 return crimson::ct_error::input_output_error::make();
388 }
389 }
390 });
391 }
392
393 ZNSSegmentManager::open_ertr::future<SegmentRef> ZNSSegmentManager::open(
394 segment_id_t id)
395 {
396 return seastar::do_with(
397 blk_zone_range{},
398 [=] (auto &range){
399 range = make_range(
400 id,
401 metadata.zone_size,
402 metadata.block_size,
403 metadata.first_segment_offset);
404 return blk_open_zone(
405 device,
406 range
407 );
408 }
409 ).safe_then([=] {
410 logger().error("open _segment: open successful");
411 return open_ertr::future<SegmentRef>(
412 open_ertr::ready_future_marker{},
413 SegmentRef(new ZNSSegment(*this, id))
414 );
415 });
416 }
417
418 using blk_close_zone_ertr = crimson::errorator<
419 crimson::ct_error::input_output_error>;
420 using blk_close_zone_ret = blk_close_zone_ertr::future<>;
421 blk_close_zone_ret blk_close_zone(
422 seastar::file &device,
423 blk_zone_range &range)
424 {
425 return device.ioctl(
426 BLKCLOSEZONE,
427 &range
428 ).then_wrapped([=](auto f) -> blk_open_zone_ret{
429 if (f.failed()) {
430 return crimson::ct_error::input_output_error::make();
431 }
432 else {
433 int ret = f.get();
434 if (ret == 0) {
435 return seastar::now();
436 } else {
437 return crimson::ct_error::input_output_error::make();
438 }
439 }
440 });
441 }
442
443 ZNSSegmentManager::release_ertr::future<> ZNSSegmentManager::release(
444 segment_id_t id)
445 {
446 return seastar::do_with(
447 blk_zone_range{},
448 [=] (auto &range){
449 range = make_range(
450 id,
451 metadata.zone_size,
452 metadata.block_size,
453 metadata.first_segment_offset);
454 return blk_close_zone(
455 device,
456 range
457 );
458 }
459 ).safe_then([=] {
460 logger().error("release _segment: release successful");
461 return release_ertr::now();
462 });
463 }
464
465 SegmentManager::read_ertr::future<> ZNSSegmentManager::read(
466 paddr_t addr,
467 size_t len,
468 ceph::bufferptr &out)
469 {
470 auto& seg_addr = addr.as_seg_paddr();
471 if (seg_addr.get_segment_id().device_segment_id() >= get_num_segments()) {
472 logger().error(
473 "ZNSSegmentManager::read: invalid segment {}",
474 addr);
475 return crimson::ct_error::invarg::make();
476 }
477
478 if (seg_addr.get_segment_off() + len > metadata.zone_size) {
479 logger().error(
480 "ZNSSegmentManager::read: invalid offset {}~{}!",
481 addr,
482 len);
483 return crimson::ct_error::invarg::make();
484 }
485 return do_read(
486 device,
487 get_offset(addr),
488 len,
489 out);
490 }
491
492 Segment::close_ertr::future<> ZNSSegmentManager::segment_close(
493 segment_id_t id, segment_off_t write_pointer)
494 {
495 return seastar::do_with(
496 blk_zone_range{},
497 [=] (auto &range){
498 range = make_range(
499 id,
500 metadata.zone_size,
501 metadata.block_size,
502 metadata.first_segment_offset);
503 return blk_close_zone(
504 device,
505 range
506 );
507 }
508 ).safe_then([=] {
509 logger().error("open _segment: open successful");
510 return Segment::close_ertr::now();
511 });
512 }
513
514 Segment::write_ertr::future<> ZNSSegmentManager::segment_write(
515 paddr_t addr,
516 ceph::bufferlist bl,
517 bool ignore_check)
518 {
519 assert(addr.get_device_id() == get_device_id());
520 assert((bl.length() % metadata.block_size) == 0);
521 auto& seg_addr = addr.as_seg_paddr();
522 logger().debug(
523 "BlockSegmentManager::segment_write: "
524 "segment_write to segment {} at offset {}, physical offset {}, len {}",
525 seg_addr.get_segment_id(),
526 seg_addr.get_segment_off(),
527 get_offset(addr),
528 bl.length());
529 stats.data_write.increment(bl.length());
530 return do_writev(
531 device,
532 get_offset(addr),
533 std::move(bl),
534 metadata.block_size);
535 }
536
537 device_id_t ZNSSegmentManager::get_device_id() const
538 {
539 return metadata.device_id;
540 };
541
542 secondary_device_set_t& ZNSSegmentManager::get_secondary_devices()
543 {
544 return metadata.secondary_devices;
545 };
546
547 device_spec_t ZNSSegmentManager::get_device_spec() const
548 {
549 auto spec = device_spec_t();
550 spec.magic = metadata.magic;
551 spec.dtype = metadata.dtype;
552 spec.id = metadata.device_id;
553 return spec;
554 };
555
556 magic_t ZNSSegmentManager::get_magic() const
557 {
558 return metadata.magic;
559 };
560
561 segment_off_t ZNSSegment::get_write_capacity() const
562 {
563 return manager.get_segment_size();
564 }
565
566 SegmentManager::close_ertr::future<> ZNSSegmentManager::close()
567 {
568 if (device) {
569 return device.close();
570 }
571 return seastar::now();
572 }
573
574 Segment::close_ertr::future<> ZNSSegment::close()
575 {
576 return manager.segment_close(id, write_pointer);
577 }
578
579 Segment::write_ertr::future<> ZNSSegment::write(
580 segment_off_t offset, ceph::bufferlist bl)
581 {
582 if (offset < write_pointer || offset % manager.metadata.block_size != 0) {
583 logger().error(
584 "ZNSSegmentManager::ZNSSegment::write: "
585 "invalid segment write on segment {} to offset {}",
586 id,
587 offset);
588 return crimson::ct_error::invarg::make();
589 }
590 if (offset + bl.length() > manager.metadata.segment_size)
591 return crimson::ct_error::enospc::make();
592
593 write_pointer = offset + bl.length();
594 return manager.segment_write(paddr_t::make_seg_paddr(id, offset), bl);
595 }
596
597 }