]> git.proxmox.com Git - ceph.git/blame - ceph/src/crimson/os/seastore/segment_manager/block.cc
update ceph source to reef 18.1.2
[ceph.git] / ceph / src / crimson / os / seastore / segment_manager / block.cc
CommitLineData
f67539c2
TL
1// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2// vim: ts=8 sw=2 smarttab
3
4#include <sys/mman.h>
5#include <string.h>
6
2a845540
TL
7#include <fmt/format.h>
8
1e59de90
TL
9#include <seastar/core/metrics.hh>
10
f67539c2 11#include "include/buffer.h"
f67539c2 12
20effc67
TL
13#include "crimson/common/config_proxy.h"
14#include "crimson/common/errorator-loop.h"
15
16#include "crimson/os/seastore/logging.h"
17#include "crimson/os/seastore/segment_manager/block.h"
f67539c2 18
20effc67
TL
19SET_SUBSYS(seastore_device);
20/*
21 * format:
22 * - D<device-id> S<segment-id> offset=<off>~<len> poffset=<off> information
23 * - D<device-id> poffset=<off>~<len> information
24 *
25 * levels:
26 * - INFO: major initiation, closing and segment operations
27 * - DEBUG: INFO details, major read and write operations
28 * - TRACE: DEBUG details
29 */
f67539c2 30
2a845540
TL
31using segment_state_t = crimson::os::seastore::Segment::segment_state_t;
32
33template <> struct fmt::formatter<segment_state_t>: fmt::formatter<std::string_view> {
34 // parse is inherited from formatter<string_view>.
35 template <typename FormatContext>
36 auto format(segment_state_t s, FormatContext& ctx) {
37 std::string_view name = "unknown";
38 switch (s) {
39 case segment_state_t::EMPTY:
40 name = "empty";
41 break;
42 case segment_state_t::OPEN:
43 name = "open";
44 break;
45 case segment_state_t::CLOSED:
46 name = "closed";
47 break;
48 }
49 return formatter<string_view>::format(name, ctx);
50 }
51};
52
f67539c2
TL
53namespace crimson::os::seastore::segment_manager::block {
54
55static write_ertr::future<> do_write(
20effc67 56 device_id_t device_id,
f67539c2
TL
57 seastar::file &device,
58 uint64_t offset,
59 bufferptr &bptr)
60{
20effc67
TL
61 LOG_PREFIX(block_do_write);
62 auto len = bptr.length();
1e59de90
TL
63 TRACE("{} poffset={}~{} ...",
64 device_id_printer_t{device_id}, offset, len);
f67539c2
TL
65 return device.dma_write(
66 offset,
67 bptr.c_str(),
20effc67
TL
68 len
69 ).handle_exception(
70 [FNAME, device_id, offset, len](auto e) -> write_ertr::future<size_t> {
1e59de90
TL
71 ERROR("{} poffset={}~{} got error -- {}",
72 device_id_printer_t{device_id}, offset, len, e);
20effc67
TL
73 return crimson::ct_error::input_output_error::make();
74 }).then([FNAME, device_id, offset, len](auto result) -> write_ertr::future<> {
75 if (result != len) {
1e59de90
TL
76 ERROR("{} poffset={}~{} write len={} inconsistent",
77 device_id_printer_t{device_id}, offset, len, result);
f67539c2
TL
78 return crimson::ct_error::input_output_error::make();
79 }
1e59de90 80 TRACE("{} poffset={}~{} done", device_id_printer_t{device_id}, offset, len);
f67539c2
TL
81 return write_ertr::now();
82 });
83}
84
20effc67
TL
85static write_ertr::future<> do_writev(
86 device_id_t device_id,
87 seastar::file &device,
88 uint64_t offset,
89 bufferlist&& bl,
90 size_t block_size)
91{
92 LOG_PREFIX(block_do_writev);
1e59de90
TL
93 TRACE("{} poffset={}~{}, {} buffers",
94 device_id_printer_t{device_id}, offset, bl.length(), bl.get_num_buffers());
20effc67
TL
95
96 // writev requires each buffer to be aligned to the disks' block
97 // size, we need to rebuild here
98 bl.rebuild_aligned(block_size);
99
100 return seastar::do_with(
101 bl.prepare_iovs(),
102 std::move(bl),
103 [&device, device_id, offset, FNAME](auto& iovs, auto& bl)
104 {
105 return write_ertr::parallel_for_each(
106 iovs,
107 [&device, device_id, offset, FNAME](auto& p) mutable
108 {
109 auto off = offset + p.offset;
110 auto len = p.length;
111 auto& iov = p.iov;
1e59de90
TL
112 TRACE("{} poffset={}~{} dma_write ...",
113 device_id_printer_t{device_id}, off, len);
20effc67
TL
114 return device.dma_write(off, std::move(iov)
115 ).handle_exception(
116 [FNAME, device_id, off, len](auto e) -> write_ertr::future<size_t>
117 {
1e59de90
TL
118 ERROR("{} poffset={}~{} dma_write got error -- {}",
119 device_id_printer_t{device_id}, off, len, e);
20effc67
TL
120 return crimson::ct_error::input_output_error::make();
121 }).then([FNAME, device_id, off, len](size_t written) -> write_ertr::future<> {
122 if (written != len) {
1e59de90
TL
123 ERROR("{} poffset={}~{} dma_write len={} inconsistent",
124 device_id_printer_t{device_id}, off, len, written);
20effc67
TL
125 return crimson::ct_error::input_output_error::make();
126 }
1e59de90
TL
127 TRACE("{} poffset={}~{} dma_write done",
128 device_id_printer_t{device_id}, off, len);
20effc67
TL
129 return write_ertr::now();
130 });
131 });
132 });
133}
134
f67539c2 135static read_ertr::future<> do_read(
20effc67 136 device_id_t device_id,
f67539c2
TL
137 seastar::file &device,
138 uint64_t offset,
20effc67 139 size_t len,
f67539c2
TL
140 bufferptr &bptr)
141{
20effc67 142 LOG_PREFIX(block_do_read);
1e59de90 143 TRACE("{} poffset={}~{} ...", device_id_printer_t{device_id}, offset, len);
20effc67 144 assert(len <= bptr.length());
f67539c2
TL
145 return device.dma_read(
146 offset,
147 bptr.c_str(),
20effc67
TL
148 len
149 ).handle_exception(
150 //FIXME: this is a little bit tricky, since seastar::future<T>::handle_exception
151 // returns seastar::future<T>, to return an crimson::ct_error, we have to create
152 // a seastar::future<T> holding that crimson::ct_error. This is not necessary
153 // once seastar::future<T>::handle_exception() returns seastar::futurize_t<T>
154 [FNAME, device_id, offset, len](auto e) -> read_ertr::future<size_t>
155 {
1e59de90
TL
156 ERROR("{} poffset={}~{} got error -- {}",
157 device_id_printer_t{device_id}, offset, len, e);
f67539c2 158 return crimson::ct_error::input_output_error::make();
20effc67
TL
159 }).then([FNAME, device_id, offset, len](auto result) -> read_ertr::future<> {
160 if (result != len) {
1e59de90
TL
161 ERROR("{} poffset={}~{} read len={} inconsistent",
162 device_id_printer_t{device_id}, offset, len, result);
f67539c2
TL
163 return crimson::ct_error::input_output_error::make();
164 }
1e59de90 165 TRACE("{} poffset={}~{} done", device_id_printer_t{device_id}, offset, len);
f67539c2
TL
166 return read_ertr::now();
167 });
168}
169
170write_ertr::future<>
171SegmentStateTracker::write_out(
20effc67 172 device_id_t device_id,
f67539c2
TL
173 seastar::file &device,
174 uint64_t offset)
175{
20effc67 176 LOG_PREFIX(SegmentStateTracker::write_out);
1e59de90
TL
177 DEBUG("{} poffset={}~{}",
178 device_id_printer_t{device_id}, offset, bptr.length());
20effc67 179 return do_write(device_id, device, offset, bptr);
f67539c2
TL
180}
181
182write_ertr::future<>
183SegmentStateTracker::read_in(
20effc67 184 device_id_t device_id,
f67539c2
TL
185 seastar::file &device,
186 uint64_t offset)
187{
20effc67 188 LOG_PREFIX(SegmentStateTracker::read_in);
1e59de90
TL
189 DEBUG("{} poffset={}~{}",
190 device_id_printer_t{device_id}, offset, bptr.length());
f67539c2 191 return do_read(
20effc67 192 device_id,
f67539c2
TL
193 device,
194 offset,
20effc67 195 bptr.length(),
f67539c2
TL
196 bptr);
197}
1e59de90 198using std::vector;
f67539c2
TL
199static
200block_sm_superblock_t make_superblock(
20effc67 201 device_id_t device_id,
1e59de90 202 device_config_t sm_config,
f67539c2
TL
203 const seastar::stat_data &data)
204{
20effc67
TL
205 LOG_PREFIX(block_make_superblock);
206 using crimson::common::get_conf;
207
208 auto config_size = get_conf<Option::size_t>(
209 "seastore_device_size");
210
211 size_t size = (data.size == 0) ? config_size : data.size;
212
213 auto config_segment_size = get_conf<Option::size_t>(
214 "seastore_segment_size");
215 size_t raw_segments = size / config_segment_size;
1e59de90
TL
216 size_t shard_tracker_size = SegmentStateTracker::get_raw_size(
217 raw_segments / seastar::smp::count,
f67539c2 218 data.block_size);
1e59de90
TL
219 size_t total_tracker_size = shard_tracker_size * seastar::smp::count;
220 size_t tracker_off = data.block_size; //superblock
221 size_t segments = (size - tracker_off - total_tracker_size) / config_segment_size;
222 size_t segments_per_shard = segments / seastar::smp::count;
223
224 vector<block_shard_info_t> shard_infos(seastar::smp::count);
225 for (unsigned int i = 0; i < seastar::smp::count; i++) {
226 shard_infos[i].size = segments_per_shard * config_segment_size;
227 shard_infos[i].segments = segments_per_shard;
228 shard_infos[i].tracker_offset = tracker_off + i * shard_tracker_size;
229 shard_infos[i].first_segment_offset = tracker_off + total_tracker_size
230 + i * segments_per_shard * config_segment_size;
231 }
20effc67 232
1e59de90
TL
233 INFO("{} disk_size={}, segment_size={}, block_size={}",
234 device_id_printer_t{device_id},
20effc67 235 size,
1e59de90
TL
236 uint64_t(config_segment_size),
237 data.block_size);
238 for (unsigned int i = 0; i < seastar::smp::count; i++) {
239 INFO("shard {} infos:", i, shard_infos[i]);
240 }
20effc67 241
f67539c2 242 return block_sm_superblock_t{
1e59de90 243 seastar::smp::count,
20effc67 244 config_segment_size,
f67539c2 245 data.block_size,
1e59de90
TL
246 shard_infos,
247 std::move(sm_config)
f67539c2
TL
248 };
249}
250
20effc67
TL
251using check_create_device_ertr = BlockSegmentManager::access_ertr;
252using check_create_device_ret = check_create_device_ertr::future<>;
253static check_create_device_ret check_create_device(
254 const std::string &path,
255 size_t size)
256{
257 LOG_PREFIX(block_check_create_device);
258 INFO("path={}, size={}", path, size);
259 return seastar::open_file_dma(
260 path,
261 seastar::open_flags::exclusive |
262 seastar::open_flags::rw |
263 seastar::open_flags::create
264 ).then([size, FNAME, &path](auto file) {
265 return seastar::do_with(
266 file,
267 [size, FNAME, &path](auto &f) -> seastar::future<>
268 {
269 DEBUG("path={} created, truncating to {}", path, size);
270 ceph_assert(f);
271 return f.truncate(
272 size
273 ).then([&f, size] {
274 return f.allocate(0, size);
275 }).finally([&f] {
276 return f.close();
277 });
278 });
279 }).then_wrapped([&path, FNAME](auto f) -> check_create_device_ret {
280 if (f.failed()) {
281 try {
282 f.get();
283 return seastar::now();
284 } catch (const std::system_error &e) {
285 if (e.code().value() == EEXIST) {
286 ERROR("path={} exists", path);
287 return seastar::now();
288 } else {
289 ERROR("path={} creation error -- {}", path, e);
290 return crimson::ct_error::input_output_error::make();
291 }
292 } catch (...) {
293 ERROR("path={} creation error", path);
294 return crimson::ct_error::input_output_error::make();
295 }
296 }
297
298 DEBUG("path={} complete", path);
299 std::ignore = f.discard_result();
300 return seastar::now();
301 });
302}
303
f67539c2
TL
304using open_device_ret =
305 BlockSegmentManager::access_ertr::future<
306 std::pair<seastar::file, seastar::stat_data>
307 >;
308static
20effc67
TL
309open_device_ret open_device(
310 const std::string &path)
f67539c2 311{
20effc67
TL
312 LOG_PREFIX(block_open_device);
313 return seastar::file_stat(path, seastar::follow_symlink::yes
314 ).then([&path, FNAME](auto stat) mutable {
315 return seastar::open_file_dma(
316 path,
317 seastar::open_flags::rw | seastar::open_flags::dsync
1e59de90
TL
318 ).then([stat, &path, FNAME](auto file) mutable {
319 return file.size().then([stat, file, &path, FNAME](auto size) mutable {
320 stat.size = size;
321 INFO("path={} successful, size={}, block_size={}",
322 path, stat.size, stat.block_size);
323 return std::make_pair(file, stat);
324 });
f67539c2 325 });
20effc67
TL
326 }).handle_exception([FNAME, &path](auto e) -> open_device_ret {
327 ERROR("path={} got error -- {}", path, e);
328 return crimson::ct_error::input_output_error::make();
329 });
f67539c2
TL
330}
331
20effc67 332
f67539c2
TL
333static
334BlockSegmentManager::access_ertr::future<>
20effc67
TL
335write_superblock(
336 device_id_t device_id,
337 seastar::file &device,
338 block_sm_superblock_t sb)
f67539c2 339{
20effc67 340 LOG_PREFIX(block_write_superblock);
1e59de90 341 DEBUG("{} write {}", device_id_printer_t{device_id}, sb);
20effc67
TL
342 sb.validate();
343 assert(ceph::encoded_sizeof<block_sm_superblock_t>(sb) <
f67539c2
TL
344 sb.block_size);
345 return seastar::do_with(
346 bufferptr(ceph::buffer::create_page_aligned(sb.block_size)),
20effc67
TL
347 [=, &device](auto &bp)
348 {
349 bufferlist bl;
350 encode(sb, bl);
351 auto iter = bl.begin();
352 assert(bl.length() < sb.block_size);
353 iter.copy(bl.length(), bp.c_str());
354 return do_write(device_id, device, 0, bp);
355 });
f67539c2
TL
356}
357
358static
359BlockSegmentManager::access_ertr::future<block_sm_superblock_t>
360read_superblock(seastar::file &device, seastar::stat_data sd)
361{
20effc67
TL
362 LOG_PREFIX(block_read_superblock);
363 DEBUG("reading superblock ...");
f67539c2
TL
364 return seastar::do_with(
365 bufferptr(ceph::buffer::create_page_aligned(sd.block_size)),
20effc67
TL
366 [=, &device](auto &bp)
367 {
368 return do_read(
369 DEVICE_ID_NULL, // unknown
370 device,
371 0,
372 bp.length(),
373 bp
374 ).safe_then([=, &bp] {
375 bufferlist bl;
376 bl.push_back(bp);
377 block_sm_superblock_t ret;
378 auto bliter = bl.cbegin();
379 try {
380 decode(ret, bliter);
381 } catch (...) {
382 ERROR("got decode error!");
383 ceph_assert(0 == "invalid superblock");
384 }
385 assert(ceph::encoded_sizeof<block_sm_superblock_t>(ret) <
386 sd.block_size);
387 return BlockSegmentManager::access_ertr::future<block_sm_superblock_t>(
388 BlockSegmentManager::access_ertr::ready_future_marker{},
389 ret);
f67539c2 390 });
20effc67 391 });
f67539c2
TL
392}
393
394BlockSegment::BlockSegment(
395 BlockSegmentManager &manager, segment_id_t id)
396 : manager(manager), id(id) {}
397
398segment_off_t BlockSegment::get_write_capacity() const
399{
400 return manager.get_segment_size();
401}
402
403Segment::close_ertr::future<> BlockSegment::close()
404{
20effc67 405 return manager.segment_close(id, write_pointer);
f67539c2
TL
406}
407
408Segment::write_ertr::future<> BlockSegment::write(
409 segment_off_t offset, ceph::bufferlist bl)
410{
20effc67
TL
411 LOG_PREFIX(BlockSegment::write);
412 auto paddr = paddr_t::make_seg_paddr(id, offset);
1e59de90
TL
413 DEBUG("{} offset={}~{} poffset={} ...",
414 id, offset, bl.length(), manager.get_offset(paddr));
20effc67
TL
415
416 if (offset < write_pointer ||
417 offset % manager.superblock.block_size != 0 ||
418 bl.length() % manager.superblock.block_size != 0) {
1e59de90
TL
419 ERROR("{} offset={}~{} poffset={} invalid write",
420 id, offset, bl.length(), manager.get_offset(paddr));
f67539c2 421 return crimson::ct_error::invarg::make();
20effc67 422 }
f67539c2 423
20effc67 424 if (offset + bl.length() > manager.superblock.segment_size) {
1e59de90
TL
425 ERROR("{} offset={}~{} poffset={} write out of the range {}",
426 id, offset, bl.length(), manager.get_offset(paddr),
20effc67 427 manager.superblock.segment_size);
f67539c2 428 return crimson::ct_error::enospc::make();
20effc67 429 }
f67539c2
TL
430
431 write_pointer = offset + bl.length();
20effc67 432 return manager.segment_write(paddr, bl);
f67539c2
TL
433}
434
1e59de90
TL
435Segment::write_ertr::future<> BlockSegment::advance_wp(
436 segment_off_t offset) {
437 return write_ertr::now();
438}
439
20effc67
TL
440Segment::close_ertr::future<> BlockSegmentManager::segment_close(
441 segment_id_t id, segment_off_t write_pointer)
f67539c2 442{
20effc67
TL
443 LOG_PREFIX(BlockSegmentManager::segment_close);
444 auto s_id = id.device_segment_id();
445 int unused_bytes = get_segment_size() - write_pointer;
1e59de90 446 INFO("{} unused_bytes={} ...", id, unused_bytes);
20effc67
TL
447
448 assert(unused_bytes >= 0);
449 assert(id.device_id() == get_device_id());
f67539c2 450 assert(tracker);
20effc67
TL
451
452 tracker->set(s_id, segment_state_t::CLOSED);
453 ++stats.closed_segments;
454 stats.closed_segments_unused_bytes += unused_bytes;
455 stats.metadata_write.increment(tracker->get_size());
456 return tracker->write_out(
1e59de90
TL
457 get_device_id(), device,
458 shard_info.tracker_offset);
f67539c2
TL
459}
460
461Segment::write_ertr::future<> BlockSegmentManager::segment_write(
462 paddr_t addr,
463 ceph::bufferlist bl,
464 bool ignore_check)
465{
20effc67 466 assert(addr.get_device_id() == get_device_id());
f67539c2 467 assert((bl.length() % superblock.block_size) == 0);
20effc67
TL
468 stats.data_write.increment(bl.length());
469 return do_writev(
470 get_device_id(),
471 device,
472 get_offset(addr),
473 std::move(bl),
474 superblock.block_size);
f67539c2
TL
475}
476
477BlockSegmentManager::~BlockSegmentManager()
478{
479}
480
20effc67 481BlockSegmentManager::mount_ret BlockSegmentManager::mount()
f67539c2 482{
1e59de90
TL
483 return shard_devices.invoke_on_all([](auto &local_device) {
484 return local_device.shard_mount(
485 ).handle_error(
486 crimson::ct_error::assert_all{
487 "Invalid error in BlockSegmentManager::mount"
488 });
489 });
490}
491
492BlockSegmentManager::mount_ret BlockSegmentManager::shard_mount()
493{
494 LOG_PREFIX(BlockSegmentManager::shard_mount);
f67539c2 495 return open_device(
20effc67 496 device_path
1e59de90 497 ).safe_then([=, this](auto p) {
f67539c2
TL
498 device = std::move(p.first);
499 auto sd = p.second;
500 return read_superblock(device, sd);
1e59de90
TL
501 }).safe_then([=, this](auto sb) {
502 set_device_id(sb.config.spec.id);
503 shard_info = sb.shard_infos[seastar::this_shard_id()];
504 INFO("{} read {}", device_id_printer_t{get_device_id()}, shard_info);
20effc67 505 sb.validate();
f67539c2 506 superblock = sb;
20effc67
TL
507 stats.data_read.increment(
508 ceph::encoded_sizeof<block_sm_superblock_t>(superblock));
f67539c2 509 tracker = std::make_unique<SegmentStateTracker>(
1e59de90 510 shard_info.segments,
f67539c2 511 superblock.block_size);
20effc67 512 stats.data_read.increment(tracker->get_size());
f67539c2 513 return tracker->read_in(
20effc67 514 get_device_id(),
f67539c2 515 device,
1e59de90 516 shard_info.tracker_offset
f67539c2 517 ).safe_then([this] {
20effc67 518 for (device_segment_id_t i = 0; i < tracker->get_capacity(); ++i) {
f67539c2
TL
519 if (tracker->get(i) == segment_state_t::OPEN) {
520 tracker->set(i, segment_state_t::CLOSED);
521 }
522 }
20effc67
TL
523 stats.metadata_write.increment(tracker->get_size());
524 return tracker->write_out(
1e59de90
TL
525 get_device_id(), device,
526 shard_info.tracker_offset);
f67539c2 527 });
20effc67 528 }).safe_then([this, FNAME] {
1e59de90 529 INFO("{} complete", device_id_printer_t{get_device_id()});
20effc67 530 register_metrics();
f67539c2
TL
531 });
532}
533
20effc67 534BlockSegmentManager::mkfs_ret BlockSegmentManager::mkfs(
1e59de90
TL
535 device_config_t sm_config)
536{
537 return shard_devices.local().primary_mkfs(sm_config
538 ).safe_then([this] {
539 return shard_devices.invoke_on_all([](auto &local_device) {
540 return local_device.shard_mkfs(
541 ).handle_error(
542 crimson::ct_error::assert_all{
543 "Invalid error in BlockSegmentManager::mkfs"
544 });
545 });
546 });
547}
548
549BlockSegmentManager::mkfs_ret BlockSegmentManager::primary_mkfs(
550 device_config_t sm_config)
f67539c2 551{
1e59de90
TL
552 LOG_PREFIX(BlockSegmentManager::primary_mkfs);
553 ceph_assert(sm_config.spec.dtype == superblock.config.spec.dtype);
554 set_device_id(sm_config.spec.id);
555 INFO("{} path={}, {}",
556 device_id_printer_t{get_device_id()}, device_path, sm_config);
f67539c2
TL
557 return seastar::do_with(
558 seastar::file{},
559 seastar::stat_data{},
560 block_sm_superblock_t{},
561 std::unique_ptr<SegmentStateTracker>(),
1e59de90 562 [=, this](auto &device, auto &stat, auto &sb, auto &tracker)
20effc67
TL
563 {
564 check_create_device_ret maybe_create = check_create_device_ertr::now();
565 using crimson::common::get_conf;
566 if (get_conf<bool>("seastore_block_create")) {
567 auto size = get_conf<Option::size_t>("seastore_device_size");
568 maybe_create = check_create_device(device_path, size);
569 }
570
571 return maybe_create.safe_then([this] {
572 return open_device(device_path);
573 }).safe_then([&, sm_config](auto p) {
574 device = p.first;
575 stat = p.second;
576 sb = make_superblock(get_device_id(), sm_config, stat);
577 stats.metadata_write.increment(
578 ceph::encoded_sizeof<block_sm_superblock_t>(sb));
579 return write_superblock(get_device_id(), device, sb);
20effc67
TL
580 }).finally([&] {
581 return device.close();
582 }).safe_then([FNAME, this] {
1e59de90 583 INFO("{} complete", device_id_printer_t{get_device_id()});
20effc67 584 return mkfs_ertr::now();
f67539c2 585 });
20effc67 586 });
f67539c2
TL
587}
588
1e59de90
TL
589BlockSegmentManager::mkfs_ret BlockSegmentManager::shard_mkfs()
590{
591 LOG_PREFIX(BlockSegmentManager::shard_mkfs);
592 return open_device(
593 device_path
594 ).safe_then([this](auto p) {
595 device = std::move(p.first);
596 auto sd = p.second;
597 return read_superblock(device, sd);
598 }).safe_then([this, FNAME](auto sb) {
599 set_device_id(sb.config.spec.id);
600 shard_info = sb.shard_infos[seastar::this_shard_id()];
601 INFO("{} read {}", device_id_printer_t{get_device_id()}, shard_info);
602 sb.validate();
603 tracker.reset(new SegmentStateTracker(
604 shard_info.segments, sb.block_size));
605 stats.metadata_write.increment(tracker->get_size());
606 return tracker->write_out(
607 get_device_id(), device,
608 shard_info.tracker_offset);
609 }).finally([this] {
610 return device.close();
611 }).safe_then([FNAME, this] {
612 INFO("{} complete", device_id_printer_t{get_device_id()});
613 return mkfs_ertr::now();
614 });
615}
616
f67539c2
TL
617BlockSegmentManager::close_ertr::future<> BlockSegmentManager::close()
618{
20effc67 619 LOG_PREFIX(BlockSegmentManager::close);
1e59de90 620 INFO("{}", device_id_printer_t{get_device_id()});
20effc67 621 metrics.clear();
f67539c2
TL
622 return device.close();
623}
624
625SegmentManager::open_ertr::future<SegmentRef> BlockSegmentManager::open(
626 segment_id_t id)
627{
20effc67
TL
628 LOG_PREFIX(BlockSegmentManager::open);
629 auto s_id = id.device_segment_id();
1e59de90 630 INFO("{} ...", id);
20effc67
TL
631
632 assert(id.device_id() == get_device_id());
633
634 if (s_id >= get_num_segments()) {
1e59de90 635 ERROR("{} segment-id out of range {}", id, get_num_segments());
f67539c2
TL
636 return crimson::ct_error::invarg::make();
637 }
638
20effc67 639 if (tracker->get(s_id) != segment_state_t::EMPTY) {
1e59de90 640 ERROR("{} invalid state {} != EMPTY", id, tracker->get(s_id));
f67539c2
TL
641 return crimson::ct_error::invarg::make();
642 }
643
20effc67
TL
644 tracker->set(s_id, segment_state_t::OPEN);
645 stats.metadata_write.increment(tracker->get_size());
646 return tracker->write_out(
1e59de90
TL
647 get_device_id(), device,
648 shard_info.tracker_offset
20effc67
TL
649 ).safe_then([this, id, FNAME] {
650 ++stats.opened_segments;
1e59de90 651 DEBUG("{} done", id);
f67539c2
TL
652 return open_ertr::future<SegmentRef>(
653 open_ertr::ready_future_marker{},
654 SegmentRef(new BlockSegment(*this, id)));
655 });
656}
657
658SegmentManager::release_ertr::future<> BlockSegmentManager::release(
659 segment_id_t id)
660{
20effc67
TL
661 LOG_PREFIX(BlockSegmentManager::release);
662 auto s_id = id.device_segment_id();
1e59de90 663 INFO("{} ...", id);
20effc67
TL
664
665 assert(id.device_id() == get_device_id());
f67539c2 666
20effc67 667 if (s_id >= get_num_segments()) {
1e59de90 668 ERROR("{} segment-id out of range {}", id, get_num_segments());
f67539c2
TL
669 return crimson::ct_error::invarg::make();
670 }
671
20effc67 672 if (tracker->get(s_id) != segment_state_t::CLOSED) {
1e59de90 673 ERROR("{} invalid state {} != CLOSED", id, tracker->get(s_id));
f67539c2
TL
674 return crimson::ct_error::invarg::make();
675 }
676
20effc67
TL
677 tracker->set(s_id, segment_state_t::EMPTY);
678 ++stats.released_segments;
679 stats.metadata_write.increment(tracker->get_size());
680 return tracker->write_out(
1e59de90
TL
681 get_device_id(), device,
682 shard_info.tracker_offset);
f67539c2
TL
683}
684
685SegmentManager::read_ertr::future<> BlockSegmentManager::read(
686 paddr_t addr,
687 size_t len,
688 ceph::bufferptr &out)
689{
20effc67
TL
690 LOG_PREFIX(BlockSegmentManager::read);
691 auto& seg_addr = addr.as_seg_paddr();
1e59de90
TL
692 auto id = seg_addr.get_segment_id();
693 auto s_id = id.device_segment_id();
20effc67
TL
694 auto s_off = seg_addr.get_segment_off();
695 auto p_off = get_offset(addr);
1e59de90 696 DEBUG("{} offset={}~{} poffset={} ...", id, s_off, len, p_off);
20effc67
TL
697
698 assert(addr.get_device_id() == get_device_id());
699
700 if (s_off % superblock.block_size != 0 ||
701 len % superblock.block_size != 0) {
1e59de90 702 ERROR("{} offset={}~{} poffset={} invalid read", id, s_off, len, p_off);
20effc67
TL
703 return crimson::ct_error::invarg::make();
704 }
705
706 if (s_id >= get_num_segments()) {
1e59de90
TL
707 ERROR("{} offset={}~{} poffset={} segment-id out of range {}",
708 id, s_off, len, p_off, get_num_segments());
f67539c2
TL
709 return crimson::ct_error::invarg::make();
710 }
711
20effc67 712 if (s_off + len > superblock.segment_size) {
1e59de90
TL
713 ERROR("{} offset={}~{} poffset={} read out of range {}",
714 id, s_off, len, p_off, superblock.segment_size);
f67539c2
TL
715 return crimson::ct_error::invarg::make();
716 }
717
20effc67
TL
718 if (tracker->get(s_id) == segment_state_t::EMPTY) {
719 // XXX: not an error during scanning,
720 // might need refactor to increase the log level
1e59de90
TL
721 DEBUG("{} offset={}~{} poffset={} invalid state {}",
722 id, s_off, len, p_off, tracker->get(s_id));
f67539c2
TL
723 return crimson::ct_error::enoent::make();
724 }
725
20effc67 726 stats.data_read.increment(len);
f67539c2 727 return do_read(
20effc67 728 get_device_id(),
f67539c2 729 device,
20effc67
TL
730 p_off,
731 len,
f67539c2
TL
732 out);
733}
734
20effc67
TL
735void BlockSegmentManager::register_metrics()
736{
737 LOG_PREFIX(BlockSegmentManager::register_metrics);
1e59de90 738 DEBUG("{}", device_id_printer_t{get_device_id()});
20effc67 739 namespace sm = seastar::metrics;
20effc67 740 std::vector<sm::label_instance> label_instances;
1e59de90 741 label_instances.push_back(sm::label_instance("device_id", get_device_id()));
20effc67
TL
742 stats.reset();
743 metrics.add_group(
744 "segment_manager",
745 {
746 sm::make_counter(
747 "data_read_num",
748 stats.data_read.num,
749 sm::description("total number of data read"),
750 label_instances
751 ),
752 sm::make_counter(
753 "data_read_bytes",
754 stats.data_read.bytes,
755 sm::description("total bytes of data read"),
756 label_instances
757 ),
758 sm::make_counter(
759 "data_write_num",
760 stats.data_write.num,
761 sm::description("total number of data write"),
762 label_instances
763 ),
764 sm::make_counter(
765 "data_write_bytes",
766 stats.data_write.bytes,
767 sm::description("total bytes of data write"),
768 label_instances
769 ),
770 sm::make_counter(
771 "metadata_write_num",
772 stats.metadata_write.num,
773 sm::description("total number of metadata write"),
774 label_instances
775 ),
776 sm::make_counter(
777 "metadata_write_bytes",
778 stats.metadata_write.bytes,
779 sm::description("total bytes of metadata write"),
780 label_instances
781 ),
782 sm::make_counter(
783 "opened_segments",
784 stats.opened_segments,
785 sm::description("total segments opened"),
786 label_instances
787 ),
788 sm::make_counter(
789 "closed_segments",
790 stats.closed_segments,
791 sm::description("total segments closed"),
792 label_instances
793 ),
794 sm::make_counter(
795 "closed_segments_unused_bytes",
796 stats.closed_segments_unused_bytes,
797 sm::description("total unused bytes of closed segments"),
798 label_instances
799 ),
800 sm::make_counter(
801 "released_segments",
802 stats.released_segments,
803 sm::description("total segments released"),
804 label_instances
805 ),
806 }
807 );
808}
809
f67539c2 810}