]>
Commit | Line | Data |
---|---|---|
20effc67 TL |
1 | // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- |
2 | // vim: ts=8 sw=2 smarttab | |
3 | ||
4 | #include <sys/mman.h> | |
5 | #include <string.h> | |
6 | #include <linux/blkzoned.h> | |
7 | ||
1e59de90 | 8 | #include <fmt/format.h> |
aee94f69 | 9 | #include "crimson/os/seastore/segment_manager/zbd.h" |
20effc67 | 10 | #include "crimson/common/config_proxy.h" |
1e59de90 | 11 | #include "crimson/os/seastore/logging.h" |
aee94f69 | 12 | #include "crimson/common/errorator-loop.h" |
20effc67 TL |
13 | #include "include/buffer.h" |
14 | ||
1e59de90 TL |
15 | SET_SUBSYS(seastore_device); |
16 | ||
17 | #define SECT_SHIFT 9 | |
18 | #define RESERVED_ZONES 1 | |
19 | // limit the max padding buf size to 1MB | |
aee94f69 | 20 | #define MAX_PADDING_SIZE 4194304 |
1e59de90 | 21 | |
aee94f69 | 22 | using z_op = crimson::os::seastore::segment_manager::zbd::zone_op; |
1e59de90 TL |
23 | template <> struct fmt::formatter<z_op>: fmt::formatter<std::string_view> { |
24 | template <typename FormatContext> | |
25 | auto format(z_op s, FormatContext& ctx) { | |
26 | std::string_view name = "Unknown"; | |
27 | switch (s) { | |
28 | using enum z_op; | |
29 | case OPEN: | |
30 | name = "BLKOPENZONE"; | |
31 | break; | |
32 | case FINISH: | |
33 | name = "BLKFINISHZONE"; | |
34 | break; | |
35 | case CLOSE: | |
36 | name = "BLKCLOSEZONE"; | |
37 | break; | |
38 | case RESET: | |
39 | name = "BLKRESETZONE"; | |
40 | break; | |
41 | } | |
42 | return formatter<string_view>::format(name, ctx); | |
43 | } | |
44 | }; | |
20effc67 | 45 | |
aee94f69 | 46 | namespace crimson::os::seastore::segment_manager::zbd { |
20effc67 | 47 | |
aee94f69 | 48 | using open_device_ret = ZBDSegmentManager::access_ertr::future< |
20effc67 TL |
49 | std::pair<seastar::file, seastar::stat_data>>; |
50 | static open_device_ret open_device( | |
51 | const std::string &path, | |
52 | seastar::open_flags mode) | |
53 | { | |
aee94f69 | 54 | LOG_PREFIX(ZBDSegmentManager::open_device); |
20effc67 TL |
55 | return seastar::file_stat( |
56 | path, seastar::follow_symlink::yes | |
1e59de90 TL |
57 | ).then([FNAME, mode, &path](auto stat) mutable { |
58 | return seastar::open_file_dma(path, mode).then([=](auto file) { | |
59 | DEBUG("open of device {} successful, size {}", | |
60 | path, | |
61 | stat.size); | |
20effc67 TL |
62 | return std::make_pair(file, stat); |
63 | }); | |
64 | }).handle_exception( | |
1e59de90 TL |
65 | [FNAME](auto e) -> open_device_ret { |
66 | ERROR("got error {}", | |
20effc67 TL |
67 | e); |
68 | return crimson::ct_error::input_output_error::make(); | |
69 | } | |
70 | ); | |
71 | } | |
72 | ||
aee94f69 | 73 | static zbd_sm_metadata_t make_metadata( |
1e59de90 | 74 | uint64_t total_size, |
20effc67 TL |
75 | seastore_meta_t meta, |
76 | const seastar::stat_data &data, | |
1e59de90 TL |
77 | size_t zone_size_sectors, |
78 | size_t zone_capacity_sectors, | |
aee94f69 | 79 | size_t nr_cnv_zones, |
20effc67 TL |
80 | size_t num_zones) |
81 | { | |
aee94f69 TL |
82 | LOG_PREFIX(ZBDSegmentManager::make_metadata); |
83 | ||
84 | // Using only SWR zones in a SMR drive, for now | |
85 | auto skipped_zones = RESERVED_ZONES + nr_cnv_zones; | |
86 | assert(num_zones > skipped_zones); | |
1e59de90 TL |
87 | |
88 | // TODO: support Option::size_t seastore_segment_size | |
89 | // to allow zones_per_segment > 1 with striping. | |
90 | size_t zone_size = zone_size_sectors << SECT_SHIFT; | |
aee94f69 | 91 | assert(total_size == num_zones * zone_size); |
1e59de90 TL |
92 | size_t zone_capacity = zone_capacity_sectors << SECT_SHIFT; |
93 | size_t segment_size = zone_size; | |
94 | size_t zones_per_segment = segment_size / zone_size; | |
aee94f69 | 95 | size_t segments = (num_zones - skipped_zones) / zones_per_segment; |
1e59de90 TL |
96 | size_t per_shard_segments = segments / seastar::smp::count; |
97 | size_t available_size = zone_capacity * segments; | |
98 | size_t per_shard_available_size = zone_capacity * per_shard_segments; | |
1e59de90 | 99 | |
1e59de90 TL |
100 | |
101 | WARN("Ignoring configuration values for device and segment size"); | |
102 | INFO( | |
aee94f69 TL |
103 | "device size: {}, available size: {}, block size: {}, allocated size: {}," |
104 | " total zones {}, zone size: {}, zone capacity: {}," | |
105 | " total segments: {}, zones per segment: {}, segment size: {}" | |
106 | " conv zones: {}, swr zones: {}, per shard segments: {}" | |
107 | " per shard available size: {}", | |
1e59de90 TL |
108 | total_size, |
109 | available_size, | |
20effc67 TL |
110 | data.block_size, |
111 | data.allocated_size, | |
1e59de90 TL |
112 | num_zones, |
113 | zone_size, | |
114 | zone_capacity, | |
115 | segments, | |
116 | zones_per_segment, | |
aee94f69 TL |
117 | zone_capacity * zones_per_segment, |
118 | nr_cnv_zones, | |
119 | num_zones - nr_cnv_zones, | |
120 | per_shard_segments, | |
121 | per_shard_available_size); | |
122 | ||
123 | std::vector<zbd_shard_info_t> shard_infos(seastar::smp::count); | |
124 | for (unsigned int i = 0; i < seastar::smp::count; i++) { | |
125 | shard_infos[i].size = per_shard_available_size; | |
126 | shard_infos[i].segments = per_shard_segments; | |
127 | shard_infos[i].first_segment_offset = zone_size * skipped_zones | |
128 | + i * segment_size * per_shard_segments; | |
129 | INFO("First segment offset for shard {} is: {}", | |
130 | i, shard_infos[i].first_segment_offset); | |
131 | } | |
1e59de90 | 132 | |
aee94f69 | 133 | zbd_sm_metadata_t ret = zbd_sm_metadata_t{ |
1e59de90 TL |
134 | seastar::smp::count, |
135 | segment_size, | |
20effc67 TL |
136 | zone_capacity * zones_per_segment, |
137 | zones_per_segment, | |
138 | zone_capacity, | |
139 | data.block_size, | |
20effc67 | 140 | zone_size, |
1e59de90 | 141 | shard_infos, |
20effc67 | 142 | meta}; |
1e59de90 | 143 | ret.validate(); |
20effc67 TL |
144 | return ret; |
145 | } | |
146 | ||
147 | struct ZoneReport { | |
148 | struct blk_zone_report *hdr; | |
149 | ZoneReport(int nr_zones) | |
150 | : hdr((blk_zone_report *)malloc( | |
151 | sizeof(struct blk_zone_report) + nr_zones * sizeof(struct blk_zone))){;} | |
152 | ~ZoneReport(){ | |
153 | free(hdr); | |
154 | } | |
155 | ZoneReport(const ZoneReport &) = delete; | |
156 | ZoneReport(ZoneReport &&rhs) : hdr(rhs.hdr) { | |
157 | rhs.hdr = nullptr; | |
158 | } | |
159 | }; | |
160 | ||
1e59de90 TL |
161 | static seastar::future<size_t> get_blk_dev_size( |
162 | seastar::file &device) | |
163 | { | |
164 | return seastar::do_with( | |
165 | (uint64_t)0, | |
166 | [&](auto& size_sects) { | |
167 | return device.ioctl( | |
168 | BLKGETSIZE, | |
169 | (void *)&size_sects | |
170 | ).then([&](int ret) { | |
171 | ceph_assert(size_sects); | |
172 | size_t size = size_sects << SECT_SHIFT; | |
173 | return seastar::make_ready_future<size_t>(size); | |
174 | }); | |
175 | }); | |
176 | } | |
177 | ||
178 | // zone_size should be in 512B sectors | |
20effc67 | 179 | static seastar::future<> reset_device( |
1e59de90 TL |
180 | seastar::file &device, |
181 | uint64_t zone_size_sects, | |
182 | uint64_t nr_zones) | |
20effc67 TL |
183 | { |
184 | return seastar::do_with( | |
185 | blk_zone_range{}, | |
1e59de90 | 186 | [&, nr_zones, zone_size_sects](auto &range) { |
20effc67 | 187 | range.sector = 0; |
1e59de90 | 188 | range.nr_sectors = zone_size_sects * nr_zones; |
20effc67 TL |
189 | return device.ioctl( |
190 | BLKRESETZONE, | |
191 | &range | |
192 | ).then([&](int ret){ | |
193 | return seastar::now(); | |
194 | }); | |
195 | } | |
196 | ); | |
197 | } | |
198 | ||
199 | static seastar::future<size_t> get_zone_capacity( | |
1e59de90 | 200 | seastar::file &device, |
20effc67 TL |
201 | uint32_t nr_zones) |
202 | { | |
203 | return seastar::do_with( | |
20effc67 | 204 | ZoneReport(nr_zones), |
1e59de90 TL |
205 | [&](auto &zr) { |
206 | zr.hdr->sector = 0; | |
207 | zr.hdr->nr_zones = nr_zones; | |
208 | return device.ioctl( | |
209 | BLKREPORTZONE, | |
210 | zr.hdr | |
211 | ).then([&](int ret) { | |
212 | return seastar::make_ready_future<size_t>(zr.hdr->zones[0].capacity); | |
20effc67 TL |
213 | }); |
214 | } | |
215 | ); | |
216 | } | |
217 | ||
aee94f69 TL |
218 | // get the number of conventional zones of SMR HDD, |
219 | // they are randomly writable and don't respond to zone operations | |
220 | static seastar::future<size_t> get_nr_cnv_zones( | |
221 | seastar::file &device, | |
222 | uint32_t nr_zones) | |
223 | { | |
224 | return seastar::do_with( | |
225 | ZoneReport(nr_zones), | |
226 | [&](auto &zr) { | |
227 | zr.hdr->sector = 0; | |
228 | zr.hdr->nr_zones = nr_zones; | |
229 | return device.ioctl( | |
230 | BLKREPORTZONE, | |
231 | zr.hdr | |
232 | ).then([&, nr_zones](int ret) { | |
233 | size_t cnv_zones = 0; | |
234 | for (uint32_t i = 0; i < nr_zones; i++) { | |
235 | if (zr.hdr->zones[i].type == BLK_ZONE_TYPE_CONVENTIONAL) | |
236 | cnv_zones++; | |
237 | } | |
238 | return seastar::make_ready_future<size_t>(cnv_zones); | |
239 | }); | |
240 | } | |
241 | ); | |
242 | } | |
243 | ||
244 | ||
20effc67 TL |
245 | static write_ertr::future<> do_write( |
246 | seastar::file &device, | |
247 | uint64_t offset, | |
248 | bufferptr &bptr) | |
249 | { | |
aee94f69 | 250 | LOG_PREFIX(ZBDSegmentManager::do_write); |
1e59de90 | 251 | DEBUG("offset {} len {}", |
20effc67 TL |
252 | offset, |
253 | bptr.length()); | |
254 | return device.dma_write( | |
255 | offset, | |
256 | bptr.c_str(), | |
257 | bptr.length() | |
258 | ).handle_exception( | |
1e59de90 TL |
259 | [FNAME](auto e) -> write_ertr::future<size_t> { |
260 | ERROR("dma_write got error {}", | |
20effc67 TL |
261 | e); |
262 | return crimson::ct_error::input_output_error::make(); | |
263 | } | |
264 | ).then([length = bptr.length()](auto result) -> write_ertr::future<> { | |
265 | if (result != length) { | |
266 | return crimson::ct_error::input_output_error::make(); | |
267 | } | |
268 | return write_ertr::now(); | |
269 | }); | |
270 | } | |
271 | ||
272 | static write_ertr::future<> do_writev( | |
aee94f69 | 273 | device_id_t device_id, |
20effc67 TL |
274 | seastar::file &device, |
275 | uint64_t offset, | |
276 | bufferlist&& bl, | |
277 | size_t block_size) | |
278 | { | |
aee94f69 TL |
279 | LOG_PREFIX(ZBDSegmentManager::do_writev); |
280 | DEBUG("{} offset {} len {}", | |
281 | device_id_printer_t{device_id}, offset, bl.length()); | |
20effc67 TL |
282 | // writev requires each buffer to be aligned to the disks' block |
283 | // size, we need to rebuild here | |
284 | bl.rebuild_aligned(block_size); | |
285 | ||
aee94f69 TL |
286 | return seastar::do_with( |
287 | bl.prepare_iovs(), | |
288 | std::move(bl), | |
289 | [&device, device_id, offset, FNAME](auto& iovs, auto& bl) | |
290 | { | |
291 | return write_ertr::parallel_for_each( | |
292 | iovs, | |
293 | [&device, device_id, offset, FNAME](auto& p) | |
294 | { | |
295 | auto off = offset + p.offset; | |
296 | auto len = p.length; | |
297 | auto& iov = p.iov; | |
298 | DEBUG("{} poffset={}~{} dma_write ...", | |
299 | device_id_printer_t{device_id}, | |
300 | off, len); | |
301 | return device.dma_write(off, std::move(iov) | |
302 | ).handle_exception( | |
303 | [FNAME, device_id, off, len](auto e) -> write_ertr::future<size_t> | |
304 | { | |
305 | ERROR("{} poffset={}~{} dma_write got error -- {}", | |
306 | device_id_printer_t{device_id}, off, len, e); | |
307 | return crimson::ct_error::input_output_error::make(); | |
308 | }).then([FNAME, device_id, off, len](size_t written) -> write_ertr::future<> { | |
309 | if (written != len) { | |
310 | ERROR("{} poffset={}~{} dma_write len={} inconsistent", | |
311 | device_id_printer_t{device_id}, off, len, written); | |
312 | return crimson::ct_error::input_output_error::make(); | |
313 | } | |
314 | DEBUG("{} poffset={}~{} dma_write done", | |
315 | device_id_printer_t{device_id}, | |
316 | off, len); | |
317 | return write_ertr::now(); | |
318 | }); | |
319 | }); | |
20effc67 TL |
320 | }); |
321 | } | |
322 | ||
aee94f69 TL |
323 | static ZBDSegmentManager::access_ertr::future<> |
324 | write_metadata(seastar::file &device, zbd_sm_metadata_t sb) | |
20effc67 | 325 | { |
aee94f69 | 326 | assert(ceph::encoded_sizeof_bounded<zbd_sm_metadata_t>() < |
20effc67 TL |
327 | sb.block_size); |
328 | return seastar::do_with( | |
329 | bufferptr(ceph::buffer::create_page_aligned(sb.block_size)), | |
1e59de90 | 330 | [=, &device](auto &bp) { |
aee94f69 | 331 | LOG_PREFIX(ZBDSegmentManager::write_metadata); |
1e59de90 | 332 | DEBUG("block_size {}", sb.block_size); |
20effc67 TL |
333 | bufferlist bl; |
334 | encode(sb, bl); | |
335 | auto iter = bl.begin(); | |
336 | assert(bl.length() < sb.block_size); | |
1e59de90 | 337 | DEBUG("buffer length {}", bl.length()); |
20effc67 | 338 | iter.copy(bl.length(), bp.c_str()); |
1e59de90 | 339 | DEBUG("doing writeout"); |
20effc67 TL |
340 | return do_write(device, 0, bp); |
341 | }); | |
342 | } | |
343 | ||
344 | static read_ertr::future<> do_read( | |
345 | seastar::file &device, | |
346 | uint64_t offset, | |
347 | size_t len, | |
348 | bufferptr &bptr) | |
349 | { | |
aee94f69 | 350 | LOG_PREFIX(ZBDSegmentManager::do_read); |
20effc67 | 351 | assert(len <= bptr.length()); |
1e59de90 | 352 | DEBUG("offset {} len {}", |
20effc67 TL |
353 | offset, |
354 | len); | |
355 | return device.dma_read( | |
356 | offset, | |
357 | bptr.c_str(), | |
358 | len | |
359 | ).handle_exception( | |
1e59de90 TL |
360 | [FNAME](auto e) -> read_ertr::future<size_t> { |
361 | ERROR("dma_read got error {}", | |
20effc67 TL |
362 | e); |
363 | return crimson::ct_error::input_output_error::make(); | |
364 | } | |
365 | ).then([len](auto result) -> read_ertr::future<> { | |
366 | if (result != len) { | |
367 | return crimson::ct_error::input_output_error::make(); | |
368 | } | |
369 | return read_ertr::now(); | |
370 | }); | |
371 | } | |
372 | ||
373 | static | |
aee94f69 | 374 | ZBDSegmentManager::access_ertr::future<zbd_sm_metadata_t> |
20effc67 TL |
375 | read_metadata(seastar::file &device, seastar::stat_data sd) |
376 | { | |
aee94f69 | 377 | assert(ceph::encoded_sizeof_bounded<zbd_sm_metadata_t>() < |
20effc67 TL |
378 | sd.block_size); |
379 | return seastar::do_with( | |
380 | bufferptr(ceph::buffer::create_page_aligned(sd.block_size)), | |
381 | [=, &device](auto &bp) { | |
382 | return do_read( | |
383 | device, | |
384 | 0, | |
385 | bp.length(), | |
386 | bp | |
387 | ).safe_then([=, &bp] { | |
388 | bufferlist bl; | |
389 | bl.push_back(bp); | |
aee94f69 | 390 | zbd_sm_metadata_t ret; |
20effc67 TL |
391 | auto bliter = bl.cbegin(); |
392 | decode(ret, bliter); | |
1e59de90 | 393 | ret.validate(); |
aee94f69 TL |
394 | return ZBDSegmentManager::access_ertr::future<zbd_sm_metadata_t>( |
395 | ZBDSegmentManager::access_ertr::ready_future_marker{}, | |
20effc67 TL |
396 | ret); |
397 | }); | |
398 | }); | |
399 | } | |
400 | ||
aee94f69 | 401 | ZBDSegmentManager::mount_ret ZBDSegmentManager::mount() |
1e59de90 TL |
402 | { |
403 | return shard_devices.invoke_on_all([](auto &local_device) { | |
404 | return local_device.shard_mount( | |
405 | ).handle_error( | |
406 | crimson::ct_error::assert_all{ | |
aee94f69 | 407 | "Invalid error in ZBDSegmentManager::mount" |
1e59de90 TL |
408 | }); |
409 | }); | |
410 | } | |
411 | ||
aee94f69 | 412 | ZBDSegmentManager::mount_ret ZBDSegmentManager::shard_mount() |
20effc67 TL |
413 | { |
414 | return open_device( | |
415 | device_path, seastar::open_flags::rw | |
1e59de90 | 416 | ).safe_then([=, this](auto p) { |
20effc67 TL |
417 | device = std::move(p.first); |
418 | auto sd = p.second; | |
419 | return read_metadata(device, sd); | |
1e59de90 TL |
420 | }).safe_then([=, this](auto meta){ |
421 | shard_info = meta.shard_infos[seastar::this_shard_id()]; | |
20effc67 TL |
422 | metadata = meta; |
423 | return mount_ertr::now(); | |
424 | }); | |
425 | } | |
426 | ||
aee94f69 | 427 | ZBDSegmentManager::mkfs_ret ZBDSegmentManager::mkfs( |
1e59de90 | 428 | device_config_t config) |
20effc67 | 429 | { |
1e59de90 TL |
430 | return shard_devices.local().primary_mkfs(config |
431 | ).safe_then([this] { | |
432 | return shard_devices.invoke_on_all([](auto &local_device) { | |
433 | return local_device.shard_mkfs( | |
434 | ).handle_error( | |
435 | crimson::ct_error::assert_all{ | |
aee94f69 | 436 | "Invalid error in ZBDSegmentManager::mkfs" |
1e59de90 TL |
437 | }); |
438 | }); | |
439 | }); | |
440 | } | |
441 | ||
aee94f69 | 442 | ZBDSegmentManager::mkfs_ret ZBDSegmentManager::primary_mkfs( |
1e59de90 TL |
443 | device_config_t config) |
444 | { | |
aee94f69 | 445 | LOG_PREFIX(ZBDSegmentManager::primary_mkfs); |
1e59de90 | 446 | INFO("starting, device_path {}", device_path); |
20effc67 TL |
447 | return seastar::do_with( |
448 | seastar::file{}, | |
449 | seastar::stat_data{}, | |
aee94f69 TL |
450 | zbd_sm_metadata_t{}, |
451 | size_t(), | |
20effc67 TL |
452 | size_t(), |
453 | size_t(), | |
1e59de90 | 454 | size_t(), |
aee94f69 TL |
455 | [=, this] |
456 | (auto &device, | |
457 | auto &stat, | |
458 | auto &sb, | |
459 | auto &zone_size_sects, | |
460 | auto &nr_zones, | |
461 | auto &size, | |
462 | auto &nr_cnv_zones) { | |
20effc67 | 463 | return open_device( |
1e59de90 | 464 | device_path, |
20effc67 | 465 | seastar::open_flags::rw |
aee94f69 | 466 | ).safe_then([=, this, &device, &stat, &sb, &zone_size_sects, &nr_zones, &size, &nr_cnv_zones](auto p) { |
20effc67 TL |
467 | device = p.first; |
468 | stat = p.second; | |
469 | return device.ioctl( | |
1e59de90 | 470 | BLKGETNRZONES, |
20effc67 | 471 | (void *)&nr_zones |
1e59de90 | 472 | ).then([&](int ret) { |
20effc67 TL |
473 | if (nr_zones == 0) { |
474 | return seastar::make_exception_future<int>( | |
475 | std::system_error(std::make_error_code(std::errc::io_error))); | |
476 | } | |
1e59de90 TL |
477 | return device.ioctl(BLKGETZONESZ, (void *)&zone_size_sects); |
478 | }).then([&](int ret) { | |
479 | ceph_assert(zone_size_sects); | |
480 | return reset_device(device, zone_size_sects, nr_zones); | |
481 | }).then([&] { | |
482 | return get_blk_dev_size(device); | |
483 | }).then([&](auto devsize) { | |
484 | size = devsize; | |
aee94f69 TL |
485 | return get_nr_cnv_zones(device, nr_zones); |
486 | }).then([&](auto cnv_zones) { | |
487 | DEBUG("Found {} conventional zones", cnv_zones); | |
488 | nr_cnv_zones = cnv_zones; | |
1e59de90 TL |
489 | return get_zone_capacity(device, nr_zones); |
490 | }).then([&, FNAME, config](auto zone_capacity_sects) { | |
491 | ceph_assert(zone_capacity_sects); | |
492 | DEBUG("zone_size in sectors {}, zone_capacity in sectors {}", | |
493 | zone_size_sects, zone_capacity_sects); | |
20effc67 | 494 | sb = make_metadata( |
1e59de90 TL |
495 | size, |
496 | config.meta, | |
497 | stat, | |
498 | zone_size_sects, | |
499 | zone_capacity_sects, | |
aee94f69 | 500 | nr_cnv_zones, |
20effc67 TL |
501 | nr_zones); |
502 | metadata = sb; | |
503 | stats.metadata_write.increment( | |
aee94f69 | 504 | ceph::encoded_sizeof_bounded<zbd_sm_metadata_t>()); |
1e59de90 | 505 | DEBUG("Wrote to stats."); |
20effc67 | 506 | return write_metadata(device, sb); |
1e59de90 TL |
507 | }).finally([&, FNAME] { |
508 | DEBUG("Closing device."); | |
20effc67 | 509 | return device.close(); |
1e59de90 TL |
510 | }).safe_then([FNAME] { |
511 | DEBUG("Returning from mkfs."); | |
20effc67 TL |
512 | return mkfs_ertr::now(); |
513 | }); | |
514 | }); | |
515 | }); | |
516 | } | |
517 | ||
aee94f69 | 518 | ZBDSegmentManager::mkfs_ret ZBDSegmentManager::shard_mkfs() |
1e59de90 | 519 | { |
aee94f69 | 520 | LOG_PREFIX(ZBDSegmentManager::shard_mkfs); |
1e59de90 TL |
521 | INFO("starting, device_path {}", device_path); |
522 | return open_device( | |
523 | device_path, seastar::open_flags::rw | |
524 | ).safe_then([=, this](auto p) { | |
525 | device = std::move(p.first); | |
526 | auto sd = p.second; | |
527 | return read_metadata(device, sd); | |
528 | }).safe_then([=, this](auto meta){ | |
529 | shard_info = meta.shard_infos[seastar::this_shard_id()]; | |
530 | metadata = meta; | |
531 | return device.close(); | |
532 | }).safe_then([FNAME] { | |
533 | DEBUG("Returning from shard_mkfs."); | |
534 | return mkfs_ertr::now(); | |
535 | }); | |
536 | } | |
537 | ||
538 | // Return range of sectors to operate on. | |
20effc67 | 539 | struct blk_zone_range make_range( |
1e59de90 TL |
540 | segment_id_t id, |
541 | size_t segment_size, | |
20effc67 TL |
542 | size_t first_segment_offset) |
543 | { | |
544 | return blk_zone_range{ | |
1e59de90 TL |
545 | (id.device_segment_id() * (segment_size >> SECT_SHIFT) |
546 | + (first_segment_offset >> SECT_SHIFT)), | |
547 | (segment_size >> SECT_SHIFT) | |
20effc67 TL |
548 | }; |
549 | } | |
550 | ||
1e59de90 | 551 | using blk_zone_op_ertr = crimson::errorator< |
20effc67 | 552 | crimson::ct_error::input_output_error>; |
1e59de90 TL |
553 | using blk_zone_op_ret = blk_zone_op_ertr::future<>; |
554 | blk_zone_op_ret blk_zone_op(seastar::file &device, | |
555 | blk_zone_range &range, | |
556 | zone_op op) { | |
aee94f69 | 557 | LOG_PREFIX(ZBDSegmentManager::blk_zone_op); |
1e59de90 TL |
558 | |
559 | unsigned long ioctl_op = 0; | |
560 | switch (op) { | |
561 | using enum zone_op; | |
562 | case OPEN: | |
563 | ioctl_op = BLKOPENZONE; | |
564 | break; | |
565 | case FINISH: | |
566 | ioctl_op = BLKFINISHZONE; | |
567 | break; | |
568 | case RESET: | |
569 | ioctl_op = BLKRESETZONE; | |
570 | break; | |
571 | case CLOSE: | |
572 | ioctl_op = BLKCLOSEZONE; | |
573 | break; | |
574 | default: | |
575 | ERROR("Invalid zone operation {}", op); | |
576 | ceph_assert(ioctl_op); | |
577 | } | |
578 | ||
20effc67 | 579 | return device.ioctl( |
1e59de90 | 580 | ioctl_op, |
20effc67 | 581 | &range |
1e59de90 | 582 | ).then_wrapped([=](auto f) -> blk_zone_op_ret { |
20effc67 | 583 | if (f.failed()) { |
1e59de90 | 584 | ERROR("{} ioctl failed", op); |
20effc67 | 585 | return crimson::ct_error::input_output_error::make(); |
1e59de90 | 586 | } else { |
20effc67 TL |
587 | int ret = f.get(); |
588 | if (ret == 0) { | |
589 | return seastar::now(); | |
590 | } else { | |
1e59de90 | 591 | ERROR("{} ioctl failed with return code {}", op, ret); |
20effc67 TL |
592 | return crimson::ct_error::input_output_error::make(); |
593 | } | |
594 | } | |
595 | }); | |
596 | } | |
597 | ||
aee94f69 | 598 | ZBDSegmentManager::open_ertr::future<SegmentRef> ZBDSegmentManager::open( |
20effc67 TL |
599 | segment_id_t id) |
600 | { | |
aee94f69 | 601 | LOG_PREFIX(ZBDSegmentManager::open); |
20effc67 TL |
602 | return seastar::do_with( |
603 | blk_zone_range{}, | |
1e59de90 | 604 | [=, this](auto &range) { |
20effc67 | 605 | range = make_range( |
1e59de90 TL |
606 | id, |
607 | metadata.segment_size, | |
608 | shard_info.first_segment_offset); | |
609 | return blk_zone_op( | |
610 | device, | |
611 | range, | |
612 | zone_op::OPEN | |
20effc67 TL |
613 | ); |
614 | } | |
1e59de90 TL |
615 | ).safe_then([=, this] { |
616 | DEBUG("segment {}, open successful", id); | |
20effc67 TL |
617 | return open_ertr::future<SegmentRef>( |
618 | open_ertr::ready_future_marker{}, | |
aee94f69 | 619 | SegmentRef(new ZBDSegment(*this, id)) |
20effc67 TL |
620 | ); |
621 | }); | |
622 | } | |
623 | ||
aee94f69 | 624 | ZBDSegmentManager::release_ertr::future<> ZBDSegmentManager::release( |
20effc67 TL |
625 | segment_id_t id) |
626 | { | |
aee94f69 | 627 | LOG_PREFIX(ZBDSegmentManager::release); |
1e59de90 | 628 | DEBUG("Resetting zone/segment {}", id); |
20effc67 TL |
629 | return seastar::do_with( |
630 | blk_zone_range{}, | |
1e59de90 | 631 | [=, this](auto &range) { |
20effc67 | 632 | range = make_range( |
1e59de90 TL |
633 | id, |
634 | metadata.segment_size, | |
635 | shard_info.first_segment_offset); | |
636 | return blk_zone_op( | |
637 | device, | |
638 | range, | |
639 | zone_op::RESET | |
20effc67 TL |
640 | ); |
641 | } | |
642 | ).safe_then([=] { | |
1e59de90 | 643 | DEBUG("segment release successful"); |
20effc67 TL |
644 | return release_ertr::now(); |
645 | }); | |
646 | } | |
647 | ||
aee94f69 | 648 | SegmentManager::read_ertr::future<> ZBDSegmentManager::read( |
20effc67 TL |
649 | paddr_t addr, |
650 | size_t len, | |
651 | ceph::bufferptr &out) | |
652 | { | |
aee94f69 | 653 | LOG_PREFIX(ZBDSegmentManager::read); |
20effc67 TL |
654 | auto& seg_addr = addr.as_seg_paddr(); |
655 | if (seg_addr.get_segment_id().device_segment_id() >= get_num_segments()) { | |
1e59de90 TL |
656 | ERROR("invalid segment {}", |
657 | seg_addr.get_segment_id().device_segment_id()); | |
20effc67 TL |
658 | return crimson::ct_error::invarg::make(); |
659 | } | |
660 | ||
1e59de90 TL |
661 | if (seg_addr.get_segment_off() + len > metadata.segment_capacity) { |
662 | ERROR("invalid read offset {}, len {}", | |
20effc67 TL |
663 | addr, |
664 | len); | |
665 | return crimson::ct_error::invarg::make(); | |
666 | } | |
667 | return do_read( | |
668 | device, | |
669 | get_offset(addr), | |
670 | len, | |
671 | out); | |
672 | } | |
673 | ||
aee94f69 | 674 | Segment::close_ertr::future<> ZBDSegmentManager::segment_close( |
20effc67 TL |
675 | segment_id_t id, segment_off_t write_pointer) |
676 | { | |
aee94f69 | 677 | LOG_PREFIX(ZBDSegmentManager::segment_close); |
20effc67 TL |
678 | return seastar::do_with( |
679 | blk_zone_range{}, | |
1e59de90 | 680 | [=, this](auto &range) { |
20effc67 | 681 | range = make_range( |
1e59de90 TL |
682 | id, |
683 | metadata.segment_size, | |
684 | shard_info.first_segment_offset); | |
685 | return blk_zone_op( | |
686 | device, | |
687 | range, | |
688 | zone_op::FINISH | |
20effc67 TL |
689 | ); |
690 | } | |
691 | ).safe_then([=] { | |
1e59de90 | 692 | DEBUG("zone finish successful"); |
20effc67 TL |
693 | return Segment::close_ertr::now(); |
694 | }); | |
695 | } | |
696 | ||
aee94f69 | 697 | Segment::write_ertr::future<> ZBDSegmentManager::segment_write( |
20effc67 TL |
698 | paddr_t addr, |
699 | ceph::bufferlist bl, | |
700 | bool ignore_check) | |
701 | { | |
aee94f69 | 702 | LOG_PREFIX(ZBDSegmentManager::segment_write); |
20effc67 TL |
703 | assert(addr.get_device_id() == get_device_id()); |
704 | assert((bl.length() % metadata.block_size) == 0); | |
705 | auto& seg_addr = addr.as_seg_paddr(); | |
1e59de90 | 706 | DEBUG("write to segment {} at offset {}, physical offset {}, len {}", |
20effc67 TL |
707 | seg_addr.get_segment_id(), |
708 | seg_addr.get_segment_off(), | |
709 | get_offset(addr), | |
710 | bl.length()); | |
711 | stats.data_write.increment(bl.length()); | |
712 | return do_writev( | |
aee94f69 | 713 | get_device_id(), |
20effc67 TL |
714 | device, |
715 | get_offset(addr), | |
716 | std::move(bl), | |
717 | metadata.block_size); | |
718 | } | |
719 | ||
aee94f69 | 720 | device_id_t ZBDSegmentManager::get_device_id() const |
20effc67 TL |
721 | { |
722 | return metadata.device_id; | |
723 | }; | |
724 | ||
aee94f69 | 725 | secondary_device_set_t& ZBDSegmentManager::get_secondary_devices() |
20effc67 TL |
726 | { |
727 | return metadata.secondary_devices; | |
728 | }; | |
729 | ||
aee94f69 | 730 | magic_t ZBDSegmentManager::get_magic() const |
20effc67 TL |
731 | { |
732 | return metadata.magic; | |
733 | }; | |
734 | ||
aee94f69 | 735 | segment_off_t ZBDSegment::get_write_capacity() const |
20effc67 TL |
736 | { |
737 | return manager.get_segment_size(); | |
738 | } | |
739 | ||
aee94f69 | 740 | SegmentManager::close_ertr::future<> ZBDSegmentManager::close() |
20effc67 TL |
741 | { |
742 | if (device) { | |
743 | return device.close(); | |
744 | } | |
745 | return seastar::now(); | |
746 | } | |
747 | ||
aee94f69 | 748 | Segment::close_ertr::future<> ZBDSegment::close() |
20effc67 TL |
749 | { |
750 | return manager.segment_close(id, write_pointer); | |
751 | } | |
752 | ||
aee94f69 | 753 | Segment::write_ertr::future<> ZBDSegment::write( |
20effc67 TL |
754 | segment_off_t offset, ceph::bufferlist bl) |
755 | { | |
aee94f69 | 756 | LOG_PREFIX(ZBDSegment::write); |
1e59de90 TL |
757 | if (offset != write_pointer || offset % manager.metadata.block_size != 0) { |
758 | ERROR("Segment offset and zone write pointer mismatch. " | |
759 | "segment {} segment-offset {} write pointer {}", | |
760 | id, offset, write_pointer); | |
20effc67 TL |
761 | return crimson::ct_error::invarg::make(); |
762 | } | |
1e59de90 | 763 | if (offset + bl.length() > manager.metadata.segment_capacity) { |
20effc67 | 764 | return crimson::ct_error::enospc::make(); |
1e59de90 | 765 | } |
20effc67 TL |
766 | |
767 | write_pointer = offset + bl.length(); | |
768 | return manager.segment_write(paddr_t::make_seg_paddr(id, offset), bl); | |
769 | } | |
770 | ||
aee94f69 | 771 | Segment::write_ertr::future<> ZBDSegment::write_padding_bytes( |
1e59de90 TL |
772 | size_t padding_bytes) |
773 | { | |
aee94f69 | 774 | LOG_PREFIX(ZBDSegment::write_padding_bytes); |
1e59de90 TL |
775 | DEBUG("Writing {} padding bytes to segment {} at wp {}", |
776 | padding_bytes, id, write_pointer); | |
777 | ||
778 | return crimson::repeat([FNAME, padding_bytes, this] () mutable { | |
779 | size_t bufsize = 0; | |
780 | if (padding_bytes >= MAX_PADDING_SIZE) { | |
781 | bufsize = MAX_PADDING_SIZE; | |
782 | } else { | |
783 | bufsize = padding_bytes; | |
784 | } | |
785 | ||
786 | padding_bytes -= bufsize; | |
787 | bufferptr bp(ceph::buffer::create_page_aligned(bufsize)); | |
788 | bp.zero(); | |
789 | bufferlist padd_bl; | |
790 | padd_bl.append(bp); | |
791 | return write(write_pointer, padd_bl).safe_then([FNAME, padding_bytes, this]() { | |
792 | if (padding_bytes == 0) { | |
793 | return write_ertr::make_ready_future<seastar::stop_iteration>(seastar::stop_iteration::yes); | |
794 | } else { | |
795 | return write_ertr::make_ready_future<seastar::stop_iteration>(seastar::stop_iteration::no); | |
796 | } | |
797 | }); | |
798 | }); | |
799 | } | |
800 | ||
801 | // Advance write pointer, to given offset. | |
aee94f69 | 802 | Segment::write_ertr::future<> ZBDSegment::advance_wp( |
1e59de90 TL |
803 | segment_off_t offset) |
804 | { | |
aee94f69 | 805 | LOG_PREFIX(ZBDSegment::advance_wp); |
1e59de90 TL |
806 | |
807 | DEBUG("Advancing write pointer from {} to {}", write_pointer, offset); | |
808 | if (offset < write_pointer) { | |
809 | return crimson::ct_error::invarg::make(); | |
810 | } | |
811 | ||
812 | size_t padding_bytes = offset - write_pointer; | |
813 | ||
814 | if (padding_bytes == 0) { | |
815 | return write_ertr::now(); | |
816 | } | |
817 | ||
818 | assert(padding_bytes % manager.metadata.block_size == 0); | |
819 | ||
820 | return write_padding_bytes(padding_bytes); | |
821 | } | |
822 | ||
20effc67 | 823 | } |