]>
Commit | Line | Data |
---|---|---|
f67539c2 TL |
1 | // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- |
2 | // vim: ts=8 sw=2 smarttab | |
3 | ||
4 | #include <sys/mman.h> | |
5 | #include <string.h> | |
6 | ||
2a845540 TL |
7 | #include <fmt/format.h> |
8 | ||
1e59de90 TL |
9 | #include <seastar/core/metrics.hh> |
10 | ||
f67539c2 | 11 | #include "include/buffer.h" |
f67539c2 | 12 | |
20effc67 TL |
13 | #include "crimson/common/config_proxy.h" |
14 | #include "crimson/common/errorator-loop.h" | |
15 | ||
16 | #include "crimson/os/seastore/logging.h" | |
17 | #include "crimson/os/seastore/segment_manager/block.h" | |
f67539c2 | 18 | |
20effc67 TL |
19 | SET_SUBSYS(seastore_device); |
20 | /* | |
21 | * format: | |
22 | * - D<device-id> S<segment-id> offset=<off>~<len> poffset=<off> information | |
23 | * - D<device-id> poffset=<off>~<len> information | |
24 | * | |
25 | * levels: | |
26 | * - INFO: major initiation, closing and segment operations | |
27 | * - DEBUG: INFO details, major read and write operations | |
28 | * - TRACE: DEBUG details | |
29 | */ | |
f67539c2 | 30 | |
2a845540 TL |
31 | using segment_state_t = crimson::os::seastore::Segment::segment_state_t; |
32 | ||
33 | template <> struct fmt::formatter<segment_state_t>: fmt::formatter<std::string_view> { | |
34 | // parse is inherited from formatter<string_view>. | |
35 | template <typename FormatContext> | |
36 | auto format(segment_state_t s, FormatContext& ctx) { | |
37 | std::string_view name = "unknown"; | |
38 | switch (s) { | |
39 | case segment_state_t::EMPTY: | |
40 | name = "empty"; | |
41 | break; | |
42 | case segment_state_t::OPEN: | |
43 | name = "open"; | |
44 | break; | |
45 | case segment_state_t::CLOSED: | |
46 | name = "closed"; | |
47 | break; | |
48 | } | |
49 | return formatter<string_view>::format(name, ctx); | |
50 | } | |
51 | }; | |
52 | ||
f67539c2 TL |
53 | namespace crimson::os::seastore::segment_manager::block { |
54 | ||
55 | static write_ertr::future<> do_write( | |
20effc67 | 56 | device_id_t device_id, |
f67539c2 TL |
57 | seastar::file &device, |
58 | uint64_t offset, | |
59 | bufferptr &bptr) | |
60 | { | |
20effc67 TL |
61 | LOG_PREFIX(block_do_write); |
62 | auto len = bptr.length(); | |
1e59de90 TL |
63 | TRACE("{} poffset={}~{} ...", |
64 | device_id_printer_t{device_id}, offset, len); | |
f67539c2 TL |
65 | return device.dma_write( |
66 | offset, | |
67 | bptr.c_str(), | |
20effc67 TL |
68 | len |
69 | ).handle_exception( | |
70 | [FNAME, device_id, offset, len](auto e) -> write_ertr::future<size_t> { | |
1e59de90 TL |
71 | ERROR("{} poffset={}~{} got error -- {}", |
72 | device_id_printer_t{device_id}, offset, len, e); | |
20effc67 TL |
73 | return crimson::ct_error::input_output_error::make(); |
74 | }).then([FNAME, device_id, offset, len](auto result) -> write_ertr::future<> { | |
75 | if (result != len) { | |
1e59de90 TL |
76 | ERROR("{} poffset={}~{} write len={} inconsistent", |
77 | device_id_printer_t{device_id}, offset, len, result); | |
f67539c2 TL |
78 | return crimson::ct_error::input_output_error::make(); |
79 | } | |
1e59de90 | 80 | TRACE("{} poffset={}~{} done", device_id_printer_t{device_id}, offset, len); |
f67539c2 TL |
81 | return write_ertr::now(); |
82 | }); | |
83 | } | |
84 | ||
20effc67 TL |
85 | static write_ertr::future<> do_writev( |
86 | device_id_t device_id, | |
87 | seastar::file &device, | |
88 | uint64_t offset, | |
89 | bufferlist&& bl, | |
90 | size_t block_size) | |
91 | { | |
92 | LOG_PREFIX(block_do_writev); | |
1e59de90 TL |
93 | TRACE("{} poffset={}~{}, {} buffers", |
94 | device_id_printer_t{device_id}, offset, bl.length(), bl.get_num_buffers()); | |
20effc67 TL |
95 | |
96 | // writev requires each buffer to be aligned to the disks' block | |
97 | // size, we need to rebuild here | |
98 | bl.rebuild_aligned(block_size); | |
99 | ||
100 | return seastar::do_with( | |
101 | bl.prepare_iovs(), | |
102 | std::move(bl), | |
103 | [&device, device_id, offset, FNAME](auto& iovs, auto& bl) | |
104 | { | |
105 | return write_ertr::parallel_for_each( | |
106 | iovs, | |
107 | [&device, device_id, offset, FNAME](auto& p) mutable | |
108 | { | |
109 | auto off = offset + p.offset; | |
110 | auto len = p.length; | |
111 | auto& iov = p.iov; | |
1e59de90 TL |
112 | TRACE("{} poffset={}~{} dma_write ...", |
113 | device_id_printer_t{device_id}, off, len); | |
20effc67 TL |
114 | return device.dma_write(off, std::move(iov) |
115 | ).handle_exception( | |
116 | [FNAME, device_id, off, len](auto e) -> write_ertr::future<size_t> | |
117 | { | |
1e59de90 TL |
118 | ERROR("{} poffset={}~{} dma_write got error -- {}", |
119 | device_id_printer_t{device_id}, off, len, e); | |
20effc67 TL |
120 | return crimson::ct_error::input_output_error::make(); |
121 | }).then([FNAME, device_id, off, len](size_t written) -> write_ertr::future<> { | |
122 | if (written != len) { | |
1e59de90 TL |
123 | ERROR("{} poffset={}~{} dma_write len={} inconsistent", |
124 | device_id_printer_t{device_id}, off, len, written); | |
20effc67 TL |
125 | return crimson::ct_error::input_output_error::make(); |
126 | } | |
1e59de90 TL |
127 | TRACE("{} poffset={}~{} dma_write done", |
128 | device_id_printer_t{device_id}, off, len); | |
20effc67 TL |
129 | return write_ertr::now(); |
130 | }); | |
131 | }); | |
132 | }); | |
133 | } | |
134 | ||
f67539c2 | 135 | static read_ertr::future<> do_read( |
20effc67 | 136 | device_id_t device_id, |
f67539c2 TL |
137 | seastar::file &device, |
138 | uint64_t offset, | |
20effc67 | 139 | size_t len, |
f67539c2 TL |
140 | bufferptr &bptr) |
141 | { | |
20effc67 | 142 | LOG_PREFIX(block_do_read); |
1e59de90 | 143 | TRACE("{} poffset={}~{} ...", device_id_printer_t{device_id}, offset, len); |
20effc67 | 144 | assert(len <= bptr.length()); |
f67539c2 TL |
145 | return device.dma_read( |
146 | offset, | |
147 | bptr.c_str(), | |
20effc67 TL |
148 | len |
149 | ).handle_exception( | |
150 | //FIXME: this is a little bit tricky, since seastar::future<T>::handle_exception | |
151 | // returns seastar::future<T>, to return an crimson::ct_error, we have to create | |
152 | // a seastar::future<T> holding that crimson::ct_error. This is not necessary | |
153 | // once seastar::future<T>::handle_exception() returns seastar::futurize_t<T> | |
154 | [FNAME, device_id, offset, len](auto e) -> read_ertr::future<size_t> | |
155 | { | |
1e59de90 TL |
156 | ERROR("{} poffset={}~{} got error -- {}", |
157 | device_id_printer_t{device_id}, offset, len, e); | |
f67539c2 | 158 | return crimson::ct_error::input_output_error::make(); |
20effc67 TL |
159 | }).then([FNAME, device_id, offset, len](auto result) -> read_ertr::future<> { |
160 | if (result != len) { | |
1e59de90 TL |
161 | ERROR("{} poffset={}~{} read len={} inconsistent", |
162 | device_id_printer_t{device_id}, offset, len, result); | |
f67539c2 TL |
163 | return crimson::ct_error::input_output_error::make(); |
164 | } | |
1e59de90 | 165 | TRACE("{} poffset={}~{} done", device_id_printer_t{device_id}, offset, len); |
f67539c2 TL |
166 | return read_ertr::now(); |
167 | }); | |
168 | } | |
169 | ||
170 | write_ertr::future<> | |
171 | SegmentStateTracker::write_out( | |
20effc67 | 172 | device_id_t device_id, |
f67539c2 TL |
173 | seastar::file &device, |
174 | uint64_t offset) | |
175 | { | |
20effc67 | 176 | LOG_PREFIX(SegmentStateTracker::write_out); |
1e59de90 TL |
177 | DEBUG("{} poffset={}~{}", |
178 | device_id_printer_t{device_id}, offset, bptr.length()); | |
20effc67 | 179 | return do_write(device_id, device, offset, bptr); |
f67539c2 TL |
180 | } |
181 | ||
182 | write_ertr::future<> | |
183 | SegmentStateTracker::read_in( | |
20effc67 | 184 | device_id_t device_id, |
f67539c2 TL |
185 | seastar::file &device, |
186 | uint64_t offset) | |
187 | { | |
20effc67 | 188 | LOG_PREFIX(SegmentStateTracker::read_in); |
1e59de90 TL |
189 | DEBUG("{} poffset={}~{}", |
190 | device_id_printer_t{device_id}, offset, bptr.length()); | |
f67539c2 | 191 | return do_read( |
20effc67 | 192 | device_id, |
f67539c2 TL |
193 | device, |
194 | offset, | |
20effc67 | 195 | bptr.length(), |
f67539c2 TL |
196 | bptr); |
197 | } | |
1e59de90 | 198 | using std::vector; |
f67539c2 TL |
199 | static |
200 | block_sm_superblock_t make_superblock( | |
20effc67 | 201 | device_id_t device_id, |
1e59de90 | 202 | device_config_t sm_config, |
f67539c2 TL |
203 | const seastar::stat_data &data) |
204 | { | |
20effc67 TL |
205 | LOG_PREFIX(block_make_superblock); |
206 | using crimson::common::get_conf; | |
207 | ||
208 | auto config_size = get_conf<Option::size_t>( | |
209 | "seastore_device_size"); | |
210 | ||
211 | size_t size = (data.size == 0) ? config_size : data.size; | |
212 | ||
213 | auto config_segment_size = get_conf<Option::size_t>( | |
214 | "seastore_segment_size"); | |
215 | size_t raw_segments = size / config_segment_size; | |
1e59de90 TL |
216 | size_t shard_tracker_size = SegmentStateTracker::get_raw_size( |
217 | raw_segments / seastar::smp::count, | |
f67539c2 | 218 | data.block_size); |
1e59de90 TL |
219 | size_t total_tracker_size = shard_tracker_size * seastar::smp::count; |
220 | size_t tracker_off = data.block_size; //superblock | |
221 | size_t segments = (size - tracker_off - total_tracker_size) / config_segment_size; | |
222 | size_t segments_per_shard = segments / seastar::smp::count; | |
223 | ||
224 | vector<block_shard_info_t> shard_infos(seastar::smp::count); | |
225 | for (unsigned int i = 0; i < seastar::smp::count; i++) { | |
226 | shard_infos[i].size = segments_per_shard * config_segment_size; | |
227 | shard_infos[i].segments = segments_per_shard; | |
228 | shard_infos[i].tracker_offset = tracker_off + i * shard_tracker_size; | |
229 | shard_infos[i].first_segment_offset = tracker_off + total_tracker_size | |
230 | + i * segments_per_shard * config_segment_size; | |
231 | } | |
20effc67 | 232 | |
1e59de90 TL |
233 | INFO("{} disk_size={}, segment_size={}, block_size={}", |
234 | device_id_printer_t{device_id}, | |
20effc67 | 235 | size, |
1e59de90 TL |
236 | uint64_t(config_segment_size), |
237 | data.block_size); | |
238 | for (unsigned int i = 0; i < seastar::smp::count; i++) { | |
239 | INFO("shard {} infos:", i, shard_infos[i]); | |
240 | } | |
20effc67 | 241 | |
f67539c2 | 242 | return block_sm_superblock_t{ |
1e59de90 | 243 | seastar::smp::count, |
20effc67 | 244 | config_segment_size, |
f67539c2 | 245 | data.block_size, |
1e59de90 TL |
246 | shard_infos, |
247 | std::move(sm_config) | |
f67539c2 TL |
248 | }; |
249 | } | |
250 | ||
20effc67 TL |
251 | using check_create_device_ertr = BlockSegmentManager::access_ertr; |
252 | using check_create_device_ret = check_create_device_ertr::future<>; | |
253 | static check_create_device_ret check_create_device( | |
254 | const std::string &path, | |
255 | size_t size) | |
256 | { | |
257 | LOG_PREFIX(block_check_create_device); | |
258 | INFO("path={}, size={}", path, size); | |
259 | return seastar::open_file_dma( | |
260 | path, | |
261 | seastar::open_flags::exclusive | | |
262 | seastar::open_flags::rw | | |
263 | seastar::open_flags::create | |
264 | ).then([size, FNAME, &path](auto file) { | |
265 | return seastar::do_with( | |
266 | file, | |
267 | [size, FNAME, &path](auto &f) -> seastar::future<> | |
268 | { | |
269 | DEBUG("path={} created, truncating to {}", path, size); | |
270 | ceph_assert(f); | |
271 | return f.truncate( | |
272 | size | |
273 | ).then([&f, size] { | |
274 | return f.allocate(0, size); | |
275 | }).finally([&f] { | |
276 | return f.close(); | |
277 | }); | |
278 | }); | |
279 | }).then_wrapped([&path, FNAME](auto f) -> check_create_device_ret { | |
280 | if (f.failed()) { | |
281 | try { | |
282 | f.get(); | |
283 | return seastar::now(); | |
284 | } catch (const std::system_error &e) { | |
285 | if (e.code().value() == EEXIST) { | |
286 | ERROR("path={} exists", path); | |
287 | return seastar::now(); | |
288 | } else { | |
289 | ERROR("path={} creation error -- {}", path, e); | |
290 | return crimson::ct_error::input_output_error::make(); | |
291 | } | |
292 | } catch (...) { | |
293 | ERROR("path={} creation error", path); | |
294 | return crimson::ct_error::input_output_error::make(); | |
295 | } | |
296 | } | |
297 | ||
298 | DEBUG("path={} complete", path); | |
299 | std::ignore = f.discard_result(); | |
300 | return seastar::now(); | |
301 | }); | |
302 | } | |
303 | ||
f67539c2 TL |
304 | using open_device_ret = |
305 | BlockSegmentManager::access_ertr::future< | |
306 | std::pair<seastar::file, seastar::stat_data> | |
307 | >; | |
308 | static | |
20effc67 TL |
309 | open_device_ret open_device( |
310 | const std::string &path) | |
f67539c2 | 311 | { |
20effc67 TL |
312 | LOG_PREFIX(block_open_device); |
313 | return seastar::file_stat(path, seastar::follow_symlink::yes | |
314 | ).then([&path, FNAME](auto stat) mutable { | |
315 | return seastar::open_file_dma( | |
316 | path, | |
317 | seastar::open_flags::rw | seastar::open_flags::dsync | |
1e59de90 TL |
318 | ).then([stat, &path, FNAME](auto file) mutable { |
319 | return file.size().then([stat, file, &path, FNAME](auto size) mutable { | |
320 | stat.size = size; | |
321 | INFO("path={} successful, size={}, block_size={}", | |
322 | path, stat.size, stat.block_size); | |
323 | return std::make_pair(file, stat); | |
324 | }); | |
f67539c2 | 325 | }); |
20effc67 TL |
326 | }).handle_exception([FNAME, &path](auto e) -> open_device_ret { |
327 | ERROR("path={} got error -- {}", path, e); | |
328 | return crimson::ct_error::input_output_error::make(); | |
329 | }); | |
f67539c2 TL |
330 | } |
331 | ||
20effc67 | 332 | |
f67539c2 TL |
333 | static |
334 | BlockSegmentManager::access_ertr::future<> | |
20effc67 TL |
335 | write_superblock( |
336 | device_id_t device_id, | |
337 | seastar::file &device, | |
338 | block_sm_superblock_t sb) | |
f67539c2 | 339 | { |
20effc67 | 340 | LOG_PREFIX(block_write_superblock); |
1e59de90 | 341 | DEBUG("{} write {}", device_id_printer_t{device_id}, sb); |
20effc67 TL |
342 | sb.validate(); |
343 | assert(ceph::encoded_sizeof<block_sm_superblock_t>(sb) < | |
f67539c2 TL |
344 | sb.block_size); |
345 | return seastar::do_with( | |
346 | bufferptr(ceph::buffer::create_page_aligned(sb.block_size)), | |
20effc67 TL |
347 | [=, &device](auto &bp) |
348 | { | |
349 | bufferlist bl; | |
350 | encode(sb, bl); | |
351 | auto iter = bl.begin(); | |
352 | assert(bl.length() < sb.block_size); | |
353 | iter.copy(bl.length(), bp.c_str()); | |
354 | return do_write(device_id, device, 0, bp); | |
355 | }); | |
f67539c2 TL |
356 | } |
357 | ||
358 | static | |
359 | BlockSegmentManager::access_ertr::future<block_sm_superblock_t> | |
360 | read_superblock(seastar::file &device, seastar::stat_data sd) | |
361 | { | |
20effc67 TL |
362 | LOG_PREFIX(block_read_superblock); |
363 | DEBUG("reading superblock ..."); | |
f67539c2 TL |
364 | return seastar::do_with( |
365 | bufferptr(ceph::buffer::create_page_aligned(sd.block_size)), | |
20effc67 TL |
366 | [=, &device](auto &bp) |
367 | { | |
368 | return do_read( | |
369 | DEVICE_ID_NULL, // unknown | |
370 | device, | |
371 | 0, | |
372 | bp.length(), | |
373 | bp | |
374 | ).safe_then([=, &bp] { | |
375 | bufferlist bl; | |
376 | bl.push_back(bp); | |
377 | block_sm_superblock_t ret; | |
378 | auto bliter = bl.cbegin(); | |
379 | try { | |
380 | decode(ret, bliter); | |
381 | } catch (...) { | |
382 | ERROR("got decode error!"); | |
383 | ceph_assert(0 == "invalid superblock"); | |
384 | } | |
385 | assert(ceph::encoded_sizeof<block_sm_superblock_t>(ret) < | |
386 | sd.block_size); | |
387 | return BlockSegmentManager::access_ertr::future<block_sm_superblock_t>( | |
388 | BlockSegmentManager::access_ertr::ready_future_marker{}, | |
389 | ret); | |
f67539c2 | 390 | }); |
20effc67 | 391 | }); |
f67539c2 TL |
392 | } |
393 | ||
394 | BlockSegment::BlockSegment( | |
395 | BlockSegmentManager &manager, segment_id_t id) | |
396 | : manager(manager), id(id) {} | |
397 | ||
398 | segment_off_t BlockSegment::get_write_capacity() const | |
399 | { | |
400 | return manager.get_segment_size(); | |
401 | } | |
402 | ||
403 | Segment::close_ertr::future<> BlockSegment::close() | |
404 | { | |
20effc67 | 405 | return manager.segment_close(id, write_pointer); |
f67539c2 TL |
406 | } |
407 | ||
408 | Segment::write_ertr::future<> BlockSegment::write( | |
409 | segment_off_t offset, ceph::bufferlist bl) | |
410 | { | |
20effc67 TL |
411 | LOG_PREFIX(BlockSegment::write); |
412 | auto paddr = paddr_t::make_seg_paddr(id, offset); | |
1e59de90 TL |
413 | DEBUG("{} offset={}~{} poffset={} ...", |
414 | id, offset, bl.length(), manager.get_offset(paddr)); | |
20effc67 TL |
415 | |
416 | if (offset < write_pointer || | |
417 | offset % manager.superblock.block_size != 0 || | |
418 | bl.length() % manager.superblock.block_size != 0) { | |
1e59de90 TL |
419 | ERROR("{} offset={}~{} poffset={} invalid write", |
420 | id, offset, bl.length(), manager.get_offset(paddr)); | |
f67539c2 | 421 | return crimson::ct_error::invarg::make(); |
20effc67 | 422 | } |
f67539c2 | 423 | |
20effc67 | 424 | if (offset + bl.length() > manager.superblock.segment_size) { |
1e59de90 TL |
425 | ERROR("{} offset={}~{} poffset={} write out of the range {}", |
426 | id, offset, bl.length(), manager.get_offset(paddr), | |
20effc67 | 427 | manager.superblock.segment_size); |
f67539c2 | 428 | return crimson::ct_error::enospc::make(); |
20effc67 | 429 | } |
f67539c2 TL |
430 | |
431 | write_pointer = offset + bl.length(); | |
20effc67 | 432 | return manager.segment_write(paddr, bl); |
f67539c2 TL |
433 | } |
434 | ||
1e59de90 TL |
435 | Segment::write_ertr::future<> BlockSegment::advance_wp( |
436 | segment_off_t offset) { | |
437 | return write_ertr::now(); | |
438 | } | |
439 | ||
20effc67 TL |
440 | Segment::close_ertr::future<> BlockSegmentManager::segment_close( |
441 | segment_id_t id, segment_off_t write_pointer) | |
f67539c2 | 442 | { |
20effc67 TL |
443 | LOG_PREFIX(BlockSegmentManager::segment_close); |
444 | auto s_id = id.device_segment_id(); | |
445 | int unused_bytes = get_segment_size() - write_pointer; | |
1e59de90 | 446 | INFO("{} unused_bytes={} ...", id, unused_bytes); |
20effc67 TL |
447 | |
448 | assert(unused_bytes >= 0); | |
449 | assert(id.device_id() == get_device_id()); | |
f67539c2 | 450 | assert(tracker); |
20effc67 TL |
451 | |
452 | tracker->set(s_id, segment_state_t::CLOSED); | |
453 | ++stats.closed_segments; | |
454 | stats.closed_segments_unused_bytes += unused_bytes; | |
455 | stats.metadata_write.increment(tracker->get_size()); | |
456 | return tracker->write_out( | |
1e59de90 TL |
457 | get_device_id(), device, |
458 | shard_info.tracker_offset); | |
f67539c2 TL |
459 | } |
460 | ||
461 | Segment::write_ertr::future<> BlockSegmentManager::segment_write( | |
462 | paddr_t addr, | |
463 | ceph::bufferlist bl, | |
464 | bool ignore_check) | |
465 | { | |
20effc67 | 466 | assert(addr.get_device_id() == get_device_id()); |
f67539c2 | 467 | assert((bl.length() % superblock.block_size) == 0); |
20effc67 TL |
468 | stats.data_write.increment(bl.length()); |
469 | return do_writev( | |
470 | get_device_id(), | |
471 | device, | |
472 | get_offset(addr), | |
473 | std::move(bl), | |
474 | superblock.block_size); | |
f67539c2 TL |
475 | } |
476 | ||
477 | BlockSegmentManager::~BlockSegmentManager() | |
478 | { | |
479 | } | |
480 | ||
20effc67 | 481 | BlockSegmentManager::mount_ret BlockSegmentManager::mount() |
f67539c2 | 482 | { |
1e59de90 TL |
483 | return shard_devices.invoke_on_all([](auto &local_device) { |
484 | return local_device.shard_mount( | |
485 | ).handle_error( | |
486 | crimson::ct_error::assert_all{ | |
487 | "Invalid error in BlockSegmentManager::mount" | |
488 | }); | |
489 | }); | |
490 | } | |
491 | ||
492 | BlockSegmentManager::mount_ret BlockSegmentManager::shard_mount() | |
493 | { | |
494 | LOG_PREFIX(BlockSegmentManager::shard_mount); | |
f67539c2 | 495 | return open_device( |
20effc67 | 496 | device_path |
1e59de90 | 497 | ).safe_then([=, this](auto p) { |
f67539c2 TL |
498 | device = std::move(p.first); |
499 | auto sd = p.second; | |
500 | return read_superblock(device, sd); | |
1e59de90 TL |
501 | }).safe_then([=, this](auto sb) { |
502 | set_device_id(sb.config.spec.id); | |
503 | shard_info = sb.shard_infos[seastar::this_shard_id()]; | |
504 | INFO("{} read {}", device_id_printer_t{get_device_id()}, shard_info); | |
20effc67 | 505 | sb.validate(); |
f67539c2 | 506 | superblock = sb; |
20effc67 TL |
507 | stats.data_read.increment( |
508 | ceph::encoded_sizeof<block_sm_superblock_t>(superblock)); | |
f67539c2 | 509 | tracker = std::make_unique<SegmentStateTracker>( |
1e59de90 | 510 | shard_info.segments, |
f67539c2 | 511 | superblock.block_size); |
20effc67 | 512 | stats.data_read.increment(tracker->get_size()); |
f67539c2 | 513 | return tracker->read_in( |
20effc67 | 514 | get_device_id(), |
f67539c2 | 515 | device, |
1e59de90 | 516 | shard_info.tracker_offset |
f67539c2 | 517 | ).safe_then([this] { |
20effc67 | 518 | for (device_segment_id_t i = 0; i < tracker->get_capacity(); ++i) { |
f67539c2 TL |
519 | if (tracker->get(i) == segment_state_t::OPEN) { |
520 | tracker->set(i, segment_state_t::CLOSED); | |
521 | } | |
522 | } | |
20effc67 TL |
523 | stats.metadata_write.increment(tracker->get_size()); |
524 | return tracker->write_out( | |
1e59de90 TL |
525 | get_device_id(), device, |
526 | shard_info.tracker_offset); | |
f67539c2 | 527 | }); |
20effc67 | 528 | }).safe_then([this, FNAME] { |
1e59de90 | 529 | INFO("{} complete", device_id_printer_t{get_device_id()}); |
20effc67 | 530 | register_metrics(); |
f67539c2 TL |
531 | }); |
532 | } | |
533 | ||
20effc67 | 534 | BlockSegmentManager::mkfs_ret BlockSegmentManager::mkfs( |
1e59de90 TL |
535 | device_config_t sm_config) |
536 | { | |
537 | return shard_devices.local().primary_mkfs(sm_config | |
538 | ).safe_then([this] { | |
539 | return shard_devices.invoke_on_all([](auto &local_device) { | |
540 | return local_device.shard_mkfs( | |
541 | ).handle_error( | |
542 | crimson::ct_error::assert_all{ | |
543 | "Invalid error in BlockSegmentManager::mkfs" | |
544 | }); | |
545 | }); | |
546 | }); | |
547 | } | |
548 | ||
549 | BlockSegmentManager::mkfs_ret BlockSegmentManager::primary_mkfs( | |
550 | device_config_t sm_config) | |
f67539c2 | 551 | { |
1e59de90 TL |
552 | LOG_PREFIX(BlockSegmentManager::primary_mkfs); |
553 | ceph_assert(sm_config.spec.dtype == superblock.config.spec.dtype); | |
554 | set_device_id(sm_config.spec.id); | |
555 | INFO("{} path={}, {}", | |
556 | device_id_printer_t{get_device_id()}, device_path, sm_config); | |
f67539c2 TL |
557 | return seastar::do_with( |
558 | seastar::file{}, | |
559 | seastar::stat_data{}, | |
560 | block_sm_superblock_t{}, | |
561 | std::unique_ptr<SegmentStateTracker>(), | |
1e59de90 | 562 | [=, this](auto &device, auto &stat, auto &sb, auto &tracker) |
20effc67 TL |
563 | { |
564 | check_create_device_ret maybe_create = check_create_device_ertr::now(); | |
565 | using crimson::common::get_conf; | |
566 | if (get_conf<bool>("seastore_block_create")) { | |
567 | auto size = get_conf<Option::size_t>("seastore_device_size"); | |
568 | maybe_create = check_create_device(device_path, size); | |
569 | } | |
570 | ||
571 | return maybe_create.safe_then([this] { | |
572 | return open_device(device_path); | |
573 | }).safe_then([&, sm_config](auto p) { | |
574 | device = p.first; | |
575 | stat = p.second; | |
576 | sb = make_superblock(get_device_id(), sm_config, stat); | |
577 | stats.metadata_write.increment( | |
578 | ceph::encoded_sizeof<block_sm_superblock_t>(sb)); | |
579 | return write_superblock(get_device_id(), device, sb); | |
20effc67 TL |
580 | }).finally([&] { |
581 | return device.close(); | |
582 | }).safe_then([FNAME, this] { | |
1e59de90 | 583 | INFO("{} complete", device_id_printer_t{get_device_id()}); |
20effc67 | 584 | return mkfs_ertr::now(); |
f67539c2 | 585 | }); |
20effc67 | 586 | }); |
f67539c2 TL |
587 | } |
588 | ||
1e59de90 TL |
589 | BlockSegmentManager::mkfs_ret BlockSegmentManager::shard_mkfs() |
590 | { | |
591 | LOG_PREFIX(BlockSegmentManager::shard_mkfs); | |
592 | return open_device( | |
593 | device_path | |
594 | ).safe_then([this](auto p) { | |
595 | device = std::move(p.first); | |
596 | auto sd = p.second; | |
597 | return read_superblock(device, sd); | |
598 | }).safe_then([this, FNAME](auto sb) { | |
599 | set_device_id(sb.config.spec.id); | |
600 | shard_info = sb.shard_infos[seastar::this_shard_id()]; | |
601 | INFO("{} read {}", device_id_printer_t{get_device_id()}, shard_info); | |
602 | sb.validate(); | |
603 | tracker.reset(new SegmentStateTracker( | |
604 | shard_info.segments, sb.block_size)); | |
605 | stats.metadata_write.increment(tracker->get_size()); | |
606 | return tracker->write_out( | |
607 | get_device_id(), device, | |
608 | shard_info.tracker_offset); | |
609 | }).finally([this] { | |
610 | return device.close(); | |
611 | }).safe_then([FNAME, this] { | |
612 | INFO("{} complete", device_id_printer_t{get_device_id()}); | |
613 | return mkfs_ertr::now(); | |
614 | }); | |
615 | } | |
616 | ||
f67539c2 TL |
617 | BlockSegmentManager::close_ertr::future<> BlockSegmentManager::close() |
618 | { | |
20effc67 | 619 | LOG_PREFIX(BlockSegmentManager::close); |
1e59de90 | 620 | INFO("{}", device_id_printer_t{get_device_id()}); |
20effc67 | 621 | metrics.clear(); |
f67539c2 TL |
622 | return device.close(); |
623 | } | |
624 | ||
625 | SegmentManager::open_ertr::future<SegmentRef> BlockSegmentManager::open( | |
626 | segment_id_t id) | |
627 | { | |
20effc67 TL |
628 | LOG_PREFIX(BlockSegmentManager::open); |
629 | auto s_id = id.device_segment_id(); | |
1e59de90 | 630 | INFO("{} ...", id); |
20effc67 TL |
631 | |
632 | assert(id.device_id() == get_device_id()); | |
633 | ||
634 | if (s_id >= get_num_segments()) { | |
1e59de90 | 635 | ERROR("{} segment-id out of range {}", id, get_num_segments()); |
f67539c2 TL |
636 | return crimson::ct_error::invarg::make(); |
637 | } | |
638 | ||
20effc67 | 639 | if (tracker->get(s_id) != segment_state_t::EMPTY) { |
1e59de90 | 640 | ERROR("{} invalid state {} != EMPTY", id, tracker->get(s_id)); |
f67539c2 TL |
641 | return crimson::ct_error::invarg::make(); |
642 | } | |
643 | ||
20effc67 TL |
644 | tracker->set(s_id, segment_state_t::OPEN); |
645 | stats.metadata_write.increment(tracker->get_size()); | |
646 | return tracker->write_out( | |
1e59de90 TL |
647 | get_device_id(), device, |
648 | shard_info.tracker_offset | |
20effc67 TL |
649 | ).safe_then([this, id, FNAME] { |
650 | ++stats.opened_segments; | |
1e59de90 | 651 | DEBUG("{} done", id); |
f67539c2 TL |
652 | return open_ertr::future<SegmentRef>( |
653 | open_ertr::ready_future_marker{}, | |
654 | SegmentRef(new BlockSegment(*this, id))); | |
655 | }); | |
656 | } | |
657 | ||
658 | SegmentManager::release_ertr::future<> BlockSegmentManager::release( | |
659 | segment_id_t id) | |
660 | { | |
20effc67 TL |
661 | LOG_PREFIX(BlockSegmentManager::release); |
662 | auto s_id = id.device_segment_id(); | |
1e59de90 | 663 | INFO("{} ...", id); |
20effc67 TL |
664 | |
665 | assert(id.device_id() == get_device_id()); | |
f67539c2 | 666 | |
20effc67 | 667 | if (s_id >= get_num_segments()) { |
1e59de90 | 668 | ERROR("{} segment-id out of range {}", id, get_num_segments()); |
f67539c2 TL |
669 | return crimson::ct_error::invarg::make(); |
670 | } | |
671 | ||
20effc67 | 672 | if (tracker->get(s_id) != segment_state_t::CLOSED) { |
1e59de90 | 673 | ERROR("{} invalid state {} != CLOSED", id, tracker->get(s_id)); |
f67539c2 TL |
674 | return crimson::ct_error::invarg::make(); |
675 | } | |
676 | ||
20effc67 TL |
677 | tracker->set(s_id, segment_state_t::EMPTY); |
678 | ++stats.released_segments; | |
679 | stats.metadata_write.increment(tracker->get_size()); | |
680 | return tracker->write_out( | |
1e59de90 TL |
681 | get_device_id(), device, |
682 | shard_info.tracker_offset); | |
f67539c2 TL |
683 | } |
684 | ||
685 | SegmentManager::read_ertr::future<> BlockSegmentManager::read( | |
686 | paddr_t addr, | |
687 | size_t len, | |
688 | ceph::bufferptr &out) | |
689 | { | |
20effc67 TL |
690 | LOG_PREFIX(BlockSegmentManager::read); |
691 | auto& seg_addr = addr.as_seg_paddr(); | |
1e59de90 TL |
692 | auto id = seg_addr.get_segment_id(); |
693 | auto s_id = id.device_segment_id(); | |
20effc67 TL |
694 | auto s_off = seg_addr.get_segment_off(); |
695 | auto p_off = get_offset(addr); | |
1e59de90 | 696 | DEBUG("{} offset={}~{} poffset={} ...", id, s_off, len, p_off); |
20effc67 TL |
697 | |
698 | assert(addr.get_device_id() == get_device_id()); | |
699 | ||
700 | if (s_off % superblock.block_size != 0 || | |
701 | len % superblock.block_size != 0) { | |
1e59de90 | 702 | ERROR("{} offset={}~{} poffset={} invalid read", id, s_off, len, p_off); |
20effc67 TL |
703 | return crimson::ct_error::invarg::make(); |
704 | } | |
705 | ||
706 | if (s_id >= get_num_segments()) { | |
1e59de90 TL |
707 | ERROR("{} offset={}~{} poffset={} segment-id out of range {}", |
708 | id, s_off, len, p_off, get_num_segments()); | |
f67539c2 TL |
709 | return crimson::ct_error::invarg::make(); |
710 | } | |
711 | ||
20effc67 | 712 | if (s_off + len > superblock.segment_size) { |
1e59de90 TL |
713 | ERROR("{} offset={}~{} poffset={} read out of range {}", |
714 | id, s_off, len, p_off, superblock.segment_size); | |
f67539c2 TL |
715 | return crimson::ct_error::invarg::make(); |
716 | } | |
717 | ||
20effc67 TL |
718 | if (tracker->get(s_id) == segment_state_t::EMPTY) { |
719 | // XXX: not an error during scanning, | |
720 | // might need refactor to increase the log level | |
1e59de90 TL |
721 | DEBUG("{} offset={}~{} poffset={} invalid state {}", |
722 | id, s_off, len, p_off, tracker->get(s_id)); | |
f67539c2 TL |
723 | return crimson::ct_error::enoent::make(); |
724 | } | |
725 | ||
20effc67 | 726 | stats.data_read.increment(len); |
f67539c2 | 727 | return do_read( |
20effc67 | 728 | get_device_id(), |
f67539c2 | 729 | device, |
20effc67 TL |
730 | p_off, |
731 | len, | |
f67539c2 TL |
732 | out); |
733 | } | |
734 | ||
20effc67 TL |
735 | void BlockSegmentManager::register_metrics() |
736 | { | |
737 | LOG_PREFIX(BlockSegmentManager::register_metrics); | |
1e59de90 | 738 | DEBUG("{}", device_id_printer_t{get_device_id()}); |
20effc67 | 739 | namespace sm = seastar::metrics; |
20effc67 | 740 | std::vector<sm::label_instance> label_instances; |
1e59de90 | 741 | label_instances.push_back(sm::label_instance("device_id", get_device_id())); |
20effc67 TL |
742 | stats.reset(); |
743 | metrics.add_group( | |
744 | "segment_manager", | |
745 | { | |
746 | sm::make_counter( | |
747 | "data_read_num", | |
748 | stats.data_read.num, | |
749 | sm::description("total number of data read"), | |
750 | label_instances | |
751 | ), | |
752 | sm::make_counter( | |
753 | "data_read_bytes", | |
754 | stats.data_read.bytes, | |
755 | sm::description("total bytes of data read"), | |
756 | label_instances | |
757 | ), | |
758 | sm::make_counter( | |
759 | "data_write_num", | |
760 | stats.data_write.num, | |
761 | sm::description("total number of data write"), | |
762 | label_instances | |
763 | ), | |
764 | sm::make_counter( | |
765 | "data_write_bytes", | |
766 | stats.data_write.bytes, | |
767 | sm::description("total bytes of data write"), | |
768 | label_instances | |
769 | ), | |
770 | sm::make_counter( | |
771 | "metadata_write_num", | |
772 | stats.metadata_write.num, | |
773 | sm::description("total number of metadata write"), | |
774 | label_instances | |
775 | ), | |
776 | sm::make_counter( | |
777 | "metadata_write_bytes", | |
778 | stats.metadata_write.bytes, | |
779 | sm::description("total bytes of metadata write"), | |
780 | label_instances | |
781 | ), | |
782 | sm::make_counter( | |
783 | "opened_segments", | |
784 | stats.opened_segments, | |
785 | sm::description("total segments opened"), | |
786 | label_instances | |
787 | ), | |
788 | sm::make_counter( | |
789 | "closed_segments", | |
790 | stats.closed_segments, | |
791 | sm::description("total segments closed"), | |
792 | label_instances | |
793 | ), | |
794 | sm::make_counter( | |
795 | "closed_segments_unused_bytes", | |
796 | stats.closed_segments_unused_bytes, | |
797 | sm::description("total unused bytes of closed segments"), | |
798 | label_instances | |
799 | ), | |
800 | sm::make_counter( | |
801 | "released_segments", | |
802 | stats.released_segments, | |
803 | sm::description("total segments released"), | |
804 | label_instances | |
805 | ), | |
806 | } | |
807 | ); | |
808 | } | |
809 | ||
f67539c2 | 810 | } |