]> git.proxmox.com Git - ceph.git/blob - ceph/src/crimson/os/seastore/random_block_manager/nvme_block_device.h
4dc4de533aa479251a05b82e67cb9e5abd28872e
[ceph.git] / ceph / src / crimson / os / seastore / random_block_manager / nvme_block_device.h
1 //-*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2 // vim: ts=8 sw=2 smarttab
3
4 #pragma once
5
6 #include <memory>
7 #include <vector>
8
9 #include <seastar/core/file.hh>
10 #include <linux/nvme_ioctl.h>
11
12 #include "crimson/osd/exceptions.h"
13 #include "crimson/common/layout.h"
14 #include "rbm_device.h"
15
16 namespace ceph {
17 namespace buffer {
18 class bufferptr;
19 }
20 }
21
22 namespace crimson::os::seastore::random_block_device::nvme {
23 /*
24 * NVMe protocol structures (nvme_XX, identify_XX)
25 *
26 * All structures relative to NVMe protocol are following NVMe protocol v1.4
27 * (latest). NVMe is protocol for fast interfacing between user and SSD device.
28 * We selectively adopted features among various NVMe features to ease
29 * implementation. And also, NVMeBlockDevice provides generic command submission
30 * APIs for IO and Admin commands. Please use pass_through_io() and pass_admin()
31 * to do it.
32 *
33 * For more information about NVMe protocol, refer https://nvmexpress.org/
34 */
35 struct nvme_identify_command_t {
36 uint32_t common_dw[10];
37
38 uint32_t cns : 8;
39 uint32_t reserved : 8;
40 uint32_t cnt_id : 16;
41
42 static const uint8_t CNS_NAMESPACE = 0x00;
43 static const uint8_t CNS_CONTROLLER = 0x01;
44 };
45
46 struct nvme_admin_command_t {
47 union {
48 nvme_passthru_cmd common;
49 nvme_identify_command_t identify;
50 };
51
52 static const uint8_t OPCODE_IDENTIFY = 0x06;
53 };
54
55 // Optional Admin Command Support (OACS)
56 // Indicates optional commands are supported by SSD or not
57 struct oacs_t {
58 uint16_t unused : 5;
59 uint16_t support_directives : 1; // Support multi-stream
60 uint16_t unused2 : 10;
61 };
62
63 struct nvme_identify_controller_data_t {
64 union {
65 struct {
66 uint8_t unused[256]; // [255:0]
67 oacs_t oacs; // [257:256]
68 uint8_t unused2[270]; // [527:258]
69 uint16_t awupf; // [529:528]
70 };
71 uint8_t raw[4096];
72 };
73 };
74
75 // End-to-end Data Protection Capabilities (DPC)
76 // Indicates type of E2E data protection supported by SSD
77 struct dpc_t {
78 uint8_t support_type1 : 1;
79 uint8_t support_type2 : 1;
80 uint8_t support_type3 : 1;
81 uint8_t support_first_meta : 1;
82 uint8_t support_last_meta : 1;
83 uint8_t reserved : 3;
84 };
85
86 // End-to-end Data Protection Type Settings (DPS)
87 // Indicates enabled type of E2E data protection
88 struct dps_t {
89 uint8_t protection_type : 3;
90 uint8_t protection_info : 1;
91 uint8_t reserved : 4;
92 };
93
94 // Namespace Features (NSFEAT)
95 // Indicates features of namespace
96 struct nsfeat_t {
97 uint8_t thinp : 1;
98 uint8_t nsabp : 1;
99 uint8_t dae : 1;
100 uint8_t uid_reuse : 1;
101 uint8_t opterf : 1; // Support NPWG, NPWA
102 uint8_t reserved : 3;
103 };
104
105 // LBA Format (LBAF)
106 // Indicates LBA format (metadata size, data size, performance)
107 struct lbaf_t {
108 uint32_t ms : 16;
109 uint32_t lbads : 8;
110 uint32_t rp : 2;
111 uint32_t reserved : 6;
112 };
113
114 struct nvme_identify_namespace_data_t {
115 union {
116 struct {
117 uint8_t unused[24]; // [23:0]
118 nsfeat_t nsfeat; // [24]
119 uint8_t unused2[3]; // [27:25]
120 dpc_t dpc; // [28]
121 dps_t dps; // [29]
122 uint8_t unused3[34]; // [63:30]
123 uint16_t npwg; // [65:64]
124 uint16_t npwa; // [67:66]
125 uint8_t unused4[60]; // [127:68]
126 lbaf_t lbaf0; // [131:128]
127 };
128 uint8_t raw[4096];
129 };
130 };
131
132 struct nvme_rw_command_t {
133 uint32_t common_dw[10];
134
135 uint64_t s_lba;
136
137 uint32_t nlb : 16; // 0's based value
138 uint32_t reserved : 4;
139 uint32_t d_type : 4;
140 uint32_t reserved2 : 2;
141 uint32_t prinfo_prchk : 3;
142 uint32_t prinfo_pract : 1;
143 uint32_t fua : 1;
144 uint32_t lr : 1;
145
146 uint32_t reserved3 : 16;
147 uint32_t dspec : 16;
148
149 static const uint32_t DTYPE_STREAM = 1;
150 };
151
152 struct nvme_io_command_t {
153 union {
154 nvme_passthru_cmd common;
155 nvme_rw_command_t rw;
156 };
157 static const uint8_t OPCODE_WRITE = 0x01;
158 static const uint8_t OPCODE_READ = 0x01;
159 };
160
161 /*
162 * Implementation of NVMeBlockDevice with POSIX APIs
163 *
164 * NVMeBlockDevice provides NVMe SSD interfaces through POSIX APIs which is
165 * generally available at most operating environment.
166 */
167 class NVMeBlockDevice : public RBMDevice {
168 public:
169
170 /*
171 * Service NVMe device relative size
172 *
173 * size : total size of device in byte.
174 *
175 * block_size : IO unit size in byte. Caller should follow every IO command
176 * aligned with block size.
177 *
178 * preffered_write_granularity(PWG), preffered_write_alignment(PWA) : IO unit
179 * size for write in byte. Caller should request every write IO sized multiple
180 * times of PWG and aligned starting address by PWA. Available only if NVMe
181 * Device supports NVMe protocol 1.4 or later versions.
182 * atomic_write_unit : The maximum size of write whose atomicity is guranteed
183 * by SSD even on power failure. The write equal to or smaller than
184 * atomic_write_unit does not require fsync().
185 */
186
187 NVMeBlockDevice(std::string device_path) : device_path(device_path) {}
188 ~NVMeBlockDevice() = default;
189
190 open_ertr::future<> open(
191 const std::string &in_path,
192 seastar::open_flags mode) override;
193
194 write_ertr::future<> write(
195 uint64_t offset,
196 bufferptr &&bptr,
197 uint16_t stream = 0) override;
198
199 using RBMDevice::read;
200 read_ertr::future<> read(
201 uint64_t offset,
202 bufferptr &bptr) final;
203
204 close_ertr::future<> close() override;
205
206 discard_ertr::future<> discard(
207 uint64_t offset,
208 uint64_t len) override;
209
210 mount_ret mount() final;
211
212 mkfs_ret mkfs(device_config_t config) final {
213 using crimson::common::get_conf;
214 super.journal_size = get_conf<Option::size_t>("seastore_cbjournal_size");
215 return do_mkfs(config);
216 }
217
218 write_ertr::future<> writev(
219 uint64_t offset,
220 ceph::bufferlist bl,
221 uint16_t stream = 0) final;
222
223 stat_device_ret stat_device() final {
224 return seastar::file_stat(device_path, seastar::follow_symlink::yes
225 ).handle_exception([](auto e) -> stat_device_ret {
226 return crimson::ct_error::input_output_error::make();
227 }).then([this](auto stat) {
228 return seastar::open_file_dma(
229 device_path,
230 seastar::open_flags::rw | seastar::open_flags::dsync
231 ).then([this, stat](auto file) mutable {
232 return file.size().then([this, stat, file](auto size) mutable {
233 stat.size = size;
234 return identify_namespace(file
235 ).safe_then([stat] (auto id_namespace_data) mutable {
236 // LBA format provides LBA size which is power of 2. LBA is the
237 // minimum size of read and write.
238 stat.block_size = (1 << id_namespace_data.lbaf0.lbads);
239 if (stat.block_size < RBM_SUPERBLOCK_SIZE) {
240 stat.block_size = RBM_SUPERBLOCK_SIZE;
241 }
242 return stat_device_ret(
243 read_ertr::ready_future_marker{},
244 stat
245 );
246 }).handle_error(crimson::ct_error::input_output_error::handle(
247 [stat]{
248 return stat_device_ret(
249 read_ertr::ready_future_marker{},
250 stat
251 );
252 }), crimson::ct_error::pass_further_all{});
253 }).safe_then([file](auto st) mutable {
254 return file.close(
255 ).then([st] {
256 return stat_device_ret(
257 read_ertr::ready_future_marker{},
258 st
259 );
260 });
261 });
262 });
263 });
264 }
265
266 std::string get_device_path() const final {
267 return device_path;
268 }
269
270 uint64_t get_preffered_write_granularity() const { return write_granularity; }
271 uint64_t get_preffered_write_alignment() const { return write_alignment; }
272 uint64_t get_atomic_write_unit() const { return atomic_write_unit; }
273 /*
274 * End-to-End Data Protection
275 *
276 * NVMe device keeps track of data integrity similar with checksum. Client can
277 * offload checksuming to NVMe device to reduce its CPU utilization. If data
278 * protection is enabled, checksum is calculated on every write and used to
279 * verify data on every read.
280 */
281 bool is_data_protection_enabled() const { return data_protection_enabled; }
282
283 /*
284 * Data Health
285 *
286 * Returns list of LBAs which have almost corrupted data. Data of the LBAs
287 * will be corrupted very soon. Caller can overwrite, unmap or refresh data to
288 * protect data
289 */
290 virtual nvme_command_ertr::future<std::list<uint64_t>> get_data_health() {
291 std::list<uint64_t> fragile_lbas;
292 return nvme_command_ertr::future<std::list<uint64_t>>(
293 nvme_command_ertr::ready_future_marker{},
294 fragile_lbas
295 );
296 }
297
298 /*
299 * Recovery Level
300 *
301 * Regulate magnitude of SSD-internal data recovery. Caller can get good read
302 * latency with lower magnitude.
303 */
304 virtual nvme_command_ertr::future<> set_data_recovery_level(
305 uint32_t level) { return nvme_command_ertr::now(); }
306 /*
307 * For passsing through nvme IO or Admin command to SSD
308 * Caller can construct and execute its own nvme command
309 */
310 nvme_command_ertr::future<int> pass_admin(
311 nvme_admin_command_t& admin_cmd, seastar::file f);
312 nvme_command_ertr::future<int> pass_through_io(
313 nvme_io_command_t& io_cmd);
314
315 bool support_multistream = false;
316 uint8_t data_protection_type = 0;
317
318 /*
319 * Predictable Latency
320 *
321 * NVMe device can guarantee IO latency within pre-defined time window. This
322 * functionality will be analyzed soon.
323 */
324
325 private:
326 // identify_controller/namespace are used to get SSD internal information such
327 // as supported features, NPWG and NPWA
328 nvme_command_ertr::future<nvme_identify_controller_data_t>
329 identify_controller(seastar::file f);
330 nvme_command_ertr::future<nvme_identify_namespace_data_t>
331 identify_namespace(seastar::file f);
332 nvme_command_ertr::future<int> get_nsid(seastar::file f);
333 open_ertr::future<> open_for_io(
334 const std::string& in_path,
335 seastar::open_flags mode);
336
337 seastar::file device;
338 std::vector<seastar::file> io_device;
339 uint32_t stream_index_to_open = WRITE_LIFE_NOT_SET;
340 uint32_t stream_id_count = 1; // stream is disabled, defaultly.
341 uint32_t awupf = 0;
342
343 uint64_t write_granularity = 4096;
344 uint64_t write_alignment = 4096;
345 uint32_t atomic_write_unit = 4096;
346
347 bool data_protection_enabled = false;
348 std::string device_path;
349 };
350
351 }