1 //-*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2 // vim: ts=8 sw=2 smarttab
9 #include <seastar/core/file.hh>
10 #include <linux/nvme_ioctl.h>
12 #include "crimson/osd/exceptions.h"
13 #include "crimson/common/layout.h"
14 #include "rbm_device.h"
22 namespace crimson::os::seastore::random_block_device::nvme
{
24 * NVMe protocol structures (nvme_XX, identify_XX)
26 * All structures relative to NVMe protocol are following NVMe protocol v1.4
27 * (latest). NVMe is protocol for fast interfacing between user and SSD device.
28 * We selectively adopted features among various NVMe features to ease
29 * implementation. And also, NVMeBlockDevice provides generic command submission
30 * APIs for IO and Admin commands. Please use pass_through_io() and pass_admin()
33 * For more information about NVMe protocol, refer https://nvmexpress.org/
35 struct nvme_identify_command_t
{
36 uint32_t common_dw
[10];
39 uint32_t reserved
: 8;
42 static const uint8_t CNS_NAMESPACE
= 0x00;
43 static const uint8_t CNS_CONTROLLER
= 0x01;
46 struct nvme_admin_command_t
{
48 nvme_passthru_cmd common
;
49 nvme_identify_command_t identify
;
52 static const uint8_t OPCODE_IDENTIFY
= 0x06;
55 // Optional Admin Command Support (OACS)
56 // Indicates optional commands are supported by SSD or not
59 uint16_t support_directives
: 1; // Support multi-stream
60 uint16_t unused2
: 10;
63 struct nvme_identify_controller_data_t
{
66 uint8_t unused
[256]; // [255:0]
67 oacs_t oacs
; // [257:256]
68 uint8_t unused2
[270]; // [527:258]
69 uint16_t awupf
; // [529:528]
75 // End-to-end Data Protection Capabilities (DPC)
76 // Indicates type of E2E data protection supported by SSD
78 uint8_t support_type1
: 1;
79 uint8_t support_type2
: 1;
80 uint8_t support_type3
: 1;
81 uint8_t support_first_meta
: 1;
82 uint8_t support_last_meta
: 1;
86 // End-to-end Data Protection Type Settings (DPS)
87 // Indicates enabled type of E2E data protection
89 uint8_t protection_type
: 3;
90 uint8_t protection_info
: 1;
94 // Namespace Features (NSFEAT)
95 // Indicates features of namespace
100 uint8_t uid_reuse
: 1;
101 uint8_t opterf
: 1; // Support NPWG, NPWA
102 uint8_t reserved
: 3;
106 // Indicates LBA format (metadata size, data size, performance)
111 uint32_t reserved
: 6;
114 struct nvme_identify_namespace_data_t
{
117 uint8_t unused
[24]; // [23:0]
118 nsfeat_t nsfeat
; // [24]
119 uint8_t unused2
[3]; // [27:25]
122 uint8_t unused3
[34]; // [63:30]
123 uint16_t npwg
; // [65:64]
124 uint16_t npwa
; // [67:66]
125 uint8_t unused4
[60]; // [127:68]
126 lbaf_t lbaf0
; // [131:128]
132 struct nvme_rw_command_t
{
133 uint32_t common_dw
[10];
137 uint32_t nlb
: 16; // 0's based value
138 uint32_t reserved
: 4;
140 uint32_t reserved2
: 2;
141 uint32_t prinfo_prchk
: 3;
142 uint32_t prinfo_pract
: 1;
146 uint32_t reserved3
: 16;
149 static const uint32_t DTYPE_STREAM
= 1;
152 struct nvme_io_command_t
{
154 nvme_passthru_cmd common
;
155 nvme_rw_command_t rw
;
157 static const uint8_t OPCODE_WRITE
= 0x01;
158 static const uint8_t OPCODE_READ
= 0x01;
162 * Implementation of NVMeBlockDevice with POSIX APIs
164 * NVMeBlockDevice provides NVMe SSD interfaces through POSIX APIs which is
165 * generally available at most operating environment.
167 class NVMeBlockDevice
: public RBMDevice
{
171 * Service NVMe device relative size
173 * size : total size of device in byte.
175 * block_size : IO unit size in byte. Caller should follow every IO command
176 * aligned with block size.
178 * preffered_write_granularity(PWG), preffered_write_alignment(PWA) : IO unit
179 * size for write in byte. Caller should request every write IO sized multiple
180 * times of PWG and aligned starting address by PWA. Available only if NVMe
181 * Device supports NVMe protocol 1.4 or later versions.
182 * atomic_write_unit : The maximum size of write whose atomicity is guranteed
183 * by SSD even on power failure. The write equal to or smaller than
184 * atomic_write_unit does not require fsync().
187 NVMeBlockDevice(std::string device_path
) : device_path(device_path
) {}
188 ~NVMeBlockDevice() = default;
190 open_ertr::future
<> open(
191 const std::string
&in_path
,
192 seastar::open_flags mode
) override
;
194 write_ertr::future
<> write(
197 uint16_t stream
= 0) override
;
199 using RBMDevice::read
;
200 read_ertr::future
<> read(
202 bufferptr
&bptr
) final
;
204 close_ertr::future
<> close() override
;
206 discard_ertr::future
<> discard(
208 uint64_t len
) override
;
210 mount_ret
mount() final
;
212 mkfs_ret
mkfs(device_config_t config
) final
{
213 using crimson::common::get_conf
;
214 super
.journal_size
= get_conf
<Option::size_t>("seastore_cbjournal_size");
215 return do_mkfs(config
);
218 write_ertr::future
<> writev(
221 uint16_t stream
= 0) final
;
223 stat_device_ret
stat_device() final
{
224 return seastar::file_stat(device_path
, seastar::follow_symlink::yes
225 ).handle_exception([](auto e
) -> stat_device_ret
{
226 return crimson::ct_error::input_output_error::make();
227 }).then([this](auto stat
) {
228 return seastar::open_file_dma(
230 seastar::open_flags::rw
| seastar::open_flags::dsync
231 ).then([this, stat
](auto file
) mutable {
232 return file
.size().then([this, stat
, file
](auto size
) mutable {
234 return identify_namespace(file
235 ).safe_then([stat
] (auto id_namespace_data
) mutable {
236 // LBA format provides LBA size which is power of 2. LBA is the
237 // minimum size of read and write.
238 stat
.block_size
= (1 << id_namespace_data
.lbaf0
.lbads
);
239 if (stat
.block_size
< RBM_SUPERBLOCK_SIZE
) {
240 stat
.block_size
= RBM_SUPERBLOCK_SIZE
;
242 return stat_device_ret(
243 read_ertr::ready_future_marker
{},
246 }).handle_error(crimson::ct_error::input_output_error::handle(
248 return stat_device_ret(
249 read_ertr::ready_future_marker
{},
252 }), crimson::ct_error::pass_further_all
{});
253 }).safe_then([file
](auto st
) mutable {
256 return stat_device_ret(
257 read_ertr::ready_future_marker
{},
266 std::string
get_device_path() const final
{
270 uint64_t get_preffered_write_granularity() const { return write_granularity
; }
271 uint64_t get_preffered_write_alignment() const { return write_alignment
; }
272 uint64_t get_atomic_write_unit() const { return atomic_write_unit
; }
274 * End-to-End Data Protection
276 * NVMe device keeps track of data integrity similar with checksum. Client can
277 * offload checksuming to NVMe device to reduce its CPU utilization. If data
278 * protection is enabled, checksum is calculated on every write and used to
279 * verify data on every read.
281 bool is_data_protection_enabled() const { return data_protection_enabled
; }
286 * Returns list of LBAs which have almost corrupted data. Data of the LBAs
287 * will be corrupted very soon. Caller can overwrite, unmap or refresh data to
290 virtual nvme_command_ertr::future
<std::list
<uint64_t>> get_data_health() {
291 std::list
<uint64_t> fragile_lbas
;
292 return nvme_command_ertr::future
<std::list
<uint64_t>>(
293 nvme_command_ertr::ready_future_marker
{},
301 * Regulate magnitude of SSD-internal data recovery. Caller can get good read
302 * latency with lower magnitude.
304 virtual nvme_command_ertr::future
<> set_data_recovery_level(
305 uint32_t level
) { return nvme_command_ertr::now(); }
307 * For passsing through nvme IO or Admin command to SSD
308 * Caller can construct and execute its own nvme command
310 nvme_command_ertr::future
<int> pass_admin(
311 nvme_admin_command_t
& admin_cmd
, seastar::file f
);
312 nvme_command_ertr::future
<int> pass_through_io(
313 nvme_io_command_t
& io_cmd
);
315 bool support_multistream
= false;
316 uint8_t data_protection_type
= 0;
319 * Predictable Latency
321 * NVMe device can guarantee IO latency within pre-defined time window. This
322 * functionality will be analyzed soon.
326 // identify_controller/namespace are used to get SSD internal information such
327 // as supported features, NPWG and NPWA
328 nvme_command_ertr::future
<nvme_identify_controller_data_t
>
329 identify_controller(seastar::file f
);
330 nvme_command_ertr::future
<nvme_identify_namespace_data_t
>
331 identify_namespace(seastar::file f
);
332 nvme_command_ertr::future
<int> get_nsid(seastar::file f
);
333 open_ertr::future
<> open_for_io(
334 const std::string
& in_path
,
335 seastar::open_flags mode
);
337 seastar::file device
;
338 std::vector
<seastar::file
> io_device
;
339 uint32_t stream_index_to_open
= WRITE_LIFE_NOT_SET
;
340 uint32_t stream_id_count
= 1; // stream is disabled, defaultly.
343 uint64_t write_granularity
= 4096;
344 uint64_t write_alignment
= 4096;
345 uint32_t atomic_write_unit
= 4096;
347 bool data_protection_enabled
= false;
348 std::string device_path
;