]>
Commit | Line | Data |
---|---|---|
31f18b77 FG |
1 | // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- |
2 | // vim: ts=8 sw=2 smarttab | |
3 | /* | |
4 | * Ceph - scalable distributed file system | |
5 | * | |
6 | * Copyright (C) 2015 Intel <jianpeng.ma@intel.com> | |
7 | * | |
8 | * Author: Jianpeng Ma <jianpeng.ma@intel.com> | |
9 | * | |
10 | * This is free software; you can redistribute it and/or | |
11 | * modify it under the terms of the GNU Lesser General Public | |
12 | * License version 2.1, as published by the Free Software | |
13 | * Foundation. See file COPYING. | |
14 | * | |
15 | */ | |
16 | ||
17 | #include <unistd.h> | |
18 | #include <stdlib.h> | |
19 | #include <sys/types.h> | |
20 | #include <sys/stat.h> | |
1e59de90 TL |
21 | #include <sys/sysmacros.h> |
22 | #include <stdio.h> | |
23 | #include <errno.h> | |
24 | #include <fcntl.h> | |
25 | #include <string.h> | |
26 | #include <filesystem> | |
27 | #include <fstream> | |
28 | ||
29 | #include <fmt/format.h> | |
31f18b77 FG |
30 | |
31 | #include "PMEMDevice.h" | |
11fdf7f2 | 32 | #include "libpmem.h" |
31f18b77 FG |
33 | #include "include/types.h" |
34 | #include "include/compat.h" | |
35 | #include "include/stringify.h" | |
36 | #include "common/errno.h" | |
37 | #include "common/debug.h" | |
38 | #include "common/blkdev.h" | |
39 | ||
1e59de90 TL |
40 | #if defined(HAVE_LIBDML) |
41 | #include <dml/dml.hpp> | |
42 | using execution_path = dml::automatic; | |
43 | #endif | |
44 | ||
31f18b77 FG |
45 | #define dout_context cct |
46 | #define dout_subsys ceph_subsys_bdev | |
47 | #undef dout_prefix | |
48 | #define dout_prefix *_dout << "bdev-PMEM(" << path << ") " | |
49 | ||
50 | PMEMDevice::PMEMDevice(CephContext *cct, aio_callback_t cb, void *cbpriv) | |
11fdf7f2 | 51 | : BlockDevice(cct, cb, cbpriv), |
31f18b77 | 52 | fd(-1), addr(0), |
31f18b77 FG |
53 | injecting_crash(0) |
54 | { | |
55 | } | |
56 | ||
57 | int PMEMDevice::_lock() | |
58 | { | |
59 | struct flock l; | |
60 | memset(&l, 0, sizeof(l)); | |
61 | l.l_type = F_WRLCK; | |
62 | l.l_whence = SEEK_SET; | |
63 | l.l_start = 0; | |
64 | l.l_len = 0; | |
65 | int r = ::fcntl(fd, F_SETLK, &l); | |
66 | if (r < 0) | |
67 | return -errno; | |
68 | return 0; | |
69 | } | |
70 | ||
1e59de90 TL |
71 | static int pmem_check_file_type(int fd, const char *pmem_file, uint64_t *total_size) |
72 | { | |
73 | namespace fs = std::filesystem; | |
74 | if (!fs::is_character_file(pmem_file)) { | |
75 | return -EINVAL; | |
76 | } | |
77 | struct stat file_stat; | |
78 | if (::fstat(fd, &file_stat)) { | |
79 | return -EINVAL; | |
80 | } | |
81 | fs::path char_dir = fmt::format("/sys/dev/char/{}:{}", | |
82 | major(file_stat.st_rdev), | |
83 | minor(file_stat.st_rdev)); | |
84 | // Need to check if it is a DAX device | |
85 | if (auto subsys_path = char_dir / "subsystem"; | |
86 | fs::read_symlink(subsys_path).filename().string() != "dax") { | |
87 | return -EINVAL; | |
88 | } | |
89 | if (total_size == nullptr) { | |
90 | return 0; | |
91 | } | |
92 | if (std::ifstream size_file(char_dir / "size"); size_file) { | |
93 | size_file >> *total_size; | |
94 | return size_file ? 0 : -EINVAL; | |
95 | } else { | |
96 | return -EINVAL; | |
97 | } | |
98 | } | |
99 | ||
20effc67 | 100 | int PMEMDevice::open(const std::string& p) |
31f18b77 FG |
101 | { |
102 | path = p; | |
103 | int r = 0; | |
104 | dout(1) << __func__ << " path " << path << dendl; | |
105 | ||
91327a77 | 106 | fd = ::open(path.c_str(), O_RDWR | O_CLOEXEC); |
31f18b77 FG |
107 | if (fd < 0) { |
108 | r = -errno; | |
109 | derr << __func__ << " open got: " << cpp_strerror(r) << dendl; | |
110 | return r; | |
111 | } | |
112 | ||
1e59de90 TL |
113 | r = pmem_check_file_type(fd, path.c_str(), &size); |
114 | if (!r) { | |
115 | dout(1) << __func__ << " This path " << path << " is a devdax dev " << dendl; | |
116 | devdax_device = true; | |
117 | // If using devdax char device, set it to not rotational device. | |
118 | rotational = false; | |
119 | } | |
120 | ||
31f18b77 FG |
121 | r = _lock(); |
122 | if (r < 0) { | |
123 | derr << __func__ << " failed to lock " << path << ": " << cpp_strerror(r) | |
124 | << dendl; | |
125 | goto out_fail; | |
126 | } | |
127 | ||
128 | struct stat st; | |
129 | r = ::fstat(fd, &st); | |
130 | if (r < 0) { | |
131 | r = -errno; | |
132 | derr << __func__ << " fstat got " << cpp_strerror(r) << dendl; | |
133 | goto out_fail; | |
134 | } | |
31f18b77 FG |
135 | |
136 | size_t map_len; | |
1e59de90 TL |
137 | addr = (char *)pmem_map_file(path.c_str(), 0, |
138 | devdax_device ? 0: PMEM_FILE_EXCL, O_RDWR, | |
139 | &map_len, NULL); | |
31f18b77 | 140 | if (addr == NULL) { |
11fdf7f2 | 141 | derr << __func__ << " pmem_map_file failed: " << pmem_errormsg() << dendl; |
31f18b77 FG |
142 | goto out_fail; |
143 | } | |
144 | size = map_len; | |
145 | ||
146 | // Operate as though the block size is 4 KB. The backing file | |
147 | // blksize doesn't strictly matter except that some file systems may | |
148 | // require a read/modify/write if we write something smaller than | |
149 | // it. | |
11fdf7f2 | 150 | block_size = g_conf()->bdev_block_size; |
31f18b77 FG |
151 | if (block_size != (unsigned)st.st_blksize) { |
152 | dout(1) << __func__ << " backing device/file reports st_blksize " | |
153 | << st.st_blksize << ", using bdev_block_size " | |
154 | << block_size << " anyway" << dendl; | |
155 | } | |
156 | ||
157 | dout(1) << __func__ | |
158 | << " size " << size | |
1adf2230 | 159 | << " (" << byte_u_t(size) << ")" |
31f18b77 | 160 | << " block_size " << block_size |
1adf2230 | 161 | << " (" << byte_u_t(block_size) << ")" |
31f18b77 FG |
162 | << dendl; |
163 | return 0; | |
164 | ||
165 | out_fail: | |
166 | VOID_TEMP_FAILURE_RETRY(::close(fd)); | |
167 | fd = -1; | |
168 | return r; | |
169 | } | |
170 | ||
171 | void PMEMDevice::close() | |
172 | { | |
173 | dout(1) << __func__ << dendl; | |
174 | ||
11fdf7f2 | 175 | ceph_assert(addr != NULL); |
1e59de90 TL |
176 | if (devdax_device) { |
177 | devdax_device = false; | |
178 | } | |
31f18b77 | 179 | pmem_unmap(addr, size); |
1e59de90 | 180 | |
11fdf7f2 | 181 | ceph_assert(fd >= 0); |
31f18b77 FG |
182 | VOID_TEMP_FAILURE_RETRY(::close(fd)); |
183 | fd = -1; | |
184 | ||
185 | path.clear(); | |
186 | } | |
187 | ||
20effc67 | 188 | int PMEMDevice::collect_metadata(const std::string& prefix, std::map<std::string,std::string> *pm) const |
31f18b77 FG |
189 | { |
190 | (*pm)[prefix + "rotational"] = stringify((int)(bool)rotational); | |
191 | (*pm)[prefix + "size"] = stringify(get_size()); | |
192 | (*pm)[prefix + "block_size"] = stringify(get_block_size()); | |
193 | (*pm)[prefix + "driver"] = "PMEMDevice"; | |
194 | (*pm)[prefix + "type"] = "ssd"; | |
195 | ||
196 | struct stat st; | |
197 | int r = ::fstat(fd, &st); | |
198 | if (r < 0) | |
199 | return -errno; | |
200 | if (S_ISBLK(st.st_mode)) { | |
201 | (*pm)[prefix + "access_mode"] = "blk"; | |
11fdf7f2 | 202 | char buffer[1024] = {0}; |
9f95a23c | 203 | BlkDev blkdev(fd); |
11fdf7f2 TL |
204 | |
205 | blkdev.model(buffer, sizeof(buffer)); | |
206 | (*pm)[prefix + "model"] = buffer; | |
207 | ||
208 | buffer[0] = '\0'; | |
209 | blkdev.dev(buffer, sizeof(buffer)); | |
210 | (*pm)[prefix + "dev"] = buffer; | |
211 | ||
212 | // nvme exposes a serial number | |
213 | buffer[0] = '\0'; | |
214 | blkdev.serial(buffer, sizeof(buffer)); | |
215 | (*pm)[prefix + "serial"] = buffer; | |
216 | ||
1e59de90 TL |
217 | } else if (S_ISCHR(st.st_mode)) { |
218 | (*pm)[prefix + "access_mode"] = "chardevice"; | |
219 | (*pm)[prefix + "path"] = path; | |
220 | ||
31f18b77 FG |
221 | } else { |
222 | (*pm)[prefix + "access_mode"] = "file"; | |
223 | (*pm)[prefix + "path"] = path; | |
224 | } | |
225 | return 0; | |
226 | } | |
227 | ||
f67539c2 TL |
228 | bool PMEMDevice::support(const std::string &path) |
229 | { | |
230 | int is_pmem = 0; | |
231 | size_t map_len = 0; | |
1e59de90 TL |
232 | int r = 0; |
233 | int local_fd; | |
234 | ||
235 | local_fd = ::open(path.c_str(), O_RDWR); | |
236 | if (local_fd < 0) { | |
237 | return false; | |
238 | } | |
239 | ||
240 | r = pmem_check_file_type(local_fd, path.c_str(), NULL); | |
241 | VOID_TEMP_FAILURE_RETRY(::close(local_fd)); | |
242 | int flags = PMEM_FILE_EXCL; | |
243 | if (r == 0) { | |
244 | flags = 0; | |
245 | } | |
246 | ||
247 | void *addr = pmem_map_file(path.c_str(), 0, flags, O_RDONLY, &map_len, &is_pmem); | |
f67539c2 | 248 | if (addr != NULL) { |
1e59de90 | 249 | pmem_unmap(addr, map_len); |
f67539c2 TL |
250 | if (is_pmem) { |
251 | return true; | |
252 | } | |
f67539c2 | 253 | } |
1e59de90 | 254 | |
f67539c2 TL |
255 | return false; |
256 | } | |
257 | ||
31f18b77 FG |
258 | int PMEMDevice::flush() |
259 | { | |
260 | //Because all write is persist. So no need | |
261 | return 0; | |
262 | } | |
263 | ||
264 | ||
265 | void PMEMDevice::aio_submit(IOContext *ioc) | |
266 | { | |
11fdf7f2 TL |
267 | if (ioc->priv) { |
268 | ceph_assert(ioc->num_running == 0); | |
269 | aio_callback(aio_callback_priv, ioc->priv); | |
270 | } else { | |
271 | ioc->try_aio_wake(); | |
272 | } | |
31f18b77 FG |
273 | return; |
274 | } | |
275 | ||
9f95a23c | 276 | int PMEMDevice::write(uint64_t off, bufferlist& bl, bool buffered, int write_hint) |
31f18b77 FG |
277 | { |
278 | uint64_t len = bl.length(); | |
279 | dout(20) << __func__ << " " << off << "~" << len << dendl; | |
11fdf7f2 | 280 | ceph_assert(is_valid_io(off, len)); |
31f18b77 | 281 | |
20effc67 | 282 | dout(40) << "data:\n"; |
31f18b77 FG |
283 | bl.hexdump(*_dout); |
284 | *_dout << dendl; | |
285 | ||
11fdf7f2 TL |
286 | if (g_conf()->bdev_inject_crash && |
287 | rand() % g_conf()->bdev_inject_crash == 0) { | |
31f18b77 FG |
288 | derr << __func__ << " bdev_inject_crash: dropping io " << off << "~" << len |
289 | << dendl; | |
290 | ++injecting_crash; | |
291 | return 0; | |
292 | } | |
293 | ||
294 | bufferlist::iterator p = bl.begin(); | |
f67539c2 | 295 | uint64_t off1 = off; |
31f18b77 FG |
296 | while (len) { |
297 | const char *data; | |
298 | uint32_t l = p.get_ptr_and_advance(len, &data); | |
1e59de90 TL |
299 | |
300 | #if defined(HAVE_LIBDML) | |
301 | // Take care of the persistency issue | |
302 | auto result = dml::execute<execution_path>(dml::mem_move, dml::make_view(data, l), dml::make_view(addr + off1, l)); | |
303 | ceph_assert(result.status == dml::status_code::ok); | |
304 | #else | |
31f18b77 | 305 | pmem_memcpy_persist(addr + off1, data, l); |
1e59de90 | 306 | #endif |
31f18b77 FG |
307 | len -= l; |
308 | off1 += l; | |
309 | } | |
31f18b77 FG |
310 | return 0; |
311 | } | |
312 | ||
313 | int PMEMDevice::aio_write( | |
314 | uint64_t off, | |
315 | bufferlist &bl, | |
316 | IOContext *ioc, | |
11fdf7f2 | 317 | bool buffered, |
9f95a23c | 318 | int write_hint) |
31f18b77 FG |
319 | { |
320 | return write(off, bl, buffered); | |
321 | } | |
322 | ||
323 | ||
324 | int PMEMDevice::read(uint64_t off, uint64_t len, bufferlist *pbl, | |
325 | IOContext *ioc, | |
326 | bool buffered) | |
327 | { | |
328 | dout(5) << __func__ << " " << off << "~" << len << dendl; | |
11fdf7f2 | 329 | ceph_assert(is_valid_io(off, len)); |
31f18b77 | 330 | |
11fdf7f2 | 331 | bufferptr p = buffer::create_small_page_aligned(len); |
1e59de90 TL |
332 | |
333 | #if defined(HAVE_LIBDML) | |
334 | auto result = dml::execute<execution_path>(dml::mem_move, dml::make_view(addr + off, len), dml::make_view(p.c_str(), len)); | |
335 | ceph_assert(result.status == dml::status_code::ok); | |
336 | #else | |
31f18b77 | 337 | memcpy(p.c_str(), addr + off, len); |
1e59de90 | 338 | #endif |
31f18b77 FG |
339 | |
340 | pbl->clear(); | |
341 | pbl->push_back(std::move(p)); | |
342 | ||
20effc67 | 343 | dout(40) << "data:\n"; |
31f18b77 FG |
344 | pbl->hexdump(*_dout); |
345 | *_dout << dendl; | |
346 | ||
347 | return 0; | |
348 | } | |
349 | ||
350 | int PMEMDevice::aio_read(uint64_t off, uint64_t len, bufferlist *pbl, | |
351 | IOContext *ioc) | |
352 | { | |
353 | return read(off, len, pbl, ioc, false); | |
354 | } | |
355 | ||
356 | int PMEMDevice::read_random(uint64_t off, uint64_t len, char *buf, bool buffered) | |
357 | { | |
11fdf7f2 TL |
358 | dout(5) << __func__ << " " << off << "~" << len << dendl; |
359 | ceph_assert(is_valid_io(off, len)); | |
31f18b77 | 360 | |
1e59de90 TL |
361 | |
362 | #if defined(HAVE_LIBDML) | |
363 | auto result = dml::execute<execution_path>(dml::mem_move, dml::make_view(addr + off, len), dml::make_view(buf, len)); | |
364 | ceph_assert(result.status == dml::status_code::ok); | |
365 | #else | |
31f18b77 | 366 | memcpy(buf, addr + off, len); |
1e59de90 | 367 | #endif |
31f18b77 FG |
368 | return 0; |
369 | } | |
370 | ||
371 | ||
372 | int PMEMDevice::invalidate_cache(uint64_t off, uint64_t len) | |
373 | { | |
374 | dout(5) << __func__ << " " << off << "~" << len << dendl; | |
375 | return 0; | |
376 | } | |
377 | ||
378 |