]> git.proxmox.com Git - ceph.git/blame - ceph/src/blk/pmem/PMEMDevice.cc
update ceph source to reef 18.1.2
[ceph.git] / ceph / src / blk / pmem / PMEMDevice.cc
CommitLineData
31f18b77
FG
1// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2// vim: ts=8 sw=2 smarttab
3/*
4 * Ceph - scalable distributed file system
5 *
6 * Copyright (C) 2015 Intel <jianpeng.ma@intel.com>
7 *
8 * Author: Jianpeng Ma <jianpeng.ma@intel.com>
9 *
10 * This is free software; you can redistribute it and/or
11 * modify it under the terms of the GNU Lesser General Public
12 * License version 2.1, as published by the Free Software
13 * Foundation. See file COPYING.
14 *
15 */
16
17#include <unistd.h>
18#include <stdlib.h>
19#include <sys/types.h>
20#include <sys/stat.h>
1e59de90
TL
21#include <sys/sysmacros.h>
22#include <stdio.h>
23#include <errno.h>
24#include <fcntl.h>
25#include <string.h>
26#include <filesystem>
27#include <fstream>
28
29#include <fmt/format.h>
31f18b77
FG
30
31#include "PMEMDevice.h"
11fdf7f2 32#include "libpmem.h"
31f18b77
FG
33#include "include/types.h"
34#include "include/compat.h"
35#include "include/stringify.h"
36#include "common/errno.h"
37#include "common/debug.h"
38#include "common/blkdev.h"
39
1e59de90
TL
40#if defined(HAVE_LIBDML)
41#include <dml/dml.hpp>
42using execution_path = dml::automatic;
43#endif
44
31f18b77
FG
45#define dout_context cct
46#define dout_subsys ceph_subsys_bdev
47#undef dout_prefix
48#define dout_prefix *_dout << "bdev-PMEM(" << path << ") "
49
50PMEMDevice::PMEMDevice(CephContext *cct, aio_callback_t cb, void *cbpriv)
11fdf7f2 51 : BlockDevice(cct, cb, cbpriv),
31f18b77 52 fd(-1), addr(0),
31f18b77
FG
53 injecting_crash(0)
54{
55}
56
57int PMEMDevice::_lock()
58{
59 struct flock l;
60 memset(&l, 0, sizeof(l));
61 l.l_type = F_WRLCK;
62 l.l_whence = SEEK_SET;
63 l.l_start = 0;
64 l.l_len = 0;
65 int r = ::fcntl(fd, F_SETLK, &l);
66 if (r < 0)
67 return -errno;
68 return 0;
69}
70
1e59de90
TL
71static int pmem_check_file_type(int fd, const char *pmem_file, uint64_t *total_size)
72{
73 namespace fs = std::filesystem;
74 if (!fs::is_character_file(pmem_file)) {
75 return -EINVAL;
76 }
77 struct stat file_stat;
78 if (::fstat(fd, &file_stat)) {
79 return -EINVAL;
80 }
81 fs::path char_dir = fmt::format("/sys/dev/char/{}:{}",
82 major(file_stat.st_rdev),
83 minor(file_stat.st_rdev));
84 // Need to check if it is a DAX device
85 if (auto subsys_path = char_dir / "subsystem";
86 fs::read_symlink(subsys_path).filename().string() != "dax") {
87 return -EINVAL;
88 }
89 if (total_size == nullptr) {
90 return 0;
91 }
92 if (std::ifstream size_file(char_dir / "size"); size_file) {
93 size_file >> *total_size;
94 return size_file ? 0 : -EINVAL;
95 } else {
96 return -EINVAL;
97 }
98}
99
20effc67 100int PMEMDevice::open(const std::string& p)
31f18b77
FG
101{
102 path = p;
103 int r = 0;
104 dout(1) << __func__ << " path " << path << dendl;
105
91327a77 106 fd = ::open(path.c_str(), O_RDWR | O_CLOEXEC);
31f18b77
FG
107 if (fd < 0) {
108 r = -errno;
109 derr << __func__ << " open got: " << cpp_strerror(r) << dendl;
110 return r;
111 }
112
1e59de90
TL
113 r = pmem_check_file_type(fd, path.c_str(), &size);
114 if (!r) {
115 dout(1) << __func__ << " This path " << path << " is a devdax dev " << dendl;
116 devdax_device = true;
117 // If using devdax char device, set it to not rotational device.
118 rotational = false;
119 }
120
31f18b77
FG
121 r = _lock();
122 if (r < 0) {
123 derr << __func__ << " failed to lock " << path << ": " << cpp_strerror(r)
124 << dendl;
125 goto out_fail;
126 }
127
128 struct stat st;
129 r = ::fstat(fd, &st);
130 if (r < 0) {
131 r = -errno;
132 derr << __func__ << " fstat got " << cpp_strerror(r) << dendl;
133 goto out_fail;
134 }
31f18b77
FG
135
136 size_t map_len;
1e59de90
TL
137 addr = (char *)pmem_map_file(path.c_str(), 0,
138 devdax_device ? 0: PMEM_FILE_EXCL, O_RDWR,
139 &map_len, NULL);
31f18b77 140 if (addr == NULL) {
11fdf7f2 141 derr << __func__ << " pmem_map_file failed: " << pmem_errormsg() << dendl;
31f18b77
FG
142 goto out_fail;
143 }
144 size = map_len;
145
146 // Operate as though the block size is 4 KB. The backing file
147 // blksize doesn't strictly matter except that some file systems may
148 // require a read/modify/write if we write something smaller than
149 // it.
11fdf7f2 150 block_size = g_conf()->bdev_block_size;
31f18b77
FG
151 if (block_size != (unsigned)st.st_blksize) {
152 dout(1) << __func__ << " backing device/file reports st_blksize "
153 << st.st_blksize << ", using bdev_block_size "
154 << block_size << " anyway" << dendl;
155 }
156
157 dout(1) << __func__
158 << " size " << size
1adf2230 159 << " (" << byte_u_t(size) << ")"
31f18b77 160 << " block_size " << block_size
1adf2230 161 << " (" << byte_u_t(block_size) << ")"
31f18b77
FG
162 << dendl;
163 return 0;
164
165 out_fail:
166 VOID_TEMP_FAILURE_RETRY(::close(fd));
167 fd = -1;
168 return r;
169}
170
171void PMEMDevice::close()
172{
173 dout(1) << __func__ << dendl;
174
11fdf7f2 175 ceph_assert(addr != NULL);
1e59de90
TL
176 if (devdax_device) {
177 devdax_device = false;
178 }
31f18b77 179 pmem_unmap(addr, size);
1e59de90 180
11fdf7f2 181 ceph_assert(fd >= 0);
31f18b77
FG
182 VOID_TEMP_FAILURE_RETRY(::close(fd));
183 fd = -1;
184
185 path.clear();
186}
187
20effc67 188int PMEMDevice::collect_metadata(const std::string& prefix, std::map<std::string,std::string> *pm) const
31f18b77
FG
189{
190 (*pm)[prefix + "rotational"] = stringify((int)(bool)rotational);
191 (*pm)[prefix + "size"] = stringify(get_size());
192 (*pm)[prefix + "block_size"] = stringify(get_block_size());
193 (*pm)[prefix + "driver"] = "PMEMDevice";
194 (*pm)[prefix + "type"] = "ssd";
195
196 struct stat st;
197 int r = ::fstat(fd, &st);
198 if (r < 0)
199 return -errno;
200 if (S_ISBLK(st.st_mode)) {
201 (*pm)[prefix + "access_mode"] = "blk";
11fdf7f2 202 char buffer[1024] = {0};
9f95a23c 203 BlkDev blkdev(fd);
11fdf7f2
TL
204
205 blkdev.model(buffer, sizeof(buffer));
206 (*pm)[prefix + "model"] = buffer;
207
208 buffer[0] = '\0';
209 blkdev.dev(buffer, sizeof(buffer));
210 (*pm)[prefix + "dev"] = buffer;
211
212 // nvme exposes a serial number
213 buffer[0] = '\0';
214 blkdev.serial(buffer, sizeof(buffer));
215 (*pm)[prefix + "serial"] = buffer;
216
1e59de90
TL
217 } else if (S_ISCHR(st.st_mode)) {
218 (*pm)[prefix + "access_mode"] = "chardevice";
219 (*pm)[prefix + "path"] = path;
220
31f18b77
FG
221 } else {
222 (*pm)[prefix + "access_mode"] = "file";
223 (*pm)[prefix + "path"] = path;
224 }
225 return 0;
226}
227
f67539c2
TL
228bool PMEMDevice::support(const std::string &path)
229{
230 int is_pmem = 0;
231 size_t map_len = 0;
1e59de90
TL
232 int r = 0;
233 int local_fd;
234
235 local_fd = ::open(path.c_str(), O_RDWR);
236 if (local_fd < 0) {
237 return false;
238 }
239
240 r = pmem_check_file_type(local_fd, path.c_str(), NULL);
241 VOID_TEMP_FAILURE_RETRY(::close(local_fd));
242 int flags = PMEM_FILE_EXCL;
243 if (r == 0) {
244 flags = 0;
245 }
246
247 void *addr = pmem_map_file(path.c_str(), 0, flags, O_RDONLY, &map_len, &is_pmem);
f67539c2 248 if (addr != NULL) {
1e59de90 249 pmem_unmap(addr, map_len);
f67539c2
TL
250 if (is_pmem) {
251 return true;
252 }
f67539c2 253 }
1e59de90 254
f67539c2
TL
255 return false;
256}
257
31f18b77
FG
258int PMEMDevice::flush()
259{
260 //Because all write is persist. So no need
261 return 0;
262}
263
264
265void PMEMDevice::aio_submit(IOContext *ioc)
266{
11fdf7f2
TL
267 if (ioc->priv) {
268 ceph_assert(ioc->num_running == 0);
269 aio_callback(aio_callback_priv, ioc->priv);
270 } else {
271 ioc->try_aio_wake();
272 }
31f18b77
FG
273 return;
274}
275
9f95a23c 276int PMEMDevice::write(uint64_t off, bufferlist& bl, bool buffered, int write_hint)
31f18b77
FG
277{
278 uint64_t len = bl.length();
279 dout(20) << __func__ << " " << off << "~" << len << dendl;
11fdf7f2 280 ceph_assert(is_valid_io(off, len));
31f18b77 281
20effc67 282 dout(40) << "data:\n";
31f18b77
FG
283 bl.hexdump(*_dout);
284 *_dout << dendl;
285
11fdf7f2
TL
286 if (g_conf()->bdev_inject_crash &&
287 rand() % g_conf()->bdev_inject_crash == 0) {
31f18b77
FG
288 derr << __func__ << " bdev_inject_crash: dropping io " << off << "~" << len
289 << dendl;
290 ++injecting_crash;
291 return 0;
292 }
293
294 bufferlist::iterator p = bl.begin();
f67539c2 295 uint64_t off1 = off;
31f18b77
FG
296 while (len) {
297 const char *data;
298 uint32_t l = p.get_ptr_and_advance(len, &data);
1e59de90
TL
299
300#if defined(HAVE_LIBDML)
301 // Take care of the persistency issue
302 auto result = dml::execute<execution_path>(dml::mem_move, dml::make_view(data, l), dml::make_view(addr + off1, l));
303 ceph_assert(result.status == dml::status_code::ok);
304#else
31f18b77 305 pmem_memcpy_persist(addr + off1, data, l);
1e59de90 306#endif
31f18b77
FG
307 len -= l;
308 off1 += l;
309 }
31f18b77
FG
310 return 0;
311}
312
313int PMEMDevice::aio_write(
314 uint64_t off,
315 bufferlist &bl,
316 IOContext *ioc,
11fdf7f2 317 bool buffered,
9f95a23c 318 int write_hint)
31f18b77
FG
319{
320 return write(off, bl, buffered);
321}
322
323
324int PMEMDevice::read(uint64_t off, uint64_t len, bufferlist *pbl,
325 IOContext *ioc,
326 bool buffered)
327{
328 dout(5) << __func__ << " " << off << "~" << len << dendl;
11fdf7f2 329 ceph_assert(is_valid_io(off, len));
31f18b77 330
11fdf7f2 331 bufferptr p = buffer::create_small_page_aligned(len);
1e59de90
TL
332
333#if defined(HAVE_LIBDML)
334 auto result = dml::execute<execution_path>(dml::mem_move, dml::make_view(addr + off, len), dml::make_view(p.c_str(), len));
335 ceph_assert(result.status == dml::status_code::ok);
336#else
31f18b77 337 memcpy(p.c_str(), addr + off, len);
1e59de90 338#endif
31f18b77
FG
339
340 pbl->clear();
341 pbl->push_back(std::move(p));
342
20effc67 343 dout(40) << "data:\n";
31f18b77
FG
344 pbl->hexdump(*_dout);
345 *_dout << dendl;
346
347 return 0;
348}
349
350int PMEMDevice::aio_read(uint64_t off, uint64_t len, bufferlist *pbl,
351 IOContext *ioc)
352{
353 return read(off, len, pbl, ioc, false);
354}
355
356int PMEMDevice::read_random(uint64_t off, uint64_t len, char *buf, bool buffered)
357{
11fdf7f2
TL
358 dout(5) << __func__ << " " << off << "~" << len << dendl;
359 ceph_assert(is_valid_io(off, len));
31f18b77 360
1e59de90
TL
361
362#if defined(HAVE_LIBDML)
363 auto result = dml::execute<execution_path>(dml::mem_move, dml::make_view(addr + off, len), dml::make_view(buf, len));
364 ceph_assert(result.status == dml::status_code::ok);
365#else
31f18b77 366 memcpy(buf, addr + off, len);
1e59de90 367#endif
31f18b77
FG
368 return 0;
369}
370
371
372int PMEMDevice::invalidate_cache(uint64_t off, uint64_t len)
373{
374 dout(5) << __func__ << " " << off << "~" << len << dendl;
375 return 0;
376}
377
378