]> git.proxmox.com Git - ceph.git/blame - ceph/src/os/bluestore/PMEMDevice.cc
update sources to 12.2.8
[ceph.git] / ceph / src / os / bluestore / PMEMDevice.cc
CommitLineData
31f18b77
FG
1// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2// vim: ts=8 sw=2 smarttab
3/*
4 * Ceph - scalable distributed file system
5 *
6 * Copyright (C) 2015 Intel <jianpeng.ma@intel.com>
7 *
8 * Author: Jianpeng Ma <jianpeng.ma@intel.com>
9 *
10 * This is free software; you can redistribute it and/or
11 * modify it under the terms of the GNU Lesser General Public
12 * License version 2.1, as published by the Free Software
13 * Foundation. See file COPYING.
14 *
15 */
16
17#include <unistd.h>
18#include <stdlib.h>
19#include <sys/types.h>
20#include <sys/stat.h>
21#include <libpmem.h>
22
23#include "PMEMDevice.h"
24#include "include/types.h"
25#include "include/compat.h"
26#include "include/stringify.h"
27#include "common/errno.h"
28#include "common/debug.h"
29#include "common/blkdev.h"
30
31#define dout_context cct
32#define dout_subsys ceph_subsys_bdev
33#undef dout_prefix
34#define dout_prefix *_dout << "bdev-PMEM(" << path << ") "
35
36PMEMDevice::PMEMDevice(CephContext *cct, aio_callback_t cb, void *cbpriv)
37 : BlockDevice(cct),
38 fd(-1), addr(0),
39 size(0), block_size(0),
40 debug_lock("PMEMDevice::debug_lock"),
41 injecting_crash(0)
42{
43}
44
45int PMEMDevice::_lock()
46{
47 struct flock l;
48 memset(&l, 0, sizeof(l));
49 l.l_type = F_WRLCK;
50 l.l_whence = SEEK_SET;
51 l.l_start = 0;
52 l.l_len = 0;
53 int r = ::fcntl(fd, F_SETLK, &l);
54 if (r < 0)
55 return -errno;
56 return 0;
57}
58
59int PMEMDevice::open(const string& p)
60{
61 path = p;
62 int r = 0;
63 dout(1) << __func__ << " path " << path << dendl;
64
65 fd = ::open(path.c_str(), O_RDWR);
66 if (fd < 0) {
67 r = -errno;
68 derr << __func__ << " open got: " << cpp_strerror(r) << dendl;
69 return r;
70 }
71
72 r = _lock();
73 if (r < 0) {
74 derr << __func__ << " failed to lock " << path << ": " << cpp_strerror(r)
75 << dendl;
76 goto out_fail;
77 }
78
79 struct stat st;
80 r = ::fstat(fd, &st);
81 if (r < 0) {
82 r = -errno;
83 derr << __func__ << " fstat got " << cpp_strerror(r) << dendl;
84 goto out_fail;
85 }
86 if (S_ISBLK(st.st_mode)) {
87 int64_t s;
88 r = get_block_device_size(fd, &s);
89 if (r < 0) {
90 goto out_fail;
91 }
92 size = s;
93 } else {
94 size = st.st_size;
95 }
96
97 size_t map_len;
98 addr = (char *)pmem_map_file(path.c_str(), size, PMEM_FILE_EXCL, O_RDWR, &map_len, NULL);
99 if (addr == NULL) {
100 derr << __func__ << " pmem_map_file error" << dendl;
101 goto out_fail;
102 }
103 size = map_len;
104
105 // Operate as though the block size is 4 KB. The backing file
106 // blksize doesn't strictly matter except that some file systems may
107 // require a read/modify/write if we write something smaller than
108 // it.
109 block_size = g_conf->bdev_block_size;
110 if (block_size != (unsigned)st.st_blksize) {
111 dout(1) << __func__ << " backing device/file reports st_blksize "
112 << st.st_blksize << ", using bdev_block_size "
113 << block_size << " anyway" << dendl;
114 }
115
116 dout(1) << __func__
117 << " size " << size
1adf2230 118 << " (" << byte_u_t(size) << ")"
31f18b77 119 << " block_size " << block_size
1adf2230 120 << " (" << byte_u_t(block_size) << ")"
31f18b77
FG
121 << dendl;
122 return 0;
123
124 out_fail:
125 VOID_TEMP_FAILURE_RETRY(::close(fd));
126 fd = -1;
127 return r;
128}
129
130void PMEMDevice::close()
131{
132 dout(1) << __func__ << dendl;
133
134 assert(addr != NULL);
135 pmem_unmap(addr, size);
136 assert(fd >= 0);
137 VOID_TEMP_FAILURE_RETRY(::close(fd));
138 fd = -1;
139
140 path.clear();
141}
142
143static string get_dev_property(const char *dev, const char *property)
144{
145 char val[1024] = {0};
146 get_block_device_string_property(dev, property, val, sizeof(val));
147 return val;
148}
149
150int PMEMDevice::collect_metadata(string prefix, map<string,string> *pm) const
151{
152 (*pm)[prefix + "rotational"] = stringify((int)(bool)rotational);
153 (*pm)[prefix + "size"] = stringify(get_size());
154 (*pm)[prefix + "block_size"] = stringify(get_block_size());
155 (*pm)[prefix + "driver"] = "PMEMDevice";
156 (*pm)[prefix + "type"] = "ssd";
157
158 struct stat st;
159 int r = ::fstat(fd, &st);
160 if (r < 0)
161 return -errno;
162 if (S_ISBLK(st.st_mode)) {
163 (*pm)[prefix + "access_mode"] = "blk";
164 char partition_path[PATH_MAX];
165 char dev_node[PATH_MAX];
166 int rc = get_device_by_fd(fd, partition_path, dev_node, PATH_MAX);
167 switch (rc) {
168 case -EOPNOTSUPP:
169 case -EINVAL:
170 (*pm)[prefix + "partition_path"] = "unknown";
171 (*pm)[prefix + "dev_node"] = "unknown";
172 break;
173 case -ENODEV:
174 (*pm)[prefix + "partition_path"] = string(partition_path);
175 (*pm)[prefix + "dev_node"] = "unknown";
176 break;
177 default:
178 {
179 (*pm)[prefix + "partition_path"] = string(partition_path);
180 (*pm)[prefix + "dev_node"] = string(dev_node);
181 (*pm)[prefix + "model"] = get_dev_property(dev_node, "device/model");
182 (*pm)[prefix + "dev"] = get_dev_property(dev_node, "dev");
183
184 // nvme exposes a serial number
185 string serial = get_dev_property(dev_node, "device/serial");
186 if (serial.length()) {
187 (*pm)[prefix + "serial"] = serial;
188 }
189
190 // nvme has a device/device/* structure; infer from that. there
191 // is probably a better way?
192 string nvme_vendor = get_dev_property(dev_node, "device/device/vendor");
193 if (nvme_vendor.length()) {
194 (*pm)[prefix + "type"] = "nvme";
195 }
196 }
197 }
198 } else {
199 (*pm)[prefix + "access_mode"] = "file";
200 (*pm)[prefix + "path"] = path;
201 }
202 return 0;
203}
204
205int PMEMDevice::flush()
206{
207 //Because all write is persist. So no need
208 return 0;
209}
210
211
212void PMEMDevice::aio_submit(IOContext *ioc)
213{
214 return;
215}
216
217int PMEMDevice::write(uint64_t off, bufferlist& bl, bool buffered)
218{
219 uint64_t len = bl.length();
220 dout(20) << __func__ << " " << off << "~" << len << dendl;
221 assert(len > 0);
222 assert(off < size);
223 assert(off + len <= size);
224
225 dout(40) << "data: ";
226 bl.hexdump(*_dout);
227 *_dout << dendl;
228
229 if (g_conf->bdev_inject_crash &&
230 rand() % g_conf->bdev_inject_crash == 0) {
231 derr << __func__ << " bdev_inject_crash: dropping io " << off << "~" << len
232 << dendl;
233 ++injecting_crash;
234 return 0;
235 }
236
237 bufferlist::iterator p = bl.begin();
238 uint32_t off1 = off;
239 while (len) {
240 const char *data;
241 uint32_t l = p.get_ptr_and_advance(len, &data);
242 pmem_memcpy_persist(addr + off1, data, l);
243 len -= l;
244 off1 += l;
245 }
246
247 return 0;
248}
249
250int PMEMDevice::aio_write(
251 uint64_t off,
252 bufferlist &bl,
253 IOContext *ioc,
254 bool buffered)
255{
256 return write(off, bl, buffered);
257}
258
259
260int PMEMDevice::read(uint64_t off, uint64_t len, bufferlist *pbl,
261 IOContext *ioc,
262 bool buffered)
263{
264 dout(5) << __func__ << " " << off << "~" << len << dendl;
265 assert(len > 0);
266 assert(off < size);
267 assert(off + len <= size);
268
269 bufferptr p = buffer::create_page_aligned(len);
270 memcpy(p.c_str(), addr + off, len);
271
272 pbl->clear();
273 pbl->push_back(std::move(p));
274
275 dout(40) << "data: ";
276 pbl->hexdump(*_dout);
277 *_dout << dendl;
278
279 return 0;
280}
281
282int PMEMDevice::aio_read(uint64_t off, uint64_t len, bufferlist *pbl,
283 IOContext *ioc)
284{
285 return read(off, len, pbl, ioc, false);
286}
287
288int PMEMDevice::read_random(uint64_t off, uint64_t len, char *buf, bool buffered)
289{
290 assert(len > 0);
291 assert(off < size);
292 assert(off + len <= size);
293
294 memcpy(buf, addr + off, len);
295 return 0;
296}
297
298
299int PMEMDevice::invalidate_cache(uint64_t off, uint64_t len)
300{
301 dout(5) << __func__ << " " << off << "~" << len << dendl;
302 return 0;
303}
304
305