]>
Commit | Line | Data |
---|---|---|
31f18b77 FG |
1 | // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- |
2 | // vim: ts=8 sw=2 smarttab | |
3 | /* | |
4 | * Ceph - scalable distributed file system | |
5 | * | |
6 | * Copyright (C) 2015 Intel <jianpeng.ma@intel.com> | |
7 | * | |
8 | * Author: Jianpeng Ma <jianpeng.ma@intel.com> | |
9 | * | |
10 | * This is free software; you can redistribute it and/or | |
11 | * modify it under the terms of the GNU Lesser General Public | |
12 | * License version 2.1, as published by the Free Software | |
13 | * Foundation. See file COPYING. | |
14 | * | |
15 | */ | |
16 | ||
17 | #include <unistd.h> | |
18 | #include <stdlib.h> | |
19 | #include <sys/types.h> | |
20 | #include <sys/stat.h> | |
21 | #include <libpmem.h> | |
22 | ||
23 | #include "PMEMDevice.h" | |
24 | #include "include/types.h" | |
25 | #include "include/compat.h" | |
26 | #include "include/stringify.h" | |
27 | #include "common/errno.h" | |
28 | #include "common/debug.h" | |
29 | #include "common/blkdev.h" | |
30 | ||
31 | #define dout_context cct | |
32 | #define dout_subsys ceph_subsys_bdev | |
33 | #undef dout_prefix | |
34 | #define dout_prefix *_dout << "bdev-PMEM(" << path << ") " | |
35 | ||
36 | PMEMDevice::PMEMDevice(CephContext *cct, aio_callback_t cb, void *cbpriv) | |
37 | : BlockDevice(cct), | |
38 | fd(-1), addr(0), | |
39 | size(0), block_size(0), | |
40 | debug_lock("PMEMDevice::debug_lock"), | |
41 | injecting_crash(0) | |
42 | { | |
43 | } | |
44 | ||
45 | int PMEMDevice::_lock() | |
46 | { | |
47 | struct flock l; | |
48 | memset(&l, 0, sizeof(l)); | |
49 | l.l_type = F_WRLCK; | |
50 | l.l_whence = SEEK_SET; | |
51 | l.l_start = 0; | |
52 | l.l_len = 0; | |
53 | int r = ::fcntl(fd, F_SETLK, &l); | |
54 | if (r < 0) | |
55 | return -errno; | |
56 | return 0; | |
57 | } | |
58 | ||
59 | int PMEMDevice::open(const string& p) | |
60 | { | |
61 | path = p; | |
62 | int r = 0; | |
63 | dout(1) << __func__ << " path " << path << dendl; | |
64 | ||
65 | fd = ::open(path.c_str(), O_RDWR); | |
66 | if (fd < 0) { | |
67 | r = -errno; | |
68 | derr << __func__ << " open got: " << cpp_strerror(r) << dendl; | |
69 | return r; | |
70 | } | |
71 | ||
72 | r = _lock(); | |
73 | if (r < 0) { | |
74 | derr << __func__ << " failed to lock " << path << ": " << cpp_strerror(r) | |
75 | << dendl; | |
76 | goto out_fail; | |
77 | } | |
78 | ||
79 | struct stat st; | |
80 | r = ::fstat(fd, &st); | |
81 | if (r < 0) { | |
82 | r = -errno; | |
83 | derr << __func__ << " fstat got " << cpp_strerror(r) << dendl; | |
84 | goto out_fail; | |
85 | } | |
86 | if (S_ISBLK(st.st_mode)) { | |
87 | int64_t s; | |
88 | r = get_block_device_size(fd, &s); | |
89 | if (r < 0) { | |
90 | goto out_fail; | |
91 | } | |
92 | size = s; | |
93 | } else { | |
94 | size = st.st_size; | |
95 | } | |
96 | ||
97 | size_t map_len; | |
98 | addr = (char *)pmem_map_file(path.c_str(), size, PMEM_FILE_EXCL, O_RDWR, &map_len, NULL); | |
99 | if (addr == NULL) { | |
100 | derr << __func__ << " pmem_map_file error" << dendl; | |
101 | goto out_fail; | |
102 | } | |
103 | size = map_len; | |
104 | ||
105 | // Operate as though the block size is 4 KB. The backing file | |
106 | // blksize doesn't strictly matter except that some file systems may | |
107 | // require a read/modify/write if we write something smaller than | |
108 | // it. | |
109 | block_size = g_conf->bdev_block_size; | |
110 | if (block_size != (unsigned)st.st_blksize) { | |
111 | dout(1) << __func__ << " backing device/file reports st_blksize " | |
112 | << st.st_blksize << ", using bdev_block_size " | |
113 | << block_size << " anyway" << dendl; | |
114 | } | |
115 | ||
116 | dout(1) << __func__ | |
117 | << " size " << size | |
1adf2230 | 118 | << " (" << byte_u_t(size) << ")" |
31f18b77 | 119 | << " block_size " << block_size |
1adf2230 | 120 | << " (" << byte_u_t(block_size) << ")" |
31f18b77 FG |
121 | << dendl; |
122 | return 0; | |
123 | ||
124 | out_fail: | |
125 | VOID_TEMP_FAILURE_RETRY(::close(fd)); | |
126 | fd = -1; | |
127 | return r; | |
128 | } | |
129 | ||
130 | void PMEMDevice::close() | |
131 | { | |
132 | dout(1) << __func__ << dendl; | |
133 | ||
134 | assert(addr != NULL); | |
135 | pmem_unmap(addr, size); | |
136 | assert(fd >= 0); | |
137 | VOID_TEMP_FAILURE_RETRY(::close(fd)); | |
138 | fd = -1; | |
139 | ||
140 | path.clear(); | |
141 | } | |
142 | ||
143 | static string get_dev_property(const char *dev, const char *property) | |
144 | { | |
145 | char val[1024] = {0}; | |
146 | get_block_device_string_property(dev, property, val, sizeof(val)); | |
147 | return val; | |
148 | } | |
149 | ||
150 | int PMEMDevice::collect_metadata(string prefix, map<string,string> *pm) const | |
151 | { | |
152 | (*pm)[prefix + "rotational"] = stringify((int)(bool)rotational); | |
153 | (*pm)[prefix + "size"] = stringify(get_size()); | |
154 | (*pm)[prefix + "block_size"] = stringify(get_block_size()); | |
155 | (*pm)[prefix + "driver"] = "PMEMDevice"; | |
156 | (*pm)[prefix + "type"] = "ssd"; | |
157 | ||
158 | struct stat st; | |
159 | int r = ::fstat(fd, &st); | |
160 | if (r < 0) | |
161 | return -errno; | |
162 | if (S_ISBLK(st.st_mode)) { | |
163 | (*pm)[prefix + "access_mode"] = "blk"; | |
164 | char partition_path[PATH_MAX]; | |
165 | char dev_node[PATH_MAX]; | |
166 | int rc = get_device_by_fd(fd, partition_path, dev_node, PATH_MAX); | |
167 | switch (rc) { | |
168 | case -EOPNOTSUPP: | |
169 | case -EINVAL: | |
170 | (*pm)[prefix + "partition_path"] = "unknown"; | |
171 | (*pm)[prefix + "dev_node"] = "unknown"; | |
172 | break; | |
173 | case -ENODEV: | |
174 | (*pm)[prefix + "partition_path"] = string(partition_path); | |
175 | (*pm)[prefix + "dev_node"] = "unknown"; | |
176 | break; | |
177 | default: | |
178 | { | |
179 | (*pm)[prefix + "partition_path"] = string(partition_path); | |
180 | (*pm)[prefix + "dev_node"] = string(dev_node); | |
181 | (*pm)[prefix + "model"] = get_dev_property(dev_node, "device/model"); | |
182 | (*pm)[prefix + "dev"] = get_dev_property(dev_node, "dev"); | |
183 | ||
184 | // nvme exposes a serial number | |
185 | string serial = get_dev_property(dev_node, "device/serial"); | |
186 | if (serial.length()) { | |
187 | (*pm)[prefix + "serial"] = serial; | |
188 | } | |
189 | ||
190 | // nvme has a device/device/* structure; infer from that. there | |
191 | // is probably a better way? | |
192 | string nvme_vendor = get_dev_property(dev_node, "device/device/vendor"); | |
193 | if (nvme_vendor.length()) { | |
194 | (*pm)[prefix + "type"] = "nvme"; | |
195 | } | |
196 | } | |
197 | } | |
198 | } else { | |
199 | (*pm)[prefix + "access_mode"] = "file"; | |
200 | (*pm)[prefix + "path"] = path; | |
201 | } | |
202 | return 0; | |
203 | } | |
204 | ||
205 | int PMEMDevice::flush() | |
206 | { | |
207 | //Because all write is persist. So no need | |
208 | return 0; | |
209 | } | |
210 | ||
211 | ||
212 | void PMEMDevice::aio_submit(IOContext *ioc) | |
213 | { | |
214 | return; | |
215 | } | |
216 | ||
217 | int PMEMDevice::write(uint64_t off, bufferlist& bl, bool buffered) | |
218 | { | |
219 | uint64_t len = bl.length(); | |
220 | dout(20) << __func__ << " " << off << "~" << len << dendl; | |
221 | assert(len > 0); | |
222 | assert(off < size); | |
223 | assert(off + len <= size); | |
224 | ||
225 | dout(40) << "data: "; | |
226 | bl.hexdump(*_dout); | |
227 | *_dout << dendl; | |
228 | ||
229 | if (g_conf->bdev_inject_crash && | |
230 | rand() % g_conf->bdev_inject_crash == 0) { | |
231 | derr << __func__ << " bdev_inject_crash: dropping io " << off << "~" << len | |
232 | << dendl; | |
233 | ++injecting_crash; | |
234 | return 0; | |
235 | } | |
236 | ||
237 | bufferlist::iterator p = bl.begin(); | |
238 | uint32_t off1 = off; | |
239 | while (len) { | |
240 | const char *data; | |
241 | uint32_t l = p.get_ptr_and_advance(len, &data); | |
242 | pmem_memcpy_persist(addr + off1, data, l); | |
243 | len -= l; | |
244 | off1 += l; | |
245 | } | |
246 | ||
247 | return 0; | |
248 | } | |
249 | ||
250 | int PMEMDevice::aio_write( | |
251 | uint64_t off, | |
252 | bufferlist &bl, | |
253 | IOContext *ioc, | |
254 | bool buffered) | |
255 | { | |
256 | return write(off, bl, buffered); | |
257 | } | |
258 | ||
259 | ||
260 | int PMEMDevice::read(uint64_t off, uint64_t len, bufferlist *pbl, | |
261 | IOContext *ioc, | |
262 | bool buffered) | |
263 | { | |
264 | dout(5) << __func__ << " " << off << "~" << len << dendl; | |
265 | assert(len > 0); | |
266 | assert(off < size); | |
267 | assert(off + len <= size); | |
268 | ||
269 | bufferptr p = buffer::create_page_aligned(len); | |
270 | memcpy(p.c_str(), addr + off, len); | |
271 | ||
272 | pbl->clear(); | |
273 | pbl->push_back(std::move(p)); | |
274 | ||
275 | dout(40) << "data: "; | |
276 | pbl->hexdump(*_dout); | |
277 | *_dout << dendl; | |
278 | ||
279 | return 0; | |
280 | } | |
281 | ||
282 | int PMEMDevice::aio_read(uint64_t off, uint64_t len, bufferlist *pbl, | |
283 | IOContext *ioc) | |
284 | { | |
285 | return read(off, len, pbl, ioc, false); | |
286 | } | |
287 | ||
288 | int PMEMDevice::read_random(uint64_t off, uint64_t len, char *buf, bool buffered) | |
289 | { | |
290 | assert(len > 0); | |
291 | assert(off < size); | |
292 | assert(off + len <= size); | |
293 | ||
294 | memcpy(buf, addr + off, len); | |
295 | return 0; | |
296 | } | |
297 | ||
298 | ||
299 | int PMEMDevice::invalidate_cache(uint64_t off, uint64_t len) | |
300 | { | |
301 | dout(5) << __func__ << " " << off << "~" << len << dendl; | |
302 | return 0; | |
303 | } | |
304 | ||
305 |