]>
Commit | Line | Data |
---|---|---|
31f18b77 FG |
1 | // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- |
2 | // vim: ts=8 sw=2 smarttab | |
3 | /* | |
4 | * Ceph - scalable distributed file system | |
5 | * | |
6 | * Copyright (C) 2015 Intel <jianpeng.ma@intel.com> | |
7 | * | |
8 | * Author: Jianpeng Ma <jianpeng.ma@intel.com> | |
9 | * | |
10 | * This is free software; you can redistribute it and/or | |
11 | * modify it under the terms of the GNU Lesser General Public | |
12 | * License version 2.1, as published by the Free Software | |
13 | * Foundation. See file COPYING. | |
14 | * | |
15 | */ | |
16 | ||
17 | #include <unistd.h> | |
18 | #include <stdlib.h> | |
19 | #include <sys/types.h> | |
20 | #include <sys/stat.h> | |
31f18b77 FG |
21 | |
22 | #include "PMEMDevice.h" | |
11fdf7f2 | 23 | #include "libpmem.h" |
31f18b77 FG |
24 | #include "include/types.h" |
25 | #include "include/compat.h" | |
26 | #include "include/stringify.h" | |
27 | #include "common/errno.h" | |
28 | #include "common/debug.h" | |
29 | #include "common/blkdev.h" | |
30 | ||
31 | #define dout_context cct | |
32 | #define dout_subsys ceph_subsys_bdev | |
33 | #undef dout_prefix | |
34 | #define dout_prefix *_dout << "bdev-PMEM(" << path << ") " | |
35 | ||
36 | PMEMDevice::PMEMDevice(CephContext *cct, aio_callback_t cb, void *cbpriv) | |
11fdf7f2 | 37 | : BlockDevice(cct, cb, cbpriv), |
31f18b77 | 38 | fd(-1), addr(0), |
31f18b77 FG |
39 | injecting_crash(0) |
40 | { | |
41 | } | |
42 | ||
43 | int PMEMDevice::_lock() | |
44 | { | |
45 | struct flock l; | |
46 | memset(&l, 0, sizeof(l)); | |
47 | l.l_type = F_WRLCK; | |
48 | l.l_whence = SEEK_SET; | |
49 | l.l_start = 0; | |
50 | l.l_len = 0; | |
51 | int r = ::fcntl(fd, F_SETLK, &l); | |
52 | if (r < 0) | |
53 | return -errno; | |
54 | return 0; | |
55 | } | |
56 | ||
20effc67 | 57 | int PMEMDevice::open(const std::string& p) |
31f18b77 FG |
58 | { |
59 | path = p; | |
60 | int r = 0; | |
61 | dout(1) << __func__ << " path " << path << dendl; | |
62 | ||
91327a77 | 63 | fd = ::open(path.c_str(), O_RDWR | O_CLOEXEC); |
31f18b77 FG |
64 | if (fd < 0) { |
65 | r = -errno; | |
66 | derr << __func__ << " open got: " << cpp_strerror(r) << dendl; | |
67 | return r; | |
68 | } | |
69 | ||
70 | r = _lock(); | |
71 | if (r < 0) { | |
72 | derr << __func__ << " failed to lock " << path << ": " << cpp_strerror(r) | |
73 | << dendl; | |
74 | goto out_fail; | |
75 | } | |
76 | ||
77 | struct stat st; | |
78 | r = ::fstat(fd, &st); | |
79 | if (r < 0) { | |
80 | r = -errno; | |
81 | derr << __func__ << " fstat got " << cpp_strerror(r) << dendl; | |
82 | goto out_fail; | |
83 | } | |
31f18b77 FG |
84 | |
85 | size_t map_len; | |
11fdf7f2 | 86 | addr = (char *)pmem_map_file(path.c_str(), 0, PMEM_FILE_EXCL, O_RDWR, &map_len, NULL); |
31f18b77 | 87 | if (addr == NULL) { |
11fdf7f2 | 88 | derr << __func__ << " pmem_map_file failed: " << pmem_errormsg() << dendl; |
31f18b77 FG |
89 | goto out_fail; |
90 | } | |
91 | size = map_len; | |
92 | ||
93 | // Operate as though the block size is 4 KB. The backing file | |
94 | // blksize doesn't strictly matter except that some file systems may | |
95 | // require a read/modify/write if we write something smaller than | |
96 | // it. | |
11fdf7f2 | 97 | block_size = g_conf()->bdev_block_size; |
31f18b77 FG |
98 | if (block_size != (unsigned)st.st_blksize) { |
99 | dout(1) << __func__ << " backing device/file reports st_blksize " | |
100 | << st.st_blksize << ", using bdev_block_size " | |
101 | << block_size << " anyway" << dendl; | |
102 | } | |
103 | ||
104 | dout(1) << __func__ | |
105 | << " size " << size | |
1adf2230 | 106 | << " (" << byte_u_t(size) << ")" |
31f18b77 | 107 | << " block_size " << block_size |
1adf2230 | 108 | << " (" << byte_u_t(block_size) << ")" |
31f18b77 FG |
109 | << dendl; |
110 | return 0; | |
111 | ||
112 | out_fail: | |
113 | VOID_TEMP_FAILURE_RETRY(::close(fd)); | |
114 | fd = -1; | |
115 | return r; | |
116 | } | |
117 | ||
118 | void PMEMDevice::close() | |
119 | { | |
120 | dout(1) << __func__ << dendl; | |
121 | ||
11fdf7f2 | 122 | ceph_assert(addr != NULL); |
31f18b77 | 123 | pmem_unmap(addr, size); |
11fdf7f2 | 124 | ceph_assert(fd >= 0); |
31f18b77 FG |
125 | VOID_TEMP_FAILURE_RETRY(::close(fd)); |
126 | fd = -1; | |
127 | ||
128 | path.clear(); | |
129 | } | |
130 | ||
20effc67 | 131 | int PMEMDevice::collect_metadata(const std::string& prefix, std::map<std::string,std::string> *pm) const |
31f18b77 FG |
132 | { |
133 | (*pm)[prefix + "rotational"] = stringify((int)(bool)rotational); | |
134 | (*pm)[prefix + "size"] = stringify(get_size()); | |
135 | (*pm)[prefix + "block_size"] = stringify(get_block_size()); | |
136 | (*pm)[prefix + "driver"] = "PMEMDevice"; | |
137 | (*pm)[prefix + "type"] = "ssd"; | |
138 | ||
139 | struct stat st; | |
140 | int r = ::fstat(fd, &st); | |
141 | if (r < 0) | |
142 | return -errno; | |
143 | if (S_ISBLK(st.st_mode)) { | |
144 | (*pm)[prefix + "access_mode"] = "blk"; | |
11fdf7f2 | 145 | char buffer[1024] = {0}; |
9f95a23c | 146 | BlkDev blkdev(fd); |
11fdf7f2 TL |
147 | |
148 | blkdev.model(buffer, sizeof(buffer)); | |
149 | (*pm)[prefix + "model"] = buffer; | |
150 | ||
151 | buffer[0] = '\0'; | |
152 | blkdev.dev(buffer, sizeof(buffer)); | |
153 | (*pm)[prefix + "dev"] = buffer; | |
154 | ||
155 | // nvme exposes a serial number | |
156 | buffer[0] = '\0'; | |
157 | blkdev.serial(buffer, sizeof(buffer)); | |
158 | (*pm)[prefix + "serial"] = buffer; | |
159 | ||
31f18b77 FG |
160 | } else { |
161 | (*pm)[prefix + "access_mode"] = "file"; | |
162 | (*pm)[prefix + "path"] = path; | |
163 | } | |
164 | return 0; | |
165 | } | |
166 | ||
f67539c2 TL |
167 | bool PMEMDevice::support(const std::string &path) |
168 | { | |
169 | int is_pmem = 0; | |
170 | size_t map_len = 0; | |
171 | void *addr = pmem_map_file(path.c_str(), 0, PMEM_FILE_EXCL, O_RDONLY, &map_len, &is_pmem); | |
172 | if (addr != NULL) { | |
173 | if (is_pmem) { | |
174 | return true; | |
175 | } | |
176 | pmem_unmap(addr, map_len); | |
177 | } | |
178 | return false; | |
179 | } | |
180 | ||
31f18b77 FG |
181 | int PMEMDevice::flush() |
182 | { | |
183 | //Because all write is persist. So no need | |
184 | return 0; | |
185 | } | |
186 | ||
187 | ||
188 | void PMEMDevice::aio_submit(IOContext *ioc) | |
189 | { | |
11fdf7f2 TL |
190 | if (ioc->priv) { |
191 | ceph_assert(ioc->num_running == 0); | |
192 | aio_callback(aio_callback_priv, ioc->priv); | |
193 | } else { | |
194 | ioc->try_aio_wake(); | |
195 | } | |
31f18b77 FG |
196 | return; |
197 | } | |
198 | ||
9f95a23c | 199 | int PMEMDevice::write(uint64_t off, bufferlist& bl, bool buffered, int write_hint) |
31f18b77 FG |
200 | { |
201 | uint64_t len = bl.length(); | |
202 | dout(20) << __func__ << " " << off << "~" << len << dendl; | |
11fdf7f2 | 203 | ceph_assert(is_valid_io(off, len)); |
31f18b77 | 204 | |
20effc67 | 205 | dout(40) << "data:\n"; |
31f18b77 FG |
206 | bl.hexdump(*_dout); |
207 | *_dout << dendl; | |
208 | ||
11fdf7f2 TL |
209 | if (g_conf()->bdev_inject_crash && |
210 | rand() % g_conf()->bdev_inject_crash == 0) { | |
31f18b77 FG |
211 | derr << __func__ << " bdev_inject_crash: dropping io " << off << "~" << len |
212 | << dendl; | |
213 | ++injecting_crash; | |
214 | return 0; | |
215 | } | |
216 | ||
217 | bufferlist::iterator p = bl.begin(); | |
f67539c2 | 218 | uint64_t off1 = off; |
31f18b77 FG |
219 | while (len) { |
220 | const char *data; | |
221 | uint32_t l = p.get_ptr_and_advance(len, &data); | |
222 | pmem_memcpy_persist(addr + off1, data, l); | |
223 | len -= l; | |
224 | off1 += l; | |
225 | } | |
31f18b77 FG |
226 | return 0; |
227 | } | |
228 | ||
229 | int PMEMDevice::aio_write( | |
230 | uint64_t off, | |
231 | bufferlist &bl, | |
232 | IOContext *ioc, | |
11fdf7f2 | 233 | bool buffered, |
9f95a23c | 234 | int write_hint) |
31f18b77 FG |
235 | { |
236 | return write(off, bl, buffered); | |
237 | } | |
238 | ||
239 | ||
240 | int PMEMDevice::read(uint64_t off, uint64_t len, bufferlist *pbl, | |
241 | IOContext *ioc, | |
242 | bool buffered) | |
243 | { | |
244 | dout(5) << __func__ << " " << off << "~" << len << dendl; | |
11fdf7f2 | 245 | ceph_assert(is_valid_io(off, len)); |
31f18b77 | 246 | |
11fdf7f2 | 247 | bufferptr p = buffer::create_small_page_aligned(len); |
31f18b77 FG |
248 | memcpy(p.c_str(), addr + off, len); |
249 | ||
250 | pbl->clear(); | |
251 | pbl->push_back(std::move(p)); | |
252 | ||
20effc67 | 253 | dout(40) << "data:\n"; |
31f18b77 FG |
254 | pbl->hexdump(*_dout); |
255 | *_dout << dendl; | |
256 | ||
257 | return 0; | |
258 | } | |
259 | ||
260 | int PMEMDevice::aio_read(uint64_t off, uint64_t len, bufferlist *pbl, | |
261 | IOContext *ioc) | |
262 | { | |
263 | return read(off, len, pbl, ioc, false); | |
264 | } | |
265 | ||
266 | int PMEMDevice::read_random(uint64_t off, uint64_t len, char *buf, bool buffered) | |
267 | { | |
11fdf7f2 TL |
268 | dout(5) << __func__ << " " << off << "~" << len << dendl; |
269 | ceph_assert(is_valid_io(off, len)); | |
31f18b77 FG |
270 | |
271 | memcpy(buf, addr + off, len); | |
272 | return 0; | |
273 | } | |
274 | ||
275 | ||
276 | int PMEMDevice::invalidate_cache(uint64_t off, uint64_t len) | |
277 | { | |
278 | dout(5) << __func__ << " " << off << "~" << len << dendl; | |
279 | return 0; | |
280 | } | |
281 | ||
282 |