]>
Commit | Line | Data |
---|---|---|
31f18b77 FG |
1 | // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- |
2 | // vim: ts=8 sw=2 smarttab | |
3 | /* | |
4 | * Ceph - scalable distributed file system | |
5 | * | |
6 | * Copyright (C) 2015 Intel <jianpeng.ma@intel.com> | |
7 | * | |
8 | * Author: Jianpeng Ma <jianpeng.ma@intel.com> | |
9 | * | |
10 | * This is free software; you can redistribute it and/or | |
11 | * modify it under the terms of the GNU Lesser General Public | |
12 | * License version 2.1, as published by the Free Software | |
13 | * Foundation. See file COPYING. | |
14 | * | |
15 | */ | |
16 | ||
17 | #include <unistd.h> | |
18 | #include <stdlib.h> | |
19 | #include <sys/types.h> | |
20 | #include <sys/stat.h> | |
31f18b77 FG |
21 | |
22 | #include "PMEMDevice.h" | |
11fdf7f2 | 23 | #include "libpmem.h" |
31f18b77 FG |
24 | #include "include/types.h" |
25 | #include "include/compat.h" | |
26 | #include "include/stringify.h" | |
27 | #include "common/errno.h" | |
28 | #include "common/debug.h" | |
29 | #include "common/blkdev.h" | |
30 | ||
31 | #define dout_context cct | |
32 | #define dout_subsys ceph_subsys_bdev | |
33 | #undef dout_prefix | |
34 | #define dout_prefix *_dout << "bdev-PMEM(" << path << ") " | |
35 | ||
36 | PMEMDevice::PMEMDevice(CephContext *cct, aio_callback_t cb, void *cbpriv) | |
11fdf7f2 | 37 | : BlockDevice(cct, cb, cbpriv), |
31f18b77 | 38 | fd(-1), addr(0), |
31f18b77 FG |
39 | injecting_crash(0) |
40 | { | |
41 | } | |
42 | ||
43 | int PMEMDevice::_lock() | |
44 | { | |
45 | struct flock l; | |
46 | memset(&l, 0, sizeof(l)); | |
47 | l.l_type = F_WRLCK; | |
48 | l.l_whence = SEEK_SET; | |
49 | l.l_start = 0; | |
50 | l.l_len = 0; | |
51 | int r = ::fcntl(fd, F_SETLK, &l); | |
52 | if (r < 0) | |
53 | return -errno; | |
54 | return 0; | |
55 | } | |
56 | ||
57 | int PMEMDevice::open(const string& p) | |
58 | { | |
59 | path = p; | |
60 | int r = 0; | |
61 | dout(1) << __func__ << " path " << path << dendl; | |
62 | ||
91327a77 | 63 | fd = ::open(path.c_str(), O_RDWR | O_CLOEXEC); |
31f18b77 FG |
64 | if (fd < 0) { |
65 | r = -errno; | |
66 | derr << __func__ << " open got: " << cpp_strerror(r) << dendl; | |
67 | return r; | |
68 | } | |
69 | ||
70 | r = _lock(); | |
71 | if (r < 0) { | |
72 | derr << __func__ << " failed to lock " << path << ": " << cpp_strerror(r) | |
73 | << dendl; | |
74 | goto out_fail; | |
75 | } | |
76 | ||
77 | struct stat st; | |
78 | r = ::fstat(fd, &st); | |
79 | if (r < 0) { | |
80 | r = -errno; | |
81 | derr << __func__ << " fstat got " << cpp_strerror(r) << dendl; | |
82 | goto out_fail; | |
83 | } | |
31f18b77 FG |
84 | |
85 | size_t map_len; | |
11fdf7f2 | 86 | addr = (char *)pmem_map_file(path.c_str(), 0, PMEM_FILE_EXCL, O_RDWR, &map_len, NULL); |
31f18b77 | 87 | if (addr == NULL) { |
11fdf7f2 | 88 | derr << __func__ << " pmem_map_file failed: " << pmem_errormsg() << dendl; |
31f18b77 FG |
89 | goto out_fail; |
90 | } | |
91 | size = map_len; | |
92 | ||
93 | // Operate as though the block size is 4 KB. The backing file | |
94 | // blksize doesn't strictly matter except that some file systems may | |
95 | // require a read/modify/write if we write something smaller than | |
96 | // it. | |
11fdf7f2 | 97 | block_size = g_conf()->bdev_block_size; |
31f18b77 FG |
98 | if (block_size != (unsigned)st.st_blksize) { |
99 | dout(1) << __func__ << " backing device/file reports st_blksize " | |
100 | << st.st_blksize << ", using bdev_block_size " | |
101 | << block_size << " anyway" << dendl; | |
102 | } | |
103 | ||
104 | dout(1) << __func__ | |
105 | << " size " << size | |
1adf2230 | 106 | << " (" << byte_u_t(size) << ")" |
31f18b77 | 107 | << " block_size " << block_size |
1adf2230 | 108 | << " (" << byte_u_t(block_size) << ")" |
31f18b77 FG |
109 | << dendl; |
110 | return 0; | |
111 | ||
112 | out_fail: | |
113 | VOID_TEMP_FAILURE_RETRY(::close(fd)); | |
114 | fd = -1; | |
115 | return r; | |
116 | } | |
117 | ||
118 | void PMEMDevice::close() | |
119 | { | |
120 | dout(1) << __func__ << dendl; | |
121 | ||
11fdf7f2 | 122 | ceph_assert(addr != NULL); |
31f18b77 | 123 | pmem_unmap(addr, size); |
11fdf7f2 | 124 | ceph_assert(fd >= 0); |
31f18b77 FG |
125 | VOID_TEMP_FAILURE_RETRY(::close(fd)); |
126 | fd = -1; | |
127 | ||
128 | path.clear(); | |
129 | } | |
130 | ||
11fdf7f2 | 131 | int PMEMDevice::collect_metadata(const string& prefix, map<string,string> *pm) const |
31f18b77 FG |
132 | { |
133 | (*pm)[prefix + "rotational"] = stringify((int)(bool)rotational); | |
134 | (*pm)[prefix + "size"] = stringify(get_size()); | |
135 | (*pm)[prefix + "block_size"] = stringify(get_block_size()); | |
136 | (*pm)[prefix + "driver"] = "PMEMDevice"; | |
137 | (*pm)[prefix + "type"] = "ssd"; | |
138 | ||
139 | struct stat st; | |
140 | int r = ::fstat(fd, &st); | |
141 | if (r < 0) | |
142 | return -errno; | |
143 | if (S_ISBLK(st.st_mode)) { | |
144 | (*pm)[prefix + "access_mode"] = "blk"; | |
11fdf7f2 | 145 | char buffer[1024] = {0}; |
9f95a23c | 146 | BlkDev blkdev(fd); |
11fdf7f2 TL |
147 | |
148 | blkdev.model(buffer, sizeof(buffer)); | |
149 | (*pm)[prefix + "model"] = buffer; | |
150 | ||
151 | buffer[0] = '\0'; | |
152 | blkdev.dev(buffer, sizeof(buffer)); | |
153 | (*pm)[prefix + "dev"] = buffer; | |
154 | ||
155 | // nvme exposes a serial number | |
156 | buffer[0] = '\0'; | |
157 | blkdev.serial(buffer, sizeof(buffer)); | |
158 | (*pm)[prefix + "serial"] = buffer; | |
159 | ||
31f18b77 FG |
160 | } else { |
161 | (*pm)[prefix + "access_mode"] = "file"; | |
162 | (*pm)[prefix + "path"] = path; | |
163 | } | |
164 | return 0; | |
165 | } | |
166 | ||
167 | int PMEMDevice::flush() | |
168 | { | |
169 | //Because all write is persist. So no need | |
170 | return 0; | |
171 | } | |
172 | ||
173 | ||
174 | void PMEMDevice::aio_submit(IOContext *ioc) | |
175 | { | |
11fdf7f2 TL |
176 | if (ioc->priv) { |
177 | ceph_assert(ioc->num_running == 0); | |
178 | aio_callback(aio_callback_priv, ioc->priv); | |
179 | } else { | |
180 | ioc->try_aio_wake(); | |
181 | } | |
31f18b77 FG |
182 | return; |
183 | } | |
184 | ||
9f95a23c | 185 | int PMEMDevice::write(uint64_t off, bufferlist& bl, bool buffered, int write_hint) |
31f18b77 FG |
186 | { |
187 | uint64_t len = bl.length(); | |
188 | dout(20) << __func__ << " " << off << "~" << len << dendl; | |
11fdf7f2 | 189 | ceph_assert(is_valid_io(off, len)); |
31f18b77 FG |
190 | |
191 | dout(40) << "data: "; | |
192 | bl.hexdump(*_dout); | |
193 | *_dout << dendl; | |
194 | ||
11fdf7f2 TL |
195 | if (g_conf()->bdev_inject_crash && |
196 | rand() % g_conf()->bdev_inject_crash == 0) { | |
31f18b77 FG |
197 | derr << __func__ << " bdev_inject_crash: dropping io " << off << "~" << len |
198 | << dendl; | |
199 | ++injecting_crash; | |
200 | return 0; | |
201 | } | |
202 | ||
203 | bufferlist::iterator p = bl.begin(); | |
204 | uint32_t off1 = off; | |
205 | while (len) { | |
206 | const char *data; | |
207 | uint32_t l = p.get_ptr_and_advance(len, &data); | |
208 | pmem_memcpy_persist(addr + off1, data, l); | |
209 | len -= l; | |
210 | off1 += l; | |
211 | } | |
31f18b77 FG |
212 | return 0; |
213 | } | |
214 | ||
215 | int PMEMDevice::aio_write( | |
216 | uint64_t off, | |
217 | bufferlist &bl, | |
218 | IOContext *ioc, | |
11fdf7f2 | 219 | bool buffered, |
9f95a23c | 220 | int write_hint) |
31f18b77 FG |
221 | { |
222 | return write(off, bl, buffered); | |
223 | } | |
224 | ||
225 | ||
226 | int PMEMDevice::read(uint64_t off, uint64_t len, bufferlist *pbl, | |
227 | IOContext *ioc, | |
228 | bool buffered) | |
229 | { | |
230 | dout(5) << __func__ << " " << off << "~" << len << dendl; | |
11fdf7f2 | 231 | ceph_assert(is_valid_io(off, len)); |
31f18b77 | 232 | |
11fdf7f2 | 233 | bufferptr p = buffer::create_small_page_aligned(len); |
31f18b77 FG |
234 | memcpy(p.c_str(), addr + off, len); |
235 | ||
236 | pbl->clear(); | |
237 | pbl->push_back(std::move(p)); | |
238 | ||
239 | dout(40) << "data: "; | |
240 | pbl->hexdump(*_dout); | |
241 | *_dout << dendl; | |
242 | ||
243 | return 0; | |
244 | } | |
245 | ||
246 | int PMEMDevice::aio_read(uint64_t off, uint64_t len, bufferlist *pbl, | |
247 | IOContext *ioc) | |
248 | { | |
249 | return read(off, len, pbl, ioc, false); | |
250 | } | |
251 | ||
252 | int PMEMDevice::read_random(uint64_t off, uint64_t len, char *buf, bool buffered) | |
253 | { | |
11fdf7f2 TL |
254 | dout(5) << __func__ << " " << off << "~" << len << dendl; |
255 | ceph_assert(is_valid_io(off, len)); | |
31f18b77 FG |
256 | |
257 | memcpy(buf, addr + off, len); | |
258 | return 0; | |
259 | } | |
260 | ||
261 | ||
262 | int PMEMDevice::invalidate_cache(uint64_t off, uint64_t len) | |
263 | { | |
264 | dout(5) << __func__ << " " << off << "~" << len << dendl; | |
265 | return 0; | |
266 | } | |
267 | ||
268 |