]>
Commit | Line | Data |
---|---|---|
7c673cae FG |
1 | // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- |
2 | // vim: ts=8 sw=2 smarttab | |
3 | /* | |
4 | * Ceph - scalable distributed file system | |
5 | * | |
6 | * Copyright (C) 2014 Red Hat | |
7 | * | |
8 | * This is free software; you can redistribute it and/or | |
9 | * modify it under the terms of the GNU Lesser General Public | |
10 | * License version 2.1, as published by the Free Software | |
11 | * Foundation. See file COPYING. | |
12 | * | |
13 | */ | |
14 | ||
f67539c2 | 15 | #include <limits> |
7c673cae FG |
16 | #include <unistd.h> |
17 | #include <stdlib.h> | |
18 | #include <sys/types.h> | |
19 | #include <sys/stat.h> | |
20 | #include <fcntl.h> | |
11fdf7f2 | 21 | #include <sys/file.h> |
7c673cae FG |
22 | |
23 | #include "KernelDevice.h" | |
9f95a23c | 24 | #include "include/intarith.h" |
7c673cae FG |
25 | #include "include/types.h" |
26 | #include "include/compat.h" | |
27 | #include "include/stringify.h" | |
11fdf7f2 | 28 | #include "common/blkdev.h" |
7c673cae | 29 | #include "common/errno.h" |
11fdf7f2 TL |
30 | #if defined(__FreeBSD__) |
31 | #include "bsm/audit_errno.h" | |
32 | #endif | |
7c673cae | 33 | #include "common/debug.h" |
11fdf7f2 TL |
34 | #include "common/numa.h" |
35 | ||
36 | #include "global/global_context.h" | |
f67539c2 | 37 | #include "io_uring.h" |
7c673cae FG |
38 | |
39 | #define dout_context cct | |
40 | #define dout_subsys ceph_subsys_bdev | |
41 | #undef dout_prefix | |
42 | #define dout_prefix *_dout << "bdev(" << this << " " << path << ") " | |
43 | ||
f67539c2 TL |
44 | using std::list; |
45 | using std::map; | |
46 | using std::string; | |
47 | using std::vector; | |
48 | ||
49 | using ceph::bufferlist; | |
50 | using ceph::bufferptr; | |
51 | using ceph::make_timespan; | |
52 | using ceph::mono_clock; | |
53 | using ceph::operator <<; | |
54 | ||
11fdf7f2 TL |
55 | KernelDevice::KernelDevice(CephContext* cct, aio_callback_t cb, void *cbpriv, aio_callback_t d_cb, void *d_cbpriv) |
56 | : BlockDevice(cct, cb, cbpriv), | |
57 | aio(false), dio(false), | |
11fdf7f2 TL |
58 | discard_callback(d_cb), |
59 | discard_callback_priv(d_cbpriv), | |
7c673cae | 60 | aio_stop(false), |
11fdf7f2 TL |
61 | discard_started(false), |
62 | discard_stop(false), | |
7c673cae | 63 | aio_thread(this), |
11fdf7f2 | 64 | discard_thread(this), |
7c673cae FG |
65 | injecting_crash(0) |
66 | { | |
11fdf7f2 TL |
67 | fd_directs.resize(WRITE_LIFE_MAX, -1); |
68 | fd_buffereds.resize(WRITE_LIFE_MAX, -1); | |
9f95a23c | 69 | |
f67539c2 | 70 | bool use_ioring = cct->_conf.get_val<bool>("bdev_ioring"); |
9f95a23c TL |
71 | unsigned int iodepth = cct->_conf->bdev_aio_max_queue_depth; |
72 | ||
73 | if (use_ioring && ioring_queue_t::supported()) { | |
f67539c2 TL |
74 | bool use_ioring_hipri = cct->_conf.get_val<bool>("bdev_ioring_hipri"); |
75 | bool use_ioring_sqthread_poll = cct->_conf.get_val<bool>("bdev_ioring_sqthread_poll"); | |
76 | io_queue = std::make_unique<ioring_queue_t>(iodepth, use_ioring_hipri, use_ioring_sqthread_poll); | |
9f95a23c TL |
77 | } else { |
78 | static bool once; | |
79 | if (use_ioring && !once) { | |
80 | derr << "WARNING: io_uring API is not supported! Fallback to libaio!" | |
81 | << dendl; | |
82 | once = true; | |
83 | } | |
84 | io_queue = std::make_unique<aio_queue_t>(iodepth); | |
85 | } | |
7c673cae FG |
86 | } |
87 | ||
88 | int KernelDevice::_lock() | |
89 | { | |
11fdf7f2 | 90 | dout(10) << __func__ << " " << fd_directs[WRITE_LIFE_NOT_SET] << dendl; |
adb31ebb TL |
91 | // When the block changes, systemd-udevd will open the block, |
92 | // read some information and close it. Then a failure occurs here. | |
93 | // So we need to try again here. | |
f67539c2 TL |
94 | int fd = fd_directs[WRITE_LIFE_NOT_SET]; |
95 | uint64_t nr_tries = 0; | |
96 | for (;;) { | |
97 | struct flock fl = { F_WRLCK, | |
98 | SEEK_SET }; | |
99 | int r = ::fcntl(fd, F_OFD_SETLK, &fl); | |
100 | if (r < 0) { | |
101 | if (errno == EINVAL) { | |
102 | r = ::flock(fd, LOCK_EX | LOCK_NB); | |
103 | } | |
104 | } | |
105 | if (r == 0) { | |
adb31ebb TL |
106 | return 0; |
107 | } | |
f67539c2 TL |
108 | if (errno != EAGAIN) { |
109 | return -errno; | |
110 | } | |
111 | dout(1) << __func__ << " flock busy on " << path << dendl; | |
112 | if (const uint64_t max_retry = | |
113 | cct->_conf.get_val<uint64_t>("bdev_flock_retry"); | |
114 | max_retry > 0 && nr_tries++ == max_retry) { | |
115 | return -EAGAIN; | |
116 | } | |
117 | double retry_interval = | |
118 | cct->_conf.get_val<double>("bdev_flock_retry_interval"); | |
119 | std::this_thread::sleep_for(ceph::make_timespan(retry_interval)); | |
11fdf7f2 | 120 | } |
7c673cae FG |
121 | } |
122 | ||
123 | int KernelDevice::open(const string& p) | |
124 | { | |
125 | path = p; | |
11fdf7f2 | 126 | int r = 0, i = 0; |
7c673cae FG |
127 | dout(1) << __func__ << " path " << path << dendl; |
128 | ||
11fdf7f2 TL |
129 | for (i = 0; i < WRITE_LIFE_MAX; i++) { |
130 | int fd = ::open(path.c_str(), O_RDWR | O_DIRECT); | |
131 | if (fd < 0) { | |
132 | r = -errno; | |
133 | break; | |
134 | } | |
135 | fd_directs[i] = fd; | |
136 | ||
137 | fd = ::open(path.c_str(), O_RDWR | O_CLOEXEC); | |
138 | if (fd < 0) { | |
139 | r = -errno; | |
140 | break; | |
141 | } | |
142 | fd_buffereds[i] = fd; | |
7c673cae | 143 | } |
11fdf7f2 TL |
144 | |
145 | if (i != WRITE_LIFE_MAX) { | |
7c673cae | 146 | derr << __func__ << " open got: " << cpp_strerror(r) << dendl; |
11fdf7f2 TL |
147 | goto out_fail; |
148 | } | |
149 | ||
150 | #if defined(F_SET_FILE_RW_HINT) | |
151 | for (i = WRITE_LIFE_NONE; i < WRITE_LIFE_MAX; i++) { | |
152 | if (fcntl(fd_directs[i], F_SET_FILE_RW_HINT, &i) < 0) { | |
153 | r = -errno; | |
154 | break; | |
155 | } | |
156 | if (fcntl(fd_buffereds[i], F_SET_FILE_RW_HINT, &i) < 0) { | |
157 | r = -errno; | |
158 | break; | |
159 | } | |
7c673cae | 160 | } |
11fdf7f2 TL |
161 | if (i != WRITE_LIFE_MAX) { |
162 | enable_wrt = false; | |
163 | dout(0) << "ioctl(F_SET_FILE_RW_HINT) on " << path << " failed: " << cpp_strerror(r) << dendl; | |
164 | } | |
165 | #endif | |
166 | ||
7c673cae FG |
167 | dio = true; |
168 | aio = cct->_conf->bdev_aio; | |
169 | if (!aio) { | |
11fdf7f2 | 170 | ceph_abort_msg("non-aio not supported"); |
7c673cae FG |
171 | } |
172 | ||
173 | // disable readahead as it will wreak havoc on our mix of | |
174 | // directio/aio and buffered io. | |
11fdf7f2 | 175 | r = posix_fadvise(fd_buffereds[WRITE_LIFE_NOT_SET], 0, 0, POSIX_FADV_RANDOM); |
7c673cae FG |
176 | if (r) { |
177 | r = -r; | |
9f95a23c | 178 | derr << __func__ << " posix_fadvise got: " << cpp_strerror(r) << dendl; |
7c673cae FG |
179 | goto out_fail; |
180 | } | |
181 | ||
11fdf7f2 TL |
182 | if (lock_exclusive) { |
183 | r = _lock(); | |
184 | if (r < 0) { | |
185 | derr << __func__ << " failed to lock " << path << ": " << cpp_strerror(r) | |
186 | << dendl; | |
187 | goto out_fail; | |
188 | } | |
7c673cae FG |
189 | } |
190 | ||
191 | struct stat st; | |
11fdf7f2 | 192 | r = ::fstat(fd_directs[WRITE_LIFE_NOT_SET], &st); |
7c673cae FG |
193 | if (r < 0) { |
194 | r = -errno; | |
195 | derr << __func__ << " fstat got " << cpp_strerror(r) << dendl; | |
196 | goto out_fail; | |
197 | } | |
198 | ||
199 | // Operate as though the block size is 4 KB. The backing file | |
200 | // blksize doesn't strictly matter except that some file systems may | |
201 | // require a read/modify/write if we write something smaller than | |
202 | // it. | |
203 | block_size = cct->_conf->bdev_block_size; | |
204 | if (block_size != (unsigned)st.st_blksize) { | |
205 | dout(1) << __func__ << " backing device/file reports st_blksize " | |
206 | << st.st_blksize << ", using bdev_block_size " | |
207 | << block_size << " anyway" << dendl; | |
208 | } | |
209 | ||
7c673cae FG |
210 | |
211 | { | |
11fdf7f2 TL |
212 | BlkDev blkdev_direct(fd_directs[WRITE_LIFE_NOT_SET]); |
213 | BlkDev blkdev_buffered(fd_buffereds[WRITE_LIFE_NOT_SET]); | |
214 | ||
215 | if (S_ISBLK(st.st_mode)) { | |
216 | int64_t s; | |
217 | r = blkdev_direct.get_size(&s); | |
218 | if (r < 0) { | |
219 | goto out_fail; | |
220 | } | |
221 | size = s; | |
222 | } else { | |
223 | size = st.st_size; | |
224 | } | |
225 | ||
7c673cae | 226 | char partition[PATH_MAX], devname[PATH_MAX]; |
11fdf7f2 TL |
227 | if ((r = blkdev_buffered.partition(partition, PATH_MAX)) || |
228 | (r = blkdev_buffered.wholedisk(devname, PATH_MAX))) { | |
7c673cae | 229 | derr << "unable to get device name for " << path << ": " |
11fdf7f2 | 230 | << cpp_strerror(r) << dendl; |
7c673cae FG |
231 | rotational = true; |
232 | } else { | |
233 | dout(20) << __func__ << " devname " << devname << dendl; | |
11fdf7f2 TL |
234 | rotational = blkdev_buffered.is_rotational(); |
235 | support_discard = blkdev_buffered.support_discard(); | |
236 | this->devname = devname; | |
237 | _detect_vdo(); | |
7c673cae FG |
238 | } |
239 | } | |
240 | ||
31f18b77 FG |
241 | r = _aio_start(); |
242 | if (r < 0) { | |
243 | goto out_fail; | |
244 | } | |
11fdf7f2 | 245 | _discard_start(); |
7c673cae FG |
246 | |
247 | // round size down to an even block | |
248 | size &= ~(block_size - 1); | |
249 | ||
7c673cae FG |
250 | dout(1) << __func__ |
251 | << " size " << size | |
252 | << " (0x" << std::hex << size << std::dec << ", " | |
1adf2230 | 253 | << byte_u_t(size) << ")" |
7c673cae | 254 | << " block_size " << block_size |
1adf2230 | 255 | << " (" << byte_u_t(block_size) << ")" |
7c673cae | 256 | << " " << (rotational ? "rotational" : "non-rotational") |
11fdf7f2 | 257 | << " discard " << (support_discard ? "supported" : "not supported") |
7c673cae FG |
258 | << dendl; |
259 | return 0; | |
260 | ||
11fdf7f2 TL |
261 | out_fail: |
262 | for (i = 0; i < WRITE_LIFE_MAX; i++) { | |
263 | if (fd_directs[i] >= 0) { | |
264 | VOID_TEMP_FAILURE_RETRY(::close(fd_directs[i])); | |
265 | fd_directs[i] = -1; | |
266 | } else { | |
267 | break; | |
268 | } | |
269 | if (fd_buffereds[i] >= 0) { | |
270 | VOID_TEMP_FAILURE_RETRY(::close(fd_buffereds[i])); | |
271 | fd_buffereds[i] = -1; | |
272 | } else { | |
273 | break; | |
274 | } | |
275 | } | |
7c673cae FG |
276 | return r; |
277 | } | |
278 | ||
9f95a23c | 279 | int KernelDevice::get_devices(std::set<std::string> *ls) const |
11fdf7f2 TL |
280 | { |
281 | if (devname.empty()) { | |
282 | return 0; | |
283 | } | |
284 | get_raw_devices(devname, ls); | |
285 | return 0; | |
286 | } | |
287 | ||
7c673cae FG |
288 | void KernelDevice::close() |
289 | { | |
290 | dout(1) << __func__ << dendl; | |
291 | _aio_stop(); | |
11fdf7f2 | 292 | _discard_stop(); |
7c673cae | 293 | |
11fdf7f2 TL |
294 | if (vdo_fd >= 0) { |
295 | VOID_TEMP_FAILURE_RETRY(::close(vdo_fd)); | |
296 | vdo_fd = -1; | |
297 | } | |
7c673cae | 298 | |
11fdf7f2 TL |
299 | for (int i = 0; i < WRITE_LIFE_MAX; i++) { |
300 | assert(fd_directs[i] >= 0); | |
301 | VOID_TEMP_FAILURE_RETRY(::close(fd_directs[i])); | |
302 | fd_directs[i] = -1; | |
7c673cae | 303 | |
11fdf7f2 TL |
304 | assert(fd_buffereds[i] >= 0); |
305 | VOID_TEMP_FAILURE_RETRY(::close(fd_buffereds[i])); | |
306 | fd_buffereds[i] = -1; | |
307 | } | |
7c673cae FG |
308 | path.clear(); |
309 | } | |
310 | ||
11fdf7f2 | 311 | int KernelDevice::collect_metadata(const string& prefix, map<string,string> *pm) const |
7c673cae | 312 | { |
11fdf7f2 | 313 | (*pm)[prefix + "support_discard"] = stringify((int)(bool)support_discard); |
7c673cae FG |
314 | (*pm)[prefix + "rotational"] = stringify((int)(bool)rotational); |
315 | (*pm)[prefix + "size"] = stringify(get_size()); | |
316 | (*pm)[prefix + "block_size"] = stringify(get_block_size()); | |
317 | (*pm)[prefix + "driver"] = "KernelDevice"; | |
318 | if (rotational) { | |
319 | (*pm)[prefix + "type"] = "hdd"; | |
320 | } else { | |
321 | (*pm)[prefix + "type"] = "ssd"; | |
322 | } | |
11fdf7f2 TL |
323 | if (vdo_fd >= 0) { |
324 | (*pm)[prefix + "vdo"] = "true"; | |
325 | uint64_t total, avail; | |
326 | get_vdo_utilization(vdo_fd, &total, &avail); | |
327 | (*pm)[prefix + "vdo_physical_size"] = stringify(total); | |
328 | } | |
7c673cae | 329 | |
9f95a23c TL |
330 | { |
331 | string res_names; | |
332 | std::set<std::string> devnames; | |
333 | if (get_devices(&devnames) == 0) { | |
334 | for (auto& dev : devnames) { | |
335 | if (!res_names.empty()) { | |
336 | res_names += ","; | |
337 | } | |
338 | res_names += dev; | |
339 | } | |
340 | if (res_names.size()) { | |
341 | (*pm)[prefix + "devices"] = res_names; | |
342 | } | |
343 | } | |
344 | } | |
345 | ||
7c673cae | 346 | struct stat st; |
11fdf7f2 | 347 | int r = ::fstat(fd_buffereds[WRITE_LIFE_NOT_SET], &st); |
7c673cae FG |
348 | if (r < 0) |
349 | return -errno; | |
350 | if (S_ISBLK(st.st_mode)) { | |
351 | (*pm)[prefix + "access_mode"] = "blk"; | |
11fdf7f2 TL |
352 | |
353 | char buffer[1024] = {0}; | |
354 | BlkDev blkdev{fd_buffereds[WRITE_LIFE_NOT_SET]}; | |
355 | if (r = blkdev.partition(buffer, sizeof(buffer)); r) { | |
7c673cae | 356 | (*pm)[prefix + "partition_path"] = "unknown"; |
11fdf7f2 TL |
357 | } else { |
358 | (*pm)[prefix + "partition_path"] = buffer; | |
359 | } | |
360 | buffer[0] = '\0'; | |
361 | if (r = blkdev.partition(buffer, sizeof(buffer)); r) { | |
7c673cae | 362 | (*pm)[prefix + "dev_node"] = "unknown"; |
11fdf7f2 TL |
363 | } else { |
364 | (*pm)[prefix + "dev_node"] = buffer; | |
365 | } | |
366 | if (!r) { | |
367 | return 0; | |
368 | } | |
369 | buffer[0] = '\0'; | |
370 | blkdev.model(buffer, sizeof(buffer)); | |
371 | (*pm)[prefix + "model"] = buffer; | |
372 | ||
373 | buffer[0] = '\0'; | |
374 | blkdev.dev(buffer, sizeof(buffer)); | |
375 | (*pm)[prefix + "dev"] = buffer; | |
376 | ||
377 | // nvme exposes a serial number | |
378 | buffer[0] = '\0'; | |
379 | blkdev.serial(buffer, sizeof(buffer)); | |
380 | (*pm)[prefix + "serial"] = buffer; | |
381 | ||
11fdf7f2 TL |
382 | // numa |
383 | int node; | |
384 | r = blkdev.get_numa_node(&node); | |
385 | if (r >= 0) { | |
386 | (*pm)[prefix + "numa_node"] = stringify(node); | |
7c673cae FG |
387 | } |
388 | } else { | |
389 | (*pm)[prefix + "access_mode"] = "file"; | |
390 | (*pm)[prefix + "path"] = path; | |
391 | } | |
392 | return 0; | |
393 | } | |
394 | ||
11fdf7f2 TL |
395 | void KernelDevice::_detect_vdo() |
396 | { | |
397 | vdo_fd = get_vdo_stats_handle(devname.c_str(), &vdo_name); | |
398 | if (vdo_fd >= 0) { | |
399 | dout(1) << __func__ << " VDO volume " << vdo_name | |
400 | << " maps to " << devname << dendl; | |
401 | } else { | |
402 | dout(20) << __func__ << " no VDO volume maps to " << devname << dendl; | |
403 | } | |
404 | return; | |
405 | } | |
406 | ||
407 | bool KernelDevice::get_thin_utilization(uint64_t *total, uint64_t *avail) const | |
408 | { | |
409 | if (vdo_fd < 0) { | |
410 | return false; | |
411 | } | |
412 | return get_vdo_utilization(vdo_fd, total, avail); | |
413 | } | |
414 | ||
415 | int KernelDevice::choose_fd(bool buffered, int write_hint) const | |
416 | { | |
417 | assert(write_hint >= WRITE_LIFE_NOT_SET && write_hint < WRITE_LIFE_MAX); | |
418 | if (!enable_wrt) | |
419 | write_hint = WRITE_LIFE_NOT_SET; | |
420 | return buffered ? fd_buffereds[write_hint] : fd_directs[write_hint]; | |
421 | } | |
422 | ||
7c673cae FG |
423 | int KernelDevice::flush() |
424 | { | |
31f18b77 | 425 | // protect flush with a mutex. note that we are not really protecting |
7c673cae FG |
426 | // data here. instead, we're ensuring that if any flush() caller |
427 | // sees that io_since_flush is true, they block any racing callers | |
428 | // until the flush is observed. that allows racing threads to be | |
429 | // calling flush while still ensuring that *any* of them that got an | |
430 | // aio completion notification will not return before that aio is | |
431 | // stable on disk: whichever thread sees the flag first will block | |
432 | // followers until the aio is stable. | |
11fdf7f2 | 433 | std::lock_guard l(flush_mutex); |
7c673cae FG |
434 | |
435 | bool expect = true; | |
436 | if (!io_since_flush.compare_exchange_strong(expect, false)) { | |
437 | dout(10) << __func__ << " no-op (no ios since last flush), flag is " | |
438 | << (int)io_since_flush.load() << dendl; | |
439 | return 0; | |
440 | } | |
441 | ||
442 | dout(10) << __func__ << " start" << dendl; | |
443 | if (cct->_conf->bdev_inject_crash) { | |
444 | ++injecting_crash; | |
445 | // sleep for a moment to give other threads a chance to submit or | |
446 | // wait on io that races with a flush. | |
447 | derr << __func__ << " injecting crash. first we sleep..." << dendl; | |
448 | sleep(cct->_conf->bdev_inject_crash_flush_delay); | |
449 | derr << __func__ << " and now we die" << dendl; | |
450 | cct->_log->flush(); | |
451 | _exit(1); | |
452 | } | |
453 | utime_t start = ceph_clock_now(); | |
11fdf7f2 | 454 | int r = ::fdatasync(fd_directs[WRITE_LIFE_NOT_SET]); |
7c673cae FG |
455 | utime_t end = ceph_clock_now(); |
456 | utime_t dur = end - start; | |
457 | if (r < 0) { | |
458 | r = -errno; | |
459 | derr << __func__ << " fdatasync got: " << cpp_strerror(r) << dendl; | |
460 | ceph_abort(); | |
461 | } | |
462 | dout(5) << __func__ << " in " << dur << dendl;; | |
463 | return r; | |
464 | } | |
465 | ||
466 | int KernelDevice::_aio_start() | |
467 | { | |
468 | if (aio) { | |
469 | dout(10) << __func__ << dendl; | |
9f95a23c | 470 | int r = io_queue->init(fd_directs); |
7c673cae | 471 | if (r < 0) { |
31f18b77 FG |
472 | if (r == -EAGAIN) { |
473 | derr << __func__ << " io_setup(2) failed with EAGAIN; " | |
474 | << "try increasing /proc/sys/fs/aio-max-nr" << dendl; | |
475 | } else { | |
476 | derr << __func__ << " io_setup(2) failed: " << cpp_strerror(r) << dendl; | |
477 | } | |
7c673cae FG |
478 | return r; |
479 | } | |
480 | aio_thread.create("bstore_aio"); | |
481 | } | |
482 | return 0; | |
483 | } | |
484 | ||
485 | void KernelDevice::_aio_stop() | |
486 | { | |
487 | if (aio) { | |
488 | dout(10) << __func__ << dendl; | |
489 | aio_stop = true; | |
490 | aio_thread.join(); | |
491 | aio_stop = false; | |
9f95a23c | 492 | io_queue->shutdown(); |
7c673cae FG |
493 | } |
494 | } | |
495 | ||
11fdf7f2 TL |
496 | int KernelDevice::_discard_start() |
497 | { | |
498 | discard_thread.create("bstore_discard"); | |
499 | return 0; | |
500 | } | |
501 | ||
502 | void KernelDevice::_discard_stop() | |
503 | { | |
504 | dout(10) << __func__ << dendl; | |
505 | { | |
506 | std::unique_lock l(discard_lock); | |
507 | while (!discard_started) { | |
508 | discard_cond.wait(l); | |
509 | } | |
510 | discard_stop = true; | |
511 | discard_cond.notify_all(); | |
512 | } | |
513 | discard_thread.join(); | |
514 | { | |
515 | std::lock_guard l(discard_lock); | |
516 | discard_stop = false; | |
517 | } | |
518 | dout(10) << __func__ << " stopped" << dendl; | |
519 | } | |
520 | ||
521 | void KernelDevice::discard_drain() | |
522 | { | |
523 | dout(10) << __func__ << dendl; | |
524 | std::unique_lock l(discard_lock); | |
525 | while (!discard_queued.empty() || discard_running) { | |
526 | discard_cond.wait(l); | |
527 | } | |
528 | } | |
529 | ||
28e407b8 AA |
530 | static bool is_expected_ioerr(const int r) |
531 | { | |
532 | // https://lxr.missinglinkelectronics.com/linux+v4.15/block/blk-core.c#L135 | |
533 | return (r == -EOPNOTSUPP || r == -ETIMEDOUT || r == -ENOSPC || | |
11fdf7f2 | 534 | r == -ENOLINK || r == -EREMOTEIO || r == -EAGAIN || r == -EIO || |
28e407b8 | 535 | r == -ENODATA || r == -EILSEQ || r == -ENOMEM || |
11fdf7f2 TL |
536 | #if defined(__linux__) |
537 | r == -EREMCHG || r == -EBADE | |
538 | #elif defined(__FreeBSD__) | |
539 | r == - BSM_ERRNO_EREMCHG || r == -BSM_ERRNO_EBADE | |
540 | #endif | |
541 | ); | |
28e407b8 AA |
542 | } |
543 | ||
7c673cae FG |
544 | void KernelDevice::_aio_thread() |
545 | { | |
546 | dout(10) << __func__ << " start" << dendl; | |
547 | int inject_crash_count = 0; | |
548 | while (!aio_stop) { | |
549 | dout(40) << __func__ << " polling" << dendl; | |
224ce89b | 550 | int max = cct->_conf->bdev_aio_reap_max; |
7c673cae | 551 | aio_t *aio[max]; |
9f95a23c | 552 | int r = io_queue->get_next_completed(cct->_conf->bdev_aio_poll_ms, |
7c673cae FG |
553 | aio, max); |
554 | if (r < 0) { | |
555 | derr << __func__ << " got " << cpp_strerror(r) << dendl; | |
11fdf7f2 | 556 | ceph_abort_msg("got unexpected error from io_getevents"); |
7c673cae FG |
557 | } |
558 | if (r > 0) { | |
559 | dout(30) << __func__ << " got " << r << " completed aios" << dendl; | |
560 | for (int i = 0; i < r; ++i) { | |
561 | IOContext *ioc = static_cast<IOContext*>(aio[i]->priv); | |
562 | _aio_log_finish(ioc, aio[i]->offset, aio[i]->length); | |
563 | if (aio[i]->queue_item.is_linked()) { | |
11fdf7f2 | 564 | std::lock_guard l(debug_queue_lock); |
7c673cae FG |
565 | debug_aio_unlink(*aio[i]); |
566 | } | |
567 | ||
568 | // set flag indicating new ios have completed. we do this *before* | |
569 | // any completion or notifications so that any user flush() that | |
570 | // follows the observed io completion will include this io. Note | |
571 | // that an earlier, racing flush() could observe and clear this | |
572 | // flag, but that also ensures that the IO will be stable before the | |
573 | // later flush() occurs. | |
574 | io_since_flush.store(true); | |
575 | ||
94b18763 | 576 | long r = aio[i]->get_return_value(); |
b32b8144 | 577 | if (r < 0) { |
28e407b8 AA |
578 | derr << __func__ << " got r=" << r << " (" << cpp_strerror(r) << ")" |
579 | << dendl; | |
580 | if (ioc->allow_eio && is_expected_ioerr(r)) { | |
581 | derr << __func__ << " translating the error to EIO for upper layer" | |
582 | << dendl; | |
583 | ioc->set_return_value(-EIO); | |
b32b8144 | 584 | } else { |
11fdf7f2 TL |
585 | if (is_expected_ioerr(r)) { |
586 | note_io_error_event( | |
587 | devname.c_str(), | |
588 | path.c_str(), | |
589 | r, | |
81eedcae TL |
590 | #if defined(HAVE_POSIXAIO) |
591 | aio[i]->aio.aiocb.aio_lio_opcode, | |
592 | #else | |
593 | aio[i]->iocb.aio_lio_opcode, | |
594 | #endif | |
11fdf7f2 TL |
595 | aio[i]->offset, |
596 | aio[i]->length); | |
597 | ceph_abort_msg( | |
598 | "Unexpected IO error. " | |
599 | "This may suggest a hardware issue. " | |
600 | "Please check your kernel log!"); | |
601 | } | |
602 | ceph_abort_msg( | |
603 | "Unexpected IO error. " | |
604 | "This may suggest HW issue. Please check your dmesg!"); | |
b32b8144 FG |
605 | } |
606 | } else if (aio[i]->length != (uint64_t)r) { | |
eafe8130 TL |
607 | derr << "aio to 0x" << std::hex << aio[i]->offset |
608 | << "~" << aio[i]->length << std::dec | |
b32b8144 | 609 | << " but returned: " << r << dendl; |
11fdf7f2 | 610 | ceph_abort_msg("unexpected aio return value: does not match length"); |
b32b8144 FG |
611 | } |
612 | ||
613 | dout(10) << __func__ << " finished aio " << aio[i] << " r " << r | |
614 | << " ioc " << ioc | |
615 | << " with " << (ioc->num_running.load() - 1) | |
616 | << " aios left" << dendl; | |
7c673cae FG |
617 | |
618 | // NOTE: once num_running and we either call the callback or | |
619 | // call aio_wake we cannot touch ioc or aio[] as the caller | |
620 | // may free it. | |
621 | if (ioc->priv) { | |
622 | if (--ioc->num_running == 0) { | |
623 | aio_callback(aio_callback_priv, ioc->priv); | |
624 | } | |
625 | } else { | |
31f18b77 | 626 | ioc->try_aio_wake(); |
7c673cae FG |
627 | } |
628 | } | |
629 | } | |
630 | if (cct->_conf->bdev_debug_aio) { | |
631 | utime_t now = ceph_clock_now(); | |
11fdf7f2 | 632 | std::lock_guard l(debug_queue_lock); |
7c673cae FG |
633 | if (debug_oldest) { |
634 | if (debug_stall_since == utime_t()) { | |
635 | debug_stall_since = now; | |
636 | } else { | |
11fdf7f2 TL |
637 | if (cct->_conf->bdev_debug_aio_suicide_timeout) { |
638 | utime_t cutoff = now; | |
639 | cutoff -= cct->_conf->bdev_debug_aio_suicide_timeout; | |
640 | if (debug_stall_since < cutoff) { | |
641 | derr << __func__ << " stalled aio " << debug_oldest | |
642 | << " since " << debug_stall_since << ", timeout is " | |
643 | << cct->_conf->bdev_debug_aio_suicide_timeout | |
644 | << "s, suicide" << dendl; | |
645 | ceph_abort_msg("stalled aio... buggy kernel or bad device?"); | |
646 | } | |
7c673cae FG |
647 | } |
648 | } | |
649 | } | |
650 | } | |
651 | reap_ioc(); | |
652 | if (cct->_conf->bdev_inject_crash) { | |
653 | ++inject_crash_count; | |
654 | if (inject_crash_count * cct->_conf->bdev_aio_poll_ms / 1000 > | |
655 | cct->_conf->bdev_inject_crash + cct->_conf->bdev_inject_crash_flush_delay) { | |
656 | derr << __func__ << " bdev_inject_crash trigger from aio thread" | |
657 | << dendl; | |
658 | cct->_log->flush(); | |
659 | _exit(1); | |
660 | } | |
661 | } | |
662 | } | |
663 | reap_ioc(); | |
664 | dout(10) << __func__ << " end" << dendl; | |
665 | } | |
666 | ||
11fdf7f2 TL |
667 | void KernelDevice::_discard_thread() |
668 | { | |
669 | std::unique_lock l(discard_lock); | |
670 | ceph_assert(!discard_started); | |
671 | discard_started = true; | |
672 | discard_cond.notify_all(); | |
673 | while (true) { | |
674 | ceph_assert(discard_finishing.empty()); | |
675 | if (discard_queued.empty()) { | |
676 | if (discard_stop) | |
677 | break; | |
678 | dout(20) << __func__ << " sleep" << dendl; | |
679 | discard_cond.notify_all(); // for the thread trying to drain... | |
680 | discard_cond.wait(l); | |
681 | dout(20) << __func__ << " wake" << dendl; | |
682 | } else { | |
683 | discard_finishing.swap(discard_queued); | |
684 | discard_running = true; | |
685 | l.unlock(); | |
686 | dout(20) << __func__ << " finishing" << dendl; | |
687 | for (auto p = discard_finishing.begin();p != discard_finishing.end(); ++p) { | |
688 | discard(p.get_start(), p.get_len()); | |
689 | } | |
690 | ||
691 | discard_callback(discard_callback_priv, static_cast<void*>(&discard_finishing)); | |
692 | discard_finishing.clear(); | |
693 | l.lock(); | |
694 | discard_running = false; | |
695 | } | |
696 | } | |
697 | dout(10) << __func__ << " finish" << dendl; | |
698 | discard_started = false; | |
699 | } | |
700 | ||
701 | int KernelDevice::queue_discard(interval_set<uint64_t> &to_release) | |
702 | { | |
703 | if (!support_discard) | |
704 | return -1; | |
705 | ||
706 | if (to_release.empty()) | |
707 | return 0; | |
708 | ||
709 | std::lock_guard l(discard_lock); | |
710 | discard_queued.insert(to_release); | |
711 | discard_cond.notify_all(); | |
712 | return 0; | |
713 | } | |
714 | ||
7c673cae FG |
715 | void KernelDevice::_aio_log_start( |
716 | IOContext *ioc, | |
717 | uint64_t offset, | |
718 | uint64_t length) | |
719 | { | |
720 | dout(20) << __func__ << " 0x" << std::hex << offset << "~" << length | |
721 | << std::dec << dendl; | |
722 | if (cct->_conf->bdev_debug_inflight_ios) { | |
11fdf7f2 | 723 | std::lock_guard l(debug_lock); |
7c673cae FG |
724 | if (debug_inflight.intersects(offset, length)) { |
725 | derr << __func__ << " inflight overlap of 0x" | |
726 | << std::hex | |
727 | << offset << "~" << length << std::dec | |
728 | << " with " << debug_inflight << dendl; | |
729 | ceph_abort(); | |
730 | } | |
731 | debug_inflight.insert(offset, length); | |
732 | } | |
733 | } | |
734 | ||
735 | void KernelDevice::debug_aio_link(aio_t& aio) | |
736 | { | |
737 | if (debug_queue.empty()) { | |
738 | debug_oldest = &aio; | |
739 | } | |
740 | debug_queue.push_back(aio); | |
741 | } | |
742 | ||
743 | void KernelDevice::debug_aio_unlink(aio_t& aio) | |
744 | { | |
745 | if (aio.queue_item.is_linked()) { | |
746 | debug_queue.erase(debug_queue.iterator_to(aio)); | |
747 | if (debug_oldest == &aio) { | |
11fdf7f2 TL |
748 | auto age = cct->_conf->bdev_debug_aio_log_age; |
749 | if (age && debug_stall_since != utime_t()) { | |
750 | utime_t cutoff = ceph_clock_now(); | |
751 | cutoff -= age; | |
752 | if (debug_stall_since < cutoff) { | |
753 | derr << __func__ << " stalled aio " << debug_oldest | |
754 | << " since " << debug_stall_since << ", timeout is " | |
755 | << age | |
756 | << "s" << dendl; | |
757 | } | |
758 | } | |
759 | ||
7c673cae FG |
760 | if (debug_queue.empty()) { |
761 | debug_oldest = nullptr; | |
762 | } else { | |
763 | debug_oldest = &debug_queue.front(); | |
764 | } | |
765 | debug_stall_since = utime_t(); | |
766 | } | |
767 | } | |
768 | } | |
769 | ||
770 | void KernelDevice::_aio_log_finish( | |
771 | IOContext *ioc, | |
772 | uint64_t offset, | |
773 | uint64_t length) | |
774 | { | |
775 | dout(20) << __func__ << " " << aio << " 0x" | |
776 | << std::hex << offset << "~" << length << std::dec << dendl; | |
777 | if (cct->_conf->bdev_debug_inflight_ios) { | |
11fdf7f2 | 778 | std::lock_guard l(debug_lock); |
7c673cae FG |
779 | debug_inflight.erase(offset, length); |
780 | } | |
781 | } | |
782 | ||
783 | void KernelDevice::aio_submit(IOContext *ioc) | |
784 | { | |
785 | dout(20) << __func__ << " ioc " << ioc | |
786 | << " pending " << ioc->num_pending.load() | |
787 | << " running " << ioc->num_running.load() | |
788 | << dendl; | |
224ce89b | 789 | |
7c673cae FG |
790 | if (ioc->num_pending.load() == 0) { |
791 | return; | |
792 | } | |
224ce89b | 793 | |
7c673cae FG |
794 | // move these aside, and get our end iterator position now, as the |
795 | // aios might complete as soon as they are submitted and queue more | |
796 | // wal aio's. | |
797 | list<aio_t>::iterator e = ioc->running_aios.begin(); | |
798 | ioc->running_aios.splice(e, ioc->pending_aios); | |
7c673cae FG |
799 | |
800 | int pending = ioc->num_pending.load(); | |
801 | ioc->num_running += pending; | |
802 | ioc->num_pending -= pending; | |
11fdf7f2 TL |
803 | ceph_assert(ioc->num_pending.load() == 0); // we should be only thread doing this |
804 | ceph_assert(ioc->pending_aios.size() == 0); | |
805 | ||
224ce89b WB |
806 | if (cct->_conf->bdev_debug_aio) { |
807 | list<aio_t>::iterator p = ioc->running_aios.begin(); | |
808 | while (p != e) { | |
11fdf7f2 TL |
809 | dout(30) << __func__ << " " << *p << dendl; |
810 | std::lock_guard l(debug_queue_lock); | |
224ce89b | 811 | debug_aio_link(*p++); |
7c673cae FG |
812 | } |
813 | } | |
224ce89b WB |
814 | |
815 | void *priv = static_cast<void*>(ioc); | |
816 | int r, retries = 0; | |
f67539c2 TL |
817 | // num of pending aios should not overflow when passed to submit_batch() |
818 | assert(pending <= std::numeric_limits<uint16_t>::max()); | |
9f95a23c | 819 | r = io_queue->submit_batch(ioc->running_aios.begin(), e, |
11fdf7f2 TL |
820 | pending, priv, &retries); |
821 | ||
224ce89b WB |
822 | if (retries) |
823 | derr << __func__ << " retries " << retries << dendl; | |
824 | if (r < 0) { | |
825 | derr << " aio submit got " << cpp_strerror(r) << dendl; | |
11fdf7f2 | 826 | ceph_assert(r == 0); |
224ce89b | 827 | } |
7c673cae FG |
828 | } |
829 | ||
11fdf7f2 | 830 | int KernelDevice::_sync_write(uint64_t off, bufferlist &bl, bool buffered, int write_hint) |
7c673cae FG |
831 | { |
832 | uint64_t len = bl.length(); | |
833 | dout(5) << __func__ << " 0x" << std::hex << off << "~" << len | |
11fdf7f2 | 834 | << std::dec << (buffered ? " (buffered)" : " (direct)") << dendl; |
7c673cae FG |
835 | if (cct->_conf->bdev_inject_crash && |
836 | rand() % cct->_conf->bdev_inject_crash == 0) { | |
837 | derr << __func__ << " bdev_inject_crash: dropping io 0x" << std::hex | |
838 | << off << "~" << len << std::dec << dendl; | |
839 | ++injecting_crash; | |
840 | return 0; | |
841 | } | |
842 | vector<iovec> iov; | |
843 | bl.prepare_iov(&iov); | |
7c673cae | 844 | |
e306af50 TL |
845 | auto left = len; |
846 | auto o = off; | |
847 | size_t idx = 0; | |
848 | do { | |
849 | auto r = ::pwritev(choose_fd(buffered, write_hint), | |
850 | &iov[idx], iov.size() - idx, o); | |
851 | ||
852 | if (r < 0) { | |
853 | r = -errno; | |
854 | derr << __func__ << " pwritev error: " << cpp_strerror(r) << dendl; | |
855 | return r; | |
856 | } | |
857 | o += r; | |
858 | left -= r; | |
859 | if (left) { | |
860 | // skip fully processed IOVs | |
861 | while (idx < iov.size() && (size_t)r >= iov[idx].iov_len) { | |
862 | r -= iov[idx++].iov_len; | |
863 | } | |
864 | // update partially processed one if any | |
865 | if (r) { | |
866 | ceph_assert(idx < iov.size()); | |
867 | ceph_assert((size_t)r < iov[idx].iov_len); | |
868 | iov[idx].iov_base = static_cast<char*>(iov[idx].iov_base) + r; | |
869 | iov[idx].iov_len -= r; | |
870 | r = 0; | |
871 | } | |
872 | ceph_assert(r == 0); | |
873 | } | |
874 | } while (left); | |
875 | ||
11fdf7f2 | 876 | #ifdef HAVE_SYNC_FILE_RANGE |
7c673cae | 877 | if (buffered) { |
494da23a | 878 | // initiate IO and wait till it completes |
e306af50 | 879 | auto r = ::sync_file_range(fd_buffereds[WRITE_LIFE_NOT_SET], off, len, SYNC_FILE_RANGE_WRITE|SYNC_FILE_RANGE_WAIT_AFTER|SYNC_FILE_RANGE_WAIT_BEFORE); |
7c673cae FG |
880 | if (r < 0) { |
881 | r = -errno; | |
882 | derr << __func__ << " sync_file_range error: " << cpp_strerror(r) << dendl; | |
883 | return r; | |
884 | } | |
885 | } | |
11fdf7f2 | 886 | #endif |
31f18b77 FG |
887 | |
888 | io_since_flush.store(true); | |
889 | ||
7c673cae FG |
890 | return 0; |
891 | } | |
892 | ||
893 | int KernelDevice::write( | |
894 | uint64_t off, | |
895 | bufferlist &bl, | |
11fdf7f2 TL |
896 | bool buffered, |
897 | int write_hint) | |
7c673cae FG |
898 | { |
899 | uint64_t len = bl.length(); | |
900 | dout(20) << __func__ << " 0x" << std::hex << off << "~" << len << std::dec | |
901 | << (buffered ? " (buffered)" : " (direct)") | |
902 | << dendl; | |
11fdf7f2 | 903 | ceph_assert(is_valid_io(off, len)); |
eafe8130 TL |
904 | if (cct->_conf->objectstore_blackhole) { |
905 | lderr(cct) << __func__ << " objectstore_blackhole=true, throwing out IO" | |
906 | << dendl; | |
907 | return 0; | |
908 | } | |
7c673cae FG |
909 | |
910 | if ((!buffered || bl.get_num_buffers() >= IOV_MAX) && | |
b32b8144 | 911 | bl.rebuild_aligned_size_and_memory(block_size, block_size, IOV_MAX)) { |
7c673cae FG |
912 | dout(20) << __func__ << " rebuilding buffer to be aligned" << dendl; |
913 | } | |
914 | dout(40) << "data: "; | |
915 | bl.hexdump(*_dout); | |
916 | *_dout << dendl; | |
917 | ||
11fdf7f2 | 918 | return _sync_write(off, bl, buffered, write_hint); |
7c673cae FG |
919 | } |
920 | ||
921 | int KernelDevice::aio_write( | |
922 | uint64_t off, | |
923 | bufferlist &bl, | |
924 | IOContext *ioc, | |
11fdf7f2 TL |
925 | bool buffered, |
926 | int write_hint) | |
7c673cae FG |
927 | { |
928 | uint64_t len = bl.length(); | |
929 | dout(20) << __func__ << " 0x" << std::hex << off << "~" << len << std::dec | |
930 | << (buffered ? " (buffered)" : " (direct)") | |
931 | << dendl; | |
11fdf7f2 | 932 | ceph_assert(is_valid_io(off, len)); |
eafe8130 TL |
933 | if (cct->_conf->objectstore_blackhole) { |
934 | lderr(cct) << __func__ << " objectstore_blackhole=true, throwing out IO" | |
935 | << dendl; | |
936 | return 0; | |
937 | } | |
7c673cae FG |
938 | |
939 | if ((!buffered || bl.get_num_buffers() >= IOV_MAX) && | |
b32b8144 | 940 | bl.rebuild_aligned_size_and_memory(block_size, block_size, IOV_MAX)) { |
7c673cae FG |
941 | dout(20) << __func__ << " rebuilding buffer to be aligned" << dendl; |
942 | } | |
943 | dout(40) << "data: "; | |
944 | bl.hexdump(*_dout); | |
945 | *_dout << dendl; | |
946 | ||
947 | _aio_log_start(ioc, off, len); | |
948 | ||
949 | #ifdef HAVE_LIBAIO | |
950 | if (aio && dio && !buffered) { | |
7c673cae FG |
951 | if (cct->_conf->bdev_inject_crash && |
952 | rand() % cct->_conf->bdev_inject_crash == 0) { | |
953 | derr << __func__ << " bdev_inject_crash: dropping io 0x" << std::hex | |
954 | << off << "~" << len << std::dec | |
955 | << dendl; | |
956 | // generate a real io so that aio_wait behaves properly, but make it | |
957 | // a read instead of write, and toss the result. | |
494da23a TL |
958 | ioc->pending_aios.push_back(aio_t(ioc, choose_fd(false, write_hint))); |
959 | ++ioc->num_pending; | |
960 | auto& aio = ioc->pending_aios.back(); | |
f67539c2 | 961 | bufferptr p = ceph::buffer::create_small_page_aligned(len); |
9f95a23c TL |
962 | aio.bl.append(std::move(p)); |
963 | aio.bl.prepare_iov(&aio.iov); | |
964 | aio.preadv(off, len); | |
7c673cae FG |
965 | ++injecting_crash; |
966 | } else { | |
494da23a TL |
967 | if (bl.length() <= RW_IO_MAX) { |
968 | // fast path (non-huge write) | |
969 | ioc->pending_aios.push_back(aio_t(ioc, choose_fd(false, write_hint))); | |
970 | ++ioc->num_pending; | |
971 | auto& aio = ioc->pending_aios.back(); | |
972 | bl.prepare_iov(&aio.iov); | |
973 | aio.bl.claim_append(bl); | |
974 | aio.pwritev(off, len); | |
975 | dout(30) << aio << dendl; | |
976 | dout(5) << __func__ << " 0x" << std::hex << off << "~" << len | |
977 | << std::dec << " aio " << &aio << dendl; | |
978 | } else { | |
979 | // write in RW_IO_MAX-sized chunks | |
980 | uint64_t prev_len = 0; | |
981 | while (prev_len < bl.length()) { | |
982 | bufferlist tmp; | |
983 | if (prev_len + RW_IO_MAX < bl.length()) { | |
984 | tmp.substr_of(bl, prev_len, RW_IO_MAX); | |
985 | } else { | |
986 | tmp.substr_of(bl, prev_len, bl.length() - prev_len); | |
987 | } | |
988 | auto len = tmp.length(); | |
989 | ioc->pending_aios.push_back(aio_t(ioc, choose_fd(false, write_hint))); | |
990 | ++ioc->num_pending; | |
991 | auto& aio = ioc->pending_aios.back(); | |
992 | tmp.prepare_iov(&aio.iov); | |
993 | aio.bl.claim_append(tmp); | |
994 | aio.pwritev(off + prev_len, len); | |
995 | dout(30) << aio << dendl; | |
996 | dout(5) << __func__ << " 0x" << std::hex << off + prev_len | |
997 | << "~" << len | |
998 | << std::dec << " aio " << &aio << " (piece)" << dendl; | |
999 | prev_len += len; | |
1000 | } | |
1001 | } | |
7c673cae | 1002 | } |
7c673cae FG |
1003 | } else |
1004 | #endif | |
1005 | { | |
11fdf7f2 | 1006 | int r = _sync_write(off, bl, buffered, write_hint); |
7c673cae FG |
1007 | _aio_log_finish(ioc, off, len); |
1008 | if (r < 0) | |
1009 | return r; | |
1010 | } | |
1011 | return 0; | |
1012 | } | |
1013 | ||
11fdf7f2 TL |
1014 | int KernelDevice::discard(uint64_t offset, uint64_t len) |
1015 | { | |
1016 | int r = 0; | |
eafe8130 TL |
1017 | if (cct->_conf->objectstore_blackhole) { |
1018 | lderr(cct) << __func__ << " objectstore_blackhole=true, throwing out IO" | |
1019 | << dendl; | |
1020 | return 0; | |
1021 | } | |
11fdf7f2 TL |
1022 | if (support_discard) { |
1023 | dout(10) << __func__ | |
1024 | << " 0x" << std::hex << offset << "~" << len << std::dec | |
1025 | << dendl; | |
1026 | ||
1027 | r = BlkDev{fd_directs[WRITE_LIFE_NOT_SET]}.discard((int64_t)offset, (int64_t)len); | |
1028 | } | |
1029 | return r; | |
1030 | } | |
1031 | ||
7c673cae FG |
1032 | int KernelDevice::read(uint64_t off, uint64_t len, bufferlist *pbl, |
1033 | IOContext *ioc, | |
1034 | bool buffered) | |
1035 | { | |
1036 | dout(5) << __func__ << " 0x" << std::hex << off << "~" << len << std::dec | |
1037 | << (buffered ? " (buffered)" : " (direct)") | |
1038 | << dendl; | |
11fdf7f2 | 1039 | ceph_assert(is_valid_io(off, len)); |
7c673cae FG |
1040 | |
1041 | _aio_log_start(ioc, off, len); | |
1042 | ||
11fdf7f2 TL |
1043 | auto start1 = mono_clock::now(); |
1044 | ||
f67539c2 | 1045 | auto p = ceph::buffer::ptr_node::create(ceph::buffer::create_small_page_aligned(len)); |
11fdf7f2 TL |
1046 | int r = ::pread(buffered ? fd_buffereds[WRITE_LIFE_NOT_SET] : fd_directs[WRITE_LIFE_NOT_SET], |
1047 | p->c_str(), len, off); | |
1048 | auto age = cct->_conf->bdev_debug_aio_log_age; | |
1049 | if (mono_clock::now() - start1 >= make_timespan(age)) { | |
1050 | derr << __func__ << " stalled read " | |
1051 | << " 0x" << std::hex << off << "~" << len << std::dec | |
1052 | << (buffered ? " (buffered)" : " (direct)") | |
1053 | << " since " << start1 << ", timeout is " | |
1054 | << age | |
1055 | << "s" << dendl; | |
1056 | } | |
1057 | ||
7c673cae | 1058 | if (r < 0) { |
a8e16298 TL |
1059 | if (ioc->allow_eio && is_expected_ioerr(r)) { |
1060 | r = -EIO; | |
1061 | } else { | |
1062 | r = -errno; | |
1063 | } | |
7c673cae FG |
1064 | goto out; |
1065 | } | |
11fdf7f2 | 1066 | ceph_assert((uint64_t)r == len); |
7c673cae FG |
1067 | pbl->push_back(std::move(p)); |
1068 | ||
1069 | dout(40) << "data: "; | |
1070 | pbl->hexdump(*_dout); | |
1071 | *_dout << dendl; | |
1072 | ||
1073 | out: | |
1074 | _aio_log_finish(ioc, off, len); | |
1075 | return r < 0 ? r : 0; | |
1076 | } | |
1077 | ||
1078 | int KernelDevice::aio_read( | |
1079 | uint64_t off, | |
1080 | uint64_t len, | |
1081 | bufferlist *pbl, | |
1082 | IOContext *ioc) | |
1083 | { | |
1084 | dout(5) << __func__ << " 0x" << std::hex << off << "~" << len << std::dec | |
1085 | << dendl; | |
1086 | ||
1087 | int r = 0; | |
1088 | #ifdef HAVE_LIBAIO | |
1089 | if (aio && dio) { | |
11fdf7f2 | 1090 | ceph_assert(is_valid_io(off, len)); |
7c673cae | 1091 | _aio_log_start(ioc, off, len); |
11fdf7f2 | 1092 | ioc->pending_aios.push_back(aio_t(ioc, fd_directs[WRITE_LIFE_NOT_SET])); |
7c673cae FG |
1093 | ++ioc->num_pending; |
1094 | aio_t& aio = ioc->pending_aios.back(); | |
f67539c2 | 1095 | bufferptr p = ceph::buffer::create_small_page_aligned(len); |
9f95a23c TL |
1096 | aio.bl.append(std::move(p)); |
1097 | aio.bl.prepare_iov(&aio.iov); | |
1098 | aio.preadv(off, len); | |
11fdf7f2 | 1099 | dout(30) << aio << dendl; |
7c673cae FG |
1100 | pbl->append(aio.bl); |
1101 | dout(5) << __func__ << " 0x" << std::hex << off << "~" << len | |
1102 | << std::dec << " aio " << &aio << dendl; | |
1103 | } else | |
1104 | #endif | |
1105 | { | |
1106 | r = read(off, len, pbl, ioc, false); | |
1107 | } | |
1108 | ||
1109 | return r; | |
1110 | } | |
1111 | ||
1112 | int KernelDevice::direct_read_unaligned(uint64_t off, uint64_t len, char *buf) | |
1113 | { | |
9f95a23c TL |
1114 | uint64_t aligned_off = p2align(off, block_size); |
1115 | uint64_t aligned_len = p2roundup(off+len, block_size) - aligned_off; | |
f67539c2 | 1116 | bufferptr p = ceph::buffer::create_small_page_aligned(aligned_len); |
7c673cae FG |
1117 | int r = 0; |
1118 | ||
11fdf7f2 TL |
1119 | auto start1 = mono_clock::now(); |
1120 | r = ::pread(fd_directs[WRITE_LIFE_NOT_SET], p.c_str(), aligned_len, aligned_off); | |
1121 | auto age = cct->_conf->bdev_debug_aio_log_age; | |
1122 | if (mono_clock::now() - start1 >= make_timespan(age)) { | |
1123 | derr << __func__ << " stalled read " | |
1124 | << " 0x" << std::hex << off << "~" << len << std::dec | |
1125 | << " since " << start1 << ", timeout is " | |
1126 | << age | |
1127 | << "s" << dendl; | |
1128 | } | |
1129 | ||
7c673cae FG |
1130 | if (r < 0) { |
1131 | r = -errno; | |
11fdf7f2 | 1132 | derr << __func__ << " 0x" << std::hex << off << "~" << len << std::dec |
7c673cae FG |
1133 | << " error: " << cpp_strerror(r) << dendl; |
1134 | goto out; | |
1135 | } | |
11fdf7f2 | 1136 | ceph_assert((uint64_t)r == aligned_len); |
7c673cae FG |
1137 | memcpy(buf, p.c_str() + (off - aligned_off), len); |
1138 | ||
1139 | dout(40) << __func__ << " data: "; | |
1140 | bufferlist bl; | |
1141 | bl.append(buf, len); | |
1142 | bl.hexdump(*_dout); | |
1143 | *_dout << dendl; | |
1144 | ||
1145 | out: | |
1146 | return r < 0 ? r : 0; | |
1147 | } | |
1148 | ||
1149 | int KernelDevice::read_random(uint64_t off, uint64_t len, char *buf, | |
1150 | bool buffered) | |
1151 | { | |
1152 | dout(5) << __func__ << " 0x" << std::hex << off << "~" << len << std::dec | |
494da23a | 1153 | << "buffered " << buffered |
7c673cae | 1154 | << dendl; |
11fdf7f2 TL |
1155 | ceph_assert(len > 0); |
1156 | ceph_assert(off < size); | |
1157 | ceph_assert(off + len <= size); | |
7c673cae | 1158 | int r = 0; |
11fdf7f2 | 1159 | auto age = cct->_conf->bdev_debug_aio_log_age; |
7c673cae FG |
1160 | |
1161 | //if it's direct io and unaligned, we have to use a internal buffer | |
1162 | if (!buffered && ((off % block_size != 0) | |
1163 | || (len % block_size != 0) | |
1164 | || (uintptr_t(buf) % CEPH_PAGE_SIZE != 0))) | |
1165 | return direct_read_unaligned(off, len, buf); | |
1166 | ||
11fdf7f2 | 1167 | auto start1 = mono_clock::now(); |
7c673cae FG |
1168 | if (buffered) { |
1169 | //buffered read | |
11fdf7f2 | 1170 | auto off0 = off; |
7c673cae FG |
1171 | char *t = buf; |
1172 | uint64_t left = len; | |
1173 | while (left > 0) { | |
11fdf7f2 | 1174 | r = ::pread(fd_buffereds[WRITE_LIFE_NOT_SET], t, left, off); |
7c673cae FG |
1175 | if (r < 0) { |
1176 | r = -errno; | |
11fdf7f2 | 1177 | derr << __func__ << " 0x" << std::hex << off << "~" << left |
7c673cae FG |
1178 | << std::dec << " error: " << cpp_strerror(r) << dendl; |
1179 | goto out; | |
1180 | } | |
1181 | off += r; | |
1182 | t += r; | |
1183 | left -= r; | |
1184 | } | |
11fdf7f2 TL |
1185 | if (mono_clock::now() - start1 >= make_timespan(age)) { |
1186 | derr << __func__ << " stalled read " | |
1187 | << " 0x" << std::hex << off0 << "~" << len << std::dec | |
1188 | << " (buffered) since " << start1 << ", timeout is " | |
1189 | << age | |
1190 | << "s" << dendl; | |
1191 | } | |
7c673cae FG |
1192 | } else { |
1193 | //direct and aligned read | |
11fdf7f2 TL |
1194 | r = ::pread(fd_directs[WRITE_LIFE_NOT_SET], buf, len, off); |
1195 | if (mono_clock::now() - start1 >= make_timespan(age)) { | |
1196 | derr << __func__ << " stalled read " | |
1197 | << " 0x" << std::hex << off << "~" << len << std::dec | |
1198 | << " (direct) since " << start1 << ", timeout is " | |
1199 | << age | |
1200 | << "s" << dendl; | |
1201 | } | |
7c673cae FG |
1202 | if (r < 0) { |
1203 | r = -errno; | |
11fdf7f2 | 1204 | derr << __func__ << " direct_aligned_read" << " 0x" << std::hex |
f67539c2 | 1205 | << off << "~" << std::left << std::dec << " error: " << cpp_strerror(r) |
7c673cae FG |
1206 | << dendl; |
1207 | goto out; | |
1208 | } | |
11fdf7f2 | 1209 | ceph_assert((uint64_t)r == len); |
7c673cae FG |
1210 | } |
1211 | ||
1212 | dout(40) << __func__ << " data: "; | |
1213 | bufferlist bl; | |
1214 | bl.append(buf, len); | |
1215 | bl.hexdump(*_dout); | |
1216 | *_dout << dendl; | |
1217 | ||
1218 | out: | |
1219 | return r < 0 ? r : 0; | |
1220 | } | |
1221 | ||
1222 | int KernelDevice::invalidate_cache(uint64_t off, uint64_t len) | |
1223 | { | |
1224 | dout(5) << __func__ << " 0x" << std::hex << off << "~" << len << std::dec | |
1225 | << dendl; | |
11fdf7f2 TL |
1226 | ceph_assert(off % block_size == 0); |
1227 | ceph_assert(len % block_size == 0); | |
1228 | int r = posix_fadvise(fd_buffereds[WRITE_LIFE_NOT_SET], off, len, POSIX_FADV_DONTNEED); | |
7c673cae FG |
1229 | if (r) { |
1230 | r = -r; | |
1231 | derr << __func__ << " 0x" << std::hex << off << "~" << len << std::dec | |
1232 | << " error: " << cpp_strerror(r) << dendl; | |
1233 | } | |
1234 | return r; | |
1235 | } |