]>
Commit | Line | Data |
---|---|---|
7c673cae FG |
1 | // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- |
2 | // vim: ts=8 sw=2 smarttab | |
3 | /* | |
4 | * Ceph - scalable distributed file system | |
5 | * | |
6 | * Copyright (C) 2014 Red Hat | |
7 | * | |
8 | * This is free software; you can redistribute it and/or | |
9 | * modify it under the terms of the GNU Lesser General Public | |
10 | * License version 2.1, as published by the Free Software | |
11 | * Foundation. See file COPYING. | |
12 | * | |
13 | */ | |
14 | ||
15 | #include <unistd.h> | |
16 | #include <stdlib.h> | |
17 | #include <sys/types.h> | |
18 | #include <sys/stat.h> | |
19 | #include <fcntl.h> | |
11fdf7f2 | 20 | #include <sys/file.h> |
7c673cae FG |
21 | |
22 | #include "KernelDevice.h" | |
9f95a23c | 23 | #include "include/intarith.h" |
7c673cae FG |
24 | #include "include/types.h" |
25 | #include "include/compat.h" | |
26 | #include "include/stringify.h" | |
11fdf7f2 | 27 | #include "common/blkdev.h" |
7c673cae | 28 | #include "common/errno.h" |
11fdf7f2 TL |
29 | #if defined(__FreeBSD__) |
30 | #include "bsm/audit_errno.h" | |
31 | #endif | |
7c673cae | 32 | #include "common/debug.h" |
11fdf7f2 TL |
33 | #include "common/numa.h" |
34 | ||
35 | #include "global/global_context.h" | |
9f95a23c | 36 | #include "ceph_io_uring.h" |
7c673cae FG |
37 | |
38 | #define dout_context cct | |
39 | #define dout_subsys ceph_subsys_bdev | |
40 | #undef dout_prefix | |
41 | #define dout_prefix *_dout << "bdev(" << this << " " << path << ") " | |
42 | ||
11fdf7f2 TL |
43 | KernelDevice::KernelDevice(CephContext* cct, aio_callback_t cb, void *cbpriv, aio_callback_t d_cb, void *d_cbpriv) |
44 | : BlockDevice(cct, cb, cbpriv), | |
45 | aio(false), dio(false), | |
11fdf7f2 TL |
46 | discard_callback(d_cb), |
47 | discard_callback_priv(d_cbpriv), | |
7c673cae | 48 | aio_stop(false), |
11fdf7f2 TL |
49 | discard_started(false), |
50 | discard_stop(false), | |
7c673cae | 51 | aio_thread(this), |
11fdf7f2 | 52 | discard_thread(this), |
7c673cae FG |
53 | injecting_crash(0) |
54 | { | |
11fdf7f2 TL |
55 | fd_directs.resize(WRITE_LIFE_MAX, -1); |
56 | fd_buffereds.resize(WRITE_LIFE_MAX, -1); | |
9f95a23c TL |
57 | |
58 | bool use_ioring = g_ceph_context->_conf.get_val<bool>("bluestore_ioring"); | |
59 | unsigned int iodepth = cct->_conf->bdev_aio_max_queue_depth; | |
60 | ||
61 | if (use_ioring && ioring_queue_t::supported()) { | |
62 | io_queue = std::make_unique<ioring_queue_t>(iodepth); | |
63 | } else { | |
64 | static bool once; | |
65 | if (use_ioring && !once) { | |
66 | derr << "WARNING: io_uring API is not supported! Fallback to libaio!" | |
67 | << dendl; | |
68 | once = true; | |
69 | } | |
70 | io_queue = std::make_unique<aio_queue_t>(iodepth); | |
71 | } | |
7c673cae FG |
72 | } |
73 | ||
74 | int KernelDevice::_lock() | |
75 | { | |
11fdf7f2 TL |
76 | dout(10) << __func__ << " " << fd_directs[WRITE_LIFE_NOT_SET] << dendl; |
77 | int r = ::flock(fd_directs[WRITE_LIFE_NOT_SET], LOCK_EX | LOCK_NB); | |
78 | if (r < 0) { | |
79 | derr << __func__ << " flock failed on " << path << dendl; | |
7c673cae | 80 | return -errno; |
11fdf7f2 | 81 | } |
7c673cae FG |
82 | return 0; |
83 | } | |
84 | ||
85 | int KernelDevice::open(const string& p) | |
86 | { | |
87 | path = p; | |
11fdf7f2 | 88 | int r = 0, i = 0; |
7c673cae FG |
89 | dout(1) << __func__ << " path " << path << dendl; |
90 | ||
11fdf7f2 TL |
91 | for (i = 0; i < WRITE_LIFE_MAX; i++) { |
92 | int fd = ::open(path.c_str(), O_RDWR | O_DIRECT); | |
93 | if (fd < 0) { | |
94 | r = -errno; | |
95 | break; | |
96 | } | |
97 | fd_directs[i] = fd; | |
98 | ||
99 | fd = ::open(path.c_str(), O_RDWR | O_CLOEXEC); | |
100 | if (fd < 0) { | |
101 | r = -errno; | |
102 | break; | |
103 | } | |
104 | fd_buffereds[i] = fd; | |
7c673cae | 105 | } |
11fdf7f2 TL |
106 | |
107 | if (i != WRITE_LIFE_MAX) { | |
7c673cae | 108 | derr << __func__ << " open got: " << cpp_strerror(r) << dendl; |
11fdf7f2 TL |
109 | goto out_fail; |
110 | } | |
111 | ||
112 | #if defined(F_SET_FILE_RW_HINT) | |
113 | for (i = WRITE_LIFE_NONE; i < WRITE_LIFE_MAX; i++) { | |
114 | if (fcntl(fd_directs[i], F_SET_FILE_RW_HINT, &i) < 0) { | |
115 | r = -errno; | |
116 | break; | |
117 | } | |
118 | if (fcntl(fd_buffereds[i], F_SET_FILE_RW_HINT, &i) < 0) { | |
119 | r = -errno; | |
120 | break; | |
121 | } | |
7c673cae | 122 | } |
11fdf7f2 TL |
123 | if (i != WRITE_LIFE_MAX) { |
124 | enable_wrt = false; | |
125 | dout(0) << "ioctl(F_SET_FILE_RW_HINT) on " << path << " failed: " << cpp_strerror(r) << dendl; | |
126 | } | |
127 | #endif | |
128 | ||
7c673cae FG |
129 | dio = true; |
130 | aio = cct->_conf->bdev_aio; | |
131 | if (!aio) { | |
11fdf7f2 | 132 | ceph_abort_msg("non-aio not supported"); |
7c673cae FG |
133 | } |
134 | ||
135 | // disable readahead as it will wreak havoc on our mix of | |
136 | // directio/aio and buffered io. | |
11fdf7f2 | 137 | r = posix_fadvise(fd_buffereds[WRITE_LIFE_NOT_SET], 0, 0, POSIX_FADV_RANDOM); |
7c673cae FG |
138 | if (r) { |
139 | r = -r; | |
9f95a23c | 140 | derr << __func__ << " posix_fadvise got: " << cpp_strerror(r) << dendl; |
7c673cae FG |
141 | goto out_fail; |
142 | } | |
143 | ||
11fdf7f2 TL |
144 | if (lock_exclusive) { |
145 | r = _lock(); | |
146 | if (r < 0) { | |
147 | derr << __func__ << " failed to lock " << path << ": " << cpp_strerror(r) | |
148 | << dendl; | |
149 | goto out_fail; | |
150 | } | |
7c673cae FG |
151 | } |
152 | ||
153 | struct stat st; | |
11fdf7f2 | 154 | r = ::fstat(fd_directs[WRITE_LIFE_NOT_SET], &st); |
7c673cae FG |
155 | if (r < 0) { |
156 | r = -errno; | |
157 | derr << __func__ << " fstat got " << cpp_strerror(r) << dendl; | |
158 | goto out_fail; | |
159 | } | |
160 | ||
161 | // Operate as though the block size is 4 KB. The backing file | |
162 | // blksize doesn't strictly matter except that some file systems may | |
163 | // require a read/modify/write if we write something smaller than | |
164 | // it. | |
165 | block_size = cct->_conf->bdev_block_size; | |
166 | if (block_size != (unsigned)st.st_blksize) { | |
167 | dout(1) << __func__ << " backing device/file reports st_blksize " | |
168 | << st.st_blksize << ", using bdev_block_size " | |
169 | << block_size << " anyway" << dendl; | |
170 | } | |
171 | ||
7c673cae FG |
172 | |
173 | { | |
11fdf7f2 TL |
174 | BlkDev blkdev_direct(fd_directs[WRITE_LIFE_NOT_SET]); |
175 | BlkDev blkdev_buffered(fd_buffereds[WRITE_LIFE_NOT_SET]); | |
176 | ||
177 | if (S_ISBLK(st.st_mode)) { | |
178 | int64_t s; | |
179 | r = blkdev_direct.get_size(&s); | |
180 | if (r < 0) { | |
181 | goto out_fail; | |
182 | } | |
183 | size = s; | |
184 | } else { | |
185 | size = st.st_size; | |
186 | } | |
187 | ||
7c673cae | 188 | char partition[PATH_MAX], devname[PATH_MAX]; |
11fdf7f2 TL |
189 | if ((r = blkdev_buffered.partition(partition, PATH_MAX)) || |
190 | (r = blkdev_buffered.wholedisk(devname, PATH_MAX))) { | |
7c673cae | 191 | derr << "unable to get device name for " << path << ": " |
11fdf7f2 | 192 | << cpp_strerror(r) << dendl; |
7c673cae FG |
193 | rotational = true; |
194 | } else { | |
195 | dout(20) << __func__ << " devname " << devname << dendl; | |
11fdf7f2 TL |
196 | rotational = blkdev_buffered.is_rotational(); |
197 | support_discard = blkdev_buffered.support_discard(); | |
198 | this->devname = devname; | |
199 | _detect_vdo(); | |
7c673cae FG |
200 | } |
201 | } | |
202 | ||
31f18b77 FG |
203 | r = _aio_start(); |
204 | if (r < 0) { | |
205 | goto out_fail; | |
206 | } | |
11fdf7f2 | 207 | _discard_start(); |
7c673cae FG |
208 | |
209 | // round size down to an even block | |
210 | size &= ~(block_size - 1); | |
211 | ||
7c673cae FG |
212 | dout(1) << __func__ |
213 | << " size " << size | |
214 | << " (0x" << std::hex << size << std::dec << ", " | |
1adf2230 | 215 | << byte_u_t(size) << ")" |
7c673cae | 216 | << " block_size " << block_size |
1adf2230 | 217 | << " (" << byte_u_t(block_size) << ")" |
7c673cae | 218 | << " " << (rotational ? "rotational" : "non-rotational") |
11fdf7f2 | 219 | << " discard " << (support_discard ? "supported" : "not supported") |
7c673cae FG |
220 | << dendl; |
221 | return 0; | |
222 | ||
11fdf7f2 TL |
223 | out_fail: |
224 | for (i = 0; i < WRITE_LIFE_MAX; i++) { | |
225 | if (fd_directs[i] >= 0) { | |
226 | VOID_TEMP_FAILURE_RETRY(::close(fd_directs[i])); | |
227 | fd_directs[i] = -1; | |
228 | } else { | |
229 | break; | |
230 | } | |
231 | if (fd_buffereds[i] >= 0) { | |
232 | VOID_TEMP_FAILURE_RETRY(::close(fd_buffereds[i])); | |
233 | fd_buffereds[i] = -1; | |
234 | } else { | |
235 | break; | |
236 | } | |
237 | } | |
7c673cae FG |
238 | return r; |
239 | } | |
240 | ||
9f95a23c | 241 | int KernelDevice::get_devices(std::set<std::string> *ls) const |
11fdf7f2 TL |
242 | { |
243 | if (devname.empty()) { | |
244 | return 0; | |
245 | } | |
246 | get_raw_devices(devname, ls); | |
247 | return 0; | |
248 | } | |
249 | ||
7c673cae FG |
250 | void KernelDevice::close() |
251 | { | |
252 | dout(1) << __func__ << dendl; | |
253 | _aio_stop(); | |
11fdf7f2 | 254 | _discard_stop(); |
7c673cae | 255 | |
11fdf7f2 TL |
256 | if (vdo_fd >= 0) { |
257 | VOID_TEMP_FAILURE_RETRY(::close(vdo_fd)); | |
258 | vdo_fd = -1; | |
259 | } | |
7c673cae | 260 | |
11fdf7f2 TL |
261 | for (int i = 0; i < WRITE_LIFE_MAX; i++) { |
262 | assert(fd_directs[i] >= 0); | |
263 | VOID_TEMP_FAILURE_RETRY(::close(fd_directs[i])); | |
264 | fd_directs[i] = -1; | |
7c673cae | 265 | |
11fdf7f2 TL |
266 | assert(fd_buffereds[i] >= 0); |
267 | VOID_TEMP_FAILURE_RETRY(::close(fd_buffereds[i])); | |
268 | fd_buffereds[i] = -1; | |
269 | } | |
7c673cae FG |
270 | path.clear(); |
271 | } | |
272 | ||
11fdf7f2 | 273 | int KernelDevice::collect_metadata(const string& prefix, map<string,string> *pm) const |
7c673cae | 274 | { |
11fdf7f2 | 275 | (*pm)[prefix + "support_discard"] = stringify((int)(bool)support_discard); |
7c673cae FG |
276 | (*pm)[prefix + "rotational"] = stringify((int)(bool)rotational); |
277 | (*pm)[prefix + "size"] = stringify(get_size()); | |
278 | (*pm)[prefix + "block_size"] = stringify(get_block_size()); | |
279 | (*pm)[prefix + "driver"] = "KernelDevice"; | |
280 | if (rotational) { | |
281 | (*pm)[prefix + "type"] = "hdd"; | |
282 | } else { | |
283 | (*pm)[prefix + "type"] = "ssd"; | |
284 | } | |
11fdf7f2 TL |
285 | if (vdo_fd >= 0) { |
286 | (*pm)[prefix + "vdo"] = "true"; | |
287 | uint64_t total, avail; | |
288 | get_vdo_utilization(vdo_fd, &total, &avail); | |
289 | (*pm)[prefix + "vdo_physical_size"] = stringify(total); | |
290 | } | |
7c673cae | 291 | |
9f95a23c TL |
292 | { |
293 | string res_names; | |
294 | std::set<std::string> devnames; | |
295 | if (get_devices(&devnames) == 0) { | |
296 | for (auto& dev : devnames) { | |
297 | if (!res_names.empty()) { | |
298 | res_names += ","; | |
299 | } | |
300 | res_names += dev; | |
301 | } | |
302 | if (res_names.size()) { | |
303 | (*pm)[prefix + "devices"] = res_names; | |
304 | } | |
305 | } | |
306 | } | |
307 | ||
7c673cae | 308 | struct stat st; |
11fdf7f2 | 309 | int r = ::fstat(fd_buffereds[WRITE_LIFE_NOT_SET], &st); |
7c673cae FG |
310 | if (r < 0) |
311 | return -errno; | |
312 | if (S_ISBLK(st.st_mode)) { | |
313 | (*pm)[prefix + "access_mode"] = "blk"; | |
11fdf7f2 TL |
314 | |
315 | char buffer[1024] = {0}; | |
316 | BlkDev blkdev{fd_buffereds[WRITE_LIFE_NOT_SET]}; | |
317 | if (r = blkdev.partition(buffer, sizeof(buffer)); r) { | |
7c673cae | 318 | (*pm)[prefix + "partition_path"] = "unknown"; |
11fdf7f2 TL |
319 | } else { |
320 | (*pm)[prefix + "partition_path"] = buffer; | |
321 | } | |
322 | buffer[0] = '\0'; | |
323 | if (r = blkdev.partition(buffer, sizeof(buffer)); r) { | |
7c673cae | 324 | (*pm)[prefix + "dev_node"] = "unknown"; |
11fdf7f2 TL |
325 | } else { |
326 | (*pm)[prefix + "dev_node"] = buffer; | |
327 | } | |
328 | if (!r) { | |
329 | return 0; | |
330 | } | |
331 | buffer[0] = '\0'; | |
332 | blkdev.model(buffer, sizeof(buffer)); | |
333 | (*pm)[prefix + "model"] = buffer; | |
334 | ||
335 | buffer[0] = '\0'; | |
336 | blkdev.dev(buffer, sizeof(buffer)); | |
337 | (*pm)[prefix + "dev"] = buffer; | |
338 | ||
339 | // nvme exposes a serial number | |
340 | buffer[0] = '\0'; | |
341 | blkdev.serial(buffer, sizeof(buffer)); | |
342 | (*pm)[prefix + "serial"] = buffer; | |
343 | ||
11fdf7f2 TL |
344 | // numa |
345 | int node; | |
346 | r = blkdev.get_numa_node(&node); | |
347 | if (r >= 0) { | |
348 | (*pm)[prefix + "numa_node"] = stringify(node); | |
7c673cae FG |
349 | } |
350 | } else { | |
351 | (*pm)[prefix + "access_mode"] = "file"; | |
352 | (*pm)[prefix + "path"] = path; | |
353 | } | |
354 | return 0; | |
355 | } | |
356 | ||
11fdf7f2 TL |
357 | void KernelDevice::_detect_vdo() |
358 | { | |
359 | vdo_fd = get_vdo_stats_handle(devname.c_str(), &vdo_name); | |
360 | if (vdo_fd >= 0) { | |
361 | dout(1) << __func__ << " VDO volume " << vdo_name | |
362 | << " maps to " << devname << dendl; | |
363 | } else { | |
364 | dout(20) << __func__ << " no VDO volume maps to " << devname << dendl; | |
365 | } | |
366 | return; | |
367 | } | |
368 | ||
369 | bool KernelDevice::get_thin_utilization(uint64_t *total, uint64_t *avail) const | |
370 | { | |
371 | if (vdo_fd < 0) { | |
372 | return false; | |
373 | } | |
374 | return get_vdo_utilization(vdo_fd, total, avail); | |
375 | } | |
376 | ||
377 | int KernelDevice::choose_fd(bool buffered, int write_hint) const | |
378 | { | |
379 | assert(write_hint >= WRITE_LIFE_NOT_SET && write_hint < WRITE_LIFE_MAX); | |
380 | if (!enable_wrt) | |
381 | write_hint = WRITE_LIFE_NOT_SET; | |
382 | return buffered ? fd_buffereds[write_hint] : fd_directs[write_hint]; | |
383 | } | |
384 | ||
7c673cae FG |
385 | int KernelDevice::flush() |
386 | { | |
31f18b77 | 387 | // protect flush with a mutex. note that we are not really protecting |
7c673cae FG |
388 | // data here. instead, we're ensuring that if any flush() caller |
389 | // sees that io_since_flush is true, they block any racing callers | |
390 | // until the flush is observed. that allows racing threads to be | |
391 | // calling flush while still ensuring that *any* of them that got an | |
392 | // aio completion notification will not return before that aio is | |
393 | // stable on disk: whichever thread sees the flag first will block | |
394 | // followers until the aio is stable. | |
11fdf7f2 | 395 | std::lock_guard l(flush_mutex); |
7c673cae FG |
396 | |
397 | bool expect = true; | |
398 | if (!io_since_flush.compare_exchange_strong(expect, false)) { | |
399 | dout(10) << __func__ << " no-op (no ios since last flush), flag is " | |
400 | << (int)io_since_flush.load() << dendl; | |
401 | return 0; | |
402 | } | |
403 | ||
404 | dout(10) << __func__ << " start" << dendl; | |
405 | if (cct->_conf->bdev_inject_crash) { | |
406 | ++injecting_crash; | |
407 | // sleep for a moment to give other threads a chance to submit or | |
408 | // wait on io that races with a flush. | |
409 | derr << __func__ << " injecting crash. first we sleep..." << dendl; | |
410 | sleep(cct->_conf->bdev_inject_crash_flush_delay); | |
411 | derr << __func__ << " and now we die" << dendl; | |
412 | cct->_log->flush(); | |
413 | _exit(1); | |
414 | } | |
415 | utime_t start = ceph_clock_now(); | |
11fdf7f2 | 416 | int r = ::fdatasync(fd_directs[WRITE_LIFE_NOT_SET]); |
7c673cae FG |
417 | utime_t end = ceph_clock_now(); |
418 | utime_t dur = end - start; | |
419 | if (r < 0) { | |
420 | r = -errno; | |
421 | derr << __func__ << " fdatasync got: " << cpp_strerror(r) << dendl; | |
422 | ceph_abort(); | |
423 | } | |
424 | dout(5) << __func__ << " in " << dur << dendl;; | |
425 | return r; | |
426 | } | |
427 | ||
428 | int KernelDevice::_aio_start() | |
429 | { | |
430 | if (aio) { | |
431 | dout(10) << __func__ << dendl; | |
9f95a23c | 432 | int r = io_queue->init(fd_directs); |
7c673cae | 433 | if (r < 0) { |
31f18b77 FG |
434 | if (r == -EAGAIN) { |
435 | derr << __func__ << " io_setup(2) failed with EAGAIN; " | |
436 | << "try increasing /proc/sys/fs/aio-max-nr" << dendl; | |
437 | } else { | |
438 | derr << __func__ << " io_setup(2) failed: " << cpp_strerror(r) << dendl; | |
439 | } | |
7c673cae FG |
440 | return r; |
441 | } | |
442 | aio_thread.create("bstore_aio"); | |
443 | } | |
444 | return 0; | |
445 | } | |
446 | ||
447 | void KernelDevice::_aio_stop() | |
448 | { | |
449 | if (aio) { | |
450 | dout(10) << __func__ << dendl; | |
451 | aio_stop = true; | |
452 | aio_thread.join(); | |
453 | aio_stop = false; | |
9f95a23c | 454 | io_queue->shutdown(); |
7c673cae FG |
455 | } |
456 | } | |
457 | ||
11fdf7f2 TL |
458 | int KernelDevice::_discard_start() |
459 | { | |
460 | discard_thread.create("bstore_discard"); | |
461 | return 0; | |
462 | } | |
463 | ||
464 | void KernelDevice::_discard_stop() | |
465 | { | |
466 | dout(10) << __func__ << dendl; | |
467 | { | |
468 | std::unique_lock l(discard_lock); | |
469 | while (!discard_started) { | |
470 | discard_cond.wait(l); | |
471 | } | |
472 | discard_stop = true; | |
473 | discard_cond.notify_all(); | |
474 | } | |
475 | discard_thread.join(); | |
476 | { | |
477 | std::lock_guard l(discard_lock); | |
478 | discard_stop = false; | |
479 | } | |
480 | dout(10) << __func__ << " stopped" << dendl; | |
481 | } | |
482 | ||
483 | void KernelDevice::discard_drain() | |
484 | { | |
485 | dout(10) << __func__ << dendl; | |
486 | std::unique_lock l(discard_lock); | |
487 | while (!discard_queued.empty() || discard_running) { | |
488 | discard_cond.wait(l); | |
489 | } | |
490 | } | |
491 | ||
28e407b8 AA |
492 | static bool is_expected_ioerr(const int r) |
493 | { | |
494 | // https://lxr.missinglinkelectronics.com/linux+v4.15/block/blk-core.c#L135 | |
495 | return (r == -EOPNOTSUPP || r == -ETIMEDOUT || r == -ENOSPC || | |
11fdf7f2 | 496 | r == -ENOLINK || r == -EREMOTEIO || r == -EAGAIN || r == -EIO || |
28e407b8 | 497 | r == -ENODATA || r == -EILSEQ || r == -ENOMEM || |
11fdf7f2 TL |
498 | #if defined(__linux__) |
499 | r == -EREMCHG || r == -EBADE | |
500 | #elif defined(__FreeBSD__) | |
501 | r == - BSM_ERRNO_EREMCHG || r == -BSM_ERRNO_EBADE | |
502 | #endif | |
503 | ); | |
28e407b8 AA |
504 | } |
505 | ||
7c673cae FG |
506 | void KernelDevice::_aio_thread() |
507 | { | |
508 | dout(10) << __func__ << " start" << dendl; | |
509 | int inject_crash_count = 0; | |
510 | while (!aio_stop) { | |
511 | dout(40) << __func__ << " polling" << dendl; | |
224ce89b | 512 | int max = cct->_conf->bdev_aio_reap_max; |
7c673cae | 513 | aio_t *aio[max]; |
9f95a23c | 514 | int r = io_queue->get_next_completed(cct->_conf->bdev_aio_poll_ms, |
7c673cae FG |
515 | aio, max); |
516 | if (r < 0) { | |
517 | derr << __func__ << " got " << cpp_strerror(r) << dendl; | |
11fdf7f2 | 518 | ceph_abort_msg("got unexpected error from io_getevents"); |
7c673cae FG |
519 | } |
520 | if (r > 0) { | |
521 | dout(30) << __func__ << " got " << r << " completed aios" << dendl; | |
522 | for (int i = 0; i < r; ++i) { | |
523 | IOContext *ioc = static_cast<IOContext*>(aio[i]->priv); | |
524 | _aio_log_finish(ioc, aio[i]->offset, aio[i]->length); | |
525 | if (aio[i]->queue_item.is_linked()) { | |
11fdf7f2 | 526 | std::lock_guard l(debug_queue_lock); |
7c673cae FG |
527 | debug_aio_unlink(*aio[i]); |
528 | } | |
529 | ||
530 | // set flag indicating new ios have completed. we do this *before* | |
531 | // any completion or notifications so that any user flush() that | |
532 | // follows the observed io completion will include this io. Note | |
533 | // that an earlier, racing flush() could observe and clear this | |
534 | // flag, but that also ensures that the IO will be stable before the | |
535 | // later flush() occurs. | |
536 | io_since_flush.store(true); | |
537 | ||
94b18763 | 538 | long r = aio[i]->get_return_value(); |
b32b8144 | 539 | if (r < 0) { |
28e407b8 AA |
540 | derr << __func__ << " got r=" << r << " (" << cpp_strerror(r) << ")" |
541 | << dendl; | |
542 | if (ioc->allow_eio && is_expected_ioerr(r)) { | |
543 | derr << __func__ << " translating the error to EIO for upper layer" | |
544 | << dendl; | |
545 | ioc->set_return_value(-EIO); | |
b32b8144 | 546 | } else { |
11fdf7f2 TL |
547 | if (is_expected_ioerr(r)) { |
548 | note_io_error_event( | |
549 | devname.c_str(), | |
550 | path.c_str(), | |
551 | r, | |
81eedcae TL |
552 | #if defined(HAVE_POSIXAIO) |
553 | aio[i]->aio.aiocb.aio_lio_opcode, | |
554 | #else | |
555 | aio[i]->iocb.aio_lio_opcode, | |
556 | #endif | |
11fdf7f2 TL |
557 | aio[i]->offset, |
558 | aio[i]->length); | |
559 | ceph_abort_msg( | |
560 | "Unexpected IO error. " | |
561 | "This may suggest a hardware issue. " | |
562 | "Please check your kernel log!"); | |
563 | } | |
564 | ceph_abort_msg( | |
565 | "Unexpected IO error. " | |
566 | "This may suggest HW issue. Please check your dmesg!"); | |
b32b8144 FG |
567 | } |
568 | } else if (aio[i]->length != (uint64_t)r) { | |
eafe8130 TL |
569 | derr << "aio to 0x" << std::hex << aio[i]->offset |
570 | << "~" << aio[i]->length << std::dec | |
b32b8144 | 571 | << " but returned: " << r << dendl; |
11fdf7f2 | 572 | ceph_abort_msg("unexpected aio return value: does not match length"); |
b32b8144 FG |
573 | } |
574 | ||
575 | dout(10) << __func__ << " finished aio " << aio[i] << " r " << r | |
576 | << " ioc " << ioc | |
577 | << " with " << (ioc->num_running.load() - 1) | |
578 | << " aios left" << dendl; | |
7c673cae FG |
579 | |
580 | // NOTE: once num_running and we either call the callback or | |
581 | // call aio_wake we cannot touch ioc or aio[] as the caller | |
582 | // may free it. | |
583 | if (ioc->priv) { | |
584 | if (--ioc->num_running == 0) { | |
585 | aio_callback(aio_callback_priv, ioc->priv); | |
586 | } | |
587 | } else { | |
31f18b77 | 588 | ioc->try_aio_wake(); |
7c673cae FG |
589 | } |
590 | } | |
591 | } | |
592 | if (cct->_conf->bdev_debug_aio) { | |
593 | utime_t now = ceph_clock_now(); | |
11fdf7f2 | 594 | std::lock_guard l(debug_queue_lock); |
7c673cae FG |
595 | if (debug_oldest) { |
596 | if (debug_stall_since == utime_t()) { | |
597 | debug_stall_since = now; | |
598 | } else { | |
11fdf7f2 TL |
599 | if (cct->_conf->bdev_debug_aio_suicide_timeout) { |
600 | utime_t cutoff = now; | |
601 | cutoff -= cct->_conf->bdev_debug_aio_suicide_timeout; | |
602 | if (debug_stall_since < cutoff) { | |
603 | derr << __func__ << " stalled aio " << debug_oldest | |
604 | << " since " << debug_stall_since << ", timeout is " | |
605 | << cct->_conf->bdev_debug_aio_suicide_timeout | |
606 | << "s, suicide" << dendl; | |
607 | ceph_abort_msg("stalled aio... buggy kernel or bad device?"); | |
608 | } | |
7c673cae FG |
609 | } |
610 | } | |
611 | } | |
612 | } | |
613 | reap_ioc(); | |
614 | if (cct->_conf->bdev_inject_crash) { | |
615 | ++inject_crash_count; | |
616 | if (inject_crash_count * cct->_conf->bdev_aio_poll_ms / 1000 > | |
617 | cct->_conf->bdev_inject_crash + cct->_conf->bdev_inject_crash_flush_delay) { | |
618 | derr << __func__ << " bdev_inject_crash trigger from aio thread" | |
619 | << dendl; | |
620 | cct->_log->flush(); | |
621 | _exit(1); | |
622 | } | |
623 | } | |
624 | } | |
625 | reap_ioc(); | |
626 | dout(10) << __func__ << " end" << dendl; | |
627 | } | |
628 | ||
11fdf7f2 TL |
629 | void KernelDevice::_discard_thread() |
630 | { | |
631 | std::unique_lock l(discard_lock); | |
632 | ceph_assert(!discard_started); | |
633 | discard_started = true; | |
634 | discard_cond.notify_all(); | |
635 | while (true) { | |
636 | ceph_assert(discard_finishing.empty()); | |
637 | if (discard_queued.empty()) { | |
638 | if (discard_stop) | |
639 | break; | |
640 | dout(20) << __func__ << " sleep" << dendl; | |
641 | discard_cond.notify_all(); // for the thread trying to drain... | |
642 | discard_cond.wait(l); | |
643 | dout(20) << __func__ << " wake" << dendl; | |
644 | } else { | |
645 | discard_finishing.swap(discard_queued); | |
646 | discard_running = true; | |
647 | l.unlock(); | |
648 | dout(20) << __func__ << " finishing" << dendl; | |
649 | for (auto p = discard_finishing.begin();p != discard_finishing.end(); ++p) { | |
650 | discard(p.get_start(), p.get_len()); | |
651 | } | |
652 | ||
653 | discard_callback(discard_callback_priv, static_cast<void*>(&discard_finishing)); | |
654 | discard_finishing.clear(); | |
655 | l.lock(); | |
656 | discard_running = false; | |
657 | } | |
658 | } | |
659 | dout(10) << __func__ << " finish" << dendl; | |
660 | discard_started = false; | |
661 | } | |
662 | ||
663 | int KernelDevice::queue_discard(interval_set<uint64_t> &to_release) | |
664 | { | |
665 | if (!support_discard) | |
666 | return -1; | |
667 | ||
668 | if (to_release.empty()) | |
669 | return 0; | |
670 | ||
671 | std::lock_guard l(discard_lock); | |
672 | discard_queued.insert(to_release); | |
673 | discard_cond.notify_all(); | |
674 | return 0; | |
675 | } | |
676 | ||
7c673cae FG |
677 | void KernelDevice::_aio_log_start( |
678 | IOContext *ioc, | |
679 | uint64_t offset, | |
680 | uint64_t length) | |
681 | { | |
682 | dout(20) << __func__ << " 0x" << std::hex << offset << "~" << length | |
683 | << std::dec << dendl; | |
684 | if (cct->_conf->bdev_debug_inflight_ios) { | |
11fdf7f2 | 685 | std::lock_guard l(debug_lock); |
7c673cae FG |
686 | if (debug_inflight.intersects(offset, length)) { |
687 | derr << __func__ << " inflight overlap of 0x" | |
688 | << std::hex | |
689 | << offset << "~" << length << std::dec | |
690 | << " with " << debug_inflight << dendl; | |
691 | ceph_abort(); | |
692 | } | |
693 | debug_inflight.insert(offset, length); | |
694 | } | |
695 | } | |
696 | ||
697 | void KernelDevice::debug_aio_link(aio_t& aio) | |
698 | { | |
699 | if (debug_queue.empty()) { | |
700 | debug_oldest = &aio; | |
701 | } | |
702 | debug_queue.push_back(aio); | |
703 | } | |
704 | ||
705 | void KernelDevice::debug_aio_unlink(aio_t& aio) | |
706 | { | |
707 | if (aio.queue_item.is_linked()) { | |
708 | debug_queue.erase(debug_queue.iterator_to(aio)); | |
709 | if (debug_oldest == &aio) { | |
11fdf7f2 TL |
710 | auto age = cct->_conf->bdev_debug_aio_log_age; |
711 | if (age && debug_stall_since != utime_t()) { | |
712 | utime_t cutoff = ceph_clock_now(); | |
713 | cutoff -= age; | |
714 | if (debug_stall_since < cutoff) { | |
715 | derr << __func__ << " stalled aio " << debug_oldest | |
716 | << " since " << debug_stall_since << ", timeout is " | |
717 | << age | |
718 | << "s" << dendl; | |
719 | } | |
720 | } | |
721 | ||
7c673cae FG |
722 | if (debug_queue.empty()) { |
723 | debug_oldest = nullptr; | |
724 | } else { | |
725 | debug_oldest = &debug_queue.front(); | |
726 | } | |
727 | debug_stall_since = utime_t(); | |
728 | } | |
729 | } | |
730 | } | |
731 | ||
732 | void KernelDevice::_aio_log_finish( | |
733 | IOContext *ioc, | |
734 | uint64_t offset, | |
735 | uint64_t length) | |
736 | { | |
737 | dout(20) << __func__ << " " << aio << " 0x" | |
738 | << std::hex << offset << "~" << length << std::dec << dendl; | |
739 | if (cct->_conf->bdev_debug_inflight_ios) { | |
11fdf7f2 | 740 | std::lock_guard l(debug_lock); |
7c673cae FG |
741 | debug_inflight.erase(offset, length); |
742 | } | |
743 | } | |
744 | ||
745 | void KernelDevice::aio_submit(IOContext *ioc) | |
746 | { | |
747 | dout(20) << __func__ << " ioc " << ioc | |
748 | << " pending " << ioc->num_pending.load() | |
749 | << " running " << ioc->num_running.load() | |
750 | << dendl; | |
224ce89b | 751 | |
7c673cae FG |
752 | if (ioc->num_pending.load() == 0) { |
753 | return; | |
754 | } | |
224ce89b | 755 | |
7c673cae FG |
756 | // move these aside, and get our end iterator position now, as the |
757 | // aios might complete as soon as they are submitted and queue more | |
758 | // wal aio's. | |
759 | list<aio_t>::iterator e = ioc->running_aios.begin(); | |
760 | ioc->running_aios.splice(e, ioc->pending_aios); | |
7c673cae FG |
761 | |
762 | int pending = ioc->num_pending.load(); | |
763 | ioc->num_running += pending; | |
764 | ioc->num_pending -= pending; | |
11fdf7f2 TL |
765 | ceph_assert(ioc->num_pending.load() == 0); // we should be only thread doing this |
766 | ceph_assert(ioc->pending_aios.size() == 0); | |
767 | ||
224ce89b WB |
768 | if (cct->_conf->bdev_debug_aio) { |
769 | list<aio_t>::iterator p = ioc->running_aios.begin(); | |
770 | while (p != e) { | |
11fdf7f2 TL |
771 | dout(30) << __func__ << " " << *p << dendl; |
772 | std::lock_guard l(debug_queue_lock); | |
224ce89b | 773 | debug_aio_link(*p++); |
7c673cae FG |
774 | } |
775 | } | |
224ce89b WB |
776 | |
777 | void *priv = static_cast<void*>(ioc); | |
778 | int r, retries = 0; | |
9f95a23c | 779 | r = io_queue->submit_batch(ioc->running_aios.begin(), e, |
11fdf7f2 TL |
780 | pending, priv, &retries); |
781 | ||
224ce89b WB |
782 | if (retries) |
783 | derr << __func__ << " retries " << retries << dendl; | |
784 | if (r < 0) { | |
785 | derr << " aio submit got " << cpp_strerror(r) << dendl; | |
11fdf7f2 | 786 | ceph_assert(r == 0); |
224ce89b | 787 | } |
7c673cae FG |
788 | } |
789 | ||
11fdf7f2 | 790 | int KernelDevice::_sync_write(uint64_t off, bufferlist &bl, bool buffered, int write_hint) |
7c673cae FG |
791 | { |
792 | uint64_t len = bl.length(); | |
793 | dout(5) << __func__ << " 0x" << std::hex << off << "~" << len | |
11fdf7f2 | 794 | << std::dec << (buffered ? " (buffered)" : " (direct)") << dendl; |
7c673cae FG |
795 | if (cct->_conf->bdev_inject_crash && |
796 | rand() % cct->_conf->bdev_inject_crash == 0) { | |
797 | derr << __func__ << " bdev_inject_crash: dropping io 0x" << std::hex | |
798 | << off << "~" << len << std::dec << dendl; | |
799 | ++injecting_crash; | |
800 | return 0; | |
801 | } | |
802 | vector<iovec> iov; | |
803 | bl.prepare_iov(&iov); | |
7c673cae | 804 | |
e306af50 TL |
805 | auto left = len; |
806 | auto o = off; | |
807 | size_t idx = 0; | |
808 | do { | |
809 | auto r = ::pwritev(choose_fd(buffered, write_hint), | |
810 | &iov[idx], iov.size() - idx, o); | |
811 | ||
812 | if (r < 0) { | |
813 | r = -errno; | |
814 | derr << __func__ << " pwritev error: " << cpp_strerror(r) << dendl; | |
815 | return r; | |
816 | } | |
817 | o += r; | |
818 | left -= r; | |
819 | if (left) { | |
820 | // skip fully processed IOVs | |
821 | while (idx < iov.size() && (size_t)r >= iov[idx].iov_len) { | |
822 | r -= iov[idx++].iov_len; | |
823 | } | |
824 | // update partially processed one if any | |
825 | if (r) { | |
826 | ceph_assert(idx < iov.size()); | |
827 | ceph_assert((size_t)r < iov[idx].iov_len); | |
828 | iov[idx].iov_base = static_cast<char*>(iov[idx].iov_base) + r; | |
829 | iov[idx].iov_len -= r; | |
830 | r = 0; | |
831 | } | |
832 | ceph_assert(r == 0); | |
833 | } | |
834 | } while (left); | |
835 | ||
11fdf7f2 | 836 | #ifdef HAVE_SYNC_FILE_RANGE |
7c673cae | 837 | if (buffered) { |
494da23a | 838 | // initiate IO and wait till it completes |
e306af50 | 839 | auto r = ::sync_file_range(fd_buffereds[WRITE_LIFE_NOT_SET], off, len, SYNC_FILE_RANGE_WRITE|SYNC_FILE_RANGE_WAIT_AFTER|SYNC_FILE_RANGE_WAIT_BEFORE); |
7c673cae FG |
840 | if (r < 0) { |
841 | r = -errno; | |
842 | derr << __func__ << " sync_file_range error: " << cpp_strerror(r) << dendl; | |
843 | return r; | |
844 | } | |
845 | } | |
11fdf7f2 | 846 | #endif |
31f18b77 FG |
847 | |
848 | io_since_flush.store(true); | |
849 | ||
7c673cae FG |
850 | return 0; |
851 | } | |
852 | ||
853 | int KernelDevice::write( | |
854 | uint64_t off, | |
855 | bufferlist &bl, | |
11fdf7f2 TL |
856 | bool buffered, |
857 | int write_hint) | |
7c673cae FG |
858 | { |
859 | uint64_t len = bl.length(); | |
860 | dout(20) << __func__ << " 0x" << std::hex << off << "~" << len << std::dec | |
861 | << (buffered ? " (buffered)" : " (direct)") | |
862 | << dendl; | |
11fdf7f2 | 863 | ceph_assert(is_valid_io(off, len)); |
eafe8130 TL |
864 | if (cct->_conf->objectstore_blackhole) { |
865 | lderr(cct) << __func__ << " objectstore_blackhole=true, throwing out IO" | |
866 | << dendl; | |
867 | return 0; | |
868 | } | |
7c673cae FG |
869 | |
870 | if ((!buffered || bl.get_num_buffers() >= IOV_MAX) && | |
b32b8144 | 871 | bl.rebuild_aligned_size_and_memory(block_size, block_size, IOV_MAX)) { |
7c673cae FG |
872 | dout(20) << __func__ << " rebuilding buffer to be aligned" << dendl; |
873 | } | |
874 | dout(40) << "data: "; | |
875 | bl.hexdump(*_dout); | |
876 | *_dout << dendl; | |
877 | ||
11fdf7f2 | 878 | return _sync_write(off, bl, buffered, write_hint); |
7c673cae FG |
879 | } |
880 | ||
881 | int KernelDevice::aio_write( | |
882 | uint64_t off, | |
883 | bufferlist &bl, | |
884 | IOContext *ioc, | |
11fdf7f2 TL |
885 | bool buffered, |
886 | int write_hint) | |
7c673cae FG |
887 | { |
888 | uint64_t len = bl.length(); | |
889 | dout(20) << __func__ << " 0x" << std::hex << off << "~" << len << std::dec | |
890 | << (buffered ? " (buffered)" : " (direct)") | |
891 | << dendl; | |
11fdf7f2 | 892 | ceph_assert(is_valid_io(off, len)); |
eafe8130 TL |
893 | if (cct->_conf->objectstore_blackhole) { |
894 | lderr(cct) << __func__ << " objectstore_blackhole=true, throwing out IO" | |
895 | << dendl; | |
896 | return 0; | |
897 | } | |
7c673cae FG |
898 | |
899 | if ((!buffered || bl.get_num_buffers() >= IOV_MAX) && | |
b32b8144 | 900 | bl.rebuild_aligned_size_and_memory(block_size, block_size, IOV_MAX)) { |
7c673cae FG |
901 | dout(20) << __func__ << " rebuilding buffer to be aligned" << dendl; |
902 | } | |
903 | dout(40) << "data: "; | |
904 | bl.hexdump(*_dout); | |
905 | *_dout << dendl; | |
906 | ||
907 | _aio_log_start(ioc, off, len); | |
908 | ||
909 | #ifdef HAVE_LIBAIO | |
910 | if (aio && dio && !buffered) { | |
7c673cae FG |
911 | if (cct->_conf->bdev_inject_crash && |
912 | rand() % cct->_conf->bdev_inject_crash == 0) { | |
913 | derr << __func__ << " bdev_inject_crash: dropping io 0x" << std::hex | |
914 | << off << "~" << len << std::dec | |
915 | << dendl; | |
916 | // generate a real io so that aio_wait behaves properly, but make it | |
917 | // a read instead of write, and toss the result. | |
494da23a TL |
918 | ioc->pending_aios.push_back(aio_t(ioc, choose_fd(false, write_hint))); |
919 | ++ioc->num_pending; | |
920 | auto& aio = ioc->pending_aios.back(); | |
9f95a23c TL |
921 | bufferptr p = buffer::create_small_page_aligned(len); |
922 | aio.bl.append(std::move(p)); | |
923 | aio.bl.prepare_iov(&aio.iov); | |
924 | aio.preadv(off, len); | |
7c673cae FG |
925 | ++injecting_crash; |
926 | } else { | |
494da23a TL |
927 | if (bl.length() <= RW_IO_MAX) { |
928 | // fast path (non-huge write) | |
929 | ioc->pending_aios.push_back(aio_t(ioc, choose_fd(false, write_hint))); | |
930 | ++ioc->num_pending; | |
931 | auto& aio = ioc->pending_aios.back(); | |
932 | bl.prepare_iov(&aio.iov); | |
933 | aio.bl.claim_append(bl); | |
934 | aio.pwritev(off, len); | |
935 | dout(30) << aio << dendl; | |
936 | dout(5) << __func__ << " 0x" << std::hex << off << "~" << len | |
937 | << std::dec << " aio " << &aio << dendl; | |
938 | } else { | |
939 | // write in RW_IO_MAX-sized chunks | |
940 | uint64_t prev_len = 0; | |
941 | while (prev_len < bl.length()) { | |
942 | bufferlist tmp; | |
943 | if (prev_len + RW_IO_MAX < bl.length()) { | |
944 | tmp.substr_of(bl, prev_len, RW_IO_MAX); | |
945 | } else { | |
946 | tmp.substr_of(bl, prev_len, bl.length() - prev_len); | |
947 | } | |
948 | auto len = tmp.length(); | |
949 | ioc->pending_aios.push_back(aio_t(ioc, choose_fd(false, write_hint))); | |
950 | ++ioc->num_pending; | |
951 | auto& aio = ioc->pending_aios.back(); | |
952 | tmp.prepare_iov(&aio.iov); | |
953 | aio.bl.claim_append(tmp); | |
954 | aio.pwritev(off + prev_len, len); | |
955 | dout(30) << aio << dendl; | |
956 | dout(5) << __func__ << " 0x" << std::hex << off + prev_len | |
957 | << "~" << len | |
958 | << std::dec << " aio " << &aio << " (piece)" << dendl; | |
959 | prev_len += len; | |
960 | } | |
961 | } | |
7c673cae | 962 | } |
7c673cae FG |
963 | } else |
964 | #endif | |
965 | { | |
11fdf7f2 | 966 | int r = _sync_write(off, bl, buffered, write_hint); |
7c673cae FG |
967 | _aio_log_finish(ioc, off, len); |
968 | if (r < 0) | |
969 | return r; | |
970 | } | |
971 | return 0; | |
972 | } | |
973 | ||
11fdf7f2 TL |
974 | int KernelDevice::discard(uint64_t offset, uint64_t len) |
975 | { | |
976 | int r = 0; | |
eafe8130 TL |
977 | if (cct->_conf->objectstore_blackhole) { |
978 | lderr(cct) << __func__ << " objectstore_blackhole=true, throwing out IO" | |
979 | << dendl; | |
980 | return 0; | |
981 | } | |
11fdf7f2 TL |
982 | if (support_discard) { |
983 | dout(10) << __func__ | |
984 | << " 0x" << std::hex << offset << "~" << len << std::dec | |
985 | << dendl; | |
986 | ||
987 | r = BlkDev{fd_directs[WRITE_LIFE_NOT_SET]}.discard((int64_t)offset, (int64_t)len); | |
988 | } | |
989 | return r; | |
990 | } | |
991 | ||
7c673cae FG |
992 | int KernelDevice::read(uint64_t off, uint64_t len, bufferlist *pbl, |
993 | IOContext *ioc, | |
994 | bool buffered) | |
995 | { | |
996 | dout(5) << __func__ << " 0x" << std::hex << off << "~" << len << std::dec | |
997 | << (buffered ? " (buffered)" : " (direct)") | |
998 | << dendl; | |
11fdf7f2 | 999 | ceph_assert(is_valid_io(off, len)); |
7c673cae FG |
1000 | |
1001 | _aio_log_start(ioc, off, len); | |
1002 | ||
11fdf7f2 TL |
1003 | auto start1 = mono_clock::now(); |
1004 | ||
1005 | auto p = buffer::ptr_node::create(buffer::create_small_page_aligned(len)); | |
1006 | int r = ::pread(buffered ? fd_buffereds[WRITE_LIFE_NOT_SET] : fd_directs[WRITE_LIFE_NOT_SET], | |
1007 | p->c_str(), len, off); | |
1008 | auto age = cct->_conf->bdev_debug_aio_log_age; | |
1009 | if (mono_clock::now() - start1 >= make_timespan(age)) { | |
1010 | derr << __func__ << " stalled read " | |
1011 | << " 0x" << std::hex << off << "~" << len << std::dec | |
1012 | << (buffered ? " (buffered)" : " (direct)") | |
1013 | << " since " << start1 << ", timeout is " | |
1014 | << age | |
1015 | << "s" << dendl; | |
1016 | } | |
1017 | ||
7c673cae | 1018 | if (r < 0) { |
a8e16298 TL |
1019 | if (ioc->allow_eio && is_expected_ioerr(r)) { |
1020 | r = -EIO; | |
1021 | } else { | |
1022 | r = -errno; | |
1023 | } | |
7c673cae FG |
1024 | goto out; |
1025 | } | |
11fdf7f2 | 1026 | ceph_assert((uint64_t)r == len); |
7c673cae FG |
1027 | pbl->push_back(std::move(p)); |
1028 | ||
1029 | dout(40) << "data: "; | |
1030 | pbl->hexdump(*_dout); | |
1031 | *_dout << dendl; | |
1032 | ||
1033 | out: | |
1034 | _aio_log_finish(ioc, off, len); | |
1035 | return r < 0 ? r : 0; | |
1036 | } | |
1037 | ||
1038 | int KernelDevice::aio_read( | |
1039 | uint64_t off, | |
1040 | uint64_t len, | |
1041 | bufferlist *pbl, | |
1042 | IOContext *ioc) | |
1043 | { | |
1044 | dout(5) << __func__ << " 0x" << std::hex << off << "~" << len << std::dec | |
1045 | << dendl; | |
1046 | ||
1047 | int r = 0; | |
1048 | #ifdef HAVE_LIBAIO | |
1049 | if (aio && dio) { | |
11fdf7f2 | 1050 | ceph_assert(is_valid_io(off, len)); |
7c673cae | 1051 | _aio_log_start(ioc, off, len); |
11fdf7f2 | 1052 | ioc->pending_aios.push_back(aio_t(ioc, fd_directs[WRITE_LIFE_NOT_SET])); |
7c673cae FG |
1053 | ++ioc->num_pending; |
1054 | aio_t& aio = ioc->pending_aios.back(); | |
9f95a23c TL |
1055 | bufferptr p = buffer::create_small_page_aligned(len); |
1056 | aio.bl.append(std::move(p)); | |
1057 | aio.bl.prepare_iov(&aio.iov); | |
1058 | aio.preadv(off, len); | |
11fdf7f2 | 1059 | dout(30) << aio << dendl; |
7c673cae FG |
1060 | pbl->append(aio.bl); |
1061 | dout(5) << __func__ << " 0x" << std::hex << off << "~" << len | |
1062 | << std::dec << " aio " << &aio << dendl; | |
1063 | } else | |
1064 | #endif | |
1065 | { | |
1066 | r = read(off, len, pbl, ioc, false); | |
1067 | } | |
1068 | ||
1069 | return r; | |
1070 | } | |
1071 | ||
1072 | int KernelDevice::direct_read_unaligned(uint64_t off, uint64_t len, char *buf) | |
1073 | { | |
9f95a23c TL |
1074 | uint64_t aligned_off = p2align(off, block_size); |
1075 | uint64_t aligned_len = p2roundup(off+len, block_size) - aligned_off; | |
11fdf7f2 | 1076 | bufferptr p = buffer::create_small_page_aligned(aligned_len); |
7c673cae FG |
1077 | int r = 0; |
1078 | ||
11fdf7f2 TL |
1079 | auto start1 = mono_clock::now(); |
1080 | r = ::pread(fd_directs[WRITE_LIFE_NOT_SET], p.c_str(), aligned_len, aligned_off); | |
1081 | auto age = cct->_conf->bdev_debug_aio_log_age; | |
1082 | if (mono_clock::now() - start1 >= make_timespan(age)) { | |
1083 | derr << __func__ << " stalled read " | |
1084 | << " 0x" << std::hex << off << "~" << len << std::dec | |
1085 | << " since " << start1 << ", timeout is " | |
1086 | << age | |
1087 | << "s" << dendl; | |
1088 | } | |
1089 | ||
7c673cae FG |
1090 | if (r < 0) { |
1091 | r = -errno; | |
11fdf7f2 | 1092 | derr << __func__ << " 0x" << std::hex << off << "~" << len << std::dec |
7c673cae FG |
1093 | << " error: " << cpp_strerror(r) << dendl; |
1094 | goto out; | |
1095 | } | |
11fdf7f2 | 1096 | ceph_assert((uint64_t)r == aligned_len); |
7c673cae FG |
1097 | memcpy(buf, p.c_str() + (off - aligned_off), len); |
1098 | ||
1099 | dout(40) << __func__ << " data: "; | |
1100 | bufferlist bl; | |
1101 | bl.append(buf, len); | |
1102 | bl.hexdump(*_dout); | |
1103 | *_dout << dendl; | |
1104 | ||
1105 | out: | |
1106 | return r < 0 ? r : 0; | |
1107 | } | |
1108 | ||
1109 | int KernelDevice::read_random(uint64_t off, uint64_t len, char *buf, | |
1110 | bool buffered) | |
1111 | { | |
1112 | dout(5) << __func__ << " 0x" << std::hex << off << "~" << len << std::dec | |
494da23a | 1113 | << "buffered " << buffered |
7c673cae | 1114 | << dendl; |
11fdf7f2 TL |
1115 | ceph_assert(len > 0); |
1116 | ceph_assert(off < size); | |
1117 | ceph_assert(off + len <= size); | |
7c673cae | 1118 | int r = 0; |
11fdf7f2 | 1119 | auto age = cct->_conf->bdev_debug_aio_log_age; |
7c673cae FG |
1120 | |
1121 | //if it's direct io and unaligned, we have to use a internal buffer | |
1122 | if (!buffered && ((off % block_size != 0) | |
1123 | || (len % block_size != 0) | |
1124 | || (uintptr_t(buf) % CEPH_PAGE_SIZE != 0))) | |
1125 | return direct_read_unaligned(off, len, buf); | |
1126 | ||
11fdf7f2 | 1127 | auto start1 = mono_clock::now(); |
7c673cae FG |
1128 | if (buffered) { |
1129 | //buffered read | |
11fdf7f2 | 1130 | auto off0 = off; |
7c673cae FG |
1131 | char *t = buf; |
1132 | uint64_t left = len; | |
1133 | while (left > 0) { | |
11fdf7f2 | 1134 | r = ::pread(fd_buffereds[WRITE_LIFE_NOT_SET], t, left, off); |
7c673cae FG |
1135 | if (r < 0) { |
1136 | r = -errno; | |
11fdf7f2 | 1137 | derr << __func__ << " 0x" << std::hex << off << "~" << left |
7c673cae FG |
1138 | << std::dec << " error: " << cpp_strerror(r) << dendl; |
1139 | goto out; | |
1140 | } | |
1141 | off += r; | |
1142 | t += r; | |
1143 | left -= r; | |
1144 | } | |
11fdf7f2 TL |
1145 | if (mono_clock::now() - start1 >= make_timespan(age)) { |
1146 | derr << __func__ << " stalled read " | |
1147 | << " 0x" << std::hex << off0 << "~" << len << std::dec | |
1148 | << " (buffered) since " << start1 << ", timeout is " | |
1149 | << age | |
1150 | << "s" << dendl; | |
1151 | } | |
7c673cae FG |
1152 | } else { |
1153 | //direct and aligned read | |
11fdf7f2 TL |
1154 | r = ::pread(fd_directs[WRITE_LIFE_NOT_SET], buf, len, off); |
1155 | if (mono_clock::now() - start1 >= make_timespan(age)) { | |
1156 | derr << __func__ << " stalled read " | |
1157 | << " 0x" << std::hex << off << "~" << len << std::dec | |
1158 | << " (direct) since " << start1 << ", timeout is " | |
1159 | << age | |
1160 | << "s" << dendl; | |
1161 | } | |
7c673cae FG |
1162 | if (r < 0) { |
1163 | r = -errno; | |
11fdf7f2 TL |
1164 | derr << __func__ << " direct_aligned_read" << " 0x" << std::hex |
1165 | << off << "~" << left << std::dec << " error: " << cpp_strerror(r) | |
7c673cae FG |
1166 | << dendl; |
1167 | goto out; | |
1168 | } | |
11fdf7f2 | 1169 | ceph_assert((uint64_t)r == len); |
7c673cae FG |
1170 | } |
1171 | ||
1172 | dout(40) << __func__ << " data: "; | |
1173 | bufferlist bl; | |
1174 | bl.append(buf, len); | |
1175 | bl.hexdump(*_dout); | |
1176 | *_dout << dendl; | |
1177 | ||
1178 | out: | |
1179 | return r < 0 ? r : 0; | |
1180 | } | |
1181 | ||
1182 | int KernelDevice::invalidate_cache(uint64_t off, uint64_t len) | |
1183 | { | |
1184 | dout(5) << __func__ << " 0x" << std::hex << off << "~" << len << std::dec | |
1185 | << dendl; | |
11fdf7f2 TL |
1186 | ceph_assert(off % block_size == 0); |
1187 | ceph_assert(len % block_size == 0); | |
1188 | int r = posix_fadvise(fd_buffereds[WRITE_LIFE_NOT_SET], off, len, POSIX_FADV_DONTNEED); | |
7c673cae FG |
1189 | if (r) { |
1190 | r = -r; | |
1191 | derr << __func__ << " 0x" << std::hex << off << "~" << len << std::dec | |
1192 | << " error: " << cpp_strerror(r) << dendl; | |
1193 | } | |
1194 | return r; | |
1195 | } |