]> git.proxmox.com Git - ceph.git/blame - ceph/src/os/bluestore/KernelDevice.cc
import 15.2.4
[ceph.git] / ceph / src / os / bluestore / KernelDevice.cc
CommitLineData
7c673cae
FG
1// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2// vim: ts=8 sw=2 smarttab
3/*
4 * Ceph - scalable distributed file system
5 *
6 * Copyright (C) 2014 Red Hat
7 *
8 * This is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU Lesser General Public
10 * License version 2.1, as published by the Free Software
11 * Foundation. See file COPYING.
12 *
13 */
14
15#include <unistd.h>
16#include <stdlib.h>
17#include <sys/types.h>
18#include <sys/stat.h>
19#include <fcntl.h>
11fdf7f2 20#include <sys/file.h>
7c673cae
FG
21
22#include "KernelDevice.h"
9f95a23c 23#include "include/intarith.h"
7c673cae
FG
24#include "include/types.h"
25#include "include/compat.h"
26#include "include/stringify.h"
11fdf7f2 27#include "common/blkdev.h"
7c673cae 28#include "common/errno.h"
11fdf7f2
TL
29#if defined(__FreeBSD__)
30#include "bsm/audit_errno.h"
31#endif
7c673cae 32#include "common/debug.h"
11fdf7f2
TL
33#include "common/numa.h"
34
35#include "global/global_context.h"
9f95a23c 36#include "ceph_io_uring.h"
7c673cae
FG
37
38#define dout_context cct
39#define dout_subsys ceph_subsys_bdev
40#undef dout_prefix
41#define dout_prefix *_dout << "bdev(" << this << " " << path << ") "
42
11fdf7f2
TL
43KernelDevice::KernelDevice(CephContext* cct, aio_callback_t cb, void *cbpriv, aio_callback_t d_cb, void *d_cbpriv)
44 : BlockDevice(cct, cb, cbpriv),
45 aio(false), dio(false),
11fdf7f2
TL
46 discard_callback(d_cb),
47 discard_callback_priv(d_cbpriv),
7c673cae 48 aio_stop(false),
11fdf7f2
TL
49 discard_started(false),
50 discard_stop(false),
7c673cae 51 aio_thread(this),
11fdf7f2 52 discard_thread(this),
7c673cae
FG
53 injecting_crash(0)
54{
11fdf7f2
TL
55 fd_directs.resize(WRITE_LIFE_MAX, -1);
56 fd_buffereds.resize(WRITE_LIFE_MAX, -1);
9f95a23c
TL
57
58 bool use_ioring = g_ceph_context->_conf.get_val<bool>("bluestore_ioring");
59 unsigned int iodepth = cct->_conf->bdev_aio_max_queue_depth;
60
61 if (use_ioring && ioring_queue_t::supported()) {
62 io_queue = std::make_unique<ioring_queue_t>(iodepth);
63 } else {
64 static bool once;
65 if (use_ioring && !once) {
66 derr << "WARNING: io_uring API is not supported! Fallback to libaio!"
67 << dendl;
68 once = true;
69 }
70 io_queue = std::make_unique<aio_queue_t>(iodepth);
71 }
7c673cae
FG
72}
73
74int KernelDevice::_lock()
75{
11fdf7f2
TL
76 dout(10) << __func__ << " " << fd_directs[WRITE_LIFE_NOT_SET] << dendl;
77 int r = ::flock(fd_directs[WRITE_LIFE_NOT_SET], LOCK_EX | LOCK_NB);
78 if (r < 0) {
79 derr << __func__ << " flock failed on " << path << dendl;
7c673cae 80 return -errno;
11fdf7f2 81 }
7c673cae
FG
82 return 0;
83}
84
85int KernelDevice::open(const string& p)
86{
87 path = p;
11fdf7f2 88 int r = 0, i = 0;
7c673cae
FG
89 dout(1) << __func__ << " path " << path << dendl;
90
11fdf7f2
TL
91 for (i = 0; i < WRITE_LIFE_MAX; i++) {
92 int fd = ::open(path.c_str(), O_RDWR | O_DIRECT);
93 if (fd < 0) {
94 r = -errno;
95 break;
96 }
97 fd_directs[i] = fd;
98
99 fd = ::open(path.c_str(), O_RDWR | O_CLOEXEC);
100 if (fd < 0) {
101 r = -errno;
102 break;
103 }
104 fd_buffereds[i] = fd;
7c673cae 105 }
11fdf7f2
TL
106
107 if (i != WRITE_LIFE_MAX) {
7c673cae 108 derr << __func__ << " open got: " << cpp_strerror(r) << dendl;
11fdf7f2
TL
109 goto out_fail;
110 }
111
112#if defined(F_SET_FILE_RW_HINT)
113 for (i = WRITE_LIFE_NONE; i < WRITE_LIFE_MAX; i++) {
114 if (fcntl(fd_directs[i], F_SET_FILE_RW_HINT, &i) < 0) {
115 r = -errno;
116 break;
117 }
118 if (fcntl(fd_buffereds[i], F_SET_FILE_RW_HINT, &i) < 0) {
119 r = -errno;
120 break;
121 }
7c673cae 122 }
11fdf7f2
TL
123 if (i != WRITE_LIFE_MAX) {
124 enable_wrt = false;
125 dout(0) << "ioctl(F_SET_FILE_RW_HINT) on " << path << " failed: " << cpp_strerror(r) << dendl;
126 }
127#endif
128
7c673cae
FG
129 dio = true;
130 aio = cct->_conf->bdev_aio;
131 if (!aio) {
11fdf7f2 132 ceph_abort_msg("non-aio not supported");
7c673cae
FG
133 }
134
135 // disable readahead as it will wreak havoc on our mix of
136 // directio/aio and buffered io.
11fdf7f2 137 r = posix_fadvise(fd_buffereds[WRITE_LIFE_NOT_SET], 0, 0, POSIX_FADV_RANDOM);
7c673cae
FG
138 if (r) {
139 r = -r;
9f95a23c 140 derr << __func__ << " posix_fadvise got: " << cpp_strerror(r) << dendl;
7c673cae
FG
141 goto out_fail;
142 }
143
11fdf7f2
TL
144 if (lock_exclusive) {
145 r = _lock();
146 if (r < 0) {
147 derr << __func__ << " failed to lock " << path << ": " << cpp_strerror(r)
148 << dendl;
149 goto out_fail;
150 }
7c673cae
FG
151 }
152
153 struct stat st;
11fdf7f2 154 r = ::fstat(fd_directs[WRITE_LIFE_NOT_SET], &st);
7c673cae
FG
155 if (r < 0) {
156 r = -errno;
157 derr << __func__ << " fstat got " << cpp_strerror(r) << dendl;
158 goto out_fail;
159 }
160
161 // Operate as though the block size is 4 KB. The backing file
162 // blksize doesn't strictly matter except that some file systems may
163 // require a read/modify/write if we write something smaller than
164 // it.
165 block_size = cct->_conf->bdev_block_size;
166 if (block_size != (unsigned)st.st_blksize) {
167 dout(1) << __func__ << " backing device/file reports st_blksize "
168 << st.st_blksize << ", using bdev_block_size "
169 << block_size << " anyway" << dendl;
170 }
171
7c673cae
FG
172
173 {
11fdf7f2
TL
174 BlkDev blkdev_direct(fd_directs[WRITE_LIFE_NOT_SET]);
175 BlkDev blkdev_buffered(fd_buffereds[WRITE_LIFE_NOT_SET]);
176
177 if (S_ISBLK(st.st_mode)) {
178 int64_t s;
179 r = blkdev_direct.get_size(&s);
180 if (r < 0) {
181 goto out_fail;
182 }
183 size = s;
184 } else {
185 size = st.st_size;
186 }
187
7c673cae 188 char partition[PATH_MAX], devname[PATH_MAX];
11fdf7f2
TL
189 if ((r = blkdev_buffered.partition(partition, PATH_MAX)) ||
190 (r = blkdev_buffered.wholedisk(devname, PATH_MAX))) {
7c673cae 191 derr << "unable to get device name for " << path << ": "
11fdf7f2 192 << cpp_strerror(r) << dendl;
7c673cae
FG
193 rotational = true;
194 } else {
195 dout(20) << __func__ << " devname " << devname << dendl;
11fdf7f2
TL
196 rotational = blkdev_buffered.is_rotational();
197 support_discard = blkdev_buffered.support_discard();
198 this->devname = devname;
199 _detect_vdo();
7c673cae
FG
200 }
201 }
202
31f18b77
FG
203 r = _aio_start();
204 if (r < 0) {
205 goto out_fail;
206 }
11fdf7f2 207 _discard_start();
7c673cae
FG
208
209 // round size down to an even block
210 size &= ~(block_size - 1);
211
7c673cae
FG
212 dout(1) << __func__
213 << " size " << size
214 << " (0x" << std::hex << size << std::dec << ", "
1adf2230 215 << byte_u_t(size) << ")"
7c673cae 216 << " block_size " << block_size
1adf2230 217 << " (" << byte_u_t(block_size) << ")"
7c673cae 218 << " " << (rotational ? "rotational" : "non-rotational")
11fdf7f2 219 << " discard " << (support_discard ? "supported" : "not supported")
7c673cae
FG
220 << dendl;
221 return 0;
222
11fdf7f2
TL
223out_fail:
224 for (i = 0; i < WRITE_LIFE_MAX; i++) {
225 if (fd_directs[i] >= 0) {
226 VOID_TEMP_FAILURE_RETRY(::close(fd_directs[i]));
227 fd_directs[i] = -1;
228 } else {
229 break;
230 }
231 if (fd_buffereds[i] >= 0) {
232 VOID_TEMP_FAILURE_RETRY(::close(fd_buffereds[i]));
233 fd_buffereds[i] = -1;
234 } else {
235 break;
236 }
237 }
7c673cae
FG
238 return r;
239}
240
9f95a23c 241int KernelDevice::get_devices(std::set<std::string> *ls) const
11fdf7f2
TL
242{
243 if (devname.empty()) {
244 return 0;
245 }
246 get_raw_devices(devname, ls);
247 return 0;
248}
249
7c673cae
FG
250void KernelDevice::close()
251{
252 dout(1) << __func__ << dendl;
253 _aio_stop();
11fdf7f2 254 _discard_stop();
7c673cae 255
11fdf7f2
TL
256 if (vdo_fd >= 0) {
257 VOID_TEMP_FAILURE_RETRY(::close(vdo_fd));
258 vdo_fd = -1;
259 }
7c673cae 260
11fdf7f2
TL
261 for (int i = 0; i < WRITE_LIFE_MAX; i++) {
262 assert(fd_directs[i] >= 0);
263 VOID_TEMP_FAILURE_RETRY(::close(fd_directs[i]));
264 fd_directs[i] = -1;
7c673cae 265
11fdf7f2
TL
266 assert(fd_buffereds[i] >= 0);
267 VOID_TEMP_FAILURE_RETRY(::close(fd_buffereds[i]));
268 fd_buffereds[i] = -1;
269 }
7c673cae
FG
270 path.clear();
271}
272
11fdf7f2 273int KernelDevice::collect_metadata(const string& prefix, map<string,string> *pm) const
7c673cae 274{
11fdf7f2 275 (*pm)[prefix + "support_discard"] = stringify((int)(bool)support_discard);
7c673cae
FG
276 (*pm)[prefix + "rotational"] = stringify((int)(bool)rotational);
277 (*pm)[prefix + "size"] = stringify(get_size());
278 (*pm)[prefix + "block_size"] = stringify(get_block_size());
279 (*pm)[prefix + "driver"] = "KernelDevice";
280 if (rotational) {
281 (*pm)[prefix + "type"] = "hdd";
282 } else {
283 (*pm)[prefix + "type"] = "ssd";
284 }
11fdf7f2
TL
285 if (vdo_fd >= 0) {
286 (*pm)[prefix + "vdo"] = "true";
287 uint64_t total, avail;
288 get_vdo_utilization(vdo_fd, &total, &avail);
289 (*pm)[prefix + "vdo_physical_size"] = stringify(total);
290 }
7c673cae 291
9f95a23c
TL
292 {
293 string res_names;
294 std::set<std::string> devnames;
295 if (get_devices(&devnames) == 0) {
296 for (auto& dev : devnames) {
297 if (!res_names.empty()) {
298 res_names += ",";
299 }
300 res_names += dev;
301 }
302 if (res_names.size()) {
303 (*pm)[prefix + "devices"] = res_names;
304 }
305 }
306 }
307
7c673cae 308 struct stat st;
11fdf7f2 309 int r = ::fstat(fd_buffereds[WRITE_LIFE_NOT_SET], &st);
7c673cae
FG
310 if (r < 0)
311 return -errno;
312 if (S_ISBLK(st.st_mode)) {
313 (*pm)[prefix + "access_mode"] = "blk";
11fdf7f2
TL
314
315 char buffer[1024] = {0};
316 BlkDev blkdev{fd_buffereds[WRITE_LIFE_NOT_SET]};
317 if (r = blkdev.partition(buffer, sizeof(buffer)); r) {
7c673cae 318 (*pm)[prefix + "partition_path"] = "unknown";
11fdf7f2
TL
319 } else {
320 (*pm)[prefix + "partition_path"] = buffer;
321 }
322 buffer[0] = '\0';
323 if (r = blkdev.partition(buffer, sizeof(buffer)); r) {
7c673cae 324 (*pm)[prefix + "dev_node"] = "unknown";
11fdf7f2
TL
325 } else {
326 (*pm)[prefix + "dev_node"] = buffer;
327 }
328 if (!r) {
329 return 0;
330 }
331 buffer[0] = '\0';
332 blkdev.model(buffer, sizeof(buffer));
333 (*pm)[prefix + "model"] = buffer;
334
335 buffer[0] = '\0';
336 blkdev.dev(buffer, sizeof(buffer));
337 (*pm)[prefix + "dev"] = buffer;
338
339 // nvme exposes a serial number
340 buffer[0] = '\0';
341 blkdev.serial(buffer, sizeof(buffer));
342 (*pm)[prefix + "serial"] = buffer;
343
11fdf7f2
TL
344 // numa
345 int node;
346 r = blkdev.get_numa_node(&node);
347 if (r >= 0) {
348 (*pm)[prefix + "numa_node"] = stringify(node);
7c673cae
FG
349 }
350 } else {
351 (*pm)[prefix + "access_mode"] = "file";
352 (*pm)[prefix + "path"] = path;
353 }
354 return 0;
355}
356
11fdf7f2
TL
357void KernelDevice::_detect_vdo()
358{
359 vdo_fd = get_vdo_stats_handle(devname.c_str(), &vdo_name);
360 if (vdo_fd >= 0) {
361 dout(1) << __func__ << " VDO volume " << vdo_name
362 << " maps to " << devname << dendl;
363 } else {
364 dout(20) << __func__ << " no VDO volume maps to " << devname << dendl;
365 }
366 return;
367}
368
369bool KernelDevice::get_thin_utilization(uint64_t *total, uint64_t *avail) const
370{
371 if (vdo_fd < 0) {
372 return false;
373 }
374 return get_vdo_utilization(vdo_fd, total, avail);
375}
376
377int KernelDevice::choose_fd(bool buffered, int write_hint) const
378{
379 assert(write_hint >= WRITE_LIFE_NOT_SET && write_hint < WRITE_LIFE_MAX);
380 if (!enable_wrt)
381 write_hint = WRITE_LIFE_NOT_SET;
382 return buffered ? fd_buffereds[write_hint] : fd_directs[write_hint];
383}
384
7c673cae
FG
385int KernelDevice::flush()
386{
31f18b77 387 // protect flush with a mutex. note that we are not really protecting
7c673cae
FG
388 // data here. instead, we're ensuring that if any flush() caller
389 // sees that io_since_flush is true, they block any racing callers
390 // until the flush is observed. that allows racing threads to be
391 // calling flush while still ensuring that *any* of them that got an
392 // aio completion notification will not return before that aio is
393 // stable on disk: whichever thread sees the flag first will block
394 // followers until the aio is stable.
11fdf7f2 395 std::lock_guard l(flush_mutex);
7c673cae
FG
396
397 bool expect = true;
398 if (!io_since_flush.compare_exchange_strong(expect, false)) {
399 dout(10) << __func__ << " no-op (no ios since last flush), flag is "
400 << (int)io_since_flush.load() << dendl;
401 return 0;
402 }
403
404 dout(10) << __func__ << " start" << dendl;
405 if (cct->_conf->bdev_inject_crash) {
406 ++injecting_crash;
407 // sleep for a moment to give other threads a chance to submit or
408 // wait on io that races with a flush.
409 derr << __func__ << " injecting crash. first we sleep..." << dendl;
410 sleep(cct->_conf->bdev_inject_crash_flush_delay);
411 derr << __func__ << " and now we die" << dendl;
412 cct->_log->flush();
413 _exit(1);
414 }
415 utime_t start = ceph_clock_now();
11fdf7f2 416 int r = ::fdatasync(fd_directs[WRITE_LIFE_NOT_SET]);
7c673cae
FG
417 utime_t end = ceph_clock_now();
418 utime_t dur = end - start;
419 if (r < 0) {
420 r = -errno;
421 derr << __func__ << " fdatasync got: " << cpp_strerror(r) << dendl;
422 ceph_abort();
423 }
424 dout(5) << __func__ << " in " << dur << dendl;;
425 return r;
426}
427
428int KernelDevice::_aio_start()
429{
430 if (aio) {
431 dout(10) << __func__ << dendl;
9f95a23c 432 int r = io_queue->init(fd_directs);
7c673cae 433 if (r < 0) {
31f18b77
FG
434 if (r == -EAGAIN) {
435 derr << __func__ << " io_setup(2) failed with EAGAIN; "
436 << "try increasing /proc/sys/fs/aio-max-nr" << dendl;
437 } else {
438 derr << __func__ << " io_setup(2) failed: " << cpp_strerror(r) << dendl;
439 }
7c673cae
FG
440 return r;
441 }
442 aio_thread.create("bstore_aio");
443 }
444 return 0;
445}
446
447void KernelDevice::_aio_stop()
448{
449 if (aio) {
450 dout(10) << __func__ << dendl;
451 aio_stop = true;
452 aio_thread.join();
453 aio_stop = false;
9f95a23c 454 io_queue->shutdown();
7c673cae
FG
455 }
456}
457
11fdf7f2
TL
458int KernelDevice::_discard_start()
459{
460 discard_thread.create("bstore_discard");
461 return 0;
462}
463
464void KernelDevice::_discard_stop()
465{
466 dout(10) << __func__ << dendl;
467 {
468 std::unique_lock l(discard_lock);
469 while (!discard_started) {
470 discard_cond.wait(l);
471 }
472 discard_stop = true;
473 discard_cond.notify_all();
474 }
475 discard_thread.join();
476 {
477 std::lock_guard l(discard_lock);
478 discard_stop = false;
479 }
480 dout(10) << __func__ << " stopped" << dendl;
481}
482
483void KernelDevice::discard_drain()
484{
485 dout(10) << __func__ << dendl;
486 std::unique_lock l(discard_lock);
487 while (!discard_queued.empty() || discard_running) {
488 discard_cond.wait(l);
489 }
490}
491
28e407b8
AA
492static bool is_expected_ioerr(const int r)
493{
494 // https://lxr.missinglinkelectronics.com/linux+v4.15/block/blk-core.c#L135
495 return (r == -EOPNOTSUPP || r == -ETIMEDOUT || r == -ENOSPC ||
11fdf7f2 496 r == -ENOLINK || r == -EREMOTEIO || r == -EAGAIN || r == -EIO ||
28e407b8 497 r == -ENODATA || r == -EILSEQ || r == -ENOMEM ||
11fdf7f2
TL
498#if defined(__linux__)
499 r == -EREMCHG || r == -EBADE
500#elif defined(__FreeBSD__)
501 r == - BSM_ERRNO_EREMCHG || r == -BSM_ERRNO_EBADE
502#endif
503 );
28e407b8
AA
504}
505
7c673cae
FG
506void KernelDevice::_aio_thread()
507{
508 dout(10) << __func__ << " start" << dendl;
509 int inject_crash_count = 0;
510 while (!aio_stop) {
511 dout(40) << __func__ << " polling" << dendl;
224ce89b 512 int max = cct->_conf->bdev_aio_reap_max;
7c673cae 513 aio_t *aio[max];
9f95a23c 514 int r = io_queue->get_next_completed(cct->_conf->bdev_aio_poll_ms,
7c673cae
FG
515 aio, max);
516 if (r < 0) {
517 derr << __func__ << " got " << cpp_strerror(r) << dendl;
11fdf7f2 518 ceph_abort_msg("got unexpected error from io_getevents");
7c673cae
FG
519 }
520 if (r > 0) {
521 dout(30) << __func__ << " got " << r << " completed aios" << dendl;
522 for (int i = 0; i < r; ++i) {
523 IOContext *ioc = static_cast<IOContext*>(aio[i]->priv);
524 _aio_log_finish(ioc, aio[i]->offset, aio[i]->length);
525 if (aio[i]->queue_item.is_linked()) {
11fdf7f2 526 std::lock_guard l(debug_queue_lock);
7c673cae
FG
527 debug_aio_unlink(*aio[i]);
528 }
529
530 // set flag indicating new ios have completed. we do this *before*
531 // any completion or notifications so that any user flush() that
532 // follows the observed io completion will include this io. Note
533 // that an earlier, racing flush() could observe and clear this
534 // flag, but that also ensures that the IO will be stable before the
535 // later flush() occurs.
536 io_since_flush.store(true);
537
94b18763 538 long r = aio[i]->get_return_value();
b32b8144 539 if (r < 0) {
28e407b8
AA
540 derr << __func__ << " got r=" << r << " (" << cpp_strerror(r) << ")"
541 << dendl;
542 if (ioc->allow_eio && is_expected_ioerr(r)) {
543 derr << __func__ << " translating the error to EIO for upper layer"
544 << dendl;
545 ioc->set_return_value(-EIO);
b32b8144 546 } else {
11fdf7f2
TL
547 if (is_expected_ioerr(r)) {
548 note_io_error_event(
549 devname.c_str(),
550 path.c_str(),
551 r,
81eedcae
TL
552#if defined(HAVE_POSIXAIO)
553 aio[i]->aio.aiocb.aio_lio_opcode,
554#else
555 aio[i]->iocb.aio_lio_opcode,
556#endif
11fdf7f2
TL
557 aio[i]->offset,
558 aio[i]->length);
559 ceph_abort_msg(
560 "Unexpected IO error. "
561 "This may suggest a hardware issue. "
562 "Please check your kernel log!");
563 }
564 ceph_abort_msg(
565 "Unexpected IO error. "
566 "This may suggest HW issue. Please check your dmesg!");
b32b8144
FG
567 }
568 } else if (aio[i]->length != (uint64_t)r) {
eafe8130
TL
569 derr << "aio to 0x" << std::hex << aio[i]->offset
570 << "~" << aio[i]->length << std::dec
b32b8144 571 << " but returned: " << r << dendl;
11fdf7f2 572 ceph_abort_msg("unexpected aio return value: does not match length");
b32b8144
FG
573 }
574
575 dout(10) << __func__ << " finished aio " << aio[i] << " r " << r
576 << " ioc " << ioc
577 << " with " << (ioc->num_running.load() - 1)
578 << " aios left" << dendl;
7c673cae
FG
579
580 // NOTE: once num_running and we either call the callback or
581 // call aio_wake we cannot touch ioc or aio[] as the caller
582 // may free it.
583 if (ioc->priv) {
584 if (--ioc->num_running == 0) {
585 aio_callback(aio_callback_priv, ioc->priv);
586 }
587 } else {
31f18b77 588 ioc->try_aio_wake();
7c673cae
FG
589 }
590 }
591 }
592 if (cct->_conf->bdev_debug_aio) {
593 utime_t now = ceph_clock_now();
11fdf7f2 594 std::lock_guard l(debug_queue_lock);
7c673cae
FG
595 if (debug_oldest) {
596 if (debug_stall_since == utime_t()) {
597 debug_stall_since = now;
598 } else {
11fdf7f2
TL
599 if (cct->_conf->bdev_debug_aio_suicide_timeout) {
600 utime_t cutoff = now;
601 cutoff -= cct->_conf->bdev_debug_aio_suicide_timeout;
602 if (debug_stall_since < cutoff) {
603 derr << __func__ << " stalled aio " << debug_oldest
604 << " since " << debug_stall_since << ", timeout is "
605 << cct->_conf->bdev_debug_aio_suicide_timeout
606 << "s, suicide" << dendl;
607 ceph_abort_msg("stalled aio... buggy kernel or bad device?");
608 }
7c673cae
FG
609 }
610 }
611 }
612 }
613 reap_ioc();
614 if (cct->_conf->bdev_inject_crash) {
615 ++inject_crash_count;
616 if (inject_crash_count * cct->_conf->bdev_aio_poll_ms / 1000 >
617 cct->_conf->bdev_inject_crash + cct->_conf->bdev_inject_crash_flush_delay) {
618 derr << __func__ << " bdev_inject_crash trigger from aio thread"
619 << dendl;
620 cct->_log->flush();
621 _exit(1);
622 }
623 }
624 }
625 reap_ioc();
626 dout(10) << __func__ << " end" << dendl;
627}
628
11fdf7f2
TL
629void KernelDevice::_discard_thread()
630{
631 std::unique_lock l(discard_lock);
632 ceph_assert(!discard_started);
633 discard_started = true;
634 discard_cond.notify_all();
635 while (true) {
636 ceph_assert(discard_finishing.empty());
637 if (discard_queued.empty()) {
638 if (discard_stop)
639 break;
640 dout(20) << __func__ << " sleep" << dendl;
641 discard_cond.notify_all(); // for the thread trying to drain...
642 discard_cond.wait(l);
643 dout(20) << __func__ << " wake" << dendl;
644 } else {
645 discard_finishing.swap(discard_queued);
646 discard_running = true;
647 l.unlock();
648 dout(20) << __func__ << " finishing" << dendl;
649 for (auto p = discard_finishing.begin();p != discard_finishing.end(); ++p) {
650 discard(p.get_start(), p.get_len());
651 }
652
653 discard_callback(discard_callback_priv, static_cast<void*>(&discard_finishing));
654 discard_finishing.clear();
655 l.lock();
656 discard_running = false;
657 }
658 }
659 dout(10) << __func__ << " finish" << dendl;
660 discard_started = false;
661}
662
663int KernelDevice::queue_discard(interval_set<uint64_t> &to_release)
664{
665 if (!support_discard)
666 return -1;
667
668 if (to_release.empty())
669 return 0;
670
671 std::lock_guard l(discard_lock);
672 discard_queued.insert(to_release);
673 discard_cond.notify_all();
674 return 0;
675}
676
7c673cae
FG
677void KernelDevice::_aio_log_start(
678 IOContext *ioc,
679 uint64_t offset,
680 uint64_t length)
681{
682 dout(20) << __func__ << " 0x" << std::hex << offset << "~" << length
683 << std::dec << dendl;
684 if (cct->_conf->bdev_debug_inflight_ios) {
11fdf7f2 685 std::lock_guard l(debug_lock);
7c673cae
FG
686 if (debug_inflight.intersects(offset, length)) {
687 derr << __func__ << " inflight overlap of 0x"
688 << std::hex
689 << offset << "~" << length << std::dec
690 << " with " << debug_inflight << dendl;
691 ceph_abort();
692 }
693 debug_inflight.insert(offset, length);
694 }
695}
696
697void KernelDevice::debug_aio_link(aio_t& aio)
698{
699 if (debug_queue.empty()) {
700 debug_oldest = &aio;
701 }
702 debug_queue.push_back(aio);
703}
704
705void KernelDevice::debug_aio_unlink(aio_t& aio)
706{
707 if (aio.queue_item.is_linked()) {
708 debug_queue.erase(debug_queue.iterator_to(aio));
709 if (debug_oldest == &aio) {
11fdf7f2
TL
710 auto age = cct->_conf->bdev_debug_aio_log_age;
711 if (age && debug_stall_since != utime_t()) {
712 utime_t cutoff = ceph_clock_now();
713 cutoff -= age;
714 if (debug_stall_since < cutoff) {
715 derr << __func__ << " stalled aio " << debug_oldest
716 << " since " << debug_stall_since << ", timeout is "
717 << age
718 << "s" << dendl;
719 }
720 }
721
7c673cae
FG
722 if (debug_queue.empty()) {
723 debug_oldest = nullptr;
724 } else {
725 debug_oldest = &debug_queue.front();
726 }
727 debug_stall_since = utime_t();
728 }
729 }
730}
731
732void KernelDevice::_aio_log_finish(
733 IOContext *ioc,
734 uint64_t offset,
735 uint64_t length)
736{
737 dout(20) << __func__ << " " << aio << " 0x"
738 << std::hex << offset << "~" << length << std::dec << dendl;
739 if (cct->_conf->bdev_debug_inflight_ios) {
11fdf7f2 740 std::lock_guard l(debug_lock);
7c673cae
FG
741 debug_inflight.erase(offset, length);
742 }
743}
744
745void KernelDevice::aio_submit(IOContext *ioc)
746{
747 dout(20) << __func__ << " ioc " << ioc
748 << " pending " << ioc->num_pending.load()
749 << " running " << ioc->num_running.load()
750 << dendl;
224ce89b 751
7c673cae
FG
752 if (ioc->num_pending.load() == 0) {
753 return;
754 }
224ce89b 755
7c673cae
FG
756 // move these aside, and get our end iterator position now, as the
757 // aios might complete as soon as they are submitted and queue more
758 // wal aio's.
759 list<aio_t>::iterator e = ioc->running_aios.begin();
760 ioc->running_aios.splice(e, ioc->pending_aios);
7c673cae
FG
761
762 int pending = ioc->num_pending.load();
763 ioc->num_running += pending;
764 ioc->num_pending -= pending;
11fdf7f2
TL
765 ceph_assert(ioc->num_pending.load() == 0); // we should be only thread doing this
766 ceph_assert(ioc->pending_aios.size() == 0);
767
224ce89b
WB
768 if (cct->_conf->bdev_debug_aio) {
769 list<aio_t>::iterator p = ioc->running_aios.begin();
770 while (p != e) {
11fdf7f2
TL
771 dout(30) << __func__ << " " << *p << dendl;
772 std::lock_guard l(debug_queue_lock);
224ce89b 773 debug_aio_link(*p++);
7c673cae
FG
774 }
775 }
224ce89b
WB
776
777 void *priv = static_cast<void*>(ioc);
778 int r, retries = 0;
9f95a23c 779 r = io_queue->submit_batch(ioc->running_aios.begin(), e,
11fdf7f2
TL
780 pending, priv, &retries);
781
224ce89b
WB
782 if (retries)
783 derr << __func__ << " retries " << retries << dendl;
784 if (r < 0) {
785 derr << " aio submit got " << cpp_strerror(r) << dendl;
11fdf7f2 786 ceph_assert(r == 0);
224ce89b 787 }
7c673cae
FG
788}
789
11fdf7f2 790int KernelDevice::_sync_write(uint64_t off, bufferlist &bl, bool buffered, int write_hint)
7c673cae
FG
791{
792 uint64_t len = bl.length();
793 dout(5) << __func__ << " 0x" << std::hex << off << "~" << len
11fdf7f2 794 << std::dec << (buffered ? " (buffered)" : " (direct)") << dendl;
7c673cae
FG
795 if (cct->_conf->bdev_inject_crash &&
796 rand() % cct->_conf->bdev_inject_crash == 0) {
797 derr << __func__ << " bdev_inject_crash: dropping io 0x" << std::hex
798 << off << "~" << len << std::dec << dendl;
799 ++injecting_crash;
800 return 0;
801 }
802 vector<iovec> iov;
803 bl.prepare_iov(&iov);
7c673cae 804
e306af50
TL
805 auto left = len;
806 auto o = off;
807 size_t idx = 0;
808 do {
809 auto r = ::pwritev(choose_fd(buffered, write_hint),
810 &iov[idx], iov.size() - idx, o);
811
812 if (r < 0) {
813 r = -errno;
814 derr << __func__ << " pwritev error: " << cpp_strerror(r) << dendl;
815 return r;
816 }
817 o += r;
818 left -= r;
819 if (left) {
820 // skip fully processed IOVs
821 while (idx < iov.size() && (size_t)r >= iov[idx].iov_len) {
822 r -= iov[idx++].iov_len;
823 }
824 // update partially processed one if any
825 if (r) {
826 ceph_assert(idx < iov.size());
827 ceph_assert((size_t)r < iov[idx].iov_len);
828 iov[idx].iov_base = static_cast<char*>(iov[idx].iov_base) + r;
829 iov[idx].iov_len -= r;
830 r = 0;
831 }
832 ceph_assert(r == 0);
833 }
834 } while (left);
835
11fdf7f2 836#ifdef HAVE_SYNC_FILE_RANGE
7c673cae 837 if (buffered) {
494da23a 838 // initiate IO and wait till it completes
e306af50 839 auto r = ::sync_file_range(fd_buffereds[WRITE_LIFE_NOT_SET], off, len, SYNC_FILE_RANGE_WRITE|SYNC_FILE_RANGE_WAIT_AFTER|SYNC_FILE_RANGE_WAIT_BEFORE);
7c673cae
FG
840 if (r < 0) {
841 r = -errno;
842 derr << __func__ << " sync_file_range error: " << cpp_strerror(r) << dendl;
843 return r;
844 }
845 }
11fdf7f2 846#endif
31f18b77
FG
847
848 io_since_flush.store(true);
849
7c673cae
FG
850 return 0;
851}
852
853int KernelDevice::write(
854 uint64_t off,
855 bufferlist &bl,
11fdf7f2
TL
856 bool buffered,
857 int write_hint)
7c673cae
FG
858{
859 uint64_t len = bl.length();
860 dout(20) << __func__ << " 0x" << std::hex << off << "~" << len << std::dec
861 << (buffered ? " (buffered)" : " (direct)")
862 << dendl;
11fdf7f2 863 ceph_assert(is_valid_io(off, len));
eafe8130
TL
864 if (cct->_conf->objectstore_blackhole) {
865 lderr(cct) << __func__ << " objectstore_blackhole=true, throwing out IO"
866 << dendl;
867 return 0;
868 }
7c673cae
FG
869
870 if ((!buffered || bl.get_num_buffers() >= IOV_MAX) &&
b32b8144 871 bl.rebuild_aligned_size_and_memory(block_size, block_size, IOV_MAX)) {
7c673cae
FG
872 dout(20) << __func__ << " rebuilding buffer to be aligned" << dendl;
873 }
874 dout(40) << "data: ";
875 bl.hexdump(*_dout);
876 *_dout << dendl;
877
11fdf7f2 878 return _sync_write(off, bl, buffered, write_hint);
7c673cae
FG
879}
880
881int KernelDevice::aio_write(
882 uint64_t off,
883 bufferlist &bl,
884 IOContext *ioc,
11fdf7f2
TL
885 bool buffered,
886 int write_hint)
7c673cae
FG
887{
888 uint64_t len = bl.length();
889 dout(20) << __func__ << " 0x" << std::hex << off << "~" << len << std::dec
890 << (buffered ? " (buffered)" : " (direct)")
891 << dendl;
11fdf7f2 892 ceph_assert(is_valid_io(off, len));
eafe8130
TL
893 if (cct->_conf->objectstore_blackhole) {
894 lderr(cct) << __func__ << " objectstore_blackhole=true, throwing out IO"
895 << dendl;
896 return 0;
897 }
7c673cae
FG
898
899 if ((!buffered || bl.get_num_buffers() >= IOV_MAX) &&
b32b8144 900 bl.rebuild_aligned_size_and_memory(block_size, block_size, IOV_MAX)) {
7c673cae
FG
901 dout(20) << __func__ << " rebuilding buffer to be aligned" << dendl;
902 }
903 dout(40) << "data: ";
904 bl.hexdump(*_dout);
905 *_dout << dendl;
906
907 _aio_log_start(ioc, off, len);
908
909#ifdef HAVE_LIBAIO
910 if (aio && dio && !buffered) {
7c673cae
FG
911 if (cct->_conf->bdev_inject_crash &&
912 rand() % cct->_conf->bdev_inject_crash == 0) {
913 derr << __func__ << " bdev_inject_crash: dropping io 0x" << std::hex
914 << off << "~" << len << std::dec
915 << dendl;
916 // generate a real io so that aio_wait behaves properly, but make it
917 // a read instead of write, and toss the result.
494da23a
TL
918 ioc->pending_aios.push_back(aio_t(ioc, choose_fd(false, write_hint)));
919 ++ioc->num_pending;
920 auto& aio = ioc->pending_aios.back();
9f95a23c
TL
921 bufferptr p = buffer::create_small_page_aligned(len);
922 aio.bl.append(std::move(p));
923 aio.bl.prepare_iov(&aio.iov);
924 aio.preadv(off, len);
7c673cae
FG
925 ++injecting_crash;
926 } else {
494da23a
TL
927 if (bl.length() <= RW_IO_MAX) {
928 // fast path (non-huge write)
929 ioc->pending_aios.push_back(aio_t(ioc, choose_fd(false, write_hint)));
930 ++ioc->num_pending;
931 auto& aio = ioc->pending_aios.back();
932 bl.prepare_iov(&aio.iov);
933 aio.bl.claim_append(bl);
934 aio.pwritev(off, len);
935 dout(30) << aio << dendl;
936 dout(5) << __func__ << " 0x" << std::hex << off << "~" << len
937 << std::dec << " aio " << &aio << dendl;
938 } else {
939 // write in RW_IO_MAX-sized chunks
940 uint64_t prev_len = 0;
941 while (prev_len < bl.length()) {
942 bufferlist tmp;
943 if (prev_len + RW_IO_MAX < bl.length()) {
944 tmp.substr_of(bl, prev_len, RW_IO_MAX);
945 } else {
946 tmp.substr_of(bl, prev_len, bl.length() - prev_len);
947 }
948 auto len = tmp.length();
949 ioc->pending_aios.push_back(aio_t(ioc, choose_fd(false, write_hint)));
950 ++ioc->num_pending;
951 auto& aio = ioc->pending_aios.back();
952 tmp.prepare_iov(&aio.iov);
953 aio.bl.claim_append(tmp);
954 aio.pwritev(off + prev_len, len);
955 dout(30) << aio << dendl;
956 dout(5) << __func__ << " 0x" << std::hex << off + prev_len
957 << "~" << len
958 << std::dec << " aio " << &aio << " (piece)" << dendl;
959 prev_len += len;
960 }
961 }
7c673cae 962 }
7c673cae
FG
963 } else
964#endif
965 {
11fdf7f2 966 int r = _sync_write(off, bl, buffered, write_hint);
7c673cae
FG
967 _aio_log_finish(ioc, off, len);
968 if (r < 0)
969 return r;
970 }
971 return 0;
972}
973
11fdf7f2
TL
974int KernelDevice::discard(uint64_t offset, uint64_t len)
975{
976 int r = 0;
eafe8130
TL
977 if (cct->_conf->objectstore_blackhole) {
978 lderr(cct) << __func__ << " objectstore_blackhole=true, throwing out IO"
979 << dendl;
980 return 0;
981 }
11fdf7f2
TL
982 if (support_discard) {
983 dout(10) << __func__
984 << " 0x" << std::hex << offset << "~" << len << std::dec
985 << dendl;
986
987 r = BlkDev{fd_directs[WRITE_LIFE_NOT_SET]}.discard((int64_t)offset, (int64_t)len);
988 }
989 return r;
990}
991
7c673cae
FG
992int KernelDevice::read(uint64_t off, uint64_t len, bufferlist *pbl,
993 IOContext *ioc,
994 bool buffered)
995{
996 dout(5) << __func__ << " 0x" << std::hex << off << "~" << len << std::dec
997 << (buffered ? " (buffered)" : " (direct)")
998 << dendl;
11fdf7f2 999 ceph_assert(is_valid_io(off, len));
7c673cae
FG
1000
1001 _aio_log_start(ioc, off, len);
1002
11fdf7f2
TL
1003 auto start1 = mono_clock::now();
1004
1005 auto p = buffer::ptr_node::create(buffer::create_small_page_aligned(len));
1006 int r = ::pread(buffered ? fd_buffereds[WRITE_LIFE_NOT_SET] : fd_directs[WRITE_LIFE_NOT_SET],
1007 p->c_str(), len, off);
1008 auto age = cct->_conf->bdev_debug_aio_log_age;
1009 if (mono_clock::now() - start1 >= make_timespan(age)) {
1010 derr << __func__ << " stalled read "
1011 << " 0x" << std::hex << off << "~" << len << std::dec
1012 << (buffered ? " (buffered)" : " (direct)")
1013 << " since " << start1 << ", timeout is "
1014 << age
1015 << "s" << dendl;
1016 }
1017
7c673cae 1018 if (r < 0) {
a8e16298
TL
1019 if (ioc->allow_eio && is_expected_ioerr(r)) {
1020 r = -EIO;
1021 } else {
1022 r = -errno;
1023 }
7c673cae
FG
1024 goto out;
1025 }
11fdf7f2 1026 ceph_assert((uint64_t)r == len);
7c673cae
FG
1027 pbl->push_back(std::move(p));
1028
1029 dout(40) << "data: ";
1030 pbl->hexdump(*_dout);
1031 *_dout << dendl;
1032
1033 out:
1034 _aio_log_finish(ioc, off, len);
1035 return r < 0 ? r : 0;
1036}
1037
1038int KernelDevice::aio_read(
1039 uint64_t off,
1040 uint64_t len,
1041 bufferlist *pbl,
1042 IOContext *ioc)
1043{
1044 dout(5) << __func__ << " 0x" << std::hex << off << "~" << len << std::dec
1045 << dendl;
1046
1047 int r = 0;
1048#ifdef HAVE_LIBAIO
1049 if (aio && dio) {
11fdf7f2 1050 ceph_assert(is_valid_io(off, len));
7c673cae 1051 _aio_log_start(ioc, off, len);
11fdf7f2 1052 ioc->pending_aios.push_back(aio_t(ioc, fd_directs[WRITE_LIFE_NOT_SET]));
7c673cae
FG
1053 ++ioc->num_pending;
1054 aio_t& aio = ioc->pending_aios.back();
9f95a23c
TL
1055 bufferptr p = buffer::create_small_page_aligned(len);
1056 aio.bl.append(std::move(p));
1057 aio.bl.prepare_iov(&aio.iov);
1058 aio.preadv(off, len);
11fdf7f2 1059 dout(30) << aio << dendl;
7c673cae
FG
1060 pbl->append(aio.bl);
1061 dout(5) << __func__ << " 0x" << std::hex << off << "~" << len
1062 << std::dec << " aio " << &aio << dendl;
1063 } else
1064#endif
1065 {
1066 r = read(off, len, pbl, ioc, false);
1067 }
1068
1069 return r;
1070}
1071
1072int KernelDevice::direct_read_unaligned(uint64_t off, uint64_t len, char *buf)
1073{
9f95a23c
TL
1074 uint64_t aligned_off = p2align(off, block_size);
1075 uint64_t aligned_len = p2roundup(off+len, block_size) - aligned_off;
11fdf7f2 1076 bufferptr p = buffer::create_small_page_aligned(aligned_len);
7c673cae
FG
1077 int r = 0;
1078
11fdf7f2
TL
1079 auto start1 = mono_clock::now();
1080 r = ::pread(fd_directs[WRITE_LIFE_NOT_SET], p.c_str(), aligned_len, aligned_off);
1081 auto age = cct->_conf->bdev_debug_aio_log_age;
1082 if (mono_clock::now() - start1 >= make_timespan(age)) {
1083 derr << __func__ << " stalled read "
1084 << " 0x" << std::hex << off << "~" << len << std::dec
1085 << " since " << start1 << ", timeout is "
1086 << age
1087 << "s" << dendl;
1088 }
1089
7c673cae
FG
1090 if (r < 0) {
1091 r = -errno;
11fdf7f2 1092 derr << __func__ << " 0x" << std::hex << off << "~" << len << std::dec
7c673cae
FG
1093 << " error: " << cpp_strerror(r) << dendl;
1094 goto out;
1095 }
11fdf7f2 1096 ceph_assert((uint64_t)r == aligned_len);
7c673cae
FG
1097 memcpy(buf, p.c_str() + (off - aligned_off), len);
1098
1099 dout(40) << __func__ << " data: ";
1100 bufferlist bl;
1101 bl.append(buf, len);
1102 bl.hexdump(*_dout);
1103 *_dout << dendl;
1104
1105 out:
1106 return r < 0 ? r : 0;
1107}
1108
1109int KernelDevice::read_random(uint64_t off, uint64_t len, char *buf,
1110 bool buffered)
1111{
1112 dout(5) << __func__ << " 0x" << std::hex << off << "~" << len << std::dec
494da23a 1113 << "buffered " << buffered
7c673cae 1114 << dendl;
11fdf7f2
TL
1115 ceph_assert(len > 0);
1116 ceph_assert(off < size);
1117 ceph_assert(off + len <= size);
7c673cae 1118 int r = 0;
11fdf7f2 1119 auto age = cct->_conf->bdev_debug_aio_log_age;
7c673cae
FG
1120
1121 //if it's direct io and unaligned, we have to use a internal buffer
1122 if (!buffered && ((off % block_size != 0)
1123 || (len % block_size != 0)
1124 || (uintptr_t(buf) % CEPH_PAGE_SIZE != 0)))
1125 return direct_read_unaligned(off, len, buf);
1126
11fdf7f2 1127 auto start1 = mono_clock::now();
7c673cae
FG
1128 if (buffered) {
1129 //buffered read
11fdf7f2 1130 auto off0 = off;
7c673cae
FG
1131 char *t = buf;
1132 uint64_t left = len;
1133 while (left > 0) {
11fdf7f2 1134 r = ::pread(fd_buffereds[WRITE_LIFE_NOT_SET], t, left, off);
7c673cae
FG
1135 if (r < 0) {
1136 r = -errno;
11fdf7f2 1137 derr << __func__ << " 0x" << std::hex << off << "~" << left
7c673cae
FG
1138 << std::dec << " error: " << cpp_strerror(r) << dendl;
1139 goto out;
1140 }
1141 off += r;
1142 t += r;
1143 left -= r;
1144 }
11fdf7f2
TL
1145 if (mono_clock::now() - start1 >= make_timespan(age)) {
1146 derr << __func__ << " stalled read "
1147 << " 0x" << std::hex << off0 << "~" << len << std::dec
1148 << " (buffered) since " << start1 << ", timeout is "
1149 << age
1150 << "s" << dendl;
1151 }
7c673cae
FG
1152 } else {
1153 //direct and aligned read
11fdf7f2
TL
1154 r = ::pread(fd_directs[WRITE_LIFE_NOT_SET], buf, len, off);
1155 if (mono_clock::now() - start1 >= make_timespan(age)) {
1156 derr << __func__ << " stalled read "
1157 << " 0x" << std::hex << off << "~" << len << std::dec
1158 << " (direct) since " << start1 << ", timeout is "
1159 << age
1160 << "s" << dendl;
1161 }
7c673cae
FG
1162 if (r < 0) {
1163 r = -errno;
11fdf7f2
TL
1164 derr << __func__ << " direct_aligned_read" << " 0x" << std::hex
1165 << off << "~" << left << std::dec << " error: " << cpp_strerror(r)
7c673cae
FG
1166 << dendl;
1167 goto out;
1168 }
11fdf7f2 1169 ceph_assert((uint64_t)r == len);
7c673cae
FG
1170 }
1171
1172 dout(40) << __func__ << " data: ";
1173 bufferlist bl;
1174 bl.append(buf, len);
1175 bl.hexdump(*_dout);
1176 *_dout << dendl;
1177
1178 out:
1179 return r < 0 ? r : 0;
1180}
1181
1182int KernelDevice::invalidate_cache(uint64_t off, uint64_t len)
1183{
1184 dout(5) << __func__ << " 0x" << std::hex << off << "~" << len << std::dec
1185 << dendl;
11fdf7f2
TL
1186 ceph_assert(off % block_size == 0);
1187 ceph_assert(len % block_size == 0);
1188 int r = posix_fadvise(fd_buffereds[WRITE_LIFE_NOT_SET], off, len, POSIX_FADV_DONTNEED);
7c673cae
FG
1189 if (r) {
1190 r = -r;
1191 derr << __func__ << " 0x" << std::hex << off << "~" << len << std::dec
1192 << " error: " << cpp_strerror(r) << dendl;
1193 }
1194 return r;
1195}