]> git.proxmox.com Git - ceph.git/blame - ceph/src/os/bluestore/KernelDevice.cc
buildsys: auto-determine current version for makefile
[ceph.git] / ceph / src / os / bluestore / KernelDevice.cc
CommitLineData
7c673cae
FG
1// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2// vim: ts=8 sw=2 smarttab
3/*
4 * Ceph - scalable distributed file system
5 *
6 * Copyright (C) 2014 Red Hat
7 *
8 * This is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU Lesser General Public
10 * License version 2.1, as published by the Free Software
11 * Foundation. See file COPYING.
12 *
13 */
14
15#include <unistd.h>
16#include <stdlib.h>
17#include <sys/types.h>
18#include <sys/stat.h>
19#include <fcntl.h>
11fdf7f2 20#include <sys/file.h>
7c673cae
FG
21
22#include "KernelDevice.h"
23#include "include/types.h"
24#include "include/compat.h"
25#include "include/stringify.h"
11fdf7f2 26#include "common/blkdev.h"
7c673cae 27#include "common/errno.h"
11fdf7f2
TL
28#if defined(__FreeBSD__)
29#include "bsm/audit_errno.h"
30#endif
7c673cae 31#include "common/debug.h"
7c673cae 32#include "common/align.h"
11fdf7f2
TL
33#include "common/numa.h"
34
35#include "global/global_context.h"
7c673cae
FG
36
37#define dout_context cct
38#define dout_subsys ceph_subsys_bdev
39#undef dout_prefix
40#define dout_prefix *_dout << "bdev(" << this << " " << path << ") "
41
11fdf7f2
TL
42KernelDevice::KernelDevice(CephContext* cct, aio_callback_t cb, void *cbpriv, aio_callback_t d_cb, void *d_cbpriv)
43 : BlockDevice(cct, cb, cbpriv),
44 aio(false), dio(false),
7c673cae 45 aio_queue(cct->_conf->bdev_aio_max_queue_depth),
11fdf7f2
TL
46 discard_callback(d_cb),
47 discard_callback_priv(d_cbpriv),
7c673cae 48 aio_stop(false),
11fdf7f2
TL
49 discard_started(false),
50 discard_stop(false),
7c673cae 51 aio_thread(this),
11fdf7f2 52 discard_thread(this),
7c673cae
FG
53 injecting_crash(0)
54{
11fdf7f2
TL
55 fd_directs.resize(WRITE_LIFE_MAX, -1);
56 fd_buffereds.resize(WRITE_LIFE_MAX, -1);
7c673cae
FG
57}
58
59int KernelDevice::_lock()
60{
11fdf7f2
TL
61 dout(10) << __func__ << " " << fd_directs[WRITE_LIFE_NOT_SET] << dendl;
62 int r = ::flock(fd_directs[WRITE_LIFE_NOT_SET], LOCK_EX | LOCK_NB);
63 if (r < 0) {
64 derr << __func__ << " flock failed on " << path << dendl;
7c673cae 65 return -errno;
11fdf7f2 66 }
7c673cae
FG
67 return 0;
68}
69
70int KernelDevice::open(const string& p)
71{
72 path = p;
11fdf7f2 73 int r = 0, i = 0;
7c673cae
FG
74 dout(1) << __func__ << " path " << path << dendl;
75
11fdf7f2
TL
76 for (i = 0; i < WRITE_LIFE_MAX; i++) {
77 int fd = ::open(path.c_str(), O_RDWR | O_DIRECT);
78 if (fd < 0) {
79 r = -errno;
80 break;
81 }
82 fd_directs[i] = fd;
83
84 fd = ::open(path.c_str(), O_RDWR | O_CLOEXEC);
85 if (fd < 0) {
86 r = -errno;
87 break;
88 }
89 fd_buffereds[i] = fd;
7c673cae 90 }
11fdf7f2
TL
91
92 if (i != WRITE_LIFE_MAX) {
7c673cae 93 derr << __func__ << " open got: " << cpp_strerror(r) << dendl;
11fdf7f2
TL
94 goto out_fail;
95 }
96
97#if defined(F_SET_FILE_RW_HINT)
98 for (i = WRITE_LIFE_NONE; i < WRITE_LIFE_MAX; i++) {
99 if (fcntl(fd_directs[i], F_SET_FILE_RW_HINT, &i) < 0) {
100 r = -errno;
101 break;
102 }
103 if (fcntl(fd_buffereds[i], F_SET_FILE_RW_HINT, &i) < 0) {
104 r = -errno;
105 break;
106 }
7c673cae 107 }
11fdf7f2
TL
108 if (i != WRITE_LIFE_MAX) {
109 enable_wrt = false;
110 dout(0) << "ioctl(F_SET_FILE_RW_HINT) on " << path << " failed: " << cpp_strerror(r) << dendl;
111 }
112#endif
113
7c673cae
FG
114 dio = true;
115 aio = cct->_conf->bdev_aio;
116 if (!aio) {
11fdf7f2 117 ceph_abort_msg("non-aio not supported");
7c673cae
FG
118 }
119
120 // disable readahead as it will wreak havoc on our mix of
121 // directio/aio and buffered io.
11fdf7f2 122 r = posix_fadvise(fd_buffereds[WRITE_LIFE_NOT_SET], 0, 0, POSIX_FADV_RANDOM);
7c673cae
FG
123 if (r) {
124 r = -r;
125 derr << __func__ << " open got: " << cpp_strerror(r) << dendl;
126 goto out_fail;
127 }
128
11fdf7f2
TL
129 if (lock_exclusive) {
130 r = _lock();
131 if (r < 0) {
132 derr << __func__ << " failed to lock " << path << ": " << cpp_strerror(r)
133 << dendl;
134 goto out_fail;
135 }
7c673cae
FG
136 }
137
138 struct stat st;
11fdf7f2 139 r = ::fstat(fd_directs[WRITE_LIFE_NOT_SET], &st);
7c673cae
FG
140 if (r < 0) {
141 r = -errno;
142 derr << __func__ << " fstat got " << cpp_strerror(r) << dendl;
143 goto out_fail;
144 }
145
146 // Operate as though the block size is 4 KB. The backing file
147 // blksize doesn't strictly matter except that some file systems may
148 // require a read/modify/write if we write something smaller than
149 // it.
150 block_size = cct->_conf->bdev_block_size;
151 if (block_size != (unsigned)st.st_blksize) {
152 dout(1) << __func__ << " backing device/file reports st_blksize "
153 << st.st_blksize << ", using bdev_block_size "
154 << block_size << " anyway" << dendl;
155 }
156
7c673cae
FG
157
158 {
11fdf7f2
TL
159 BlkDev blkdev_direct(fd_directs[WRITE_LIFE_NOT_SET]);
160 BlkDev blkdev_buffered(fd_buffereds[WRITE_LIFE_NOT_SET]);
161
162 if (S_ISBLK(st.st_mode)) {
163 int64_t s;
164 r = blkdev_direct.get_size(&s);
165 if (r < 0) {
166 goto out_fail;
167 }
168 size = s;
169 } else {
170 size = st.st_size;
171 }
172
7c673cae 173 char partition[PATH_MAX], devname[PATH_MAX];
11fdf7f2
TL
174 if ((r = blkdev_buffered.partition(partition, PATH_MAX)) ||
175 (r = blkdev_buffered.wholedisk(devname, PATH_MAX))) {
7c673cae 176 derr << "unable to get device name for " << path << ": "
11fdf7f2 177 << cpp_strerror(r) << dendl;
7c673cae
FG
178 rotational = true;
179 } else {
180 dout(20) << __func__ << " devname " << devname << dendl;
11fdf7f2
TL
181 rotational = blkdev_buffered.is_rotational();
182 support_discard = blkdev_buffered.support_discard();
183 this->devname = devname;
184 _detect_vdo();
7c673cae
FG
185 }
186 }
187
31f18b77
FG
188 r = _aio_start();
189 if (r < 0) {
190 goto out_fail;
191 }
11fdf7f2 192 _discard_start();
7c673cae
FG
193
194 // round size down to an even block
195 size &= ~(block_size - 1);
196
7c673cae
FG
197 dout(1) << __func__
198 << " size " << size
199 << " (0x" << std::hex << size << std::dec << ", "
1adf2230 200 << byte_u_t(size) << ")"
7c673cae 201 << " block_size " << block_size
1adf2230 202 << " (" << byte_u_t(block_size) << ")"
7c673cae 203 << " " << (rotational ? "rotational" : "non-rotational")
11fdf7f2 204 << " discard " << (support_discard ? "supported" : "not supported")
7c673cae
FG
205 << dendl;
206 return 0;
207
11fdf7f2
TL
208out_fail:
209 for (i = 0; i < WRITE_LIFE_MAX; i++) {
210 if (fd_directs[i] >= 0) {
211 VOID_TEMP_FAILURE_RETRY(::close(fd_directs[i]));
212 fd_directs[i] = -1;
213 } else {
214 break;
215 }
216 if (fd_buffereds[i] >= 0) {
217 VOID_TEMP_FAILURE_RETRY(::close(fd_buffereds[i]));
218 fd_buffereds[i] = -1;
219 } else {
220 break;
221 }
222 }
7c673cae
FG
223 return r;
224}
225
11fdf7f2
TL
226int KernelDevice::get_devices(std::set<std::string> *ls)
227{
228 if (devname.empty()) {
229 return 0;
230 }
231 get_raw_devices(devname, ls);
232 return 0;
233}
234
7c673cae
FG
235void KernelDevice::close()
236{
237 dout(1) << __func__ << dendl;
238 _aio_stop();
11fdf7f2 239 _discard_stop();
7c673cae 240
11fdf7f2
TL
241 if (vdo_fd >= 0) {
242 VOID_TEMP_FAILURE_RETRY(::close(vdo_fd));
243 vdo_fd = -1;
244 }
7c673cae 245
11fdf7f2
TL
246 for (int i = 0; i < WRITE_LIFE_MAX; i++) {
247 assert(fd_directs[i] >= 0);
248 VOID_TEMP_FAILURE_RETRY(::close(fd_directs[i]));
249 fd_directs[i] = -1;
7c673cae 250
11fdf7f2
TL
251 assert(fd_buffereds[i] >= 0);
252 VOID_TEMP_FAILURE_RETRY(::close(fd_buffereds[i]));
253 fd_buffereds[i] = -1;
254 }
7c673cae
FG
255 path.clear();
256}
257
11fdf7f2 258int KernelDevice::collect_metadata(const string& prefix, map<string,string> *pm) const
7c673cae 259{
11fdf7f2 260 (*pm)[prefix + "support_discard"] = stringify((int)(bool)support_discard);
7c673cae
FG
261 (*pm)[prefix + "rotational"] = stringify((int)(bool)rotational);
262 (*pm)[prefix + "size"] = stringify(get_size());
263 (*pm)[prefix + "block_size"] = stringify(get_block_size());
264 (*pm)[prefix + "driver"] = "KernelDevice";
265 if (rotational) {
266 (*pm)[prefix + "type"] = "hdd";
267 } else {
268 (*pm)[prefix + "type"] = "ssd";
269 }
11fdf7f2
TL
270 if (vdo_fd >= 0) {
271 (*pm)[prefix + "vdo"] = "true";
272 uint64_t total, avail;
273 get_vdo_utilization(vdo_fd, &total, &avail);
274 (*pm)[prefix + "vdo_physical_size"] = stringify(total);
275 }
7c673cae
FG
276
277 struct stat st;
11fdf7f2 278 int r = ::fstat(fd_buffereds[WRITE_LIFE_NOT_SET], &st);
7c673cae
FG
279 if (r < 0)
280 return -errno;
281 if (S_ISBLK(st.st_mode)) {
282 (*pm)[prefix + "access_mode"] = "blk";
11fdf7f2
TL
283
284 char buffer[1024] = {0};
285 BlkDev blkdev{fd_buffereds[WRITE_LIFE_NOT_SET]};
286 if (r = blkdev.partition(buffer, sizeof(buffer)); r) {
7c673cae 287 (*pm)[prefix + "partition_path"] = "unknown";
11fdf7f2
TL
288 } else {
289 (*pm)[prefix + "partition_path"] = buffer;
290 }
291 buffer[0] = '\0';
292 if (r = blkdev.partition(buffer, sizeof(buffer)); r) {
7c673cae 293 (*pm)[prefix + "dev_node"] = "unknown";
11fdf7f2
TL
294 } else {
295 (*pm)[prefix + "dev_node"] = buffer;
296 }
297 if (!r) {
298 return 0;
299 }
300 buffer[0] = '\0';
301 blkdev.model(buffer, sizeof(buffer));
302 (*pm)[prefix + "model"] = buffer;
303
304 buffer[0] = '\0';
305 blkdev.dev(buffer, sizeof(buffer));
306 (*pm)[prefix + "dev"] = buffer;
307
308 // nvme exposes a serial number
309 buffer[0] = '\0';
310 blkdev.serial(buffer, sizeof(buffer));
311 (*pm)[prefix + "serial"] = buffer;
312
313 if (blkdev.is_nvme())
314 (*pm)[prefix + "type"] = "nvme";
315
316 // numa
317 int node;
318 r = blkdev.get_numa_node(&node);
319 if (r >= 0) {
320 (*pm)[prefix + "numa_node"] = stringify(node);
7c673cae
FG
321 }
322 } else {
323 (*pm)[prefix + "access_mode"] = "file";
324 (*pm)[prefix + "path"] = path;
325 }
326 return 0;
327}
328
11fdf7f2
TL
329void KernelDevice::_detect_vdo()
330{
331 vdo_fd = get_vdo_stats_handle(devname.c_str(), &vdo_name);
332 if (vdo_fd >= 0) {
333 dout(1) << __func__ << " VDO volume " << vdo_name
334 << " maps to " << devname << dendl;
335 } else {
336 dout(20) << __func__ << " no VDO volume maps to " << devname << dendl;
337 }
338 return;
339}
340
341bool KernelDevice::get_thin_utilization(uint64_t *total, uint64_t *avail) const
342{
343 if (vdo_fd < 0) {
344 return false;
345 }
346 return get_vdo_utilization(vdo_fd, total, avail);
347}
348
349int KernelDevice::choose_fd(bool buffered, int write_hint) const
350{
351 assert(write_hint >= WRITE_LIFE_NOT_SET && write_hint < WRITE_LIFE_MAX);
352 if (!enable_wrt)
353 write_hint = WRITE_LIFE_NOT_SET;
354 return buffered ? fd_buffereds[write_hint] : fd_directs[write_hint];
355}
356
7c673cae
FG
357int KernelDevice::flush()
358{
31f18b77 359 // protect flush with a mutex. note that we are not really protecting
7c673cae
FG
360 // data here. instead, we're ensuring that if any flush() caller
361 // sees that io_since_flush is true, they block any racing callers
362 // until the flush is observed. that allows racing threads to be
363 // calling flush while still ensuring that *any* of them that got an
364 // aio completion notification will not return before that aio is
365 // stable on disk: whichever thread sees the flag first will block
366 // followers until the aio is stable.
11fdf7f2 367 std::lock_guard l(flush_mutex);
7c673cae
FG
368
369 bool expect = true;
370 if (!io_since_flush.compare_exchange_strong(expect, false)) {
371 dout(10) << __func__ << " no-op (no ios since last flush), flag is "
372 << (int)io_since_flush.load() << dendl;
373 return 0;
374 }
375
376 dout(10) << __func__ << " start" << dendl;
377 if (cct->_conf->bdev_inject_crash) {
378 ++injecting_crash;
379 // sleep for a moment to give other threads a chance to submit or
380 // wait on io that races with a flush.
381 derr << __func__ << " injecting crash. first we sleep..." << dendl;
382 sleep(cct->_conf->bdev_inject_crash_flush_delay);
383 derr << __func__ << " and now we die" << dendl;
384 cct->_log->flush();
385 _exit(1);
386 }
387 utime_t start = ceph_clock_now();
11fdf7f2 388 int r = ::fdatasync(fd_directs[WRITE_LIFE_NOT_SET]);
7c673cae
FG
389 utime_t end = ceph_clock_now();
390 utime_t dur = end - start;
391 if (r < 0) {
392 r = -errno;
393 derr << __func__ << " fdatasync got: " << cpp_strerror(r) << dendl;
394 ceph_abort();
395 }
396 dout(5) << __func__ << " in " << dur << dendl;;
397 return r;
398}
399
400int KernelDevice::_aio_start()
401{
402 if (aio) {
403 dout(10) << __func__ << dendl;
404 int r = aio_queue.init();
405 if (r < 0) {
31f18b77
FG
406 if (r == -EAGAIN) {
407 derr << __func__ << " io_setup(2) failed with EAGAIN; "
408 << "try increasing /proc/sys/fs/aio-max-nr" << dendl;
409 } else {
410 derr << __func__ << " io_setup(2) failed: " << cpp_strerror(r) << dendl;
411 }
7c673cae
FG
412 return r;
413 }
414 aio_thread.create("bstore_aio");
415 }
416 return 0;
417}
418
419void KernelDevice::_aio_stop()
420{
421 if (aio) {
422 dout(10) << __func__ << dendl;
423 aio_stop = true;
424 aio_thread.join();
425 aio_stop = false;
426 aio_queue.shutdown();
427 }
428}
429
11fdf7f2
TL
430int KernelDevice::_discard_start()
431{
432 discard_thread.create("bstore_discard");
433 return 0;
434}
435
436void KernelDevice::_discard_stop()
437{
438 dout(10) << __func__ << dendl;
439 {
440 std::unique_lock l(discard_lock);
441 while (!discard_started) {
442 discard_cond.wait(l);
443 }
444 discard_stop = true;
445 discard_cond.notify_all();
446 }
447 discard_thread.join();
448 {
449 std::lock_guard l(discard_lock);
450 discard_stop = false;
451 }
452 dout(10) << __func__ << " stopped" << dendl;
453}
454
455void KernelDevice::discard_drain()
456{
457 dout(10) << __func__ << dendl;
458 std::unique_lock l(discard_lock);
459 while (!discard_queued.empty() || discard_running) {
460 discard_cond.wait(l);
461 }
462}
463
28e407b8
AA
464static bool is_expected_ioerr(const int r)
465{
466 // https://lxr.missinglinkelectronics.com/linux+v4.15/block/blk-core.c#L135
467 return (r == -EOPNOTSUPP || r == -ETIMEDOUT || r == -ENOSPC ||
11fdf7f2 468 r == -ENOLINK || r == -EREMOTEIO || r == -EAGAIN || r == -EIO ||
28e407b8 469 r == -ENODATA || r == -EILSEQ || r == -ENOMEM ||
11fdf7f2
TL
470#if defined(__linux__)
471 r == -EREMCHG || r == -EBADE
472#elif defined(__FreeBSD__)
473 r == - BSM_ERRNO_EREMCHG || r == -BSM_ERRNO_EBADE
474#endif
475 );
28e407b8
AA
476}
477
7c673cae
FG
478void KernelDevice::_aio_thread()
479{
480 dout(10) << __func__ << " start" << dendl;
481 int inject_crash_count = 0;
482 while (!aio_stop) {
483 dout(40) << __func__ << " polling" << dendl;
224ce89b 484 int max = cct->_conf->bdev_aio_reap_max;
7c673cae
FG
485 aio_t *aio[max];
486 int r = aio_queue.get_next_completed(cct->_conf->bdev_aio_poll_ms,
487 aio, max);
488 if (r < 0) {
489 derr << __func__ << " got " << cpp_strerror(r) << dendl;
11fdf7f2 490 ceph_abort_msg("got unexpected error from io_getevents");
7c673cae
FG
491 }
492 if (r > 0) {
493 dout(30) << __func__ << " got " << r << " completed aios" << dendl;
494 for (int i = 0; i < r; ++i) {
495 IOContext *ioc = static_cast<IOContext*>(aio[i]->priv);
496 _aio_log_finish(ioc, aio[i]->offset, aio[i]->length);
497 if (aio[i]->queue_item.is_linked()) {
11fdf7f2 498 std::lock_guard l(debug_queue_lock);
7c673cae
FG
499 debug_aio_unlink(*aio[i]);
500 }
501
502 // set flag indicating new ios have completed. we do this *before*
503 // any completion or notifications so that any user flush() that
504 // follows the observed io completion will include this io. Note
505 // that an earlier, racing flush() could observe and clear this
506 // flag, but that also ensures that the IO will be stable before the
507 // later flush() occurs.
508 io_since_flush.store(true);
509
94b18763 510 long r = aio[i]->get_return_value();
b32b8144 511 if (r < 0) {
28e407b8
AA
512 derr << __func__ << " got r=" << r << " (" << cpp_strerror(r) << ")"
513 << dendl;
514 if (ioc->allow_eio && is_expected_ioerr(r)) {
515 derr << __func__ << " translating the error to EIO for upper layer"
516 << dendl;
517 ioc->set_return_value(-EIO);
b32b8144 518 } else {
11fdf7f2
TL
519 if (is_expected_ioerr(r)) {
520 note_io_error_event(
521 devname.c_str(),
522 path.c_str(),
523 r,
81eedcae
TL
524#if defined(HAVE_POSIXAIO)
525 aio[i]->aio.aiocb.aio_lio_opcode,
526#else
527 aio[i]->iocb.aio_lio_opcode,
528#endif
11fdf7f2
TL
529 aio[i]->offset,
530 aio[i]->length);
531 ceph_abort_msg(
532 "Unexpected IO error. "
533 "This may suggest a hardware issue. "
534 "Please check your kernel log!");
535 }
536 ceph_abort_msg(
537 "Unexpected IO error. "
538 "This may suggest HW issue. Please check your dmesg!");
b32b8144
FG
539 }
540 } else if (aio[i]->length != (uint64_t)r) {
541 derr << "aio to " << aio[i]->offset << "~" << aio[i]->length
542 << " but returned: " << r << dendl;
11fdf7f2 543 ceph_abort_msg("unexpected aio return value: does not match length");
b32b8144
FG
544 }
545
546 dout(10) << __func__ << " finished aio " << aio[i] << " r " << r
547 << " ioc " << ioc
548 << " with " << (ioc->num_running.load() - 1)
549 << " aios left" << dendl;
7c673cae
FG
550
551 // NOTE: once num_running and we either call the callback or
552 // call aio_wake we cannot touch ioc or aio[] as the caller
553 // may free it.
554 if (ioc->priv) {
555 if (--ioc->num_running == 0) {
556 aio_callback(aio_callback_priv, ioc->priv);
557 }
558 } else {
31f18b77 559 ioc->try_aio_wake();
7c673cae
FG
560 }
561 }
562 }
563 if (cct->_conf->bdev_debug_aio) {
564 utime_t now = ceph_clock_now();
11fdf7f2 565 std::lock_guard l(debug_queue_lock);
7c673cae
FG
566 if (debug_oldest) {
567 if (debug_stall_since == utime_t()) {
568 debug_stall_since = now;
569 } else {
11fdf7f2
TL
570 if (cct->_conf->bdev_debug_aio_suicide_timeout) {
571 utime_t cutoff = now;
572 cutoff -= cct->_conf->bdev_debug_aio_suicide_timeout;
573 if (debug_stall_since < cutoff) {
574 derr << __func__ << " stalled aio " << debug_oldest
575 << " since " << debug_stall_since << ", timeout is "
576 << cct->_conf->bdev_debug_aio_suicide_timeout
577 << "s, suicide" << dendl;
578 ceph_abort_msg("stalled aio... buggy kernel or bad device?");
579 }
7c673cae
FG
580 }
581 }
582 }
583 }
584 reap_ioc();
585 if (cct->_conf->bdev_inject_crash) {
586 ++inject_crash_count;
587 if (inject_crash_count * cct->_conf->bdev_aio_poll_ms / 1000 >
588 cct->_conf->bdev_inject_crash + cct->_conf->bdev_inject_crash_flush_delay) {
589 derr << __func__ << " bdev_inject_crash trigger from aio thread"
590 << dendl;
591 cct->_log->flush();
592 _exit(1);
593 }
594 }
595 }
596 reap_ioc();
597 dout(10) << __func__ << " end" << dendl;
598}
599
11fdf7f2
TL
600void KernelDevice::_discard_thread()
601{
602 std::unique_lock l(discard_lock);
603 ceph_assert(!discard_started);
604 discard_started = true;
605 discard_cond.notify_all();
606 while (true) {
607 ceph_assert(discard_finishing.empty());
608 if (discard_queued.empty()) {
609 if (discard_stop)
610 break;
611 dout(20) << __func__ << " sleep" << dendl;
612 discard_cond.notify_all(); // for the thread trying to drain...
613 discard_cond.wait(l);
614 dout(20) << __func__ << " wake" << dendl;
615 } else {
616 discard_finishing.swap(discard_queued);
617 discard_running = true;
618 l.unlock();
619 dout(20) << __func__ << " finishing" << dendl;
620 for (auto p = discard_finishing.begin();p != discard_finishing.end(); ++p) {
621 discard(p.get_start(), p.get_len());
622 }
623
624 discard_callback(discard_callback_priv, static_cast<void*>(&discard_finishing));
625 discard_finishing.clear();
626 l.lock();
627 discard_running = false;
628 }
629 }
630 dout(10) << __func__ << " finish" << dendl;
631 discard_started = false;
632}
633
634int KernelDevice::queue_discard(interval_set<uint64_t> &to_release)
635{
636 if (!support_discard)
637 return -1;
638
639 if (to_release.empty())
640 return 0;
641
642 std::lock_guard l(discard_lock);
643 discard_queued.insert(to_release);
644 discard_cond.notify_all();
645 return 0;
646}
647
7c673cae
FG
648void KernelDevice::_aio_log_start(
649 IOContext *ioc,
650 uint64_t offset,
651 uint64_t length)
652{
653 dout(20) << __func__ << " 0x" << std::hex << offset << "~" << length
654 << std::dec << dendl;
655 if (cct->_conf->bdev_debug_inflight_ios) {
11fdf7f2 656 std::lock_guard l(debug_lock);
7c673cae
FG
657 if (debug_inflight.intersects(offset, length)) {
658 derr << __func__ << " inflight overlap of 0x"
659 << std::hex
660 << offset << "~" << length << std::dec
661 << " with " << debug_inflight << dendl;
662 ceph_abort();
663 }
664 debug_inflight.insert(offset, length);
665 }
666}
667
668void KernelDevice::debug_aio_link(aio_t& aio)
669{
670 if (debug_queue.empty()) {
671 debug_oldest = &aio;
672 }
673 debug_queue.push_back(aio);
674}
675
676void KernelDevice::debug_aio_unlink(aio_t& aio)
677{
678 if (aio.queue_item.is_linked()) {
679 debug_queue.erase(debug_queue.iterator_to(aio));
680 if (debug_oldest == &aio) {
11fdf7f2
TL
681 auto age = cct->_conf->bdev_debug_aio_log_age;
682 if (age && debug_stall_since != utime_t()) {
683 utime_t cutoff = ceph_clock_now();
684 cutoff -= age;
685 if (debug_stall_since < cutoff) {
686 derr << __func__ << " stalled aio " << debug_oldest
687 << " since " << debug_stall_since << ", timeout is "
688 << age
689 << "s" << dendl;
690 }
691 }
692
7c673cae
FG
693 if (debug_queue.empty()) {
694 debug_oldest = nullptr;
695 } else {
696 debug_oldest = &debug_queue.front();
697 }
698 debug_stall_since = utime_t();
699 }
700 }
701}
702
703void KernelDevice::_aio_log_finish(
704 IOContext *ioc,
705 uint64_t offset,
706 uint64_t length)
707{
708 dout(20) << __func__ << " " << aio << " 0x"
709 << std::hex << offset << "~" << length << std::dec << dendl;
710 if (cct->_conf->bdev_debug_inflight_ios) {
11fdf7f2 711 std::lock_guard l(debug_lock);
7c673cae
FG
712 debug_inflight.erase(offset, length);
713 }
714}
715
716void KernelDevice::aio_submit(IOContext *ioc)
717{
718 dout(20) << __func__ << " ioc " << ioc
719 << " pending " << ioc->num_pending.load()
720 << " running " << ioc->num_running.load()
721 << dendl;
224ce89b 722
7c673cae
FG
723 if (ioc->num_pending.load() == 0) {
724 return;
725 }
224ce89b 726
7c673cae
FG
727 // move these aside, and get our end iterator position now, as the
728 // aios might complete as soon as they are submitted and queue more
729 // wal aio's.
730 list<aio_t>::iterator e = ioc->running_aios.begin();
731 ioc->running_aios.splice(e, ioc->pending_aios);
7c673cae
FG
732
733 int pending = ioc->num_pending.load();
734 ioc->num_running += pending;
735 ioc->num_pending -= pending;
11fdf7f2
TL
736 ceph_assert(ioc->num_pending.load() == 0); // we should be only thread doing this
737 ceph_assert(ioc->pending_aios.size() == 0);
738
224ce89b
WB
739 if (cct->_conf->bdev_debug_aio) {
740 list<aio_t>::iterator p = ioc->running_aios.begin();
741 while (p != e) {
11fdf7f2
TL
742 dout(30) << __func__ << " " << *p << dendl;
743 std::lock_guard l(debug_queue_lock);
224ce89b 744 debug_aio_link(*p++);
7c673cae
FG
745 }
746 }
224ce89b
WB
747
748 void *priv = static_cast<void*>(ioc);
749 int r, retries = 0;
11fdf7f2
TL
750 r = aio_queue.submit_batch(ioc->running_aios.begin(), e,
751 pending, priv, &retries);
752
224ce89b
WB
753 if (retries)
754 derr << __func__ << " retries " << retries << dendl;
755 if (r < 0) {
756 derr << " aio submit got " << cpp_strerror(r) << dendl;
11fdf7f2 757 ceph_assert(r == 0);
224ce89b 758 }
7c673cae
FG
759}
760
11fdf7f2 761int KernelDevice::_sync_write(uint64_t off, bufferlist &bl, bool buffered, int write_hint)
7c673cae
FG
762{
763 uint64_t len = bl.length();
764 dout(5) << __func__ << " 0x" << std::hex << off << "~" << len
11fdf7f2 765 << std::dec << (buffered ? " (buffered)" : " (direct)") << dendl;
7c673cae
FG
766 if (cct->_conf->bdev_inject_crash &&
767 rand() % cct->_conf->bdev_inject_crash == 0) {
768 derr << __func__ << " bdev_inject_crash: dropping io 0x" << std::hex
769 << off << "~" << len << std::dec << dendl;
770 ++injecting_crash;
771 return 0;
772 }
773 vector<iovec> iov;
774 bl.prepare_iov(&iov);
11fdf7f2 775 int r = ::pwritev(choose_fd(buffered, write_hint),
7c673cae
FG
776 &iov[0], iov.size(), off);
777
778 if (r < 0) {
779 r = -errno;
780 derr << __func__ << " pwritev error: " << cpp_strerror(r) << dendl;
781 return r;
782 }
11fdf7f2 783#ifdef HAVE_SYNC_FILE_RANGE
7c673cae 784 if (buffered) {
494da23a
TL
785 // initiate IO and wait till it completes
786 r = ::sync_file_range(fd_buffereds[WRITE_LIFE_NOT_SET], off, len, SYNC_FILE_RANGE_WRITE|SYNC_FILE_RANGE_WAIT_AFTER|SYNC_FILE_RANGE_WAIT_BEFORE);
7c673cae
FG
787 if (r < 0) {
788 r = -errno;
789 derr << __func__ << " sync_file_range error: " << cpp_strerror(r) << dendl;
790 return r;
791 }
792 }
11fdf7f2 793#endif
31f18b77
FG
794
795 io_since_flush.store(true);
796
7c673cae
FG
797 return 0;
798}
799
800int KernelDevice::write(
801 uint64_t off,
802 bufferlist &bl,
11fdf7f2
TL
803 bool buffered,
804 int write_hint)
7c673cae
FG
805{
806 uint64_t len = bl.length();
807 dout(20) << __func__ << " 0x" << std::hex << off << "~" << len << std::dec
808 << (buffered ? " (buffered)" : " (direct)")
809 << dendl;
11fdf7f2 810 ceph_assert(is_valid_io(off, len));
7c673cae
FG
811
812 if ((!buffered || bl.get_num_buffers() >= IOV_MAX) &&
b32b8144 813 bl.rebuild_aligned_size_and_memory(block_size, block_size, IOV_MAX)) {
7c673cae
FG
814 dout(20) << __func__ << " rebuilding buffer to be aligned" << dendl;
815 }
816 dout(40) << "data: ";
817 bl.hexdump(*_dout);
818 *_dout << dendl;
819
11fdf7f2 820 return _sync_write(off, bl, buffered, write_hint);
7c673cae
FG
821}
822
823int KernelDevice::aio_write(
824 uint64_t off,
825 bufferlist &bl,
826 IOContext *ioc,
11fdf7f2
TL
827 bool buffered,
828 int write_hint)
7c673cae
FG
829{
830 uint64_t len = bl.length();
831 dout(20) << __func__ << " 0x" << std::hex << off << "~" << len << std::dec
832 << (buffered ? " (buffered)" : " (direct)")
833 << dendl;
11fdf7f2 834 ceph_assert(is_valid_io(off, len));
7c673cae
FG
835
836 if ((!buffered || bl.get_num_buffers() >= IOV_MAX) &&
b32b8144 837 bl.rebuild_aligned_size_and_memory(block_size, block_size, IOV_MAX)) {
7c673cae
FG
838 dout(20) << __func__ << " rebuilding buffer to be aligned" << dendl;
839 }
840 dout(40) << "data: ";
841 bl.hexdump(*_dout);
842 *_dout << dendl;
843
844 _aio_log_start(ioc, off, len);
845
846#ifdef HAVE_LIBAIO
847 if (aio && dio && !buffered) {
7c673cae
FG
848 if (cct->_conf->bdev_inject_crash &&
849 rand() % cct->_conf->bdev_inject_crash == 0) {
850 derr << __func__ << " bdev_inject_crash: dropping io 0x" << std::hex
851 << off << "~" << len << std::dec
852 << dendl;
853 // generate a real io so that aio_wait behaves properly, but make it
854 // a read instead of write, and toss the result.
494da23a
TL
855 ioc->pending_aios.push_back(aio_t(ioc, choose_fd(false, write_hint)));
856 ++ioc->num_pending;
857 auto& aio = ioc->pending_aios.back();
7c673cae
FG
858 aio.pread(off, len);
859 ++injecting_crash;
860 } else {
494da23a
TL
861 if (bl.length() <= RW_IO_MAX) {
862 // fast path (non-huge write)
863 ioc->pending_aios.push_back(aio_t(ioc, choose_fd(false, write_hint)));
864 ++ioc->num_pending;
865 auto& aio = ioc->pending_aios.back();
866 bl.prepare_iov(&aio.iov);
867 aio.bl.claim_append(bl);
868 aio.pwritev(off, len);
869 dout(30) << aio << dendl;
870 dout(5) << __func__ << " 0x" << std::hex << off << "~" << len
871 << std::dec << " aio " << &aio << dendl;
872 } else {
873 // write in RW_IO_MAX-sized chunks
874 uint64_t prev_len = 0;
875 while (prev_len < bl.length()) {
876 bufferlist tmp;
877 if (prev_len + RW_IO_MAX < bl.length()) {
878 tmp.substr_of(bl, prev_len, RW_IO_MAX);
879 } else {
880 tmp.substr_of(bl, prev_len, bl.length() - prev_len);
881 }
882 auto len = tmp.length();
883 ioc->pending_aios.push_back(aio_t(ioc, choose_fd(false, write_hint)));
884 ++ioc->num_pending;
885 auto& aio = ioc->pending_aios.back();
886 tmp.prepare_iov(&aio.iov);
887 aio.bl.claim_append(tmp);
888 aio.pwritev(off + prev_len, len);
889 dout(30) << aio << dendl;
890 dout(5) << __func__ << " 0x" << std::hex << off + prev_len
891 << "~" << len
892 << std::dec << " aio " << &aio << " (piece)" << dendl;
893 prev_len += len;
894 }
895 }
7c673cae 896 }
7c673cae
FG
897 } else
898#endif
899 {
11fdf7f2 900 int r = _sync_write(off, bl, buffered, write_hint);
7c673cae
FG
901 _aio_log_finish(ioc, off, len);
902 if (r < 0)
903 return r;
904 }
905 return 0;
906}
907
11fdf7f2
TL
908int KernelDevice::discard(uint64_t offset, uint64_t len)
909{
910 int r = 0;
911 if (support_discard) {
912 dout(10) << __func__
913 << " 0x" << std::hex << offset << "~" << len << std::dec
914 << dendl;
915
916 r = BlkDev{fd_directs[WRITE_LIFE_NOT_SET]}.discard((int64_t)offset, (int64_t)len);
917 }
918 return r;
919}
920
7c673cae
FG
921int KernelDevice::read(uint64_t off, uint64_t len, bufferlist *pbl,
922 IOContext *ioc,
923 bool buffered)
924{
925 dout(5) << __func__ << " 0x" << std::hex << off << "~" << len << std::dec
926 << (buffered ? " (buffered)" : " (direct)")
927 << dendl;
11fdf7f2 928 ceph_assert(is_valid_io(off, len));
7c673cae
FG
929
930 _aio_log_start(ioc, off, len);
931
11fdf7f2
TL
932 auto start1 = mono_clock::now();
933
934 auto p = buffer::ptr_node::create(buffer::create_small_page_aligned(len));
935 int r = ::pread(buffered ? fd_buffereds[WRITE_LIFE_NOT_SET] : fd_directs[WRITE_LIFE_NOT_SET],
936 p->c_str(), len, off);
937 auto age = cct->_conf->bdev_debug_aio_log_age;
938 if (mono_clock::now() - start1 >= make_timespan(age)) {
939 derr << __func__ << " stalled read "
940 << " 0x" << std::hex << off << "~" << len << std::dec
941 << (buffered ? " (buffered)" : " (direct)")
942 << " since " << start1 << ", timeout is "
943 << age
944 << "s" << dendl;
945 }
946
7c673cae 947 if (r < 0) {
a8e16298
TL
948 if (ioc->allow_eio && is_expected_ioerr(r)) {
949 r = -EIO;
950 } else {
951 r = -errno;
952 }
7c673cae
FG
953 goto out;
954 }
11fdf7f2 955 ceph_assert((uint64_t)r == len);
7c673cae
FG
956 pbl->push_back(std::move(p));
957
958 dout(40) << "data: ";
959 pbl->hexdump(*_dout);
960 *_dout << dendl;
961
962 out:
963 _aio_log_finish(ioc, off, len);
964 return r < 0 ? r : 0;
965}
966
967int KernelDevice::aio_read(
968 uint64_t off,
969 uint64_t len,
970 bufferlist *pbl,
971 IOContext *ioc)
972{
973 dout(5) << __func__ << " 0x" << std::hex << off << "~" << len << std::dec
974 << dendl;
975
976 int r = 0;
977#ifdef HAVE_LIBAIO
978 if (aio && dio) {
11fdf7f2 979 ceph_assert(is_valid_io(off, len));
7c673cae 980 _aio_log_start(ioc, off, len);
11fdf7f2 981 ioc->pending_aios.push_back(aio_t(ioc, fd_directs[WRITE_LIFE_NOT_SET]));
7c673cae
FG
982 ++ioc->num_pending;
983 aio_t& aio = ioc->pending_aios.back();
984 aio.pread(off, len);
11fdf7f2 985 dout(30) << aio << dendl;
7c673cae
FG
986 pbl->append(aio.bl);
987 dout(5) << __func__ << " 0x" << std::hex << off << "~" << len
988 << std::dec << " aio " << &aio << dendl;
989 } else
990#endif
991 {
992 r = read(off, len, pbl, ioc, false);
993 }
994
995 return r;
996}
997
998int KernelDevice::direct_read_unaligned(uint64_t off, uint64_t len, char *buf)
999{
1000 uint64_t aligned_off = align_down(off, block_size);
1001 uint64_t aligned_len = align_up(off+len, block_size) - aligned_off;
11fdf7f2 1002 bufferptr p = buffer::create_small_page_aligned(aligned_len);
7c673cae
FG
1003 int r = 0;
1004
11fdf7f2
TL
1005 auto start1 = mono_clock::now();
1006 r = ::pread(fd_directs[WRITE_LIFE_NOT_SET], p.c_str(), aligned_len, aligned_off);
1007 auto age = cct->_conf->bdev_debug_aio_log_age;
1008 if (mono_clock::now() - start1 >= make_timespan(age)) {
1009 derr << __func__ << " stalled read "
1010 << " 0x" << std::hex << off << "~" << len << std::dec
1011 << " since " << start1 << ", timeout is "
1012 << age
1013 << "s" << dendl;
1014 }
1015
7c673cae
FG
1016 if (r < 0) {
1017 r = -errno;
11fdf7f2 1018 derr << __func__ << " 0x" << std::hex << off << "~" << len << std::dec
7c673cae
FG
1019 << " error: " << cpp_strerror(r) << dendl;
1020 goto out;
1021 }
11fdf7f2 1022 ceph_assert((uint64_t)r == aligned_len);
7c673cae
FG
1023 memcpy(buf, p.c_str() + (off - aligned_off), len);
1024
1025 dout(40) << __func__ << " data: ";
1026 bufferlist bl;
1027 bl.append(buf, len);
1028 bl.hexdump(*_dout);
1029 *_dout << dendl;
1030
1031 out:
1032 return r < 0 ? r : 0;
1033}
1034
1035int KernelDevice::read_random(uint64_t off, uint64_t len, char *buf,
1036 bool buffered)
1037{
1038 dout(5) << __func__ << " 0x" << std::hex << off << "~" << len << std::dec
494da23a 1039 << "buffered " << buffered
7c673cae 1040 << dendl;
11fdf7f2
TL
1041 ceph_assert(len > 0);
1042 ceph_assert(off < size);
1043 ceph_assert(off + len <= size);
7c673cae 1044 int r = 0;
11fdf7f2 1045 auto age = cct->_conf->bdev_debug_aio_log_age;
7c673cae
FG
1046
1047 //if it's direct io and unaligned, we have to use a internal buffer
1048 if (!buffered && ((off % block_size != 0)
1049 || (len % block_size != 0)
1050 || (uintptr_t(buf) % CEPH_PAGE_SIZE != 0)))
1051 return direct_read_unaligned(off, len, buf);
1052
11fdf7f2 1053 auto start1 = mono_clock::now();
7c673cae
FG
1054 if (buffered) {
1055 //buffered read
11fdf7f2 1056 auto off0 = off;
7c673cae
FG
1057 char *t = buf;
1058 uint64_t left = len;
1059 while (left > 0) {
11fdf7f2 1060 r = ::pread(fd_buffereds[WRITE_LIFE_NOT_SET], t, left, off);
7c673cae
FG
1061 if (r < 0) {
1062 r = -errno;
11fdf7f2 1063 derr << __func__ << " 0x" << std::hex << off << "~" << left
7c673cae
FG
1064 << std::dec << " error: " << cpp_strerror(r) << dendl;
1065 goto out;
1066 }
1067 off += r;
1068 t += r;
1069 left -= r;
1070 }
11fdf7f2
TL
1071 if (mono_clock::now() - start1 >= make_timespan(age)) {
1072 derr << __func__ << " stalled read "
1073 << " 0x" << std::hex << off0 << "~" << len << std::dec
1074 << " (buffered) since " << start1 << ", timeout is "
1075 << age
1076 << "s" << dendl;
1077 }
7c673cae
FG
1078 } else {
1079 //direct and aligned read
11fdf7f2
TL
1080 r = ::pread(fd_directs[WRITE_LIFE_NOT_SET], buf, len, off);
1081 if (mono_clock::now() - start1 >= make_timespan(age)) {
1082 derr << __func__ << " stalled read "
1083 << " 0x" << std::hex << off << "~" << len << std::dec
1084 << " (direct) since " << start1 << ", timeout is "
1085 << age
1086 << "s" << dendl;
1087 }
7c673cae
FG
1088 if (r < 0) {
1089 r = -errno;
11fdf7f2
TL
1090 derr << __func__ << " direct_aligned_read" << " 0x" << std::hex
1091 << off << "~" << left << std::dec << " error: " << cpp_strerror(r)
7c673cae
FG
1092 << dendl;
1093 goto out;
1094 }
11fdf7f2 1095 ceph_assert((uint64_t)r == len);
7c673cae
FG
1096 }
1097
1098 dout(40) << __func__ << " data: ";
1099 bufferlist bl;
1100 bl.append(buf, len);
1101 bl.hexdump(*_dout);
1102 *_dout << dendl;
1103
1104 out:
1105 return r < 0 ? r : 0;
1106}
1107
1108int KernelDevice::invalidate_cache(uint64_t off, uint64_t len)
1109{
1110 dout(5) << __func__ << " 0x" << std::hex << off << "~" << len << std::dec
1111 << dendl;
11fdf7f2
TL
1112 ceph_assert(off % block_size == 0);
1113 ceph_assert(len % block_size == 0);
1114 int r = posix_fadvise(fd_buffereds[WRITE_LIFE_NOT_SET], off, len, POSIX_FADV_DONTNEED);
7c673cae
FG
1115 if (r) {
1116 r = -r;
1117 derr << __func__ << " 0x" << std::hex << off << "~" << len << std::dec
1118 << " error: " << cpp_strerror(r) << dendl;
1119 }
1120 return r;
1121}