]> git.proxmox.com Git - ceph.git/blame - ceph/src/blk/kernel/KernelDevice.cc
update source to Ceph Pacific 16.2.2
[ceph.git] / ceph / src / blk / kernel / KernelDevice.cc
CommitLineData
7c673cae
FG
1// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2// vim: ts=8 sw=2 smarttab
3/*
4 * Ceph - scalable distributed file system
5 *
6 * Copyright (C) 2014 Red Hat
7 *
8 * This is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU Lesser General Public
10 * License version 2.1, as published by the Free Software
11 * Foundation. See file COPYING.
12 *
13 */
14
f67539c2 15#include <limits>
7c673cae
FG
16#include <unistd.h>
17#include <stdlib.h>
18#include <sys/types.h>
19#include <sys/stat.h>
20#include <fcntl.h>
11fdf7f2 21#include <sys/file.h>
7c673cae
FG
22
23#include "KernelDevice.h"
9f95a23c 24#include "include/intarith.h"
7c673cae
FG
25#include "include/types.h"
26#include "include/compat.h"
27#include "include/stringify.h"
11fdf7f2 28#include "common/blkdev.h"
7c673cae 29#include "common/errno.h"
11fdf7f2
TL
30#if defined(__FreeBSD__)
31#include "bsm/audit_errno.h"
32#endif
7c673cae 33#include "common/debug.h"
11fdf7f2
TL
34#include "common/numa.h"
35
36#include "global/global_context.h"
f67539c2 37#include "io_uring.h"
7c673cae
FG
38
39#define dout_context cct
40#define dout_subsys ceph_subsys_bdev
41#undef dout_prefix
42#define dout_prefix *_dout << "bdev(" << this << " " << path << ") "
43
f67539c2
TL
44using std::list;
45using std::map;
46using std::string;
47using std::vector;
48
49using ceph::bufferlist;
50using ceph::bufferptr;
51using ceph::make_timespan;
52using ceph::mono_clock;
53using ceph::operator <<;
54
11fdf7f2
TL
55KernelDevice::KernelDevice(CephContext* cct, aio_callback_t cb, void *cbpriv, aio_callback_t d_cb, void *d_cbpriv)
56 : BlockDevice(cct, cb, cbpriv),
57 aio(false), dio(false),
11fdf7f2
TL
58 discard_callback(d_cb),
59 discard_callback_priv(d_cbpriv),
7c673cae 60 aio_stop(false),
11fdf7f2
TL
61 discard_started(false),
62 discard_stop(false),
7c673cae 63 aio_thread(this),
11fdf7f2 64 discard_thread(this),
7c673cae
FG
65 injecting_crash(0)
66{
11fdf7f2
TL
67 fd_directs.resize(WRITE_LIFE_MAX, -1);
68 fd_buffereds.resize(WRITE_LIFE_MAX, -1);
9f95a23c 69
f67539c2 70 bool use_ioring = cct->_conf.get_val<bool>("bdev_ioring");
9f95a23c
TL
71 unsigned int iodepth = cct->_conf->bdev_aio_max_queue_depth;
72
73 if (use_ioring && ioring_queue_t::supported()) {
f67539c2
TL
74 bool use_ioring_hipri = cct->_conf.get_val<bool>("bdev_ioring_hipri");
75 bool use_ioring_sqthread_poll = cct->_conf.get_val<bool>("bdev_ioring_sqthread_poll");
76 io_queue = std::make_unique<ioring_queue_t>(iodepth, use_ioring_hipri, use_ioring_sqthread_poll);
9f95a23c
TL
77 } else {
78 static bool once;
79 if (use_ioring && !once) {
80 derr << "WARNING: io_uring API is not supported! Fallback to libaio!"
81 << dendl;
82 once = true;
83 }
84 io_queue = std::make_unique<aio_queue_t>(iodepth);
85 }
7c673cae
FG
86}
87
88int KernelDevice::_lock()
89{
11fdf7f2 90 dout(10) << __func__ << " " << fd_directs[WRITE_LIFE_NOT_SET] << dendl;
adb31ebb
TL
91 // When the block changes, systemd-udevd will open the block,
92 // read some information and close it. Then a failure occurs here.
93 // So we need to try again here.
f67539c2
TL
94 int fd = fd_directs[WRITE_LIFE_NOT_SET];
95 uint64_t nr_tries = 0;
96 for (;;) {
97 struct flock fl = { F_WRLCK,
98 SEEK_SET };
99 int r = ::fcntl(fd, F_OFD_SETLK, &fl);
100 if (r < 0) {
101 if (errno == EINVAL) {
102 r = ::flock(fd, LOCK_EX | LOCK_NB);
103 }
104 }
105 if (r == 0) {
adb31ebb
TL
106 return 0;
107 }
f67539c2
TL
108 if (errno != EAGAIN) {
109 return -errno;
110 }
111 dout(1) << __func__ << " flock busy on " << path << dendl;
112 if (const uint64_t max_retry =
113 cct->_conf.get_val<uint64_t>("bdev_flock_retry");
114 max_retry > 0 && nr_tries++ == max_retry) {
115 return -EAGAIN;
116 }
117 double retry_interval =
118 cct->_conf.get_val<double>("bdev_flock_retry_interval");
119 std::this_thread::sleep_for(ceph::make_timespan(retry_interval));
11fdf7f2 120 }
7c673cae
FG
121}
122
123int KernelDevice::open(const string& p)
124{
125 path = p;
11fdf7f2 126 int r = 0, i = 0;
7c673cae
FG
127 dout(1) << __func__ << " path " << path << dendl;
128
11fdf7f2
TL
129 for (i = 0; i < WRITE_LIFE_MAX; i++) {
130 int fd = ::open(path.c_str(), O_RDWR | O_DIRECT);
131 if (fd < 0) {
132 r = -errno;
133 break;
134 }
135 fd_directs[i] = fd;
136
137 fd = ::open(path.c_str(), O_RDWR | O_CLOEXEC);
138 if (fd < 0) {
139 r = -errno;
140 break;
141 }
142 fd_buffereds[i] = fd;
7c673cae 143 }
11fdf7f2
TL
144
145 if (i != WRITE_LIFE_MAX) {
7c673cae 146 derr << __func__ << " open got: " << cpp_strerror(r) << dendl;
11fdf7f2
TL
147 goto out_fail;
148 }
149
150#if defined(F_SET_FILE_RW_HINT)
151 for (i = WRITE_LIFE_NONE; i < WRITE_LIFE_MAX; i++) {
152 if (fcntl(fd_directs[i], F_SET_FILE_RW_HINT, &i) < 0) {
153 r = -errno;
154 break;
155 }
156 if (fcntl(fd_buffereds[i], F_SET_FILE_RW_HINT, &i) < 0) {
157 r = -errno;
158 break;
159 }
7c673cae 160 }
11fdf7f2
TL
161 if (i != WRITE_LIFE_MAX) {
162 enable_wrt = false;
163 dout(0) << "ioctl(F_SET_FILE_RW_HINT) on " << path << " failed: " << cpp_strerror(r) << dendl;
164 }
165#endif
166
7c673cae
FG
167 dio = true;
168 aio = cct->_conf->bdev_aio;
169 if (!aio) {
11fdf7f2 170 ceph_abort_msg("non-aio not supported");
7c673cae
FG
171 }
172
173 // disable readahead as it will wreak havoc on our mix of
174 // directio/aio and buffered io.
11fdf7f2 175 r = posix_fadvise(fd_buffereds[WRITE_LIFE_NOT_SET], 0, 0, POSIX_FADV_RANDOM);
7c673cae
FG
176 if (r) {
177 r = -r;
9f95a23c 178 derr << __func__ << " posix_fadvise got: " << cpp_strerror(r) << dendl;
7c673cae
FG
179 goto out_fail;
180 }
181
11fdf7f2
TL
182 if (lock_exclusive) {
183 r = _lock();
184 if (r < 0) {
185 derr << __func__ << " failed to lock " << path << ": " << cpp_strerror(r)
186 << dendl;
187 goto out_fail;
188 }
7c673cae
FG
189 }
190
191 struct stat st;
11fdf7f2 192 r = ::fstat(fd_directs[WRITE_LIFE_NOT_SET], &st);
7c673cae
FG
193 if (r < 0) {
194 r = -errno;
195 derr << __func__ << " fstat got " << cpp_strerror(r) << dendl;
196 goto out_fail;
197 }
198
199 // Operate as though the block size is 4 KB. The backing file
200 // blksize doesn't strictly matter except that some file systems may
201 // require a read/modify/write if we write something smaller than
202 // it.
203 block_size = cct->_conf->bdev_block_size;
204 if (block_size != (unsigned)st.st_blksize) {
205 dout(1) << __func__ << " backing device/file reports st_blksize "
206 << st.st_blksize << ", using bdev_block_size "
207 << block_size << " anyway" << dendl;
208 }
209
7c673cae
FG
210
211 {
11fdf7f2
TL
212 BlkDev blkdev_direct(fd_directs[WRITE_LIFE_NOT_SET]);
213 BlkDev blkdev_buffered(fd_buffereds[WRITE_LIFE_NOT_SET]);
214
215 if (S_ISBLK(st.st_mode)) {
216 int64_t s;
217 r = blkdev_direct.get_size(&s);
218 if (r < 0) {
219 goto out_fail;
220 }
221 size = s;
222 } else {
223 size = st.st_size;
224 }
225
7c673cae 226 char partition[PATH_MAX], devname[PATH_MAX];
11fdf7f2
TL
227 if ((r = blkdev_buffered.partition(partition, PATH_MAX)) ||
228 (r = blkdev_buffered.wholedisk(devname, PATH_MAX))) {
7c673cae 229 derr << "unable to get device name for " << path << ": "
11fdf7f2 230 << cpp_strerror(r) << dendl;
7c673cae
FG
231 rotational = true;
232 } else {
233 dout(20) << __func__ << " devname " << devname << dendl;
11fdf7f2
TL
234 rotational = blkdev_buffered.is_rotational();
235 support_discard = blkdev_buffered.support_discard();
236 this->devname = devname;
237 _detect_vdo();
7c673cae
FG
238 }
239 }
240
31f18b77
FG
241 r = _aio_start();
242 if (r < 0) {
243 goto out_fail;
244 }
11fdf7f2 245 _discard_start();
7c673cae
FG
246
247 // round size down to an even block
248 size &= ~(block_size - 1);
249
7c673cae
FG
250 dout(1) << __func__
251 << " size " << size
252 << " (0x" << std::hex << size << std::dec << ", "
1adf2230 253 << byte_u_t(size) << ")"
7c673cae 254 << " block_size " << block_size
1adf2230 255 << " (" << byte_u_t(block_size) << ")"
7c673cae 256 << " " << (rotational ? "rotational" : "non-rotational")
11fdf7f2 257 << " discard " << (support_discard ? "supported" : "not supported")
7c673cae
FG
258 << dendl;
259 return 0;
260
11fdf7f2
TL
261out_fail:
262 for (i = 0; i < WRITE_LIFE_MAX; i++) {
263 if (fd_directs[i] >= 0) {
264 VOID_TEMP_FAILURE_RETRY(::close(fd_directs[i]));
265 fd_directs[i] = -1;
266 } else {
267 break;
268 }
269 if (fd_buffereds[i] >= 0) {
270 VOID_TEMP_FAILURE_RETRY(::close(fd_buffereds[i]));
271 fd_buffereds[i] = -1;
272 } else {
273 break;
274 }
275 }
7c673cae
FG
276 return r;
277}
278
9f95a23c 279int KernelDevice::get_devices(std::set<std::string> *ls) const
11fdf7f2
TL
280{
281 if (devname.empty()) {
282 return 0;
283 }
284 get_raw_devices(devname, ls);
285 return 0;
286}
287
7c673cae
FG
288void KernelDevice::close()
289{
290 dout(1) << __func__ << dendl;
291 _aio_stop();
11fdf7f2 292 _discard_stop();
7c673cae 293
11fdf7f2
TL
294 if (vdo_fd >= 0) {
295 VOID_TEMP_FAILURE_RETRY(::close(vdo_fd));
296 vdo_fd = -1;
297 }
7c673cae 298
11fdf7f2
TL
299 for (int i = 0; i < WRITE_LIFE_MAX; i++) {
300 assert(fd_directs[i] >= 0);
301 VOID_TEMP_FAILURE_RETRY(::close(fd_directs[i]));
302 fd_directs[i] = -1;
7c673cae 303
11fdf7f2
TL
304 assert(fd_buffereds[i] >= 0);
305 VOID_TEMP_FAILURE_RETRY(::close(fd_buffereds[i]));
306 fd_buffereds[i] = -1;
307 }
7c673cae
FG
308 path.clear();
309}
310
11fdf7f2 311int KernelDevice::collect_metadata(const string& prefix, map<string,string> *pm) const
7c673cae 312{
11fdf7f2 313 (*pm)[prefix + "support_discard"] = stringify((int)(bool)support_discard);
7c673cae
FG
314 (*pm)[prefix + "rotational"] = stringify((int)(bool)rotational);
315 (*pm)[prefix + "size"] = stringify(get_size());
316 (*pm)[prefix + "block_size"] = stringify(get_block_size());
317 (*pm)[prefix + "driver"] = "KernelDevice";
318 if (rotational) {
319 (*pm)[prefix + "type"] = "hdd";
320 } else {
321 (*pm)[prefix + "type"] = "ssd";
322 }
11fdf7f2
TL
323 if (vdo_fd >= 0) {
324 (*pm)[prefix + "vdo"] = "true";
325 uint64_t total, avail;
326 get_vdo_utilization(vdo_fd, &total, &avail);
327 (*pm)[prefix + "vdo_physical_size"] = stringify(total);
328 }
7c673cae 329
9f95a23c
TL
330 {
331 string res_names;
332 std::set<std::string> devnames;
333 if (get_devices(&devnames) == 0) {
334 for (auto& dev : devnames) {
335 if (!res_names.empty()) {
336 res_names += ",";
337 }
338 res_names += dev;
339 }
340 if (res_names.size()) {
341 (*pm)[prefix + "devices"] = res_names;
342 }
343 }
344 }
345
7c673cae 346 struct stat st;
11fdf7f2 347 int r = ::fstat(fd_buffereds[WRITE_LIFE_NOT_SET], &st);
7c673cae
FG
348 if (r < 0)
349 return -errno;
350 if (S_ISBLK(st.st_mode)) {
351 (*pm)[prefix + "access_mode"] = "blk";
11fdf7f2
TL
352
353 char buffer[1024] = {0};
354 BlkDev blkdev{fd_buffereds[WRITE_LIFE_NOT_SET]};
355 if (r = blkdev.partition(buffer, sizeof(buffer)); r) {
7c673cae 356 (*pm)[prefix + "partition_path"] = "unknown";
11fdf7f2
TL
357 } else {
358 (*pm)[prefix + "partition_path"] = buffer;
359 }
360 buffer[0] = '\0';
361 if (r = blkdev.partition(buffer, sizeof(buffer)); r) {
7c673cae 362 (*pm)[prefix + "dev_node"] = "unknown";
11fdf7f2
TL
363 } else {
364 (*pm)[prefix + "dev_node"] = buffer;
365 }
366 if (!r) {
367 return 0;
368 }
369 buffer[0] = '\0';
370 blkdev.model(buffer, sizeof(buffer));
371 (*pm)[prefix + "model"] = buffer;
372
373 buffer[0] = '\0';
374 blkdev.dev(buffer, sizeof(buffer));
375 (*pm)[prefix + "dev"] = buffer;
376
377 // nvme exposes a serial number
378 buffer[0] = '\0';
379 blkdev.serial(buffer, sizeof(buffer));
380 (*pm)[prefix + "serial"] = buffer;
381
11fdf7f2
TL
382 // numa
383 int node;
384 r = blkdev.get_numa_node(&node);
385 if (r >= 0) {
386 (*pm)[prefix + "numa_node"] = stringify(node);
7c673cae
FG
387 }
388 } else {
389 (*pm)[prefix + "access_mode"] = "file";
390 (*pm)[prefix + "path"] = path;
391 }
392 return 0;
393}
394
11fdf7f2
TL
395void KernelDevice::_detect_vdo()
396{
397 vdo_fd = get_vdo_stats_handle(devname.c_str(), &vdo_name);
398 if (vdo_fd >= 0) {
399 dout(1) << __func__ << " VDO volume " << vdo_name
400 << " maps to " << devname << dendl;
401 } else {
402 dout(20) << __func__ << " no VDO volume maps to " << devname << dendl;
403 }
404 return;
405}
406
407bool KernelDevice::get_thin_utilization(uint64_t *total, uint64_t *avail) const
408{
409 if (vdo_fd < 0) {
410 return false;
411 }
412 return get_vdo_utilization(vdo_fd, total, avail);
413}
414
415int KernelDevice::choose_fd(bool buffered, int write_hint) const
416{
417 assert(write_hint >= WRITE_LIFE_NOT_SET && write_hint < WRITE_LIFE_MAX);
418 if (!enable_wrt)
419 write_hint = WRITE_LIFE_NOT_SET;
420 return buffered ? fd_buffereds[write_hint] : fd_directs[write_hint];
421}
422
7c673cae
FG
423int KernelDevice::flush()
424{
31f18b77 425 // protect flush with a mutex. note that we are not really protecting
7c673cae
FG
426 // data here. instead, we're ensuring that if any flush() caller
427 // sees that io_since_flush is true, they block any racing callers
428 // until the flush is observed. that allows racing threads to be
429 // calling flush while still ensuring that *any* of them that got an
430 // aio completion notification will not return before that aio is
431 // stable on disk: whichever thread sees the flag first will block
432 // followers until the aio is stable.
11fdf7f2 433 std::lock_guard l(flush_mutex);
7c673cae
FG
434
435 bool expect = true;
436 if (!io_since_flush.compare_exchange_strong(expect, false)) {
437 dout(10) << __func__ << " no-op (no ios since last flush), flag is "
438 << (int)io_since_flush.load() << dendl;
439 return 0;
440 }
441
442 dout(10) << __func__ << " start" << dendl;
443 if (cct->_conf->bdev_inject_crash) {
444 ++injecting_crash;
445 // sleep for a moment to give other threads a chance to submit or
446 // wait on io that races with a flush.
447 derr << __func__ << " injecting crash. first we sleep..." << dendl;
448 sleep(cct->_conf->bdev_inject_crash_flush_delay);
449 derr << __func__ << " and now we die" << dendl;
450 cct->_log->flush();
451 _exit(1);
452 }
453 utime_t start = ceph_clock_now();
11fdf7f2 454 int r = ::fdatasync(fd_directs[WRITE_LIFE_NOT_SET]);
7c673cae
FG
455 utime_t end = ceph_clock_now();
456 utime_t dur = end - start;
457 if (r < 0) {
458 r = -errno;
459 derr << __func__ << " fdatasync got: " << cpp_strerror(r) << dendl;
460 ceph_abort();
461 }
462 dout(5) << __func__ << " in " << dur << dendl;;
463 return r;
464}
465
466int KernelDevice::_aio_start()
467{
468 if (aio) {
469 dout(10) << __func__ << dendl;
9f95a23c 470 int r = io_queue->init(fd_directs);
7c673cae 471 if (r < 0) {
31f18b77
FG
472 if (r == -EAGAIN) {
473 derr << __func__ << " io_setup(2) failed with EAGAIN; "
474 << "try increasing /proc/sys/fs/aio-max-nr" << dendl;
475 } else {
476 derr << __func__ << " io_setup(2) failed: " << cpp_strerror(r) << dendl;
477 }
7c673cae
FG
478 return r;
479 }
480 aio_thread.create("bstore_aio");
481 }
482 return 0;
483}
484
485void KernelDevice::_aio_stop()
486{
487 if (aio) {
488 dout(10) << __func__ << dendl;
489 aio_stop = true;
490 aio_thread.join();
491 aio_stop = false;
9f95a23c 492 io_queue->shutdown();
7c673cae
FG
493 }
494}
495
11fdf7f2
TL
496int KernelDevice::_discard_start()
497{
498 discard_thread.create("bstore_discard");
499 return 0;
500}
501
502void KernelDevice::_discard_stop()
503{
504 dout(10) << __func__ << dendl;
505 {
506 std::unique_lock l(discard_lock);
507 while (!discard_started) {
508 discard_cond.wait(l);
509 }
510 discard_stop = true;
511 discard_cond.notify_all();
512 }
513 discard_thread.join();
514 {
515 std::lock_guard l(discard_lock);
516 discard_stop = false;
517 }
518 dout(10) << __func__ << " stopped" << dendl;
519}
520
521void KernelDevice::discard_drain()
522{
523 dout(10) << __func__ << dendl;
524 std::unique_lock l(discard_lock);
525 while (!discard_queued.empty() || discard_running) {
526 discard_cond.wait(l);
527 }
528}
529
28e407b8
AA
530static bool is_expected_ioerr(const int r)
531{
532 // https://lxr.missinglinkelectronics.com/linux+v4.15/block/blk-core.c#L135
533 return (r == -EOPNOTSUPP || r == -ETIMEDOUT || r == -ENOSPC ||
11fdf7f2 534 r == -ENOLINK || r == -EREMOTEIO || r == -EAGAIN || r == -EIO ||
28e407b8 535 r == -ENODATA || r == -EILSEQ || r == -ENOMEM ||
11fdf7f2
TL
536#if defined(__linux__)
537 r == -EREMCHG || r == -EBADE
538#elif defined(__FreeBSD__)
539 r == - BSM_ERRNO_EREMCHG || r == -BSM_ERRNO_EBADE
540#endif
541 );
28e407b8
AA
542}
543
7c673cae
FG
544void KernelDevice::_aio_thread()
545{
546 dout(10) << __func__ << " start" << dendl;
547 int inject_crash_count = 0;
548 while (!aio_stop) {
549 dout(40) << __func__ << " polling" << dendl;
224ce89b 550 int max = cct->_conf->bdev_aio_reap_max;
7c673cae 551 aio_t *aio[max];
9f95a23c 552 int r = io_queue->get_next_completed(cct->_conf->bdev_aio_poll_ms,
7c673cae
FG
553 aio, max);
554 if (r < 0) {
555 derr << __func__ << " got " << cpp_strerror(r) << dendl;
11fdf7f2 556 ceph_abort_msg("got unexpected error from io_getevents");
7c673cae
FG
557 }
558 if (r > 0) {
559 dout(30) << __func__ << " got " << r << " completed aios" << dendl;
560 for (int i = 0; i < r; ++i) {
561 IOContext *ioc = static_cast<IOContext*>(aio[i]->priv);
562 _aio_log_finish(ioc, aio[i]->offset, aio[i]->length);
563 if (aio[i]->queue_item.is_linked()) {
11fdf7f2 564 std::lock_guard l(debug_queue_lock);
7c673cae
FG
565 debug_aio_unlink(*aio[i]);
566 }
567
568 // set flag indicating new ios have completed. we do this *before*
569 // any completion or notifications so that any user flush() that
570 // follows the observed io completion will include this io. Note
571 // that an earlier, racing flush() could observe and clear this
572 // flag, but that also ensures that the IO will be stable before the
573 // later flush() occurs.
574 io_since_flush.store(true);
575
94b18763 576 long r = aio[i]->get_return_value();
b32b8144 577 if (r < 0) {
28e407b8
AA
578 derr << __func__ << " got r=" << r << " (" << cpp_strerror(r) << ")"
579 << dendl;
580 if (ioc->allow_eio && is_expected_ioerr(r)) {
581 derr << __func__ << " translating the error to EIO for upper layer"
582 << dendl;
583 ioc->set_return_value(-EIO);
b32b8144 584 } else {
11fdf7f2
TL
585 if (is_expected_ioerr(r)) {
586 note_io_error_event(
587 devname.c_str(),
588 path.c_str(),
589 r,
81eedcae
TL
590#if defined(HAVE_POSIXAIO)
591 aio[i]->aio.aiocb.aio_lio_opcode,
592#else
593 aio[i]->iocb.aio_lio_opcode,
594#endif
11fdf7f2
TL
595 aio[i]->offset,
596 aio[i]->length);
597 ceph_abort_msg(
598 "Unexpected IO error. "
599 "This may suggest a hardware issue. "
600 "Please check your kernel log!");
601 }
602 ceph_abort_msg(
603 "Unexpected IO error. "
604 "This may suggest HW issue. Please check your dmesg!");
b32b8144
FG
605 }
606 } else if (aio[i]->length != (uint64_t)r) {
eafe8130
TL
607 derr << "aio to 0x" << std::hex << aio[i]->offset
608 << "~" << aio[i]->length << std::dec
b32b8144 609 << " but returned: " << r << dendl;
11fdf7f2 610 ceph_abort_msg("unexpected aio return value: does not match length");
b32b8144
FG
611 }
612
613 dout(10) << __func__ << " finished aio " << aio[i] << " r " << r
614 << " ioc " << ioc
615 << " with " << (ioc->num_running.load() - 1)
616 << " aios left" << dendl;
7c673cae
FG
617
618 // NOTE: once num_running and we either call the callback or
619 // call aio_wake we cannot touch ioc or aio[] as the caller
620 // may free it.
621 if (ioc->priv) {
622 if (--ioc->num_running == 0) {
623 aio_callback(aio_callback_priv, ioc->priv);
624 }
625 } else {
31f18b77 626 ioc->try_aio_wake();
7c673cae
FG
627 }
628 }
629 }
630 if (cct->_conf->bdev_debug_aio) {
631 utime_t now = ceph_clock_now();
11fdf7f2 632 std::lock_guard l(debug_queue_lock);
7c673cae
FG
633 if (debug_oldest) {
634 if (debug_stall_since == utime_t()) {
635 debug_stall_since = now;
636 } else {
11fdf7f2
TL
637 if (cct->_conf->bdev_debug_aio_suicide_timeout) {
638 utime_t cutoff = now;
639 cutoff -= cct->_conf->bdev_debug_aio_suicide_timeout;
640 if (debug_stall_since < cutoff) {
641 derr << __func__ << " stalled aio " << debug_oldest
642 << " since " << debug_stall_since << ", timeout is "
643 << cct->_conf->bdev_debug_aio_suicide_timeout
644 << "s, suicide" << dendl;
645 ceph_abort_msg("stalled aio... buggy kernel or bad device?");
646 }
7c673cae
FG
647 }
648 }
649 }
650 }
651 reap_ioc();
652 if (cct->_conf->bdev_inject_crash) {
653 ++inject_crash_count;
654 if (inject_crash_count * cct->_conf->bdev_aio_poll_ms / 1000 >
655 cct->_conf->bdev_inject_crash + cct->_conf->bdev_inject_crash_flush_delay) {
656 derr << __func__ << " bdev_inject_crash trigger from aio thread"
657 << dendl;
658 cct->_log->flush();
659 _exit(1);
660 }
661 }
662 }
663 reap_ioc();
664 dout(10) << __func__ << " end" << dendl;
665}
666
11fdf7f2
TL
667void KernelDevice::_discard_thread()
668{
669 std::unique_lock l(discard_lock);
670 ceph_assert(!discard_started);
671 discard_started = true;
672 discard_cond.notify_all();
673 while (true) {
674 ceph_assert(discard_finishing.empty());
675 if (discard_queued.empty()) {
676 if (discard_stop)
677 break;
678 dout(20) << __func__ << " sleep" << dendl;
679 discard_cond.notify_all(); // for the thread trying to drain...
680 discard_cond.wait(l);
681 dout(20) << __func__ << " wake" << dendl;
682 } else {
683 discard_finishing.swap(discard_queued);
684 discard_running = true;
685 l.unlock();
686 dout(20) << __func__ << " finishing" << dendl;
687 for (auto p = discard_finishing.begin();p != discard_finishing.end(); ++p) {
688 discard(p.get_start(), p.get_len());
689 }
690
691 discard_callback(discard_callback_priv, static_cast<void*>(&discard_finishing));
692 discard_finishing.clear();
693 l.lock();
694 discard_running = false;
695 }
696 }
697 dout(10) << __func__ << " finish" << dendl;
698 discard_started = false;
699}
700
701int KernelDevice::queue_discard(interval_set<uint64_t> &to_release)
702{
703 if (!support_discard)
704 return -1;
705
706 if (to_release.empty())
707 return 0;
708
709 std::lock_guard l(discard_lock);
710 discard_queued.insert(to_release);
711 discard_cond.notify_all();
712 return 0;
713}
714
7c673cae
FG
715void KernelDevice::_aio_log_start(
716 IOContext *ioc,
717 uint64_t offset,
718 uint64_t length)
719{
720 dout(20) << __func__ << " 0x" << std::hex << offset << "~" << length
721 << std::dec << dendl;
722 if (cct->_conf->bdev_debug_inflight_ios) {
11fdf7f2 723 std::lock_guard l(debug_lock);
7c673cae
FG
724 if (debug_inflight.intersects(offset, length)) {
725 derr << __func__ << " inflight overlap of 0x"
726 << std::hex
727 << offset << "~" << length << std::dec
728 << " with " << debug_inflight << dendl;
729 ceph_abort();
730 }
731 debug_inflight.insert(offset, length);
732 }
733}
734
735void KernelDevice::debug_aio_link(aio_t& aio)
736{
737 if (debug_queue.empty()) {
738 debug_oldest = &aio;
739 }
740 debug_queue.push_back(aio);
741}
742
743void KernelDevice::debug_aio_unlink(aio_t& aio)
744{
745 if (aio.queue_item.is_linked()) {
746 debug_queue.erase(debug_queue.iterator_to(aio));
747 if (debug_oldest == &aio) {
11fdf7f2
TL
748 auto age = cct->_conf->bdev_debug_aio_log_age;
749 if (age && debug_stall_since != utime_t()) {
750 utime_t cutoff = ceph_clock_now();
751 cutoff -= age;
752 if (debug_stall_since < cutoff) {
753 derr << __func__ << " stalled aio " << debug_oldest
754 << " since " << debug_stall_since << ", timeout is "
755 << age
756 << "s" << dendl;
757 }
758 }
759
7c673cae
FG
760 if (debug_queue.empty()) {
761 debug_oldest = nullptr;
762 } else {
763 debug_oldest = &debug_queue.front();
764 }
765 debug_stall_since = utime_t();
766 }
767 }
768}
769
770void KernelDevice::_aio_log_finish(
771 IOContext *ioc,
772 uint64_t offset,
773 uint64_t length)
774{
775 dout(20) << __func__ << " " << aio << " 0x"
776 << std::hex << offset << "~" << length << std::dec << dendl;
777 if (cct->_conf->bdev_debug_inflight_ios) {
11fdf7f2 778 std::lock_guard l(debug_lock);
7c673cae
FG
779 debug_inflight.erase(offset, length);
780 }
781}
782
783void KernelDevice::aio_submit(IOContext *ioc)
784{
785 dout(20) << __func__ << " ioc " << ioc
786 << " pending " << ioc->num_pending.load()
787 << " running " << ioc->num_running.load()
788 << dendl;
224ce89b 789
7c673cae
FG
790 if (ioc->num_pending.load() == 0) {
791 return;
792 }
224ce89b 793
7c673cae
FG
794 // move these aside, and get our end iterator position now, as the
795 // aios might complete as soon as they are submitted and queue more
796 // wal aio's.
797 list<aio_t>::iterator e = ioc->running_aios.begin();
798 ioc->running_aios.splice(e, ioc->pending_aios);
7c673cae
FG
799
800 int pending = ioc->num_pending.load();
801 ioc->num_running += pending;
802 ioc->num_pending -= pending;
11fdf7f2
TL
803 ceph_assert(ioc->num_pending.load() == 0); // we should be only thread doing this
804 ceph_assert(ioc->pending_aios.size() == 0);
805
224ce89b
WB
806 if (cct->_conf->bdev_debug_aio) {
807 list<aio_t>::iterator p = ioc->running_aios.begin();
808 while (p != e) {
11fdf7f2
TL
809 dout(30) << __func__ << " " << *p << dendl;
810 std::lock_guard l(debug_queue_lock);
224ce89b 811 debug_aio_link(*p++);
7c673cae
FG
812 }
813 }
224ce89b
WB
814
815 void *priv = static_cast<void*>(ioc);
816 int r, retries = 0;
f67539c2
TL
817 // num of pending aios should not overflow when passed to submit_batch()
818 assert(pending <= std::numeric_limits<uint16_t>::max());
9f95a23c 819 r = io_queue->submit_batch(ioc->running_aios.begin(), e,
11fdf7f2
TL
820 pending, priv, &retries);
821
224ce89b
WB
822 if (retries)
823 derr << __func__ << " retries " << retries << dendl;
824 if (r < 0) {
825 derr << " aio submit got " << cpp_strerror(r) << dendl;
11fdf7f2 826 ceph_assert(r == 0);
224ce89b 827 }
7c673cae
FG
828}
829
11fdf7f2 830int KernelDevice::_sync_write(uint64_t off, bufferlist &bl, bool buffered, int write_hint)
7c673cae
FG
831{
832 uint64_t len = bl.length();
833 dout(5) << __func__ << " 0x" << std::hex << off << "~" << len
11fdf7f2 834 << std::dec << (buffered ? " (buffered)" : " (direct)") << dendl;
7c673cae
FG
835 if (cct->_conf->bdev_inject_crash &&
836 rand() % cct->_conf->bdev_inject_crash == 0) {
837 derr << __func__ << " bdev_inject_crash: dropping io 0x" << std::hex
838 << off << "~" << len << std::dec << dendl;
839 ++injecting_crash;
840 return 0;
841 }
842 vector<iovec> iov;
843 bl.prepare_iov(&iov);
7c673cae 844
e306af50
TL
845 auto left = len;
846 auto o = off;
847 size_t idx = 0;
848 do {
849 auto r = ::pwritev(choose_fd(buffered, write_hint),
850 &iov[idx], iov.size() - idx, o);
851
852 if (r < 0) {
853 r = -errno;
854 derr << __func__ << " pwritev error: " << cpp_strerror(r) << dendl;
855 return r;
856 }
857 o += r;
858 left -= r;
859 if (left) {
860 // skip fully processed IOVs
861 while (idx < iov.size() && (size_t)r >= iov[idx].iov_len) {
862 r -= iov[idx++].iov_len;
863 }
864 // update partially processed one if any
865 if (r) {
866 ceph_assert(idx < iov.size());
867 ceph_assert((size_t)r < iov[idx].iov_len);
868 iov[idx].iov_base = static_cast<char*>(iov[idx].iov_base) + r;
869 iov[idx].iov_len -= r;
870 r = 0;
871 }
872 ceph_assert(r == 0);
873 }
874 } while (left);
875
11fdf7f2 876#ifdef HAVE_SYNC_FILE_RANGE
7c673cae 877 if (buffered) {
494da23a 878 // initiate IO and wait till it completes
e306af50 879 auto r = ::sync_file_range(fd_buffereds[WRITE_LIFE_NOT_SET], off, len, SYNC_FILE_RANGE_WRITE|SYNC_FILE_RANGE_WAIT_AFTER|SYNC_FILE_RANGE_WAIT_BEFORE);
7c673cae
FG
880 if (r < 0) {
881 r = -errno;
882 derr << __func__ << " sync_file_range error: " << cpp_strerror(r) << dendl;
883 return r;
884 }
885 }
11fdf7f2 886#endif
31f18b77
FG
887
888 io_since_flush.store(true);
889
7c673cae
FG
890 return 0;
891}
892
893int KernelDevice::write(
894 uint64_t off,
895 bufferlist &bl,
11fdf7f2
TL
896 bool buffered,
897 int write_hint)
7c673cae
FG
898{
899 uint64_t len = bl.length();
900 dout(20) << __func__ << " 0x" << std::hex << off << "~" << len << std::dec
901 << (buffered ? " (buffered)" : " (direct)")
902 << dendl;
11fdf7f2 903 ceph_assert(is_valid_io(off, len));
eafe8130
TL
904 if (cct->_conf->objectstore_blackhole) {
905 lderr(cct) << __func__ << " objectstore_blackhole=true, throwing out IO"
906 << dendl;
907 return 0;
908 }
7c673cae
FG
909
910 if ((!buffered || bl.get_num_buffers() >= IOV_MAX) &&
b32b8144 911 bl.rebuild_aligned_size_and_memory(block_size, block_size, IOV_MAX)) {
7c673cae
FG
912 dout(20) << __func__ << " rebuilding buffer to be aligned" << dendl;
913 }
914 dout(40) << "data: ";
915 bl.hexdump(*_dout);
916 *_dout << dendl;
917
11fdf7f2 918 return _sync_write(off, bl, buffered, write_hint);
7c673cae
FG
919}
920
921int KernelDevice::aio_write(
922 uint64_t off,
923 bufferlist &bl,
924 IOContext *ioc,
11fdf7f2
TL
925 bool buffered,
926 int write_hint)
7c673cae
FG
927{
928 uint64_t len = bl.length();
929 dout(20) << __func__ << " 0x" << std::hex << off << "~" << len << std::dec
930 << (buffered ? " (buffered)" : " (direct)")
931 << dendl;
11fdf7f2 932 ceph_assert(is_valid_io(off, len));
eafe8130
TL
933 if (cct->_conf->objectstore_blackhole) {
934 lderr(cct) << __func__ << " objectstore_blackhole=true, throwing out IO"
935 << dendl;
936 return 0;
937 }
7c673cae
FG
938
939 if ((!buffered || bl.get_num_buffers() >= IOV_MAX) &&
b32b8144 940 bl.rebuild_aligned_size_and_memory(block_size, block_size, IOV_MAX)) {
7c673cae
FG
941 dout(20) << __func__ << " rebuilding buffer to be aligned" << dendl;
942 }
943 dout(40) << "data: ";
944 bl.hexdump(*_dout);
945 *_dout << dendl;
946
947 _aio_log_start(ioc, off, len);
948
949#ifdef HAVE_LIBAIO
950 if (aio && dio && !buffered) {
7c673cae
FG
951 if (cct->_conf->bdev_inject_crash &&
952 rand() % cct->_conf->bdev_inject_crash == 0) {
953 derr << __func__ << " bdev_inject_crash: dropping io 0x" << std::hex
954 << off << "~" << len << std::dec
955 << dendl;
956 // generate a real io so that aio_wait behaves properly, but make it
957 // a read instead of write, and toss the result.
494da23a
TL
958 ioc->pending_aios.push_back(aio_t(ioc, choose_fd(false, write_hint)));
959 ++ioc->num_pending;
960 auto& aio = ioc->pending_aios.back();
f67539c2 961 bufferptr p = ceph::buffer::create_small_page_aligned(len);
9f95a23c
TL
962 aio.bl.append(std::move(p));
963 aio.bl.prepare_iov(&aio.iov);
964 aio.preadv(off, len);
7c673cae
FG
965 ++injecting_crash;
966 } else {
494da23a
TL
967 if (bl.length() <= RW_IO_MAX) {
968 // fast path (non-huge write)
969 ioc->pending_aios.push_back(aio_t(ioc, choose_fd(false, write_hint)));
970 ++ioc->num_pending;
971 auto& aio = ioc->pending_aios.back();
972 bl.prepare_iov(&aio.iov);
973 aio.bl.claim_append(bl);
974 aio.pwritev(off, len);
975 dout(30) << aio << dendl;
976 dout(5) << __func__ << " 0x" << std::hex << off << "~" << len
977 << std::dec << " aio " << &aio << dendl;
978 } else {
979 // write in RW_IO_MAX-sized chunks
980 uint64_t prev_len = 0;
981 while (prev_len < bl.length()) {
982 bufferlist tmp;
983 if (prev_len + RW_IO_MAX < bl.length()) {
984 tmp.substr_of(bl, prev_len, RW_IO_MAX);
985 } else {
986 tmp.substr_of(bl, prev_len, bl.length() - prev_len);
987 }
988 auto len = tmp.length();
989 ioc->pending_aios.push_back(aio_t(ioc, choose_fd(false, write_hint)));
990 ++ioc->num_pending;
991 auto& aio = ioc->pending_aios.back();
992 tmp.prepare_iov(&aio.iov);
993 aio.bl.claim_append(tmp);
994 aio.pwritev(off + prev_len, len);
995 dout(30) << aio << dendl;
996 dout(5) << __func__ << " 0x" << std::hex << off + prev_len
997 << "~" << len
998 << std::dec << " aio " << &aio << " (piece)" << dendl;
999 prev_len += len;
1000 }
1001 }
7c673cae 1002 }
7c673cae
FG
1003 } else
1004#endif
1005 {
11fdf7f2 1006 int r = _sync_write(off, bl, buffered, write_hint);
7c673cae
FG
1007 _aio_log_finish(ioc, off, len);
1008 if (r < 0)
1009 return r;
1010 }
1011 return 0;
1012}
1013
11fdf7f2
TL
1014int KernelDevice::discard(uint64_t offset, uint64_t len)
1015{
1016 int r = 0;
eafe8130
TL
1017 if (cct->_conf->objectstore_blackhole) {
1018 lderr(cct) << __func__ << " objectstore_blackhole=true, throwing out IO"
1019 << dendl;
1020 return 0;
1021 }
11fdf7f2
TL
1022 if (support_discard) {
1023 dout(10) << __func__
1024 << " 0x" << std::hex << offset << "~" << len << std::dec
1025 << dendl;
1026
1027 r = BlkDev{fd_directs[WRITE_LIFE_NOT_SET]}.discard((int64_t)offset, (int64_t)len);
1028 }
1029 return r;
1030}
1031
7c673cae
FG
1032int KernelDevice::read(uint64_t off, uint64_t len, bufferlist *pbl,
1033 IOContext *ioc,
1034 bool buffered)
1035{
1036 dout(5) << __func__ << " 0x" << std::hex << off << "~" << len << std::dec
1037 << (buffered ? " (buffered)" : " (direct)")
1038 << dendl;
11fdf7f2 1039 ceph_assert(is_valid_io(off, len));
7c673cae
FG
1040
1041 _aio_log_start(ioc, off, len);
1042
11fdf7f2
TL
1043 auto start1 = mono_clock::now();
1044
f67539c2 1045 auto p = ceph::buffer::ptr_node::create(ceph::buffer::create_small_page_aligned(len));
11fdf7f2
TL
1046 int r = ::pread(buffered ? fd_buffereds[WRITE_LIFE_NOT_SET] : fd_directs[WRITE_LIFE_NOT_SET],
1047 p->c_str(), len, off);
1048 auto age = cct->_conf->bdev_debug_aio_log_age;
1049 if (mono_clock::now() - start1 >= make_timespan(age)) {
1050 derr << __func__ << " stalled read "
1051 << " 0x" << std::hex << off << "~" << len << std::dec
1052 << (buffered ? " (buffered)" : " (direct)")
1053 << " since " << start1 << ", timeout is "
1054 << age
1055 << "s" << dendl;
1056 }
1057
7c673cae 1058 if (r < 0) {
a8e16298
TL
1059 if (ioc->allow_eio && is_expected_ioerr(r)) {
1060 r = -EIO;
1061 } else {
1062 r = -errno;
1063 }
7c673cae
FG
1064 goto out;
1065 }
11fdf7f2 1066 ceph_assert((uint64_t)r == len);
7c673cae
FG
1067 pbl->push_back(std::move(p));
1068
1069 dout(40) << "data: ";
1070 pbl->hexdump(*_dout);
1071 *_dout << dendl;
1072
1073 out:
1074 _aio_log_finish(ioc, off, len);
1075 return r < 0 ? r : 0;
1076}
1077
1078int KernelDevice::aio_read(
1079 uint64_t off,
1080 uint64_t len,
1081 bufferlist *pbl,
1082 IOContext *ioc)
1083{
1084 dout(5) << __func__ << " 0x" << std::hex << off << "~" << len << std::dec
1085 << dendl;
1086
1087 int r = 0;
1088#ifdef HAVE_LIBAIO
1089 if (aio && dio) {
11fdf7f2 1090 ceph_assert(is_valid_io(off, len));
7c673cae 1091 _aio_log_start(ioc, off, len);
11fdf7f2 1092 ioc->pending_aios.push_back(aio_t(ioc, fd_directs[WRITE_LIFE_NOT_SET]));
7c673cae
FG
1093 ++ioc->num_pending;
1094 aio_t& aio = ioc->pending_aios.back();
f67539c2 1095 bufferptr p = ceph::buffer::create_small_page_aligned(len);
9f95a23c
TL
1096 aio.bl.append(std::move(p));
1097 aio.bl.prepare_iov(&aio.iov);
1098 aio.preadv(off, len);
11fdf7f2 1099 dout(30) << aio << dendl;
7c673cae
FG
1100 pbl->append(aio.bl);
1101 dout(5) << __func__ << " 0x" << std::hex << off << "~" << len
1102 << std::dec << " aio " << &aio << dendl;
1103 } else
1104#endif
1105 {
1106 r = read(off, len, pbl, ioc, false);
1107 }
1108
1109 return r;
1110}
1111
1112int KernelDevice::direct_read_unaligned(uint64_t off, uint64_t len, char *buf)
1113{
9f95a23c
TL
1114 uint64_t aligned_off = p2align(off, block_size);
1115 uint64_t aligned_len = p2roundup(off+len, block_size) - aligned_off;
f67539c2 1116 bufferptr p = ceph::buffer::create_small_page_aligned(aligned_len);
7c673cae
FG
1117 int r = 0;
1118
11fdf7f2
TL
1119 auto start1 = mono_clock::now();
1120 r = ::pread(fd_directs[WRITE_LIFE_NOT_SET], p.c_str(), aligned_len, aligned_off);
1121 auto age = cct->_conf->bdev_debug_aio_log_age;
1122 if (mono_clock::now() - start1 >= make_timespan(age)) {
1123 derr << __func__ << " stalled read "
1124 << " 0x" << std::hex << off << "~" << len << std::dec
1125 << " since " << start1 << ", timeout is "
1126 << age
1127 << "s" << dendl;
1128 }
1129
7c673cae
FG
1130 if (r < 0) {
1131 r = -errno;
11fdf7f2 1132 derr << __func__ << " 0x" << std::hex << off << "~" << len << std::dec
7c673cae
FG
1133 << " error: " << cpp_strerror(r) << dendl;
1134 goto out;
1135 }
11fdf7f2 1136 ceph_assert((uint64_t)r == aligned_len);
7c673cae
FG
1137 memcpy(buf, p.c_str() + (off - aligned_off), len);
1138
1139 dout(40) << __func__ << " data: ";
1140 bufferlist bl;
1141 bl.append(buf, len);
1142 bl.hexdump(*_dout);
1143 *_dout << dendl;
1144
1145 out:
1146 return r < 0 ? r : 0;
1147}
1148
1149int KernelDevice::read_random(uint64_t off, uint64_t len, char *buf,
1150 bool buffered)
1151{
1152 dout(5) << __func__ << " 0x" << std::hex << off << "~" << len << std::dec
494da23a 1153 << "buffered " << buffered
7c673cae 1154 << dendl;
11fdf7f2
TL
1155 ceph_assert(len > 0);
1156 ceph_assert(off < size);
1157 ceph_assert(off + len <= size);
7c673cae 1158 int r = 0;
11fdf7f2 1159 auto age = cct->_conf->bdev_debug_aio_log_age;
7c673cae
FG
1160
1161 //if it's direct io and unaligned, we have to use a internal buffer
1162 if (!buffered && ((off % block_size != 0)
1163 || (len % block_size != 0)
1164 || (uintptr_t(buf) % CEPH_PAGE_SIZE != 0)))
1165 return direct_read_unaligned(off, len, buf);
1166
11fdf7f2 1167 auto start1 = mono_clock::now();
7c673cae
FG
1168 if (buffered) {
1169 //buffered read
11fdf7f2 1170 auto off0 = off;
7c673cae
FG
1171 char *t = buf;
1172 uint64_t left = len;
1173 while (left > 0) {
11fdf7f2 1174 r = ::pread(fd_buffereds[WRITE_LIFE_NOT_SET], t, left, off);
7c673cae
FG
1175 if (r < 0) {
1176 r = -errno;
11fdf7f2 1177 derr << __func__ << " 0x" << std::hex << off << "~" << left
7c673cae
FG
1178 << std::dec << " error: " << cpp_strerror(r) << dendl;
1179 goto out;
1180 }
1181 off += r;
1182 t += r;
1183 left -= r;
1184 }
11fdf7f2
TL
1185 if (mono_clock::now() - start1 >= make_timespan(age)) {
1186 derr << __func__ << " stalled read "
1187 << " 0x" << std::hex << off0 << "~" << len << std::dec
1188 << " (buffered) since " << start1 << ", timeout is "
1189 << age
1190 << "s" << dendl;
1191 }
7c673cae
FG
1192 } else {
1193 //direct and aligned read
11fdf7f2
TL
1194 r = ::pread(fd_directs[WRITE_LIFE_NOT_SET], buf, len, off);
1195 if (mono_clock::now() - start1 >= make_timespan(age)) {
1196 derr << __func__ << " stalled read "
1197 << " 0x" << std::hex << off << "~" << len << std::dec
1198 << " (direct) since " << start1 << ", timeout is "
1199 << age
1200 << "s" << dendl;
1201 }
7c673cae
FG
1202 if (r < 0) {
1203 r = -errno;
11fdf7f2 1204 derr << __func__ << " direct_aligned_read" << " 0x" << std::hex
f67539c2 1205 << off << "~" << std::left << std::dec << " error: " << cpp_strerror(r)
7c673cae
FG
1206 << dendl;
1207 goto out;
1208 }
11fdf7f2 1209 ceph_assert((uint64_t)r == len);
7c673cae
FG
1210 }
1211
1212 dout(40) << __func__ << " data: ";
1213 bufferlist bl;
1214 bl.append(buf, len);
1215 bl.hexdump(*_dout);
1216 *_dout << dendl;
1217
1218 out:
1219 return r < 0 ? r : 0;
1220}
1221
1222int KernelDevice::invalidate_cache(uint64_t off, uint64_t len)
1223{
1224 dout(5) << __func__ << " 0x" << std::hex << off << "~" << len << std::dec
1225 << dendl;
11fdf7f2
TL
1226 ceph_assert(off % block_size == 0);
1227 ceph_assert(len % block_size == 0);
1228 int r = posix_fadvise(fd_buffereds[WRITE_LIFE_NOT_SET], off, len, POSIX_FADV_DONTNEED);
7c673cae
FG
1229 if (r) {
1230 r = -r;
1231 derr << __func__ << " 0x" << std::hex << off << "~" << len << std::dec
1232 << " error: " << cpp_strerror(r) << dendl;
1233 }
1234 return r;
1235}