]> git.proxmox.com Git - ceph.git/blame - ceph/src/os/bluestore/KernelDevice.cc
update sources to 12.2.7
[ceph.git] / ceph / src / os / bluestore / KernelDevice.cc
CommitLineData
7c673cae
FG
1// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2// vim: ts=8 sw=2 smarttab
3/*
4 * Ceph - scalable distributed file system
5 *
6 * Copyright (C) 2014 Red Hat
7 *
8 * This is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU Lesser General Public
10 * License version 2.1, as published by the Free Software
11 * Foundation. See file COPYING.
12 *
13 */
14
15#include <unistd.h>
16#include <stdlib.h>
17#include <sys/types.h>
18#include <sys/stat.h>
19#include <fcntl.h>
20
21#include "KernelDevice.h"
22#include "include/types.h"
23#include "include/compat.h"
24#include "include/stringify.h"
25#include "common/errno.h"
26#include "common/debug.h"
27#include "common/blkdev.h"
28#include "common/align.h"
29#include "common/blkdev.h"
30
31#define dout_context cct
32#define dout_subsys ceph_subsys_bdev
33#undef dout_prefix
34#define dout_prefix *_dout << "bdev(" << this << " " << path << ") "
35
36KernelDevice::KernelDevice(CephContext* cct, aio_callback_t cb, void *cbpriv)
37 : BlockDevice(cct),
38 fd_direct(-1),
39 fd_buffered(-1),
40 size(0), block_size(0),
41 fs(NULL), aio(false), dio(false),
42 debug_lock("KernelDevice::debug_lock"),
43 aio_queue(cct->_conf->bdev_aio_max_queue_depth),
44 aio_callback(cb),
45 aio_callback_priv(cbpriv),
46 aio_stop(false),
47 aio_thread(this),
48 injecting_crash(0)
49{
50}
51
52int KernelDevice::_lock()
53{
54 struct flock l;
55 memset(&l, 0, sizeof(l));
56 l.l_type = F_WRLCK;
57 l.l_whence = SEEK_SET;
58 int r = ::fcntl(fd_direct, F_SETLK, &l);
59 if (r < 0)
60 return -errno;
61 return 0;
62}
63
64int KernelDevice::open(const string& p)
65{
66 path = p;
67 int r = 0;
68 dout(1) << __func__ << " path " << path << dendl;
69
70 fd_direct = ::open(path.c_str(), O_RDWR | O_DIRECT);
71 if (fd_direct < 0) {
72 r = -errno;
73 derr << __func__ << " open got: " << cpp_strerror(r) << dendl;
74 return r;
75 }
76 fd_buffered = ::open(path.c_str(), O_RDWR);
77 if (fd_buffered < 0) {
78 r = -errno;
79 derr << __func__ << " open got: " << cpp_strerror(r) << dendl;
80 goto out_direct;
81 }
82 dio = true;
83 aio = cct->_conf->bdev_aio;
84 if (!aio) {
85 assert(0 == "non-aio not supported");
86 }
87
88 // disable readahead as it will wreak havoc on our mix of
89 // directio/aio and buffered io.
90 r = posix_fadvise(fd_buffered, 0, 0, POSIX_FADV_RANDOM);
91 if (r) {
92 r = -r;
93 derr << __func__ << " open got: " << cpp_strerror(r) << dendl;
94 goto out_fail;
95 }
96
97 r = _lock();
98 if (r < 0) {
99 derr << __func__ << " failed to lock " << path << ": " << cpp_strerror(r)
100 << dendl;
101 goto out_fail;
102 }
103
104 struct stat st;
105 r = ::fstat(fd_direct, &st);
106 if (r < 0) {
107 r = -errno;
108 derr << __func__ << " fstat got " << cpp_strerror(r) << dendl;
109 goto out_fail;
110 }
111
112 // Operate as though the block size is 4 KB. The backing file
113 // blksize doesn't strictly matter except that some file systems may
114 // require a read/modify/write if we write something smaller than
115 // it.
116 block_size = cct->_conf->bdev_block_size;
117 if (block_size != (unsigned)st.st_blksize) {
118 dout(1) << __func__ << " backing device/file reports st_blksize "
119 << st.st_blksize << ", using bdev_block_size "
120 << block_size << " anyway" << dendl;
121 }
122
123 if (S_ISBLK(st.st_mode)) {
124 int64_t s;
125 r = get_block_device_size(fd_direct, &s);
126 if (r < 0) {
127 goto out_fail;
128 }
129 size = s;
130 } else {
131 size = st.st_size;
132 }
3efd9988
FG
133 if (cct->_conf->get_val<bool>("bdev_inject_bad_size")) {
134 derr << "injecting bad size; actual 0x" << std::hex << size
135 << " but using 0x" << (size & ~block_size) << std::dec << dendl;
136 size &= ~(block_size);
137 }
7c673cae
FG
138
139 {
140 char partition[PATH_MAX], devname[PATH_MAX];
141 r = get_device_by_fd(fd_buffered, partition, devname, sizeof(devname));
142 if (r < 0) {
143 derr << "unable to get device name for " << path << ": "
144 << cpp_strerror(r) << dendl;
145 rotational = true;
146 } else {
147 dout(20) << __func__ << " devname " << devname << dendl;
148 rotational = block_device_is_rotational(devname);
149 }
150 }
151
31f18b77
FG
152 r = _aio_start();
153 if (r < 0) {
154 goto out_fail;
155 }
156
7c673cae
FG
157 fs = FS::create_by_fd(fd_direct);
158 assert(fs);
159
160 // round size down to an even block
161 size &= ~(block_size - 1);
162
7c673cae
FG
163 dout(1) << __func__
164 << " size " << size
165 << " (0x" << std::hex << size << std::dec << ", "
166 << pretty_si_t(size) << "B)"
167 << " block_size " << block_size
168 << " (" << pretty_si_t(block_size) << "B)"
169 << " " << (rotational ? "rotational" : "non-rotational")
170 << dendl;
171 return 0;
172
173 out_fail:
174 VOID_TEMP_FAILURE_RETRY(::close(fd_buffered));
175 fd_buffered = -1;
176 out_direct:
177 VOID_TEMP_FAILURE_RETRY(::close(fd_direct));
178 fd_direct = -1;
179 return r;
180}
181
182void KernelDevice::close()
183{
184 dout(1) << __func__ << dendl;
185 _aio_stop();
186
187 assert(fs);
188 delete fs;
189 fs = NULL;
190
191 assert(fd_direct >= 0);
192 VOID_TEMP_FAILURE_RETRY(::close(fd_direct));
193 fd_direct = -1;
194
195 assert(fd_buffered >= 0);
196 VOID_TEMP_FAILURE_RETRY(::close(fd_buffered));
197 fd_buffered = -1;
198
199 path.clear();
200}
201
202static string get_dev_property(const char *dev, const char *property)
203{
204 char val[1024] = {0};
205 get_block_device_string_property(dev, property, val, sizeof(val));
206 return val;
207}
208
209int KernelDevice::collect_metadata(string prefix, map<string,string> *pm) const
210{
211 (*pm)[prefix + "rotational"] = stringify((int)(bool)rotational);
212 (*pm)[prefix + "size"] = stringify(get_size());
213 (*pm)[prefix + "block_size"] = stringify(get_block_size());
214 (*pm)[prefix + "driver"] = "KernelDevice";
215 if (rotational) {
216 (*pm)[prefix + "type"] = "hdd";
217 } else {
218 (*pm)[prefix + "type"] = "ssd";
219 }
220
221 struct stat st;
222 int r = ::fstat(fd_buffered, &st);
223 if (r < 0)
224 return -errno;
225 if (S_ISBLK(st.st_mode)) {
226 (*pm)[prefix + "access_mode"] = "blk";
227 char partition_path[PATH_MAX];
228 char dev_node[PATH_MAX];
229 int rc = get_device_by_fd(fd_buffered, partition_path, dev_node, PATH_MAX);
230 switch (rc) {
231 case -EOPNOTSUPP:
232 case -EINVAL:
233 (*pm)[prefix + "partition_path"] = "unknown";
234 (*pm)[prefix + "dev_node"] = "unknown";
235 break;
236 case -ENODEV:
237 (*pm)[prefix + "partition_path"] = string(partition_path);
238 (*pm)[prefix + "dev_node"] = "unknown";
239 break;
240 default:
241 {
242 (*pm)[prefix + "partition_path"] = string(partition_path);
243 (*pm)[prefix + "dev_node"] = string(dev_node);
244 (*pm)[prefix + "model"] = get_dev_property(dev_node, "device/model");
245 (*pm)[prefix + "dev"] = get_dev_property(dev_node, "dev");
246
247 // nvme exposes a serial number
248 string serial = get_dev_property(dev_node, "device/serial");
249 if (serial.length()) {
250 (*pm)[prefix + "serial"] = serial;
251 }
252
253 // nvme has a device/device/* structure; infer from that. there
254 // is probably a better way?
255 string nvme_vendor = get_dev_property(dev_node, "device/device/vendor");
256 if (nvme_vendor.length()) {
257 (*pm)[prefix + "type"] = "nvme";
258 }
259 }
260 }
261 } else {
262 (*pm)[prefix + "access_mode"] = "file";
263 (*pm)[prefix + "path"] = path;
264 }
265 return 0;
266}
267
268int KernelDevice::flush()
269{
31f18b77 270 // protect flush with a mutex. note that we are not really protecting
7c673cae
FG
271 // data here. instead, we're ensuring that if any flush() caller
272 // sees that io_since_flush is true, they block any racing callers
273 // until the flush is observed. that allows racing threads to be
274 // calling flush while still ensuring that *any* of them that got an
275 // aio completion notification will not return before that aio is
276 // stable on disk: whichever thread sees the flag first will block
277 // followers until the aio is stable.
278 std::lock_guard<std::mutex> l(flush_mutex);
279
280 bool expect = true;
281 if (!io_since_flush.compare_exchange_strong(expect, false)) {
282 dout(10) << __func__ << " no-op (no ios since last flush), flag is "
283 << (int)io_since_flush.load() << dendl;
284 return 0;
285 }
286
287 dout(10) << __func__ << " start" << dendl;
288 if (cct->_conf->bdev_inject_crash) {
289 ++injecting_crash;
290 // sleep for a moment to give other threads a chance to submit or
291 // wait on io that races with a flush.
292 derr << __func__ << " injecting crash. first we sleep..." << dendl;
293 sleep(cct->_conf->bdev_inject_crash_flush_delay);
294 derr << __func__ << " and now we die" << dendl;
295 cct->_log->flush();
296 _exit(1);
297 }
298 utime_t start = ceph_clock_now();
299 int r = ::fdatasync(fd_direct);
300 utime_t end = ceph_clock_now();
301 utime_t dur = end - start;
302 if (r < 0) {
303 r = -errno;
304 derr << __func__ << " fdatasync got: " << cpp_strerror(r) << dendl;
305 ceph_abort();
306 }
307 dout(5) << __func__ << " in " << dur << dendl;;
308 return r;
309}
310
311int KernelDevice::_aio_start()
312{
313 if (aio) {
314 dout(10) << __func__ << dendl;
315 int r = aio_queue.init();
316 if (r < 0) {
31f18b77
FG
317 if (r == -EAGAIN) {
318 derr << __func__ << " io_setup(2) failed with EAGAIN; "
319 << "try increasing /proc/sys/fs/aio-max-nr" << dendl;
320 } else {
321 derr << __func__ << " io_setup(2) failed: " << cpp_strerror(r) << dendl;
322 }
7c673cae
FG
323 return r;
324 }
325 aio_thread.create("bstore_aio");
326 }
327 return 0;
328}
329
330void KernelDevice::_aio_stop()
331{
332 if (aio) {
333 dout(10) << __func__ << dendl;
334 aio_stop = true;
335 aio_thread.join();
336 aio_stop = false;
337 aio_queue.shutdown();
338 }
339}
340
28e407b8
AA
341static bool is_expected_ioerr(const int r)
342{
343 // https://lxr.missinglinkelectronics.com/linux+v4.15/block/blk-core.c#L135
344 return (r == -EOPNOTSUPP || r == -ETIMEDOUT || r == -ENOSPC ||
345 r == -ENOLINK || r == -EREMOTEIO || r == -EBADE ||
346 r == -ENODATA || r == -EILSEQ || r == -ENOMEM ||
347 r == -EAGAIN || r == -EREMCHG || r == -EIO);
348}
349
7c673cae
FG
350void KernelDevice::_aio_thread()
351{
352 dout(10) << __func__ << " start" << dendl;
353 int inject_crash_count = 0;
354 while (!aio_stop) {
355 dout(40) << __func__ << " polling" << dendl;
224ce89b 356 int max = cct->_conf->bdev_aio_reap_max;
7c673cae
FG
357 aio_t *aio[max];
358 int r = aio_queue.get_next_completed(cct->_conf->bdev_aio_poll_ms,
359 aio, max);
360 if (r < 0) {
361 derr << __func__ << " got " << cpp_strerror(r) << dendl;
b32b8144 362 assert(0 == "got unexpected error from io_getevents");
7c673cae
FG
363 }
364 if (r > 0) {
365 dout(30) << __func__ << " got " << r << " completed aios" << dendl;
366 for (int i = 0; i < r; ++i) {
367 IOContext *ioc = static_cast<IOContext*>(aio[i]->priv);
368 _aio_log_finish(ioc, aio[i]->offset, aio[i]->length);
369 if (aio[i]->queue_item.is_linked()) {
370 std::lock_guard<std::mutex> l(debug_queue_lock);
371 debug_aio_unlink(*aio[i]);
372 }
373
374 // set flag indicating new ios have completed. we do this *before*
375 // any completion or notifications so that any user flush() that
376 // follows the observed io completion will include this io. Note
377 // that an earlier, racing flush() could observe and clear this
378 // flag, but that also ensures that the IO will be stable before the
379 // later flush() occurs.
380 io_since_flush.store(true);
381
94b18763 382 long r = aio[i]->get_return_value();
b32b8144 383 if (r < 0) {
28e407b8
AA
384 derr << __func__ << " got r=" << r << " (" << cpp_strerror(r) << ")"
385 << dendl;
386 if (ioc->allow_eio && is_expected_ioerr(r)) {
387 derr << __func__ << " translating the error to EIO for upper layer"
388 << dendl;
389 ioc->set_return_value(-EIO);
b32b8144 390 } else {
28e407b8
AA
391 assert(0 == "got unexpected error from aio_t::get_return_value. "
392 "This may suggest HW issue. Please check your dmesg!");
b32b8144
FG
393 }
394 } else if (aio[i]->length != (uint64_t)r) {
395 derr << "aio to " << aio[i]->offset << "~" << aio[i]->length
396 << " but returned: " << r << dendl;
397 assert(0 == "unexpected aio error");
398 }
399
400 dout(10) << __func__ << " finished aio " << aio[i] << " r " << r
401 << " ioc " << ioc
402 << " with " << (ioc->num_running.load() - 1)
403 << " aios left" << dendl;
7c673cae
FG
404
405 // NOTE: once num_running and we either call the callback or
406 // call aio_wake we cannot touch ioc or aio[] as the caller
407 // may free it.
408 if (ioc->priv) {
409 if (--ioc->num_running == 0) {
410 aio_callback(aio_callback_priv, ioc->priv);
411 }
412 } else {
31f18b77 413 ioc->try_aio_wake();
7c673cae
FG
414 }
415 }
416 }
417 if (cct->_conf->bdev_debug_aio) {
418 utime_t now = ceph_clock_now();
419 std::lock_guard<std::mutex> l(debug_queue_lock);
420 if (debug_oldest) {
421 if (debug_stall_since == utime_t()) {
422 debug_stall_since = now;
423 } else {
424 utime_t cutoff = now;
425 cutoff -= cct->_conf->bdev_debug_aio_suicide_timeout;
426 if (debug_stall_since < cutoff) {
427 derr << __func__ << " stalled aio " << debug_oldest
428 << " since " << debug_stall_since << ", timeout is "
429 << cct->_conf->bdev_debug_aio_suicide_timeout
430 << "s, suicide" << dendl;
431 assert(0 == "stalled aio... buggy kernel or bad device?");
432 }
433 }
434 }
435 }
436 reap_ioc();
437 if (cct->_conf->bdev_inject_crash) {
438 ++inject_crash_count;
439 if (inject_crash_count * cct->_conf->bdev_aio_poll_ms / 1000 >
440 cct->_conf->bdev_inject_crash + cct->_conf->bdev_inject_crash_flush_delay) {
441 derr << __func__ << " bdev_inject_crash trigger from aio thread"
442 << dendl;
443 cct->_log->flush();
444 _exit(1);
445 }
446 }
447 }
448 reap_ioc();
449 dout(10) << __func__ << " end" << dendl;
450}
451
452void KernelDevice::_aio_log_start(
453 IOContext *ioc,
454 uint64_t offset,
455 uint64_t length)
456{
457 dout(20) << __func__ << " 0x" << std::hex << offset << "~" << length
458 << std::dec << dendl;
459 if (cct->_conf->bdev_debug_inflight_ios) {
460 Mutex::Locker l(debug_lock);
461 if (debug_inflight.intersects(offset, length)) {
462 derr << __func__ << " inflight overlap of 0x"
463 << std::hex
464 << offset << "~" << length << std::dec
465 << " with " << debug_inflight << dendl;
466 ceph_abort();
467 }
468 debug_inflight.insert(offset, length);
469 }
470}
471
472void KernelDevice::debug_aio_link(aio_t& aio)
473{
474 if (debug_queue.empty()) {
475 debug_oldest = &aio;
476 }
477 debug_queue.push_back(aio);
478}
479
480void KernelDevice::debug_aio_unlink(aio_t& aio)
481{
482 if (aio.queue_item.is_linked()) {
483 debug_queue.erase(debug_queue.iterator_to(aio));
484 if (debug_oldest == &aio) {
485 if (debug_queue.empty()) {
486 debug_oldest = nullptr;
487 } else {
488 debug_oldest = &debug_queue.front();
489 }
490 debug_stall_since = utime_t();
491 }
492 }
493}
494
495void KernelDevice::_aio_log_finish(
496 IOContext *ioc,
497 uint64_t offset,
498 uint64_t length)
499{
500 dout(20) << __func__ << " " << aio << " 0x"
501 << std::hex << offset << "~" << length << std::dec << dendl;
502 if (cct->_conf->bdev_debug_inflight_ios) {
503 Mutex::Locker l(debug_lock);
504 debug_inflight.erase(offset, length);
505 }
506}
507
508void KernelDevice::aio_submit(IOContext *ioc)
509{
510 dout(20) << __func__ << " ioc " << ioc
511 << " pending " << ioc->num_pending.load()
512 << " running " << ioc->num_running.load()
513 << dendl;
224ce89b 514
7c673cae
FG
515 if (ioc->num_pending.load() == 0) {
516 return;
517 }
224ce89b 518
7c673cae
FG
519 // move these aside, and get our end iterator position now, as the
520 // aios might complete as soon as they are submitted and queue more
521 // wal aio's.
522 list<aio_t>::iterator e = ioc->running_aios.begin();
523 ioc->running_aios.splice(e, ioc->pending_aios);
7c673cae
FG
524
525 int pending = ioc->num_pending.load();
526 ioc->num_running += pending;
527 ioc->num_pending -= pending;
528 assert(ioc->num_pending.load() == 0); // we should be only thread doing this
224ce89b
WB
529 assert(ioc->pending_aios.size() == 0);
530
531 if (cct->_conf->bdev_debug_aio) {
532 list<aio_t>::iterator p = ioc->running_aios.begin();
533 while (p != e) {
534 for (auto& io : p->iov)
535 dout(30) << __func__ << " iov " << (void*)io.iov_base
536 << " len " << io.iov_len << dendl;
7c673cae 537
7c673cae 538 std::lock_guard<std::mutex> l(debug_queue_lock);
224ce89b 539 debug_aio_link(*p++);
7c673cae
FG
540 }
541 }
224ce89b
WB
542
543 void *priv = static_cast<void*>(ioc);
544 int r, retries = 0;
545 r = aio_queue.submit_batch(ioc->running_aios.begin(), e,
546 ioc->num_running.load(), priv, &retries);
547
548 if (retries)
549 derr << __func__ << " retries " << retries << dendl;
550 if (r < 0) {
551 derr << " aio submit got " << cpp_strerror(r) << dendl;
552 assert(r == 0);
553 }
7c673cae
FG
554}
555
556int KernelDevice::_sync_write(uint64_t off, bufferlist &bl, bool buffered)
557{
558 uint64_t len = bl.length();
559 dout(5) << __func__ << " 0x" << std::hex << off << "~" << len
560 << std::dec << " buffered" << dendl;
561 if (cct->_conf->bdev_inject_crash &&
562 rand() % cct->_conf->bdev_inject_crash == 0) {
563 derr << __func__ << " bdev_inject_crash: dropping io 0x" << std::hex
564 << off << "~" << len << std::dec << dendl;
565 ++injecting_crash;
566 return 0;
567 }
568 vector<iovec> iov;
569 bl.prepare_iov(&iov);
570 int r = ::pwritev(buffered ? fd_buffered : fd_direct,
571 &iov[0], iov.size(), off);
572
573 if (r < 0) {
574 r = -errno;
575 derr << __func__ << " pwritev error: " << cpp_strerror(r) << dendl;
576 return r;
577 }
578 if (buffered) {
579 // initiate IO (but do not wait)
580 r = ::sync_file_range(fd_buffered, off, len, SYNC_FILE_RANGE_WRITE);
581 if (r < 0) {
582 r = -errno;
583 derr << __func__ << " sync_file_range error: " << cpp_strerror(r) << dendl;
584 return r;
585 }
586 }
31f18b77
FG
587
588 io_since_flush.store(true);
589
7c673cae
FG
590 return 0;
591}
592
593int KernelDevice::write(
594 uint64_t off,
595 bufferlist &bl,
596 bool buffered)
597{
598 uint64_t len = bl.length();
599 dout(20) << __func__ << " 0x" << std::hex << off << "~" << len << std::dec
600 << (buffered ? " (buffered)" : " (direct)")
601 << dendl;
602 assert(off % block_size == 0);
603 assert(len % block_size == 0);
604 assert(len > 0);
605 assert(off < size);
606 assert(off + len <= size);
607
608 if ((!buffered || bl.get_num_buffers() >= IOV_MAX) &&
b32b8144 609 bl.rebuild_aligned_size_and_memory(block_size, block_size, IOV_MAX)) {
7c673cae
FG
610 dout(20) << __func__ << " rebuilding buffer to be aligned" << dendl;
611 }
612 dout(40) << "data: ";
613 bl.hexdump(*_dout);
614 *_dout << dendl;
615
616 return _sync_write(off, bl, buffered);
617}
618
619int KernelDevice::aio_write(
620 uint64_t off,
621 bufferlist &bl,
622 IOContext *ioc,
623 bool buffered)
624{
625 uint64_t len = bl.length();
626 dout(20) << __func__ << " 0x" << std::hex << off << "~" << len << std::dec
627 << (buffered ? " (buffered)" : " (direct)")
628 << dendl;
629 assert(off % block_size == 0);
630 assert(len % block_size == 0);
631 assert(len > 0);
632 assert(off < size);
633 assert(off + len <= size);
634
635 if ((!buffered || bl.get_num_buffers() >= IOV_MAX) &&
b32b8144 636 bl.rebuild_aligned_size_and_memory(block_size, block_size, IOV_MAX)) {
7c673cae
FG
637 dout(20) << __func__ << " rebuilding buffer to be aligned" << dendl;
638 }
639 dout(40) << "data: ";
640 bl.hexdump(*_dout);
641 *_dout << dendl;
642
643 _aio_log_start(ioc, off, len);
644
645#ifdef HAVE_LIBAIO
646 if (aio && dio && !buffered) {
647 ioc->pending_aios.push_back(aio_t(ioc, fd_direct));
648 ++ioc->num_pending;
649 aio_t& aio = ioc->pending_aios.back();
650 if (cct->_conf->bdev_inject_crash &&
651 rand() % cct->_conf->bdev_inject_crash == 0) {
652 derr << __func__ << " bdev_inject_crash: dropping io 0x" << std::hex
653 << off << "~" << len << std::dec
654 << dendl;
655 // generate a real io so that aio_wait behaves properly, but make it
656 // a read instead of write, and toss the result.
657 aio.pread(off, len);
658 ++injecting_crash;
659 } else {
660 bl.prepare_iov(&aio.iov);
661 for (unsigned i=0; i<aio.iov.size(); ++i) {
662 dout(30) << "aio " << i << " " << aio.iov[i].iov_base
663 << " " << aio.iov[i].iov_len << dendl;
664 }
665 aio.bl.claim_append(bl);
666 aio.pwritev(off, len);
667 }
668 dout(5) << __func__ << " 0x" << std::hex << off << "~" << len
669 << std::dec << " aio " << &aio << dendl;
670 } else
671#endif
672 {
673 int r = _sync_write(off, bl, buffered);
674 _aio_log_finish(ioc, off, len);
675 if (r < 0)
676 return r;
677 }
678 return 0;
679}
680
681int KernelDevice::read(uint64_t off, uint64_t len, bufferlist *pbl,
682 IOContext *ioc,
683 bool buffered)
684{
685 dout(5) << __func__ << " 0x" << std::hex << off << "~" << len << std::dec
686 << (buffered ? " (buffered)" : " (direct)")
687 << dendl;
688 assert(off % block_size == 0);
689 assert(len % block_size == 0);
690 assert(len > 0);
691 assert(off < size);
692 assert(off + len <= size);
693
694 _aio_log_start(ioc, off, len);
695
696 bufferptr p = buffer::create_page_aligned(len);
697 int r = ::pread(buffered ? fd_buffered : fd_direct,
698 p.c_str(), len, off);
699 if (r < 0) {
700 r = -errno;
701 goto out;
702 }
703 assert((uint64_t)r == len);
704 pbl->push_back(std::move(p));
705
706 dout(40) << "data: ";
707 pbl->hexdump(*_dout);
708 *_dout << dendl;
709
710 out:
711 _aio_log_finish(ioc, off, len);
712 return r < 0 ? r : 0;
713}
714
715int KernelDevice::aio_read(
716 uint64_t off,
717 uint64_t len,
718 bufferlist *pbl,
719 IOContext *ioc)
720{
721 dout(5) << __func__ << " 0x" << std::hex << off << "~" << len << std::dec
722 << dendl;
723
724 int r = 0;
725#ifdef HAVE_LIBAIO
726 if (aio && dio) {
727 _aio_log_start(ioc, off, len);
728 ioc->pending_aios.push_back(aio_t(ioc, fd_direct));
729 ++ioc->num_pending;
730 aio_t& aio = ioc->pending_aios.back();
731 aio.pread(off, len);
732 for (unsigned i=0; i<aio.iov.size(); ++i) {
733 dout(30) << "aio " << i << " " << aio.iov[i].iov_base
734 << " " << aio.iov[i].iov_len << dendl;
735 }
736 pbl->append(aio.bl);
737 dout(5) << __func__ << " 0x" << std::hex << off << "~" << len
738 << std::dec << " aio " << &aio << dendl;
739 } else
740#endif
741 {
742 r = read(off, len, pbl, ioc, false);
743 }
744
745 return r;
746}
747
748int KernelDevice::direct_read_unaligned(uint64_t off, uint64_t len, char *buf)
749{
750 uint64_t aligned_off = align_down(off, block_size);
751 uint64_t aligned_len = align_up(off+len, block_size) - aligned_off;
752 bufferptr p = buffer::create_page_aligned(aligned_len);
753 int r = 0;
754
755 r = ::pread(fd_direct, p.c_str(), aligned_len, aligned_off);
756 if (r < 0) {
757 r = -errno;
758 derr << __func__ << " 0x" << std::hex << off << "~" << len << std::dec
759 << " error: " << cpp_strerror(r) << dendl;
760 goto out;
761 }
762 assert((uint64_t)r == aligned_len);
763 memcpy(buf, p.c_str() + (off - aligned_off), len);
764
765 dout(40) << __func__ << " data: ";
766 bufferlist bl;
767 bl.append(buf, len);
768 bl.hexdump(*_dout);
769 *_dout << dendl;
770
771 out:
772 return r < 0 ? r : 0;
773}
774
775int KernelDevice::read_random(uint64_t off, uint64_t len, char *buf,
776 bool buffered)
777{
778 dout(5) << __func__ << " 0x" << std::hex << off << "~" << len << std::dec
779 << dendl;
780 assert(len > 0);
781 assert(off < size);
782 assert(off + len <= size);
783 int r = 0;
784
785 //if it's direct io and unaligned, we have to use a internal buffer
786 if (!buffered && ((off % block_size != 0)
787 || (len % block_size != 0)
788 || (uintptr_t(buf) % CEPH_PAGE_SIZE != 0)))
789 return direct_read_unaligned(off, len, buf);
790
791 if (buffered) {
792 //buffered read
793 char *t = buf;
794 uint64_t left = len;
795 while (left > 0) {
796 r = ::pread(fd_buffered, t, left, off);
797 if (r < 0) {
798 r = -errno;
799 derr << __func__ << " 0x" << std::hex << off << "~" << left
800 << std::dec << " error: " << cpp_strerror(r) << dendl;
801 goto out;
802 }
803 off += r;
804 t += r;
805 left -= r;
806 }
807 } else {
808 //direct and aligned read
809 r = ::pread(fd_direct, buf, len, off);
810 if (r < 0) {
811 r = -errno;
812 derr << __func__ << " direct_aligned_read" << " 0x" << std::hex
813 << off << "~" << left << std::dec << " error: " << cpp_strerror(r)
814 << dendl;
815 goto out;
816 }
817 assert((uint64_t)r == len);
818 }
819
820 dout(40) << __func__ << " data: ";
821 bufferlist bl;
822 bl.append(buf, len);
823 bl.hexdump(*_dout);
824 *_dout << dendl;
825
826 out:
827 return r < 0 ? r : 0;
828}
829
830int KernelDevice::invalidate_cache(uint64_t off, uint64_t len)
831{
832 dout(5) << __func__ << " 0x" << std::hex << off << "~" << len << std::dec
833 << dendl;
834 assert(off % block_size == 0);
835 assert(len % block_size == 0);
836 int r = posix_fadvise(fd_buffered, off, len, POSIX_FADV_DONTNEED);
837 if (r) {
838 r = -r;
839 derr << __func__ << " 0x" << std::hex << off << "~" << len << std::dec
840 << " error: " << cpp_strerror(r) << dendl;
841 }
842 return r;
843}
844