]> git.proxmox.com Git - ceph.git/blame - ceph/src/os/bluestore/BlockDevice.h
Import ceph 15.2.8
[ceph.git] / ceph / src / os / bluestore / BlockDevice.h
CommitLineData
7c673cae
FG
1// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2// vim: ts=8 sw=2 smarttab
3/*
4 * Ceph - scalable distributed file system
5 *
6 * Copyright (C) 2015 XSky <haomai@xsky.com>
7 *
8 * Author: Haomai Wang <haomaiwang@gmail.com>
9 *
10 * This is free software; you can redistribute it and/or
11 * modify it under the terms of the GNU Lesser General Public
12 * License version 2.1, as published by the Free Software
13 * Foundation. See file COPYING.
14 *
15 */
16
17#ifndef CEPH_OS_BLUESTORE_BLOCKDEVICE_H
18#define CEPH_OS_BLUESTORE_BLOCKDEVICE_H
19
20#include <atomic>
21#include <condition_variable>
7c673cae 22#include <list>
11fdf7f2
TL
23#include <map>
24#include <mutex>
25#include <set>
26#include <string>
27#include <vector>
7c673cae
FG
28
29#include "acconfig.h"
11fdf7f2 30#include "common/ceph_mutex.h"
9f95a23c 31#include "include/common_fwd.h"
7c673cae 32
11fdf7f2
TL
33#if defined(HAVE_LIBAIO) || defined(HAVE_POSIXAIO)
34#include "ceph_aio.h"
35#endif
36#include "include/ceph_assert.h"
37#include "include/buffer.h"
38#include "include/interval_set.h"
7c673cae
FG
39#define SPDK_PREFIX "spdk:"
40
11fdf7f2
TL
41#if defined(__linux__)
42#if !defined(F_SET_FILE_RW_HINT)
43#define F_LINUX_SPECIFIC_BASE 1024
44#define F_SET_FILE_RW_HINT (F_LINUX_SPECIFIC_BASE + 14)
45#endif
46// These values match Linux definition
47// https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/tree/include/uapi/linux/fcntl.h#n56
48#define WRITE_LIFE_NOT_SET 0 // No hint information set
49#define WRITE_LIFE_NONE 1 // No hints about write life time
50#define WRITE_LIFE_SHORT 2 // Data written has a short life time
51#define WRITE_LIFE_MEDIUM 3 // Data written has a medium life time
52#define WRITE_LIFE_LONG 4 // Data written has a long life time
53#define WRITE_LIFE_EXTREME 5 // Data written has an extremely long life time
54#define WRITE_LIFE_MAX 6
55#else
56// On systems don't have WRITE_LIFE_* only use one FD
57// And all files are created equal
58#define WRITE_LIFE_NOT_SET 0 // No hint information set
59#define WRITE_LIFE_NONE 0 // No hints about write life time
60#define WRITE_LIFE_SHORT 0 // Data written has a short life time
61#define WRITE_LIFE_MEDIUM 0 // Data written has a medium life time
62#define WRITE_LIFE_LONG 0 // Data written has a long life time
63#define WRITE_LIFE_EXTREME 0 // Data written has an extremely long life time
64#define WRITE_LIFE_MAX 1
65#endif
66
11fdf7f2 67
7c673cae
FG
68/// track in-flight io
69struct IOContext {
31f18b77 70private:
11fdf7f2
TL
71 ceph::mutex lock = ceph::make_mutex("IOContext::lock");
72 ceph::condition_variable cond;
b32b8144 73 int r = 0;
31f18b77
FG
74
75public:
7c673cae
FG
76 CephContext* cct;
77 void *priv;
78#ifdef HAVE_SPDK
79 void *nvme_task_first = nullptr;
80 void *nvme_task_last = nullptr;
11fdf7f2 81 std::atomic_int total_nseg = {0};
7c673cae
FG
82#endif
83
11fdf7f2 84#if defined(HAVE_LIBAIO) || defined(HAVE_POSIXAIO)
7c673cae
FG
85 std::list<aio_t> pending_aios; ///< not yet submitted
86 std::list<aio_t> running_aios; ///< submitting or submitted
11fdf7f2 87#endif
7c673cae
FG
88 std::atomic_int num_pending = {0};
89 std::atomic_int num_running = {0};
b32b8144 90 bool allow_eio;
7c673cae 91
b32b8144
FG
92 explicit IOContext(CephContext* cct, void *p, bool allow_eio = false)
93 : cct(cct), priv(p), allow_eio(allow_eio)
7c673cae
FG
94 {}
95
96 // no copying
97 IOContext(const IOContext& other) = delete;
98 IOContext &operator=(const IOContext& other) = delete;
99
100 bool has_pending_aios() {
101 return num_pending.load();
102 }
11fdf7f2 103 void release_running_aios();
7c673cae 104 void aio_wait();
11fdf7f2 105 uint64_t get_num_ios() const;
7c673cae 106
31f18b77 107 void try_aio_wake() {
11fdf7f2
TL
108 assert(num_running >= 1);
109
110 std::lock_guard l(lock);
111 if (num_running.fetch_sub(1) == 1) {
31f18b77
FG
112
113 // we might have some pending IOs submitted after the check
114 // as there is no lock protection for aio_submit.
115 // Hence we might have false conditional trigger.
116 // aio_wait has to handle that hence do not care here.
31f18b77 117 cond.notify_all();
31f18b77 118 }
7c673cae 119 }
b32b8144
FG
120
121 void set_return_value(int _r) {
122 r = _r;
123 }
124
125 int get_return_value() const {
126 return r;
127 }
7c673cae
FG
128};
129
130
131class BlockDevice {
132public:
133 CephContext* cct;
11fdf7f2 134 typedef void (*aio_callback_t)(void *handle, void *aio);
7c673cae 135private:
11fdf7f2 136 ceph::mutex ioc_reap_lock = ceph::make_mutex("BlockDevice::ioc_reap_lock");
7c673cae
FG
137 std::vector<IOContext*> ioc_reap_queue;
138 std::atomic_int ioc_reap_count = {0};
139
140protected:
9f95a23c
TL
141 uint64_t size = 0;
142 uint64_t block_size = 0;
11fdf7f2 143 bool support_discard = false;
7c673cae 144 bool rotational = true;
11fdf7f2 145 bool lock_exclusive = true;
7c673cae
FG
146
147public:
11fdf7f2
TL
148 aio_callback_t aio_callback;
149 void *aio_callback_priv;
150 BlockDevice(CephContext* cct, aio_callback_t cb, void *cbpriv)
151 : cct(cct),
11fdf7f2
TL
152 aio_callback(cb),
153 aio_callback_priv(cbpriv)
154 {}
7c673cae 155 virtual ~BlockDevice() = default;
7c673cae
FG
156
157 static BlockDevice *create(
11fdf7f2 158 CephContext* cct, const std::string& path, aio_callback_t cb, void *cbpriv, aio_callback_t d_cb, void *d_cbpriv);
7c673cae
FG
159 virtual bool supported_bdev_label() { return true; }
160 virtual bool is_rotational() { return rotational; }
161
162 virtual void aio_submit(IOContext *ioc) = 0;
163
11fdf7f2
TL
164 void set_no_exclusive_lock() {
165 lock_exclusive = false;
166 }
167
168 uint64_t get_size() const { return size; }
169 uint64_t get_block_size() const { return block_size; }
170
171 /// hook to provide utilization of thinly-provisioned device
172 virtual bool get_thin_utilization(uint64_t *total, uint64_t *avail) const {
173 return false;
174 }
7c673cae 175
11fdf7f2
TL
176 virtual int collect_metadata(const std::string& prefix, std::map<std::string,std::string> *pm) const = 0;
177
9f95a23c 178 virtual int get_devname(std::string *out) const {
11fdf7f2
TL
179 return -ENOENT;
180 }
9f95a23c 181 virtual int get_devices(std::set<std::string> *ls) const {
11fdf7f2
TL
182 std::string s;
183 if (get_devname(&s) == 0) {
184 ls->insert(s);
185 }
186 return 0;
187 }
188 virtual int get_numa_node(int *node) const {
189 return -EOPNOTSUPP;
190 }
7c673cae
FG
191
192 virtual int read(
193 uint64_t off,
194 uint64_t len,
195 bufferlist *pbl,
196 IOContext *ioc,
197 bool buffered) = 0;
198 virtual int read_random(
199 uint64_t off,
200 uint64_t len,
201 char *buf,
202 bool buffered) = 0;
203 virtual int write(
204 uint64_t off,
205 bufferlist& bl,
11fdf7f2
TL
206 bool buffered,
207 int write_hint = WRITE_LIFE_NOT_SET) = 0;
7c673cae
FG
208
209 virtual int aio_read(
210 uint64_t off,
211 uint64_t len,
212 bufferlist *pbl,
213 IOContext *ioc) = 0;
214 virtual int aio_write(
215 uint64_t off,
216 bufferlist& bl,
217 IOContext *ioc,
11fdf7f2
TL
218 bool buffered,
219 int write_hint = WRITE_LIFE_NOT_SET) = 0;
7c673cae 220 virtual int flush() = 0;
11fdf7f2
TL
221 virtual int discard(uint64_t offset, uint64_t len) { return 0; }
222 virtual int queue_discard(interval_set<uint64_t> &to_release) { return -1; }
223 virtual void discard_drain() { return; }
7c673cae
FG
224
225 void queue_reap_ioc(IOContext *ioc);
226 void reap_ioc();
227
228 // for managing buffered readers/writers
229 virtual int invalidate_cache(uint64_t off, uint64_t len) = 0;
230 virtual int open(const std::string& path) = 0;
231 virtual void close() = 0;
11fdf7f2
TL
232
233protected:
234 bool is_valid_io(uint64_t off, uint64_t len) const {
235 return (off % block_size == 0 &&
236 len % block_size == 0 &&
237 len > 0 &&
238 off < size &&
239 off + len <= size);
240 }
7c673cae
FG
241};
242
243#endif //CEPH_OS_BLUESTORE_BLOCKDEVICE_H