]>
Commit | Line | Data |
---|---|---|
7c673cae FG |
1 | // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- |
2 | // vim: ts=8 sw=2 smarttab | |
3 | /* | |
4 | * Ceph - scalable distributed file system | |
5 | * | |
6 | * Copyright (C) 2015 XSky <haomai@xsky.com> | |
7 | * | |
8 | * Author: Haomai Wang <haomaiwang@gmail.com> | |
9 | * | |
10 | * This is free software; you can redistribute it and/or | |
11 | * modify it under the terms of the GNU Lesser General Public | |
12 | * License version 2.1, as published by the Free Software | |
13 | * Foundation. See file COPYING. | |
14 | * | |
15 | */ | |
16 | ||
f67539c2 TL |
17 | #ifndef CEPH_BLK_BLOCKDEVICE_H |
18 | #define CEPH_BLK_BLOCKDEVICE_H | |
7c673cae FG |
19 | |
20 | #include <atomic> | |
21 | #include <condition_variable> | |
7c673cae | 22 | #include <list> |
11fdf7f2 TL |
23 | #include <map> |
24 | #include <mutex> | |
25 | #include <set> | |
26 | #include <string> | |
27 | #include <vector> | |
7c673cae FG |
28 | |
29 | #include "acconfig.h" | |
11fdf7f2 | 30 | #include "common/ceph_mutex.h" |
9f95a23c | 31 | #include "include/common_fwd.h" |
7c673cae | 32 | |
11fdf7f2 | 33 | #if defined(HAVE_LIBAIO) || defined(HAVE_POSIXAIO) |
f67539c2 | 34 | #include "aio/aio.h" |
11fdf7f2 TL |
35 | #endif |
36 | #include "include/ceph_assert.h" | |
37 | #include "include/buffer.h" | |
38 | #include "include/interval_set.h" | |
7c673cae FG |
39 | #define SPDK_PREFIX "spdk:" |
40 | ||
11fdf7f2 TL |
41 | #if defined(__linux__) |
42 | #if !defined(F_SET_FILE_RW_HINT) | |
43 | #define F_LINUX_SPECIFIC_BASE 1024 | |
44 | #define F_SET_FILE_RW_HINT (F_LINUX_SPECIFIC_BASE + 14) | |
45 | #endif | |
46 | // These values match Linux definition | |
47 | // https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/tree/include/uapi/linux/fcntl.h#n56 | |
48 | #define WRITE_LIFE_NOT_SET 0 // No hint information set | |
49 | #define WRITE_LIFE_NONE 1 // No hints about write life time | |
50 | #define WRITE_LIFE_SHORT 2 // Data written has a short life time | |
51 | #define WRITE_LIFE_MEDIUM 3 // Data written has a medium life time | |
52 | #define WRITE_LIFE_LONG 4 // Data written has a long life time | |
53 | #define WRITE_LIFE_EXTREME 5 // Data written has an extremely long life time | |
54 | #define WRITE_LIFE_MAX 6 | |
55 | #else | |
56 | // On systems don't have WRITE_LIFE_* only use one FD | |
57 | // And all files are created equal | |
58 | #define WRITE_LIFE_NOT_SET 0 // No hint information set | |
59 | #define WRITE_LIFE_NONE 0 // No hints about write life time | |
60 | #define WRITE_LIFE_SHORT 0 // Data written has a short life time | |
61 | #define WRITE_LIFE_MEDIUM 0 // Data written has a medium life time | |
62 | #define WRITE_LIFE_LONG 0 // Data written has a long life time | |
63 | #define WRITE_LIFE_EXTREME 0 // Data written has an extremely long life time | |
64 | #define WRITE_LIFE_MAX 1 | |
65 | #endif | |
66 | ||
11fdf7f2 | 67 | |
7c673cae FG |
68 | /// track in-flight io |
69 | struct IOContext { | |
31f18b77 | 70 | private: |
11fdf7f2 TL |
71 | ceph::mutex lock = ceph::make_mutex("IOContext::lock"); |
72 | ceph::condition_variable cond; | |
b32b8144 | 73 | int r = 0; |
31f18b77 FG |
74 | |
75 | public: | |
7c673cae FG |
76 | CephContext* cct; |
77 | void *priv; | |
78 | #ifdef HAVE_SPDK | |
79 | void *nvme_task_first = nullptr; | |
80 | void *nvme_task_last = nullptr; | |
11fdf7f2 | 81 | std::atomic_int total_nseg = {0}; |
7c673cae FG |
82 | #endif |
83 | ||
11fdf7f2 | 84 | #if defined(HAVE_LIBAIO) || defined(HAVE_POSIXAIO) |
7c673cae FG |
85 | std::list<aio_t> pending_aios; ///< not yet submitted |
86 | std::list<aio_t> running_aios; ///< submitting or submitted | |
11fdf7f2 | 87 | #endif |
7c673cae FG |
88 | std::atomic_int num_pending = {0}; |
89 | std::atomic_int num_running = {0}; | |
b32b8144 | 90 | bool allow_eio; |
7c673cae | 91 | |
b32b8144 FG |
92 | explicit IOContext(CephContext* cct, void *p, bool allow_eio = false) |
93 | : cct(cct), priv(p), allow_eio(allow_eio) | |
7c673cae FG |
94 | {} |
95 | ||
96 | // no copying | |
97 | IOContext(const IOContext& other) = delete; | |
98 | IOContext &operator=(const IOContext& other) = delete; | |
99 | ||
100 | bool has_pending_aios() { | |
101 | return num_pending.load(); | |
102 | } | |
11fdf7f2 | 103 | void release_running_aios(); |
7c673cae | 104 | void aio_wait(); |
11fdf7f2 | 105 | uint64_t get_num_ios() const; |
7c673cae | 106 | |
31f18b77 | 107 | void try_aio_wake() { |
11fdf7f2 TL |
108 | assert(num_running >= 1); |
109 | ||
110 | std::lock_guard l(lock); | |
111 | if (num_running.fetch_sub(1) == 1) { | |
31f18b77 FG |
112 | |
113 | // we might have some pending IOs submitted after the check | |
114 | // as there is no lock protection for aio_submit. | |
115 | // Hence we might have false conditional trigger. | |
116 | // aio_wait has to handle that hence do not care here. | |
31f18b77 | 117 | cond.notify_all(); |
31f18b77 | 118 | } |
7c673cae | 119 | } |
b32b8144 FG |
120 | |
121 | void set_return_value(int _r) { | |
122 | r = _r; | |
123 | } | |
124 | ||
125 | int get_return_value() const { | |
126 | return r; | |
127 | } | |
7c673cae FG |
128 | }; |
129 | ||
130 | ||
131 | class BlockDevice { | |
132 | public: | |
133 | CephContext* cct; | |
11fdf7f2 | 134 | typedef void (*aio_callback_t)(void *handle, void *aio); |
7c673cae | 135 | private: |
11fdf7f2 | 136 | ceph::mutex ioc_reap_lock = ceph::make_mutex("BlockDevice::ioc_reap_lock"); |
7c673cae FG |
137 | std::vector<IOContext*> ioc_reap_queue; |
138 | std::atomic_int ioc_reap_count = {0}; | |
f67539c2 TL |
139 | enum class block_device_t { |
140 | unknown, | |
141 | #if defined(HAVE_LIBAIO) || defined(HAVE_POSIXAIO) | |
142 | aio, | |
143 | #if defined(HAVE_LIBZBD) | |
144 | hm_smr, | |
145 | #endif | |
146 | #endif | |
147 | #if defined(HAVE_SPDK) | |
148 | spdk, | |
149 | #endif | |
150 | #if defined(HAVE_BLUESTORE_PMEM) | |
151 | pmem, | |
152 | #endif | |
153 | }; | |
154 | static block_device_t detect_device_type(const std::string& path); | |
155 | static block_device_t device_type_from_name(const std::string& blk_dev_name); | |
156 | static BlockDevice *create_with_type(block_device_t device_type, | |
157 | CephContext* cct, const std::string& path, aio_callback_t cb, | |
158 | void *cbpriv, aio_callback_t d_cb, void *d_cbpriv); | |
7c673cae FG |
159 | |
160 | protected: | |
9f95a23c TL |
161 | uint64_t size = 0; |
162 | uint64_t block_size = 0; | |
11fdf7f2 | 163 | bool support_discard = false; |
7c673cae | 164 | bool rotational = true; |
11fdf7f2 | 165 | bool lock_exclusive = true; |
7c673cae | 166 | |
f67539c2 TL |
167 | // HM-SMR specific properties. In HM-SMR drives the LBA space is divided into |
168 | // fixed-size zones. Typically, the first few zones are randomly writable; | |
169 | // they form a conventional region of the drive. The remaining zones must be | |
170 | // written sequentially and they must be reset before rewritten. For example, | |
171 | // a 14 TB HGST HSH721414AL drive has 52156 zones each of size is 256 MiB. | |
172 | // The zones 0-523 are randomly writable and they form the conventional region | |
173 | // of the drive. The zones 524-52155 are sequential zones. | |
174 | uint64_t conventional_region_size = 0; | |
175 | uint64_t zone_size = 0; | |
176 | ||
7c673cae | 177 | public: |
11fdf7f2 TL |
178 | aio_callback_t aio_callback; |
179 | void *aio_callback_priv; | |
180 | BlockDevice(CephContext* cct, aio_callback_t cb, void *cbpriv) | |
181 | : cct(cct), | |
11fdf7f2 TL |
182 | aio_callback(cb), |
183 | aio_callback_priv(cbpriv) | |
184 | {} | |
7c673cae | 185 | virtual ~BlockDevice() = default; |
7c673cae FG |
186 | |
187 | static BlockDevice *create( | |
11fdf7f2 | 188 | CephContext* cct, const std::string& path, aio_callback_t cb, void *cbpriv, aio_callback_t d_cb, void *d_cbpriv); |
7c673cae FG |
189 | virtual bool supported_bdev_label() { return true; } |
190 | virtual bool is_rotational() { return rotational; } | |
191 | ||
f67539c2 TL |
192 | // HM-SMR-specific calls |
193 | virtual bool is_smr() const { return false; } | |
194 | virtual uint64_t get_zone_size() const { | |
195 | ceph_assert(is_smr()); | |
196 | return zone_size; | |
197 | } | |
198 | virtual uint64_t get_conventional_region_size() const { | |
199 | ceph_assert(is_smr()); | |
200 | return conventional_region_size; | |
201 | } | |
202 | ||
7c673cae FG |
203 | virtual void aio_submit(IOContext *ioc) = 0; |
204 | ||
11fdf7f2 TL |
205 | void set_no_exclusive_lock() { |
206 | lock_exclusive = false; | |
207 | } | |
208 | ||
209 | uint64_t get_size() const { return size; } | |
210 | uint64_t get_block_size() const { return block_size; } | |
211 | ||
212 | /// hook to provide utilization of thinly-provisioned device | |
213 | virtual bool get_thin_utilization(uint64_t *total, uint64_t *avail) const { | |
214 | return false; | |
215 | } | |
7c673cae | 216 | |
11fdf7f2 TL |
217 | virtual int collect_metadata(const std::string& prefix, std::map<std::string,std::string> *pm) const = 0; |
218 | ||
9f95a23c | 219 | virtual int get_devname(std::string *out) const { |
11fdf7f2 TL |
220 | return -ENOENT; |
221 | } | |
9f95a23c | 222 | virtual int get_devices(std::set<std::string> *ls) const { |
11fdf7f2 TL |
223 | std::string s; |
224 | if (get_devname(&s) == 0) { | |
225 | ls->insert(s); | |
226 | } | |
227 | return 0; | |
228 | } | |
229 | virtual int get_numa_node(int *node) const { | |
230 | return -EOPNOTSUPP; | |
231 | } | |
7c673cae FG |
232 | |
233 | virtual int read( | |
234 | uint64_t off, | |
235 | uint64_t len, | |
f67539c2 | 236 | ceph::buffer::list *pbl, |
7c673cae FG |
237 | IOContext *ioc, |
238 | bool buffered) = 0; | |
239 | virtual int read_random( | |
240 | uint64_t off, | |
241 | uint64_t len, | |
242 | char *buf, | |
243 | bool buffered) = 0; | |
244 | virtual int write( | |
245 | uint64_t off, | |
f67539c2 | 246 | ceph::buffer::list& bl, |
11fdf7f2 TL |
247 | bool buffered, |
248 | int write_hint = WRITE_LIFE_NOT_SET) = 0; | |
7c673cae FG |
249 | |
250 | virtual int aio_read( | |
251 | uint64_t off, | |
252 | uint64_t len, | |
f67539c2 | 253 | ceph::buffer::list *pbl, |
7c673cae FG |
254 | IOContext *ioc) = 0; |
255 | virtual int aio_write( | |
256 | uint64_t off, | |
f67539c2 | 257 | ceph::buffer::list& bl, |
7c673cae | 258 | IOContext *ioc, |
11fdf7f2 TL |
259 | bool buffered, |
260 | int write_hint = WRITE_LIFE_NOT_SET) = 0; | |
7c673cae | 261 | virtual int flush() = 0; |
11fdf7f2 TL |
262 | virtual int discard(uint64_t offset, uint64_t len) { return 0; } |
263 | virtual int queue_discard(interval_set<uint64_t> &to_release) { return -1; } | |
264 | virtual void discard_drain() { return; } | |
7c673cae FG |
265 | |
266 | void queue_reap_ioc(IOContext *ioc); | |
267 | void reap_ioc(); | |
268 | ||
269 | // for managing buffered readers/writers | |
270 | virtual int invalidate_cache(uint64_t off, uint64_t len) = 0; | |
271 | virtual int open(const std::string& path) = 0; | |
272 | virtual void close() = 0; | |
11fdf7f2 TL |
273 | |
274 | protected: | |
f67539c2 | 275 | bool is_valid_io(uint64_t off, uint64_t len) const; |
7c673cae FG |
276 | }; |
277 | ||
f67539c2 | 278 | #endif //CEPH_BLK_BLOCKDEVICE_H |