]>
Commit | Line | Data |
---|---|---|
7c673cae FG |
1 | // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- |
2 | // vim: ts=8 sw=2 smarttab | |
3 | /* | |
4 | * Ceph - scalable distributed file system | |
5 | * | |
6 | * Copyright (C) 2015 XSky <haomai@xsky.com> | |
7 | * | |
8 | * Author: Haomai Wang <haomaiwang@gmail.com> | |
9 | * | |
10 | * This is free software; you can redistribute it and/or | |
11 | * modify it under the terms of the GNU Lesser General Public | |
12 | * License version 2.1, as published by the Free Software | |
13 | * Foundation. See file COPYING. | |
14 | * | |
15 | */ | |
16 | ||
f67539c2 TL |
17 | #ifndef CEPH_BLK_BLOCKDEVICE_H |
18 | #define CEPH_BLK_BLOCKDEVICE_H | |
7c673cae FG |
19 | |
20 | #include <atomic> | |
21 | #include <condition_variable> | |
7c673cae | 22 | #include <list> |
11fdf7f2 TL |
23 | #include <map> |
24 | #include <mutex> | |
25 | #include <set> | |
26 | #include <string> | |
27 | #include <vector> | |
7c673cae FG |
28 | |
29 | #include "acconfig.h" | |
11fdf7f2 | 30 | #include "common/ceph_mutex.h" |
9f95a23c | 31 | #include "include/common_fwd.h" |
1e59de90 | 32 | #include "extblkdev/ExtBlkDevInterface.h" |
7c673cae | 33 | |
11fdf7f2 | 34 | #if defined(HAVE_LIBAIO) || defined(HAVE_POSIXAIO) |
f67539c2 | 35 | #include "aio/aio.h" |
11fdf7f2 TL |
36 | #endif |
37 | #include "include/ceph_assert.h" | |
38 | #include "include/buffer.h" | |
39 | #include "include/interval_set.h" | |
7c673cae FG |
40 | #define SPDK_PREFIX "spdk:" |
41 | ||
11fdf7f2 TL |
42 | #if defined(__linux__) |
43 | #if !defined(F_SET_FILE_RW_HINT) | |
44 | #define F_LINUX_SPECIFIC_BASE 1024 | |
45 | #define F_SET_FILE_RW_HINT (F_LINUX_SPECIFIC_BASE + 14) | |
46 | #endif | |
47 | // These values match Linux definition | |
48 | // https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/tree/include/uapi/linux/fcntl.h#n56 | |
49 | #define WRITE_LIFE_NOT_SET 0 // No hint information set | |
50 | #define WRITE_LIFE_NONE 1 // No hints about write life time | |
51 | #define WRITE_LIFE_SHORT 2 // Data written has a short life time | |
52 | #define WRITE_LIFE_MEDIUM 3 // Data written has a medium life time | |
53 | #define WRITE_LIFE_LONG 4 // Data written has a long life time | |
54 | #define WRITE_LIFE_EXTREME 5 // Data written has an extremely long life time | |
55 | #define WRITE_LIFE_MAX 6 | |
56 | #else | |
57 | // On systems don't have WRITE_LIFE_* only use one FD | |
58 | // And all files are created equal | |
59 | #define WRITE_LIFE_NOT_SET 0 // No hint information set | |
60 | #define WRITE_LIFE_NONE 0 // No hints about write life time | |
61 | #define WRITE_LIFE_SHORT 0 // Data written has a short life time | |
62 | #define WRITE_LIFE_MEDIUM 0 // Data written has a medium life time | |
63 | #define WRITE_LIFE_LONG 0 // Data written has a long life time | |
64 | #define WRITE_LIFE_EXTREME 0 // Data written has an extremely long life time | |
65 | #define WRITE_LIFE_MAX 1 | |
66 | #endif | |
67 | ||
20effc67 TL |
68 | enum struct blk_access_mode_t { |
69 | DIRECT, | |
70 | BUFFERED | |
71 | }; | |
72 | blk_access_mode_t buffermode(bool buffered); | |
73 | std::ostream& operator<<(std::ostream& os, const blk_access_mode_t buffered); | |
11fdf7f2 | 74 | |
7c673cae FG |
75 | /// track in-flight io |
76 | struct IOContext { | |
20effc67 TL |
77 | enum { |
78 | FLAG_DONT_CACHE = 1 | |
79 | }; | |
80 | ||
31f18b77 | 81 | private: |
11fdf7f2 TL |
82 | ceph::mutex lock = ceph::make_mutex("IOContext::lock"); |
83 | ceph::condition_variable cond; | |
b32b8144 | 84 | int r = 0; |
31f18b77 FG |
85 | |
86 | public: | |
7c673cae FG |
87 | CephContext* cct; |
88 | void *priv; | |
89 | #ifdef HAVE_SPDK | |
90 | void *nvme_task_first = nullptr; | |
91 | void *nvme_task_last = nullptr; | |
11fdf7f2 | 92 | std::atomic_int total_nseg = {0}; |
7c673cae FG |
93 | #endif |
94 | ||
11fdf7f2 | 95 | #if defined(HAVE_LIBAIO) || defined(HAVE_POSIXAIO) |
7c673cae FG |
96 | std::list<aio_t> pending_aios; ///< not yet submitted |
97 | std::list<aio_t> running_aios; ///< submitting or submitted | |
11fdf7f2 | 98 | #endif |
7c673cae FG |
99 | std::atomic_int num_pending = {0}; |
100 | std::atomic_int num_running = {0}; | |
b32b8144 | 101 | bool allow_eio; |
20effc67 | 102 | uint32_t flags = 0; // FLAG_* |
7c673cae | 103 | |
b32b8144 FG |
104 | explicit IOContext(CephContext* cct, void *p, bool allow_eio = false) |
105 | : cct(cct), priv(p), allow_eio(allow_eio) | |
7c673cae FG |
106 | {} |
107 | ||
108 | // no copying | |
109 | IOContext(const IOContext& other) = delete; | |
110 | IOContext &operator=(const IOContext& other) = delete; | |
111 | ||
112 | bool has_pending_aios() { | |
113 | return num_pending.load(); | |
114 | } | |
11fdf7f2 | 115 | void release_running_aios(); |
7c673cae | 116 | void aio_wait(); |
11fdf7f2 | 117 | uint64_t get_num_ios() const; |
7c673cae | 118 | |
31f18b77 | 119 | void try_aio_wake() { |
11fdf7f2 TL |
120 | assert(num_running >= 1); |
121 | ||
122 | std::lock_guard l(lock); | |
123 | if (num_running.fetch_sub(1) == 1) { | |
31f18b77 FG |
124 | |
125 | // we might have some pending IOs submitted after the check | |
126 | // as there is no lock protection for aio_submit. | |
127 | // Hence we might have false conditional trigger. | |
128 | // aio_wait has to handle that hence do not care here. | |
31f18b77 | 129 | cond.notify_all(); |
31f18b77 | 130 | } |
7c673cae | 131 | } |
b32b8144 FG |
132 | |
133 | void set_return_value(int _r) { | |
134 | r = _r; | |
135 | } | |
136 | ||
137 | int get_return_value() const { | |
138 | return r; | |
139 | } | |
20effc67 TL |
140 | |
141 | bool skip_cache() const { | |
142 | return flags & FLAG_DONT_CACHE; | |
143 | } | |
7c673cae FG |
144 | }; |
145 | ||
146 | ||
147 | class BlockDevice { | |
148 | public: | |
149 | CephContext* cct; | |
11fdf7f2 | 150 | typedef void (*aio_callback_t)(void *handle, void *aio); |
7c673cae | 151 | private: |
11fdf7f2 | 152 | ceph::mutex ioc_reap_lock = ceph::make_mutex("BlockDevice::ioc_reap_lock"); |
7c673cae FG |
153 | std::vector<IOContext*> ioc_reap_queue; |
154 | std::atomic_int ioc_reap_count = {0}; | |
f67539c2 TL |
155 | enum class block_device_t { |
156 | unknown, | |
157 | #if defined(HAVE_LIBAIO) || defined(HAVE_POSIXAIO) | |
158 | aio, | |
159 | #if defined(HAVE_LIBZBD) | |
160 | hm_smr, | |
161 | #endif | |
162 | #endif | |
163 | #if defined(HAVE_SPDK) | |
164 | spdk, | |
165 | #endif | |
166 | #if defined(HAVE_BLUESTORE_PMEM) | |
167 | pmem, | |
168 | #endif | |
169 | }; | |
170 | static block_device_t detect_device_type(const std::string& path); | |
171 | static block_device_t device_type_from_name(const std::string& blk_dev_name); | |
172 | static BlockDevice *create_with_type(block_device_t device_type, | |
173 | CephContext* cct, const std::string& path, aio_callback_t cb, | |
174 | void *cbpriv, aio_callback_t d_cb, void *d_cbpriv); | |
7c673cae FG |
175 | |
176 | protected: | |
9f95a23c TL |
177 | uint64_t size = 0; |
178 | uint64_t block_size = 0; | |
20effc67 | 179 | uint64_t optimal_io_size = 0; |
11fdf7f2 | 180 | bool support_discard = false; |
7c673cae | 181 | bool rotational = true; |
11fdf7f2 | 182 | bool lock_exclusive = true; |
7c673cae | 183 | |
f67539c2 TL |
184 | // HM-SMR specific properties. In HM-SMR drives the LBA space is divided into |
185 | // fixed-size zones. Typically, the first few zones are randomly writable; | |
186 | // they form a conventional region of the drive. The remaining zones must be | |
187 | // written sequentially and they must be reset before rewritten. For example, | |
188 | // a 14 TB HGST HSH721414AL drive has 52156 zones each of size is 256 MiB. | |
189 | // The zones 0-523 are randomly writable and they form the conventional region | |
190 | // of the drive. The zones 524-52155 are sequential zones. | |
191 | uint64_t conventional_region_size = 0; | |
192 | uint64_t zone_size = 0; | |
193 | ||
7c673cae | 194 | public: |
11fdf7f2 TL |
195 | aio_callback_t aio_callback; |
196 | void *aio_callback_priv; | |
197 | BlockDevice(CephContext* cct, aio_callback_t cb, void *cbpriv) | |
198 | : cct(cct), | |
11fdf7f2 TL |
199 | aio_callback(cb), |
200 | aio_callback_priv(cbpriv) | |
201 | {} | |
7c673cae | 202 | virtual ~BlockDevice() = default; |
7c673cae FG |
203 | |
204 | static BlockDevice *create( | |
11fdf7f2 | 205 | CephContext* cct, const std::string& path, aio_callback_t cb, void *cbpriv, aio_callback_t d_cb, void *d_cbpriv); |
7c673cae FG |
206 | virtual bool supported_bdev_label() { return true; } |
207 | virtual bool is_rotational() { return rotational; } | |
208 | ||
f67539c2 TL |
209 | // HM-SMR-specific calls |
210 | virtual bool is_smr() const { return false; } | |
211 | virtual uint64_t get_zone_size() const { | |
212 | ceph_assert(is_smr()); | |
213 | return zone_size; | |
214 | } | |
215 | virtual uint64_t get_conventional_region_size() const { | |
216 | ceph_assert(is_smr()); | |
217 | return conventional_region_size; | |
218 | } | |
20effc67 TL |
219 | virtual void reset_all_zones() { |
220 | ceph_assert(is_smr()); | |
221 | } | |
222 | virtual void reset_zone(uint64_t zone) { | |
223 | ceph_assert(is_smr()); | |
224 | } | |
225 | virtual std::vector<uint64_t> get_zones() { | |
226 | ceph_assert(is_smr()); | |
227 | return std::vector<uint64_t>(); | |
228 | } | |
f67539c2 | 229 | |
7c673cae FG |
230 | virtual void aio_submit(IOContext *ioc) = 0; |
231 | ||
11fdf7f2 TL |
232 | void set_no_exclusive_lock() { |
233 | lock_exclusive = false; | |
234 | } | |
235 | ||
236 | uint64_t get_size() const { return size; } | |
237 | uint64_t get_block_size() const { return block_size; } | |
20effc67 | 238 | uint64_t get_optimal_io_size() const { return optimal_io_size; } |
11fdf7f2 TL |
239 | |
240 | /// hook to provide utilization of thinly-provisioned device | |
1e59de90 TL |
241 | virtual int get_ebd_state(ExtBlkDevState &state) const { |
242 | return -ENOENT; | |
11fdf7f2 | 243 | } |
7c673cae | 244 | |
11fdf7f2 TL |
245 | virtual int collect_metadata(const std::string& prefix, std::map<std::string,std::string> *pm) const = 0; |
246 | ||
9f95a23c | 247 | virtual int get_devname(std::string *out) const { |
11fdf7f2 TL |
248 | return -ENOENT; |
249 | } | |
9f95a23c | 250 | virtual int get_devices(std::set<std::string> *ls) const { |
11fdf7f2 TL |
251 | std::string s; |
252 | if (get_devname(&s) == 0) { | |
253 | ls->insert(s); | |
254 | } | |
255 | return 0; | |
256 | } | |
257 | virtual int get_numa_node(int *node) const { | |
258 | return -EOPNOTSUPP; | |
259 | } | |
7c673cae FG |
260 | |
261 | virtual int read( | |
262 | uint64_t off, | |
263 | uint64_t len, | |
f67539c2 | 264 | ceph::buffer::list *pbl, |
7c673cae FG |
265 | IOContext *ioc, |
266 | bool buffered) = 0; | |
267 | virtual int read_random( | |
268 | uint64_t off, | |
269 | uint64_t len, | |
270 | char *buf, | |
271 | bool buffered) = 0; | |
272 | virtual int write( | |
273 | uint64_t off, | |
f67539c2 | 274 | ceph::buffer::list& bl, |
11fdf7f2 TL |
275 | bool buffered, |
276 | int write_hint = WRITE_LIFE_NOT_SET) = 0; | |
7c673cae FG |
277 | |
278 | virtual int aio_read( | |
279 | uint64_t off, | |
280 | uint64_t len, | |
f67539c2 | 281 | ceph::buffer::list *pbl, |
7c673cae FG |
282 | IOContext *ioc) = 0; |
283 | virtual int aio_write( | |
284 | uint64_t off, | |
f67539c2 | 285 | ceph::buffer::list& bl, |
7c673cae | 286 | IOContext *ioc, |
11fdf7f2 TL |
287 | bool buffered, |
288 | int write_hint = WRITE_LIFE_NOT_SET) = 0; | |
7c673cae | 289 | virtual int flush() = 0; |
1e59de90 | 290 | virtual bool try_discard(interval_set<uint64_t> &to_release, bool async=true) { return false; } |
11fdf7f2 | 291 | virtual void discard_drain() { return; } |
7c673cae | 292 | |
7c673cae FG |
293 | // for managing buffered readers/writers |
294 | virtual int invalidate_cache(uint64_t off, uint64_t len) = 0; | |
295 | virtual int open(const std::string& path) = 0; | |
296 | virtual void close() = 0; | |
11fdf7f2 | 297 | |
20effc67 TL |
298 | struct hugepaged_raw_marker_t {}; |
299 | ||
11fdf7f2 | 300 | protected: |
f67539c2 | 301 | bool is_valid_io(uint64_t off, uint64_t len) const; |
7c673cae FG |
302 | }; |
303 | ||
f67539c2 | 304 | #endif //CEPH_BLK_BLOCKDEVICE_H |