]>
Commit | Line | Data |
---|---|---|
7c673cae FG |
1 | // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- |
2 | // vim: ts=8 sw=2 smarttab | |
3 | /* | |
4 | * Ceph - scalable distributed file system | |
5 | * | |
6 | * Copyright (C) 2015 XSky <haomai@xsky.com> | |
7 | * | |
8 | * Author: Haomai Wang <haomaiwang@gmail.com> | |
9 | * | |
10 | * This is free software; you can redistribute it and/or | |
11 | * modify it under the terms of the GNU Lesser General Public | |
12 | * License version 2.1, as published by the Free Software | |
13 | * Foundation. See file COPYING. | |
14 | * | |
15 | */ | |
16 | ||
17 | #ifndef CEPH_OS_BLUESTORE_BLOCKDEVICE_H | |
18 | #define CEPH_OS_BLUESTORE_BLOCKDEVICE_H | |
19 | ||
20 | #include <atomic> | |
21 | #include <condition_variable> | |
7c673cae | 22 | #include <list> |
11fdf7f2 TL |
23 | #include <map> |
24 | #include <mutex> | |
25 | #include <set> | |
26 | #include <string> | |
27 | #include <vector> | |
7c673cae FG |
28 | |
29 | #include "acconfig.h" | |
11fdf7f2 | 30 | #include "common/ceph_mutex.h" |
9f95a23c | 31 | #include "include/common_fwd.h" |
7c673cae | 32 | |
11fdf7f2 TL |
33 | #if defined(HAVE_LIBAIO) || defined(HAVE_POSIXAIO) |
34 | #include "ceph_aio.h" | |
35 | #endif | |
36 | #include "include/ceph_assert.h" | |
37 | #include "include/buffer.h" | |
38 | #include "include/interval_set.h" | |
7c673cae FG |
39 | #define SPDK_PREFIX "spdk:" |
40 | ||
11fdf7f2 TL |
41 | #if defined(__linux__) |
42 | #if !defined(F_SET_FILE_RW_HINT) | |
43 | #define F_LINUX_SPECIFIC_BASE 1024 | |
44 | #define F_SET_FILE_RW_HINT (F_LINUX_SPECIFIC_BASE + 14) | |
45 | #endif | |
46 | // These values match Linux definition | |
47 | // https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/tree/include/uapi/linux/fcntl.h#n56 | |
48 | #define WRITE_LIFE_NOT_SET 0 // No hint information set | |
49 | #define WRITE_LIFE_NONE 1 // No hints about write life time | |
50 | #define WRITE_LIFE_SHORT 2 // Data written has a short life time | |
51 | #define WRITE_LIFE_MEDIUM 3 // Data written has a medium life time | |
52 | #define WRITE_LIFE_LONG 4 // Data written has a long life time | |
53 | #define WRITE_LIFE_EXTREME 5 // Data written has an extremely long life time | |
54 | #define WRITE_LIFE_MAX 6 | |
55 | #else | |
56 | // On systems don't have WRITE_LIFE_* only use one FD | |
57 | // And all files are created equal | |
58 | #define WRITE_LIFE_NOT_SET 0 // No hint information set | |
59 | #define WRITE_LIFE_NONE 0 // No hints about write life time | |
60 | #define WRITE_LIFE_SHORT 0 // Data written has a short life time | |
61 | #define WRITE_LIFE_MEDIUM 0 // Data written has a medium life time | |
62 | #define WRITE_LIFE_LONG 0 // Data written has a long life time | |
63 | #define WRITE_LIFE_EXTREME 0 // Data written has an extremely long life time | |
64 | #define WRITE_LIFE_MAX 1 | |
65 | #endif | |
66 | ||
11fdf7f2 | 67 | |
7c673cae FG |
68 | /// track in-flight io |
69 | struct IOContext { | |
31f18b77 | 70 | private: |
11fdf7f2 TL |
71 | ceph::mutex lock = ceph::make_mutex("IOContext::lock"); |
72 | ceph::condition_variable cond; | |
b32b8144 | 73 | int r = 0; |
31f18b77 FG |
74 | |
75 | public: | |
7c673cae FG |
76 | CephContext* cct; |
77 | void *priv; | |
78 | #ifdef HAVE_SPDK | |
79 | void *nvme_task_first = nullptr; | |
80 | void *nvme_task_last = nullptr; | |
11fdf7f2 | 81 | std::atomic_int total_nseg = {0}; |
7c673cae FG |
82 | #endif |
83 | ||
11fdf7f2 | 84 | #if defined(HAVE_LIBAIO) || defined(HAVE_POSIXAIO) |
7c673cae FG |
85 | std::list<aio_t> pending_aios; ///< not yet submitted |
86 | std::list<aio_t> running_aios; ///< submitting or submitted | |
11fdf7f2 | 87 | #endif |
7c673cae FG |
88 | std::atomic_int num_pending = {0}; |
89 | std::atomic_int num_running = {0}; | |
b32b8144 | 90 | bool allow_eio; |
7c673cae | 91 | |
b32b8144 FG |
92 | explicit IOContext(CephContext* cct, void *p, bool allow_eio = false) |
93 | : cct(cct), priv(p), allow_eio(allow_eio) | |
7c673cae FG |
94 | {} |
95 | ||
96 | // no copying | |
97 | IOContext(const IOContext& other) = delete; | |
98 | IOContext &operator=(const IOContext& other) = delete; | |
99 | ||
100 | bool has_pending_aios() { | |
101 | return num_pending.load(); | |
102 | } | |
11fdf7f2 | 103 | void release_running_aios(); |
7c673cae | 104 | void aio_wait(); |
11fdf7f2 | 105 | uint64_t get_num_ios() const; |
7c673cae | 106 | |
31f18b77 | 107 | void try_aio_wake() { |
11fdf7f2 TL |
108 | assert(num_running >= 1); |
109 | ||
110 | std::lock_guard l(lock); | |
111 | if (num_running.fetch_sub(1) == 1) { | |
31f18b77 FG |
112 | |
113 | // we might have some pending IOs submitted after the check | |
114 | // as there is no lock protection for aio_submit. | |
115 | // Hence we might have false conditional trigger. | |
116 | // aio_wait has to handle that hence do not care here. | |
31f18b77 | 117 | cond.notify_all(); |
31f18b77 | 118 | } |
7c673cae | 119 | } |
b32b8144 FG |
120 | |
121 | void set_return_value(int _r) { | |
122 | r = _r; | |
123 | } | |
124 | ||
125 | int get_return_value() const { | |
126 | return r; | |
127 | } | |
7c673cae FG |
128 | }; |
129 | ||
130 | ||
131 | class BlockDevice { | |
132 | public: | |
133 | CephContext* cct; | |
11fdf7f2 | 134 | typedef void (*aio_callback_t)(void *handle, void *aio); |
7c673cae | 135 | private: |
11fdf7f2 | 136 | ceph::mutex ioc_reap_lock = ceph::make_mutex("BlockDevice::ioc_reap_lock"); |
7c673cae FG |
137 | std::vector<IOContext*> ioc_reap_queue; |
138 | std::atomic_int ioc_reap_count = {0}; | |
139 | ||
140 | protected: | |
9f95a23c TL |
141 | uint64_t size = 0; |
142 | uint64_t block_size = 0; | |
11fdf7f2 | 143 | bool support_discard = false; |
7c673cae | 144 | bool rotational = true; |
11fdf7f2 | 145 | bool lock_exclusive = true; |
7c673cae FG |
146 | |
147 | public: | |
11fdf7f2 TL |
148 | aio_callback_t aio_callback; |
149 | void *aio_callback_priv; | |
150 | BlockDevice(CephContext* cct, aio_callback_t cb, void *cbpriv) | |
151 | : cct(cct), | |
11fdf7f2 TL |
152 | aio_callback(cb), |
153 | aio_callback_priv(cbpriv) | |
154 | {} | |
7c673cae | 155 | virtual ~BlockDevice() = default; |
7c673cae FG |
156 | |
157 | static BlockDevice *create( | |
11fdf7f2 | 158 | CephContext* cct, const std::string& path, aio_callback_t cb, void *cbpriv, aio_callback_t d_cb, void *d_cbpriv); |
7c673cae FG |
159 | virtual bool supported_bdev_label() { return true; } |
160 | virtual bool is_rotational() { return rotational; } | |
161 | ||
162 | virtual void aio_submit(IOContext *ioc) = 0; | |
163 | ||
11fdf7f2 TL |
164 | void set_no_exclusive_lock() { |
165 | lock_exclusive = false; | |
166 | } | |
167 | ||
168 | uint64_t get_size() const { return size; } | |
169 | uint64_t get_block_size() const { return block_size; } | |
170 | ||
171 | /// hook to provide utilization of thinly-provisioned device | |
172 | virtual bool get_thin_utilization(uint64_t *total, uint64_t *avail) const { | |
173 | return false; | |
174 | } | |
7c673cae | 175 | |
11fdf7f2 TL |
176 | virtual int collect_metadata(const std::string& prefix, std::map<std::string,std::string> *pm) const = 0; |
177 | ||
9f95a23c | 178 | virtual int get_devname(std::string *out) const { |
11fdf7f2 TL |
179 | return -ENOENT; |
180 | } | |
9f95a23c | 181 | virtual int get_devices(std::set<std::string> *ls) const { |
11fdf7f2 TL |
182 | std::string s; |
183 | if (get_devname(&s) == 0) { | |
184 | ls->insert(s); | |
185 | } | |
186 | return 0; | |
187 | } | |
188 | virtual int get_numa_node(int *node) const { | |
189 | return -EOPNOTSUPP; | |
190 | } | |
7c673cae FG |
191 | |
192 | virtual int read( | |
193 | uint64_t off, | |
194 | uint64_t len, | |
195 | bufferlist *pbl, | |
196 | IOContext *ioc, | |
197 | bool buffered) = 0; | |
198 | virtual int read_random( | |
199 | uint64_t off, | |
200 | uint64_t len, | |
201 | char *buf, | |
202 | bool buffered) = 0; | |
203 | virtual int write( | |
204 | uint64_t off, | |
205 | bufferlist& bl, | |
11fdf7f2 TL |
206 | bool buffered, |
207 | int write_hint = WRITE_LIFE_NOT_SET) = 0; | |
7c673cae FG |
208 | |
209 | virtual int aio_read( | |
210 | uint64_t off, | |
211 | uint64_t len, | |
212 | bufferlist *pbl, | |
213 | IOContext *ioc) = 0; | |
214 | virtual int aio_write( | |
215 | uint64_t off, | |
216 | bufferlist& bl, | |
217 | IOContext *ioc, | |
11fdf7f2 TL |
218 | bool buffered, |
219 | int write_hint = WRITE_LIFE_NOT_SET) = 0; | |
7c673cae | 220 | virtual int flush() = 0; |
11fdf7f2 TL |
221 | virtual int discard(uint64_t offset, uint64_t len) { return 0; } |
222 | virtual int queue_discard(interval_set<uint64_t> &to_release) { return -1; } | |
223 | virtual void discard_drain() { return; } | |
7c673cae FG |
224 | |
225 | void queue_reap_ioc(IOContext *ioc); | |
226 | void reap_ioc(); | |
227 | ||
228 | // for managing buffered readers/writers | |
229 | virtual int invalidate_cache(uint64_t off, uint64_t len) = 0; | |
230 | virtual int open(const std::string& path) = 0; | |
231 | virtual void close() = 0; | |
11fdf7f2 TL |
232 | |
233 | protected: | |
234 | bool is_valid_io(uint64_t off, uint64_t len) const { | |
235 | return (off % block_size == 0 && | |
236 | len % block_size == 0 && | |
237 | len > 0 && | |
238 | off < size && | |
239 | off + len <= size); | |
240 | } | |
7c673cae FG |
241 | }; |
242 | ||
243 | #endif //CEPH_OS_BLUESTORE_BLOCKDEVICE_H |