]> git.proxmox.com Git - ceph.git/blame - ceph/src/blk/BlockDevice.h
update ceph source to reef 18.1.2
[ceph.git] / ceph / src / blk / BlockDevice.h
CommitLineData
7c673cae
FG
1// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2// vim: ts=8 sw=2 smarttab
3/*
4 * Ceph - scalable distributed file system
5 *
6 * Copyright (C) 2015 XSky <haomai@xsky.com>
7 *
8 * Author: Haomai Wang <haomaiwang@gmail.com>
9 *
10 * This is free software; you can redistribute it and/or
11 * modify it under the terms of the GNU Lesser General Public
12 * License version 2.1, as published by the Free Software
13 * Foundation. See file COPYING.
14 *
15 */
16
f67539c2
TL
17#ifndef CEPH_BLK_BLOCKDEVICE_H
18#define CEPH_BLK_BLOCKDEVICE_H
7c673cae
FG
19
20#include <atomic>
21#include <condition_variable>
7c673cae 22#include <list>
11fdf7f2
TL
23#include <map>
24#include <mutex>
25#include <set>
26#include <string>
27#include <vector>
7c673cae
FG
28
29#include "acconfig.h"
11fdf7f2 30#include "common/ceph_mutex.h"
9f95a23c 31#include "include/common_fwd.h"
1e59de90 32#include "extblkdev/ExtBlkDevInterface.h"
7c673cae 33
11fdf7f2 34#if defined(HAVE_LIBAIO) || defined(HAVE_POSIXAIO)
f67539c2 35#include "aio/aio.h"
11fdf7f2
TL
36#endif
37#include "include/ceph_assert.h"
38#include "include/buffer.h"
39#include "include/interval_set.h"
7c673cae
FG
40#define SPDK_PREFIX "spdk:"
41
11fdf7f2
TL
42#if defined(__linux__)
43#if !defined(F_SET_FILE_RW_HINT)
44#define F_LINUX_SPECIFIC_BASE 1024
45#define F_SET_FILE_RW_HINT (F_LINUX_SPECIFIC_BASE + 14)
46#endif
47// These values match Linux definition
48// https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/tree/include/uapi/linux/fcntl.h#n56
49#define WRITE_LIFE_NOT_SET 0 // No hint information set
50#define WRITE_LIFE_NONE 1 // No hints about write life time
51#define WRITE_LIFE_SHORT 2 // Data written has a short life time
52#define WRITE_LIFE_MEDIUM 3 // Data written has a medium life time
53#define WRITE_LIFE_LONG 4 // Data written has a long life time
54#define WRITE_LIFE_EXTREME 5 // Data written has an extremely long life time
55#define WRITE_LIFE_MAX 6
56#else
57// On systems don't have WRITE_LIFE_* only use one FD
58// And all files are created equal
59#define WRITE_LIFE_NOT_SET 0 // No hint information set
60#define WRITE_LIFE_NONE 0 // No hints about write life time
61#define WRITE_LIFE_SHORT 0 // Data written has a short life time
62#define WRITE_LIFE_MEDIUM 0 // Data written has a medium life time
63#define WRITE_LIFE_LONG 0 // Data written has a long life time
64#define WRITE_LIFE_EXTREME 0 // Data written has an extremely long life time
65#define WRITE_LIFE_MAX 1
66#endif
67
20effc67
TL
68enum struct blk_access_mode_t {
69 DIRECT,
70 BUFFERED
71};
72blk_access_mode_t buffermode(bool buffered);
73std::ostream& operator<<(std::ostream& os, const blk_access_mode_t buffered);
11fdf7f2 74
7c673cae
FG
75/// track in-flight io
76struct IOContext {
20effc67
TL
77 enum {
78 FLAG_DONT_CACHE = 1
79 };
80
31f18b77 81private:
11fdf7f2
TL
82 ceph::mutex lock = ceph::make_mutex("IOContext::lock");
83 ceph::condition_variable cond;
b32b8144 84 int r = 0;
31f18b77
FG
85
86public:
7c673cae
FG
87 CephContext* cct;
88 void *priv;
89#ifdef HAVE_SPDK
90 void *nvme_task_first = nullptr;
91 void *nvme_task_last = nullptr;
11fdf7f2 92 std::atomic_int total_nseg = {0};
7c673cae
FG
93#endif
94
11fdf7f2 95#if defined(HAVE_LIBAIO) || defined(HAVE_POSIXAIO)
7c673cae
FG
96 std::list<aio_t> pending_aios; ///< not yet submitted
97 std::list<aio_t> running_aios; ///< submitting or submitted
11fdf7f2 98#endif
7c673cae
FG
99 std::atomic_int num_pending = {0};
100 std::atomic_int num_running = {0};
b32b8144 101 bool allow_eio;
20effc67 102 uint32_t flags = 0; // FLAG_*
7c673cae 103
b32b8144
FG
104 explicit IOContext(CephContext* cct, void *p, bool allow_eio = false)
105 : cct(cct), priv(p), allow_eio(allow_eio)
7c673cae
FG
106 {}
107
108 // no copying
109 IOContext(const IOContext& other) = delete;
110 IOContext &operator=(const IOContext& other) = delete;
111
112 bool has_pending_aios() {
113 return num_pending.load();
114 }
11fdf7f2 115 void release_running_aios();
7c673cae 116 void aio_wait();
11fdf7f2 117 uint64_t get_num_ios() const;
7c673cae 118
31f18b77 119 void try_aio_wake() {
11fdf7f2
TL
120 assert(num_running >= 1);
121
122 std::lock_guard l(lock);
123 if (num_running.fetch_sub(1) == 1) {
31f18b77
FG
124
125 // we might have some pending IOs submitted after the check
126 // as there is no lock protection for aio_submit.
127 // Hence we might have false conditional trigger.
128 // aio_wait has to handle that hence do not care here.
31f18b77 129 cond.notify_all();
31f18b77 130 }
7c673cae 131 }
b32b8144
FG
132
133 void set_return_value(int _r) {
134 r = _r;
135 }
136
137 int get_return_value() const {
138 return r;
139 }
20effc67
TL
140
141 bool skip_cache() const {
142 return flags & FLAG_DONT_CACHE;
143 }
7c673cae
FG
144};
145
146
147class BlockDevice {
148public:
149 CephContext* cct;
11fdf7f2 150 typedef void (*aio_callback_t)(void *handle, void *aio);
7c673cae 151private:
11fdf7f2 152 ceph::mutex ioc_reap_lock = ceph::make_mutex("BlockDevice::ioc_reap_lock");
7c673cae
FG
153 std::vector<IOContext*> ioc_reap_queue;
154 std::atomic_int ioc_reap_count = {0};
f67539c2
TL
155 enum class block_device_t {
156 unknown,
157#if defined(HAVE_LIBAIO) || defined(HAVE_POSIXAIO)
158 aio,
159#if defined(HAVE_LIBZBD)
160 hm_smr,
161#endif
162#endif
163#if defined(HAVE_SPDK)
164 spdk,
165#endif
166#if defined(HAVE_BLUESTORE_PMEM)
167 pmem,
168#endif
169 };
170 static block_device_t detect_device_type(const std::string& path);
171 static block_device_t device_type_from_name(const std::string& blk_dev_name);
172 static BlockDevice *create_with_type(block_device_t device_type,
173 CephContext* cct, const std::string& path, aio_callback_t cb,
174 void *cbpriv, aio_callback_t d_cb, void *d_cbpriv);
7c673cae
FG
175
176protected:
9f95a23c
TL
177 uint64_t size = 0;
178 uint64_t block_size = 0;
20effc67 179 uint64_t optimal_io_size = 0;
11fdf7f2 180 bool support_discard = false;
7c673cae 181 bool rotational = true;
11fdf7f2 182 bool lock_exclusive = true;
7c673cae 183
f67539c2
TL
184 // HM-SMR specific properties. In HM-SMR drives the LBA space is divided into
185 // fixed-size zones. Typically, the first few zones are randomly writable;
186 // they form a conventional region of the drive. The remaining zones must be
187 // written sequentially and they must be reset before rewritten. For example,
188 // a 14 TB HGST HSH721414AL drive has 52156 zones each of size is 256 MiB.
189 // The zones 0-523 are randomly writable and they form the conventional region
190 // of the drive. The zones 524-52155 are sequential zones.
191 uint64_t conventional_region_size = 0;
192 uint64_t zone_size = 0;
193
7c673cae 194public:
11fdf7f2
TL
195 aio_callback_t aio_callback;
196 void *aio_callback_priv;
197 BlockDevice(CephContext* cct, aio_callback_t cb, void *cbpriv)
198 : cct(cct),
11fdf7f2
TL
199 aio_callback(cb),
200 aio_callback_priv(cbpriv)
201 {}
7c673cae 202 virtual ~BlockDevice() = default;
7c673cae
FG
203
204 static BlockDevice *create(
11fdf7f2 205 CephContext* cct, const std::string& path, aio_callback_t cb, void *cbpriv, aio_callback_t d_cb, void *d_cbpriv);
7c673cae
FG
206 virtual bool supported_bdev_label() { return true; }
207 virtual bool is_rotational() { return rotational; }
208
f67539c2
TL
209 // HM-SMR-specific calls
210 virtual bool is_smr() const { return false; }
211 virtual uint64_t get_zone_size() const {
212 ceph_assert(is_smr());
213 return zone_size;
214 }
215 virtual uint64_t get_conventional_region_size() const {
216 ceph_assert(is_smr());
217 return conventional_region_size;
218 }
20effc67
TL
219 virtual void reset_all_zones() {
220 ceph_assert(is_smr());
221 }
222 virtual void reset_zone(uint64_t zone) {
223 ceph_assert(is_smr());
224 }
225 virtual std::vector<uint64_t> get_zones() {
226 ceph_assert(is_smr());
227 return std::vector<uint64_t>();
228 }
f67539c2 229
7c673cae
FG
230 virtual void aio_submit(IOContext *ioc) = 0;
231
11fdf7f2
TL
232 void set_no_exclusive_lock() {
233 lock_exclusive = false;
234 }
235
236 uint64_t get_size() const { return size; }
237 uint64_t get_block_size() const { return block_size; }
20effc67 238 uint64_t get_optimal_io_size() const { return optimal_io_size; }
11fdf7f2
TL
239
240 /// hook to provide utilization of thinly-provisioned device
1e59de90
TL
241 virtual int get_ebd_state(ExtBlkDevState &state) const {
242 return -ENOENT;
11fdf7f2 243 }
7c673cae 244
11fdf7f2
TL
245 virtual int collect_metadata(const std::string& prefix, std::map<std::string,std::string> *pm) const = 0;
246
9f95a23c 247 virtual int get_devname(std::string *out) const {
11fdf7f2
TL
248 return -ENOENT;
249 }
9f95a23c 250 virtual int get_devices(std::set<std::string> *ls) const {
11fdf7f2
TL
251 std::string s;
252 if (get_devname(&s) == 0) {
253 ls->insert(s);
254 }
255 return 0;
256 }
257 virtual int get_numa_node(int *node) const {
258 return -EOPNOTSUPP;
259 }
7c673cae
FG
260
261 virtual int read(
262 uint64_t off,
263 uint64_t len,
f67539c2 264 ceph::buffer::list *pbl,
7c673cae
FG
265 IOContext *ioc,
266 bool buffered) = 0;
267 virtual int read_random(
268 uint64_t off,
269 uint64_t len,
270 char *buf,
271 bool buffered) = 0;
272 virtual int write(
273 uint64_t off,
f67539c2 274 ceph::buffer::list& bl,
11fdf7f2
TL
275 bool buffered,
276 int write_hint = WRITE_LIFE_NOT_SET) = 0;
7c673cae
FG
277
278 virtual int aio_read(
279 uint64_t off,
280 uint64_t len,
f67539c2 281 ceph::buffer::list *pbl,
7c673cae
FG
282 IOContext *ioc) = 0;
283 virtual int aio_write(
284 uint64_t off,
f67539c2 285 ceph::buffer::list& bl,
7c673cae 286 IOContext *ioc,
11fdf7f2
TL
287 bool buffered,
288 int write_hint = WRITE_LIFE_NOT_SET) = 0;
7c673cae 289 virtual int flush() = 0;
1e59de90 290 virtual bool try_discard(interval_set<uint64_t> &to_release, bool async=true) { return false; }
11fdf7f2 291 virtual void discard_drain() { return; }
7c673cae 292
7c673cae
FG
293 // for managing buffered readers/writers
294 virtual int invalidate_cache(uint64_t off, uint64_t len) = 0;
295 virtual int open(const std::string& path) = 0;
296 virtual void close() = 0;
11fdf7f2 297
20effc67
TL
298 struct hugepaged_raw_marker_t {};
299
11fdf7f2 300protected:
f67539c2 301 bool is_valid_io(uint64_t off, uint64_t len) const;
7c673cae
FG
302};
303
f67539c2 304#endif //CEPH_BLK_BLOCKDEVICE_H