]> git.proxmox.com Git - ceph.git/blame - ceph/src/os/bluestore/BlockDevice.h
update sources to ceph Nautilus 14.2.1
[ceph.git] / ceph / src / os / bluestore / BlockDevice.h
CommitLineData
7c673cae
FG
1// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2// vim: ts=8 sw=2 smarttab
3/*
4 * Ceph - scalable distributed file system
5 *
6 * Copyright (C) 2015 XSky <haomai@xsky.com>
7 *
8 * Author: Haomai Wang <haomaiwang@gmail.com>
9 *
10 * This is free software; you can redistribute it and/or
11 * modify it under the terms of the GNU Lesser General Public
12 * License version 2.1, as published by the Free Software
13 * Foundation. See file COPYING.
14 *
15 */
16
17#ifndef CEPH_OS_BLUESTORE_BLOCKDEVICE_H
18#define CEPH_OS_BLUESTORE_BLOCKDEVICE_H
19
20#include <atomic>
21#include <condition_variable>
7c673cae 22#include <list>
11fdf7f2
TL
23#include <map>
24#include <mutex>
25#include <set>
26#include <string>
27#include <vector>
7c673cae
FG
28
29#include "acconfig.h"
11fdf7f2 30#include "common/ceph_mutex.h"
7c673cae 31
11fdf7f2
TL
32#if defined(HAVE_LIBAIO) || defined(HAVE_POSIXAIO)
33#include "ceph_aio.h"
34#endif
35#include "include/ceph_assert.h"
36#include "include/buffer.h"
37#include "include/interval_set.h"
7c673cae
FG
38#define SPDK_PREFIX "spdk:"
39
11fdf7f2
TL
40#if defined(__linux__)
41#if !defined(F_SET_FILE_RW_HINT)
42#define F_LINUX_SPECIFIC_BASE 1024
43#define F_SET_FILE_RW_HINT (F_LINUX_SPECIFIC_BASE + 14)
44#endif
45// These values match Linux definition
46// https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/tree/include/uapi/linux/fcntl.h#n56
47#define WRITE_LIFE_NOT_SET 0 // No hint information set
48#define WRITE_LIFE_NONE 1 // No hints about write life time
49#define WRITE_LIFE_SHORT 2 // Data written has a short life time
50#define WRITE_LIFE_MEDIUM 3 // Data written has a medium life time
51#define WRITE_LIFE_LONG 4 // Data written has a long life time
52#define WRITE_LIFE_EXTREME 5 // Data written has an extremely long life time
53#define WRITE_LIFE_MAX 6
54#else
55// On systems don't have WRITE_LIFE_* only use one FD
56// And all files are created equal
57#define WRITE_LIFE_NOT_SET 0 // No hint information set
58#define WRITE_LIFE_NONE 0 // No hints about write life time
59#define WRITE_LIFE_SHORT 0 // Data written has a short life time
60#define WRITE_LIFE_MEDIUM 0 // Data written has a medium life time
61#define WRITE_LIFE_LONG 0 // Data written has a long life time
62#define WRITE_LIFE_EXTREME 0 // Data written has an extremely long life time
63#define WRITE_LIFE_MAX 1
64#endif
65
66class CephContext;
67
7c673cae
FG
68/// track in-flight io
69struct IOContext {
31f18b77 70private:
11fdf7f2
TL
71 ceph::mutex lock = ceph::make_mutex("IOContext::lock");
72 ceph::condition_variable cond;
b32b8144 73 int r = 0;
31f18b77
FG
74
75public:
7c673cae
FG
76 CephContext* cct;
77 void *priv;
78#ifdef HAVE_SPDK
79 void *nvme_task_first = nullptr;
80 void *nvme_task_last = nullptr;
11fdf7f2 81 std::atomic_int total_nseg = {0};
7c673cae
FG
82#endif
83
11fdf7f2 84#if defined(HAVE_LIBAIO) || defined(HAVE_POSIXAIO)
7c673cae
FG
85 std::list<aio_t> pending_aios; ///< not yet submitted
86 std::list<aio_t> running_aios; ///< submitting or submitted
11fdf7f2 87#endif
7c673cae
FG
88 std::atomic_int num_pending = {0};
89 std::atomic_int num_running = {0};
b32b8144 90 bool allow_eio;
7c673cae 91
b32b8144
FG
92 explicit IOContext(CephContext* cct, void *p, bool allow_eio = false)
93 : cct(cct), priv(p), allow_eio(allow_eio)
7c673cae
FG
94 {}
95
96 // no copying
97 IOContext(const IOContext& other) = delete;
98 IOContext &operator=(const IOContext& other) = delete;
99
100 bool has_pending_aios() {
101 return num_pending.load();
102 }
11fdf7f2 103 void release_running_aios();
7c673cae 104 void aio_wait();
11fdf7f2 105 uint64_t get_num_ios() const;
7c673cae 106
31f18b77 107 void try_aio_wake() {
11fdf7f2
TL
108 assert(num_running >= 1);
109
110 std::lock_guard l(lock);
111 if (num_running.fetch_sub(1) == 1) {
31f18b77
FG
112
113 // we might have some pending IOs submitted after the check
114 // as there is no lock protection for aio_submit.
115 // Hence we might have false conditional trigger.
116 // aio_wait has to handle that hence do not care here.
31f18b77 117 cond.notify_all();
31f18b77 118 }
7c673cae 119 }
b32b8144
FG
120
121 void set_return_value(int _r) {
122 r = _r;
123 }
124
125 int get_return_value() const {
126 return r;
127 }
7c673cae
FG
128};
129
130
131class BlockDevice {
132public:
133 CephContext* cct;
11fdf7f2 134 typedef void (*aio_callback_t)(void *handle, void *aio);
7c673cae 135private:
11fdf7f2 136 ceph::mutex ioc_reap_lock = ceph::make_mutex("BlockDevice::ioc_reap_lock");
7c673cae
FG
137 std::vector<IOContext*> ioc_reap_queue;
138 std::atomic_int ioc_reap_count = {0};
139
140protected:
11fdf7f2
TL
141 uint64_t size;
142 uint64_t block_size;
143 bool support_discard = false;
7c673cae 144 bool rotational = true;
11fdf7f2 145 bool lock_exclusive = true;
7c673cae
FG
146
147public:
11fdf7f2
TL
148 aio_callback_t aio_callback;
149 void *aio_callback_priv;
150 BlockDevice(CephContext* cct, aio_callback_t cb, void *cbpriv)
151 : cct(cct),
152 size(0),
153 block_size(0),
154 aio_callback(cb),
155 aio_callback_priv(cbpriv)
156 {}
7c673cae 157 virtual ~BlockDevice() = default;
7c673cae
FG
158
159 static BlockDevice *create(
11fdf7f2 160 CephContext* cct, const std::string& path, aio_callback_t cb, void *cbpriv, aio_callback_t d_cb, void *d_cbpriv);
7c673cae
FG
161 virtual bool supported_bdev_label() { return true; }
162 virtual bool is_rotational() { return rotational; }
163
164 virtual void aio_submit(IOContext *ioc) = 0;
165
11fdf7f2
TL
166 void set_no_exclusive_lock() {
167 lock_exclusive = false;
168 }
169
170 uint64_t get_size() const { return size; }
171 uint64_t get_block_size() const { return block_size; }
172
173 /// hook to provide utilization of thinly-provisioned device
174 virtual bool get_thin_utilization(uint64_t *total, uint64_t *avail) const {
175 return false;
176 }
7c673cae 177
11fdf7f2
TL
178 virtual int collect_metadata(const std::string& prefix, std::map<std::string,std::string> *pm) const = 0;
179
180 virtual int get_devname(std::string *out) {
181 return -ENOENT;
182 }
183 virtual int get_devices(std::set<std::string> *ls) {
184 std::string s;
185 if (get_devname(&s) == 0) {
186 ls->insert(s);
187 }
188 return 0;
189 }
190 virtual int get_numa_node(int *node) const {
191 return -EOPNOTSUPP;
192 }
7c673cae
FG
193
194 virtual int read(
195 uint64_t off,
196 uint64_t len,
197 bufferlist *pbl,
198 IOContext *ioc,
199 bool buffered) = 0;
200 virtual int read_random(
201 uint64_t off,
202 uint64_t len,
203 char *buf,
204 bool buffered) = 0;
205 virtual int write(
206 uint64_t off,
207 bufferlist& bl,
11fdf7f2
TL
208 bool buffered,
209 int write_hint = WRITE_LIFE_NOT_SET) = 0;
7c673cae
FG
210
211 virtual int aio_read(
212 uint64_t off,
213 uint64_t len,
214 bufferlist *pbl,
215 IOContext *ioc) = 0;
216 virtual int aio_write(
217 uint64_t off,
218 bufferlist& bl,
219 IOContext *ioc,
11fdf7f2
TL
220 bool buffered,
221 int write_hint = WRITE_LIFE_NOT_SET) = 0;
7c673cae 222 virtual int flush() = 0;
11fdf7f2
TL
223 virtual int discard(uint64_t offset, uint64_t len) { return 0; }
224 virtual int queue_discard(interval_set<uint64_t> &to_release) { return -1; }
225 virtual void discard_drain() { return; }
7c673cae
FG
226
227 void queue_reap_ioc(IOContext *ioc);
228 void reap_ioc();
229
230 // for managing buffered readers/writers
231 virtual int invalidate_cache(uint64_t off, uint64_t len) = 0;
232 virtual int open(const std::string& path) = 0;
233 virtual void close() = 0;
11fdf7f2
TL
234
235protected:
236 bool is_valid_io(uint64_t off, uint64_t len) const {
237 return (off % block_size == 0 &&
238 len % block_size == 0 &&
239 len > 0 &&
240 off < size &&
241 off + len <= size);
242 }
7c673cae
FG
243};
244
245#endif //CEPH_OS_BLUESTORE_BLOCKDEVICE_H