]> git.proxmox.com Git - ceph.git/blob - ceph/src/crimson/os/seastore/seastore.h
import quincy beta 17.1.0
[ceph.git] / ceph / src / crimson / os / seastore / seastore.h
1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2 // vim: ts=8 sw=2 smarttab
3
4 #pragma once
5
6 #include <string>
7 #include <unordered_map>
8 #include <map>
9 #include <typeinfo>
10 #include <vector>
11
12 #include <optional>
13 #include <seastar/core/future.hh>
14
15 #include "include/uuid.h"
16
17 #include "os/Transaction.h"
18 #include "crimson/os/futurized_collection.h"
19 #include "crimson/os/futurized_store.h"
20
21 #include "crimson/os/seastore/transaction.h"
22 #include "crimson/os/seastore/onode_manager.h"
23 #include "crimson/os/seastore/omap_manager.h"
24 #include "crimson/os/seastore/collection_manager.h"
25
26 namespace crimson::os::seastore {
27
28 class Onode;
29 using OnodeRef = boost::intrusive_ptr<Onode>;
30 class TransactionManager;
31
32 class SeastoreCollection final : public FuturizedCollection {
33 public:
34 template <typename... T>
35 SeastoreCollection(T&&... args) :
36 FuturizedCollection(std::forward<T>(args)...) {}
37
38 seastar::shared_mutex ordering_lock;
39 };
40
41 class SeaStore final : public FuturizedStore {
42 public:
43 class MDStore {
44 public:
45 using base_iertr = crimson::errorator<
46 crimson::ct_error::input_output_error
47 >;
48
49 using write_meta_ertr = base_iertr;
50 using write_meta_ret = write_meta_ertr::future<>;
51 virtual write_meta_ret write_meta(
52 const std::string &key,
53 const std::string &val
54 ) = 0;
55
56 using read_meta_ertr = base_iertr;
57 using read_meta_ret = write_meta_ertr::future<std::optional<std::string>>;
58 virtual read_meta_ret read_meta(const std::string &key) = 0;
59
60 virtual ~MDStore() {}
61 };
62 using MDStoreRef = std::unique_ptr<MDStore>;
63
64 SeaStore(
65 const std::string& root,
66 MDStoreRef mdstore,
67 SegmentManagerRef sm,
68 TransactionManagerRef tm,
69 CollectionManagerRef cm,
70 OnodeManagerRef om);
71 SeaStore(
72 const std::string& root,
73 SegmentManagerRef sm,
74 TransactionManagerRef tm,
75 CollectionManagerRef cm,
76 OnodeManagerRef om);
77 ~SeaStore();
78
79 seastar::future<> stop() final;
80 mount_ertr::future<> mount() final;
81 seastar::future<> umount() final;
82
83 mkfs_ertr::future<> mkfs(uuid_d new_osd_fsid) final;
84 seastar::future<store_statfs_t> stat() const final;
85
86 read_errorator::future<ceph::bufferlist> read(
87 CollectionRef c,
88 const ghobject_t& oid,
89 uint64_t offset,
90 size_t len,
91 uint32_t op_flags = 0) final;
92 read_errorator::future<ceph::bufferlist> readv(
93 CollectionRef c,
94 const ghobject_t& oid,
95 interval_set<uint64_t>& m,
96 uint32_t op_flags = 0) final;
97 get_attr_errorator::future<ceph::bufferlist> get_attr(
98 CollectionRef c,
99 const ghobject_t& oid,
100 std::string_view name) const final;
101 get_attrs_ertr::future<attrs_t> get_attrs(
102 CollectionRef c,
103 const ghobject_t& oid) final;
104
105 seastar::future<struct stat> stat(
106 CollectionRef c,
107 const ghobject_t& oid) final;
108
109 read_errorator::future<omap_values_t> omap_get_values(
110 CollectionRef c,
111 const ghobject_t& oid,
112 const omap_keys_t& keys) final;
113
114 /// Retrieves paged set of values > start (if present)
115 using omap_get_values_ret_bare_t = std::tuple<bool, omap_values_t>;
116 using omap_get_values_ret_t = read_errorator::future<
117 omap_get_values_ret_bare_t>;
118 omap_get_values_ret_t omap_get_values(
119 CollectionRef c, ///< [in] collection
120 const ghobject_t &oid, ///< [in] oid
121 const std::optional<std::string> &start ///< [in] start, empty for begin
122 ) final; ///< @return <done, values> values.empty() iff done
123
124 read_errorator::future<bufferlist> omap_get_header(
125 CollectionRef c,
126 const ghobject_t& oid) final;
127
128 seastar::future<std::tuple<std::vector<ghobject_t>, ghobject_t>> list_objects(
129 CollectionRef c,
130 const ghobject_t& start,
131 const ghobject_t& end,
132 uint64_t limit) const final;
133
134 seastar::future<CollectionRef> create_new_collection(const coll_t& cid) final;
135 seastar::future<CollectionRef> open_collection(const coll_t& cid) final;
136 seastar::future<std::vector<coll_t>> list_collections() final;
137
138 seastar::future<> do_transaction(
139 CollectionRef ch,
140 ceph::os::Transaction&& txn) final;
141
142 seastar::future<OmapIteratorRef> get_omap_iterator(
143 CollectionRef ch,
144 const ghobject_t& oid) final;
145 seastar::future<std::map<uint64_t, uint64_t>> fiemap(
146 CollectionRef ch,
147 const ghobject_t& oid,
148 uint64_t off,
149 uint64_t len) final;
150
151 seastar::future<> write_meta(const std::string& key,
152 const std::string& value) final;
153 seastar::future<std::tuple<int, std::string>> read_meta(const std::string& key) final;
154 uuid_d get_fsid() const final;
155
156 unsigned get_max_attr_name_length() const final {
157 return 256;
158 }
159 enum class op_type_t : uint8_t {
160 TRANSACTION = 0,
161 READ,
162 WRITE,
163 GET_ATTR,
164 GET_ATTRS,
165 STAT,
166 OMAP_GET_VALUES,
167 OMAP_LIST,
168 MAX
169 };
170
171 private:
172 struct internal_context_t {
173 CollectionRef ch;
174 ceph::os::Transaction ext_transaction;
175
176 internal_context_t(
177 CollectionRef ch,
178 ceph::os::Transaction &&_ext_transaction,
179 TransactionRef &&transaction)
180 : ch(ch), ext_transaction(std::move(_ext_transaction)),
181 transaction(std::move(transaction)),
182 iter(ext_transaction.begin()) {}
183
184 TransactionRef transaction;
185
186 ceph::os::Transaction::iterator iter;
187 std::chrono::steady_clock::time_point begin_timestamp = std::chrono::steady_clock::now();
188
189 void reset_preserve_handle(TransactionManager &tm) {
190 tm.reset_transaction_preserve_handle(*transaction);
191 iter = ext_transaction.begin();
192 }
193 };
194
195 static void on_error(ceph::os::Transaction &t);
196
197 template <typename F>
198 auto repeat_with_internal_context(
199 CollectionRef ch,
200 ceph::os::Transaction &&t,
201 Transaction::src_t src,
202 const char* tname,
203 op_type_t op_type,
204 F &&f) {
205 return seastar::do_with(
206 internal_context_t(
207 ch, std::move(t),
208 transaction_manager->create_transaction(src, tname)),
209 std::forward<F>(f),
210 [this, op_type](auto &ctx, auto &f) {
211 return ctx.transaction->get_handle().take_collection_lock(
212 static_cast<SeastoreCollection&>(*(ctx.ch)).ordering_lock
213 ).then([&, this] {
214 return repeat_eagain([&, this] {
215 ctx.reset_preserve_handle(*transaction_manager);
216 return std::invoke(f, ctx);
217 }).handle_error(
218 crimson::ct_error::eagain::pass_further{},
219 crimson::ct_error::all_same_way([&ctx](auto e) {
220 on_error(ctx.ext_transaction);
221 })
222 );
223 }).then([this, op_type, &ctx] {
224 add_latency_sample(op_type,
225 std::chrono::steady_clock::now() - ctx.begin_timestamp);
226 });
227 }
228 );
229 }
230
231 template <typename Ret, typename F>
232 auto repeat_with_onode(
233 CollectionRef ch,
234 const ghobject_t &oid,
235 Transaction::src_t src,
236 const char* tname,
237 op_type_t op_type,
238 F &&f) const {
239 auto begin_time = std::chrono::steady_clock::now();
240 return seastar::do_with(
241 oid, Ret{}, std::forward<F>(f),
242 [this, src, op_type, begin_time, tname
243 ](auto &oid, auto &ret, auto &f)
244 {
245 return repeat_eagain([&, this, src, tname] {
246 return transaction_manager->with_transaction_intr(
247 src,
248 tname,
249 [&, this](auto& t)
250 {
251 return onode_manager->get_onode(t, oid
252 ).si_then([&](auto onode) {
253 return seastar::do_with(std::move(onode), [&](auto& onode) {
254 return f(t, *onode);
255 });
256 }).si_then([&ret](auto _ret) {
257 ret = _ret;
258 });
259 });
260 }).safe_then([&ret, op_type, begin_time, this] {
261 const_cast<SeaStore*>(this)->add_latency_sample(op_type,
262 std::chrono::steady_clock::now() - begin_time);
263 return seastar::make_ready_future<Ret>(ret);
264 });
265 });
266 }
267
268 using _omap_get_value_iertr = OMapManager::base_iertr::extend<
269 crimson::ct_error::enodata
270 >;
271 using _omap_get_value_ret = _omap_get_value_iertr::future<ceph::bufferlist>;
272 _omap_get_value_ret _omap_get_value(
273 Transaction &t,
274 omap_root_t &&root,
275 std::string_view key) const;
276
277 using _omap_get_values_iertr = OMapManager::base_iertr;
278 using _omap_get_values_ret = _omap_get_values_iertr::future<omap_values_t>;
279 _omap_get_values_ret _omap_get_values(
280 Transaction &t,
281 omap_root_t &&root,
282 const omap_keys_t &keys) const;
283
284 using _omap_list_bare_ret = OMapManager::omap_list_bare_ret;
285 using _omap_list_ret = OMapManager::omap_list_ret;
286 _omap_list_ret _omap_list(
287 Onode &onode,
288 const omap_root_le_t& omap_root,
289 Transaction& t,
290 const std::optional<std::string>& start,
291 OMapManager::omap_list_config_t config) const;
292
293 friend class SeaStoreOmapIterator;
294 omap_get_values_ret_t omap_list(
295 CollectionRef ch,
296 const ghobject_t &oid,
297 const std::optional<std::string> &_start,
298 OMapManager::omap_list_config_t config);
299
300 std::string root;
301 MDStoreRef mdstore;
302 SegmentManagerRef segment_manager;
303 std::vector<SegmentManagerRef> secondaries;
304 TransactionManagerRef transaction_manager;
305 CollectionManagerRef collection_manager;
306 OnodeManagerRef onode_manager;
307 const uint32_t max_object_size = 0;
308
309 using tm_iertr = TransactionManager::base_iertr;
310 using tm_ret = tm_iertr::future<>;
311 tm_ret _do_transaction_step(
312 internal_context_t &ctx,
313 CollectionRef &col,
314 std::vector<OnodeRef> &onodes,
315 ceph::os::Transaction::iterator &i);
316
317 tm_ret _remove(
318 internal_context_t &ctx,
319 OnodeRef &onode);
320 tm_ret _touch(
321 internal_context_t &ctx,
322 OnodeRef &onode);
323 tm_ret _write(
324 internal_context_t &ctx,
325 OnodeRef &onode,
326 uint64_t offset, size_t len,
327 ceph::bufferlist &&bl,
328 uint32_t fadvise_flags);
329 tm_ret _omap_set_values(
330 internal_context_t &ctx,
331 OnodeRef &onode,
332 std::map<std::string, ceph::bufferlist> &&aset);
333 tm_ret _omap_set_header(
334 internal_context_t &ctx,
335 OnodeRef &onode,
336 ceph::bufferlist &&header);
337 tm_ret _omap_rmkeys(
338 internal_context_t &ctx,
339 OnodeRef &onode,
340 omap_keys_t &&aset);
341 tm_ret _omap_rmkeyrange(
342 internal_context_t &ctx,
343 OnodeRef &onode,
344 std::string first,
345 std::string last);
346 tm_ret _truncate(
347 internal_context_t &ctx,
348 OnodeRef &onode, uint64_t size);
349 tm_ret _setattrs(
350 internal_context_t &ctx,
351 OnodeRef &onode,
352 std::map<std::string,bufferlist>&& aset);
353 tm_ret _create_collection(
354 internal_context_t &ctx,
355 const coll_t& cid, int bits);
356 tm_ret _remove_collection(
357 internal_context_t &ctx,
358 const coll_t& cid);
359 using omap_set_kvs_ret = tm_iertr::future<>;
360 omap_set_kvs_ret _omap_set_kvs(
361 OnodeRef &onode,
362 const omap_root_le_t& omap_root,
363 Transaction& t,
364 omap_root_le_t& mutable_omap_root,
365 std::map<std::string, ceph::bufferlist>&& kvs);
366
367 boost::intrusive_ptr<SeastoreCollection> _get_collection(const coll_t& cid);
368
369 static constexpr auto LAT_MAX = static_cast<std::size_t>(op_type_t::MAX);
370 struct {
371 std::array<seastar::metrics::histogram, LAT_MAX> op_lat;
372 } stats;
373
374 seastar::metrics::histogram& get_latency(
375 op_type_t op_type) {
376 assert(static_cast<std::size_t>(op_type) < stats.op_lat.size());
377 return stats.op_lat[static_cast<std::size_t>(op_type)];
378 }
379
380 void add_latency_sample(op_type_t op_type,
381 std::chrono::steady_clock::duration dur) {
382 seastar::metrics::histogram& lat = get_latency(op_type);
383 lat.sample_count++;
384 lat.sample_sum += std::chrono::duration_cast<std::chrono::milliseconds>(dur).count();
385 }
386 seastar::metrics::metric_group metrics;
387 void register_metrics();
388 seastar::future<> write_fsid(uuid_d new_osd_fsid);
389 };
390
391 seastar::future<std::unique_ptr<SeaStore>> make_seastore(
392 const std::string &device,
393 const ConfigValues &config);
394 }