]> git.proxmox.com Git - ceph.git/blame - ceph/src/crimson/os/seastore/seastore.h
update ceph source to reef 18.2.1
[ceph.git] / ceph / src / crimson / os / seastore / seastore.h
CommitLineData
f67539c2
TL
1// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2// vim: ts=8 sw=2 smarttab
3
4#pragma once
5
6#include <string>
7#include <unordered_map>
8#include <map>
9#include <typeinfo>
10#include <vector>
11
12#include <optional>
13#include <seastar/core/future.hh>
1e59de90 14#include <seastar/core/metrics_types.hh>
f67539c2 15
f67539c2
TL
16#include "include/uuid.h"
17
18#include "os/Transaction.h"
1e59de90 19#include "crimson/common/throttle.h"
20effc67 20#include "crimson/os/futurized_collection.h"
f67539c2 21#include "crimson/os/futurized_store.h"
20effc67 22
1e59de90 23#include "crimson/os/seastore/device.h"
20effc67
TL
24#include "crimson/os/seastore/transaction.h"
25#include "crimson/os/seastore/onode_manager.h"
26#include "crimson/os/seastore/omap_manager.h"
27#include "crimson/os/seastore/collection_manager.h"
1e59de90 28#include "crimson/os/seastore/object_data_handler.h"
f67539c2
TL
29
30namespace crimson::os::seastore {
31
f67539c2
TL
32class Onode;
33using OnodeRef = boost::intrusive_ptr<Onode>;
f67539c2 34class TransactionManager;
f67539c2 35
1e59de90
TL
36enum class op_type_t : uint8_t {
37 TRANSACTION = 0,
38 READ,
39 WRITE,
40 GET_ATTR,
41 GET_ATTRS,
42 STAT,
43 OMAP_GET_VALUES,
44 OMAP_LIST,
45 MAX
46};
47
20effc67
TL
48class SeastoreCollection final : public FuturizedCollection {
49public:
50 template <typename... T>
51 SeastoreCollection(T&&... args) :
52 FuturizedCollection(std::forward<T>(args)...) {}
53
54 seastar::shared_mutex ordering_lock;
55};
f67539c2 56
1e59de90
TL
57/**
58 * col_obj_ranges_t
59 *
60 * Represents the two ghobject_t ranges spanned by a PG collection.
61 * Temp objects will be within [temp_begin, temp_end) and normal objects
62 * will be in [obj_begin, obj_end).
63 */
64struct col_obj_ranges_t {
65 ghobject_t temp_begin;
66 ghobject_t temp_end;
67 ghobject_t obj_begin;
68 ghobject_t obj_end;
69};
70
20effc67 71class SeaStore final : public FuturizedStore {
f67539c2 72public:
20effc67
TL
73 class MDStore {
74 public:
75 using base_iertr = crimson::errorator<
76 crimson::ct_error::input_output_error
77 >;
78
79 using write_meta_ertr = base_iertr;
80 using write_meta_ret = write_meta_ertr::future<>;
81 virtual write_meta_ret write_meta(
82 const std::string &key,
83 const std::string &val
84 ) = 0;
85
86 using read_meta_ertr = base_iertr;
87 using read_meta_ret = write_meta_ertr::future<std::optional<std::string>>;
88 virtual read_meta_ret read_meta(const std::string &key) = 0;
f67539c2 89
20effc67
TL
90 virtual ~MDStore() {}
91 };
92 using MDStoreRef = std::unique_ptr<MDStore>;
f67539c2 93
1e59de90
TL
94 class Shard : public FuturizedStore::Shard {
95 public:
96 Shard(
97 std::string root,
98 Device* device,
99 bool is_test);
100 ~Shard() = default;
101
102 seastar::future<struct stat> stat(
103 CollectionRef c,
104 const ghobject_t& oid) final;
105
106 read_errorator::future<ceph::bufferlist> read(
107 CollectionRef c,
108 const ghobject_t& oid,
109 uint64_t offset,
110 size_t len,
111 uint32_t op_flags = 0) final;
112
113 read_errorator::future<ceph::bufferlist> readv(
114 CollectionRef c,
115 const ghobject_t& oid,
116 interval_set<uint64_t>& m,
117 uint32_t op_flags = 0) final;
118
119 get_attr_errorator::future<ceph::bufferlist> get_attr(
120 CollectionRef c,
121 const ghobject_t& oid,
122 std::string_view name) const final;
123
124 get_attrs_ertr::future<attrs_t> get_attrs(
125 CollectionRef c,
126 const ghobject_t& oid) final;
127
128 read_errorator::future<omap_values_t> omap_get_values(
129 CollectionRef c,
130 const ghobject_t& oid,
131 const omap_keys_t& keys) final;
132
133 /// Retrieves paged set of values > start (if present)
134 using omap_get_values_ret_bare_t = std::tuple<bool, omap_values_t>;
135 using omap_get_values_ret_t = read_errorator::future<
136 omap_get_values_ret_bare_t>;
137 omap_get_values_ret_t omap_get_values(
138 CollectionRef c, ///< [in] collection
139 const ghobject_t &oid, ///< [in] oid
140 const std::optional<std::string> &start ///< [in] start, empty for begin
141 ) final; ///< @return <done, values> values.empty() iff done
142
143 get_attr_errorator::future<bufferlist> omap_get_header(
144 CollectionRef c,
145 const ghobject_t& oid) final;
146
147 seastar::future<std::tuple<std::vector<ghobject_t>, ghobject_t>> list_objects(
148 CollectionRef c,
149 const ghobject_t& start,
150 const ghobject_t& end,
151 uint64_t limit) const final;
152
153 seastar::future<CollectionRef> create_new_collection(const coll_t& cid) final;
154 seastar::future<CollectionRef> open_collection(const coll_t& cid) final;
155
156 seastar::future<> do_transaction_no_callbacks(
157 CollectionRef ch,
158 ceph::os::Transaction&& txn) final;
f67539c2 159
1e59de90
TL
160 /* Note, flush() machinery must go through the same pipeline
161 * stages and locks as do_transaction. */
162 seastar::future<> flush(CollectionRef ch) final;
f67539c2 163
1e59de90
TL
164 read_errorator::future<std::map<uint64_t, uint64_t>> fiemap(
165 CollectionRef ch,
166 const ghobject_t& oid,
167 uint64_t off,
168 uint64_t len) final;
f67539c2 169
1e59de90
TL
170 unsigned get_max_attr_name_length() const final {
171 return 256;
172 }
f67539c2 173
1e59de90
TL
174 // only exposed to SeaStore
175 public:
176 seastar::future<> umount();
177 // init managers and mount transaction_manager
178 seastar::future<> mount_managers();
20effc67 179
1e59de90
TL
180 void set_secondaries(Device& sec_dev) {
181 secondaries.emplace_back(&sec_dev);
182 }
20effc67 183
1e59de90
TL
184 using coll_core_t = FuturizedStore::coll_core_t;
185 seastar::future<std::vector<coll_core_t>> list_collections();
20effc67 186
1e59de90
TL
187 seastar::future<> write_meta(const std::string& key,
188 const std::string& value);
20effc67 189
1e59de90
TL
190 store_statfs_t stat() const;
191
192 uuid_d get_fsid() const;
193
194 seastar::future<> mkfs_managers();
195
196 void init_managers();
197
198 private:
199 struct internal_context_t {
200 CollectionRef ch;
201 ceph::os::Transaction ext_transaction;
20effc67 202
20effc67 203 internal_context_t(
1e59de90
TL
204 CollectionRef ch,
205 ceph::os::Transaction &&_ext_transaction,
206 TransactionRef &&transaction)
207 : ch(ch), ext_transaction(std::move(_ext_transaction)),
208 transaction(std::move(transaction)),
209 iter(ext_transaction.begin()) {}
210
211 TransactionRef transaction;
212
213 ceph::os::Transaction::iterator iter;
214 std::chrono::steady_clock::time_point begin_timestamp = std::chrono::steady_clock::now();
215
216 void reset_preserve_handle(TransactionManager &tm) {
217 tm.reset_transaction_preserve_handle(*transaction);
218 iter = ext_transaction.begin();
219 }
220 };
221
222 TransactionManager::read_extent_iertr::future<std::optional<unsigned>>
223 get_coll_bits(CollectionRef ch, Transaction &t) const;
224
225 static void on_error(ceph::os::Transaction &t);
226
227 template <typename F>
228 auto repeat_with_internal_context(
229 CollectionRef ch,
230 ceph::os::Transaction &&t,
231 Transaction::src_t src,
232 const char* tname,
233 op_type_t op_type,
234 F &&f) {
235 return seastar::do_with(
236 internal_context_t(
237 ch, std::move(t),
238 transaction_manager->create_transaction(src, tname)),
239 std::forward<F>(f),
240 [this, op_type](auto &ctx, auto &f) {
20effc67
TL
241 return ctx.transaction->get_handle().take_collection_lock(
242 static_cast<SeastoreCollection&>(*(ctx.ch)).ordering_lock
1e59de90
TL
243 ).then([this] {
244 return throttler.get(1);
245 }).then([&, this] {
20effc67
TL
246 return repeat_eagain([&, this] {
247 ctx.reset_preserve_handle(*transaction_manager);
248 return std::invoke(f, ctx);
249 }).handle_error(
250 crimson::ct_error::eagain::pass_further{},
251 crimson::ct_error::all_same_way([&ctx](auto e) {
252 on_error(ctx.ext_transaction);
253 })
254 );
255 }).then([this, op_type, &ctx] {
256 add_latency_sample(op_type,
257 std::chrono::steady_clock::now() - ctx.begin_timestamp);
1e59de90
TL
258 }).finally([this] {
259 throttler.put();
20effc67 260 });
1e59de90
TL
261 });
262 }
20effc67 263
1e59de90
TL
264 template <typename Ret, typename F>
265 auto repeat_with_onode(
266 CollectionRef ch,
267 const ghobject_t &oid,
268 Transaction::src_t src,
269 const char* tname,
270 op_type_t op_type,
271 F &&f) const {
272 auto begin_time = std::chrono::steady_clock::now();
273 return seastar::do_with(
274 oid, Ret{}, std::forward<F>(f),
275 [this, src, op_type, begin_time, tname
276 ](auto &oid, auto &ret, auto &f)
277 {
278 return repeat_eagain([&, this, src, tname] {
279 return transaction_manager->with_transaction_intr(
280 src,
281 tname,
282 [&, this](auto& t)
283 {
284 return onode_manager->get_onode(t, oid
285 ).si_then([&](auto onode) {
286 return seastar::do_with(std::move(onode), [&](auto& onode) {
287 return f(t, *onode);
288 });
289 }).si_then([&ret](auto _ret) {
290 ret = _ret;
20effc67 291 });
20effc67 292 });
1e59de90
TL
293 }).safe_then([&ret, op_type, begin_time, this] {
294 const_cast<Shard*>(this)->add_latency_sample(op_type,
295 std::chrono::steady_clock::now() - begin_time);
296 return seastar::make_ready_future<Ret>(ret);
20effc67 297 });
20effc67 298 });
1e59de90
TL
299 }
300
301 using _fiemap_ret = ObjectDataHandler::fiemap_ret;
302 _fiemap_ret _fiemap(
303 Transaction &t,
304 Onode &onode,
305 uint64_t off,
306 uint64_t len) const;
307
308 using _omap_get_value_iertr = OMapManager::base_iertr::extend<
309 crimson::ct_error::enodata
310 >;
311 using _omap_get_value_ret = _omap_get_value_iertr::future<ceph::bufferlist>;
312 _omap_get_value_ret _omap_get_value(
313 Transaction &t,
314 omap_root_t &&root,
315 std::string_view key) const;
316
317 using _omap_get_values_iertr = OMapManager::base_iertr;
318 using _omap_get_values_ret = _omap_get_values_iertr::future<omap_values_t>;
319 _omap_get_values_ret _omap_get_values(
320 Transaction &t,
321 omap_root_t &&root,
322 const omap_keys_t &keys) const;
323
324 friend class SeaStoreOmapIterator;
325
326 using omap_list_bare_ret = OMapManager::omap_list_bare_ret;
327 using omap_list_ret = OMapManager::omap_list_ret;
328 omap_list_ret omap_list(
329 Onode &onode,
330 const omap_root_le_t& omap_root,
331 Transaction& t,
332 const std::optional<std::string>& start,
333 OMapManager::omap_list_config_t config) const;
334
335 using tm_iertr = TransactionManager::base_iertr;
336 using tm_ret = tm_iertr::future<>;
337 tm_ret _do_transaction_step(
338 internal_context_t &ctx,
339 CollectionRef &col,
340 std::vector<OnodeRef> &onodes,
341 std::vector<OnodeRef> &d_onodes,
342 ceph::os::Transaction::iterator &i);
343
344 tm_ret _remove(
345 internal_context_t &ctx,
346 OnodeRef &onode);
347 tm_ret _touch(
348 internal_context_t &ctx,
349 OnodeRef &onode);
350 tm_ret _write(
351 internal_context_t &ctx,
352 OnodeRef &onode,
353 uint64_t offset, size_t len,
354 ceph::bufferlist &&bl,
355 uint32_t fadvise_flags);
aee94f69
TL
356 tm_ret _clone(
357 internal_context_t &ctx,
358 OnodeRef &onode,
359 OnodeRef &d_onode);
1e59de90
TL
360 tm_ret _zero(
361 internal_context_t &ctx,
362 OnodeRef &onode,
363 objaddr_t offset, extent_len_t len);
364 tm_ret _omap_set_values(
365 internal_context_t &ctx,
366 OnodeRef &onode,
367 std::map<std::string, ceph::bufferlist> &&aset);
368 tm_ret _omap_set_header(
369 internal_context_t &ctx,
370 OnodeRef &onode,
371 ceph::bufferlist &&header);
372 tm_ret _omap_clear(
373 internal_context_t &ctx,
374 OnodeRef &onode);
375 tm_ret _omap_rmkeys(
376 internal_context_t &ctx,
377 OnodeRef &onode,
378 omap_keys_t &&aset);
379 tm_ret _omap_rmkeyrange(
380 internal_context_t &ctx,
381 OnodeRef &onode,
382 std::string first,
383 std::string last);
384 tm_ret _truncate(
385 internal_context_t &ctx,
386 OnodeRef &onode, uint64_t size);
387 tm_ret _setattrs(
388 internal_context_t &ctx,
389 OnodeRef &onode,
390 std::map<std::string,bufferlist>&& aset);
391 tm_ret _rmattr(
392 internal_context_t &ctx,
393 OnodeRef &onode,
394 std::string name);
395 tm_ret _rmattrs(
396 internal_context_t &ctx,
397 OnodeRef &onode);
398 tm_ret _xattr_rmattr(
399 internal_context_t &ctx,
400 OnodeRef &onode,
401 std::string &&name);
402 tm_ret _xattr_clear(
403 internal_context_t &ctx,
404 OnodeRef &onode);
405 tm_ret _create_collection(
406 internal_context_t &ctx,
407 const coll_t& cid, int bits);
408 tm_ret _remove_collection(
409 internal_context_t &ctx,
410 const coll_t& cid);
411 using omap_set_kvs_ret = tm_iertr::future<>;
412 omap_set_kvs_ret _omap_set_kvs(
413 OnodeRef &onode,
414 const omap_root_le_t& omap_root,
415 Transaction& t,
416 omap_root_le_t& mutable_omap_root,
417 std::map<std::string, ceph::bufferlist>&& kvs);
418
419 boost::intrusive_ptr<SeastoreCollection> _get_collection(const coll_t& cid);
420
421 static constexpr auto LAT_MAX = static_cast<std::size_t>(op_type_t::MAX);
422
423 struct {
424 std::array<seastar::metrics::histogram, LAT_MAX> op_lat;
425 } stats;
426
427 seastar::metrics::histogram& get_latency(
428 op_type_t op_type) {
429 assert(static_cast<std::size_t>(op_type) < stats.op_lat.size());
430 return stats.op_lat[static_cast<std::size_t>(op_type)];
431 }
432
433 void add_latency_sample(op_type_t op_type,
434 std::chrono::steady_clock::duration dur) {
435 seastar::metrics::histogram& lat = get_latency(op_type);
436 lat.sample_count++;
437 lat.sample_sum += std::chrono::duration_cast<std::chrono::milliseconds>(dur).count();
438 }
439
440 private:
441 std::string root;
442 Device* device;
443 const uint32_t max_object_size;
444 bool is_test;
445
446 std::vector<Device*> secondaries;
447 TransactionManagerRef transaction_manager;
448 CollectionManagerRef collection_manager;
449 OnodeManagerRef onode_manager;
450
451 common::Throttle throttler;
452
453 seastar::metrics::metric_group metrics;
454 void register_metrics();
455 };
456
457public:
458 SeaStore(
459 const std::string& root,
460 MDStoreRef mdstore);
461 ~SeaStore();
462
463 seastar::future<> start() final;
464 seastar::future<> stop() final;
465
466 mount_ertr::future<> mount() final;
467 seastar::future<> umount() final;
468
469 mkfs_ertr::future<> mkfs(uuid_d new_osd_fsid) final;
470 seastar::future<store_statfs_t> stat() const final;
471
472 uuid_d get_fsid() const final {
473 ceph_assert(seastar::this_shard_id() == primary_core);
474 return shard_stores.local().get_fsid();
20effc67
TL
475 }
476
1e59de90
TL
477 seastar::future<> write_meta(
478 const std::string& key,
479 const std::string& value) final {
480 ceph_assert(seastar::this_shard_id() == primary_core);
481 return shard_stores.local().write_meta(
482 key, value).then([this, key, value] {
483 return mdstore->write_meta(key, value);
484 }).handle_error(
485 crimson::ct_error::assert_all{"Invalid error in SeaStore::write_meta"}
486 );
487 }
20effc67 488
1e59de90
TL
489 seastar::future<std::tuple<int, std::string>> read_meta(const std::string& key) final;
490
491 seastar::future<std::vector<coll_core_t>> list_collections() final;
492
493 FuturizedStore::Shard& get_sharded_store() final {
494 return shard_stores.local();
20effc67
TL
495 }
496
1e59de90
TL
497 static col_obj_ranges_t
498 get_objs_range(CollectionRef ch, unsigned bits);
499
500// for test
501public:
502 mount_ertr::future<> test_mount();
503 mkfs_ertr::future<> test_mkfs(uuid_d new_osd_fsid);
504
505 DeviceRef get_primary_device_ref() {
506 return std::move(device);
20effc67 507 }
1e59de90
TL
508
509 seastar::future<> test_start(DeviceRef dev);
510
511private:
20effc67 512 seastar::future<> write_fsid(uuid_d new_osd_fsid);
1e59de90
TL
513
514 seastar::future<> prepare_meta(uuid_d new_osd_fsid);
515
516 seastar::future<> set_secondaries();
517
518private:
519 std::string root;
520 MDStoreRef mdstore;
521 DeviceRef device;
522 std::vector<DeviceRef> secondaries;
523 seastar::sharded<SeaStore::Shard> shard_stores;
f67539c2
TL
524};
525
1e59de90
TL
526std::unique_ptr<SeaStore> make_seastore(
527 const std::string &device);
528
529std::unique_ptr<SeaStore> make_test_seastore(
530 SeaStore::MDStoreRef mdstore);
f67539c2 531}