]>
Commit | Line | Data |
---|---|---|
f67539c2 TL |
1 | // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- |
2 | // vim: ts=8 sw=2 smarttab | |
3 | ||
4 | #pragma once | |
5 | ||
6 | #include <string> | |
7 | #include <unordered_map> | |
8 | #include <map> | |
9 | #include <typeinfo> | |
10 | #include <vector> | |
11 | ||
12 | #include <optional> | |
13 | #include <seastar/core/future.hh> | |
1e59de90 | 14 | #include <seastar/core/metrics_types.hh> |
f67539c2 | 15 | |
f67539c2 TL |
16 | #include "include/uuid.h" |
17 | ||
18 | #include "os/Transaction.h" | |
1e59de90 | 19 | #include "crimson/common/throttle.h" |
20effc67 | 20 | #include "crimson/os/futurized_collection.h" |
f67539c2 | 21 | #include "crimson/os/futurized_store.h" |
20effc67 | 22 | |
1e59de90 | 23 | #include "crimson/os/seastore/device.h" |
20effc67 TL |
24 | #include "crimson/os/seastore/transaction.h" |
25 | #include "crimson/os/seastore/onode_manager.h" | |
26 | #include "crimson/os/seastore/omap_manager.h" | |
27 | #include "crimson/os/seastore/collection_manager.h" | |
1e59de90 | 28 | #include "crimson/os/seastore/object_data_handler.h" |
f67539c2 TL |
29 | |
30 | namespace crimson::os::seastore { | |
31 | ||
f67539c2 TL |
32 | class Onode; |
33 | using OnodeRef = boost::intrusive_ptr<Onode>; | |
f67539c2 | 34 | class TransactionManager; |
f67539c2 | 35 | |
1e59de90 TL |
36 | enum class op_type_t : uint8_t { |
37 | TRANSACTION = 0, | |
38 | READ, | |
39 | WRITE, | |
40 | GET_ATTR, | |
41 | GET_ATTRS, | |
42 | STAT, | |
43 | OMAP_GET_VALUES, | |
44 | OMAP_LIST, | |
45 | MAX | |
46 | }; | |
47 | ||
20effc67 TL |
48 | class SeastoreCollection final : public FuturizedCollection { |
49 | public: | |
50 | template <typename... T> | |
51 | SeastoreCollection(T&&... args) : | |
52 | FuturizedCollection(std::forward<T>(args)...) {} | |
53 | ||
54 | seastar::shared_mutex ordering_lock; | |
55 | }; | |
f67539c2 | 56 | |
1e59de90 TL |
57 | /** |
58 | * col_obj_ranges_t | |
59 | * | |
60 | * Represents the two ghobject_t ranges spanned by a PG collection. | |
61 | * Temp objects will be within [temp_begin, temp_end) and normal objects | |
62 | * will be in [obj_begin, obj_end). | |
63 | */ | |
64 | struct col_obj_ranges_t { | |
65 | ghobject_t temp_begin; | |
66 | ghobject_t temp_end; | |
67 | ghobject_t obj_begin; | |
68 | ghobject_t obj_end; | |
69 | }; | |
70 | ||
20effc67 | 71 | class SeaStore final : public FuturizedStore { |
f67539c2 | 72 | public: |
20effc67 TL |
73 | class MDStore { |
74 | public: | |
75 | using base_iertr = crimson::errorator< | |
76 | crimson::ct_error::input_output_error | |
77 | >; | |
78 | ||
79 | using write_meta_ertr = base_iertr; | |
80 | using write_meta_ret = write_meta_ertr::future<>; | |
81 | virtual write_meta_ret write_meta( | |
82 | const std::string &key, | |
83 | const std::string &val | |
84 | ) = 0; | |
85 | ||
86 | using read_meta_ertr = base_iertr; | |
87 | using read_meta_ret = write_meta_ertr::future<std::optional<std::string>>; | |
88 | virtual read_meta_ret read_meta(const std::string &key) = 0; | |
f67539c2 | 89 | |
20effc67 TL |
90 | virtual ~MDStore() {} |
91 | }; | |
92 | using MDStoreRef = std::unique_ptr<MDStore>; | |
f67539c2 | 93 | |
1e59de90 TL |
94 | class Shard : public FuturizedStore::Shard { |
95 | public: | |
96 | Shard( | |
97 | std::string root, | |
98 | Device* device, | |
99 | bool is_test); | |
100 | ~Shard() = default; | |
101 | ||
102 | seastar::future<struct stat> stat( | |
103 | CollectionRef c, | |
104 | const ghobject_t& oid) final; | |
105 | ||
106 | read_errorator::future<ceph::bufferlist> read( | |
107 | CollectionRef c, | |
108 | const ghobject_t& oid, | |
109 | uint64_t offset, | |
110 | size_t len, | |
111 | uint32_t op_flags = 0) final; | |
112 | ||
113 | read_errorator::future<ceph::bufferlist> readv( | |
114 | CollectionRef c, | |
115 | const ghobject_t& oid, | |
116 | interval_set<uint64_t>& m, | |
117 | uint32_t op_flags = 0) final; | |
118 | ||
119 | get_attr_errorator::future<ceph::bufferlist> get_attr( | |
120 | CollectionRef c, | |
121 | const ghobject_t& oid, | |
122 | std::string_view name) const final; | |
123 | ||
124 | get_attrs_ertr::future<attrs_t> get_attrs( | |
125 | CollectionRef c, | |
126 | const ghobject_t& oid) final; | |
127 | ||
128 | read_errorator::future<omap_values_t> omap_get_values( | |
129 | CollectionRef c, | |
130 | const ghobject_t& oid, | |
131 | const omap_keys_t& keys) final; | |
132 | ||
133 | /// Retrieves paged set of values > start (if present) | |
134 | using omap_get_values_ret_bare_t = std::tuple<bool, omap_values_t>; | |
135 | using omap_get_values_ret_t = read_errorator::future< | |
136 | omap_get_values_ret_bare_t>; | |
137 | omap_get_values_ret_t omap_get_values( | |
138 | CollectionRef c, ///< [in] collection | |
139 | const ghobject_t &oid, ///< [in] oid | |
140 | const std::optional<std::string> &start ///< [in] start, empty for begin | |
141 | ) final; ///< @return <done, values> values.empty() iff done | |
142 | ||
143 | get_attr_errorator::future<bufferlist> omap_get_header( | |
144 | CollectionRef c, | |
145 | const ghobject_t& oid) final; | |
146 | ||
147 | seastar::future<std::tuple<std::vector<ghobject_t>, ghobject_t>> list_objects( | |
148 | CollectionRef c, | |
149 | const ghobject_t& start, | |
150 | const ghobject_t& end, | |
151 | uint64_t limit) const final; | |
152 | ||
153 | seastar::future<CollectionRef> create_new_collection(const coll_t& cid) final; | |
154 | seastar::future<CollectionRef> open_collection(const coll_t& cid) final; | |
155 | ||
156 | seastar::future<> do_transaction_no_callbacks( | |
157 | CollectionRef ch, | |
158 | ceph::os::Transaction&& txn) final; | |
f67539c2 | 159 | |
1e59de90 TL |
160 | /* Note, flush() machinery must go through the same pipeline |
161 | * stages and locks as do_transaction. */ | |
162 | seastar::future<> flush(CollectionRef ch) final; | |
f67539c2 | 163 | |
1e59de90 TL |
164 | read_errorator::future<std::map<uint64_t, uint64_t>> fiemap( |
165 | CollectionRef ch, | |
166 | const ghobject_t& oid, | |
167 | uint64_t off, | |
168 | uint64_t len) final; | |
f67539c2 | 169 | |
1e59de90 TL |
170 | unsigned get_max_attr_name_length() const final { |
171 | return 256; | |
172 | } | |
f67539c2 | 173 | |
1e59de90 TL |
174 | // only exposed to SeaStore |
175 | public: | |
176 | seastar::future<> umount(); | |
177 | // init managers and mount transaction_manager | |
178 | seastar::future<> mount_managers(); | |
20effc67 | 179 | |
1e59de90 TL |
180 | void set_secondaries(Device& sec_dev) { |
181 | secondaries.emplace_back(&sec_dev); | |
182 | } | |
20effc67 | 183 | |
1e59de90 TL |
184 | using coll_core_t = FuturizedStore::coll_core_t; |
185 | seastar::future<std::vector<coll_core_t>> list_collections(); | |
20effc67 | 186 | |
1e59de90 TL |
187 | seastar::future<> write_meta(const std::string& key, |
188 | const std::string& value); | |
20effc67 | 189 | |
1e59de90 TL |
190 | store_statfs_t stat() const; |
191 | ||
192 | uuid_d get_fsid() const; | |
193 | ||
194 | seastar::future<> mkfs_managers(); | |
195 | ||
196 | void init_managers(); | |
197 | ||
198 | private: | |
199 | struct internal_context_t { | |
200 | CollectionRef ch; | |
201 | ceph::os::Transaction ext_transaction; | |
20effc67 | 202 | |
20effc67 | 203 | internal_context_t( |
1e59de90 TL |
204 | CollectionRef ch, |
205 | ceph::os::Transaction &&_ext_transaction, | |
206 | TransactionRef &&transaction) | |
207 | : ch(ch), ext_transaction(std::move(_ext_transaction)), | |
208 | transaction(std::move(transaction)), | |
209 | iter(ext_transaction.begin()) {} | |
210 | ||
211 | TransactionRef transaction; | |
212 | ||
213 | ceph::os::Transaction::iterator iter; | |
214 | std::chrono::steady_clock::time_point begin_timestamp = std::chrono::steady_clock::now(); | |
215 | ||
216 | void reset_preserve_handle(TransactionManager &tm) { | |
217 | tm.reset_transaction_preserve_handle(*transaction); | |
218 | iter = ext_transaction.begin(); | |
219 | } | |
220 | }; | |
221 | ||
222 | TransactionManager::read_extent_iertr::future<std::optional<unsigned>> | |
223 | get_coll_bits(CollectionRef ch, Transaction &t) const; | |
224 | ||
225 | static void on_error(ceph::os::Transaction &t); | |
226 | ||
227 | template <typename F> | |
228 | auto repeat_with_internal_context( | |
229 | CollectionRef ch, | |
230 | ceph::os::Transaction &&t, | |
231 | Transaction::src_t src, | |
232 | const char* tname, | |
233 | op_type_t op_type, | |
234 | F &&f) { | |
235 | return seastar::do_with( | |
236 | internal_context_t( | |
237 | ch, std::move(t), | |
238 | transaction_manager->create_transaction(src, tname)), | |
239 | std::forward<F>(f), | |
240 | [this, op_type](auto &ctx, auto &f) { | |
20effc67 TL |
241 | return ctx.transaction->get_handle().take_collection_lock( |
242 | static_cast<SeastoreCollection&>(*(ctx.ch)).ordering_lock | |
1e59de90 TL |
243 | ).then([this] { |
244 | return throttler.get(1); | |
245 | }).then([&, this] { | |
20effc67 TL |
246 | return repeat_eagain([&, this] { |
247 | ctx.reset_preserve_handle(*transaction_manager); | |
248 | return std::invoke(f, ctx); | |
249 | }).handle_error( | |
250 | crimson::ct_error::eagain::pass_further{}, | |
251 | crimson::ct_error::all_same_way([&ctx](auto e) { | |
252 | on_error(ctx.ext_transaction); | |
253 | }) | |
254 | ); | |
255 | }).then([this, op_type, &ctx] { | |
256 | add_latency_sample(op_type, | |
257 | std::chrono::steady_clock::now() - ctx.begin_timestamp); | |
1e59de90 TL |
258 | }).finally([this] { |
259 | throttler.put(); | |
20effc67 | 260 | }); |
1e59de90 TL |
261 | }); |
262 | } | |
20effc67 | 263 | |
1e59de90 TL |
264 | template <typename Ret, typename F> |
265 | auto repeat_with_onode( | |
266 | CollectionRef ch, | |
267 | const ghobject_t &oid, | |
268 | Transaction::src_t src, | |
269 | const char* tname, | |
270 | op_type_t op_type, | |
271 | F &&f) const { | |
272 | auto begin_time = std::chrono::steady_clock::now(); | |
273 | return seastar::do_with( | |
274 | oid, Ret{}, std::forward<F>(f), | |
275 | [this, src, op_type, begin_time, tname | |
276 | ](auto &oid, auto &ret, auto &f) | |
277 | { | |
278 | return repeat_eagain([&, this, src, tname] { | |
279 | return transaction_manager->with_transaction_intr( | |
280 | src, | |
281 | tname, | |
282 | [&, this](auto& t) | |
283 | { | |
284 | return onode_manager->get_onode(t, oid | |
285 | ).si_then([&](auto onode) { | |
286 | return seastar::do_with(std::move(onode), [&](auto& onode) { | |
287 | return f(t, *onode); | |
288 | }); | |
289 | }).si_then([&ret](auto _ret) { | |
290 | ret = _ret; | |
20effc67 | 291 | }); |
20effc67 | 292 | }); |
1e59de90 TL |
293 | }).safe_then([&ret, op_type, begin_time, this] { |
294 | const_cast<Shard*>(this)->add_latency_sample(op_type, | |
295 | std::chrono::steady_clock::now() - begin_time); | |
296 | return seastar::make_ready_future<Ret>(ret); | |
20effc67 | 297 | }); |
20effc67 | 298 | }); |
1e59de90 TL |
299 | } |
300 | ||
301 | using _fiemap_ret = ObjectDataHandler::fiemap_ret; | |
302 | _fiemap_ret _fiemap( | |
303 | Transaction &t, | |
304 | Onode &onode, | |
305 | uint64_t off, | |
306 | uint64_t len) const; | |
307 | ||
308 | using _omap_get_value_iertr = OMapManager::base_iertr::extend< | |
309 | crimson::ct_error::enodata | |
310 | >; | |
311 | using _omap_get_value_ret = _omap_get_value_iertr::future<ceph::bufferlist>; | |
312 | _omap_get_value_ret _omap_get_value( | |
313 | Transaction &t, | |
314 | omap_root_t &&root, | |
315 | std::string_view key) const; | |
316 | ||
317 | using _omap_get_values_iertr = OMapManager::base_iertr; | |
318 | using _omap_get_values_ret = _omap_get_values_iertr::future<omap_values_t>; | |
319 | _omap_get_values_ret _omap_get_values( | |
320 | Transaction &t, | |
321 | omap_root_t &&root, | |
322 | const omap_keys_t &keys) const; | |
323 | ||
324 | friend class SeaStoreOmapIterator; | |
325 | ||
326 | using omap_list_bare_ret = OMapManager::omap_list_bare_ret; | |
327 | using omap_list_ret = OMapManager::omap_list_ret; | |
328 | omap_list_ret omap_list( | |
329 | Onode &onode, | |
330 | const omap_root_le_t& omap_root, | |
331 | Transaction& t, | |
332 | const std::optional<std::string>& start, | |
333 | OMapManager::omap_list_config_t config) const; | |
334 | ||
335 | using tm_iertr = TransactionManager::base_iertr; | |
336 | using tm_ret = tm_iertr::future<>; | |
337 | tm_ret _do_transaction_step( | |
338 | internal_context_t &ctx, | |
339 | CollectionRef &col, | |
340 | std::vector<OnodeRef> &onodes, | |
341 | std::vector<OnodeRef> &d_onodes, | |
342 | ceph::os::Transaction::iterator &i); | |
343 | ||
344 | tm_ret _remove( | |
345 | internal_context_t &ctx, | |
346 | OnodeRef &onode); | |
347 | tm_ret _touch( | |
348 | internal_context_t &ctx, | |
349 | OnodeRef &onode); | |
350 | tm_ret _write( | |
351 | internal_context_t &ctx, | |
352 | OnodeRef &onode, | |
353 | uint64_t offset, size_t len, | |
354 | ceph::bufferlist &&bl, | |
355 | uint32_t fadvise_flags); | |
aee94f69 TL |
356 | tm_ret _clone( |
357 | internal_context_t &ctx, | |
358 | OnodeRef &onode, | |
359 | OnodeRef &d_onode); | |
1e59de90 TL |
360 | tm_ret _zero( |
361 | internal_context_t &ctx, | |
362 | OnodeRef &onode, | |
363 | objaddr_t offset, extent_len_t len); | |
364 | tm_ret _omap_set_values( | |
365 | internal_context_t &ctx, | |
366 | OnodeRef &onode, | |
367 | std::map<std::string, ceph::bufferlist> &&aset); | |
368 | tm_ret _omap_set_header( | |
369 | internal_context_t &ctx, | |
370 | OnodeRef &onode, | |
371 | ceph::bufferlist &&header); | |
372 | tm_ret _omap_clear( | |
373 | internal_context_t &ctx, | |
374 | OnodeRef &onode); | |
375 | tm_ret _omap_rmkeys( | |
376 | internal_context_t &ctx, | |
377 | OnodeRef &onode, | |
378 | omap_keys_t &&aset); | |
379 | tm_ret _omap_rmkeyrange( | |
380 | internal_context_t &ctx, | |
381 | OnodeRef &onode, | |
382 | std::string first, | |
383 | std::string last); | |
384 | tm_ret _truncate( | |
385 | internal_context_t &ctx, | |
386 | OnodeRef &onode, uint64_t size); | |
387 | tm_ret _setattrs( | |
388 | internal_context_t &ctx, | |
389 | OnodeRef &onode, | |
390 | std::map<std::string,bufferlist>&& aset); | |
391 | tm_ret _rmattr( | |
392 | internal_context_t &ctx, | |
393 | OnodeRef &onode, | |
394 | std::string name); | |
395 | tm_ret _rmattrs( | |
396 | internal_context_t &ctx, | |
397 | OnodeRef &onode); | |
398 | tm_ret _xattr_rmattr( | |
399 | internal_context_t &ctx, | |
400 | OnodeRef &onode, | |
401 | std::string &&name); | |
402 | tm_ret _xattr_clear( | |
403 | internal_context_t &ctx, | |
404 | OnodeRef &onode); | |
405 | tm_ret _create_collection( | |
406 | internal_context_t &ctx, | |
407 | const coll_t& cid, int bits); | |
408 | tm_ret _remove_collection( | |
409 | internal_context_t &ctx, | |
410 | const coll_t& cid); | |
411 | using omap_set_kvs_ret = tm_iertr::future<>; | |
412 | omap_set_kvs_ret _omap_set_kvs( | |
413 | OnodeRef &onode, | |
414 | const omap_root_le_t& omap_root, | |
415 | Transaction& t, | |
416 | omap_root_le_t& mutable_omap_root, | |
417 | std::map<std::string, ceph::bufferlist>&& kvs); | |
418 | ||
419 | boost::intrusive_ptr<SeastoreCollection> _get_collection(const coll_t& cid); | |
420 | ||
421 | static constexpr auto LAT_MAX = static_cast<std::size_t>(op_type_t::MAX); | |
422 | ||
423 | struct { | |
424 | std::array<seastar::metrics::histogram, LAT_MAX> op_lat; | |
425 | } stats; | |
426 | ||
427 | seastar::metrics::histogram& get_latency( | |
428 | op_type_t op_type) { | |
429 | assert(static_cast<std::size_t>(op_type) < stats.op_lat.size()); | |
430 | return stats.op_lat[static_cast<std::size_t>(op_type)]; | |
431 | } | |
432 | ||
433 | void add_latency_sample(op_type_t op_type, | |
434 | std::chrono::steady_clock::duration dur) { | |
435 | seastar::metrics::histogram& lat = get_latency(op_type); | |
436 | lat.sample_count++; | |
437 | lat.sample_sum += std::chrono::duration_cast<std::chrono::milliseconds>(dur).count(); | |
438 | } | |
439 | ||
440 | private: | |
441 | std::string root; | |
442 | Device* device; | |
443 | const uint32_t max_object_size; | |
444 | bool is_test; | |
445 | ||
446 | std::vector<Device*> secondaries; | |
447 | TransactionManagerRef transaction_manager; | |
448 | CollectionManagerRef collection_manager; | |
449 | OnodeManagerRef onode_manager; | |
450 | ||
451 | common::Throttle throttler; | |
452 | ||
453 | seastar::metrics::metric_group metrics; | |
454 | void register_metrics(); | |
455 | }; | |
456 | ||
457 | public: | |
458 | SeaStore( | |
459 | const std::string& root, | |
460 | MDStoreRef mdstore); | |
461 | ~SeaStore(); | |
462 | ||
463 | seastar::future<> start() final; | |
464 | seastar::future<> stop() final; | |
465 | ||
466 | mount_ertr::future<> mount() final; | |
467 | seastar::future<> umount() final; | |
468 | ||
469 | mkfs_ertr::future<> mkfs(uuid_d new_osd_fsid) final; | |
470 | seastar::future<store_statfs_t> stat() const final; | |
471 | ||
472 | uuid_d get_fsid() const final { | |
473 | ceph_assert(seastar::this_shard_id() == primary_core); | |
474 | return shard_stores.local().get_fsid(); | |
20effc67 TL |
475 | } |
476 | ||
1e59de90 TL |
477 | seastar::future<> write_meta( |
478 | const std::string& key, | |
479 | const std::string& value) final { | |
480 | ceph_assert(seastar::this_shard_id() == primary_core); | |
481 | return shard_stores.local().write_meta( | |
482 | key, value).then([this, key, value] { | |
483 | return mdstore->write_meta(key, value); | |
484 | }).handle_error( | |
485 | crimson::ct_error::assert_all{"Invalid error in SeaStore::write_meta"} | |
486 | ); | |
487 | } | |
20effc67 | 488 | |
1e59de90 TL |
489 | seastar::future<std::tuple<int, std::string>> read_meta(const std::string& key) final; |
490 | ||
491 | seastar::future<std::vector<coll_core_t>> list_collections() final; | |
492 | ||
493 | FuturizedStore::Shard& get_sharded_store() final { | |
494 | return shard_stores.local(); | |
20effc67 TL |
495 | } |
496 | ||
1e59de90 TL |
497 | static col_obj_ranges_t |
498 | get_objs_range(CollectionRef ch, unsigned bits); | |
499 | ||
500 | // for test | |
501 | public: | |
502 | mount_ertr::future<> test_mount(); | |
503 | mkfs_ertr::future<> test_mkfs(uuid_d new_osd_fsid); | |
504 | ||
505 | DeviceRef get_primary_device_ref() { | |
506 | return std::move(device); | |
20effc67 | 507 | } |
1e59de90 TL |
508 | |
509 | seastar::future<> test_start(DeviceRef dev); | |
510 | ||
511 | private: | |
20effc67 | 512 | seastar::future<> write_fsid(uuid_d new_osd_fsid); |
1e59de90 TL |
513 | |
514 | seastar::future<> prepare_meta(uuid_d new_osd_fsid); | |
515 | ||
516 | seastar::future<> set_secondaries(); | |
517 | ||
518 | private: | |
519 | std::string root; | |
520 | MDStoreRef mdstore; | |
521 | DeviceRef device; | |
522 | std::vector<DeviceRef> secondaries; | |
523 | seastar::sharded<SeaStore::Shard> shard_stores; | |
f67539c2 TL |
524 | }; |
525 | ||
1e59de90 TL |
526 | std::unique_ptr<SeaStore> make_seastore( |
527 | const std::string &device); | |
528 | ||
529 | std::unique_ptr<SeaStore> make_test_seastore( | |
530 | SeaStore::MDStoreRef mdstore); | |
f67539c2 | 531 | } |