]>
Commit | Line | Data |
---|---|---|
f67539c2 TL |
1 | // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- |
2 | // vim: ts=8 sw=2 smarttab | |
3 | ||
4 | #pragma once | |
5 | ||
6 | #include <string> | |
7 | #include <unordered_map> | |
8 | #include <map> | |
9 | #include <typeinfo> | |
10 | #include <vector> | |
11 | ||
12 | #include <optional> | |
13 | #include <seastar/core/future.hh> | |
14 | ||
f67539c2 TL |
15 | #include "include/uuid.h" |
16 | ||
17 | #include "os/Transaction.h" | |
20effc67 | 18 | #include "crimson/os/futurized_collection.h" |
f67539c2 | 19 | #include "crimson/os/futurized_store.h" |
20effc67 TL |
20 | |
21 | #include "crimson/os/seastore/transaction.h" | |
22 | #include "crimson/os/seastore/onode_manager.h" | |
23 | #include "crimson/os/seastore/omap_manager.h" | |
24 | #include "crimson/os/seastore/collection_manager.h" | |
f67539c2 TL |
25 | |
26 | namespace crimson::os::seastore { | |
27 | ||
f67539c2 TL |
28 | class Onode; |
29 | using OnodeRef = boost::intrusive_ptr<Onode>; | |
f67539c2 | 30 | class TransactionManager; |
f67539c2 | 31 | |
20effc67 TL |
32 | class SeastoreCollection final : public FuturizedCollection { |
33 | public: | |
34 | template <typename... T> | |
35 | SeastoreCollection(T&&... args) : | |
36 | FuturizedCollection(std::forward<T>(args)...) {} | |
37 | ||
38 | seastar::shared_mutex ordering_lock; | |
39 | }; | |
f67539c2 | 40 | |
20effc67 | 41 | class SeaStore final : public FuturizedStore { |
f67539c2 | 42 | public: |
20effc67 TL |
43 | class MDStore { |
44 | public: | |
45 | using base_iertr = crimson::errorator< | |
46 | crimson::ct_error::input_output_error | |
47 | >; | |
48 | ||
49 | using write_meta_ertr = base_iertr; | |
50 | using write_meta_ret = write_meta_ertr::future<>; | |
51 | virtual write_meta_ret write_meta( | |
52 | const std::string &key, | |
53 | const std::string &val | |
54 | ) = 0; | |
55 | ||
56 | using read_meta_ertr = base_iertr; | |
57 | using read_meta_ret = write_meta_ertr::future<std::optional<std::string>>; | |
58 | virtual read_meta_ret read_meta(const std::string &key) = 0; | |
f67539c2 | 59 | |
20effc67 TL |
60 | virtual ~MDStore() {} |
61 | }; | |
62 | using MDStoreRef = std::unique_ptr<MDStore>; | |
f67539c2 | 63 | |
20effc67 TL |
64 | SeaStore( |
65 | const std::string& root, | |
66 | MDStoreRef mdstore, | |
67 | SegmentManagerRef sm, | |
68 | TransactionManagerRef tm, | |
69 | CollectionManagerRef cm, | |
70 | OnodeManagerRef om); | |
71 | SeaStore( | |
72 | const std::string& root, | |
73 | SegmentManagerRef sm, | |
74 | TransactionManagerRef tm, | |
75 | CollectionManagerRef cm, | |
76 | OnodeManagerRef om); | |
77 | ~SeaStore(); | |
78 | ||
f67539c2 | 79 | seastar::future<> stop() final; |
20effc67 | 80 | mount_ertr::future<> mount() final; |
f67539c2 TL |
81 | seastar::future<> umount() final; |
82 | ||
20effc67 | 83 | mkfs_ertr::future<> mkfs(uuid_d new_osd_fsid) final; |
f67539c2 TL |
84 | seastar::future<store_statfs_t> stat() const final; |
85 | ||
86 | read_errorator::future<ceph::bufferlist> read( | |
87 | CollectionRef c, | |
88 | const ghobject_t& oid, | |
89 | uint64_t offset, | |
90 | size_t len, | |
91 | uint32_t op_flags = 0) final; | |
92 | read_errorator::future<ceph::bufferlist> readv( | |
93 | CollectionRef c, | |
94 | const ghobject_t& oid, | |
95 | interval_set<uint64_t>& m, | |
96 | uint32_t op_flags = 0) final; | |
20effc67 | 97 | get_attr_errorator::future<ceph::bufferlist> get_attr( |
f67539c2 TL |
98 | CollectionRef c, |
99 | const ghobject_t& oid, | |
100 | std::string_view name) const final; | |
101 | get_attrs_ertr::future<attrs_t> get_attrs( | |
102 | CollectionRef c, | |
103 | const ghobject_t& oid) final; | |
104 | ||
105 | seastar::future<struct stat> stat( | |
106 | CollectionRef c, | |
107 | const ghobject_t& oid) final; | |
108 | ||
109 | read_errorator::future<omap_values_t> omap_get_values( | |
110 | CollectionRef c, | |
111 | const ghobject_t& oid, | |
112 | const omap_keys_t& keys) final; | |
113 | ||
114 | /// Retrieves paged set of values > start (if present) | |
20effc67 TL |
115 | using omap_get_values_ret_bare_t = std::tuple<bool, omap_values_t>; |
116 | using omap_get_values_ret_t = read_errorator::future< | |
117 | omap_get_values_ret_bare_t>; | |
118 | omap_get_values_ret_t omap_get_values( | |
f67539c2 TL |
119 | CollectionRef c, ///< [in] collection |
120 | const ghobject_t &oid, ///< [in] oid | |
121 | const std::optional<std::string> &start ///< [in] start, empty for begin | |
122 | ) final; ///< @return <done, values> values.empty() iff done | |
123 | ||
124 | read_errorator::future<bufferlist> omap_get_header( | |
125 | CollectionRef c, | |
126 | const ghobject_t& oid) final; | |
127 | ||
128 | seastar::future<std::tuple<std::vector<ghobject_t>, ghobject_t>> list_objects( | |
129 | CollectionRef c, | |
130 | const ghobject_t& start, | |
131 | const ghobject_t& end, | |
132 | uint64_t limit) const final; | |
133 | ||
134 | seastar::future<CollectionRef> create_new_collection(const coll_t& cid) final; | |
135 | seastar::future<CollectionRef> open_collection(const coll_t& cid) final; | |
136 | seastar::future<std::vector<coll_t>> list_collections() final; | |
137 | ||
138 | seastar::future<> do_transaction( | |
139 | CollectionRef ch, | |
140 | ceph::os::Transaction&& txn) final; | |
141 | ||
142 | seastar::future<OmapIteratorRef> get_omap_iterator( | |
143 | CollectionRef ch, | |
144 | const ghobject_t& oid) final; | |
145 | seastar::future<std::map<uint64_t, uint64_t>> fiemap( | |
146 | CollectionRef ch, | |
147 | const ghobject_t& oid, | |
148 | uint64_t off, | |
149 | uint64_t len) final; | |
150 | ||
151 | seastar::future<> write_meta(const std::string& key, | |
152 | const std::string& value) final; | |
153 | seastar::future<std::tuple<int, std::string>> read_meta(const std::string& key) final; | |
154 | uuid_d get_fsid() const final; | |
155 | ||
156 | unsigned get_max_attr_name_length() const final { | |
157 | return 256; | |
158 | } | |
20effc67 TL |
159 | enum class op_type_t : uint8_t { |
160 | TRANSACTION = 0, | |
161 | READ, | |
162 | WRITE, | |
163 | GET_ATTR, | |
164 | GET_ATTRS, | |
165 | STAT, | |
166 | OMAP_GET_VALUES, | |
167 | OMAP_LIST, | |
168 | MAX | |
169 | }; | |
f67539c2 TL |
170 | |
171 | private: | |
20effc67 TL |
172 | struct internal_context_t { |
173 | CollectionRef ch; | |
174 | ceph::os::Transaction ext_transaction; | |
175 | ||
176 | internal_context_t( | |
177 | CollectionRef ch, | |
178 | ceph::os::Transaction &&_ext_transaction, | |
179 | TransactionRef &&transaction) | |
180 | : ch(ch), ext_transaction(std::move(_ext_transaction)), | |
181 | transaction(std::move(transaction)), | |
182 | iter(ext_transaction.begin()) {} | |
183 | ||
184 | TransactionRef transaction; | |
185 | ||
186 | ceph::os::Transaction::iterator iter; | |
187 | std::chrono::steady_clock::time_point begin_timestamp = std::chrono::steady_clock::now(); | |
188 | ||
189 | void reset_preserve_handle(TransactionManager &tm) { | |
190 | tm.reset_transaction_preserve_handle(*transaction); | |
191 | iter = ext_transaction.begin(); | |
192 | } | |
193 | }; | |
194 | ||
195 | static void on_error(ceph::os::Transaction &t); | |
196 | ||
197 | template <typename F> | |
198 | auto repeat_with_internal_context( | |
199 | CollectionRef ch, | |
200 | ceph::os::Transaction &&t, | |
201 | Transaction::src_t src, | |
202 | const char* tname, | |
203 | op_type_t op_type, | |
204 | F &&f) { | |
205 | return seastar::do_with( | |
206 | internal_context_t( | |
207 | ch, std::move(t), | |
208 | transaction_manager->create_transaction(src, tname)), | |
209 | std::forward<F>(f), | |
210 | [this, op_type](auto &ctx, auto &f) { | |
211 | return ctx.transaction->get_handle().take_collection_lock( | |
212 | static_cast<SeastoreCollection&>(*(ctx.ch)).ordering_lock | |
213 | ).then([&, this] { | |
214 | return repeat_eagain([&, this] { | |
215 | ctx.reset_preserve_handle(*transaction_manager); | |
216 | return std::invoke(f, ctx); | |
217 | }).handle_error( | |
218 | crimson::ct_error::eagain::pass_further{}, | |
219 | crimson::ct_error::all_same_way([&ctx](auto e) { | |
220 | on_error(ctx.ext_transaction); | |
221 | }) | |
222 | ); | |
223 | }).then([this, op_type, &ctx] { | |
224 | add_latency_sample(op_type, | |
225 | std::chrono::steady_clock::now() - ctx.begin_timestamp); | |
226 | }); | |
227 | } | |
228 | ); | |
229 | } | |
230 | ||
231 | template <typename Ret, typename F> | |
232 | auto repeat_with_onode( | |
233 | CollectionRef ch, | |
234 | const ghobject_t &oid, | |
235 | Transaction::src_t src, | |
236 | const char* tname, | |
237 | op_type_t op_type, | |
238 | F &&f) const { | |
239 | auto begin_time = std::chrono::steady_clock::now(); | |
240 | return seastar::do_with( | |
241 | oid, Ret{}, std::forward<F>(f), | |
242 | [this, src, op_type, begin_time, tname | |
243 | ](auto &oid, auto &ret, auto &f) | |
244 | { | |
245 | return repeat_eagain([&, this, src, tname] { | |
246 | return transaction_manager->with_transaction_intr( | |
247 | src, | |
248 | tname, | |
249 | [&, this](auto& t) | |
250 | { | |
251 | return onode_manager->get_onode(t, oid | |
252 | ).si_then([&](auto onode) { | |
253 | return seastar::do_with(std::move(onode), [&](auto& onode) { | |
254 | return f(t, *onode); | |
255 | }); | |
256 | }).si_then([&ret](auto _ret) { | |
257 | ret = _ret; | |
258 | }); | |
259 | }); | |
260 | }).safe_then([&ret, op_type, begin_time, this] { | |
261 | const_cast<SeaStore*>(this)->add_latency_sample(op_type, | |
262 | std::chrono::steady_clock::now() - begin_time); | |
263 | return seastar::make_ready_future<Ret>(ret); | |
264 | }); | |
265 | }); | |
266 | } | |
267 | ||
268 | using _omap_get_value_iertr = OMapManager::base_iertr::extend< | |
269 | crimson::ct_error::enodata | |
270 | >; | |
271 | using _omap_get_value_ret = _omap_get_value_iertr::future<ceph::bufferlist>; | |
272 | _omap_get_value_ret _omap_get_value( | |
273 | Transaction &t, | |
274 | omap_root_t &&root, | |
275 | std::string_view key) const; | |
276 | ||
277 | using _omap_get_values_iertr = OMapManager::base_iertr; | |
278 | using _omap_get_values_ret = _omap_get_values_iertr::future<omap_values_t>; | |
279 | _omap_get_values_ret _omap_get_values( | |
280 | Transaction &t, | |
281 | omap_root_t &&root, | |
282 | const omap_keys_t &keys) const; | |
283 | ||
284 | using _omap_list_bare_ret = OMapManager::omap_list_bare_ret; | |
285 | using _omap_list_ret = OMapManager::omap_list_ret; | |
286 | _omap_list_ret _omap_list( | |
287 | Onode &onode, | |
288 | const omap_root_le_t& omap_root, | |
289 | Transaction& t, | |
290 | const std::optional<std::string>& start, | |
291 | OMapManager::omap_list_config_t config) const; | |
292 | ||
293 | friend class SeaStoreOmapIterator; | |
294 | omap_get_values_ret_t omap_list( | |
295 | CollectionRef ch, | |
296 | const ghobject_t &oid, | |
297 | const std::optional<std::string> &_start, | |
298 | OMapManager::omap_list_config_t config); | |
299 | ||
300 | std::string root; | |
301 | MDStoreRef mdstore; | |
302 | SegmentManagerRef segment_manager; | |
303 | std::vector<SegmentManagerRef> secondaries; | |
304 | TransactionManagerRef transaction_manager; | |
305 | CollectionManagerRef collection_manager; | |
306 | OnodeManagerRef onode_manager; | |
307 | const uint32_t max_object_size = 0; | |
308 | ||
309 | using tm_iertr = TransactionManager::base_iertr; | |
310 | using tm_ret = tm_iertr::future<>; | |
311 | tm_ret _do_transaction_step( | |
312 | internal_context_t &ctx, | |
f67539c2 TL |
313 | CollectionRef &col, |
314 | std::vector<OnodeRef> &onodes, | |
315 | ceph::os::Transaction::iterator &i); | |
316 | ||
20effc67 TL |
317 | tm_ret _remove( |
318 | internal_context_t &ctx, | |
f67539c2 | 319 | OnodeRef &onode); |
20effc67 TL |
320 | tm_ret _touch( |
321 | internal_context_t &ctx, | |
f67539c2 | 322 | OnodeRef &onode); |
20effc67 TL |
323 | tm_ret _write( |
324 | internal_context_t &ctx, | |
f67539c2 | 325 | OnodeRef &onode, |
20effc67 TL |
326 | uint64_t offset, size_t len, |
327 | ceph::bufferlist &&bl, | |
f67539c2 | 328 | uint32_t fadvise_flags); |
20effc67 TL |
329 | tm_ret _omap_set_values( |
330 | internal_context_t &ctx, | |
f67539c2 TL |
331 | OnodeRef &onode, |
332 | std::map<std::string, ceph::bufferlist> &&aset); | |
20effc67 TL |
333 | tm_ret _omap_set_header( |
334 | internal_context_t &ctx, | |
f67539c2 | 335 | OnodeRef &onode, |
20effc67 TL |
336 | ceph::bufferlist &&header); |
337 | tm_ret _omap_rmkeys( | |
338 | internal_context_t &ctx, | |
f67539c2 | 339 | OnodeRef &onode, |
20effc67 TL |
340 | omap_keys_t &&aset); |
341 | tm_ret _omap_rmkeyrange( | |
342 | internal_context_t &ctx, | |
f67539c2 | 343 | OnodeRef &onode, |
20effc67 TL |
344 | std::string first, |
345 | std::string last); | |
346 | tm_ret _truncate( | |
347 | internal_context_t &ctx, | |
f67539c2 | 348 | OnodeRef &onode, uint64_t size); |
20effc67 TL |
349 | tm_ret _setattrs( |
350 | internal_context_t &ctx, | |
f67539c2 | 351 | OnodeRef &onode, |
20effc67 TL |
352 | std::map<std::string,bufferlist>&& aset); |
353 | tm_ret _create_collection( | |
354 | internal_context_t &ctx, | |
f67539c2 | 355 | const coll_t& cid, int bits); |
20effc67 TL |
356 | tm_ret _remove_collection( |
357 | internal_context_t &ctx, | |
358 | const coll_t& cid); | |
359 | using omap_set_kvs_ret = tm_iertr::future<>; | |
360 | omap_set_kvs_ret _omap_set_kvs( | |
361 | OnodeRef &onode, | |
362 | const omap_root_le_t& omap_root, | |
363 | Transaction& t, | |
364 | omap_root_le_t& mutable_omap_root, | |
365 | std::map<std::string, ceph::bufferlist>&& kvs); | |
f67539c2 TL |
366 | |
367 | boost::intrusive_ptr<SeastoreCollection> _get_collection(const coll_t& cid); | |
20effc67 TL |
368 | |
369 | static constexpr auto LAT_MAX = static_cast<std::size_t>(op_type_t::MAX); | |
370 | struct { | |
371 | std::array<seastar::metrics::histogram, LAT_MAX> op_lat; | |
372 | } stats; | |
373 | ||
374 | seastar::metrics::histogram& get_latency( | |
375 | op_type_t op_type) { | |
376 | assert(static_cast<std::size_t>(op_type) < stats.op_lat.size()); | |
377 | return stats.op_lat[static_cast<std::size_t>(op_type)]; | |
378 | } | |
379 | ||
380 | void add_latency_sample(op_type_t op_type, | |
381 | std::chrono::steady_clock::duration dur) { | |
382 | seastar::metrics::histogram& lat = get_latency(op_type); | |
383 | lat.sample_count++; | |
384 | lat.sample_sum += std::chrono::duration_cast<std::chrono::milliseconds>(dur).count(); | |
385 | } | |
386 | seastar::metrics::metric_group metrics; | |
387 | void register_metrics(); | |
388 | seastar::future<> write_fsid(uuid_d new_osd_fsid); | |
f67539c2 TL |
389 | }; |
390 | ||
20effc67 TL |
391 | seastar::future<std::unique_ptr<SeaStore>> make_seastore( |
392 | const std::string &device, | |
393 | const ConfigValues &config); | |
f67539c2 | 394 | } |