]> git.proxmox.com Git - ceph.git/blame - ceph/src/osd/ReplicatedBackend.h
Import ceph 15.2.8
[ceph.git] / ceph / src / osd / ReplicatedBackend.h
CommitLineData
7c673cae
FG
1// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2// vim: ts=8 sw=2 smarttab
3/*
4 * Ceph - scalable distributed file system
5 *
6 * Copyright (C) 2013 Inktank Storage, Inc.
7 *
8 * This is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU Lesser General Public
10 * License version 2.1, as published by the Free Software
11 * Foundation. See file COPYING.
12 *
13 */
14
15#ifndef REPBACKEND_H
16#define REPBACKEND_H
17
7c673cae 18#include "PGBackend.h"
7c673cae
FG
19
20struct C_ReplicatedBackend_OnPullComplete;
21class ReplicatedBackend : public PGBackend {
22 struct RPGHandle : public PGBackend::RecoveryHandle {
23 map<pg_shard_t, vector<PushOp> > pushes;
24 map<pg_shard_t, vector<PullOp> > pulls;
25 };
26 friend struct C_ReplicatedBackend_OnPullComplete;
27public:
28 ReplicatedBackend(
29 PGBackend::Listener *pg,
11fdf7f2 30 const coll_t &coll,
7c673cae
FG
31 ObjectStore::CollectionHandle &ch,
32 ObjectStore *store,
33 CephContext *cct);
34
35 /// @see PGBackend::open_recovery_op
36 RPGHandle *_open_recovery_op() {
37 return new RPGHandle();
38 }
39 PGBackend::RecoveryHandle *open_recovery_op() override {
40 return _open_recovery_op();
41 }
42
43 /// @see PGBackend::run_recovery_op
44 void run_recovery_op(
45 PGBackend::RecoveryHandle *h,
46 int priority) override;
47
48 /// @see PGBackend::recover_object
224ce89b 49 int recover_object(
7c673cae
FG
50 const hobject_t &hoid,
51 eversion_t v,
52 ObjectContextRef head,
53 ObjectContextRef obc,
54 RecoveryHandle *h
55 ) override;
56
57 void check_recovery_sources(const OSDMapRef& osdmap) override;
58
7c673cae
FG
59 bool can_handle_while_inactive(OpRequestRef op) override;
60
61 /// @see PGBackend::handle_message
c07f9fc5 62 bool _handle_message(
7c673cae
FG
63 OpRequestRef op
64 ) override;
65
66 void on_change() override;
67 void clear_recovery_state() override;
7c673cae
FG
68
69 class RPCRecPred : public IsPGRecoverablePredicate {
70 public:
71 bool operator()(const set<pg_shard_t> &have) const override {
72 return !have.empty();
73 }
74 };
11fdf7f2 75 IsPGRecoverablePredicate *get_is_recoverable_predicate() const override {
7c673cae
FG
76 return new RPCRecPred;
77 }
78
79 class RPCReadPred : public IsPGReadablePredicate {
80 pg_shard_t whoami;
81 public:
82 explicit RPCReadPred(pg_shard_t whoami) : whoami(whoami) {}
83 bool operator()(const set<pg_shard_t> &have) const override {
84 return have.count(whoami);
85 }
86 };
11fdf7f2 87 IsPGReadablePredicate *get_is_readable_predicate() const override {
7c673cae
FG
88 return new RPCReadPred(get_parent()->whoami_shard());
89 }
90
91 void dump_recovery_info(Formatter *f) const override {
92 {
93 f->open_array_section("pull_from_peer");
94 for (map<pg_shard_t, set<hobject_t> >::const_iterator i = pull_from_peer.begin();
95 i != pull_from_peer.end();
96 ++i) {
97 f->open_object_section("pulling_from");
98 f->dump_stream("pull_from") << i->first;
99 {
100 f->open_array_section("pulls");
101 for (set<hobject_t>::const_iterator j = i->second.begin();
102 j != i->second.end();
103 ++j) {
104 f->open_object_section("pull_info");
11fdf7f2 105 ceph_assert(pulling.count(*j));
7c673cae
FG
106 pulling.find(*j)->second.dump(f);
107 f->close_section();
108 }
109 f->close_section();
110 }
111 f->close_section();
112 }
113 f->close_section();
114 }
115 {
116 f->open_array_section("pushing");
117 for (map<hobject_t, map<pg_shard_t, PushInfo>>::const_iterator i =
118 pushing.begin();
119 i != pushing.end();
120 ++i) {
121 f->open_object_section("object");
122 f->dump_stream("pushing") << i->first;
123 {
124 f->open_array_section("pushing_to");
125 for (map<pg_shard_t, PushInfo>::const_iterator j = i->second.begin();
126 j != i->second.end();
127 ++j) {
128 f->open_object_section("push_progress");
129 f->dump_stream("pushing_to") << j->first;
130 {
131 f->open_object_section("push_info");
132 j->second.dump(f);
133 f->close_section();
134 }
135 f->close_section();
136 }
137 f->close_section();
138 }
139 f->close_section();
140 }
141 f->close_section();
142 }
143 }
144
145 int objects_read_sync(
146 const hobject_t &hoid,
147 uint64_t off,
148 uint64_t len,
149 uint32_t op_flags,
150 bufferlist *bl) override;
151
9f95a23c
TL
152 int objects_readv_sync(
153 const hobject_t &hoid,
154 map<uint64_t, uint64_t>&& m,
155 uint32_t op_flags,
156 bufferlist *bl) override;
157
7c673cae
FG
158 void objects_read_async(
159 const hobject_t &hoid,
160 const list<pair<boost::tuple<uint64_t, uint64_t, uint32_t>,
161 pair<bufferlist*, Context*> > > &to_read,
162 Context *on_complete,
163 bool fast_read = false) override;
164
165private:
166 // push
167 struct PushInfo {
168 ObjectRecoveryProgress recovery_progress;
169 ObjectRecoveryInfo recovery_info;
170 ObjectContextRef obc;
171 object_stat_sum_t stat;
172 ObcLockManager lock_manager;
173
174 void dump(Formatter *f) const {
175 {
176 f->open_object_section("recovery_progress");
177 recovery_progress.dump(f);
178 f->close_section();
179 }
180 {
181 f->open_object_section("recovery_info");
182 recovery_info.dump(f);
183 f->close_section();
184 }
185 }
186 };
187 map<hobject_t, map<pg_shard_t, PushInfo>> pushing;
188
189 // pull
190 struct PullInfo {
191 pg_shard_t from;
192 hobject_t soid;
193 ObjectRecoveryProgress recovery_progress;
194 ObjectRecoveryInfo recovery_info;
195 ObjectContextRef head_ctx;
196 ObjectContextRef obc;
197 object_stat_sum_t stat;
198 bool cache_dont_need;
199 ObcLockManager lock_manager;
200
201 void dump(Formatter *f) const {
202 {
203 f->open_object_section("recovery_progress");
204 recovery_progress.dump(f);
205 f->close_section();
206 }
207 {
208 f->open_object_section("recovery_info");
209 recovery_info.dump(f);
210 f->close_section();
211 }
212 }
213
214 bool is_complete() const {
215 return recovery_progress.is_complete(recovery_info);
216 }
217 };
218
219 map<hobject_t, PullInfo> pulling;
220
11fdf7f2 221 // Reverse mapping from osd peer to objects being pulled from that peer
7c673cae
FG
222 map<pg_shard_t, set<hobject_t> > pull_from_peer;
223 void clear_pull(
224 map<hobject_t, PullInfo>::iterator piter,
225 bool clear_pull_from_peer = true);
226 void clear_pull_from(
227 map<hobject_t, PullInfo>::iterator piter);
228
229 void _do_push(OpRequestRef op);
230 void _do_pull_response(OpRequestRef op);
231 void do_push(OpRequestRef op) {
232 if (is_primary()) {
233 _do_pull_response(op);
234 } else {
235 _do_push(op);
236 }
237 }
238 void do_pull(OpRequestRef op);
239 void do_push_reply(OpRequestRef op);
240
241 bool handle_push_reply(pg_shard_t peer, const PushReplyOp &op, PushOp *reply);
242 void handle_pull(pg_shard_t peer, PullOp &op, PushOp *reply);
243
244 struct pull_complete_info {
245 hobject_t hoid;
246 object_stat_sum_t stat;
247 };
248 bool handle_pull_response(
249 pg_shard_t from, const PushOp &op, PullOp *response,
250 list<pull_complete_info> *to_continue,
251 ObjectStore::Transaction *t);
252 void handle_push(pg_shard_t from, const PushOp &op, PushReplyOp *response,
11fdf7f2 253 ObjectStore::Transaction *t, bool is_repair);
7c673cae
FG
254
255 static void trim_pushed_data(const interval_set<uint64_t> &copy_subset,
256 const interval_set<uint64_t> &intervals_received,
257 bufferlist data_received,
258 interval_set<uint64_t> *intervals_usable,
259 bufferlist *data_usable);
224ce89b 260 void _failed_pull(pg_shard_t from, const hobject_t &soid);
7c673cae
FG
261
262 void send_pushes(int prio, map<pg_shard_t, vector<PushOp> > &pushes);
263 void prep_push_op_blank(const hobject_t& soid, PushOp *op);
264 void send_pulls(
265 int priority,
266 map<pg_shard_t, vector<PullOp> > &pulls);
267
268 int build_push_op(const ObjectRecoveryInfo &recovery_info,
269 const ObjectRecoveryProgress &progress,
270 ObjectRecoveryProgress *out_progress,
271 PushOp *out_op,
272 object_stat_sum_t *stat = 0,
273 bool cache_dont_need = true);
274 void submit_push_data(const ObjectRecoveryInfo &recovery_info,
275 bool first,
276 bool complete,
9f95a23c 277 bool clear_omap,
7c673cae 278 bool cache_dont_need,
9f95a23c 279 interval_set<uint64_t> &data_zeros,
7c673cae
FG
280 const interval_set<uint64_t> &intervals_included,
281 bufferlist data_included,
282 bufferlist omap_header,
283 const map<string, bufferlist> &attrs,
284 const map<string, bufferlist> &omap_entries,
285 ObjectStore::Transaction *t);
286 void submit_push_complete(const ObjectRecoveryInfo &recovery_info,
287 ObjectStore::Transaction *t);
288
289 void calc_clone_subsets(
290 SnapSet& snapset, const hobject_t& poid, const pg_missing_t& missing,
291 const hobject_t &last_backfill,
292 interval_set<uint64_t>& data_subset,
293 map<hobject_t, interval_set<uint64_t>>& clone_subsets,
294 ObcLockManager &lock_manager);
295 void prepare_pull(
296 eversion_t v,
297 const hobject_t& soid,
298 ObjectContextRef headctx,
299 RPGHandle *h);
300 int start_pushes(
301 const hobject_t &soid,
302 ObjectContextRef obj,
303 RPGHandle *h);
224ce89b 304 int prep_push_to_replica(
7c673cae
FG
305 ObjectContextRef obc, const hobject_t& soid, pg_shard_t peer,
306 PushOp *pop, bool cache_dont_need = true);
224ce89b 307 int prep_push(
7c673cae
FG
308 ObjectContextRef obc,
309 const hobject_t& oid, pg_shard_t dest,
310 PushOp *op,
311 bool cache_dont_need);
224ce89b 312 int prep_push(
7c673cae
FG
313 ObjectContextRef obc,
314 const hobject_t& soid, pg_shard_t peer,
315 eversion_t version,
316 interval_set<uint64_t> &data_subset,
317 map<hobject_t, interval_set<uint64_t>>& clone_subsets,
318 PushOp *op,
319 bool cache,
320 ObcLockManager &&lock_manager);
321 void calc_head_subsets(
322 ObjectContextRef obc, SnapSet& snapset, const hobject_t& head,
323 const pg_missing_t& missing,
324 const hobject_t &last_backfill,
325 interval_set<uint64_t>& data_subset,
326 map<hobject_t, interval_set<uint64_t>>& clone_subsets,
327 ObcLockManager &lock_manager);
328 ObjectRecoveryInfo recalc_subsets(
329 const ObjectRecoveryInfo& recovery_info,
330 SnapSetContext *ssc,
331 ObcLockManager &lock_manager);
332
333 /**
334 * Client IO
335 */
11fdf7f2 336 struct InProgressOp : public RefCountedObject {
7c673cae
FG
337 ceph_tid_t tid;
338 set<pg_shard_t> waiting_for_commit;
7c673cae 339 Context *on_commit;
7c673cae
FG
340 OpRequestRef op;
341 eversion_t v;
7c673cae 342 bool done() const {
11fdf7f2 343 return waiting_for_commit.empty();
7c673cae 344 }
9f95a23c
TL
345 private:
346 FRIEND_MAKE_REF(InProgressOp);
347 InProgressOp(ceph_tid_t tid, Context *on_commit, OpRequestRef op, eversion_t v)
348 :
349 tid(tid), on_commit(on_commit),
350 op(op), v(v) {}
7c673cae 351 };
9f95a23c 352 map<ceph_tid_t, ceph::ref_t<InProgressOp>> in_progress_ops;
7c673cae
FG
353public:
354 friend class C_OSD_OnOpCommit;
7c673cae
FG
355
356 void call_write_ordered(std::function<void(void)> &&cb) override {
357 // ReplicatedBackend submits writes inline in submit_transaction, so
358 // we can just call the callback.
359 cb();
360 }
361
362 void submit_transaction(
363 const hobject_t &hoid,
364 const object_stat_sum_t &delta_stats,
365 const eversion_t &at_version,
366 PGTransactionUPtr &&t,
367 const eversion_t &trim_to,
9f95a23c 368 const eversion_t &min_last_complete_ondisk,
7c673cae 369 const vector<pg_log_entry_t> &log_entries,
9f95a23c 370 std::optional<pg_hit_set_history_t> &hset_history,
7c673cae
FG
371 Context *on_all_commit,
372 ceph_tid_t tid,
373 osd_reqid_t reqid,
374 OpRequestRef op
375 ) override;
376
377private:
378 Message * generate_subop(
379 const hobject_t &soid,
380 const eversion_t &at_version,
381 ceph_tid_t tid,
382 osd_reqid_t reqid,
383 eversion_t pg_trim_to,
9f95a23c 384 eversion_t min_last_complete_ondisk,
7c673cae
FG
385 hobject_t new_temp_oid,
386 hobject_t discard_temp_oid,
11fdf7f2 387 const bufferlist &log_entries,
9f95a23c 388 std::optional<pg_hit_set_history_t> &hset_history,
7c673cae
FG
389 ObjectStore::Transaction &op_t,
390 pg_shard_t peer,
391 const pg_info_t &pinfo);
392 void issue_op(
393 const hobject_t &soid,
394 const eversion_t &at_version,
395 ceph_tid_t tid,
396 osd_reqid_t reqid,
397 eversion_t pg_trim_to,
9f95a23c 398 eversion_t min_last_complete_ondisk,
7c673cae
FG
399 hobject_t new_temp_oid,
400 hobject_t discard_temp_oid,
401 const vector<pg_log_entry_t> &log_entries,
9f95a23c 402 std::optional<pg_hit_set_history_t> &hset_history,
7c673cae
FG
403 InProgressOp *op,
404 ObjectStore::Transaction &op_t);
9f95a23c 405 void op_commit(const ceph::ref_t<InProgressOp>& op);
7c673cae
FG
406 void do_repop_reply(OpRequestRef op);
407 void do_repop(OpRequestRef op);
408
409 struct RepModify {
410 OpRequestRef op;
11fdf7f2 411 bool committed;
7c673cae
FG
412 int ackerosd;
413 eversion_t last_complete;
414 epoch_t epoch_started;
415
416 ObjectStore::Transaction opt, localt;
417
11fdf7f2 418 RepModify() : committed(false), ackerosd(-1),
7c673cae
FG
419 epoch_started(0) {}
420 };
11fdf7f2 421 typedef std::shared_ptr<RepModify> RepModifyRef;
7c673cae 422
7c673cae
FG
423 struct C_OSD_RepModifyCommit;
424
7c673cae 425 void repop_commit(RepModifyRef rm);
11fdf7f2 426 bool auto_repair_supported() const override { return store->has_builtin_csum(); }
7c673cae
FG
427
428
28e407b8
AA
429 int be_deep_scrub(
430 const hobject_t &poid,
431 ScrubMap &map,
432 ScrubMapBuilder &pos,
433 ScrubMap::object &o) override;
7c673cae
FG
434 uint64_t be_get_ondisk_size(uint64_t logical_size) override { return logical_size; }
435};
436
437#endif