]> git.proxmox.com Git - ceph.git/blame - ceph/src/osd/PGBackend.h
update sources to 12.2.7
[ceph.git] / ceph / src / osd / PGBackend.h
CommitLineData
7c673cae
FG
1// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2// vim: ts=8 sw=2 smarttab
3/*
4 * Ceph - scalable distributed file system
5 *
6 * Copyright (C) 2013,2014 Inktank Storage, Inc.
7 * Copyright (C) 2013,2014 Cloudwatt <libre.licensing@cloudwatt.com>
8 *
9 * Author: Loic Dachary <loic@dachary.org>
10 *
11 * This is free software; you can redistribute it and/or
12 * modify it under the terms of the GNU Lesser General Public
13 * License version 2.1, as published by the Free Software
14 * Foundation. See file COPYING.
15 *
16 */
17
18#ifndef PGBACKEND_H
19#define PGBACKEND_H
20
21#include "osd_types.h"
22#include "common/WorkQueue.h"
23#include "include/Context.h"
24#include "os/ObjectStore.h"
25#include "common/LogClient.h"
26#include <string>
27#include "PGTransaction.h"
28
29namespace Scrub {
30 class Store;
31}
32struct shard_info_wrapper;
33struct inconsistent_obj_wrapper;
34
35//forward declaration
36class OSDMap;
37class PGLog;
38typedef ceph::shared_ptr<const OSDMap> OSDMapRef;
39
40 /**
41 * PGBackend
42 *
43 * PGBackend defines an interface for logic handling IO and
44 * replication on RADOS objects. The PGBackend implementation
45 * is responsible for:
46 *
47 * 1) Handling client operations
48 * 2) Handling object recovery
49 * 3) Handling object access
50 * 4) Handling scrub, deep-scrub, repair
51 */
52 class PGBackend {
53 public:
54 CephContext* cct;
55 protected:
56 ObjectStore *store;
57 const coll_t coll;
58 ObjectStore::CollectionHandle &ch;
59 public:
60 /**
61 * Provides interfaces for PGBackend callbacks
62 *
63 * The intention is that the parent calls into the PGBackend
64 * implementation holding a lock and that the callbacks are
65 * called under the same locks.
66 */
67 class Listener {
68 public:
69 /// Debugging
70 virtual DoutPrefixProvider *get_dpp() = 0;
71
72 /// Recovery
73
74 /**
75 * Called with the transaction recovering oid
76 */
77 virtual void on_local_recover(
78 const hobject_t &oid,
79 const ObjectRecoveryInfo &recovery_info,
80 ObjectContextRef obc,
c07f9fc5 81 bool is_delete,
7c673cae
FG
82 ObjectStore::Transaction *t
83 ) = 0;
84
85 /**
86 * Called when transaction recovering oid is durable and
87 * applied on all replicas
88 */
89 virtual void on_global_recover(
90 const hobject_t &oid,
c07f9fc5
FG
91 const object_stat_sum_t &stat_diff,
92 bool is_delete
7c673cae
FG
93 ) = 0;
94
95 /**
96 * Called when peer is recovered
97 */
98 virtual void on_peer_recover(
99 pg_shard_t peer,
100 const hobject_t &oid,
101 const ObjectRecoveryInfo &recovery_info
102 ) = 0;
103
104 virtual void begin_peer_recover(
105 pg_shard_t peer,
106 const hobject_t oid) = 0;
107
108 virtual void failed_push(const list<pg_shard_t> &from, const hobject_t &soid) = 0;
b32b8144 109 virtual void finish_degraded_object(const hobject_t& oid) = 0;
224ce89b
WB
110 virtual void primary_failed(const hobject_t &soid) = 0;
111 virtual bool primary_error(const hobject_t& soid, eversion_t v) = 0;
7c673cae
FG
112 virtual void cancel_pull(const hobject_t &soid) = 0;
113
114 virtual void apply_stats(
115 const hobject_t &soid,
116 const object_stat_sum_t &delta_stats) = 0;
117
224ce89b
WB
118 /**
119 * Called when a read on the primary fails when pushing
120 */
121 virtual void on_primary_error(
122 const hobject_t &oid,
123 eversion_t v
124 ) = 0;
125
b32b8144
FG
126 virtual void backfill_add_missing(
127 const hobject_t &oid,
128 eversion_t v
129 ) = 0;
130
c07f9fc5
FG
131 virtual void remove_missing_object(const hobject_t &oid,
132 eversion_t v,
133 Context *on_complete) = 0;
7c673cae 134
28e407b8 135
7c673cae
FG
136 /**
137 * Bless a context
138 *
139 * Wraps a context in whatever outer layers the parent usually
140 * uses to call into the PGBackend
141 */
142 virtual Context *bless_context(Context *c) = 0;
143 virtual GenContext<ThreadPool::TPHandle&> *bless_gencontext(
144 GenContext<ThreadPool::TPHandle&> *c) = 0;
145
146 virtual void send_message(int to_osd, Message *m) = 0;
147 virtual void queue_transaction(
148 ObjectStore::Transaction&& t,
149 OpRequestRef op = OpRequestRef()
150 ) = 0;
151 virtual void queue_transactions(
152 vector<ObjectStore::Transaction>& tls,
153 OpRequestRef op = OpRequestRef()
154 ) = 0;
155 virtual epoch_t get_epoch() const = 0;
156 virtual epoch_t get_interval_start_epoch() const = 0;
157 virtual epoch_t get_last_peering_reset_epoch() const = 0;
158
159 virtual const set<pg_shard_t> &get_actingbackfill_shards() const = 0;
160 virtual const set<pg_shard_t> &get_acting_shards() const = 0;
161 virtual const set<pg_shard_t> &get_backfill_shards() const = 0;
162
163 virtual std::string gen_dbg_prefix() const = 0;
164
165 virtual const map<hobject_t, set<pg_shard_t>> &get_missing_loc_shards()
166 const = 0;
167
168 virtual const pg_missing_tracker_t &get_local_missing() const = 0;
169 virtual const map<pg_shard_t, pg_missing_t> &get_shard_missing()
170 const = 0;
171 virtual boost::optional<const pg_missing_const_i &> maybe_get_shard_missing(
172 pg_shard_t peer) const {
173 if (peer == primary_shard()) {
174 return get_local_missing();
175 } else {
176 map<pg_shard_t, pg_missing_t>::const_iterator i =
177 get_shard_missing().find(peer);
178 if (i == get_shard_missing().end()) {
179 return boost::optional<const pg_missing_const_i &>();
180 } else {
181 return i->second;
182 }
183 }
184 }
185 virtual const pg_missing_const_i &get_shard_missing(pg_shard_t peer) const {
186 auto m = maybe_get_shard_missing(peer);
187 assert(m);
188 return *m;
189 }
190
191 virtual const map<pg_shard_t, pg_info_t> &get_shard_info() const = 0;
192 virtual const pg_info_t &get_shard_info(pg_shard_t peer) const {
193 if (peer == primary_shard()) {
194 return get_info();
195 } else {
196 map<pg_shard_t, pg_info_t>::const_iterator i =
197 get_shard_info().find(peer);
198 assert(i != get_shard_info().end());
199 return i->second;
200 }
201 }
202
203 virtual const PGLog &get_log() const = 0;
204 virtual bool pgb_is_primary() const = 0;
205 virtual OSDMapRef pgb_get_osdmap() const = 0;
206 virtual const pg_info_t &get_info() const = 0;
207 virtual const pg_pool_t &get_pool() const = 0;
208
209 virtual ObjectContextRef get_obc(
210 const hobject_t &hoid,
211 const map<string, bufferlist> &attrs) = 0;
212
213 virtual bool try_lock_for_read(
214 const hobject_t &hoid,
215 ObcLockManager &manager) = 0;
216
217 virtual void release_locks(ObcLockManager &manager) = 0;
218
219 virtual void op_applied(
220 const eversion_t &applied_version) = 0;
221
222 virtual bool should_send_op(
223 pg_shard_t peer,
224 const hobject_t &hoid) = 0;
225
226 virtual void log_operation(
227 const vector<pg_log_entry_t> &logv,
228 const boost::optional<pg_hit_set_history_t> &hset_history,
229 const eversion_t &trim_to,
230 const eversion_t &roll_forward_to,
231 bool transaction_applied,
232 ObjectStore::Transaction &t) = 0;
233
234 virtual void pgb_set_object_snap_mapping(
235 const hobject_t &soid,
236 const set<snapid_t> &snaps,
237 ObjectStore::Transaction *t) = 0;
238
239 virtual void pgb_clear_object_snap_mapping(
240 const hobject_t &soid,
241 ObjectStore::Transaction *t) = 0;
242
243 virtual void update_peer_last_complete_ondisk(
244 pg_shard_t fromosd,
245 eversion_t lcod) = 0;
246
247 virtual void update_last_complete_ondisk(
248 eversion_t lcod) = 0;
249
250 virtual void update_stats(
251 const pg_stat_t &stat) = 0;
252
253 virtual void schedule_recovery_work(
254 GenContext<ThreadPool::TPHandle&> *c) = 0;
255
256 virtual pg_shard_t whoami_shard() const = 0;
257 int whoami() const {
258 return whoami_shard().osd;
259 }
260 spg_t whoami_spg_t() const {
261 return get_info().pgid;
262 }
263
264 virtual spg_t primary_spg_t() const = 0;
265 virtual pg_shard_t primary_shard() const = 0;
266
267 virtual uint64_t min_peer_features() const = 0;
268
269 virtual hobject_t get_temp_recovery_object(const hobject_t& target,
270 eversion_t version) = 0;
271
272 virtual void send_message_osd_cluster(
273 int peer, Message *m, epoch_t from_epoch) = 0;
274 virtual void send_message_osd_cluster(
275 Message *m, Connection *con) = 0;
276 virtual void send_message_osd_cluster(
277 Message *m, const ConnectionRef& con) = 0;
278 virtual ConnectionRef get_con_osd_cluster(int peer, epoch_t from_epoch) = 0;
279 virtual entity_name_t get_cluster_msgr_name() = 0;
280
281 virtual PerfCounters *get_logger() = 0;
282
283 virtual ceph_tid_t get_tid() = 0;
284
285 virtual LogClientTemp clog_error() = 0;
c07f9fc5 286 virtual LogClientTemp clog_warn() = 0;
7c673cae
FG
287
288 virtual bool check_failsafe_full(ostream &ss) = 0;
289
290 virtual bool check_osdmap_full(const set<pg_shard_t> &missing_on) = 0;
291
28e407b8 292 virtual bool maybe_preempt_replica_scrub(const hobject_t& oid) = 0;
7c673cae
FG
293 virtual ~Listener() {}
294 };
295 Listener *parent;
296 Listener *get_parent() const { return parent; }
297 PGBackend(CephContext* cct, Listener *l, ObjectStore *store, coll_t coll,
298 ObjectStore::CollectionHandle &ch) :
299 cct(cct),
300 store(store),
301 coll(coll),
302 ch(ch),
303 parent(l) {}
304 bool is_primary() const { return get_parent()->pgb_is_primary(); }
305 OSDMapRef get_osdmap() const { return get_parent()->pgb_get_osdmap(); }
306 const pg_info_t &get_info() { return get_parent()->get_info(); }
307
308 std::string gen_prefix() const {
309 return parent->gen_dbg_prefix();
310 }
311
312 /**
313 * RecoveryHandle
314 *
315 * We may want to recover multiple objects in the same set of
316 * messages. RecoveryHandle is an interface for the opaque
317 * object used by the implementation to store the details of
318 * the pending recovery operations.
319 */
320 struct RecoveryHandle {
321 bool cache_dont_need;
c07f9fc5 322 map<pg_shard_t, vector<pair<hobject_t, eversion_t> > > deletes;
7c673cae
FG
323
324 RecoveryHandle(): cache_dont_need(false) {}
325 virtual ~RecoveryHandle() {}
326 };
327
328 /// Get a fresh recovery operation
329 virtual RecoveryHandle *open_recovery_op() = 0;
330
331 /// run_recovery_op: finish the operation represented by h
332 virtual void run_recovery_op(
333 RecoveryHandle *h, ///< [in] op to finish
334 int priority ///< [in] msg priority
335 ) = 0;
336
c07f9fc5
FG
337 void recover_delete_object(const hobject_t &oid, eversion_t v,
338 RecoveryHandle *h);
339 void send_recovery_deletes(int prio,
340 const map<pg_shard_t, vector<pair<hobject_t, eversion_t> > > &deletes);
341
7c673cae
FG
342 /**
343 * recover_object
344 *
345 * Triggers a recovery operation on the specified hobject_t
346 * onreadable must be called before onwriteable
347 *
348 * On each replica (primary included), get_parent()->on_not_missing()
349 * must be called when the transaction finalizing the recovery
350 * is queued. Similarly, get_parent()->on_readable() must be called
351 * when the transaction is applied in the backing store.
352 *
353 * get_parent()->on_not_degraded() should be called on the primary
354 * when writes can resume on the object.
355 *
356 * obc may be NULL if the primary lacks the object.
357 *
358 * head may be NULL only if the head/snapdir is missing
359 *
360 * @param missing [in] set of info, missing pairs for queried nodes
361 * @param overlaps [in] mapping of object to file offset overlaps
362 */
224ce89b 363 virtual int recover_object(
7c673cae
FG
364 const hobject_t &hoid, ///< [in] object to recover
365 eversion_t v, ///< [in] version to recover
366 ObjectContextRef head, ///< [in] context of the head/snapdir object
367 ObjectContextRef obc, ///< [in] context of the object
368 RecoveryHandle *h ///< [in,out] handle to attach recovery op to
369 ) = 0;
370
371 /**
372 * true if PGBackend can handle this message while inactive
373 *
374 * If it returns true, handle_message *must* also return true
375 */
376 virtual bool can_handle_while_inactive(OpRequestRef op) = 0;
377
378 /// gives PGBackend a crack at an incoming message
c07f9fc5 379 bool handle_message(
7c673cae 380 OpRequestRef op ///< [in] message received
c07f9fc5
FG
381 ); ///< @return true if the message was handled
382
383 /// the variant of handle_message that is overridden by child classes
384 virtual bool _handle_message(OpRequestRef op) = 0;
7c673cae
FG
385
386 virtual void check_recovery_sources(const OSDMapRef& osdmap) = 0;
387
388
389 /**
390 * clean up any temporary on-disk state due to a pg interval change
391 */
392 void on_change_cleanup(ObjectStore::Transaction *t);
393 /**
394 * implementation should clear itself, contexts blessed prior to on_change
395 * won't be called after on_change()
396 */
397 virtual void on_change() = 0;
398 virtual void clear_recovery_state() = 0;
399
400 virtual void on_flushed() = 0;
401
402 virtual IsPGRecoverablePredicate *get_is_recoverable_predicate() = 0;
403 virtual IsPGReadablePredicate *get_is_readable_predicate() = 0;
404
405 virtual void dump_recovery_info(Formatter *f) const = 0;
406
407 private:
408 set<hobject_t> temp_contents;
409 public:
410 // Track contents of temp collection, clear on reset
411 void add_temp_obj(const hobject_t &oid) {
412 temp_contents.insert(oid);
413 }
414 void add_temp_objs(const set<hobject_t> &oids) {
415 temp_contents.insert(oids.begin(), oids.end());
416 }
417 void clear_temp_obj(const hobject_t &oid) {
418 temp_contents.erase(oid);
419 }
420 void clear_temp_objs(const set<hobject_t> &oids) {
421 for (set<hobject_t>::const_iterator i = oids.begin();
422 i != oids.end();
423 ++i) {
424 temp_contents.erase(*i);
425 }
426 }
427
428 virtual ~PGBackend() {}
429
430 /// execute implementation specific transaction
431 virtual void submit_transaction(
432 const hobject_t &hoid, ///< [in] object
433 const object_stat_sum_t &delta_stats,///< [in] stat change
434 const eversion_t &at_version, ///< [in] version
435 PGTransactionUPtr &&t, ///< [in] trans to execute (move)
436 const eversion_t &trim_to, ///< [in] trim log to here
437 const eversion_t &roll_forward_to, ///< [in] trim rollback info to here
438 const vector<pg_log_entry_t> &log_entries, ///< [in] log entries for t
439 /// [in] hitset history (if updated with this transaction)
440 boost::optional<pg_hit_set_history_t> &hset_history,
441 Context *on_local_applied_sync, ///< [in] called when applied locally
442 Context *on_all_applied, ///< [in] called when all acked
443 Context *on_all_commit, ///< [in] called when all commit
444 ceph_tid_t tid, ///< [in] tid
445 osd_reqid_t reqid, ///< [in] reqid
446 OpRequestRef op ///< [in] op
447 ) = 0;
448
449 /// submit callback to be called in order with pending writes
450 virtual void call_write_ordered(std::function<void(void)> &&cb) = 0;
451
452 void try_stash(
453 const hobject_t &hoid,
454 version_t v,
455 ObjectStore::Transaction *t);
456
457 void rollback(
458 const pg_log_entry_t &entry,
459 ObjectStore::Transaction *t);
460
461 friend class LRBTrimmer;
462 void rollforward(
463 const pg_log_entry_t &entry,
464 ObjectStore::Transaction *t);
465
466 void trim(
467 const pg_log_entry_t &entry,
468 ObjectStore::Transaction *t);
469
470 void remove(
471 const hobject_t &hoid,
472 ObjectStore::Transaction *t);
473
474 protected:
c07f9fc5
FG
475
476 void handle_recovery_delete(OpRequestRef op);
477 void handle_recovery_delete_reply(OpRequestRef op);
478
7c673cae
FG
479 /// Reapply old attributes
480 void rollback_setattrs(
481 const hobject_t &hoid,
482 map<string, boost::optional<bufferlist> > &old_attrs,
483 ObjectStore::Transaction *t);
484
485 /// Truncate object to rollback append
486 virtual void rollback_append(
487 const hobject_t &hoid,
488 uint64_t old_size,
489 ObjectStore::Transaction *t);
490
491 /// Unstash object to rollback stash
492 void rollback_stash(
493 const hobject_t &hoid,
494 version_t old_version,
495 ObjectStore::Transaction *t);
496
497 /// Unstash object to rollback stash
498 void rollback_try_stash(
499 const hobject_t &hoid,
500 version_t old_version,
501 ObjectStore::Transaction *t);
502
503 /// Delete object to rollback create
504 void rollback_create(
505 const hobject_t &hoid,
506 ObjectStore::Transaction *t) {
507 remove(hoid, t);
508 }
509
510 /// Clone the extents back into place
511 void rollback_extents(
512 version_t gen,
513 const vector<pair<uint64_t, uint64_t> > &extents,
514 const hobject_t &hoid,
515 ObjectStore::Transaction *t);
516 public:
517
518 /// Trim object stashed at version
519 void trim_rollback_object(
520 const hobject_t &hoid,
521 version_t gen,
522 ObjectStore::Transaction *t);
523
524 /// List objects in collection
525 int objects_list_partial(
526 const hobject_t &begin,
527 int min,
528 int max,
529 vector<hobject_t> *ls,
530 hobject_t *next);
531
532 int objects_list_range(
533 const hobject_t &start,
534 const hobject_t &end,
535 snapid_t seq,
536 vector<hobject_t> *ls,
537 vector<ghobject_t> *gen_obs=0);
538
539 int objects_get_attr(
540 const hobject_t &hoid,
541 const string &attr,
542 bufferlist *out);
543
544 virtual int objects_get_attrs(
545 const hobject_t &hoid,
546 map<string, bufferlist> *out);
547
548 virtual int objects_read_sync(
549 const hobject_t &hoid,
550 uint64_t off,
551 uint64_t len,
552 uint32_t op_flags,
553 bufferlist *bl) = 0;
554
555 virtual void objects_read_async(
556 const hobject_t &hoid,
557 const list<pair<boost::tuple<uint64_t, uint64_t, uint32_t>,
558 pair<bufferlist*, Context*> > > &to_read,
559 Context *on_complete, bool fast_read = false) = 0;
560
561 virtual bool scrub_supported() = 0;
562 virtual bool auto_repair_supported() const = 0;
28e407b8
AA
563 int be_scan_list(
564 ScrubMap &map,
565 ScrubMapBuilder &pos);
7c673cae
FG
566 bool be_compare_scrub_objects(
567 pg_shard_t auth_shard,
568 const ScrubMap::object &auth,
569 const object_info_t& auth_oi,
570 const ScrubMap::object &candidate,
571 shard_info_wrapper& shard_error,
572 inconsistent_obj_wrapper &result,
573 ostream &errorstream);
574 map<pg_shard_t, ScrubMap *>::const_iterator be_select_auth_object(
575 const hobject_t &obj,
576 const map<pg_shard_t,ScrubMap*> &maps,
577 object_info_t *auth_oi,
578 map<pg_shard_t, shard_info_wrapper> &shard_map,
579 inconsistent_obj_wrapper &object_error);
580 void be_compare_scrubmaps(
581 const map<pg_shard_t,ScrubMap*> &maps,
28e407b8 582 const set<hobject_t> &master_set,
7c673cae
FG
583 bool repair,
584 map<hobject_t, set<pg_shard_t>> &missing,
585 map<hobject_t, set<pg_shard_t>> &inconsistent,
586 map<hobject_t, list<pg_shard_t>> &authoritative,
28e407b8
AA
587 map<hobject_t, pair<boost::optional<uint32_t>,
588 boost::optional<uint32_t>>> &missing_digest,
7c673cae
FG
589 int &shallow_errors, int &deep_errors,
590 Scrub::Store *store,
591 const spg_t& pgid,
592 const vector<int> &acting,
593 ostream &errorstream);
594 virtual uint64_t be_get_ondisk_size(
595 uint64_t logical_size) = 0;
28e407b8
AA
596 virtual int be_deep_scrub(
597 const hobject_t &oid,
598 ScrubMap &map,
599 ScrubMapBuilder &pos,
600 ScrubMap::object &o) = 0;
601 void be_large_omap_check(
602 const map<pg_shard_t,ScrubMap*> &maps,
603 const set<hobject_t> &master_set,
604 int& large_omap_objects,
605 ostream &warnstream) const;
7c673cae
FG
606
607 static PGBackend *build_pg_backend(
608 const pg_pool_t &pool,
609 const OSDMapRef curmap,
610 Listener *l,
611 coll_t coll,
612 ObjectStore::CollectionHandle &ch,
613 ObjectStore *store,
614 CephContext *cct);
615};
616
617#endif