]> git.proxmox.com Git - ceph.git/blob - ceph/src/osd/PGBackend.h
b50f0d8c78cbdab2db7bbaa835b13fd13571b6cf
[ceph.git] / ceph / src / osd / PGBackend.h
1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2 // vim: ts=8 sw=2 smarttab
3 /*
4 * Ceph - scalable distributed file system
5 *
6 * Copyright (C) 2013,2014 Inktank Storage, Inc.
7 * Copyright (C) 2013,2014 Cloudwatt <libre.licensing@cloudwatt.com>
8 *
9 * Author: Loic Dachary <loic@dachary.org>
10 *
11 * This is free software; you can redistribute it and/or
12 * modify it under the terms of the GNU Lesser General Public
13 * License version 2.1, as published by the Free Software
14 * Foundation. See file COPYING.
15 *
16 */
17
18 #ifndef PGBACKEND_H
19 #define PGBACKEND_H
20
21 #include "osd_types.h"
22 #include "common/WorkQueue.h"
23 #include "include/Context.h"
24 #include "os/ObjectStore.h"
25 #include "common/LogClient.h"
26 #include <string>
27 #include "PGTransaction.h"
28
29 namespace Scrub {
30 class Store;
31 }
32 struct shard_info_wrapper;
33 struct inconsistent_obj_wrapper;
34
35 //forward declaration
36 class OSDMap;
37 class PGLog;
38 typedef ceph::shared_ptr<const OSDMap> OSDMapRef;
39
40 /**
41 * PGBackend
42 *
43 * PGBackend defines an interface for logic handling IO and
44 * replication on RADOS objects. The PGBackend implementation
45 * is responsible for:
46 *
47 * 1) Handling client operations
48 * 2) Handling object recovery
49 * 3) Handling object access
50 * 4) Handling scrub, deep-scrub, repair
51 */
52 class PGBackend {
53 public:
54 CephContext* cct;
55 protected:
56 ObjectStore *store;
57 const coll_t coll;
58 ObjectStore::CollectionHandle &ch;
59 public:
60 /**
61 * Provides interfaces for PGBackend callbacks
62 *
63 * The intention is that the parent calls into the PGBackend
64 * implementation holding a lock and that the callbacks are
65 * called under the same locks.
66 */
67 class Listener {
68 public:
69 /// Debugging
70 virtual DoutPrefixProvider *get_dpp() = 0;
71
72 /// Recovery
73
74 /**
75 * Called with the transaction recovering oid
76 */
77 virtual void on_local_recover(
78 const hobject_t &oid,
79 const ObjectRecoveryInfo &recovery_info,
80 ObjectContextRef obc,
81 ObjectStore::Transaction *t
82 ) = 0;
83
84 /**
85 * Called when transaction recovering oid is durable and
86 * applied on all replicas
87 */
88 virtual void on_global_recover(
89 const hobject_t &oid,
90 const object_stat_sum_t &stat_diff
91 ) = 0;
92
93 /**
94 * Called when peer is recovered
95 */
96 virtual void on_peer_recover(
97 pg_shard_t peer,
98 const hobject_t &oid,
99 const ObjectRecoveryInfo &recovery_info
100 ) = 0;
101
102 virtual void begin_peer_recover(
103 pg_shard_t peer,
104 const hobject_t oid) = 0;
105
106 virtual void failed_push(const list<pg_shard_t> &from, const hobject_t &soid) = 0;
107
108 virtual void cancel_pull(const hobject_t &soid) = 0;
109
110 virtual void apply_stats(
111 const hobject_t &soid,
112 const object_stat_sum_t &delta_stats) = 0;
113
114
115 /**
116 * Bless a context
117 *
118 * Wraps a context in whatever outer layers the parent usually
119 * uses to call into the PGBackend
120 */
121 virtual Context *bless_context(Context *c) = 0;
122 virtual GenContext<ThreadPool::TPHandle&> *bless_gencontext(
123 GenContext<ThreadPool::TPHandle&> *c) = 0;
124
125 virtual void send_message(int to_osd, Message *m) = 0;
126 virtual void queue_transaction(
127 ObjectStore::Transaction&& t,
128 OpRequestRef op = OpRequestRef()
129 ) = 0;
130 virtual void queue_transactions(
131 vector<ObjectStore::Transaction>& tls,
132 OpRequestRef op = OpRequestRef()
133 ) = 0;
134 virtual epoch_t get_epoch() const = 0;
135 virtual epoch_t get_interval_start_epoch() const = 0;
136 virtual epoch_t get_last_peering_reset_epoch() const = 0;
137
138 virtual const set<pg_shard_t> &get_actingbackfill_shards() const = 0;
139 virtual const set<pg_shard_t> &get_acting_shards() const = 0;
140 virtual const set<pg_shard_t> &get_backfill_shards() const = 0;
141
142 virtual std::string gen_dbg_prefix() const = 0;
143
144 virtual const map<hobject_t, set<pg_shard_t>> &get_missing_loc_shards()
145 const = 0;
146
147 virtual const pg_missing_tracker_t &get_local_missing() const = 0;
148 virtual const map<pg_shard_t, pg_missing_t> &get_shard_missing()
149 const = 0;
150 virtual boost::optional<const pg_missing_const_i &> maybe_get_shard_missing(
151 pg_shard_t peer) const {
152 if (peer == primary_shard()) {
153 return get_local_missing();
154 } else {
155 map<pg_shard_t, pg_missing_t>::const_iterator i =
156 get_shard_missing().find(peer);
157 if (i == get_shard_missing().end()) {
158 return boost::optional<const pg_missing_const_i &>();
159 } else {
160 return i->second;
161 }
162 }
163 }
164 virtual const pg_missing_const_i &get_shard_missing(pg_shard_t peer) const {
165 auto m = maybe_get_shard_missing(peer);
166 assert(m);
167 return *m;
168 }
169
170 virtual const map<pg_shard_t, pg_info_t> &get_shard_info() const = 0;
171 virtual const pg_info_t &get_shard_info(pg_shard_t peer) const {
172 if (peer == primary_shard()) {
173 return get_info();
174 } else {
175 map<pg_shard_t, pg_info_t>::const_iterator i =
176 get_shard_info().find(peer);
177 assert(i != get_shard_info().end());
178 return i->second;
179 }
180 }
181
182 virtual const PGLog &get_log() const = 0;
183 virtual bool pgb_is_primary() const = 0;
184 virtual OSDMapRef pgb_get_osdmap() const = 0;
185 virtual const pg_info_t &get_info() const = 0;
186 virtual const pg_pool_t &get_pool() const = 0;
187
188 virtual ObjectContextRef get_obc(
189 const hobject_t &hoid,
190 const map<string, bufferlist> &attrs) = 0;
191
192 virtual bool try_lock_for_read(
193 const hobject_t &hoid,
194 ObcLockManager &manager) = 0;
195
196 virtual void release_locks(ObcLockManager &manager) = 0;
197
198 virtual void op_applied(
199 const eversion_t &applied_version) = 0;
200
201 virtual bool should_send_op(
202 pg_shard_t peer,
203 const hobject_t &hoid) = 0;
204
205 virtual void log_operation(
206 const vector<pg_log_entry_t> &logv,
207 const boost::optional<pg_hit_set_history_t> &hset_history,
208 const eversion_t &trim_to,
209 const eversion_t &roll_forward_to,
210 bool transaction_applied,
211 ObjectStore::Transaction &t) = 0;
212
213 virtual void pgb_set_object_snap_mapping(
214 const hobject_t &soid,
215 const set<snapid_t> &snaps,
216 ObjectStore::Transaction *t) = 0;
217
218 virtual void pgb_clear_object_snap_mapping(
219 const hobject_t &soid,
220 ObjectStore::Transaction *t) = 0;
221
222 virtual void update_peer_last_complete_ondisk(
223 pg_shard_t fromosd,
224 eversion_t lcod) = 0;
225
226 virtual void update_last_complete_ondisk(
227 eversion_t lcod) = 0;
228
229 virtual void update_stats(
230 const pg_stat_t &stat) = 0;
231
232 virtual void schedule_recovery_work(
233 GenContext<ThreadPool::TPHandle&> *c) = 0;
234
235 virtual pg_shard_t whoami_shard() const = 0;
236 int whoami() const {
237 return whoami_shard().osd;
238 }
239 spg_t whoami_spg_t() const {
240 return get_info().pgid;
241 }
242
243 virtual spg_t primary_spg_t() const = 0;
244 virtual pg_shard_t primary_shard() const = 0;
245
246 virtual uint64_t min_peer_features() const = 0;
247
248 virtual hobject_t get_temp_recovery_object(const hobject_t& target,
249 eversion_t version) = 0;
250
251 virtual void send_message_osd_cluster(
252 int peer, Message *m, epoch_t from_epoch) = 0;
253 virtual void send_message_osd_cluster(
254 Message *m, Connection *con) = 0;
255 virtual void send_message_osd_cluster(
256 Message *m, const ConnectionRef& con) = 0;
257 virtual ConnectionRef get_con_osd_cluster(int peer, epoch_t from_epoch) = 0;
258 virtual entity_name_t get_cluster_msgr_name() = 0;
259
260 virtual PerfCounters *get_logger() = 0;
261
262 virtual ceph_tid_t get_tid() = 0;
263
264 virtual LogClientTemp clog_error() = 0;
265
266 virtual bool check_failsafe_full(ostream &ss) = 0;
267
268 virtual bool check_osdmap_full(const set<pg_shard_t> &missing_on) = 0;
269
270 virtual ~Listener() {}
271 };
272 Listener *parent;
273 Listener *get_parent() const { return parent; }
274 PGBackend(CephContext* cct, Listener *l, ObjectStore *store, coll_t coll,
275 ObjectStore::CollectionHandle &ch) :
276 cct(cct),
277 store(store),
278 coll(coll),
279 ch(ch),
280 parent(l) {}
281 bool is_primary() const { return get_parent()->pgb_is_primary(); }
282 OSDMapRef get_osdmap() const { return get_parent()->pgb_get_osdmap(); }
283 const pg_info_t &get_info() { return get_parent()->get_info(); }
284
285 std::string gen_prefix() const {
286 return parent->gen_dbg_prefix();
287 }
288
289 /**
290 * RecoveryHandle
291 *
292 * We may want to recover multiple objects in the same set of
293 * messages. RecoveryHandle is an interface for the opaque
294 * object used by the implementation to store the details of
295 * the pending recovery operations.
296 */
297 struct RecoveryHandle {
298 bool cache_dont_need;
299
300 RecoveryHandle(): cache_dont_need(false) {}
301 virtual ~RecoveryHandle() {}
302 };
303
304 /// Get a fresh recovery operation
305 virtual RecoveryHandle *open_recovery_op() = 0;
306
307 /// run_recovery_op: finish the operation represented by h
308 virtual void run_recovery_op(
309 RecoveryHandle *h, ///< [in] op to finish
310 int priority ///< [in] msg priority
311 ) = 0;
312
313 /**
314 * recover_object
315 *
316 * Triggers a recovery operation on the specified hobject_t
317 * onreadable must be called before onwriteable
318 *
319 * On each replica (primary included), get_parent()->on_not_missing()
320 * must be called when the transaction finalizing the recovery
321 * is queued. Similarly, get_parent()->on_readable() must be called
322 * when the transaction is applied in the backing store.
323 *
324 * get_parent()->on_not_degraded() should be called on the primary
325 * when writes can resume on the object.
326 *
327 * obc may be NULL if the primary lacks the object.
328 *
329 * head may be NULL only if the head/snapdir is missing
330 *
331 * @param missing [in] set of info, missing pairs for queried nodes
332 * @param overlaps [in] mapping of object to file offset overlaps
333 */
334 virtual void recover_object(
335 const hobject_t &hoid, ///< [in] object to recover
336 eversion_t v, ///< [in] version to recover
337 ObjectContextRef head, ///< [in] context of the head/snapdir object
338 ObjectContextRef obc, ///< [in] context of the object
339 RecoveryHandle *h ///< [in,out] handle to attach recovery op to
340 ) = 0;
341
342 /**
343 * true if PGBackend can handle this message while inactive
344 *
345 * If it returns true, handle_message *must* also return true
346 */
347 virtual bool can_handle_while_inactive(OpRequestRef op) = 0;
348
349 /// gives PGBackend a crack at an incoming message
350 virtual bool handle_message(
351 OpRequestRef op ///< [in] message received
352 ) = 0; ///< @return true if the message was handled
353
354 virtual void check_recovery_sources(const OSDMapRef& osdmap) = 0;
355
356
357 /**
358 * clean up any temporary on-disk state due to a pg interval change
359 */
360 void on_change_cleanup(ObjectStore::Transaction *t);
361 /**
362 * implementation should clear itself, contexts blessed prior to on_change
363 * won't be called after on_change()
364 */
365 virtual void on_change() = 0;
366 virtual void clear_recovery_state() = 0;
367
368 virtual void on_flushed() = 0;
369
370 virtual IsPGRecoverablePredicate *get_is_recoverable_predicate() = 0;
371 virtual IsPGReadablePredicate *get_is_readable_predicate() = 0;
372
373 virtual void dump_recovery_info(Formatter *f) const = 0;
374
375 private:
376 set<hobject_t> temp_contents;
377 public:
378 // Track contents of temp collection, clear on reset
379 void add_temp_obj(const hobject_t &oid) {
380 temp_contents.insert(oid);
381 }
382 void add_temp_objs(const set<hobject_t> &oids) {
383 temp_contents.insert(oids.begin(), oids.end());
384 }
385 void clear_temp_obj(const hobject_t &oid) {
386 temp_contents.erase(oid);
387 }
388 void clear_temp_objs(const set<hobject_t> &oids) {
389 for (set<hobject_t>::const_iterator i = oids.begin();
390 i != oids.end();
391 ++i) {
392 temp_contents.erase(*i);
393 }
394 }
395
396 virtual ~PGBackend() {}
397
398 /// execute implementation specific transaction
399 virtual void submit_transaction(
400 const hobject_t &hoid, ///< [in] object
401 const object_stat_sum_t &delta_stats,///< [in] stat change
402 const eversion_t &at_version, ///< [in] version
403 PGTransactionUPtr &&t, ///< [in] trans to execute (move)
404 const eversion_t &trim_to, ///< [in] trim log to here
405 const eversion_t &roll_forward_to, ///< [in] trim rollback info to here
406 const vector<pg_log_entry_t> &log_entries, ///< [in] log entries for t
407 /// [in] hitset history (if updated with this transaction)
408 boost::optional<pg_hit_set_history_t> &hset_history,
409 Context *on_local_applied_sync, ///< [in] called when applied locally
410 Context *on_all_applied, ///< [in] called when all acked
411 Context *on_all_commit, ///< [in] called when all commit
412 ceph_tid_t tid, ///< [in] tid
413 osd_reqid_t reqid, ///< [in] reqid
414 OpRequestRef op ///< [in] op
415 ) = 0;
416
417 /// submit callback to be called in order with pending writes
418 virtual void call_write_ordered(std::function<void(void)> &&cb) = 0;
419
420 void try_stash(
421 const hobject_t &hoid,
422 version_t v,
423 ObjectStore::Transaction *t);
424
425 void rollback(
426 const pg_log_entry_t &entry,
427 ObjectStore::Transaction *t);
428
429 friend class LRBTrimmer;
430 void rollforward(
431 const pg_log_entry_t &entry,
432 ObjectStore::Transaction *t);
433
434 void trim(
435 const pg_log_entry_t &entry,
436 ObjectStore::Transaction *t);
437
438 void remove(
439 const hobject_t &hoid,
440 ObjectStore::Transaction *t);
441
442 protected:
443 /// Reapply old attributes
444 void rollback_setattrs(
445 const hobject_t &hoid,
446 map<string, boost::optional<bufferlist> > &old_attrs,
447 ObjectStore::Transaction *t);
448
449 /// Truncate object to rollback append
450 virtual void rollback_append(
451 const hobject_t &hoid,
452 uint64_t old_size,
453 ObjectStore::Transaction *t);
454
455 /// Unstash object to rollback stash
456 void rollback_stash(
457 const hobject_t &hoid,
458 version_t old_version,
459 ObjectStore::Transaction *t);
460
461 /// Unstash object to rollback stash
462 void rollback_try_stash(
463 const hobject_t &hoid,
464 version_t old_version,
465 ObjectStore::Transaction *t);
466
467 /// Delete object to rollback create
468 void rollback_create(
469 const hobject_t &hoid,
470 ObjectStore::Transaction *t) {
471 remove(hoid, t);
472 }
473
474 /// Clone the extents back into place
475 void rollback_extents(
476 version_t gen,
477 const vector<pair<uint64_t, uint64_t> > &extents,
478 const hobject_t &hoid,
479 ObjectStore::Transaction *t);
480 public:
481
482 /// Trim object stashed at version
483 void trim_rollback_object(
484 const hobject_t &hoid,
485 version_t gen,
486 ObjectStore::Transaction *t);
487
488 /// List objects in collection
489 int objects_list_partial(
490 const hobject_t &begin,
491 int min,
492 int max,
493 vector<hobject_t> *ls,
494 hobject_t *next);
495
496 int objects_list_range(
497 const hobject_t &start,
498 const hobject_t &end,
499 snapid_t seq,
500 vector<hobject_t> *ls,
501 vector<ghobject_t> *gen_obs=0);
502
503 int objects_get_attr(
504 const hobject_t &hoid,
505 const string &attr,
506 bufferlist *out);
507
508 virtual int objects_get_attrs(
509 const hobject_t &hoid,
510 map<string, bufferlist> *out);
511
512 virtual int objects_read_sync(
513 const hobject_t &hoid,
514 uint64_t off,
515 uint64_t len,
516 uint32_t op_flags,
517 bufferlist *bl) = 0;
518
519 virtual void objects_read_async(
520 const hobject_t &hoid,
521 const list<pair<boost::tuple<uint64_t, uint64_t, uint32_t>,
522 pair<bufferlist*, Context*> > > &to_read,
523 Context *on_complete, bool fast_read = false) = 0;
524
525 virtual bool scrub_supported() = 0;
526 virtual bool auto_repair_supported() const = 0;
527 void be_scan_list(
528 ScrubMap &map, const vector<hobject_t> &ls, bool deep, uint32_t seed,
529 ThreadPool::TPHandle &handle);
530 bool be_compare_scrub_objects(
531 pg_shard_t auth_shard,
532 const ScrubMap::object &auth,
533 const object_info_t& auth_oi,
534 const ScrubMap::object &candidate,
535 shard_info_wrapper& shard_error,
536 inconsistent_obj_wrapper &result,
537 ostream &errorstream);
538 map<pg_shard_t, ScrubMap *>::const_iterator be_select_auth_object(
539 const hobject_t &obj,
540 const map<pg_shard_t,ScrubMap*> &maps,
541 object_info_t *auth_oi,
542 map<pg_shard_t, shard_info_wrapper> &shard_map,
543 inconsistent_obj_wrapper &object_error);
544 void be_compare_scrubmaps(
545 const map<pg_shard_t,ScrubMap*> &maps,
546 bool repair,
547 map<hobject_t, set<pg_shard_t>> &missing,
548 map<hobject_t, set<pg_shard_t>> &inconsistent,
549 map<hobject_t, list<pg_shard_t>> &authoritative,
550 map<hobject_t, pair<uint32_t,uint32_t>> &missing_digest,
551 int &shallow_errors, int &deep_errors,
552 Scrub::Store *store,
553 const spg_t& pgid,
554 const vector<int> &acting,
555 ostream &errorstream);
556 virtual uint64_t be_get_ondisk_size(
557 uint64_t logical_size) = 0;
558 virtual void be_deep_scrub(
559 const hobject_t &poid,
560 uint32_t seed,
561 ScrubMap::object &o,
562 ThreadPool::TPHandle &handle) = 0;
563
564 static PGBackend *build_pg_backend(
565 const pg_pool_t &pool,
566 const OSDMapRef curmap,
567 Listener *l,
568 coll_t coll,
569 ObjectStore::CollectionHandle &ch,
570 ObjectStore *store,
571 CephContext *cct);
572 };
573
574 #endif