]> git.proxmox.com Git - ceph.git/blame - ceph/src/osd/PrimaryLogPG.h
buildsys: use download.ceph.com to download source tar ball
[ceph.git] / ceph / src / osd / PrimaryLogPG.h
CommitLineData
7c673cae
FG
1// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2/*
3 * Ceph - scalable distributed file system
4 *
5 * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
6 * Copyright (C) 2013 Cloudwatt <libre.licensing@cloudwatt.com>
7 *
8 * Author: Loic Dachary <loic@dachary.org>
9 *
10 * This is free software; you can redistribute it and/or
11 * modify it under the terms of the GNU Lesser General Public
12 * License version 2.1, as published by the Free Software
13 * Foundation. See file COPYING.
14 *
15 */
16
17#ifndef CEPH_REPLICATEDPG_H
18#define CEPH_REPLICATEDPG_H
19
20#include <boost/tuple/tuple.hpp>
11fdf7f2
TL
21#include "include/ceph_assert.h"
22#include "DynamicPerfStats.h"
23#include "OSD.h"
7c673cae
FG
24#include "PG.h"
25#include "Watch.h"
26#include "TierAgentState.h"
27#include "messages/MOSDOpReply.h"
28#include "common/Checksummer.h"
29#include "common/sharedptr_registry.hpp"
11fdf7f2 30#include "common/shared_cache.hpp"
7c673cae
FG
31#include "ReplicatedBackend.h"
32#include "PGTransaction.h"
11fdf7f2 33#include "cls/cas/cls_cas_ops.h"
7c673cae
FG
34
35class CopyFromCallback;
36class PromoteCallback;
37
38class PrimaryLogPG;
39class PGLSFilter;
40class HitSet;
41struct TierAgentState;
42class MOSDOp;
43class MOSDOpReply;
44class OSDService;
45
46void intrusive_ptr_add_ref(PrimaryLogPG *pg);
47void intrusive_ptr_release(PrimaryLogPG *pg);
48uint64_t get_with_id(PrimaryLogPG *pg);
49void put_with_id(PrimaryLogPG *pg, uint64_t id);
50
51#ifdef PG_DEBUG_REFS
52 typedef TrackedIntPtr<PrimaryLogPG> PrimaryLogPGRef;
53#else
54 typedef boost::intrusive_ptr<PrimaryLogPG> PrimaryLogPGRef;
55#endif
56
57struct inconsistent_snapset_wrapper;
58
59class PrimaryLogPG : public PG, public PGBackend::Listener {
60 friend class OSD;
61 friend class Watch;
62
63public:
64 MEMPOOL_CLASS_HELPERS();
65
66 /*
67 * state associated with a copy operation
68 */
69 struct OpContext;
70 class CopyCallback;
71
72 /**
73 * CopyResults stores the object metadata of interest to a copy initiator.
74 */
75 struct CopyResults {
76 ceph::real_time mtime; ///< the copy source's mtime
77 uint64_t object_size; ///< the copied object's size
78 bool started_temp_obj; ///< true if the callback needs to delete temp object
79 hobject_t temp_oid; ///< temp object (if any)
80
81 /**
82 * Function to fill in transaction; if non-empty the callback
83 * must execute it before any other accesses to the object
84 * (in order to complete the copy).
85 */
86 std::function<void(PGTransaction *)> fill_in_final_tx;
87
88 version_t user_version; ///< The copy source's user version
89 bool should_requeue; ///< op should be requeued on cancel
90 vector<snapid_t> snaps; ///< src's snaps (if clone)
91 snapid_t snap_seq; ///< src's snap_seq (if head)
92 librados::snap_set_t snapset; ///< src snapset (if head)
93 bool mirror_snapset;
94 bool has_omap;
95 uint32_t flags; // object_copy_data_t::FLAG_*
96 uint32_t source_data_digest, source_omap_digest;
97 uint32_t data_digest, omap_digest;
31f18b77 98 mempool::osd_pglog::vector<pair<osd_reqid_t, version_t> > reqids; // [(reqid, user_version)]
11fdf7f2 99 mempool::osd_pglog::map<uint32_t, int> reqid_return_codes; // map reqids by index to error code
7c673cae
FG
100 map<string, bufferlist> attrs; // xattrs
101 uint64_t truncate_seq;
102 uint64_t truncate_size;
103 bool is_data_digest() {
104 return flags & object_copy_data_t::FLAG_DATA_DIGEST;
105 }
106 bool is_omap_digest() {
107 return flags & object_copy_data_t::FLAG_OMAP_DIGEST;
108 }
109 CopyResults()
110 : object_size(0), started_temp_obj(false),
111 user_version(0),
112 should_requeue(false), mirror_snapset(false),
113 has_omap(false),
114 flags(0),
115 source_data_digest(-1), source_omap_digest(-1),
116 data_digest(-1), omap_digest(-1),
117 truncate_seq(0), truncate_size(0)
118 {}
119 };
120
11fdf7f2
TL
121 struct CopyOp;
122 typedef std::shared_ptr<CopyOp> CopyOpRef;
123
7c673cae
FG
124 struct CopyOp {
125 CopyCallback *cb;
126 ObjectContextRef obc;
127 hobject_t src;
128 object_locator_t oloc;
129 unsigned flags;
130 bool mirror_snapset;
131
132 CopyResults results;
133
134 ceph_tid_t objecter_tid;
135 ceph_tid_t objecter_tid2;
136
137 object_copy_cursor_t cursor;
138 map<string,bufferlist> attrs;
139 bufferlist data;
140 bufferlist omap_header;
141 bufferlist omap_data;
142 int rval;
143
144 object_copy_cursor_t temp_cursor;
145
146 /*
147 * For CopyOp the process is:
148 * step1: read the data(attr/omap/data) from the source object
149 * step2: handle those data(w/ those data create a new object)
150 * src_obj_fadvise_flags used in step1;
151 * dest_obj_fadvise_flags used in step2
152 */
153 unsigned src_obj_fadvise_flags;
154 unsigned dest_obj_fadvise_flags;
155
11fdf7f2
TL
156 map<uint64_t, CopyOpRef> chunk_cops;
157 int num_chunk;
158 bool failed;
159 uint64_t start_offset = 0;
160 uint64_t last_offset = 0;
161 vector<OSDOp> chunk_ops;
162
7c673cae
FG
163 CopyOp(CopyCallback *cb_, ObjectContextRef _obc, hobject_t s,
164 object_locator_t l,
165 version_t v,
166 unsigned f,
167 bool ms,
168 unsigned src_obj_fadvise_flags,
169 unsigned dest_obj_fadvise_flags)
170 : cb(cb_), obc(_obc), src(s), oloc(l), flags(f),
171 mirror_snapset(ms),
172 objecter_tid(0),
173 objecter_tid2(0),
174 rval(-1),
175 src_obj_fadvise_flags(src_obj_fadvise_flags),
11fdf7f2
TL
176 dest_obj_fadvise_flags(dest_obj_fadvise_flags),
177 num_chunk(0),
178 failed(false)
7c673cae
FG
179 {
180 results.user_version = v;
181 results.mirror_snapset = mirror_snapset;
182 }
183 };
7c673cae
FG
184
185 /**
186 * The CopyCallback class defines an interface for completions to the
187 * copy_start code. Users of the copy infrastructure must implement
188 * one and give an instance of the class to start_copy.
189 *
190 * The implementer is responsible for making sure that the CopyCallback
191 * can associate itself with the correct copy operation.
192 */
193 typedef boost::tuple<int, CopyResults*> CopyCallbackResults;
194
195 friend class CopyFromCallback;
c07f9fc5 196 friend class CopyFromFinisher;
7c673cae 197 friend class PromoteCallback;
11fdf7f2 198 friend class PromoteFinisher;
7c673cae
FG
199
200 struct ProxyReadOp {
201 OpRequestRef op;
202 hobject_t soid;
203 ceph_tid_t objecter_tid;
204 vector<OSDOp> &ops;
205 version_t user_version;
206 int data_offset;
207 bool canceled; ///< true if canceled
208
209 ProxyReadOp(OpRequestRef _op, hobject_t oid, vector<OSDOp>& _ops)
210 : op(_op), soid(oid),
211 objecter_tid(0), ops(_ops),
212 user_version(0), data_offset(0),
213 canceled(false) { }
214 };
11fdf7f2 215 typedef std::shared_ptr<ProxyReadOp> ProxyReadOpRef;
7c673cae
FG
216
217 struct ProxyWriteOp {
218 OpContext *ctx;
219 OpRequestRef op;
220 hobject_t soid;
221 ceph_tid_t objecter_tid;
222 vector<OSDOp> &ops;
223 version_t user_version;
224 bool sent_reply;
225 utime_t mtime;
226 bool canceled;
227 osd_reqid_t reqid;
228
229 ProxyWriteOp(OpRequestRef _op, hobject_t oid, vector<OSDOp>& _ops, osd_reqid_t _reqid)
230 : ctx(NULL), op(_op), soid(oid),
231 objecter_tid(0), ops(_ops),
232 user_version(0), sent_reply(false),
233 canceled(false),
234 reqid(_reqid) { }
235 };
11fdf7f2 236 typedef std::shared_ptr<ProxyWriteOp> ProxyWriteOpRef;
7c673cae
FG
237
238 struct FlushOp {
239 ObjectContextRef obc; ///< obc we are flushing
240 OpRequestRef op; ///< initiating op
241 list<OpRequestRef> dup_ops; ///< bandwagon jumpers
242 version_t flushed_version; ///< user version we are flushing
243 ceph_tid_t objecter_tid; ///< copy-from request tid
244 int rval; ///< copy-from result
245 bool blocking; ///< whether we are blocking updates
246 bool removal; ///< we are removing the backend object
247 boost::optional<std::function<void()>> on_flush; ///< callback, may be null
94b18763 248 // for chunked object
11fdf7f2
TL
249 map<uint64_t, int> io_results;
250 map<uint64_t, ceph_tid_t> io_tids;
251 uint64_t chunks;
7c673cae
FG
252
253 FlushOp()
254 : flushed_version(0), objecter_tid(0), rval(0),
11fdf7f2
TL
255 blocking(false), removal(false), chunks(0) {}
256 ~FlushOp() { ceph_assert(!on_flush); }
7c673cae 257 };
11fdf7f2 258 typedef std::shared_ptr<FlushOp> FlushOpRef;
7c673cae
FG
259
260 boost::scoped_ptr<PGBackend> pgbackend;
261 PGBackend *get_pgbackend() override {
262 return pgbackend.get();
263 }
264
11fdf7f2
TL
265 const PGBackend *get_pgbackend() const override {
266 return pgbackend.get();
267 }
268
7c673cae
FG
269 /// Listener methods
270 DoutPrefixProvider *get_dpp() override {
271 return this;
272 }
273
274 void on_local_recover(
275 const hobject_t &oid,
276 const ObjectRecoveryInfo &recovery_info,
277 ObjectContextRef obc,
c07f9fc5 278 bool is_delete,
7c673cae
FG
279 ObjectStore::Transaction *t
280 ) override;
281 void on_peer_recover(
282 pg_shard_t peer,
283 const hobject_t &oid,
284 const ObjectRecoveryInfo &recovery_info
285 ) override;
286 void begin_peer_recover(
287 pg_shard_t peer,
288 const hobject_t oid) override;
289 void on_global_recover(
290 const hobject_t &oid,
c07f9fc5
FG
291 const object_stat_sum_t &stat_diff,
292 bool is_delete) override;
7c673cae 293 void failed_push(const list<pg_shard_t> &from, const hobject_t &soid) override;
224ce89b
WB
294 void primary_failed(const hobject_t &soid) override;
295 bool primary_error(const hobject_t& soid, eversion_t v) override;
7c673cae
FG
296 void cancel_pull(const hobject_t &soid) override;
297 void apply_stats(
298 const hobject_t &soid,
299 const object_stat_sum_t &delta_stats) override;
224ce89b 300 void on_primary_error(const hobject_t &oid, eversion_t v) override;
b32b8144 301 void backfill_add_missing(const hobject_t &oid, eversion_t v) override;
c07f9fc5
FG
302 void remove_missing_object(const hobject_t &oid,
303 eversion_t v,
304 Context *on_complete) override;
7c673cae
FG
305
306 template<class T> class BlessedGenContext;
11fdf7f2 307 template<class T> class UnlockedBlessedGenContext;
7c673cae
FG
308 class BlessedContext;
309 Context *bless_context(Context *c) override;
310
311 GenContext<ThreadPool::TPHandle&> *bless_gencontext(
312 GenContext<ThreadPool::TPHandle&> *c) override;
11fdf7f2
TL
313 GenContext<ThreadPool::TPHandle&> *bless_unlocked_gencontext(
314 GenContext<ThreadPool::TPHandle&> *c) override;
7c673cae
FG
315
316 void send_message(int to_osd, Message *m) override {
11fdf7f2 317 osd->send_message_osd_cluster(to_osd, m, get_osdmap_epoch());
7c673cae
FG
318 }
319 void queue_transaction(ObjectStore::Transaction&& t,
320 OpRequestRef op) override {
11fdf7f2 321 osd->store->queue_transaction(ch, std::move(t), op);
7c673cae
FG
322 }
323 void queue_transactions(vector<ObjectStore::Transaction>& tls,
324 OpRequestRef op) override {
11fdf7f2 325 osd->store->queue_transactions(ch, tls, op, NULL);
7c673cae
FG
326 }
327 epoch_t get_interval_start_epoch() const override {
328 return info.history.same_interval_since;
329 }
330 epoch_t get_last_peering_reset_epoch() const override {
331 return get_last_peering_reset();
332 }
11fdf7f2
TL
333 const set<pg_shard_t> &get_acting_recovery_backfill_shards() const override {
334 return acting_recovery_backfill;
7c673cae
FG
335 }
336 const set<pg_shard_t> &get_acting_shards() const override {
337 return actingset;
338 }
339 const set<pg_shard_t> &get_backfill_shards() const override {
340 return backfill_targets;
341 }
342
11fdf7f2
TL
343 std::ostream& gen_dbg_prefix(std::ostream& out) const override {
344 return gen_prefix(out);
345 }
346
7c673cae
FG
347 const map<hobject_t, set<pg_shard_t>>
348 &get_missing_loc_shards() const override {
349 return missing_loc.get_missing_locs();
350 }
351 const map<pg_shard_t, pg_missing_t> &get_shard_missing() const override {
352 return peer_missing;
353 }
354 using PGBackend::Listener::get_shard_missing;
355 const map<pg_shard_t, pg_info_t> &get_shard_info() const override {
356 return peer_info;
357 }
358 using PGBackend::Listener::get_shard_info;
359 const pg_missing_tracker_t &get_local_missing() const override {
360 return pg_log.get_missing();
361 }
362 const PGLog &get_log() const override {
363 return pg_log;
364 }
11fdf7f2
TL
365 void add_local_next_event(const pg_log_entry_t& e) override {
366 pg_log.missing_add_next_entry(e);
367 }
7c673cae
FG
368 bool pgb_is_primary() const override {
369 return is_primary();
370 }
11fdf7f2 371 const OSDMapRef& pgb_get_osdmap() const override final {
7c673cae
FG
372 return get_osdmap();
373 }
11fdf7f2
TL
374 epoch_t pgb_get_osdmap_epoch() const override final {
375 return get_osdmap_epoch();
376 }
7c673cae
FG
377 const pg_info_t &get_info() const override {
378 return info;
379 }
380 const pg_pool_t &get_pool() const override {
381 return pool.info;
382 }
383
384 ObjectContextRef get_obc(
385 const hobject_t &hoid,
386 const map<string, bufferlist> &attrs) override {
387 return get_object_context(hoid, true, &attrs);
388 }
389
390 bool try_lock_for_read(
391 const hobject_t &hoid,
392 ObcLockManager &manager) override {
393 if (is_missing_object(hoid))
394 return false;
395 auto obc = get_object_context(hoid, false, nullptr);
396 if (!obc)
397 return false;
398 return manager.try_get_read_lock(hoid, obc);
399 }
400
401 void release_locks(ObcLockManager &manager) override {
402 release_object_locks(manager);
403 }
404
11fdf7f2
TL
405 bool pg_is_repair() override {
406 return is_repair();
407 }
408 void inc_osd_stat_repaired() override {
409 osd->inc_osd_stat_repaired();
410 }
411 bool pg_is_remote_backfilling() override {
412 return is_remote_backfilling();
413 }
414 void pg_add_local_num_bytes(int64_t num_bytes) override {
415 add_local_num_bytes(num_bytes);
416 }
417 void pg_sub_local_num_bytes(int64_t num_bytes) override {
418 sub_local_num_bytes(num_bytes);
419 }
420 void pg_add_num_bytes(int64_t num_bytes) override {
421 add_num_bytes(num_bytes);
422 }
423 void pg_sub_num_bytes(int64_t num_bytes) override {
424 sub_num_bytes(num_bytes);
425 }
426
7c673cae
FG
427 void pgb_set_object_snap_mapping(
428 const hobject_t &soid,
429 const set<snapid_t> &snaps,
430 ObjectStore::Transaction *t) override {
431 return update_object_snap_mapping(t, soid, snaps);
432 }
433 void pgb_clear_object_snap_mapping(
434 const hobject_t &soid,
435 ObjectStore::Transaction *t) override {
436 return clear_object_snap_mapping(t, soid);
437 }
438
439 void log_operation(
440 const vector<pg_log_entry_t> &logv,
441 const boost::optional<pg_hit_set_history_t> &hset_history,
442 const eversion_t &trim_to,
443 const eversion_t &roll_forward_to,
444 bool transaction_applied,
11fdf7f2
TL
445 ObjectStore::Transaction &t,
446 bool async = false) override {
7c673cae
FG
447 if (hset_history) {
448 info.hit_set = *hset_history;
449 }
11fdf7f2 450 append_log(logv, trim_to, roll_forward_to, t, transaction_applied, async);
7c673cae
FG
451 }
452
11fdf7f2 453 void op_applied(const eversion_t &applied_version) override;
7c673cae
FG
454
455 bool should_send_op(
456 pg_shard_t peer,
11fdf7f2
TL
457 const hobject_t &hoid) override;
458
459 bool pg_is_undersized() const override {
460 return is_undersized();
7c673cae
FG
461 }
462
11fdf7f2
TL
463 bool pg_is_repair() const override {
464 return is_repair();
465 }
466
7c673cae
FG
467 void update_peer_last_complete_ondisk(
468 pg_shard_t fromosd,
469 eversion_t lcod) override {
470 peer_last_complete_ondisk[fromosd] = lcod;
471 }
472
473 void update_last_complete_ondisk(
474 eversion_t lcod) override {
475 last_complete_ondisk = lcod;
476 }
477
478 void update_stats(
479 const pg_stat_t &stat) override {
480 info.stats = stat;
481 }
482
483 void schedule_recovery_work(
484 GenContext<ThreadPool::TPHandle&> *c) override;
485
486 pg_shard_t whoami_shard() const override {
487 return pg_whoami;
488 }
489 spg_t primary_spg_t() const override {
490 return spg_t(info.pgid.pgid, primary.shard);
491 }
492 pg_shard_t primary_shard() const override {
493 return primary;
494 }
7c673cae
FG
495
496 void send_message_osd_cluster(
497 int peer, Message *m, epoch_t from_epoch) override;
498 void send_message_osd_cluster(
499 Message *m, Connection *con) override;
500 void send_message_osd_cluster(
501 Message *m, const ConnectionRef& con) override;
502 ConnectionRef get_con_osd_cluster(int peer, epoch_t from_epoch) override;
503 entity_name_t get_cluster_msgr_name() override {
504 return osd->get_cluster_msgr_name();
505 }
506
507 PerfCounters *get_logger() override;
508
509 ceph_tid_t get_tid() override { return osd->get_tid(); }
510
511 LogClientTemp clog_error() override { return osd->clog->error(); }
c07f9fc5 512 LogClientTemp clog_warn() override { return osd->clog->warn(); }
7c673cae
FG
513
514 struct watch_disconnect_t {
515 uint64_t cookie;
516 entity_name_t name;
517 bool send_disconnect;
518 watch_disconnect_t(uint64_t c, entity_name_t n, bool sd)
519 : cookie(c), name(n), send_disconnect(sd) {}
520 };
521 void complete_disconnect_watches(
522 ObjectContextRef obc,
523 const list<watch_disconnect_t> &to_disconnect);
524
c07f9fc5
FG
525 struct OpFinisher {
526 virtual ~OpFinisher() {
527 }
528
529 virtual int execute() = 0;
530 };
531
7c673cae
FG
532 /*
533 * Capture all object state associated with an in-progress read or write.
534 */
535 struct OpContext {
536 OpRequestRef op;
537 osd_reqid_t reqid;
c07f9fc5 538 vector<OSDOp> *ops;
7c673cae
FG
539
540 const ObjectState *obs; // Old objectstate
541 const SnapSet *snapset; // Old snapset
542
543 ObjectState new_obs; // resulting ObjectState
544 SnapSet new_snapset; // resulting SnapSet (in case of a write)
545 //pg_stat_t new_stats; // resulting Stats
546 object_stat_sum_t delta_stats;
547
548 bool modify; // (force) modification (even if op_t is empty)
549 bool user_modify; // user-visible modification
550 bool undirty; // user explicitly un-dirtying this object
551 bool cache_evict; ///< true if this is a cache eviction
552 bool ignore_cache; ///< true if IGNORE_CACHE flag is set
553 bool ignore_log_op_stats; // don't log op stats
554 bool update_log_only; ///< this is a write that returned an error - just record in pg log for dup detection
555
556 // side effects
557 list<pair<watch_info_t,bool> > watch_connects; ///< new watch + will_ping flag
558 list<watch_disconnect_t> watch_disconnects; ///< old watch + send_discon
559 list<notify_info_t> notifies;
560 struct NotifyAck {
561 boost::optional<uint64_t> watch_cookie;
562 uint64_t notify_id;
563 bufferlist reply_bl;
564 explicit NotifyAck(uint64_t notify_id) : notify_id(notify_id) {}
565 NotifyAck(uint64_t notify_id, uint64_t cookie, bufferlist& rbl)
566 : watch_cookie(cookie), notify_id(notify_id) {
567 reply_bl.claim(rbl);
568 }
569 };
570 list<NotifyAck> notify_acks;
571
572 uint64_t bytes_written, bytes_read;
573
574 utime_t mtime;
575 SnapContext snapc; // writer snap context
576 eversion_t at_version; // pg's current version pointer
577 version_t user_at_version; // pg's current user version pointer
578
b32b8144 579 /// index of the current subop - only valid inside of do_osd_ops()
7c673cae 580 int current_osd_subop_num;
b32b8144
FG
581 /// total number of subops processed in this context for cls_cxx_subop_version()
582 int processed_subop_count = 0;
7c673cae
FG
583
584 PGTransactionUPtr op_t;
585 vector<pg_log_entry_t> log;
586 boost::optional<pg_hit_set_history_t> updated_hset_history;
587
588 interval_set<uint64_t> modified_ranges;
589 ObjectContextRef obc;
590 ObjectContextRef clone_obc; // if we created a clone
11fdf7f2 591 ObjectContextRef head_obc; // if we also update snapset (see trim_object)
7c673cae 592
c07f9fc5
FG
593 // FIXME: we may want to kill this msgr hint off at some point!
594 boost::optional<int> data_off = boost::none;
7c673cae
FG
595
596 MOSDOpReply *reply;
597
598 PrimaryLogPG *pg;
599
600 int num_read; ///< count read ops
601 int num_write; ///< count update ops
602
31f18b77 603 mempool::osd_pglog::vector<pair<osd_reqid_t, version_t> > extra_reqids;
11fdf7f2 604 mempool::osd_pglog::map<uint32_t, int> extra_reqid_return_codes;
7c673cae 605
7c673cae
FG
606 hobject_t new_temp_oid, discard_temp_oid; ///< temp objects we should start/stop tracking
607
608 list<std::function<void()>> on_applied;
609 list<std::function<void()>> on_committed;
610 list<std::function<void()>> on_finish;
611 list<std::function<void()>> on_success;
612 template <typename F>
613 void register_on_finish(F &&f) {
614 on_finish.emplace_back(std::forward<F>(f));
615 }
616 template <typename F>
617 void register_on_success(F &&f) {
618 on_success.emplace_back(std::forward<F>(f));
619 }
620 template <typename F>
621 void register_on_applied(F &&f) {
622 on_applied.emplace_back(std::forward<F>(f));
623 }
624 template <typename F>
625 void register_on_commit(F &&f) {
626 on_committed.emplace_back(std::forward<F>(f));
627 }
628
11fdf7f2 629 bool sent_reply = false;
7c673cae
FG
630
631 // pending async reads <off, len, op_flags> -> <outbl, outr>
632 list<pair<boost::tuple<uint64_t, uint64_t, unsigned>,
633 pair<bufferlist*, Context*> > > pending_async_reads;
7c673cae
FG
634 int inflightreads;
635 friend struct OnReadComplete;
636 void start_async_reads(PrimaryLogPG *pg);
637 void finish_read(PrimaryLogPG *pg);
638 bool async_reads_complete() {
639 return inflightreads == 0;
640 }
641
642 ObjectContext::RWState::State lock_type;
643 ObcLockManager lock_manager;
644
c07f9fc5
FG
645 std::map<int, std::unique_ptr<OpFinisher>> op_finishers;
646
7c673cae
FG
647 OpContext(const OpContext& other);
648 const OpContext& operator=(const OpContext& other);
649
c07f9fc5 650 OpContext(OpRequestRef _op, osd_reqid_t _reqid, vector<OSDOp>* _ops,
7c673cae
FG
651 ObjectContextRef& obc,
652 PrimaryLogPG *_pg) :
653 op(_op), reqid(_reqid), ops(_ops),
654 obs(&obc->obs),
655 snapset(0),
656 new_obs(obs->oi, obs->exists),
657 modify(false), user_modify(false), undirty(false), cache_evict(false),
658 ignore_cache(false), ignore_log_op_stats(false), update_log_only(false),
659 bytes_written(0), bytes_read(0), user_at_version(0),
660 current_osd_subop_num(0),
661 obc(obc),
c07f9fc5 662 reply(NULL), pg(_pg),
7c673cae
FG
663 num_read(0),
664 num_write(0),
7c673cae 665 sent_reply(false),
7c673cae
FG
666 inflightreads(0),
667 lock_type(ObjectContext::RWState::RWNONE) {
668 if (obc->ssc) {
669 new_snapset = obc->ssc->snapset;
670 snapset = &obc->ssc->snapset;
671 }
672 }
673 OpContext(OpRequestRef _op, osd_reqid_t _reqid,
c07f9fc5 674 vector<OSDOp>* _ops, PrimaryLogPG *_pg) :
7c673cae
FG
675 op(_op), reqid(_reqid), ops(_ops), obs(NULL), snapset(0),
676 modify(false), user_modify(false), undirty(false), cache_evict(false),
677 ignore_cache(false), ignore_log_op_stats(false), update_log_only(false),
678 bytes_written(0), bytes_read(0), user_at_version(0),
679 current_osd_subop_num(0),
c07f9fc5 680 reply(NULL), pg(_pg),
7c673cae
FG
681 num_read(0),
682 num_write(0),
7c673cae
FG
683 inflightreads(0),
684 lock_type(ObjectContext::RWState::RWNONE) {}
685 void reset_obs(ObjectContextRef obc) {
686 new_obs = ObjectState(obc->obs.oi, obc->obs.exists);
687 if (obc->ssc) {
688 new_snapset = obc->ssc->snapset;
689 snapset = &obc->ssc->snapset;
690 }
691 }
692 ~OpContext() {
11fdf7f2 693 ceph_assert(!op_t);
7c673cae
FG
694 if (reply)
695 reply->put();
696 for (list<pair<boost::tuple<uint64_t, uint64_t, unsigned>,
697 pair<bufferlist*, Context*> > >::iterator i =
698 pending_async_reads.begin();
699 i != pending_async_reads.end();
700 pending_async_reads.erase(i++)) {
701 delete i->second.second;
702 }
703 }
704 uint64_t get_features() {
705 if (op && op->get_req()) {
706 return op->get_req()->get_connection()->get_features();
707 }
708 return -1ull;
709 }
710 };
711 using OpContextUPtr = std::unique_ptr<OpContext>;
712 friend struct OpContext;
713
714 /*
715 * State on the PG primary associated with the replicated mutation
716 */
717 class RepGather {
718 public:
719 hobject_t hoid;
720 OpRequestRef op;
721 xlist<RepGather*>::item queue_item;
722 int nref;
723
724 eversion_t v;
725 int r = 0;
726
727 ceph_tid_t rep_tid;
728
11fdf7f2 729 bool rep_aborted;
7c673cae 730 bool all_committed;
7c673cae
FG
731
732 utime_t start;
733
734 eversion_t pg_local_last_complete;
735
736 ObcLockManager lock_manager;
737
7c673cae
FG
738 list<std::function<void()>> on_committed;
739 list<std::function<void()>> on_success;
740 list<std::function<void()>> on_finish;
741
742 RepGather(
743 OpContext *c, ceph_tid_t rt,
11fdf7f2 744 eversion_t lc) :
7c673cae
FG
745 hoid(c->obc->obs.oi.soid),
746 op(c->op),
747 queue_item(this),
748 nref(1),
749 rep_tid(rt),
11fdf7f2
TL
750 rep_aborted(false),
751 all_committed(false),
7c673cae
FG
752 pg_local_last_complete(lc),
753 lock_manager(std::move(c->lock_manager)),
7c673cae
FG
754 on_committed(std::move(c->on_committed)),
755 on_success(std::move(c->on_success)),
756 on_finish(std::move(c->on_finish)) {}
757
758 RepGather(
759 ObcLockManager &&manager,
760 OpRequestRef &&o,
761 boost::optional<std::function<void(void)> > &&on_complete,
762 ceph_tid_t rt,
763 eversion_t lc,
7c673cae
FG
764 int r) :
765 op(o),
766 queue_item(this),
767 nref(1),
768 r(r),
769 rep_tid(rt),
11fdf7f2
TL
770 rep_aborted(false),
771 all_committed(false),
7c673cae
FG
772 pg_local_last_complete(lc),
773 lock_manager(std::move(manager)) {
774 if (on_complete) {
775 on_success.push_back(std::move(*on_complete));
776 }
777 }
778
779 RepGather *get() {
780 nref++;
781 return this;
782 }
783 void put() {
11fdf7f2 784 ceph_assert(nref > 0);
7c673cae 785 if (--nref == 0) {
7c673cae
FG
786 delete this;
787 //generic_dout(0) << "deleting " << this << dendl;
788 }
789 }
790 };
791
792
793protected:
794
795 /**
796 * Grabs locks for OpContext, should be cleaned up in close_op_ctx
797 *
798 * @param ctx [in,out] ctx to get locks for
799 * @return true on success, false if we are queued
800 */
801 bool get_rw_locks(bool write_ordered, OpContext *ctx) {
11fdf7f2 802 /* If head_obc, !obc->obs->exists and we will always take the
7c673cae
FG
803 * snapdir lock *before* the head lock. Since all callers will do
804 * this (read or write) if we get the first we will be guaranteed
805 * to get the second.
806 */
807 if (write_ordered && ctx->op->may_read()) {
808 ctx->lock_type = ObjectContext::RWState::RWEXCL;
809 } else if (write_ordered) {
810 ctx->lock_type = ObjectContext::RWState::RWWRITE;
811 } else {
11fdf7f2 812 ceph_assert(ctx->op->may_read());
7c673cae
FG
813 ctx->lock_type = ObjectContext::RWState::RWREAD;
814 }
815
11fdf7f2
TL
816 if (ctx->head_obc) {
817 ceph_assert(!ctx->obc->obs.exists);
7c673cae
FG
818 if (!ctx->lock_manager.get_lock_type(
819 ctx->lock_type,
11fdf7f2
TL
820 ctx->head_obc->obs.oi.soid,
821 ctx->head_obc,
7c673cae
FG
822 ctx->op)) {
823 ctx->lock_type = ObjectContext::RWState::RWNONE;
824 return false;
825 }
826 }
827 if (ctx->lock_manager.get_lock_type(
828 ctx->lock_type,
829 ctx->obc->obs.oi.soid,
830 ctx->obc,
831 ctx->op)) {
832 return true;
833 } else {
11fdf7f2 834 ceph_assert(!ctx->head_obc);
7c673cae
FG
835 ctx->lock_type = ObjectContext::RWState::RWNONE;
836 return false;
837 }
838 }
839
840 /**
841 * Cleans up OpContext
842 *
843 * @param ctx [in] ctx to clean up
844 */
c07f9fc5 845 void close_op_ctx(OpContext *ctx);
7c673cae
FG
846
847 /**
848 * Releases locks
849 *
850 * @param manager [in] manager with locks to release
851 */
852 void release_object_locks(
853 ObcLockManager &lock_manager) {
11fdf7f2 854 list<pair<ObjectContextRef, list<OpRequestRef> > > to_req;
7c673cae
FG
855 bool requeue_recovery = false;
856 bool requeue_snaptrim = false;
857 lock_manager.put_locks(
858 &to_req,
859 &requeue_recovery,
860 &requeue_snaptrim);
861 if (requeue_recovery)
862 queue_recovery();
863 if (requeue_snaptrim)
864 snap_trimmer_machine.process_event(TrimWriteUnblocked());
865
866 if (!to_req.empty()) {
867 // requeue at front of scrub blocking queue if we are blocked by scrub
868 for (auto &&p: to_req) {
11fdf7f2 869 if (write_blocked_by_scrub(p.first->obs.oi.soid.get_head())) {
28e407b8
AA
870 for (auto& op : p.second) {
871 op->mark_delayed("waiting for scrub");
872 }
11fdf7f2 873
7c673cae
FG
874 waiting_for_scrub.splice(
875 waiting_for_scrub.begin(),
876 p.second,
877 p.second.begin(),
878 p.second.end());
879 } else {
880 requeue_ops(p.second);
881 }
882 }
883 }
884 }
885
886 // replica ops
887 // [primary|tail]
888 xlist<RepGather*> repop_queue;
889
7c673cae 890 friend class C_OSD_RepopCommit;
7c673cae
FG
891 void repop_all_committed(RepGather *repop);
892 void eval_repop(RepGather*);
893 void issue_repop(RepGather *repop, OpContext *ctx);
894 RepGather *new_repop(
895 OpContext *ctx,
896 ObjectContextRef obc,
897 ceph_tid_t rep_tid);
898 boost::intrusive_ptr<RepGather> new_repop(
899 eversion_t version,
900 int r,
901 ObcLockManager &&manager,
902 OpRequestRef &&op,
903 boost::optional<std::function<void(void)> > &&on_complete);
904 void remove_repop(RepGather *repop);
905
906 OpContextUPtr simple_opc_create(ObjectContextRef obc);
907 void simple_opc_submit(OpContextUPtr ctx);
908
909 /**
11fdf7f2 910 * Merge entries atomically into all acting_recovery_backfill osds
7c673cae
FG
911 * adjusting missing and recovery state as necessary.
912 *
913 * Also used to store error log entries for dup detection.
914 */
915 void submit_log_entries(
31f18b77 916 const mempool::osd_pglog::list<pg_log_entry_t> &entries,
7c673cae
FG
917 ObcLockManager &&manager,
918 boost::optional<std::function<void(void)> > &&on_complete,
919 OpRequestRef op = OpRequestRef(),
920 int r = 0);
921 struct LogUpdateCtx {
922 boost::intrusive_ptr<RepGather> repop;
923 set<pg_shard_t> waiting_on;
924 };
925 void cancel_log_updates();
926 map<ceph_tid_t, LogUpdateCtx> log_entry_update_waiting_on;
927
928
929 // hot/cold tracking
930 HitSetRef hit_set; ///< currently accumulating HitSet
931 utime_t hit_set_start_stamp; ///< time the current HitSet started recording
932
933
934 void hit_set_clear(); ///< discard any HitSet state
935 void hit_set_setup(); ///< initialize HitSet state
936 void hit_set_create(); ///< create a new HitSet
937 void hit_set_persist(); ///< persist hit info
938 bool hit_set_apply_log(); ///< apply log entries to update in-memory HitSet
939 void hit_set_trim(OpContextUPtr &ctx, unsigned max); ///< discard old HitSets
940 void hit_set_in_memory_trim(uint32_t max_in_memory); ///< discard old in memory HitSets
941 void hit_set_remove_all();
942
943 hobject_t get_hit_set_current_object(utime_t stamp);
944 hobject_t get_hit_set_archive_object(utime_t start,
945 utime_t end,
946 bool using_gmt);
947
948 // agent
949 boost::scoped_ptr<TierAgentState> agent_state;
950
951 void agent_setup(); ///< initialize agent state
952 bool agent_work(int max) override ///< entry point to do some agent work
953 {
954 return agent_work(max, max);
955 }
956 bool agent_work(int max, int agent_flush_quota) override;
957 bool agent_maybe_flush(ObjectContextRef& obc); ///< maybe flush
958 bool agent_maybe_evict(ObjectContextRef& obc, bool after_flush); ///< maybe evict
959
960 void agent_load_hit_sets(); ///< load HitSets, if needed
961
962 /// estimate object atime and temperature
963 ///
964 /// @param oid [in] object name
965 /// @param temperature [out] relative temperature (# consider both access time and frequency)
966 void agent_estimate_temp(const hobject_t& oid, int *temperature);
967
968 /// stop the agent
969 void agent_stop() override;
970 void agent_delay() override;
971
972 /// clear agent state
973 void agent_clear() override;
974
975 /// choose (new) agent mode(s), returns true if op is requeued
976 bool agent_choose_mode(bool restart = false, OpRequestRef op = OpRequestRef());
977 void agent_choose_mode_restart() override;
978
979 /// true if we can send an ondisk/commit for v
980 bool already_complete(eversion_t v);
981 /// true if we can send an ack for v
982 bool already_ack(eversion_t v);
983
984 // projected object info
985 SharedLRU<hobject_t, ObjectContext> object_contexts;
986 // map from oid.snapdir() to SnapSetContext *
987 map<hobject_t, SnapSetContext*> snapset_contexts;
988 Mutex snapset_contexts_lock;
989
990 // debug order that client ops are applied
991 map<hobject_t, map<client_t, ceph_tid_t>> debug_op_order;
992
993 void populate_obc_watchers(ObjectContextRef obc);
994 void check_blacklisted_obc_watchers(ObjectContextRef obc);
995 void check_blacklisted_watchers() override;
11fdf7f2 996 void get_watchers(list<obj_watch_item_t> *ls) override;
7c673cae
FG
997 void get_obc_watchers(ObjectContextRef obc, list<obj_watch_item_t> &pg_watchers);
998public:
999 void handle_watch_timeout(WatchRef watch);
1000protected:
1001
1002 ObjectContextRef create_object_context(const object_info_t& oi, SnapSetContext *ssc);
1003 ObjectContextRef get_object_context(
1004 const hobject_t& soid,
1005 bool can_create,
1006 const map<string, bufferlist> *attrs = 0
1007 );
1008
1009 void context_registry_on_change();
1010 void object_context_destructor_callback(ObjectContext *obc);
1011 class C_PG_ObjectContext;
1012
1013 int find_object_context(const hobject_t& oid,
1014 ObjectContextRef *pobc,
1015 bool can_create,
1016 bool map_snapid_to_clone=false,
1017 hobject_t *missing_oid=NULL);
1018
1019 void add_object_context_to_pg_stat(ObjectContextRef obc, pg_stat_t *stat);
1020
1021 void get_src_oloc(const object_t& oid, const object_locator_t& oloc, object_locator_t& src_oloc);
1022
1023 SnapSetContext *get_snapset_context(
1024 const hobject_t& oid,
1025 bool can_create,
1026 const map<string, bufferlist> *attrs = 0,
1027 bool oid_existed = true //indicate this oid whether exsited in backend
1028 );
1029 void register_snapset_context(SnapSetContext *ssc) {
11fdf7f2 1030 std::lock_guard l(snapset_contexts_lock);
7c673cae
FG
1031 _register_snapset_context(ssc);
1032 }
1033 void _register_snapset_context(SnapSetContext *ssc) {
11fdf7f2 1034 ceph_assert(snapset_contexts_lock.is_locked());
7c673cae 1035 if (!ssc->registered) {
11fdf7f2 1036 ceph_assert(snapset_contexts.count(ssc->oid) == 0);
7c673cae
FG
1037 ssc->registered = true;
1038 snapset_contexts[ssc->oid] = ssc;
1039 }
1040 }
1041 void put_snapset_context(SnapSetContext *ssc);
1042
1043 map<hobject_t, ObjectContextRef> recovering;
1044
1045 /*
1046 * Backfill
1047 *
1048 * peer_info[backfill_target].last_backfill == info.last_backfill on the peer.
1049 *
1050 * objects prior to peer_info[backfill_target].last_backfill
1051 * - are on the peer
1052 * - are included in the peer stats
1053 *
1054 * objects \in (last_backfill, last_backfill_started]
1055 * - are on the peer or are in backfills_in_flight
1056 * - are not included in pg stats (yet)
1057 * - have their stats in pending_backfill_updates on the primary
1058 */
1059 set<hobject_t> backfills_in_flight;
1060 map<hobject_t, pg_stat_t> pending_backfill_updates;
1061
1062 void dump_recovery_info(Formatter *f) const override {
1063 f->open_array_section("backfill_targets");
1064 for (set<pg_shard_t>::const_iterator p = backfill_targets.begin();
1065 p != backfill_targets.end(); ++p)
1066 f->dump_stream("replica") << *p;
1067 f->close_section();
1068 f->open_array_section("waiting_on_backfill");
1069 for (set<pg_shard_t>::const_iterator p = waiting_on_backfill.begin();
1070 p != waiting_on_backfill.end(); ++p)
1071 f->dump_stream("osd") << *p;
1072 f->close_section();
1073 f->dump_stream("last_backfill_started") << last_backfill_started;
1074 {
1075 f->open_object_section("backfill_info");
1076 backfill_info.dump(f);
1077 f->close_section();
1078 }
1079 {
1080 f->open_array_section("peer_backfill_info");
1081 for (map<pg_shard_t, BackfillInterval>::const_iterator pbi =
1082 peer_backfill_info.begin();
1083 pbi != peer_backfill_info.end(); ++pbi) {
1084 f->dump_stream("osd") << pbi->first;
1085 f->open_object_section("BackfillInterval");
1086 pbi->second.dump(f);
1087 f->close_section();
1088 }
1089 f->close_section();
1090 }
1091 {
1092 f->open_array_section("backfills_in_flight");
1093 for (set<hobject_t>::const_iterator i = backfills_in_flight.begin();
1094 i != backfills_in_flight.end();
1095 ++i) {
1096 f->dump_stream("object") << *i;
1097 }
1098 f->close_section();
1099 }
1100 {
1101 f->open_array_section("recovering");
1102 for (map<hobject_t, ObjectContextRef>::const_iterator i = recovering.begin();
1103 i != recovering.end();
1104 ++i) {
1105 f->dump_stream("object") << i->first;
1106 }
1107 f->close_section();
1108 }
1109 {
1110 f->open_object_section("pg_backend");
1111 pgbackend->dump_recovery_info(f);
1112 f->close_section();
1113 }
1114 }
1115
1116 /// last backfill operation started
1117 hobject_t last_backfill_started;
1118 bool new_backfill;
1119
1120 int prep_object_replica_pushes(const hobject_t& soid, eversion_t v,
11fdf7f2
TL
1121 PGBackend::RecoveryHandle *h,
1122 bool *work_started);
c07f9fc5 1123 int prep_object_replica_deletes(const hobject_t& soid, eversion_t v,
11fdf7f2
TL
1124 PGBackend::RecoveryHandle *h,
1125 bool *work_started);
7c673cae 1126
11fdf7f2 1127 void finish_degraded_object(const hobject_t& oid) override;
7c673cae
FG
1128
1129 // Cancels/resets pulls from peer
1130 void check_recovery_sources(const OSDMapRef& map) override ;
1131
1132 int recover_missing(
1133 const hobject_t& oid,
1134 eversion_t v,
1135 int priority,
1136 PGBackend::RecoveryHandle *h);
1137
1138 // low level ops
1139
1140 void _make_clone(
1141 OpContext *ctx,
1142 PGTransaction* t,
1143 ObjectContextRef obc,
1144 const hobject_t& head, const hobject_t& coid,
1145 object_info_t *poi);
1146 void execute_ctx(OpContext *ctx);
11fdf7f2 1147 void finish_ctx(OpContext *ctx, int log_op_type);
7c673cae
FG
1148 void reply_ctx(OpContext *ctx, int err);
1149 void reply_ctx(OpContext *ctx, int err, eversion_t v, version_t uv);
1150 void make_writeable(OpContext *ctx);
11fdf7f2 1151 void log_op_stats(const OpRequest& op, uint64_t inb, uint64_t outb);
7c673cae
FG
1152
1153 void write_update_size_and_usage(object_stat_sum_t& stats, object_info_t& oi,
1154 interval_set<uint64_t>& modified, uint64_t offset,
1155 uint64_t length, bool write_full=false);
11fdf7f2
TL
1156 inline void truncate_update_size_and_usage(
1157 object_stat_sum_t& delta_stats,
1158 object_info_t& oi,
1159 uint64_t truncate_size);
7c673cae
FG
1160
1161 enum class cache_result_t {
1162 NOOP,
1163 BLOCKED_FULL,
1164 BLOCKED_PROMOTE,
1165 HANDLED_PROXY,
1166 HANDLED_REDIRECT,
b32b8144 1167 REPLIED_WITH_EAGAIN,
11fdf7f2 1168 BLOCKED_RECOVERY,
7c673cae
FG
1169 };
1170 cache_result_t maybe_handle_cache_detail(OpRequestRef op,
1171 bool write_ordered,
1172 ObjectContextRef obc, int r,
1173 hobject_t missing_oid,
1174 bool must_promote,
1175 bool in_hit_set,
1176 ObjectContextRef *promote_obc);
31f18b77
FG
1177 cache_result_t maybe_handle_manifest_detail(OpRequestRef op,
1178 bool write_ordered,
1179 ObjectContextRef obc);
1180 bool maybe_handle_manifest(OpRequestRef op,
1181 bool write_ordered,
1182 ObjectContextRef obc) {
1183 return cache_result_t::NOOP != maybe_handle_manifest_detail(
1184 op,
1185 write_ordered,
1186 obc);
1187 }
1188
7c673cae
FG
1189 /**
1190 * This helper function is called from do_op if the ObjectContext lookup fails.
1191 * @returns true if the caching code is handling the Op, false otherwise.
1192 */
1193 bool maybe_handle_cache(OpRequestRef op,
1194 bool write_ordered,
1195 ObjectContextRef obc, int r,
1196 const hobject_t& missing_oid,
1197 bool must_promote,
1198 bool in_hit_set = false) {
1199 return cache_result_t::NOOP != maybe_handle_cache_detail(
1200 op,
1201 write_ordered,
1202 obc,
1203 r,
1204 missing_oid,
1205 must_promote,
1206 in_hit_set,
1207 nullptr);
1208 }
1209
1210 /**
1211 * This helper function checks if a promotion is needed.
1212 */
1213 bool maybe_promote(ObjectContextRef obc,
1214 const hobject_t& missing_oid,
1215 const object_locator_t& oloc,
1216 bool in_hit_set,
1217 uint32_t recency,
1218 OpRequestRef promote_op,
1219 ObjectContextRef *promote_obc = nullptr);
1220 /**
1221 * This helper function tells the client to redirect their request elsewhere.
1222 */
1223 void do_cache_redirect(OpRequestRef op);
1224 /**
1225 * This function attempts to start a promote. Either it succeeds,
1226 * or places op on a wait list. If op is null, failure means that
1227 * this is a noop. If a future user wants to be able to distinguish
1228 * these cases, a return value should be added.
1229 */
1230 void promote_object(
1231 ObjectContextRef obc, ///< [optional] obc
1232 const hobject_t& missing_object, ///< oid (if !obc)
1233 const object_locator_t& oloc, ///< locator for obc|oid
1234 OpRequestRef op, ///< [optional] client op
1235 ObjectContextRef *promote_obc = nullptr ///< [optional] new obc for object
1236 );
1237
1238 int prepare_transaction(OpContext *ctx);
1239 list<pair<OpRequestRef, OpContext*> > in_progress_async_reads;
1240 void complete_read_ctx(int result, OpContext *ctx);
1241
1242 // pg on-disk content
1243 void check_local() override;
1244
1245 void _clear_recovery_state() override;
1246
1247 bool start_recovery_ops(
1248 uint64_t max,
1249 ThreadPool::TPHandle &handle, uint64_t *started) override;
1250
1251 uint64_t recover_primary(uint64_t max, ThreadPool::TPHandle &handle);
11fdf7f2
TL
1252 uint64_t recover_replicas(uint64_t max, ThreadPool::TPHandle &handle,
1253 bool *recovery_started);
7c673cae
FG
1254 hobject_t earliest_peer_backfill() const;
1255 bool all_peer_done() const;
1256 /**
1257 * @param work_started will be set to true if recover_backfill got anywhere
1258 * @returns the number of operations started
1259 */
1260 uint64_t recover_backfill(uint64_t max, ThreadPool::TPHandle &handle,
1261 bool *work_started);
1262
1263 /**
1264 * scan a (hash) range of objects in the current pg
1265 *
1266 * @begin first item should be >= this value
1267 * @min return at least this many items, unless we are done
1268 * @max return no more than this many items
1269 * @bi [out] resulting map of objects to eversion_t's
1270 */
1271 void scan_range(
1272 int min, int max, BackfillInterval *bi,
1273 ThreadPool::TPHandle &handle
1274 );
1275
1276 /// Update a hash range to reflect changes since the last scan
1277 void update_range(
1278 BackfillInterval *bi, ///< [in,out] interval to update
1279 ThreadPool::TPHandle &handle ///< [in] tp handle
1280 );
1281
224ce89b 1282 int prep_backfill_object_push(
7c673cae
FG
1283 hobject_t oid, eversion_t v, ObjectContextRef obc,
1284 vector<pg_shard_t> peers,
1285 PGBackend::RecoveryHandle *h);
1286 void send_remove_op(const hobject_t& oid, eversion_t v, pg_shard_t peer);
1287
1288
7c673cae
FG
1289 class C_OSD_AppliedRecoveredObject;
1290 class C_OSD_CommittedPushedObject;
1291 class C_OSD_AppliedRecoveredObjectReplica;
7c673cae
FG
1292
1293 void _applied_recovered_object(ObjectContextRef obc);
1294 void _applied_recovered_object_replica();
1295 void _committed_pushed_object(epoch_t epoch, eversion_t lc);
1296 void recover_got(hobject_t oid, eversion_t v);
1297
1298 // -- copyfrom --
1299 map<hobject_t, CopyOpRef> copy_ops;
1300
11fdf7f2 1301 int do_copy_get(OpContext *ctx, bufferlist::const_iterator& bp, OSDOp& op,
c07f9fc5
FG
1302 ObjectContextRef& obc);
1303 int finish_copy_get();
1304
7c673cae
FG
1305 void fill_in_copy_get_noent(OpRequestRef& op, hobject_t oid,
1306 OSDOp& osd_op);
1307
1308 /**
1309 * To copy an object, call start_copy.
1310 *
1311 * @param cb: The CopyCallback to be activated when the copy is complete
1312 * @param obc: The ObjectContext we are copying into
1313 * @param src: The source object
1314 * @param oloc: the source object locator
1315 * @param version: the version of the source object to copy (0 for any)
1316 */
1317 void start_copy(CopyCallback *cb, ObjectContextRef obc, hobject_t src,
1318 object_locator_t oloc, version_t version, unsigned flags,
1319 bool mirror_snapset, unsigned src_obj_fadvise_flags,
1320 unsigned dest_obj_fadvise_flags);
1321 void process_copy_chunk(hobject_t oid, ceph_tid_t tid, int r);
1322 void _write_copy_chunk(CopyOpRef cop, PGTransaction *t);
1323 uint64_t get_copy_chunk_size() const {
1324 uint64_t size = cct->_conf->osd_copyfrom_max_chunk;
11fdf7f2 1325 if (pool.info.required_alignment()) {
7c673cae
FG
1326 uint64_t alignment = pool.info.required_alignment();
1327 if (size % alignment) {
1328 size += alignment - (size % alignment);
1329 }
1330 }
1331 return size;
1332 }
1333 void _copy_some(ObjectContextRef obc, CopyOpRef cop);
c07f9fc5 1334 void finish_copyfrom(CopyFromCallback *cb);
7c673cae 1335 void finish_promote(int r, CopyResults *results, ObjectContextRef obc);
94b18763
FG
1336 void cancel_copy(CopyOpRef cop, bool requeue, vector<ceph_tid_t> *tids);
1337 void cancel_copy_ops(bool requeue, vector<ceph_tid_t> *tids);
7c673cae
FG
1338
1339 friend struct C_Copyfrom;
1340
1341 // -- flush --
1342 map<hobject_t, FlushOpRef> flush_ops;
1343
1344 /// start_flush takes ownership of on_flush iff ret == -EINPROGRESS
1345 int start_flush(
1346 OpRequestRef op, ObjectContextRef obc,
1347 bool blocking, hobject_t *pmissing,
1348 boost::optional<std::function<void()>> &&on_flush);
1349 void finish_flush(hobject_t oid, ceph_tid_t tid, int r);
1350 int try_flush_mark_clean(FlushOpRef fop);
94b18763
FG
1351 void cancel_flush(FlushOpRef fop, bool requeue, vector<ceph_tid_t> *tids);
1352 void cancel_flush_ops(bool requeue, vector<ceph_tid_t> *tids);
7c673cae
FG
1353
1354 /// @return false if clone is has been evicted
1355 bool is_present_clone(hobject_t coid);
1356
1357 friend struct C_Flush;
1358
1359 // -- scrub --
1360 bool _range_available_for_scrub(
1361 const hobject_t &begin, const hobject_t &end) override;
1362 void scrub_snapshot_metadata(
1363 ScrubMap &map,
28e407b8
AA
1364 const std::map<hobject_t,
1365 pair<boost::optional<uint32_t>,
1366 boost::optional<uint32_t>>> &missing_digest) override;
7c673cae
FG
1367 void _scrub_clear_state() override;
1368 void _scrub_finish() override;
1369 object_stat_collection_t scrub_cstat;
1370
1371 void _split_into(pg_t child_pgid, PG *child,
1372 unsigned split_bits) override;
1373 void apply_and_flush_repops(bool requeue);
1374
1375 void calc_trim_to() override;
f64942e4 1376 void calc_trim_to_aggressive() override;
7c673cae
FG
1377 int do_xattr_cmp_u64(int op, __u64 v1, bufferlist& xattr);
1378 int do_xattr_cmp_str(int op, string& v1s, bufferlist& xattr);
1379
1380 // -- checksum --
11fdf7f2 1381 int do_checksum(OpContext *ctx, OSDOp& osd_op, bufferlist::const_iterator *bl_it);
7c673cae 1382 int finish_checksum(OSDOp& osd_op, Checksummer::CSumType csum_type,
11fdf7f2 1383 bufferlist::const_iterator *init_value_bl_it,
7c673cae
FG
1384 const bufferlist &read_bl);
1385
1386 friend class C_ChecksumRead;
1387
1388 int do_extent_cmp(OpContext *ctx, OSDOp& osd_op);
c07f9fc5
FG
1389 int finish_extent_cmp(OSDOp& osd_op, const bufferlist &read_bl);
1390
1391 friend class C_ExtentCmpRead;
1392
1393 int do_read(OpContext *ctx, OSDOp& osd_op);
1394 int do_sparse_read(OpContext *ctx, OSDOp& osd_op);
7c673cae
FG
1395 int do_writesame(OpContext *ctx, OSDOp& osd_op);
1396
1397 bool pgls_filter(PGLSFilter *filter, hobject_t& sobj, bufferlist& outdata);
11fdf7f2 1398 int get_pgls_filter(bufferlist::const_iterator& iter, PGLSFilter **pfilter);
7c673cae
FG
1399
1400 map<hobject_t, list<OpRequestRef>> in_progress_proxy_ops;
1401 void kick_proxy_ops_blocked(hobject_t& soid);
94b18763 1402 void cancel_proxy_ops(bool requeue, vector<ceph_tid_t> *tids);
7c673cae
FG
1403
1404 // -- proxyread --
1405 map<ceph_tid_t, ProxyReadOpRef> proxyread_ops;
1406
31f18b77 1407 void do_proxy_read(OpRequestRef op, ObjectContextRef obc = NULL);
7c673cae 1408 void finish_proxy_read(hobject_t oid, ceph_tid_t tid, int r);
94b18763 1409 void cancel_proxy_read(ProxyReadOpRef prdop, vector<ceph_tid_t> *tids);
7c673cae
FG
1410
1411 friend struct C_ProxyRead;
1412
1413 // -- proxywrite --
1414 map<ceph_tid_t, ProxyWriteOpRef> proxywrite_ops;
1415
11fdf7f2 1416 void do_proxy_write(OpRequestRef op, ObjectContextRef obc = NULL);
7c673cae 1417 void finish_proxy_write(hobject_t oid, ceph_tid_t tid, int r);
94b18763 1418 void cancel_proxy_write(ProxyWriteOpRef pwop, vector<ceph_tid_t> *tids);
7c673cae
FG
1419
1420 friend struct C_ProxyWrite_Commit;
1421
11fdf7f2
TL
1422 // -- chunkop --
1423 void do_proxy_chunked_op(OpRequestRef op, const hobject_t& missing_oid,
1424 ObjectContextRef obc, bool write_ordered);
1425 void do_proxy_chunked_read(OpRequestRef op, ObjectContextRef obc, int op_index,
1426 uint64_t chunk_index, uint64_t req_offset, uint64_t req_length,
1427 uint64_t req_total_len, bool write_ordered);
1428 bool can_proxy_chunked_read(OpRequestRef op, ObjectContextRef obc);
1429 void _copy_some_manifest(ObjectContextRef obc, CopyOpRef cop, uint64_t start_offset);
1430 void process_copy_chunk_manifest(hobject_t oid, ceph_tid_t tid, int r, uint64_t offset);
1431 void finish_promote_manifest(int r, CopyResults *results, ObjectContextRef obc);
1432 void cancel_and_requeue_proxy_ops(hobject_t oid);
1433 int do_manifest_flush(OpRequestRef op, ObjectContextRef obc, FlushOpRef manifest_fop,
1434 uint64_t start_offset, bool block);
1435 int start_manifest_flush(OpRequestRef op, ObjectContextRef obc, bool blocking,
1436 boost::optional<std::function<void()>> &&on_flush);
1437 void finish_manifest_flush(hobject_t oid, ceph_tid_t tid, int r, ObjectContextRef obc,
1438 uint64_t last_offset);
1439 void handle_manifest_flush(hobject_t oid, ceph_tid_t tid, int r,
1440 uint64_t offset, uint64_t last_offset, epoch_t lpr);
1441 void refcount_manifest(ObjectContextRef obc, object_locator_t oloc, hobject_t soid,
1442 SnapContext snapc, bool get, Context *cb, uint64_t offset);
1443
1444 friend struct C_ProxyChunkRead;
1445 friend class PromoteManifestCallback;
1446 friend class C_CopyChunk;
1447 friend struct C_ManifestFlush;
1448 friend struct RefCountCallback;
1449
7c673cae
FG
1450public:
1451 PrimaryLogPG(OSDService *o, OSDMapRef curmap,
11fdf7f2
TL
1452 const PGPool &_pool,
1453 const map<string,string>& ec_profile,
1454 spg_t p);
7c673cae
FG
1455 ~PrimaryLogPG() override {}
1456
1457 int do_command(
1458 cmdmap_t cmdmap,
1459 ostream& ss,
1460 bufferlist& idata,
1461 bufferlist& odata,
1462 ConnectionRef conn,
1463 ceph_tid_t tid) override;
1464
11fdf7f2
TL
1465 void clear_cache();
1466 int get_cache_obj_count() {
1467 return object_contexts.get_count();
1468 }
7c673cae
FG
1469 void do_request(
1470 OpRequestRef& op,
1471 ThreadPool::TPHandle &handle) override;
11fdf7f2 1472 void do_op(OpRequestRef& op);
7c673cae
FG
1473 void record_write_error(OpRequestRef op, const hobject_t &soid,
1474 MOSDOpReply *orig_reply, int r);
1475 void do_pg_op(OpRequestRef op);
7c673cae
FG
1476 void do_scan(
1477 OpRequestRef op,
11fdf7f2
TL
1478 ThreadPool::TPHandle &handle);
1479 void do_backfill(OpRequestRef op);
7c673cae
FG
1480 void do_backfill_remove(OpRequestRef op);
1481
1482 void handle_backoff(OpRequestRef& op);
1483
224ce89b 1484 int trim_object(bool first, const hobject_t &coid, OpContextUPtr *ctxp);
7c673cae
FG
1485 void snap_trimmer(epoch_t e) override;
1486 void kick_snap_trim() override;
1487 void snap_trimmer_scrub_complete() override;
1488 int do_osd_ops(OpContext *ctx, vector<OSDOp>& ops);
1489
1490 int _get_tmap(OpContext *ctx, bufferlist *header, bufferlist *vals);
1491 int do_tmap2omap(OpContext *ctx, unsigned flags);
11fdf7f2
TL
1492 int do_tmapup(OpContext *ctx, bufferlist::const_iterator& bp, OSDOp& osd_op);
1493 int do_tmapup_slow(OpContext *ctx, bufferlist::const_iterator& bp, OSDOp& osd_op, bufferlist& bl);
7c673cae
FG
1494
1495 void do_osd_op_effects(OpContext *ctx, const ConnectionRef& conn);
1496private:
1497 int do_scrub_ls(MOSDOp *op, OSDOp *osd_op);
1498 hobject_t earliest_backfill() const;
1499 bool check_src_targ(const hobject_t& soid, const hobject_t& toid) const;
1500
1501 uint64_t temp_seq; ///< last id for naming temp objects
1502 /// generate a new temp object name
1503 hobject_t generate_temp_object(const hobject_t& target);
1504 /// generate a new temp object name (for recovery)
1505 hobject_t get_temp_recovery_object(const hobject_t& target,
1506 eversion_t version) override;
1507 int get_recovery_op_priority() const {
11fdf7f2
TL
1508 int64_t pri = 0;
1509 pool.info.opts.get(pool_opts_t::RECOVERY_OP_PRIORITY, &pri);
1510 return pri > 0 ? pri : cct->_conf->osd_recovery_op_priority;
7c673cae
FG
1511 }
1512 void log_missing(unsigned missing,
1513 const boost::optional<hobject_t> &head,
1514 LogChannelRef clog,
1515 const spg_t &pgid,
1516 const char *func,
1517 const char *mode,
1518 bool allow_incomplete_clones);
1519 unsigned process_clones_to(const boost::optional<hobject_t> &head,
1520 const boost::optional<SnapSet> &snapset,
1521 LogChannelRef clog,
1522 const spg_t &pgid,
1523 const char *mode,
1524 bool allow_incomplete_clones,
1525 boost::optional<snapid_t> target,
1526 vector<snapid_t>::reverse_iterator *curclone,
1527 inconsistent_snapset_wrapper &snap_error);
1528
1529public:
1530 coll_t get_coll() {
1531 return coll;
1532 }
1533 void split_colls(
1534 spg_t child,
1535 int split_bits,
1536 int seed,
1537 const pg_pool_t *pool,
1538 ObjectStore::Transaction *t) override {
1539 coll_t target = coll_t(child);
1540 PG::_create(*t, child, split_bits);
1541 t->split_collection(
1542 coll,
1543 split_bits,
1544 seed,
1545 target);
1546 PG::_init(*t, child, pool);
1547 }
1548private:
1549
1550 struct DoSnapWork : boost::statechart::event< DoSnapWork > {
1551 DoSnapWork() : boost::statechart::event < DoSnapWork >() {}
1552 };
1553 struct KickTrim : boost::statechart::event< KickTrim > {
1554 KickTrim() : boost::statechart::event < KickTrim >() {}
1555 };
1556 struct RepopsComplete : boost::statechart::event< RepopsComplete > {
1557 RepopsComplete() : boost::statechart::event < RepopsComplete >() {}
1558 };
1559 struct ScrubComplete : boost::statechart::event< ScrubComplete > {
1560 ScrubComplete() : boost::statechart::event < ScrubComplete >() {}
1561 };
1562 struct TrimWriteUnblocked : boost::statechart::event< TrimWriteUnblocked > {
1563 TrimWriteUnblocked() : boost::statechart::event < TrimWriteUnblocked >() {}
1564 };
1565 struct Reset : boost::statechart::event< Reset > {
1566 Reset() : boost::statechart::event< Reset >() {}
1567 };
1568 struct SnapTrimReserved : boost::statechart::event< SnapTrimReserved > {
1569 SnapTrimReserved() : boost::statechart::event< SnapTrimReserved >() {}
1570 };
1571 struct SnapTrimTimerReady : boost::statechart::event< SnapTrimTimerReady > {
1572 SnapTrimTimerReady() : boost::statechart::event< SnapTrimTimerReady >() {}
1573 };
1574
1575 struct NotTrimming;
1576 struct SnapTrimmer : public boost::statechart::state_machine< SnapTrimmer, NotTrimming > {
1577 PrimaryLogPG *pg;
1578 explicit SnapTrimmer(PrimaryLogPG *pg) : pg(pg) {}
1579 void log_enter(const char *state_name);
1580 void log_exit(const char *state_name, utime_t duration);
1581 bool can_trim() {
11fdf7f2
TL
1582 return
1583 pg->is_clean() &&
1584 !pg->scrubber.active &&
1585 !pg->snap_trimq.empty() &&
1586 !pg->get_osdmap()->test_flag(CEPH_OSDMAP_NOSNAPTRIM);
7c673cae
FG
1587 }
1588 } snap_trimmer_machine;
1589
1590 struct WaitReservation;
1591 struct Trimming : boost::statechart::state< Trimming, SnapTrimmer, WaitReservation >, NamedState {
1592 typedef boost::mpl::list <
1593 boost::statechart::custom_reaction< KickTrim >,
1594 boost::statechart::transition< Reset, NotTrimming >
1595 > reactions;
1596
1597 set<hobject_t> in_flight;
1598 snapid_t snap_to_trim;
1599
1600 explicit Trimming(my_context ctx)
1601 : my_base(ctx),
1602 NamedState(context< SnapTrimmer >().pg, "Trimming") {
1603 context< SnapTrimmer >().log_enter(state_name);
11fdf7f2
TL
1604 ceph_assert(context< SnapTrimmer >().can_trim());
1605 ceph_assert(in_flight.empty());
7c673cae
FG
1606 }
1607 void exit() {
1608 context< SnapTrimmer >().log_exit(state_name, enter_time);
1609 auto *pg = context< SnapTrimmer >().pg;
1610 pg->osd->snap_reserver.cancel_reservation(pg->get_pgid());
1611 pg->state_clear(PG_STATE_SNAPTRIM);
1612 pg->publish_stats_to_osd();
1613 }
1614 boost::statechart::result react(const KickTrim&) {
1615 return discard_event();
1616 }
1617 };
1618
1619 /* SnapTrimmerStates */
1620 struct WaitTrimTimer : boost::statechart::state< WaitTrimTimer, Trimming >, NamedState {
1621 typedef boost::mpl::list <
1622 boost::statechart::custom_reaction< SnapTrimTimerReady >
1623 > reactions;
1624 Context *wakeup = nullptr;
1625 explicit WaitTrimTimer(my_context ctx)
1626 : my_base(ctx),
1627 NamedState(context< SnapTrimmer >().pg, "Trimming/WaitTrimTimer") {
1628 context< SnapTrimmer >().log_enter(state_name);
11fdf7f2 1629 ceph_assert(context<Trimming>().in_flight.empty());
7c673cae
FG
1630 struct OnTimer : Context {
1631 PrimaryLogPGRef pg;
1632 epoch_t epoch;
1633 OnTimer(PrimaryLogPGRef pg, epoch_t epoch) : pg(pg), epoch(epoch) {}
1634 void finish(int) override {
1635 pg->lock();
1636 if (!pg->pg_has_reset_since(epoch))
1637 pg->snap_trimmer_machine.process_event(SnapTrimTimerReady());
1638 pg->unlock();
1639 }
1640 };
1641 auto *pg = context< SnapTrimmer >().pg;
1642 if (pg->cct->_conf->osd_snap_trim_sleep > 0) {
11fdf7f2
TL
1643 std::lock_guard l(pg->osd->sleep_lock);
1644 wakeup = pg->osd->sleep_timer.add_event_after(
3efd9988 1645 pg->cct->_conf->osd_snap_trim_sleep,
11fdf7f2 1646 new OnTimer{pg, pg->get_osdmap_epoch()});
7c673cae
FG
1647 } else {
1648 post_event(SnapTrimTimerReady());
1649 }
1650 }
1651 void exit() {
1652 context< SnapTrimmer >().log_exit(state_name, enter_time);
1653 auto *pg = context< SnapTrimmer >().pg;
1654 if (wakeup) {
11fdf7f2
TL
1655 std::lock_guard l(pg->osd->sleep_lock);
1656 pg->osd->sleep_timer.cancel_event(wakeup);
7c673cae
FG
1657 wakeup = nullptr;
1658 }
1659 }
1660 boost::statechart::result react(const SnapTrimTimerReady &) {
1661 wakeup = nullptr;
1662 if (!context< SnapTrimmer >().can_trim()) {
1663 post_event(KickTrim());
1664 return transit< NotTrimming >();
1665 } else {
1666 return transit< AwaitAsyncWork >();
1667 }
1668 }
1669 };
1670
1671 struct WaitRWLock : boost::statechart::state< WaitRWLock, Trimming >, NamedState {
1672 typedef boost::mpl::list <
1673 boost::statechart::custom_reaction< TrimWriteUnblocked >
1674 > reactions;
1675 explicit WaitRWLock(my_context ctx)
1676 : my_base(ctx),
1677 NamedState(context< SnapTrimmer >().pg, "Trimming/WaitRWLock") {
1678 context< SnapTrimmer >().log_enter(state_name);
11fdf7f2 1679 ceph_assert(context<Trimming>().in_flight.empty());
7c673cae
FG
1680 }
1681 void exit() {
1682 context< SnapTrimmer >().log_exit(state_name, enter_time);
1683 }
1684 boost::statechart::result react(const TrimWriteUnblocked&) {
1685 if (!context< SnapTrimmer >().can_trim()) {
1686 post_event(KickTrim());
1687 return transit< NotTrimming >();
1688 } else {
1689 return transit< AwaitAsyncWork >();
1690 }
1691 }
1692 };
1693
1694 struct WaitRepops : boost::statechart::state< WaitRepops, Trimming >, NamedState {
1695 typedef boost::mpl::list <
1696 boost::statechart::custom_reaction< RepopsComplete >
1697 > reactions;
1698 explicit WaitRepops(my_context ctx)
1699 : my_base(ctx),
1700 NamedState(context< SnapTrimmer >().pg, "Trimming/WaitRepops") {
1701 context< SnapTrimmer >().log_enter(state_name);
11fdf7f2 1702 ceph_assert(!context<Trimming>().in_flight.empty());
7c673cae
FG
1703 }
1704 void exit() {
1705 context< SnapTrimmer >().log_exit(state_name, enter_time);
1706 }
1707 boost::statechart::result react(const RepopsComplete&) {
1708 if (!context< SnapTrimmer >().can_trim()) {
1709 post_event(KickTrim());
1710 return transit< NotTrimming >();
1711 } else {
1712 return transit< WaitTrimTimer >();
1713 }
1714 }
1715 };
1716
1717 struct AwaitAsyncWork : boost::statechart::state< AwaitAsyncWork, Trimming >, NamedState {
1718 typedef boost::mpl::list <
1719 boost::statechart::custom_reaction< DoSnapWork >
1720 > reactions;
1721 explicit AwaitAsyncWork(my_context ctx);
1722 void exit() {
1723 context< SnapTrimmer >().log_exit(state_name, enter_time);
1724 }
1725 boost::statechart::result react(const DoSnapWork&);
1726 };
1727
1728 struct WaitReservation : boost::statechart::state< WaitReservation, Trimming >, NamedState {
1729 /* WaitReservation is a sub-state of trimming simply so that exiting Trimming
1730 * always cancels the reservation */
1731 typedef boost::mpl::list <
1732 boost::statechart::custom_reaction< SnapTrimReserved >
1733 > reactions;
1734 struct ReservationCB : public Context {
1735 PrimaryLogPGRef pg;
1736 bool canceled;
11fdf7f2 1737 explicit ReservationCB(PrimaryLogPG *pg) : pg(pg), canceled(false) {}
7c673cae
FG
1738 void finish(int) override {
1739 pg->lock();
1740 if (!canceled)
1741 pg->snap_trimmer_machine.process_event(SnapTrimReserved());
1742 pg->unlock();
1743 }
1744 void cancel() {
11fdf7f2
TL
1745 ceph_assert(pg->is_locked());
1746 ceph_assert(!canceled);
7c673cae
FG
1747 canceled = true;
1748 }
1749 };
1750 ReservationCB *pending = nullptr;
1751
1752 explicit WaitReservation(my_context ctx)
1753 : my_base(ctx),
1754 NamedState(context< SnapTrimmer >().pg, "Trimming/WaitReservation") {
1755 context< SnapTrimmer >().log_enter(state_name);
11fdf7f2 1756 ceph_assert(context<Trimming>().in_flight.empty());
7c673cae
FG
1757 auto *pg = context< SnapTrimmer >().pg;
1758 pending = new ReservationCB(pg);
1759 pg->osd->snap_reserver.request_reservation(
1760 pg->get_pgid(),
1761 pending,
1762 0);
1763 pg->state_set(PG_STATE_SNAPTRIM_WAIT);
1764 pg->publish_stats_to_osd();
1765 }
1766 boost::statechart::result react(const SnapTrimReserved&);
1767 void exit() {
1768 context< SnapTrimmer >().log_exit(state_name, enter_time);
1769 if (pending)
1770 pending->cancel();
1771 pending = nullptr;
1772 auto *pg = context< SnapTrimmer >().pg;
1773 pg->state_clear(PG_STATE_SNAPTRIM_WAIT);
224ce89b 1774 pg->state_clear(PG_STATE_SNAPTRIM_ERROR);
7c673cae
FG
1775 pg->publish_stats_to_osd();
1776 }
1777 };
1778
1779 struct WaitScrub : boost::statechart::state< WaitScrub, SnapTrimmer >, NamedState {
1780 typedef boost::mpl::list <
1781 boost::statechart::custom_reaction< ScrubComplete >,
1782 boost::statechart::custom_reaction< KickTrim >,
1783 boost::statechart::transition< Reset, NotTrimming >
1784 > reactions;
1785 explicit WaitScrub(my_context ctx)
1786 : my_base(ctx),
1787 NamedState(context< SnapTrimmer >().pg, "Trimming/WaitScrub") {
1788 context< SnapTrimmer >().log_enter(state_name);
1789 }
1790 void exit() {
1791 context< SnapTrimmer >().log_exit(state_name, enter_time);
1792 }
1793 boost::statechart::result react(const ScrubComplete&) {
1794 post_event(KickTrim());
1795 return transit< NotTrimming >();
1796 }
1797 boost::statechart::result react(const KickTrim&) {
1798 return discard_event();
1799 }
1800 };
1801
1802 struct NotTrimming : boost::statechart::state< NotTrimming, SnapTrimmer >, NamedState {
1803 typedef boost::mpl::list <
1804 boost::statechart::custom_reaction< KickTrim >,
1805 boost::statechart::transition< Reset, NotTrimming >
1806 > reactions;
1807 explicit NotTrimming(my_context ctx);
1808 void exit();
1809 boost::statechart::result react(const KickTrim&);
1810 };
1811
1812 int _verify_no_head_clones(const hobject_t& soid,
1813 const SnapSet& ss);
1814 // return true if we're creating a local object, false for a
1815 // whiteout or no change.
1816 void maybe_create_new_object(OpContext *ctx, bool ignore_transaction=false);
1817 int _delete_oid(OpContext *ctx, bool no_whiteout, bool try_no_whiteout);
1818 int _rollback_to(OpContext *ctx, ceph_osd_op& op);
1819public:
1820 bool is_missing_object(const hobject_t& oid) const;
1821 bool is_unreadable_object(const hobject_t &oid) const {
1822 return is_missing_object(oid) ||
1823 !missing_loc.readable_with_acting(oid, actingset);
1824 }
1825 void maybe_kick_recovery(const hobject_t &soid);
1826 void wait_for_unreadable_object(const hobject_t& oid, OpRequestRef op);
1827 void wait_for_all_missing(OpRequestRef op);
1828
1829 bool is_degraded_or_backfilling_object(const hobject_t& oid);
11fdf7f2 1830 bool is_degraded_on_async_recovery_target(const hobject_t& soid);
7c673cae
FG
1831 void wait_for_degraded_object(const hobject_t& oid, OpRequestRef op);
1832
1833 void block_write_on_full_cache(
1834 const hobject_t& oid, OpRequestRef op);
224ce89b
WB
1835 void block_for_clean(
1836 const hobject_t& oid, OpRequestRef op);
7c673cae
FG
1837 void block_write_on_snap_rollback(
1838 const hobject_t& oid, ObjectContextRef obc, OpRequestRef op);
1839 void block_write_on_degraded_snap(const hobject_t& oid, OpRequestRef op);
1840
11fdf7f2 1841 bool maybe_await_blocked_head(const hobject_t &soid, OpRequestRef op);
7c673cae
FG
1842 void wait_for_blocked_object(const hobject_t& soid, OpRequestRef op);
1843 void kick_object_context_blocked(ObjectContextRef obc);
1844
1845 void maybe_force_recovery();
1846
1847 void mark_all_unfound_lost(
1848 int what,
1849 ConnectionRef con,
1850 ceph_tid_t tid);
1851 eversion_t pick_newest_available(const hobject_t& oid);
1852
1853 void do_update_log_missing(
1854 OpRequestRef &op);
1855
1856 void do_update_log_missing_reply(
1857 OpRequestRef &op);
1858
1859 void on_role_change() override;
1860 void on_pool_change() override;
1861 void _on_new_interval() override;
c07f9fc5 1862 void clear_async_reads();
7c673cae
FG
1863 void on_change(ObjectStore::Transaction *t) override;
1864 void on_activate() override;
1865 void on_flushed() override;
1866 void on_removal(ObjectStore::Transaction *t) override;
1867 void on_shutdown() override;
11fdf7f2 1868 bool check_failsafe_full() override;
7c673cae 1869 bool check_osdmap_full(const set<pg_shard_t> &missing_on) override;
28e407b8
AA
1870 bool maybe_preempt_replica_scrub(const hobject_t& oid) override {
1871 return write_blocked_by_scrub(oid);
1872 }
11fdf7f2 1873 int rep_repair_primary_object(const hobject_t& soid, OpContext *ctx);
7c673cae
FG
1874
1875 // attr cache handling
1876 void setattr_maybe_cache(
1877 ObjectContextRef obc,
7c673cae
FG
1878 PGTransaction *t,
1879 const string &key,
1880 bufferlist &val);
1881 void setattrs_maybe_cache(
1882 ObjectContextRef obc,
7c673cae
FG
1883 PGTransaction *t,
1884 map<string, bufferlist> &attrs);
1885 void rmattr_maybe_cache(
1886 ObjectContextRef obc,
7c673cae
FG
1887 PGTransaction *t,
1888 const string &key);
1889 int getattr_maybe_cache(
1890 ObjectContextRef obc,
1891 const string &key,
1892 bufferlist *val);
1893 int getattrs_maybe_cache(
1894 ObjectContextRef obc,
b32b8144 1895 map<string, bufferlist> *out);
11fdf7f2
TL
1896
1897public:
1898 void set_dynamic_perf_stats_queries(
1899 const std::list<OSDPerfMetricQuery> &queries) override;
1900 void get_dynamic_perf_stats(DynamicPerfStats *stats) override;
1901
1902private:
1903 DynamicPerfStats m_dynamic_perf_stats;
7c673cae
FG
1904};
1905
1906inline ostream& operator<<(ostream& out, const PrimaryLogPG::RepGather& repop)
1907{
1908 out << "repgather(" << &repop
1909 << " " << repop.v
1910 << " rep_tid=" << repop.rep_tid
1911 << " committed?=" << repop.all_committed
7c673cae
FG
1912 << " r=" << repop.r
1913 << ")";
1914 return out;
1915}
1916
1917inline ostream& operator<<(ostream& out,
1918 const PrimaryLogPG::ProxyWriteOpRef& pwop)
1919{
1920 out << "proxywrite(" << &pwop
1921 << " " << pwop->user_version
1922 << " pwop_tid=" << pwop->objecter_tid;
1923 if (pwop->ctx->op)
1924 out << " op=" << *(pwop->ctx->op->get_req());
1925 out << ")";
1926 return out;
1927}
1928
1929void intrusive_ptr_add_ref(PrimaryLogPG::RepGather *repop);
1930void intrusive_ptr_release(PrimaryLogPG::RepGather *repop);
1931
1932
1933#endif