]> git.proxmox.com Git - ceph.git/blob - ceph/src/osd/PeeringState.cc
import ceph 16.2.7
[ceph.git] / ceph / src / osd / PeeringState.cc
1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2 // vim: ts=8 sw=2 smarttab
3
4 #include "PGPeeringEvent.h"
5 #include "common/ceph_releases.h"
6 #include "common/dout.h"
7 #include "PeeringState.h"
8
9 #include "messages/MOSDPGRemove.h"
10 #include "messages/MBackfillReserve.h"
11 #include "messages/MRecoveryReserve.h"
12 #include "messages/MOSDScrubReserve.h"
13 #include "messages/MOSDPGInfo.h"
14 #include "messages/MOSDPGInfo2.h"
15 #include "messages/MOSDPGTrim.h"
16 #include "messages/MOSDPGLog.h"
17 #include "messages/MOSDPGNotify.h"
18 #include "messages/MOSDPGNotify2.h"
19 #include "messages/MOSDPGQuery.h"
20 #include "messages/MOSDPGQuery2.h"
21 #include "messages/MOSDPGLease.h"
22 #include "messages/MOSDPGLeaseAck.h"
23
24 #define dout_context cct
25 #define dout_subsys ceph_subsys_osd
26
27 using std::dec;
28 using std::hex;
29 using std::make_pair;
30 using std::map;
31 using std::ostream;
32 using std::pair;
33 using std::set;
34 using std::stringstream;
35 using std::vector;
36
37 using ceph::Formatter;
38 using ceph::make_message;
39
40 BufferedRecoveryMessages::BufferedRecoveryMessages(
41 ceph_release_t r,
42 PeeringCtx &ctx)
43 : require_osd_release(r) {
44 // steal messages from ctx
45 message_map.swap(ctx.message_map);
46 }
47
48 void BufferedRecoveryMessages::send_notify(int to, const pg_notify_t &n)
49 {
50 if (require_osd_release >= ceph_release_t::octopus) {
51 spg_t pgid(n.info.pgid.pgid, n.to);
52 send_osd_message(to, make_message<MOSDPGNotify2>(pgid, n));
53 } else {
54 send_osd_message(to, make_message<MOSDPGNotify>(n.epoch_sent, vector{n}));
55 }
56 }
57
58 void BufferedRecoveryMessages::send_query(
59 int to,
60 spg_t to_spgid,
61 const pg_query_t &q)
62 {
63 if (require_osd_release >= ceph_release_t::octopus) {
64 send_osd_message(to,
65 make_message<MOSDPGQuery2>(to_spgid, q));
66 } else {
67 auto m = make_message<MOSDPGQuery>(
68 q.epoch_sent,
69 MOSDPGQuery::pg_list_t{{to_spgid, q}});
70 send_osd_message(to, m);
71 }
72 }
73
74 void BufferedRecoveryMessages::send_info(
75 int to,
76 spg_t to_spgid,
77 epoch_t min_epoch,
78 epoch_t cur_epoch,
79 const pg_info_t &info,
80 std::optional<pg_lease_t> lease,
81 std::optional<pg_lease_ack_t> lease_ack)
82 {
83 if (require_osd_release >= ceph_release_t::octopus) {
84 send_osd_message(
85 to,
86 make_message<MOSDPGInfo2>(
87 to_spgid,
88 info,
89 cur_epoch,
90 min_epoch,
91 lease,
92 lease_ack)
93 );
94 } else {
95 send_osd_message(
96 to,
97 make_message<MOSDPGInfo>(
98 cur_epoch,
99 vector{pg_notify_t{to_spgid.shard,
100 info.pgid.shard,
101 min_epoch, cur_epoch,
102 info, PastIntervals{}}})
103 );
104 }
105 }
106
107 void PGPool::update(OSDMapRef map)
108 {
109 const pg_pool_t *pi = map->get_pg_pool(id);
110 if (!pi) {
111 return; // pool has been deleted
112 }
113 info = *pi;
114 name = map->get_pool_name(id);
115
116 bool updated = false;
117 if ((map->get_epoch() != cached_epoch + 1) ||
118 (pi->get_snap_epoch() == map->get_epoch())) {
119 updated = true;
120 }
121
122 if (info.is_pool_snaps_mode() && updated) {
123 snapc = pi->get_snap_context();
124 }
125 cached_epoch = map->get_epoch();
126 }
127
128 /*-------------Peering State Helpers----------------*/
129 #undef dout_prefix
130 #define dout_prefix (dpp->gen_prefix(*_dout))
131 #undef psdout
132 #define psdout(x) ldout(cct, x)
133
134 PeeringState::PeeringState(
135 CephContext *cct,
136 pg_shard_t pg_whoami,
137 spg_t spgid,
138 const PGPool &_pool,
139 OSDMapRef curmap,
140 DoutPrefixProvider *dpp,
141 PeeringListener *pl)
142 : state_history(*pl),
143 cct(cct),
144 spgid(spgid),
145 dpp(dpp),
146 pl(pl),
147 orig_ctx(0),
148 osdmap_ref(curmap),
149 pool(_pool),
150 pg_whoami(pg_whoami),
151 info(spgid),
152 pg_log(cct),
153 missing_loc(spgid, this, dpp, cct),
154 machine(this, cct, spgid, dpp, pl, &state_history)
155 {
156 machine.initiate();
157 }
158
159 void PeeringState::start_handle(PeeringCtx *new_ctx) {
160 ceph_assert(!rctx);
161 ceph_assert(!orig_ctx);
162 orig_ctx = new_ctx;
163 if (new_ctx) {
164 if (messages_pending_flush) {
165 rctx.emplace(*messages_pending_flush, *new_ctx);
166 } else {
167 rctx.emplace(*new_ctx);
168 }
169 rctx->start_time = ceph_clock_now();
170 }
171 }
172
173 void PeeringState::begin_block_outgoing() {
174 ceph_assert(!messages_pending_flush);
175 ceph_assert(orig_ctx);
176 ceph_assert(rctx);
177 messages_pending_flush = BufferedRecoveryMessages(
178 orig_ctx->require_osd_release);
179 rctx.emplace(*messages_pending_flush, *orig_ctx);
180 }
181
182 void PeeringState::clear_blocked_outgoing() {
183 ceph_assert(orig_ctx);
184 ceph_assert(rctx);
185 messages_pending_flush = std::optional<BufferedRecoveryMessages>();
186 }
187
188 void PeeringState::end_block_outgoing() {
189 ceph_assert(messages_pending_flush);
190 ceph_assert(orig_ctx);
191 ceph_assert(rctx);
192
193 orig_ctx->accept_buffered_messages(*messages_pending_flush);
194 rctx.emplace(*orig_ctx);
195 messages_pending_flush = std::optional<BufferedRecoveryMessages>();
196 }
197
198 void PeeringState::end_handle() {
199 if (rctx) {
200 utime_t dur = ceph_clock_now() - rctx->start_time;
201 machine.event_time += dur;
202 }
203
204 machine.event_count++;
205 rctx = std::nullopt;
206 orig_ctx = NULL;
207 }
208
209 void PeeringState::check_recovery_sources(const OSDMapRef& osdmap)
210 {
211 /*
212 * check that any peers we are planning to (or currently) pulling
213 * objects from are dealt with.
214 */
215 missing_loc.check_recovery_sources(osdmap);
216 pl->check_recovery_sources(osdmap);
217
218 for (auto i = peer_log_requested.begin(); i != peer_log_requested.end();) {
219 if (!osdmap->is_up(i->osd)) {
220 psdout(10) << "peer_log_requested removing " << *i << dendl;
221 peer_log_requested.erase(i++);
222 } else {
223 ++i;
224 }
225 }
226
227 for (auto i = peer_missing_requested.begin();
228 i != peer_missing_requested.end();) {
229 if (!osdmap->is_up(i->osd)) {
230 psdout(10) << "peer_missing_requested removing " << *i << dendl;
231 peer_missing_requested.erase(i++);
232 } else {
233 ++i;
234 }
235 }
236 }
237
238 void PeeringState::update_history(const pg_history_t& new_history)
239 {
240 auto mnow = pl->get_mnow();
241 info.history.refresh_prior_readable_until_ub(mnow, prior_readable_until_ub);
242 if (info.history.merge(new_history)) {
243 psdout(20) << __func__ << " advanced history from " << new_history << dendl;
244 dirty_info = true;
245 if (info.history.last_epoch_clean >= info.history.same_interval_since) {
246 psdout(20) << __func__ << " clearing past_intervals" << dendl;
247 past_intervals.clear();
248 dirty_big_info = true;
249 }
250 prior_readable_until_ub = info.history.get_prior_readable_until_ub(mnow);
251 if (prior_readable_until_ub != ceph::signedspan::zero()) {
252 dout(20) << __func__
253 << " prior_readable_until_ub " << prior_readable_until_ub
254 << " (mnow " << mnow << " + "
255 << info.history.prior_readable_until_ub << ")" << dendl;
256 }
257 }
258 pl->on_info_history_change();
259 }
260
261 hobject_t PeeringState::earliest_backfill() const
262 {
263 hobject_t e = hobject_t::get_max();
264 for (const pg_shard_t& bt : get_backfill_targets()) {
265 const pg_info_t &pi = get_peer_info(bt);
266 e = std::min(pi.last_backfill, e);
267 }
268 return e;
269 }
270
271 void PeeringState::purge_strays()
272 {
273 if (is_premerge()) {
274 psdout(10) << "purge_strays " << stray_set << " but premerge, doing nothing"
275 << dendl;
276 return;
277 }
278 if (cct->_conf.get_val<bool>("osd_debug_no_purge_strays")) {
279 return;
280 }
281 psdout(10) << "purge_strays " << stray_set << dendl;
282
283 bool removed = false;
284 for (auto p = stray_set.begin(); p != stray_set.end(); ++p) {
285 ceph_assert(!is_acting_recovery_backfill(*p));
286 if (get_osdmap()->is_up(p->osd)) {
287 psdout(10) << "sending PGRemove to osd." << *p << dendl;
288 vector<spg_t> to_remove;
289 to_remove.push_back(spg_t(info.pgid.pgid, p->shard));
290 auto m = make_message<MOSDPGRemove>(
291 get_osdmap_epoch(),
292 to_remove);
293 pl->send_cluster_message(p->osd, m, get_osdmap_epoch());
294 } else {
295 psdout(10) << "not sending PGRemove to down osd." << *p << dendl;
296 }
297 peer_missing.erase(*p);
298 peer_info.erase(*p);
299 missing_loc.remove_stray_recovery_sources(*p);
300 peer_purged.insert(*p);
301 removed = true;
302 }
303
304 // if we removed anyone, update peers (which include peer_info)
305 if (removed)
306 update_heartbeat_peers();
307
308 stray_set.clear();
309
310 // clear _requested maps; we may have to peer() again if we discover
311 // (more) stray content
312 peer_log_requested.clear();
313 peer_missing_requested.clear();
314 }
315
316 void PeeringState::query_unfound(Formatter *f, string state)
317 {
318 psdout(20) << "Enter PeeringState common QueryUnfound" << dendl;
319 {
320 f->dump_string("state", state);
321 f->dump_bool("available_might_have_unfound", true);
322 f->open_array_section("might_have_unfound");
323 for (auto p = might_have_unfound.begin();
324 p != might_have_unfound.end();
325 ++p) {
326 if (peer_missing.count(*p)) {
327 ; // Ignore already probed OSDs
328 } else {
329 f->open_object_section("osd");
330 f->dump_stream("osd") << *p;
331 if (peer_missing_requested.count(*p)) {
332 f->dump_string("status", "querying");
333 } else if (!get_osdmap()->is_up(p->osd)) {
334 f->dump_string("status", "osd is down");
335 } else {
336 f->dump_string("status", "not queried");
337 }
338 f->close_section();
339 }
340 }
341 f->close_section();
342 }
343 psdout(20) << "Exit PeeringState common QueryUnfound" << dendl;
344 return;
345 }
346
347 bool PeeringState::proc_replica_info(
348 pg_shard_t from, const pg_info_t &oinfo, epoch_t send_epoch)
349 {
350 auto p = peer_info.find(from);
351 if (p != peer_info.end() && p->second.last_update == oinfo.last_update) {
352 psdout(10) << " got dup osd." << from << " info "
353 << oinfo << ", identical to ours" << dendl;
354 return false;
355 }
356
357 if (!get_osdmap()->has_been_up_since(from.osd, send_epoch)) {
358 psdout(10) << " got info " << oinfo << " from down osd." << from
359 << " discarding" << dendl;
360 return false;
361 }
362
363 psdout(10) << " got osd." << from << " " << oinfo << dendl;
364 ceph_assert(is_primary());
365 peer_info[from] = oinfo;
366 might_have_unfound.insert(from);
367
368 update_history(oinfo.history);
369
370 // stray?
371 if (!is_up(from) && !is_acting(from)) {
372 psdout(10) << " osd." << from << " has stray content: " << oinfo << dendl;
373 stray_set.insert(from);
374 if (is_clean()) {
375 purge_strays();
376 }
377 }
378
379 // was this a new info? if so, update peers!
380 if (p == peer_info.end())
381 update_heartbeat_peers();
382
383 return true;
384 }
385
386
387 void PeeringState::remove_down_peer_info(const OSDMapRef &osdmap)
388 {
389 // Remove any downed osds from peer_info
390 bool removed = false;
391 auto p = peer_info.begin();
392 while (p != peer_info.end()) {
393 if (!osdmap->is_up(p->first.osd)) {
394 psdout(10) << " dropping down osd." << p->first << " info " << p->second << dendl;
395 peer_missing.erase(p->first);
396 peer_log_requested.erase(p->first);
397 peer_missing_requested.erase(p->first);
398 peer_info.erase(p++);
399 removed = true;
400 } else
401 ++p;
402 }
403
404 // Remove any downed osds from peer_purged so we can re-purge if necessary
405 auto it = peer_purged.begin();
406 while (it != peer_purged.end()) {
407 if (!osdmap->is_up(it->osd)) {
408 psdout(10) << " dropping down osd." << *it << " from peer_purged" << dendl;
409 peer_purged.erase(it++);
410 } else {
411 ++it;
412 }
413 }
414
415 // if we removed anyone, update peers (which include peer_info)
416 if (removed)
417 update_heartbeat_peers();
418
419 check_recovery_sources(osdmap);
420 }
421
422 void PeeringState::update_heartbeat_peers()
423 {
424 if (!is_primary())
425 return;
426
427 set<int> new_peers;
428 for (unsigned i=0; i<acting.size(); i++) {
429 if (acting[i] != CRUSH_ITEM_NONE)
430 new_peers.insert(acting[i]);
431 }
432 for (unsigned i=0; i<up.size(); i++) {
433 if (up[i] != CRUSH_ITEM_NONE)
434 new_peers.insert(up[i]);
435 }
436 for (auto p = peer_info.begin(); p != peer_info.end(); ++p) {
437 new_peers.insert(p->first.osd);
438 }
439 pl->update_heartbeat_peers(std::move(new_peers));
440 }
441
442 void PeeringState::write_if_dirty(ObjectStore::Transaction& t)
443 {
444 pl->prepare_write(
445 info,
446 last_written_info,
447 past_intervals,
448 pg_log,
449 dirty_info,
450 dirty_big_info,
451 last_persisted_osdmap < get_osdmap_epoch(),
452 t);
453 if (dirty_info || dirty_big_info) {
454 last_persisted_osdmap = get_osdmap_epoch();
455 last_written_info = info;
456 dirty_info = false;
457 dirty_big_info = false;
458 }
459 }
460
461 void PeeringState::advance_map(
462 OSDMapRef osdmap, OSDMapRef lastmap,
463 vector<int>& newup, int up_primary,
464 vector<int>& newacting, int acting_primary,
465 PeeringCtx &rctx)
466 {
467 ceph_assert(lastmap == osdmap_ref);
468 psdout(10) << "handle_advance_map "
469 << newup << "/" << newacting
470 << " -- " << up_primary << "/" << acting_primary
471 << dendl;
472
473 update_osdmap_ref(osdmap);
474 pool.update(osdmap);
475
476 AdvMap evt(
477 osdmap, lastmap, newup, up_primary,
478 newacting, acting_primary);
479 handle_event(evt, &rctx);
480 if (pool.info.last_change == osdmap_ref->get_epoch()) {
481 pl->on_pool_change();
482 }
483 readable_interval = pool.get_readable_interval(cct->_conf);
484 last_require_osd_release = osdmap->require_osd_release;
485 }
486
487 void PeeringState::activate_map(PeeringCtx &rctx)
488 {
489 psdout(10) << __func__ << dendl;
490 ActMap evt;
491 handle_event(evt, &rctx);
492 if (osdmap_ref->get_epoch() - last_persisted_osdmap >
493 cct->_conf->osd_pg_epoch_persisted_max_stale) {
494 psdout(20) << __func__ << ": Dirtying info: last_persisted is "
495 << last_persisted_osdmap
496 << " while current is " << osdmap_ref->get_epoch() << dendl;
497 dirty_info = true;
498 } else {
499 psdout(20) << __func__ << ": Not dirtying info: last_persisted is "
500 << last_persisted_osdmap
501 << " while current is " << osdmap_ref->get_epoch() << dendl;
502 }
503 write_if_dirty(rctx.transaction);
504
505 if (get_osdmap()->check_new_blocklist_entries()) {
506 pl->check_blocklisted_watchers();
507 }
508 }
509
510 void PeeringState::set_last_peering_reset()
511 {
512 psdout(20) << "set_last_peering_reset " << get_osdmap_epoch() << dendl;
513 if (last_peering_reset != get_osdmap_epoch()) {
514 last_peering_reset = get_osdmap_epoch();
515 psdout(10) << "Clearing blocked outgoing recovery messages" << dendl;
516 clear_blocked_outgoing();
517 if (!pl->try_flush_or_schedule_async()) {
518 psdout(10) << "Beginning to block outgoing recovery messages" << dendl;
519 begin_block_outgoing();
520 } else {
521 psdout(10) << "Not blocking outgoing recovery messages" << dendl;
522 }
523 }
524 }
525
526 void PeeringState::complete_flush()
527 {
528 flushes_in_progress--;
529 if (flushes_in_progress == 0) {
530 pl->on_flushed();
531 }
532 }
533
534 void PeeringState::check_full_transition(OSDMapRef lastmap, OSDMapRef osdmap)
535 {
536 const pg_pool_t *pi = osdmap->get_pg_pool(info.pgid.pool());
537 if (!pi) {
538 return; // pool deleted
539 }
540 bool changed = false;
541 if (pi->has_flag(pg_pool_t::FLAG_FULL)) {
542 const pg_pool_t *opi = lastmap->get_pg_pool(info.pgid.pool());
543 if (!opi || !opi->has_flag(pg_pool_t::FLAG_FULL)) {
544 psdout(10) << " pool was marked full in " << osdmap->get_epoch() << dendl;
545 changed = true;
546 }
547 }
548 if (changed) {
549 info.history.last_epoch_marked_full = osdmap->get_epoch();
550 dirty_info = true;
551 }
552 }
553
554 bool PeeringState::should_restart_peering(
555 int newupprimary,
556 int newactingprimary,
557 const vector<int>& newup,
558 const vector<int>& newacting,
559 OSDMapRef lastmap,
560 OSDMapRef osdmap)
561 {
562 if (PastIntervals::is_new_interval(
563 primary.osd,
564 newactingprimary,
565 acting,
566 newacting,
567 up_primary.osd,
568 newupprimary,
569 up,
570 newup,
571 osdmap.get(),
572 lastmap.get(),
573 info.pgid.pgid)) {
574 psdout(20) << "new interval newup " << newup
575 << " newacting " << newacting << dendl;
576 return true;
577 }
578 if (!lastmap->is_up(pg_whoami.osd) && osdmap->is_up(pg_whoami.osd)) {
579 psdout(10) << __func__ << " osd transitioned from down -> up"
580 << dendl;
581 return true;
582 }
583 return false;
584 }
585
586 /* Called before initializing peering during advance_map */
587 void PeeringState::start_peering_interval(
588 const OSDMapRef lastmap,
589 const vector<int>& newup, int new_up_primary,
590 const vector<int>& newacting, int new_acting_primary,
591 ObjectStore::Transaction &t)
592 {
593 const OSDMapRef osdmap = get_osdmap();
594
595 set_last_peering_reset();
596
597 vector<int> oldacting, oldup;
598 int oldrole = get_role();
599
600 if (is_primary()) {
601 pl->clear_ready_to_merge();
602 }
603
604
605 pg_shard_t old_acting_primary = get_primary();
606 pg_shard_t old_up_primary = up_primary;
607 bool was_old_primary = is_primary();
608 bool was_old_nonprimary = is_nonprimary();
609
610 acting.swap(oldacting);
611 up.swap(oldup);
612 init_primary_up_acting(
613 newup,
614 newacting,
615 new_up_primary,
616 new_acting_primary);
617
618 if (info.stats.up != up ||
619 info.stats.acting != acting ||
620 info.stats.up_primary != new_up_primary ||
621 info.stats.acting_primary != new_acting_primary) {
622 info.stats.up = up;
623 info.stats.up_primary = new_up_primary;
624 info.stats.acting = acting;
625 info.stats.acting_primary = new_acting_primary;
626 info.stats.mapping_epoch = osdmap->get_epoch();
627 }
628
629 pl->clear_publish_stats();
630
631 // This will now be remapped during a backfill in cases
632 // that it would not have been before.
633 if (up != acting)
634 state_set(PG_STATE_REMAPPED);
635 else
636 state_clear(PG_STATE_REMAPPED);
637
638 int role = osdmap->calc_pg_role(pg_whoami, acting);
639 set_role(role);
640
641 // did acting, up, primary|acker change?
642 if (!lastmap) {
643 psdout(10) << " no lastmap" << dendl;
644 dirty_info = true;
645 dirty_big_info = true;
646 info.history.same_interval_since = osdmap->get_epoch();
647 } else {
648 std::stringstream debug;
649 ceph_assert(info.history.same_interval_since != 0);
650 bool new_interval = PastIntervals::check_new_interval(
651 old_acting_primary.osd,
652 new_acting_primary,
653 oldacting, newacting,
654 old_up_primary.osd,
655 new_up_primary,
656 oldup, newup,
657 info.history.same_interval_since,
658 info.history.last_epoch_clean,
659 osdmap.get(),
660 lastmap.get(),
661 info.pgid.pgid,
662 missing_loc.get_recoverable_predicate(),
663 &past_intervals,
664 &debug);
665 psdout(10) << __func__ << ": check_new_interval output: "
666 << debug.str() << dendl;
667 if (new_interval) {
668 if (osdmap->get_epoch() == pl->oldest_stored_osdmap() &&
669 info.history.last_epoch_clean < osdmap->get_epoch()) {
670 psdout(10) << " map gap, clearing past_intervals and faking" << dendl;
671 // our information is incomplete and useless; someone else was clean
672 // after everything we know if osdmaps were trimmed.
673 past_intervals.clear();
674 } else {
675 psdout(10) << " noting past " << past_intervals << dendl;
676 }
677 dirty_info = true;
678 dirty_big_info = true;
679 info.history.same_interval_since = osdmap->get_epoch();
680 if (osdmap->have_pg_pool(info.pgid.pgid.pool()) &&
681 info.pgid.pgid.is_split(lastmap->get_pg_num(info.pgid.pgid.pool()),
682 osdmap->get_pg_num(info.pgid.pgid.pool()),
683 nullptr)) {
684 info.history.last_epoch_split = osdmap->get_epoch();
685 }
686 }
687 }
688
689 if (old_up_primary != up_primary ||
690 oldup != up) {
691 info.history.same_up_since = osdmap->get_epoch();
692 }
693 // this comparison includes primary rank via pg_shard_t
694 if (old_acting_primary != get_primary()) {
695 info.history.same_primary_since = osdmap->get_epoch();
696 }
697
698 on_new_interval();
699 pl->on_info_history_change();
700
701 psdout(1) << __func__ << " up " << oldup << " -> " << up
702 << ", acting " << oldacting << " -> " << acting
703 << ", acting_primary " << old_acting_primary << " -> "
704 << new_acting_primary
705 << ", up_primary " << old_up_primary << " -> " << new_up_primary
706 << ", role " << oldrole << " -> " << role
707 << ", features acting " << acting_features
708 << " upacting " << upacting_features
709 << dendl;
710
711 // deactivate.
712 state_clear(PG_STATE_ACTIVE);
713 state_clear(PG_STATE_PEERED);
714 state_clear(PG_STATE_PREMERGE);
715 state_clear(PG_STATE_DOWN);
716 state_clear(PG_STATE_RECOVERY_WAIT);
717 state_clear(PG_STATE_RECOVERY_TOOFULL);
718 state_clear(PG_STATE_RECOVERING);
719
720 peer_purged.clear();
721 acting_recovery_backfill.clear();
722
723 // reset primary/replica state?
724 if (was_old_primary || is_primary()) {
725 pl->clear_want_pg_temp();
726 } else if (was_old_nonprimary || is_nonprimary()) {
727 pl->clear_want_pg_temp();
728 }
729 clear_primary_state();
730
731 pl->on_change(t);
732
733 ceph_assert(!deleting);
734
735 // should we tell the primary we are here?
736 send_notify = !is_primary();
737
738 if (role != oldrole ||
739 was_old_primary != is_primary()) {
740 // did primary change?
741 if (was_old_primary != is_primary()) {
742 state_clear(PG_STATE_CLEAN);
743 }
744
745 pl->on_role_change();
746 } else {
747 // no role change.
748 // did primary change?
749 if (get_primary() != old_acting_primary) {
750 psdout(10) << oldacting << " -> " << acting
751 << ", acting primary "
752 << old_acting_primary << " -> " << get_primary()
753 << dendl;
754 } else {
755 // primary is the same.
756 if (is_primary()) {
757 // i am (still) primary. but my replica set changed.
758 state_clear(PG_STATE_CLEAN);
759
760 psdout(10) << oldacting << " -> " << acting
761 << ", replicas changed" << dendl;
762 }
763 }
764 }
765
766 if (acting.empty() && !up.empty() && up_primary == pg_whoami) {
767 psdout(10) << " acting empty, but i am up[0], clearing pg_temp" << dendl;
768 pl->queue_want_pg_temp(acting);
769 }
770 }
771
772 void PeeringState::on_new_interval()
773 {
774 dout(20) << __func__ << dendl;
775 const OSDMapRef osdmap = get_osdmap();
776
777 // initialize features
778 acting_features = CEPH_FEATURES_SUPPORTED_DEFAULT;
779 upacting_features = CEPH_FEATURES_SUPPORTED_DEFAULT;
780 for (auto p = acting.begin(); p != acting.end(); ++p) {
781 if (*p == CRUSH_ITEM_NONE)
782 continue;
783 uint64_t f = osdmap->get_xinfo(*p).features;
784 acting_features &= f;
785 upacting_features &= f;
786 }
787 for (auto p = up.begin(); p != up.end(); ++p) {
788 if (*p == CRUSH_ITEM_NONE)
789 continue;
790 upacting_features &= osdmap->get_xinfo(*p).features;
791 }
792 psdout(20) << __func__ << " upacting_features 0x" << std::hex
793 << upacting_features << std::dec
794 << " from " << acting << "+" << up << dendl;
795
796 psdout(20) << __func__ << " checking missing set deletes flag. missing = "
797 << get_pg_log().get_missing() << dendl;
798
799 if (!pg_log.get_missing().may_include_deletes &&
800 !perform_deletes_during_peering()) {
801 pl->rebuild_missing_set_with_deletes(pg_log);
802 }
803 ceph_assert(
804 pg_log.get_missing().may_include_deletes ==
805 !perform_deletes_during_peering());
806
807 init_hb_stamps();
808
809 // update lease bounds for a new interval
810 auto mnow = pl->get_mnow();
811 prior_readable_until_ub = std::max(prior_readable_until_ub,
812 readable_until_ub);
813 prior_readable_until_ub = info.history.refresh_prior_readable_until_ub(
814 mnow, prior_readable_until_ub);
815 psdout(10) << __func__ << " prior_readable_until_ub "
816 << prior_readable_until_ub << " (mnow " << mnow << " + "
817 << info.history.prior_readable_until_ub << ")" << dendl;
818 prior_readable_down_osds.clear(); // we populate this when we build the priorset
819
820 readable_until =
821 readable_until_ub =
822 readable_until_ub_sent =
823 readable_until_ub_from_primary = ceph::signedspan::zero();
824
825 acting_readable_until_ub.clear();
826 if (is_primary()) {
827 acting_readable_until_ub.resize(acting.size(), ceph::signedspan::zero());
828 }
829
830 pl->on_new_interval();
831 }
832
833 void PeeringState::init_primary_up_acting(
834 const vector<int> &newup,
835 const vector<int> &newacting,
836 int new_up_primary,
837 int new_acting_primary)
838 {
839 actingset.clear();
840 acting = newacting;
841 for (uint8_t i = 0; i < acting.size(); ++i) {
842 if (acting[i] != CRUSH_ITEM_NONE)
843 actingset.insert(
844 pg_shard_t(
845 acting[i],
846 pool.info.is_erasure() ? shard_id_t(i) : shard_id_t::NO_SHARD));
847 }
848 upset.clear();
849 up = newup;
850 for (uint8_t i = 0; i < up.size(); ++i) {
851 if (up[i] != CRUSH_ITEM_NONE)
852 upset.insert(
853 pg_shard_t(
854 up[i],
855 pool.info.is_erasure() ? shard_id_t(i) : shard_id_t::NO_SHARD));
856 }
857 if (!pool.info.is_erasure()) {
858 // replicated
859 up_primary = pg_shard_t(new_up_primary, shard_id_t::NO_SHARD);
860 primary = pg_shard_t(new_acting_primary, shard_id_t::NO_SHARD);
861 } else {
862 // erasure
863 up_primary = pg_shard_t();
864 primary = pg_shard_t();
865 for (uint8_t i = 0; i < up.size(); ++i) {
866 if (up[i] == new_up_primary) {
867 up_primary = pg_shard_t(up[i], shard_id_t(i));
868 break;
869 }
870 }
871 for (uint8_t i = 0; i < acting.size(); ++i) {
872 if (acting[i] == new_acting_primary) {
873 primary = pg_shard_t(acting[i], shard_id_t(i));
874 break;
875 }
876 }
877 ceph_assert(up_primary.osd == new_up_primary);
878 ceph_assert(primary.osd == new_acting_primary);
879 }
880 }
881
882 void PeeringState::init_hb_stamps()
883 {
884 if (is_primary()) {
885 // we care about all other osds in the acting set
886 hb_stamps.resize(acting.size() - 1);
887 unsigned i = 0;
888 for (auto p : acting) {
889 if (p == CRUSH_ITEM_NONE || p == get_primary().osd) {
890 continue;
891 }
892 hb_stamps[i++] = pl->get_hb_stamps(p);
893 }
894 hb_stamps.resize(i);
895 } else if (is_nonprimary()) {
896 // we care about just the primary
897 hb_stamps.resize(1);
898 hb_stamps[0] = pl->get_hb_stamps(get_primary().osd);
899 } else {
900 hb_stamps.clear();
901 }
902 dout(10) << __func__ << " now " << hb_stamps << dendl;
903 }
904
905
906 void PeeringState::clear_recovery_state()
907 {
908 async_recovery_targets.clear();
909 backfill_targets.clear();
910 }
911
912 void PeeringState::clear_primary_state()
913 {
914 psdout(10) << "clear_primary_state" << dendl;
915
916 // clear peering state
917 stray_set.clear();
918 peer_log_requested.clear();
919 peer_missing_requested.clear();
920 peer_info.clear();
921 peer_bytes.clear();
922 peer_missing.clear();
923 peer_last_complete_ondisk.clear();
924 peer_activated.clear();
925 min_last_complete_ondisk = eversion_t();
926 pg_trim_to = eversion_t();
927 might_have_unfound.clear();
928 need_up_thru = false;
929 missing_loc.clear();
930 pg_log.reset_recovery_pointers();
931
932 clear_recovery_state();
933
934 last_update_ondisk = eversion_t();
935 missing_loc.clear();
936 pl->clear_primary_state();
937 }
938
939 /// return [start,end) bounds for required past_intervals
940 static pair<epoch_t, epoch_t> get_required_past_interval_bounds(
941 const pg_info_t &info,
942 epoch_t oldest_map) {
943 epoch_t start = std::max(
944 info.history.last_epoch_clean ? info.history.last_epoch_clean :
945 info.history.epoch_pool_created,
946 oldest_map);
947 epoch_t end = std::max(
948 info.history.same_interval_since,
949 info.history.epoch_pool_created);
950 return make_pair(start, end);
951 }
952
953
954 void PeeringState::check_past_interval_bounds() const
955 {
956 auto oldest_epoch = pl->oldest_stored_osdmap();
957 auto rpib = get_required_past_interval_bounds(
958 info,
959 oldest_epoch);
960 if (rpib.first >= rpib.second) {
961 // do not warn if the start bound is dictated by oldest_map; the
962 // past intervals are presumably appropriate given the pg info.
963 if (!past_intervals.empty() &&
964 rpib.first > oldest_epoch) {
965 pl->get_clog_error() << info.pgid << " required past_interval bounds are"
966 << " empty [" << rpib << ") but past_intervals is not: "
967 << past_intervals;
968 derr << info.pgid << " required past_interval bounds are"
969 << " empty [" << rpib << ") but past_intervals is not: "
970 << past_intervals << dendl;
971 }
972 } else {
973 if (past_intervals.empty()) {
974 pl->get_clog_error() << info.pgid << " required past_interval bounds are"
975 << " not empty [" << rpib << ") but past_intervals "
976 << past_intervals << " is empty";
977 derr << info.pgid << " required past_interval bounds are"
978 << " not empty [" << rpib << ") but past_intervals "
979 << past_intervals << " is empty" << dendl;
980 ceph_assert(!past_intervals.empty());
981 }
982
983 auto apib = past_intervals.get_bounds();
984 if (apib.first > rpib.first) {
985 pl->get_clog_error() << info.pgid << " past_intervals [" << apib
986 << ") start interval does not contain the required"
987 << " bound [" << rpib << ") start";
988 derr << info.pgid << " past_intervals [" << apib
989 << ") start interval does not contain the required"
990 << " bound [" << rpib << ") start" << dendl;
991 ceph_abort_msg("past_interval start interval mismatch");
992 }
993 if (apib.second != rpib.second) {
994 pl->get_clog_error() << info.pgid << " past_interal bound [" << apib
995 << ") end does not match required [" << rpib
996 << ") end";
997 derr << info.pgid << " past_interal bound [" << apib
998 << ") end does not match required [" << rpib
999 << ") end" << dendl;
1000 ceph_abort_msg("past_interval end mismatch");
1001 }
1002 }
1003 }
1004
1005 int PeeringState::clamp_recovery_priority(int priority, int pool_recovery_priority, int max)
1006 {
1007 static_assert(OSD_RECOVERY_PRIORITY_MIN < OSD_RECOVERY_PRIORITY_MAX, "Invalid priority range");
1008 static_assert(OSD_RECOVERY_PRIORITY_MIN >= 0, "Priority range must match unsigned type");
1009
1010 ceph_assert(max <= OSD_RECOVERY_PRIORITY_MAX);
1011
1012 // User can't set this too high anymore, but might be a legacy value
1013 if (pool_recovery_priority > OSD_POOL_PRIORITY_MAX)
1014 pool_recovery_priority = OSD_POOL_PRIORITY_MAX;
1015 if (pool_recovery_priority < OSD_POOL_PRIORITY_MIN)
1016 pool_recovery_priority = OSD_POOL_PRIORITY_MIN;
1017 // Shift range from min to max to 0 to max - min
1018 pool_recovery_priority += (0 - OSD_POOL_PRIORITY_MIN);
1019 ceph_assert(pool_recovery_priority >= 0 && pool_recovery_priority <= (OSD_POOL_PRIORITY_MAX - OSD_POOL_PRIORITY_MIN));
1020
1021 priority += pool_recovery_priority;
1022
1023 // Clamp to valid range
1024 if (priority > max) {
1025 return max;
1026 } else if (priority < OSD_RECOVERY_PRIORITY_MIN) {
1027 return OSD_RECOVERY_PRIORITY_MIN;
1028 } else {
1029 return priority;
1030 }
1031 }
1032
1033 unsigned PeeringState::get_recovery_priority()
1034 {
1035 // a higher value -> a higher priority
1036 int ret = OSD_RECOVERY_PRIORITY_BASE;
1037 int base = ret;
1038
1039 if (state & PG_STATE_FORCED_RECOVERY) {
1040 ret = OSD_RECOVERY_PRIORITY_FORCED;
1041 } else {
1042 // XXX: This priority boost isn't so much about inactive, but about data-at-risk
1043 if (is_degraded() && info.stats.avail_no_missing.size() < pool.info.min_size) {
1044 base = OSD_RECOVERY_INACTIVE_PRIORITY_BASE;
1045 // inactive: no. of replicas < min_size, highest priority since it blocks IO
1046 ret = base + (pool.info.min_size - info.stats.avail_no_missing.size());
1047 }
1048
1049 int64_t pool_recovery_priority = 0;
1050 pool.info.opts.get(pool_opts_t::RECOVERY_PRIORITY, &pool_recovery_priority);
1051
1052 ret = clamp_recovery_priority(ret, pool_recovery_priority, max_prio_map[base]);
1053 }
1054 psdout(20) << __func__ << " recovery priority is " << ret << dendl;
1055 return static_cast<unsigned>(ret);
1056 }
1057
1058 unsigned PeeringState::get_backfill_priority()
1059 {
1060 // a higher value -> a higher priority
1061 int ret = OSD_BACKFILL_PRIORITY_BASE;
1062 int base = ret;
1063
1064 if (state & PG_STATE_FORCED_BACKFILL) {
1065 ret = OSD_BACKFILL_PRIORITY_FORCED;
1066 } else {
1067 if (actingset.size() < pool.info.min_size) {
1068 base = OSD_BACKFILL_INACTIVE_PRIORITY_BASE;
1069 // inactive: no. of replicas < min_size, highest priority since it blocks IO
1070 ret = base + (pool.info.min_size - actingset.size());
1071
1072 } else if (is_undersized()) {
1073 // undersized: OSD_BACKFILL_DEGRADED_PRIORITY_BASE + num missing replicas
1074 ceph_assert(pool.info.size > actingset.size());
1075 base = OSD_BACKFILL_DEGRADED_PRIORITY_BASE;
1076 ret = base + (pool.info.size - actingset.size());
1077
1078 } else if (is_degraded()) {
1079 // degraded: baseline degraded
1080 base = ret = OSD_BACKFILL_DEGRADED_PRIORITY_BASE;
1081 }
1082
1083 // Adjust with pool's recovery priority
1084 int64_t pool_recovery_priority = 0;
1085 pool.info.opts.get(pool_opts_t::RECOVERY_PRIORITY, &pool_recovery_priority);
1086
1087 ret = clamp_recovery_priority(ret, pool_recovery_priority, max_prio_map[base]);
1088 }
1089
1090 psdout(20) << __func__ << " backfill priority is " << ret << dendl;
1091 return static_cast<unsigned>(ret);
1092 }
1093
1094 unsigned PeeringState::get_delete_priority()
1095 {
1096 auto state = get_osdmap()->get_state(pg_whoami.osd);
1097 if (state & (CEPH_OSD_BACKFILLFULL |
1098 CEPH_OSD_FULL)) {
1099 return OSD_DELETE_PRIORITY_FULL;
1100 } else if (state & CEPH_OSD_NEARFULL) {
1101 return OSD_DELETE_PRIORITY_FULLISH;
1102 } else {
1103 return OSD_DELETE_PRIORITY_NORMAL;
1104 }
1105 }
1106
1107 bool PeeringState::set_force_recovery(bool b)
1108 {
1109 bool did = false;
1110 if (b) {
1111 if (!(state & PG_STATE_FORCED_RECOVERY) &&
1112 (state & (PG_STATE_DEGRADED |
1113 PG_STATE_RECOVERY_WAIT |
1114 PG_STATE_RECOVERING))) {
1115 psdout(20) << __func__ << " set" << dendl;
1116 state_set(PG_STATE_FORCED_RECOVERY);
1117 pl->publish_stats_to_osd();
1118 did = true;
1119 }
1120 } else if (state & PG_STATE_FORCED_RECOVERY) {
1121 psdout(20) << __func__ << " clear" << dendl;
1122 state_clear(PG_STATE_FORCED_RECOVERY);
1123 pl->publish_stats_to_osd();
1124 did = true;
1125 }
1126 if (did) {
1127 psdout(20) << __func__ << " state " << get_current_state()
1128 << dendl;
1129 pl->update_local_background_io_priority(get_recovery_priority());
1130 }
1131 return did;
1132 }
1133
1134 bool PeeringState::set_force_backfill(bool b)
1135 {
1136 bool did = false;
1137 if (b) {
1138 if (!(state & PG_STATE_FORCED_BACKFILL) &&
1139 (state & (PG_STATE_DEGRADED |
1140 PG_STATE_BACKFILL_WAIT |
1141 PG_STATE_BACKFILLING))) {
1142 psdout(10) << __func__ << " set" << dendl;
1143 state_set(PG_STATE_FORCED_BACKFILL);
1144 pl->publish_stats_to_osd();
1145 did = true;
1146 }
1147 } else if (state & PG_STATE_FORCED_BACKFILL) {
1148 psdout(10) << __func__ << " clear" << dendl;
1149 state_clear(PG_STATE_FORCED_BACKFILL);
1150 pl->publish_stats_to_osd();
1151 did = true;
1152 }
1153 if (did) {
1154 psdout(20) << __func__ << " state " << get_current_state()
1155 << dendl;
1156 pl->update_local_background_io_priority(get_backfill_priority());
1157 }
1158 return did;
1159 }
1160
1161 void PeeringState::schedule_renew_lease()
1162 {
1163 pl->schedule_renew_lease(
1164 last_peering_reset,
1165 readable_interval / 2);
1166 }
1167
1168 void PeeringState::send_lease()
1169 {
1170 epoch_t epoch = pl->get_osdmap_epoch();
1171 for (auto peer : actingset) {
1172 if (peer == pg_whoami) {
1173 continue;
1174 }
1175 pl->send_cluster_message(
1176 peer.osd,
1177 make_message<MOSDPGLease>(epoch,
1178 spg_t(spgid.pgid, peer.shard),
1179 get_lease()),
1180 epoch);
1181 }
1182 }
1183
1184 void PeeringState::proc_lease(const pg_lease_t& l)
1185 {
1186 if (!HAVE_FEATURE(upacting_features, SERVER_OCTOPUS)) {
1187 psdout(20) << __func__ << " no-op, upacting_features 0x" << std::hex
1188 << upacting_features << std::dec
1189 << " does not include SERVER_OCTOPUS" << dendl;
1190 return;
1191 }
1192 if (!is_nonprimary()) {
1193 psdout(20) << __func__ << " no-op, !nonprimary" << dendl;
1194 return;
1195 }
1196 psdout(10) << __func__ << " " << l << dendl;
1197 if (l.readable_until_ub > readable_until_ub_from_primary) {
1198 readable_until_ub_from_primary = l.readable_until_ub;
1199 }
1200
1201 ceph::signedspan ru = ceph::signedspan::zero();
1202 if (l.readable_until != ceph::signedspan::zero() &&
1203 hb_stamps[0]->peer_clock_delta_ub) {
1204 ru = l.readable_until - *hb_stamps[0]->peer_clock_delta_ub;
1205 psdout(20) << " peer_clock_delta_ub " << *hb_stamps[0]->peer_clock_delta_ub
1206 << " -> ru " << ru << dendl;
1207 }
1208 if (ru > readable_until) {
1209 readable_until = ru;
1210 psdout(20) << __func__ << " readable_until now " << readable_until << dendl;
1211 // NOTE: if we ever decide to block/queue ops on the replica,
1212 // we'll need to wake them up here.
1213 }
1214
1215 ceph::signedspan ruub;
1216 if (hb_stamps[0]->peer_clock_delta_lb) {
1217 ruub = l.readable_until_ub - *hb_stamps[0]->peer_clock_delta_lb;
1218 psdout(20) << " peer_clock_delta_lb " << *hb_stamps[0]->peer_clock_delta_lb
1219 << " -> ruub " << ruub << dendl;
1220 } else {
1221 ruub = pl->get_mnow() + l.interval;
1222 psdout(20) << " no peer_clock_delta_lb -> ruub " << ruub << dendl;
1223 }
1224 if (ruub > readable_until_ub) {
1225 readable_until_ub = ruub;
1226 psdout(20) << __func__ << " readable_until_ub now " << readable_until_ub
1227 << dendl;
1228 }
1229 }
1230
1231 void PeeringState::proc_lease_ack(int from, const pg_lease_ack_t& a)
1232 {
1233 if (!HAVE_FEATURE(upacting_features, SERVER_OCTOPUS)) {
1234 return;
1235 }
1236 auto now = pl->get_mnow();
1237 bool was_min = false;
1238 for (unsigned i = 0; i < acting.size(); ++i) {
1239 if (from == acting[i]) {
1240 // the lease_ack value is based on the primary's clock
1241 if (a.readable_until_ub > acting_readable_until_ub[i]) {
1242 if (acting_readable_until_ub[i] == readable_until) {
1243 was_min = true;
1244 }
1245 acting_readable_until_ub[i] = a.readable_until_ub;
1246 break;
1247 }
1248 }
1249 }
1250 if (was_min) {
1251 auto old_ru = readable_until;
1252 recalc_readable_until();
1253 if (now < old_ru) {
1254 pl->recheck_readable();
1255 }
1256 }
1257 }
1258
1259 void PeeringState::proc_renew_lease()
1260 {
1261 if (!HAVE_FEATURE(upacting_features, SERVER_OCTOPUS)) {
1262 return;
1263 }
1264 renew_lease(pl->get_mnow());
1265 send_lease();
1266 schedule_renew_lease();
1267 }
1268
1269 void PeeringState::recalc_readable_until()
1270 {
1271 assert(is_primary());
1272 ceph::signedspan min = readable_until_ub_sent;
1273 for (unsigned i = 0; i < acting.size(); ++i) {
1274 if (acting[i] == pg_whoami.osd || acting[i] == CRUSH_ITEM_NONE) {
1275 continue;
1276 }
1277 dout(20) << __func__ << " peer osd." << acting[i]
1278 << " ruub " << acting_readable_until_ub[i] << dendl;
1279 if (acting_readable_until_ub[i] < min) {
1280 min = acting_readable_until_ub[i];
1281 }
1282 }
1283 readable_until = min;
1284 readable_until_ub = min;
1285 dout(20) << __func__ << " readable_until[_ub] " << readable_until
1286 << " (sent " << readable_until_ub_sent << ")" << dendl;
1287 }
1288
1289 bool PeeringState::check_prior_readable_down_osds(const OSDMapRef& map)
1290 {
1291 if (!HAVE_FEATURE(upacting_features, SERVER_OCTOPUS)) {
1292 return false;
1293 }
1294 bool changed = false;
1295 auto p = prior_readable_down_osds.begin();
1296 while (p != prior_readable_down_osds.end()) {
1297 if (map->is_dead(*p)) {
1298 dout(10) << __func__ << " prior_readable_down_osds osd." << *p
1299 << " is dead as of epoch " << map->get_epoch()
1300 << dendl;
1301 p = prior_readable_down_osds.erase(p);
1302 changed = true;
1303 } else {
1304 ++p;
1305 }
1306 }
1307 if (changed && prior_readable_down_osds.empty()) {
1308 psdout(10) << " empty prior_readable_down_osds, clearing ub" << dendl;
1309 clear_prior_readable_until_ub();
1310 return true;
1311 }
1312 return false;
1313 }
1314
1315 bool PeeringState::adjust_need_up_thru(const OSDMapRef osdmap)
1316 {
1317 epoch_t up_thru = osdmap->get_up_thru(pg_whoami.osd);
1318 if (need_up_thru &&
1319 up_thru >= info.history.same_interval_since) {
1320 psdout(10) << "adjust_need_up_thru now "
1321 << up_thru << ", need_up_thru now false" << dendl;
1322 need_up_thru = false;
1323 return true;
1324 }
1325 return false;
1326 }
1327
1328 PastIntervals::PriorSet PeeringState::build_prior()
1329 {
1330 if (1) {
1331 // sanity check
1332 for (auto it = peer_info.begin(); it != peer_info.end(); ++it) {
1333 ceph_assert(info.history.last_epoch_started >=
1334 it->second.history.last_epoch_started);
1335 }
1336 }
1337
1338 const OSDMap &osdmap = *get_osdmap();
1339 PastIntervals::PriorSet prior = past_intervals.get_prior_set(
1340 pool.info.is_erasure(),
1341 info.history.last_epoch_started,
1342 &missing_loc.get_recoverable_predicate(),
1343 [&](epoch_t start, int osd, epoch_t *lost_at) {
1344 const osd_info_t *pinfo = 0;
1345 if (osdmap.exists(osd)) {
1346 pinfo = &osdmap.get_info(osd);
1347 if (lost_at)
1348 *lost_at = pinfo->lost_at;
1349 }
1350
1351 if (osdmap.is_up(osd)) {
1352 return PastIntervals::UP;
1353 } else if (!pinfo) {
1354 return PastIntervals::DNE;
1355 } else if (pinfo->lost_at > start) {
1356 return PastIntervals::LOST;
1357 } else {
1358 return PastIntervals::DOWN;
1359 }
1360 },
1361 up,
1362 acting,
1363 dpp);
1364
1365 if (prior.pg_down) {
1366 state_set(PG_STATE_DOWN);
1367 }
1368
1369 if (get_osdmap()->get_up_thru(pg_whoami.osd) <
1370 info.history.same_interval_since) {
1371 psdout(10) << "up_thru " << get_osdmap()->get_up_thru(pg_whoami.osd)
1372 << " < same_since " << info.history.same_interval_since
1373 << ", must notify monitor" << dendl;
1374 need_up_thru = true;
1375 } else {
1376 psdout(10) << "up_thru " << get_osdmap()->get_up_thru(pg_whoami.osd)
1377 << " >= same_since " << info.history.same_interval_since
1378 << ", all is well" << dendl;
1379 need_up_thru = false;
1380 }
1381 pl->set_probe_targets(prior.probe);
1382 return prior;
1383 }
1384
1385 bool PeeringState::needs_recovery() const
1386 {
1387 ceph_assert(is_primary());
1388
1389 auto &missing = pg_log.get_missing();
1390
1391 if (missing.num_missing()) {
1392 psdout(10) << __func__ << " primary has " << missing.num_missing()
1393 << " missing" << dendl;
1394 return true;
1395 }
1396
1397 ceph_assert(!acting_recovery_backfill.empty());
1398 for (const pg_shard_t& peer : acting_recovery_backfill) {
1399 if (peer == get_primary()) {
1400 continue;
1401 }
1402 auto pm = peer_missing.find(peer);
1403 if (pm == peer_missing.end()) {
1404 psdout(10) << __func__ << " osd." << peer << " doesn't have missing set"
1405 << dendl;
1406 continue;
1407 }
1408 if (pm->second.num_missing()) {
1409 psdout(10) << __func__ << " osd." << peer << " has "
1410 << pm->second.num_missing() << " missing" << dendl;
1411 return true;
1412 }
1413 }
1414
1415 psdout(10) << __func__ << " is recovered" << dendl;
1416 return false;
1417 }
1418
1419 bool PeeringState::needs_backfill() const
1420 {
1421 ceph_assert(is_primary());
1422
1423 // We can assume that only possible osds that need backfill
1424 // are on the backfill_targets vector nodes.
1425 for (const pg_shard_t& peer : backfill_targets) {
1426 auto pi = peer_info.find(peer);
1427 ceph_assert(pi != peer_info.end());
1428 if (!pi->second.last_backfill.is_max()) {
1429 psdout(10) << __func__ << " osd." << peer
1430 << " has last_backfill " << pi->second.last_backfill << dendl;
1431 return true;
1432 }
1433 }
1434
1435 psdout(10) << __func__ << " does not need backfill" << dendl;
1436 return false;
1437 }
1438
1439 /*
1440 * Returns true unless there is a non-lost OSD in might_have_unfound.
1441 */
1442 bool PeeringState::all_unfound_are_queried_or_lost(
1443 const OSDMapRef osdmap) const
1444 {
1445 ceph_assert(is_primary());
1446
1447 auto peer = might_have_unfound.begin();
1448 auto mend = might_have_unfound.end();
1449 for (; peer != mend; ++peer) {
1450 if (peer_missing.count(*peer))
1451 continue;
1452 auto iter = peer_info.find(*peer);
1453 if (iter != peer_info.end() &&
1454 (iter->second.is_empty() || iter->second.dne()))
1455 continue;
1456 if (!osdmap->exists(peer->osd))
1457 continue;
1458 const osd_info_t &osd_info(osdmap->get_info(peer->osd));
1459 if (osd_info.lost_at <= osd_info.up_from) {
1460 // If there is even one OSD in might_have_unfound that isn't lost, we
1461 // still might retrieve our unfound.
1462 return false;
1463 }
1464 }
1465 psdout(10) << "all_unfound_are_queried_or_lost all of might_have_unfound "
1466 << might_have_unfound
1467 << " have been queried or are marked lost" << dendl;
1468 return true;
1469 }
1470
1471
1472 void PeeringState::reject_reservation()
1473 {
1474 pl->unreserve_recovery_space();
1475 pl->send_cluster_message(
1476 primary.osd,
1477 make_message<MBackfillReserve>(
1478 MBackfillReserve::REJECT_TOOFULL,
1479 spg_t(info.pgid.pgid, primary.shard),
1480 get_osdmap_epoch()),
1481 get_osdmap_epoch());
1482 }
1483
1484 /**
1485 * find_best_info
1486 *
1487 * Returns an iterator to the best info in infos sorted by:
1488 * 1) Prefer newer last_update
1489 * 2) Prefer longer tail if it brings another info into contiguity
1490 * 3) Prefer current primary
1491 */
1492 map<pg_shard_t, pg_info_t>::const_iterator PeeringState::find_best_info(
1493 const map<pg_shard_t, pg_info_t> &infos,
1494 bool restrict_to_up_acting,
1495 bool *history_les_bound) const
1496 {
1497 ceph_assert(history_les_bound);
1498 /* See doc/dev/osd_internals/last_epoch_started.rst before attempting
1499 * to make changes to this process. Also, make sure to update it
1500 * when you find bugs! */
1501 epoch_t max_last_epoch_started_found = 0;
1502 for (auto i = infos.begin(); i != infos.end(); ++i) {
1503 if (!cct->_conf->osd_find_best_info_ignore_history_les &&
1504 max_last_epoch_started_found < i->second.history.last_epoch_started) {
1505 *history_les_bound = true;
1506 max_last_epoch_started_found = i->second.history.last_epoch_started;
1507 }
1508 if (!i->second.is_incomplete() &&
1509 max_last_epoch_started_found < i->second.last_epoch_started) {
1510 *history_les_bound = false;
1511 max_last_epoch_started_found = i->second.last_epoch_started;
1512 }
1513 }
1514 eversion_t min_last_update_acceptable = eversion_t::max();
1515 for (auto i = infos.begin(); i != infos.end(); ++i) {
1516 if (max_last_epoch_started_found <= i->second.last_epoch_started) {
1517 if (min_last_update_acceptable > i->second.last_update)
1518 min_last_update_acceptable = i->second.last_update;
1519 }
1520 }
1521 if (min_last_update_acceptable == eversion_t::max())
1522 return infos.end();
1523
1524 auto best = infos.end();
1525 // find osd with newest last_update (oldest for ec_pool).
1526 // if there are multiples, prefer
1527 // - a longer tail, if it brings another peer into log contiguity
1528 // - the current primary
1529 for (auto p = infos.begin(); p != infos.end(); ++p) {
1530 if (restrict_to_up_acting && !is_up(p->first) &&
1531 !is_acting(p->first))
1532 continue;
1533 // Only consider peers with last_update >= min_last_update_acceptable
1534 if (p->second.last_update < min_last_update_acceptable)
1535 continue;
1536 // Disqualify anyone with a too old last_epoch_started
1537 if (p->second.last_epoch_started < max_last_epoch_started_found)
1538 continue;
1539 // Disqualify anyone who is incomplete (not fully backfilled)
1540 if (p->second.is_incomplete())
1541 continue;
1542 if (best == infos.end()) {
1543 best = p;
1544 continue;
1545 }
1546 // Prefer newer last_update
1547 if (pool.info.require_rollback()) {
1548 if (p->second.last_update > best->second.last_update)
1549 continue;
1550 if (p->second.last_update < best->second.last_update) {
1551 best = p;
1552 continue;
1553 }
1554 } else {
1555 if (p->second.last_update < best->second.last_update)
1556 continue;
1557 if (p->second.last_update > best->second.last_update) {
1558 best = p;
1559 continue;
1560 }
1561 }
1562
1563 // Prefer longer tail
1564 if (p->second.log_tail > best->second.log_tail) {
1565 continue;
1566 } else if (p->second.log_tail < best->second.log_tail) {
1567 best = p;
1568 continue;
1569 }
1570
1571 if (!p->second.has_missing() && best->second.has_missing()) {
1572 psdout(10) << __func__ << " prefer osd." << p->first
1573 << " because it is complete while best has missing"
1574 << dendl;
1575 best = p;
1576 continue;
1577 } else if (p->second.has_missing() && !best->second.has_missing()) {
1578 psdout(10) << __func__ << " skipping osd." << p->first
1579 << " because it has missing while best is complete"
1580 << dendl;
1581 continue;
1582 } else {
1583 // both are complete or have missing
1584 // fall through
1585 }
1586
1587 // prefer current primary (usually the caller), all things being equal
1588 if (p->first == pg_whoami) {
1589 psdout(10) << "calc_acting prefer osd." << p->first
1590 << " because it is current primary" << dendl;
1591 best = p;
1592 continue;
1593 }
1594 }
1595 return best;
1596 }
1597
1598 void PeeringState::calc_ec_acting(
1599 map<pg_shard_t, pg_info_t>::const_iterator auth_log_shard,
1600 unsigned size,
1601 const vector<int> &acting,
1602 const vector<int> &up,
1603 const map<pg_shard_t, pg_info_t> &all_info,
1604 bool restrict_to_up_acting,
1605 vector<int> *_want,
1606 set<pg_shard_t> *backfill,
1607 set<pg_shard_t> *acting_backfill,
1608 ostream &ss)
1609 {
1610 vector<int> want(size, CRUSH_ITEM_NONE);
1611 map<shard_id_t, set<pg_shard_t> > all_info_by_shard;
1612 for (auto i = all_info.begin();
1613 i != all_info.end();
1614 ++i) {
1615 all_info_by_shard[i->first.shard].insert(i->first);
1616 }
1617 for (uint8_t i = 0; i < want.size(); ++i) {
1618 ss << "For position " << (unsigned)i << ": ";
1619 if (up.size() > (unsigned)i && up[i] != CRUSH_ITEM_NONE &&
1620 !all_info.find(pg_shard_t(up[i], shard_id_t(i)))->second.is_incomplete() &&
1621 all_info.find(pg_shard_t(up[i], shard_id_t(i)))->second.last_update >=
1622 auth_log_shard->second.log_tail) {
1623 ss << " selecting up[i]: " << pg_shard_t(up[i], shard_id_t(i)) << std::endl;
1624 want[i] = up[i];
1625 continue;
1626 }
1627 if (up.size() > (unsigned)i && up[i] != CRUSH_ITEM_NONE) {
1628 ss << " backfilling up[i]: " << pg_shard_t(up[i], shard_id_t(i))
1629 << " and ";
1630 backfill->insert(pg_shard_t(up[i], shard_id_t(i)));
1631 }
1632
1633 if (acting.size() > (unsigned)i && acting[i] != CRUSH_ITEM_NONE &&
1634 !all_info.find(pg_shard_t(acting[i], shard_id_t(i)))->second.is_incomplete() &&
1635 all_info.find(pg_shard_t(acting[i], shard_id_t(i)))->second.last_update >=
1636 auth_log_shard->second.log_tail) {
1637 ss << " selecting acting[i]: " << pg_shard_t(acting[i], shard_id_t(i)) << std::endl;
1638 want[i] = acting[i];
1639 } else if (!restrict_to_up_acting) {
1640 for (auto j = all_info_by_shard[shard_id_t(i)].begin();
1641 j != all_info_by_shard[shard_id_t(i)].end();
1642 ++j) {
1643 ceph_assert(j->shard == i);
1644 if (!all_info.find(*j)->second.is_incomplete() &&
1645 all_info.find(*j)->second.last_update >=
1646 auth_log_shard->second.log_tail) {
1647 ss << " selecting stray: " << *j << std::endl;
1648 want[i] = j->osd;
1649 break;
1650 }
1651 }
1652 if (want[i] == CRUSH_ITEM_NONE)
1653 ss << " failed to fill position " << (int)i << std::endl;
1654 }
1655 }
1656
1657 for (uint8_t i = 0; i < want.size(); ++i) {
1658 if (want[i] != CRUSH_ITEM_NONE) {
1659 acting_backfill->insert(pg_shard_t(want[i], shard_id_t(i)));
1660 }
1661 }
1662 acting_backfill->insert(backfill->begin(), backfill->end());
1663 _want->swap(want);
1664 }
1665
1666 std::pair<map<pg_shard_t, pg_info_t>::const_iterator, eversion_t>
1667 PeeringState::select_replicated_primary(
1668 map<pg_shard_t, pg_info_t>::const_iterator auth_log_shard,
1669 uint64_t force_auth_primary_missing_objects,
1670 const std::vector<int> &up,
1671 pg_shard_t up_primary,
1672 const map<pg_shard_t, pg_info_t> &all_info,
1673 const OSDMapRef osdmap,
1674 ostream &ss)
1675 {
1676 pg_shard_t auth_log_shard_id = auth_log_shard->first;
1677
1678 ss << __func__ << " newest update on osd." << auth_log_shard_id
1679 << " with " << auth_log_shard->second << std::endl;
1680
1681 // select primary
1682 auto primary = all_info.find(up_primary);
1683 if (up.size() &&
1684 !primary->second.is_incomplete() &&
1685 primary->second.last_update >=
1686 auth_log_shard->second.log_tail) {
1687 if (HAVE_FEATURE(osdmap->get_up_osd_features(), SERVER_NAUTILUS)) {
1688 auto approx_missing_objects =
1689 primary->second.stats.stats.sum.num_objects_missing;
1690 auto auth_version = auth_log_shard->second.last_update.version;
1691 auto primary_version = primary->second.last_update.version;
1692 if (auth_version > primary_version) {
1693 approx_missing_objects += auth_version - primary_version;
1694 } else {
1695 approx_missing_objects += primary_version - auth_version;
1696 }
1697 if ((uint64_t)approx_missing_objects >
1698 force_auth_primary_missing_objects) {
1699 primary = auth_log_shard;
1700 ss << "up_primary: " << up_primary << ") has approximate "
1701 << approx_missing_objects
1702 << "(>" << force_auth_primary_missing_objects <<") "
1703 << "missing objects, osd." << auth_log_shard_id
1704 << " selected as primary instead"
1705 << std::endl;
1706 } else {
1707 ss << "up_primary: " << up_primary << ") selected as primary"
1708 << std::endl;
1709 }
1710 } else {
1711 ss << "up_primary: " << up_primary << ") selected as primary" << std::endl;
1712 }
1713 } else {
1714 ceph_assert(!auth_log_shard->second.is_incomplete());
1715 ss << "up[0] needs backfill, osd." << auth_log_shard_id
1716 << " selected as primary instead" << std::endl;
1717 primary = auth_log_shard;
1718 }
1719
1720 ss << __func__ << " primary is osd." << primary->first
1721 << " with " << primary->second << std::endl;
1722
1723 /* We include auth_log_shard->second.log_tail because in GetLog,
1724 * we will request logs back to the min last_update over our
1725 * acting_backfill set, which will result in our log being extended
1726 * as far backwards as necessary to pick up any peers which can
1727 * be log recovered by auth_log_shard's log */
1728 eversion_t oldest_auth_log_entry =
1729 std::min(primary->second.log_tail, auth_log_shard->second.log_tail);
1730
1731 return std::make_pair(primary, oldest_auth_log_entry);
1732 }
1733
1734
1735 /**
1736 * calculate the desired acting set.
1737 *
1738 * Choose an appropriate acting set. Prefer up[0], unless it is
1739 * incomplete, or another osd has a longer tail that allows us to
1740 * bring other up nodes up to date.
1741 */
1742 void PeeringState::calc_replicated_acting(
1743 map<pg_shard_t, pg_info_t>::const_iterator primary,
1744 eversion_t oldest_auth_log_entry,
1745 unsigned size,
1746 const vector<int> &acting,
1747 const vector<int> &up,
1748 pg_shard_t up_primary,
1749 const map<pg_shard_t, pg_info_t> &all_info,
1750 bool restrict_to_up_acting,
1751 vector<int> *want,
1752 set<pg_shard_t> *backfill,
1753 set<pg_shard_t> *acting_backfill,
1754 const OSDMapRef osdmap,
1755 const PGPool& pool,
1756 ostream &ss)
1757 {
1758 ss << __func__ << (restrict_to_up_acting ? " restrict_to_up_acting" : "")
1759 << std::endl;
1760
1761 want->push_back(primary->first.osd);
1762 acting_backfill->insert(primary->first);
1763
1764 // select replicas that have log contiguity with primary.
1765 // prefer up, then acting, then any peer_info osds
1766 for (auto i : up) {
1767 pg_shard_t up_cand = pg_shard_t(i, shard_id_t::NO_SHARD);
1768 if (up_cand == primary->first)
1769 continue;
1770 const pg_info_t &cur_info = all_info.find(up_cand)->second;
1771 if (cur_info.is_incomplete() ||
1772 cur_info.last_update < oldest_auth_log_entry) {
1773 ss << " shard " << up_cand << " (up) backfill " << cur_info << std::endl;
1774 backfill->insert(up_cand);
1775 acting_backfill->insert(up_cand);
1776 } else {
1777 want->push_back(i);
1778 acting_backfill->insert(up_cand);
1779 ss << " osd." << i << " (up) accepted " << cur_info << std::endl;
1780 }
1781 }
1782
1783 if (want->size() >= size) {
1784 return;
1785 }
1786
1787 std::vector<std::pair<eversion_t, int>> candidate_by_last_update;
1788 candidate_by_last_update.reserve(acting.size());
1789 // This no longer has backfill OSDs, but they are covered above.
1790 for (auto i : acting) {
1791 pg_shard_t acting_cand(i, shard_id_t::NO_SHARD);
1792 // skip up osds we already considered above
1793 if (acting_cand == primary->first)
1794 continue;
1795 auto up_it = find(up.begin(), up.end(), i);
1796 if (up_it != up.end())
1797 continue;
1798
1799 const pg_info_t &cur_info = all_info.find(acting_cand)->second;
1800 if (cur_info.is_incomplete() ||
1801 cur_info.last_update < oldest_auth_log_entry) {
1802 ss << " shard " << acting_cand << " (acting) REJECTED "
1803 << cur_info << std::endl;
1804 } else {
1805 candidate_by_last_update.emplace_back(cur_info.last_update, i);
1806 }
1807 }
1808
1809 auto sort_by_eversion =[](const std::pair<eversion_t, int> &lhs,
1810 const std::pair<eversion_t, int> &rhs) {
1811 return lhs.first > rhs.first;
1812 };
1813 // sort by last_update, in descending order.
1814 std::sort(candidate_by_last_update.begin(),
1815 candidate_by_last_update.end(), sort_by_eversion);
1816 for (auto &p: candidate_by_last_update) {
1817 ceph_assert(want->size() < size);
1818 want->push_back(p.second);
1819 pg_shard_t s = pg_shard_t(p.second, shard_id_t::NO_SHARD);
1820 acting_backfill->insert(s);
1821 ss << " shard " << s << " (acting) accepted "
1822 << all_info.find(s)->second << std::endl;
1823 if (want->size() >= size) {
1824 return;
1825 }
1826 }
1827
1828 if (restrict_to_up_acting) {
1829 return;
1830 }
1831 candidate_by_last_update.clear();
1832 candidate_by_last_update.reserve(all_info.size()); // overestimate but fine
1833 // continue to search stray to find more suitable peers
1834 for (auto &i : all_info) {
1835 // skip up osds we already considered above
1836 if (i.first == primary->first)
1837 continue;
1838 auto up_it = find(up.begin(), up.end(), i.first.osd);
1839 if (up_it != up.end())
1840 continue;
1841 auto acting_it = find(
1842 acting.begin(), acting.end(), i.first.osd);
1843 if (acting_it != acting.end())
1844 continue;
1845
1846 if (i.second.is_incomplete() ||
1847 i.second.last_update < oldest_auth_log_entry) {
1848 ss << " shard " << i.first << " (stray) REJECTED " << i.second
1849 << std::endl;
1850 } else {
1851 candidate_by_last_update.emplace_back(
1852 i.second.last_update, i.first.osd);
1853 }
1854 }
1855
1856 if (candidate_by_last_update.empty()) {
1857 // save us some effort
1858 return;
1859 }
1860
1861 // sort by last_update, in descending order.
1862 std::sort(candidate_by_last_update.begin(),
1863 candidate_by_last_update.end(), sort_by_eversion);
1864
1865 for (auto &p: candidate_by_last_update) {
1866 ceph_assert(want->size() < size);
1867 want->push_back(p.second);
1868 pg_shard_t s = pg_shard_t(p.second, shard_id_t::NO_SHARD);
1869 acting_backfill->insert(s);
1870 ss << " shard " << s << " (stray) accepted "
1871 << all_info.find(s)->second << std::endl;
1872 if (want->size() >= size) {
1873 return;
1874 }
1875 }
1876 }
1877
1878 // Defines osd preference order: acting set, then larger last_update
1879 using osd_ord_t = std::tuple<bool, eversion_t>; // <acting, last_update>
1880 using osd_id_t = int;
1881
1882 class bucket_candidates_t {
1883 std::deque<std::pair<osd_ord_t, osd_id_t>> osds;
1884 int selected = 0;
1885
1886 public:
1887 void add_osd(osd_ord_t ord, osd_id_t osd) {
1888 // osds will be added in smallest to largest order
1889 assert(osds.empty() || osds.back().first <= ord);
1890 osds.push_back(std::make_pair(ord, osd));
1891 }
1892 osd_id_t pop_osd() {
1893 ceph_assert(!is_empty());
1894 auto ret = osds.back();
1895 osds.pop_back();
1896 return ret.second;
1897 }
1898
1899 void inc_selected() { selected++; }
1900 unsigned get_num_selected() const { return selected; }
1901
1902 osd_ord_t get_ord() const {
1903 return osds.empty() ? std::make_tuple(false, eversion_t())
1904 : osds.back().first;
1905 }
1906
1907 bool is_empty() const { return osds.empty(); }
1908
1909 bool operator<(const bucket_candidates_t &rhs) const {
1910 return std::make_tuple(-selected, get_ord()) <
1911 std::make_tuple(-rhs.selected, rhs.get_ord());
1912 }
1913
1914 friend std::ostream &operator<<(std::ostream &, const bucket_candidates_t &);
1915 };
1916
1917 std::ostream &operator<<(std::ostream &lhs, const bucket_candidates_t &cand)
1918 {
1919 return lhs << "candidates[" << cand.osds << "]";
1920 }
1921
1922 class bucket_heap_t {
1923 using elem_t = std::reference_wrapper<bucket_candidates_t>;
1924 std::vector<elem_t> heap;
1925
1926 // Max heap -- should emit buckets in order of preference
1927 struct comp {
1928 bool operator()(const elem_t &lhs, const elem_t &rhs) {
1929 return lhs.get() < rhs.get();
1930 }
1931 };
1932 public:
1933 void push_if_nonempty(elem_t e) {
1934 if (!e.get().is_empty()) {
1935 heap.push_back(e);
1936 std::push_heap(heap.begin(), heap.end(), comp());
1937 }
1938 }
1939 elem_t pop() {
1940 std::pop_heap(heap.begin(), heap.end(), comp());
1941 auto ret = heap.back();
1942 heap.pop_back();
1943 return ret;
1944 }
1945
1946 bool is_empty() const { return heap.empty(); }
1947 };
1948
1949 /**
1950 * calc_replicated_acting_stretch
1951 *
1952 * Choose an acting set using as much of the up set as possible; filling
1953 * in the remaining slots so as to maximize the number of crush buckets at
1954 * level pool.info.peering_crush_bucket_barrier represented.
1955 *
1956 * Stretch clusters are a bit special: while they have a "size" the
1957 * same way as normal pools, if we happen to lose a data center
1958 * (we call it a "stretch bucket", but really it'll be a data center or
1959 * a cloud availability zone), we don't actually want to shove
1960 * 2 DC's worth of replication into a single site -- it won't fit!
1961 * So we locally calculate a bucket_max, based
1962 * on the targeted number of stretch buckets for the pool and
1963 * its size. Then we won't pull more than bucket_max from any
1964 * given ancestor even if it leaves us undersized.
1965
1966 * There are two distinct phases: (commented below)
1967 */
1968 void PeeringState::calc_replicated_acting_stretch(
1969 map<pg_shard_t, pg_info_t>::const_iterator primary,
1970 eversion_t oldest_auth_log_entry,
1971 unsigned size,
1972 const vector<int> &acting,
1973 const vector<int> &up,
1974 pg_shard_t up_primary,
1975 const map<pg_shard_t, pg_info_t> &all_info,
1976 bool restrict_to_up_acting,
1977 vector<int> *want,
1978 set<pg_shard_t> *backfill,
1979 set<pg_shard_t> *acting_backfill,
1980 const OSDMapRef osdmap,
1981 const PGPool& pool,
1982 ostream &ss)
1983 {
1984 ceph_assert(want);
1985 ceph_assert(acting_backfill);
1986 ceph_assert(backfill);
1987 ss << __func__ << (restrict_to_up_acting ? " restrict_to_up_acting" : "")
1988 << std::endl;
1989
1990 auto used = [want](int osd) {
1991 return std::find(want->begin(), want->end(), osd) != want->end();
1992 };
1993
1994 auto usable_info = [&](const auto &cur_info) mutable {
1995 return !(cur_info.is_incomplete() ||
1996 cur_info.last_update < oldest_auth_log_entry);
1997 };
1998
1999 auto osd_info = [&](int osd) mutable -> const pg_info_t & {
2000 pg_shard_t cand = pg_shard_t(osd, shard_id_t::NO_SHARD);
2001 const pg_info_t &cur_info = all_info.find(cand)->second;
2002 return cur_info;
2003 };
2004
2005 auto usable_osd = [&](int osd) mutable {
2006 return usable_info(osd_info(osd));
2007 };
2008
2009 std::map<int, bucket_candidates_t> ancestors;
2010 auto get_ancestor = [&](int osd) mutable {
2011 int ancestor = osdmap->crush->get_parent_of_type(
2012 osd,
2013 pool.info.peering_crush_bucket_barrier,
2014 pool.info.crush_rule);
2015 return &ancestors[ancestor];
2016 };
2017
2018 unsigned bucket_max = pool.info.size / pool.info.peering_crush_bucket_target;
2019 if (bucket_max * pool.info.peering_crush_bucket_target < pool.info.size) {
2020 ++bucket_max;
2021 }
2022
2023 /* 1) Select all usable osds from the up set as well as the primary
2024 *
2025 * We also stash any unusable osds from up into backfill.
2026 */
2027 auto add_required = [&](int osd) {
2028 if (!used(osd)) {
2029 want->push_back(osd);
2030 acting_backfill->insert(
2031 pg_shard_t(osd, shard_id_t::NO_SHARD));
2032 get_ancestor(osd)->inc_selected();
2033 }
2034 };
2035 add_required(primary->first.osd);
2036 ss << " osd " << primary->first.osd << " primary accepted "
2037 << osd_info(primary->first.osd) << std::endl;
2038 for (auto upcand: up) {
2039 auto upshard = pg_shard_t(upcand, shard_id_t::NO_SHARD);
2040 auto &curinfo = osd_info(upcand);
2041 if (usable_osd(upcand)) {
2042 ss << " osd " << upcand << " (up) accepted " << curinfo << std::endl;
2043 add_required(upcand);
2044 } else {
2045 ss << " osd " << upcand << " (up) backfill " << curinfo << std::endl;
2046 backfill->insert(upshard);
2047 acting_backfill->insert(upshard);
2048 }
2049 }
2050
2051 if (want->size() >= pool.info.size) { // non-failed CRUSH mappings are valid
2052 ss << " up set sufficient" << std::endl;
2053 return;
2054 }
2055 ss << " up set insufficient, considering remaining osds" << std::endl;
2056
2057 /* 2) Fill out remaining slots from usable osds in all_info
2058 * while maximizing the number of ancestor nodes at the
2059 * barrier_id crush level.
2060 */
2061 {
2062 std::vector<std::pair<osd_ord_t, osd_id_t>> candidates;
2063 /* To do this, we first filter the set of usable osd into an ordered
2064 * list of usable osds
2065 */
2066 auto get_osd_ord = [&](bool is_acting, const pg_info_t &info) -> osd_ord_t {
2067 return std::make_tuple(
2068 !is_acting /* acting should sort first */,
2069 info.last_update);
2070 };
2071 for (auto &cand : acting) {
2072 auto &cand_info = osd_info(cand);
2073 if (!used(cand) && usable_info(cand_info)) {
2074 ss << " acting candidate " << cand << " " << cand_info << std::endl;
2075 candidates.push_back(std::make_pair(get_osd_ord(true, cand_info), cand));
2076 }
2077 }
2078 if (!restrict_to_up_acting) {
2079 for (auto &[cand, info] : all_info) {
2080 if (!used(cand.osd) && usable_info(info) &&
2081 (std::find(acting.begin(), acting.end(), cand.osd)
2082 == acting.end())) {
2083 ss << " other candidate " << cand << " " << info << std::endl;
2084 candidates.push_back(
2085 std::make_pair(get_osd_ord(false, info), cand.osd));
2086 }
2087 }
2088 }
2089 std::sort(candidates.begin(), candidates.end());
2090
2091 // We then filter these candidates by ancestor
2092 std::for_each(candidates.begin(), candidates.end(), [&](auto cand) {
2093 get_ancestor(cand.second)->add_osd(cand.first, cand.second);
2094 });
2095 }
2096
2097 auto pop_ancestor = [&](auto &ancestor) {
2098 ceph_assert(!ancestor.is_empty());
2099 auto osd = ancestor.pop_osd();
2100
2101 ss << " accepting candidate " << osd << std::endl;
2102
2103 ceph_assert(!used(osd));
2104 ceph_assert(usable_osd(osd));
2105
2106 want->push_back(osd);
2107 acting_backfill->insert(
2108 pg_shard_t(osd, shard_id_t::NO_SHARD));
2109 ancestor.inc_selected();
2110 };
2111
2112 /* Next, we use the ancestors map to grab a descendant of the
2113 * peering_crush_mandatory_member if not already represented.
2114 *
2115 * TODO: using 0 here to match other users. Prior to merge, I
2116 * expect that this and other users should instead check against
2117 * CRUSH_ITEM_NONE.
2118 */
2119 if (pool.info.peering_crush_mandatory_member != CRUSH_ITEM_NONE) {
2120 auto aiter = ancestors.find(pool.info.peering_crush_mandatory_member);
2121 if (aiter != ancestors.end() &&
2122 !aiter->second.get_num_selected()) {
2123 ss << " adding required ancestor " << aiter->first << std::endl;
2124 ceph_assert(!aiter->second.is_empty()); // wouldn't exist otherwise
2125 pop_ancestor(aiter->second);
2126 }
2127 }
2128
2129 /* We then place the ancestors in a heap ordered by fewest selected
2130 * and then by the ordering token of the next osd */
2131 bucket_heap_t aheap;
2132 std::for_each(ancestors.begin(), ancestors.end(), [&](auto &anc) {
2133 aheap.push_if_nonempty(anc.second);
2134 });
2135
2136 /* and pull from this heap until it's empty or we have enough.
2137 * "We have enough" is a sufficient check here for
2138 * stretch_set_can_peer() because our heap sorting always
2139 * pulls from ancestors with the least number of included OSDs,
2140 * so if it is possible to satisfy the bucket_count constraints we
2141 * will do so.
2142 */
2143 while (!aheap.is_empty() && want->size() < pool.info.size) {
2144 auto next = aheap.pop();
2145 pop_ancestor(next.get());
2146 if (next.get().get_num_selected() < bucket_max) {
2147 aheap.push_if_nonempty(next);
2148 }
2149 }
2150
2151 /* The end result is that we should have as many buckets covered as
2152 * possible while respecting up, the primary selection,
2153 * the pool size (given bucket count constraints),
2154 * and the mandatory member.
2155 */
2156 }
2157
2158
2159 bool PeeringState::recoverable(const vector<int> &want) const
2160 {
2161 unsigned num_want_acting = 0;
2162 set<pg_shard_t> have;
2163 for (int i = 0; i < (int)want.size(); ++i) {
2164 if (want[i] != CRUSH_ITEM_NONE) {
2165 ++num_want_acting;
2166 have.insert(
2167 pg_shard_t(
2168 want[i],
2169 pool.info.is_erasure() ? shard_id_t(i) : shard_id_t::NO_SHARD));
2170 }
2171 }
2172
2173 if (num_want_acting < pool.info.min_size) {
2174 const bool recovery_ec_pool_below_min_size=
2175 HAVE_FEATURE(get_osdmap()->get_up_osd_features(), SERVER_OCTOPUS);
2176
2177 if (pool.info.is_erasure() && !recovery_ec_pool_below_min_size) {
2178 psdout(10) << __func__ << " failed, ec recovery below min size not supported by pre-octopus" << dendl;
2179 return false;
2180 } else if (!cct->_conf.get_val<bool>("osd_allow_recovery_below_min_size")) {
2181 psdout(10) << __func__ << " failed, recovery below min size not enabled" << dendl;
2182 return false;
2183 }
2184 }
2185 if (missing_loc.get_recoverable_predicate()(have)) {
2186 return true;
2187 } else {
2188 psdout(10) << __func__ << " failed, not recoverable " << dendl;
2189 return false;
2190 }
2191 }
2192
2193 void PeeringState::choose_async_recovery_ec(
2194 const map<pg_shard_t, pg_info_t> &all_info,
2195 const pg_info_t &auth_info,
2196 vector<int> *want,
2197 set<pg_shard_t> *async_recovery,
2198 const OSDMapRef osdmap) const
2199 {
2200 set<pair<int, pg_shard_t> > candidates_by_cost;
2201 for (uint8_t i = 0; i < want->size(); ++i) {
2202 if ((*want)[i] == CRUSH_ITEM_NONE)
2203 continue;
2204
2205 // Considering log entries to recover is accurate enough for
2206 // now. We could use minimum_to_decode_with_cost() later if
2207 // necessary.
2208 pg_shard_t shard_i((*want)[i], shard_id_t(i));
2209 // do not include strays
2210 if (stray_set.find(shard_i) != stray_set.end())
2211 continue;
2212 // Do not include an osd that is not up, since choosing it as
2213 // an async_recovery_target will move it out of the acting set.
2214 // This results in it being identified as a stray during peering,
2215 // because it is no longer in the up or acting set.
2216 if (!is_up(shard_i))
2217 continue;
2218 auto shard_info = all_info.find(shard_i)->second;
2219 // for ec pools we rollback all entries past the authoritative
2220 // last_update *before* activation. This is relatively inexpensive
2221 // compared to recovery, since it is purely local, so treat shards
2222 // past the authoritative last_update the same as those equal to it.
2223 version_t auth_version = auth_info.last_update.version;
2224 version_t candidate_version = shard_info.last_update.version;
2225 if (HAVE_FEATURE(osdmap->get_up_osd_features(), SERVER_NAUTILUS)) {
2226 auto approx_missing_objects =
2227 shard_info.stats.stats.sum.num_objects_missing;
2228 if (auth_version > candidate_version) {
2229 approx_missing_objects += auth_version - candidate_version;
2230 }
2231 if (static_cast<uint64_t>(approx_missing_objects) >
2232 cct->_conf.get_val<uint64_t>("osd_async_recovery_min_cost")) {
2233 candidates_by_cost.emplace(approx_missing_objects, shard_i);
2234 }
2235 } else {
2236 if (auth_version > candidate_version &&
2237 (auth_version - candidate_version) > cct->_conf.get_val<uint64_t>("osd_async_recovery_min_cost")) {
2238 candidates_by_cost.insert(make_pair(auth_version - candidate_version, shard_i));
2239 }
2240 }
2241 }
2242
2243 psdout(20) << __func__ << " candidates by cost are: " << candidates_by_cost
2244 << dendl;
2245
2246 // take out as many osds as we can for async recovery, in order of cost
2247 for (auto rit = candidates_by_cost.rbegin();
2248 rit != candidates_by_cost.rend(); ++rit) {
2249 pg_shard_t cur_shard = rit->second;
2250 vector<int> candidate_want(*want);
2251 candidate_want[cur_shard.shard.id] = CRUSH_ITEM_NONE;
2252 if (recoverable(candidate_want)) {
2253 want->swap(candidate_want);
2254 async_recovery->insert(cur_shard);
2255 }
2256 }
2257 psdout(20) << __func__ << " result want=" << *want
2258 << " async_recovery=" << *async_recovery << dendl;
2259 }
2260
2261 void PeeringState::choose_async_recovery_replicated(
2262 const map<pg_shard_t, pg_info_t> &all_info,
2263 const pg_info_t &auth_info,
2264 vector<int> *want,
2265 set<pg_shard_t> *async_recovery,
2266 const OSDMapRef osdmap) const
2267 {
2268 set<pair<int, pg_shard_t> > candidates_by_cost;
2269 for (auto osd_num : *want) {
2270 pg_shard_t shard_i(osd_num, shard_id_t::NO_SHARD);
2271 // do not include strays
2272 if (stray_set.find(shard_i) != stray_set.end())
2273 continue;
2274 // Do not include an osd that is not up, since choosing it as
2275 // an async_recovery_target will move it out of the acting set.
2276 // This results in it being identified as a stray during peering,
2277 // because it is no longer in the up or acting set.
2278 if (!is_up(shard_i))
2279 continue;
2280 auto shard_info = all_info.find(shard_i)->second;
2281 // use the approximate magnitude of the difference in length of
2282 // logs plus historical missing objects as the cost of recovery
2283 version_t auth_version = auth_info.last_update.version;
2284 version_t candidate_version = shard_info.last_update.version;
2285 if (HAVE_FEATURE(osdmap->get_up_osd_features(), SERVER_NAUTILUS)) {
2286 auto approx_missing_objects =
2287 shard_info.stats.stats.sum.num_objects_missing;
2288 if (auth_version > candidate_version) {
2289 approx_missing_objects += auth_version - candidate_version;
2290 } else {
2291 approx_missing_objects += candidate_version - auth_version;
2292 }
2293 if (static_cast<uint64_t>(approx_missing_objects) >
2294 cct->_conf.get_val<uint64_t>("osd_async_recovery_min_cost")) {
2295 candidates_by_cost.emplace(approx_missing_objects, shard_i);
2296 }
2297 } else {
2298 size_t approx_entries;
2299 if (auth_version > candidate_version) {
2300 approx_entries = auth_version - candidate_version;
2301 } else {
2302 approx_entries = candidate_version - auth_version;
2303 }
2304 if (approx_entries > cct->_conf.get_val<uint64_t>("osd_async_recovery_min_cost")) {
2305 candidates_by_cost.insert(make_pair(approx_entries, shard_i));
2306 }
2307 }
2308 }
2309
2310 psdout(20) << __func__ << " candidates by cost are: " << candidates_by_cost
2311 << dendl;
2312 // take out as many osds as we can for async recovery, in order of cost
2313 for (auto rit = candidates_by_cost.rbegin();
2314 rit != candidates_by_cost.rend(); ++rit) {
2315 if (want->size() <= pool.info.min_size) {
2316 break;
2317 }
2318 pg_shard_t cur_shard = rit->second;
2319 vector<int> candidate_want(*want);
2320 for (auto it = candidate_want.begin(); it != candidate_want.end(); ++it) {
2321 if (*it == cur_shard.osd) {
2322 candidate_want.erase(it);
2323 if (pool.info.stretch_set_can_peer(candidate_want, *osdmap, NULL)) {
2324 // if we're in stretch mode, we can only remove the osd if it doesn't
2325 // break peering limits.
2326 want->swap(candidate_want);
2327 async_recovery->insert(cur_shard);
2328 }
2329 break;
2330 }
2331 }
2332 }
2333
2334 psdout(20) << __func__ << " result want=" << *want
2335 << " async_recovery=" << *async_recovery << dendl;
2336 }
2337
2338 /**
2339 * choose acting
2340 *
2341 * calculate the desired acting, and request a change with the monitor
2342 * if it differs from the current acting.
2343 *
2344 * if restrict_to_up_acting=true, we filter out anything that's not in
2345 * up/acting. in order to lift this restriction, we need to
2346 * 1) check whether it's worth switching the acting set any time we get
2347 * a new pg info (not just here, when recovery finishes)
2348 * 2) check whether anything in want_acting went down on each new map
2349 * (and, if so, calculate a new want_acting)
2350 * 3) remove the assertion in PG::PeeringState::Active::react(const AdvMap)
2351 * TODO!
2352 */
2353 bool PeeringState::choose_acting(pg_shard_t &auth_log_shard_id,
2354 bool restrict_to_up_acting,
2355 bool *history_les_bound,
2356 bool request_pg_temp_change_only)
2357 {
2358 map<pg_shard_t, pg_info_t> all_info(peer_info.begin(), peer_info.end());
2359 all_info[pg_whoami] = info;
2360
2361 if (cct->_conf->subsys.should_gather<dout_subsys, 10>()) {
2362 for (auto p = all_info.begin(); p != all_info.end(); ++p) {
2363 psdout(10) << __func__ << " all_info osd." << p->first << " "
2364 << p->second << dendl;
2365 }
2366 }
2367
2368 auto auth_log_shard = find_best_info(all_info, restrict_to_up_acting,
2369 history_les_bound);
2370
2371 if (auth_log_shard == all_info.end()) {
2372 if (up != acting) {
2373 psdout(10) << __func__ << " no suitable info found (incomplete backfills?),"
2374 << " reverting to up" << dendl;
2375 want_acting = up;
2376 vector<int> empty;
2377 pl->queue_want_pg_temp(empty);
2378 } else {
2379 psdout(10) << __func__ << " failed" << dendl;
2380 ceph_assert(want_acting.empty());
2381 }
2382 return false;
2383 }
2384
2385 ceph_assert(!auth_log_shard->second.is_incomplete());
2386 auth_log_shard_id = auth_log_shard->first;
2387
2388 set<pg_shard_t> want_backfill, want_acting_backfill;
2389 vector<int> want;
2390 stringstream ss;
2391 if (pool.info.is_replicated()) {
2392 auto [primary_shard, oldest_log] = select_replicated_primary(
2393 auth_log_shard,
2394 cct->_conf.get_val<uint64_t>(
2395 "osd_force_auth_primary_missing_objects"),
2396 up,
2397 up_primary,
2398 all_info,
2399 get_osdmap(),
2400 ss);
2401 if (pool.info.is_stretch_pool()) {
2402 calc_replicated_acting_stretch(
2403 primary_shard,
2404 oldest_log,
2405 get_osdmap()->get_pg_size(info.pgid.pgid),
2406 acting,
2407 up,
2408 up_primary,
2409 all_info,
2410 restrict_to_up_acting,
2411 &want,
2412 &want_backfill,
2413 &want_acting_backfill,
2414 get_osdmap(),
2415 pool,
2416 ss);
2417 } else {
2418 calc_replicated_acting(
2419 primary_shard,
2420 oldest_log,
2421 get_osdmap()->get_pg_size(info.pgid.pgid),
2422 acting,
2423 up,
2424 up_primary,
2425 all_info,
2426 restrict_to_up_acting,
2427 &want,
2428 &want_backfill,
2429 &want_acting_backfill,
2430 get_osdmap(),
2431 pool,
2432 ss);
2433 }
2434 } else {
2435 calc_ec_acting(
2436 auth_log_shard,
2437 get_osdmap()->get_pg_size(info.pgid.pgid),
2438 acting,
2439 up,
2440 all_info,
2441 restrict_to_up_acting,
2442 &want,
2443 &want_backfill,
2444 &want_acting_backfill,
2445 ss);
2446 }
2447 psdout(10) << ss.str() << dendl;
2448
2449 if (!recoverable(want)) {
2450 want_acting.clear();
2451 return false;
2452 }
2453
2454 set<pg_shard_t> want_async_recovery;
2455 if (HAVE_FEATURE(get_osdmap()->get_up_osd_features(), SERVER_MIMIC)) {
2456 if (pool.info.is_erasure()) {
2457 choose_async_recovery_ec(
2458 all_info, auth_log_shard->second, &want, &want_async_recovery,
2459 get_osdmap());
2460 } else {
2461 choose_async_recovery_replicated(
2462 all_info, auth_log_shard->second, &want, &want_async_recovery,
2463 get_osdmap());
2464 }
2465 }
2466 while (want.size() > pool.info.size) {
2467 // async recovery should have taken out as many osds as it can.
2468 // if not, then always evict the last peer
2469 // (will get synchronously recovered later)
2470 psdout(10) << __func__ << " evicting osd." << want.back()
2471 << " from oversized want " << want << dendl;
2472 want.pop_back();
2473 }
2474 if (want != acting) {
2475 psdout(10) << __func__ << " want " << want << " != acting " << acting
2476 << ", requesting pg_temp change" << dendl;
2477 want_acting = want;
2478
2479 if (!cct->_conf->osd_debug_no_acting_change) {
2480 if (want_acting == up) {
2481 // There can't be any pending backfill if
2482 // want is the same as crush map up OSDs.
2483 ceph_assert(want_backfill.empty());
2484 vector<int> empty;
2485 pl->queue_want_pg_temp(empty);
2486 } else
2487 pl->queue_want_pg_temp(want);
2488 }
2489 return false;
2490 }
2491
2492 if (request_pg_temp_change_only)
2493 return true;
2494 want_acting.clear();
2495 acting_recovery_backfill = want_acting_backfill;
2496 psdout(10) << "acting_recovery_backfill is "
2497 << acting_recovery_backfill << dendl;
2498 ceph_assert(
2499 backfill_targets.empty() ||
2500 backfill_targets == want_backfill);
2501 if (backfill_targets.empty()) {
2502 // Caller is GetInfo
2503 backfill_targets = want_backfill;
2504 }
2505 // Adding !needs_recovery() to let the async_recovery_targets reset after recovery is complete
2506 ceph_assert(
2507 async_recovery_targets.empty() ||
2508 async_recovery_targets == want_async_recovery ||
2509 !needs_recovery());
2510 if (async_recovery_targets.empty() || !needs_recovery()) {
2511 async_recovery_targets = want_async_recovery;
2512 }
2513 // Will not change if already set because up would have had to change
2514 // Verify that nothing in backfill is in stray_set
2515 for (auto i = want_backfill.begin(); i != want_backfill.end(); ++i) {
2516 ceph_assert(stray_set.find(*i) == stray_set.end());
2517 }
2518 psdout(10) << "choose_acting want=" << want << " backfill_targets="
2519 << want_backfill << " async_recovery_targets="
2520 << async_recovery_targets << dendl;
2521 return true;
2522 }
2523
2524 void PeeringState::log_weirdness()
2525 {
2526 if (pg_log.get_tail() != info.log_tail)
2527 pl->get_clog_error() << info.pgid
2528 << " info mismatch, log.tail " << pg_log.get_tail()
2529 << " != info.log_tail " << info.log_tail;
2530 if (pg_log.get_head() != info.last_update)
2531 pl->get_clog_error() << info.pgid
2532 << " info mismatch, log.head " << pg_log.get_head()
2533 << " != info.last_update " << info.last_update;
2534
2535 if (!pg_log.get_log().empty()) {
2536 // sloppy check
2537 if ((pg_log.get_log().log.begin()->version <= pg_log.get_tail()))
2538 pl->get_clog_error() << info.pgid
2539 << " log bound mismatch, info (tail,head] ("
2540 << pg_log.get_tail() << ","
2541 << pg_log.get_head() << "]"
2542 << " actual ["
2543 << pg_log.get_log().log.begin()->version << ","
2544 << pg_log.get_log().log.rbegin()->version << "]";
2545 }
2546
2547 if (pg_log.get_log().caller_ops.size() > pg_log.get_log().log.size()) {
2548 pl->get_clog_error() << info.pgid
2549 << " caller_ops.size "
2550 << pg_log.get_log().caller_ops.size()
2551 << " > log size " << pg_log.get_log().log.size();
2552 }
2553 }
2554
2555 /*
2556 * Process information from a replica to determine if it could have any
2557 * objects that i need.
2558 *
2559 * TODO: if the missing set becomes very large, this could get expensive.
2560 * Instead, we probably want to just iterate over our unfound set.
2561 */
2562 bool PeeringState::search_for_missing(
2563 const pg_info_t &oinfo, const pg_missing_t &omissing,
2564 pg_shard_t from,
2565 PeeringCtxWrapper &ctx)
2566 {
2567 uint64_t num_unfound_before = missing_loc.num_unfound();
2568 bool found_missing = missing_loc.add_source_info(
2569 from, oinfo, omissing, ctx.handle);
2570 if (found_missing && num_unfound_before != missing_loc.num_unfound())
2571 pl->publish_stats_to_osd();
2572 // avoid doing this if the peer is empty. This is abit of paranoia
2573 // to avoid doing something rash if add_source_info() above
2574 // incorrectly decided we found something new. (if the peer has
2575 // last_update=0'0 that's impossible.)
2576 if (found_missing &&
2577 oinfo.last_update != eversion_t()) {
2578 pg_info_t tinfo(oinfo);
2579 tinfo.pgid.shard = pg_whoami.shard;
2580 ctx.send_info(
2581 from.osd,
2582 spg_t(info.pgid.pgid, from.shard),
2583 get_osdmap_epoch(), // fixme: use lower epoch?
2584 get_osdmap_epoch(),
2585 tinfo);
2586 }
2587 return found_missing;
2588 }
2589
2590 bool PeeringState::discover_all_missing(
2591 BufferedRecoveryMessages &rctx)
2592 {
2593 auto &missing = pg_log.get_missing();
2594 uint64_t unfound = get_num_unfound();
2595 bool any = false; // did we start any queries
2596
2597 psdout(10) << __func__ << " "
2598 << missing.num_missing() << " missing, "
2599 << unfound << " unfound"
2600 << dendl;
2601
2602 auto m = might_have_unfound.begin();
2603 auto mend = might_have_unfound.end();
2604 for (; m != mend; ++m) {
2605 pg_shard_t peer(*m);
2606
2607 if (!get_osdmap()->is_up(peer.osd)) {
2608 psdout(20) << __func__ << " skipping down osd." << peer << dendl;
2609 continue;
2610 }
2611
2612 if (peer_purged.count(peer)) {
2613 psdout(20) << __func__ << " skipping purged osd." << peer << dendl;
2614 continue;
2615 }
2616
2617 auto iter = peer_info.find(peer);
2618 if (iter != peer_info.end() &&
2619 (iter->second.is_empty() || iter->second.dne())) {
2620 // ignore empty peers
2621 continue;
2622 }
2623
2624 // If we've requested any of this stuff, the pg_missing_t information
2625 // should be on its way.
2626 // TODO: coalsce requested_* into a single data structure
2627 if (peer_missing.find(peer) != peer_missing.end()) {
2628 psdout(20) << __func__ << ": osd." << peer
2629 << ": we already have pg_missing_t" << dendl;
2630 continue;
2631 }
2632 if (peer_log_requested.find(peer) != peer_log_requested.end()) {
2633 psdout(20) << __func__ << ": osd." << peer
2634 << ": in peer_log_requested" << dendl;
2635 continue;
2636 }
2637 if (peer_missing_requested.find(peer) != peer_missing_requested.end()) {
2638 psdout(20) << __func__ << ": osd." << peer
2639 << ": in peer_missing_requested" << dendl;
2640 continue;
2641 }
2642
2643 // Request missing
2644 psdout(10) << __func__ << ": osd." << peer << ": requesting pg_missing_t"
2645 << dendl;
2646 peer_missing_requested.insert(peer);
2647 rctx.send_query(
2648 peer.osd,
2649 spg_t(info.pgid.pgid, peer.shard),
2650 pg_query_t(
2651 pg_query_t::FULLLOG,
2652 peer.shard, pg_whoami.shard,
2653 info.history, get_osdmap_epoch()));
2654 any = true;
2655 }
2656 return any;
2657 }
2658
2659 /* Build the might_have_unfound set.
2660 *
2661 * This is used by the primary OSD during recovery.
2662 *
2663 * This set tracks the OSDs which might have unfound objects that the primary
2664 * OSD needs. As we receive pg_missing_t from each OSD in might_have_unfound, we
2665 * will remove the OSD from the set.
2666 */
2667 void PeeringState::build_might_have_unfound()
2668 {
2669 ceph_assert(might_have_unfound.empty());
2670 ceph_assert(is_primary());
2671
2672 psdout(10) << __func__ << dendl;
2673
2674 check_past_interval_bounds();
2675
2676 might_have_unfound = past_intervals.get_might_have_unfound(
2677 pg_whoami,
2678 pool.info.is_erasure());
2679
2680 // include any (stray) peers
2681 for (auto p = peer_info.begin(); p != peer_info.end(); ++p)
2682 might_have_unfound.insert(p->first);
2683
2684 psdout(15) << __func__ << ": built " << might_have_unfound << dendl;
2685 }
2686
2687 void PeeringState::activate(
2688 ObjectStore::Transaction& t,
2689 epoch_t activation_epoch,
2690 PeeringCtxWrapper &ctx)
2691 {
2692 ceph_assert(!is_peered());
2693
2694 // twiddle pg state
2695 state_clear(PG_STATE_DOWN);
2696
2697 send_notify = false;
2698
2699 if (is_primary()) {
2700 // only update primary last_epoch_started if we will go active
2701 if (acting_set_writeable()) {
2702 ceph_assert(cct->_conf->osd_find_best_info_ignore_history_les ||
2703 info.last_epoch_started <= activation_epoch);
2704 info.last_epoch_started = activation_epoch;
2705 info.last_interval_started = info.history.same_interval_since;
2706 }
2707 } else if (is_acting(pg_whoami)) {
2708 /* update last_epoch_started on acting replica to whatever the primary sent
2709 * unless it's smaller (could happen if we are going peered rather than
2710 * active, see doc/dev/osd_internals/last_epoch_started.rst) */
2711 if (info.last_epoch_started < activation_epoch) {
2712 info.last_epoch_started = activation_epoch;
2713 info.last_interval_started = info.history.same_interval_since;
2714 }
2715 }
2716
2717 auto &missing = pg_log.get_missing();
2718
2719 min_last_complete_ondisk = eversion_t(0,0); // we don't know (yet)!
2720 if (is_primary()) {
2721 last_update_ondisk = info.last_update;
2722 }
2723 last_update_applied = info.last_update;
2724 last_rollback_info_trimmed_to_applied = pg_log.get_can_rollback_to();
2725
2726 need_up_thru = false;
2727
2728 // write pg info, log
2729 dirty_info = true;
2730 dirty_big_info = true; // maybe
2731
2732 pl->schedule_event_on_commit(
2733 t,
2734 std::make_shared<PGPeeringEvent>(
2735 get_osdmap_epoch(),
2736 get_osdmap_epoch(),
2737 ActivateCommitted(
2738 get_osdmap_epoch(),
2739 activation_epoch)));
2740
2741 // init complete pointer
2742 if (missing.num_missing() == 0) {
2743 psdout(10) << "activate - no missing, moving last_complete " << info.last_complete
2744 << " -> " << info.last_update << dendl;
2745 info.last_complete = info.last_update;
2746 info.stats.stats.sum.num_objects_missing = 0;
2747 pg_log.reset_recovery_pointers();
2748 } else {
2749 psdout(10) << "activate - not complete, " << missing << dendl;
2750 info.stats.stats.sum.num_objects_missing = missing.num_missing();
2751 pg_log.activate_not_complete(info);
2752 }
2753
2754 log_weirdness();
2755
2756 if (is_primary()) {
2757 // initialize snap_trimq
2758 interval_set<snapid_t> to_trim;
2759 auto& removed_snaps_queue = get_osdmap()->get_removed_snaps_queue();
2760 auto p = removed_snaps_queue.find(info.pgid.pgid.pool());
2761 if (p != removed_snaps_queue.end()) {
2762 dout(20) << "activate - purged_snaps " << info.purged_snaps
2763 << " removed_snaps " << p->second
2764 << dendl;
2765 for (auto q : p->second) {
2766 to_trim.insert(q.first, q.second);
2767 }
2768 }
2769 interval_set<snapid_t> purged;
2770 purged.intersection_of(to_trim, info.purged_snaps);
2771 to_trim.subtract(purged);
2772
2773 if (HAVE_FEATURE(upacting_features, SERVER_OCTOPUS)) {
2774 renew_lease(pl->get_mnow());
2775 // do not schedule until we are actually activated
2776 }
2777
2778 // adjust purged_snaps: PG may have been inactive while snaps were pruned
2779 // from the removed_snaps_queue in the osdmap. update local purged_snaps
2780 // reflect only those snaps that we thought were pruned and were still in
2781 // the queue.
2782 info.purged_snaps.swap(purged);
2783
2784 // start up replicas
2785 info.history.refresh_prior_readable_until_ub(pl->get_mnow(),
2786 prior_readable_until_ub);
2787
2788 ceph_assert(!acting_recovery_backfill.empty());
2789 for (auto i = acting_recovery_backfill.begin();
2790 i != acting_recovery_backfill.end();
2791 ++i) {
2792 if (*i == pg_whoami) continue;
2793 pg_shard_t peer = *i;
2794 ceph_assert(peer_info.count(peer));
2795 pg_info_t& pi = peer_info[peer];
2796
2797 psdout(10) << "activate peer osd." << peer << " " << pi << dendl;
2798
2799 MRef<MOSDPGLog> m;
2800 ceph_assert(peer_missing.count(peer));
2801 pg_missing_t& pm = peer_missing[peer];
2802
2803 bool needs_past_intervals = pi.dne();
2804
2805 // Save num_bytes for backfill reservation request, can't be negative
2806 peer_bytes[peer] = std::max<int64_t>(0, pi.stats.stats.sum.num_bytes);
2807
2808 if (pi.last_update == info.last_update) {
2809 // empty log
2810 if (!pi.last_backfill.is_max())
2811 pl->get_clog_info() << info.pgid << " continuing backfill to osd."
2812 << peer
2813 << " from (" << pi.log_tail << "," << pi.last_update
2814 << "] " << pi.last_backfill
2815 << " to " << info.last_update;
2816 if (!pi.is_empty()) {
2817 psdout(10) << "activate peer osd." << peer
2818 << " is up to date, queueing in pending_activators" << dendl;
2819 ctx.send_info(
2820 peer.osd,
2821 spg_t(info.pgid.pgid, peer.shard),
2822 get_osdmap_epoch(), // fixme: use lower epoch?
2823 get_osdmap_epoch(),
2824 info,
2825 get_lease());
2826 } else {
2827 psdout(10) << "activate peer osd." << peer
2828 << " is up to date, but sending pg_log anyway" << dendl;
2829 m = make_message<MOSDPGLog>(
2830 i->shard, pg_whoami.shard,
2831 get_osdmap_epoch(), info,
2832 last_peering_reset);
2833 }
2834 } else if (
2835 pg_log.get_tail() > pi.last_update ||
2836 pi.last_backfill == hobject_t() ||
2837 (backfill_targets.count(*i) && pi.last_backfill.is_max())) {
2838 /* ^ This last case covers a situation where a replica is not contiguous
2839 * with the auth_log, but is contiguous with this replica. Reshuffling
2840 * the active set to handle this would be tricky, so instead we just go
2841 * ahead and backfill it anyway. This is probably preferrable in any
2842 * case since the replica in question would have to be significantly
2843 * behind.
2844 */
2845 // backfill
2846 pl->get_clog_debug() << info.pgid << " starting backfill to osd." << peer
2847 << " from (" << pi.log_tail << "," << pi.last_update
2848 << "] " << pi.last_backfill
2849 << " to " << info.last_update;
2850
2851 pi.last_update = info.last_update;
2852 pi.last_complete = info.last_update;
2853 pi.set_last_backfill(hobject_t());
2854 pi.last_epoch_started = info.last_epoch_started;
2855 pi.last_interval_started = info.last_interval_started;
2856 pi.history = info.history;
2857 pi.hit_set = info.hit_set;
2858 pi.stats.stats.clear();
2859 pi.stats.stats.sum.num_bytes = peer_bytes[peer];
2860
2861 // initialize peer with our purged_snaps.
2862 pi.purged_snaps = info.purged_snaps;
2863
2864 m = make_message<MOSDPGLog>(
2865 i->shard, pg_whoami.shard,
2866 get_osdmap_epoch(), pi,
2867 last_peering_reset /* epoch to create pg at */);
2868
2869 // send some recent log, so that op dup detection works well.
2870 m->log.copy_up_to(cct, pg_log.get_log(),
2871 cct->_conf->osd_max_pg_log_entries);
2872 m->info.log_tail = m->log.tail;
2873 pi.log_tail = m->log.tail; // sigh...
2874
2875 pm.clear();
2876 } else {
2877 // catch up
2878 ceph_assert(pg_log.get_tail() <= pi.last_update);
2879 m = make_message<MOSDPGLog>(
2880 i->shard, pg_whoami.shard,
2881 get_osdmap_epoch(), info,
2882 last_peering_reset /* epoch to create pg at */);
2883 // send new stuff to append to replicas log
2884 m->log.copy_after(cct, pg_log.get_log(), pi.last_update);
2885 }
2886
2887 // share past_intervals if we are creating the pg on the replica
2888 // based on whether our info for that peer was dne() *before*
2889 // updating pi.history in the backfill block above.
2890 if (m && needs_past_intervals)
2891 m->past_intervals = past_intervals;
2892
2893 // update local version of peer's missing list!
2894 if (m && pi.last_backfill != hobject_t()) {
2895 for (auto p = m->log.log.begin(); p != m->log.log.end(); ++p) {
2896 if (p->soid <= pi.last_backfill &&
2897 !p->is_error()) {
2898 if (perform_deletes_during_peering() && p->is_delete()) {
2899 pm.rm(p->soid, p->version);
2900 } else {
2901 pm.add_next_event(*p);
2902 }
2903 }
2904 }
2905 }
2906
2907 if (m) {
2908 dout(10) << "activate peer osd." << peer << " sending " << m->log
2909 << dendl;
2910 m->lease = get_lease();
2911 pl->send_cluster_message(peer.osd, m, get_osdmap_epoch());
2912 }
2913
2914 // peer now has
2915 pi.last_update = info.last_update;
2916
2917 // update our missing
2918 if (pm.num_missing() == 0) {
2919 pi.last_complete = pi.last_update;
2920 psdout(10) << "activate peer osd." << peer << " " << pi
2921 << " uptodate" << dendl;
2922 } else {
2923 psdout(10) << "activate peer osd." << peer << " " << pi
2924 << " missing " << pm << dendl;
2925 }
2926 }
2927
2928 // Set up missing_loc
2929 set<pg_shard_t> complete_shards;
2930 for (auto i = acting_recovery_backfill.begin();
2931 i != acting_recovery_backfill.end();
2932 ++i) {
2933 psdout(20) << __func__ << " setting up missing_loc from shard " << *i
2934 << " " << dendl;
2935 if (*i == get_primary()) {
2936 missing_loc.add_active_missing(missing);
2937 if (!missing.have_missing())
2938 complete_shards.insert(*i);
2939 } else {
2940 auto peer_missing_entry = peer_missing.find(*i);
2941 ceph_assert(peer_missing_entry != peer_missing.end());
2942 missing_loc.add_active_missing(peer_missing_entry->second);
2943 if (!peer_missing_entry->second.have_missing() &&
2944 peer_info[*i].last_backfill.is_max())
2945 complete_shards.insert(*i);
2946 }
2947 }
2948
2949 // If necessary, create might_have_unfound to help us find our unfound objects.
2950 // NOTE: It's important that we build might_have_unfound before trimming the
2951 // past intervals.
2952 might_have_unfound.clear();
2953 if (needs_recovery()) {
2954 // If only one shard has missing, we do a trick to add all others as recovery
2955 // source, this is considered safe since the PGLogs have been merged locally,
2956 // and covers vast majority of the use cases, like one OSD/host is down for
2957 // a while for hardware repairing
2958 if (complete_shards.size() + 1 == acting_recovery_backfill.size()) {
2959 missing_loc.add_batch_sources_info(complete_shards, ctx.handle);
2960 } else {
2961 missing_loc.add_source_info(pg_whoami, info, pg_log.get_missing(),
2962 ctx.handle);
2963 for (auto i = acting_recovery_backfill.begin();
2964 i != acting_recovery_backfill.end();
2965 ++i) {
2966 if (*i == pg_whoami) continue;
2967 psdout(10) << __func__ << ": adding " << *i << " as a source" << dendl;
2968 ceph_assert(peer_missing.count(*i));
2969 ceph_assert(peer_info.count(*i));
2970 missing_loc.add_source_info(
2971 *i,
2972 peer_info[*i],
2973 peer_missing[*i],
2974 ctx.handle);
2975 }
2976 }
2977 for (auto i = peer_missing.begin(); i != peer_missing.end(); ++i) {
2978 if (is_acting_recovery_backfill(i->first))
2979 continue;
2980 ceph_assert(peer_info.count(i->first));
2981 search_for_missing(
2982 peer_info[i->first],
2983 i->second,
2984 i->first,
2985 ctx);
2986 }
2987
2988 build_might_have_unfound();
2989
2990 // Always call now so update_calc_stats() will be accurate
2991 discover_all_missing(ctx.msgs);
2992
2993 }
2994
2995 // num_objects_degraded if calculated should reflect this too, unless no
2996 // missing and we are about to go clean.
2997 if (get_osdmap()->get_pg_size(info.pgid.pgid) > actingset.size()) {
2998 state_set(PG_STATE_UNDERSIZED);
2999 }
3000
3001 state_set(PG_STATE_ACTIVATING);
3002 pl->on_activate(std::move(to_trim));
3003 }
3004 if (acting_set_writeable()) {
3005 PGLog::LogEntryHandlerRef rollbacker{pl->get_log_handler(t)};
3006 pg_log.roll_forward(rollbacker.get());
3007 }
3008 }
3009
3010 void PeeringState::share_pg_info()
3011 {
3012 psdout(10) << "share_pg_info" << dendl;
3013
3014 info.history.refresh_prior_readable_until_ub(pl->get_mnow(),
3015 prior_readable_until_ub);
3016
3017 // share new pg_info_t with replicas
3018 ceph_assert(!acting_recovery_backfill.empty());
3019 for (auto pg_shard : acting_recovery_backfill) {
3020 if (pg_shard == pg_whoami) continue;
3021 if (auto peer = peer_info.find(pg_shard); peer != peer_info.end()) {
3022 peer->second.last_epoch_started = info.last_epoch_started;
3023 peer->second.last_interval_started = info.last_interval_started;
3024 peer->second.history.merge(info.history);
3025 }
3026 MessageRef m;
3027 if (last_require_osd_release >= ceph_release_t::octopus) {
3028 m = make_message<MOSDPGInfo2>(spg_t{info.pgid.pgid, pg_shard.shard},
3029 info,
3030 get_osdmap_epoch(),
3031 get_osdmap_epoch(),
3032 std::optional<pg_lease_t>{get_lease()},
3033 std::nullopt);
3034 } else {
3035 m = make_message<MOSDPGInfo>(get_osdmap_epoch(),
3036 MOSDPGInfo::pg_list_t{
3037 pg_notify_t{pg_shard.shard,
3038 pg_whoami.shard,
3039 get_osdmap_epoch(),
3040 get_osdmap_epoch(),
3041 info,
3042 past_intervals}});
3043 }
3044 pl->send_cluster_message(pg_shard.osd, m, get_osdmap_epoch());
3045 }
3046 }
3047
3048 void PeeringState::merge_log(
3049 ObjectStore::Transaction& t, pg_info_t &oinfo, pg_log_t&& olog,
3050 pg_shard_t from)
3051 {
3052 PGLog::LogEntryHandlerRef rollbacker{pl->get_log_handler(t)};
3053 pg_log.merge_log(
3054 oinfo, std::move(olog), from, info, rollbacker.get(),
3055 dirty_info, dirty_big_info);
3056 }
3057
3058 void PeeringState::rewind_divergent_log(
3059 ObjectStore::Transaction& t, eversion_t newhead)
3060 {
3061 PGLog::LogEntryHandlerRef rollbacker{pl->get_log_handler(t)};
3062 pg_log.rewind_divergent_log(
3063 newhead, info, rollbacker.get(), dirty_info, dirty_big_info);
3064 }
3065
3066
3067 void PeeringState::proc_primary_info(
3068 ObjectStore::Transaction &t, const pg_info_t &oinfo)
3069 {
3070 ceph_assert(!is_primary());
3071
3072 update_history(oinfo.history);
3073 if (!info.stats.stats_invalid && info.stats.stats.sum.num_scrub_errors) {
3074 info.stats.stats.sum.num_scrub_errors = 0;
3075 info.stats.stats.sum.num_shallow_scrub_errors = 0;
3076 info.stats.stats.sum.num_deep_scrub_errors = 0;
3077 dirty_info = true;
3078 }
3079
3080 if (!(info.purged_snaps == oinfo.purged_snaps)) {
3081 psdout(10) << __func__ << " updating purged_snaps to "
3082 << oinfo.purged_snaps
3083 << dendl;
3084 info.purged_snaps = oinfo.purged_snaps;
3085 dirty_info = true;
3086 dirty_big_info = true;
3087 }
3088 }
3089
3090 void PeeringState::proc_master_log(
3091 ObjectStore::Transaction& t, pg_info_t &oinfo,
3092 pg_log_t&& olog, pg_missing_t&& omissing, pg_shard_t from)
3093 {
3094 psdout(10) << "proc_master_log for osd." << from << ": "
3095 << olog << " " << omissing << dendl;
3096 ceph_assert(!is_peered() && is_primary());
3097
3098 // merge log into our own log to build master log. no need to
3099 // make any adjustments to their missing map; we are taking their
3100 // log to be authoritative (i.e., their entries are by definitely
3101 // non-divergent).
3102 merge_log(t, oinfo, std::move(olog), from);
3103 peer_info[from] = oinfo;
3104 psdout(10) << " peer osd." << from << " now " << oinfo
3105 << " " << omissing << dendl;
3106 might_have_unfound.insert(from);
3107
3108 // See doc/dev/osd_internals/last_epoch_started
3109 if (oinfo.last_epoch_started > info.last_epoch_started) {
3110 info.last_epoch_started = oinfo.last_epoch_started;
3111 dirty_info = true;
3112 }
3113 if (oinfo.last_interval_started > info.last_interval_started) {
3114 info.last_interval_started = oinfo.last_interval_started;
3115 dirty_info = true;
3116 }
3117 update_history(oinfo.history);
3118 ceph_assert(cct->_conf->osd_find_best_info_ignore_history_les ||
3119 info.last_epoch_started >= info.history.last_epoch_started);
3120
3121 peer_missing[from].claim(std::move(omissing));
3122 }
3123
3124 void PeeringState::proc_replica_log(
3125 pg_info_t &oinfo,
3126 const pg_log_t &olog,
3127 pg_missing_t&& omissing,
3128 pg_shard_t from)
3129 {
3130 psdout(10) << "proc_replica_log for osd." << from << ": "
3131 << oinfo << " " << olog << " " << omissing << dendl;
3132
3133 pg_log.proc_replica_log(oinfo, olog, omissing, from);
3134
3135 peer_info[from] = oinfo;
3136 psdout(10) << " peer osd." << from << " now "
3137 << oinfo << " " << omissing << dendl;
3138 might_have_unfound.insert(from);
3139
3140 for (auto i = omissing.get_items().begin();
3141 i != omissing.get_items().end();
3142 ++i) {
3143 psdout(20) << " after missing " << i->first
3144 << " need " << i->second.need
3145 << " have " << i->second.have << dendl;
3146 }
3147 peer_missing[from].claim(std::move(omissing));
3148 }
3149
3150 void PeeringState::fulfill_info(
3151 pg_shard_t from, const pg_query_t &query,
3152 pair<pg_shard_t, pg_info_t> &notify_info)
3153 {
3154 ceph_assert(from == primary);
3155 ceph_assert(query.type == pg_query_t::INFO);
3156
3157 // info
3158 psdout(10) << "sending info" << dendl;
3159 notify_info = make_pair(from, info);
3160 }
3161
3162 void PeeringState::fulfill_log(
3163 pg_shard_t from, const pg_query_t &query, epoch_t query_epoch)
3164 {
3165 psdout(10) << "log request from " << from << dendl;
3166 ceph_assert(from == primary);
3167 ceph_assert(query.type != pg_query_t::INFO);
3168
3169 auto mlog = make_message<MOSDPGLog>(
3170 from.shard, pg_whoami.shard,
3171 get_osdmap_epoch(),
3172 info, query_epoch);
3173 mlog->missing = pg_log.get_missing();
3174
3175 // primary -> other, when building master log
3176 if (query.type == pg_query_t::LOG) {
3177 psdout(10) << " sending info+missing+log since " << query.since
3178 << dendl;
3179 if (query.since != eversion_t() && query.since < pg_log.get_tail()) {
3180 pl->get_clog_error() << info.pgid << " got broken pg_query_t::LOG since "
3181 << query.since
3182 << " when my log.tail is " << pg_log.get_tail()
3183 << ", sending full log instead";
3184 mlog->log = pg_log.get_log(); // primary should not have requested this!!
3185 } else
3186 mlog->log.copy_after(cct, pg_log.get_log(), query.since);
3187 }
3188 else if (query.type == pg_query_t::FULLLOG) {
3189 psdout(10) << " sending info+missing+full log" << dendl;
3190 mlog->log = pg_log.get_log();
3191 }
3192
3193 psdout(10) << " sending " << mlog->log << " " << mlog->missing << dendl;
3194
3195 pl->send_cluster_message(from.osd, mlog, get_osdmap_epoch(), true);
3196 }
3197
3198 void PeeringState::fulfill_query(const MQuery& query, PeeringCtxWrapper &rctx)
3199 {
3200 if (query.query.type == pg_query_t::INFO) {
3201 pair<pg_shard_t, pg_info_t> notify_info;
3202 // note this refreshes our prior_readable_until_ub value
3203 update_history(query.query.history);
3204 fulfill_info(query.from, query.query, notify_info);
3205 rctx.send_notify(
3206 notify_info.first.osd,
3207 pg_notify_t(
3208 notify_info.first.shard, pg_whoami.shard,
3209 query.query_epoch,
3210 get_osdmap_epoch(),
3211 notify_info.second,
3212 past_intervals));
3213 } else {
3214 update_history(query.query.history);
3215 fulfill_log(query.from, query.query, query.query_epoch);
3216 }
3217 }
3218
3219 void PeeringState::try_mark_clean()
3220 {
3221 if (actingset.size() == get_osdmap()->get_pg_size(info.pgid.pgid)) {
3222 state_clear(PG_STATE_FORCED_BACKFILL | PG_STATE_FORCED_RECOVERY);
3223 state_set(PG_STATE_CLEAN);
3224 info.history.last_epoch_clean = get_osdmap_epoch();
3225 info.history.last_interval_clean = info.history.same_interval_since;
3226 past_intervals.clear();
3227 dirty_big_info = true;
3228 dirty_info = true;
3229 }
3230
3231 if (!is_active() && is_peered()) {
3232 if (is_clean()) {
3233 bool target;
3234 if (pool.info.is_pending_merge(info.pgid.pgid, &target)) {
3235 if (target) {
3236 psdout(10) << "ready to merge (target)" << dendl;
3237 pl->set_ready_to_merge_target(
3238 info.last_update,
3239 info.history.last_epoch_started,
3240 info.history.last_epoch_clean);
3241 } else {
3242 psdout(10) << "ready to merge (source)" << dendl;
3243 pl->set_ready_to_merge_source(info.last_update);
3244 }
3245 }
3246 } else {
3247 psdout(10) << "not clean, not ready to merge" << dendl;
3248 // we should have notified OSD in Active state entry point
3249 }
3250 }
3251
3252 state_clear(PG_STATE_FORCED_RECOVERY | PG_STATE_FORCED_BACKFILL);
3253
3254 share_pg_info();
3255 pl->publish_stats_to_osd();
3256 clear_recovery_state();
3257 }
3258
3259 void PeeringState::split_into(
3260 pg_t child_pgid, PeeringState *child, unsigned split_bits)
3261 {
3262 child->update_osdmap_ref(get_osdmap());
3263 child->pool = pool;
3264
3265 // Log
3266 pg_log.split_into(child_pgid, split_bits, &(child->pg_log));
3267 child->info.last_complete = info.last_complete;
3268
3269 info.last_update = pg_log.get_head();
3270 child->info.last_update = child->pg_log.get_head();
3271
3272 child->info.last_user_version = info.last_user_version;
3273
3274 info.log_tail = pg_log.get_tail();
3275 child->info.log_tail = child->pg_log.get_tail();
3276
3277 // reset last_complete, we might have modified pg_log & missing above
3278 pg_log.reset_complete_to(&info);
3279 child->pg_log.reset_complete_to(&child->info);
3280
3281 // Info
3282 child->info.history = info.history;
3283 child->info.history.epoch_created = get_osdmap_epoch();
3284 child->info.purged_snaps = info.purged_snaps;
3285
3286 if (info.last_backfill.is_max()) {
3287 child->info.set_last_backfill(hobject_t::get_max());
3288 } else {
3289 // restart backfill on parent and child to be safe. we could
3290 // probably do better in the bitwise sort case, but it's more
3291 // fragile (there may be special work to do on backfill completion
3292 // in the future).
3293 info.set_last_backfill(hobject_t());
3294 child->info.set_last_backfill(hobject_t());
3295 // restarting backfill implies that the missing set is empty,
3296 // since it is only used for objects prior to last_backfill
3297 pg_log.reset_backfill();
3298 child->pg_log.reset_backfill();
3299 }
3300
3301 child->info.stats = info.stats;
3302 child->info.stats.parent_split_bits = split_bits;
3303 info.stats.stats_invalid = true;
3304 child->info.stats.stats_invalid = true;
3305 child->info.last_epoch_started = info.last_epoch_started;
3306 child->info.last_interval_started = info.last_interval_started;
3307
3308 // There can't be recovery/backfill going on now
3309 int primary, up_primary;
3310 vector<int> newup, newacting;
3311 get_osdmap()->pg_to_up_acting_osds(
3312 child->info.pgid.pgid, &newup, &up_primary, &newacting, &primary);
3313 child->init_primary_up_acting(
3314 newup,
3315 newacting,
3316 up_primary,
3317 primary);
3318 child->role = OSDMap::calc_pg_role(pg_whoami, child->acting);
3319
3320 // this comparison includes primary rank via pg_shard_t
3321 if (get_primary() != child->get_primary())
3322 child->info.history.same_primary_since = get_osdmap_epoch();
3323
3324 child->info.stats.up = newup;
3325 child->info.stats.up_primary = up_primary;
3326 child->info.stats.acting = newacting;
3327 child->info.stats.acting_primary = primary;
3328 child->info.stats.mapping_epoch = get_osdmap_epoch();
3329
3330 // History
3331 child->past_intervals = past_intervals;
3332
3333 child->on_new_interval();
3334
3335 child->send_notify = !child->is_primary();
3336
3337 child->dirty_info = true;
3338 child->dirty_big_info = true;
3339 dirty_info = true;
3340 dirty_big_info = true;
3341 }
3342
3343 void PeeringState::merge_from(
3344 map<spg_t,PeeringState *>& sources,
3345 PeeringCtx &rctx,
3346 unsigned split_bits,
3347 const pg_merge_meta_t& last_pg_merge_meta)
3348 {
3349 bool incomplete = false;
3350 if (info.last_complete != info.last_update ||
3351 info.is_incomplete() ||
3352 info.dne()) {
3353 psdout(10) << __func__ << " target incomplete" << dendl;
3354 incomplete = true;
3355 }
3356 if (last_pg_merge_meta.source_pgid != pg_t()) {
3357 if (info.pgid.pgid != last_pg_merge_meta.source_pgid.get_parent()) {
3358 psdout(10) << __func__ << " target doesn't match expected parent "
3359 << last_pg_merge_meta.source_pgid.get_parent()
3360 << " of source_pgid " << last_pg_merge_meta.source_pgid
3361 << dendl;
3362 incomplete = true;
3363 }
3364 if (info.last_update != last_pg_merge_meta.target_version) {
3365 psdout(10) << __func__ << " target version doesn't match expected "
3366 << last_pg_merge_meta.target_version << dendl;
3367 incomplete = true;
3368 }
3369 }
3370
3371 PGLog::LogEntryHandlerRef handler{pl->get_log_handler(rctx.transaction)};
3372 pg_log.roll_forward(handler.get());
3373
3374 info.last_complete = info.last_update; // to fake out trim()
3375 pg_log.reset_recovery_pointers();
3376 pg_log.trim(info.last_update, info);
3377
3378 vector<PGLog*> log_from;
3379 for (auto& i : sources) {
3380 auto& source = i.second;
3381 if (!source) {
3382 psdout(10) << __func__ << " source " << i.first << " missing" << dendl;
3383 incomplete = true;
3384 continue;
3385 }
3386 if (source->info.last_complete != source->info.last_update ||
3387 source->info.is_incomplete() ||
3388 source->info.dne()) {
3389 psdout(10) << __func__ << " source " << source->pg_whoami
3390 << " incomplete"
3391 << dendl;
3392 incomplete = true;
3393 }
3394 if (last_pg_merge_meta.source_pgid != pg_t()) {
3395 if (source->info.pgid.pgid != last_pg_merge_meta.source_pgid) {
3396 dout(10) << __func__ << " source " << source->info.pgid.pgid
3397 << " doesn't match expected source pgid "
3398 << last_pg_merge_meta.source_pgid << dendl;
3399 incomplete = true;
3400 }
3401 if (source->info.last_update != last_pg_merge_meta.source_version) {
3402 dout(10) << __func__ << " source version doesn't match expected "
3403 << last_pg_merge_meta.target_version << dendl;
3404 incomplete = true;
3405 }
3406 }
3407
3408 // prepare log
3409 PGLog::LogEntryHandlerRef handler{
3410 source->pl->get_log_handler(rctx.transaction)};
3411 source->pg_log.roll_forward(handler.get());
3412 source->info.last_complete = source->info.last_update; // to fake out trim()
3413 source->pg_log.reset_recovery_pointers();
3414 source->pg_log.trim(source->info.last_update, source->info);
3415 log_from.push_back(&source->pg_log);
3416
3417 // combine stats
3418 info.stats.add(source->info.stats);
3419
3420 // pull up last_update
3421 info.last_update = std::max(info.last_update, source->info.last_update);
3422
3423 // adopt source's PastIntervals if target has none. we can do this since
3424 // pgp_num has been reduced prior to the merge, so the OSD mappings for
3425 // the PGs are identical.
3426 if (past_intervals.empty() && !source->past_intervals.empty()) {
3427 psdout(10) << __func__ << " taking source's past_intervals" << dendl;
3428 past_intervals = source->past_intervals;
3429 }
3430 }
3431
3432 info.last_complete = info.last_update;
3433 info.log_tail = info.last_update;
3434 if (incomplete) {
3435 info.last_backfill = hobject_t();
3436 }
3437
3438 // merge logs
3439 pg_log.merge_from(log_from, info.last_update);
3440
3441 // make sure we have a meaningful last_epoch_started/clean (if we were a
3442 // placeholder)
3443 if (info.history.epoch_created == 0) {
3444 // start with (a) source's history, since these PGs *should* have been
3445 // remapped in concert with each other...
3446 info.history = sources.begin()->second->info.history;
3447
3448 // we use the last_epoch_{started,clean} we got from
3449 // the caller, which are the epochs that were reported by the PGs were
3450 // found to be ready for merge.
3451 info.history.last_epoch_clean = last_pg_merge_meta.last_epoch_clean;
3452 info.history.last_epoch_started = last_pg_merge_meta.last_epoch_started;
3453 info.last_epoch_started = last_pg_merge_meta.last_epoch_started;
3454 psdout(10) << __func__
3455 << " set les/c to " << last_pg_merge_meta.last_epoch_started << "/"
3456 << last_pg_merge_meta.last_epoch_clean
3457 << " from pool last_dec_*, source pg history was "
3458 << sources.begin()->second->info.history
3459 << dendl;
3460
3461 // above we have pulled down source's history and we need to check
3462 // history.epoch_created again to confirm that source is not a placeholder
3463 // too. (peering requires a sane history.same_interval_since value for any
3464 // non-newly created pg and below here we know we are basically iterating
3465 // back a series of past maps to fake a merge process, hence we need to
3466 // fix history.same_interval_since first so that start_peering_interval()
3467 // will not complain)
3468 if (info.history.epoch_created == 0) {
3469 dout(10) << __func__ << " both merge target and source are placeholders,"
3470 << " set sis to lec " << info.history.last_epoch_clean
3471 << dendl;
3472 info.history.same_interval_since = info.history.last_epoch_clean;
3473 }
3474
3475 // if the past_intervals start is later than last_epoch_clean, it
3476 // implies the source repeered again but the target didn't, or
3477 // that the source became clean in a later epoch than the target.
3478 // avoid the discrepancy but adjusting the interval start
3479 // backwards to match so that check_past_interval_bounds() will
3480 // not complain.
3481 auto pib = past_intervals.get_bounds();
3482 if (info.history.last_epoch_clean < pib.first) {
3483 psdout(10) << __func__ << " last_epoch_clean "
3484 << info.history.last_epoch_clean << " < past_interval start "
3485 << pib.first << ", adjusting start backwards" << dendl;
3486 past_intervals.adjust_start_backwards(info.history.last_epoch_clean);
3487 }
3488
3489 // Similarly, if the same_interval_since value is later than
3490 // last_epoch_clean, the next interval change will result in a
3491 // past_interval start that is later than last_epoch_clean. This
3492 // can happen if we use the pg_history values from the merge
3493 // source. Adjust the same_interval_since value backwards if that
3494 // happens. (We trust the les and lec values more because they came from
3495 // the real target, whereas the history value we stole from the source.)
3496 if (info.history.last_epoch_started < info.history.same_interval_since) {
3497 psdout(10) << __func__ << " last_epoch_started "
3498 << info.history.last_epoch_started << " < same_interval_since "
3499 << info.history.same_interval_since
3500 << ", adjusting pg_history backwards" << dendl;
3501 info.history.same_interval_since = info.history.last_epoch_clean;
3502 // make sure same_{up,primary}_since are <= same_interval_since
3503 info.history.same_up_since = std::min(
3504 info.history.same_up_since, info.history.same_interval_since);
3505 info.history.same_primary_since = std::min(
3506 info.history.same_primary_since, info.history.same_interval_since);
3507 }
3508 }
3509
3510 dirty_info = true;
3511 dirty_big_info = true;
3512 }
3513
3514 void PeeringState::start_split_stats(
3515 const set<spg_t>& childpgs, vector<object_stat_sum_t> *out)
3516 {
3517 out->resize(childpgs.size() + 1);
3518 info.stats.stats.sum.split(*out);
3519 }
3520
3521 void PeeringState::finish_split_stats(
3522 const object_stat_sum_t& stats, ObjectStore::Transaction &t)
3523 {
3524 info.stats.stats.sum = stats;
3525 write_if_dirty(t);
3526 }
3527
3528 void PeeringState::update_blocked_by()
3529 {
3530 // set a max on the number of blocking peers we report. if we go
3531 // over, report a random subset. keep the result sorted.
3532 unsigned keep = std::min<unsigned>(
3533 blocked_by.size(), cct->_conf->osd_max_pg_blocked_by);
3534 unsigned skip = blocked_by.size() - keep;
3535 info.stats.blocked_by.clear();
3536 info.stats.blocked_by.resize(keep);
3537 unsigned pos = 0;
3538 for (auto p = blocked_by.begin(); p != blocked_by.end() && keep > 0; ++p) {
3539 if (skip > 0 && (rand() % (skip + keep) < skip)) {
3540 --skip;
3541 } else {
3542 info.stats.blocked_by[pos++] = *p;
3543 --keep;
3544 }
3545 }
3546 }
3547
3548 static bool find_shard(const set<pg_shard_t> & pgs, shard_id_t shard)
3549 {
3550 for (auto&p : pgs)
3551 if (p.shard == shard)
3552 return true;
3553 return false;
3554 }
3555
3556 static pg_shard_t get_another_shard(const set<pg_shard_t> & pgs, pg_shard_t skip, shard_id_t shard)
3557 {
3558 for (auto&p : pgs) {
3559 if (p == skip)
3560 continue;
3561 if (p.shard == shard)
3562 return p;
3563 }
3564 return pg_shard_t();
3565 }
3566
3567 void PeeringState::update_calc_stats()
3568 {
3569 info.stats.version = info.last_update;
3570 info.stats.created = info.history.epoch_created;
3571 info.stats.last_scrub = info.history.last_scrub;
3572 info.stats.last_scrub_stamp = info.history.last_scrub_stamp;
3573 info.stats.last_deep_scrub = info.history.last_deep_scrub;
3574 info.stats.last_deep_scrub_stamp = info.history.last_deep_scrub_stamp;
3575 info.stats.last_clean_scrub_stamp = info.history.last_clean_scrub_stamp;
3576 info.stats.last_epoch_clean = info.history.last_epoch_clean;
3577
3578 info.stats.log_size = pg_log.get_head().version - pg_log.get_tail().version;
3579 info.stats.ondisk_log_size = info.stats.log_size;
3580 info.stats.log_start = pg_log.get_tail();
3581 info.stats.ondisk_log_start = pg_log.get_tail();
3582 info.stats.snaptrimq_len = pl->get_snap_trimq_size();
3583
3584 unsigned num_shards = get_osdmap()->get_pg_size(info.pgid.pgid);
3585
3586 // In rare case that upset is too large (usually transient), use as target
3587 // for calculations below.
3588 unsigned target = std::max(num_shards, (unsigned)upset.size());
3589 // For undersized actingset may be larger with OSDs out
3590 unsigned nrep = std::max(actingset.size(), upset.size());
3591 // calc num_object_copies
3592 info.stats.stats.calc_copies(std::max(target, nrep));
3593 info.stats.stats.sum.num_objects_degraded = 0;
3594 info.stats.stats.sum.num_objects_unfound = 0;
3595 info.stats.stats.sum.num_objects_misplaced = 0;
3596 info.stats.avail_no_missing.clear();
3597 info.stats.object_location_counts.clear();
3598
3599 // We should never hit this condition, but if end up hitting it,
3600 // make sure to update num_objects and set PG_STATE_INCONSISTENT.
3601 if (info.stats.stats.sum.num_objects < 0) {
3602 psdout(0) << __func__ << " negative num_objects = "
3603 << info.stats.stats.sum.num_objects << " setting it to 0 "
3604 << dendl;
3605 info.stats.stats.sum.num_objects = 0;
3606 state_set(PG_STATE_INCONSISTENT);
3607 }
3608
3609 if ((is_remapped() || is_undersized() || !is_clean()) &&
3610 (is_peered()|| is_activating())) {
3611 psdout(20) << __func__ << " actingset " << actingset << " upset "
3612 << upset << " acting_recovery_backfill " << acting_recovery_backfill << dendl;
3613
3614 ceph_assert(!acting_recovery_backfill.empty());
3615
3616 bool estimate = false;
3617
3618 // NOTE: we only generate degraded, misplaced and unfound
3619 // values for the summation, not individual stat categories.
3620 int64_t num_objects = info.stats.stats.sum.num_objects;
3621
3622 // Objects missing from up nodes, sorted by # objects.
3623 boost::container::flat_set<pair<int64_t,pg_shard_t>> missing_target_objects;
3624 // Objects missing from nodes not in up, sort by # objects
3625 boost::container::flat_set<pair<int64_t,pg_shard_t>> acting_source_objects;
3626
3627 // Fill missing_target_objects/acting_source_objects
3628
3629 {
3630 int64_t missing;
3631
3632 // Primary first
3633 missing = pg_log.get_missing().num_missing();
3634 ceph_assert(acting_recovery_backfill.count(pg_whoami));
3635 if (upset.count(pg_whoami)) {
3636 missing_target_objects.emplace(missing, pg_whoami);
3637 } else {
3638 acting_source_objects.emplace(missing, pg_whoami);
3639 }
3640 info.stats.stats.sum.num_objects_missing_on_primary = missing;
3641 if (missing == 0)
3642 info.stats.avail_no_missing.push_back(pg_whoami);
3643 psdout(20) << __func__ << " shard " << pg_whoami
3644 << " primary objects " << num_objects
3645 << " missing " << missing
3646 << dendl;
3647 }
3648
3649 // All other peers
3650 for (auto& peer : peer_info) {
3651 // Primary should not be in the peer_info, skip if it is.
3652 if (peer.first == pg_whoami) continue;
3653 int64_t missing = 0;
3654 int64_t peer_num_objects =
3655 std::max((int64_t)0, peer.second.stats.stats.sum.num_objects);
3656 // Backfill targets always track num_objects accurately
3657 // all other peers track missing accurately.
3658 if (is_backfill_target(peer.first)) {
3659 missing = std::max((int64_t)0, num_objects - peer_num_objects);
3660 } else {
3661 if (peer_missing.count(peer.first)) {
3662 missing = peer_missing[peer.first].num_missing();
3663 } else {
3664 psdout(20) << __func__ << " no peer_missing found for "
3665 << peer.first << dendl;
3666 if (is_recovering()) {
3667 estimate = true;
3668 }
3669 missing = std::max((int64_t)0, num_objects - peer_num_objects);
3670 }
3671 }
3672 if (upset.count(peer.first)) {
3673 missing_target_objects.emplace(missing, peer.first);
3674 } else if (actingset.count(peer.first)) {
3675 acting_source_objects.emplace(missing, peer.first);
3676 }
3677 peer.second.stats.stats.sum.num_objects_missing = missing;
3678 if (missing == 0)
3679 info.stats.avail_no_missing.push_back(peer.first);
3680 psdout(20) << __func__ << " shard " << peer.first
3681 << " objects " << peer_num_objects
3682 << " missing " << missing
3683 << dendl;
3684 }
3685
3686 // Compute object_location_counts
3687 for (auto& ml: missing_loc.get_missing_locs()) {
3688 info.stats.object_location_counts[ml.second]++;
3689 psdout(30) << __func__ << " " << ml.first << " object_location_counts["
3690 << ml.second << "]=" << info.stats.object_location_counts[ml.second]
3691 << dendl;
3692 }
3693 int64_t not_missing = num_objects - missing_loc.get_missing_locs().size();
3694 if (not_missing) {
3695 // During recovery we know upset == actingset and is being populated
3696 // During backfill we know that all non-missing objects are in the actingset
3697 info.stats.object_location_counts[actingset] = not_missing;
3698 }
3699 psdout(30) << __func__ << " object_location_counts["
3700 << upset << "]=" << info.stats.object_location_counts[upset]
3701 << dendl;
3702 psdout(20) << __func__ << " object_location_counts "
3703 << info.stats.object_location_counts << dendl;
3704
3705 // A misplaced object is not stored on the correct OSD
3706 int64_t misplaced = 0;
3707 // a degraded objects has fewer replicas or EC shards than the pool specifies.
3708 int64_t degraded = 0;
3709
3710 if (is_recovering()) {
3711 for (auto& sml: missing_loc.get_missing_by_count()) {
3712 for (auto& ml: sml.second) {
3713 int missing_shards;
3714 if (sml.first == shard_id_t::NO_SHARD) {
3715 psdout(20) << __func__ << " ml " << ml.second
3716 << " upset size " << upset.size()
3717 << " up " << ml.first.up << dendl;
3718 missing_shards = (int)upset.size() - ml.first.up;
3719 } else {
3720 // Handle shards not even in upset below
3721 if (!find_shard(upset, sml.first))
3722 continue;
3723 missing_shards = std::max(0, 1 - ml.first.up);
3724 psdout(20) << __func__
3725 << " shard " << sml.first
3726 << " ml " << ml.second
3727 << " missing shards " << missing_shards << dendl;
3728 }
3729 int odegraded = ml.second * missing_shards;
3730 // Copies on other osds but limited to the possible degraded
3731 int more_osds = std::min(missing_shards, ml.first.other);
3732 int omisplaced = ml.second * more_osds;
3733 ceph_assert(omisplaced <= odegraded);
3734 odegraded -= omisplaced;
3735
3736 misplaced += omisplaced;
3737 degraded += odegraded;
3738 }
3739 }
3740
3741 psdout(20) << __func__ << " missing based degraded "
3742 << degraded << dendl;
3743 psdout(20) << __func__ << " missing based misplaced "
3744 << misplaced << dendl;
3745
3746 // Handle undersized case
3747 if (pool.info.is_replicated()) {
3748 // Add degraded for missing targets (num_objects missing)
3749 ceph_assert(target >= upset.size());
3750 unsigned needed = target - upset.size();
3751 degraded += num_objects * needed;
3752 } else {
3753 for (unsigned i = 0 ; i < num_shards; ++i) {
3754 shard_id_t shard(i);
3755
3756 if (!find_shard(upset, shard)) {
3757 pg_shard_t pgs = get_another_shard(actingset, pg_shard_t(), shard);
3758
3759 if (pgs != pg_shard_t()) {
3760 int64_t missing;
3761
3762 if (pgs == pg_whoami)
3763 missing = info.stats.stats.sum.num_objects_missing_on_primary;
3764 else
3765 missing = peer_info[pgs].stats.stats.sum.num_objects_missing;
3766
3767 degraded += missing;
3768 misplaced += std::max((int64_t)0, num_objects - missing);
3769 } else {
3770 // No shard anywhere
3771 degraded += num_objects;
3772 }
3773 }
3774 }
3775 }
3776 goto out;
3777 }
3778
3779 // Handle undersized case
3780 if (pool.info.is_replicated()) {
3781 // Add to missing_target_objects
3782 ceph_assert(target >= missing_target_objects.size());
3783 unsigned needed = target - missing_target_objects.size();
3784 if (needed)
3785 missing_target_objects.emplace(num_objects * needed, pg_shard_t(pg_shard_t::NO_OSD));
3786 } else {
3787 for (unsigned i = 0 ; i < num_shards; ++i) {
3788 shard_id_t shard(i);
3789 bool found = false;
3790 for (const auto& t : missing_target_objects) {
3791 if (std::get<1>(t).shard == shard) {
3792 found = true;
3793 break;
3794 }
3795 }
3796 if (!found)
3797 missing_target_objects.emplace(num_objects, pg_shard_t(pg_shard_t::NO_OSD,shard));
3798 }
3799 }
3800
3801 for (const auto& item : missing_target_objects)
3802 psdout(20) << __func__ << " missing shard " << std::get<1>(item)
3803 << " missing= " << std::get<0>(item) << dendl;
3804 for (const auto& item : acting_source_objects)
3805 psdout(20) << __func__ << " acting shard " << std::get<1>(item)
3806 << " missing= " << std::get<0>(item) << dendl;
3807
3808 // Handle all objects not in missing for remapped
3809 // or backfill
3810 for (auto m = missing_target_objects.rbegin();
3811 m != missing_target_objects.rend(); ++m) {
3812
3813 int64_t extra_missing = -1;
3814
3815 if (pool.info.is_replicated()) {
3816 if (!acting_source_objects.empty()) {
3817 auto extra_copy = acting_source_objects.begin();
3818 extra_missing = std::get<0>(*extra_copy);
3819 acting_source_objects.erase(extra_copy);
3820 }
3821 } else { // Erasure coded
3822 // Use corresponding shard
3823 for (const auto& a : acting_source_objects) {
3824 if (std::get<1>(a).shard == std::get<1>(*m).shard) {
3825 extra_missing = std::get<0>(a);
3826 acting_source_objects.erase(a);
3827 break;
3828 }
3829 }
3830 }
3831
3832 if (extra_missing >= 0 && std::get<0>(*m) >= extra_missing) {
3833 // We don't know which of the objects on the target
3834 // are part of extra_missing so assume are all degraded.
3835 misplaced += std::get<0>(*m) - extra_missing;
3836 degraded += extra_missing;
3837 } else {
3838 // 1. extra_missing == -1, more targets than sources so degraded
3839 // 2. extra_missing > std::get<0>(m), so that we know that some extra_missing
3840 // previously degraded are now present on the target.
3841 degraded += std::get<0>(*m);
3842 }
3843 }
3844 // If there are still acting that haven't been accounted for
3845 // then they are misplaced
3846 for (const auto& a : acting_source_objects) {
3847 int64_t extra_misplaced = std::max((int64_t)0, num_objects - std::get<0>(a));
3848 psdout(20) << __func__ << " extra acting misplaced " << extra_misplaced
3849 << dendl;
3850 misplaced += extra_misplaced;
3851 }
3852 out:
3853 // NOTE: Tests use these messages to verify this code
3854 psdout(20) << __func__ << " degraded " << degraded
3855 << (estimate ? " (est)": "") << dendl;
3856 psdout(20) << __func__ << " misplaced " << misplaced
3857 << (estimate ? " (est)": "")<< dendl;
3858
3859 info.stats.stats.sum.num_objects_degraded = degraded;
3860 info.stats.stats.sum.num_objects_unfound = get_num_unfound();
3861 info.stats.stats.sum.num_objects_misplaced = misplaced;
3862 }
3863 }
3864
3865 std::optional<pg_stat_t> PeeringState::prepare_stats_for_publish(
3866 bool pg_stats_publish_valid,
3867 const pg_stat_t &pg_stats_publish,
3868 const object_stat_collection_t &unstable_stats)
3869 {
3870 if (info.stats.stats.sum.num_scrub_errors) {
3871 state_set(PG_STATE_INCONSISTENT);
3872 } else {
3873 state_clear(PG_STATE_INCONSISTENT);
3874 state_clear(PG_STATE_FAILED_REPAIR);
3875 }
3876
3877 utime_t now = ceph_clock_now();
3878 if (info.stats.state != state) {
3879 info.stats.last_change = now;
3880 // Optimistic estimation, if we just find out an inactive PG,
3881 // assumt it is active till now.
3882 if (!(state & PG_STATE_ACTIVE) &&
3883 (info.stats.state & PG_STATE_ACTIVE))
3884 info.stats.last_active = now;
3885
3886 if ((state & PG_STATE_ACTIVE) &&
3887 !(info.stats.state & PG_STATE_ACTIVE))
3888 info.stats.last_became_active = now;
3889 if ((state & (PG_STATE_ACTIVE|PG_STATE_PEERED)) &&
3890 !(info.stats.state & (PG_STATE_ACTIVE|PG_STATE_PEERED)))
3891 info.stats.last_became_peered = now;
3892 info.stats.state = state;
3893 }
3894
3895 update_calc_stats();
3896 if (info.stats.stats.sum.num_objects_degraded) {
3897 state_set(PG_STATE_DEGRADED);
3898 } else {
3899 state_clear(PG_STATE_DEGRADED);
3900 }
3901 update_blocked_by();
3902
3903 pg_stat_t pre_publish = info.stats;
3904 pre_publish.stats.add(unstable_stats);
3905 utime_t cutoff = now;
3906 cutoff -= cct->_conf->osd_pg_stat_report_interval_max;
3907
3908 // share (some of) our purged_snaps via the pg_stats. limit # of intervals
3909 // because we don't want to make the pg_stat_t structures too expensive.
3910 unsigned max = cct->_conf->osd_max_snap_prune_intervals_per_epoch;
3911 unsigned num = 0;
3912 auto i = info.purged_snaps.begin();
3913 while (num < max && i != info.purged_snaps.end()) {
3914 pre_publish.purged_snaps.insert(i.get_start(), i.get_len());
3915 ++num;
3916 ++i;
3917 }
3918 psdout(20) << __func__ << " reporting purged_snaps "
3919 << pre_publish.purged_snaps << dendl;
3920
3921 if (pg_stats_publish_valid && pre_publish == pg_stats_publish &&
3922 info.stats.last_fresh > cutoff) {
3923 psdout(15) << "publish_stats_to_osd " << pg_stats_publish.reported_epoch
3924 << ": no change since " << info.stats.last_fresh << dendl;
3925 return std::nullopt;
3926 } else {
3927 // update our stat summary and timestamps
3928 info.stats.reported_epoch = get_osdmap_epoch();
3929 ++info.stats.reported_seq;
3930
3931 info.stats.last_fresh = now;
3932
3933 if (info.stats.state & PG_STATE_CLEAN)
3934 info.stats.last_clean = now;
3935 if (info.stats.state & PG_STATE_ACTIVE)
3936 info.stats.last_active = now;
3937 if (info.stats.state & (PG_STATE_ACTIVE|PG_STATE_PEERED))
3938 info.stats.last_peered = now;
3939 info.stats.last_unstale = now;
3940 if ((info.stats.state & PG_STATE_DEGRADED) == 0)
3941 info.stats.last_undegraded = now;
3942 if ((info.stats.state & PG_STATE_UNDERSIZED) == 0)
3943 info.stats.last_fullsized = now;
3944
3945 psdout(15) << "publish_stats_to_osd " << pg_stats_publish.reported_epoch
3946 << ":" << pg_stats_publish.reported_seq << dendl;
3947 return std::make_optional(std::move(pre_publish));
3948 }
3949 }
3950
3951 void PeeringState::init(
3952 int role,
3953 const vector<int>& newup, int new_up_primary,
3954 const vector<int>& newacting, int new_acting_primary,
3955 const pg_history_t& history,
3956 const PastIntervals& pi,
3957 bool backfill,
3958 ObjectStore::Transaction &t)
3959 {
3960 psdout(10) << "init role " << role << " up "
3961 << newup << " acting " << newacting
3962 << " history " << history
3963 << " past_intervals " << pi
3964 << dendl;
3965
3966 set_role(role);
3967 init_primary_up_acting(
3968 newup,
3969 newacting,
3970 new_up_primary,
3971 new_acting_primary);
3972
3973 info.history = history;
3974 past_intervals = pi;
3975
3976 info.stats.up = up;
3977 info.stats.up_primary = new_up_primary;
3978 info.stats.acting = acting;
3979 info.stats.acting_primary = new_acting_primary;
3980 info.stats.mapping_epoch = info.history.same_interval_since;
3981
3982 if (!perform_deletes_during_peering()) {
3983 pg_log.set_missing_may_contain_deletes();
3984 }
3985
3986 if (backfill) {
3987 psdout(10) << __func__ << ": Setting backfill" << dendl;
3988 info.set_last_backfill(hobject_t());
3989 info.last_complete = info.last_update;
3990 pg_log.mark_log_for_rewrite();
3991 }
3992
3993 on_new_interval();
3994
3995 dirty_info = true;
3996 dirty_big_info = true;
3997 write_if_dirty(t);
3998 }
3999
4000 void PeeringState::dump_peering_state(Formatter *f)
4001 {
4002 f->dump_string("state", get_pg_state_string());
4003 f->dump_unsigned("epoch", get_osdmap_epoch());
4004 f->open_array_section("up");
4005 for (auto p = up.begin(); p != up.end(); ++p)
4006 f->dump_unsigned("osd", *p);
4007 f->close_section();
4008 f->open_array_section("acting");
4009 for (auto p = acting.begin(); p != acting.end(); ++p)
4010 f->dump_unsigned("osd", *p);
4011 f->close_section();
4012 if (!backfill_targets.empty()) {
4013 f->open_array_section("backfill_targets");
4014 for (auto p = backfill_targets.begin(); p != backfill_targets.end(); ++p)
4015 f->dump_stream("shard") << *p;
4016 f->close_section();
4017 }
4018 if (!async_recovery_targets.empty()) {
4019 f->open_array_section("async_recovery_targets");
4020 for (auto p = async_recovery_targets.begin();
4021 p != async_recovery_targets.end();
4022 ++p)
4023 f->dump_stream("shard") << *p;
4024 f->close_section();
4025 }
4026 if (!acting_recovery_backfill.empty()) {
4027 f->open_array_section("acting_recovery_backfill");
4028 for (auto p = acting_recovery_backfill.begin();
4029 p != acting_recovery_backfill.end();
4030 ++p)
4031 f->dump_stream("shard") << *p;
4032 f->close_section();
4033 }
4034 f->open_object_section("info");
4035 update_calc_stats();
4036 info.dump(f);
4037 f->close_section();
4038
4039 f->open_array_section("peer_info");
4040 for (auto p = peer_info.begin(); p != peer_info.end(); ++p) {
4041 f->open_object_section("info");
4042 f->dump_stream("peer") << p->first;
4043 p->second.dump(f);
4044 f->close_section();
4045 }
4046 f->close_section();
4047 }
4048
4049 void PeeringState::update_stats(
4050 std::function<bool(pg_history_t &, pg_stat_t &)> f,
4051 ObjectStore::Transaction *t) {
4052 if (f(info.history, info.stats)) {
4053 pl->publish_stats_to_osd();
4054 }
4055 pl->on_info_history_change();
4056
4057 if (t) {
4058 dirty_info = true;
4059 write_if_dirty(*t);
4060 }
4061 }
4062
4063 bool PeeringState::append_log_entries_update_missing(
4064 const mempool::osd_pglog::list<pg_log_entry_t> &entries,
4065 ObjectStore::Transaction &t, std::optional<eversion_t> trim_to,
4066 std::optional<eversion_t> roll_forward_to)
4067 {
4068 ceph_assert(!entries.empty());
4069 ceph_assert(entries.begin()->version > info.last_update);
4070
4071 PGLog::LogEntryHandlerRef rollbacker{pl->get_log_handler(t)};
4072 bool invalidate_stats =
4073 pg_log.append_new_log_entries(
4074 info.last_backfill,
4075 entries,
4076 rollbacker.get());
4077
4078 if (roll_forward_to && entries.rbegin()->soid > info.last_backfill) {
4079 pg_log.roll_forward(rollbacker.get());
4080 }
4081 if (roll_forward_to && *roll_forward_to > pg_log.get_can_rollback_to()) {
4082 pg_log.roll_forward_to(*roll_forward_to, rollbacker.get());
4083 last_rollback_info_trimmed_to_applied = *roll_forward_to;
4084 }
4085
4086 info.last_update = pg_log.get_head();
4087
4088 if (pg_log.get_missing().num_missing() == 0) {
4089 // advance last_complete since nothing else is missing!
4090 info.last_complete = info.last_update;
4091 }
4092 info.stats.stats_invalid = info.stats.stats_invalid || invalidate_stats;
4093
4094 psdout(20) << __func__ << " trim_to bool = " << bool(trim_to)
4095 << " trim_to = " << (trim_to ? *trim_to : eversion_t()) << dendl;
4096 if (trim_to)
4097 pg_log.trim(*trim_to, info);
4098 dirty_info = true;
4099 write_if_dirty(t);
4100 return invalidate_stats;
4101 }
4102
4103 void PeeringState::merge_new_log_entries(
4104 const mempool::osd_pglog::list<pg_log_entry_t> &entries,
4105 ObjectStore::Transaction &t,
4106 std::optional<eversion_t> trim_to,
4107 std::optional<eversion_t> roll_forward_to)
4108 {
4109 psdout(10) << __func__ << " " << entries << dendl;
4110 ceph_assert(is_primary());
4111
4112 bool rebuild_missing = append_log_entries_update_missing(entries, t, trim_to, roll_forward_to);
4113 for (auto i = acting_recovery_backfill.begin();
4114 i != acting_recovery_backfill.end();
4115 ++i) {
4116 pg_shard_t peer(*i);
4117 if (peer == pg_whoami) continue;
4118 ceph_assert(peer_missing.count(peer));
4119 ceph_assert(peer_info.count(peer));
4120 pg_missing_t& pmissing(peer_missing[peer]);
4121 psdout(20) << __func__ << " peer_missing for " << peer
4122 << " = " << pmissing << dendl;
4123 pg_info_t& pinfo(peer_info[peer]);
4124 bool invalidate_stats = PGLog::append_log_entries_update_missing(
4125 pinfo.last_backfill,
4126 entries,
4127 true,
4128 NULL,
4129 pmissing,
4130 NULL,
4131 dpp);
4132 pinfo.last_update = info.last_update;
4133 pinfo.stats.stats_invalid = pinfo.stats.stats_invalid || invalidate_stats;
4134 rebuild_missing = rebuild_missing || invalidate_stats;
4135 }
4136
4137 if (!rebuild_missing) {
4138 return;
4139 }
4140
4141 for (auto &&i: entries) {
4142 missing_loc.rebuild(
4143 i.soid,
4144 pg_whoami,
4145 acting_recovery_backfill,
4146 info,
4147 pg_log.get_missing(),
4148 peer_missing,
4149 peer_info);
4150 }
4151 }
4152
4153 void PeeringState::add_log_entry(const pg_log_entry_t& e, bool applied)
4154 {
4155 // raise last_complete only if we were previously up to date
4156 if (info.last_complete == info.last_update)
4157 info.last_complete = e.version;
4158
4159 // raise last_update.
4160 ceph_assert(e.version > info.last_update);
4161 info.last_update = e.version;
4162
4163 // raise user_version, if it increased (it may have not get bumped
4164 // by all logged updates)
4165 if (e.user_version > info.last_user_version)
4166 info.last_user_version = e.user_version;
4167
4168 // log mutation
4169 pg_log.add(e, applied);
4170 psdout(10) << "add_log_entry " << e << dendl;
4171 }
4172
4173
4174 void PeeringState::append_log(
4175 vector<pg_log_entry_t>&& logv,
4176 eversion_t trim_to,
4177 eversion_t roll_forward_to,
4178 eversion_t mlcod,
4179 ObjectStore::Transaction &t,
4180 bool transaction_applied,
4181 bool async)
4182 {
4183 /* The primary has sent an info updating the history, but it may not
4184 * have arrived yet. We want to make sure that we cannot remember this
4185 * write without remembering that it happened in an interval which went
4186 * active in epoch history.last_epoch_started.
4187 */
4188 if (info.last_epoch_started != info.history.last_epoch_started) {
4189 info.history.last_epoch_started = info.last_epoch_started;
4190 }
4191 if (info.last_interval_started != info.history.last_interval_started) {
4192 info.history.last_interval_started = info.last_interval_started;
4193 }
4194 psdout(10) << "append_log " << pg_log.get_log() << " " << logv << dendl;
4195
4196 PGLog::LogEntryHandlerRef handler{pl->get_log_handler(t)};
4197 if (!transaction_applied) {
4198 /* We must be a backfill or async recovery peer, so it's ok if we apply
4199 * out-of-turn since we won't be considered when
4200 * determining a min possible last_update.
4201 *
4202 * We skip_rollforward() here, which advances the crt, without
4203 * doing an actual rollforward. This avoids cleaning up entries
4204 * from the backend and we do not end up in a situation, where the
4205 * object is deleted before we can _merge_object_divergent_entries().
4206 */
4207 pg_log.skip_rollforward();
4208 }
4209
4210 for (auto p = logv.begin(); p != logv.end(); ++p) {
4211 add_log_entry(*p, transaction_applied);
4212
4213 /* We don't want to leave the rollforward artifacts around
4214 * here past last_backfill. It's ok for the same reason as
4215 * above */
4216 if (transaction_applied &&
4217 p->soid > info.last_backfill) {
4218 pg_log.roll_forward(handler.get());
4219 }
4220 }
4221 if (transaction_applied && roll_forward_to > pg_log.get_can_rollback_to()) {
4222 pg_log.roll_forward_to(
4223 roll_forward_to,
4224 handler.get());
4225 last_rollback_info_trimmed_to_applied = roll_forward_to;
4226 }
4227
4228 psdout(10) << __func__ << " approx pg log length = "
4229 << pg_log.get_log().approx_size() << dendl;
4230 psdout(10) << __func__ << " transaction_applied = "
4231 << transaction_applied << dendl;
4232 if (!transaction_applied || async)
4233 psdout(10) << __func__ << " " << pg_whoami
4234 << " is async_recovery or backfill target" << dendl;
4235 pg_log.trim(trim_to, info, transaction_applied, async);
4236
4237 // update the local pg, pg log
4238 dirty_info = true;
4239 write_if_dirty(t);
4240
4241 if (!is_primary())
4242 min_last_complete_ondisk = mlcod;
4243 }
4244
4245 void PeeringState::recover_got(
4246 const hobject_t &oid, eversion_t v,
4247 bool is_delete,
4248 ObjectStore::Transaction &t)
4249 {
4250 if (v > pg_log.get_can_rollback_to()) {
4251 /* This can only happen during a repair, and even then, it would
4252 * be one heck of a race. If we are repairing the object, the
4253 * write in question must be fully committed, so it's not valid
4254 * to roll it back anyway (and we'll be rolled forward shortly
4255 * anyway) */
4256 PGLog::LogEntryHandlerRef handler{pl->get_log_handler(t)};
4257 pg_log.roll_forward_to(v, handler.get());
4258 }
4259
4260 psdout(10) << "got missing " << oid << " v " << v << dendl;
4261 pg_log.recover_got(oid, v, info);
4262 if (pg_log.get_log().log.empty()) {
4263 psdout(10) << "last_complete now " << info.last_complete
4264 << " while log is empty" << dendl;
4265 } else if (pg_log.get_log().complete_to != pg_log.get_log().log.end()) {
4266 psdout(10) << "last_complete now " << info.last_complete
4267 << " log.complete_to " << pg_log.get_log().complete_to->version
4268 << dendl;
4269 } else {
4270 psdout(10) << "last_complete now " << info.last_complete
4271 << " log.complete_to at end" << dendl;
4272 //below is not true in the repair case.
4273 //assert(missing.num_missing() == 0); // otherwise, complete_to was wrong.
4274 ceph_assert(info.last_complete == info.last_update);
4275 }
4276
4277 if (is_primary()) {
4278 ceph_assert(missing_loc.needs_recovery(oid));
4279 if (!is_delete)
4280 missing_loc.add_location(oid, pg_whoami);
4281 }
4282
4283 // update pg
4284 dirty_info = true;
4285 write_if_dirty(t);
4286 }
4287
4288 void PeeringState::update_backfill_progress(
4289 const hobject_t &updated_backfill,
4290 const pg_stat_t &updated_stats,
4291 bool preserve_local_num_bytes,
4292 ObjectStore::Transaction &t) {
4293 info.set_last_backfill(updated_backfill);
4294 if (preserve_local_num_bytes) {
4295 psdout(25) << __func__ << " primary " << updated_stats.stats.sum.num_bytes
4296 << " local " << info.stats.stats.sum.num_bytes << dendl;
4297 int64_t bytes = info.stats.stats.sum.num_bytes;
4298 info.stats = updated_stats;
4299 info.stats.stats.sum.num_bytes = bytes;
4300 } else {
4301 psdout(20) << __func__ << " final " << updated_stats.stats.sum.num_bytes
4302 << " replaces local " << info.stats.stats.sum.num_bytes << dendl;
4303 info.stats = updated_stats;
4304 }
4305
4306 dirty_info = true;
4307 write_if_dirty(t);
4308 }
4309
4310 void PeeringState::adjust_purged_snaps(
4311 std::function<void(interval_set<snapid_t> &snaps)> f) {
4312 f(info.purged_snaps);
4313 dirty_info = true;
4314 dirty_big_info = true;
4315 }
4316
4317 void PeeringState::on_peer_recover(
4318 pg_shard_t peer,
4319 const hobject_t &soid,
4320 const eversion_t &version)
4321 {
4322 pl->publish_stats_to_osd();
4323 // done!
4324 peer_missing[peer].got(soid, version);
4325 missing_loc.add_location(soid, peer);
4326 }
4327
4328 void PeeringState::begin_peer_recover(
4329 pg_shard_t peer,
4330 const hobject_t soid)
4331 {
4332 peer_missing[peer].revise_have(soid, eversion_t());
4333 }
4334
4335 void PeeringState::force_object_missing(
4336 const set<pg_shard_t> &peers,
4337 const hobject_t &soid,
4338 eversion_t version)
4339 {
4340 for (auto &&peer : peers) {
4341 if (peer != primary) {
4342 peer_missing[peer].add(soid, version, eversion_t(), false);
4343 } else {
4344 pg_log.missing_add(soid, version, eversion_t());
4345 pg_log.reset_complete_to(&info);
4346 pg_log.set_last_requested(0);
4347 }
4348 }
4349
4350 missing_loc.rebuild(
4351 soid,
4352 pg_whoami,
4353 acting_recovery_backfill,
4354 info,
4355 pg_log.get_missing(),
4356 peer_missing,
4357 peer_info);
4358 }
4359
4360 void PeeringState::pre_submit_op(
4361 const hobject_t &hoid,
4362 const vector<pg_log_entry_t>& logv,
4363 eversion_t at_version)
4364 {
4365 if (at_version > eversion_t()) {
4366 for (auto &&i : get_acting_recovery_backfill()) {
4367 if (i == primary) continue;
4368 pg_info_t &pinfo = peer_info[i];
4369 // keep peer_info up to date
4370 if (pinfo.last_complete == pinfo.last_update)
4371 pinfo.last_complete = at_version;
4372 pinfo.last_update = at_version;
4373 }
4374 }
4375
4376 bool requires_missing_loc = false;
4377 for (auto &&i : get_async_recovery_targets()) {
4378 if (i == primary || !get_peer_missing(i).is_missing(hoid))
4379 continue;
4380 requires_missing_loc = true;
4381 for (auto &&entry: logv) {
4382 peer_missing[i].add_next_event(entry);
4383 }
4384 }
4385
4386 if (requires_missing_loc) {
4387 for (auto &&entry: logv) {
4388 psdout(30) << __func__ << " missing_loc before: "
4389 << missing_loc.get_locations(entry.soid) << dendl;
4390 missing_loc.add_missing(entry.soid, entry.version,
4391 eversion_t(), entry.is_delete());
4392 // clear out missing_loc
4393 missing_loc.clear_location(entry.soid);
4394 for (auto &i: get_actingset()) {
4395 if (!get_peer_missing(i).is_missing(entry.soid))
4396 missing_loc.add_location(entry.soid, i);
4397 }
4398 psdout(30) << __func__ << " missing_loc after: "
4399 << missing_loc.get_locations(entry.soid) << dendl;
4400 }
4401 }
4402 }
4403
4404 void PeeringState::recovery_committed_to(eversion_t version)
4405 {
4406 psdout(10) << __func__ << " version " << version
4407 << " now ondisk" << dendl;
4408 last_complete_ondisk = version;
4409
4410 if (last_complete_ondisk == info.last_update) {
4411 if (!is_primary()) {
4412 // Either we are a replica or backfill target.
4413 // we are fully up to date. tell the primary!
4414 pl->send_cluster_message(
4415 get_primary().osd,
4416 make_message<MOSDPGTrim>(
4417 get_osdmap_epoch(),
4418 spg_t(info.pgid.pgid, primary.shard),
4419 last_complete_ondisk),
4420 get_osdmap_epoch());
4421 } else {
4422 calc_min_last_complete_ondisk();
4423 }
4424 }
4425 }
4426
4427 void PeeringState::complete_write(eversion_t v, eversion_t lc)
4428 {
4429 last_update_ondisk = v;
4430 last_complete_ondisk = lc;
4431 calc_min_last_complete_ondisk();
4432 }
4433
4434 void PeeringState::calc_trim_to()
4435 {
4436 size_t target = pl->get_target_pg_log_entries();
4437
4438 eversion_t limit = std::min(
4439 min_last_complete_ondisk,
4440 pg_log.get_can_rollback_to());
4441 if (limit != eversion_t() &&
4442 limit != pg_trim_to &&
4443 pg_log.get_log().approx_size() > target) {
4444 size_t num_to_trim = std::min(pg_log.get_log().approx_size() - target,
4445 cct->_conf->osd_pg_log_trim_max);
4446 if (num_to_trim < cct->_conf->osd_pg_log_trim_min &&
4447 cct->_conf->osd_pg_log_trim_max >= cct->_conf->osd_pg_log_trim_min) {
4448 return;
4449 }
4450 auto it = pg_log.get_log().log.begin();
4451 eversion_t new_trim_to;
4452 for (size_t i = 0; i < num_to_trim; ++i) {
4453 new_trim_to = it->version;
4454 ++it;
4455 if (new_trim_to > limit) {
4456 new_trim_to = limit;
4457 psdout(10) << "calc_trim_to trimming to min_last_complete_ondisk" << dendl;
4458 break;
4459 }
4460 }
4461 psdout(10) << "calc_trim_to " << pg_trim_to << " -> " << new_trim_to << dendl;
4462 pg_trim_to = new_trim_to;
4463 assert(pg_trim_to <= pg_log.get_head());
4464 assert(pg_trim_to <= min_last_complete_ondisk);
4465 }
4466 }
4467
4468 void PeeringState::calc_trim_to_aggressive()
4469 {
4470 size_t target = pl->get_target_pg_log_entries();
4471
4472 // limit pg log trimming up to the can_rollback_to value
4473 eversion_t limit = std::min({
4474 pg_log.get_head(),
4475 pg_log.get_can_rollback_to(),
4476 last_update_ondisk});
4477 psdout(10) << __func__ << " limit = " << limit << dendl;
4478
4479 if (limit != eversion_t() &&
4480 limit != pg_trim_to &&
4481 pg_log.get_log().approx_size() > target) {
4482 psdout(10) << __func__ << " approx pg log length = "
4483 << pg_log.get_log().approx_size() << dendl;
4484 uint64_t num_to_trim = std::min<uint64_t>(pg_log.get_log().approx_size() - target,
4485 cct->_conf->osd_pg_log_trim_max);
4486 psdout(10) << __func__ << " num_to_trim = " << num_to_trim << dendl;
4487 if (num_to_trim < cct->_conf->osd_pg_log_trim_min &&
4488 cct->_conf->osd_pg_log_trim_max >= cct->_conf->osd_pg_log_trim_min) {
4489 return;
4490 }
4491 auto it = pg_log.get_log().log.begin(); // oldest log entry
4492 auto rit = pg_log.get_log().log.rbegin();
4493 eversion_t by_n_to_keep; // start from tail
4494 eversion_t by_n_to_trim = eversion_t::max(); // start from head
4495 for (size_t i = 0; it != pg_log.get_log().log.end(); ++it, ++rit) {
4496 i++;
4497 if (i > target && by_n_to_keep == eversion_t()) {
4498 by_n_to_keep = rit->version;
4499 }
4500 if (i >= num_to_trim && by_n_to_trim == eversion_t::max()) {
4501 by_n_to_trim = it->version;
4502 }
4503 if (by_n_to_keep != eversion_t() &&
4504 by_n_to_trim != eversion_t::max()) {
4505 break;
4506 }
4507 }
4508
4509 if (by_n_to_keep == eversion_t()) {
4510 return;
4511 }
4512
4513 pg_trim_to = std::min({by_n_to_keep, by_n_to_trim, limit});
4514 psdout(10) << __func__ << " pg_trim_to now " << pg_trim_to << dendl;
4515 ceph_assert(pg_trim_to <= pg_log.get_head());
4516 }
4517 }
4518
4519 void PeeringState::apply_op_stats(
4520 const hobject_t &soid,
4521 const object_stat_sum_t &delta_stats)
4522 {
4523 info.stats.stats.add(delta_stats);
4524 info.stats.stats.floor(0);
4525
4526 for (auto i = get_backfill_targets().begin();
4527 i != get_backfill_targets().end();
4528 ++i) {
4529 pg_shard_t bt = *i;
4530 pg_info_t& pinfo = peer_info[bt];
4531 if (soid <= pinfo.last_backfill)
4532 pinfo.stats.stats.add(delta_stats);
4533 }
4534 }
4535
4536 void PeeringState::update_complete_backfill_object_stats(
4537 const hobject_t &hoid,
4538 const pg_stat_t &stats)
4539 {
4540 for (auto &&bt: get_backfill_targets()) {
4541 pg_info_t& pinfo = peer_info[bt];
4542 //Add stats to all peers that were missing object
4543 if (hoid > pinfo.last_backfill)
4544 pinfo.stats.add(stats);
4545 }
4546 }
4547
4548 void PeeringState::update_peer_last_backfill(
4549 pg_shard_t peer,
4550 const hobject_t &new_last_backfill)
4551 {
4552 pg_info_t &pinfo = peer_info[peer];
4553 pinfo.last_backfill = new_last_backfill;
4554 if (new_last_backfill.is_max()) {
4555 /* pinfo.stats might be wrong if we did log-based recovery on the
4556 * backfilled portion in addition to continuing backfill.
4557 */
4558 pinfo.stats = info.stats;
4559 }
4560 }
4561
4562 void PeeringState::set_revert_with_targets(
4563 const hobject_t &soid,
4564 const set<pg_shard_t> &good_peers)
4565 {
4566 for (auto &&peer: good_peers) {
4567 missing_loc.add_location(soid, peer);
4568 }
4569 }
4570
4571 void PeeringState::prepare_backfill_for_missing(
4572 const hobject_t &soid,
4573 const eversion_t &version,
4574 const vector<pg_shard_t> &targets) {
4575 for (auto &&peer: targets) {
4576 peer_missing[peer].add(soid, version, eversion_t(), false);
4577 }
4578 }
4579
4580 void PeeringState::update_hset(const pg_hit_set_history_t &hset_history)
4581 {
4582 info.hit_set = hset_history;
4583 }
4584
4585 /*------------ Peering State Machine----------------*/
4586 #undef dout_prefix
4587 #define dout_prefix (context< PeeringMachine >().dpp->gen_prefix(*_dout) \
4588 << "state<" << get_state_name() << ">: ")
4589 #undef psdout
4590 #define psdout(x) ldout(context< PeeringMachine >().cct, x)
4591
4592 #define DECLARE_LOCALS \
4593 PeeringState *ps = context< PeeringMachine >().state; \
4594 std::ignore = ps; \
4595 PeeringListener *pl = context< PeeringMachine >().pl; \
4596 std::ignore = pl
4597
4598
4599 /*------Crashed-------*/
4600 PeeringState::Crashed::Crashed(my_context ctx)
4601 : my_base(ctx),
4602 NamedState(context< PeeringMachine >().state_history, "Crashed")
4603 {
4604 context< PeeringMachine >().log_enter(state_name);
4605 ceph_abort_msg("we got a bad state machine event");
4606 }
4607
4608
4609 /*------Initial-------*/
4610 PeeringState::Initial::Initial(my_context ctx)
4611 : my_base(ctx),
4612 NamedState(context< PeeringMachine >().state_history, "Initial")
4613 {
4614 context< PeeringMachine >().log_enter(state_name);
4615 }
4616
4617 boost::statechart::result PeeringState::Initial::react(const MNotifyRec& notify)
4618 {
4619 DECLARE_LOCALS;
4620 ps->proc_replica_info(
4621 notify.from, notify.notify.info, notify.notify.epoch_sent);
4622 ps->set_last_peering_reset();
4623 return transit< Primary >();
4624 }
4625
4626 boost::statechart::result PeeringState::Initial::react(const MInfoRec& i)
4627 {
4628 DECLARE_LOCALS;
4629 ceph_assert(!ps->is_primary());
4630 post_event(i);
4631 return transit< Stray >();
4632 }
4633
4634 boost::statechart::result PeeringState::Initial::react(const MLogRec& i)
4635 {
4636 DECLARE_LOCALS;
4637 ceph_assert(!ps->is_primary());
4638 post_event(i);
4639 return transit< Stray >();
4640 }
4641
4642 void PeeringState::Initial::exit()
4643 {
4644 context< PeeringMachine >().log_exit(state_name, enter_time);
4645 DECLARE_LOCALS;
4646 utime_t dur = ceph_clock_now() - enter_time;
4647 pl->get_peering_perf().tinc(rs_initial_latency, dur);
4648 }
4649
4650 /*------Started-------*/
4651 PeeringState::Started::Started(my_context ctx)
4652 : my_base(ctx),
4653 NamedState(context< PeeringMachine >().state_history, "Started")
4654 {
4655 context< PeeringMachine >().log_enter(state_name);
4656 }
4657
4658 boost::statechart::result
4659 PeeringState::Started::react(const IntervalFlush&)
4660 {
4661 psdout(10) << "Ending blocked outgoing recovery messages" << dendl;
4662 context< PeeringMachine >().state->end_block_outgoing();
4663 return discard_event();
4664 }
4665
4666 boost::statechart::result PeeringState::Started::react(const AdvMap& advmap)
4667 {
4668 DECLARE_LOCALS;
4669 psdout(10) << "Started advmap" << dendl;
4670 ps->check_full_transition(advmap.lastmap, advmap.osdmap);
4671 if (ps->should_restart_peering(
4672 advmap.up_primary,
4673 advmap.acting_primary,
4674 advmap.newup,
4675 advmap.newacting,
4676 advmap.lastmap,
4677 advmap.osdmap)) {
4678 psdout(10) << "should_restart_peering, transitioning to Reset"
4679 << dendl;
4680 post_event(advmap);
4681 return transit< Reset >();
4682 }
4683 ps->remove_down_peer_info(advmap.osdmap);
4684 return discard_event();
4685 }
4686
4687 boost::statechart::result PeeringState::Started::react(const QueryState& q)
4688 {
4689 q.f->open_object_section("state");
4690 q.f->dump_string("name", state_name);
4691 q.f->dump_stream("enter_time") << enter_time;
4692 q.f->close_section();
4693 return discard_event();
4694 }
4695
4696 boost::statechart::result PeeringState::Started::react(const QueryUnfound& q)
4697 {
4698 q.f->dump_string("state", "Started");
4699 q.f->dump_bool("available_might_have_unfound", false);
4700 return discard_event();
4701 }
4702
4703 void PeeringState::Started::exit()
4704 {
4705 context< PeeringMachine >().log_exit(state_name, enter_time);
4706 DECLARE_LOCALS;
4707 utime_t dur = ceph_clock_now() - enter_time;
4708 pl->get_peering_perf().tinc(rs_started_latency, dur);
4709 ps->state_clear(PG_STATE_WAIT | PG_STATE_LAGGY);
4710 }
4711
4712 /*--------Reset---------*/
4713 PeeringState::Reset::Reset(my_context ctx)
4714 : my_base(ctx),
4715 NamedState(context< PeeringMachine >().state_history, "Reset")
4716 {
4717 context< PeeringMachine >().log_enter(state_name);
4718 DECLARE_LOCALS;
4719
4720 ps->flushes_in_progress = 0;
4721 ps->set_last_peering_reset();
4722 ps->log_weirdness();
4723 }
4724
4725 boost::statechart::result
4726 PeeringState::Reset::react(const IntervalFlush&)
4727 {
4728 psdout(10) << "Ending blocked outgoing recovery messages" << dendl;
4729 context< PeeringMachine >().state->end_block_outgoing();
4730 return discard_event();
4731 }
4732
4733 boost::statechart::result PeeringState::Reset::react(const AdvMap& advmap)
4734 {
4735 DECLARE_LOCALS;
4736 psdout(10) << "Reset advmap" << dendl;
4737
4738 ps->check_full_transition(advmap.lastmap, advmap.osdmap);
4739
4740 if (ps->should_restart_peering(
4741 advmap.up_primary,
4742 advmap.acting_primary,
4743 advmap.newup,
4744 advmap.newacting,
4745 advmap.lastmap,
4746 advmap.osdmap)) {
4747 psdout(10) << "should restart peering, calling start_peering_interval again"
4748 << dendl;
4749 ps->start_peering_interval(
4750 advmap.lastmap,
4751 advmap.newup, advmap.up_primary,
4752 advmap.newacting, advmap.acting_primary,
4753 context< PeeringMachine >().get_cur_transaction());
4754 }
4755 ps->remove_down_peer_info(advmap.osdmap);
4756 ps->check_past_interval_bounds();
4757 return discard_event();
4758 }
4759
4760 boost::statechart::result PeeringState::Reset::react(const ActMap&)
4761 {
4762 DECLARE_LOCALS;
4763 if (ps->should_send_notify() && ps->get_primary().osd >= 0) {
4764 ps->info.history.refresh_prior_readable_until_ub(
4765 pl->get_mnow(),
4766 ps->prior_readable_until_ub);
4767 context< PeeringMachine >().send_notify(
4768 ps->get_primary().osd,
4769 pg_notify_t(
4770 ps->get_primary().shard, ps->pg_whoami.shard,
4771 ps->get_osdmap_epoch(),
4772 ps->get_osdmap_epoch(),
4773 ps->info,
4774 ps->past_intervals));
4775 }
4776
4777 ps->update_heartbeat_peers();
4778
4779 return transit< Started >();
4780 }
4781
4782 boost::statechart::result PeeringState::Reset::react(const QueryState& q)
4783 {
4784 q.f->open_object_section("state");
4785 q.f->dump_string("name", state_name);
4786 q.f->dump_stream("enter_time") << enter_time;
4787 q.f->close_section();
4788 return discard_event();
4789 }
4790
4791 boost::statechart::result PeeringState::Reset::react(const QueryUnfound& q)
4792 {
4793 q.f->dump_string("state", "Reset");
4794 q.f->dump_bool("available_might_have_unfound", false);
4795 return discard_event();
4796 }
4797
4798 void PeeringState::Reset::exit()
4799 {
4800 context< PeeringMachine >().log_exit(state_name, enter_time);
4801 DECLARE_LOCALS;
4802 utime_t dur = ceph_clock_now() - enter_time;
4803 pl->get_peering_perf().tinc(rs_reset_latency, dur);
4804 }
4805
4806 /*-------Start---------*/
4807 PeeringState::Start::Start(my_context ctx)
4808 : my_base(ctx),
4809 NamedState(context< PeeringMachine >().state_history, "Start")
4810 {
4811 context< PeeringMachine >().log_enter(state_name);
4812
4813 DECLARE_LOCALS;
4814 if (ps->is_primary()) {
4815 psdout(1) << "transitioning to Primary" << dendl;
4816 post_event(MakePrimary());
4817 } else { //is_stray
4818 psdout(1) << "transitioning to Stray" << dendl;
4819 post_event(MakeStray());
4820 }
4821 }
4822
4823 void PeeringState::Start::exit()
4824 {
4825 context< PeeringMachine >().log_exit(state_name, enter_time);
4826 DECLARE_LOCALS;
4827 utime_t dur = ceph_clock_now() - enter_time;
4828 pl->get_peering_perf().tinc(rs_start_latency, dur);
4829 }
4830
4831 /*---------Primary--------*/
4832 PeeringState::Primary::Primary(my_context ctx)
4833 : my_base(ctx),
4834 NamedState(context< PeeringMachine >().state_history, "Started/Primary")
4835 {
4836 context< PeeringMachine >().log_enter(state_name);
4837 DECLARE_LOCALS;
4838 ceph_assert(ps->want_acting.empty());
4839
4840 // set CREATING bit until we have peered for the first time.
4841 if (ps->info.history.last_epoch_started == 0) {
4842 ps->state_set(PG_STATE_CREATING);
4843 // use the history timestamp, which ultimately comes from the
4844 // monitor in the create case.
4845 utime_t t = ps->info.history.last_scrub_stamp;
4846 ps->info.stats.last_fresh = t;
4847 ps->info.stats.last_active = t;
4848 ps->info.stats.last_change = t;
4849 ps->info.stats.last_peered = t;
4850 ps->info.stats.last_clean = t;
4851 ps->info.stats.last_unstale = t;
4852 ps->info.stats.last_undegraded = t;
4853 ps->info.stats.last_fullsized = t;
4854 ps->info.stats.last_scrub_stamp = t;
4855 ps->info.stats.last_deep_scrub_stamp = t;
4856 ps->info.stats.last_clean_scrub_stamp = t;
4857 }
4858 }
4859
4860 boost::statechart::result PeeringState::Primary::react(const MNotifyRec& notevt)
4861 {
4862 DECLARE_LOCALS;
4863 psdout(7) << "handle_pg_notify from osd." << notevt.from << dendl;
4864 ps->proc_replica_info(
4865 notevt.from, notevt.notify.info, notevt.notify.epoch_sent);
4866 return discard_event();
4867 }
4868
4869 boost::statechart::result PeeringState::Primary::react(const ActMap&)
4870 {
4871 DECLARE_LOCALS;
4872 psdout(7) << "handle ActMap primary" << dendl;
4873 pl->publish_stats_to_osd();
4874 return discard_event();
4875 }
4876
4877 boost::statechart::result PeeringState::Primary::react(
4878 const SetForceRecovery&)
4879 {
4880 DECLARE_LOCALS;
4881 ps->set_force_recovery(true);
4882 return discard_event();
4883 }
4884
4885 boost::statechart::result PeeringState::Primary::react(
4886 const UnsetForceRecovery&)
4887 {
4888 DECLARE_LOCALS;
4889 ps->set_force_recovery(false);
4890 return discard_event();
4891 }
4892
4893 boost::statechart::result PeeringState::Primary::react(
4894 const RequestScrub& evt)
4895 {
4896 DECLARE_LOCALS;
4897 if (ps->is_primary()) {
4898 pl->scrub_requested(evt.deep, evt.repair);
4899 psdout(10) << "marking for scrub" << dendl;
4900 }
4901 return discard_event();
4902 }
4903
4904 boost::statechart::result PeeringState::Primary::react(
4905 const SetForceBackfill&)
4906 {
4907 DECLARE_LOCALS;
4908 ps->set_force_backfill(true);
4909 return discard_event();
4910 }
4911
4912 boost::statechart::result PeeringState::Primary::react(
4913 const UnsetForceBackfill&)
4914 {
4915 DECLARE_LOCALS;
4916 ps->set_force_backfill(false);
4917 return discard_event();
4918 }
4919
4920 void PeeringState::Primary::exit()
4921 {
4922 context< PeeringMachine >().log_exit(state_name, enter_time);
4923 DECLARE_LOCALS;
4924 ps->want_acting.clear();
4925 utime_t dur = ceph_clock_now() - enter_time;
4926 pl->get_peering_perf().tinc(rs_primary_latency, dur);
4927 pl->clear_primary_state();
4928 ps->state_clear(PG_STATE_CREATING);
4929 }
4930
4931 /*---------Peering--------*/
4932 PeeringState::Peering::Peering(my_context ctx)
4933 : my_base(ctx),
4934 NamedState(context< PeeringMachine >().state_history, "Started/Primary/Peering"),
4935 history_les_bound(false)
4936 {
4937 context< PeeringMachine >().log_enter(state_name);
4938 DECLARE_LOCALS;
4939
4940 ceph_assert(!ps->is_peered());
4941 ceph_assert(!ps->is_peering());
4942 ceph_assert(ps->is_primary());
4943 ps->state_set(PG_STATE_PEERING);
4944 }
4945
4946 boost::statechart::result PeeringState::Peering::react(const AdvMap& advmap)
4947 {
4948 DECLARE_LOCALS;
4949 psdout(10) << "Peering advmap" << dendl;
4950 if (prior_set.affected_by_map(*(advmap.osdmap), ps->dpp)) {
4951 psdout(1) << "Peering, affected_by_map, going to Reset" << dendl;
4952 post_event(advmap);
4953 return transit< Reset >();
4954 }
4955
4956 ps->adjust_need_up_thru(advmap.osdmap);
4957 ps->check_prior_readable_down_osds(advmap.osdmap);
4958
4959 return forward_event();
4960 }
4961
4962 boost::statechart::result PeeringState::Peering::react(const QueryState& q)
4963 {
4964 DECLARE_LOCALS;
4965
4966 q.f->open_object_section("state");
4967 q.f->dump_string("name", state_name);
4968 q.f->dump_stream("enter_time") << enter_time;
4969
4970 q.f->open_array_section("past_intervals");
4971 ps->past_intervals.dump(q.f);
4972 q.f->close_section();
4973
4974 q.f->open_array_section("probing_osds");
4975 for (auto p = prior_set.probe.begin(); p != prior_set.probe.end(); ++p)
4976 q.f->dump_stream("osd") << *p;
4977 q.f->close_section();
4978
4979 if (prior_set.pg_down)
4980 q.f->dump_string("blocked", "peering is blocked due to down osds");
4981
4982 q.f->open_array_section("down_osds_we_would_probe");
4983 for (auto p = prior_set.down.begin(); p != prior_set.down.end(); ++p)
4984 q.f->dump_int("osd", *p);
4985 q.f->close_section();
4986
4987 q.f->open_array_section("peering_blocked_by");
4988 for (auto p = prior_set.blocked_by.begin();
4989 p != prior_set.blocked_by.end();
4990 ++p) {
4991 q.f->open_object_section("osd");
4992 q.f->dump_int("osd", p->first);
4993 q.f->dump_int("current_lost_at", p->second);
4994 q.f->dump_string("comment", "starting or marking this osd lost may let us proceed");
4995 q.f->close_section();
4996 }
4997 q.f->close_section();
4998
4999 if (history_les_bound) {
5000 q.f->open_array_section("peering_blocked_by_detail");
5001 q.f->open_object_section("item");
5002 q.f->dump_string("detail","peering_blocked_by_history_les_bound");
5003 q.f->close_section();
5004 q.f->close_section();
5005 }
5006
5007 q.f->close_section();
5008 return forward_event();
5009 }
5010
5011 boost::statechart::result PeeringState::Peering::react(const QueryUnfound& q)
5012 {
5013 q.f->dump_string("state", "Peering");
5014 q.f->dump_bool("available_might_have_unfound", false);
5015 return discard_event();
5016 }
5017
5018 void PeeringState::Peering::exit()
5019 {
5020
5021 DECLARE_LOCALS;
5022 psdout(10) << "Leaving Peering" << dendl;
5023 context< PeeringMachine >().log_exit(state_name, enter_time);
5024 ps->state_clear(PG_STATE_PEERING);
5025 pl->clear_probe_targets();
5026
5027 utime_t dur = ceph_clock_now() - enter_time;
5028 pl->get_peering_perf().tinc(rs_peering_latency, dur);
5029 }
5030
5031
5032 /*------Backfilling-------*/
5033 PeeringState::Backfilling::Backfilling(my_context ctx)
5034 : my_base(ctx),
5035 NamedState(context< PeeringMachine >().state_history, "Started/Primary/Active/Backfilling")
5036 {
5037 context< PeeringMachine >().log_enter(state_name);
5038
5039
5040 DECLARE_LOCALS;
5041 ps->backfill_reserved = true;
5042 pl->on_backfill_reserved();
5043 ps->state_clear(PG_STATE_BACKFILL_TOOFULL);
5044 ps->state_clear(PG_STATE_BACKFILL_WAIT);
5045 ps->state_set(PG_STATE_BACKFILLING);
5046 pl->publish_stats_to_osd();
5047 }
5048
5049 void PeeringState::Backfilling::backfill_release_reservations()
5050 {
5051 DECLARE_LOCALS;
5052 pl->cancel_local_background_io_reservation();
5053 for (auto it = ps->backfill_targets.begin();
5054 it != ps->backfill_targets.end();
5055 ++it) {
5056 ceph_assert(*it != ps->pg_whoami);
5057 pl->send_cluster_message(
5058 it->osd,
5059 make_message<MBackfillReserve>(
5060 MBackfillReserve::RELEASE,
5061 spg_t(ps->info.pgid.pgid, it->shard),
5062 ps->get_osdmap_epoch()),
5063 ps->get_osdmap_epoch());
5064 }
5065 }
5066
5067 void PeeringState::Backfilling::cancel_backfill()
5068 {
5069 DECLARE_LOCALS;
5070 backfill_release_reservations();
5071 pl->on_backfill_canceled();
5072 }
5073
5074 boost::statechart::result
5075 PeeringState::Backfilling::react(const Backfilled &c)
5076 {
5077 backfill_release_reservations();
5078 return transit<Recovered>();
5079 }
5080
5081 boost::statechart::result
5082 PeeringState::Backfilling::react(const DeferBackfill &c)
5083 {
5084 DECLARE_LOCALS;
5085
5086 psdout(10) << "defer backfill, retry delay " << c.delay << dendl;
5087 ps->state_set(PG_STATE_BACKFILL_WAIT);
5088 ps->state_clear(PG_STATE_BACKFILLING);
5089 cancel_backfill();
5090
5091 pl->schedule_event_after(
5092 std::make_shared<PGPeeringEvent>(
5093 ps->get_osdmap_epoch(),
5094 ps->get_osdmap_epoch(),
5095 RequestBackfill()),
5096 c.delay);
5097 return transit<NotBackfilling>();
5098 }
5099
5100 boost::statechart::result
5101 PeeringState::Backfilling::react(const UnfoundBackfill &c)
5102 {
5103 DECLARE_LOCALS;
5104 psdout(10) << "backfill has unfound, can't continue" << dendl;
5105 ps->state_set(PG_STATE_BACKFILL_UNFOUND);
5106 ps->state_clear(PG_STATE_BACKFILLING);
5107 cancel_backfill();
5108 return transit<NotBackfilling>();
5109 }
5110
5111 boost::statechart::result
5112 PeeringState::Backfilling::react(const RemoteReservationRevokedTooFull &)
5113 {
5114 DECLARE_LOCALS;
5115
5116 ps->state_set(PG_STATE_BACKFILL_TOOFULL);
5117 ps->state_clear(PG_STATE_BACKFILLING);
5118 cancel_backfill();
5119
5120 pl->schedule_event_after(
5121 std::make_shared<PGPeeringEvent>(
5122 ps->get_osdmap_epoch(),
5123 ps->get_osdmap_epoch(),
5124 RequestBackfill()),
5125 ps->cct->_conf->osd_backfill_retry_interval);
5126
5127 return transit<NotBackfilling>();
5128 }
5129
5130 boost::statechart::result
5131 PeeringState::Backfilling::react(const RemoteReservationRevoked &)
5132 {
5133 DECLARE_LOCALS;
5134 ps->state_set(PG_STATE_BACKFILL_WAIT);
5135 cancel_backfill();
5136 if (ps->needs_backfill()) {
5137 return transit<WaitLocalBackfillReserved>();
5138 } else {
5139 // raced with MOSDPGBackfill::OP_BACKFILL_FINISH, ignore
5140 return discard_event();
5141 }
5142 }
5143
5144 void PeeringState::Backfilling::exit()
5145 {
5146 context< PeeringMachine >().log_exit(state_name, enter_time);
5147 DECLARE_LOCALS;
5148 ps->backfill_reserved = false;
5149 ps->state_clear(PG_STATE_BACKFILLING);
5150 ps->state_clear(PG_STATE_FORCED_BACKFILL | PG_STATE_FORCED_RECOVERY);
5151 utime_t dur = ceph_clock_now() - enter_time;
5152 pl->get_peering_perf().tinc(rs_backfilling_latency, dur);
5153 }
5154
5155 /*--WaitRemoteBackfillReserved--*/
5156
5157 PeeringState::WaitRemoteBackfillReserved::WaitRemoteBackfillReserved(my_context ctx)
5158 : my_base(ctx),
5159 NamedState(context< PeeringMachine >().state_history, "Started/Primary/Active/WaitRemoteBackfillReserved"),
5160 backfill_osd_it(context< Active >().remote_shards_to_reserve_backfill.begin())
5161 {
5162 context< PeeringMachine >().log_enter(state_name);
5163 DECLARE_LOCALS;
5164
5165 ps->state_set(PG_STATE_BACKFILL_WAIT);
5166 pl->publish_stats_to_osd();
5167 post_event(RemoteBackfillReserved());
5168 }
5169
5170 boost::statechart::result
5171 PeeringState::WaitRemoteBackfillReserved::react(const RemoteBackfillReserved &evt)
5172 {
5173 DECLARE_LOCALS;
5174
5175 int64_t num_bytes = ps->info.stats.stats.sum.num_bytes;
5176 psdout(10) << __func__ << " num_bytes " << num_bytes << dendl;
5177 if (backfill_osd_it !=
5178 context< Active >().remote_shards_to_reserve_backfill.end()) {
5179 // The primary never backfills itself
5180 ceph_assert(*backfill_osd_it != ps->pg_whoami);
5181 pl->send_cluster_message(
5182 backfill_osd_it->osd,
5183 make_message<MBackfillReserve>(
5184 MBackfillReserve::REQUEST,
5185 spg_t(context< PeeringMachine >().spgid.pgid, backfill_osd_it->shard),
5186 ps->get_osdmap_epoch(),
5187 ps->get_backfill_priority(),
5188 num_bytes,
5189 ps->peer_bytes[*backfill_osd_it]),
5190 ps->get_osdmap_epoch());
5191 ++backfill_osd_it;
5192 } else {
5193 ps->peer_bytes.clear();
5194 post_event(AllBackfillsReserved());
5195 }
5196 return discard_event();
5197 }
5198
5199 void PeeringState::WaitRemoteBackfillReserved::exit()
5200 {
5201 context< PeeringMachine >().log_exit(state_name, enter_time);
5202 DECLARE_LOCALS;
5203
5204 utime_t dur = ceph_clock_now() - enter_time;
5205 pl->get_peering_perf().tinc(rs_waitremotebackfillreserved_latency, dur);
5206 }
5207
5208 void PeeringState::WaitRemoteBackfillReserved::retry()
5209 {
5210 DECLARE_LOCALS;
5211 pl->cancel_local_background_io_reservation();
5212
5213 // Send CANCEL to all previously acquired reservations
5214 set<pg_shard_t>::const_iterator it, begin, end;
5215 begin = context< Active >().remote_shards_to_reserve_backfill.begin();
5216 end = context< Active >().remote_shards_to_reserve_backfill.end();
5217 ceph_assert(begin != end);
5218 for (it = begin; it != backfill_osd_it; ++it) {
5219 // The primary never backfills itself
5220 ceph_assert(*it != ps->pg_whoami);
5221 pl->send_cluster_message(
5222 it->osd,
5223 make_message<MBackfillReserve>(
5224 MBackfillReserve::RELEASE,
5225 spg_t(context< PeeringMachine >().spgid.pgid, it->shard),
5226 ps->get_osdmap_epoch()),
5227 ps->get_osdmap_epoch());
5228 }
5229
5230 ps->state_clear(PG_STATE_BACKFILL_WAIT);
5231 pl->publish_stats_to_osd();
5232
5233 pl->schedule_event_after(
5234 std::make_shared<PGPeeringEvent>(
5235 ps->get_osdmap_epoch(),
5236 ps->get_osdmap_epoch(),
5237 RequestBackfill()),
5238 ps->cct->_conf->osd_backfill_retry_interval);
5239 }
5240
5241 boost::statechart::result
5242 PeeringState::WaitRemoteBackfillReserved::react(const RemoteReservationRejectedTooFull &evt)
5243 {
5244 DECLARE_LOCALS;
5245 ps->state_set(PG_STATE_BACKFILL_TOOFULL);
5246 retry();
5247 return transit<NotBackfilling>();
5248 }
5249
5250 boost::statechart::result
5251 PeeringState::WaitRemoteBackfillReserved::react(const RemoteReservationRevoked &evt)
5252 {
5253 retry();
5254 return transit<NotBackfilling>();
5255 }
5256
5257 /*--WaitLocalBackfillReserved--*/
5258 PeeringState::WaitLocalBackfillReserved::WaitLocalBackfillReserved(my_context ctx)
5259 : my_base(ctx),
5260 NamedState(context< PeeringMachine >().state_history, "Started/Primary/Active/WaitLocalBackfillReserved")
5261 {
5262 context< PeeringMachine >().log_enter(state_name);
5263 DECLARE_LOCALS;
5264
5265 ps->state_set(PG_STATE_BACKFILL_WAIT);
5266 pl->request_local_background_io_reservation(
5267 ps->get_backfill_priority(),
5268 std::make_unique<PGPeeringEvent>(
5269 ps->get_osdmap_epoch(),
5270 ps->get_osdmap_epoch(),
5271 LocalBackfillReserved()),
5272 std::make_unique<PGPeeringEvent>(
5273 ps->get_osdmap_epoch(),
5274 ps->get_osdmap_epoch(),
5275 DeferBackfill(0.0)));
5276 pl->publish_stats_to_osd();
5277 }
5278
5279 void PeeringState::WaitLocalBackfillReserved::exit()
5280 {
5281 context< PeeringMachine >().log_exit(state_name, enter_time);
5282 DECLARE_LOCALS;
5283 utime_t dur = ceph_clock_now() - enter_time;
5284 pl->get_peering_perf().tinc(rs_waitlocalbackfillreserved_latency, dur);
5285 }
5286
5287 /*----NotBackfilling------*/
5288 PeeringState::NotBackfilling::NotBackfilling(my_context ctx)
5289 : my_base(ctx),
5290 NamedState(context< PeeringMachine >().state_history, "Started/Primary/Active/NotBackfilling")
5291 {
5292 context< PeeringMachine >().log_enter(state_name);
5293 DECLARE_LOCALS;
5294 ps->state_clear(PG_STATE_REPAIR);
5295 pl->publish_stats_to_osd();
5296 }
5297
5298 boost::statechart::result PeeringState::NotBackfilling::react(const QueryUnfound& q)
5299 {
5300 DECLARE_LOCALS;
5301
5302 ps->query_unfound(q.f, "NotBackfilling");
5303 return discard_event();
5304 }
5305
5306 boost::statechart::result
5307 PeeringState::NotBackfilling::react(const RemoteBackfillReserved &evt)
5308 {
5309 return discard_event();
5310 }
5311
5312 boost::statechart::result
5313 PeeringState::NotBackfilling::react(const RemoteReservationRejectedTooFull &evt)
5314 {
5315 return discard_event();
5316 }
5317
5318 void PeeringState::NotBackfilling::exit()
5319 {
5320 context< PeeringMachine >().log_exit(state_name, enter_time);
5321
5322 DECLARE_LOCALS;
5323 ps->state_clear(PG_STATE_BACKFILL_UNFOUND);
5324 utime_t dur = ceph_clock_now() - enter_time;
5325 pl->get_peering_perf().tinc(rs_notbackfilling_latency, dur);
5326 }
5327
5328 /*----NotRecovering------*/
5329 PeeringState::NotRecovering::NotRecovering(my_context ctx)
5330 : my_base(ctx),
5331 NamedState(context< PeeringMachine >().state_history, "Started/Primary/Active/NotRecovering")
5332 {
5333 context< PeeringMachine >().log_enter(state_name);
5334 DECLARE_LOCALS;
5335 ps->state_clear(PG_STATE_REPAIR);
5336 pl->publish_stats_to_osd();
5337 }
5338
5339 boost::statechart::result PeeringState::NotRecovering::react(const QueryUnfound& q)
5340 {
5341 DECLARE_LOCALS;
5342
5343 ps->query_unfound(q.f, "NotRecovering");
5344 return discard_event();
5345 }
5346
5347 void PeeringState::NotRecovering::exit()
5348 {
5349 context< PeeringMachine >().log_exit(state_name, enter_time);
5350
5351 DECLARE_LOCALS;
5352 ps->state_clear(PG_STATE_RECOVERY_UNFOUND);
5353 utime_t dur = ceph_clock_now() - enter_time;
5354 pl->get_peering_perf().tinc(rs_notrecovering_latency, dur);
5355 }
5356
5357 /*---RepNotRecovering----*/
5358 PeeringState::RepNotRecovering::RepNotRecovering(my_context ctx)
5359 : my_base(ctx),
5360 NamedState(context< PeeringMachine >().state_history, "Started/ReplicaActive/RepNotRecovering")
5361 {
5362 context< PeeringMachine >().log_enter(state_name);
5363 }
5364
5365 boost::statechart::result
5366 PeeringState::RepNotRecovering::react(const RejectTooFullRemoteReservation &evt)
5367 {
5368 DECLARE_LOCALS;
5369 ps->reject_reservation();
5370 post_event(RemoteReservationRejectedTooFull());
5371 return discard_event();
5372 }
5373
5374 void PeeringState::RepNotRecovering::exit()
5375 {
5376 context< PeeringMachine >().log_exit(state_name, enter_time);
5377 DECLARE_LOCALS;
5378 utime_t dur = ceph_clock_now() - enter_time;
5379 pl->get_peering_perf().tinc(rs_repnotrecovering_latency, dur);
5380 }
5381
5382 /*---RepWaitRecoveryReserved--*/
5383 PeeringState::RepWaitRecoveryReserved::RepWaitRecoveryReserved(my_context ctx)
5384 : my_base(ctx),
5385 NamedState(context< PeeringMachine >().state_history, "Started/ReplicaActive/RepWaitRecoveryReserved")
5386 {
5387 context< PeeringMachine >().log_enter(state_name);
5388 }
5389
5390 boost::statechart::result
5391 PeeringState::RepWaitRecoveryReserved::react(const RemoteRecoveryReserved &evt)
5392 {
5393 DECLARE_LOCALS;
5394 pl->send_cluster_message(
5395 ps->primary.osd,
5396 make_message<MRecoveryReserve>(
5397 MRecoveryReserve::GRANT,
5398 spg_t(ps->info.pgid.pgid, ps->primary.shard),
5399 ps->get_osdmap_epoch()),
5400 ps->get_osdmap_epoch());
5401 return transit<RepRecovering>();
5402 }
5403
5404 boost::statechart::result
5405 PeeringState::RepWaitRecoveryReserved::react(
5406 const RemoteReservationCanceled &evt)
5407 {
5408 DECLARE_LOCALS;
5409 pl->unreserve_recovery_space();
5410
5411 pl->cancel_remote_recovery_reservation();
5412 return transit<RepNotRecovering>();
5413 }
5414
5415 void PeeringState::RepWaitRecoveryReserved::exit()
5416 {
5417 context< PeeringMachine >().log_exit(state_name, enter_time);
5418 DECLARE_LOCALS;
5419 utime_t dur = ceph_clock_now() - enter_time;
5420 pl->get_peering_perf().tinc(rs_repwaitrecoveryreserved_latency, dur);
5421 }
5422
5423 /*-RepWaitBackfillReserved*/
5424 PeeringState::RepWaitBackfillReserved::RepWaitBackfillReserved(my_context ctx)
5425 : my_base(ctx),
5426 NamedState(context< PeeringMachine >().state_history, "Started/ReplicaActive/RepWaitBackfillReserved")
5427 {
5428 context< PeeringMachine >().log_enter(state_name);
5429 }
5430
5431 boost::statechart::result
5432 PeeringState::RepNotRecovering::react(const RequestBackfillPrio &evt)
5433 {
5434
5435 DECLARE_LOCALS;
5436
5437 if (!pl->try_reserve_recovery_space(
5438 evt.primary_num_bytes, evt.local_num_bytes)) {
5439 post_event(RejectTooFullRemoteReservation());
5440 } else {
5441 PGPeeringEventURef preempt;
5442 if (HAVE_FEATURE(ps->upacting_features, RECOVERY_RESERVATION_2)) {
5443 // older peers will interpret preemption as TOOFULL
5444 preempt = std::make_unique<PGPeeringEvent>(
5445 pl->get_osdmap_epoch(),
5446 pl->get_osdmap_epoch(),
5447 RemoteBackfillPreempted());
5448 }
5449 pl->request_remote_recovery_reservation(
5450 evt.priority,
5451 std::make_unique<PGPeeringEvent>(
5452 pl->get_osdmap_epoch(),
5453 pl->get_osdmap_epoch(),
5454 RemoteBackfillReserved()),
5455 std::move(preempt));
5456 }
5457 return transit<RepWaitBackfillReserved>();
5458 }
5459
5460 boost::statechart::result
5461 PeeringState::RepNotRecovering::react(const RequestRecoveryPrio &evt)
5462 {
5463 DECLARE_LOCALS;
5464
5465 // fall back to a local reckoning of priority of primary doesn't pass one
5466 // (pre-mimic compat)
5467 int prio = evt.priority ? evt.priority : ps->get_recovery_priority();
5468
5469 PGPeeringEventURef preempt;
5470 if (HAVE_FEATURE(ps->upacting_features, RECOVERY_RESERVATION_2)) {
5471 // older peers can't handle this
5472 preempt = std::make_unique<PGPeeringEvent>(
5473 ps->get_osdmap_epoch(),
5474 ps->get_osdmap_epoch(),
5475 RemoteRecoveryPreempted());
5476 }
5477
5478 pl->request_remote_recovery_reservation(
5479 prio,
5480 std::make_unique<PGPeeringEvent>(
5481 ps->get_osdmap_epoch(),
5482 ps->get_osdmap_epoch(),
5483 RemoteRecoveryReserved()),
5484 std::move(preempt));
5485 return transit<RepWaitRecoveryReserved>();
5486 }
5487
5488 void PeeringState::RepWaitBackfillReserved::exit()
5489 {
5490 context< PeeringMachine >().log_exit(state_name, enter_time);
5491 DECLARE_LOCALS;
5492 utime_t dur = ceph_clock_now() - enter_time;
5493 pl->get_peering_perf().tinc(rs_repwaitbackfillreserved_latency, dur);
5494 }
5495
5496 boost::statechart::result
5497 PeeringState::RepWaitBackfillReserved::react(const RemoteBackfillReserved &evt)
5498 {
5499 DECLARE_LOCALS;
5500
5501
5502 pl->send_cluster_message(
5503 ps->primary.osd,
5504 make_message<MBackfillReserve>(
5505 MBackfillReserve::GRANT,
5506 spg_t(ps->info.pgid.pgid, ps->primary.shard),
5507 ps->get_osdmap_epoch()),
5508 ps->get_osdmap_epoch());
5509 return transit<RepRecovering>();
5510 }
5511
5512 boost::statechart::result
5513 PeeringState::RepWaitBackfillReserved::react(
5514 const RejectTooFullRemoteReservation &evt)
5515 {
5516 DECLARE_LOCALS;
5517 ps->reject_reservation();
5518 post_event(RemoteReservationRejectedTooFull());
5519 return discard_event();
5520 }
5521
5522 boost::statechart::result
5523 PeeringState::RepWaitBackfillReserved::react(
5524 const RemoteReservationRejectedTooFull &evt)
5525 {
5526 DECLARE_LOCALS;
5527 pl->unreserve_recovery_space();
5528
5529 pl->cancel_remote_recovery_reservation();
5530 return transit<RepNotRecovering>();
5531 }
5532
5533 boost::statechart::result
5534 PeeringState::RepWaitBackfillReserved::react(
5535 const RemoteReservationCanceled &evt)
5536 {
5537 DECLARE_LOCALS;
5538 pl->unreserve_recovery_space();
5539
5540 pl->cancel_remote_recovery_reservation();
5541 return transit<RepNotRecovering>();
5542 }
5543
5544 /*---RepRecovering-------*/
5545 PeeringState::RepRecovering::RepRecovering(my_context ctx)
5546 : my_base(ctx),
5547 NamedState(context< PeeringMachine >().state_history, "Started/ReplicaActive/RepRecovering")
5548 {
5549 context< PeeringMachine >().log_enter(state_name);
5550 }
5551
5552 boost::statechart::result
5553 PeeringState::RepRecovering::react(const RemoteRecoveryPreempted &)
5554 {
5555 DECLARE_LOCALS;
5556
5557
5558 pl->unreserve_recovery_space();
5559 pl->send_cluster_message(
5560 ps->primary.osd,
5561 make_message<MRecoveryReserve>(
5562 MRecoveryReserve::REVOKE,
5563 spg_t(ps->info.pgid.pgid, ps->primary.shard),
5564 ps->get_osdmap_epoch()),
5565 ps->get_osdmap_epoch());
5566 return discard_event();
5567 }
5568
5569 boost::statechart::result
5570 PeeringState::RepRecovering::react(const BackfillTooFull &)
5571 {
5572 DECLARE_LOCALS;
5573
5574
5575 pl->unreserve_recovery_space();
5576 pl->send_cluster_message(
5577 ps->primary.osd,
5578 make_message<MBackfillReserve>(
5579 MBackfillReserve::REVOKE_TOOFULL,
5580 spg_t(ps->info.pgid.pgid, ps->primary.shard),
5581 ps->get_osdmap_epoch()),
5582 ps->get_osdmap_epoch());
5583 return discard_event();
5584 }
5585
5586 boost::statechart::result
5587 PeeringState::RepRecovering::react(const RemoteBackfillPreempted &)
5588 {
5589 DECLARE_LOCALS;
5590
5591
5592 pl->unreserve_recovery_space();
5593 pl->send_cluster_message(
5594 ps->primary.osd,
5595 make_message<MBackfillReserve>(
5596 MBackfillReserve::REVOKE,
5597 spg_t(ps->info.pgid.pgid, ps->primary.shard),
5598 ps->get_osdmap_epoch()),
5599 ps->get_osdmap_epoch());
5600 return discard_event();
5601 }
5602
5603 void PeeringState::RepRecovering::exit()
5604 {
5605 context< PeeringMachine >().log_exit(state_name, enter_time);
5606 DECLARE_LOCALS;
5607 pl->unreserve_recovery_space();
5608
5609 pl->cancel_remote_recovery_reservation();
5610 utime_t dur = ceph_clock_now() - enter_time;
5611 pl->get_peering_perf().tinc(rs_reprecovering_latency, dur);
5612 }
5613
5614 /*------Activating--------*/
5615 PeeringState::Activating::Activating(my_context ctx)
5616 : my_base(ctx),
5617 NamedState(context< PeeringMachine >().state_history, "Started/Primary/Active/Activating")
5618 {
5619 context< PeeringMachine >().log_enter(state_name);
5620 }
5621
5622 void PeeringState::Activating::exit()
5623 {
5624 context< PeeringMachine >().log_exit(state_name, enter_time);
5625 DECLARE_LOCALS;
5626 utime_t dur = ceph_clock_now() - enter_time;
5627 pl->get_peering_perf().tinc(rs_activating_latency, dur);
5628 }
5629
5630 PeeringState::WaitLocalRecoveryReserved::WaitLocalRecoveryReserved(my_context ctx)
5631 : my_base(ctx),
5632 NamedState(context< PeeringMachine >().state_history, "Started/Primary/Active/WaitLocalRecoveryReserved")
5633 {
5634 context< PeeringMachine >().log_enter(state_name);
5635 DECLARE_LOCALS;
5636
5637 // Make sure all nodes that part of the recovery aren't full
5638 if (!ps->cct->_conf->osd_debug_skip_full_check_in_recovery &&
5639 ps->get_osdmap()->check_full(ps->acting_recovery_backfill)) {
5640 post_event(RecoveryTooFull());
5641 return;
5642 }
5643
5644 ps->state_clear(PG_STATE_RECOVERY_TOOFULL);
5645 ps->state_set(PG_STATE_RECOVERY_WAIT);
5646 pl->request_local_background_io_reservation(
5647 ps->get_recovery_priority(),
5648 std::make_unique<PGPeeringEvent>(
5649 ps->get_osdmap_epoch(),
5650 ps->get_osdmap_epoch(),
5651 LocalRecoveryReserved()),
5652 std::make_unique<PGPeeringEvent>(
5653 ps->get_osdmap_epoch(),
5654 ps->get_osdmap_epoch(),
5655 DeferRecovery(0.0)));
5656 pl->publish_stats_to_osd();
5657 }
5658
5659 boost::statechart::result
5660 PeeringState::WaitLocalRecoveryReserved::react(const RecoveryTooFull &evt)
5661 {
5662 DECLARE_LOCALS;
5663 ps->state_set(PG_STATE_RECOVERY_TOOFULL);
5664 pl->schedule_event_after(
5665 std::make_shared<PGPeeringEvent>(
5666 ps->get_osdmap_epoch(),
5667 ps->get_osdmap_epoch(),
5668 DoRecovery()),
5669 ps->cct->_conf->osd_recovery_retry_interval);
5670 return transit<NotRecovering>();
5671 }
5672
5673 void PeeringState::WaitLocalRecoveryReserved::exit()
5674 {
5675 context< PeeringMachine >().log_exit(state_name, enter_time);
5676 DECLARE_LOCALS;
5677 utime_t dur = ceph_clock_now() - enter_time;
5678 pl->get_peering_perf().tinc(rs_waitlocalrecoveryreserved_latency, dur);
5679 }
5680
5681 PeeringState::WaitRemoteRecoveryReserved::WaitRemoteRecoveryReserved(my_context ctx)
5682 : my_base(ctx),
5683 NamedState(context< PeeringMachine >().state_history, "Started/Primary/Active/WaitRemoteRecoveryReserved"),
5684 remote_recovery_reservation_it(context< Active >().remote_shards_to_reserve_recovery.begin())
5685 {
5686 context< PeeringMachine >().log_enter(state_name);
5687 post_event(RemoteRecoveryReserved());
5688 }
5689
5690 boost::statechart::result
5691 PeeringState::WaitRemoteRecoveryReserved::react(const RemoteRecoveryReserved &evt) {
5692 DECLARE_LOCALS;
5693
5694 if (remote_recovery_reservation_it !=
5695 context< Active >().remote_shards_to_reserve_recovery.end()) {
5696 ceph_assert(*remote_recovery_reservation_it != ps->pg_whoami);
5697 pl->send_cluster_message(
5698 remote_recovery_reservation_it->osd,
5699 make_message<MRecoveryReserve>(
5700 MRecoveryReserve::REQUEST,
5701 spg_t(context< PeeringMachine >().spgid.pgid,
5702 remote_recovery_reservation_it->shard),
5703 ps->get_osdmap_epoch(),
5704 ps->get_recovery_priority()),
5705 ps->get_osdmap_epoch());
5706 ++remote_recovery_reservation_it;
5707 } else {
5708 post_event(AllRemotesReserved());
5709 }
5710 return discard_event();
5711 }
5712
5713 void PeeringState::WaitRemoteRecoveryReserved::exit()
5714 {
5715 context< PeeringMachine >().log_exit(state_name, enter_time);
5716 DECLARE_LOCALS;
5717 utime_t dur = ceph_clock_now() - enter_time;
5718 pl->get_peering_perf().tinc(rs_waitremoterecoveryreserved_latency, dur);
5719 }
5720
5721 PeeringState::Recovering::Recovering(my_context ctx)
5722 : my_base(ctx),
5723 NamedState(context< PeeringMachine >().state_history, "Started/Primary/Active/Recovering")
5724 {
5725 context< PeeringMachine >().log_enter(state_name);
5726
5727 DECLARE_LOCALS;
5728 ps->state_clear(PG_STATE_RECOVERY_WAIT);
5729 ps->state_clear(PG_STATE_RECOVERY_TOOFULL);
5730 ps->state_set(PG_STATE_RECOVERING);
5731 pl->on_recovery_reserved();
5732 ceph_assert(!ps->state_test(PG_STATE_ACTIVATING));
5733 pl->publish_stats_to_osd();
5734 }
5735
5736 void PeeringState::Recovering::release_reservations(bool cancel)
5737 {
5738 DECLARE_LOCALS;
5739 ceph_assert(cancel || !ps->pg_log.get_missing().have_missing());
5740
5741 // release remote reservations
5742 for (auto i = context< Active >().remote_shards_to_reserve_recovery.begin();
5743 i != context< Active >().remote_shards_to_reserve_recovery.end();
5744 ++i) {
5745 if (*i == ps->pg_whoami) // skip myself
5746 continue;
5747 pl->send_cluster_message(
5748 i->osd,
5749 make_message<MRecoveryReserve>(
5750 MRecoveryReserve::RELEASE,
5751 spg_t(ps->info.pgid.pgid, i->shard),
5752 ps->get_osdmap_epoch()),
5753 ps->get_osdmap_epoch());
5754 }
5755 }
5756
5757 boost::statechart::result
5758 PeeringState::Recovering::react(const AllReplicasRecovered &evt)
5759 {
5760 DECLARE_LOCALS;
5761 ps->state_clear(PG_STATE_FORCED_RECOVERY);
5762 release_reservations();
5763 pl->cancel_local_background_io_reservation();
5764 return transit<Recovered>();
5765 }
5766
5767 boost::statechart::result
5768 PeeringState::Recovering::react(const RequestBackfill &evt)
5769 {
5770 DECLARE_LOCALS;
5771
5772 release_reservations();
5773
5774 ps->state_clear(PG_STATE_FORCED_RECOVERY);
5775 pl->cancel_local_background_io_reservation();
5776 pl->publish_stats_to_osd();
5777 // transit any async_recovery_targets back into acting
5778 // so pg won't have to stay undersized for long
5779 // as backfill might take a long time to complete..
5780 if (!ps->async_recovery_targets.empty()) {
5781 pg_shard_t auth_log_shard;
5782 bool history_les_bound = false;
5783 // FIXME: Uh-oh we have to check this return value; choose_acting can fail!
5784 ps->choose_acting(auth_log_shard, true, &history_les_bound);
5785 }
5786 return transit<WaitLocalBackfillReserved>();
5787 }
5788
5789 boost::statechart::result
5790 PeeringState::Recovering::react(const DeferRecovery &evt)
5791 {
5792 DECLARE_LOCALS;
5793 if (!ps->state_test(PG_STATE_RECOVERING)) {
5794 // we may have finished recovery and have an AllReplicasRecovered
5795 // event queued to move us to the next state.
5796 psdout(10) << "got defer recovery but not recovering" << dendl;
5797 return discard_event();
5798 }
5799 psdout(10) << "defer recovery, retry delay " << evt.delay << dendl;
5800 ps->state_set(PG_STATE_RECOVERY_WAIT);
5801 pl->cancel_local_background_io_reservation();
5802 release_reservations(true);
5803 pl->schedule_event_after(
5804 std::make_shared<PGPeeringEvent>(
5805 ps->get_osdmap_epoch(),
5806 ps->get_osdmap_epoch(),
5807 DoRecovery()),
5808 evt.delay);
5809 return transit<NotRecovering>();
5810 }
5811
5812 boost::statechart::result
5813 PeeringState::Recovering::react(const UnfoundRecovery &evt)
5814 {
5815 DECLARE_LOCALS;
5816 psdout(10) << "recovery has unfound, can't continue" << dendl;
5817 ps->state_set(PG_STATE_RECOVERY_UNFOUND);
5818 pl->cancel_local_background_io_reservation();
5819 release_reservations(true);
5820 return transit<NotRecovering>();
5821 }
5822
5823 void PeeringState::Recovering::exit()
5824 {
5825 context< PeeringMachine >().log_exit(state_name, enter_time);
5826
5827 DECLARE_LOCALS;
5828 utime_t dur = ceph_clock_now() - enter_time;
5829 ps->state_clear(PG_STATE_RECOVERING);
5830 pl->get_peering_perf().tinc(rs_recovering_latency, dur);
5831 }
5832
5833 PeeringState::Recovered::Recovered(my_context ctx)
5834 : my_base(ctx),
5835 NamedState(context< PeeringMachine >().state_history, "Started/Primary/Active/Recovered")
5836 {
5837 pg_shard_t auth_log_shard;
5838
5839 context< PeeringMachine >().log_enter(state_name);
5840
5841 DECLARE_LOCALS;
5842
5843 ceph_assert(!ps->needs_recovery());
5844
5845 // if we finished backfill, all acting are active; recheck if
5846 // DEGRADED | UNDERSIZED is appropriate.
5847 ceph_assert(!ps->acting_recovery_backfill.empty());
5848 if (ps->get_osdmap()->get_pg_size(context< PeeringMachine >().spgid.pgid) <=
5849 ps->acting_recovery_backfill.size()) {
5850 ps->state_clear(PG_STATE_FORCED_BACKFILL | PG_STATE_FORCED_RECOVERY);
5851 pl->publish_stats_to_osd();
5852 }
5853
5854 // adjust acting set? (e.g. because backfill completed...)
5855 bool history_les_bound = false;
5856 if (ps->acting != ps->up && !ps->choose_acting(auth_log_shard,
5857 true, &history_les_bound)) {
5858 ceph_assert(ps->want_acting.size());
5859 } else if (!ps->async_recovery_targets.empty()) {
5860 // FIXME: Uh-oh we have to check this return value; choose_acting can fail!
5861 ps->choose_acting(auth_log_shard, true, &history_les_bound);
5862 }
5863
5864 if (context< Active >().all_replicas_activated &&
5865 ps->async_recovery_targets.empty())
5866 post_event(GoClean());
5867 }
5868
5869 void PeeringState::Recovered::exit()
5870 {
5871 context< PeeringMachine >().log_exit(state_name, enter_time);
5872 DECLARE_LOCALS;
5873
5874 utime_t dur = ceph_clock_now() - enter_time;
5875 pl->get_peering_perf().tinc(rs_recovered_latency, dur);
5876 }
5877
5878 PeeringState::Clean::Clean(my_context ctx)
5879 : my_base(ctx),
5880 NamedState(context< PeeringMachine >().state_history, "Started/Primary/Active/Clean")
5881 {
5882 context< PeeringMachine >().log_enter(state_name);
5883
5884 DECLARE_LOCALS;
5885
5886 if (ps->info.last_complete != ps->info.last_update) {
5887 ceph_abort();
5888 }
5889
5890
5891 ps->try_mark_clean();
5892
5893 context< PeeringMachine >().get_cur_transaction().register_on_commit(
5894 pl->on_clean());
5895 }
5896
5897 void PeeringState::Clean::exit()
5898 {
5899 context< PeeringMachine >().log_exit(state_name, enter_time);
5900
5901 DECLARE_LOCALS;
5902 ps->state_clear(PG_STATE_CLEAN);
5903 utime_t dur = ceph_clock_now() - enter_time;
5904 pl->get_peering_perf().tinc(rs_clean_latency, dur);
5905 }
5906
5907 template <typename T>
5908 set<pg_shard_t> unique_osd_shard_set(const pg_shard_t & skip, const T &in)
5909 {
5910 set<int> osds_found;
5911 set<pg_shard_t> out;
5912 for (auto i = in.begin(); i != in.end(); ++i) {
5913 if (*i != skip && !osds_found.count(i->osd)) {
5914 osds_found.insert(i->osd);
5915 out.insert(*i);
5916 }
5917 }
5918 return out;
5919 }
5920
5921 /*---------Active---------*/
5922 PeeringState::Active::Active(my_context ctx)
5923 : my_base(ctx),
5924 NamedState(context< PeeringMachine >().state_history, "Started/Primary/Active"),
5925 remote_shards_to_reserve_recovery(
5926 unique_osd_shard_set(
5927 context< PeeringMachine >().state->pg_whoami,
5928 context< PeeringMachine >().state->acting_recovery_backfill)),
5929 remote_shards_to_reserve_backfill(
5930 unique_osd_shard_set(
5931 context< PeeringMachine >().state->pg_whoami,
5932 context< PeeringMachine >().state->backfill_targets)),
5933 all_replicas_activated(false)
5934 {
5935 context< PeeringMachine >().log_enter(state_name);
5936
5937
5938 DECLARE_LOCALS;
5939
5940 ceph_assert(!ps->backfill_reserved);
5941 ceph_assert(ps->is_primary());
5942 psdout(10) << "In Active, about to call activate" << dendl;
5943 ps->start_flush(context< PeeringMachine >().get_cur_transaction());
5944 ps->activate(context< PeeringMachine >().get_cur_transaction(),
5945 ps->get_osdmap_epoch(),
5946 context< PeeringMachine >().get_recovery_ctx());
5947
5948 // everyone has to commit/ack before we are truly active
5949 ps->blocked_by.clear();
5950 for (auto p = ps->acting_recovery_backfill.begin();
5951 p != ps->acting_recovery_backfill.end();
5952 ++p) {
5953 if (p->shard != ps->pg_whoami.shard) {
5954 ps->blocked_by.insert(p->shard);
5955 }
5956 }
5957 pl->publish_stats_to_osd();
5958 psdout(10) << "Activate Finished" << dendl;
5959 }
5960
5961 boost::statechart::result PeeringState::Active::react(const AdvMap& advmap)
5962 {
5963 DECLARE_LOCALS;
5964
5965 if (ps->should_restart_peering(
5966 advmap.up_primary,
5967 advmap.acting_primary,
5968 advmap.newup,
5969 advmap.newacting,
5970 advmap.lastmap,
5971 advmap.osdmap)) {
5972 psdout(10) << "Active advmap interval change, fast return" << dendl;
5973 return forward_event();
5974 }
5975 psdout(10) << "Active advmap" << dendl;
5976 bool need_publish = false;
5977
5978 pl->on_active_advmap(advmap.osdmap);
5979 if (ps->dirty_big_info) {
5980 // share updated purged_snaps to mgr/mon so that we (a) stop reporting
5981 // purged snaps and (b) perhaps share more snaps that we have purged
5982 // but didn't fit in pg_stat_t.
5983 need_publish = true;
5984 ps->share_pg_info();
5985 }
5986
5987 bool need_acting_change = false;
5988 for (size_t i = 0; i < ps->want_acting.size(); i++) {
5989 int osd = ps->want_acting[i];
5990 if (!advmap.osdmap->is_up(osd)) {
5991 pg_shard_t osd_with_shard(osd, shard_id_t(i));
5992 if (!ps->is_acting(osd_with_shard) && !ps->is_up(osd_with_shard)) {
5993 psdout(10) << "Active stray osd." << osd << " in want_acting is down"
5994 << dendl;
5995 need_acting_change = true;
5996 }
5997 }
5998 }
5999 if (need_acting_change) {
6000 psdout(10) << "Active need acting change, call choose_acting again"
6001 << dendl;
6002 // possibly because we re-add some strays into the acting set and
6003 // some of them then go down in a subsequent map before we could see
6004 // the map changing the pg temp.
6005 // call choose_acting again to clear them out.
6006 // note that we leave restrict_to_up_acting to false in order to
6007 // not overkill any chosen stray that is still alive.
6008 pg_shard_t auth_log_shard;
6009 bool history_les_bound = false;
6010 ps->remove_down_peer_info(advmap.osdmap);
6011 ps->choose_acting(auth_log_shard, false, &history_les_bound, true);
6012 }
6013
6014 /* Check for changes in pool size (if the acting set changed as a result,
6015 * this does not matter) */
6016 if (advmap.lastmap->get_pg_size(ps->info.pgid.pgid) !=
6017 ps->get_osdmap()->get_pg_size(ps->info.pgid.pgid)) {
6018 if (ps->get_osdmap()->get_pg_size(ps->info.pgid.pgid) <=
6019 ps->actingset.size()) {
6020 ps->state_clear(PG_STATE_UNDERSIZED);
6021 } else {
6022 ps->state_set(PG_STATE_UNDERSIZED);
6023 }
6024 // degraded changes will be detected by call from publish_stats_to_osd()
6025 need_publish = true;
6026 }
6027
6028 // if we haven't reported our PG stats in a long time, do so now.
6029 if (ps->info.stats.reported_epoch + ps->cct->_conf->osd_pg_stat_report_interval_max < advmap.osdmap->get_epoch()) {
6030 psdout(20) << "reporting stats to osd after " << (advmap.osdmap->get_epoch() - ps->info.stats.reported_epoch)
6031 << " epochs" << dendl;
6032 need_publish = true;
6033 }
6034
6035 if (need_publish)
6036 pl->publish_stats_to_osd();
6037
6038 if (ps->check_prior_readable_down_osds(advmap.osdmap)) {
6039 pl->recheck_readable();
6040 }
6041
6042 return forward_event();
6043 }
6044
6045 boost::statechart::result PeeringState::Active::react(const ActMap&)
6046 {
6047 DECLARE_LOCALS;
6048 psdout(10) << "Active: handling ActMap" << dendl;
6049 ceph_assert(ps->is_primary());
6050
6051 pl->on_active_actmap();
6052
6053 if (ps->have_unfound()) {
6054 // object may have become unfound
6055 ps->discover_all_missing(context<PeeringMachine>().get_recovery_ctx().msgs);
6056 }
6057
6058 uint64_t unfound = ps->missing_loc.num_unfound();
6059 if (unfound > 0 &&
6060 ps->all_unfound_are_queried_or_lost(ps->get_osdmap())) {
6061 if (ps->cct->_conf->osd_auto_mark_unfound_lost) {
6062 pl->get_clog_error() << context< PeeringMachine >().spgid.pgid << " has " << unfound
6063 << " objects unfound and apparently lost, would automatically "
6064 << "mark these objects lost but this feature is not yet implemented "
6065 << "(osd_auto_mark_unfound_lost)";
6066 } else
6067 pl->get_clog_error() << context< PeeringMachine >().spgid.pgid << " has "
6068 << unfound << " objects unfound and apparently lost";
6069 }
6070
6071 return forward_event();
6072 }
6073
6074 boost::statechart::result PeeringState::Active::react(const MNotifyRec& notevt)
6075 {
6076
6077 DECLARE_LOCALS;
6078 ceph_assert(ps->is_primary());
6079 if (ps->peer_info.count(notevt.from)) {
6080 psdout(10) << "Active: got notify from " << notevt.from
6081 << ", already have info from that osd, ignoring"
6082 << dendl;
6083 } else if (ps->peer_purged.count(notevt.from)) {
6084 psdout(10) << "Active: got notify from " << notevt.from
6085 << ", already purged that peer, ignoring"
6086 << dendl;
6087 } else {
6088 psdout(10) << "Active: got notify from " << notevt.from
6089 << ", calling proc_replica_info and discover_all_missing"
6090 << dendl;
6091 ps->proc_replica_info(
6092 notevt.from, notevt.notify.info, notevt.notify.epoch_sent);
6093 if (ps->have_unfound() || (ps->is_degraded() && ps->might_have_unfound.count(notevt.from))) {
6094 ps->discover_all_missing(
6095 context<PeeringMachine>().get_recovery_ctx().msgs);
6096 }
6097 // check if it is a previous down acting member that's coming back.
6098 // if so, request pg_temp change to trigger a new interval transition
6099 pg_shard_t auth_log_shard;
6100 bool history_les_bound = false;
6101 // FIXME: Uh-oh we have to check this return value; choose_acting can fail!
6102 ps->choose_acting(auth_log_shard, false, &history_les_bound, true);
6103 if (!ps->want_acting.empty() && ps->want_acting != ps->acting) {
6104 psdout(10) << "Active: got notify from previous acting member "
6105 << notevt.from << ", requesting pg_temp change"
6106 << dendl;
6107 }
6108 }
6109 return discard_event();
6110 }
6111
6112 boost::statechart::result PeeringState::Active::react(const MTrim& trim)
6113 {
6114 DECLARE_LOCALS;
6115 ceph_assert(ps->is_primary());
6116
6117 // peer is informing us of their last_complete_ondisk
6118 ldout(ps->cct,10) << " replica osd." << trim.from << " lcod " << trim.trim_to << dendl;
6119 ps->update_peer_last_complete_ondisk(pg_shard_t{trim.from, trim.shard},
6120 trim.trim_to);
6121 // trim log when the pg is recovered
6122 ps->calc_min_last_complete_ondisk();
6123 return discard_event();
6124 }
6125
6126 boost::statechart::result PeeringState::Active::react(const MInfoRec& infoevt)
6127 {
6128 DECLARE_LOCALS;
6129 ceph_assert(ps->is_primary());
6130
6131 ceph_assert(!ps->acting_recovery_backfill.empty());
6132 if (infoevt.lease_ack) {
6133 ps->proc_lease_ack(infoevt.from.osd, *infoevt.lease_ack);
6134 }
6135 // don't update history (yet) if we are active and primary; the replica
6136 // may be telling us they have activated (and committed) but we can't
6137 // share that until _everyone_ does the same.
6138 if (ps->is_acting_recovery_backfill(infoevt.from) &&
6139 ps->peer_activated.count(infoevt.from) == 0) {
6140 psdout(10) << " peer osd." << infoevt.from
6141 << " activated and committed" << dendl;
6142 ps->peer_activated.insert(infoevt.from);
6143 ps->blocked_by.erase(infoevt.from.shard);
6144 pl->publish_stats_to_osd();
6145 if (ps->peer_activated.size() == ps->acting_recovery_backfill.size()) {
6146 all_activated_and_committed();
6147 }
6148 }
6149 return discard_event();
6150 }
6151
6152 boost::statechart::result PeeringState::Active::react(const MLogRec& logevt)
6153 {
6154 DECLARE_LOCALS;
6155 psdout(10) << "searching osd." << logevt.from
6156 << " log for unfound items" << dendl;
6157 ps->proc_replica_log(
6158 logevt.msg->info, logevt.msg->log, std::move(logevt.msg->missing), logevt.from);
6159 bool got_missing = ps->search_for_missing(
6160 ps->peer_info[logevt.from],
6161 ps->peer_missing[logevt.from],
6162 logevt.from,
6163 context< PeeringMachine >().get_recovery_ctx());
6164 // If there are missing AND we are "fully" active then start recovery now
6165 if (got_missing && ps->state_test(PG_STATE_ACTIVE)) {
6166 post_event(DoRecovery());
6167 }
6168 return discard_event();
6169 }
6170
6171 boost::statechart::result PeeringState::Active::react(const QueryState& q)
6172 {
6173 DECLARE_LOCALS;
6174
6175 q.f->open_object_section("state");
6176 q.f->dump_string("name", state_name);
6177 q.f->dump_stream("enter_time") << enter_time;
6178
6179 {
6180 q.f->open_array_section("might_have_unfound");
6181 for (auto p = ps->might_have_unfound.begin();
6182 p != ps->might_have_unfound.end();
6183 ++p) {
6184 q.f->open_object_section("osd");
6185 q.f->dump_stream("osd") << *p;
6186 if (ps->peer_missing.count(*p)) {
6187 q.f->dump_string("status", "already probed");
6188 } else if (ps->peer_missing_requested.count(*p)) {
6189 q.f->dump_string("status", "querying");
6190 } else if (!ps->get_osdmap()->is_up(p->osd)) {
6191 q.f->dump_string("status", "osd is down");
6192 } else {
6193 q.f->dump_string("status", "not queried");
6194 }
6195 q.f->close_section();
6196 }
6197 q.f->close_section();
6198 }
6199 {
6200 q.f->open_object_section("recovery_progress");
6201 q.f->open_array_section("backfill_targets");
6202 for (auto p = ps->backfill_targets.begin();
6203 p != ps->backfill_targets.end(); ++p)
6204 q.f->dump_stream("replica") << *p;
6205 q.f->close_section();
6206 pl->dump_recovery_info(q.f);
6207 q.f->close_section();
6208 }
6209
6210 q.f->close_section();
6211 return forward_event();
6212 }
6213
6214 boost::statechart::result PeeringState::Active::react(const QueryUnfound& q)
6215 {
6216 DECLARE_LOCALS;
6217
6218 ps->query_unfound(q.f, "Active");
6219 return discard_event();
6220 }
6221
6222 boost::statechart::result PeeringState::Active::react(
6223 const ActivateCommitted &evt)
6224 {
6225 DECLARE_LOCALS;
6226 ceph_assert(!ps->peer_activated.count(ps->pg_whoami));
6227 ps->peer_activated.insert(ps->pg_whoami);
6228 psdout(10) << "_activate_committed " << evt.epoch
6229 << " peer_activated now " << ps->peer_activated
6230 << " last_interval_started "
6231 << ps->info.history.last_interval_started
6232 << " last_epoch_started "
6233 << ps->info.history.last_epoch_started
6234 << " same_interval_since "
6235 << ps->info.history.same_interval_since
6236 << dendl;
6237 ceph_assert(!ps->acting_recovery_backfill.empty());
6238 if (ps->peer_activated.size() == ps->acting_recovery_backfill.size())
6239 all_activated_and_committed();
6240 return discard_event();
6241 }
6242
6243 boost::statechart::result PeeringState::Active::react(const AllReplicasActivated &evt)
6244 {
6245
6246 DECLARE_LOCALS;
6247 pg_t pgid = context< PeeringMachine >().spgid.pgid;
6248
6249 all_replicas_activated = true;
6250
6251 ps->state_clear(PG_STATE_ACTIVATING);
6252 ps->state_clear(PG_STATE_CREATING);
6253 ps->state_clear(PG_STATE_PREMERGE);
6254
6255 bool merge_target;
6256 if (ps->pool.info.is_pending_merge(pgid, &merge_target)) {
6257 ps->state_set(PG_STATE_PEERED);
6258 ps->state_set(PG_STATE_PREMERGE);
6259
6260 if (ps->actingset.size() != ps->get_osdmap()->get_pg_size(pgid)) {
6261 if (merge_target) {
6262 pg_t src = pgid;
6263 src.set_ps(ps->pool.info.get_pg_num_pending());
6264 assert(src.get_parent() == pgid);
6265 pl->set_not_ready_to_merge_target(pgid, src);
6266 } else {
6267 pl->set_not_ready_to_merge_source(pgid);
6268 }
6269 }
6270 } else if (!ps->acting_set_writeable()) {
6271 ps->state_set(PG_STATE_PEERED);
6272 } else {
6273 ps->state_set(PG_STATE_ACTIVE);
6274 }
6275
6276 auto mnow = pl->get_mnow();
6277 if (ps->prior_readable_until_ub > mnow) {
6278 psdout(10) << " waiting for prior_readable_until_ub "
6279 << ps->prior_readable_until_ub << " > mnow " << mnow << dendl;
6280 ps->state_set(PG_STATE_WAIT);
6281 pl->queue_check_readable(
6282 ps->last_peering_reset,
6283 ps->prior_readable_until_ub - mnow);
6284 } else {
6285 psdout(10) << " mnow " << mnow << " >= prior_readable_until_ub "
6286 << ps->prior_readable_until_ub << dendl;
6287 }
6288
6289 if (ps->pool.info.has_flag(pg_pool_t::FLAG_CREATING)) {
6290 pl->send_pg_created(pgid);
6291 }
6292
6293 ps->info.history.last_epoch_started = ps->info.last_epoch_started;
6294 ps->info.history.last_interval_started = ps->info.last_interval_started;
6295 ps->dirty_info = true;
6296
6297 ps->share_pg_info();
6298 pl->publish_stats_to_osd();
6299
6300 pl->on_activate_complete();
6301
6302 return discard_event();
6303 }
6304
6305 boost::statechart::result PeeringState::Active::react(const RenewLease& rl)
6306 {
6307 DECLARE_LOCALS;
6308 ps->proc_renew_lease();
6309 return discard_event();
6310 }
6311
6312 boost::statechart::result PeeringState::Active::react(const MLeaseAck& la)
6313 {
6314 DECLARE_LOCALS;
6315 ps->proc_lease_ack(la.from, la.lease_ack);
6316 return discard_event();
6317 }
6318
6319
6320 boost::statechart::result PeeringState::Active::react(const CheckReadable &evt)
6321 {
6322 DECLARE_LOCALS;
6323 pl->recheck_readable();
6324 return discard_event();
6325 }
6326
6327 /*
6328 * update info.history.last_epoch_started ONLY after we and all
6329 * replicas have activated AND committed the activate transaction
6330 * (i.e. the peering results are stable on disk).
6331 */
6332 void PeeringState::Active::all_activated_and_committed()
6333 {
6334 DECLARE_LOCALS;
6335 psdout(10) << "all_activated_and_committed" << dendl;
6336 ceph_assert(ps->is_primary());
6337 ceph_assert(ps->peer_activated.size() == ps->acting_recovery_backfill.size());
6338 ceph_assert(!ps->acting_recovery_backfill.empty());
6339 ceph_assert(ps->blocked_by.empty());
6340
6341 if (HAVE_FEATURE(ps->upacting_features, SERVER_OCTOPUS)) {
6342 // this is overkill when the activation is quick, but when it is slow it
6343 // is important, because the lease was renewed by the activate itself but we
6344 // don't know how long ago that was, and simply scheduling now may leave
6345 // a gap in lease coverage. keep it simple and aggressively renew.
6346 ps->renew_lease(pl->get_mnow());
6347 ps->send_lease();
6348 ps->schedule_renew_lease();
6349 }
6350
6351 // Degraded?
6352 ps->update_calc_stats();
6353 if (ps->info.stats.stats.sum.num_objects_degraded) {
6354 ps->state_set(PG_STATE_DEGRADED);
6355 } else {
6356 ps->state_clear(PG_STATE_DEGRADED);
6357 }
6358
6359 post_event(PeeringState::AllReplicasActivated());
6360 }
6361
6362
6363 void PeeringState::Active::exit()
6364 {
6365 context< PeeringMachine >().log_exit(state_name, enter_time);
6366
6367
6368 DECLARE_LOCALS;
6369 pl->cancel_local_background_io_reservation();
6370
6371 ps->blocked_by.clear();
6372 ps->backfill_reserved = false;
6373 ps->state_clear(PG_STATE_ACTIVATING);
6374 ps->state_clear(PG_STATE_DEGRADED);
6375 ps->state_clear(PG_STATE_UNDERSIZED);
6376 ps->state_clear(PG_STATE_BACKFILL_TOOFULL);
6377 ps->state_clear(PG_STATE_BACKFILL_WAIT);
6378 ps->state_clear(PG_STATE_RECOVERY_WAIT);
6379 ps->state_clear(PG_STATE_RECOVERY_TOOFULL);
6380 utime_t dur = ceph_clock_now() - enter_time;
6381 pl->get_peering_perf().tinc(rs_active_latency, dur);
6382 pl->on_active_exit();
6383 }
6384
6385 /*------ReplicaActive-----*/
6386 PeeringState::ReplicaActive::ReplicaActive(my_context ctx)
6387 : my_base(ctx),
6388 NamedState(context< PeeringMachine >().state_history, "Started/ReplicaActive")
6389 {
6390 context< PeeringMachine >().log_enter(state_name);
6391
6392 DECLARE_LOCALS;
6393 ps->start_flush(context< PeeringMachine >().get_cur_transaction());
6394 }
6395
6396
6397 boost::statechart::result PeeringState::ReplicaActive::react(
6398 const Activate& actevt) {
6399 DECLARE_LOCALS;
6400 psdout(10) << "In ReplicaActive, about to call activate" << dendl;
6401 ps->activate(
6402 context< PeeringMachine >().get_cur_transaction(),
6403 actevt.activation_epoch,
6404 context< PeeringMachine >().get_recovery_ctx());
6405 psdout(10) << "Activate Finished" << dendl;
6406 return discard_event();
6407 }
6408
6409 boost::statechart::result PeeringState::ReplicaActive::react(
6410 const ActivateCommitted &evt)
6411 {
6412 DECLARE_LOCALS;
6413 psdout(10) << __func__ << " " << evt.epoch << " telling primary" << dendl;
6414
6415 auto &rctx = context<PeeringMachine>().get_recovery_ctx();
6416 auto epoch = ps->get_osdmap_epoch();
6417 pg_info_t i = ps->info;
6418 i.history.last_epoch_started = evt.activation_epoch;
6419 i.history.last_interval_started = i.history.same_interval_since;
6420 rctx.send_info(
6421 ps->get_primary().osd,
6422 spg_t(ps->info.pgid.pgid, ps->get_primary().shard),
6423 epoch,
6424 epoch,
6425 i,
6426 {}, /* lease */
6427 ps->get_lease_ack());
6428
6429 if (ps->acting_set_writeable()) {
6430 ps->state_set(PG_STATE_ACTIVE);
6431 } else {
6432 ps->state_set(PG_STATE_PEERED);
6433 }
6434 pl->on_activate_committed();
6435
6436 return discard_event();
6437 }
6438
6439 boost::statechart::result PeeringState::ReplicaActive::react(const MLease& l)
6440 {
6441 DECLARE_LOCALS;
6442 spg_t spgid = context< PeeringMachine >().spgid;
6443 epoch_t epoch = pl->get_osdmap_epoch();
6444
6445 ps->proc_lease(l.lease);
6446 pl->send_cluster_message(
6447 ps->get_primary().osd,
6448 make_message<MOSDPGLeaseAck>(epoch,
6449 spg_t(spgid.pgid, ps->get_primary().shard),
6450 ps->get_lease_ack()),
6451 epoch);
6452 return discard_event();
6453 }
6454
6455 boost::statechart::result PeeringState::ReplicaActive::react(const MInfoRec& infoevt)
6456 {
6457 DECLARE_LOCALS;
6458 ps->proc_primary_info(context<PeeringMachine>().get_cur_transaction(),
6459 infoevt.info);
6460 return discard_event();
6461 }
6462
6463 boost::statechart::result PeeringState::ReplicaActive::react(const MLogRec& logevt)
6464 {
6465 DECLARE_LOCALS;
6466 psdout(10) << "received log from " << logevt.from << dendl;
6467 ObjectStore::Transaction &t = context<PeeringMachine>().get_cur_transaction();
6468 ps->merge_log(t, logevt.msg->info, std::move(logevt.msg->log), logevt.from);
6469 ceph_assert(ps->pg_log.get_head() == ps->info.last_update);
6470 if (logevt.msg->lease) {
6471 ps->proc_lease(*logevt.msg->lease);
6472 }
6473
6474 return discard_event();
6475 }
6476
6477 boost::statechart::result PeeringState::ReplicaActive::react(const MTrim& trim)
6478 {
6479 DECLARE_LOCALS;
6480 // primary is instructing us to trim
6481 ps->pg_log.trim(trim.trim_to, ps->info);
6482 ps->dirty_info = true;
6483 return discard_event();
6484 }
6485
6486 boost::statechart::result PeeringState::ReplicaActive::react(const ActMap&)
6487 {
6488 DECLARE_LOCALS;
6489 if (ps->should_send_notify() && ps->get_primary().osd >= 0) {
6490 ps->info.history.refresh_prior_readable_until_ub(
6491 pl->get_mnow(), ps->prior_readable_until_ub);
6492 context< PeeringMachine >().send_notify(
6493 ps->get_primary().osd,
6494 pg_notify_t(
6495 ps->get_primary().shard, ps->pg_whoami.shard,
6496 ps->get_osdmap_epoch(),
6497 ps->get_osdmap_epoch(),
6498 ps->info,
6499 ps->past_intervals));
6500 }
6501 return discard_event();
6502 }
6503
6504 boost::statechart::result PeeringState::ReplicaActive::react(
6505 const MQuery& query)
6506 {
6507 DECLARE_LOCALS;
6508 ps->fulfill_query(query, context<PeeringMachine>().get_recovery_ctx());
6509 return discard_event();
6510 }
6511
6512 boost::statechart::result PeeringState::ReplicaActive::react(const QueryState& q)
6513 {
6514 q.f->open_object_section("state");
6515 q.f->dump_string("name", state_name);
6516 q.f->dump_stream("enter_time") << enter_time;
6517 q.f->close_section();
6518 return forward_event();
6519 }
6520
6521 boost::statechart::result PeeringState::ReplicaActive::react(const QueryUnfound& q)
6522 {
6523 q.f->dump_string("state", "ReplicaActive");
6524 q.f->dump_bool("available_might_have_unfound", false);
6525 return discard_event();
6526 }
6527
6528 void PeeringState::ReplicaActive::exit()
6529 {
6530 context< PeeringMachine >().log_exit(state_name, enter_time);
6531 DECLARE_LOCALS;
6532 pl->unreserve_recovery_space();
6533
6534 pl->cancel_remote_recovery_reservation();
6535 utime_t dur = ceph_clock_now() - enter_time;
6536 pl->get_peering_perf().tinc(rs_replicaactive_latency, dur);
6537
6538 ps->min_last_complete_ondisk = eversion_t();
6539 }
6540
6541 /*-------Stray---*/
6542 PeeringState::Stray::Stray(my_context ctx)
6543 : my_base(ctx),
6544 NamedState(context< PeeringMachine >().state_history, "Started/Stray")
6545 {
6546 context< PeeringMachine >().log_enter(state_name);
6547
6548
6549 DECLARE_LOCALS;
6550 ceph_assert(!ps->is_peered());
6551 ceph_assert(!ps->is_peering());
6552 ceph_assert(!ps->is_primary());
6553
6554 if (!ps->get_osdmap()->have_pg_pool(ps->info.pgid.pgid.pool())) {
6555 ldout(ps->cct,10) << __func__ << " pool is deleted" << dendl;
6556 post_event(DeleteStart());
6557 } else {
6558 ps->start_flush(context< PeeringMachine >().get_cur_transaction());
6559 }
6560 }
6561
6562 boost::statechart::result PeeringState::Stray::react(const MLogRec& logevt)
6563 {
6564 DECLARE_LOCALS;
6565 MOSDPGLog *msg = logevt.msg.get();
6566 psdout(10) << "got info+log from osd." << logevt.from << " " << msg->info << " " << msg->log << dendl;
6567
6568 ObjectStore::Transaction &t = context<PeeringMachine>().get_cur_transaction();
6569 if (msg->info.last_backfill == hobject_t()) {
6570 // restart backfill
6571 ps->info = msg->info;
6572 pl->on_info_history_change();
6573 ps->dirty_info = true;
6574 ps->dirty_big_info = true; // maybe.
6575
6576 PGLog::LogEntryHandlerRef rollbacker{pl->get_log_handler(t)};
6577 ps->pg_log.reset_backfill_claim_log(msg->log, rollbacker.get());
6578
6579 ps->pg_log.reset_backfill();
6580 } else {
6581 ps->merge_log(t, msg->info, std::move(msg->log), logevt.from);
6582 }
6583 if (logevt.msg->lease) {
6584 ps->proc_lease(*logevt.msg->lease);
6585 }
6586
6587 ceph_assert(ps->pg_log.get_head() == ps->info.last_update);
6588
6589 post_event(Activate(logevt.msg->info.last_epoch_started));
6590 return transit<ReplicaActive>();
6591 }
6592
6593 boost::statechart::result PeeringState::Stray::react(const MInfoRec& infoevt)
6594 {
6595 DECLARE_LOCALS;
6596 psdout(10) << "got info from osd." << infoevt.from << " " << infoevt.info << dendl;
6597
6598 if (ps->info.last_update > infoevt.info.last_update) {
6599 // rewind divergent log entries
6600 ObjectStore::Transaction &t = context<PeeringMachine>().get_cur_transaction();
6601 ps->rewind_divergent_log(t, infoevt.info.last_update);
6602 ps->info.stats = infoevt.info.stats;
6603 ps->info.hit_set = infoevt.info.hit_set;
6604 }
6605
6606 if (infoevt.lease) {
6607 ps->proc_lease(*infoevt.lease);
6608 }
6609
6610 ceph_assert(infoevt.info.last_update == ps->info.last_update);
6611 ceph_assert(ps->pg_log.get_head() == ps->info.last_update);
6612
6613 post_event(Activate(infoevt.info.last_epoch_started));
6614 return transit<ReplicaActive>();
6615 }
6616
6617 boost::statechart::result PeeringState::Stray::react(const MQuery& query)
6618 {
6619 DECLARE_LOCALS;
6620 ps->fulfill_query(query, context<PeeringMachine>().get_recovery_ctx());
6621 return discard_event();
6622 }
6623
6624 boost::statechart::result PeeringState::Stray::react(const ActMap&)
6625 {
6626 DECLARE_LOCALS;
6627 if (ps->should_send_notify() && ps->get_primary().osd >= 0) {
6628 ps->info.history.refresh_prior_readable_until_ub(
6629 pl->get_mnow(), ps->prior_readable_until_ub);
6630 context< PeeringMachine >().send_notify(
6631 ps->get_primary().osd,
6632 pg_notify_t(
6633 ps->get_primary().shard, ps->pg_whoami.shard,
6634 ps->get_osdmap_epoch(),
6635 ps->get_osdmap_epoch(),
6636 ps->info,
6637 ps->past_intervals));
6638 }
6639 return discard_event();
6640 }
6641
6642 void PeeringState::Stray::exit()
6643 {
6644 context< PeeringMachine >().log_exit(state_name, enter_time);
6645 DECLARE_LOCALS;
6646 utime_t dur = ceph_clock_now() - enter_time;
6647 pl->get_peering_perf().tinc(rs_stray_latency, dur);
6648 }
6649
6650
6651 /*--------ToDelete----------*/
6652 PeeringState::ToDelete::ToDelete(my_context ctx)
6653 : my_base(ctx),
6654 NamedState(context< PeeringMachine >().state_history, "Started/ToDelete")
6655 {
6656 context< PeeringMachine >().log_enter(state_name);
6657 DECLARE_LOCALS;
6658 pl->get_perf_logger().inc(l_osd_pg_removing);
6659 }
6660
6661 void PeeringState::ToDelete::exit()
6662 {
6663 context< PeeringMachine >().log_exit(state_name, enter_time);
6664 DECLARE_LOCALS;
6665 // note: on a successful removal, this path doesn't execute. see
6666 // _delete_some().
6667 pl->get_perf_logger().dec(l_osd_pg_removing);
6668
6669 pl->cancel_local_background_io_reservation();
6670 }
6671
6672 /*----WaitDeleteReserved----*/
6673 PeeringState::WaitDeleteReserved::WaitDeleteReserved(my_context ctx)
6674 : my_base(ctx),
6675 NamedState(context< PeeringMachine >().state_history,
6676 "Started/ToDelete/WaitDeleteReseved")
6677 {
6678 context< PeeringMachine >().log_enter(state_name);
6679 DECLARE_LOCALS;
6680 context< ToDelete >().priority = ps->get_delete_priority();
6681
6682 pl->cancel_local_background_io_reservation();
6683 pl->request_local_background_io_reservation(
6684 context<ToDelete>().priority,
6685 std::make_unique<PGPeeringEvent>(
6686 ps->get_osdmap_epoch(),
6687 ps->get_osdmap_epoch(),
6688 DeleteReserved()),
6689 std::make_unique<PGPeeringEvent>(
6690 ps->get_osdmap_epoch(),
6691 ps->get_osdmap_epoch(),
6692 DeleteInterrupted()));
6693 }
6694
6695 boost::statechart::result PeeringState::ToDelete::react(
6696 const ActMap& evt)
6697 {
6698 DECLARE_LOCALS;
6699 if (ps->get_delete_priority() != priority) {
6700 psdout(10) << __func__ << " delete priority changed, resetting"
6701 << dendl;
6702 return transit<ToDelete>();
6703 }
6704 return discard_event();
6705 }
6706
6707 void PeeringState::WaitDeleteReserved::exit()
6708 {
6709 context< PeeringMachine >().log_exit(state_name, enter_time);
6710 }
6711
6712 /*----Deleting-----*/
6713 PeeringState::Deleting::Deleting(my_context ctx)
6714 : my_base(ctx),
6715 NamedState(context< PeeringMachine >().state_history, "Started/ToDelete/Deleting")
6716 {
6717 context< PeeringMachine >().log_enter(state_name);
6718
6719 DECLARE_LOCALS;
6720 ps->deleting = true;
6721 ObjectStore::Transaction &t = context<PeeringMachine>().get_cur_transaction();
6722
6723 // clear log
6724 PGLog::LogEntryHandlerRef rollbacker{pl->get_log_handler(t)};
6725 ps->pg_log.roll_forward(rollbacker.get());
6726
6727 // adjust info to backfill
6728 ps->info.set_last_backfill(hobject_t());
6729 ps->pg_log.reset_backfill();
6730 ps->dirty_info = true;
6731
6732 pl->on_removal(t);
6733 }
6734
6735 boost::statechart::result PeeringState::Deleting::react(
6736 const DeleteSome& evt)
6737 {
6738 DECLARE_LOCALS;
6739 std::pair<ghobject_t, bool> p;
6740 p = pl->do_delete_work(context<PeeringMachine>().get_cur_transaction(),
6741 next);
6742 next = p.first;
6743 return p.second ? discard_event() : terminate();
6744 }
6745
6746 void PeeringState::Deleting::exit()
6747 {
6748 context< PeeringMachine >().log_exit(state_name, enter_time);
6749 DECLARE_LOCALS;
6750 ps->deleting = false;
6751 pl->cancel_local_background_io_reservation();
6752 }
6753
6754 /*--------GetInfo---------*/
6755 PeeringState::GetInfo::GetInfo(my_context ctx)
6756 : my_base(ctx),
6757 NamedState(context< PeeringMachine >().state_history, "Started/Primary/Peering/GetInfo")
6758 {
6759 context< PeeringMachine >().log_enter(state_name);
6760
6761
6762 DECLARE_LOCALS;
6763 ps->check_past_interval_bounds();
6764 ps->log_weirdness();
6765 PastIntervals::PriorSet &prior_set = context< Peering >().prior_set;
6766
6767 ceph_assert(ps->blocked_by.empty());
6768
6769 prior_set = ps->build_prior();
6770 ps->prior_readable_down_osds = prior_set.down;
6771 if (ps->prior_readable_down_osds.empty()) {
6772 psdout(10) << " no prior_set down osds, clearing prior_readable_until_ub"
6773 << dendl;
6774 ps->clear_prior_readable_until_ub();
6775 }
6776
6777 ps->reset_min_peer_features();
6778 get_infos();
6779 if (prior_set.pg_down) {
6780 post_event(IsDown());
6781 } else if (peer_info_requested.empty()) {
6782 post_event(GotInfo());
6783 }
6784 }
6785
6786 void PeeringState::GetInfo::get_infos()
6787 {
6788 DECLARE_LOCALS;
6789 PastIntervals::PriorSet &prior_set = context< Peering >().prior_set;
6790
6791 ps->blocked_by.clear();
6792 for (auto it = prior_set.probe.begin(); it != prior_set.probe.end(); ++it) {
6793 pg_shard_t peer = *it;
6794 if (peer == ps->pg_whoami) {
6795 continue;
6796 }
6797 if (ps->peer_info.count(peer)) {
6798 psdout(10) << " have osd." << peer << " info " << ps->peer_info[peer] << dendl;
6799 continue;
6800 }
6801 if (peer_info_requested.count(peer)) {
6802 psdout(10) << " already requested info from osd." << peer << dendl;
6803 ps->blocked_by.insert(peer.osd);
6804 } else if (!ps->get_osdmap()->is_up(peer.osd)) {
6805 psdout(10) << " not querying info from down osd." << peer << dendl;
6806 } else {
6807 psdout(10) << " querying info from osd." << peer << dendl;
6808 context< PeeringMachine >().send_query(
6809 peer.osd,
6810 pg_query_t(pg_query_t::INFO,
6811 it->shard, ps->pg_whoami.shard,
6812 ps->info.history,
6813 ps->get_osdmap_epoch()));
6814 peer_info_requested.insert(peer);
6815 ps->blocked_by.insert(peer.osd);
6816 }
6817 }
6818
6819 ps->check_prior_readable_down_osds(ps->get_osdmap());
6820
6821 pl->publish_stats_to_osd();
6822 }
6823
6824 boost::statechart::result PeeringState::GetInfo::react(const MNotifyRec& infoevt)
6825 {
6826
6827 DECLARE_LOCALS;
6828
6829 auto p = peer_info_requested.find(infoevt.from);
6830 if (p != peer_info_requested.end()) {
6831 peer_info_requested.erase(p);
6832 ps->blocked_by.erase(infoevt.from.osd);
6833 }
6834
6835 epoch_t old_start = ps->info.history.last_epoch_started;
6836 if (ps->proc_replica_info(
6837 infoevt.from, infoevt.notify.info, infoevt.notify.epoch_sent)) {
6838 // we got something new ...
6839 PastIntervals::PriorSet &prior_set = context< Peering >().prior_set;
6840 if (old_start < ps->info.history.last_epoch_started) {
6841 psdout(10) << " last_epoch_started moved forward, rebuilding prior" << dendl;
6842 prior_set = ps->build_prior();
6843 ps->prior_readable_down_osds = prior_set.down;
6844
6845 // filter out any osds that got dropped from the probe set from
6846 // peer_info_requested. this is less expensive than restarting
6847 // peering (which would re-probe everyone).
6848 auto p = peer_info_requested.begin();
6849 while (p != peer_info_requested.end()) {
6850 if (prior_set.probe.count(*p) == 0) {
6851 psdout(20) << " dropping osd." << *p << " from info_requested, no longer in probe set" << dendl;
6852 peer_info_requested.erase(p++);
6853 } else {
6854 ++p;
6855 }
6856 }
6857 get_infos();
6858 }
6859 psdout(20) << "Adding osd: " << infoevt.from.osd << " peer features: "
6860 << hex << infoevt.features << dec << dendl;
6861 ps->apply_peer_features(infoevt.features);
6862
6863 // are we done getting everything?
6864 if (peer_info_requested.empty() && !prior_set.pg_down) {
6865 psdout(20) << "Common peer features: " << hex << ps->get_min_peer_features() << dec << dendl;
6866 psdout(20) << "Common acting features: " << hex << ps->get_min_acting_features() << dec << dendl;
6867 psdout(20) << "Common upacting features: " << hex << ps->get_min_upacting_features() << dec << dendl;
6868 post_event(GotInfo());
6869 }
6870 }
6871 return discard_event();
6872 }
6873
6874 boost::statechart::result PeeringState::GetInfo::react(const QueryState& q)
6875 {
6876 DECLARE_LOCALS;
6877 q.f->open_object_section("state");
6878 q.f->dump_string("name", state_name);
6879 q.f->dump_stream("enter_time") << enter_time;
6880
6881 q.f->open_array_section("requested_info_from");
6882 for (auto p = peer_info_requested.begin();
6883 p != peer_info_requested.end();
6884 ++p) {
6885 q.f->open_object_section("osd");
6886 q.f->dump_stream("osd") << *p;
6887 if (ps->peer_info.count(*p)) {
6888 q.f->open_object_section("got_info");
6889 ps->peer_info[*p].dump(q.f);
6890 q.f->close_section();
6891 }
6892 q.f->close_section();
6893 }
6894 q.f->close_section();
6895
6896 q.f->close_section();
6897 return forward_event();
6898 }
6899
6900 boost::statechart::result PeeringState::GetInfo::react(const QueryUnfound& q)
6901 {
6902 q.f->dump_string("state", "GetInfo");
6903 q.f->dump_bool("available_might_have_unfound", false);
6904 return discard_event();
6905 }
6906
6907 void PeeringState::GetInfo::exit()
6908 {
6909 context< PeeringMachine >().log_exit(state_name, enter_time);
6910
6911 DECLARE_LOCALS;
6912 utime_t dur = ceph_clock_now() - enter_time;
6913 pl->get_peering_perf().tinc(rs_getinfo_latency, dur);
6914 ps->blocked_by.clear();
6915 }
6916
6917 /*------GetLog------------*/
6918 PeeringState::GetLog::GetLog(my_context ctx)
6919 : my_base(ctx),
6920 NamedState(
6921 context< PeeringMachine >().state_history,
6922 "Started/Primary/Peering/GetLog"),
6923 msg(0)
6924 {
6925 context< PeeringMachine >().log_enter(state_name);
6926
6927 DECLARE_LOCALS;
6928
6929 ps->log_weirdness();
6930
6931 // adjust acting?
6932 if (!ps->choose_acting(auth_log_shard, false,
6933 &context< Peering >().history_les_bound)) {
6934 if (!ps->want_acting.empty()) {
6935 post_event(NeedActingChange());
6936 } else {
6937 post_event(IsIncomplete());
6938 }
6939 return;
6940 }
6941
6942 // am i the best?
6943 if (auth_log_shard == ps->pg_whoami) {
6944 post_event(GotLog());
6945 return;
6946 }
6947
6948 const pg_info_t& best = ps->peer_info[auth_log_shard];
6949
6950 // am i broken?
6951 if (ps->info.last_update < best.log_tail) {
6952 psdout(10) << " not contiguous with osd." << auth_log_shard << ", down" << dendl;
6953 post_event(IsIncomplete());
6954 return;
6955 }
6956
6957 // how much log to request?
6958 eversion_t request_log_from = ps->info.last_update;
6959 ceph_assert(!ps->acting_recovery_backfill.empty());
6960 for (auto p = ps->acting_recovery_backfill.begin();
6961 p != ps->acting_recovery_backfill.end();
6962 ++p) {
6963 if (*p == ps->pg_whoami) continue;
6964 pg_info_t& ri = ps->peer_info[*p];
6965 if (ri.last_update < ps->info.log_tail && ri.last_update >= best.log_tail &&
6966 ri.last_update < request_log_from)
6967 request_log_from = ri.last_update;
6968 }
6969
6970 // how much?
6971 psdout(10) << " requesting log from osd." << auth_log_shard << dendl;
6972 context<PeeringMachine>().send_query(
6973 auth_log_shard.osd,
6974 pg_query_t(
6975 pg_query_t::LOG,
6976 auth_log_shard.shard, ps->pg_whoami.shard,
6977 request_log_from, ps->info.history,
6978 ps->get_osdmap_epoch()));
6979
6980 ceph_assert(ps->blocked_by.empty());
6981 ps->blocked_by.insert(auth_log_shard.osd);
6982 pl->publish_stats_to_osd();
6983 }
6984
6985 boost::statechart::result PeeringState::GetLog::react(const AdvMap& advmap)
6986 {
6987 // make sure our log source didn't go down. we need to check
6988 // explicitly because it may not be part of the prior set, which
6989 // means the Peering state check won't catch it going down.
6990 if (!advmap.osdmap->is_up(auth_log_shard.osd)) {
6991 psdout(10) << "GetLog: auth_log_shard osd."
6992 << auth_log_shard.osd << " went down" << dendl;
6993 post_event(advmap);
6994 return transit< Reset >();
6995 }
6996
6997 // let the Peering state do its checks.
6998 return forward_event();
6999 }
7000
7001 boost::statechart::result PeeringState::GetLog::react(const MLogRec& logevt)
7002 {
7003 ceph_assert(!msg);
7004 if (logevt.from != auth_log_shard) {
7005 psdout(10) << "GetLog: discarding log from "
7006 << "non-auth_log_shard osd." << logevt.from << dendl;
7007 return discard_event();
7008 }
7009 psdout(10) << "GetLog: received master log from osd."
7010 << logevt.from << dendl;
7011 msg = logevt.msg;
7012 post_event(GotLog());
7013 return discard_event();
7014 }
7015
7016 boost::statechart::result PeeringState::GetLog::react(const GotLog&)
7017 {
7018
7019 DECLARE_LOCALS;
7020 psdout(10) << "leaving GetLog" << dendl;
7021 if (msg) {
7022 psdout(10) << "processing master log" << dendl;
7023 ps->proc_master_log(context<PeeringMachine>().get_cur_transaction(),
7024 msg->info, std::move(msg->log), std::move(msg->missing),
7025 auth_log_shard);
7026 }
7027 ps->start_flush(context< PeeringMachine >().get_cur_transaction());
7028 return transit< GetMissing >();
7029 }
7030
7031 boost::statechart::result PeeringState::GetLog::react(const QueryState& q)
7032 {
7033 q.f->open_object_section("state");
7034 q.f->dump_string("name", state_name);
7035 q.f->dump_stream("enter_time") << enter_time;
7036 q.f->dump_stream("auth_log_shard") << auth_log_shard;
7037 q.f->close_section();
7038 return forward_event();
7039 }
7040
7041 boost::statechart::result PeeringState::GetLog::react(const QueryUnfound& q)
7042 {
7043 q.f->dump_string("state", "GetLog");
7044 q.f->dump_bool("available_might_have_unfound", false);
7045 return discard_event();
7046 }
7047
7048 void PeeringState::GetLog::exit()
7049 {
7050 context< PeeringMachine >().log_exit(state_name, enter_time);
7051
7052 DECLARE_LOCALS;
7053 utime_t dur = ceph_clock_now() - enter_time;
7054 pl->get_peering_perf().tinc(rs_getlog_latency, dur);
7055 ps->blocked_by.clear();
7056 }
7057
7058 /*------WaitActingChange--------*/
7059 PeeringState::WaitActingChange::WaitActingChange(my_context ctx)
7060 : my_base(ctx),
7061 NamedState(context< PeeringMachine >().state_history, "Started/Primary/WaitActingChange")
7062 {
7063 context< PeeringMachine >().log_enter(state_name);
7064 }
7065
7066 boost::statechart::result PeeringState::WaitActingChange::react(const AdvMap& advmap)
7067 {
7068 DECLARE_LOCALS;
7069 OSDMapRef osdmap = advmap.osdmap;
7070
7071 psdout(10) << "verifying no want_acting " << ps->want_acting << " targets didn't go down" << dendl;
7072 for (auto p = ps->want_acting.begin(); p != ps->want_acting.end(); ++p) {
7073 if (!osdmap->is_up(*p)) {
7074 psdout(10) << " want_acting target osd." << *p << " went down, resetting" << dendl;
7075 post_event(advmap);
7076 return transit< Reset >();
7077 }
7078 }
7079 return forward_event();
7080 }
7081
7082 boost::statechart::result PeeringState::WaitActingChange::react(const MLogRec& logevt)
7083 {
7084 psdout(10) << "In WaitActingChange, ignoring MLocRec" << dendl;
7085 return discard_event();
7086 }
7087
7088 boost::statechart::result PeeringState::WaitActingChange::react(const MInfoRec& evt)
7089 {
7090 psdout(10) << "In WaitActingChange, ignoring MInfoRec" << dendl;
7091 return discard_event();
7092 }
7093
7094 boost::statechart::result PeeringState::WaitActingChange::react(const MNotifyRec& evt)
7095 {
7096 psdout(10) << "In WaitActingChange, ignoring MNotifyRec" << dendl;
7097 return discard_event();
7098 }
7099
7100 boost::statechart::result PeeringState::WaitActingChange::react(const QueryState& q)
7101 {
7102 q.f->open_object_section("state");
7103 q.f->dump_string("name", state_name);
7104 q.f->dump_stream("enter_time") << enter_time;
7105 q.f->dump_string("comment", "waiting for pg acting set to change");
7106 q.f->close_section();
7107 return forward_event();
7108 }
7109
7110 boost::statechart::result PeeringState::WaitActingChange::react(const QueryUnfound& q)
7111 {
7112 q.f->dump_string("state", "WaitActingChange");
7113 q.f->dump_bool("available_might_have_unfound", false);
7114 return discard_event();
7115 }
7116
7117 void PeeringState::WaitActingChange::exit()
7118 {
7119 context< PeeringMachine >().log_exit(state_name, enter_time);
7120 DECLARE_LOCALS;
7121 utime_t dur = ceph_clock_now() - enter_time;
7122 pl->get_peering_perf().tinc(rs_waitactingchange_latency, dur);
7123 }
7124
7125 /*------Down--------*/
7126 PeeringState::Down::Down(my_context ctx)
7127 : my_base(ctx),
7128 NamedState(context< PeeringMachine >().state_history, "Started/Primary/Peering/Down")
7129 {
7130 context< PeeringMachine >().log_enter(state_name);
7131 DECLARE_LOCALS;
7132
7133 ps->state_clear(PG_STATE_PEERING);
7134 ps->state_set(PG_STATE_DOWN);
7135
7136 auto &prior_set = context< Peering >().prior_set;
7137 ceph_assert(ps->blocked_by.empty());
7138 ps->blocked_by.insert(prior_set.down.begin(), prior_set.down.end());
7139 pl->publish_stats_to_osd();
7140 }
7141
7142 void PeeringState::Down::exit()
7143 {
7144 context< PeeringMachine >().log_exit(state_name, enter_time);
7145
7146 DECLARE_LOCALS;
7147
7148 ps->state_clear(PG_STATE_DOWN);
7149 utime_t dur = ceph_clock_now() - enter_time;
7150 pl->get_peering_perf().tinc(rs_down_latency, dur);
7151
7152 ps->blocked_by.clear();
7153 }
7154
7155 boost::statechart::result PeeringState::Down::react(const QueryState& q)
7156 {
7157 q.f->open_object_section("state");
7158 q.f->dump_string("name", state_name);
7159 q.f->dump_stream("enter_time") << enter_time;
7160 q.f->dump_string("comment",
7161 "not enough up instances of this PG to go active");
7162 q.f->close_section();
7163 return forward_event();
7164 }
7165
7166 boost::statechart::result PeeringState::Down::react(const QueryUnfound& q)
7167 {
7168 q.f->dump_string("state", "Down");
7169 q.f->dump_bool("available_might_have_unfound", false);
7170 return discard_event();
7171 }
7172
7173 boost::statechart::result PeeringState::Down::react(const MNotifyRec& infoevt)
7174 {
7175 DECLARE_LOCALS;
7176
7177 ceph_assert(ps->is_primary());
7178 epoch_t old_start = ps->info.history.last_epoch_started;
7179 if (!ps->peer_info.count(infoevt.from) &&
7180 ps->get_osdmap()->has_been_up_since(infoevt.from.osd, infoevt.notify.epoch_sent)) {
7181 ps->update_history(infoevt.notify.info.history);
7182 }
7183 // if we got something new to make pg escape down state
7184 if (ps->info.history.last_epoch_started > old_start) {
7185 psdout(10) << " last_epoch_started moved forward, re-enter getinfo" << dendl;
7186 ps->state_clear(PG_STATE_DOWN);
7187 ps->state_set(PG_STATE_PEERING);
7188 return transit< GetInfo >();
7189 }
7190
7191 return discard_event();
7192 }
7193
7194
7195 /*------Incomplete--------*/
7196 PeeringState::Incomplete::Incomplete(my_context ctx)
7197 : my_base(ctx),
7198 NamedState(context< PeeringMachine >().state_history, "Started/Primary/Peering/Incomplete")
7199 {
7200 context< PeeringMachine >().log_enter(state_name);
7201 DECLARE_LOCALS;
7202
7203 ps->state_clear(PG_STATE_PEERING);
7204 ps->state_set(PG_STATE_INCOMPLETE);
7205
7206 PastIntervals::PriorSet &prior_set = context< Peering >().prior_set;
7207 ceph_assert(ps->blocked_by.empty());
7208 ps->blocked_by.insert(prior_set.down.begin(), prior_set.down.end());
7209 pl->publish_stats_to_osd();
7210 }
7211
7212 boost::statechart::result PeeringState::Incomplete::react(const AdvMap &advmap) {
7213 DECLARE_LOCALS;
7214 int64_t poolnum = ps->info.pgid.pool();
7215
7216 // Reset if min_size turn smaller than previous value, pg might now be able to go active
7217 if (!advmap.osdmap->have_pg_pool(poolnum) ||
7218 advmap.lastmap->get_pools().find(poolnum)->second.min_size >
7219 advmap.osdmap->get_pools().find(poolnum)->second.min_size) {
7220 post_event(advmap);
7221 return transit< Reset >();
7222 }
7223
7224 return forward_event();
7225 }
7226
7227 boost::statechart::result PeeringState::Incomplete::react(const MNotifyRec& notevt) {
7228 DECLARE_LOCALS;
7229 psdout(7) << "handle_pg_notify from osd." << notevt.from << dendl;
7230 if (ps->proc_replica_info(
7231 notevt.from, notevt.notify.info, notevt.notify.epoch_sent)) {
7232 // We got something new, try again!
7233 return transit< GetLog >();
7234 } else {
7235 return discard_event();
7236 }
7237 }
7238
7239 boost::statechart::result PeeringState::Incomplete::react(
7240 const QueryState& q)
7241 {
7242 q.f->open_object_section("state");
7243 q.f->dump_string("name", state_name);
7244 q.f->dump_stream("enter_time") << enter_time;
7245 q.f->dump_string("comment", "not enough complete instances of this PG");
7246 q.f->close_section();
7247 return forward_event();
7248 }
7249
7250 boost::statechart::result PeeringState::Incomplete::react(const QueryUnfound& q)
7251 {
7252 q.f->dump_string("state", "Incomplete");
7253 q.f->dump_bool("available_might_have_unfound", false);
7254 return discard_event();
7255 }
7256
7257 void PeeringState::Incomplete::exit()
7258 {
7259 context< PeeringMachine >().log_exit(state_name, enter_time);
7260
7261 DECLARE_LOCALS;
7262
7263 ps->state_clear(PG_STATE_INCOMPLETE);
7264 utime_t dur = ceph_clock_now() - enter_time;
7265 pl->get_peering_perf().tinc(rs_incomplete_latency, dur);
7266
7267 ps->blocked_by.clear();
7268 }
7269
7270 /*------GetMissing--------*/
7271 PeeringState::GetMissing::GetMissing(my_context ctx)
7272 : my_base(ctx),
7273 NamedState(context< PeeringMachine >().state_history, "Started/Primary/Peering/GetMissing")
7274 {
7275 context< PeeringMachine >().log_enter(state_name);
7276
7277 DECLARE_LOCALS;
7278 ps->log_weirdness();
7279 ceph_assert(!ps->acting_recovery_backfill.empty());
7280 eversion_t since;
7281 for (auto i = ps->acting_recovery_backfill.begin();
7282 i != ps->acting_recovery_backfill.end();
7283 ++i) {
7284 if (*i == ps->get_primary()) continue;
7285 const pg_info_t& pi = ps->peer_info[*i];
7286 // reset this so to make sure the pg_missing_t is initialized and
7287 // has the correct semantics even if we don't need to get a
7288 // missing set from a shard. This way later additions due to
7289 // lost+unfound delete work properly.
7290 ps->peer_missing[*i].may_include_deletes = !ps->perform_deletes_during_peering();
7291
7292 if (pi.is_empty())
7293 continue; // no pg data, nothing divergent
7294
7295 if (pi.last_update < ps->pg_log.get_tail()) {
7296 psdout(10) << " osd." << *i << " is not contiguous, will restart backfill" << dendl;
7297 ps->peer_missing[*i].clear();
7298 continue;
7299 }
7300 if (pi.last_backfill == hobject_t()) {
7301 psdout(10) << " osd." << *i << " will fully backfill; can infer empty missing set" << dendl;
7302 ps->peer_missing[*i].clear();
7303 continue;
7304 }
7305
7306 if (pi.last_update == pi.last_complete && // peer has no missing
7307 pi.last_update == ps->info.last_update) { // peer is up to date
7308 // replica has no missing and identical log as us. no need to
7309 // pull anything.
7310 // FIXME: we can do better here. if last_update==last_complete we
7311 // can infer the rest!
7312 psdout(10) << " osd." << *i << " has no missing, identical log" << dendl;
7313 ps->peer_missing[*i].clear();
7314 continue;
7315 }
7316
7317 // We pull the log from the peer's last_epoch_started to ensure we
7318 // get enough log to detect divergent updates.
7319 since.epoch = pi.last_epoch_started;
7320 ceph_assert(pi.last_update >= ps->info.log_tail); // or else choose_acting() did a bad thing
7321 if (pi.log_tail <= since) {
7322 psdout(10) << " requesting log+missing since " << since << " from osd." << *i << dendl;
7323 context< PeeringMachine >().send_query(
7324 i->osd,
7325 pg_query_t(
7326 pg_query_t::LOG,
7327 i->shard, ps->pg_whoami.shard,
7328 since, ps->info.history,
7329 ps->get_osdmap_epoch()));
7330 } else {
7331 psdout(10) << " requesting fulllog+missing from osd." << *i
7332 << " (want since " << since << " < log.tail "
7333 << pi.log_tail << ")" << dendl;
7334 context< PeeringMachine >().send_query(
7335 i->osd, pg_query_t(
7336 pg_query_t::FULLLOG,
7337 i->shard, ps->pg_whoami.shard,
7338 ps->info.history, ps->get_osdmap_epoch()));
7339 }
7340 peer_missing_requested.insert(*i);
7341 ps->blocked_by.insert(i->osd);
7342 }
7343
7344 if (peer_missing_requested.empty()) {
7345 if (ps->need_up_thru) {
7346 psdout(10) << " still need up_thru update before going active"
7347 << dendl;
7348 post_event(NeedUpThru());
7349 return;
7350 }
7351
7352 // all good!
7353 post_event(Activate(ps->get_osdmap_epoch()));
7354 } else {
7355 pl->publish_stats_to_osd();
7356 }
7357 }
7358
7359 boost::statechart::result PeeringState::GetMissing::react(const MLogRec& logevt)
7360 {
7361 DECLARE_LOCALS;
7362
7363 peer_missing_requested.erase(logevt.from);
7364 ps->proc_replica_log(logevt.msg->info,
7365 logevt.msg->log,
7366 std::move(logevt.msg->missing),
7367 logevt.from);
7368
7369 if (peer_missing_requested.empty()) {
7370 if (ps->need_up_thru) {
7371 psdout(10) << " still need up_thru update before going active"
7372 << dendl;
7373 post_event(NeedUpThru());
7374 } else {
7375 psdout(10) << "Got last missing, don't need missing "
7376 << "posting Activate" << dendl;
7377 post_event(Activate(ps->get_osdmap_epoch()));
7378 }
7379 }
7380 return discard_event();
7381 }
7382
7383 boost::statechart::result PeeringState::GetMissing::react(const QueryState& q)
7384 {
7385 DECLARE_LOCALS;
7386 q.f->open_object_section("state");
7387 q.f->dump_string("name", state_name);
7388 q.f->dump_stream("enter_time") << enter_time;
7389
7390 q.f->open_array_section("peer_missing_requested");
7391 for (auto p = peer_missing_requested.begin();
7392 p != peer_missing_requested.end();
7393 ++p) {
7394 q.f->open_object_section("osd");
7395 q.f->dump_stream("osd") << *p;
7396 if (ps->peer_missing.count(*p)) {
7397 q.f->open_object_section("got_missing");
7398 ps->peer_missing[*p].dump(q.f);
7399 q.f->close_section();
7400 }
7401 q.f->close_section();
7402 }
7403 q.f->close_section();
7404
7405 q.f->close_section();
7406 return forward_event();
7407 }
7408
7409 boost::statechart::result PeeringState::GetMissing::react(const QueryUnfound& q)
7410 {
7411 q.f->dump_string("state", "GetMising");
7412 q.f->dump_bool("available_might_have_unfound", false);
7413 return discard_event();
7414 }
7415
7416 void PeeringState::GetMissing::exit()
7417 {
7418 context< PeeringMachine >().log_exit(state_name, enter_time);
7419
7420 DECLARE_LOCALS;
7421 utime_t dur = ceph_clock_now() - enter_time;
7422 pl->get_peering_perf().tinc(rs_getmissing_latency, dur);
7423 ps->blocked_by.clear();
7424 }
7425
7426 /*------WaitUpThru--------*/
7427 PeeringState::WaitUpThru::WaitUpThru(my_context ctx)
7428 : my_base(ctx),
7429 NamedState(context< PeeringMachine >().state_history, "Started/Primary/Peering/WaitUpThru")
7430 {
7431 context< PeeringMachine >().log_enter(state_name);
7432 }
7433
7434 boost::statechart::result PeeringState::WaitUpThru::react(const ActMap& am)
7435 {
7436 DECLARE_LOCALS;
7437 if (!ps->need_up_thru) {
7438 post_event(Activate(ps->get_osdmap_epoch()));
7439 }
7440 return forward_event();
7441 }
7442
7443 boost::statechart::result PeeringState::WaitUpThru::react(const MLogRec& logevt)
7444 {
7445 DECLARE_LOCALS;
7446 psdout(10) << "Noting missing from osd." << logevt.from << dendl;
7447 ps->peer_missing[logevt.from].claim(std::move(logevt.msg->missing));
7448 ps->peer_info[logevt.from] = logevt.msg->info;
7449 return discard_event();
7450 }
7451
7452 boost::statechart::result PeeringState::WaitUpThru::react(const QueryState& q)
7453 {
7454 q.f->open_object_section("state");
7455 q.f->dump_string("name", state_name);
7456 q.f->dump_stream("enter_time") << enter_time;
7457 q.f->dump_string("comment", "waiting for osdmap to reflect a new up_thru for this osd");
7458 q.f->close_section();
7459 return forward_event();
7460 }
7461
7462 boost::statechart::result PeeringState::WaitUpThru::react(const QueryUnfound& q)
7463 {
7464 q.f->dump_string("state", "WaitUpThru");
7465 q.f->dump_bool("available_might_have_unfound", false);
7466 return discard_event();
7467 }
7468
7469 void PeeringState::WaitUpThru::exit()
7470 {
7471 context< PeeringMachine >().log_exit(state_name, enter_time);
7472 DECLARE_LOCALS;
7473 utime_t dur = ceph_clock_now() - enter_time;
7474 pl->get_peering_perf().tinc(rs_waitupthru_latency, dur);
7475 }
7476
7477 /*----PeeringState::PeeringMachine Methods-----*/
7478 #undef dout_prefix
7479 #define dout_prefix dpp->gen_prefix(*_dout)
7480
7481 void PeeringState::PeeringMachine::log_enter(const char *state_name)
7482 {
7483 DECLARE_LOCALS;
7484 psdout(5) << "enter " << state_name << dendl;
7485 pl->log_state_enter(state_name);
7486 }
7487
7488 void PeeringState::PeeringMachine::log_exit(const char *state_name, utime_t enter_time)
7489 {
7490 DECLARE_LOCALS;
7491 utime_t dur = ceph_clock_now() - enter_time;
7492 psdout(5) << "exit " << state_name << " " << dur << " " << event_count << " " << event_time << dendl;
7493 pl->log_state_exit(state_name, enter_time, event_count, event_time);
7494 event_count = 0;
7495 event_time = utime_t();
7496 }
7497
7498 ostream &operator<<(ostream &out, const PeeringState &ps) {
7499 out << "pg[" << ps.info
7500 << " " << pg_vector_string(ps.up);
7501 if (ps.acting != ps.up)
7502 out << "/" << pg_vector_string(ps.acting);
7503 if (ps.is_ec_pg())
7504 out << "p" << ps.get_primary();
7505 if (!ps.async_recovery_targets.empty())
7506 out << " async=[" << ps.async_recovery_targets << "]";
7507 if (!ps.backfill_targets.empty())
7508 out << " backfill=[" << ps.backfill_targets << "]";
7509 out << " r=" << ps.get_role();
7510 out << " lpr=" << ps.get_last_peering_reset();
7511
7512 if (ps.deleting)
7513 out << " DELETING";
7514
7515 if (!ps.past_intervals.empty()) {
7516 out << " pi=[" << ps.past_intervals.get_bounds()
7517 << ")/" << ps.past_intervals.size();
7518 }
7519
7520 if (ps.is_peered()) {
7521 if (ps.last_update_ondisk != ps.info.last_update)
7522 out << " luod=" << ps.last_update_ondisk;
7523 if (ps.last_update_applied != ps.info.last_update)
7524 out << " lua=" << ps.last_update_applied;
7525 }
7526
7527 if (ps.pg_log.get_tail() != ps.info.log_tail ||
7528 ps.pg_log.get_head() != ps.info.last_update)
7529 out << " (info mismatch, " << ps.pg_log.get_log() << ")";
7530
7531 if (!ps.pg_log.get_log().empty()) {
7532 if ((ps.pg_log.get_log().log.begin()->version <= ps.pg_log.get_tail())) {
7533 out << " (log bound mismatch, actual=["
7534 << ps.pg_log.get_log().log.begin()->version << ","
7535 << ps.pg_log.get_log().log.rbegin()->version << "]";
7536 out << ")";
7537 }
7538 }
7539
7540 out << " crt=" << ps.pg_log.get_can_rollback_to();
7541
7542 if (ps.last_complete_ondisk != ps.info.last_complete)
7543 out << " lcod " << ps.last_complete_ondisk;
7544
7545 out << " mlcod " << ps.min_last_complete_ondisk;
7546
7547 out << " " << pg_state_string(ps.get_state());
7548 if (ps.should_send_notify())
7549 out << " NOTIFY";
7550
7551 if (ps.prior_readable_until_ub != ceph::signedspan::zero()) {
7552 out << " pruub " << ps.prior_readable_until_ub
7553 << "@" << ps.get_prior_readable_down_osds();
7554 }
7555 return out;
7556 }
7557
7558 std::vector<pg_shard_t> PeeringState::get_replica_recovery_order() const
7559 {
7560 std::vector<std::pair<unsigned int, pg_shard_t>> replicas_by_num_missing,
7561 async_by_num_missing;
7562 replicas_by_num_missing.reserve(get_acting_recovery_backfill().size() - 1);
7563 for (auto &p : get_acting_recovery_backfill()) {
7564 if (p == get_primary()) {
7565 continue;
7566 }
7567 auto pm = get_peer_missing().find(p);
7568 assert(pm != get_peer_missing().end());
7569 auto nm = pm->second.num_missing();
7570 if (nm != 0) {
7571 if (is_async_recovery_target(p)) {
7572 async_by_num_missing.push_back(make_pair(nm, p));
7573 } else {
7574 replicas_by_num_missing.push_back(make_pair(nm, p));
7575 }
7576 }
7577 }
7578 // sort by number of missing objects, in ascending order.
7579 auto func = [](const std::pair<unsigned int, pg_shard_t> &lhs,
7580 const std::pair<unsigned int, pg_shard_t> &rhs) {
7581 return lhs.first < rhs.first;
7582 };
7583 // acting goes first
7584 std::sort(replicas_by_num_missing.begin(), replicas_by_num_missing.end(), func);
7585 // then async_recovery_targets
7586 std::sort(async_by_num_missing.begin(), async_by_num_missing.end(), func);
7587 replicas_by_num_missing.insert(replicas_by_num_missing.end(),
7588 async_by_num_missing.begin(), async_by_num_missing.end());
7589
7590 std::vector<pg_shard_t> ret;
7591 ret.reserve(replicas_by_num_missing.size());
7592 for (auto p : replicas_by_num_missing) {
7593 ret.push_back(p.second);
7594 }
7595 return ret;
7596 }
7597
7598