]> git.proxmox.com Git - ceph.git/blob - ceph/src/osd/PeeringState.cc
import quincy beta 17.1.0
[ceph.git] / ceph / src / osd / PeeringState.cc
1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2 // vim: ts=8 sw=2 smarttab
3
4 #include "PGPeeringEvent.h"
5 #include "common/ceph_releases.h"
6 #include "common/dout.h"
7 #include "PeeringState.h"
8
9 #include "messages/MOSDPGRemove.h"
10 #include "messages/MBackfillReserve.h"
11 #include "messages/MRecoveryReserve.h"
12 #include "messages/MOSDScrubReserve.h"
13 #include "messages/MOSDPGInfo2.h"
14 #include "messages/MOSDPGTrim.h"
15 #include "messages/MOSDPGLog.h"
16 #include "messages/MOSDPGNotify2.h"
17 #include "messages/MOSDPGQuery2.h"
18 #include "messages/MOSDPGLease.h"
19 #include "messages/MOSDPGLeaseAck.h"
20
21 #define dout_context cct
22 #define dout_subsys ceph_subsys_osd
23
24 using std::dec;
25 using std::hex;
26 using std::make_pair;
27 using std::map;
28 using std::ostream;
29 using std::pair;
30 using std::set;
31 using std::string;
32 using std::stringstream;
33 using std::vector;
34
35 using ceph::Formatter;
36 using ceph::make_message;
37
38 BufferedRecoveryMessages::BufferedRecoveryMessages(PeeringCtx &ctx)
39 // steal messages from ctx
40 : message_map{std::move(ctx.message_map)}
41 {}
42
43 void BufferedRecoveryMessages::send_notify(int to, const pg_notify_t &n)
44 {
45 spg_t pgid(n.info.pgid.pgid, n.to);
46 send_osd_message(to, TOPNSPC::make_message<MOSDPGNotify2>(pgid, n));
47 }
48
49 void BufferedRecoveryMessages::send_query(
50 int to,
51 spg_t to_spgid,
52 const pg_query_t &q)
53 {
54 send_osd_message(to, TOPNSPC::make_message<MOSDPGQuery2>(to_spgid, q));
55 }
56
57 void BufferedRecoveryMessages::send_info(
58 int to,
59 spg_t to_spgid,
60 epoch_t min_epoch,
61 epoch_t cur_epoch,
62 const pg_info_t &info,
63 std::optional<pg_lease_t> lease,
64 std::optional<pg_lease_ack_t> lease_ack)
65 {
66 send_osd_message(
67 to,
68 TOPNSPC::make_message<MOSDPGInfo2>(
69 to_spgid,
70 info,
71 cur_epoch,
72 min_epoch,
73 lease,
74 lease_ack)
75 );
76 }
77
78 void PGPool::update(OSDMapRef map)
79 {
80 const pg_pool_t *pi = map->get_pg_pool(id);
81 if (!pi) {
82 return; // pool has been deleted
83 }
84 info = *pi;
85 name = map->get_pool_name(id);
86
87 bool updated = false;
88 if ((map->get_epoch() != cached_epoch + 1) ||
89 (pi->get_snap_epoch() == map->get_epoch())) {
90 updated = true;
91 }
92
93 if (info.is_pool_snaps_mode() && updated) {
94 snapc = pi->get_snap_context();
95 }
96 cached_epoch = map->get_epoch();
97 }
98
99 /*-------------Peering State Helpers----------------*/
100 #undef dout_prefix
101 #define dout_prefix (dpp->gen_prefix(*_dout))
102 #undef psdout
103 #define psdout(x) ldout(cct, x)
104
105 PeeringState::PeeringState(
106 CephContext *cct,
107 pg_shard_t pg_whoami,
108 spg_t spgid,
109 const PGPool &_pool,
110 OSDMapRef curmap,
111 DoutPrefixProvider *dpp,
112 PeeringListener *pl)
113 : state_history(*pl),
114 cct(cct),
115 spgid(spgid),
116 dpp(dpp),
117 pl(pl),
118 orig_ctx(0),
119 osdmap_ref(curmap),
120 pool(_pool),
121 pg_whoami(pg_whoami),
122 info(spgid),
123 pg_log(cct),
124 last_require_osd_release(curmap->require_osd_release),
125 missing_loc(spgid, this, dpp, cct),
126 machine(this, cct, spgid, dpp, pl, &state_history)
127 {
128 machine.initiate();
129 }
130
131 void PeeringState::start_handle(PeeringCtx *new_ctx) {
132 ceph_assert(!rctx);
133 ceph_assert(!orig_ctx);
134 orig_ctx = new_ctx;
135 if (new_ctx) {
136 if (messages_pending_flush) {
137 rctx.emplace(*messages_pending_flush, *new_ctx);
138 } else {
139 rctx.emplace(*new_ctx);
140 }
141 rctx->start_time = ceph_clock_now();
142 }
143 }
144
145 void PeeringState::begin_block_outgoing() {
146 ceph_assert(!messages_pending_flush);
147 ceph_assert(orig_ctx);
148 ceph_assert(rctx);
149 messages_pending_flush.emplace();
150 rctx.emplace(*messages_pending_flush, *orig_ctx);
151 }
152
153 void PeeringState::clear_blocked_outgoing() {
154 ceph_assert(orig_ctx);
155 ceph_assert(rctx);
156 messages_pending_flush = std::optional<BufferedRecoveryMessages>();
157 }
158
159 void PeeringState::end_block_outgoing() {
160 ceph_assert(messages_pending_flush);
161 ceph_assert(orig_ctx);
162 ceph_assert(rctx);
163
164 orig_ctx->accept_buffered_messages(*messages_pending_flush);
165 rctx.emplace(*orig_ctx);
166 messages_pending_flush = std::optional<BufferedRecoveryMessages>();
167 }
168
169 void PeeringState::end_handle() {
170 if (rctx) {
171 utime_t dur = ceph_clock_now() - rctx->start_time;
172 machine.event_time += dur;
173 }
174
175 machine.event_count++;
176 rctx = std::nullopt;
177 orig_ctx = NULL;
178 }
179
180 void PeeringState::check_recovery_sources(const OSDMapRef& osdmap)
181 {
182 /*
183 * check that any peers we are planning to (or currently) pulling
184 * objects from are dealt with.
185 */
186 missing_loc.check_recovery_sources(osdmap);
187 pl->check_recovery_sources(osdmap);
188
189 for (auto i = peer_log_requested.begin(); i != peer_log_requested.end();) {
190 if (!osdmap->is_up(i->osd)) {
191 psdout(10) << "peer_log_requested removing " << *i << dendl;
192 peer_log_requested.erase(i++);
193 } else {
194 ++i;
195 }
196 }
197
198 for (auto i = peer_missing_requested.begin();
199 i != peer_missing_requested.end();) {
200 if (!osdmap->is_up(i->osd)) {
201 psdout(10) << "peer_missing_requested removing " << *i << dendl;
202 peer_missing_requested.erase(i++);
203 } else {
204 ++i;
205 }
206 }
207 }
208
209 void PeeringState::update_history(const pg_history_t& new_history)
210 {
211 auto mnow = pl->get_mnow();
212 info.history.refresh_prior_readable_until_ub(mnow, prior_readable_until_ub);
213 if (info.history.merge(new_history)) {
214 psdout(20) << __func__ << " advanced history from " << new_history << dendl;
215 dirty_info = true;
216 if (info.history.last_epoch_clean >= info.history.same_interval_since) {
217 psdout(20) << __func__ << " clearing past_intervals" << dendl;
218 past_intervals.clear();
219 dirty_big_info = true;
220 }
221 prior_readable_until_ub = info.history.get_prior_readable_until_ub(mnow);
222 if (prior_readable_until_ub != ceph::signedspan::zero()) {
223 dout(20) << __func__
224 << " prior_readable_until_ub " << prior_readable_until_ub
225 << " (mnow " << mnow << " + "
226 << info.history.prior_readable_until_ub << ")" << dendl;
227 }
228 }
229 pl->on_info_history_change();
230 }
231
232 hobject_t PeeringState::earliest_backfill() const
233 {
234 hobject_t e = hobject_t::get_max();
235 for (const pg_shard_t& bt : get_backfill_targets()) {
236 const pg_info_t &pi = get_peer_info(bt);
237 e = std::min(pi.last_backfill, e);
238 }
239 return e;
240 }
241
242 void PeeringState::purge_strays()
243 {
244 if (is_premerge()) {
245 psdout(10) << "purge_strays " << stray_set << " but premerge, doing nothing"
246 << dendl;
247 return;
248 }
249 if (cct->_conf.get_val<bool>("osd_debug_no_purge_strays")) {
250 return;
251 }
252 psdout(10) << "purge_strays " << stray_set << dendl;
253
254 bool removed = false;
255 for (auto p = stray_set.begin(); p != stray_set.end(); ++p) {
256 ceph_assert(!is_acting_recovery_backfill(*p));
257 if (get_osdmap()->is_up(p->osd)) {
258 psdout(10) << "sending PGRemove to osd." << *p << dendl;
259 vector<spg_t> to_remove;
260 to_remove.push_back(spg_t(info.pgid.pgid, p->shard));
261 auto m = TOPNSPC::make_message<MOSDPGRemove>(
262 get_osdmap_epoch(),
263 to_remove);
264 pl->send_cluster_message(p->osd, std::move(m), get_osdmap_epoch());
265 } else {
266 psdout(10) << "not sending PGRemove to down osd." << *p << dendl;
267 }
268 peer_missing.erase(*p);
269 peer_info.erase(*p);
270 missing_loc.remove_stray_recovery_sources(*p);
271 peer_purged.insert(*p);
272 removed = true;
273 }
274
275 // if we removed anyone, update peers (which include peer_info)
276 if (removed)
277 update_heartbeat_peers();
278
279 stray_set.clear();
280
281 // clear _requested maps; we may have to peer() again if we discover
282 // (more) stray content
283 peer_log_requested.clear();
284 peer_missing_requested.clear();
285 }
286
287 void PeeringState::query_unfound(Formatter *f, string state)
288 {
289 psdout(20) << "Enter PeeringState common QueryUnfound" << dendl;
290 {
291 f->dump_string("state", state);
292 f->dump_bool("available_might_have_unfound", true);
293 f->open_array_section("might_have_unfound");
294 for (auto p = might_have_unfound.begin();
295 p != might_have_unfound.end();
296 ++p) {
297 if (peer_missing.count(*p)) {
298 ; // Ignore already probed OSDs
299 } else {
300 f->open_object_section("osd");
301 f->dump_stream("osd") << *p;
302 if (peer_missing_requested.count(*p)) {
303 f->dump_string("status", "querying");
304 } else if (!get_osdmap()->is_up(p->osd)) {
305 f->dump_string("status", "osd is down");
306 } else {
307 f->dump_string("status", "not queried");
308 }
309 f->close_section();
310 }
311 }
312 f->close_section();
313 }
314 psdout(20) << "Exit PeeringState common QueryUnfound" << dendl;
315 return;
316 }
317
318 bool PeeringState::proc_replica_info(
319 pg_shard_t from, const pg_info_t &oinfo, epoch_t send_epoch)
320 {
321 auto p = peer_info.find(from);
322 if (p != peer_info.end() && p->second.last_update == oinfo.last_update) {
323 psdout(10) << " got dup osd." << from << " info "
324 << oinfo << ", identical to ours" << dendl;
325 return false;
326 }
327
328 if (!get_osdmap()->has_been_up_since(from.osd, send_epoch)) {
329 psdout(10) << " got info " << oinfo << " from down osd." << from
330 << " discarding" << dendl;
331 return false;
332 }
333
334 psdout(10) << " got osd." << from << " " << oinfo << dendl;
335 ceph_assert(is_primary());
336 peer_info[from] = oinfo;
337 might_have_unfound.insert(from);
338
339 update_history(oinfo.history);
340
341 // stray?
342 if (!is_up(from) && !is_acting(from)) {
343 psdout(10) << " osd." << from << " has stray content: " << oinfo << dendl;
344 stray_set.insert(from);
345 if (is_clean()) {
346 purge_strays();
347 }
348 }
349
350 // was this a new info? if so, update peers!
351 if (p == peer_info.end())
352 update_heartbeat_peers();
353
354 return true;
355 }
356
357
358 void PeeringState::remove_down_peer_info(const OSDMapRef &osdmap)
359 {
360 // Remove any downed osds from peer_info
361 bool removed = false;
362 auto p = peer_info.begin();
363 while (p != peer_info.end()) {
364 if (!osdmap->is_up(p->first.osd)) {
365 psdout(10) << " dropping down osd." << p->first << " info " << p->second << dendl;
366 peer_missing.erase(p->first);
367 peer_log_requested.erase(p->first);
368 peer_missing_requested.erase(p->first);
369 peer_info.erase(p++);
370 removed = true;
371 } else
372 ++p;
373 }
374
375 // Remove any downed osds from peer_purged so we can re-purge if necessary
376 auto it = peer_purged.begin();
377 while (it != peer_purged.end()) {
378 if (!osdmap->is_up(it->osd)) {
379 psdout(10) << " dropping down osd." << *it << " from peer_purged" << dendl;
380 peer_purged.erase(it++);
381 } else {
382 ++it;
383 }
384 }
385
386 // if we removed anyone, update peers (which include peer_info)
387 if (removed)
388 update_heartbeat_peers();
389
390 check_recovery_sources(osdmap);
391 }
392
393 void PeeringState::update_heartbeat_peers()
394 {
395 if (!is_primary())
396 return;
397
398 set<int> new_peers;
399 for (unsigned i=0; i<acting.size(); i++) {
400 if (acting[i] != CRUSH_ITEM_NONE)
401 new_peers.insert(acting[i]);
402 }
403 for (unsigned i=0; i<up.size(); i++) {
404 if (up[i] != CRUSH_ITEM_NONE)
405 new_peers.insert(up[i]);
406 }
407 for (auto p = peer_info.begin(); p != peer_info.end(); ++p) {
408 new_peers.insert(p->first.osd);
409 }
410 pl->update_heartbeat_peers(std::move(new_peers));
411 }
412
413 void PeeringState::write_if_dirty(ObjectStore::Transaction& t)
414 {
415 pl->prepare_write(
416 info,
417 last_written_info,
418 past_intervals,
419 pg_log,
420 dirty_info,
421 dirty_big_info,
422 last_persisted_osdmap < get_osdmap_epoch(),
423 t);
424 if (dirty_info || dirty_big_info) {
425 last_persisted_osdmap = get_osdmap_epoch();
426 last_written_info = info;
427 dirty_info = false;
428 dirty_big_info = false;
429 }
430 }
431
432 void PeeringState::advance_map(
433 OSDMapRef osdmap, OSDMapRef lastmap,
434 vector<int>& newup, int up_primary,
435 vector<int>& newacting, int acting_primary,
436 PeeringCtx &rctx)
437 {
438 ceph_assert(lastmap == osdmap_ref);
439 psdout(10) << "handle_advance_map "
440 << newup << "/" << newacting
441 << " -- " << up_primary << "/" << acting_primary
442 << dendl;
443
444 update_osdmap_ref(osdmap);
445 pool.update(osdmap);
446
447 AdvMap evt(
448 osdmap, lastmap, newup, up_primary,
449 newacting, acting_primary);
450 handle_event(evt, &rctx);
451 if (pool.info.last_change == osdmap_ref->get_epoch()) {
452 pl->on_pool_change();
453 }
454 readable_interval = pool.get_readable_interval(cct->_conf);
455 last_require_osd_release = osdmap->require_osd_release;
456 }
457
458 void PeeringState::activate_map(PeeringCtx &rctx)
459 {
460 psdout(10) << __func__ << dendl;
461 ActMap evt;
462 handle_event(evt, &rctx);
463 if (osdmap_ref->get_epoch() - last_persisted_osdmap >
464 cct->_conf->osd_pg_epoch_persisted_max_stale) {
465 psdout(20) << __func__ << ": Dirtying info: last_persisted is "
466 << last_persisted_osdmap
467 << " while current is " << osdmap_ref->get_epoch() << dendl;
468 dirty_info = true;
469 } else {
470 psdout(20) << __func__ << ": Not dirtying info: last_persisted is "
471 << last_persisted_osdmap
472 << " while current is " << osdmap_ref->get_epoch() << dendl;
473 }
474 write_if_dirty(rctx.transaction);
475
476 if (get_osdmap()->check_new_blocklist_entries()) {
477 pl->check_blocklisted_watchers();
478 }
479 }
480
481 void PeeringState::set_last_peering_reset()
482 {
483 psdout(20) << "set_last_peering_reset " << get_osdmap_epoch() << dendl;
484 if (last_peering_reset != get_osdmap_epoch()) {
485 last_peering_reset = get_osdmap_epoch();
486 psdout(10) << "Clearing blocked outgoing recovery messages" << dendl;
487 clear_blocked_outgoing();
488 if (!pl->try_flush_or_schedule_async()) {
489 psdout(10) << "Beginning to block outgoing recovery messages" << dendl;
490 begin_block_outgoing();
491 } else {
492 psdout(10) << "Not blocking outgoing recovery messages" << dendl;
493 }
494 }
495 }
496
497 void PeeringState::complete_flush()
498 {
499 flushes_in_progress--;
500 if (flushes_in_progress == 0) {
501 pl->on_flushed();
502 }
503 }
504
505 void PeeringState::check_full_transition(OSDMapRef lastmap, OSDMapRef osdmap)
506 {
507 const pg_pool_t *pi = osdmap->get_pg_pool(info.pgid.pool());
508 if (!pi) {
509 return; // pool deleted
510 }
511 bool changed = false;
512 if (pi->has_flag(pg_pool_t::FLAG_FULL)) {
513 const pg_pool_t *opi = lastmap->get_pg_pool(info.pgid.pool());
514 if (!opi || !opi->has_flag(pg_pool_t::FLAG_FULL)) {
515 psdout(10) << " pool was marked full in " << osdmap->get_epoch() << dendl;
516 changed = true;
517 }
518 }
519 if (changed) {
520 info.history.last_epoch_marked_full = osdmap->get_epoch();
521 dirty_info = true;
522 }
523 }
524
525 bool PeeringState::should_restart_peering(
526 int newupprimary,
527 int newactingprimary,
528 const vector<int>& newup,
529 const vector<int>& newacting,
530 OSDMapRef lastmap,
531 OSDMapRef osdmap)
532 {
533 if (PastIntervals::is_new_interval(
534 primary.osd,
535 newactingprimary,
536 acting,
537 newacting,
538 up_primary.osd,
539 newupprimary,
540 up,
541 newup,
542 osdmap.get(),
543 lastmap.get(),
544 info.pgid.pgid)) {
545 psdout(20) << "new interval newup " << newup
546 << " newacting " << newacting << dendl;
547 return true;
548 }
549 if (!lastmap->is_up(pg_whoami.osd) && osdmap->is_up(pg_whoami.osd)) {
550 psdout(10) << __func__ << " osd transitioned from down -> up"
551 << dendl;
552 return true;
553 }
554 return false;
555 }
556
557 /* Called before initializing peering during advance_map */
558 void PeeringState::start_peering_interval(
559 const OSDMapRef lastmap,
560 const vector<int>& newup, int new_up_primary,
561 const vector<int>& newacting, int new_acting_primary,
562 ObjectStore::Transaction &t)
563 {
564 const OSDMapRef osdmap = get_osdmap();
565
566 set_last_peering_reset();
567
568 vector<int> oldacting, oldup;
569 int oldrole = get_role();
570
571 if (is_primary()) {
572 pl->clear_ready_to_merge();
573 }
574
575
576 pg_shard_t old_acting_primary = get_primary();
577 pg_shard_t old_up_primary = up_primary;
578 bool was_old_primary = is_primary();
579 bool was_old_nonprimary = is_nonprimary();
580
581 acting.swap(oldacting);
582 up.swap(oldup);
583 init_primary_up_acting(
584 newup,
585 newacting,
586 new_up_primary,
587 new_acting_primary);
588
589 if (info.stats.up != up ||
590 info.stats.acting != acting ||
591 info.stats.up_primary != new_up_primary ||
592 info.stats.acting_primary != new_acting_primary) {
593 info.stats.up = up;
594 info.stats.up_primary = new_up_primary;
595 info.stats.acting = acting;
596 info.stats.acting_primary = new_acting_primary;
597 info.stats.mapping_epoch = osdmap->get_epoch();
598 }
599
600 pl->clear_publish_stats();
601
602 // This will now be remapped during a backfill in cases
603 // that it would not have been before.
604 if (up != acting)
605 state_set(PG_STATE_REMAPPED);
606 else
607 state_clear(PG_STATE_REMAPPED);
608
609 int role = osdmap->calc_pg_role(pg_whoami, acting);
610 set_role(role);
611
612 // did acting, up, primary|acker change?
613 if (!lastmap) {
614 psdout(10) << " no lastmap" << dendl;
615 dirty_info = true;
616 dirty_big_info = true;
617 info.history.same_interval_since = osdmap->get_epoch();
618 } else {
619 std::stringstream debug;
620 ceph_assert(info.history.same_interval_since != 0);
621 bool new_interval = PastIntervals::check_new_interval(
622 old_acting_primary.osd,
623 new_acting_primary,
624 oldacting, newacting,
625 old_up_primary.osd,
626 new_up_primary,
627 oldup, newup,
628 info.history.same_interval_since,
629 info.history.last_epoch_clean,
630 osdmap.get(),
631 lastmap.get(),
632 info.pgid.pgid,
633 missing_loc.get_recoverable_predicate(),
634 &past_intervals,
635 &debug);
636 psdout(10) << __func__ << ": check_new_interval output: "
637 << debug.str() << dendl;
638 if (new_interval) {
639 if (osdmap->get_epoch() == pl->oldest_stored_osdmap() &&
640 info.history.last_epoch_clean < osdmap->get_epoch()) {
641 psdout(10) << " map gap, clearing past_intervals and faking" << dendl;
642 // our information is incomplete and useless; someone else was clean
643 // after everything we know if osdmaps were trimmed.
644 past_intervals.clear();
645 } else {
646 psdout(10) << " noting past " << past_intervals << dendl;
647 }
648 dirty_info = true;
649 dirty_big_info = true;
650 info.history.same_interval_since = osdmap->get_epoch();
651 if (osdmap->have_pg_pool(info.pgid.pgid.pool()) &&
652 info.pgid.pgid.is_split(lastmap->get_pg_num(info.pgid.pgid.pool()),
653 osdmap->get_pg_num(info.pgid.pgid.pool()),
654 nullptr)) {
655 info.history.last_epoch_split = osdmap->get_epoch();
656 }
657 }
658 }
659
660 if (old_up_primary != up_primary ||
661 oldup != up) {
662 info.history.same_up_since = osdmap->get_epoch();
663 }
664 // this comparison includes primary rank via pg_shard_t
665 if (old_acting_primary != get_primary()) {
666 info.history.same_primary_since = osdmap->get_epoch();
667 }
668
669 on_new_interval();
670 pl->on_info_history_change();
671
672 psdout(1) << __func__ << " up " << oldup << " -> " << up
673 << ", acting " << oldacting << " -> " << acting
674 << ", acting_primary " << old_acting_primary << " -> "
675 << new_acting_primary
676 << ", up_primary " << old_up_primary << " -> " << new_up_primary
677 << ", role " << oldrole << " -> " << role
678 << ", features acting " << acting_features
679 << " upacting " << upacting_features
680 << dendl;
681
682 // deactivate.
683 state_clear(PG_STATE_ACTIVE);
684 state_clear(PG_STATE_PEERED);
685 state_clear(PG_STATE_PREMERGE);
686 state_clear(PG_STATE_DOWN);
687 state_clear(PG_STATE_RECOVERY_WAIT);
688 state_clear(PG_STATE_RECOVERY_TOOFULL);
689 state_clear(PG_STATE_RECOVERING);
690
691 peer_purged.clear();
692 acting_recovery_backfill.clear();
693
694 // reset primary/replica state?
695 if (was_old_primary || is_primary()) {
696 pl->clear_want_pg_temp();
697 } else if (was_old_nonprimary || is_nonprimary()) {
698 pl->clear_want_pg_temp();
699 }
700 clear_primary_state();
701
702 pl->on_change(t);
703
704 ceph_assert(!deleting);
705
706 // should we tell the primary we are here?
707 send_notify = !is_primary();
708
709 if (role != oldrole ||
710 was_old_primary != is_primary()) {
711 // did primary change?
712 if (was_old_primary != is_primary()) {
713 state_clear(PG_STATE_CLEAN);
714 // queue/dequeue the scrubber
715 pl->on_primary_status_change(was_old_primary, is_primary());
716 }
717
718 pl->on_role_change();
719 } else {
720 // no role change.
721 // did primary change?
722 if (get_primary() != old_acting_primary) {
723 psdout(10) << oldacting << " -> " << acting
724 << ", acting primary "
725 << old_acting_primary << " -> " << get_primary()
726 << dendl;
727 } else {
728 // primary is the same.
729 if (is_primary()) {
730 // i am (still) primary. but my replica set changed.
731 state_clear(PG_STATE_CLEAN);
732
733 psdout(10) << oldacting << " -> " << acting
734 << ", replicas changed" << dendl;
735 }
736 }
737 }
738
739 if (is_primary() && was_old_primary) {
740 pl->reschedule_scrub();
741 }
742
743 if (acting.empty() && !up.empty() && up_primary == pg_whoami) {
744 psdout(10) << " acting empty, but i am up[0], clearing pg_temp" << dendl;
745 pl->queue_want_pg_temp(acting);
746 }
747 }
748
749 void PeeringState::on_new_interval()
750 {
751 dout(20) << __func__ << dendl;
752 const OSDMapRef osdmap = get_osdmap();
753
754 // initialize features
755 acting_features = CEPH_FEATURES_SUPPORTED_DEFAULT;
756 upacting_features = CEPH_FEATURES_SUPPORTED_DEFAULT;
757 for (auto p = acting.begin(); p != acting.end(); ++p) {
758 if (*p == CRUSH_ITEM_NONE)
759 continue;
760 uint64_t f = osdmap->get_xinfo(*p).features;
761 acting_features &= f;
762 upacting_features &= f;
763 }
764 for (auto p = up.begin(); p != up.end(); ++p) {
765 if (*p == CRUSH_ITEM_NONE)
766 continue;
767 upacting_features &= osdmap->get_xinfo(*p).features;
768 }
769 psdout(20) << __func__ << " upacting_features 0x" << std::hex
770 << upacting_features << std::dec
771 << " from " << acting << "+" << up << dendl;
772
773 psdout(20) << __func__ << " checking missing set deletes flag. missing = "
774 << get_pg_log().get_missing() << dendl;
775
776 if (!pg_log.get_missing().may_include_deletes &&
777 !perform_deletes_during_peering()) {
778 pl->rebuild_missing_set_with_deletes(pg_log);
779 }
780 ceph_assert(
781 pg_log.get_missing().may_include_deletes ==
782 !perform_deletes_during_peering());
783
784 init_hb_stamps();
785
786 // update lease bounds for a new interval
787 auto mnow = pl->get_mnow();
788 prior_readable_until_ub = std::max(prior_readable_until_ub,
789 readable_until_ub);
790 prior_readable_until_ub = info.history.refresh_prior_readable_until_ub(
791 mnow, prior_readable_until_ub);
792 psdout(10) << __func__ << " prior_readable_until_ub "
793 << prior_readable_until_ub << " (mnow " << mnow << " + "
794 << info.history.prior_readable_until_ub << ")" << dendl;
795 prior_readable_down_osds.clear(); // we populate this when we build the priorset
796
797 readable_until =
798 readable_until_ub =
799 readable_until_ub_sent =
800 readable_until_ub_from_primary = ceph::signedspan::zero();
801
802 acting_readable_until_ub.clear();
803 if (is_primary()) {
804 acting_readable_until_ub.resize(acting.size(), ceph::signedspan::zero());
805 }
806
807 pl->on_new_interval();
808 }
809
810 void PeeringState::init_primary_up_acting(
811 const vector<int> &newup,
812 const vector<int> &newacting,
813 int new_up_primary,
814 int new_acting_primary)
815 {
816 actingset.clear();
817 acting = newacting;
818 for (uint8_t i = 0; i < acting.size(); ++i) {
819 if (acting[i] != CRUSH_ITEM_NONE)
820 actingset.insert(
821 pg_shard_t(
822 acting[i],
823 pool.info.is_erasure() ? shard_id_t(i) : shard_id_t::NO_SHARD));
824 }
825 upset.clear();
826 up = newup;
827 for (uint8_t i = 0; i < up.size(); ++i) {
828 if (up[i] != CRUSH_ITEM_NONE)
829 upset.insert(
830 pg_shard_t(
831 up[i],
832 pool.info.is_erasure() ? shard_id_t(i) : shard_id_t::NO_SHARD));
833 }
834 if (!pool.info.is_erasure()) {
835 // replicated
836 up_primary = pg_shard_t(new_up_primary, shard_id_t::NO_SHARD);
837 primary = pg_shard_t(new_acting_primary, shard_id_t::NO_SHARD);
838 } else {
839 // erasure
840 up_primary = pg_shard_t();
841 primary = pg_shard_t();
842 for (uint8_t i = 0; i < up.size(); ++i) {
843 if (up[i] == new_up_primary) {
844 up_primary = pg_shard_t(up[i], shard_id_t(i));
845 break;
846 }
847 }
848 for (uint8_t i = 0; i < acting.size(); ++i) {
849 if (acting[i] == new_acting_primary) {
850 primary = pg_shard_t(acting[i], shard_id_t(i));
851 break;
852 }
853 }
854 ceph_assert(up_primary.osd == new_up_primary);
855 ceph_assert(primary.osd == new_acting_primary);
856 }
857 }
858
859 void PeeringState::init_hb_stamps()
860 {
861 if (is_primary()) {
862 // we care about all other osds in the acting set
863 hb_stamps.resize(acting.size() - 1);
864 unsigned i = 0;
865 for (auto p : acting) {
866 if (p == CRUSH_ITEM_NONE || p == get_primary().osd) {
867 continue;
868 }
869 hb_stamps[i++] = pl->get_hb_stamps(p);
870 }
871 hb_stamps.resize(i);
872 } else if (is_nonprimary()) {
873 // we care about just the primary
874 hb_stamps.resize(1);
875 hb_stamps[0] = pl->get_hb_stamps(get_primary().osd);
876 } else {
877 hb_stamps.clear();
878 }
879 dout(10) << __func__ << " now " << hb_stamps << dendl;
880 }
881
882
883 void PeeringState::clear_recovery_state()
884 {
885 async_recovery_targets.clear();
886 backfill_targets.clear();
887 }
888
889 void PeeringState::clear_primary_state()
890 {
891 psdout(10) << "clear_primary_state" << dendl;
892
893 // clear peering state
894 stray_set.clear();
895 peer_log_requested.clear();
896 peer_missing_requested.clear();
897 peer_info.clear();
898 peer_bytes.clear();
899 peer_missing.clear();
900 peer_last_complete_ondisk.clear();
901 peer_activated.clear();
902 min_last_complete_ondisk = eversion_t();
903 pg_trim_to = eversion_t();
904 might_have_unfound.clear();
905 need_up_thru = false;
906 missing_loc.clear();
907 pg_log.reset_recovery_pointers();
908
909 clear_recovery_state();
910
911 last_update_ondisk = eversion_t();
912 missing_loc.clear();
913 pl->clear_primary_state();
914 }
915
916 /// return [start,end) bounds for required past_intervals
917 static pair<epoch_t, epoch_t> get_required_past_interval_bounds(
918 const pg_info_t &info,
919 epoch_t oldest_map) {
920 epoch_t start = std::max(
921 info.history.last_epoch_clean ? info.history.last_epoch_clean :
922 info.history.epoch_pool_created,
923 oldest_map);
924 epoch_t end = std::max(
925 info.history.same_interval_since,
926 info.history.epoch_pool_created);
927 return make_pair(start, end);
928 }
929
930
931 void PeeringState::check_past_interval_bounds() const
932 {
933 auto oldest_epoch = pl->oldest_stored_osdmap();
934 auto rpib = get_required_past_interval_bounds(
935 info,
936 oldest_epoch);
937 if (rpib.first >= rpib.second) {
938 // do not warn if the start bound is dictated by oldest_map; the
939 // past intervals are presumably appropriate given the pg info.
940 if (!past_intervals.empty() &&
941 rpib.first > oldest_epoch) {
942 pl->get_clog_error() << info.pgid << " required past_interval bounds are"
943 << " empty [" << rpib << ") but past_intervals is not: "
944 << past_intervals;
945 derr << info.pgid << " required past_interval bounds are"
946 << " empty [" << rpib << ") but past_intervals is not: "
947 << past_intervals << dendl;
948 }
949 } else {
950 if (past_intervals.empty()) {
951 pl->get_clog_error() << info.pgid << " required past_interval bounds are"
952 << " not empty [" << rpib << ") but past_intervals "
953 << past_intervals << " is empty";
954 derr << info.pgid << " required past_interval bounds are"
955 << " not empty [" << rpib << ") but past_intervals "
956 << past_intervals << " is empty" << dendl;
957 ceph_assert(!past_intervals.empty());
958 }
959
960 auto apib = past_intervals.get_bounds();
961 if (apib.first > rpib.first) {
962 pl->get_clog_error() << info.pgid << " past_intervals [" << apib
963 << ") start interval does not contain the required"
964 << " bound [" << rpib << ") start";
965 derr << info.pgid << " past_intervals [" << apib
966 << ") start interval does not contain the required"
967 << " bound [" << rpib << ") start" << dendl;
968 ceph_abort_msg("past_interval start interval mismatch");
969 }
970 if (apib.second != rpib.second) {
971 pl->get_clog_error() << info.pgid << " past_interal bound [" << apib
972 << ") end does not match required [" << rpib
973 << ") end";
974 derr << info.pgid << " past_interal bound [" << apib
975 << ") end does not match required [" << rpib
976 << ") end" << dendl;
977 ceph_abort_msg("past_interval end mismatch");
978 }
979 }
980 }
981
982 int PeeringState::clamp_recovery_priority(int priority, int pool_recovery_priority, int max)
983 {
984 static_assert(OSD_RECOVERY_PRIORITY_MIN < OSD_RECOVERY_PRIORITY_MAX, "Invalid priority range");
985 static_assert(OSD_RECOVERY_PRIORITY_MIN >= 0, "Priority range must match unsigned type");
986
987 ceph_assert(max <= OSD_RECOVERY_PRIORITY_MAX);
988
989 // User can't set this too high anymore, but might be a legacy value
990 if (pool_recovery_priority > OSD_POOL_PRIORITY_MAX)
991 pool_recovery_priority = OSD_POOL_PRIORITY_MAX;
992 if (pool_recovery_priority < OSD_POOL_PRIORITY_MIN)
993 pool_recovery_priority = OSD_POOL_PRIORITY_MIN;
994 // Shift range from min to max to 0 to max - min
995 pool_recovery_priority += (0 - OSD_POOL_PRIORITY_MIN);
996 ceph_assert(pool_recovery_priority >= 0 && pool_recovery_priority <= (OSD_POOL_PRIORITY_MAX - OSD_POOL_PRIORITY_MIN));
997
998 priority += pool_recovery_priority;
999
1000 // Clamp to valid range
1001 if (priority > max) {
1002 return max;
1003 } else if (priority < OSD_RECOVERY_PRIORITY_MIN) {
1004 return OSD_RECOVERY_PRIORITY_MIN;
1005 } else {
1006 return priority;
1007 }
1008 }
1009
1010 unsigned PeeringState::get_recovery_priority()
1011 {
1012 // a higher value -> a higher priority
1013 int ret = OSD_RECOVERY_PRIORITY_BASE;
1014 int base = ret;
1015
1016 if (state & PG_STATE_FORCED_RECOVERY) {
1017 ret = OSD_RECOVERY_PRIORITY_FORCED;
1018 } else {
1019 // XXX: This priority boost isn't so much about inactive, but about data-at-risk
1020 if (is_degraded() && info.stats.avail_no_missing.size() < pool.info.min_size) {
1021 base = OSD_RECOVERY_INACTIVE_PRIORITY_BASE;
1022 // inactive: no. of replicas < min_size, highest priority since it blocks IO
1023 ret = base + (pool.info.min_size - info.stats.avail_no_missing.size());
1024 }
1025
1026 int64_t pool_recovery_priority = 0;
1027 pool.info.opts.get(pool_opts_t::RECOVERY_PRIORITY, &pool_recovery_priority);
1028
1029 ret = clamp_recovery_priority(ret, pool_recovery_priority, max_prio_map[base]);
1030 }
1031 psdout(20) << __func__ << " recovery priority is " << ret << dendl;
1032 return static_cast<unsigned>(ret);
1033 }
1034
1035 unsigned PeeringState::get_backfill_priority()
1036 {
1037 // a higher value -> a higher priority
1038 int ret = OSD_BACKFILL_PRIORITY_BASE;
1039 int base = ret;
1040
1041 if (state & PG_STATE_FORCED_BACKFILL) {
1042 ret = OSD_BACKFILL_PRIORITY_FORCED;
1043 } else {
1044 if (actingset.size() < pool.info.min_size) {
1045 base = OSD_BACKFILL_INACTIVE_PRIORITY_BASE;
1046 // inactive: no. of replicas < min_size, highest priority since it blocks IO
1047 ret = base + (pool.info.min_size - actingset.size());
1048
1049 } else if (is_undersized()) {
1050 // undersized: OSD_BACKFILL_DEGRADED_PRIORITY_BASE + num missing replicas
1051 ceph_assert(pool.info.size > actingset.size());
1052 base = OSD_BACKFILL_DEGRADED_PRIORITY_BASE;
1053 ret = base + (pool.info.size - actingset.size());
1054
1055 } else if (is_degraded()) {
1056 // degraded: baseline degraded
1057 base = ret = OSD_BACKFILL_DEGRADED_PRIORITY_BASE;
1058 }
1059
1060 // Adjust with pool's recovery priority
1061 int64_t pool_recovery_priority = 0;
1062 pool.info.opts.get(pool_opts_t::RECOVERY_PRIORITY, &pool_recovery_priority);
1063
1064 ret = clamp_recovery_priority(ret, pool_recovery_priority, max_prio_map[base]);
1065 }
1066
1067 psdout(20) << __func__ << " backfill priority is " << ret << dendl;
1068 return static_cast<unsigned>(ret);
1069 }
1070
1071 unsigned PeeringState::get_delete_priority()
1072 {
1073 auto state = get_osdmap()->get_state(pg_whoami.osd);
1074 if (state & (CEPH_OSD_BACKFILLFULL |
1075 CEPH_OSD_FULL)) {
1076 return OSD_DELETE_PRIORITY_FULL;
1077 } else if (state & CEPH_OSD_NEARFULL) {
1078 return OSD_DELETE_PRIORITY_FULLISH;
1079 } else {
1080 return OSD_DELETE_PRIORITY_NORMAL;
1081 }
1082 }
1083
1084 bool PeeringState::set_force_recovery(bool b)
1085 {
1086 bool did = false;
1087 if (b) {
1088 if (!(state & PG_STATE_FORCED_RECOVERY) &&
1089 (state & (PG_STATE_DEGRADED |
1090 PG_STATE_RECOVERY_WAIT |
1091 PG_STATE_RECOVERING))) {
1092 psdout(20) << __func__ << " set" << dendl;
1093 state_set(PG_STATE_FORCED_RECOVERY);
1094 pl->publish_stats_to_osd();
1095 did = true;
1096 }
1097 } else if (state & PG_STATE_FORCED_RECOVERY) {
1098 psdout(20) << __func__ << " clear" << dendl;
1099 state_clear(PG_STATE_FORCED_RECOVERY);
1100 pl->publish_stats_to_osd();
1101 did = true;
1102 }
1103 if (did) {
1104 psdout(20) << __func__ << " state " << get_current_state()
1105 << dendl;
1106 pl->update_local_background_io_priority(get_recovery_priority());
1107 }
1108 return did;
1109 }
1110
1111 bool PeeringState::set_force_backfill(bool b)
1112 {
1113 bool did = false;
1114 if (b) {
1115 if (!(state & PG_STATE_FORCED_BACKFILL) &&
1116 (state & (PG_STATE_DEGRADED |
1117 PG_STATE_BACKFILL_WAIT |
1118 PG_STATE_BACKFILLING))) {
1119 psdout(10) << __func__ << " set" << dendl;
1120 state_set(PG_STATE_FORCED_BACKFILL);
1121 pl->publish_stats_to_osd();
1122 did = true;
1123 }
1124 } else if (state & PG_STATE_FORCED_BACKFILL) {
1125 psdout(10) << __func__ << " clear" << dendl;
1126 state_clear(PG_STATE_FORCED_BACKFILL);
1127 pl->publish_stats_to_osd();
1128 did = true;
1129 }
1130 if (did) {
1131 psdout(20) << __func__ << " state " << get_current_state()
1132 << dendl;
1133 pl->update_local_background_io_priority(get_backfill_priority());
1134 }
1135 return did;
1136 }
1137
1138 void PeeringState::schedule_renew_lease()
1139 {
1140 pl->schedule_renew_lease(
1141 last_peering_reset,
1142 readable_interval / 2);
1143 }
1144
1145 void PeeringState::send_lease()
1146 {
1147 epoch_t epoch = pl->get_osdmap_epoch();
1148 for (auto peer : actingset) {
1149 if (peer == pg_whoami) {
1150 continue;
1151 }
1152 pl->send_cluster_message(
1153 peer.osd,
1154 TOPNSPC::make_message<MOSDPGLease>(epoch,
1155 spg_t(spgid.pgid, peer.shard),
1156 get_lease()),
1157 epoch);
1158 }
1159 }
1160
1161 void PeeringState::proc_lease(const pg_lease_t& l)
1162 {
1163 assert(HAVE_FEATURE(upacting_features, SERVER_OCTOPUS));
1164 if (!is_nonprimary()) {
1165 psdout(20) << __func__ << " no-op, !nonprimary" << dendl;
1166 return;
1167 }
1168 psdout(10) << __func__ << " " << l << dendl;
1169 if (l.readable_until_ub > readable_until_ub_from_primary) {
1170 readable_until_ub_from_primary = l.readable_until_ub;
1171 }
1172
1173 ceph::signedspan ru = ceph::signedspan::zero();
1174 if (l.readable_until != ceph::signedspan::zero() &&
1175 hb_stamps[0]->peer_clock_delta_ub) {
1176 ru = l.readable_until - *hb_stamps[0]->peer_clock_delta_ub;
1177 psdout(20) << " peer_clock_delta_ub " << *hb_stamps[0]->peer_clock_delta_ub
1178 << " -> ru " << ru << dendl;
1179 }
1180 if (ru > readable_until) {
1181 readable_until = ru;
1182 psdout(20) << __func__ << " readable_until now " << readable_until << dendl;
1183 // NOTE: if we ever decide to block/queue ops on the replica,
1184 // we'll need to wake them up here.
1185 }
1186
1187 ceph::signedspan ruub;
1188 if (hb_stamps[0]->peer_clock_delta_lb) {
1189 ruub = l.readable_until_ub - *hb_stamps[0]->peer_clock_delta_lb;
1190 psdout(20) << " peer_clock_delta_lb " << *hb_stamps[0]->peer_clock_delta_lb
1191 << " -> ruub " << ruub << dendl;
1192 } else {
1193 ruub = pl->get_mnow() + l.interval;
1194 psdout(20) << " no peer_clock_delta_lb -> ruub " << ruub << dendl;
1195 }
1196 if (ruub > readable_until_ub) {
1197 readable_until_ub = ruub;
1198 psdout(20) << __func__ << " readable_until_ub now " << readable_until_ub
1199 << dendl;
1200 }
1201 }
1202
1203 void PeeringState::proc_lease_ack(int from, const pg_lease_ack_t& a)
1204 {
1205 assert(HAVE_FEATURE(upacting_features, SERVER_OCTOPUS));
1206 auto now = pl->get_mnow();
1207 bool was_min = false;
1208 for (unsigned i = 0; i < acting.size(); ++i) {
1209 if (from == acting[i]) {
1210 // the lease_ack value is based on the primary's clock
1211 if (a.readable_until_ub > acting_readable_until_ub[i]) {
1212 if (acting_readable_until_ub[i] == readable_until) {
1213 was_min = true;
1214 }
1215 acting_readable_until_ub[i] = a.readable_until_ub;
1216 break;
1217 }
1218 }
1219 }
1220 if (was_min) {
1221 auto old_ru = readable_until;
1222 recalc_readable_until();
1223 if (now < old_ru) {
1224 pl->recheck_readable();
1225 }
1226 }
1227 }
1228
1229 void PeeringState::proc_renew_lease()
1230 {
1231 assert(HAVE_FEATURE(upacting_features, SERVER_OCTOPUS));
1232 renew_lease(pl->get_mnow());
1233 send_lease();
1234 schedule_renew_lease();
1235 }
1236
1237 void PeeringState::recalc_readable_until()
1238 {
1239 assert(is_primary());
1240 ceph::signedspan min = readable_until_ub_sent;
1241 for (unsigned i = 0; i < acting.size(); ++i) {
1242 if (acting[i] == pg_whoami.osd || acting[i] == CRUSH_ITEM_NONE) {
1243 continue;
1244 }
1245 dout(20) << __func__ << " peer osd." << acting[i]
1246 << " ruub " << acting_readable_until_ub[i] << dendl;
1247 if (acting_readable_until_ub[i] < min) {
1248 min = acting_readable_until_ub[i];
1249 }
1250 }
1251 readable_until = min;
1252 readable_until_ub = min;
1253 dout(20) << __func__ << " readable_until[_ub] " << readable_until
1254 << " (sent " << readable_until_ub_sent << ")" << dendl;
1255 }
1256
1257 bool PeeringState::check_prior_readable_down_osds(const OSDMapRef& map)
1258 {
1259 assert(HAVE_FEATURE(upacting_features, SERVER_OCTOPUS));
1260 bool changed = false;
1261 auto p = prior_readable_down_osds.begin();
1262 while (p != prior_readable_down_osds.end()) {
1263 if (map->is_dead(*p)) {
1264 dout(10) << __func__ << " prior_readable_down_osds osd." << *p
1265 << " is dead as of epoch " << map->get_epoch()
1266 << dendl;
1267 p = prior_readable_down_osds.erase(p);
1268 changed = true;
1269 } else {
1270 ++p;
1271 }
1272 }
1273 if (changed && prior_readable_down_osds.empty()) {
1274 psdout(10) << " empty prior_readable_down_osds, clearing ub" << dendl;
1275 clear_prior_readable_until_ub();
1276 return true;
1277 }
1278 return false;
1279 }
1280
1281 bool PeeringState::adjust_need_up_thru(const OSDMapRef osdmap)
1282 {
1283 epoch_t up_thru = osdmap->get_up_thru(pg_whoami.osd);
1284 if (need_up_thru &&
1285 up_thru >= info.history.same_interval_since) {
1286 psdout(10) << "adjust_need_up_thru now "
1287 << up_thru << ", need_up_thru now false" << dendl;
1288 need_up_thru = false;
1289 return true;
1290 }
1291 return false;
1292 }
1293
1294 PastIntervals::PriorSet PeeringState::build_prior()
1295 {
1296 if (1) {
1297 // sanity check
1298 for (auto it = peer_info.begin(); it != peer_info.end(); ++it) {
1299 ceph_assert(info.history.last_epoch_started >=
1300 it->second.history.last_epoch_started);
1301 }
1302 }
1303
1304 const OSDMap &osdmap = *get_osdmap();
1305 PastIntervals::PriorSet prior = past_intervals.get_prior_set(
1306 pool.info.is_erasure(),
1307 info.history.last_epoch_started,
1308 &missing_loc.get_recoverable_predicate(),
1309 [&](epoch_t start, int osd, epoch_t *lost_at) {
1310 const osd_info_t *pinfo = 0;
1311 if (osdmap.exists(osd)) {
1312 pinfo = &osdmap.get_info(osd);
1313 if (lost_at)
1314 *lost_at = pinfo->lost_at;
1315 }
1316
1317 if (osdmap.is_up(osd)) {
1318 return PastIntervals::UP;
1319 } else if (!pinfo) {
1320 return PastIntervals::DNE;
1321 } else if (pinfo->lost_at > start) {
1322 return PastIntervals::LOST;
1323 } else {
1324 return PastIntervals::DOWN;
1325 }
1326 },
1327 up,
1328 acting,
1329 dpp);
1330
1331 if (prior.pg_down) {
1332 state_set(PG_STATE_DOWN);
1333 }
1334
1335 if (get_osdmap()->get_up_thru(pg_whoami.osd) <
1336 info.history.same_interval_since) {
1337 psdout(10) << "up_thru " << get_osdmap()->get_up_thru(pg_whoami.osd)
1338 << " < same_since " << info.history.same_interval_since
1339 << ", must notify monitor" << dendl;
1340 need_up_thru = true;
1341 } else {
1342 psdout(10) << "up_thru " << get_osdmap()->get_up_thru(pg_whoami.osd)
1343 << " >= same_since " << info.history.same_interval_since
1344 << ", all is well" << dendl;
1345 need_up_thru = false;
1346 }
1347 pl->set_probe_targets(prior.probe);
1348 return prior;
1349 }
1350
1351 bool PeeringState::needs_recovery() const
1352 {
1353 ceph_assert(is_primary());
1354
1355 auto &missing = pg_log.get_missing();
1356
1357 if (missing.num_missing()) {
1358 psdout(10) << __func__ << " primary has " << missing.num_missing()
1359 << " missing" << dendl;
1360 return true;
1361 }
1362
1363 ceph_assert(!acting_recovery_backfill.empty());
1364 for (const pg_shard_t& peer : acting_recovery_backfill) {
1365 if (peer == get_primary()) {
1366 continue;
1367 }
1368 auto pm = peer_missing.find(peer);
1369 if (pm == peer_missing.end()) {
1370 psdout(10) << __func__ << " osd." << peer << " doesn't have missing set"
1371 << dendl;
1372 continue;
1373 }
1374 if (pm->second.num_missing()) {
1375 psdout(10) << __func__ << " osd." << peer << " has "
1376 << pm->second.num_missing() << " missing" << dendl;
1377 return true;
1378 }
1379 }
1380
1381 psdout(10) << __func__ << " is recovered" << dendl;
1382 return false;
1383 }
1384
1385 bool PeeringState::needs_backfill() const
1386 {
1387 ceph_assert(is_primary());
1388
1389 // We can assume that only possible osds that need backfill
1390 // are on the backfill_targets vector nodes.
1391 for (const pg_shard_t& peer : backfill_targets) {
1392 auto pi = peer_info.find(peer);
1393 ceph_assert(pi != peer_info.end());
1394 if (!pi->second.last_backfill.is_max()) {
1395 psdout(10) << __func__ << " osd." << peer
1396 << " has last_backfill " << pi->second.last_backfill << dendl;
1397 return true;
1398 }
1399 }
1400
1401 psdout(10) << __func__ << " does not need backfill" << dendl;
1402 return false;
1403 }
1404
1405 /*
1406 * Returns true unless there is a non-lost OSD in might_have_unfound.
1407 */
1408 bool PeeringState::all_unfound_are_queried_or_lost(
1409 const OSDMapRef osdmap) const
1410 {
1411 ceph_assert(is_primary());
1412
1413 auto peer = might_have_unfound.begin();
1414 auto mend = might_have_unfound.end();
1415 for (; peer != mend; ++peer) {
1416 if (peer_missing.count(*peer))
1417 continue;
1418 auto iter = peer_info.find(*peer);
1419 if (iter != peer_info.end() &&
1420 (iter->second.is_empty() || iter->second.dne()))
1421 continue;
1422 if (!osdmap->exists(peer->osd))
1423 continue;
1424 const osd_info_t &osd_info(osdmap->get_info(peer->osd));
1425 if (osd_info.lost_at <= osd_info.up_from) {
1426 // If there is even one OSD in might_have_unfound that isn't lost, we
1427 // still might retrieve our unfound.
1428 return false;
1429 }
1430 }
1431 psdout(10) << "all_unfound_are_queried_or_lost all of might_have_unfound "
1432 << might_have_unfound
1433 << " have been queried or are marked lost" << dendl;
1434 return true;
1435 }
1436
1437
1438 void PeeringState::reject_reservation()
1439 {
1440 pl->unreserve_recovery_space();
1441 pl->send_cluster_message(
1442 primary.osd,
1443 TOPNSPC::make_message<MBackfillReserve>(
1444 MBackfillReserve::REJECT_TOOFULL,
1445 spg_t(info.pgid.pgid, primary.shard),
1446 get_osdmap_epoch()),
1447 get_osdmap_epoch());
1448 }
1449
1450 /**
1451 * find_best_info
1452 *
1453 * Returns an iterator to the best info in infos sorted by:
1454 * 1) Prefer newer last_update
1455 * 2) Prefer longer tail if it brings another info into contiguity
1456 * 3) Prefer current primary
1457 */
1458 map<pg_shard_t, pg_info_t>::const_iterator PeeringState::find_best_info(
1459 const map<pg_shard_t, pg_info_t> &infos,
1460 bool restrict_to_up_acting,
1461 bool *history_les_bound) const
1462 {
1463 ceph_assert(history_les_bound);
1464 /* See doc/dev/osd_internals/last_epoch_started.rst before attempting
1465 * to make changes to this process. Also, make sure to update it
1466 * when you find bugs! */
1467 epoch_t max_last_epoch_started_found = 0;
1468 for (auto i = infos.begin(); i != infos.end(); ++i) {
1469 if (!cct->_conf->osd_find_best_info_ignore_history_les &&
1470 max_last_epoch_started_found < i->second.history.last_epoch_started) {
1471 *history_les_bound = true;
1472 max_last_epoch_started_found = i->second.history.last_epoch_started;
1473 }
1474 if (!i->second.is_incomplete() &&
1475 max_last_epoch_started_found < i->second.last_epoch_started) {
1476 *history_les_bound = false;
1477 max_last_epoch_started_found = i->second.last_epoch_started;
1478 }
1479 }
1480 eversion_t min_last_update_acceptable = eversion_t::max();
1481 for (auto i = infos.begin(); i != infos.end(); ++i) {
1482 if (max_last_epoch_started_found <= i->second.last_epoch_started) {
1483 if (min_last_update_acceptable > i->second.last_update)
1484 min_last_update_acceptable = i->second.last_update;
1485 }
1486 }
1487 if (min_last_update_acceptable == eversion_t::max())
1488 return infos.end();
1489
1490 auto best = infos.end();
1491 // find osd with newest last_update (oldest for ec_pool).
1492 // if there are multiples, prefer
1493 // - a longer tail, if it brings another peer into log contiguity
1494 // - the current primary
1495 for (auto p = infos.begin(); p != infos.end(); ++p) {
1496 if (restrict_to_up_acting && !is_up(p->first) &&
1497 !is_acting(p->first))
1498 continue;
1499 // Only consider peers with last_update >= min_last_update_acceptable
1500 if (p->second.last_update < min_last_update_acceptable)
1501 continue;
1502 // Disqualify anyone with a too old last_epoch_started
1503 if (p->second.last_epoch_started < max_last_epoch_started_found)
1504 continue;
1505 // Disqualify anyone who is incomplete (not fully backfilled)
1506 if (p->second.is_incomplete())
1507 continue;
1508 if (best == infos.end()) {
1509 best = p;
1510 continue;
1511 }
1512 // Prefer newer last_update
1513 if (pool.info.require_rollback()) {
1514 if (p->second.last_update > best->second.last_update)
1515 continue;
1516 if (p->second.last_update < best->second.last_update) {
1517 best = p;
1518 continue;
1519 }
1520 } else {
1521 if (p->second.last_update < best->second.last_update)
1522 continue;
1523 if (p->second.last_update > best->second.last_update) {
1524 best = p;
1525 continue;
1526 }
1527 }
1528
1529 // Prefer longer tail
1530 if (p->second.log_tail > best->second.log_tail) {
1531 continue;
1532 } else if (p->second.log_tail < best->second.log_tail) {
1533 best = p;
1534 continue;
1535 }
1536
1537 if (!p->second.has_missing() && best->second.has_missing()) {
1538 psdout(10) << __func__ << " prefer osd." << p->first
1539 << " because it is complete while best has missing"
1540 << dendl;
1541 best = p;
1542 continue;
1543 } else if (p->second.has_missing() && !best->second.has_missing()) {
1544 psdout(10) << __func__ << " skipping osd." << p->first
1545 << " because it has missing while best is complete"
1546 << dendl;
1547 continue;
1548 } else {
1549 // both are complete or have missing
1550 // fall through
1551 }
1552
1553 // prefer current primary (usually the caller), all things being equal
1554 if (p->first == pg_whoami) {
1555 psdout(10) << "calc_acting prefer osd." << p->first
1556 << " because it is current primary" << dendl;
1557 best = p;
1558 continue;
1559 }
1560 }
1561 return best;
1562 }
1563
1564 void PeeringState::calc_ec_acting(
1565 map<pg_shard_t, pg_info_t>::const_iterator auth_log_shard,
1566 unsigned size,
1567 const vector<int> &acting,
1568 const vector<int> &up,
1569 const map<pg_shard_t, pg_info_t> &all_info,
1570 bool restrict_to_up_acting,
1571 vector<int> *_want,
1572 set<pg_shard_t> *backfill,
1573 set<pg_shard_t> *acting_backfill,
1574 ostream &ss)
1575 {
1576 vector<int> want(size, CRUSH_ITEM_NONE);
1577 map<shard_id_t, set<pg_shard_t> > all_info_by_shard;
1578 for (auto i = all_info.begin();
1579 i != all_info.end();
1580 ++i) {
1581 all_info_by_shard[i->first.shard].insert(i->first);
1582 }
1583 for (uint8_t i = 0; i < want.size(); ++i) {
1584 ss << "For position " << (unsigned)i << ": ";
1585 if (up.size() > (unsigned)i && up[i] != CRUSH_ITEM_NONE &&
1586 !all_info.find(pg_shard_t(up[i], shard_id_t(i)))->second.is_incomplete() &&
1587 all_info.find(pg_shard_t(up[i], shard_id_t(i)))->second.last_update >=
1588 auth_log_shard->second.log_tail) {
1589 ss << " selecting up[i]: " << pg_shard_t(up[i], shard_id_t(i)) << std::endl;
1590 want[i] = up[i];
1591 continue;
1592 }
1593 if (up.size() > (unsigned)i && up[i] != CRUSH_ITEM_NONE) {
1594 ss << " backfilling up[i]: " << pg_shard_t(up[i], shard_id_t(i))
1595 << " and ";
1596 backfill->insert(pg_shard_t(up[i], shard_id_t(i)));
1597 }
1598
1599 if (acting.size() > (unsigned)i && acting[i] != CRUSH_ITEM_NONE &&
1600 !all_info.find(pg_shard_t(acting[i], shard_id_t(i)))->second.is_incomplete() &&
1601 all_info.find(pg_shard_t(acting[i], shard_id_t(i)))->second.last_update >=
1602 auth_log_shard->second.log_tail) {
1603 ss << " selecting acting[i]: " << pg_shard_t(acting[i], shard_id_t(i)) << std::endl;
1604 want[i] = acting[i];
1605 } else if (!restrict_to_up_acting) {
1606 for (auto j = all_info_by_shard[shard_id_t(i)].begin();
1607 j != all_info_by_shard[shard_id_t(i)].end();
1608 ++j) {
1609 ceph_assert(j->shard == i);
1610 if (!all_info.find(*j)->second.is_incomplete() &&
1611 all_info.find(*j)->second.last_update >=
1612 auth_log_shard->second.log_tail) {
1613 ss << " selecting stray: " << *j << std::endl;
1614 want[i] = j->osd;
1615 break;
1616 }
1617 }
1618 if (want[i] == CRUSH_ITEM_NONE)
1619 ss << " failed to fill position " << (int)i << std::endl;
1620 }
1621 }
1622
1623 for (uint8_t i = 0; i < want.size(); ++i) {
1624 if (want[i] != CRUSH_ITEM_NONE) {
1625 acting_backfill->insert(pg_shard_t(want[i], shard_id_t(i)));
1626 }
1627 }
1628 acting_backfill->insert(backfill->begin(), backfill->end());
1629 _want->swap(want);
1630 }
1631
1632 std::pair<map<pg_shard_t, pg_info_t>::const_iterator, eversion_t>
1633 PeeringState::select_replicated_primary(
1634 map<pg_shard_t, pg_info_t>::const_iterator auth_log_shard,
1635 uint64_t force_auth_primary_missing_objects,
1636 const std::vector<int> &up,
1637 pg_shard_t up_primary,
1638 const map<pg_shard_t, pg_info_t> &all_info,
1639 const OSDMapRef osdmap,
1640 ostream &ss)
1641 {
1642 pg_shard_t auth_log_shard_id = auth_log_shard->first;
1643
1644 ss << __func__ << " newest update on osd." << auth_log_shard_id
1645 << " with " << auth_log_shard->second << std::endl;
1646
1647 // select primary
1648 auto primary = all_info.find(up_primary);
1649 if (up.size() &&
1650 !primary->second.is_incomplete() &&
1651 primary->second.last_update >=
1652 auth_log_shard->second.log_tail) {
1653 assert(HAVE_FEATURE(osdmap->get_up_osd_features(), SERVER_NAUTILUS));
1654 auto approx_missing_objects =
1655 primary->second.stats.stats.sum.num_objects_missing;
1656 auto auth_version = auth_log_shard->second.last_update.version;
1657 auto primary_version = primary->second.last_update.version;
1658 if (auth_version > primary_version) {
1659 approx_missing_objects += auth_version - primary_version;
1660 } else {
1661 approx_missing_objects += primary_version - auth_version;
1662 }
1663 if ((uint64_t)approx_missing_objects >
1664 force_auth_primary_missing_objects) {
1665 primary = auth_log_shard;
1666 ss << "up_primary: " << up_primary << ") has approximate "
1667 << approx_missing_objects
1668 << "(>" << force_auth_primary_missing_objects <<") "
1669 << "missing objects, osd." << auth_log_shard_id
1670 << " selected as primary instead"
1671 << std::endl;
1672 } else {
1673 ss << "up_primary: " << up_primary << ") selected as primary"
1674 << std::endl;
1675 }
1676 } else {
1677 ceph_assert(!auth_log_shard->second.is_incomplete());
1678 ss << "up[0] needs backfill, osd." << auth_log_shard_id
1679 << " selected as primary instead" << std::endl;
1680 primary = auth_log_shard;
1681 }
1682
1683 ss << __func__ << " primary is osd." << primary->first
1684 << " with " << primary->second << std::endl;
1685
1686 /* We include auth_log_shard->second.log_tail because in GetLog,
1687 * we will request logs back to the min last_update over our
1688 * acting_backfill set, which will result in our log being extended
1689 * as far backwards as necessary to pick up any peers which can
1690 * be log recovered by auth_log_shard's log */
1691 eversion_t oldest_auth_log_entry =
1692 std::min(primary->second.log_tail, auth_log_shard->second.log_tail);
1693
1694 return std::make_pair(primary, oldest_auth_log_entry);
1695 }
1696
1697
1698 /**
1699 * calculate the desired acting set.
1700 *
1701 * Choose an appropriate acting set. Prefer up[0], unless it is
1702 * incomplete, or another osd has a longer tail that allows us to
1703 * bring other up nodes up to date.
1704 */
1705 void PeeringState::calc_replicated_acting(
1706 map<pg_shard_t, pg_info_t>::const_iterator primary,
1707 eversion_t oldest_auth_log_entry,
1708 unsigned size,
1709 const vector<int> &acting,
1710 const vector<int> &up,
1711 pg_shard_t up_primary,
1712 const map<pg_shard_t, pg_info_t> &all_info,
1713 bool restrict_to_up_acting,
1714 vector<int> *want,
1715 set<pg_shard_t> *backfill,
1716 set<pg_shard_t> *acting_backfill,
1717 const OSDMapRef osdmap,
1718 const PGPool& pool,
1719 ostream &ss)
1720 {
1721 ss << __func__ << (restrict_to_up_acting ? " restrict_to_up_acting" : "")
1722 << std::endl;
1723
1724 want->push_back(primary->first.osd);
1725 acting_backfill->insert(primary->first);
1726
1727 // select replicas that have log contiguity with primary.
1728 // prefer up, then acting, then any peer_info osds
1729 for (auto i : up) {
1730 pg_shard_t up_cand = pg_shard_t(i, shard_id_t::NO_SHARD);
1731 if (up_cand == primary->first)
1732 continue;
1733 const pg_info_t &cur_info = all_info.find(up_cand)->second;
1734 if (cur_info.is_incomplete() ||
1735 cur_info.last_update < oldest_auth_log_entry) {
1736 ss << " shard " << up_cand << " (up) backfill " << cur_info << std::endl;
1737 backfill->insert(up_cand);
1738 acting_backfill->insert(up_cand);
1739 } else {
1740 want->push_back(i);
1741 acting_backfill->insert(up_cand);
1742 ss << " osd." << i << " (up) accepted " << cur_info << std::endl;
1743 }
1744 }
1745
1746 if (want->size() >= size) {
1747 return;
1748 }
1749
1750 std::vector<std::pair<eversion_t, int>> candidate_by_last_update;
1751 candidate_by_last_update.reserve(acting.size());
1752 // This no longer has backfill OSDs, but they are covered above.
1753 for (auto i : acting) {
1754 pg_shard_t acting_cand(i, shard_id_t::NO_SHARD);
1755 // skip up osds we already considered above
1756 if (acting_cand == primary->first)
1757 continue;
1758 auto up_it = find(up.begin(), up.end(), i);
1759 if (up_it != up.end())
1760 continue;
1761
1762 const pg_info_t &cur_info = all_info.find(acting_cand)->second;
1763 if (cur_info.is_incomplete() ||
1764 cur_info.last_update < oldest_auth_log_entry) {
1765 ss << " shard " << acting_cand << " (acting) REJECTED "
1766 << cur_info << std::endl;
1767 } else {
1768 candidate_by_last_update.emplace_back(cur_info.last_update, i);
1769 }
1770 }
1771
1772 auto sort_by_eversion =[](const std::pair<eversion_t, int> &lhs,
1773 const std::pair<eversion_t, int> &rhs) {
1774 return lhs.first > rhs.first;
1775 };
1776 // sort by last_update, in descending order.
1777 std::sort(candidate_by_last_update.begin(),
1778 candidate_by_last_update.end(), sort_by_eversion);
1779 for (auto &p: candidate_by_last_update) {
1780 ceph_assert(want->size() < size);
1781 want->push_back(p.second);
1782 pg_shard_t s = pg_shard_t(p.second, shard_id_t::NO_SHARD);
1783 acting_backfill->insert(s);
1784 ss << " shard " << s << " (acting) accepted "
1785 << all_info.find(s)->second << std::endl;
1786 if (want->size() >= size) {
1787 return;
1788 }
1789 }
1790
1791 if (restrict_to_up_acting) {
1792 return;
1793 }
1794 candidate_by_last_update.clear();
1795 candidate_by_last_update.reserve(all_info.size()); // overestimate but fine
1796 // continue to search stray to find more suitable peers
1797 for (auto &i : all_info) {
1798 // skip up osds we already considered above
1799 if (i.first == primary->first)
1800 continue;
1801 auto up_it = find(up.begin(), up.end(), i.first.osd);
1802 if (up_it != up.end())
1803 continue;
1804 auto acting_it = find(
1805 acting.begin(), acting.end(), i.first.osd);
1806 if (acting_it != acting.end())
1807 continue;
1808
1809 if (i.second.is_incomplete() ||
1810 i.second.last_update < oldest_auth_log_entry) {
1811 ss << " shard " << i.first << " (stray) REJECTED " << i.second
1812 << std::endl;
1813 } else {
1814 candidate_by_last_update.emplace_back(
1815 i.second.last_update, i.first.osd);
1816 }
1817 }
1818
1819 if (candidate_by_last_update.empty()) {
1820 // save us some effort
1821 return;
1822 }
1823
1824 // sort by last_update, in descending order.
1825 std::sort(candidate_by_last_update.begin(),
1826 candidate_by_last_update.end(), sort_by_eversion);
1827
1828 for (auto &p: candidate_by_last_update) {
1829 ceph_assert(want->size() < size);
1830 want->push_back(p.second);
1831 pg_shard_t s = pg_shard_t(p.second, shard_id_t::NO_SHARD);
1832 acting_backfill->insert(s);
1833 ss << " shard " << s << " (stray) accepted "
1834 << all_info.find(s)->second << std::endl;
1835 if (want->size() >= size) {
1836 return;
1837 }
1838 }
1839 }
1840
1841 // Defines osd preference order: acting set, then larger last_update
1842 using osd_ord_t = std::tuple<bool, eversion_t>; // <acting, last_update>
1843 using osd_id_t = int;
1844
1845 class bucket_candidates_t {
1846 std::deque<std::pair<osd_ord_t, osd_id_t>> osds;
1847 int selected = 0;
1848
1849 public:
1850 void add_osd(osd_ord_t ord, osd_id_t osd) {
1851 // osds will be added in smallest to largest order
1852 assert(osds.empty() || osds.back().first <= ord);
1853 osds.push_back(std::make_pair(ord, osd));
1854 }
1855 osd_id_t pop_osd() {
1856 ceph_assert(!is_empty());
1857 auto ret = osds.front();
1858 osds.pop_front();
1859 return ret.second;
1860 }
1861
1862 void inc_selected() { selected++; }
1863 unsigned get_num_selected() const { return selected; }
1864
1865 osd_ord_t get_ord() const {
1866 return osds.empty() ? std::make_tuple(false, eversion_t())
1867 : osds.front().first;
1868 }
1869
1870 bool is_empty() const { return osds.empty(); }
1871
1872 bool operator<(const bucket_candidates_t &rhs) const {
1873 return std::make_tuple(-selected, get_ord()) <
1874 std::make_tuple(-rhs.selected, rhs.get_ord());
1875 }
1876
1877 friend std::ostream &operator<<(std::ostream &, const bucket_candidates_t &);
1878 };
1879
1880 std::ostream &operator<<(std::ostream &lhs, const bucket_candidates_t &cand)
1881 {
1882 return lhs << "candidates[" << cand.osds << "]";
1883 }
1884
1885 class bucket_heap_t {
1886 using elem_t = std::reference_wrapper<bucket_candidates_t>;
1887 std::vector<elem_t> heap;
1888
1889 // Max heap -- should emit buckets in order of preference
1890 struct comp {
1891 bool operator()(const elem_t &lhs, const elem_t &rhs) {
1892 return lhs.get() < rhs.get();
1893 }
1894 };
1895 public:
1896 void push_if_nonempty(elem_t e) {
1897 if (!e.get().is_empty()) {
1898 heap.push_back(e);
1899 std::push_heap(heap.begin(), heap.end(), comp());
1900 }
1901 }
1902 elem_t pop() {
1903 std::pop_heap(heap.begin(), heap.end(), comp());
1904 auto ret = heap.back();
1905 heap.pop_back();
1906 return ret;
1907 }
1908
1909 bool is_empty() const { return heap.empty(); }
1910 };
1911
1912 /**
1913 * calc_replicated_acting_stretch
1914 *
1915 * Choose an acting set using as much of the up set as possible; filling
1916 * in the remaining slots so as to maximize the number of crush buckets at
1917 * level pool.info.peering_crush_bucket_barrier represented.
1918 *
1919 * Stretch clusters are a bit special: while they have a "size" the
1920 * same way as normal pools, if we happen to lose a data center
1921 * (we call it a "stretch bucket", but really it'll be a data center or
1922 * a cloud availability zone), we don't actually want to shove
1923 * 2 DC's worth of replication into a single site -- it won't fit!
1924 * So we locally calculate a bucket_max, based
1925 * on the targeted number of stretch buckets for the pool and
1926 * its size. Then we won't pull more than bucket_max from any
1927 * given ancestor even if it leaves us undersized.
1928
1929 * There are two distinct phases: (commented below)
1930 */
1931 void PeeringState::calc_replicated_acting_stretch(
1932 map<pg_shard_t, pg_info_t>::const_iterator primary,
1933 eversion_t oldest_auth_log_entry,
1934 unsigned size,
1935 const vector<int> &acting,
1936 const vector<int> &up,
1937 pg_shard_t up_primary,
1938 const map<pg_shard_t, pg_info_t> &all_info,
1939 bool restrict_to_up_acting,
1940 vector<int> *want,
1941 set<pg_shard_t> *backfill,
1942 set<pg_shard_t> *acting_backfill,
1943 const OSDMapRef osdmap,
1944 const PGPool& pool,
1945 ostream &ss)
1946 {
1947 ceph_assert(want);
1948 ceph_assert(acting_backfill);
1949 ceph_assert(backfill);
1950 ss << __func__ << (restrict_to_up_acting ? " restrict_to_up_acting" : "")
1951 << std::endl;
1952
1953 auto used = [want](int osd) {
1954 return std::find(want->begin(), want->end(), osd) != want->end();
1955 };
1956
1957 auto usable_info = [&](const auto &cur_info) mutable {
1958 return !(cur_info.is_incomplete() ||
1959 cur_info.last_update < oldest_auth_log_entry);
1960 };
1961
1962 auto osd_info = [&](int osd) mutable -> const pg_info_t & {
1963 pg_shard_t cand = pg_shard_t(osd, shard_id_t::NO_SHARD);
1964 const pg_info_t &cur_info = all_info.find(cand)->second;
1965 return cur_info;
1966 };
1967
1968 auto usable_osd = [&](int osd) mutable {
1969 return usable_info(osd_info(osd));
1970 };
1971
1972 std::map<int, bucket_candidates_t> ancestors;
1973 auto get_ancestor = [&](int osd) mutable {
1974 int ancestor = osdmap->crush->get_parent_of_type(
1975 osd,
1976 pool.info.peering_crush_bucket_barrier,
1977 pool.info.crush_rule);
1978 return &ancestors[ancestor];
1979 };
1980
1981 unsigned bucket_max = pool.info.size / pool.info.peering_crush_bucket_target;
1982 if (bucket_max * pool.info.peering_crush_bucket_target < pool.info.size) {
1983 ++bucket_max;
1984 }
1985
1986 /* 1) Select all usable osds from the up set as well as the primary
1987 *
1988 * We also stash any unusable osds from up into backfill.
1989 */
1990 auto add_required = [&](int osd) {
1991 if (!used(osd)) {
1992 want->push_back(osd);
1993 acting_backfill->insert(
1994 pg_shard_t(osd, shard_id_t::NO_SHARD));
1995 get_ancestor(osd)->inc_selected();
1996 }
1997 };
1998 add_required(primary->first.osd);
1999 ss << " osd " << primary->first.osd << " primary accepted "
2000 << osd_info(primary->first.osd) << std::endl;
2001 for (auto upcand: up) {
2002 auto upshard = pg_shard_t(upcand, shard_id_t::NO_SHARD);
2003 auto &curinfo = osd_info(upcand);
2004 if (usable_osd(upcand)) {
2005 ss << " osd " << upcand << " (up) accepted " << curinfo << std::endl;
2006 add_required(upcand);
2007 } else {
2008 ss << " osd " << upcand << " (up) backfill " << curinfo << std::endl;
2009 backfill->insert(upshard);
2010 acting_backfill->insert(upshard);
2011 }
2012 }
2013
2014 if (want->size() >= pool.info.size) { // non-failed CRUSH mappings are valid
2015 ss << " up set sufficient" << std::endl;
2016 return;
2017 }
2018 ss << " up set insufficient, considering remaining osds" << std::endl;
2019
2020 /* 2) Fill out remaining slots from usable osds in all_info
2021 * while maximizing the number of ancestor nodes at the
2022 * barrier_id crush level.
2023 */
2024 {
2025 std::vector<std::pair<osd_ord_t, osd_id_t>> candidates;
2026 /* To do this, we first filter the set of usable osd into an ordered
2027 * list of usable osds
2028 */
2029 auto get_osd_ord = [&](bool is_acting, const pg_info_t &info) -> osd_ord_t {
2030 return std::make_tuple(
2031 !is_acting /* acting should sort first */,
2032 info.last_update);
2033 };
2034 for (auto &cand : acting) {
2035 auto &cand_info = osd_info(cand);
2036 if (!used(cand) && usable_info(cand_info)) {
2037 ss << " acting candidate " << cand << " " << cand_info << std::endl;
2038 candidates.push_back(std::make_pair(get_osd_ord(true, cand_info), cand));
2039 }
2040 }
2041 if (!restrict_to_up_acting) {
2042 for (auto &[cand, info] : all_info) {
2043 if (!used(cand.osd) && usable_info(info) &&
2044 (std::find(acting.begin(), acting.end(), cand.osd)
2045 == acting.end())) {
2046 ss << " other candidate " << cand << " " << info << std::endl;
2047 candidates.push_back(
2048 std::make_pair(get_osd_ord(false, info), cand.osd));
2049 }
2050 }
2051 }
2052 std::sort(candidates.begin(), candidates.end());
2053
2054 // We then filter these candidates by ancestor
2055 std::for_each(candidates.begin(), candidates.end(), [&](auto cand) {
2056 get_ancestor(cand.second)->add_osd(cand.first, cand.second);
2057 });
2058 }
2059
2060 auto pop_ancestor = [&](auto &ancestor) {
2061 ceph_assert(!ancestor.is_empty());
2062 auto osd = ancestor.pop_osd();
2063
2064 ss << " accepting candidate " << osd << std::endl;
2065
2066 ceph_assert(!used(osd));
2067 ceph_assert(usable_osd(osd));
2068
2069 want->push_back(osd);
2070 acting_backfill->insert(
2071 pg_shard_t(osd, shard_id_t::NO_SHARD));
2072 ancestor.inc_selected();
2073 };
2074
2075 /* Next, we use the ancestors map to grab a descendant of the
2076 * peering_crush_mandatory_member if not already represented.
2077 *
2078 * TODO: using 0 here to match other users. Prior to merge, I
2079 * expect that this and other users should instead check against
2080 * CRUSH_ITEM_NONE.
2081 */
2082 if (pool.info.peering_crush_mandatory_member != CRUSH_ITEM_NONE) {
2083 auto aiter = ancestors.find(pool.info.peering_crush_mandatory_member);
2084 if (aiter != ancestors.end() &&
2085 !aiter->second.get_num_selected()) {
2086 ss << " adding required ancestor " << aiter->first << std::endl;
2087 ceph_assert(!aiter->second.is_empty()); // wouldn't exist otherwise
2088 pop_ancestor(aiter->second);
2089 }
2090 }
2091
2092 /* We then place the ancestors in a heap ordered by fewest selected
2093 * and then by the ordering token of the next osd */
2094 bucket_heap_t aheap;
2095 std::for_each(ancestors.begin(), ancestors.end(), [&](auto &anc) {
2096 aheap.push_if_nonempty(anc.second);
2097 });
2098
2099 /* and pull from this heap until it's empty or we have enough.
2100 * "We have enough" is a sufficient check here for
2101 * stretch_set_can_peer() because our heap sorting always
2102 * pulls from ancestors with the least number of included OSDs,
2103 * so if it is possible to satisfy the bucket_count constraints we
2104 * will do so.
2105 */
2106 while (!aheap.is_empty() && want->size() < pool.info.size) {
2107 auto next = aheap.pop();
2108 pop_ancestor(next.get());
2109 if (next.get().get_num_selected() < bucket_max) {
2110 aheap.push_if_nonempty(next);
2111 }
2112 }
2113
2114 /* The end result is that we should have as many buckets covered as
2115 * possible while respecting up, the primary selection,
2116 * the pool size (given bucket count constraints),
2117 * and the mandatory member.
2118 */
2119 }
2120
2121
2122 bool PeeringState::recoverable(const vector<int> &want) const
2123 {
2124 unsigned num_want_acting = 0;
2125 set<pg_shard_t> have;
2126 for (int i = 0; i < (int)want.size(); ++i) {
2127 if (want[i] != CRUSH_ITEM_NONE) {
2128 ++num_want_acting;
2129 have.insert(
2130 pg_shard_t(
2131 want[i],
2132 pool.info.is_erasure() ? shard_id_t(i) : shard_id_t::NO_SHARD));
2133 }
2134 }
2135
2136 if (num_want_acting < pool.info.min_size) {
2137 if (!cct->_conf.get_val<bool>("osd_allow_recovery_below_min_size")) {
2138 psdout(10) << __func__ << " failed, recovery below min size not enabled" << dendl;
2139 return false;
2140 }
2141 }
2142 if (missing_loc.get_recoverable_predicate()(have)) {
2143 return true;
2144 } else {
2145 psdout(10) << __func__ << " failed, not recoverable " << dendl;
2146 return false;
2147 }
2148 }
2149
2150 void PeeringState::choose_async_recovery_ec(
2151 const map<pg_shard_t, pg_info_t> &all_info,
2152 const pg_info_t &auth_info,
2153 vector<int> *want,
2154 set<pg_shard_t> *async_recovery,
2155 const OSDMapRef osdmap) const
2156 {
2157 set<pair<int, pg_shard_t> > candidates_by_cost;
2158 for (uint8_t i = 0; i < want->size(); ++i) {
2159 if ((*want)[i] == CRUSH_ITEM_NONE)
2160 continue;
2161
2162 // Considering log entries to recover is accurate enough for
2163 // now. We could use minimum_to_decode_with_cost() later if
2164 // necessary.
2165 pg_shard_t shard_i((*want)[i], shard_id_t(i));
2166 // do not include strays
2167 if (stray_set.find(shard_i) != stray_set.end())
2168 continue;
2169 // Do not include an osd that is not up, since choosing it as
2170 // an async_recovery_target will move it out of the acting set.
2171 // This results in it being identified as a stray during peering,
2172 // because it is no longer in the up or acting set.
2173 if (!is_up(shard_i))
2174 continue;
2175 auto shard_info = all_info.find(shard_i)->second;
2176 // for ec pools we rollback all entries past the authoritative
2177 // last_update *before* activation. This is relatively inexpensive
2178 // compared to recovery, since it is purely local, so treat shards
2179 // past the authoritative last_update the same as those equal to it.
2180 version_t auth_version = auth_info.last_update.version;
2181 version_t candidate_version = shard_info.last_update.version;
2182 assert(HAVE_FEATURE(osdmap->get_up_osd_features(), SERVER_NAUTILUS));
2183 auto approx_missing_objects =
2184 shard_info.stats.stats.sum.num_objects_missing;
2185 if (auth_version > candidate_version) {
2186 approx_missing_objects += auth_version - candidate_version;
2187 }
2188 if (static_cast<uint64_t>(approx_missing_objects) >
2189 cct->_conf.get_val<uint64_t>("osd_async_recovery_min_cost")) {
2190 candidates_by_cost.emplace(approx_missing_objects, shard_i);
2191 }
2192 }
2193
2194 psdout(20) << __func__ << " candidates by cost are: " << candidates_by_cost
2195 << dendl;
2196
2197 // take out as many osds as we can for async recovery, in order of cost
2198 for (auto rit = candidates_by_cost.rbegin();
2199 rit != candidates_by_cost.rend(); ++rit) {
2200 pg_shard_t cur_shard = rit->second;
2201 vector<int> candidate_want(*want);
2202 candidate_want[cur_shard.shard.id] = CRUSH_ITEM_NONE;
2203 if (recoverable(candidate_want)) {
2204 want->swap(candidate_want);
2205 async_recovery->insert(cur_shard);
2206 }
2207 }
2208 psdout(20) << __func__ << " result want=" << *want
2209 << " async_recovery=" << *async_recovery << dendl;
2210 }
2211
2212 void PeeringState::choose_async_recovery_replicated(
2213 const map<pg_shard_t, pg_info_t> &all_info,
2214 const pg_info_t &auth_info,
2215 vector<int> *want,
2216 set<pg_shard_t> *async_recovery,
2217 const OSDMapRef osdmap) const
2218 {
2219 set<pair<int, pg_shard_t> > candidates_by_cost;
2220 for (auto osd_num : *want) {
2221 pg_shard_t shard_i(osd_num, shard_id_t::NO_SHARD);
2222 // do not include strays
2223 if (stray_set.find(shard_i) != stray_set.end())
2224 continue;
2225 // Do not include an osd that is not up, since choosing it as
2226 // an async_recovery_target will move it out of the acting set.
2227 // This results in it being identified as a stray during peering,
2228 // because it is no longer in the up or acting set.
2229 if (!is_up(shard_i))
2230 continue;
2231 auto shard_info = all_info.find(shard_i)->second;
2232 // use the approximate magnitude of the difference in length of
2233 // logs plus historical missing objects as the cost of recovery
2234 version_t auth_version = auth_info.last_update.version;
2235 version_t candidate_version = shard_info.last_update.version;
2236 assert(HAVE_FEATURE(osdmap->get_up_osd_features(), SERVER_NAUTILUS));
2237 auto approx_missing_objects =
2238 shard_info.stats.stats.sum.num_objects_missing;
2239 if (auth_version > candidate_version) {
2240 approx_missing_objects += auth_version - candidate_version;
2241 } else {
2242 approx_missing_objects += candidate_version - auth_version;
2243 }
2244 if (static_cast<uint64_t>(approx_missing_objects) >
2245 cct->_conf.get_val<uint64_t>("osd_async_recovery_min_cost")) {
2246 candidates_by_cost.emplace(approx_missing_objects, shard_i);
2247 }
2248 }
2249
2250 psdout(20) << __func__ << " candidates by cost are: " << candidates_by_cost
2251 << dendl;
2252 // take out as many osds as we can for async recovery, in order of cost
2253 for (auto rit = candidates_by_cost.rbegin();
2254 rit != candidates_by_cost.rend(); ++rit) {
2255 if (want->size() <= pool.info.min_size) {
2256 break;
2257 }
2258 pg_shard_t cur_shard = rit->second;
2259 vector<int> candidate_want(*want);
2260 for (auto it = candidate_want.begin(); it != candidate_want.end(); ++it) {
2261 if (*it == cur_shard.osd) {
2262 candidate_want.erase(it);
2263 if (pool.info.stretch_set_can_peer(candidate_want, *osdmap, NULL)) {
2264 // if we're in stretch mode, we can only remove the osd if it doesn't
2265 // break peering limits.
2266 want->swap(candidate_want);
2267 async_recovery->insert(cur_shard);
2268 }
2269 break;
2270 }
2271 }
2272 }
2273
2274 psdout(20) << __func__ << " result want=" << *want
2275 << " async_recovery=" << *async_recovery << dendl;
2276 }
2277
2278 /**
2279 * choose acting
2280 *
2281 * calculate the desired acting, and request a change with the monitor
2282 * if it differs from the current acting.
2283 *
2284 * if restrict_to_up_acting=true, we filter out anything that's not in
2285 * up/acting. in order to lift this restriction, we need to
2286 * 1) check whether it's worth switching the acting set any time we get
2287 * a new pg info (not just here, when recovery finishes)
2288 * 2) check whether anything in want_acting went down on each new map
2289 * (and, if so, calculate a new want_acting)
2290 * 3) remove the assertion in PG::PeeringState::Active::react(const AdvMap)
2291 * TODO!
2292 */
2293 bool PeeringState::choose_acting(pg_shard_t &auth_log_shard_id,
2294 bool restrict_to_up_acting,
2295 bool *history_les_bound,
2296 bool request_pg_temp_change_only)
2297 {
2298 map<pg_shard_t, pg_info_t> all_info(peer_info.begin(), peer_info.end());
2299 all_info[pg_whoami] = info;
2300
2301 if (cct->_conf->subsys.should_gather<dout_subsys, 10>()) {
2302 for (auto p = all_info.begin(); p != all_info.end(); ++p) {
2303 psdout(10) << __func__ << " all_info osd." << p->first << " "
2304 << p->second << dendl;
2305 }
2306 }
2307
2308 auto auth_log_shard = find_best_info(all_info, restrict_to_up_acting,
2309 history_les_bound);
2310
2311 if (auth_log_shard == all_info.end()) {
2312 if (up != acting) {
2313 psdout(10) << __func__ << " no suitable info found (incomplete backfills?),"
2314 << " reverting to up" << dendl;
2315 want_acting = up;
2316 vector<int> empty;
2317 pl->queue_want_pg_temp(empty);
2318 } else {
2319 psdout(10) << __func__ << " failed" << dendl;
2320 ceph_assert(want_acting.empty());
2321 }
2322 return false;
2323 }
2324
2325 ceph_assert(!auth_log_shard->second.is_incomplete());
2326 auth_log_shard_id = auth_log_shard->first;
2327
2328 set<pg_shard_t> want_backfill, want_acting_backfill;
2329 vector<int> want;
2330 stringstream ss;
2331 if (pool.info.is_replicated()) {
2332 auto [primary_shard, oldest_log] = select_replicated_primary(
2333 auth_log_shard,
2334 cct->_conf.get_val<uint64_t>(
2335 "osd_force_auth_primary_missing_objects"),
2336 up,
2337 up_primary,
2338 all_info,
2339 get_osdmap(),
2340 ss);
2341 if (pool.info.is_stretch_pool()) {
2342 calc_replicated_acting_stretch(
2343 primary_shard,
2344 oldest_log,
2345 get_osdmap()->get_pg_size(info.pgid.pgid),
2346 acting,
2347 up,
2348 up_primary,
2349 all_info,
2350 restrict_to_up_acting,
2351 &want,
2352 &want_backfill,
2353 &want_acting_backfill,
2354 get_osdmap(),
2355 pool,
2356 ss);
2357 } else {
2358 calc_replicated_acting(
2359 primary_shard,
2360 oldest_log,
2361 get_osdmap()->get_pg_size(info.pgid.pgid),
2362 acting,
2363 up,
2364 up_primary,
2365 all_info,
2366 restrict_to_up_acting,
2367 &want,
2368 &want_backfill,
2369 &want_acting_backfill,
2370 get_osdmap(),
2371 pool,
2372 ss);
2373 }
2374 } else {
2375 calc_ec_acting(
2376 auth_log_shard,
2377 get_osdmap()->get_pg_size(info.pgid.pgid),
2378 acting,
2379 up,
2380 all_info,
2381 restrict_to_up_acting,
2382 &want,
2383 &want_backfill,
2384 &want_acting_backfill,
2385 ss);
2386 }
2387 psdout(10) << ss.str() << dendl;
2388
2389 if (!recoverable(want)) {
2390 want_acting.clear();
2391 return false;
2392 }
2393
2394 set<pg_shard_t> want_async_recovery;
2395 if (HAVE_FEATURE(get_osdmap()->get_up_osd_features(), SERVER_MIMIC)) {
2396 if (pool.info.is_erasure()) {
2397 choose_async_recovery_ec(
2398 all_info, auth_log_shard->second, &want, &want_async_recovery,
2399 get_osdmap());
2400 } else {
2401 choose_async_recovery_replicated(
2402 all_info, auth_log_shard->second, &want, &want_async_recovery,
2403 get_osdmap());
2404 }
2405 }
2406 while (want.size() > pool.info.size) {
2407 // async recovery should have taken out as many osds as it can.
2408 // if not, then always evict the last peer
2409 // (will get synchronously recovered later)
2410 psdout(10) << __func__ << " evicting osd." << want.back()
2411 << " from oversized want " << want << dendl;
2412 want.pop_back();
2413 }
2414 if (want != acting) {
2415 psdout(10) << __func__ << " want " << want << " != acting " << acting
2416 << ", requesting pg_temp change" << dendl;
2417 want_acting = want;
2418
2419 if (!cct->_conf->osd_debug_no_acting_change) {
2420 if (want_acting == up) {
2421 // There can't be any pending backfill if
2422 // want is the same as crush map up OSDs.
2423 ceph_assert(want_backfill.empty());
2424 vector<int> empty;
2425 pl->queue_want_pg_temp(empty);
2426 } else
2427 pl->queue_want_pg_temp(want);
2428 }
2429 return false;
2430 }
2431
2432 if (request_pg_temp_change_only)
2433 return true;
2434 want_acting.clear();
2435 acting_recovery_backfill = want_acting_backfill;
2436 psdout(10) << "acting_recovery_backfill is "
2437 << acting_recovery_backfill << dendl;
2438 ceph_assert(
2439 backfill_targets.empty() ||
2440 backfill_targets == want_backfill);
2441 if (backfill_targets.empty()) {
2442 // Caller is GetInfo
2443 backfill_targets = want_backfill;
2444 }
2445 // Adding !needs_recovery() to let the async_recovery_targets reset after recovery is complete
2446 ceph_assert(
2447 async_recovery_targets.empty() ||
2448 async_recovery_targets == want_async_recovery ||
2449 !needs_recovery());
2450 if (async_recovery_targets.empty() || !needs_recovery()) {
2451 async_recovery_targets = want_async_recovery;
2452 }
2453 // Will not change if already set because up would have had to change
2454 // Verify that nothing in backfill is in stray_set
2455 for (auto i = want_backfill.begin(); i != want_backfill.end(); ++i) {
2456 ceph_assert(stray_set.find(*i) == stray_set.end());
2457 }
2458 psdout(10) << "choose_acting want=" << want << " backfill_targets="
2459 << want_backfill << " async_recovery_targets="
2460 << async_recovery_targets << dendl;
2461 return true;
2462 }
2463
2464 void PeeringState::log_weirdness()
2465 {
2466 if (pg_log.get_tail() != info.log_tail)
2467 pl->get_clog_error() << info.pgid
2468 << " info mismatch, log.tail " << pg_log.get_tail()
2469 << " != info.log_tail " << info.log_tail;
2470 if (pg_log.get_head() != info.last_update)
2471 pl->get_clog_error() << info.pgid
2472 << " info mismatch, log.head " << pg_log.get_head()
2473 << " != info.last_update " << info.last_update;
2474
2475 if (!pg_log.get_log().empty()) {
2476 // sloppy check
2477 if ((pg_log.get_log().log.begin()->version <= pg_log.get_tail()))
2478 pl->get_clog_error() << info.pgid
2479 << " log bound mismatch, info (tail,head] ("
2480 << pg_log.get_tail() << ","
2481 << pg_log.get_head() << "]"
2482 << " actual ["
2483 << pg_log.get_log().log.begin()->version << ","
2484 << pg_log.get_log().log.rbegin()->version << "]";
2485 }
2486
2487 if (pg_log.get_log().caller_ops.size() > pg_log.get_log().log.size()) {
2488 pl->get_clog_error() << info.pgid
2489 << " caller_ops.size "
2490 << pg_log.get_log().caller_ops.size()
2491 << " > log size " << pg_log.get_log().log.size();
2492 }
2493 }
2494
2495 /*
2496 * Process information from a replica to determine if it could have any
2497 * objects that i need.
2498 *
2499 * TODO: if the missing set becomes very large, this could get expensive.
2500 * Instead, we probably want to just iterate over our unfound set.
2501 */
2502 bool PeeringState::search_for_missing(
2503 const pg_info_t &oinfo, const pg_missing_t &omissing,
2504 pg_shard_t from,
2505 PeeringCtxWrapper &ctx)
2506 {
2507 uint64_t num_unfound_before = missing_loc.num_unfound();
2508 bool found_missing = missing_loc.add_source_info(
2509 from, oinfo, omissing, ctx.handle);
2510 if (found_missing && num_unfound_before != missing_loc.num_unfound())
2511 pl->publish_stats_to_osd();
2512 // avoid doing this if the peer is empty. This is abit of paranoia
2513 // to avoid doing something rash if add_source_info() above
2514 // incorrectly decided we found something new. (if the peer has
2515 // last_update=0'0 that's impossible.)
2516 if (found_missing &&
2517 oinfo.last_update != eversion_t()) {
2518 pg_info_t tinfo(oinfo);
2519 tinfo.pgid.shard = pg_whoami.shard;
2520 ctx.send_info(
2521 from.osd,
2522 spg_t(info.pgid.pgid, from.shard),
2523 get_osdmap_epoch(), // fixme: use lower epoch?
2524 get_osdmap_epoch(),
2525 tinfo);
2526 }
2527 return found_missing;
2528 }
2529
2530 bool PeeringState::discover_all_missing(
2531 BufferedRecoveryMessages &rctx)
2532 {
2533 auto &missing = pg_log.get_missing();
2534 uint64_t unfound = get_num_unfound();
2535 bool any = false; // did we start any queries
2536
2537 psdout(10) << __func__ << " "
2538 << missing.num_missing() << " missing, "
2539 << unfound << " unfound"
2540 << dendl;
2541
2542 auto m = might_have_unfound.begin();
2543 auto mend = might_have_unfound.end();
2544 for (; m != mend; ++m) {
2545 pg_shard_t peer(*m);
2546
2547 if (!get_osdmap()->is_up(peer.osd)) {
2548 psdout(20) << __func__ << " skipping down osd." << peer << dendl;
2549 continue;
2550 }
2551
2552 if (peer_purged.count(peer)) {
2553 psdout(20) << __func__ << " skipping purged osd." << peer << dendl;
2554 continue;
2555 }
2556
2557 auto iter = peer_info.find(peer);
2558 if (iter != peer_info.end() &&
2559 (iter->second.is_empty() || iter->second.dne())) {
2560 // ignore empty peers
2561 continue;
2562 }
2563
2564 // If we've requested any of this stuff, the pg_missing_t information
2565 // should be on its way.
2566 // TODO: coalsce requested_* into a single data structure
2567 if (peer_missing.find(peer) != peer_missing.end()) {
2568 psdout(20) << __func__ << ": osd." << peer
2569 << ": we already have pg_missing_t" << dendl;
2570 continue;
2571 }
2572 if (peer_log_requested.find(peer) != peer_log_requested.end()) {
2573 psdout(20) << __func__ << ": osd." << peer
2574 << ": in peer_log_requested" << dendl;
2575 continue;
2576 }
2577 if (peer_missing_requested.find(peer) != peer_missing_requested.end()) {
2578 psdout(20) << __func__ << ": osd." << peer
2579 << ": in peer_missing_requested" << dendl;
2580 continue;
2581 }
2582
2583 // Request missing
2584 psdout(10) << __func__ << ": osd." << peer << ": requesting pg_missing_t"
2585 << dendl;
2586 peer_missing_requested.insert(peer);
2587 rctx.send_query(
2588 peer.osd,
2589 spg_t(info.pgid.pgid, peer.shard),
2590 pg_query_t(
2591 pg_query_t::FULLLOG,
2592 peer.shard, pg_whoami.shard,
2593 info.history, get_osdmap_epoch()));
2594 any = true;
2595 }
2596 return any;
2597 }
2598
2599 /* Build the might_have_unfound set.
2600 *
2601 * This is used by the primary OSD during recovery.
2602 *
2603 * This set tracks the OSDs which might have unfound objects that the primary
2604 * OSD needs. As we receive pg_missing_t from each OSD in might_have_unfound, we
2605 * will remove the OSD from the set.
2606 */
2607 void PeeringState::build_might_have_unfound()
2608 {
2609 ceph_assert(might_have_unfound.empty());
2610 ceph_assert(is_primary());
2611
2612 psdout(10) << __func__ << dendl;
2613
2614 check_past_interval_bounds();
2615
2616 might_have_unfound = past_intervals.get_might_have_unfound(
2617 pg_whoami,
2618 pool.info.is_erasure());
2619
2620 // include any (stray) peers
2621 for (auto p = peer_info.begin(); p != peer_info.end(); ++p)
2622 might_have_unfound.insert(p->first);
2623
2624 psdout(15) << __func__ << ": built " << might_have_unfound << dendl;
2625 }
2626
2627 void PeeringState::activate(
2628 ObjectStore::Transaction& t,
2629 epoch_t activation_epoch,
2630 PeeringCtxWrapper &ctx)
2631 {
2632 ceph_assert(!is_peered());
2633
2634 // twiddle pg state
2635 state_clear(PG_STATE_DOWN);
2636
2637 send_notify = false;
2638
2639 if (is_primary()) {
2640 // only update primary last_epoch_started if we will go active
2641 if (acting_set_writeable()) {
2642 ceph_assert(cct->_conf->osd_find_best_info_ignore_history_les ||
2643 info.last_epoch_started <= activation_epoch);
2644 info.last_epoch_started = activation_epoch;
2645 info.last_interval_started = info.history.same_interval_since;
2646 }
2647 } else if (is_acting(pg_whoami)) {
2648 /* update last_epoch_started on acting replica to whatever the primary sent
2649 * unless it's smaller (could happen if we are going peered rather than
2650 * active, see doc/dev/osd_internals/last_epoch_started.rst) */
2651 if (info.last_epoch_started < activation_epoch) {
2652 info.last_epoch_started = activation_epoch;
2653 info.last_interval_started = info.history.same_interval_since;
2654 }
2655 }
2656
2657 auto &missing = pg_log.get_missing();
2658
2659 min_last_complete_ondisk = eversion_t(0,0); // we don't know (yet)!
2660 if (is_primary()) {
2661 last_update_ondisk = info.last_update;
2662 }
2663 last_update_applied = info.last_update;
2664 last_rollback_info_trimmed_to_applied = pg_log.get_can_rollback_to();
2665
2666 need_up_thru = false;
2667
2668 // write pg info, log
2669 dirty_info = true;
2670 dirty_big_info = true; // maybe
2671
2672 pl->schedule_event_on_commit(
2673 t,
2674 std::make_shared<PGPeeringEvent>(
2675 get_osdmap_epoch(),
2676 get_osdmap_epoch(),
2677 ActivateCommitted(
2678 get_osdmap_epoch(),
2679 activation_epoch)));
2680
2681 // init complete pointer
2682 if (missing.num_missing() == 0) {
2683 psdout(10) << "activate - no missing, moving last_complete " << info.last_complete
2684 << " -> " << info.last_update << dendl;
2685 info.last_complete = info.last_update;
2686 info.stats.stats.sum.num_objects_missing = 0;
2687 pg_log.reset_recovery_pointers();
2688 } else {
2689 psdout(10) << "activate - not complete, " << missing << dendl;
2690 info.stats.stats.sum.num_objects_missing = missing.num_missing();
2691 pg_log.activate_not_complete(info);
2692 }
2693
2694 log_weirdness();
2695
2696 if (is_primary()) {
2697 // initialize snap_trimq
2698 interval_set<snapid_t> to_trim;
2699 auto& removed_snaps_queue = get_osdmap()->get_removed_snaps_queue();
2700 auto p = removed_snaps_queue.find(info.pgid.pgid.pool());
2701 if (p != removed_snaps_queue.end()) {
2702 dout(20) << "activate - purged_snaps " << info.purged_snaps
2703 << " removed_snaps " << p->second
2704 << dendl;
2705 for (auto q : p->second) {
2706 to_trim.insert(q.first, q.second);
2707 }
2708 }
2709 interval_set<snapid_t> purged;
2710 purged.intersection_of(to_trim, info.purged_snaps);
2711 to_trim.subtract(purged);
2712
2713 assert(HAVE_FEATURE(upacting_features, SERVER_OCTOPUS));
2714 renew_lease(pl->get_mnow());
2715 // do not schedule until we are actually activated
2716
2717 // adjust purged_snaps: PG may have been inactive while snaps were pruned
2718 // from the removed_snaps_queue in the osdmap. update local purged_snaps
2719 // reflect only those snaps that we thought were pruned and were still in
2720 // the queue.
2721 info.purged_snaps.swap(purged);
2722
2723 // start up replicas
2724 if (prior_readable_down_osds.empty()) {
2725 dout(10) << __func__ << " no prior_readable_down_osds to wait on, clearing ub"
2726 << dendl;
2727 clear_prior_readable_until_ub();
2728 }
2729 info.history.refresh_prior_readable_until_ub(pl->get_mnow(),
2730 prior_readable_until_ub);
2731
2732 ceph_assert(!acting_recovery_backfill.empty());
2733 for (auto i = acting_recovery_backfill.begin();
2734 i != acting_recovery_backfill.end();
2735 ++i) {
2736 if (*i == pg_whoami) continue;
2737 pg_shard_t peer = *i;
2738 ceph_assert(peer_info.count(peer));
2739 pg_info_t& pi = peer_info[peer];
2740
2741 psdout(10) << "activate peer osd." << peer << " " << pi << dendl;
2742
2743 #if defined(WITH_SEASTAR)
2744 MURef<MOSDPGLog> m;
2745 #else
2746 MRef<MOSDPGLog> m;
2747 #endif
2748 ceph_assert(peer_missing.count(peer));
2749 pg_missing_t& pm = peer_missing[peer];
2750
2751 bool needs_past_intervals = pi.dne();
2752
2753 // Save num_bytes for backfill reservation request, can't be negative
2754 peer_bytes[peer] = std::max<int64_t>(0, pi.stats.stats.sum.num_bytes);
2755
2756 if (pi.last_update == info.last_update) {
2757 // empty log
2758 if (!pi.last_backfill.is_max())
2759 pl->get_clog_info() << info.pgid << " continuing backfill to osd."
2760 << peer
2761 << " from (" << pi.log_tail << "," << pi.last_update
2762 << "] " << pi.last_backfill
2763 << " to " << info.last_update;
2764 if (!pi.is_empty()) {
2765 psdout(10) << "activate peer osd." << peer
2766 << " is up to date, queueing in pending_activators" << dendl;
2767 ctx.send_info(
2768 peer.osd,
2769 spg_t(info.pgid.pgid, peer.shard),
2770 get_osdmap_epoch(), // fixme: use lower epoch?
2771 get_osdmap_epoch(),
2772 info,
2773 get_lease());
2774 } else {
2775 psdout(10) << "activate peer osd." << peer
2776 << " is up to date, but sending pg_log anyway" << dendl;
2777 m = TOPNSPC::make_message<MOSDPGLog>(
2778 i->shard, pg_whoami.shard,
2779 get_osdmap_epoch(), info,
2780 last_peering_reset);
2781 }
2782 } else if (
2783 pg_log.get_tail() > pi.last_update ||
2784 pi.last_backfill == hobject_t() ||
2785 (backfill_targets.count(*i) && pi.last_backfill.is_max())) {
2786 /* ^ This last case covers a situation where a replica is not contiguous
2787 * with the auth_log, but is contiguous with this replica. Reshuffling
2788 * the active set to handle this would be tricky, so instead we just go
2789 * ahead and backfill it anyway. This is probably preferrable in any
2790 * case since the replica in question would have to be significantly
2791 * behind.
2792 */
2793 // backfill
2794 pl->get_clog_debug() << info.pgid << " starting backfill to osd." << peer
2795 << " from (" << pi.log_tail << "," << pi.last_update
2796 << "] " << pi.last_backfill
2797 << " to " << info.last_update;
2798
2799 pi.last_update = info.last_update;
2800 pi.last_complete = info.last_update;
2801 pi.set_last_backfill(hobject_t());
2802 pi.last_epoch_started = info.last_epoch_started;
2803 pi.last_interval_started = info.last_interval_started;
2804 pi.history = info.history;
2805 pi.hit_set = info.hit_set;
2806 pi.stats.stats.clear();
2807 pi.stats.stats.sum.num_bytes = peer_bytes[peer];
2808
2809 // initialize peer with our purged_snaps.
2810 pi.purged_snaps = info.purged_snaps;
2811
2812 m = TOPNSPC::make_message<MOSDPGLog>(
2813 i->shard, pg_whoami.shard,
2814 get_osdmap_epoch(), pi,
2815 last_peering_reset /* epoch to create pg at */);
2816
2817 // send some recent log, so that op dup detection works well.
2818 m->log.copy_up_to(cct, pg_log.get_log(),
2819 cct->_conf->osd_max_pg_log_entries);
2820 m->info.log_tail = m->log.tail;
2821 pi.log_tail = m->log.tail; // sigh...
2822
2823 pm.clear();
2824 } else {
2825 // catch up
2826 ceph_assert(pg_log.get_tail() <= pi.last_update);
2827 m = TOPNSPC::make_message<MOSDPGLog>(
2828 i->shard, pg_whoami.shard,
2829 get_osdmap_epoch(), info,
2830 last_peering_reset /* epoch to create pg at */);
2831 // send new stuff to append to replicas log
2832 m->log.copy_after(cct, pg_log.get_log(), pi.last_update);
2833 }
2834
2835 // share past_intervals if we are creating the pg on the replica
2836 // based on whether our info for that peer was dne() *before*
2837 // updating pi.history in the backfill block above.
2838 if (m && needs_past_intervals)
2839 m->past_intervals = past_intervals;
2840
2841 // update local version of peer's missing list!
2842 if (m && pi.last_backfill != hobject_t()) {
2843 for (auto p = m->log.log.begin(); p != m->log.log.end(); ++p) {
2844 if (p->soid <= pi.last_backfill &&
2845 !p->is_error()) {
2846 if (perform_deletes_during_peering() && p->is_delete()) {
2847 pm.rm(p->soid, p->version);
2848 } else {
2849 pm.add_next_event(*p);
2850 }
2851 }
2852 }
2853 }
2854
2855 if (m) {
2856 dout(10) << "activate peer osd." << peer << " sending " << m->log
2857 << dendl;
2858 m->lease = get_lease();
2859 pl->send_cluster_message(peer.osd, std::move(m), get_osdmap_epoch());
2860 }
2861
2862 // peer now has
2863 pi.last_update = info.last_update;
2864
2865 // update our missing
2866 if (pm.num_missing() == 0) {
2867 pi.last_complete = pi.last_update;
2868 psdout(10) << "activate peer osd." << peer << " " << pi
2869 << " uptodate" << dendl;
2870 } else {
2871 psdout(10) << "activate peer osd." << peer << " " << pi
2872 << " missing " << pm << dendl;
2873 }
2874 }
2875
2876 // Set up missing_loc
2877 set<pg_shard_t> complete_shards;
2878 for (auto i = acting_recovery_backfill.begin();
2879 i != acting_recovery_backfill.end();
2880 ++i) {
2881 psdout(20) << __func__ << " setting up missing_loc from shard " << *i
2882 << " " << dendl;
2883 if (*i == get_primary()) {
2884 missing_loc.add_active_missing(missing);
2885 if (!missing.have_missing())
2886 complete_shards.insert(*i);
2887 } else {
2888 auto peer_missing_entry = peer_missing.find(*i);
2889 ceph_assert(peer_missing_entry != peer_missing.end());
2890 missing_loc.add_active_missing(peer_missing_entry->second);
2891 if (!peer_missing_entry->second.have_missing() &&
2892 peer_info[*i].last_backfill.is_max())
2893 complete_shards.insert(*i);
2894 }
2895 }
2896
2897 // If necessary, create might_have_unfound to help us find our unfound objects.
2898 // NOTE: It's important that we build might_have_unfound before trimming the
2899 // past intervals.
2900 might_have_unfound.clear();
2901 if (needs_recovery()) {
2902 // If only one shard has missing, we do a trick to add all others as recovery
2903 // source, this is considered safe since the PGLogs have been merged locally,
2904 // and covers vast majority of the use cases, like one OSD/host is down for
2905 // a while for hardware repairing
2906 if (complete_shards.size() + 1 == acting_recovery_backfill.size()) {
2907 missing_loc.add_batch_sources_info(complete_shards, ctx.handle);
2908 } else {
2909 missing_loc.add_source_info(pg_whoami, info, pg_log.get_missing(),
2910 ctx.handle);
2911 for (auto i = acting_recovery_backfill.begin();
2912 i != acting_recovery_backfill.end();
2913 ++i) {
2914 if (*i == pg_whoami) continue;
2915 psdout(10) << __func__ << ": adding " << *i << " as a source" << dendl;
2916 ceph_assert(peer_missing.count(*i));
2917 ceph_assert(peer_info.count(*i));
2918 missing_loc.add_source_info(
2919 *i,
2920 peer_info[*i],
2921 peer_missing[*i],
2922 ctx.handle);
2923 }
2924 }
2925 for (auto i = peer_missing.begin(); i != peer_missing.end(); ++i) {
2926 if (is_acting_recovery_backfill(i->first))
2927 continue;
2928 ceph_assert(peer_info.count(i->first));
2929 search_for_missing(
2930 peer_info[i->first],
2931 i->second,
2932 i->first,
2933 ctx);
2934 }
2935
2936 build_might_have_unfound();
2937
2938 // Always call now so update_calc_stats() will be accurate
2939 discover_all_missing(ctx.msgs);
2940
2941 }
2942
2943 // num_objects_degraded if calculated should reflect this too, unless no
2944 // missing and we are about to go clean.
2945 if (get_osdmap()->get_pg_size(info.pgid.pgid) > actingset.size()) {
2946 state_set(PG_STATE_UNDERSIZED);
2947 }
2948
2949 state_set(PG_STATE_ACTIVATING);
2950 pl->on_activate(std::move(to_trim));
2951 }
2952 if (acting_set_writeable()) {
2953 PGLog::LogEntryHandlerRef rollbacker{pl->get_log_handler(t)};
2954 pg_log.roll_forward(rollbacker.get());
2955 }
2956 }
2957
2958 void PeeringState::share_pg_info()
2959 {
2960 psdout(10) << "share_pg_info" << dendl;
2961
2962 info.history.refresh_prior_readable_until_ub(pl->get_mnow(),
2963 prior_readable_until_ub);
2964
2965 // share new pg_info_t with replicas
2966 ceph_assert(!acting_recovery_backfill.empty());
2967 for (auto pg_shard : acting_recovery_backfill) {
2968 if (pg_shard == pg_whoami) continue;
2969 if (auto peer = peer_info.find(pg_shard); peer != peer_info.end()) {
2970 peer->second.last_epoch_started = info.last_epoch_started;
2971 peer->second.last_interval_started = info.last_interval_started;
2972 peer->second.history.merge(info.history);
2973 }
2974 auto m = TOPNSPC::make_message<MOSDPGInfo2>(spg_t{info.pgid.pgid, pg_shard.shard},
2975 info,
2976 get_osdmap_epoch(),
2977 get_osdmap_epoch(),
2978 std::optional<pg_lease_t>{get_lease()},
2979 std::nullopt);
2980 pl->send_cluster_message(pg_shard.osd, std::move(m), get_osdmap_epoch());
2981 }
2982 }
2983
2984 void PeeringState::merge_log(
2985 ObjectStore::Transaction& t, pg_info_t &oinfo, pg_log_t&& olog,
2986 pg_shard_t from)
2987 {
2988 PGLog::LogEntryHandlerRef rollbacker{pl->get_log_handler(t)};
2989 pg_log.merge_log(
2990 oinfo, std::move(olog), from, info, rollbacker.get(),
2991 dirty_info, dirty_big_info);
2992 }
2993
2994 void PeeringState::rewind_divergent_log(
2995 ObjectStore::Transaction& t, eversion_t newhead)
2996 {
2997 PGLog::LogEntryHandlerRef rollbacker{pl->get_log_handler(t)};
2998 pg_log.rewind_divergent_log(
2999 newhead, info, rollbacker.get(), dirty_info, dirty_big_info);
3000 }
3001
3002
3003 void PeeringState::proc_primary_info(
3004 ObjectStore::Transaction &t, const pg_info_t &oinfo)
3005 {
3006 ceph_assert(!is_primary());
3007
3008 update_history(oinfo.history);
3009 if (!info.stats.stats_invalid && info.stats.stats.sum.num_scrub_errors) {
3010 info.stats.stats.sum.num_scrub_errors = 0;
3011 info.stats.stats.sum.num_shallow_scrub_errors = 0;
3012 info.stats.stats.sum.num_deep_scrub_errors = 0;
3013 dirty_info = true;
3014 }
3015
3016 if (!(info.purged_snaps == oinfo.purged_snaps)) {
3017 psdout(10) << __func__ << " updating purged_snaps to "
3018 << oinfo.purged_snaps
3019 << dendl;
3020 info.purged_snaps = oinfo.purged_snaps;
3021 dirty_info = true;
3022 dirty_big_info = true;
3023 }
3024 }
3025
3026 void PeeringState::proc_master_log(
3027 ObjectStore::Transaction& t, pg_info_t &oinfo,
3028 pg_log_t&& olog, pg_missing_t&& omissing, pg_shard_t from)
3029 {
3030 psdout(10) << "proc_master_log for osd." << from << ": "
3031 << olog << " " << omissing << dendl;
3032 ceph_assert(!is_peered() && is_primary());
3033
3034 // merge log into our own log to build master log. no need to
3035 // make any adjustments to their missing map; we are taking their
3036 // log to be authoritative (i.e., their entries are by definitely
3037 // non-divergent).
3038 merge_log(t, oinfo, std::move(olog), from);
3039 peer_info[from] = oinfo;
3040 psdout(10) << " peer osd." << from << " now " << oinfo
3041 << " " << omissing << dendl;
3042 might_have_unfound.insert(from);
3043
3044 // See doc/dev/osd_internals/last_epoch_started
3045 if (oinfo.last_epoch_started > info.last_epoch_started) {
3046 info.last_epoch_started = oinfo.last_epoch_started;
3047 dirty_info = true;
3048 }
3049 if (oinfo.last_interval_started > info.last_interval_started) {
3050 info.last_interval_started = oinfo.last_interval_started;
3051 dirty_info = true;
3052 }
3053 update_history(oinfo.history);
3054 ceph_assert(cct->_conf->osd_find_best_info_ignore_history_les ||
3055 info.last_epoch_started >= info.history.last_epoch_started);
3056
3057 peer_missing[from].claim(std::move(omissing));
3058 }
3059
3060 void PeeringState::proc_replica_log(
3061 pg_info_t &oinfo,
3062 const pg_log_t &olog,
3063 pg_missing_t&& omissing,
3064 pg_shard_t from)
3065 {
3066 psdout(10) << "proc_replica_log for osd." << from << ": "
3067 << oinfo << " " << olog << " " << omissing << dendl;
3068
3069 pg_log.proc_replica_log(oinfo, olog, omissing, from);
3070
3071 peer_info[from] = oinfo;
3072 psdout(10) << " peer osd." << from << " now "
3073 << oinfo << " " << omissing << dendl;
3074 might_have_unfound.insert(from);
3075
3076 for (auto i = omissing.get_items().begin();
3077 i != omissing.get_items().end();
3078 ++i) {
3079 psdout(20) << " after missing " << i->first
3080 << " need " << i->second.need
3081 << " have " << i->second.have << dendl;
3082 }
3083 peer_missing[from].claim(std::move(omissing));
3084 }
3085
3086 void PeeringState::fulfill_info(
3087 pg_shard_t from, const pg_query_t &query,
3088 pair<pg_shard_t, pg_info_t> &notify_info)
3089 {
3090 ceph_assert(from == primary);
3091 ceph_assert(query.type == pg_query_t::INFO);
3092
3093 // info
3094 psdout(10) << "sending info" << dendl;
3095 notify_info = make_pair(from, info);
3096 }
3097
3098 void PeeringState::fulfill_log(
3099 pg_shard_t from, const pg_query_t &query, epoch_t query_epoch)
3100 {
3101 psdout(10) << "log request from " << from << dendl;
3102 ceph_assert(from == primary);
3103 ceph_assert(query.type != pg_query_t::INFO);
3104
3105 auto mlog = TOPNSPC::make_message<MOSDPGLog>(
3106 from.shard, pg_whoami.shard,
3107 get_osdmap_epoch(),
3108 info, query_epoch);
3109 mlog->missing = pg_log.get_missing();
3110
3111 // primary -> other, when building master log
3112 if (query.type == pg_query_t::LOG) {
3113 psdout(10) << " sending info+missing+log since " << query.since
3114 << dendl;
3115 if (query.since != eversion_t() && query.since < pg_log.get_tail()) {
3116 pl->get_clog_error() << info.pgid << " got broken pg_query_t::LOG since "
3117 << query.since
3118 << " when my log.tail is " << pg_log.get_tail()
3119 << ", sending full log instead";
3120 mlog->log = pg_log.get_log(); // primary should not have requested this!!
3121 } else
3122 mlog->log.copy_after(cct, pg_log.get_log(), query.since);
3123 }
3124 else if (query.type == pg_query_t::FULLLOG) {
3125 psdout(10) << " sending info+missing+full log" << dendl;
3126 mlog->log = pg_log.get_log();
3127 }
3128
3129 psdout(10) << " sending " << mlog->log << " " << mlog->missing << dendl;
3130
3131 pl->send_cluster_message(from.osd, std::move(mlog), get_osdmap_epoch(), true);
3132 }
3133
3134 void PeeringState::fulfill_query(const MQuery& query, PeeringCtxWrapper &rctx)
3135 {
3136 if (query.query.type == pg_query_t::INFO) {
3137 pair<pg_shard_t, pg_info_t> notify_info;
3138 // note this refreshes our prior_readable_until_ub value
3139 update_history(query.query.history);
3140 fulfill_info(query.from, query.query, notify_info);
3141 rctx.send_notify(
3142 notify_info.first.osd,
3143 pg_notify_t(
3144 notify_info.first.shard, pg_whoami.shard,
3145 query.query_epoch,
3146 get_osdmap_epoch(),
3147 notify_info.second,
3148 past_intervals));
3149 } else {
3150 update_history(query.query.history);
3151 fulfill_log(query.from, query.query, query.query_epoch);
3152 }
3153 }
3154
3155 void PeeringState::try_mark_clean()
3156 {
3157 if (actingset.size() == get_osdmap()->get_pg_size(info.pgid.pgid)) {
3158 state_clear(PG_STATE_FORCED_BACKFILL | PG_STATE_FORCED_RECOVERY);
3159 state_set(PG_STATE_CLEAN);
3160 info.history.last_epoch_clean = get_osdmap_epoch();
3161 info.history.last_interval_clean = info.history.same_interval_since;
3162 past_intervals.clear();
3163 dirty_big_info = true;
3164 dirty_info = true;
3165 }
3166
3167 if (!is_active() && is_peered()) {
3168 if (is_clean()) {
3169 bool target;
3170 if (pool.info.is_pending_merge(info.pgid.pgid, &target)) {
3171 if (target) {
3172 psdout(10) << "ready to merge (target)" << dendl;
3173 pl->set_ready_to_merge_target(
3174 info.last_update,
3175 info.history.last_epoch_started,
3176 info.history.last_epoch_clean);
3177 } else {
3178 psdout(10) << "ready to merge (source)" << dendl;
3179 pl->set_ready_to_merge_source(info.last_update);
3180 }
3181 }
3182 } else {
3183 psdout(10) << "not clean, not ready to merge" << dendl;
3184 // we should have notified OSD in Active state entry point
3185 }
3186 }
3187
3188 state_clear(PG_STATE_FORCED_RECOVERY | PG_STATE_FORCED_BACKFILL);
3189
3190 share_pg_info();
3191 pl->publish_stats_to_osd();
3192 clear_recovery_state();
3193 }
3194
3195 void PeeringState::split_into(
3196 pg_t child_pgid, PeeringState *child, unsigned split_bits)
3197 {
3198 child->update_osdmap_ref(get_osdmap());
3199 child->pool = pool;
3200
3201 // Log
3202 pg_log.split_into(child_pgid, split_bits, &(child->pg_log));
3203 child->info.last_complete = info.last_complete;
3204
3205 info.last_update = pg_log.get_head();
3206 child->info.last_update = child->pg_log.get_head();
3207
3208 child->info.last_user_version = info.last_user_version;
3209
3210 info.log_tail = pg_log.get_tail();
3211 child->info.log_tail = child->pg_log.get_tail();
3212
3213 // reset last_complete, we might have modified pg_log & missing above
3214 pg_log.reset_complete_to(&info);
3215 child->pg_log.reset_complete_to(&child->info);
3216
3217 // Info
3218 child->info.history = info.history;
3219 child->info.history.epoch_created = get_osdmap_epoch();
3220 child->info.purged_snaps = info.purged_snaps;
3221
3222 if (info.last_backfill.is_max()) {
3223 child->info.set_last_backfill(hobject_t::get_max());
3224 } else {
3225 // restart backfill on parent and child to be safe. we could
3226 // probably do better in the bitwise sort case, but it's more
3227 // fragile (there may be special work to do on backfill completion
3228 // in the future).
3229 info.set_last_backfill(hobject_t());
3230 child->info.set_last_backfill(hobject_t());
3231 // restarting backfill implies that the missing set is empty,
3232 // since it is only used for objects prior to last_backfill
3233 pg_log.reset_backfill();
3234 child->pg_log.reset_backfill();
3235 }
3236
3237 child->info.stats = info.stats;
3238 child->info.stats.parent_split_bits = split_bits;
3239 info.stats.stats_invalid = true;
3240 child->info.stats.stats_invalid = true;
3241 child->info.last_epoch_started = info.last_epoch_started;
3242 child->info.last_interval_started = info.last_interval_started;
3243
3244 // There can't be recovery/backfill going on now
3245 int primary, up_primary;
3246 vector<int> newup, newacting;
3247 get_osdmap()->pg_to_up_acting_osds(
3248 child->info.pgid.pgid, &newup, &up_primary, &newacting, &primary);
3249 child->init_primary_up_acting(
3250 newup,
3251 newacting,
3252 up_primary,
3253 primary);
3254 child->role = OSDMap::calc_pg_role(pg_whoami, child->acting);
3255
3256 // this comparison includes primary rank via pg_shard_t
3257 if (get_primary() != child->get_primary())
3258 child->info.history.same_primary_since = get_osdmap_epoch();
3259
3260 child->info.stats.up = newup;
3261 child->info.stats.up_primary = up_primary;
3262 child->info.stats.acting = newacting;
3263 child->info.stats.acting_primary = primary;
3264 child->info.stats.mapping_epoch = get_osdmap_epoch();
3265
3266 // History
3267 child->past_intervals = past_intervals;
3268
3269 child->on_new_interval();
3270
3271 child->send_notify = !child->is_primary();
3272
3273 child->dirty_info = true;
3274 child->dirty_big_info = true;
3275 dirty_info = true;
3276 dirty_big_info = true;
3277 }
3278
3279 void PeeringState::merge_from(
3280 map<spg_t,PeeringState *>& sources,
3281 PeeringCtx &rctx,
3282 unsigned split_bits,
3283 const pg_merge_meta_t& last_pg_merge_meta)
3284 {
3285 bool incomplete = false;
3286 if (info.last_complete != info.last_update ||
3287 info.is_incomplete() ||
3288 info.dne()) {
3289 psdout(10) << __func__ << " target incomplete" << dendl;
3290 incomplete = true;
3291 }
3292 if (last_pg_merge_meta.source_pgid != pg_t()) {
3293 if (info.pgid.pgid != last_pg_merge_meta.source_pgid.get_parent()) {
3294 psdout(10) << __func__ << " target doesn't match expected parent "
3295 << last_pg_merge_meta.source_pgid.get_parent()
3296 << " of source_pgid " << last_pg_merge_meta.source_pgid
3297 << dendl;
3298 incomplete = true;
3299 }
3300 if (info.last_update != last_pg_merge_meta.target_version) {
3301 psdout(10) << __func__ << " target version doesn't match expected "
3302 << last_pg_merge_meta.target_version << dendl;
3303 incomplete = true;
3304 }
3305 }
3306
3307 PGLog::LogEntryHandlerRef handler{pl->get_log_handler(rctx.transaction)};
3308 pg_log.roll_forward(handler.get());
3309
3310 info.last_complete = info.last_update; // to fake out trim()
3311 pg_log.reset_recovery_pointers();
3312 pg_log.trim(info.last_update, info);
3313
3314 vector<PGLog*> log_from;
3315 for (auto& i : sources) {
3316 auto& source = i.second;
3317 if (!source) {
3318 psdout(10) << __func__ << " source " << i.first << " missing" << dendl;
3319 incomplete = true;
3320 continue;
3321 }
3322 if (source->info.last_complete != source->info.last_update ||
3323 source->info.is_incomplete() ||
3324 source->info.dne()) {
3325 psdout(10) << __func__ << " source " << source->pg_whoami
3326 << " incomplete"
3327 << dendl;
3328 incomplete = true;
3329 }
3330 if (last_pg_merge_meta.source_pgid != pg_t()) {
3331 if (source->info.pgid.pgid != last_pg_merge_meta.source_pgid) {
3332 dout(10) << __func__ << " source " << source->info.pgid.pgid
3333 << " doesn't match expected source pgid "
3334 << last_pg_merge_meta.source_pgid << dendl;
3335 incomplete = true;
3336 }
3337 if (source->info.last_update != last_pg_merge_meta.source_version) {
3338 dout(10) << __func__ << " source version doesn't match expected "
3339 << last_pg_merge_meta.target_version << dendl;
3340 incomplete = true;
3341 }
3342 }
3343
3344 // prepare log
3345 PGLog::LogEntryHandlerRef handler{
3346 source->pl->get_log_handler(rctx.transaction)};
3347 source->pg_log.roll_forward(handler.get());
3348 source->info.last_complete = source->info.last_update; // to fake out trim()
3349 source->pg_log.reset_recovery_pointers();
3350 source->pg_log.trim(source->info.last_update, source->info);
3351 log_from.push_back(&source->pg_log);
3352
3353 // combine stats
3354 info.stats.add(source->info.stats);
3355
3356 // pull up last_update
3357 info.last_update = std::max(info.last_update, source->info.last_update);
3358
3359 // adopt source's PastIntervals if target has none. we can do this since
3360 // pgp_num has been reduced prior to the merge, so the OSD mappings for
3361 // the PGs are identical.
3362 if (past_intervals.empty() && !source->past_intervals.empty()) {
3363 psdout(10) << __func__ << " taking source's past_intervals" << dendl;
3364 past_intervals = source->past_intervals;
3365 }
3366 }
3367
3368 info.last_complete = info.last_update;
3369 info.log_tail = info.last_update;
3370 if (incomplete) {
3371 info.last_backfill = hobject_t();
3372 }
3373
3374 // merge logs
3375 pg_log.merge_from(log_from, info.last_update);
3376
3377 // make sure we have a meaningful last_epoch_started/clean (if we were a
3378 // placeholder)
3379 if (info.history.epoch_created == 0) {
3380 // start with (a) source's history, since these PGs *should* have been
3381 // remapped in concert with each other...
3382 info.history = sources.begin()->second->info.history;
3383
3384 // we use the last_epoch_{started,clean} we got from
3385 // the caller, which are the epochs that were reported by the PGs were
3386 // found to be ready for merge.
3387 info.history.last_epoch_clean = last_pg_merge_meta.last_epoch_clean;
3388 info.history.last_epoch_started = last_pg_merge_meta.last_epoch_started;
3389 info.last_epoch_started = last_pg_merge_meta.last_epoch_started;
3390 psdout(10) << __func__
3391 << " set les/c to " << last_pg_merge_meta.last_epoch_started << "/"
3392 << last_pg_merge_meta.last_epoch_clean
3393 << " from pool last_dec_*, source pg history was "
3394 << sources.begin()->second->info.history
3395 << dendl;
3396
3397 // above we have pulled down source's history and we need to check
3398 // history.epoch_created again to confirm that source is not a placeholder
3399 // too. (peering requires a sane history.same_interval_since value for any
3400 // non-newly created pg and below here we know we are basically iterating
3401 // back a series of past maps to fake a merge process, hence we need to
3402 // fix history.same_interval_since first so that start_peering_interval()
3403 // will not complain)
3404 if (info.history.epoch_created == 0) {
3405 dout(10) << __func__ << " both merge target and source are placeholders,"
3406 << " set sis to lec " << info.history.last_epoch_clean
3407 << dendl;
3408 info.history.same_interval_since = info.history.last_epoch_clean;
3409 }
3410
3411 // if the past_intervals start is later than last_epoch_clean, it
3412 // implies the source repeered again but the target didn't, or
3413 // that the source became clean in a later epoch than the target.
3414 // avoid the discrepancy but adjusting the interval start
3415 // backwards to match so that check_past_interval_bounds() will
3416 // not complain.
3417 auto pib = past_intervals.get_bounds();
3418 if (info.history.last_epoch_clean < pib.first) {
3419 psdout(10) << __func__ << " last_epoch_clean "
3420 << info.history.last_epoch_clean << " < past_interval start "
3421 << pib.first << ", adjusting start backwards" << dendl;
3422 past_intervals.adjust_start_backwards(info.history.last_epoch_clean);
3423 }
3424
3425 // Similarly, if the same_interval_since value is later than
3426 // last_epoch_clean, the next interval change will result in a
3427 // past_interval start that is later than last_epoch_clean. This
3428 // can happen if we use the pg_history values from the merge
3429 // source. Adjust the same_interval_since value backwards if that
3430 // happens. (We trust the les and lec values more because they came from
3431 // the real target, whereas the history value we stole from the source.)
3432 if (info.history.last_epoch_started < info.history.same_interval_since) {
3433 psdout(10) << __func__ << " last_epoch_started "
3434 << info.history.last_epoch_started << " < same_interval_since "
3435 << info.history.same_interval_since
3436 << ", adjusting pg_history backwards" << dendl;
3437 info.history.same_interval_since = info.history.last_epoch_clean;
3438 // make sure same_{up,primary}_since are <= same_interval_since
3439 info.history.same_up_since = std::min(
3440 info.history.same_up_since, info.history.same_interval_since);
3441 info.history.same_primary_since = std::min(
3442 info.history.same_primary_since, info.history.same_interval_since);
3443 }
3444 }
3445
3446 dirty_info = true;
3447 dirty_big_info = true;
3448 }
3449
3450 void PeeringState::start_split_stats(
3451 const set<spg_t>& childpgs, vector<object_stat_sum_t> *out)
3452 {
3453 out->resize(childpgs.size() + 1);
3454 info.stats.stats.sum.split(*out);
3455 }
3456
3457 void PeeringState::finish_split_stats(
3458 const object_stat_sum_t& stats, ObjectStore::Transaction &t)
3459 {
3460 info.stats.stats.sum = stats;
3461 write_if_dirty(t);
3462 }
3463
3464 void PeeringState::update_blocked_by()
3465 {
3466 // set a max on the number of blocking peers we report. if we go
3467 // over, report a random subset. keep the result sorted.
3468 unsigned keep = std::min<unsigned>(
3469 blocked_by.size(), cct->_conf->osd_max_pg_blocked_by);
3470 unsigned skip = blocked_by.size() - keep;
3471 info.stats.blocked_by.clear();
3472 info.stats.blocked_by.resize(keep);
3473 unsigned pos = 0;
3474 for (auto p = blocked_by.begin(); p != blocked_by.end() && keep > 0; ++p) {
3475 if (skip > 0 && (rand() % (skip + keep) < skip)) {
3476 --skip;
3477 } else {
3478 info.stats.blocked_by[pos++] = *p;
3479 --keep;
3480 }
3481 }
3482 }
3483
3484 static bool find_shard(const set<pg_shard_t> & pgs, shard_id_t shard)
3485 {
3486 for (auto&p : pgs)
3487 if (p.shard == shard)
3488 return true;
3489 return false;
3490 }
3491
3492 static pg_shard_t get_another_shard(const set<pg_shard_t> & pgs, pg_shard_t skip, shard_id_t shard)
3493 {
3494 for (auto&p : pgs) {
3495 if (p == skip)
3496 continue;
3497 if (p.shard == shard)
3498 return p;
3499 }
3500 return pg_shard_t();
3501 }
3502
3503 void PeeringState::update_calc_stats()
3504 {
3505 info.stats.version = info.last_update;
3506 info.stats.created = info.history.epoch_created;
3507 info.stats.last_scrub = info.history.last_scrub;
3508 info.stats.last_scrub_stamp = info.history.last_scrub_stamp;
3509 info.stats.last_deep_scrub = info.history.last_deep_scrub;
3510 info.stats.last_deep_scrub_stamp = info.history.last_deep_scrub_stamp;
3511 info.stats.last_clean_scrub_stamp = info.history.last_clean_scrub_stamp;
3512 info.stats.last_epoch_clean = info.history.last_epoch_clean;
3513
3514 info.stats.log_size = pg_log.get_head().version - pg_log.get_tail().version;
3515 info.stats.ondisk_log_size = info.stats.log_size;
3516 info.stats.log_start = pg_log.get_tail();
3517 info.stats.ondisk_log_start = pg_log.get_tail();
3518 info.stats.snaptrimq_len = pl->get_snap_trimq_size();
3519
3520 unsigned num_shards = get_osdmap()->get_pg_size(info.pgid.pgid);
3521
3522 // In rare case that upset is too large (usually transient), use as target
3523 // for calculations below.
3524 unsigned target = std::max(num_shards, (unsigned)upset.size());
3525 // For undersized actingset may be larger with OSDs out
3526 unsigned nrep = std::max(actingset.size(), upset.size());
3527 // calc num_object_copies
3528 info.stats.stats.calc_copies(std::max(target, nrep));
3529 info.stats.stats.sum.num_objects_degraded = 0;
3530 info.stats.stats.sum.num_objects_unfound = 0;
3531 info.stats.stats.sum.num_objects_misplaced = 0;
3532 info.stats.avail_no_missing.clear();
3533 info.stats.object_location_counts.clear();
3534
3535 // We should never hit this condition, but if end up hitting it,
3536 // make sure to update num_objects and set PG_STATE_INCONSISTENT.
3537 if (info.stats.stats.sum.num_objects < 0) {
3538 psdout(0) << __func__ << " negative num_objects = "
3539 << info.stats.stats.sum.num_objects << " setting it to 0 "
3540 << dendl;
3541 info.stats.stats.sum.num_objects = 0;
3542 state_set(PG_STATE_INCONSISTENT);
3543 }
3544
3545 if ((is_remapped() || is_undersized() || !is_clean()) &&
3546 (is_peered()|| is_activating())) {
3547 psdout(20) << __func__ << " actingset " << actingset << " upset "
3548 << upset << " acting_recovery_backfill " << acting_recovery_backfill << dendl;
3549
3550 ceph_assert(!acting_recovery_backfill.empty());
3551
3552 bool estimate = false;
3553
3554 // NOTE: we only generate degraded, misplaced and unfound
3555 // values for the summation, not individual stat categories.
3556 int64_t num_objects = info.stats.stats.sum.num_objects;
3557
3558 // Objects missing from up nodes, sorted by # objects.
3559 boost::container::flat_set<pair<int64_t,pg_shard_t>> missing_target_objects;
3560 // Objects missing from nodes not in up, sort by # objects
3561 boost::container::flat_set<pair<int64_t,pg_shard_t>> acting_source_objects;
3562
3563 // Fill missing_target_objects/acting_source_objects
3564
3565 {
3566 int64_t missing;
3567
3568 // Primary first
3569 missing = pg_log.get_missing().num_missing();
3570 ceph_assert(acting_recovery_backfill.count(pg_whoami));
3571 if (upset.count(pg_whoami)) {
3572 missing_target_objects.emplace(missing, pg_whoami);
3573 } else {
3574 acting_source_objects.emplace(missing, pg_whoami);
3575 }
3576 info.stats.stats.sum.num_objects_missing_on_primary = missing;
3577 if (missing == 0)
3578 info.stats.avail_no_missing.push_back(pg_whoami);
3579 psdout(20) << __func__ << " shard " << pg_whoami
3580 << " primary objects " << num_objects
3581 << " missing " << missing
3582 << dendl;
3583 }
3584
3585 // All other peers
3586 for (auto& peer : peer_info) {
3587 // Primary should not be in the peer_info, skip if it is.
3588 if (peer.first == pg_whoami) continue;
3589 int64_t missing = 0;
3590 int64_t peer_num_objects =
3591 std::max((int64_t)0, peer.second.stats.stats.sum.num_objects);
3592 // Backfill targets always track num_objects accurately
3593 // all other peers track missing accurately.
3594 if (is_backfill_target(peer.first)) {
3595 missing = std::max((int64_t)0, num_objects - peer_num_objects);
3596 } else {
3597 if (peer_missing.count(peer.first)) {
3598 missing = peer_missing[peer.first].num_missing();
3599 } else {
3600 psdout(20) << __func__ << " no peer_missing found for "
3601 << peer.first << dendl;
3602 if (is_recovering()) {
3603 estimate = true;
3604 }
3605 missing = std::max((int64_t)0, num_objects - peer_num_objects);
3606 }
3607 }
3608 if (upset.count(peer.first)) {
3609 missing_target_objects.emplace(missing, peer.first);
3610 } else if (actingset.count(peer.first)) {
3611 acting_source_objects.emplace(missing, peer.first);
3612 }
3613 peer.second.stats.stats.sum.num_objects_missing = missing;
3614 if (missing == 0)
3615 info.stats.avail_no_missing.push_back(peer.first);
3616 psdout(20) << __func__ << " shard " << peer.first
3617 << " objects " << peer_num_objects
3618 << " missing " << missing
3619 << dendl;
3620 }
3621
3622 // Compute object_location_counts
3623 for (auto& ml: missing_loc.get_missing_locs()) {
3624 info.stats.object_location_counts[ml.second]++;
3625 psdout(30) << __func__ << " " << ml.first << " object_location_counts["
3626 << ml.second << "]=" << info.stats.object_location_counts[ml.second]
3627 << dendl;
3628 }
3629 int64_t not_missing = num_objects - missing_loc.get_missing_locs().size();
3630 if (not_missing) {
3631 // During recovery we know upset == actingset and is being populated
3632 // During backfill we know that all non-missing objects are in the actingset
3633 info.stats.object_location_counts[actingset] = not_missing;
3634 }
3635 psdout(30) << __func__ << " object_location_counts["
3636 << upset << "]=" << info.stats.object_location_counts[upset]
3637 << dendl;
3638 psdout(20) << __func__ << " object_location_counts "
3639 << info.stats.object_location_counts << dendl;
3640
3641 // A misplaced object is not stored on the correct OSD
3642 int64_t misplaced = 0;
3643 // a degraded objects has fewer replicas or EC shards than the pool specifies.
3644 int64_t degraded = 0;
3645
3646 if (is_recovering()) {
3647 for (auto& sml: missing_loc.get_missing_by_count()) {
3648 for (auto& ml: sml.second) {
3649 int missing_shards;
3650 if (sml.first == shard_id_t::NO_SHARD) {
3651 psdout(20) << __func__ << " ml " << ml.second
3652 << " upset size " << upset.size()
3653 << " up " << ml.first.up << dendl;
3654 missing_shards = (int)upset.size() - ml.first.up;
3655 } else {
3656 // Handle shards not even in upset below
3657 if (!find_shard(upset, sml.first))
3658 continue;
3659 missing_shards = std::max(0, 1 - ml.first.up);
3660 psdout(20) << __func__
3661 << " shard " << sml.first
3662 << " ml " << ml.second
3663 << " missing shards " << missing_shards << dendl;
3664 }
3665 int odegraded = ml.second * missing_shards;
3666 // Copies on other osds but limited to the possible degraded
3667 int more_osds = std::min(missing_shards, ml.first.other);
3668 int omisplaced = ml.second * more_osds;
3669 ceph_assert(omisplaced <= odegraded);
3670 odegraded -= omisplaced;
3671
3672 misplaced += omisplaced;
3673 degraded += odegraded;
3674 }
3675 }
3676
3677 psdout(20) << __func__ << " missing based degraded "
3678 << degraded << dendl;
3679 psdout(20) << __func__ << " missing based misplaced "
3680 << misplaced << dendl;
3681
3682 // Handle undersized case
3683 if (pool.info.is_replicated()) {
3684 // Add degraded for missing targets (num_objects missing)
3685 ceph_assert(target >= upset.size());
3686 unsigned needed = target - upset.size();
3687 degraded += num_objects * needed;
3688 } else {
3689 for (unsigned i = 0 ; i < num_shards; ++i) {
3690 shard_id_t shard(i);
3691
3692 if (!find_shard(upset, shard)) {
3693 pg_shard_t pgs = get_another_shard(actingset, pg_shard_t(), shard);
3694
3695 if (pgs != pg_shard_t()) {
3696 int64_t missing;
3697
3698 if (pgs == pg_whoami)
3699 missing = info.stats.stats.sum.num_objects_missing_on_primary;
3700 else
3701 missing = peer_info[pgs].stats.stats.sum.num_objects_missing;
3702
3703 degraded += missing;
3704 misplaced += std::max((int64_t)0, num_objects - missing);
3705 } else {
3706 // No shard anywhere
3707 degraded += num_objects;
3708 }
3709 }
3710 }
3711 }
3712 goto out;
3713 }
3714
3715 // Handle undersized case
3716 if (pool.info.is_replicated()) {
3717 // Add to missing_target_objects
3718 ceph_assert(target >= missing_target_objects.size());
3719 unsigned needed = target - missing_target_objects.size();
3720 if (needed)
3721 missing_target_objects.emplace(num_objects * needed, pg_shard_t(pg_shard_t::NO_OSD));
3722 } else {
3723 for (unsigned i = 0 ; i < num_shards; ++i) {
3724 shard_id_t shard(i);
3725 bool found = false;
3726 for (const auto& t : missing_target_objects) {
3727 if (std::get<1>(t).shard == shard) {
3728 found = true;
3729 break;
3730 }
3731 }
3732 if (!found)
3733 missing_target_objects.emplace(num_objects, pg_shard_t(pg_shard_t::NO_OSD,shard));
3734 }
3735 }
3736
3737 for (const auto& item : missing_target_objects)
3738 psdout(20) << __func__ << " missing shard " << std::get<1>(item)
3739 << " missing= " << std::get<0>(item) << dendl;
3740 for (const auto& item : acting_source_objects)
3741 psdout(20) << __func__ << " acting shard " << std::get<1>(item)
3742 << " missing= " << std::get<0>(item) << dendl;
3743
3744 // Handle all objects not in missing for remapped
3745 // or backfill
3746 for (auto m = missing_target_objects.rbegin();
3747 m != missing_target_objects.rend(); ++m) {
3748
3749 int64_t extra_missing = -1;
3750
3751 if (pool.info.is_replicated()) {
3752 if (!acting_source_objects.empty()) {
3753 auto extra_copy = acting_source_objects.begin();
3754 extra_missing = std::get<0>(*extra_copy);
3755 acting_source_objects.erase(extra_copy);
3756 }
3757 } else { // Erasure coded
3758 // Use corresponding shard
3759 for (const auto& a : acting_source_objects) {
3760 if (std::get<1>(a).shard == std::get<1>(*m).shard) {
3761 extra_missing = std::get<0>(a);
3762 acting_source_objects.erase(a);
3763 break;
3764 }
3765 }
3766 }
3767
3768 if (extra_missing >= 0 && std::get<0>(*m) >= extra_missing) {
3769 // We don't know which of the objects on the target
3770 // are part of extra_missing so assume are all degraded.
3771 misplaced += std::get<0>(*m) - extra_missing;
3772 degraded += extra_missing;
3773 } else {
3774 // 1. extra_missing == -1, more targets than sources so degraded
3775 // 2. extra_missing > std::get<0>(m), so that we know that some extra_missing
3776 // previously degraded are now present on the target.
3777 degraded += std::get<0>(*m);
3778 }
3779 }
3780 // If there are still acting that haven't been accounted for
3781 // then they are misplaced
3782 for (const auto& a : acting_source_objects) {
3783 int64_t extra_misplaced = std::max((int64_t)0, num_objects - std::get<0>(a));
3784 psdout(20) << __func__ << " extra acting misplaced " << extra_misplaced
3785 << dendl;
3786 misplaced += extra_misplaced;
3787 }
3788 out:
3789 // NOTE: Tests use these messages to verify this code
3790 psdout(20) << __func__ << " degraded " << degraded
3791 << (estimate ? " (est)": "") << dendl;
3792 psdout(20) << __func__ << " misplaced " << misplaced
3793 << (estimate ? " (est)": "")<< dendl;
3794
3795 info.stats.stats.sum.num_objects_degraded = degraded;
3796 info.stats.stats.sum.num_objects_unfound = get_num_unfound();
3797 info.stats.stats.sum.num_objects_misplaced = misplaced;
3798 }
3799 }
3800
3801 std::optional<pg_stat_t> PeeringState::prepare_stats_for_publish(
3802 const std::optional<pg_stat_t> &pg_stats_publish,
3803 const object_stat_collection_t &unstable_stats)
3804 {
3805 if (info.stats.stats.sum.num_scrub_errors) {
3806 psdout(10) << __func__ << " inconsistent due to " <<
3807 info.stats.stats.sum.num_scrub_errors << " scrub errors" << dendl;
3808 state_set(PG_STATE_INCONSISTENT);
3809 } else {
3810 state_clear(PG_STATE_INCONSISTENT);
3811 state_clear(PG_STATE_FAILED_REPAIR);
3812 }
3813
3814 utime_t now = ceph_clock_now();
3815 if (info.stats.state != state) {
3816 info.stats.last_change = now;
3817 // Optimistic estimation, if we just find out an inactive PG,
3818 // assume it is active till now.
3819 if (!(state & PG_STATE_ACTIVE) &&
3820 (info.stats.state & PG_STATE_ACTIVE))
3821 info.stats.last_active = now;
3822
3823 if ((state & PG_STATE_ACTIVE) &&
3824 !(info.stats.state & PG_STATE_ACTIVE))
3825 info.stats.last_became_active = now;
3826 if ((state & (PG_STATE_ACTIVE|PG_STATE_PEERED)) &&
3827 !(info.stats.state & (PG_STATE_ACTIVE|PG_STATE_PEERED)))
3828 info.stats.last_became_peered = now;
3829 info.stats.state = state;
3830 }
3831
3832 update_calc_stats();
3833 if (info.stats.stats.sum.num_objects_degraded) {
3834 state_set(PG_STATE_DEGRADED);
3835 } else {
3836 state_clear(PG_STATE_DEGRADED);
3837 }
3838 update_blocked_by();
3839
3840 pg_stat_t pre_publish = info.stats;
3841 pre_publish.stats.add(unstable_stats);
3842 utime_t cutoff = now;
3843 cutoff -= cct->_conf->osd_pg_stat_report_interval_max;
3844
3845 // share (some of) our purged_snaps via the pg_stats. limit # of intervals
3846 // because we don't want to make the pg_stat_t structures too expensive.
3847 unsigned max = cct->_conf->osd_max_snap_prune_intervals_per_epoch;
3848 unsigned num = 0;
3849 auto i = info.purged_snaps.begin();
3850 while (num < max && i != info.purged_snaps.end()) {
3851 pre_publish.purged_snaps.insert(i.get_start(), i.get_len());
3852 ++num;
3853 ++i;
3854 }
3855 psdout(20) << __func__ << " reporting purged_snaps "
3856 << pre_publish.purged_snaps << dendl;
3857
3858 if (pg_stats_publish && pre_publish == *pg_stats_publish &&
3859 info.stats.last_fresh > cutoff) {
3860 psdout(15) << "publish_stats_to_osd " << pg_stats_publish->reported_epoch
3861 << ": no change since " << info.stats.last_fresh << dendl;
3862 return std::nullopt;
3863 } else {
3864 // update our stat summary and timestamps
3865 info.stats.reported_epoch = get_osdmap_epoch();
3866 ++info.stats.reported_seq;
3867
3868 info.stats.last_fresh = now;
3869
3870 if (info.stats.state & PG_STATE_CLEAN)
3871 info.stats.last_clean = now;
3872 if (info.stats.state & PG_STATE_ACTIVE)
3873 info.stats.last_active = now;
3874 if (info.stats.state & (PG_STATE_ACTIVE|PG_STATE_PEERED))
3875 info.stats.last_peered = now;
3876 info.stats.last_unstale = now;
3877 if ((info.stats.state & PG_STATE_DEGRADED) == 0)
3878 info.stats.last_undegraded = now;
3879 if ((info.stats.state & PG_STATE_UNDERSIZED) == 0)
3880 info.stats.last_fullsized = now;
3881
3882 psdout(15) << "publish_stats_to_osd " << pre_publish.reported_epoch
3883 << ":" << pre_publish.reported_seq << dendl;
3884 return std::make_optional(std::move(pre_publish));
3885 }
3886 }
3887
3888 void PeeringState::init(
3889 int role,
3890 const vector<int>& newup, int new_up_primary,
3891 const vector<int>& newacting, int new_acting_primary,
3892 const pg_history_t& history,
3893 const PastIntervals& pi,
3894 ObjectStore::Transaction &t)
3895 {
3896 psdout(10) << "init role " << role << " up "
3897 << newup << " acting " << newacting
3898 << " history " << history
3899 << " past_intervals " << pi
3900 << dendl;
3901
3902 set_role(role);
3903 init_primary_up_acting(
3904 newup,
3905 newacting,
3906 new_up_primary,
3907 new_acting_primary);
3908
3909 info.history = history;
3910 past_intervals = pi;
3911
3912 info.stats.up = up;
3913 info.stats.up_primary = new_up_primary;
3914 info.stats.acting = acting;
3915 info.stats.acting_primary = new_acting_primary;
3916 info.stats.mapping_epoch = info.history.same_interval_since;
3917
3918 if (!perform_deletes_during_peering()) {
3919 pg_log.set_missing_may_contain_deletes();
3920 }
3921
3922 on_new_interval();
3923
3924 dirty_info = true;
3925 dirty_big_info = true;
3926 write_if_dirty(t);
3927 }
3928
3929 void PeeringState::dump_peering_state(Formatter *f)
3930 {
3931 f->dump_string("state", get_pg_state_string());
3932 f->dump_unsigned("epoch", get_osdmap_epoch());
3933 f->open_array_section("up");
3934 for (auto p = up.begin(); p != up.end(); ++p)
3935 f->dump_unsigned("osd", *p);
3936 f->close_section();
3937 f->open_array_section("acting");
3938 for (auto p = acting.begin(); p != acting.end(); ++p)
3939 f->dump_unsigned("osd", *p);
3940 f->close_section();
3941 if (!backfill_targets.empty()) {
3942 f->open_array_section("backfill_targets");
3943 for (auto p = backfill_targets.begin(); p != backfill_targets.end(); ++p)
3944 f->dump_stream("shard") << *p;
3945 f->close_section();
3946 }
3947 if (!async_recovery_targets.empty()) {
3948 f->open_array_section("async_recovery_targets");
3949 for (auto p = async_recovery_targets.begin();
3950 p != async_recovery_targets.end();
3951 ++p)
3952 f->dump_stream("shard") << *p;
3953 f->close_section();
3954 }
3955 if (!acting_recovery_backfill.empty()) {
3956 f->open_array_section("acting_recovery_backfill");
3957 for (auto p = acting_recovery_backfill.begin();
3958 p != acting_recovery_backfill.end();
3959 ++p)
3960 f->dump_stream("shard") << *p;
3961 f->close_section();
3962 }
3963 f->open_object_section("info");
3964 update_calc_stats();
3965 info.dump(f);
3966 f->close_section();
3967
3968 f->open_array_section("peer_info");
3969 for (auto p = peer_info.begin(); p != peer_info.end(); ++p) {
3970 f->open_object_section("info");
3971 f->dump_stream("peer") << p->first;
3972 p->second.dump(f);
3973 f->close_section();
3974 }
3975 f->close_section();
3976 }
3977
3978 void PeeringState::update_stats(
3979 std::function<bool(pg_history_t &, pg_stat_t &)> f,
3980 ObjectStore::Transaction *t) {
3981 if (f(info.history, info.stats)) {
3982 pl->publish_stats_to_osd();
3983 }
3984 pl->reschedule_scrub();
3985
3986 if (t) {
3987 dirty_info = true;
3988 write_if_dirty(*t);
3989 }
3990 }
3991
3992 void PeeringState::update_stats_wo_resched(
3993 std::function<void(pg_history_t &, pg_stat_t &)> f)
3994 {
3995 f(info.history, info.stats);
3996 }
3997
3998 bool PeeringState::append_log_entries_update_missing(
3999 const mempool::osd_pglog::list<pg_log_entry_t> &entries,
4000 ObjectStore::Transaction &t, std::optional<eversion_t> trim_to,
4001 std::optional<eversion_t> roll_forward_to)
4002 {
4003 ceph_assert(!entries.empty());
4004 ceph_assert(entries.begin()->version > info.last_update);
4005
4006 PGLog::LogEntryHandlerRef rollbacker{pl->get_log_handler(t)};
4007 bool invalidate_stats =
4008 pg_log.append_new_log_entries(
4009 info.last_backfill,
4010 entries,
4011 rollbacker.get());
4012
4013 if (roll_forward_to && entries.rbegin()->soid > info.last_backfill) {
4014 pg_log.roll_forward(rollbacker.get());
4015 }
4016 if (roll_forward_to && *roll_forward_to > pg_log.get_can_rollback_to()) {
4017 pg_log.roll_forward_to(*roll_forward_to, rollbacker.get());
4018 last_rollback_info_trimmed_to_applied = *roll_forward_to;
4019 }
4020
4021 info.last_update = pg_log.get_head();
4022
4023 if (pg_log.get_missing().num_missing() == 0) {
4024 // advance last_complete since nothing else is missing!
4025 info.last_complete = info.last_update;
4026 }
4027 info.stats.stats_invalid = info.stats.stats_invalid || invalidate_stats;
4028
4029 psdout(20) << __func__ << " trim_to bool = " << bool(trim_to)
4030 << " trim_to = " << (trim_to ? *trim_to : eversion_t()) << dendl;
4031 if (trim_to)
4032 pg_log.trim(*trim_to, info);
4033 dirty_info = true;
4034 write_if_dirty(t);
4035 return invalidate_stats;
4036 }
4037
4038 void PeeringState::merge_new_log_entries(
4039 const mempool::osd_pglog::list<pg_log_entry_t> &entries,
4040 ObjectStore::Transaction &t,
4041 std::optional<eversion_t> trim_to,
4042 std::optional<eversion_t> roll_forward_to)
4043 {
4044 psdout(10) << __func__ << " " << entries << dendl;
4045 ceph_assert(is_primary());
4046
4047 bool rebuild_missing = append_log_entries_update_missing(entries, t, trim_to, roll_forward_to);
4048 for (auto i = acting_recovery_backfill.begin();
4049 i != acting_recovery_backfill.end();
4050 ++i) {
4051 pg_shard_t peer(*i);
4052 if (peer == pg_whoami) continue;
4053 ceph_assert(peer_missing.count(peer));
4054 ceph_assert(peer_info.count(peer));
4055 pg_missing_t& pmissing(peer_missing[peer]);
4056 psdout(20) << __func__ << " peer_missing for " << peer
4057 << " = " << pmissing << dendl;
4058 pg_info_t& pinfo(peer_info[peer]);
4059 bool invalidate_stats = PGLog::append_log_entries_update_missing(
4060 pinfo.last_backfill,
4061 entries,
4062 true,
4063 NULL,
4064 pmissing,
4065 NULL,
4066 dpp);
4067 pinfo.last_update = info.last_update;
4068 pinfo.stats.stats_invalid = pinfo.stats.stats_invalid || invalidate_stats;
4069 rebuild_missing = rebuild_missing || invalidate_stats;
4070 }
4071
4072 if (!rebuild_missing) {
4073 return;
4074 }
4075
4076 for (auto &&i: entries) {
4077 missing_loc.rebuild(
4078 i.soid,
4079 pg_whoami,
4080 acting_recovery_backfill,
4081 info,
4082 pg_log.get_missing(),
4083 peer_missing,
4084 peer_info);
4085 }
4086 }
4087
4088 void PeeringState::add_log_entry(const pg_log_entry_t& e, bool applied)
4089 {
4090 // raise last_complete only if we were previously up to date
4091 if (info.last_complete == info.last_update)
4092 info.last_complete = e.version;
4093
4094 // raise last_update.
4095 ceph_assert(e.version > info.last_update);
4096 info.last_update = e.version;
4097
4098 // raise user_version, if it increased (it may have not get bumped
4099 // by all logged updates)
4100 if (e.user_version > info.last_user_version)
4101 info.last_user_version = e.user_version;
4102
4103 // log mutation
4104 pg_log.add(e, applied);
4105 psdout(10) << "add_log_entry " << e << dendl;
4106 }
4107
4108
4109 void PeeringState::append_log(
4110 vector<pg_log_entry_t>&& logv,
4111 eversion_t trim_to,
4112 eversion_t roll_forward_to,
4113 eversion_t mlcod,
4114 ObjectStore::Transaction &t,
4115 bool transaction_applied,
4116 bool async)
4117 {
4118 /* The primary has sent an info updating the history, but it may not
4119 * have arrived yet. We want to make sure that we cannot remember this
4120 * write without remembering that it happened in an interval which went
4121 * active in epoch history.last_epoch_started.
4122 */
4123 if (info.last_epoch_started != info.history.last_epoch_started) {
4124 info.history.last_epoch_started = info.last_epoch_started;
4125 }
4126 if (info.last_interval_started != info.history.last_interval_started) {
4127 info.history.last_interval_started = info.last_interval_started;
4128 }
4129 psdout(10) << "append_log " << pg_log.get_log() << " " << logv << dendl;
4130
4131 PGLog::LogEntryHandlerRef handler{pl->get_log_handler(t)};
4132 if (!transaction_applied) {
4133 /* We must be a backfill or async recovery peer, so it's ok if we apply
4134 * out-of-turn since we won't be considered when
4135 * determining a min possible last_update.
4136 *
4137 * We skip_rollforward() here, which advances the crt, without
4138 * doing an actual rollforward. This avoids cleaning up entries
4139 * from the backend and we do not end up in a situation, where the
4140 * object is deleted before we can _merge_object_divergent_entries().
4141 */
4142 pg_log.skip_rollforward();
4143 }
4144
4145 for (auto p = logv.begin(); p != logv.end(); ++p) {
4146 add_log_entry(*p, transaction_applied);
4147
4148 /* We don't want to leave the rollforward artifacts around
4149 * here past last_backfill. It's ok for the same reason as
4150 * above */
4151 if (transaction_applied &&
4152 p->soid > info.last_backfill) {
4153 pg_log.roll_forward(handler.get());
4154 }
4155 }
4156 if (transaction_applied && roll_forward_to > pg_log.get_can_rollback_to()) {
4157 pg_log.roll_forward_to(
4158 roll_forward_to,
4159 handler.get());
4160 last_rollback_info_trimmed_to_applied = roll_forward_to;
4161 }
4162
4163 psdout(10) << __func__ << " approx pg log length = "
4164 << pg_log.get_log().approx_size() << dendl;
4165 psdout(10) << __func__ << " transaction_applied = "
4166 << transaction_applied << dendl;
4167 if (!transaction_applied || async)
4168 psdout(10) << __func__ << " " << pg_whoami
4169 << " is async_recovery or backfill target" << dendl;
4170 pg_log.trim(trim_to, info, transaction_applied, async);
4171
4172 // update the local pg, pg log
4173 dirty_info = true;
4174 write_if_dirty(t);
4175
4176 if (!is_primary())
4177 min_last_complete_ondisk = mlcod;
4178 }
4179
4180 void PeeringState::recover_got(
4181 const hobject_t &oid, eversion_t v,
4182 bool is_delete,
4183 ObjectStore::Transaction &t)
4184 {
4185 if (v > pg_log.get_can_rollback_to()) {
4186 /* This can only happen during a repair, and even then, it would
4187 * be one heck of a race. If we are repairing the object, the
4188 * write in question must be fully committed, so it's not valid
4189 * to roll it back anyway (and we'll be rolled forward shortly
4190 * anyway) */
4191 PGLog::LogEntryHandlerRef handler{pl->get_log_handler(t)};
4192 pg_log.roll_forward_to(v, handler.get());
4193 }
4194
4195 psdout(10) << "got missing " << oid << " v " << v << dendl;
4196 pg_log.recover_got(oid, v, info);
4197 if (pg_log.get_log().log.empty()) {
4198 psdout(10) << "last_complete now " << info.last_complete
4199 << " while log is empty" << dendl;
4200 } else if (pg_log.get_log().complete_to != pg_log.get_log().log.end()) {
4201 psdout(10) << "last_complete now " << info.last_complete
4202 << " log.complete_to " << pg_log.get_log().complete_to->version
4203 << dendl;
4204 } else {
4205 psdout(10) << "last_complete now " << info.last_complete
4206 << " log.complete_to at end" << dendl;
4207 //below is not true in the repair case.
4208 //assert(missing.num_missing() == 0); // otherwise, complete_to was wrong.
4209 ceph_assert(info.last_complete == info.last_update);
4210 }
4211
4212 if (is_primary()) {
4213 ceph_assert(missing_loc.needs_recovery(oid));
4214 if (!is_delete)
4215 missing_loc.add_location(oid, pg_whoami);
4216 }
4217
4218 // update pg
4219 dirty_info = true;
4220 write_if_dirty(t);
4221 }
4222
4223 void PeeringState::update_backfill_progress(
4224 const hobject_t &updated_backfill,
4225 const pg_stat_t &updated_stats,
4226 bool preserve_local_num_bytes,
4227 ObjectStore::Transaction &t) {
4228 info.set_last_backfill(updated_backfill);
4229 if (preserve_local_num_bytes) {
4230 psdout(25) << __func__ << " primary " << updated_stats.stats.sum.num_bytes
4231 << " local " << info.stats.stats.sum.num_bytes << dendl;
4232 int64_t bytes = info.stats.stats.sum.num_bytes;
4233 info.stats = updated_stats;
4234 info.stats.stats.sum.num_bytes = bytes;
4235 } else {
4236 psdout(20) << __func__ << " final " << updated_stats.stats.sum.num_bytes
4237 << " replaces local " << info.stats.stats.sum.num_bytes << dendl;
4238 info.stats = updated_stats;
4239 }
4240
4241 dirty_info = true;
4242 write_if_dirty(t);
4243 }
4244
4245 void PeeringState::adjust_purged_snaps(
4246 std::function<void(interval_set<snapid_t> &snaps)> f) {
4247 f(info.purged_snaps);
4248 dirty_info = true;
4249 dirty_big_info = true;
4250 }
4251
4252 void PeeringState::on_peer_recover(
4253 pg_shard_t peer,
4254 const hobject_t &soid,
4255 const eversion_t &version)
4256 {
4257 pl->publish_stats_to_osd();
4258 // done!
4259 peer_missing[peer].got(soid, version);
4260 missing_loc.add_location(soid, peer);
4261 }
4262
4263 void PeeringState::begin_peer_recover(
4264 pg_shard_t peer,
4265 const hobject_t soid)
4266 {
4267 peer_missing[peer].revise_have(soid, eversion_t());
4268 }
4269
4270 void PeeringState::force_object_missing(
4271 const set<pg_shard_t> &peers,
4272 const hobject_t &soid,
4273 eversion_t version)
4274 {
4275 for (auto &&peer : peers) {
4276 if (peer != primary) {
4277 peer_missing[peer].add(soid, version, eversion_t(), false);
4278 } else {
4279 pg_log.missing_add(soid, version, eversion_t());
4280 pg_log.reset_complete_to(&info);
4281 pg_log.set_last_requested(0);
4282 }
4283 }
4284
4285 missing_loc.rebuild(
4286 soid,
4287 pg_whoami,
4288 acting_recovery_backfill,
4289 info,
4290 pg_log.get_missing(),
4291 peer_missing,
4292 peer_info);
4293 }
4294
4295 void PeeringState::pre_submit_op(
4296 const hobject_t &hoid,
4297 const vector<pg_log_entry_t>& logv,
4298 eversion_t at_version)
4299 {
4300 if (at_version > eversion_t()) {
4301 for (auto &&i : get_acting_recovery_backfill()) {
4302 if (i == primary) continue;
4303 pg_info_t &pinfo = peer_info[i];
4304 // keep peer_info up to date
4305 if (pinfo.last_complete == pinfo.last_update)
4306 pinfo.last_complete = at_version;
4307 pinfo.last_update = at_version;
4308 }
4309 }
4310
4311 bool requires_missing_loc = false;
4312 for (auto &&i : get_async_recovery_targets()) {
4313 if (i == primary || !get_peer_missing(i).is_missing(hoid))
4314 continue;
4315 requires_missing_loc = true;
4316 for (auto &&entry: logv) {
4317 peer_missing[i].add_next_event(entry);
4318 }
4319 }
4320
4321 if (requires_missing_loc) {
4322 for (auto &&entry: logv) {
4323 psdout(30) << __func__ << " missing_loc before: "
4324 << missing_loc.get_locations(entry.soid) << dendl;
4325 missing_loc.add_missing(entry.soid, entry.version,
4326 eversion_t(), entry.is_delete());
4327 // clear out missing_loc
4328 missing_loc.clear_location(entry.soid);
4329 for (auto &i: get_actingset()) {
4330 if (!get_peer_missing(i).is_missing(entry.soid))
4331 missing_loc.add_location(entry.soid, i);
4332 }
4333 psdout(30) << __func__ << " missing_loc after: "
4334 << missing_loc.get_locations(entry.soid) << dendl;
4335 }
4336 }
4337 }
4338
4339 void PeeringState::recovery_committed_to(eversion_t version)
4340 {
4341 psdout(10) << __func__ << " version " << version
4342 << " now ondisk" << dendl;
4343 last_complete_ondisk = version;
4344
4345 if (last_complete_ondisk == info.last_update) {
4346 if (!is_primary()) {
4347 // Either we are a replica or backfill target.
4348 // we are fully up to date. tell the primary!
4349 pl->send_cluster_message(
4350 get_primary().osd,
4351 TOPNSPC::make_message<MOSDPGTrim>(
4352 get_osdmap_epoch(),
4353 spg_t(info.pgid.pgid, primary.shard),
4354 last_complete_ondisk),
4355 get_osdmap_epoch());
4356 } else {
4357 calc_min_last_complete_ondisk();
4358 }
4359 }
4360 }
4361
4362 void PeeringState::complete_write(eversion_t v, eversion_t lc)
4363 {
4364 last_update_ondisk = v;
4365 last_complete_ondisk = lc;
4366 calc_min_last_complete_ondisk();
4367 }
4368
4369 void PeeringState::calc_trim_to()
4370 {
4371 size_t target = pl->get_target_pg_log_entries();
4372
4373 eversion_t limit = std::min(
4374 min_last_complete_ondisk,
4375 pg_log.get_can_rollback_to());
4376 if (limit != eversion_t() &&
4377 limit != pg_trim_to &&
4378 pg_log.get_log().approx_size() > target) {
4379 size_t num_to_trim = std::min(pg_log.get_log().approx_size() - target,
4380 cct->_conf->osd_pg_log_trim_max);
4381 if (num_to_trim < cct->_conf->osd_pg_log_trim_min &&
4382 cct->_conf->osd_pg_log_trim_max >= cct->_conf->osd_pg_log_trim_min) {
4383 return;
4384 }
4385 auto it = pg_log.get_log().log.begin();
4386 eversion_t new_trim_to;
4387 for (size_t i = 0; i < num_to_trim; ++i) {
4388 new_trim_to = it->version;
4389 ++it;
4390 if (new_trim_to > limit) {
4391 new_trim_to = limit;
4392 psdout(10) << "calc_trim_to trimming to min_last_complete_ondisk" << dendl;
4393 break;
4394 }
4395 }
4396 psdout(10) << "calc_trim_to " << pg_trim_to << " -> " << new_trim_to << dendl;
4397 pg_trim_to = new_trim_to;
4398 assert(pg_trim_to <= pg_log.get_head());
4399 assert(pg_trim_to <= min_last_complete_ondisk);
4400 }
4401 }
4402
4403 void PeeringState::calc_trim_to_aggressive()
4404 {
4405 size_t target = pl->get_target_pg_log_entries();
4406
4407 // limit pg log trimming up to the can_rollback_to value
4408 eversion_t limit = std::min({
4409 pg_log.get_head(),
4410 pg_log.get_can_rollback_to(),
4411 last_update_ondisk});
4412 psdout(10) << __func__ << " limit = " << limit << dendl;
4413
4414 if (limit != eversion_t() &&
4415 limit != pg_trim_to &&
4416 pg_log.get_log().approx_size() > target) {
4417 psdout(10) << __func__ << " approx pg log length = "
4418 << pg_log.get_log().approx_size() << dendl;
4419 uint64_t num_to_trim = std::min<uint64_t>(pg_log.get_log().approx_size() - target,
4420 cct->_conf->osd_pg_log_trim_max);
4421 psdout(10) << __func__ << " num_to_trim = " << num_to_trim << dendl;
4422 if (num_to_trim < cct->_conf->osd_pg_log_trim_min &&
4423 cct->_conf->osd_pg_log_trim_max >= cct->_conf->osd_pg_log_trim_min) {
4424 return;
4425 }
4426 auto it = pg_log.get_log().log.begin(); // oldest log entry
4427 auto rit = pg_log.get_log().log.rbegin();
4428 eversion_t by_n_to_keep; // start from tail
4429 eversion_t by_n_to_trim = eversion_t::max(); // start from head
4430 for (size_t i = 0; it != pg_log.get_log().log.end(); ++it, ++rit) {
4431 i++;
4432 if (i > target && by_n_to_keep == eversion_t()) {
4433 by_n_to_keep = rit->version;
4434 }
4435 if (i >= num_to_trim && by_n_to_trim == eversion_t::max()) {
4436 by_n_to_trim = it->version;
4437 }
4438 if (by_n_to_keep != eversion_t() &&
4439 by_n_to_trim != eversion_t::max()) {
4440 break;
4441 }
4442 }
4443
4444 if (by_n_to_keep == eversion_t()) {
4445 return;
4446 }
4447
4448 pg_trim_to = std::min({by_n_to_keep, by_n_to_trim, limit});
4449 psdout(10) << __func__ << " pg_trim_to now " << pg_trim_to << dendl;
4450 ceph_assert(pg_trim_to <= pg_log.get_head());
4451 }
4452 }
4453
4454 void PeeringState::apply_op_stats(
4455 const hobject_t &soid,
4456 const object_stat_sum_t &delta_stats)
4457 {
4458 info.stats.stats.add(delta_stats);
4459 info.stats.stats.floor(0);
4460
4461 for (auto i = get_backfill_targets().begin();
4462 i != get_backfill_targets().end();
4463 ++i) {
4464 pg_shard_t bt = *i;
4465 pg_info_t& pinfo = peer_info[bt];
4466 if (soid <= pinfo.last_backfill)
4467 pinfo.stats.stats.add(delta_stats);
4468 }
4469 }
4470
4471 void PeeringState::update_complete_backfill_object_stats(
4472 const hobject_t &hoid,
4473 const pg_stat_t &stats)
4474 {
4475 for (auto &&bt: get_backfill_targets()) {
4476 pg_info_t& pinfo = peer_info[bt];
4477 //Add stats to all peers that were missing object
4478 if (hoid > pinfo.last_backfill)
4479 pinfo.stats.add(stats);
4480 }
4481 }
4482
4483 void PeeringState::update_peer_last_backfill(
4484 pg_shard_t peer,
4485 const hobject_t &new_last_backfill)
4486 {
4487 pg_info_t &pinfo = peer_info[peer];
4488 pinfo.last_backfill = new_last_backfill;
4489 if (new_last_backfill.is_max()) {
4490 /* pinfo.stats might be wrong if we did log-based recovery on the
4491 * backfilled portion in addition to continuing backfill.
4492 */
4493 pinfo.stats = info.stats;
4494 }
4495 }
4496
4497 void PeeringState::set_revert_with_targets(
4498 const hobject_t &soid,
4499 const set<pg_shard_t> &good_peers)
4500 {
4501 for (auto &&peer: good_peers) {
4502 missing_loc.add_location(soid, peer);
4503 }
4504 }
4505
4506 void PeeringState::prepare_backfill_for_missing(
4507 const hobject_t &soid,
4508 const eversion_t &version,
4509 const vector<pg_shard_t> &targets) {
4510 for (auto &&peer: targets) {
4511 peer_missing[peer].add(soid, version, eversion_t(), false);
4512 }
4513 }
4514
4515 void PeeringState::update_hset(const pg_hit_set_history_t &hset_history)
4516 {
4517 info.hit_set = hset_history;
4518 }
4519
4520 /*------------ Peering State Machine----------------*/
4521 #undef dout_prefix
4522 #define dout_prefix (context< PeeringMachine >().dpp->gen_prefix(*_dout) \
4523 << "state<" << get_state_name() << ">: ")
4524 #undef psdout
4525 #define psdout(x) ldout(context< PeeringMachine >().cct, x)
4526
4527 #define DECLARE_LOCALS \
4528 PeeringState *ps = context< PeeringMachine >().state; \
4529 std::ignore = ps; \
4530 PeeringListener *pl = context< PeeringMachine >().pl; \
4531 std::ignore = pl
4532
4533
4534 /*------Crashed-------*/
4535 PeeringState::Crashed::Crashed(my_context ctx)
4536 : my_base(ctx),
4537 NamedState(context< PeeringMachine >().state_history, "Crashed")
4538 {
4539 context< PeeringMachine >().log_enter(state_name);
4540 ceph_abort_msg("we got a bad state machine event");
4541 }
4542
4543
4544 /*------Initial-------*/
4545 PeeringState::Initial::Initial(my_context ctx)
4546 : my_base(ctx),
4547 NamedState(context< PeeringMachine >().state_history, "Initial")
4548 {
4549 context< PeeringMachine >().log_enter(state_name);
4550 }
4551
4552 boost::statechart::result PeeringState::Initial::react(const MNotifyRec& notify)
4553 {
4554 DECLARE_LOCALS;
4555 ps->proc_replica_info(
4556 notify.from, notify.notify.info, notify.notify.epoch_sent);
4557 ps->set_last_peering_reset();
4558 return transit< Primary >();
4559 }
4560
4561 boost::statechart::result PeeringState::Initial::react(const MInfoRec& i)
4562 {
4563 DECLARE_LOCALS;
4564 ceph_assert(!ps->is_primary());
4565 post_event(i);
4566 return transit< Stray >();
4567 }
4568
4569 boost::statechart::result PeeringState::Initial::react(const MLogRec& i)
4570 {
4571 DECLARE_LOCALS;
4572 ceph_assert(!ps->is_primary());
4573 post_event(i);
4574 return transit< Stray >();
4575 }
4576
4577 void PeeringState::Initial::exit()
4578 {
4579 context< PeeringMachine >().log_exit(state_name, enter_time);
4580 DECLARE_LOCALS;
4581 utime_t dur = ceph_clock_now() - enter_time;
4582 pl->get_peering_perf().tinc(rs_initial_latency, dur);
4583 }
4584
4585 /*------Started-------*/
4586 PeeringState::Started::Started(my_context ctx)
4587 : my_base(ctx),
4588 NamedState(context< PeeringMachine >().state_history, "Started")
4589 {
4590 context< PeeringMachine >().log_enter(state_name);
4591 }
4592
4593 boost::statechart::result
4594 PeeringState::Started::react(const IntervalFlush&)
4595 {
4596 psdout(10) << "Ending blocked outgoing recovery messages" << dendl;
4597 context< PeeringMachine >().state->end_block_outgoing();
4598 return discard_event();
4599 }
4600
4601 boost::statechart::result PeeringState::Started::react(const AdvMap& advmap)
4602 {
4603 DECLARE_LOCALS;
4604 psdout(10) << "Started advmap" << dendl;
4605 ps->check_full_transition(advmap.lastmap, advmap.osdmap);
4606 if (ps->should_restart_peering(
4607 advmap.up_primary,
4608 advmap.acting_primary,
4609 advmap.newup,
4610 advmap.newacting,
4611 advmap.lastmap,
4612 advmap.osdmap)) {
4613 psdout(10) << "should_restart_peering, transitioning to Reset"
4614 << dendl;
4615 post_event(advmap);
4616 return transit< Reset >();
4617 }
4618 ps->remove_down_peer_info(advmap.osdmap);
4619 return discard_event();
4620 }
4621
4622 boost::statechart::result PeeringState::Started::react(const QueryState& q)
4623 {
4624 q.f->open_object_section("state");
4625 q.f->dump_string("name", state_name);
4626 q.f->dump_stream("enter_time") << enter_time;
4627 q.f->close_section();
4628 return discard_event();
4629 }
4630
4631 boost::statechart::result PeeringState::Started::react(const QueryUnfound& q)
4632 {
4633 q.f->dump_string("state", "Started");
4634 q.f->dump_bool("available_might_have_unfound", false);
4635 return discard_event();
4636 }
4637
4638 void PeeringState::Started::exit()
4639 {
4640 context< PeeringMachine >().log_exit(state_name, enter_time);
4641 DECLARE_LOCALS;
4642 utime_t dur = ceph_clock_now() - enter_time;
4643 pl->get_peering_perf().tinc(rs_started_latency, dur);
4644 ps->state_clear(PG_STATE_WAIT | PG_STATE_LAGGY);
4645 }
4646
4647 /*--------Reset---------*/
4648 PeeringState::Reset::Reset(my_context ctx)
4649 : my_base(ctx),
4650 NamedState(context< PeeringMachine >().state_history, "Reset")
4651 {
4652 context< PeeringMachine >().log_enter(state_name);
4653 DECLARE_LOCALS;
4654
4655 ps->flushes_in_progress = 0;
4656 ps->set_last_peering_reset();
4657 ps->log_weirdness();
4658 }
4659
4660 boost::statechart::result
4661 PeeringState::Reset::react(const IntervalFlush&)
4662 {
4663 psdout(10) << "Ending blocked outgoing recovery messages" << dendl;
4664 context< PeeringMachine >().state->end_block_outgoing();
4665 return discard_event();
4666 }
4667
4668 boost::statechart::result PeeringState::Reset::react(const AdvMap& advmap)
4669 {
4670 DECLARE_LOCALS;
4671 psdout(10) << "Reset advmap" << dendl;
4672
4673 ps->check_full_transition(advmap.lastmap, advmap.osdmap);
4674
4675 if (ps->should_restart_peering(
4676 advmap.up_primary,
4677 advmap.acting_primary,
4678 advmap.newup,
4679 advmap.newacting,
4680 advmap.lastmap,
4681 advmap.osdmap)) {
4682 psdout(10) << "should restart peering, calling start_peering_interval again"
4683 << dendl;
4684 ps->start_peering_interval(
4685 advmap.lastmap,
4686 advmap.newup, advmap.up_primary,
4687 advmap.newacting, advmap.acting_primary,
4688 context< PeeringMachine >().get_cur_transaction());
4689 }
4690 ps->remove_down_peer_info(advmap.osdmap);
4691 ps->check_past_interval_bounds();
4692 return discard_event();
4693 }
4694
4695 boost::statechart::result PeeringState::Reset::react(const ActMap&)
4696 {
4697 DECLARE_LOCALS;
4698 if (ps->should_send_notify() && ps->get_primary().osd >= 0) {
4699 ps->info.history.refresh_prior_readable_until_ub(
4700 pl->get_mnow(),
4701 ps->prior_readable_until_ub);
4702 context< PeeringMachine >().send_notify(
4703 ps->get_primary().osd,
4704 pg_notify_t(
4705 ps->get_primary().shard, ps->pg_whoami.shard,
4706 ps->get_osdmap_epoch(),
4707 ps->get_osdmap_epoch(),
4708 ps->info,
4709 ps->past_intervals));
4710 }
4711
4712 ps->update_heartbeat_peers();
4713
4714 return transit< Started >();
4715 }
4716
4717 boost::statechart::result PeeringState::Reset::react(const QueryState& q)
4718 {
4719 q.f->open_object_section("state");
4720 q.f->dump_string("name", state_name);
4721 q.f->dump_stream("enter_time") << enter_time;
4722 q.f->close_section();
4723 return discard_event();
4724 }
4725
4726 boost::statechart::result PeeringState::Reset::react(const QueryUnfound& q)
4727 {
4728 q.f->dump_string("state", "Reset");
4729 q.f->dump_bool("available_might_have_unfound", false);
4730 return discard_event();
4731 }
4732
4733 void PeeringState::Reset::exit()
4734 {
4735 context< PeeringMachine >().log_exit(state_name, enter_time);
4736 DECLARE_LOCALS;
4737 utime_t dur = ceph_clock_now() - enter_time;
4738 pl->get_peering_perf().tinc(rs_reset_latency, dur);
4739 }
4740
4741 /*-------Start---------*/
4742 PeeringState::Start::Start(my_context ctx)
4743 : my_base(ctx),
4744 NamedState(context< PeeringMachine >().state_history, "Start")
4745 {
4746 context< PeeringMachine >().log_enter(state_name);
4747
4748 DECLARE_LOCALS;
4749 if (ps->is_primary()) {
4750 psdout(1) << "transitioning to Primary" << dendl;
4751 post_event(MakePrimary());
4752 } else { //is_stray
4753 psdout(1) << "transitioning to Stray" << dendl;
4754 post_event(MakeStray());
4755 }
4756 }
4757
4758 void PeeringState::Start::exit()
4759 {
4760 context< PeeringMachine >().log_exit(state_name, enter_time);
4761 DECLARE_LOCALS;
4762 utime_t dur = ceph_clock_now() - enter_time;
4763 pl->get_peering_perf().tinc(rs_start_latency, dur);
4764 }
4765
4766 /*---------Primary--------*/
4767 PeeringState::Primary::Primary(my_context ctx)
4768 : my_base(ctx),
4769 NamedState(context< PeeringMachine >().state_history, "Started/Primary")
4770 {
4771 context< PeeringMachine >().log_enter(state_name);
4772 DECLARE_LOCALS;
4773 ceph_assert(ps->want_acting.empty());
4774
4775 // set CREATING bit until we have peered for the first time.
4776 if (ps->info.history.last_epoch_started == 0) {
4777 ps->state_set(PG_STATE_CREATING);
4778 // use the history timestamp, which ultimately comes from the
4779 // monitor in the create case.
4780 utime_t t = ps->info.history.last_scrub_stamp;
4781 ps->info.stats.last_fresh = t;
4782 ps->info.stats.last_active = t;
4783 ps->info.stats.last_change = t;
4784 ps->info.stats.last_peered = t;
4785 ps->info.stats.last_clean = t;
4786 ps->info.stats.last_unstale = t;
4787 ps->info.stats.last_undegraded = t;
4788 ps->info.stats.last_fullsized = t;
4789 ps->info.stats.last_scrub_stamp = t;
4790 ps->info.stats.last_deep_scrub_stamp = t;
4791 ps->info.stats.last_clean_scrub_stamp = t;
4792 }
4793 }
4794
4795 boost::statechart::result PeeringState::Primary::react(const MNotifyRec& notevt)
4796 {
4797 DECLARE_LOCALS;
4798 psdout(7) << "handle_pg_notify from osd." << notevt.from << dendl;
4799 ps->proc_replica_info(
4800 notevt.from, notevt.notify.info, notevt.notify.epoch_sent);
4801 return discard_event();
4802 }
4803
4804 boost::statechart::result PeeringState::Primary::react(const ActMap&)
4805 {
4806 DECLARE_LOCALS;
4807 psdout(7) << "handle ActMap primary" << dendl;
4808 pl->publish_stats_to_osd();
4809 return discard_event();
4810 }
4811
4812 boost::statechart::result PeeringState::Primary::react(
4813 const SetForceRecovery&)
4814 {
4815 DECLARE_LOCALS;
4816 ps->set_force_recovery(true);
4817 return discard_event();
4818 }
4819
4820 boost::statechart::result PeeringState::Primary::react(
4821 const UnsetForceRecovery&)
4822 {
4823 DECLARE_LOCALS;
4824 ps->set_force_recovery(false);
4825 return discard_event();
4826 }
4827
4828 boost::statechart::result PeeringState::Primary::react(
4829 const RequestScrub& evt)
4830 {
4831 DECLARE_LOCALS;
4832 if (ps->is_primary()) {
4833 pl->scrub_requested(evt.deep, evt.repair);
4834 psdout(10) << "marking for scrub" << dendl;
4835 }
4836 return discard_event();
4837 }
4838
4839 boost::statechart::result PeeringState::Primary::react(
4840 const SetForceBackfill&)
4841 {
4842 DECLARE_LOCALS;
4843 ps->set_force_backfill(true);
4844 return discard_event();
4845 }
4846
4847 boost::statechart::result PeeringState::Primary::react(
4848 const UnsetForceBackfill&)
4849 {
4850 DECLARE_LOCALS;
4851 ps->set_force_backfill(false);
4852 return discard_event();
4853 }
4854
4855 void PeeringState::Primary::exit()
4856 {
4857 context< PeeringMachine >().log_exit(state_name, enter_time);
4858 DECLARE_LOCALS;
4859 ps->want_acting.clear();
4860 utime_t dur = ceph_clock_now() - enter_time;
4861 pl->get_peering_perf().tinc(rs_primary_latency, dur);
4862 pl->clear_primary_state();
4863 ps->state_clear(PG_STATE_CREATING);
4864 }
4865
4866 /*---------Peering--------*/
4867 PeeringState::Peering::Peering(my_context ctx)
4868 : my_base(ctx),
4869 NamedState(context< PeeringMachine >().state_history, "Started/Primary/Peering"),
4870 history_les_bound(false)
4871 {
4872 context< PeeringMachine >().log_enter(state_name);
4873 DECLARE_LOCALS;
4874
4875 ceph_assert(!ps->is_peered());
4876 ceph_assert(!ps->is_peering());
4877 ceph_assert(ps->is_primary());
4878 ps->state_set(PG_STATE_PEERING);
4879 }
4880
4881 boost::statechart::result PeeringState::Peering::react(const AdvMap& advmap)
4882 {
4883 DECLARE_LOCALS;
4884 psdout(10) << "Peering advmap" << dendl;
4885 if (prior_set.affected_by_map(*(advmap.osdmap), ps->dpp)) {
4886 psdout(1) << "Peering, affected_by_map, going to Reset" << dendl;
4887 post_event(advmap);
4888 return transit< Reset >();
4889 }
4890
4891 ps->adjust_need_up_thru(advmap.osdmap);
4892 ps->check_prior_readable_down_osds(advmap.osdmap);
4893
4894 return forward_event();
4895 }
4896
4897 boost::statechart::result PeeringState::Peering::react(const QueryState& q)
4898 {
4899 DECLARE_LOCALS;
4900
4901 q.f->open_object_section("state");
4902 q.f->dump_string("name", state_name);
4903 q.f->dump_stream("enter_time") << enter_time;
4904
4905 q.f->open_array_section("past_intervals");
4906 ps->past_intervals.dump(q.f);
4907 q.f->close_section();
4908
4909 q.f->open_array_section("probing_osds");
4910 for (auto p = prior_set.probe.begin(); p != prior_set.probe.end(); ++p)
4911 q.f->dump_stream("osd") << *p;
4912 q.f->close_section();
4913
4914 if (prior_set.pg_down)
4915 q.f->dump_string("blocked", "peering is blocked due to down osds");
4916
4917 q.f->open_array_section("down_osds_we_would_probe");
4918 for (auto p = prior_set.down.begin(); p != prior_set.down.end(); ++p)
4919 q.f->dump_int("osd", *p);
4920 q.f->close_section();
4921
4922 q.f->open_array_section("peering_blocked_by");
4923 for (auto p = prior_set.blocked_by.begin();
4924 p != prior_set.blocked_by.end();
4925 ++p) {
4926 q.f->open_object_section("osd");
4927 q.f->dump_int("osd", p->first);
4928 q.f->dump_int("current_lost_at", p->second);
4929 q.f->dump_string("comment", "starting or marking this osd lost may let us proceed");
4930 q.f->close_section();
4931 }
4932 q.f->close_section();
4933
4934 if (history_les_bound) {
4935 q.f->open_array_section("peering_blocked_by_detail");
4936 q.f->open_object_section("item");
4937 q.f->dump_string("detail","peering_blocked_by_history_les_bound");
4938 q.f->close_section();
4939 q.f->close_section();
4940 }
4941
4942 q.f->close_section();
4943 return forward_event();
4944 }
4945
4946 boost::statechart::result PeeringState::Peering::react(const QueryUnfound& q)
4947 {
4948 q.f->dump_string("state", "Peering");
4949 q.f->dump_bool("available_might_have_unfound", false);
4950 return discard_event();
4951 }
4952
4953 void PeeringState::Peering::exit()
4954 {
4955
4956 DECLARE_LOCALS;
4957 psdout(10) << "Leaving Peering" << dendl;
4958 context< PeeringMachine >().log_exit(state_name, enter_time);
4959 ps->state_clear(PG_STATE_PEERING);
4960 pl->clear_probe_targets();
4961
4962 utime_t dur = ceph_clock_now() - enter_time;
4963 pl->get_peering_perf().tinc(rs_peering_latency, dur);
4964 }
4965
4966
4967 /*------Backfilling-------*/
4968 PeeringState::Backfilling::Backfilling(my_context ctx)
4969 : my_base(ctx),
4970 NamedState(context< PeeringMachine >().state_history, "Started/Primary/Active/Backfilling")
4971 {
4972 context< PeeringMachine >().log_enter(state_name);
4973
4974
4975 DECLARE_LOCALS;
4976 ps->backfill_reserved = true;
4977 pl->on_backfill_reserved();
4978 ps->state_clear(PG_STATE_BACKFILL_TOOFULL);
4979 ps->state_clear(PG_STATE_BACKFILL_WAIT);
4980 ps->state_set(PG_STATE_BACKFILLING);
4981 pl->publish_stats_to_osd();
4982 }
4983
4984 void PeeringState::Backfilling::backfill_release_reservations()
4985 {
4986 DECLARE_LOCALS;
4987 pl->cancel_local_background_io_reservation();
4988 for (auto it = ps->backfill_targets.begin();
4989 it != ps->backfill_targets.end();
4990 ++it) {
4991 ceph_assert(*it != ps->pg_whoami);
4992 pl->send_cluster_message(
4993 it->osd,
4994 TOPNSPC::make_message<MBackfillReserve>(
4995 MBackfillReserve::RELEASE,
4996 spg_t(ps->info.pgid.pgid, it->shard),
4997 ps->get_osdmap_epoch()),
4998 ps->get_osdmap_epoch());
4999 }
5000 }
5001
5002 void PeeringState::Backfilling::cancel_backfill()
5003 {
5004 DECLARE_LOCALS;
5005 backfill_release_reservations();
5006 pl->on_backfill_canceled();
5007 }
5008
5009 boost::statechart::result
5010 PeeringState::Backfilling::react(const Backfilled &c)
5011 {
5012 backfill_release_reservations();
5013 return transit<Recovered>();
5014 }
5015
5016 boost::statechart::result
5017 PeeringState::Backfilling::react(const DeferBackfill &c)
5018 {
5019 DECLARE_LOCALS;
5020
5021 psdout(10) << "defer backfill, retry delay " << c.delay << dendl;
5022 ps->state_set(PG_STATE_BACKFILL_WAIT);
5023 ps->state_clear(PG_STATE_BACKFILLING);
5024 cancel_backfill();
5025
5026 pl->schedule_event_after(
5027 std::make_shared<PGPeeringEvent>(
5028 ps->get_osdmap_epoch(),
5029 ps->get_osdmap_epoch(),
5030 RequestBackfill()),
5031 c.delay);
5032 return transit<NotBackfilling>();
5033 }
5034
5035 boost::statechart::result
5036 PeeringState::Backfilling::react(const UnfoundBackfill &c)
5037 {
5038 DECLARE_LOCALS;
5039 psdout(10) << "backfill has unfound, can't continue" << dendl;
5040 ps->state_set(PG_STATE_BACKFILL_UNFOUND);
5041 ps->state_clear(PG_STATE_BACKFILLING);
5042 cancel_backfill();
5043 return transit<NotBackfilling>();
5044 }
5045
5046 boost::statechart::result
5047 PeeringState::Backfilling::react(const RemoteReservationRevokedTooFull &)
5048 {
5049 DECLARE_LOCALS;
5050
5051 ps->state_set(PG_STATE_BACKFILL_TOOFULL);
5052 ps->state_clear(PG_STATE_BACKFILLING);
5053 cancel_backfill();
5054
5055 pl->schedule_event_after(
5056 std::make_shared<PGPeeringEvent>(
5057 ps->get_osdmap_epoch(),
5058 ps->get_osdmap_epoch(),
5059 RequestBackfill()),
5060 ps->cct->_conf->osd_backfill_retry_interval);
5061
5062 return transit<NotBackfilling>();
5063 }
5064
5065 boost::statechart::result
5066 PeeringState::Backfilling::react(const RemoteReservationRevoked &)
5067 {
5068 DECLARE_LOCALS;
5069 ps->state_set(PG_STATE_BACKFILL_WAIT);
5070 cancel_backfill();
5071 if (ps->needs_backfill()) {
5072 return transit<WaitLocalBackfillReserved>();
5073 } else {
5074 // raced with MOSDPGBackfill::OP_BACKFILL_FINISH, ignore
5075 return discard_event();
5076 }
5077 }
5078
5079 void PeeringState::Backfilling::exit()
5080 {
5081 context< PeeringMachine >().log_exit(state_name, enter_time);
5082 DECLARE_LOCALS;
5083 ps->backfill_reserved = false;
5084 ps->state_clear(PG_STATE_BACKFILLING);
5085 ps->state_clear(PG_STATE_FORCED_BACKFILL | PG_STATE_FORCED_RECOVERY);
5086 utime_t dur = ceph_clock_now() - enter_time;
5087 pl->get_peering_perf().tinc(rs_backfilling_latency, dur);
5088 }
5089
5090 /*--WaitRemoteBackfillReserved--*/
5091
5092 PeeringState::WaitRemoteBackfillReserved::WaitRemoteBackfillReserved(my_context ctx)
5093 : my_base(ctx),
5094 NamedState(context< PeeringMachine >().state_history, "Started/Primary/Active/WaitRemoteBackfillReserved"),
5095 backfill_osd_it(context< Active >().remote_shards_to_reserve_backfill.begin())
5096 {
5097 context< PeeringMachine >().log_enter(state_name);
5098 DECLARE_LOCALS;
5099
5100 ps->state_set(PG_STATE_BACKFILL_WAIT);
5101 pl->publish_stats_to_osd();
5102 post_event(RemoteBackfillReserved());
5103 }
5104
5105 boost::statechart::result
5106 PeeringState::WaitRemoteBackfillReserved::react(const RemoteBackfillReserved &evt)
5107 {
5108 DECLARE_LOCALS;
5109
5110 int64_t num_bytes = ps->info.stats.stats.sum.num_bytes;
5111 psdout(10) << __func__ << " num_bytes " << num_bytes << dendl;
5112 if (backfill_osd_it !=
5113 context< Active >().remote_shards_to_reserve_backfill.end()) {
5114 // The primary never backfills itself
5115 ceph_assert(*backfill_osd_it != ps->pg_whoami);
5116 pl->send_cluster_message(
5117 backfill_osd_it->osd,
5118 TOPNSPC::make_message<MBackfillReserve>(
5119 MBackfillReserve::REQUEST,
5120 spg_t(context< PeeringMachine >().spgid.pgid, backfill_osd_it->shard),
5121 ps->get_osdmap_epoch(),
5122 ps->get_backfill_priority(),
5123 num_bytes,
5124 ps->peer_bytes[*backfill_osd_it]),
5125 ps->get_osdmap_epoch());
5126 ++backfill_osd_it;
5127 } else {
5128 ps->peer_bytes.clear();
5129 post_event(AllBackfillsReserved());
5130 }
5131 return discard_event();
5132 }
5133
5134 void PeeringState::WaitRemoteBackfillReserved::exit()
5135 {
5136 context< PeeringMachine >().log_exit(state_name, enter_time);
5137 DECLARE_LOCALS;
5138
5139 utime_t dur = ceph_clock_now() - enter_time;
5140 pl->get_peering_perf().tinc(rs_waitremotebackfillreserved_latency, dur);
5141 }
5142
5143 void PeeringState::WaitRemoteBackfillReserved::retry()
5144 {
5145 DECLARE_LOCALS;
5146 pl->cancel_local_background_io_reservation();
5147
5148 // Send CANCEL to all previously acquired reservations
5149 set<pg_shard_t>::const_iterator it, begin, end;
5150 begin = context< Active >().remote_shards_to_reserve_backfill.begin();
5151 end = context< Active >().remote_shards_to_reserve_backfill.end();
5152 ceph_assert(begin != end);
5153 for (it = begin; it != backfill_osd_it; ++it) {
5154 // The primary never backfills itself
5155 ceph_assert(*it != ps->pg_whoami);
5156 pl->send_cluster_message(
5157 it->osd,
5158 TOPNSPC::make_message<MBackfillReserve>(
5159 MBackfillReserve::RELEASE,
5160 spg_t(context< PeeringMachine >().spgid.pgid, it->shard),
5161 ps->get_osdmap_epoch()),
5162 ps->get_osdmap_epoch());
5163 }
5164
5165 ps->state_clear(PG_STATE_BACKFILL_WAIT);
5166 pl->publish_stats_to_osd();
5167
5168 pl->schedule_event_after(
5169 std::make_shared<PGPeeringEvent>(
5170 ps->get_osdmap_epoch(),
5171 ps->get_osdmap_epoch(),
5172 RequestBackfill()),
5173 ps->cct->_conf->osd_backfill_retry_interval);
5174 }
5175
5176 boost::statechart::result
5177 PeeringState::WaitRemoteBackfillReserved::react(const RemoteReservationRejectedTooFull &evt)
5178 {
5179 DECLARE_LOCALS;
5180 ps->state_set(PG_STATE_BACKFILL_TOOFULL);
5181 retry();
5182 return transit<NotBackfilling>();
5183 }
5184
5185 boost::statechart::result
5186 PeeringState::WaitRemoteBackfillReserved::react(const RemoteReservationRevoked &evt)
5187 {
5188 retry();
5189 return transit<NotBackfilling>();
5190 }
5191
5192 /*--WaitLocalBackfillReserved--*/
5193 PeeringState::WaitLocalBackfillReserved::WaitLocalBackfillReserved(my_context ctx)
5194 : my_base(ctx),
5195 NamedState(context< PeeringMachine >().state_history, "Started/Primary/Active/WaitLocalBackfillReserved")
5196 {
5197 context< PeeringMachine >().log_enter(state_name);
5198 DECLARE_LOCALS;
5199
5200 ps->state_set(PG_STATE_BACKFILL_WAIT);
5201 pl->request_local_background_io_reservation(
5202 ps->get_backfill_priority(),
5203 std::make_unique<PGPeeringEvent>(
5204 ps->get_osdmap_epoch(),
5205 ps->get_osdmap_epoch(),
5206 LocalBackfillReserved()),
5207 std::make_unique<PGPeeringEvent>(
5208 ps->get_osdmap_epoch(),
5209 ps->get_osdmap_epoch(),
5210 DeferBackfill(0.0)));
5211 pl->publish_stats_to_osd();
5212 }
5213
5214 void PeeringState::WaitLocalBackfillReserved::exit()
5215 {
5216 context< PeeringMachine >().log_exit(state_name, enter_time);
5217 DECLARE_LOCALS;
5218 utime_t dur = ceph_clock_now() - enter_time;
5219 pl->get_peering_perf().tinc(rs_waitlocalbackfillreserved_latency, dur);
5220 }
5221
5222 /*----NotBackfilling------*/
5223 PeeringState::NotBackfilling::NotBackfilling(my_context ctx)
5224 : my_base(ctx),
5225 NamedState(context< PeeringMachine >().state_history, "Started/Primary/Active/NotBackfilling")
5226 {
5227 context< PeeringMachine >().log_enter(state_name);
5228 DECLARE_LOCALS;
5229 ps->state_clear(PG_STATE_REPAIR);
5230 pl->publish_stats_to_osd();
5231 }
5232
5233 boost::statechart::result PeeringState::NotBackfilling::react(const QueryUnfound& q)
5234 {
5235 DECLARE_LOCALS;
5236
5237 ps->query_unfound(q.f, "NotBackfilling");
5238 return discard_event();
5239 }
5240
5241 boost::statechart::result
5242 PeeringState::NotBackfilling::react(const RemoteBackfillReserved &evt)
5243 {
5244 return discard_event();
5245 }
5246
5247 boost::statechart::result
5248 PeeringState::NotBackfilling::react(const RemoteReservationRejectedTooFull &evt)
5249 {
5250 return discard_event();
5251 }
5252
5253 void PeeringState::NotBackfilling::exit()
5254 {
5255 context< PeeringMachine >().log_exit(state_name, enter_time);
5256
5257 DECLARE_LOCALS;
5258 ps->state_clear(PG_STATE_BACKFILL_UNFOUND);
5259 utime_t dur = ceph_clock_now() - enter_time;
5260 pl->get_peering_perf().tinc(rs_notbackfilling_latency, dur);
5261 }
5262
5263 /*----NotRecovering------*/
5264 PeeringState::NotRecovering::NotRecovering(my_context ctx)
5265 : my_base(ctx),
5266 NamedState(context< PeeringMachine >().state_history, "Started/Primary/Active/NotRecovering")
5267 {
5268 context< PeeringMachine >().log_enter(state_name);
5269 DECLARE_LOCALS;
5270 ps->state_clear(PG_STATE_REPAIR);
5271 pl->publish_stats_to_osd();
5272 }
5273
5274 boost::statechart::result PeeringState::NotRecovering::react(const QueryUnfound& q)
5275 {
5276 DECLARE_LOCALS;
5277
5278 ps->query_unfound(q.f, "NotRecovering");
5279 return discard_event();
5280 }
5281
5282 void PeeringState::NotRecovering::exit()
5283 {
5284 context< PeeringMachine >().log_exit(state_name, enter_time);
5285
5286 DECLARE_LOCALS;
5287 ps->state_clear(PG_STATE_RECOVERY_UNFOUND);
5288 utime_t dur = ceph_clock_now() - enter_time;
5289 pl->get_peering_perf().tinc(rs_notrecovering_latency, dur);
5290 }
5291
5292 /*---RepNotRecovering----*/
5293 PeeringState::RepNotRecovering::RepNotRecovering(my_context ctx)
5294 : my_base(ctx),
5295 NamedState(context< PeeringMachine >().state_history, "Started/ReplicaActive/RepNotRecovering")
5296 {
5297 context< PeeringMachine >().log_enter(state_name);
5298 }
5299
5300 boost::statechart::result
5301 PeeringState::RepNotRecovering::react(const RejectTooFullRemoteReservation &evt)
5302 {
5303 DECLARE_LOCALS;
5304 ps->reject_reservation();
5305 post_event(RemoteReservationRejectedTooFull());
5306 return discard_event();
5307 }
5308
5309 void PeeringState::RepNotRecovering::exit()
5310 {
5311 context< PeeringMachine >().log_exit(state_name, enter_time);
5312 DECLARE_LOCALS;
5313 utime_t dur = ceph_clock_now() - enter_time;
5314 pl->get_peering_perf().tinc(rs_repnotrecovering_latency, dur);
5315 }
5316
5317 /*---RepWaitRecoveryReserved--*/
5318 PeeringState::RepWaitRecoveryReserved::RepWaitRecoveryReserved(my_context ctx)
5319 : my_base(ctx),
5320 NamedState(context< PeeringMachine >().state_history, "Started/ReplicaActive/RepWaitRecoveryReserved")
5321 {
5322 context< PeeringMachine >().log_enter(state_name);
5323 }
5324
5325 boost::statechart::result
5326 PeeringState::RepWaitRecoveryReserved::react(const RemoteRecoveryReserved &evt)
5327 {
5328 DECLARE_LOCALS;
5329 pl->send_cluster_message(
5330 ps->primary.osd,
5331 TOPNSPC::make_message<MRecoveryReserve>(
5332 MRecoveryReserve::GRANT,
5333 spg_t(ps->info.pgid.pgid, ps->primary.shard),
5334 ps->get_osdmap_epoch()),
5335 ps->get_osdmap_epoch());
5336 return transit<RepRecovering>();
5337 }
5338
5339 boost::statechart::result
5340 PeeringState::RepWaitRecoveryReserved::react(
5341 const RemoteReservationCanceled &evt)
5342 {
5343 DECLARE_LOCALS;
5344 pl->unreserve_recovery_space();
5345
5346 pl->cancel_remote_recovery_reservation();
5347 return transit<RepNotRecovering>();
5348 }
5349
5350 void PeeringState::RepWaitRecoveryReserved::exit()
5351 {
5352 context< PeeringMachine >().log_exit(state_name, enter_time);
5353 DECLARE_LOCALS;
5354 utime_t dur = ceph_clock_now() - enter_time;
5355 pl->get_peering_perf().tinc(rs_repwaitrecoveryreserved_latency, dur);
5356 }
5357
5358 /*-RepWaitBackfillReserved*/
5359 PeeringState::RepWaitBackfillReserved::RepWaitBackfillReserved(my_context ctx)
5360 : my_base(ctx),
5361 NamedState(context< PeeringMachine >().state_history, "Started/ReplicaActive/RepWaitBackfillReserved")
5362 {
5363 context< PeeringMachine >().log_enter(state_name);
5364 }
5365
5366 boost::statechart::result
5367 PeeringState::RepNotRecovering::react(const RequestBackfillPrio &evt)
5368 {
5369
5370 DECLARE_LOCALS;
5371
5372 if (!pl->try_reserve_recovery_space(
5373 evt.primary_num_bytes, evt.local_num_bytes)) {
5374 post_event(RejectTooFullRemoteReservation());
5375 } else {
5376 PGPeeringEventURef preempt;
5377 if (HAVE_FEATURE(ps->upacting_features, RECOVERY_RESERVATION_2)) {
5378 // older peers will interpret preemption as TOOFULL
5379 preempt = std::make_unique<PGPeeringEvent>(
5380 pl->get_osdmap_epoch(),
5381 pl->get_osdmap_epoch(),
5382 RemoteBackfillPreempted());
5383 }
5384 pl->request_remote_recovery_reservation(
5385 evt.priority,
5386 std::make_unique<PGPeeringEvent>(
5387 pl->get_osdmap_epoch(),
5388 pl->get_osdmap_epoch(),
5389 RemoteBackfillReserved()),
5390 std::move(preempt));
5391 }
5392 return transit<RepWaitBackfillReserved>();
5393 }
5394
5395 boost::statechart::result
5396 PeeringState::RepNotRecovering::react(const RequestRecoveryPrio &evt)
5397 {
5398 DECLARE_LOCALS;
5399
5400 // fall back to a local reckoning of priority of primary doesn't pass one
5401 // (pre-mimic compat)
5402 int prio = evt.priority ? evt.priority : ps->get_recovery_priority();
5403
5404 PGPeeringEventURef preempt;
5405 if (HAVE_FEATURE(ps->upacting_features, RECOVERY_RESERVATION_2)) {
5406 // older peers can't handle this
5407 preempt = std::make_unique<PGPeeringEvent>(
5408 ps->get_osdmap_epoch(),
5409 ps->get_osdmap_epoch(),
5410 RemoteRecoveryPreempted());
5411 }
5412
5413 pl->request_remote_recovery_reservation(
5414 prio,
5415 std::make_unique<PGPeeringEvent>(
5416 ps->get_osdmap_epoch(),
5417 ps->get_osdmap_epoch(),
5418 RemoteRecoveryReserved()),
5419 std::move(preempt));
5420 return transit<RepWaitRecoveryReserved>();
5421 }
5422
5423 void PeeringState::RepWaitBackfillReserved::exit()
5424 {
5425 context< PeeringMachine >().log_exit(state_name, enter_time);
5426 DECLARE_LOCALS;
5427 utime_t dur = ceph_clock_now() - enter_time;
5428 pl->get_peering_perf().tinc(rs_repwaitbackfillreserved_latency, dur);
5429 }
5430
5431 boost::statechart::result
5432 PeeringState::RepWaitBackfillReserved::react(const RemoteBackfillReserved &evt)
5433 {
5434 DECLARE_LOCALS;
5435
5436
5437 pl->send_cluster_message(
5438 ps->primary.osd,
5439 TOPNSPC::make_message<MBackfillReserve>(
5440 MBackfillReserve::GRANT,
5441 spg_t(ps->info.pgid.pgid, ps->primary.shard),
5442 ps->get_osdmap_epoch()),
5443 ps->get_osdmap_epoch());
5444 return transit<RepRecovering>();
5445 }
5446
5447 boost::statechart::result
5448 PeeringState::RepWaitBackfillReserved::react(
5449 const RejectTooFullRemoteReservation &evt)
5450 {
5451 DECLARE_LOCALS;
5452 ps->reject_reservation();
5453 post_event(RemoteReservationRejectedTooFull());
5454 return discard_event();
5455 }
5456
5457 boost::statechart::result
5458 PeeringState::RepWaitBackfillReserved::react(
5459 const RemoteReservationRejectedTooFull &evt)
5460 {
5461 DECLARE_LOCALS;
5462 pl->unreserve_recovery_space();
5463
5464 pl->cancel_remote_recovery_reservation();
5465 return transit<RepNotRecovering>();
5466 }
5467
5468 boost::statechart::result
5469 PeeringState::RepWaitBackfillReserved::react(
5470 const RemoteReservationCanceled &evt)
5471 {
5472 DECLARE_LOCALS;
5473 pl->unreserve_recovery_space();
5474
5475 pl->cancel_remote_recovery_reservation();
5476 return transit<RepNotRecovering>();
5477 }
5478
5479 /*---RepRecovering-------*/
5480 PeeringState::RepRecovering::RepRecovering(my_context ctx)
5481 : my_base(ctx),
5482 NamedState(context< PeeringMachine >().state_history, "Started/ReplicaActive/RepRecovering")
5483 {
5484 context< PeeringMachine >().log_enter(state_name);
5485 }
5486
5487 boost::statechart::result
5488 PeeringState::RepRecovering::react(const RemoteRecoveryPreempted &)
5489 {
5490 DECLARE_LOCALS;
5491
5492
5493 pl->unreserve_recovery_space();
5494 pl->send_cluster_message(
5495 ps->primary.osd,
5496 TOPNSPC::make_message<MRecoveryReserve>(
5497 MRecoveryReserve::REVOKE,
5498 spg_t(ps->info.pgid.pgid, ps->primary.shard),
5499 ps->get_osdmap_epoch()),
5500 ps->get_osdmap_epoch());
5501 return discard_event();
5502 }
5503
5504 boost::statechart::result
5505 PeeringState::RepRecovering::react(const BackfillTooFull &)
5506 {
5507 DECLARE_LOCALS;
5508
5509
5510 pl->unreserve_recovery_space();
5511 pl->send_cluster_message(
5512 ps->primary.osd,
5513 TOPNSPC::make_message<MBackfillReserve>(
5514 MBackfillReserve::REVOKE_TOOFULL,
5515 spg_t(ps->info.pgid.pgid, ps->primary.shard),
5516 ps->get_osdmap_epoch()),
5517 ps->get_osdmap_epoch());
5518 return discard_event();
5519 }
5520
5521 boost::statechart::result
5522 PeeringState::RepRecovering::react(const RemoteBackfillPreempted &)
5523 {
5524 DECLARE_LOCALS;
5525
5526
5527 pl->unreserve_recovery_space();
5528 pl->send_cluster_message(
5529 ps->primary.osd,
5530 TOPNSPC::make_message<MBackfillReserve>(
5531 MBackfillReserve::REVOKE,
5532 spg_t(ps->info.pgid.pgid, ps->primary.shard),
5533 ps->get_osdmap_epoch()),
5534 ps->get_osdmap_epoch());
5535 return discard_event();
5536 }
5537
5538 void PeeringState::RepRecovering::exit()
5539 {
5540 context< PeeringMachine >().log_exit(state_name, enter_time);
5541 DECLARE_LOCALS;
5542 pl->unreserve_recovery_space();
5543
5544 pl->cancel_remote_recovery_reservation();
5545 utime_t dur = ceph_clock_now() - enter_time;
5546 pl->get_peering_perf().tinc(rs_reprecovering_latency, dur);
5547 }
5548
5549 /*------Activating--------*/
5550 PeeringState::Activating::Activating(my_context ctx)
5551 : my_base(ctx),
5552 NamedState(context< PeeringMachine >().state_history, "Started/Primary/Active/Activating")
5553 {
5554 context< PeeringMachine >().log_enter(state_name);
5555 }
5556
5557 void PeeringState::Activating::exit()
5558 {
5559 context< PeeringMachine >().log_exit(state_name, enter_time);
5560 DECLARE_LOCALS;
5561 utime_t dur = ceph_clock_now() - enter_time;
5562 pl->get_peering_perf().tinc(rs_activating_latency, dur);
5563 }
5564
5565 PeeringState::WaitLocalRecoveryReserved::WaitLocalRecoveryReserved(my_context ctx)
5566 : my_base(ctx),
5567 NamedState(context< PeeringMachine >().state_history, "Started/Primary/Active/WaitLocalRecoveryReserved")
5568 {
5569 context< PeeringMachine >().log_enter(state_name);
5570 DECLARE_LOCALS;
5571
5572 // Make sure all nodes that part of the recovery aren't full
5573 if (!ps->cct->_conf->osd_debug_skip_full_check_in_recovery &&
5574 ps->get_osdmap()->check_full(ps->acting_recovery_backfill)) {
5575 post_event(RecoveryTooFull());
5576 return;
5577 }
5578
5579 ps->state_clear(PG_STATE_RECOVERY_TOOFULL);
5580 ps->state_set(PG_STATE_RECOVERY_WAIT);
5581 pl->request_local_background_io_reservation(
5582 ps->get_recovery_priority(),
5583 std::make_unique<PGPeeringEvent>(
5584 ps->get_osdmap_epoch(),
5585 ps->get_osdmap_epoch(),
5586 LocalRecoveryReserved()),
5587 std::make_unique<PGPeeringEvent>(
5588 ps->get_osdmap_epoch(),
5589 ps->get_osdmap_epoch(),
5590 DeferRecovery(0.0)));
5591 pl->publish_stats_to_osd();
5592 }
5593
5594 boost::statechart::result
5595 PeeringState::WaitLocalRecoveryReserved::react(const RecoveryTooFull &evt)
5596 {
5597 DECLARE_LOCALS;
5598 ps->state_set(PG_STATE_RECOVERY_TOOFULL);
5599 pl->schedule_event_after(
5600 std::make_shared<PGPeeringEvent>(
5601 ps->get_osdmap_epoch(),
5602 ps->get_osdmap_epoch(),
5603 DoRecovery()),
5604 ps->cct->_conf->osd_recovery_retry_interval);
5605 return transit<NotRecovering>();
5606 }
5607
5608 void PeeringState::WaitLocalRecoveryReserved::exit()
5609 {
5610 context< PeeringMachine >().log_exit(state_name, enter_time);
5611 DECLARE_LOCALS;
5612 utime_t dur = ceph_clock_now() - enter_time;
5613 pl->get_peering_perf().tinc(rs_waitlocalrecoveryreserved_latency, dur);
5614 }
5615
5616 PeeringState::WaitRemoteRecoveryReserved::WaitRemoteRecoveryReserved(my_context ctx)
5617 : my_base(ctx),
5618 NamedState(context< PeeringMachine >().state_history, "Started/Primary/Active/WaitRemoteRecoveryReserved"),
5619 remote_recovery_reservation_it(context< Active >().remote_shards_to_reserve_recovery.begin())
5620 {
5621 context< PeeringMachine >().log_enter(state_name);
5622 post_event(RemoteRecoveryReserved());
5623 }
5624
5625 boost::statechart::result
5626 PeeringState::WaitRemoteRecoveryReserved::react(const RemoteRecoveryReserved &evt) {
5627 DECLARE_LOCALS;
5628
5629 if (remote_recovery_reservation_it !=
5630 context< Active >().remote_shards_to_reserve_recovery.end()) {
5631 ceph_assert(*remote_recovery_reservation_it != ps->pg_whoami);
5632 pl->send_cluster_message(
5633 remote_recovery_reservation_it->osd,
5634 TOPNSPC::make_message<MRecoveryReserve>(
5635 MRecoveryReserve::REQUEST,
5636 spg_t(context< PeeringMachine >().spgid.pgid,
5637 remote_recovery_reservation_it->shard),
5638 ps->get_osdmap_epoch(),
5639 ps->get_recovery_priority()),
5640 ps->get_osdmap_epoch());
5641 ++remote_recovery_reservation_it;
5642 } else {
5643 post_event(AllRemotesReserved());
5644 }
5645 return discard_event();
5646 }
5647
5648 void PeeringState::WaitRemoteRecoveryReserved::exit()
5649 {
5650 context< PeeringMachine >().log_exit(state_name, enter_time);
5651 DECLARE_LOCALS;
5652 utime_t dur = ceph_clock_now() - enter_time;
5653 pl->get_peering_perf().tinc(rs_waitremoterecoveryreserved_latency, dur);
5654 }
5655
5656 PeeringState::Recovering::Recovering(my_context ctx)
5657 : my_base(ctx),
5658 NamedState(context< PeeringMachine >().state_history, "Started/Primary/Active/Recovering")
5659 {
5660 context< PeeringMachine >().log_enter(state_name);
5661
5662 DECLARE_LOCALS;
5663 ps->state_clear(PG_STATE_RECOVERY_WAIT);
5664 ps->state_clear(PG_STATE_RECOVERY_TOOFULL);
5665 ps->state_set(PG_STATE_RECOVERING);
5666 pl->on_recovery_reserved();
5667 ceph_assert(!ps->state_test(PG_STATE_ACTIVATING));
5668 pl->publish_stats_to_osd();
5669 }
5670
5671 void PeeringState::Recovering::release_reservations(bool cancel)
5672 {
5673 DECLARE_LOCALS;
5674 ceph_assert(cancel || !ps->pg_log.get_missing().have_missing());
5675
5676 // release remote reservations
5677 for (auto i = context< Active >().remote_shards_to_reserve_recovery.begin();
5678 i != context< Active >().remote_shards_to_reserve_recovery.end();
5679 ++i) {
5680 if (*i == ps->pg_whoami) // skip myself
5681 continue;
5682 pl->send_cluster_message(
5683 i->osd,
5684 TOPNSPC::make_message<MRecoveryReserve>(
5685 MRecoveryReserve::RELEASE,
5686 spg_t(ps->info.pgid.pgid, i->shard),
5687 ps->get_osdmap_epoch()),
5688 ps->get_osdmap_epoch());
5689 }
5690 }
5691
5692 boost::statechart::result
5693 PeeringState::Recovering::react(const AllReplicasRecovered &evt)
5694 {
5695 DECLARE_LOCALS;
5696 ps->state_clear(PG_STATE_FORCED_RECOVERY);
5697 release_reservations();
5698 pl->cancel_local_background_io_reservation();
5699 return transit<Recovered>();
5700 }
5701
5702 boost::statechart::result
5703 PeeringState::Recovering::react(const RequestBackfill &evt)
5704 {
5705 DECLARE_LOCALS;
5706
5707 release_reservations();
5708
5709 ps->state_clear(PG_STATE_FORCED_RECOVERY);
5710 pl->cancel_local_background_io_reservation();
5711 pl->publish_stats_to_osd();
5712 // transit any async_recovery_targets back into acting
5713 // so pg won't have to stay undersized for long
5714 // as backfill might take a long time to complete..
5715 if (!ps->async_recovery_targets.empty()) {
5716 pg_shard_t auth_log_shard;
5717 bool history_les_bound = false;
5718 // FIXME: Uh-oh we have to check this return value; choose_acting can fail!
5719 ps->choose_acting(auth_log_shard, true, &history_les_bound);
5720 }
5721 return transit<WaitLocalBackfillReserved>();
5722 }
5723
5724 boost::statechart::result
5725 PeeringState::Recovering::react(const DeferRecovery &evt)
5726 {
5727 DECLARE_LOCALS;
5728 if (!ps->state_test(PG_STATE_RECOVERING)) {
5729 // we may have finished recovery and have an AllReplicasRecovered
5730 // event queued to move us to the next state.
5731 psdout(10) << "got defer recovery but not recovering" << dendl;
5732 return discard_event();
5733 }
5734 psdout(10) << "defer recovery, retry delay " << evt.delay << dendl;
5735 ps->state_set(PG_STATE_RECOVERY_WAIT);
5736 pl->cancel_local_background_io_reservation();
5737 release_reservations(true);
5738 pl->schedule_event_after(
5739 std::make_shared<PGPeeringEvent>(
5740 ps->get_osdmap_epoch(),
5741 ps->get_osdmap_epoch(),
5742 DoRecovery()),
5743 evt.delay);
5744 return transit<NotRecovering>();
5745 }
5746
5747 boost::statechart::result
5748 PeeringState::Recovering::react(const UnfoundRecovery &evt)
5749 {
5750 DECLARE_LOCALS;
5751 psdout(10) << "recovery has unfound, can't continue" << dendl;
5752 ps->state_set(PG_STATE_RECOVERY_UNFOUND);
5753 pl->cancel_local_background_io_reservation();
5754 release_reservations(true);
5755 return transit<NotRecovering>();
5756 }
5757
5758 void PeeringState::Recovering::exit()
5759 {
5760 context< PeeringMachine >().log_exit(state_name, enter_time);
5761
5762 DECLARE_LOCALS;
5763 utime_t dur = ceph_clock_now() - enter_time;
5764 ps->state_clear(PG_STATE_RECOVERING);
5765 pl->get_peering_perf().tinc(rs_recovering_latency, dur);
5766 }
5767
5768 PeeringState::Recovered::Recovered(my_context ctx)
5769 : my_base(ctx),
5770 NamedState(context< PeeringMachine >().state_history, "Started/Primary/Active/Recovered")
5771 {
5772 pg_shard_t auth_log_shard;
5773
5774 context< PeeringMachine >().log_enter(state_name);
5775
5776 DECLARE_LOCALS;
5777
5778 ceph_assert(!ps->needs_recovery());
5779
5780 // if we finished backfill, all acting are active; recheck if
5781 // DEGRADED | UNDERSIZED is appropriate.
5782 ceph_assert(!ps->acting_recovery_backfill.empty());
5783 if (ps->get_osdmap()->get_pg_size(context< PeeringMachine >().spgid.pgid) <=
5784 ps->acting_recovery_backfill.size()) {
5785 ps->state_clear(PG_STATE_FORCED_BACKFILL | PG_STATE_FORCED_RECOVERY);
5786 pl->publish_stats_to_osd();
5787 }
5788
5789 // adjust acting set? (e.g. because backfill completed...)
5790 bool history_les_bound = false;
5791 if (ps->acting != ps->up && !ps->choose_acting(auth_log_shard,
5792 true, &history_les_bound)) {
5793 ceph_assert(ps->want_acting.size());
5794 } else if (!ps->async_recovery_targets.empty()) {
5795 // FIXME: Uh-oh we have to check this return value; choose_acting can fail!
5796 ps->choose_acting(auth_log_shard, true, &history_les_bound);
5797 }
5798
5799 if (context< Active >().all_replicas_activated &&
5800 ps->async_recovery_targets.empty())
5801 post_event(GoClean());
5802 }
5803
5804 void PeeringState::Recovered::exit()
5805 {
5806 context< PeeringMachine >().log_exit(state_name, enter_time);
5807 DECLARE_LOCALS;
5808
5809 utime_t dur = ceph_clock_now() - enter_time;
5810 pl->get_peering_perf().tinc(rs_recovered_latency, dur);
5811 }
5812
5813 PeeringState::Clean::Clean(my_context ctx)
5814 : my_base(ctx),
5815 NamedState(context< PeeringMachine >().state_history, "Started/Primary/Active/Clean")
5816 {
5817 context< PeeringMachine >().log_enter(state_name);
5818
5819 DECLARE_LOCALS;
5820
5821 if (ps->info.last_complete != ps->info.last_update) {
5822 ceph_abort();
5823 }
5824
5825
5826 ps->try_mark_clean();
5827
5828 context< PeeringMachine >().get_cur_transaction().register_on_commit(
5829 pl->on_clean());
5830 }
5831
5832 void PeeringState::Clean::exit()
5833 {
5834 context< PeeringMachine >().log_exit(state_name, enter_time);
5835
5836 DECLARE_LOCALS;
5837 ps->state_clear(PG_STATE_CLEAN);
5838 utime_t dur = ceph_clock_now() - enter_time;
5839 pl->get_peering_perf().tinc(rs_clean_latency, dur);
5840 }
5841
5842 template <typename T>
5843 set<pg_shard_t> unique_osd_shard_set(const pg_shard_t & skip, const T &in)
5844 {
5845 set<int> osds_found;
5846 set<pg_shard_t> out;
5847 for (auto i = in.begin(); i != in.end(); ++i) {
5848 if (*i != skip && !osds_found.count(i->osd)) {
5849 osds_found.insert(i->osd);
5850 out.insert(*i);
5851 }
5852 }
5853 return out;
5854 }
5855
5856 /*---------Active---------*/
5857 PeeringState::Active::Active(my_context ctx)
5858 : my_base(ctx),
5859 NamedState(context< PeeringMachine >().state_history, "Started/Primary/Active"),
5860 remote_shards_to_reserve_recovery(
5861 unique_osd_shard_set(
5862 context< PeeringMachine >().state->pg_whoami,
5863 context< PeeringMachine >().state->acting_recovery_backfill)),
5864 remote_shards_to_reserve_backfill(
5865 unique_osd_shard_set(
5866 context< PeeringMachine >().state->pg_whoami,
5867 context< PeeringMachine >().state->backfill_targets)),
5868 all_replicas_activated(false)
5869 {
5870 context< PeeringMachine >().log_enter(state_name);
5871
5872
5873 DECLARE_LOCALS;
5874
5875 ceph_assert(!ps->backfill_reserved);
5876 ceph_assert(ps->is_primary());
5877 psdout(10) << "In Active, about to call activate" << dendl;
5878 ps->start_flush(context< PeeringMachine >().get_cur_transaction());
5879 ps->activate(context< PeeringMachine >().get_cur_transaction(),
5880 ps->get_osdmap_epoch(),
5881 context< PeeringMachine >().get_recovery_ctx());
5882
5883 // everyone has to commit/ack before we are truly active
5884 ps->blocked_by.clear();
5885 for (auto p = ps->acting_recovery_backfill.begin();
5886 p != ps->acting_recovery_backfill.end();
5887 ++p) {
5888 if (p->shard != ps->pg_whoami.shard) {
5889 ps->blocked_by.insert(p->shard);
5890 }
5891 }
5892 pl->publish_stats_to_osd();
5893 psdout(10) << "Activate Finished" << dendl;
5894 }
5895
5896 boost::statechart::result PeeringState::Active::react(const AdvMap& advmap)
5897 {
5898 DECLARE_LOCALS;
5899
5900 if (ps->should_restart_peering(
5901 advmap.up_primary,
5902 advmap.acting_primary,
5903 advmap.newup,
5904 advmap.newacting,
5905 advmap.lastmap,
5906 advmap.osdmap)) {
5907 psdout(10) << "Active advmap interval change, fast return" << dendl;
5908 return forward_event();
5909 }
5910 psdout(10) << "Active advmap" << dendl;
5911 bool need_publish = false;
5912
5913 pl->on_active_advmap(advmap.osdmap);
5914 if (ps->dirty_big_info) {
5915 // share updated purged_snaps to mgr/mon so that we (a) stop reporting
5916 // purged snaps and (b) perhaps share more snaps that we have purged
5917 // but didn't fit in pg_stat_t.
5918 need_publish = true;
5919 ps->share_pg_info();
5920 }
5921
5922 bool need_acting_change = false;
5923 for (size_t i = 0; i < ps->want_acting.size(); i++) {
5924 int osd = ps->want_acting[i];
5925 if (!advmap.osdmap->is_up(osd)) {
5926 pg_shard_t osd_with_shard(osd, shard_id_t(i));
5927 if (!ps->is_acting(osd_with_shard) && !ps->is_up(osd_with_shard)) {
5928 psdout(10) << "Active stray osd." << osd << " in want_acting is down"
5929 << dendl;
5930 need_acting_change = true;
5931 }
5932 }
5933 }
5934 if (need_acting_change) {
5935 psdout(10) << "Active need acting change, call choose_acting again"
5936 << dendl;
5937 // possibly because we re-add some strays into the acting set and
5938 // some of them then go down in a subsequent map before we could see
5939 // the map changing the pg temp.
5940 // call choose_acting again to clear them out.
5941 // note that we leave restrict_to_up_acting to false in order to
5942 // not overkill any chosen stray that is still alive.
5943 pg_shard_t auth_log_shard;
5944 bool history_les_bound = false;
5945 ps->remove_down_peer_info(advmap.osdmap);
5946 ps->choose_acting(auth_log_shard, false, &history_les_bound, true);
5947 }
5948
5949 /* Check for changes in pool size (if the acting set changed as a result,
5950 * this does not matter) */
5951 if (advmap.lastmap->get_pg_size(ps->info.pgid.pgid) !=
5952 ps->get_osdmap()->get_pg_size(ps->info.pgid.pgid)) {
5953 if (ps->get_osdmap()->get_pg_size(ps->info.pgid.pgid) <=
5954 ps->actingset.size()) {
5955 ps->state_clear(PG_STATE_UNDERSIZED);
5956 } else {
5957 ps->state_set(PG_STATE_UNDERSIZED);
5958 }
5959 // degraded changes will be detected by call from publish_stats_to_osd()
5960 need_publish = true;
5961 }
5962
5963 // if we haven't reported our PG stats in a long time, do so now.
5964 if (ps->info.stats.reported_epoch + ps->cct->_conf->osd_pg_stat_report_interval_max < advmap.osdmap->get_epoch()) {
5965 psdout(20) << "reporting stats to osd after " << (advmap.osdmap->get_epoch() - ps->info.stats.reported_epoch)
5966 << " epochs" << dendl;
5967 need_publish = true;
5968 }
5969
5970 if (need_publish)
5971 pl->publish_stats_to_osd();
5972
5973 if (ps->check_prior_readable_down_osds(advmap.osdmap)) {
5974 pl->recheck_readable();
5975 }
5976
5977 return forward_event();
5978 }
5979
5980 boost::statechart::result PeeringState::Active::react(const ActMap&)
5981 {
5982 DECLARE_LOCALS;
5983 psdout(10) << "Active: handling ActMap" << dendl;
5984 ceph_assert(ps->is_primary());
5985
5986 pl->on_active_actmap();
5987
5988 if (ps->have_unfound()) {
5989 // object may have become unfound
5990 ps->discover_all_missing(context<PeeringMachine>().get_recovery_ctx().msgs);
5991 }
5992
5993 uint64_t unfound = ps->missing_loc.num_unfound();
5994 if (unfound > 0 &&
5995 ps->all_unfound_are_queried_or_lost(ps->get_osdmap())) {
5996 if (ps->cct->_conf->osd_auto_mark_unfound_lost) {
5997 pl->get_clog_error() << context< PeeringMachine >().spgid.pgid << " has " << unfound
5998 << " objects unfound and apparently lost, would automatically "
5999 << "mark these objects lost but this feature is not yet implemented "
6000 << "(osd_auto_mark_unfound_lost)";
6001 } else
6002 pl->get_clog_error() << context< PeeringMachine >().spgid.pgid << " has "
6003 << unfound << " objects unfound and apparently lost";
6004 }
6005
6006 return forward_event();
6007 }
6008
6009 boost::statechart::result PeeringState::Active::react(const MNotifyRec& notevt)
6010 {
6011
6012 DECLARE_LOCALS;
6013 ceph_assert(ps->is_primary());
6014 if (ps->peer_info.count(notevt.from)) {
6015 psdout(10) << "Active: got notify from " << notevt.from
6016 << ", already have info from that osd, ignoring"
6017 << dendl;
6018 } else if (ps->peer_purged.count(notevt.from)) {
6019 psdout(10) << "Active: got notify from " << notevt.from
6020 << ", already purged that peer, ignoring"
6021 << dendl;
6022 } else {
6023 psdout(10) << "Active: got notify from " << notevt.from
6024 << ", calling proc_replica_info and discover_all_missing"
6025 << dendl;
6026 ps->proc_replica_info(
6027 notevt.from, notevt.notify.info, notevt.notify.epoch_sent);
6028 if (ps->have_unfound() || (ps->is_degraded() && ps->might_have_unfound.count(notevt.from))) {
6029 ps->discover_all_missing(
6030 context<PeeringMachine>().get_recovery_ctx().msgs);
6031 }
6032 // check if it is a previous down acting member that's coming back.
6033 // if so, request pg_temp change to trigger a new interval transition
6034 pg_shard_t auth_log_shard;
6035 bool history_les_bound = false;
6036 // FIXME: Uh-oh we have to check this return value; choose_acting can fail!
6037 ps->choose_acting(auth_log_shard, false, &history_les_bound, true);
6038 if (!ps->want_acting.empty() && ps->want_acting != ps->acting) {
6039 psdout(10) << "Active: got notify from previous acting member "
6040 << notevt.from << ", requesting pg_temp change"
6041 << dendl;
6042 }
6043 }
6044 return discard_event();
6045 }
6046
6047 boost::statechart::result PeeringState::Active::react(const MTrim& trim)
6048 {
6049 DECLARE_LOCALS;
6050 ceph_assert(ps->is_primary());
6051
6052 // peer is informing us of their last_complete_ondisk
6053 ldout(ps->cct,10) << " replica osd." << trim.from << " lcod " << trim.trim_to << dendl;
6054 ps->update_peer_last_complete_ondisk(pg_shard_t{trim.from, trim.shard},
6055 trim.trim_to);
6056 // trim log when the pg is recovered
6057 ps->calc_min_last_complete_ondisk();
6058 return discard_event();
6059 }
6060
6061 boost::statechart::result PeeringState::Active::react(const MInfoRec& infoevt)
6062 {
6063 DECLARE_LOCALS;
6064 ceph_assert(ps->is_primary());
6065
6066 ceph_assert(!ps->acting_recovery_backfill.empty());
6067 if (infoevt.lease_ack) {
6068 ps->proc_lease_ack(infoevt.from.osd, *infoevt.lease_ack);
6069 }
6070 // don't update history (yet) if we are active and primary; the replica
6071 // may be telling us they have activated (and committed) but we can't
6072 // share that until _everyone_ does the same.
6073 if (ps->is_acting_recovery_backfill(infoevt.from) &&
6074 ps->peer_activated.count(infoevt.from) == 0) {
6075 psdout(10) << " peer osd." << infoevt.from
6076 << " activated and committed" << dendl;
6077 ps->peer_activated.insert(infoevt.from);
6078 ps->blocked_by.erase(infoevt.from.shard);
6079 pl->publish_stats_to_osd();
6080 if (ps->peer_activated.size() == ps->acting_recovery_backfill.size()) {
6081 all_activated_and_committed();
6082 }
6083 }
6084 return discard_event();
6085 }
6086
6087 boost::statechart::result PeeringState::Active::react(const MLogRec& logevt)
6088 {
6089 DECLARE_LOCALS;
6090 psdout(10) << "searching osd." << logevt.from
6091 << " log for unfound items" << dendl;
6092 ps->proc_replica_log(
6093 logevt.msg->info, logevt.msg->log, std::move(logevt.msg->missing), logevt.from);
6094 bool got_missing = ps->search_for_missing(
6095 ps->peer_info[logevt.from],
6096 ps->peer_missing[logevt.from],
6097 logevt.from,
6098 context< PeeringMachine >().get_recovery_ctx());
6099 // If there are missing AND we are "fully" active then start recovery now
6100 if (got_missing && ps->state_test(PG_STATE_ACTIVE)) {
6101 post_event(DoRecovery());
6102 }
6103 return discard_event();
6104 }
6105
6106 boost::statechart::result PeeringState::Active::react(const QueryState& q)
6107 {
6108 DECLARE_LOCALS;
6109
6110 q.f->open_object_section("state");
6111 q.f->dump_string("name", state_name);
6112 q.f->dump_stream("enter_time") << enter_time;
6113
6114 {
6115 q.f->open_array_section("might_have_unfound");
6116 for (auto p = ps->might_have_unfound.begin();
6117 p != ps->might_have_unfound.end();
6118 ++p) {
6119 q.f->open_object_section("osd");
6120 q.f->dump_stream("osd") << *p;
6121 if (ps->peer_missing.count(*p)) {
6122 q.f->dump_string("status", "already probed");
6123 } else if (ps->peer_missing_requested.count(*p)) {
6124 q.f->dump_string("status", "querying");
6125 } else if (!ps->get_osdmap()->is_up(p->osd)) {
6126 q.f->dump_string("status", "osd is down");
6127 } else {
6128 q.f->dump_string("status", "not queried");
6129 }
6130 q.f->close_section();
6131 }
6132 q.f->close_section();
6133 }
6134 {
6135 q.f->open_object_section("recovery_progress");
6136 q.f->open_array_section("backfill_targets");
6137 for (auto p = ps->backfill_targets.begin();
6138 p != ps->backfill_targets.end(); ++p)
6139 q.f->dump_stream("replica") << *p;
6140 q.f->close_section();
6141 pl->dump_recovery_info(q.f);
6142 q.f->close_section();
6143 }
6144
6145 q.f->close_section();
6146 return forward_event();
6147 }
6148
6149 boost::statechart::result PeeringState::Active::react(const QueryUnfound& q)
6150 {
6151 DECLARE_LOCALS;
6152
6153 ps->query_unfound(q.f, "Active");
6154 return discard_event();
6155 }
6156
6157 boost::statechart::result PeeringState::Active::react(
6158 const ActivateCommitted &evt)
6159 {
6160 DECLARE_LOCALS;
6161 ceph_assert(!ps->peer_activated.count(ps->pg_whoami));
6162 ps->peer_activated.insert(ps->pg_whoami);
6163 psdout(10) << "_activate_committed " << evt.epoch
6164 << " peer_activated now " << ps->peer_activated
6165 << " last_interval_started "
6166 << ps->info.history.last_interval_started
6167 << " last_epoch_started "
6168 << ps->info.history.last_epoch_started
6169 << " same_interval_since "
6170 << ps->info.history.same_interval_since
6171 << dendl;
6172 ceph_assert(!ps->acting_recovery_backfill.empty());
6173 if (ps->peer_activated.size() == ps->acting_recovery_backfill.size())
6174 all_activated_and_committed();
6175 return discard_event();
6176 }
6177
6178 boost::statechart::result PeeringState::Active::react(const AllReplicasActivated &evt)
6179 {
6180
6181 DECLARE_LOCALS;
6182 pg_t pgid = context< PeeringMachine >().spgid.pgid;
6183
6184 all_replicas_activated = true;
6185
6186 ps->state_clear(PG_STATE_ACTIVATING);
6187 ps->state_clear(PG_STATE_CREATING);
6188 ps->state_clear(PG_STATE_PREMERGE);
6189
6190 bool merge_target;
6191 if (ps->pool.info.is_pending_merge(pgid, &merge_target)) {
6192 ps->state_set(PG_STATE_PEERED);
6193 ps->state_set(PG_STATE_PREMERGE);
6194
6195 if (ps->actingset.size() != ps->get_osdmap()->get_pg_size(pgid)) {
6196 if (merge_target) {
6197 pg_t src = pgid;
6198 src.set_ps(ps->pool.info.get_pg_num_pending());
6199 assert(src.get_parent() == pgid);
6200 pl->set_not_ready_to_merge_target(pgid, src);
6201 } else {
6202 pl->set_not_ready_to_merge_source(pgid);
6203 }
6204 }
6205 } else if (!ps->acting_set_writeable()) {
6206 ps->state_set(PG_STATE_PEERED);
6207 } else {
6208 ps->state_set(PG_STATE_ACTIVE);
6209 }
6210
6211 auto mnow = pl->get_mnow();
6212 if (ps->prior_readable_until_ub > mnow) {
6213 psdout(10) << " waiting for prior_readable_until_ub "
6214 << ps->prior_readable_until_ub << " > mnow " << mnow << dendl;
6215 ps->state_set(PG_STATE_WAIT);
6216 pl->queue_check_readable(
6217 ps->last_peering_reset,
6218 ps->prior_readable_until_ub - mnow);
6219 } else {
6220 psdout(10) << " mnow " << mnow << " >= prior_readable_until_ub "
6221 << ps->prior_readable_until_ub << dendl;
6222 }
6223
6224 if (ps->pool.info.has_flag(pg_pool_t::FLAG_CREATING)) {
6225 pl->send_pg_created(pgid);
6226 }
6227
6228 ps->info.history.last_epoch_started = ps->info.last_epoch_started;
6229 ps->info.history.last_interval_started = ps->info.last_interval_started;
6230 ps->dirty_info = true;
6231
6232 ps->share_pg_info();
6233 pl->publish_stats_to_osd();
6234
6235 pl->on_activate_complete();
6236
6237 return discard_event();
6238 }
6239
6240 boost::statechart::result PeeringState::Active::react(const RenewLease& rl)
6241 {
6242 DECLARE_LOCALS;
6243 ps->proc_renew_lease();
6244 return discard_event();
6245 }
6246
6247 boost::statechart::result PeeringState::Active::react(const MLeaseAck& la)
6248 {
6249 DECLARE_LOCALS;
6250 ps->proc_lease_ack(la.from, la.lease_ack);
6251 return discard_event();
6252 }
6253
6254
6255 boost::statechart::result PeeringState::Active::react(const CheckReadable &evt)
6256 {
6257 DECLARE_LOCALS;
6258 pl->recheck_readable();
6259 return discard_event();
6260 }
6261
6262 /*
6263 * update info.history.last_epoch_started ONLY after we and all
6264 * replicas have activated AND committed the activate transaction
6265 * (i.e. the peering results are stable on disk).
6266 */
6267 void PeeringState::Active::all_activated_and_committed()
6268 {
6269 DECLARE_LOCALS;
6270 psdout(10) << "all_activated_and_committed" << dendl;
6271 ceph_assert(ps->is_primary());
6272 ceph_assert(ps->peer_activated.size() == ps->acting_recovery_backfill.size());
6273 ceph_assert(!ps->acting_recovery_backfill.empty());
6274 ceph_assert(ps->blocked_by.empty());
6275
6276 assert(HAVE_FEATURE(ps->upacting_features, SERVER_OCTOPUS));
6277 // this is overkill when the activation is quick, but when it is slow it
6278 // is important, because the lease was renewed by the activate itself but we
6279 // don't know how long ago that was, and simply scheduling now may leave
6280 // a gap in lease coverage. keep it simple and aggressively renew.
6281 ps->renew_lease(pl->get_mnow());
6282 ps->send_lease();
6283 ps->schedule_renew_lease();
6284
6285 // Degraded?
6286 ps->update_calc_stats();
6287 if (ps->info.stats.stats.sum.num_objects_degraded) {
6288 ps->state_set(PG_STATE_DEGRADED);
6289 } else {
6290 ps->state_clear(PG_STATE_DEGRADED);
6291 }
6292
6293 post_event(PeeringState::AllReplicasActivated());
6294 }
6295
6296
6297 void PeeringState::Active::exit()
6298 {
6299 context< PeeringMachine >().log_exit(state_name, enter_time);
6300
6301
6302 DECLARE_LOCALS;
6303 pl->cancel_local_background_io_reservation();
6304
6305 ps->blocked_by.clear();
6306 ps->backfill_reserved = false;
6307 ps->state_clear(PG_STATE_ACTIVATING);
6308 ps->state_clear(PG_STATE_DEGRADED);
6309 ps->state_clear(PG_STATE_UNDERSIZED);
6310 ps->state_clear(PG_STATE_BACKFILL_TOOFULL);
6311 ps->state_clear(PG_STATE_BACKFILL_WAIT);
6312 ps->state_clear(PG_STATE_RECOVERY_WAIT);
6313 ps->state_clear(PG_STATE_RECOVERY_TOOFULL);
6314 utime_t dur = ceph_clock_now() - enter_time;
6315 pl->get_peering_perf().tinc(rs_active_latency, dur);
6316 pl->on_active_exit();
6317 }
6318
6319 /*------ReplicaActive-----*/
6320 PeeringState::ReplicaActive::ReplicaActive(my_context ctx)
6321 : my_base(ctx),
6322 NamedState(context< PeeringMachine >().state_history, "Started/ReplicaActive")
6323 {
6324 context< PeeringMachine >().log_enter(state_name);
6325
6326 DECLARE_LOCALS;
6327 ps->start_flush(context< PeeringMachine >().get_cur_transaction());
6328 }
6329
6330
6331 boost::statechart::result PeeringState::ReplicaActive::react(
6332 const Activate& actevt) {
6333 DECLARE_LOCALS;
6334 psdout(10) << "In ReplicaActive, about to call activate" << dendl;
6335 ps->activate(
6336 context< PeeringMachine >().get_cur_transaction(),
6337 actevt.activation_epoch,
6338 context< PeeringMachine >().get_recovery_ctx());
6339 psdout(10) << "Activate Finished" << dendl;
6340 return discard_event();
6341 }
6342
6343 boost::statechart::result PeeringState::ReplicaActive::react(
6344 const ActivateCommitted &evt)
6345 {
6346 DECLARE_LOCALS;
6347 psdout(10) << __func__ << " " << evt.epoch << " telling primary" << dendl;
6348
6349 auto &rctx = context<PeeringMachine>().get_recovery_ctx();
6350 auto epoch = ps->get_osdmap_epoch();
6351 pg_info_t i = ps->info;
6352 i.history.last_epoch_started = evt.activation_epoch;
6353 i.history.last_interval_started = i.history.same_interval_since;
6354 rctx.send_info(
6355 ps->get_primary().osd,
6356 spg_t(ps->info.pgid.pgid, ps->get_primary().shard),
6357 epoch,
6358 epoch,
6359 i,
6360 {}, /* lease */
6361 ps->get_lease_ack());
6362
6363 if (ps->acting_set_writeable()) {
6364 ps->state_set(PG_STATE_ACTIVE);
6365 } else {
6366 ps->state_set(PG_STATE_PEERED);
6367 }
6368 pl->on_activate_committed();
6369
6370 return discard_event();
6371 }
6372
6373 boost::statechart::result PeeringState::ReplicaActive::react(const MLease& l)
6374 {
6375 DECLARE_LOCALS;
6376 spg_t spgid = context< PeeringMachine >().spgid;
6377 epoch_t epoch = pl->get_osdmap_epoch();
6378
6379 ps->proc_lease(l.lease);
6380 pl->send_cluster_message(
6381 ps->get_primary().osd,
6382 TOPNSPC::make_message<MOSDPGLeaseAck>(epoch,
6383 spg_t(spgid.pgid, ps->get_primary().shard),
6384 ps->get_lease_ack()),
6385 epoch);
6386 return discard_event();
6387 }
6388
6389 boost::statechart::result PeeringState::ReplicaActive::react(const MInfoRec& infoevt)
6390 {
6391 DECLARE_LOCALS;
6392 ps->proc_primary_info(context<PeeringMachine>().get_cur_transaction(),
6393 infoevt.info);
6394 return discard_event();
6395 }
6396
6397 boost::statechart::result PeeringState::ReplicaActive::react(const MLogRec& logevt)
6398 {
6399 DECLARE_LOCALS;
6400 psdout(10) << "received log from " << logevt.from << dendl;
6401 ObjectStore::Transaction &t = context<PeeringMachine>().get_cur_transaction();
6402 ps->merge_log(t, logevt.msg->info, std::move(logevt.msg->log), logevt.from);
6403 ceph_assert(ps->pg_log.get_head() == ps->info.last_update);
6404 if (logevt.msg->lease) {
6405 ps->proc_lease(*logevt.msg->lease);
6406 }
6407
6408 return discard_event();
6409 }
6410
6411 boost::statechart::result PeeringState::ReplicaActive::react(const MTrim& trim)
6412 {
6413 DECLARE_LOCALS;
6414 // primary is instructing us to trim
6415 ps->pg_log.trim(trim.trim_to, ps->info);
6416 ps->dirty_info = true;
6417 return discard_event();
6418 }
6419
6420 boost::statechart::result PeeringState::ReplicaActive::react(const ActMap&)
6421 {
6422 DECLARE_LOCALS;
6423 if (ps->should_send_notify() && ps->get_primary().osd >= 0) {
6424 ps->info.history.refresh_prior_readable_until_ub(
6425 pl->get_mnow(), ps->prior_readable_until_ub);
6426 context< PeeringMachine >().send_notify(
6427 ps->get_primary().osd,
6428 pg_notify_t(
6429 ps->get_primary().shard, ps->pg_whoami.shard,
6430 ps->get_osdmap_epoch(),
6431 ps->get_osdmap_epoch(),
6432 ps->info,
6433 ps->past_intervals));
6434 }
6435 return discard_event();
6436 }
6437
6438 boost::statechart::result PeeringState::ReplicaActive::react(
6439 const MQuery& query)
6440 {
6441 DECLARE_LOCALS;
6442 ps->fulfill_query(query, context<PeeringMachine>().get_recovery_ctx());
6443 return discard_event();
6444 }
6445
6446 boost::statechart::result PeeringState::ReplicaActive::react(const QueryState& q)
6447 {
6448 q.f->open_object_section("state");
6449 q.f->dump_string("name", state_name);
6450 q.f->dump_stream("enter_time") << enter_time;
6451 q.f->close_section();
6452 return forward_event();
6453 }
6454
6455 boost::statechart::result PeeringState::ReplicaActive::react(const QueryUnfound& q)
6456 {
6457 q.f->dump_string("state", "ReplicaActive");
6458 q.f->dump_bool("available_might_have_unfound", false);
6459 return discard_event();
6460 }
6461
6462 void PeeringState::ReplicaActive::exit()
6463 {
6464 context< PeeringMachine >().log_exit(state_name, enter_time);
6465 DECLARE_LOCALS;
6466 pl->unreserve_recovery_space();
6467
6468 pl->cancel_remote_recovery_reservation();
6469 utime_t dur = ceph_clock_now() - enter_time;
6470 pl->get_peering_perf().tinc(rs_replicaactive_latency, dur);
6471
6472 ps->min_last_complete_ondisk = eversion_t();
6473 }
6474
6475 /*-------Stray---*/
6476 PeeringState::Stray::Stray(my_context ctx)
6477 : my_base(ctx),
6478 NamedState(context< PeeringMachine >().state_history, "Started/Stray")
6479 {
6480 context< PeeringMachine >().log_enter(state_name);
6481
6482
6483 DECLARE_LOCALS;
6484 ceph_assert(!ps->is_peered());
6485 ceph_assert(!ps->is_peering());
6486 ceph_assert(!ps->is_primary());
6487
6488 if (!ps->get_osdmap()->have_pg_pool(ps->info.pgid.pgid.pool())) {
6489 ldout(ps->cct,10) << __func__ << " pool is deleted" << dendl;
6490 post_event(DeleteStart());
6491 } else {
6492 ps->start_flush(context< PeeringMachine >().get_cur_transaction());
6493 }
6494 }
6495
6496 boost::statechart::result PeeringState::Stray::react(const MLogRec& logevt)
6497 {
6498 DECLARE_LOCALS;
6499 MOSDPGLog *msg = logevt.msg.get();
6500 psdout(10) << "got info+log from osd." << logevt.from << " " << msg->info << " " << msg->log << dendl;
6501
6502 ObjectStore::Transaction &t = context<PeeringMachine>().get_cur_transaction();
6503 if (msg->info.last_backfill == hobject_t()) {
6504 // restart backfill
6505 ps->info = msg->info;
6506 pl->on_info_history_change();
6507 ps->dirty_info = true;
6508 ps->dirty_big_info = true; // maybe.
6509
6510 PGLog::LogEntryHandlerRef rollbacker{pl->get_log_handler(t)};
6511 ps->pg_log.reset_backfill_claim_log(msg->log, rollbacker.get());
6512
6513 ps->pg_log.reset_backfill();
6514 } else {
6515 ps->merge_log(t, msg->info, std::move(msg->log), logevt.from);
6516 }
6517 if (logevt.msg->lease) {
6518 ps->proc_lease(*logevt.msg->lease);
6519 }
6520
6521 ceph_assert(ps->pg_log.get_head() == ps->info.last_update);
6522
6523 post_event(Activate(logevt.msg->info.last_epoch_started));
6524 return transit<ReplicaActive>();
6525 }
6526
6527 boost::statechart::result PeeringState::Stray::react(const MInfoRec& infoevt)
6528 {
6529 DECLARE_LOCALS;
6530 psdout(10) << "got info from osd." << infoevt.from << " " << infoevt.info << dendl;
6531
6532 if (ps->info.last_update > infoevt.info.last_update) {
6533 // rewind divergent log entries
6534 ObjectStore::Transaction &t = context<PeeringMachine>().get_cur_transaction();
6535 ps->rewind_divergent_log(t, infoevt.info.last_update);
6536 ps->info.stats = infoevt.info.stats;
6537 ps->info.hit_set = infoevt.info.hit_set;
6538 }
6539
6540 if (infoevt.lease) {
6541 ps->proc_lease(*infoevt.lease);
6542 }
6543
6544 ceph_assert(infoevt.info.last_update == ps->info.last_update);
6545 ceph_assert(ps->pg_log.get_head() == ps->info.last_update);
6546
6547 post_event(Activate(infoevt.info.last_epoch_started));
6548 return transit<ReplicaActive>();
6549 }
6550
6551 boost::statechart::result PeeringState::Stray::react(const MQuery& query)
6552 {
6553 DECLARE_LOCALS;
6554 ps->fulfill_query(query, context<PeeringMachine>().get_recovery_ctx());
6555 return discard_event();
6556 }
6557
6558 boost::statechart::result PeeringState::Stray::react(const ActMap&)
6559 {
6560 DECLARE_LOCALS;
6561 if (ps->should_send_notify() && ps->get_primary().osd >= 0) {
6562 ps->info.history.refresh_prior_readable_until_ub(
6563 pl->get_mnow(), ps->prior_readable_until_ub);
6564 context< PeeringMachine >().send_notify(
6565 ps->get_primary().osd,
6566 pg_notify_t(
6567 ps->get_primary().shard, ps->pg_whoami.shard,
6568 ps->get_osdmap_epoch(),
6569 ps->get_osdmap_epoch(),
6570 ps->info,
6571 ps->past_intervals));
6572 }
6573 return discard_event();
6574 }
6575
6576 void PeeringState::Stray::exit()
6577 {
6578 context< PeeringMachine >().log_exit(state_name, enter_time);
6579 DECLARE_LOCALS;
6580 utime_t dur = ceph_clock_now() - enter_time;
6581 pl->get_peering_perf().tinc(rs_stray_latency, dur);
6582 }
6583
6584
6585 /*--------ToDelete----------*/
6586 PeeringState::ToDelete::ToDelete(my_context ctx)
6587 : my_base(ctx),
6588 NamedState(context< PeeringMachine >().state_history, "Started/ToDelete")
6589 {
6590 context< PeeringMachine >().log_enter(state_name);
6591 DECLARE_LOCALS;
6592 pl->get_perf_logger().inc(l_osd_pg_removing);
6593 }
6594
6595 void PeeringState::ToDelete::exit()
6596 {
6597 context< PeeringMachine >().log_exit(state_name, enter_time);
6598 DECLARE_LOCALS;
6599 // note: on a successful removal, this path doesn't execute. see
6600 // do_delete_work().
6601 pl->get_perf_logger().dec(l_osd_pg_removing);
6602
6603 pl->cancel_local_background_io_reservation();
6604 }
6605
6606 /*----WaitDeleteReserved----*/
6607 PeeringState::WaitDeleteReserved::WaitDeleteReserved(my_context ctx)
6608 : my_base(ctx),
6609 NamedState(context< PeeringMachine >().state_history,
6610 "Started/ToDelete/WaitDeleteReseved")
6611 {
6612 context< PeeringMachine >().log_enter(state_name);
6613 DECLARE_LOCALS;
6614 context< ToDelete >().priority = ps->get_delete_priority();
6615
6616 pl->cancel_local_background_io_reservation();
6617 pl->request_local_background_io_reservation(
6618 context<ToDelete>().priority,
6619 std::make_unique<PGPeeringEvent>(
6620 ps->get_osdmap_epoch(),
6621 ps->get_osdmap_epoch(),
6622 DeleteReserved()),
6623 std::make_unique<PGPeeringEvent>(
6624 ps->get_osdmap_epoch(),
6625 ps->get_osdmap_epoch(),
6626 DeleteInterrupted()));
6627 }
6628
6629 boost::statechart::result PeeringState::ToDelete::react(
6630 const ActMap& evt)
6631 {
6632 DECLARE_LOCALS;
6633 if (ps->get_delete_priority() != priority) {
6634 psdout(10) << __func__ << " delete priority changed, resetting"
6635 << dendl;
6636 return transit<ToDelete>();
6637 }
6638 return discard_event();
6639 }
6640
6641 void PeeringState::WaitDeleteReserved::exit()
6642 {
6643 context< PeeringMachine >().log_exit(state_name, enter_time);
6644 }
6645
6646 /*----Deleting-----*/
6647 PeeringState::Deleting::Deleting(my_context ctx)
6648 : my_base(ctx),
6649 NamedState(context< PeeringMachine >().state_history, "Started/ToDelete/Deleting")
6650 {
6651 context< PeeringMachine >().log_enter(state_name);
6652
6653 DECLARE_LOCALS;
6654 ps->deleting = true;
6655 ObjectStore::Transaction &t = context<PeeringMachine>().get_cur_transaction();
6656
6657 // clear log
6658 PGLog::LogEntryHandlerRef rollbacker{pl->get_log_handler(t)};
6659 ps->pg_log.roll_forward(rollbacker.get());
6660
6661 // adjust info to backfill
6662 ps->info.set_last_backfill(hobject_t());
6663 ps->pg_log.reset_backfill();
6664 ps->dirty_info = true;
6665
6666 pl->on_removal(t);
6667 }
6668
6669 boost::statechart::result PeeringState::Deleting::react(
6670 const DeleteSome& evt)
6671 {
6672 DECLARE_LOCALS;
6673 std::pair<ghobject_t, bool> p;
6674 p = pl->do_delete_work(context<PeeringMachine>().get_cur_transaction(),
6675 next);
6676 next = p.first;
6677 return p.second ? discard_event() : terminate();
6678 }
6679
6680 void PeeringState::Deleting::exit()
6681 {
6682 context< PeeringMachine >().log_exit(state_name, enter_time);
6683 DECLARE_LOCALS;
6684 ps->deleting = false;
6685 pl->cancel_local_background_io_reservation();
6686 }
6687
6688 /*--------GetInfo---------*/
6689 PeeringState::GetInfo::GetInfo(my_context ctx)
6690 : my_base(ctx),
6691 NamedState(context< PeeringMachine >().state_history, "Started/Primary/Peering/GetInfo")
6692 {
6693 context< PeeringMachine >().log_enter(state_name);
6694
6695
6696 DECLARE_LOCALS;
6697 ps->check_past_interval_bounds();
6698 ps->log_weirdness();
6699 PastIntervals::PriorSet &prior_set = context< Peering >().prior_set;
6700
6701 ceph_assert(ps->blocked_by.empty());
6702
6703 prior_set = ps->build_prior();
6704 ps->prior_readable_down_osds = prior_set.down;
6705
6706 if (ps->prior_readable_down_osds.empty()) {
6707 psdout(10) << " no prior_set down osds, will clear prior_readable_until_ub before activating"
6708 << dendl;
6709 }
6710
6711 ps->reset_min_peer_features();
6712 get_infos();
6713 if (prior_set.pg_down) {
6714 post_event(IsDown());
6715 } else if (peer_info_requested.empty()) {
6716 post_event(GotInfo());
6717 }
6718 }
6719
6720 void PeeringState::GetInfo::get_infos()
6721 {
6722 DECLARE_LOCALS;
6723 PastIntervals::PriorSet &prior_set = context< Peering >().prior_set;
6724
6725 ps->blocked_by.clear();
6726 for (auto it = prior_set.probe.begin(); it != prior_set.probe.end(); ++it) {
6727 pg_shard_t peer = *it;
6728 if (peer == ps->pg_whoami) {
6729 continue;
6730 }
6731 if (ps->peer_info.count(peer)) {
6732 psdout(10) << " have osd." << peer << " info " << ps->peer_info[peer] << dendl;
6733 continue;
6734 }
6735 if (peer_info_requested.count(peer)) {
6736 psdout(10) << " already requested info from osd." << peer << dendl;
6737 ps->blocked_by.insert(peer.osd);
6738 } else if (!ps->get_osdmap()->is_up(peer.osd)) {
6739 psdout(10) << " not querying info from down osd." << peer << dendl;
6740 } else {
6741 psdout(10) << " querying info from osd." << peer << dendl;
6742 context< PeeringMachine >().send_query(
6743 peer.osd,
6744 pg_query_t(pg_query_t::INFO,
6745 it->shard, ps->pg_whoami.shard,
6746 ps->info.history,
6747 ps->get_osdmap_epoch()));
6748 peer_info_requested.insert(peer);
6749 ps->blocked_by.insert(peer.osd);
6750 }
6751 }
6752
6753 ps->check_prior_readable_down_osds(ps->get_osdmap());
6754
6755 pl->publish_stats_to_osd();
6756 }
6757
6758 boost::statechart::result PeeringState::GetInfo::react(const MNotifyRec& infoevt)
6759 {
6760
6761 DECLARE_LOCALS;
6762
6763 auto p = peer_info_requested.find(infoevt.from);
6764 if (p != peer_info_requested.end()) {
6765 peer_info_requested.erase(p);
6766 ps->blocked_by.erase(infoevt.from.osd);
6767 }
6768
6769 epoch_t old_start = ps->info.history.last_epoch_started;
6770 if (ps->proc_replica_info(
6771 infoevt.from, infoevt.notify.info, infoevt.notify.epoch_sent)) {
6772 // we got something new ...
6773 PastIntervals::PriorSet &prior_set = context< Peering >().prior_set;
6774 if (old_start < ps->info.history.last_epoch_started) {
6775 psdout(10) << " last_epoch_started moved forward, rebuilding prior" << dendl;
6776 prior_set = ps->build_prior();
6777 ps->prior_readable_down_osds = prior_set.down;
6778
6779 // filter out any osds that got dropped from the probe set from
6780 // peer_info_requested. this is less expensive than restarting
6781 // peering (which would re-probe everyone).
6782 auto p = peer_info_requested.begin();
6783 while (p != peer_info_requested.end()) {
6784 if (prior_set.probe.count(*p) == 0) {
6785 psdout(20) << " dropping osd." << *p << " from info_requested, no longer in probe set" << dendl;
6786 peer_info_requested.erase(p++);
6787 } else {
6788 ++p;
6789 }
6790 }
6791 get_infos();
6792 }
6793 psdout(20) << "Adding osd: " << infoevt.from.osd << " peer features: "
6794 << hex << infoevt.features << dec << dendl;
6795 ps->apply_peer_features(infoevt.features);
6796
6797 // are we done getting everything?
6798 if (peer_info_requested.empty() && !prior_set.pg_down) {
6799 psdout(20) << "Common peer features: " << hex << ps->get_min_peer_features() << dec << dendl;
6800 psdout(20) << "Common acting features: " << hex << ps->get_min_acting_features() << dec << dendl;
6801 psdout(20) << "Common upacting features: " << hex << ps->get_min_upacting_features() << dec << dendl;
6802 post_event(GotInfo());
6803 }
6804 }
6805 return discard_event();
6806 }
6807
6808 boost::statechart::result PeeringState::GetInfo::react(const QueryState& q)
6809 {
6810 DECLARE_LOCALS;
6811 q.f->open_object_section("state");
6812 q.f->dump_string("name", state_name);
6813 q.f->dump_stream("enter_time") << enter_time;
6814
6815 q.f->open_array_section("requested_info_from");
6816 for (auto p = peer_info_requested.begin();
6817 p != peer_info_requested.end();
6818 ++p) {
6819 q.f->open_object_section("osd");
6820 q.f->dump_stream("osd") << *p;
6821 if (ps->peer_info.count(*p)) {
6822 q.f->open_object_section("got_info");
6823 ps->peer_info[*p].dump(q.f);
6824 q.f->close_section();
6825 }
6826 q.f->close_section();
6827 }
6828 q.f->close_section();
6829
6830 q.f->close_section();
6831 return forward_event();
6832 }
6833
6834 boost::statechart::result PeeringState::GetInfo::react(const QueryUnfound& q)
6835 {
6836 q.f->dump_string("state", "GetInfo");
6837 q.f->dump_bool("available_might_have_unfound", false);
6838 return discard_event();
6839 }
6840
6841 void PeeringState::GetInfo::exit()
6842 {
6843 context< PeeringMachine >().log_exit(state_name, enter_time);
6844
6845 DECLARE_LOCALS;
6846 utime_t dur = ceph_clock_now() - enter_time;
6847 pl->get_peering_perf().tinc(rs_getinfo_latency, dur);
6848 ps->blocked_by.clear();
6849 }
6850
6851 /*------GetLog------------*/
6852 PeeringState::GetLog::GetLog(my_context ctx)
6853 : my_base(ctx),
6854 NamedState(
6855 context< PeeringMachine >().state_history,
6856 "Started/Primary/Peering/GetLog"),
6857 msg(0)
6858 {
6859 context< PeeringMachine >().log_enter(state_name);
6860
6861 DECLARE_LOCALS;
6862
6863 ps->log_weirdness();
6864
6865 // adjust acting?
6866 if (!ps->choose_acting(auth_log_shard, false,
6867 &context< Peering >().history_les_bound)) {
6868 if (!ps->want_acting.empty()) {
6869 post_event(NeedActingChange());
6870 } else {
6871 post_event(IsIncomplete());
6872 }
6873 return;
6874 }
6875
6876 // am i the best?
6877 if (auth_log_shard == ps->pg_whoami) {
6878 post_event(GotLog());
6879 return;
6880 }
6881
6882 const pg_info_t& best = ps->peer_info[auth_log_shard];
6883
6884 // am i broken?
6885 if (ps->info.last_update < best.log_tail) {
6886 psdout(10) << " not contiguous with osd." << auth_log_shard << ", down" << dendl;
6887 post_event(IsIncomplete());
6888 return;
6889 }
6890
6891 // how much log to request?
6892 eversion_t request_log_from = ps->info.last_update;
6893 ceph_assert(!ps->acting_recovery_backfill.empty());
6894 for (auto p = ps->acting_recovery_backfill.begin();
6895 p != ps->acting_recovery_backfill.end();
6896 ++p) {
6897 if (*p == ps->pg_whoami) continue;
6898 pg_info_t& ri = ps->peer_info[*p];
6899 if (ri.last_update < ps->info.log_tail && ri.last_update >= best.log_tail &&
6900 ri.last_update < request_log_from)
6901 request_log_from = ri.last_update;
6902 }
6903
6904 // how much?
6905 psdout(10) << " requesting log from osd." << auth_log_shard << dendl;
6906 context<PeeringMachine>().send_query(
6907 auth_log_shard.osd,
6908 pg_query_t(
6909 pg_query_t::LOG,
6910 auth_log_shard.shard, ps->pg_whoami.shard,
6911 request_log_from, ps->info.history,
6912 ps->get_osdmap_epoch()));
6913
6914 ceph_assert(ps->blocked_by.empty());
6915 ps->blocked_by.insert(auth_log_shard.osd);
6916 pl->publish_stats_to_osd();
6917 }
6918
6919 boost::statechart::result PeeringState::GetLog::react(const AdvMap& advmap)
6920 {
6921 // make sure our log source didn't go down. we need to check
6922 // explicitly because it may not be part of the prior set, which
6923 // means the Peering state check won't catch it going down.
6924 if (!advmap.osdmap->is_up(auth_log_shard.osd)) {
6925 psdout(10) << "GetLog: auth_log_shard osd."
6926 << auth_log_shard.osd << " went down" << dendl;
6927 post_event(advmap);
6928 return transit< Reset >();
6929 }
6930
6931 // let the Peering state do its checks.
6932 return forward_event();
6933 }
6934
6935 boost::statechart::result PeeringState::GetLog::react(const MLogRec& logevt)
6936 {
6937 ceph_assert(!msg);
6938 if (logevt.from != auth_log_shard) {
6939 psdout(10) << "GetLog: discarding log from "
6940 << "non-auth_log_shard osd." << logevt.from << dendl;
6941 return discard_event();
6942 }
6943 psdout(10) << "GetLog: received master log from osd."
6944 << logevt.from << dendl;
6945 msg = logevt.msg;
6946 post_event(GotLog());
6947 return discard_event();
6948 }
6949
6950 boost::statechart::result PeeringState::GetLog::react(const GotLog&)
6951 {
6952
6953 DECLARE_LOCALS;
6954 psdout(10) << "leaving GetLog" << dendl;
6955 if (msg) {
6956 psdout(10) << "processing master log" << dendl;
6957 ps->proc_master_log(context<PeeringMachine>().get_cur_transaction(),
6958 msg->info, std::move(msg->log), std::move(msg->missing),
6959 auth_log_shard);
6960 }
6961 ps->start_flush(context< PeeringMachine >().get_cur_transaction());
6962 return transit< GetMissing >();
6963 }
6964
6965 boost::statechart::result PeeringState::GetLog::react(const QueryState& q)
6966 {
6967 q.f->open_object_section("state");
6968 q.f->dump_string("name", state_name);
6969 q.f->dump_stream("enter_time") << enter_time;
6970 q.f->dump_stream("auth_log_shard") << auth_log_shard;
6971 q.f->close_section();
6972 return forward_event();
6973 }
6974
6975 boost::statechart::result PeeringState::GetLog::react(const QueryUnfound& q)
6976 {
6977 q.f->dump_string("state", "GetLog");
6978 q.f->dump_bool("available_might_have_unfound", false);
6979 return discard_event();
6980 }
6981
6982 void PeeringState::GetLog::exit()
6983 {
6984 context< PeeringMachine >().log_exit(state_name, enter_time);
6985
6986 DECLARE_LOCALS;
6987 utime_t dur = ceph_clock_now() - enter_time;
6988 pl->get_peering_perf().tinc(rs_getlog_latency, dur);
6989 ps->blocked_by.clear();
6990 }
6991
6992 /*------WaitActingChange--------*/
6993 PeeringState::WaitActingChange::WaitActingChange(my_context ctx)
6994 : my_base(ctx),
6995 NamedState(context< PeeringMachine >().state_history, "Started/Primary/WaitActingChange")
6996 {
6997 context< PeeringMachine >().log_enter(state_name);
6998 }
6999
7000 boost::statechart::result PeeringState::WaitActingChange::react(const AdvMap& advmap)
7001 {
7002 DECLARE_LOCALS;
7003 OSDMapRef osdmap = advmap.osdmap;
7004
7005 psdout(10) << "verifying no want_acting " << ps->want_acting << " targets didn't go down" << dendl;
7006 for (auto p = ps->want_acting.begin(); p != ps->want_acting.end(); ++p) {
7007 if (!osdmap->is_up(*p)) {
7008 psdout(10) << " want_acting target osd." << *p << " went down, resetting" << dendl;
7009 post_event(advmap);
7010 return transit< Reset >();
7011 }
7012 }
7013 return forward_event();
7014 }
7015
7016 boost::statechart::result PeeringState::WaitActingChange::react(const MLogRec& logevt)
7017 {
7018 psdout(10) << "In WaitActingChange, ignoring MLocRec" << dendl;
7019 return discard_event();
7020 }
7021
7022 boost::statechart::result PeeringState::WaitActingChange::react(const MInfoRec& evt)
7023 {
7024 psdout(10) << "In WaitActingChange, ignoring MInfoRec" << dendl;
7025 return discard_event();
7026 }
7027
7028 boost::statechart::result PeeringState::WaitActingChange::react(const MNotifyRec& evt)
7029 {
7030 psdout(10) << "In WaitActingChange, ignoring MNotifyRec" << dendl;
7031 return discard_event();
7032 }
7033
7034 boost::statechart::result PeeringState::WaitActingChange::react(const QueryState& q)
7035 {
7036 q.f->open_object_section("state");
7037 q.f->dump_string("name", state_name);
7038 q.f->dump_stream("enter_time") << enter_time;
7039 q.f->dump_string("comment", "waiting for pg acting set to change");
7040 q.f->close_section();
7041 return forward_event();
7042 }
7043
7044 boost::statechart::result PeeringState::WaitActingChange::react(const QueryUnfound& q)
7045 {
7046 q.f->dump_string("state", "WaitActingChange");
7047 q.f->dump_bool("available_might_have_unfound", false);
7048 return discard_event();
7049 }
7050
7051 void PeeringState::WaitActingChange::exit()
7052 {
7053 context< PeeringMachine >().log_exit(state_name, enter_time);
7054 DECLARE_LOCALS;
7055 utime_t dur = ceph_clock_now() - enter_time;
7056 pl->get_peering_perf().tinc(rs_waitactingchange_latency, dur);
7057 }
7058
7059 /*------Down--------*/
7060 PeeringState::Down::Down(my_context ctx)
7061 : my_base(ctx),
7062 NamedState(context< PeeringMachine >().state_history, "Started/Primary/Peering/Down")
7063 {
7064 context< PeeringMachine >().log_enter(state_name);
7065 DECLARE_LOCALS;
7066
7067 ps->state_clear(PG_STATE_PEERING);
7068 ps->state_set(PG_STATE_DOWN);
7069
7070 auto &prior_set = context< Peering >().prior_set;
7071 ceph_assert(ps->blocked_by.empty());
7072 ps->blocked_by.insert(prior_set.down.begin(), prior_set.down.end());
7073 pl->publish_stats_to_osd();
7074 }
7075
7076 void PeeringState::Down::exit()
7077 {
7078 context< PeeringMachine >().log_exit(state_name, enter_time);
7079
7080 DECLARE_LOCALS;
7081
7082 ps->state_clear(PG_STATE_DOWN);
7083 utime_t dur = ceph_clock_now() - enter_time;
7084 pl->get_peering_perf().tinc(rs_down_latency, dur);
7085
7086 ps->blocked_by.clear();
7087 }
7088
7089 boost::statechart::result PeeringState::Down::react(const QueryState& q)
7090 {
7091 q.f->open_object_section("state");
7092 q.f->dump_string("name", state_name);
7093 q.f->dump_stream("enter_time") << enter_time;
7094 q.f->dump_string("comment",
7095 "not enough up instances of this PG to go active");
7096 q.f->close_section();
7097 return forward_event();
7098 }
7099
7100 boost::statechart::result PeeringState::Down::react(const QueryUnfound& q)
7101 {
7102 q.f->dump_string("state", "Down");
7103 q.f->dump_bool("available_might_have_unfound", false);
7104 return discard_event();
7105 }
7106
7107 boost::statechart::result PeeringState::Down::react(const MNotifyRec& infoevt)
7108 {
7109 DECLARE_LOCALS;
7110
7111 ceph_assert(ps->is_primary());
7112 epoch_t old_start = ps->info.history.last_epoch_started;
7113 if (!ps->peer_info.count(infoevt.from) &&
7114 ps->get_osdmap()->has_been_up_since(infoevt.from.osd, infoevt.notify.epoch_sent)) {
7115 ps->update_history(infoevt.notify.info.history);
7116 }
7117 // if we got something new to make pg escape down state
7118 if (ps->info.history.last_epoch_started > old_start) {
7119 psdout(10) << " last_epoch_started moved forward, re-enter getinfo" << dendl;
7120 ps->state_clear(PG_STATE_DOWN);
7121 ps->state_set(PG_STATE_PEERING);
7122 return transit< GetInfo >();
7123 }
7124
7125 return discard_event();
7126 }
7127
7128
7129 /*------Incomplete--------*/
7130 PeeringState::Incomplete::Incomplete(my_context ctx)
7131 : my_base(ctx),
7132 NamedState(context< PeeringMachine >().state_history, "Started/Primary/Peering/Incomplete")
7133 {
7134 context< PeeringMachine >().log_enter(state_name);
7135 DECLARE_LOCALS;
7136
7137 ps->state_clear(PG_STATE_PEERING);
7138 ps->state_set(PG_STATE_INCOMPLETE);
7139
7140 PastIntervals::PriorSet &prior_set = context< Peering >().prior_set;
7141 ceph_assert(ps->blocked_by.empty());
7142 ps->blocked_by.insert(prior_set.down.begin(), prior_set.down.end());
7143 pl->publish_stats_to_osd();
7144 }
7145
7146 boost::statechart::result PeeringState::Incomplete::react(const AdvMap &advmap) {
7147 DECLARE_LOCALS;
7148 int64_t poolnum = ps->info.pgid.pool();
7149
7150 // Reset if min_size turn smaller than previous value, pg might now be able to go active
7151 if (!advmap.osdmap->have_pg_pool(poolnum) ||
7152 advmap.lastmap->get_pools().find(poolnum)->second.min_size >
7153 advmap.osdmap->get_pools().find(poolnum)->second.min_size) {
7154 post_event(advmap);
7155 return transit< Reset >();
7156 }
7157
7158 return forward_event();
7159 }
7160
7161 boost::statechart::result PeeringState::Incomplete::react(const MNotifyRec& notevt) {
7162 DECLARE_LOCALS;
7163 psdout(7) << "handle_pg_notify from osd." << notevt.from << dendl;
7164 if (ps->proc_replica_info(
7165 notevt.from, notevt.notify.info, notevt.notify.epoch_sent)) {
7166 // We got something new, try again!
7167 return transit< GetLog >();
7168 } else {
7169 return discard_event();
7170 }
7171 }
7172
7173 boost::statechart::result PeeringState::Incomplete::react(
7174 const QueryState& q)
7175 {
7176 q.f->open_object_section("state");
7177 q.f->dump_string("name", state_name);
7178 q.f->dump_stream("enter_time") << enter_time;
7179 q.f->dump_string("comment", "not enough complete instances of this PG");
7180 q.f->close_section();
7181 return forward_event();
7182 }
7183
7184 boost::statechart::result PeeringState::Incomplete::react(const QueryUnfound& q)
7185 {
7186 q.f->dump_string("state", "Incomplete");
7187 q.f->dump_bool("available_might_have_unfound", false);
7188 return discard_event();
7189 }
7190
7191 void PeeringState::Incomplete::exit()
7192 {
7193 context< PeeringMachine >().log_exit(state_name, enter_time);
7194
7195 DECLARE_LOCALS;
7196
7197 ps->state_clear(PG_STATE_INCOMPLETE);
7198 utime_t dur = ceph_clock_now() - enter_time;
7199 pl->get_peering_perf().tinc(rs_incomplete_latency, dur);
7200
7201 ps->blocked_by.clear();
7202 }
7203
7204 /*------GetMissing--------*/
7205 PeeringState::GetMissing::GetMissing(my_context ctx)
7206 : my_base(ctx),
7207 NamedState(context< PeeringMachine >().state_history, "Started/Primary/Peering/GetMissing")
7208 {
7209 context< PeeringMachine >().log_enter(state_name);
7210
7211 DECLARE_LOCALS;
7212 ps->log_weirdness();
7213 ceph_assert(!ps->acting_recovery_backfill.empty());
7214 eversion_t since;
7215 for (auto i = ps->acting_recovery_backfill.begin();
7216 i != ps->acting_recovery_backfill.end();
7217 ++i) {
7218 if (*i == ps->get_primary()) continue;
7219 const pg_info_t& pi = ps->peer_info[*i];
7220 // reset this so to make sure the pg_missing_t is initialized and
7221 // has the correct semantics even if we don't need to get a
7222 // missing set from a shard. This way later additions due to
7223 // lost+unfound delete work properly.
7224 ps->peer_missing[*i].may_include_deletes = !ps->perform_deletes_during_peering();
7225
7226 if (pi.is_empty())
7227 continue; // no pg data, nothing divergent
7228
7229 if (pi.last_update < ps->pg_log.get_tail()) {
7230 psdout(10) << " osd." << *i << " is not contiguous, will restart backfill" << dendl;
7231 ps->peer_missing[*i].clear();
7232 continue;
7233 }
7234 if (pi.last_backfill == hobject_t()) {
7235 psdout(10) << " osd." << *i << " will fully backfill; can infer empty missing set" << dendl;
7236 ps->peer_missing[*i].clear();
7237 continue;
7238 }
7239
7240 if (pi.last_update == pi.last_complete && // peer has no missing
7241 pi.last_update == ps->info.last_update) { // peer is up to date
7242 // replica has no missing and identical log as us. no need to
7243 // pull anything.
7244 // FIXME: we can do better here. if last_update==last_complete we
7245 // can infer the rest!
7246 psdout(10) << " osd." << *i << " has no missing, identical log" << dendl;
7247 ps->peer_missing[*i].clear();
7248 continue;
7249 }
7250
7251 // We pull the log from the peer's last_epoch_started to ensure we
7252 // get enough log to detect divergent updates.
7253 since.epoch = pi.last_epoch_started;
7254 ceph_assert(pi.last_update >= ps->info.log_tail); // or else choose_acting() did a bad thing
7255 if (pi.log_tail <= since) {
7256 psdout(10) << " requesting log+missing since " << since << " from osd." << *i << dendl;
7257 context< PeeringMachine >().send_query(
7258 i->osd,
7259 pg_query_t(
7260 pg_query_t::LOG,
7261 i->shard, ps->pg_whoami.shard,
7262 since, ps->info.history,
7263 ps->get_osdmap_epoch()));
7264 } else {
7265 psdout(10) << " requesting fulllog+missing from osd." << *i
7266 << " (want since " << since << " < log.tail "
7267 << pi.log_tail << ")" << dendl;
7268 context< PeeringMachine >().send_query(
7269 i->osd, pg_query_t(
7270 pg_query_t::FULLLOG,
7271 i->shard, ps->pg_whoami.shard,
7272 ps->info.history, ps->get_osdmap_epoch()));
7273 }
7274 peer_missing_requested.insert(*i);
7275 ps->blocked_by.insert(i->osd);
7276 }
7277
7278 if (peer_missing_requested.empty()) {
7279 if (ps->need_up_thru) {
7280 psdout(10) << " still need up_thru update before going active"
7281 << dendl;
7282 post_event(NeedUpThru());
7283 return;
7284 }
7285
7286 // all good!
7287 post_event(Activate(ps->get_osdmap_epoch()));
7288 } else {
7289 pl->publish_stats_to_osd();
7290 }
7291 }
7292
7293 boost::statechart::result PeeringState::GetMissing::react(const MLogRec& logevt)
7294 {
7295 DECLARE_LOCALS;
7296
7297 peer_missing_requested.erase(logevt.from);
7298 ps->proc_replica_log(logevt.msg->info,
7299 logevt.msg->log,
7300 std::move(logevt.msg->missing),
7301 logevt.from);
7302
7303 if (peer_missing_requested.empty()) {
7304 if (ps->need_up_thru) {
7305 psdout(10) << " still need up_thru update before going active"
7306 << dendl;
7307 post_event(NeedUpThru());
7308 } else {
7309 psdout(10) << "Got last missing, don't need missing "
7310 << "posting Activate" << dendl;
7311 post_event(Activate(ps->get_osdmap_epoch()));
7312 }
7313 }
7314 return discard_event();
7315 }
7316
7317 boost::statechart::result PeeringState::GetMissing::react(const QueryState& q)
7318 {
7319 DECLARE_LOCALS;
7320 q.f->open_object_section("state");
7321 q.f->dump_string("name", state_name);
7322 q.f->dump_stream("enter_time") << enter_time;
7323
7324 q.f->open_array_section("peer_missing_requested");
7325 for (auto p = peer_missing_requested.begin();
7326 p != peer_missing_requested.end();
7327 ++p) {
7328 q.f->open_object_section("osd");
7329 q.f->dump_stream("osd") << *p;
7330 if (ps->peer_missing.count(*p)) {
7331 q.f->open_object_section("got_missing");
7332 ps->peer_missing[*p].dump(q.f);
7333 q.f->close_section();
7334 }
7335 q.f->close_section();
7336 }
7337 q.f->close_section();
7338
7339 q.f->close_section();
7340 return forward_event();
7341 }
7342
7343 boost::statechart::result PeeringState::GetMissing::react(const QueryUnfound& q)
7344 {
7345 q.f->dump_string("state", "GetMising");
7346 q.f->dump_bool("available_might_have_unfound", false);
7347 return discard_event();
7348 }
7349
7350 void PeeringState::GetMissing::exit()
7351 {
7352 context< PeeringMachine >().log_exit(state_name, enter_time);
7353
7354 DECLARE_LOCALS;
7355 utime_t dur = ceph_clock_now() - enter_time;
7356 pl->get_peering_perf().tinc(rs_getmissing_latency, dur);
7357 ps->blocked_by.clear();
7358 }
7359
7360 /*------WaitUpThru--------*/
7361 PeeringState::WaitUpThru::WaitUpThru(my_context ctx)
7362 : my_base(ctx),
7363 NamedState(context< PeeringMachine >().state_history, "Started/Primary/Peering/WaitUpThru")
7364 {
7365 context< PeeringMachine >().log_enter(state_name);
7366 }
7367
7368 boost::statechart::result PeeringState::WaitUpThru::react(const ActMap& am)
7369 {
7370 DECLARE_LOCALS;
7371 if (!ps->need_up_thru) {
7372 post_event(Activate(ps->get_osdmap_epoch()));
7373 }
7374 return forward_event();
7375 }
7376
7377 boost::statechart::result PeeringState::WaitUpThru::react(const MLogRec& logevt)
7378 {
7379 DECLARE_LOCALS;
7380 psdout(10) << "Noting missing from osd." << logevt.from << dendl;
7381 ps->peer_missing[logevt.from].claim(std::move(logevt.msg->missing));
7382 ps->peer_info[logevt.from] = logevt.msg->info;
7383 return discard_event();
7384 }
7385
7386 boost::statechart::result PeeringState::WaitUpThru::react(const QueryState& q)
7387 {
7388 q.f->open_object_section("state");
7389 q.f->dump_string("name", state_name);
7390 q.f->dump_stream("enter_time") << enter_time;
7391 q.f->dump_string("comment", "waiting for osdmap to reflect a new up_thru for this osd");
7392 q.f->close_section();
7393 return forward_event();
7394 }
7395
7396 boost::statechart::result PeeringState::WaitUpThru::react(const QueryUnfound& q)
7397 {
7398 q.f->dump_string("state", "WaitUpThru");
7399 q.f->dump_bool("available_might_have_unfound", false);
7400 return discard_event();
7401 }
7402
7403 void PeeringState::WaitUpThru::exit()
7404 {
7405 context< PeeringMachine >().log_exit(state_name, enter_time);
7406 DECLARE_LOCALS;
7407 utime_t dur = ceph_clock_now() - enter_time;
7408 pl->get_peering_perf().tinc(rs_waitupthru_latency, dur);
7409 }
7410
7411 /*----PeeringState::PeeringMachine Methods-----*/
7412 #undef dout_prefix
7413 #define dout_prefix dpp->gen_prefix(*_dout)
7414
7415 void PeeringState::PeeringMachine::log_enter(const char *state_name)
7416 {
7417 DECLARE_LOCALS;
7418 psdout(5) << "enter " << state_name << dendl;
7419 pl->log_state_enter(state_name);
7420 }
7421
7422 void PeeringState::PeeringMachine::log_exit(const char *state_name, utime_t enter_time)
7423 {
7424 DECLARE_LOCALS;
7425 utime_t dur = ceph_clock_now() - enter_time;
7426 psdout(5) << "exit " << state_name << " " << dur << " " << event_count << " " << event_time << dendl;
7427 pl->log_state_exit(state_name, enter_time, event_count, event_time);
7428 event_count = 0;
7429 event_time = utime_t();
7430 }
7431
7432 ostream &operator<<(ostream &out, const PeeringState &ps) {
7433 out << "pg[" << ps.info
7434 << " " << pg_vector_string(ps.up);
7435 if (ps.acting != ps.up)
7436 out << "/" << pg_vector_string(ps.acting);
7437 if (ps.is_ec_pg())
7438 out << "p" << ps.get_primary();
7439 if (!ps.async_recovery_targets.empty())
7440 out << " async=[" << ps.async_recovery_targets << "]";
7441 if (!ps.backfill_targets.empty())
7442 out << " backfill=[" << ps.backfill_targets << "]";
7443 out << " r=" << ps.get_role();
7444 out << " lpr=" << ps.get_last_peering_reset();
7445
7446 if (ps.deleting)
7447 out << " DELETING";
7448
7449 if (!ps.past_intervals.empty()) {
7450 out << " pi=[" << ps.past_intervals.get_bounds()
7451 << ")/" << ps.past_intervals.size();
7452 }
7453
7454 if (ps.is_peered()) {
7455 if (ps.last_update_ondisk != ps.info.last_update)
7456 out << " luod=" << ps.last_update_ondisk;
7457 if (ps.last_update_applied != ps.info.last_update)
7458 out << " lua=" << ps.last_update_applied;
7459 }
7460
7461 if (ps.pg_log.get_tail() != ps.info.log_tail ||
7462 ps.pg_log.get_head() != ps.info.last_update)
7463 out << " (info mismatch, " << ps.pg_log.get_log() << ")";
7464
7465 if (!ps.pg_log.get_log().empty()) {
7466 if ((ps.pg_log.get_log().log.begin()->version <= ps.pg_log.get_tail())) {
7467 out << " (log bound mismatch, actual=["
7468 << ps.pg_log.get_log().log.begin()->version << ","
7469 << ps.pg_log.get_log().log.rbegin()->version << "]";
7470 out << ")";
7471 }
7472 }
7473
7474 out << " crt=" << ps.pg_log.get_can_rollback_to();
7475
7476 if (ps.last_complete_ondisk != ps.info.last_complete)
7477 out << " lcod " << ps.last_complete_ondisk;
7478
7479 out << " mlcod " << ps.min_last_complete_ondisk;
7480
7481 out << " " << pg_state_string(ps.get_state());
7482 if (ps.should_send_notify())
7483 out << " NOTIFY";
7484
7485 if (ps.prior_readable_until_ub != ceph::signedspan::zero()) {
7486 out << " pruub " << ps.prior_readable_until_ub
7487 << "@" << ps.get_prior_readable_down_osds();
7488 }
7489 return out;
7490 }
7491
7492 std::vector<pg_shard_t> PeeringState::get_replica_recovery_order() const
7493 {
7494 std::vector<std::pair<unsigned int, pg_shard_t>> replicas_by_num_missing,
7495 async_by_num_missing;
7496 replicas_by_num_missing.reserve(get_acting_recovery_backfill().size() - 1);
7497 for (auto &p : get_acting_recovery_backfill()) {
7498 if (p == get_primary()) {
7499 continue;
7500 }
7501 auto pm = get_peer_missing().find(p);
7502 assert(pm != get_peer_missing().end());
7503 auto nm = pm->second.num_missing();
7504 if (nm != 0) {
7505 if (is_async_recovery_target(p)) {
7506 async_by_num_missing.push_back(make_pair(nm, p));
7507 } else {
7508 replicas_by_num_missing.push_back(make_pair(nm, p));
7509 }
7510 }
7511 }
7512 // sort by number of missing objects, in ascending order.
7513 auto func = [](const std::pair<unsigned int, pg_shard_t> &lhs,
7514 const std::pair<unsigned int, pg_shard_t> &rhs) {
7515 return lhs.first < rhs.first;
7516 };
7517 // acting goes first
7518 std::sort(replicas_by_num_missing.begin(), replicas_by_num_missing.end(), func);
7519 // then async_recovery_targets
7520 std::sort(async_by_num_missing.begin(), async_by_num_missing.end(), func);
7521 replicas_by_num_missing.insert(replicas_by_num_missing.end(),
7522 async_by_num_missing.begin(), async_by_num_missing.end());
7523
7524 std::vector<pg_shard_t> ret;
7525 ret.reserve(replicas_by_num_missing.size());
7526 for (auto p : replicas_by_num_missing) {
7527 ret.push_back(p.second);
7528 }
7529 return ret;
7530 }
7531
7532