]> git.proxmox.com Git - ceph.git/blob - ceph/src/osd/PeeringState.cc
d7495123475b1bd79f065b633856aa6d95233315
[ceph.git] / ceph / src / osd / PeeringState.cc
1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2 // vim: ts=8 sw=2 smarttab
3
4 #include "PGPeeringEvent.h"
5 #include "common/ceph_releases.h"
6 #include "common/dout.h"
7 #include "PeeringState.h"
8
9 #include "messages/MOSDPGRemove.h"
10 #include "messages/MBackfillReserve.h"
11 #include "messages/MRecoveryReserve.h"
12 #include "messages/MOSDScrubReserve.h"
13 #include "messages/MOSDPGInfo.h"
14 #include "messages/MOSDPGInfo2.h"
15 #include "messages/MOSDPGTrim.h"
16 #include "messages/MOSDPGLog.h"
17 #include "messages/MOSDPGNotify.h"
18 #include "messages/MOSDPGNotify2.h"
19 #include "messages/MOSDPGQuery.h"
20 #include "messages/MOSDPGQuery2.h"
21 #include "messages/MOSDPGLease.h"
22 #include "messages/MOSDPGLeaseAck.h"
23
24 #define dout_context cct
25 #define dout_subsys ceph_subsys_osd
26
27 BufferedRecoveryMessages::BufferedRecoveryMessages(
28 ceph_release_t r,
29 PeeringCtx &ctx)
30 : require_osd_release(r) {
31 // steal messages from ctx
32 message_map.swap(ctx.message_map);
33 }
34
35 void BufferedRecoveryMessages::send_notify(int to, const pg_notify_t &n)
36 {
37 if (require_osd_release >= ceph_release_t::octopus) {
38 spg_t pgid(n.info.pgid.pgid, n.to);
39 send_osd_message(to, make_message<MOSDPGNotify2>(pgid, n));
40 } else {
41 send_osd_message(to, make_message<MOSDPGNotify>(n.epoch_sent, vector{n}));
42 }
43 }
44
45 void BufferedRecoveryMessages::send_query(
46 int to,
47 spg_t to_spgid,
48 const pg_query_t &q)
49 {
50 if (require_osd_release >= ceph_release_t::octopus) {
51 send_osd_message(to,
52 make_message<MOSDPGQuery2>(to_spgid, q));
53 } else {
54 auto m = make_message<MOSDPGQuery>(
55 q.epoch_sent,
56 MOSDPGQuery::pg_list_t{{to_spgid, q}});
57 send_osd_message(to, m);
58 }
59 }
60
61 void BufferedRecoveryMessages::send_info(
62 int to,
63 spg_t to_spgid,
64 epoch_t min_epoch,
65 epoch_t cur_epoch,
66 const pg_info_t &info,
67 std::optional<pg_lease_t> lease,
68 std::optional<pg_lease_ack_t> lease_ack)
69 {
70 if (require_osd_release >= ceph_release_t::octopus) {
71 send_osd_message(
72 to,
73 make_message<MOSDPGInfo2>(
74 to_spgid,
75 info,
76 cur_epoch,
77 min_epoch,
78 lease,
79 lease_ack)
80 );
81 } else {
82 send_osd_message(
83 to,
84 make_message<MOSDPGInfo>(
85 cur_epoch,
86 vector{pg_notify_t{to_spgid.shard,
87 info.pgid.shard,
88 min_epoch, cur_epoch,
89 info, PastIntervals{}}})
90 );
91 }
92 }
93
94 void PGPool::update(CephContext *cct, OSDMapRef map)
95 {
96 const pg_pool_t *pi = map->get_pg_pool(id);
97 if (!pi) {
98 return; // pool has been deleted
99 }
100 info = *pi;
101 name = map->get_pool_name(id);
102
103 bool updated = false;
104 if ((map->get_epoch() != cached_epoch + 1) ||
105 (pi->get_snap_epoch() == map->get_epoch())) {
106 updated = true;
107 }
108
109 if (info.is_pool_snaps_mode() && updated) {
110 snapc = pi->get_snap_context();
111 }
112 cached_epoch = map->get_epoch();
113 }
114
115 /*-------------Peering State Helpers----------------*/
116 #undef dout_prefix
117 #define dout_prefix (dpp->gen_prefix(*_dout))
118 #undef psdout
119 #define psdout(x) ldout(cct, x)
120
121 PeeringState::PeeringState(
122 CephContext *cct,
123 pg_shard_t pg_whoami,
124 spg_t spgid,
125 const PGPool &_pool,
126 OSDMapRef curmap,
127 DoutPrefixProvider *dpp,
128 PeeringListener *pl)
129 : state_history(*pl),
130 cct(cct),
131 spgid(spgid),
132 dpp(dpp),
133 pl(pl),
134 orig_ctx(0),
135 osdmap_ref(curmap),
136 pool(_pool),
137 pg_whoami(pg_whoami),
138 info(spgid),
139 pg_log(cct),
140 missing_loc(spgid, this, dpp, cct),
141 machine(this, cct, spgid, dpp, pl, &state_history)
142 {
143 machine.initiate();
144 }
145
146 void PeeringState::start_handle(PeeringCtx *new_ctx) {
147 ceph_assert(!rctx);
148 ceph_assert(!orig_ctx);
149 orig_ctx = new_ctx;
150 if (new_ctx) {
151 if (messages_pending_flush) {
152 rctx.emplace(*messages_pending_flush, *new_ctx);
153 } else {
154 rctx.emplace(*new_ctx);
155 }
156 rctx->start_time = ceph_clock_now();
157 }
158 }
159
160 void PeeringState::begin_block_outgoing() {
161 ceph_assert(!messages_pending_flush);
162 ceph_assert(orig_ctx);
163 ceph_assert(rctx);
164 messages_pending_flush = BufferedRecoveryMessages(
165 orig_ctx->require_osd_release);
166 rctx.emplace(*messages_pending_flush, *orig_ctx);
167 }
168
169 void PeeringState::clear_blocked_outgoing() {
170 ceph_assert(orig_ctx);
171 ceph_assert(rctx);
172 messages_pending_flush = std::optional<BufferedRecoveryMessages>();
173 }
174
175 void PeeringState::end_block_outgoing() {
176 ceph_assert(messages_pending_flush);
177 ceph_assert(orig_ctx);
178 ceph_assert(rctx);
179
180 orig_ctx->accept_buffered_messages(*messages_pending_flush);
181 rctx.emplace(*orig_ctx);
182 messages_pending_flush = std::optional<BufferedRecoveryMessages>();
183 }
184
185 void PeeringState::end_handle() {
186 if (rctx) {
187 utime_t dur = ceph_clock_now() - rctx->start_time;
188 machine.event_time += dur;
189 }
190
191 machine.event_count++;
192 rctx = std::nullopt;
193 orig_ctx = NULL;
194 }
195
196 void PeeringState::check_recovery_sources(const OSDMapRef& osdmap)
197 {
198 /*
199 * check that any peers we are planning to (or currently) pulling
200 * objects from are dealt with.
201 */
202 missing_loc.check_recovery_sources(osdmap);
203 pl->check_recovery_sources(osdmap);
204
205 for (set<pg_shard_t>::iterator i = peer_log_requested.begin();
206 i != peer_log_requested.end();
207 ) {
208 if (!osdmap->is_up(i->osd)) {
209 psdout(10) << "peer_log_requested removing " << *i << dendl;
210 peer_log_requested.erase(i++);
211 } else {
212 ++i;
213 }
214 }
215
216 for (set<pg_shard_t>::iterator i = peer_missing_requested.begin();
217 i != peer_missing_requested.end();
218 ) {
219 if (!osdmap->is_up(i->osd)) {
220 psdout(10) << "peer_missing_requested removing " << *i << dendl;
221 peer_missing_requested.erase(i++);
222 } else {
223 ++i;
224 }
225 }
226 }
227
228 void PeeringState::update_history(const pg_history_t& new_history)
229 {
230 auto mnow = pl->get_mnow();
231 info.history.refresh_prior_readable_until_ub(mnow, prior_readable_until_ub);
232 if (info.history.merge(new_history)) {
233 psdout(20) << __func__ << " advanced history from " << new_history << dendl;
234 dirty_info = true;
235 if (info.history.last_epoch_clean >= info.history.same_interval_since) {
236 psdout(20) << __func__ << " clearing past_intervals" << dendl;
237 past_intervals.clear();
238 dirty_big_info = true;
239 }
240 prior_readable_until_ub = info.history.get_prior_readable_until_ub(mnow);
241 if (prior_readable_until_ub != ceph::signedspan::zero()) {
242 dout(20) << __func__
243 << " prior_readable_until_ub " << prior_readable_until_ub
244 << " (mnow " << mnow << " + "
245 << info.history.prior_readable_until_ub << ")" << dendl;
246 }
247 }
248 pl->on_info_history_change();
249 }
250
251 void PeeringState::purge_strays()
252 {
253 if (is_premerge()) {
254 psdout(10) << "purge_strays " << stray_set << " but premerge, doing nothing"
255 << dendl;
256 return;
257 }
258 if (cct->_conf.get_val<bool>("osd_debug_no_purge_strays")) {
259 return;
260 }
261 psdout(10) << "purge_strays " << stray_set << dendl;
262
263 bool removed = false;
264 for (set<pg_shard_t>::iterator p = stray_set.begin();
265 p != stray_set.end();
266 ++p) {
267 ceph_assert(!is_acting_recovery_backfill(*p));
268 if (get_osdmap()->is_up(p->osd)) {
269 psdout(10) << "sending PGRemove to osd." << *p << dendl;
270 vector<spg_t> to_remove;
271 to_remove.push_back(spg_t(info.pgid.pgid, p->shard));
272 MOSDPGRemove *m = new MOSDPGRemove(
273 get_osdmap_epoch(),
274 to_remove);
275 pl->send_cluster_message(p->osd, m, get_osdmap_epoch());
276 } else {
277 psdout(10) << "not sending PGRemove to down osd." << *p << dendl;
278 }
279 peer_missing.erase(*p);
280 peer_info.erase(*p);
281 missing_loc.remove_stray_recovery_sources(*p);
282 peer_purged.insert(*p);
283 removed = true;
284 }
285
286 // if we removed anyone, update peers (which include peer_info)
287 if (removed)
288 update_heartbeat_peers();
289
290 stray_set.clear();
291
292 // clear _requested maps; we may have to peer() again if we discover
293 // (more) stray content
294 peer_log_requested.clear();
295 peer_missing_requested.clear();
296 }
297
298
299 bool PeeringState::proc_replica_info(
300 pg_shard_t from, const pg_info_t &oinfo, epoch_t send_epoch)
301 {
302 map<pg_shard_t, pg_info_t>::iterator p = peer_info.find(from);
303 if (p != peer_info.end() && p->second.last_update == oinfo.last_update) {
304 psdout(10) << " got dup osd." << from << " info "
305 << oinfo << ", identical to ours" << dendl;
306 return false;
307 }
308
309 if (!get_osdmap()->has_been_up_since(from.osd, send_epoch)) {
310 psdout(10) << " got info " << oinfo << " from down osd." << from
311 << " discarding" << dendl;
312 return false;
313 }
314
315 psdout(10) << " got osd." << from << " " << oinfo << dendl;
316 ceph_assert(is_primary());
317 peer_info[from] = oinfo;
318 might_have_unfound.insert(from);
319
320 update_history(oinfo.history);
321
322 // stray?
323 if (!is_up(from) && !is_acting(from)) {
324 psdout(10) << " osd." << from << " has stray content: " << oinfo << dendl;
325 stray_set.insert(from);
326 if (is_clean()) {
327 purge_strays();
328 }
329 }
330
331 // was this a new info? if so, update peers!
332 if (p == peer_info.end())
333 update_heartbeat_peers();
334
335 return true;
336 }
337
338
339 void PeeringState::remove_down_peer_info(const OSDMapRef &osdmap)
340 {
341 // Remove any downed osds from peer_info
342 bool removed = false;
343 map<pg_shard_t, pg_info_t>::iterator p = peer_info.begin();
344 while (p != peer_info.end()) {
345 if (!osdmap->is_up(p->first.osd)) {
346 psdout(10) << " dropping down osd." << p->first << " info " << p->second << dendl;
347 peer_missing.erase(p->first);
348 peer_log_requested.erase(p->first);
349 peer_missing_requested.erase(p->first);
350 peer_purged.erase(p->first);
351 peer_info.erase(p++);
352 removed = true;
353 } else
354 ++p;
355 }
356
357 // if we removed anyone, update peers (which include peer_info)
358 if (removed)
359 update_heartbeat_peers();
360
361 check_recovery_sources(osdmap);
362 }
363
364 void PeeringState::update_heartbeat_peers()
365 {
366 if (!is_primary())
367 return;
368
369 set<int> new_peers;
370 for (unsigned i=0; i<acting.size(); i++) {
371 if (acting[i] != CRUSH_ITEM_NONE)
372 new_peers.insert(acting[i]);
373 }
374 for (unsigned i=0; i<up.size(); i++) {
375 if (up[i] != CRUSH_ITEM_NONE)
376 new_peers.insert(up[i]);
377 }
378 for (map<pg_shard_t,pg_info_t>::iterator p = peer_info.begin();
379 p != peer_info.end();
380 ++p) {
381 new_peers.insert(p->first.osd);
382 }
383 pl->update_heartbeat_peers(std::move(new_peers));
384 }
385
386 void PeeringState::write_if_dirty(ObjectStore::Transaction& t)
387 {
388 pl->prepare_write(
389 info,
390 last_written_info,
391 past_intervals,
392 pg_log,
393 dirty_info,
394 dirty_big_info,
395 last_persisted_osdmap < get_osdmap_epoch(),
396 t);
397 if (dirty_info || dirty_big_info) {
398 last_persisted_osdmap = get_osdmap_epoch();
399 last_written_info = info;
400 dirty_info = false;
401 dirty_big_info = false;
402 }
403 }
404
405 void PeeringState::advance_map(
406 OSDMapRef osdmap, OSDMapRef lastmap,
407 vector<int>& newup, int up_primary,
408 vector<int>& newacting, int acting_primary,
409 PeeringCtx &rctx)
410 {
411 ceph_assert(lastmap == osdmap_ref);
412 psdout(10) << "handle_advance_map "
413 << newup << "/" << newacting
414 << " -- " << up_primary << "/" << acting_primary
415 << dendl;
416
417 update_osdmap_ref(osdmap);
418 pool.update(cct, osdmap);
419
420 AdvMap evt(
421 osdmap, lastmap, newup, up_primary,
422 newacting, acting_primary);
423 handle_event(evt, &rctx);
424 if (pool.info.last_change == osdmap_ref->get_epoch()) {
425 pl->on_pool_change();
426 }
427 readable_interval = pool.get_readable_interval();
428 last_require_osd_release = osdmap->require_osd_release;
429 }
430
431 void PeeringState::activate_map(PeeringCtx &rctx)
432 {
433 psdout(10) << __func__ << dendl;
434 ActMap evt;
435 handle_event(evt, &rctx);
436 if (osdmap_ref->get_epoch() - last_persisted_osdmap >
437 cct->_conf->osd_pg_epoch_persisted_max_stale) {
438 psdout(20) << __func__ << ": Dirtying info: last_persisted is "
439 << last_persisted_osdmap
440 << " while current is " << osdmap_ref->get_epoch() << dendl;
441 dirty_info = true;
442 } else {
443 psdout(20) << __func__ << ": Not dirtying info: last_persisted is "
444 << last_persisted_osdmap
445 << " while current is " << osdmap_ref->get_epoch() << dendl;
446 }
447 write_if_dirty(rctx.transaction);
448
449 if (get_osdmap()->check_new_blacklist_entries()) {
450 pl->check_blacklisted_watchers();
451 }
452 }
453
454 void PeeringState::set_last_peering_reset()
455 {
456 psdout(20) << "set_last_peering_reset " << get_osdmap_epoch() << dendl;
457 if (last_peering_reset != get_osdmap_epoch()) {
458 last_peering_reset = get_osdmap_epoch();
459 psdout(10) << "Clearing blocked outgoing recovery messages" << dendl;
460 clear_blocked_outgoing();
461 if (!pl->try_flush_or_schedule_async()) {
462 psdout(10) << "Beginning to block outgoing recovery messages" << dendl;
463 begin_block_outgoing();
464 } else {
465 psdout(10) << "Not blocking outgoing recovery messages" << dendl;
466 }
467 }
468 }
469
470 void PeeringState::complete_flush()
471 {
472 flushes_in_progress--;
473 if (flushes_in_progress == 0) {
474 pl->on_flushed();
475 }
476 }
477
478 void PeeringState::check_full_transition(OSDMapRef lastmap, OSDMapRef osdmap)
479 {
480 const pg_pool_t *pi = osdmap->get_pg_pool(info.pgid.pool());
481 if (!pi) {
482 return; // pool deleted
483 }
484 bool changed = false;
485 if (pi->has_flag(pg_pool_t::FLAG_FULL)) {
486 const pg_pool_t *opi = lastmap->get_pg_pool(info.pgid.pool());
487 if (!opi || !opi->has_flag(pg_pool_t::FLAG_FULL)) {
488 psdout(10) << " pool was marked full in " << osdmap->get_epoch() << dendl;
489 changed = true;
490 }
491 }
492 if (changed) {
493 info.history.last_epoch_marked_full = osdmap->get_epoch();
494 dirty_info = true;
495 }
496 }
497
498 bool PeeringState::should_restart_peering(
499 int newupprimary,
500 int newactingprimary,
501 const vector<int>& newup,
502 const vector<int>& newacting,
503 OSDMapRef lastmap,
504 OSDMapRef osdmap)
505 {
506 if (PastIntervals::is_new_interval(
507 primary.osd,
508 newactingprimary,
509 acting,
510 newacting,
511 up_primary.osd,
512 newupprimary,
513 up,
514 newup,
515 osdmap.get(),
516 lastmap.get(),
517 info.pgid.pgid)) {
518 psdout(20) << "new interval newup " << newup
519 << " newacting " << newacting << dendl;
520 return true;
521 }
522 if (!lastmap->is_up(pg_whoami.osd) && osdmap->is_up(pg_whoami.osd)) {
523 psdout(10) << __func__ << " osd transitioned from down -> up"
524 << dendl;
525 return true;
526 }
527 return false;
528 }
529
530 /* Called before initializing peering during advance_map */
531 void PeeringState::start_peering_interval(
532 const OSDMapRef lastmap,
533 const vector<int>& newup, int new_up_primary,
534 const vector<int>& newacting, int new_acting_primary,
535 ObjectStore::Transaction &t)
536 {
537 const OSDMapRef osdmap = get_osdmap();
538
539 set_last_peering_reset();
540
541 vector<int> oldacting, oldup;
542 int oldrole = get_role();
543
544 if (is_primary()) {
545 pl->clear_ready_to_merge();
546 }
547
548
549 pg_shard_t old_acting_primary = get_primary();
550 pg_shard_t old_up_primary = up_primary;
551 bool was_old_primary = is_primary();
552 bool was_old_nonprimary = is_nonprimary();
553
554 acting.swap(oldacting);
555 up.swap(oldup);
556 init_primary_up_acting(
557 newup,
558 newacting,
559 new_up_primary,
560 new_acting_primary);
561
562 if (info.stats.up != up ||
563 info.stats.acting != acting ||
564 info.stats.up_primary != new_up_primary ||
565 info.stats.acting_primary != new_acting_primary) {
566 info.stats.up = up;
567 info.stats.up_primary = new_up_primary;
568 info.stats.acting = acting;
569 info.stats.acting_primary = new_acting_primary;
570 info.stats.mapping_epoch = osdmap->get_epoch();
571 }
572
573 pl->clear_publish_stats();
574
575 // This will now be remapped during a backfill in cases
576 // that it would not have been before.
577 if (up != acting)
578 state_set(PG_STATE_REMAPPED);
579 else
580 state_clear(PG_STATE_REMAPPED);
581
582 int role = osdmap->calc_pg_role(pg_whoami, acting);
583 set_role(role);
584
585 // did acting, up, primary|acker change?
586 if (!lastmap) {
587 psdout(10) << " no lastmap" << dendl;
588 dirty_info = true;
589 dirty_big_info = true;
590 info.history.same_interval_since = osdmap->get_epoch();
591 } else {
592 std::stringstream debug;
593 ceph_assert(info.history.same_interval_since != 0);
594 bool new_interval = PastIntervals::check_new_interval(
595 old_acting_primary.osd,
596 new_acting_primary,
597 oldacting, newacting,
598 old_up_primary.osd,
599 new_up_primary,
600 oldup, newup,
601 info.history.same_interval_since,
602 info.history.last_epoch_clean,
603 osdmap.get(),
604 lastmap.get(),
605 info.pgid.pgid,
606 missing_loc.get_recoverable_predicate(),
607 &past_intervals,
608 &debug);
609 psdout(10) << __func__ << ": check_new_interval output: "
610 << debug.str() << dendl;
611 if (new_interval) {
612 if (osdmap->get_epoch() == pl->oldest_stored_osdmap() &&
613 info.history.last_epoch_clean < osdmap->get_epoch()) {
614 psdout(10) << " map gap, clearing past_intervals and faking" << dendl;
615 // our information is incomplete and useless; someone else was clean
616 // after everything we know if osdmaps were trimmed.
617 past_intervals.clear();
618 } else {
619 psdout(10) << " noting past " << past_intervals << dendl;
620 }
621 dirty_info = true;
622 dirty_big_info = true;
623 info.history.same_interval_since = osdmap->get_epoch();
624 if (osdmap->have_pg_pool(info.pgid.pgid.pool()) &&
625 info.pgid.pgid.is_split(lastmap->get_pg_num(info.pgid.pgid.pool()),
626 osdmap->get_pg_num(info.pgid.pgid.pool()),
627 nullptr)) {
628 info.history.last_epoch_split = osdmap->get_epoch();
629 }
630 }
631 }
632
633 if (old_up_primary != up_primary ||
634 oldup != up) {
635 info.history.same_up_since = osdmap->get_epoch();
636 }
637 // this comparison includes primary rank via pg_shard_t
638 if (old_acting_primary != get_primary()) {
639 info.history.same_primary_since = osdmap->get_epoch();
640 }
641
642 on_new_interval();
643 pl->on_info_history_change();
644
645 psdout(1) << __func__ << " up " << oldup << " -> " << up
646 << ", acting " << oldacting << " -> " << acting
647 << ", acting_primary " << old_acting_primary << " -> "
648 << new_acting_primary
649 << ", up_primary " << old_up_primary << " -> " << new_up_primary
650 << ", role " << oldrole << " -> " << role
651 << ", features acting " << acting_features
652 << " upacting " << upacting_features
653 << dendl;
654
655 // deactivate.
656 state_clear(PG_STATE_ACTIVE);
657 state_clear(PG_STATE_PEERED);
658 state_clear(PG_STATE_PREMERGE);
659 state_clear(PG_STATE_DOWN);
660 state_clear(PG_STATE_RECOVERY_WAIT);
661 state_clear(PG_STATE_RECOVERY_TOOFULL);
662 state_clear(PG_STATE_RECOVERING);
663
664 peer_purged.clear();
665 acting_recovery_backfill.clear();
666
667 // reset primary/replica state?
668 if (was_old_primary || is_primary()) {
669 pl->clear_want_pg_temp();
670 } else if (was_old_nonprimary || is_nonprimary()) {
671 pl->clear_want_pg_temp();
672 }
673 clear_primary_state();
674
675 pl->on_change(t);
676
677 ceph_assert(!deleting);
678
679 // should we tell the primary we are here?
680 send_notify = !is_primary();
681
682 if (role != oldrole ||
683 was_old_primary != is_primary()) {
684 // did primary change?
685 if (was_old_primary != is_primary()) {
686 state_clear(PG_STATE_CLEAN);
687 }
688
689 pl->on_role_change();
690 } else {
691 // no role change.
692 // did primary change?
693 if (get_primary() != old_acting_primary) {
694 psdout(10) << oldacting << " -> " << acting
695 << ", acting primary "
696 << old_acting_primary << " -> " << get_primary()
697 << dendl;
698 } else {
699 // primary is the same.
700 if (is_primary()) {
701 // i am (still) primary. but my replica set changed.
702 state_clear(PG_STATE_CLEAN);
703
704 psdout(10) << oldacting << " -> " << acting
705 << ", replicas changed" << dendl;
706 }
707 }
708 }
709
710 if (acting.empty() && !up.empty() && up_primary == pg_whoami) {
711 psdout(10) << " acting empty, but i am up[0], clearing pg_temp" << dendl;
712 pl->queue_want_pg_temp(acting);
713 }
714 }
715
716 void PeeringState::on_new_interval()
717 {
718 dout(20) << __func__ << dendl;
719 const OSDMapRef osdmap = get_osdmap();
720
721 // initialize features
722 acting_features = CEPH_FEATURES_SUPPORTED_DEFAULT;
723 upacting_features = CEPH_FEATURES_SUPPORTED_DEFAULT;
724 for (vector<int>::iterator p = acting.begin(); p != acting.end(); ++p) {
725 if (*p == CRUSH_ITEM_NONE)
726 continue;
727 uint64_t f = osdmap->get_xinfo(*p).features;
728 acting_features &= f;
729 upacting_features &= f;
730 }
731 for (vector<int>::iterator p = up.begin(); p != up.end(); ++p) {
732 if (*p == CRUSH_ITEM_NONE)
733 continue;
734 upacting_features &= osdmap->get_xinfo(*p).features;
735 }
736 psdout(20) << __func__ << " upacting_features 0x" << std::hex
737 << upacting_features << std::dec
738 << " from " << acting << "+" << up << dendl;
739
740 psdout(20) << __func__ << " checking missing set deletes flag. missing = "
741 << get_pg_log().get_missing() << dendl;
742
743 if (!pg_log.get_missing().may_include_deletes &&
744 !perform_deletes_during_peering()) {
745 pl->rebuild_missing_set_with_deletes(pg_log);
746 }
747 ceph_assert(
748 pg_log.get_missing().may_include_deletes ==
749 !perform_deletes_during_peering());
750
751 init_hb_stamps();
752
753 // update lease bounds for a new interval
754 auto mnow = pl->get_mnow();
755 prior_readable_until_ub = std::max(prior_readable_until_ub,
756 readable_until_ub);
757 prior_readable_until_ub = info.history.refresh_prior_readable_until_ub(
758 mnow, prior_readable_until_ub);
759 psdout(10) << __func__ << " prior_readable_until_ub "
760 << prior_readable_until_ub << " (mnow " << mnow << " + "
761 << info.history.prior_readable_until_ub << ")" << dendl;
762 prior_readable_down_osds.clear(); // we populate this when we build the priorset
763
764 readable_until =
765 readable_until_ub =
766 readable_until_ub_sent =
767 readable_until_ub_from_primary = ceph::signedspan::zero();
768
769 acting_readable_until_ub.clear();
770 if (is_primary()) {
771 acting_readable_until_ub.resize(acting.size(), ceph::signedspan::zero());
772 }
773
774 pl->on_new_interval();
775 }
776
777 void PeeringState::init_primary_up_acting(
778 const vector<int> &newup,
779 const vector<int> &newacting,
780 int new_up_primary,
781 int new_acting_primary)
782 {
783 actingset.clear();
784 acting = newacting;
785 for (uint8_t i = 0; i < acting.size(); ++i) {
786 if (acting[i] != CRUSH_ITEM_NONE)
787 actingset.insert(
788 pg_shard_t(
789 acting[i],
790 pool.info.is_erasure() ? shard_id_t(i) : shard_id_t::NO_SHARD));
791 }
792 upset.clear();
793 up = newup;
794 for (uint8_t i = 0; i < up.size(); ++i) {
795 if (up[i] != CRUSH_ITEM_NONE)
796 upset.insert(
797 pg_shard_t(
798 up[i],
799 pool.info.is_erasure() ? shard_id_t(i) : shard_id_t::NO_SHARD));
800 }
801 if (!pool.info.is_erasure()) {
802 // replicated
803 up_primary = pg_shard_t(new_up_primary, shard_id_t::NO_SHARD);
804 primary = pg_shard_t(new_acting_primary, shard_id_t::NO_SHARD);
805 } else {
806 // erasure
807 up_primary = pg_shard_t();
808 primary = pg_shard_t();
809 for (uint8_t i = 0; i < up.size(); ++i) {
810 if (up[i] == new_up_primary) {
811 up_primary = pg_shard_t(up[i], shard_id_t(i));
812 break;
813 }
814 }
815 for (uint8_t i = 0; i < acting.size(); ++i) {
816 if (acting[i] == new_acting_primary) {
817 primary = pg_shard_t(acting[i], shard_id_t(i));
818 break;
819 }
820 }
821 ceph_assert(up_primary.osd == new_up_primary);
822 ceph_assert(primary.osd == new_acting_primary);
823 }
824 }
825
826 void PeeringState::init_hb_stamps()
827 {
828 if (is_primary()) {
829 // we care about all other osds in the acting set
830 hb_stamps.resize(acting.size() - 1);
831 unsigned i = 0;
832 for (auto p : acting) {
833 if (p == CRUSH_ITEM_NONE || p == get_primary().osd) {
834 continue;
835 }
836 hb_stamps[i++] = pl->get_hb_stamps(p);
837 }
838 hb_stamps.resize(i);
839 } else if (is_nonprimary()) {
840 // we care about just the primary
841 hb_stamps.resize(1);
842 hb_stamps[0] = pl->get_hb_stamps(get_primary().osd);
843 } else {
844 hb_stamps.clear();
845 }
846 dout(10) << __func__ << " now " << hb_stamps << dendl;
847 }
848
849
850 void PeeringState::clear_recovery_state()
851 {
852 async_recovery_targets.clear();
853 backfill_targets.clear();
854 }
855
856 void PeeringState::clear_primary_state()
857 {
858 psdout(10) << "clear_primary_state" << dendl;
859
860 // clear peering state
861 stray_set.clear();
862 peer_log_requested.clear();
863 peer_missing_requested.clear();
864 peer_info.clear();
865 peer_bytes.clear();
866 peer_missing.clear();
867 peer_last_complete_ondisk.clear();
868 peer_activated.clear();
869 min_last_complete_ondisk = eversion_t();
870 pg_trim_to = eversion_t();
871 might_have_unfound.clear();
872 need_up_thru = false;
873 missing_loc.clear();
874 pg_log.reset_recovery_pointers();
875
876 clear_recovery_state();
877
878 last_update_ondisk = eversion_t();
879 missing_loc.clear();
880 pl->clear_primary_state();
881 }
882
883 /// return [start,end) bounds for required past_intervals
884 static pair<epoch_t, epoch_t> get_required_past_interval_bounds(
885 const pg_info_t &info,
886 epoch_t oldest_map) {
887 epoch_t start = std::max(
888 info.history.last_epoch_clean ? info.history.last_epoch_clean :
889 info.history.epoch_pool_created,
890 oldest_map);
891 epoch_t end = std::max(
892 info.history.same_interval_since,
893 info.history.epoch_pool_created);
894 return make_pair(start, end);
895 }
896
897
898 void PeeringState::check_past_interval_bounds() const
899 {
900 auto oldest_epoch = pl->oldest_stored_osdmap();
901 auto rpib = get_required_past_interval_bounds(
902 info,
903 oldest_epoch);
904 if (rpib.first >= rpib.second) {
905 // do not warn if the start bound is dictated by oldest_map; the
906 // past intervals are presumably appropriate given the pg info.
907 if (!past_intervals.empty() &&
908 rpib.first > oldest_epoch) {
909 pl->get_clog_error() << info.pgid << " required past_interval bounds are"
910 << " empty [" << rpib << ") but past_intervals is not: "
911 << past_intervals;
912 derr << info.pgid << " required past_interval bounds are"
913 << " empty [" << rpib << ") but past_intervals is not: "
914 << past_intervals << dendl;
915 }
916 } else {
917 if (past_intervals.empty()) {
918 pl->get_clog_error() << info.pgid << " required past_interval bounds are"
919 << " not empty [" << rpib << ") but past_intervals "
920 << past_intervals << " is empty";
921 derr << info.pgid << " required past_interval bounds are"
922 << " not empty [" << rpib << ") but past_intervals "
923 << past_intervals << " is empty" << dendl;
924 ceph_assert(!past_intervals.empty());
925 }
926
927 auto apib = past_intervals.get_bounds();
928 if (apib.first > rpib.first) {
929 pl->get_clog_error() << info.pgid << " past_intervals [" << apib
930 << ") start interval does not contain the required"
931 << " bound [" << rpib << ") start";
932 derr << info.pgid << " past_intervals [" << apib
933 << ") start interval does not contain the required"
934 << " bound [" << rpib << ") start" << dendl;
935 ceph_abort_msg("past_interval start interval mismatch");
936 }
937 if (apib.second != rpib.second) {
938 pl->get_clog_error() << info.pgid << " past_interal bound [" << apib
939 << ") end does not match required [" << rpib
940 << ") end";
941 derr << info.pgid << " past_interal bound [" << apib
942 << ") end does not match required [" << rpib
943 << ") end" << dendl;
944 ceph_abort_msg("past_interval end mismatch");
945 }
946 }
947 }
948
949 int PeeringState::clamp_recovery_priority(int priority, int pool_recovery_priority, int max)
950 {
951 static_assert(OSD_RECOVERY_PRIORITY_MIN < OSD_RECOVERY_PRIORITY_MAX, "Invalid priority range");
952 static_assert(OSD_RECOVERY_PRIORITY_MIN >= 0, "Priority range must match unsigned type");
953
954 ceph_assert(max <= OSD_RECOVERY_PRIORITY_MAX);
955
956 // User can't set this too high anymore, but might be a legacy value
957 if (pool_recovery_priority > OSD_POOL_PRIORITY_MAX)
958 pool_recovery_priority = OSD_POOL_PRIORITY_MAX;
959 if (pool_recovery_priority < OSD_POOL_PRIORITY_MIN)
960 pool_recovery_priority = OSD_POOL_PRIORITY_MIN;
961 // Shift range from min to max to 0 to max - min
962 pool_recovery_priority += (0 - OSD_POOL_PRIORITY_MIN);
963 ceph_assert(pool_recovery_priority >= 0 && pool_recovery_priority <= (OSD_POOL_PRIORITY_MAX - OSD_POOL_PRIORITY_MIN));
964
965 priority += pool_recovery_priority;
966
967 // Clamp to valid range
968 if (priority > max) {
969 return max;
970 } else if (priority < OSD_RECOVERY_PRIORITY_MIN) {
971 return OSD_RECOVERY_PRIORITY_MIN;
972 } else {
973 return priority;
974 }
975 }
976
977 unsigned PeeringState::get_recovery_priority()
978 {
979 // a higher value -> a higher priority
980 int ret = OSD_RECOVERY_PRIORITY_BASE;
981 int base = ret;
982
983 if (state & PG_STATE_FORCED_RECOVERY) {
984 ret = OSD_RECOVERY_PRIORITY_FORCED;
985 } else {
986 // XXX: This priority boost isn't so much about inactive, but about data-at-risk
987 if (is_degraded() && info.stats.avail_no_missing.size() < pool.info.min_size) {
988 base = OSD_RECOVERY_INACTIVE_PRIORITY_BASE;
989 // inactive: no. of replicas < min_size, highest priority since it blocks IO
990 ret = base + (pool.info.min_size - info.stats.avail_no_missing.size());
991 }
992
993 int64_t pool_recovery_priority = 0;
994 pool.info.opts.get(pool_opts_t::RECOVERY_PRIORITY, &pool_recovery_priority);
995
996 ret = clamp_recovery_priority(ret, pool_recovery_priority, max_prio_map[base]);
997 }
998 psdout(20) << __func__ << " recovery priority is " << ret << dendl;
999 return static_cast<unsigned>(ret);
1000 }
1001
1002 unsigned PeeringState::get_backfill_priority()
1003 {
1004 // a higher value -> a higher priority
1005 int ret = OSD_BACKFILL_PRIORITY_BASE;
1006 int base = ret;
1007
1008 if (state & PG_STATE_FORCED_BACKFILL) {
1009 ret = OSD_BACKFILL_PRIORITY_FORCED;
1010 } else {
1011 if (acting.size() < pool.info.min_size) {
1012 base = OSD_BACKFILL_INACTIVE_PRIORITY_BASE;
1013 // inactive: no. of replicas < min_size, highest priority since it blocks IO
1014 ret = base + (pool.info.min_size - acting.size());
1015
1016 } else if (is_undersized()) {
1017 // undersized: OSD_BACKFILL_DEGRADED_PRIORITY_BASE + num missing replicas
1018 ceph_assert(pool.info.size > actingset.size());
1019 base = OSD_BACKFILL_DEGRADED_PRIORITY_BASE;
1020 ret = base + (pool.info.size - actingset.size());
1021
1022 } else if (is_degraded()) {
1023 // degraded: baseline degraded
1024 base = ret = OSD_BACKFILL_DEGRADED_PRIORITY_BASE;
1025 }
1026
1027 // Adjust with pool's recovery priority
1028 int64_t pool_recovery_priority = 0;
1029 pool.info.opts.get(pool_opts_t::RECOVERY_PRIORITY, &pool_recovery_priority);
1030
1031 ret = clamp_recovery_priority(ret, pool_recovery_priority, max_prio_map[base]);
1032 }
1033
1034 psdout(20) << __func__ << " backfill priority is " << ret << dendl;
1035 return static_cast<unsigned>(ret);
1036 }
1037
1038 unsigned PeeringState::get_delete_priority()
1039 {
1040 auto state = get_osdmap()->get_state(pg_whoami.osd);
1041 if (state & (CEPH_OSD_BACKFILLFULL |
1042 CEPH_OSD_FULL)) {
1043 return OSD_DELETE_PRIORITY_FULL;
1044 } else if (state & CEPH_OSD_NEARFULL) {
1045 return OSD_DELETE_PRIORITY_FULLISH;
1046 } else {
1047 return OSD_DELETE_PRIORITY_NORMAL;
1048 }
1049 }
1050
1051 bool PeeringState::set_force_recovery(bool b)
1052 {
1053 bool did = false;
1054 if (b) {
1055 if (!(state & PG_STATE_FORCED_RECOVERY) &&
1056 (state & (PG_STATE_DEGRADED |
1057 PG_STATE_RECOVERY_WAIT |
1058 PG_STATE_RECOVERING))) {
1059 psdout(20) << __func__ << " set" << dendl;
1060 state_set(PG_STATE_FORCED_RECOVERY);
1061 pl->publish_stats_to_osd();
1062 did = true;
1063 }
1064 } else if (state & PG_STATE_FORCED_RECOVERY) {
1065 psdout(20) << __func__ << " clear" << dendl;
1066 state_clear(PG_STATE_FORCED_RECOVERY);
1067 pl->publish_stats_to_osd();
1068 did = true;
1069 }
1070 if (did) {
1071 psdout(20) << __func__ << " state " << get_current_state()
1072 << dendl;
1073 pl->update_local_background_io_priority(get_recovery_priority());
1074 }
1075 return did;
1076 }
1077
1078 bool PeeringState::set_force_backfill(bool b)
1079 {
1080 bool did = false;
1081 if (b) {
1082 if (!(state & PG_STATE_FORCED_BACKFILL) &&
1083 (state & (PG_STATE_DEGRADED |
1084 PG_STATE_BACKFILL_WAIT |
1085 PG_STATE_BACKFILLING))) {
1086 psdout(10) << __func__ << " set" << dendl;
1087 state_set(PG_STATE_FORCED_BACKFILL);
1088 pl->publish_stats_to_osd();
1089 did = true;
1090 }
1091 } else if (state & PG_STATE_FORCED_BACKFILL) {
1092 psdout(10) << __func__ << " clear" << dendl;
1093 state_clear(PG_STATE_FORCED_BACKFILL);
1094 pl->publish_stats_to_osd();
1095 did = true;
1096 }
1097 if (did) {
1098 psdout(20) << __func__ << " state " << get_current_state()
1099 << dendl;
1100 pl->update_local_background_io_priority(get_backfill_priority());
1101 }
1102 return did;
1103 }
1104
1105 void PeeringState::schedule_renew_lease()
1106 {
1107 pl->schedule_renew_lease(
1108 last_peering_reset,
1109 readable_interval / 2);
1110 }
1111
1112 void PeeringState::send_lease()
1113 {
1114 epoch_t epoch = pl->get_osdmap_epoch();
1115 for (auto peer : actingset) {
1116 if (peer == pg_whoami) {
1117 continue;
1118 }
1119 pl->send_cluster_message(
1120 peer.osd,
1121 new MOSDPGLease(epoch,
1122 spg_t(spgid.pgid, peer.shard),
1123 get_lease()),
1124 epoch);
1125 }
1126 }
1127
1128 void PeeringState::proc_lease(const pg_lease_t& l)
1129 {
1130 if (!HAVE_FEATURE(upacting_features, SERVER_OCTOPUS)) {
1131 psdout(20) << __func__ << " no-op, upacting_features 0x" << std::hex
1132 << upacting_features << std::dec
1133 << " does not include SERVER_OCTOPUS" << dendl;
1134 return;
1135 }
1136 if (!is_nonprimary()) {
1137 psdout(20) << __func__ << " no-op, !nonprimary" << dendl;
1138 return;
1139 }
1140 psdout(10) << __func__ << " " << l << dendl;
1141 if (l.readable_until_ub > readable_until_ub_from_primary) {
1142 readable_until_ub_from_primary = l.readable_until_ub;
1143 }
1144
1145 ceph::signedspan ru = ceph::signedspan::zero();
1146 if (l.readable_until != ceph::signedspan::zero() &&
1147 hb_stamps[0]->peer_clock_delta_ub) {
1148 ru = l.readable_until - *hb_stamps[0]->peer_clock_delta_ub;
1149 psdout(20) << " peer_clock_delta_ub " << *hb_stamps[0]->peer_clock_delta_ub
1150 << " -> ru " << ru << dendl;
1151 }
1152 if (ru > readable_until) {
1153 readable_until = ru;
1154 psdout(20) << __func__ << " readable_until now " << readable_until << dendl;
1155 // NOTE: if we ever decide to block/queue ops on the replica,
1156 // we'll need to wake them up here.
1157 }
1158
1159 ceph::signedspan ruub;
1160 if (hb_stamps[0]->peer_clock_delta_lb) {
1161 ruub = l.readable_until_ub - *hb_stamps[0]->peer_clock_delta_lb;
1162 psdout(20) << " peer_clock_delta_lb " << *hb_stamps[0]->peer_clock_delta_lb
1163 << " -> ruub " << ruub << dendl;
1164 } else {
1165 ruub = pl->get_mnow() + l.interval;
1166 psdout(20) << " no peer_clock_delta_lb -> ruub " << ruub << dendl;
1167 }
1168 if (ruub > readable_until_ub) {
1169 readable_until_ub = ruub;
1170 psdout(20) << __func__ << " readable_until_ub now " << readable_until_ub
1171 << dendl;
1172 }
1173 }
1174
1175 void PeeringState::proc_lease_ack(int from, const pg_lease_ack_t& a)
1176 {
1177 if (!HAVE_FEATURE(upacting_features, SERVER_OCTOPUS)) {
1178 return;
1179 }
1180 auto now = pl->get_mnow();
1181 bool was_min = false;
1182 for (unsigned i = 0; i < acting.size(); ++i) {
1183 if (from == acting[i]) {
1184 // the lease_ack value is based on the primary's clock
1185 if (a.readable_until_ub > acting_readable_until_ub[i]) {
1186 if (acting_readable_until_ub[i] == readable_until) {
1187 was_min = true;
1188 }
1189 acting_readable_until_ub[i] = a.readable_until_ub;
1190 break;
1191 }
1192 }
1193 }
1194 if (was_min) {
1195 auto old_ru = readable_until;
1196 recalc_readable_until();
1197 if (now < old_ru) {
1198 pl->recheck_readable();
1199 }
1200 }
1201 }
1202
1203 void PeeringState::proc_renew_lease()
1204 {
1205 if (!HAVE_FEATURE(upacting_features, SERVER_OCTOPUS)) {
1206 return;
1207 }
1208 renew_lease(pl->get_mnow());
1209 send_lease();
1210 schedule_renew_lease();
1211 }
1212
1213 void PeeringState::recalc_readable_until()
1214 {
1215 assert(is_primary());
1216 ceph::signedspan min = readable_until_ub_sent;
1217 for (unsigned i = 0; i < acting.size(); ++i) {
1218 if (acting[i] == pg_whoami.osd || acting[i] == CRUSH_ITEM_NONE) {
1219 continue;
1220 }
1221 dout(20) << __func__ << " peer osd." << acting[i]
1222 << " ruub " << acting_readable_until_ub[i] << dendl;
1223 if (acting_readable_until_ub[i] < min) {
1224 min = acting_readable_until_ub[i];
1225 }
1226 }
1227 readable_until = min;
1228 readable_until_ub = min;
1229 dout(20) << __func__ << " readable_until[_ub] " << readable_until
1230 << " (sent " << readable_until_ub_sent << ")" << dendl;
1231 }
1232
1233 bool PeeringState::check_prior_readable_down_osds(const OSDMapRef& map)
1234 {
1235 if (!HAVE_FEATURE(upacting_features, SERVER_OCTOPUS)) {
1236 return false;
1237 }
1238 bool changed = false;
1239 auto p = prior_readable_down_osds.begin();
1240 while (p != prior_readable_down_osds.end()) {
1241 if (map->is_dead(*p)) {
1242 dout(10) << __func__ << " prior_readable_down_osds osd." << *p
1243 << " is dead as of epoch " << map->get_epoch()
1244 << dendl;
1245 p = prior_readable_down_osds.erase(p);
1246 changed = true;
1247 } else {
1248 ++p;
1249 }
1250 }
1251 if (changed && prior_readable_down_osds.empty()) {
1252 psdout(10) << " empty prior_readable_down_osds, clearing ub" << dendl;
1253 clear_prior_readable_until_ub();
1254 return true;
1255 }
1256 return false;
1257 }
1258
1259 bool PeeringState::adjust_need_up_thru(const OSDMapRef osdmap)
1260 {
1261 epoch_t up_thru = osdmap->get_up_thru(pg_whoami.osd);
1262 if (need_up_thru &&
1263 up_thru >= info.history.same_interval_since) {
1264 psdout(10) << "adjust_need_up_thru now "
1265 << up_thru << ", need_up_thru now false" << dendl;
1266 need_up_thru = false;
1267 return true;
1268 }
1269 return false;
1270 }
1271
1272 PastIntervals::PriorSet PeeringState::build_prior()
1273 {
1274 if (1) {
1275 // sanity check
1276 for (map<pg_shard_t,pg_info_t>::iterator it = peer_info.begin();
1277 it != peer_info.end();
1278 ++it) {
1279 ceph_assert(info.history.last_epoch_started >=
1280 it->second.history.last_epoch_started);
1281 }
1282 }
1283
1284 const OSDMap &osdmap = *get_osdmap();
1285 PastIntervals::PriorSet prior = past_intervals.get_prior_set(
1286 pool.info.is_erasure(),
1287 info.history.last_epoch_started,
1288 &missing_loc.get_recoverable_predicate(),
1289 [&](epoch_t start, int osd, epoch_t *lost_at) {
1290 const osd_info_t *pinfo = 0;
1291 if (osdmap.exists(osd)) {
1292 pinfo = &osdmap.get_info(osd);
1293 if (lost_at)
1294 *lost_at = pinfo->lost_at;
1295 }
1296
1297 if (osdmap.is_up(osd)) {
1298 return PastIntervals::UP;
1299 } else if (!pinfo) {
1300 return PastIntervals::DNE;
1301 } else if (pinfo->lost_at > start) {
1302 return PastIntervals::LOST;
1303 } else {
1304 return PastIntervals::DOWN;
1305 }
1306 },
1307 up,
1308 acting,
1309 dpp);
1310
1311 if (prior.pg_down) {
1312 state_set(PG_STATE_DOWN);
1313 }
1314
1315 if (get_osdmap()->get_up_thru(pg_whoami.osd) <
1316 info.history.same_interval_since) {
1317 psdout(10) << "up_thru " << get_osdmap()->get_up_thru(pg_whoami.osd)
1318 << " < same_since " << info.history.same_interval_since
1319 << ", must notify monitor" << dendl;
1320 need_up_thru = true;
1321 } else {
1322 psdout(10) << "up_thru " << get_osdmap()->get_up_thru(pg_whoami.osd)
1323 << " >= same_since " << info.history.same_interval_since
1324 << ", all is well" << dendl;
1325 need_up_thru = false;
1326 }
1327 pl->set_probe_targets(prior.probe);
1328 return prior;
1329 }
1330
1331 bool PeeringState::needs_recovery() const
1332 {
1333 ceph_assert(is_primary());
1334
1335 auto &missing = pg_log.get_missing();
1336
1337 if (missing.num_missing()) {
1338 psdout(10) << __func__ << " primary has " << missing.num_missing()
1339 << " missing" << dendl;
1340 return true;
1341 }
1342
1343 ceph_assert(!acting_recovery_backfill.empty());
1344 set<pg_shard_t>::const_iterator end = acting_recovery_backfill.end();
1345 set<pg_shard_t>::const_iterator a = acting_recovery_backfill.begin();
1346 for (; a != end; ++a) {
1347 if (*a == get_primary()) continue;
1348 pg_shard_t peer = *a;
1349 map<pg_shard_t, pg_missing_t>::const_iterator pm = peer_missing.find(peer);
1350 if (pm == peer_missing.end()) {
1351 psdout(10) << __func__ << " osd." << peer << " doesn't have missing set"
1352 << dendl;
1353 continue;
1354 }
1355 if (pm->second.num_missing()) {
1356 psdout(10) << __func__ << " osd." << peer << " has "
1357 << pm->second.num_missing() << " missing" << dendl;
1358 return true;
1359 }
1360 }
1361
1362 psdout(10) << __func__ << " is recovered" << dendl;
1363 return false;
1364 }
1365
1366 bool PeeringState::needs_backfill() const
1367 {
1368 ceph_assert(is_primary());
1369
1370 // We can assume that only possible osds that need backfill
1371 // are on the backfill_targets vector nodes.
1372 set<pg_shard_t>::const_iterator end = backfill_targets.end();
1373 set<pg_shard_t>::const_iterator a = backfill_targets.begin();
1374 for (; a != end; ++a) {
1375 pg_shard_t peer = *a;
1376 map<pg_shard_t, pg_info_t>::const_iterator pi = peer_info.find(peer);
1377 if (!pi->second.last_backfill.is_max()) {
1378 psdout(10) << __func__ << " osd." << peer
1379 << " has last_backfill " << pi->second.last_backfill << dendl;
1380 return true;
1381 }
1382 }
1383
1384 psdout(10) << __func__ << " does not need backfill" << dendl;
1385 return false;
1386 }
1387
1388 /*
1389 * Returns true unless there is a non-lost OSD in might_have_unfound.
1390 */
1391 bool PeeringState::all_unfound_are_queried_or_lost(
1392 const OSDMapRef osdmap) const
1393 {
1394 ceph_assert(is_primary());
1395
1396 set<pg_shard_t>::const_iterator peer = might_have_unfound.begin();
1397 set<pg_shard_t>::const_iterator mend = might_have_unfound.end();
1398 for (; peer != mend; ++peer) {
1399 if (peer_missing.count(*peer))
1400 continue;
1401 map<pg_shard_t, pg_info_t>::const_iterator iter = peer_info.find(*peer);
1402 if (iter != peer_info.end() &&
1403 (iter->second.is_empty() || iter->second.dne()))
1404 continue;
1405 if (!osdmap->exists(peer->osd))
1406 continue;
1407 const osd_info_t &osd_info(osdmap->get_info(peer->osd));
1408 if (osd_info.lost_at <= osd_info.up_from) {
1409 // If there is even one OSD in might_have_unfound that isn't lost, we
1410 // still might retrieve our unfound.
1411 return false;
1412 }
1413 }
1414 psdout(10) << "all_unfound_are_queried_or_lost all of might_have_unfound "
1415 << might_have_unfound
1416 << " have been queried or are marked lost" << dendl;
1417 return true;
1418 }
1419
1420
1421 void PeeringState::reject_reservation()
1422 {
1423 pl->unreserve_recovery_space();
1424 pl->send_cluster_message(
1425 primary.osd,
1426 new MBackfillReserve(
1427 MBackfillReserve::REJECT_TOOFULL,
1428 spg_t(info.pgid.pgid, primary.shard),
1429 get_osdmap_epoch()),
1430 get_osdmap_epoch());
1431 }
1432
1433 /**
1434 * find_best_info
1435 *
1436 * Returns an iterator to the best info in infos sorted by:
1437 * 1) Prefer newer last_update
1438 * 2) Prefer longer tail if it brings another info into contiguity
1439 * 3) Prefer current primary
1440 */
1441 map<pg_shard_t, pg_info_t>::const_iterator PeeringState::find_best_info(
1442 const map<pg_shard_t, pg_info_t> &infos,
1443 bool restrict_to_up_acting,
1444 bool *history_les_bound) const
1445 {
1446 ceph_assert(history_les_bound);
1447 /* See doc/dev/osd_internals/last_epoch_started.rst before attempting
1448 * to make changes to this process. Also, make sure to update it
1449 * when you find bugs! */
1450 eversion_t min_last_update_acceptable = eversion_t::max();
1451 epoch_t max_last_epoch_started_found = 0;
1452 for (map<pg_shard_t, pg_info_t>::const_iterator i = infos.begin();
1453 i != infos.end();
1454 ++i) {
1455 if (!cct->_conf->osd_find_best_info_ignore_history_les &&
1456 max_last_epoch_started_found < i->second.history.last_epoch_started) {
1457 *history_les_bound = true;
1458 max_last_epoch_started_found = i->second.history.last_epoch_started;
1459 }
1460 if (!i->second.is_incomplete() &&
1461 max_last_epoch_started_found < i->second.last_epoch_started) {
1462 *history_les_bound = false;
1463 max_last_epoch_started_found = i->second.last_epoch_started;
1464 }
1465 }
1466 for (map<pg_shard_t, pg_info_t>::const_iterator i = infos.begin();
1467 i != infos.end();
1468 ++i) {
1469 if (max_last_epoch_started_found <= i->second.last_epoch_started) {
1470 if (min_last_update_acceptable > i->second.last_update)
1471 min_last_update_acceptable = i->second.last_update;
1472 }
1473 }
1474 if (min_last_update_acceptable == eversion_t::max())
1475 return infos.end();
1476
1477 map<pg_shard_t, pg_info_t>::const_iterator best = infos.end();
1478 // find osd with newest last_update (oldest for ec_pool).
1479 // if there are multiples, prefer
1480 // - a longer tail, if it brings another peer into log contiguity
1481 // - the current primary
1482 for (map<pg_shard_t, pg_info_t>::const_iterator p = infos.begin();
1483 p != infos.end();
1484 ++p) {
1485 if (restrict_to_up_acting && !is_up(p->first) &&
1486 !is_acting(p->first))
1487 continue;
1488 // Only consider peers with last_update >= min_last_update_acceptable
1489 if (p->second.last_update < min_last_update_acceptable)
1490 continue;
1491 // Disqualify anyone with a too old last_epoch_started
1492 if (p->second.last_epoch_started < max_last_epoch_started_found)
1493 continue;
1494 // Disqualify anyone who is incomplete (not fully backfilled)
1495 if (p->second.is_incomplete())
1496 continue;
1497 if (best == infos.end()) {
1498 best = p;
1499 continue;
1500 }
1501 // Prefer newer last_update
1502 if (pool.info.require_rollback()) {
1503 if (p->second.last_update > best->second.last_update)
1504 continue;
1505 if (p->second.last_update < best->second.last_update) {
1506 best = p;
1507 continue;
1508 }
1509 } else {
1510 if (p->second.last_update < best->second.last_update)
1511 continue;
1512 if (p->second.last_update > best->second.last_update) {
1513 best = p;
1514 continue;
1515 }
1516 }
1517
1518 // Prefer longer tail
1519 if (p->second.log_tail > best->second.log_tail) {
1520 continue;
1521 } else if (p->second.log_tail < best->second.log_tail) {
1522 best = p;
1523 continue;
1524 }
1525
1526 if (!p->second.has_missing() && best->second.has_missing()) {
1527 psdout(10) << __func__ << " prefer osd." << p->first
1528 << " because it is complete while best has missing"
1529 << dendl;
1530 best = p;
1531 continue;
1532 } else if (p->second.has_missing() && !best->second.has_missing()) {
1533 psdout(10) << __func__ << " skipping osd." << p->first
1534 << " because it has missing while best is complete"
1535 << dendl;
1536 continue;
1537 } else {
1538 // both are complete or have missing
1539 // fall through
1540 }
1541
1542 // prefer current primary (usually the caller), all things being equal
1543 if (p->first == pg_whoami) {
1544 psdout(10) << "calc_acting prefer osd." << p->first
1545 << " because it is current primary" << dendl;
1546 best = p;
1547 continue;
1548 }
1549 }
1550 return best;
1551 }
1552
1553 void PeeringState::calc_ec_acting(
1554 map<pg_shard_t, pg_info_t>::const_iterator auth_log_shard,
1555 unsigned size,
1556 const vector<int> &acting,
1557 const vector<int> &up,
1558 const map<pg_shard_t, pg_info_t> &all_info,
1559 bool restrict_to_up_acting,
1560 vector<int> *_want,
1561 set<pg_shard_t> *backfill,
1562 set<pg_shard_t> *acting_backfill,
1563 ostream &ss)
1564 {
1565 vector<int> want(size, CRUSH_ITEM_NONE);
1566 map<shard_id_t, set<pg_shard_t> > all_info_by_shard;
1567 for (map<pg_shard_t, pg_info_t>::const_iterator i = all_info.begin();
1568 i != all_info.end();
1569 ++i) {
1570 all_info_by_shard[i->first.shard].insert(i->first);
1571 }
1572 for (uint8_t i = 0; i < want.size(); ++i) {
1573 ss << "For position " << (unsigned)i << ": ";
1574 if (up.size() > (unsigned)i && up[i] != CRUSH_ITEM_NONE &&
1575 !all_info.find(pg_shard_t(up[i], shard_id_t(i)))->second.is_incomplete() &&
1576 all_info.find(pg_shard_t(up[i], shard_id_t(i)))->second.last_update >=
1577 auth_log_shard->second.log_tail) {
1578 ss << " selecting up[i]: " << pg_shard_t(up[i], shard_id_t(i)) << std::endl;
1579 want[i] = up[i];
1580 continue;
1581 }
1582 if (up.size() > (unsigned)i && up[i] != CRUSH_ITEM_NONE) {
1583 ss << " backfilling up[i]: " << pg_shard_t(up[i], shard_id_t(i))
1584 << " and ";
1585 backfill->insert(pg_shard_t(up[i], shard_id_t(i)));
1586 }
1587
1588 if (acting.size() > (unsigned)i && acting[i] != CRUSH_ITEM_NONE &&
1589 !all_info.find(pg_shard_t(acting[i], shard_id_t(i)))->second.is_incomplete() &&
1590 all_info.find(pg_shard_t(acting[i], shard_id_t(i)))->second.last_update >=
1591 auth_log_shard->second.log_tail) {
1592 ss << " selecting acting[i]: " << pg_shard_t(acting[i], shard_id_t(i)) << std::endl;
1593 want[i] = acting[i];
1594 } else if (!restrict_to_up_acting) {
1595 for (set<pg_shard_t>::iterator j = all_info_by_shard[shard_id_t(i)].begin();
1596 j != all_info_by_shard[shard_id_t(i)].end();
1597 ++j) {
1598 ceph_assert(j->shard == i);
1599 if (!all_info.find(*j)->second.is_incomplete() &&
1600 all_info.find(*j)->second.last_update >=
1601 auth_log_shard->second.log_tail) {
1602 ss << " selecting stray: " << *j << std::endl;
1603 want[i] = j->osd;
1604 break;
1605 }
1606 }
1607 if (want[i] == CRUSH_ITEM_NONE)
1608 ss << " failed to fill position " << (int)i << std::endl;
1609 }
1610 }
1611
1612 for (uint8_t i = 0; i < want.size(); ++i) {
1613 if (want[i] != CRUSH_ITEM_NONE) {
1614 acting_backfill->insert(pg_shard_t(want[i], shard_id_t(i)));
1615 }
1616 }
1617 acting_backfill->insert(backfill->begin(), backfill->end());
1618 _want->swap(want);
1619 }
1620
1621 /**
1622 * calculate the desired acting set.
1623 *
1624 * Choose an appropriate acting set. Prefer up[0], unless it is
1625 * incomplete, or another osd has a longer tail that allows us to
1626 * bring other up nodes up to date.
1627 */
1628 void PeeringState::calc_replicated_acting(
1629 map<pg_shard_t, pg_info_t>::const_iterator auth_log_shard,
1630 uint64_t force_auth_primary_missing_objects,
1631 unsigned size,
1632 const vector<int> &acting,
1633 const vector<int> &up,
1634 pg_shard_t up_primary,
1635 const map<pg_shard_t, pg_info_t> &all_info,
1636 bool restrict_to_up_acting,
1637 vector<int> *want,
1638 set<pg_shard_t> *backfill,
1639 set<pg_shard_t> *acting_backfill,
1640 const OSDMapRef osdmap,
1641 ostream &ss)
1642 {
1643 pg_shard_t auth_log_shard_id = auth_log_shard->first;
1644
1645 ss << __func__ << " newest update on osd." << auth_log_shard_id
1646 << " with " << auth_log_shard->second
1647 << (restrict_to_up_acting ? " restrict_to_up_acting" : "") << std::endl;
1648
1649 // select primary
1650 auto primary = all_info.find(up_primary);
1651 if (up.size() &&
1652 !primary->second.is_incomplete() &&
1653 primary->second.last_update >=
1654 auth_log_shard->second.log_tail) {
1655 if (HAVE_FEATURE(osdmap->get_up_osd_features(), SERVER_NAUTILUS)) {
1656 auto approx_missing_objects =
1657 primary->second.stats.stats.sum.num_objects_missing;
1658 auto auth_version = auth_log_shard->second.last_update.version;
1659 auto primary_version = primary->second.last_update.version;
1660 if (auth_version > primary_version) {
1661 approx_missing_objects += auth_version - primary_version;
1662 } else {
1663 approx_missing_objects += primary_version - auth_version;
1664 }
1665 if ((uint64_t)approx_missing_objects >
1666 force_auth_primary_missing_objects) {
1667 primary = auth_log_shard;
1668 ss << "up_primary: " << up_primary << ") has approximate "
1669 << approx_missing_objects
1670 << "(>" << force_auth_primary_missing_objects <<") "
1671 << "missing objects, osd." << auth_log_shard_id
1672 << " selected as primary instead"
1673 << std::endl;
1674 } else {
1675 ss << "up_primary: " << up_primary << ") selected as primary"
1676 << std::endl;
1677 }
1678 } else {
1679 ss << "up_primary: " << up_primary << ") selected as primary" << std::endl;
1680 }
1681 } else {
1682 ceph_assert(!auth_log_shard->second.is_incomplete());
1683 ss << "up[0] needs backfill, osd." << auth_log_shard_id
1684 << " selected as primary instead" << std::endl;
1685 primary = auth_log_shard;
1686 }
1687
1688 ss << __func__ << " primary is osd." << primary->first
1689 << " with " << primary->second << std::endl;
1690 want->push_back(primary->first.osd);
1691 acting_backfill->insert(primary->first);
1692
1693 /* We include auth_log_shard->second.log_tail because in GetLog,
1694 * we will request logs back to the min last_update over our
1695 * acting_backfill set, which will result in our log being extended
1696 * as far backwards as necessary to pick up any peers which can
1697 * be log recovered by auth_log_shard's log */
1698 eversion_t oldest_auth_log_entry =
1699 std::min(primary->second.log_tail, auth_log_shard->second.log_tail);
1700
1701 // select replicas that have log contiguity with primary.
1702 // prefer up, then acting, then any peer_info osds
1703 for (auto i : up) {
1704 pg_shard_t up_cand = pg_shard_t(i, shard_id_t::NO_SHARD);
1705 if (up_cand == primary->first)
1706 continue;
1707 const pg_info_t &cur_info = all_info.find(up_cand)->second;
1708 if (cur_info.is_incomplete() ||
1709 cur_info.last_update < oldest_auth_log_entry) {
1710 ss << " shard " << up_cand << " (up) backfill " << cur_info << std::endl;
1711 backfill->insert(up_cand);
1712 acting_backfill->insert(up_cand);
1713 } else {
1714 want->push_back(i);
1715 acting_backfill->insert(up_cand);
1716 ss << " osd." << i << " (up) accepted " << cur_info << std::endl;
1717 }
1718 }
1719
1720 if (want->size() >= size) {
1721 return;
1722 }
1723
1724 std::vector<std::pair<eversion_t, int>> candidate_by_last_update;
1725 candidate_by_last_update.reserve(acting.size());
1726 // This no longer has backfill OSDs, but they are covered above.
1727 for (auto i : acting) {
1728 pg_shard_t acting_cand(i, shard_id_t::NO_SHARD);
1729 // skip up osds we already considered above
1730 if (acting_cand == primary->first)
1731 continue;
1732 vector<int>::const_iterator up_it = find(up.begin(), up.end(), i);
1733 if (up_it != up.end())
1734 continue;
1735
1736 const pg_info_t &cur_info = all_info.find(acting_cand)->second;
1737 if (cur_info.is_incomplete() ||
1738 cur_info.last_update < oldest_auth_log_entry) {
1739 ss << " shard " << acting_cand << " (acting) REJECTED "
1740 << cur_info << std::endl;
1741 } else {
1742 candidate_by_last_update.emplace_back(cur_info.last_update, i);
1743 }
1744 }
1745
1746 auto sort_by_eversion =[](const std::pair<eversion_t, int> &lhs,
1747 const std::pair<eversion_t, int> &rhs) {
1748 return lhs.first > rhs.first;
1749 };
1750 // sort by last_update, in descending order.
1751 std::sort(candidate_by_last_update.begin(),
1752 candidate_by_last_update.end(), sort_by_eversion);
1753 for (auto &p: candidate_by_last_update) {
1754 ceph_assert(want->size() < size);
1755 want->push_back(p.second);
1756 pg_shard_t s = pg_shard_t(p.second, shard_id_t::NO_SHARD);
1757 acting_backfill->insert(s);
1758 ss << " shard " << s << " (acting) accepted "
1759 << all_info.find(s)->second << std::endl;
1760 if (want->size() >= size) {
1761 return;
1762 }
1763 }
1764
1765 if (restrict_to_up_acting) {
1766 return;
1767 }
1768 candidate_by_last_update.clear();
1769 candidate_by_last_update.reserve(all_info.size()); // overestimate but fine
1770 // continue to search stray to find more suitable peers
1771 for (auto &i : all_info) {
1772 // skip up osds we already considered above
1773 if (i.first == primary->first)
1774 continue;
1775 vector<int>::const_iterator up_it = find(up.begin(), up.end(), i.first.osd);
1776 if (up_it != up.end())
1777 continue;
1778 vector<int>::const_iterator acting_it = find(
1779 acting.begin(), acting.end(), i.first.osd);
1780 if (acting_it != acting.end())
1781 continue;
1782
1783 if (i.second.is_incomplete() ||
1784 i.second.last_update < oldest_auth_log_entry) {
1785 ss << " shard " << i.first << " (stray) REJECTED " << i.second
1786 << std::endl;
1787 } else {
1788 candidate_by_last_update.emplace_back(
1789 i.second.last_update, i.first.osd);
1790 }
1791 }
1792
1793 if (candidate_by_last_update.empty()) {
1794 // save us some effort
1795 return;
1796 }
1797
1798 // sort by last_update, in descending order.
1799 std::sort(candidate_by_last_update.begin(),
1800 candidate_by_last_update.end(), sort_by_eversion);
1801
1802 for (auto &p: candidate_by_last_update) {
1803 ceph_assert(want->size() < size);
1804 want->push_back(p.second);
1805 pg_shard_t s = pg_shard_t(p.second, shard_id_t::NO_SHARD);
1806 acting_backfill->insert(s);
1807 ss << " shard " << s << " (stray) accepted "
1808 << all_info.find(s)->second << std::endl;
1809 if (want->size() >= size) {
1810 return;
1811 }
1812 }
1813 }
1814
1815 bool PeeringState::recoverable(const vector<int> &want) const
1816 {
1817 unsigned num_want_acting = 0;
1818 set<pg_shard_t> have;
1819 for (int i = 0; i < (int)want.size(); ++i) {
1820 if (want[i] != CRUSH_ITEM_NONE) {
1821 ++num_want_acting;
1822 have.insert(
1823 pg_shard_t(
1824 want[i],
1825 pool.info.is_erasure() ? shard_id_t(i) : shard_id_t::NO_SHARD));
1826 }
1827 }
1828
1829 if (num_want_acting < pool.info.min_size) {
1830 const bool recovery_ec_pool_below_min_size=
1831 HAVE_FEATURE(get_osdmap()->get_up_osd_features(), SERVER_OCTOPUS);
1832
1833 if (pool.info.is_erasure() && !recovery_ec_pool_below_min_size) {
1834 psdout(10) << __func__ << " failed, ec recovery below min size not supported by pre-octopus" << dendl;
1835 return false;
1836 } else if (!cct->_conf.get_val<bool>("osd_allow_recovery_below_min_size")) {
1837 psdout(10) << __func__ << " failed, recovery below min size not enabled" << dendl;
1838 return false;
1839 }
1840 }
1841 if (missing_loc.get_recoverable_predicate()(have)) {
1842 return true;
1843 } else {
1844 psdout(10) << __func__ << " failed, not recoverable " << dendl;
1845 return false;
1846 }
1847 }
1848
1849 void PeeringState::choose_async_recovery_ec(
1850 const map<pg_shard_t, pg_info_t> &all_info,
1851 const pg_info_t &auth_info,
1852 vector<int> *want,
1853 set<pg_shard_t> *async_recovery,
1854 const OSDMapRef osdmap) const
1855 {
1856 set<pair<int, pg_shard_t> > candidates_by_cost;
1857 for (uint8_t i = 0; i < want->size(); ++i) {
1858 if ((*want)[i] == CRUSH_ITEM_NONE)
1859 continue;
1860
1861 // Considering log entries to recover is accurate enough for
1862 // now. We could use minimum_to_decode_with_cost() later if
1863 // necessary.
1864 pg_shard_t shard_i((*want)[i], shard_id_t(i));
1865 // do not include strays
1866 if (stray_set.find(shard_i) != stray_set.end())
1867 continue;
1868 // Do not include an osd that is not up, since choosing it as
1869 // an async_recovery_target will move it out of the acting set.
1870 // This results in it being identified as a stray during peering,
1871 // because it is no longer in the up or acting set.
1872 if (!is_up(shard_i))
1873 continue;
1874 auto shard_info = all_info.find(shard_i)->second;
1875 // for ec pools we rollback all entries past the authoritative
1876 // last_update *before* activation. This is relatively inexpensive
1877 // compared to recovery, since it is purely local, so treat shards
1878 // past the authoritative last_update the same as those equal to it.
1879 version_t auth_version = auth_info.last_update.version;
1880 version_t candidate_version = shard_info.last_update.version;
1881 if (HAVE_FEATURE(osdmap->get_up_osd_features(), SERVER_NAUTILUS)) {
1882 auto approx_missing_objects =
1883 shard_info.stats.stats.sum.num_objects_missing;
1884 if (auth_version > candidate_version) {
1885 approx_missing_objects += auth_version - candidate_version;
1886 }
1887 if (static_cast<uint64_t>(approx_missing_objects) >
1888 cct->_conf.get_val<uint64_t>("osd_async_recovery_min_cost")) {
1889 candidates_by_cost.emplace(approx_missing_objects, shard_i);
1890 }
1891 } else {
1892 if (auth_version > candidate_version &&
1893 (auth_version - candidate_version) > cct->_conf.get_val<uint64_t>("osd_async_recovery_min_cost")) {
1894 candidates_by_cost.insert(make_pair(auth_version - candidate_version, shard_i));
1895 }
1896 }
1897 }
1898
1899 psdout(20) << __func__ << " candidates by cost are: " << candidates_by_cost
1900 << dendl;
1901
1902 // take out as many osds as we can for async recovery, in order of cost
1903 for (auto rit = candidates_by_cost.rbegin();
1904 rit != candidates_by_cost.rend(); ++rit) {
1905 pg_shard_t cur_shard = rit->second;
1906 vector<int> candidate_want(*want);
1907 candidate_want[cur_shard.shard.id] = CRUSH_ITEM_NONE;
1908 if (recoverable(candidate_want)) {
1909 want->swap(candidate_want);
1910 async_recovery->insert(cur_shard);
1911 }
1912 }
1913 psdout(20) << __func__ << " result want=" << *want
1914 << " async_recovery=" << *async_recovery << dendl;
1915 }
1916
1917 void PeeringState::choose_async_recovery_replicated(
1918 const map<pg_shard_t, pg_info_t> &all_info,
1919 const pg_info_t &auth_info,
1920 vector<int> *want,
1921 set<pg_shard_t> *async_recovery,
1922 const OSDMapRef osdmap) const
1923 {
1924 set<pair<int, pg_shard_t> > candidates_by_cost;
1925 for (auto osd_num : *want) {
1926 pg_shard_t shard_i(osd_num, shard_id_t::NO_SHARD);
1927 // do not include strays
1928 if (stray_set.find(shard_i) != stray_set.end())
1929 continue;
1930 // Do not include an osd that is not up, since choosing it as
1931 // an async_recovery_target will move it out of the acting set.
1932 // This results in it being identified as a stray during peering,
1933 // because it is no longer in the up or acting set.
1934 if (!is_up(shard_i))
1935 continue;
1936 auto shard_info = all_info.find(shard_i)->second;
1937 // use the approximate magnitude of the difference in length of
1938 // logs plus historical missing objects as the cost of recovery
1939 version_t auth_version = auth_info.last_update.version;
1940 version_t candidate_version = shard_info.last_update.version;
1941 if (HAVE_FEATURE(osdmap->get_up_osd_features(), SERVER_NAUTILUS)) {
1942 auto approx_missing_objects =
1943 shard_info.stats.stats.sum.num_objects_missing;
1944 if (auth_version > candidate_version) {
1945 approx_missing_objects += auth_version - candidate_version;
1946 } else {
1947 approx_missing_objects += candidate_version - auth_version;
1948 }
1949 if (static_cast<uint64_t>(approx_missing_objects) >
1950 cct->_conf.get_val<uint64_t>("osd_async_recovery_min_cost")) {
1951 candidates_by_cost.emplace(approx_missing_objects, shard_i);
1952 }
1953 } else {
1954 size_t approx_entries;
1955 if (auth_version > candidate_version) {
1956 approx_entries = auth_version - candidate_version;
1957 } else {
1958 approx_entries = candidate_version - auth_version;
1959 }
1960 if (approx_entries > cct->_conf.get_val<uint64_t>("osd_async_recovery_min_cost")) {
1961 candidates_by_cost.insert(make_pair(approx_entries, shard_i));
1962 }
1963 }
1964 }
1965
1966 psdout(20) << __func__ << " candidates by cost are: " << candidates_by_cost
1967 << dendl;
1968 // take out as many osds as we can for async recovery, in order of cost
1969 for (auto rit = candidates_by_cost.rbegin();
1970 rit != candidates_by_cost.rend(); ++rit) {
1971 if (want->size() <= pool.info.min_size) {
1972 break;
1973 }
1974 pg_shard_t cur_shard = rit->second;
1975 vector<int> candidate_want(*want);
1976 for (auto it = candidate_want.begin(); it != candidate_want.end(); ++it) {
1977 if (*it == cur_shard.osd) {
1978 candidate_want.erase(it);
1979 want->swap(candidate_want);
1980 async_recovery->insert(cur_shard);
1981 break;
1982 }
1983 }
1984 }
1985 psdout(20) << __func__ << " result want=" << *want
1986 << " async_recovery=" << *async_recovery << dendl;
1987 }
1988
1989
1990
1991 /**
1992 * choose acting
1993 *
1994 * calculate the desired acting, and request a change with the monitor
1995 * if it differs from the current acting.
1996 *
1997 * if restrict_to_up_acting=true, we filter out anything that's not in
1998 * up/acting. in order to lift this restriction, we need to
1999 * 1) check whether it's worth switching the acting set any time we get
2000 * a new pg info (not just here, when recovery finishes)
2001 * 2) check whether anything in want_acting went down on each new map
2002 * (and, if so, calculate a new want_acting)
2003 * 3) remove the assertion in PG::PeeringState::Active::react(const AdvMap)
2004 * TODO!
2005 */
2006 bool PeeringState::choose_acting(pg_shard_t &auth_log_shard_id,
2007 bool restrict_to_up_acting,
2008 bool *history_les_bound,
2009 bool request_pg_temp_change_only)
2010 {
2011 map<pg_shard_t, pg_info_t> all_info(peer_info.begin(), peer_info.end());
2012 all_info[pg_whoami] = info;
2013
2014 if (cct->_conf->subsys.should_gather<dout_subsys, 10>()) {
2015 for (map<pg_shard_t, pg_info_t>::iterator p = all_info.begin();
2016 p != all_info.end();
2017 ++p) {
2018 psdout(10) << __func__ << " all_info osd." << p->first << " "
2019 << p->second << dendl;
2020 }
2021 }
2022
2023 map<pg_shard_t, pg_info_t>::const_iterator auth_log_shard =
2024 find_best_info(all_info, restrict_to_up_acting, history_les_bound);
2025
2026 if (auth_log_shard == all_info.end()) {
2027 if (up != acting) {
2028 psdout(10) << __func__ << " no suitable info found (incomplete backfills?),"
2029 << " reverting to up" << dendl;
2030 want_acting = up;
2031 vector<int> empty;
2032 pl->queue_want_pg_temp(empty);
2033 } else {
2034 psdout(10) << __func__ << " failed" << dendl;
2035 ceph_assert(want_acting.empty());
2036 }
2037 return false;
2038 }
2039
2040 ceph_assert(!auth_log_shard->second.is_incomplete());
2041 auth_log_shard_id = auth_log_shard->first;
2042
2043 set<pg_shard_t> want_backfill, want_acting_backfill;
2044 vector<int> want;
2045 stringstream ss;
2046 if (pool.info.is_replicated())
2047 calc_replicated_acting(
2048 auth_log_shard,
2049 cct->_conf.get_val<uint64_t>(
2050 "osd_force_auth_primary_missing_objects"),
2051 get_osdmap()->get_pg_size(info.pgid.pgid),
2052 acting,
2053 up,
2054 up_primary,
2055 all_info,
2056 restrict_to_up_acting,
2057 &want,
2058 &want_backfill,
2059 &want_acting_backfill,
2060 get_osdmap(),
2061 ss);
2062 else
2063 calc_ec_acting(
2064 auth_log_shard,
2065 get_osdmap()->get_pg_size(info.pgid.pgid),
2066 acting,
2067 up,
2068 all_info,
2069 restrict_to_up_acting,
2070 &want,
2071 &want_backfill,
2072 &want_acting_backfill,
2073 ss);
2074 psdout(10) << ss.str() << dendl;
2075
2076 if (!recoverable(want)) {
2077 want_acting.clear();
2078 return false;
2079 }
2080
2081 set<pg_shard_t> want_async_recovery;
2082 if (HAVE_FEATURE(get_osdmap()->get_up_osd_features(), SERVER_MIMIC)) {
2083 if (pool.info.is_erasure()) {
2084 choose_async_recovery_ec(
2085 all_info, auth_log_shard->second, &want, &want_async_recovery,
2086 get_osdmap());
2087 } else {
2088 choose_async_recovery_replicated(
2089 all_info, auth_log_shard->second, &want, &want_async_recovery,
2090 get_osdmap());
2091 }
2092 }
2093 while (want.size() > pool.info.size) {
2094 // async recovery should have taken out as many osds as it can.
2095 // if not, then always evict the last peer
2096 // (will get synchronously recovered later)
2097 psdout(10) << __func__ << " evicting osd." << want.back()
2098 << " from oversized want " << want << dendl;
2099 want.pop_back();
2100 }
2101 if (want != acting) {
2102 psdout(10) << __func__ << " want " << want << " != acting " << acting
2103 << ", requesting pg_temp change" << dendl;
2104 want_acting = want;
2105
2106 if (!cct->_conf->osd_debug_no_acting_change) {
2107 if (want_acting == up) {
2108 // There can't be any pending backfill if
2109 // want is the same as crush map up OSDs.
2110 ceph_assert(want_backfill.empty());
2111 vector<int> empty;
2112 pl->queue_want_pg_temp(empty);
2113 } else
2114 pl->queue_want_pg_temp(want);
2115 }
2116 return false;
2117 }
2118 if (request_pg_temp_change_only)
2119 return true;
2120 want_acting.clear();
2121 acting_recovery_backfill = want_acting_backfill;
2122 psdout(10) << "acting_recovery_backfill is "
2123 << acting_recovery_backfill << dendl;
2124 ceph_assert(
2125 backfill_targets.empty() ||
2126 backfill_targets == want_backfill);
2127 if (backfill_targets.empty()) {
2128 // Caller is GetInfo
2129 backfill_targets = want_backfill;
2130 }
2131 // Adding !needs_recovery() to let the async_recovery_targets reset after recovery is complete
2132 ceph_assert(
2133 async_recovery_targets.empty() ||
2134 async_recovery_targets == want_async_recovery ||
2135 !needs_recovery());
2136 if (async_recovery_targets.empty() || !needs_recovery()) {
2137 async_recovery_targets = want_async_recovery;
2138 }
2139 // Will not change if already set because up would have had to change
2140 // Verify that nothing in backfill is in stray_set
2141 for (set<pg_shard_t>::iterator i = want_backfill.begin();
2142 i != want_backfill.end();
2143 ++i) {
2144 ceph_assert(stray_set.find(*i) == stray_set.end());
2145 }
2146 psdout(10) << "choose_acting want=" << want << " backfill_targets="
2147 << want_backfill << " async_recovery_targets="
2148 << async_recovery_targets << dendl;
2149 return true;
2150 }
2151
2152 void PeeringState::log_weirdness()
2153 {
2154 if (pg_log.get_tail() != info.log_tail)
2155 pl->get_clog_error() << info.pgid
2156 << " info mismatch, log.tail " << pg_log.get_tail()
2157 << " != info.log_tail " << info.log_tail;
2158 if (pg_log.get_head() != info.last_update)
2159 pl->get_clog_error() << info.pgid
2160 << " info mismatch, log.head " << pg_log.get_head()
2161 << " != info.last_update " << info.last_update;
2162
2163 if (!pg_log.get_log().empty()) {
2164 // sloppy check
2165 if ((pg_log.get_log().log.begin()->version <= pg_log.get_tail()))
2166 pl->get_clog_error() << info.pgid
2167 << " log bound mismatch, info (tail,head] ("
2168 << pg_log.get_tail() << ","
2169 << pg_log.get_head() << "]"
2170 << " actual ["
2171 << pg_log.get_log().log.begin()->version << ","
2172 << pg_log.get_log().log.rbegin()->version << "]";
2173 }
2174
2175 if (pg_log.get_log().caller_ops.size() > pg_log.get_log().log.size()) {
2176 pl->get_clog_error() << info.pgid
2177 << " caller_ops.size "
2178 << pg_log.get_log().caller_ops.size()
2179 << " > log size " << pg_log.get_log().log.size();
2180 }
2181 }
2182
2183 /*
2184 * Process information from a replica to determine if it could have any
2185 * objects that i need.
2186 *
2187 * TODO: if the missing set becomes very large, this could get expensive.
2188 * Instead, we probably want to just iterate over our unfound set.
2189 */
2190 bool PeeringState::search_for_missing(
2191 const pg_info_t &oinfo, const pg_missing_t &omissing,
2192 pg_shard_t from,
2193 PeeringCtxWrapper &ctx)
2194 {
2195 uint64_t num_unfound_before = missing_loc.num_unfound();
2196 bool found_missing = missing_loc.add_source_info(
2197 from, oinfo, omissing, ctx.handle);
2198 if (found_missing && num_unfound_before != missing_loc.num_unfound())
2199 pl->publish_stats_to_osd();
2200 // avoid doing this if the peer is empty. This is abit of paranoia
2201 // to avoid doing something rash if add_source_info() above
2202 // incorrectly decided we found something new. (if the peer has
2203 // last_update=0'0 that's impossible.)
2204 if (found_missing &&
2205 oinfo.last_update != eversion_t()) {
2206 pg_info_t tinfo(oinfo);
2207 tinfo.pgid.shard = pg_whoami.shard;
2208 ctx.send_info(
2209 from.osd,
2210 spg_t(info.pgid.pgid, from.shard),
2211 get_osdmap_epoch(), // fixme: use lower epoch?
2212 get_osdmap_epoch(),
2213 tinfo);
2214 }
2215 return found_missing;
2216 }
2217
2218 bool PeeringState::discover_all_missing(
2219 BufferedRecoveryMessages &rctx)
2220 {
2221 auto &missing = pg_log.get_missing();
2222 uint64_t unfound = get_num_unfound();
2223 bool any = false; // did we start any queries
2224
2225 psdout(10) << __func__ << " "
2226 << missing.num_missing() << " missing, "
2227 << unfound << " unfound"
2228 << dendl;
2229
2230 std::set<pg_shard_t>::const_iterator m = might_have_unfound.begin();
2231 std::set<pg_shard_t>::const_iterator mend = might_have_unfound.end();
2232 for (; m != mend; ++m) {
2233 pg_shard_t peer(*m);
2234
2235 if (!get_osdmap()->is_up(peer.osd)) {
2236 psdout(20) << __func__ << " skipping down osd." << peer << dendl;
2237 continue;
2238 }
2239
2240 if (peer_purged.count(peer)) {
2241 psdout(20) << __func__ << " skipping purged osd." << peer << dendl;
2242 continue;
2243 }
2244
2245 map<pg_shard_t, pg_info_t>::const_iterator iter = peer_info.find(peer);
2246 if (iter != peer_info.end() &&
2247 (iter->second.is_empty() || iter->second.dne())) {
2248 // ignore empty peers
2249 continue;
2250 }
2251
2252 // If we've requested any of this stuff, the pg_missing_t information
2253 // should be on its way.
2254 // TODO: coalsce requested_* into a single data structure
2255 if (peer_missing.find(peer) != peer_missing.end()) {
2256 psdout(20) << __func__ << ": osd." << peer
2257 << ": we already have pg_missing_t" << dendl;
2258 continue;
2259 }
2260 if (peer_log_requested.find(peer) != peer_log_requested.end()) {
2261 psdout(20) << __func__ << ": osd." << peer
2262 << ": in peer_log_requested" << dendl;
2263 continue;
2264 }
2265 if (peer_missing_requested.find(peer) != peer_missing_requested.end()) {
2266 psdout(20) << __func__ << ": osd." << peer
2267 << ": in peer_missing_requested" << dendl;
2268 continue;
2269 }
2270
2271 // Request missing
2272 psdout(10) << __func__ << ": osd." << peer << ": requesting pg_missing_t"
2273 << dendl;
2274 peer_missing_requested.insert(peer);
2275 rctx.send_query(
2276 peer.osd,
2277 spg_t(info.pgid.pgid, peer.shard),
2278 pg_query_t(
2279 pg_query_t::FULLLOG,
2280 peer.shard, pg_whoami.shard,
2281 info.history, get_osdmap_epoch()));
2282 any = true;
2283 }
2284 return any;
2285 }
2286
2287 /* Build the might_have_unfound set.
2288 *
2289 * This is used by the primary OSD during recovery.
2290 *
2291 * This set tracks the OSDs which might have unfound objects that the primary
2292 * OSD needs. As we receive pg_missing_t from each OSD in might_have_unfound, we
2293 * will remove the OSD from the set.
2294 */
2295 void PeeringState::build_might_have_unfound()
2296 {
2297 ceph_assert(might_have_unfound.empty());
2298 ceph_assert(is_primary());
2299
2300 psdout(10) << __func__ << dendl;
2301
2302 check_past_interval_bounds();
2303
2304 might_have_unfound = past_intervals.get_might_have_unfound(
2305 pg_whoami,
2306 pool.info.is_erasure());
2307
2308 // include any (stray) peers
2309 for (map<pg_shard_t, pg_info_t>::iterator p = peer_info.begin();
2310 p != peer_info.end();
2311 ++p)
2312 might_have_unfound.insert(p->first);
2313
2314 psdout(15) << __func__ << ": built " << might_have_unfound << dendl;
2315 }
2316
2317 void PeeringState::activate(
2318 ObjectStore::Transaction& t,
2319 epoch_t activation_epoch,
2320 PeeringCtxWrapper &ctx)
2321 {
2322 ceph_assert(!is_peered());
2323
2324 // twiddle pg state
2325 state_clear(PG_STATE_DOWN);
2326
2327 send_notify = false;
2328
2329 if (is_primary()) {
2330 // only update primary last_epoch_started if we will go active
2331 if (acting.size() >= pool.info.min_size) {
2332 ceph_assert(cct->_conf->osd_find_best_info_ignore_history_les ||
2333 info.last_epoch_started <= activation_epoch);
2334 info.last_epoch_started = activation_epoch;
2335 info.last_interval_started = info.history.same_interval_since;
2336 }
2337 } else if (is_acting(pg_whoami)) {
2338 /* update last_epoch_started on acting replica to whatever the primary sent
2339 * unless it's smaller (could happen if we are going peered rather than
2340 * active, see doc/dev/osd_internals/last_epoch_started.rst) */
2341 if (info.last_epoch_started < activation_epoch) {
2342 info.last_epoch_started = activation_epoch;
2343 info.last_interval_started = info.history.same_interval_since;
2344 }
2345 }
2346
2347 auto &missing = pg_log.get_missing();
2348
2349 min_last_complete_ondisk = eversion_t(0,0); // we don't know (yet)!
2350 if (is_primary()) {
2351 last_update_ondisk = info.last_update;
2352 }
2353 last_update_applied = info.last_update;
2354 last_rollback_info_trimmed_to_applied = pg_log.get_can_rollback_to();
2355
2356 need_up_thru = false;
2357
2358 // write pg info, log
2359 dirty_info = true;
2360 dirty_big_info = true; // maybe
2361
2362 pl->schedule_event_on_commit(
2363 t,
2364 std::make_shared<PGPeeringEvent>(
2365 get_osdmap_epoch(),
2366 get_osdmap_epoch(),
2367 ActivateCommitted(
2368 get_osdmap_epoch(),
2369 activation_epoch)));
2370
2371 // init complete pointer
2372 if (missing.num_missing() == 0) {
2373 psdout(10) << "activate - no missing, moving last_complete " << info.last_complete
2374 << " -> " << info.last_update << dendl;
2375 info.last_complete = info.last_update;
2376 info.stats.stats.sum.num_objects_missing = 0;
2377 pg_log.reset_recovery_pointers();
2378 } else {
2379 psdout(10) << "activate - not complete, " << missing << dendl;
2380 info.stats.stats.sum.num_objects_missing = missing.num_missing();
2381 pg_log.activate_not_complete(info);
2382 }
2383
2384 log_weirdness();
2385
2386 if (is_primary()) {
2387 // initialize snap_trimq
2388 interval_set<snapid_t> to_trim;
2389 auto& removed_snaps_queue = get_osdmap()->get_removed_snaps_queue();
2390 auto p = removed_snaps_queue.find(info.pgid.pgid.pool());
2391 if (p != removed_snaps_queue.end()) {
2392 dout(20) << "activate - purged_snaps " << info.purged_snaps
2393 << " removed_snaps " << p->second
2394 << dendl;
2395 for (auto q : p->second) {
2396 to_trim.insert(q.first, q.second);
2397 }
2398 }
2399 interval_set<snapid_t> purged;
2400 purged.intersection_of(to_trim, info.purged_snaps);
2401 to_trim.subtract(purged);
2402
2403 if (HAVE_FEATURE(upacting_features, SERVER_OCTOPUS)) {
2404 renew_lease(pl->get_mnow());
2405 // do not schedule until we are actually activated
2406 }
2407
2408 // adjust purged_snaps: PG may have been inactive while snaps were pruned
2409 // from the removed_snaps_queue in the osdmap. update local purged_snaps
2410 // reflect only those snaps that we thought were pruned and were still in
2411 // the queue.
2412 info.purged_snaps.swap(purged);
2413
2414 // start up replicas
2415 info.history.refresh_prior_readable_until_ub(pl->get_mnow(),
2416 prior_readable_until_ub);
2417
2418 ceph_assert(!acting_recovery_backfill.empty());
2419 for (set<pg_shard_t>::iterator i = acting_recovery_backfill.begin();
2420 i != acting_recovery_backfill.end();
2421 ++i) {
2422 if (*i == pg_whoami) continue;
2423 pg_shard_t peer = *i;
2424 ceph_assert(peer_info.count(peer));
2425 pg_info_t& pi = peer_info[peer];
2426
2427 psdout(10) << "activate peer osd." << peer << " " << pi << dendl;
2428
2429 MOSDPGLog *m = 0;
2430 ceph_assert(peer_missing.count(peer));
2431 pg_missing_t& pm = peer_missing[peer];
2432
2433 bool needs_past_intervals = pi.dne();
2434
2435 if (pi.last_update == info.last_update) {
2436 // empty log
2437 if (!pi.last_backfill.is_max())
2438 pl->get_clog_info() << info.pgid << " continuing backfill to osd."
2439 << peer
2440 << " from (" << pi.log_tail << "," << pi.last_update
2441 << "] " << pi.last_backfill
2442 << " to " << info.last_update;
2443 if (!pi.is_empty()) {
2444 psdout(10) << "activate peer osd." << peer
2445 << " is up to date, queueing in pending_activators" << dendl;
2446 ctx.send_info(
2447 peer.osd,
2448 spg_t(info.pgid.pgid, peer.shard),
2449 get_osdmap_epoch(), // fixme: use lower epoch?
2450 get_osdmap_epoch(),
2451 info,
2452 get_lease());
2453 } else {
2454 psdout(10) << "activate peer osd." << peer
2455 << " is up to date, but sending pg_log anyway" << dendl;
2456 m = new MOSDPGLog(
2457 i->shard, pg_whoami.shard,
2458 get_osdmap_epoch(), info,
2459 last_peering_reset);
2460 }
2461 } else if (
2462 pg_log.get_tail() > pi.last_update ||
2463 pi.last_backfill == hobject_t() ||
2464 (backfill_targets.count(*i) && pi.last_backfill.is_max())) {
2465 /* ^ This last case covers a situation where a replica is not contiguous
2466 * with the auth_log, but is contiguous with this replica. Reshuffling
2467 * the active set to handle this would be tricky, so instead we just go
2468 * ahead and backfill it anyway. This is probably preferrable in any
2469 * case since the replica in question would have to be significantly
2470 * behind.
2471 */
2472 // backfill
2473 pl->get_clog_debug() << info.pgid << " starting backfill to osd." << peer
2474 << " from (" << pi.log_tail << "," << pi.last_update
2475 << "] " << pi.last_backfill
2476 << " to " << info.last_update;
2477
2478 pi.last_update = info.last_update;
2479 pi.last_complete = info.last_update;
2480 pi.set_last_backfill(hobject_t());
2481 pi.last_epoch_started = info.last_epoch_started;
2482 pi.last_interval_started = info.last_interval_started;
2483 pi.history = info.history;
2484 pi.hit_set = info.hit_set;
2485 // Save num_bytes for reservation request, can't be negative
2486 peer_bytes[peer] = std::max<int64_t>(0, pi.stats.stats.sum.num_bytes);
2487 pi.stats.stats.clear();
2488 pi.stats.stats.sum.num_bytes = peer_bytes[peer];
2489
2490 // initialize peer with our purged_snaps.
2491 pi.purged_snaps = info.purged_snaps;
2492
2493 m = new MOSDPGLog(
2494 i->shard, pg_whoami.shard,
2495 get_osdmap_epoch(), pi,
2496 last_peering_reset /* epoch to create pg at */);
2497
2498 // send some recent log, so that op dup detection works well.
2499 m->log.copy_up_to(cct, pg_log.get_log(),
2500 cct->_conf->osd_max_pg_log_entries);
2501 m->info.log_tail = m->log.tail;
2502 pi.log_tail = m->log.tail; // sigh...
2503
2504 pm.clear();
2505 } else {
2506 // catch up
2507 ceph_assert(pg_log.get_tail() <= pi.last_update);
2508 m = new MOSDPGLog(
2509 i->shard, pg_whoami.shard,
2510 get_osdmap_epoch(), info,
2511 last_peering_reset /* epoch to create pg at */);
2512 // send new stuff to append to replicas log
2513 m->log.copy_after(cct, pg_log.get_log(), pi.last_update);
2514 }
2515
2516 // share past_intervals if we are creating the pg on the replica
2517 // based on whether our info for that peer was dne() *before*
2518 // updating pi.history in the backfill block above.
2519 if (m && needs_past_intervals)
2520 m->past_intervals = past_intervals;
2521
2522 // update local version of peer's missing list!
2523 if (m && pi.last_backfill != hobject_t()) {
2524 for (list<pg_log_entry_t>::iterator p = m->log.log.begin();
2525 p != m->log.log.end();
2526 ++p) {
2527 if (p->soid <= pi.last_backfill &&
2528 !p->is_error()) {
2529 if (perform_deletes_during_peering() && p->is_delete()) {
2530 pm.rm(p->soid, p->version);
2531 } else {
2532 pm.add_next_event(*p);
2533 }
2534 }
2535 }
2536 }
2537
2538 if (m) {
2539 dout(10) << "activate peer osd." << peer << " sending " << m->log
2540 << dendl;
2541 m->lease = get_lease();
2542 pl->send_cluster_message(peer.osd, m, get_osdmap_epoch());
2543 }
2544
2545 // peer now has
2546 pi.last_update = info.last_update;
2547
2548 // update our missing
2549 if (pm.num_missing() == 0) {
2550 pi.last_complete = pi.last_update;
2551 psdout(10) << "activate peer osd." << peer << " " << pi
2552 << " uptodate" << dendl;
2553 } else {
2554 psdout(10) << "activate peer osd." << peer << " " << pi
2555 << " missing " << pm << dendl;
2556 }
2557 }
2558
2559 // Set up missing_loc
2560 set<pg_shard_t> complete_shards;
2561 for (set<pg_shard_t>::iterator i = acting_recovery_backfill.begin();
2562 i != acting_recovery_backfill.end();
2563 ++i) {
2564 psdout(20) << __func__ << " setting up missing_loc from shard " << *i
2565 << " " << dendl;
2566 if (*i == get_primary()) {
2567 missing_loc.add_active_missing(missing);
2568 if (!missing.have_missing())
2569 complete_shards.insert(*i);
2570 } else {
2571 auto peer_missing_entry = peer_missing.find(*i);
2572 ceph_assert(peer_missing_entry != peer_missing.end());
2573 missing_loc.add_active_missing(peer_missing_entry->second);
2574 if (!peer_missing_entry->second.have_missing() &&
2575 peer_info[*i].last_backfill.is_max())
2576 complete_shards.insert(*i);
2577 }
2578 }
2579
2580 // If necessary, create might_have_unfound to help us find our unfound objects.
2581 // NOTE: It's important that we build might_have_unfound before trimming the
2582 // past intervals.
2583 might_have_unfound.clear();
2584 if (needs_recovery()) {
2585 // If only one shard has missing, we do a trick to add all others as recovery
2586 // source, this is considered safe since the PGLogs have been merged locally,
2587 // and covers vast majority of the use cases, like one OSD/host is down for
2588 // a while for hardware repairing
2589 if (complete_shards.size() + 1 == acting_recovery_backfill.size()) {
2590 missing_loc.add_batch_sources_info(complete_shards, ctx.handle);
2591 } else {
2592 missing_loc.add_source_info(pg_whoami, info, pg_log.get_missing(),
2593 ctx.handle);
2594 for (set<pg_shard_t>::iterator i = acting_recovery_backfill.begin();
2595 i != acting_recovery_backfill.end();
2596 ++i) {
2597 if (*i == pg_whoami) continue;
2598 psdout(10) << __func__ << ": adding " << *i << " as a source" << dendl;
2599 ceph_assert(peer_missing.count(*i));
2600 ceph_assert(peer_info.count(*i));
2601 missing_loc.add_source_info(
2602 *i,
2603 peer_info[*i],
2604 peer_missing[*i],
2605 ctx.handle);
2606 }
2607 }
2608 for (map<pg_shard_t, pg_missing_t>::iterator i = peer_missing.begin();
2609 i != peer_missing.end();
2610 ++i) {
2611 if (is_acting_recovery_backfill(i->first))
2612 continue;
2613 ceph_assert(peer_info.count(i->first));
2614 search_for_missing(
2615 peer_info[i->first],
2616 i->second,
2617 i->first,
2618 ctx);
2619 }
2620
2621 build_might_have_unfound();
2622
2623 // Always call now so update_calc_stats() will be accurate
2624 discover_all_missing(ctx.msgs);
2625
2626 }
2627
2628 // num_objects_degraded if calculated should reflect this too, unless no
2629 // missing and we are about to go clean.
2630 if (get_osdmap()->get_pg_size(info.pgid.pgid) > actingset.size()) {
2631 state_set(PG_STATE_UNDERSIZED);
2632 }
2633
2634 state_set(PG_STATE_ACTIVATING);
2635 pl->on_activate(std::move(to_trim));
2636 }
2637 if (acting.size() >= pool.info.min_size) {
2638 PGLog::LogEntryHandlerRef rollbacker{pl->get_log_handler(t)};
2639 pg_log.roll_forward(rollbacker.get());
2640 }
2641 }
2642
2643 void PeeringState::share_pg_info()
2644 {
2645 psdout(10) << "share_pg_info" << dendl;
2646
2647 info.history.refresh_prior_readable_until_ub(pl->get_mnow(),
2648 prior_readable_until_ub);
2649
2650 // share new pg_info_t with replicas
2651 ceph_assert(!acting_recovery_backfill.empty());
2652 for (auto pg_shard : acting_recovery_backfill) {
2653 if (pg_shard == pg_whoami) continue;
2654 if (auto peer = peer_info.find(pg_shard); peer != peer_info.end()) {
2655 peer->second.last_epoch_started = info.last_epoch_started;
2656 peer->second.last_interval_started = info.last_interval_started;
2657 peer->second.history.merge(info.history);
2658 }
2659 Message* m = nullptr;
2660 if (last_require_osd_release >= ceph_release_t::octopus) {
2661 m = new MOSDPGInfo2{spg_t{info.pgid.pgid, pg_shard.shard},
2662 info,
2663 get_osdmap_epoch(),
2664 get_osdmap_epoch(),
2665 get_lease(), {}};
2666 } else {
2667 m = new MOSDPGInfo{get_osdmap_epoch(),
2668 {pg_notify_t{pg_shard.shard,
2669 pg_whoami.shard,
2670 get_osdmap_epoch(),
2671 get_osdmap_epoch(),
2672 info,
2673 past_intervals}}};
2674 }
2675 pl->send_cluster_message(pg_shard.osd, m, get_osdmap_epoch());
2676 }
2677 }
2678
2679 void PeeringState::merge_log(
2680 ObjectStore::Transaction& t, pg_info_t &oinfo, pg_log_t &olog,
2681 pg_shard_t from)
2682 {
2683 PGLog::LogEntryHandlerRef rollbacker{pl->get_log_handler(t)};
2684 pg_log.merge_log(
2685 oinfo, olog, from, info, rollbacker.get(), dirty_info, dirty_big_info);
2686 }
2687
2688 void PeeringState::rewind_divergent_log(
2689 ObjectStore::Transaction& t, eversion_t newhead)
2690 {
2691 PGLog::LogEntryHandlerRef rollbacker{pl->get_log_handler(t)};
2692 pg_log.rewind_divergent_log(
2693 newhead, info, rollbacker.get(), dirty_info, dirty_big_info);
2694 }
2695
2696
2697 void PeeringState::proc_primary_info(
2698 ObjectStore::Transaction &t, const pg_info_t &oinfo)
2699 {
2700 ceph_assert(!is_primary());
2701
2702 update_history(oinfo.history);
2703 if (!info.stats.stats_invalid && info.stats.stats.sum.num_scrub_errors) {
2704 info.stats.stats.sum.num_scrub_errors = 0;
2705 info.stats.stats.sum.num_shallow_scrub_errors = 0;
2706 info.stats.stats.sum.num_deep_scrub_errors = 0;
2707 dirty_info = true;
2708 }
2709
2710 if (!(info.purged_snaps == oinfo.purged_snaps)) {
2711 psdout(10) << __func__ << " updating purged_snaps to "
2712 << oinfo.purged_snaps
2713 << dendl;
2714 info.purged_snaps = oinfo.purged_snaps;
2715 dirty_info = true;
2716 dirty_big_info = true;
2717 }
2718 }
2719
2720 void PeeringState::proc_master_log(
2721 ObjectStore::Transaction& t, pg_info_t &oinfo,
2722 pg_log_t &olog, pg_missing_t& omissing, pg_shard_t from)
2723 {
2724 psdout(10) << "proc_master_log for osd." << from << ": "
2725 << olog << " " << omissing << dendl;
2726 ceph_assert(!is_peered() && is_primary());
2727
2728 // merge log into our own log to build master log. no need to
2729 // make any adjustments to their missing map; we are taking their
2730 // log to be authoritative (i.e., their entries are by definitely
2731 // non-divergent).
2732 merge_log(t, oinfo, olog, from);
2733 peer_info[from] = oinfo;
2734 psdout(10) << " peer osd." << from << " now " << oinfo
2735 << " " << omissing << dendl;
2736 might_have_unfound.insert(from);
2737
2738 // See doc/dev/osd_internals/last_epoch_started
2739 if (oinfo.last_epoch_started > info.last_epoch_started) {
2740 info.last_epoch_started = oinfo.last_epoch_started;
2741 dirty_info = true;
2742 }
2743 if (oinfo.last_interval_started > info.last_interval_started) {
2744 info.last_interval_started = oinfo.last_interval_started;
2745 dirty_info = true;
2746 }
2747 update_history(oinfo.history);
2748 ceph_assert(cct->_conf->osd_find_best_info_ignore_history_les ||
2749 info.last_epoch_started >= info.history.last_epoch_started);
2750
2751 peer_missing[from].claim(omissing);
2752 }
2753
2754 void PeeringState::proc_replica_log(
2755 pg_info_t &oinfo,
2756 const pg_log_t &olog,
2757 pg_missing_t& omissing,
2758 pg_shard_t from)
2759 {
2760 psdout(10) << "proc_replica_log for osd." << from << ": "
2761 << oinfo << " " << olog << " " << omissing << dendl;
2762
2763 pg_log.proc_replica_log(oinfo, olog, omissing, from);
2764
2765 peer_info[from] = oinfo;
2766 psdout(10) << " peer osd." << from << " now "
2767 << oinfo << " " << omissing << dendl;
2768 might_have_unfound.insert(from);
2769
2770 for (map<hobject_t, pg_missing_item>::const_iterator i =
2771 omissing.get_items().begin();
2772 i != omissing.get_items().end();
2773 ++i) {
2774 psdout(20) << " after missing " << i->first
2775 << " need " << i->second.need
2776 << " have " << i->second.have << dendl;
2777 }
2778 peer_missing[from].claim(omissing);
2779 }
2780
2781 void PeeringState::fulfill_info(
2782 pg_shard_t from, const pg_query_t &query,
2783 pair<pg_shard_t, pg_info_t> &notify_info)
2784 {
2785 ceph_assert(from == primary);
2786 ceph_assert(query.type == pg_query_t::INFO);
2787
2788 // info
2789 psdout(10) << "sending info" << dendl;
2790 notify_info = make_pair(from, info);
2791 }
2792
2793 void PeeringState::fulfill_log(
2794 pg_shard_t from, const pg_query_t &query, epoch_t query_epoch)
2795 {
2796 psdout(10) << "log request from " << from << dendl;
2797 ceph_assert(from == primary);
2798 ceph_assert(query.type != pg_query_t::INFO);
2799
2800 MOSDPGLog *mlog = new MOSDPGLog(
2801 from.shard, pg_whoami.shard,
2802 get_osdmap_epoch(),
2803 info, query_epoch);
2804 mlog->missing = pg_log.get_missing();
2805
2806 // primary -> other, when building master log
2807 if (query.type == pg_query_t::LOG) {
2808 psdout(10) << " sending info+missing+log since " << query.since
2809 << dendl;
2810 if (query.since != eversion_t() && query.since < pg_log.get_tail()) {
2811 pl->get_clog_error() << info.pgid << " got broken pg_query_t::LOG since "
2812 << query.since
2813 << " when my log.tail is " << pg_log.get_tail()
2814 << ", sending full log instead";
2815 mlog->log = pg_log.get_log(); // primary should not have requested this!!
2816 } else
2817 mlog->log.copy_after(cct, pg_log.get_log(), query.since);
2818 }
2819 else if (query.type == pg_query_t::FULLLOG) {
2820 psdout(10) << " sending info+missing+full log" << dendl;
2821 mlog->log = pg_log.get_log();
2822 }
2823
2824 psdout(10) << " sending " << mlog->log << " " << mlog->missing << dendl;
2825
2826 pl->send_cluster_message(from.osd, mlog, get_osdmap_epoch(), true);
2827 }
2828
2829 void PeeringState::fulfill_query(const MQuery& query, PeeringCtxWrapper &rctx)
2830 {
2831 if (query.query.type == pg_query_t::INFO) {
2832 pair<pg_shard_t, pg_info_t> notify_info;
2833 // note this refreshes our prior_readable_until_ub value
2834 update_history(query.query.history);
2835 fulfill_info(query.from, query.query, notify_info);
2836 rctx.send_notify(
2837 notify_info.first.osd,
2838 pg_notify_t(
2839 notify_info.first.shard, pg_whoami.shard,
2840 query.query_epoch,
2841 get_osdmap_epoch(),
2842 notify_info.second,
2843 past_intervals));
2844 } else {
2845 update_history(query.query.history);
2846 fulfill_log(query.from, query.query, query.query_epoch);
2847 }
2848 }
2849
2850 void PeeringState::try_mark_clean()
2851 {
2852 if (actingset.size() == get_osdmap()->get_pg_size(info.pgid.pgid)) {
2853 state_clear(PG_STATE_FORCED_BACKFILL | PG_STATE_FORCED_RECOVERY);
2854 state_set(PG_STATE_CLEAN);
2855 info.history.last_epoch_clean = get_osdmap_epoch();
2856 info.history.last_interval_clean = info.history.same_interval_since;
2857 past_intervals.clear();
2858 dirty_big_info = true;
2859 dirty_info = true;
2860 }
2861
2862 if (!is_active() && is_peered()) {
2863 if (is_clean()) {
2864 bool target;
2865 if (pool.info.is_pending_merge(info.pgid.pgid, &target)) {
2866 if (target) {
2867 psdout(10) << "ready to merge (target)" << dendl;
2868 pl->set_ready_to_merge_target(
2869 info.last_update,
2870 info.history.last_epoch_started,
2871 info.history.last_epoch_clean);
2872 } else {
2873 psdout(10) << "ready to merge (source)" << dendl;
2874 pl->set_ready_to_merge_source(info.last_update);
2875 }
2876 }
2877 } else {
2878 psdout(10) << "not clean, not ready to merge" << dendl;
2879 // we should have notified OSD in Active state entry point
2880 }
2881 }
2882
2883 state_clear(PG_STATE_FORCED_RECOVERY | PG_STATE_FORCED_BACKFILL);
2884
2885 share_pg_info();
2886 pl->publish_stats_to_osd();
2887 clear_recovery_state();
2888 }
2889
2890 void PeeringState::split_into(
2891 pg_t child_pgid, PeeringState *child, unsigned split_bits)
2892 {
2893 child->update_osdmap_ref(get_osdmap());
2894 child->pool = pool;
2895
2896 // Log
2897 pg_log.split_into(child_pgid, split_bits, &(child->pg_log));
2898 child->info.last_complete = info.last_complete;
2899
2900 info.last_update = pg_log.get_head();
2901 child->info.last_update = child->pg_log.get_head();
2902
2903 child->info.last_user_version = info.last_user_version;
2904
2905 info.log_tail = pg_log.get_tail();
2906 child->info.log_tail = child->pg_log.get_tail();
2907
2908 // reset last_complete, we might have modified pg_log & missing above
2909 pg_log.reset_complete_to(&info);
2910 child->pg_log.reset_complete_to(&child->info);
2911
2912 // Info
2913 child->info.history = info.history;
2914 child->info.history.epoch_created = get_osdmap_epoch();
2915 child->info.purged_snaps = info.purged_snaps;
2916
2917 if (info.last_backfill.is_max()) {
2918 child->info.set_last_backfill(hobject_t::get_max());
2919 } else {
2920 // restart backfill on parent and child to be safe. we could
2921 // probably do better in the bitwise sort case, but it's more
2922 // fragile (there may be special work to do on backfill completion
2923 // in the future).
2924 info.set_last_backfill(hobject_t());
2925 child->info.set_last_backfill(hobject_t());
2926 // restarting backfill implies that the missing set is empty,
2927 // since it is only used for objects prior to last_backfill
2928 pg_log.reset_backfill();
2929 child->pg_log.reset_backfill();
2930 }
2931
2932 child->info.stats = info.stats;
2933 child->info.stats.parent_split_bits = split_bits;
2934 info.stats.stats_invalid = true;
2935 child->info.stats.stats_invalid = true;
2936 child->info.last_epoch_started = info.last_epoch_started;
2937 child->info.last_interval_started = info.last_interval_started;
2938
2939 // There can't be recovery/backfill going on now
2940 int primary, up_primary;
2941 vector<int> newup, newacting;
2942 get_osdmap()->pg_to_up_acting_osds(
2943 child->info.pgid.pgid, &newup, &up_primary, &newacting, &primary);
2944 child->init_primary_up_acting(
2945 newup,
2946 newacting,
2947 up_primary,
2948 primary);
2949 child->role = OSDMap::calc_pg_role(pg_whoami, child->acting);
2950
2951 // this comparison includes primary rank via pg_shard_t
2952 if (get_primary() != child->get_primary())
2953 child->info.history.same_primary_since = get_osdmap_epoch();
2954
2955 child->info.stats.up = up;
2956 child->info.stats.up_primary = up_primary;
2957 child->info.stats.acting = acting;
2958 child->info.stats.acting_primary = primary;
2959 child->info.stats.mapping_epoch = get_osdmap_epoch();
2960
2961 // History
2962 child->past_intervals = past_intervals;
2963
2964 child->on_new_interval();
2965
2966 child->send_notify = !child->is_primary();
2967
2968 child->dirty_info = true;
2969 child->dirty_big_info = true;
2970 dirty_info = true;
2971 dirty_big_info = true;
2972 }
2973
2974 void PeeringState::merge_from(
2975 map<spg_t,PeeringState *>& sources,
2976 PeeringCtx &rctx,
2977 unsigned split_bits,
2978 const pg_merge_meta_t& last_pg_merge_meta)
2979 {
2980 bool incomplete = false;
2981 if (info.last_complete != info.last_update ||
2982 info.is_incomplete() ||
2983 info.dne()) {
2984 psdout(10) << __func__ << " target incomplete" << dendl;
2985 incomplete = true;
2986 }
2987 if (last_pg_merge_meta.source_pgid != pg_t()) {
2988 if (info.pgid.pgid != last_pg_merge_meta.source_pgid.get_parent()) {
2989 psdout(10) << __func__ << " target doesn't match expected parent "
2990 << last_pg_merge_meta.source_pgid.get_parent()
2991 << " of source_pgid " << last_pg_merge_meta.source_pgid
2992 << dendl;
2993 incomplete = true;
2994 }
2995 if (info.last_update != last_pg_merge_meta.target_version) {
2996 psdout(10) << __func__ << " target version doesn't match expected "
2997 << last_pg_merge_meta.target_version << dendl;
2998 incomplete = true;
2999 }
3000 }
3001
3002 PGLog::LogEntryHandlerRef handler{pl->get_log_handler(rctx.transaction)};
3003 pg_log.roll_forward(handler.get());
3004
3005 info.last_complete = info.last_update; // to fake out trim()
3006 pg_log.reset_recovery_pointers();
3007 pg_log.trim(info.last_update, info);
3008
3009 vector<PGLog*> log_from;
3010 for (auto& i : sources) {
3011 auto& source = i.second;
3012 if (!source) {
3013 psdout(10) << __func__ << " source " << i.first << " missing" << dendl;
3014 incomplete = true;
3015 continue;
3016 }
3017 if (source->info.last_complete != source->info.last_update ||
3018 source->info.is_incomplete() ||
3019 source->info.dne()) {
3020 psdout(10) << __func__ << " source " << source->pg_whoami
3021 << " incomplete"
3022 << dendl;
3023 incomplete = true;
3024 }
3025 if (last_pg_merge_meta.source_pgid != pg_t()) {
3026 if (source->info.pgid.pgid != last_pg_merge_meta.source_pgid) {
3027 dout(10) << __func__ << " source " << source->info.pgid.pgid
3028 << " doesn't match expected source pgid "
3029 << last_pg_merge_meta.source_pgid << dendl;
3030 incomplete = true;
3031 }
3032 if (source->info.last_update != last_pg_merge_meta.source_version) {
3033 dout(10) << __func__ << " source version doesn't match expected "
3034 << last_pg_merge_meta.target_version << dendl;
3035 incomplete = true;
3036 }
3037 }
3038
3039 // prepare log
3040 PGLog::LogEntryHandlerRef handler{
3041 source->pl->get_log_handler(rctx.transaction)};
3042 source->pg_log.roll_forward(handler.get());
3043 source->info.last_complete = source->info.last_update; // to fake out trim()
3044 source->pg_log.reset_recovery_pointers();
3045 source->pg_log.trim(source->info.last_update, source->info);
3046 log_from.push_back(&source->pg_log);
3047
3048 // combine stats
3049 info.stats.add(source->info.stats);
3050
3051 // pull up last_update
3052 info.last_update = std::max(info.last_update, source->info.last_update);
3053
3054 // adopt source's PastIntervals if target has none. we can do this since
3055 // pgp_num has been reduced prior to the merge, so the OSD mappings for
3056 // the PGs are identical.
3057 if (past_intervals.empty() && !source->past_intervals.empty()) {
3058 psdout(10) << __func__ << " taking source's past_intervals" << dendl;
3059 past_intervals = source->past_intervals;
3060 }
3061 }
3062
3063 info.last_complete = info.last_update;
3064 info.log_tail = info.last_update;
3065 if (incomplete) {
3066 info.last_backfill = hobject_t();
3067 }
3068
3069 // merge logs
3070 pg_log.merge_from(log_from, info.last_update);
3071
3072 // make sure we have a meaningful last_epoch_started/clean (if we were a
3073 // placeholder)
3074 if (info.history.epoch_created == 0) {
3075 // start with (a) source's history, since these PGs *should* have been
3076 // remapped in concert with each other...
3077 info.history = sources.begin()->second->info.history;
3078
3079 // we use the last_epoch_{started,clean} we got from
3080 // the caller, which are the epochs that were reported by the PGs were
3081 // found to be ready for merge.
3082 info.history.last_epoch_clean = last_pg_merge_meta.last_epoch_clean;
3083 info.history.last_epoch_started = last_pg_merge_meta.last_epoch_started;
3084 info.last_epoch_started = last_pg_merge_meta.last_epoch_started;
3085 psdout(10) << __func__
3086 << " set les/c to " << last_pg_merge_meta.last_epoch_started << "/"
3087 << last_pg_merge_meta.last_epoch_clean
3088 << " from pool last_dec_*, source pg history was "
3089 << sources.begin()->second->info.history
3090 << dendl;
3091
3092 // above we have pulled down source's history and we need to check
3093 // history.epoch_created again to confirm that source is not a placeholder
3094 // too. (peering requires a sane history.same_interval_since value for any
3095 // non-newly created pg and below here we know we are basically iterating
3096 // back a series of past maps to fake a merge process, hence we need to
3097 // fix history.same_interval_since first so that start_peering_interval()
3098 // will not complain)
3099 if (info.history.epoch_created == 0) {
3100 dout(10) << __func__ << " both merge target and source are placeholders,"
3101 << " set sis to lec " << info.history.last_epoch_clean
3102 << dendl;
3103 info.history.same_interval_since = info.history.last_epoch_clean;
3104 }
3105
3106 // if the past_intervals start is later than last_epoch_clean, it
3107 // implies the source repeered again but the target didn't, or
3108 // that the source became clean in a later epoch than the target.
3109 // avoid the discrepancy but adjusting the interval start
3110 // backwards to match so that check_past_interval_bounds() will
3111 // not complain.
3112 auto pib = past_intervals.get_bounds();
3113 if (info.history.last_epoch_clean < pib.first) {
3114 psdout(10) << __func__ << " last_epoch_clean "
3115 << info.history.last_epoch_clean << " < past_interval start "
3116 << pib.first << ", adjusting start backwards" << dendl;
3117 past_intervals.adjust_start_backwards(info.history.last_epoch_clean);
3118 }
3119
3120 // Similarly, if the same_interval_since value is later than
3121 // last_epoch_clean, the next interval change will result in a
3122 // past_interval start that is later than last_epoch_clean. This
3123 // can happen if we use the pg_history values from the merge
3124 // source. Adjust the same_interval_since value backwards if that
3125 // happens. (We trust the les and lec values more because they came from
3126 // the real target, whereas the history value we stole from the source.)
3127 if (info.history.last_epoch_started < info.history.same_interval_since) {
3128 psdout(10) << __func__ << " last_epoch_started "
3129 << info.history.last_epoch_started << " < same_interval_since "
3130 << info.history.same_interval_since
3131 << ", adjusting pg_history backwards" << dendl;
3132 info.history.same_interval_since = info.history.last_epoch_clean;
3133 // make sure same_{up,primary}_since are <= same_interval_since
3134 info.history.same_up_since = std::min(
3135 info.history.same_up_since, info.history.same_interval_since);
3136 info.history.same_primary_since = std::min(
3137 info.history.same_primary_since, info.history.same_interval_since);
3138 }
3139 }
3140
3141 dirty_info = true;
3142 dirty_big_info = true;
3143 }
3144
3145 void PeeringState::start_split_stats(
3146 const set<spg_t>& childpgs, vector<object_stat_sum_t> *out)
3147 {
3148 out->resize(childpgs.size() + 1);
3149 info.stats.stats.sum.split(*out);
3150 }
3151
3152 void PeeringState::finish_split_stats(
3153 const object_stat_sum_t& stats, ObjectStore::Transaction &t)
3154 {
3155 info.stats.stats.sum = stats;
3156 write_if_dirty(t);
3157 }
3158
3159 void PeeringState::update_blocked_by()
3160 {
3161 // set a max on the number of blocking peers we report. if we go
3162 // over, report a random subset. keep the result sorted.
3163 unsigned keep = std::min<unsigned>(
3164 blocked_by.size(), cct->_conf->osd_max_pg_blocked_by);
3165 unsigned skip = blocked_by.size() - keep;
3166 info.stats.blocked_by.clear();
3167 info.stats.blocked_by.resize(keep);
3168 unsigned pos = 0;
3169 for (set<int>::iterator p = blocked_by.begin();
3170 p != blocked_by.end() && keep > 0;
3171 ++p) {
3172 if (skip > 0 && (rand() % (skip + keep) < skip)) {
3173 --skip;
3174 } else {
3175 info.stats.blocked_by[pos++] = *p;
3176 --keep;
3177 }
3178 }
3179 }
3180
3181 static bool find_shard(const set<pg_shard_t> & pgs, shard_id_t shard)
3182 {
3183 for (auto&p : pgs)
3184 if (p.shard == shard)
3185 return true;
3186 return false;
3187 }
3188
3189 static pg_shard_t get_another_shard(const set<pg_shard_t> & pgs, pg_shard_t skip, shard_id_t shard)
3190 {
3191 for (auto&p : pgs) {
3192 if (p == skip)
3193 continue;
3194 if (p.shard == shard)
3195 return p;
3196 }
3197 return pg_shard_t();
3198 }
3199
3200 void PeeringState::update_calc_stats()
3201 {
3202 info.stats.version = info.last_update;
3203 info.stats.created = info.history.epoch_created;
3204 info.stats.last_scrub = info.history.last_scrub;
3205 info.stats.last_scrub_stamp = info.history.last_scrub_stamp;
3206 info.stats.last_deep_scrub = info.history.last_deep_scrub;
3207 info.stats.last_deep_scrub_stamp = info.history.last_deep_scrub_stamp;
3208 info.stats.last_clean_scrub_stamp = info.history.last_clean_scrub_stamp;
3209 info.stats.last_epoch_clean = info.history.last_epoch_clean;
3210
3211 info.stats.log_size = pg_log.get_head().version - pg_log.get_tail().version;
3212 info.stats.ondisk_log_size = info.stats.log_size;
3213 info.stats.log_start = pg_log.get_tail();
3214 info.stats.ondisk_log_start = pg_log.get_tail();
3215 info.stats.snaptrimq_len = pl->get_snap_trimq_size();
3216
3217 unsigned num_shards = get_osdmap()->get_pg_size(info.pgid.pgid);
3218
3219 // In rare case that upset is too large (usually transient), use as target
3220 // for calculations below.
3221 unsigned target = std::max(num_shards, (unsigned)upset.size());
3222 // For undersized actingset may be larger with OSDs out
3223 unsigned nrep = std::max(actingset.size(), upset.size());
3224 // calc num_object_copies
3225 info.stats.stats.calc_copies(std::max(target, nrep));
3226 info.stats.stats.sum.num_objects_degraded = 0;
3227 info.stats.stats.sum.num_objects_unfound = 0;
3228 info.stats.stats.sum.num_objects_misplaced = 0;
3229 info.stats.avail_no_missing.clear();
3230 info.stats.object_location_counts.clear();
3231
3232 // We should never hit this condition, but if end up hitting it,
3233 // make sure to update num_objects and set PG_STATE_INCONSISTENT.
3234 if (info.stats.stats.sum.num_objects < 0) {
3235 psdout(0) << __func__ << " negative num_objects = "
3236 << info.stats.stats.sum.num_objects << " setting it to 0 "
3237 << dendl;
3238 info.stats.stats.sum.num_objects = 0;
3239 state_set(PG_STATE_INCONSISTENT);
3240 }
3241
3242 if ((is_remapped() || is_undersized() || !is_clean()) &&
3243 (is_peered()|| is_activating())) {
3244 psdout(20) << __func__ << " actingset " << actingset << " upset "
3245 << upset << " acting_recovery_backfill " << acting_recovery_backfill << dendl;
3246
3247 ceph_assert(!acting_recovery_backfill.empty());
3248
3249 bool estimate = false;
3250
3251 // NOTE: we only generate degraded, misplaced and unfound
3252 // values for the summation, not individual stat categories.
3253 int64_t num_objects = info.stats.stats.sum.num_objects;
3254
3255 // Objects missing from up nodes, sorted by # objects.
3256 boost::container::flat_set<pair<int64_t,pg_shard_t>> missing_target_objects;
3257 // Objects missing from nodes not in up, sort by # objects
3258 boost::container::flat_set<pair<int64_t,pg_shard_t>> acting_source_objects;
3259
3260 // Fill missing_target_objects/acting_source_objects
3261
3262 {
3263 int64_t missing;
3264
3265 // Primary first
3266 missing = pg_log.get_missing().num_missing();
3267 ceph_assert(acting_recovery_backfill.count(pg_whoami));
3268 if (upset.count(pg_whoami)) {
3269 missing_target_objects.emplace(missing, pg_whoami);
3270 } else {
3271 acting_source_objects.emplace(missing, pg_whoami);
3272 }
3273 info.stats.stats.sum.num_objects_missing_on_primary = missing;
3274 if (missing == 0)
3275 info.stats.avail_no_missing.push_back(pg_whoami);
3276 psdout(20) << __func__ << " shard " << pg_whoami
3277 << " primary objects " << num_objects
3278 << " missing " << missing
3279 << dendl;
3280 }
3281
3282 // All other peers
3283 for (auto& peer : peer_info) {
3284 // Primary should not be in the peer_info, skip if it is.
3285 if (peer.first == pg_whoami) continue;
3286 int64_t missing = 0;
3287 int64_t peer_num_objects =
3288 std::max((int64_t)0, peer.second.stats.stats.sum.num_objects);
3289 // Backfill targets always track num_objects accurately
3290 // all other peers track missing accurately.
3291 if (is_backfill_target(peer.first)) {
3292 missing = std::max((int64_t)0, num_objects - peer_num_objects);
3293 } else {
3294 if (peer_missing.count(peer.first)) {
3295 missing = peer_missing[peer.first].num_missing();
3296 } else {
3297 psdout(20) << __func__ << " no peer_missing found for "
3298 << peer.first << dendl;
3299 if (is_recovering()) {
3300 estimate = true;
3301 }
3302 missing = std::max((int64_t)0, num_objects - peer_num_objects);
3303 }
3304 }
3305 if (upset.count(peer.first)) {
3306 missing_target_objects.emplace(missing, peer.first);
3307 } else if (actingset.count(peer.first)) {
3308 acting_source_objects.emplace(missing, peer.first);
3309 }
3310 peer.second.stats.stats.sum.num_objects_missing = missing;
3311 if (missing == 0)
3312 info.stats.avail_no_missing.push_back(peer.first);
3313 psdout(20) << __func__ << " shard " << peer.first
3314 << " objects " << peer_num_objects
3315 << " missing " << missing
3316 << dendl;
3317 }
3318
3319 // Compute object_location_counts
3320 for (auto& ml: missing_loc.get_missing_locs()) {
3321 info.stats.object_location_counts[ml.second]++;
3322 psdout(30) << __func__ << " " << ml.first << " object_location_counts["
3323 << ml.second << "]=" << info.stats.object_location_counts[ml.second]
3324 << dendl;
3325 }
3326 int64_t not_missing = num_objects - missing_loc.get_missing_locs().size();
3327 if (not_missing) {
3328 // During recovery we know upset == actingset and is being populated
3329 // During backfill we know that all non-missing objects are in the actingset
3330 info.stats.object_location_counts[actingset] = not_missing;
3331 }
3332 psdout(30) << __func__ << " object_location_counts["
3333 << upset << "]=" << info.stats.object_location_counts[upset]
3334 << dendl;
3335 psdout(20) << __func__ << " object_location_counts "
3336 << info.stats.object_location_counts << dendl;
3337
3338 // A misplaced object is not stored on the correct OSD
3339 int64_t misplaced = 0;
3340 // a degraded objects has fewer replicas or EC shards than the pool specifies.
3341 int64_t degraded = 0;
3342
3343 if (is_recovering()) {
3344 for (auto& sml: missing_loc.get_missing_by_count()) {
3345 for (auto& ml: sml.second) {
3346 int missing_shards;
3347 if (sml.first == shard_id_t::NO_SHARD) {
3348 psdout(20) << __func__ << " ml " << ml.second
3349 << " upset size " << upset.size()
3350 << " up " << ml.first.up << dendl;
3351 missing_shards = (int)upset.size() - ml.first.up;
3352 } else {
3353 // Handle shards not even in upset below
3354 if (!find_shard(upset, sml.first))
3355 continue;
3356 missing_shards = std::max(0, 1 - ml.first.up);
3357 psdout(20) << __func__
3358 << " shard " << sml.first
3359 << " ml " << ml.second
3360 << " missing shards " << missing_shards << dendl;
3361 }
3362 int odegraded = ml.second * missing_shards;
3363 // Copies on other osds but limited to the possible degraded
3364 int more_osds = std::min(missing_shards, ml.first.other);
3365 int omisplaced = ml.second * more_osds;
3366 ceph_assert(omisplaced <= odegraded);
3367 odegraded -= omisplaced;
3368
3369 misplaced += omisplaced;
3370 degraded += odegraded;
3371 }
3372 }
3373
3374 psdout(20) << __func__ << " missing based degraded "
3375 << degraded << dendl;
3376 psdout(20) << __func__ << " missing based misplaced "
3377 << misplaced << dendl;
3378
3379 // Handle undersized case
3380 if (pool.info.is_replicated()) {
3381 // Add degraded for missing targets (num_objects missing)
3382 ceph_assert(target >= upset.size());
3383 unsigned needed = target - upset.size();
3384 degraded += num_objects * needed;
3385 } else {
3386 for (unsigned i = 0 ; i < num_shards; ++i) {
3387 shard_id_t shard(i);
3388
3389 if (!find_shard(upset, shard)) {
3390 pg_shard_t pgs = get_another_shard(actingset, pg_shard_t(), shard);
3391
3392 if (pgs != pg_shard_t()) {
3393 int64_t missing;
3394
3395 if (pgs == pg_whoami)
3396 missing = info.stats.stats.sum.num_objects_missing_on_primary;
3397 else
3398 missing = peer_info[pgs].stats.stats.sum.num_objects_missing;
3399
3400 degraded += missing;
3401 misplaced += std::max((int64_t)0, num_objects - missing);
3402 } else {
3403 // No shard anywhere
3404 degraded += num_objects;
3405 }
3406 }
3407 }
3408 }
3409 goto out;
3410 }
3411
3412 // Handle undersized case
3413 if (pool.info.is_replicated()) {
3414 // Add to missing_target_objects
3415 ceph_assert(target >= missing_target_objects.size());
3416 unsigned needed = target - missing_target_objects.size();
3417 if (needed)
3418 missing_target_objects.emplace(num_objects * needed, pg_shard_t(pg_shard_t::NO_OSD));
3419 } else {
3420 for (unsigned i = 0 ; i < num_shards; ++i) {
3421 shard_id_t shard(i);
3422 bool found = false;
3423 for (const auto& t : missing_target_objects) {
3424 if (std::get<1>(t).shard == shard) {
3425 found = true;
3426 break;
3427 }
3428 }
3429 if (!found)
3430 missing_target_objects.emplace(num_objects, pg_shard_t(pg_shard_t::NO_OSD,shard));
3431 }
3432 }
3433
3434 for (const auto& item : missing_target_objects)
3435 psdout(20) << __func__ << " missing shard " << std::get<1>(item)
3436 << " missing= " << std::get<0>(item) << dendl;
3437 for (const auto& item : acting_source_objects)
3438 psdout(20) << __func__ << " acting shard " << std::get<1>(item)
3439 << " missing= " << std::get<0>(item) << dendl;
3440
3441 // Handle all objects not in missing for remapped
3442 // or backfill
3443 for (auto m = missing_target_objects.rbegin();
3444 m != missing_target_objects.rend(); ++m) {
3445
3446 int64_t extra_missing = -1;
3447
3448 if (pool.info.is_replicated()) {
3449 if (!acting_source_objects.empty()) {
3450 auto extra_copy = acting_source_objects.begin();
3451 extra_missing = std::get<0>(*extra_copy);
3452 acting_source_objects.erase(extra_copy);
3453 }
3454 } else { // Erasure coded
3455 // Use corresponding shard
3456 for (const auto& a : acting_source_objects) {
3457 if (std::get<1>(a).shard == std::get<1>(*m).shard) {
3458 extra_missing = std::get<0>(a);
3459 acting_source_objects.erase(a);
3460 break;
3461 }
3462 }
3463 }
3464
3465 if (extra_missing >= 0 && std::get<0>(*m) >= extra_missing) {
3466 // We don't know which of the objects on the target
3467 // are part of extra_missing so assume are all degraded.
3468 misplaced += std::get<0>(*m) - extra_missing;
3469 degraded += extra_missing;
3470 } else {
3471 // 1. extra_missing == -1, more targets than sources so degraded
3472 // 2. extra_missing > std::get<0>(m), so that we know that some extra_missing
3473 // previously degraded are now present on the target.
3474 degraded += std::get<0>(*m);
3475 }
3476 }
3477 // If there are still acting that haven't been accounted for
3478 // then they are misplaced
3479 for (const auto& a : acting_source_objects) {
3480 int64_t extra_misplaced = std::max((int64_t)0, num_objects - std::get<0>(a));
3481 psdout(20) << __func__ << " extra acting misplaced " << extra_misplaced
3482 << dendl;
3483 misplaced += extra_misplaced;
3484 }
3485 out:
3486 // NOTE: Tests use these messages to verify this code
3487 psdout(20) << __func__ << " degraded " << degraded
3488 << (estimate ? " (est)": "") << dendl;
3489 psdout(20) << __func__ << " misplaced " << misplaced
3490 << (estimate ? " (est)": "")<< dendl;
3491
3492 info.stats.stats.sum.num_objects_degraded = degraded;
3493 info.stats.stats.sum.num_objects_unfound = get_num_unfound();
3494 info.stats.stats.sum.num_objects_misplaced = misplaced;
3495 }
3496 }
3497
3498 std::optional<pg_stat_t> PeeringState::prepare_stats_for_publish(
3499 bool pg_stats_publish_valid,
3500 const pg_stat_t &pg_stats_publish,
3501 const object_stat_collection_t &unstable_stats)
3502 {
3503 if (info.stats.stats.sum.num_scrub_errors) {
3504 state_set(PG_STATE_INCONSISTENT);
3505 } else {
3506 state_clear(PG_STATE_INCONSISTENT);
3507 state_clear(PG_STATE_FAILED_REPAIR);
3508 }
3509
3510 utime_t now = ceph_clock_now();
3511 if (info.stats.state != state) {
3512 info.stats.last_change = now;
3513 // Optimistic estimation, if we just find out an inactive PG,
3514 // assumt it is active till now.
3515 if (!(state & PG_STATE_ACTIVE) &&
3516 (info.stats.state & PG_STATE_ACTIVE))
3517 info.stats.last_active = now;
3518
3519 if ((state & PG_STATE_ACTIVE) &&
3520 !(info.stats.state & PG_STATE_ACTIVE))
3521 info.stats.last_became_active = now;
3522 if ((state & (PG_STATE_ACTIVE|PG_STATE_PEERED)) &&
3523 !(info.stats.state & (PG_STATE_ACTIVE|PG_STATE_PEERED)))
3524 info.stats.last_became_peered = now;
3525 info.stats.state = state;
3526 }
3527
3528 update_calc_stats();
3529 if (info.stats.stats.sum.num_objects_degraded) {
3530 state_set(PG_STATE_DEGRADED);
3531 } else {
3532 state_clear(PG_STATE_DEGRADED);
3533 }
3534 update_blocked_by();
3535
3536 pg_stat_t pre_publish = info.stats;
3537 pre_publish.stats.add(unstable_stats);
3538 utime_t cutoff = now;
3539 cutoff -= cct->_conf->osd_pg_stat_report_interval_max;
3540
3541 // share (some of) our purged_snaps via the pg_stats. limit # of intervals
3542 // because we don't want to make the pg_stat_t structures too expensive.
3543 unsigned max = cct->_conf->osd_max_snap_prune_intervals_per_epoch;
3544 unsigned num = 0;
3545 auto i = info.purged_snaps.begin();
3546 while (num < max && i != info.purged_snaps.end()) {
3547 pre_publish.purged_snaps.insert(i.get_start(), i.get_len());
3548 ++num;
3549 ++i;
3550 }
3551 psdout(20) << __func__ << " reporting purged_snaps "
3552 << pre_publish.purged_snaps << dendl;
3553
3554 if (pg_stats_publish_valid && pre_publish == pg_stats_publish &&
3555 info.stats.last_fresh > cutoff) {
3556 psdout(15) << "publish_stats_to_osd " << pg_stats_publish.reported_epoch
3557 << ": no change since " << info.stats.last_fresh << dendl;
3558 return std::nullopt;
3559 } else {
3560 // update our stat summary and timestamps
3561 info.stats.reported_epoch = get_osdmap_epoch();
3562 ++info.stats.reported_seq;
3563
3564 info.stats.last_fresh = now;
3565
3566 if (info.stats.state & PG_STATE_CLEAN)
3567 info.stats.last_clean = now;
3568 if (info.stats.state & PG_STATE_ACTIVE)
3569 info.stats.last_active = now;
3570 if (info.stats.state & (PG_STATE_ACTIVE|PG_STATE_PEERED))
3571 info.stats.last_peered = now;
3572 info.stats.last_unstale = now;
3573 if ((info.stats.state & PG_STATE_DEGRADED) == 0)
3574 info.stats.last_undegraded = now;
3575 if ((info.stats.state & PG_STATE_UNDERSIZED) == 0)
3576 info.stats.last_fullsized = now;
3577
3578 psdout(15) << "publish_stats_to_osd " << pg_stats_publish.reported_epoch
3579 << ":" << pg_stats_publish.reported_seq << dendl;
3580 return std::make_optional(std::move(pre_publish));
3581 }
3582 }
3583
3584 void PeeringState::init(
3585 int role,
3586 const vector<int>& newup, int new_up_primary,
3587 const vector<int>& newacting, int new_acting_primary,
3588 const pg_history_t& history,
3589 const PastIntervals& pi,
3590 bool backfill,
3591 ObjectStore::Transaction &t)
3592 {
3593 psdout(10) << "init role " << role << " up "
3594 << newup << " acting " << newacting
3595 << " history " << history
3596 << " past_intervals " << pi
3597 << dendl;
3598
3599 set_role(role);
3600 init_primary_up_acting(
3601 newup,
3602 newacting,
3603 new_up_primary,
3604 new_acting_primary);
3605
3606 info.history = history;
3607 past_intervals = pi;
3608
3609 info.stats.up = up;
3610 info.stats.up_primary = new_up_primary;
3611 info.stats.acting = acting;
3612 info.stats.acting_primary = new_acting_primary;
3613 info.stats.mapping_epoch = info.history.same_interval_since;
3614
3615 if (!perform_deletes_during_peering()) {
3616 pg_log.set_missing_may_contain_deletes();
3617 }
3618
3619 if (backfill) {
3620 psdout(10) << __func__ << ": Setting backfill" << dendl;
3621 info.set_last_backfill(hobject_t());
3622 info.last_complete = info.last_update;
3623 pg_log.mark_log_for_rewrite();
3624 }
3625
3626 on_new_interval();
3627
3628 dirty_info = true;
3629 dirty_big_info = true;
3630 write_if_dirty(t);
3631 }
3632
3633 void PeeringState::dump_peering_state(Formatter *f)
3634 {
3635 f->dump_string("state", get_pg_state_string());
3636 f->dump_unsigned("epoch", get_osdmap_epoch());
3637 f->open_array_section("up");
3638 for (vector<int>::const_iterator p = up.begin(); p != up.end(); ++p)
3639 f->dump_unsigned("osd", *p);
3640 f->close_section();
3641 f->open_array_section("acting");
3642 for (vector<int>::const_iterator p = acting.begin(); p != acting.end(); ++p)
3643 f->dump_unsigned("osd", *p);
3644 f->close_section();
3645 if (!backfill_targets.empty()) {
3646 f->open_array_section("backfill_targets");
3647 for (set<pg_shard_t>::iterator p = backfill_targets.begin();
3648 p != backfill_targets.end();
3649 ++p)
3650 f->dump_stream("shard") << *p;
3651 f->close_section();
3652 }
3653 if (!async_recovery_targets.empty()) {
3654 f->open_array_section("async_recovery_targets");
3655 for (set<pg_shard_t>::iterator p = async_recovery_targets.begin();
3656 p != async_recovery_targets.end();
3657 ++p)
3658 f->dump_stream("shard") << *p;
3659 f->close_section();
3660 }
3661 if (!acting_recovery_backfill.empty()) {
3662 f->open_array_section("acting_recovery_backfill");
3663 for (set<pg_shard_t>::iterator p = acting_recovery_backfill.begin();
3664 p != acting_recovery_backfill.end();
3665 ++p)
3666 f->dump_stream("shard") << *p;
3667 f->close_section();
3668 }
3669 f->open_object_section("info");
3670 update_calc_stats();
3671 info.dump(f);
3672 f->close_section();
3673
3674 f->open_array_section("peer_info");
3675 for (map<pg_shard_t, pg_info_t>::const_iterator p = peer_info.begin();
3676 p != peer_info.end();
3677 ++p) {
3678 f->open_object_section("info");
3679 f->dump_stream("peer") << p->first;
3680 p->second.dump(f);
3681 f->close_section();
3682 }
3683 }
3684
3685 void PeeringState::update_stats(
3686 std::function<bool(pg_history_t &, pg_stat_t &)> f,
3687 ObjectStore::Transaction *t) {
3688 if (f(info.history, info.stats)) {
3689 pl->publish_stats_to_osd();
3690 }
3691 pl->on_info_history_change();
3692
3693 if (t) {
3694 dirty_info = true;
3695 write_if_dirty(*t);
3696 }
3697 }
3698
3699 bool PeeringState::append_log_entries_update_missing(
3700 const mempool::osd_pglog::list<pg_log_entry_t> &entries,
3701 ObjectStore::Transaction &t, std::optional<eversion_t> trim_to,
3702 std::optional<eversion_t> roll_forward_to)
3703 {
3704 ceph_assert(!entries.empty());
3705 ceph_assert(entries.begin()->version > info.last_update);
3706
3707 PGLog::LogEntryHandlerRef rollbacker{pl->get_log_handler(t)};
3708 bool invalidate_stats =
3709 pg_log.append_new_log_entries(
3710 info.last_backfill,
3711 entries,
3712 rollbacker.get());
3713
3714 if (roll_forward_to && entries.rbegin()->soid > info.last_backfill) {
3715 pg_log.roll_forward(rollbacker.get());
3716 }
3717 if (roll_forward_to && *roll_forward_to > pg_log.get_can_rollback_to()) {
3718 pg_log.roll_forward_to(*roll_forward_to, rollbacker.get());
3719 last_rollback_info_trimmed_to_applied = *roll_forward_to;
3720 }
3721
3722 info.last_update = pg_log.get_head();
3723
3724 if (pg_log.get_missing().num_missing() == 0) {
3725 // advance last_complete since nothing else is missing!
3726 info.last_complete = info.last_update;
3727 }
3728 info.stats.stats_invalid = info.stats.stats_invalid || invalidate_stats;
3729
3730 psdout(20) << __func__ << " trim_to bool = " << bool(trim_to)
3731 << " trim_to = " << (trim_to ? *trim_to : eversion_t()) << dendl;
3732 if (trim_to)
3733 pg_log.trim(*trim_to, info);
3734 dirty_info = true;
3735 write_if_dirty(t);
3736 return invalidate_stats;
3737 }
3738
3739 void PeeringState::merge_new_log_entries(
3740 const mempool::osd_pglog::list<pg_log_entry_t> &entries,
3741 ObjectStore::Transaction &t,
3742 std::optional<eversion_t> trim_to,
3743 std::optional<eversion_t> roll_forward_to)
3744 {
3745 psdout(10) << __func__ << " " << entries << dendl;
3746 ceph_assert(is_primary());
3747
3748 bool rebuild_missing = append_log_entries_update_missing(entries, t, trim_to, roll_forward_to);
3749 for (set<pg_shard_t>::const_iterator i = acting_recovery_backfill.begin();
3750 i != acting_recovery_backfill.end();
3751 ++i) {
3752 pg_shard_t peer(*i);
3753 if (peer == pg_whoami) continue;
3754 ceph_assert(peer_missing.count(peer));
3755 ceph_assert(peer_info.count(peer));
3756 pg_missing_t& pmissing(peer_missing[peer]);
3757 psdout(20) << __func__ << " peer_missing for " << peer
3758 << " = " << pmissing << dendl;
3759 pg_info_t& pinfo(peer_info[peer]);
3760 bool invalidate_stats = PGLog::append_log_entries_update_missing(
3761 pinfo.last_backfill,
3762 entries,
3763 true,
3764 NULL,
3765 pmissing,
3766 NULL,
3767 dpp);
3768 pinfo.last_update = info.last_update;
3769 pinfo.stats.stats_invalid = pinfo.stats.stats_invalid || invalidate_stats;
3770 rebuild_missing = rebuild_missing || invalidate_stats;
3771 }
3772
3773 if (!rebuild_missing) {
3774 return;
3775 }
3776
3777 for (auto &&i: entries) {
3778 missing_loc.rebuild(
3779 i.soid,
3780 pg_whoami,
3781 acting_recovery_backfill,
3782 info,
3783 pg_log.get_missing(),
3784 peer_missing,
3785 peer_info);
3786 }
3787 }
3788
3789 void PeeringState::add_log_entry(const pg_log_entry_t& e, bool applied)
3790 {
3791 // raise last_complete only if we were previously up to date
3792 if (info.last_complete == info.last_update)
3793 info.last_complete = e.version;
3794
3795 // raise last_update.
3796 ceph_assert(e.version > info.last_update);
3797 info.last_update = e.version;
3798
3799 // raise user_version, if it increased (it may have not get bumped
3800 // by all logged updates)
3801 if (e.user_version > info.last_user_version)
3802 info.last_user_version = e.user_version;
3803
3804 // log mutation
3805 pg_log.add(e, applied);
3806 psdout(10) << "add_log_entry " << e << dendl;
3807 }
3808
3809
3810 void PeeringState::append_log(
3811 const vector<pg_log_entry_t>& logv,
3812 eversion_t trim_to,
3813 eversion_t roll_forward_to,
3814 eversion_t mlcod,
3815 ObjectStore::Transaction &t,
3816 bool transaction_applied,
3817 bool async)
3818 {
3819 /* The primary has sent an info updating the history, but it may not
3820 * have arrived yet. We want to make sure that we cannot remember this
3821 * write without remembering that it happened in an interval which went
3822 * active in epoch history.last_epoch_started.
3823 */
3824 if (info.last_epoch_started != info.history.last_epoch_started) {
3825 info.history.last_epoch_started = info.last_epoch_started;
3826 }
3827 if (info.last_interval_started != info.history.last_interval_started) {
3828 info.history.last_interval_started = info.last_interval_started;
3829 }
3830 psdout(10) << "append_log " << pg_log.get_log() << " " << logv << dendl;
3831
3832 PGLog::LogEntryHandlerRef handler{pl->get_log_handler(t)};
3833 if (!transaction_applied) {
3834 /* We must be a backfill or async recovery peer, so it's ok if we apply
3835 * out-of-turn since we won't be considered when
3836 * determining a min possible last_update.
3837 *
3838 * We skip_rollforward() here, which advances the crt, without
3839 * doing an actual rollforward. This avoids cleaning up entries
3840 * from the backend and we do not end up in a situation, where the
3841 * object is deleted before we can _merge_object_divergent_entries().
3842 */
3843 pg_log.skip_rollforward();
3844 }
3845
3846 for (vector<pg_log_entry_t>::const_iterator p = logv.begin();
3847 p != logv.end();
3848 ++p) {
3849 add_log_entry(*p, transaction_applied);
3850
3851 /* We don't want to leave the rollforward artifacts around
3852 * here past last_backfill. It's ok for the same reason as
3853 * above */
3854 if (transaction_applied &&
3855 p->soid > info.last_backfill) {
3856 pg_log.roll_forward(handler.get());
3857 }
3858 }
3859 if (transaction_applied && roll_forward_to > pg_log.get_can_rollback_to()) {
3860 pg_log.roll_forward_to(
3861 roll_forward_to,
3862 handler.get());
3863 last_rollback_info_trimmed_to_applied = roll_forward_to;
3864 }
3865
3866 psdout(10) << __func__ << " approx pg log length = "
3867 << pg_log.get_log().approx_size() << dendl;
3868 psdout(10) << __func__ << " transaction_applied = "
3869 << transaction_applied << dendl;
3870 if (!transaction_applied || async)
3871 psdout(10) << __func__ << " " << pg_whoami
3872 << " is async_recovery or backfill target" << dendl;
3873 pg_log.trim(trim_to, info, transaction_applied, async);
3874
3875 // update the local pg, pg log
3876 dirty_info = true;
3877 write_if_dirty(t);
3878
3879 if (!is_primary())
3880 min_last_complete_ondisk = mlcod;
3881 }
3882
3883 void PeeringState::recover_got(
3884 const hobject_t &oid, eversion_t v,
3885 bool is_delete,
3886 ObjectStore::Transaction &t)
3887 {
3888 if (v > pg_log.get_can_rollback_to()) {
3889 /* This can only happen during a repair, and even then, it would
3890 * be one heck of a race. If we are repairing the object, the
3891 * write in question must be fully committed, so it's not valid
3892 * to roll it back anyway (and we'll be rolled forward shortly
3893 * anyway) */
3894 PGLog::LogEntryHandlerRef handler{pl->get_log_handler(t)};
3895 pg_log.roll_forward_to(v, handler.get());
3896 }
3897
3898 psdout(10) << "got missing " << oid << " v " << v << dendl;
3899 pg_log.recover_got(oid, v, info);
3900 if (pg_log.get_log().log.empty()) {
3901 psdout(10) << "last_complete now " << info.last_complete
3902 << " while log is empty" << dendl;
3903 } else if (pg_log.get_log().complete_to != pg_log.get_log().log.end()) {
3904 psdout(10) << "last_complete now " << info.last_complete
3905 << " log.complete_to " << pg_log.get_log().complete_to->version
3906 << dendl;
3907 } else {
3908 psdout(10) << "last_complete now " << info.last_complete
3909 << " log.complete_to at end" << dendl;
3910 //below is not true in the repair case.
3911 //assert(missing.num_missing() == 0); // otherwise, complete_to was wrong.
3912 ceph_assert(info.last_complete == info.last_update);
3913 }
3914
3915 if (is_primary()) {
3916 ceph_assert(missing_loc.needs_recovery(oid));
3917 if (!is_delete)
3918 missing_loc.add_location(oid, pg_whoami);
3919 }
3920
3921 // update pg
3922 dirty_info = true;
3923 write_if_dirty(t);
3924 }
3925
3926 void PeeringState::update_backfill_progress(
3927 const hobject_t &updated_backfill,
3928 const pg_stat_t &updated_stats,
3929 bool preserve_local_num_bytes,
3930 ObjectStore::Transaction &t) {
3931 info.set_last_backfill(updated_backfill);
3932 if (preserve_local_num_bytes) {
3933 psdout(25) << __func__ << " primary " << updated_stats.stats.sum.num_bytes
3934 << " local " << info.stats.stats.sum.num_bytes << dendl;
3935 int64_t bytes = info.stats.stats.sum.num_bytes;
3936 info.stats = updated_stats;
3937 info.stats.stats.sum.num_bytes = bytes;
3938 } else {
3939 psdout(20) << __func__ << " final " << updated_stats.stats.sum.num_bytes
3940 << " replaces local " << info.stats.stats.sum.num_bytes << dendl;
3941 info.stats = updated_stats;
3942 }
3943
3944 dirty_info = true;
3945 write_if_dirty(t);
3946 }
3947
3948 void PeeringState::adjust_purged_snaps(
3949 std::function<void(interval_set<snapid_t> &snaps)> f) {
3950 f(info.purged_snaps);
3951 dirty_info = true;
3952 dirty_big_info = true;
3953 }
3954
3955 void PeeringState::on_peer_recover(
3956 pg_shard_t peer,
3957 const hobject_t &soid,
3958 const eversion_t &version)
3959 {
3960 pl->publish_stats_to_osd();
3961 // done!
3962 peer_missing[peer].got(soid, version);
3963 missing_loc.add_location(soid, peer);
3964 }
3965
3966 void PeeringState::begin_peer_recover(
3967 pg_shard_t peer,
3968 const hobject_t soid)
3969 {
3970 peer_missing[peer].revise_have(soid, eversion_t());
3971 }
3972
3973 void PeeringState::force_object_missing(
3974 const set<pg_shard_t> &peers,
3975 const hobject_t &soid,
3976 eversion_t version)
3977 {
3978 for (auto &&peer : peers) {
3979 if (peer != primary) {
3980 peer_missing[peer].add(soid, version, eversion_t(), false);
3981 } else {
3982 pg_log.missing_add(soid, version, eversion_t());
3983 pg_log.reset_complete_to(&info);
3984 pg_log.set_last_requested(0);
3985 }
3986 }
3987
3988 missing_loc.rebuild(
3989 soid,
3990 pg_whoami,
3991 acting_recovery_backfill,
3992 info,
3993 pg_log.get_missing(),
3994 peer_missing,
3995 peer_info);
3996 }
3997
3998 void PeeringState::pre_submit_op(
3999 const hobject_t &hoid,
4000 const vector<pg_log_entry_t>& logv,
4001 eversion_t at_version)
4002 {
4003 if (at_version > eversion_t()) {
4004 for (auto &&i : get_acting_recovery_backfill()) {
4005 if (i == primary) continue;
4006 pg_info_t &pinfo = peer_info[i];
4007 // keep peer_info up to date
4008 if (pinfo.last_complete == pinfo.last_update)
4009 pinfo.last_complete = at_version;
4010 pinfo.last_update = at_version;
4011 }
4012 }
4013
4014 bool requires_missing_loc = false;
4015 for (auto &&i : get_async_recovery_targets()) {
4016 if (i == primary || !get_peer_missing(i).is_missing(hoid))
4017 continue;
4018 requires_missing_loc = true;
4019 for (auto &&entry: logv) {
4020 peer_missing[i].add_next_event(entry);
4021 }
4022 }
4023
4024 if (requires_missing_loc) {
4025 for (auto &&entry: logv) {
4026 psdout(30) << __func__ << " missing_loc before: "
4027 << missing_loc.get_locations(entry.soid) << dendl;
4028 missing_loc.add_missing(entry.soid, entry.version,
4029 eversion_t(), entry.is_delete());
4030 // clear out missing_loc
4031 missing_loc.clear_location(entry.soid);
4032 for (auto &i: get_actingset()) {
4033 if (!get_peer_missing(i).is_missing(entry.soid))
4034 missing_loc.add_location(entry.soid, i);
4035 }
4036 psdout(30) << __func__ << " missing_loc after: "
4037 << missing_loc.get_locations(entry.soid) << dendl;
4038 }
4039 }
4040 }
4041
4042 void PeeringState::recovery_committed_to(eversion_t version)
4043 {
4044 psdout(10) << __func__ << " version " << version
4045 << " now ondisk" << dendl;
4046 last_complete_ondisk = version;
4047
4048 if (last_complete_ondisk == info.last_update) {
4049 if (!is_primary()) {
4050 // Either we are a replica or backfill target.
4051 // we are fully up to date. tell the primary!
4052 pl->send_cluster_message(
4053 get_primary().osd,
4054 new MOSDPGTrim(
4055 get_osdmap_epoch(),
4056 spg_t(info.pgid.pgid, primary.shard),
4057 last_complete_ondisk),
4058 get_osdmap_epoch());
4059 } else {
4060 calc_min_last_complete_ondisk();
4061 }
4062 }
4063 }
4064
4065 void PeeringState::complete_write(eversion_t v, eversion_t lc)
4066 {
4067 last_update_ondisk = v;
4068 last_complete_ondisk = lc;
4069 calc_min_last_complete_ondisk();
4070 }
4071
4072 void PeeringState::calc_trim_to()
4073 {
4074 size_t target = pl->get_target_pg_log_entries();
4075
4076 eversion_t limit = std::min(
4077 min_last_complete_ondisk,
4078 pg_log.get_can_rollback_to());
4079 if (limit != eversion_t() &&
4080 limit != pg_trim_to &&
4081 pg_log.get_log().approx_size() > target) {
4082 size_t num_to_trim = std::min(pg_log.get_log().approx_size() - target,
4083 cct->_conf->osd_pg_log_trim_max);
4084 if (num_to_trim < cct->_conf->osd_pg_log_trim_min &&
4085 cct->_conf->osd_pg_log_trim_max >= cct->_conf->osd_pg_log_trim_min) {
4086 return;
4087 }
4088 list<pg_log_entry_t>::const_iterator it = pg_log.get_log().log.begin();
4089 eversion_t new_trim_to;
4090 for (size_t i = 0; i < num_to_trim; ++i) {
4091 new_trim_to = it->version;
4092 ++it;
4093 if (new_trim_to > limit) {
4094 new_trim_to = limit;
4095 psdout(10) << "calc_trim_to trimming to min_last_complete_ondisk" << dendl;
4096 break;
4097 }
4098 }
4099 psdout(10) << "calc_trim_to " << pg_trim_to << " -> " << new_trim_to << dendl;
4100 pg_trim_to = new_trim_to;
4101 assert(pg_trim_to <= pg_log.get_head());
4102 assert(pg_trim_to <= min_last_complete_ondisk);
4103 }
4104 }
4105
4106 void PeeringState::calc_trim_to_aggressive()
4107 {
4108 size_t target = pl->get_target_pg_log_entries();
4109
4110 // limit pg log trimming up to the can_rollback_to value
4111 eversion_t limit = std::min({
4112 pg_log.get_head(),
4113 pg_log.get_can_rollback_to(),
4114 last_update_ondisk});
4115 psdout(10) << __func__ << " limit = " << limit << dendl;
4116
4117 if (limit != eversion_t() &&
4118 limit != pg_trim_to &&
4119 pg_log.get_log().approx_size() > target) {
4120 psdout(10) << __func__ << " approx pg log length = "
4121 << pg_log.get_log().approx_size() << dendl;
4122 uint64_t num_to_trim = std::min<uint64_t>(pg_log.get_log().approx_size() - target,
4123 cct->_conf->osd_pg_log_trim_max);
4124 psdout(10) << __func__ << " num_to_trim = " << num_to_trim << dendl;
4125 if (num_to_trim < cct->_conf->osd_pg_log_trim_min &&
4126 cct->_conf->osd_pg_log_trim_max >= cct->_conf->osd_pg_log_trim_min) {
4127 return;
4128 }
4129 auto it = pg_log.get_log().log.begin(); // oldest log entry
4130 auto rit = pg_log.get_log().log.rbegin();
4131 eversion_t by_n_to_keep; // start from tail
4132 eversion_t by_n_to_trim = eversion_t::max(); // start from head
4133 for (size_t i = 0; it != pg_log.get_log().log.end(); ++it, ++rit) {
4134 i++;
4135 if (i > target && by_n_to_keep == eversion_t()) {
4136 by_n_to_keep = rit->version;
4137 }
4138 if (i >= num_to_trim && by_n_to_trim == eversion_t::max()) {
4139 by_n_to_trim = it->version;
4140 }
4141 if (by_n_to_keep != eversion_t() &&
4142 by_n_to_trim != eversion_t::max()) {
4143 break;
4144 }
4145 }
4146
4147 if (by_n_to_keep == eversion_t()) {
4148 return;
4149 }
4150
4151 pg_trim_to = std::min({by_n_to_keep, by_n_to_trim, limit});
4152 psdout(10) << __func__ << " pg_trim_to now " << pg_trim_to << dendl;
4153 ceph_assert(pg_trim_to <= pg_log.get_head());
4154 }
4155 }
4156
4157 void PeeringState::apply_op_stats(
4158 const hobject_t &soid,
4159 const object_stat_sum_t &delta_stats)
4160 {
4161 info.stats.stats.add(delta_stats);
4162 info.stats.stats.floor(0);
4163
4164 for (set<pg_shard_t>::const_iterator i = get_backfill_targets().begin();
4165 i != get_backfill_targets().end();
4166 ++i) {
4167 pg_shard_t bt = *i;
4168 pg_info_t& pinfo = peer_info[bt];
4169 if (soid <= pinfo.last_backfill)
4170 pinfo.stats.stats.add(delta_stats);
4171 }
4172 }
4173
4174 void PeeringState::update_complete_backfill_object_stats(
4175 const hobject_t &hoid,
4176 const pg_stat_t &stats)
4177 {
4178 for (auto &&bt: get_backfill_targets()) {
4179 pg_info_t& pinfo = peer_info[bt];
4180 //Add stats to all peers that were missing object
4181 if (hoid > pinfo.last_backfill)
4182 pinfo.stats.add(stats);
4183 }
4184 }
4185
4186 void PeeringState::update_peer_last_backfill(
4187 pg_shard_t peer,
4188 const hobject_t &new_last_backfill)
4189 {
4190 pg_info_t &pinfo = peer_info[peer];
4191 pinfo.last_backfill = new_last_backfill;
4192 if (new_last_backfill.is_max()) {
4193 /* pinfo.stats might be wrong if we did log-based recovery on the
4194 * backfilled portion in addition to continuing backfill.
4195 */
4196 pinfo.stats = info.stats;
4197 }
4198 }
4199
4200 void PeeringState::set_revert_with_targets(
4201 const hobject_t &soid,
4202 const set<pg_shard_t> &good_peers)
4203 {
4204 for (auto &&peer: good_peers) {
4205 missing_loc.add_location(soid, peer);
4206 }
4207 }
4208
4209 void PeeringState::prepare_backfill_for_missing(
4210 const hobject_t &soid,
4211 const eversion_t &version,
4212 const vector<pg_shard_t> &targets) {
4213 for (auto &&peer: targets) {
4214 peer_missing[peer].add(soid, version, eversion_t(), false);
4215 }
4216 }
4217
4218 void PeeringState::update_hset(const pg_hit_set_history_t &hset_history)
4219 {
4220 info.hit_set = hset_history;
4221 }
4222
4223 /*------------ Peering State Machine----------------*/
4224 #undef dout_prefix
4225 #define dout_prefix (context< PeeringMachine >().dpp->gen_prefix(*_dout) \
4226 << "state<" << get_state_name() << ">: ")
4227 #undef psdout
4228 #define psdout(x) ldout(context< PeeringMachine >().cct, x)
4229
4230 #define DECLARE_LOCALS \
4231 PeeringState *ps = context< PeeringMachine >().state; \
4232 std::ignore = ps; \
4233 PeeringListener *pl = context< PeeringMachine >().pl; \
4234 std::ignore = pl
4235
4236
4237 /*------Crashed-------*/
4238 PeeringState::Crashed::Crashed(my_context ctx)
4239 : my_base(ctx),
4240 NamedState(context< PeeringMachine >().state_history, "Crashed")
4241 {
4242 context< PeeringMachine >().log_enter(state_name);
4243 ceph_abort_msg("we got a bad state machine event");
4244 }
4245
4246
4247 /*------Initial-------*/
4248 PeeringState::Initial::Initial(my_context ctx)
4249 : my_base(ctx),
4250 NamedState(context< PeeringMachine >().state_history, "Initial")
4251 {
4252 context< PeeringMachine >().log_enter(state_name);
4253 }
4254
4255 boost::statechart::result PeeringState::Initial::react(const MNotifyRec& notify)
4256 {
4257 DECLARE_LOCALS;
4258 ps->proc_replica_info(
4259 notify.from, notify.notify.info, notify.notify.epoch_sent);
4260 ps->set_last_peering_reset();
4261 return transit< Primary >();
4262 }
4263
4264 boost::statechart::result PeeringState::Initial::react(const MInfoRec& i)
4265 {
4266 DECLARE_LOCALS;
4267 ceph_assert(!ps->is_primary());
4268 post_event(i);
4269 return transit< Stray >();
4270 }
4271
4272 boost::statechart::result PeeringState::Initial::react(const MLogRec& i)
4273 {
4274 DECLARE_LOCALS;
4275 ceph_assert(!ps->is_primary());
4276 post_event(i);
4277 return transit< Stray >();
4278 }
4279
4280 void PeeringState::Initial::exit()
4281 {
4282 context< PeeringMachine >().log_exit(state_name, enter_time);
4283 DECLARE_LOCALS;
4284 utime_t dur = ceph_clock_now() - enter_time;
4285 pl->get_peering_perf().tinc(rs_initial_latency, dur);
4286 }
4287
4288 /*------Started-------*/
4289 PeeringState::Started::Started(my_context ctx)
4290 : my_base(ctx),
4291 NamedState(context< PeeringMachine >().state_history, "Started")
4292 {
4293 context< PeeringMachine >().log_enter(state_name);
4294 }
4295
4296 boost::statechart::result
4297 PeeringState::Started::react(const IntervalFlush&)
4298 {
4299 psdout(10) << "Ending blocked outgoing recovery messages" << dendl;
4300 context< PeeringMachine >().state->end_block_outgoing();
4301 return discard_event();
4302 }
4303
4304 boost::statechart::result PeeringState::Started::react(const AdvMap& advmap)
4305 {
4306 DECLARE_LOCALS;
4307 psdout(10) << "Started advmap" << dendl;
4308 ps->check_full_transition(advmap.lastmap, advmap.osdmap);
4309 if (ps->should_restart_peering(
4310 advmap.up_primary,
4311 advmap.acting_primary,
4312 advmap.newup,
4313 advmap.newacting,
4314 advmap.lastmap,
4315 advmap.osdmap)) {
4316 psdout(10) << "should_restart_peering, transitioning to Reset"
4317 << dendl;
4318 post_event(advmap);
4319 return transit< Reset >();
4320 }
4321 ps->remove_down_peer_info(advmap.osdmap);
4322 return discard_event();
4323 }
4324
4325 boost::statechart::result PeeringState::Started::react(const QueryState& q)
4326 {
4327 q.f->open_object_section("state");
4328 q.f->dump_string("name", state_name);
4329 q.f->dump_stream("enter_time") << enter_time;
4330 q.f->close_section();
4331 return discard_event();
4332 }
4333
4334 void PeeringState::Started::exit()
4335 {
4336 context< PeeringMachine >().log_exit(state_name, enter_time);
4337 DECLARE_LOCALS;
4338 utime_t dur = ceph_clock_now() - enter_time;
4339 pl->get_peering_perf().tinc(rs_started_latency, dur);
4340 ps->state_clear(PG_STATE_WAIT | PG_STATE_LAGGY);
4341 }
4342
4343 /*--------Reset---------*/
4344 PeeringState::Reset::Reset(my_context ctx)
4345 : my_base(ctx),
4346 NamedState(context< PeeringMachine >().state_history, "Reset")
4347 {
4348 context< PeeringMachine >().log_enter(state_name);
4349 DECLARE_LOCALS;
4350
4351 ps->flushes_in_progress = 0;
4352 ps->set_last_peering_reset();
4353 ps->log_weirdness();
4354 }
4355
4356 boost::statechart::result
4357 PeeringState::Reset::react(const IntervalFlush&)
4358 {
4359 psdout(10) << "Ending blocked outgoing recovery messages" << dendl;
4360 context< PeeringMachine >().state->end_block_outgoing();
4361 return discard_event();
4362 }
4363
4364 boost::statechart::result PeeringState::Reset::react(const AdvMap& advmap)
4365 {
4366 DECLARE_LOCALS;
4367 psdout(10) << "Reset advmap" << dendl;
4368
4369 ps->check_full_transition(advmap.lastmap, advmap.osdmap);
4370
4371 if (ps->should_restart_peering(
4372 advmap.up_primary,
4373 advmap.acting_primary,
4374 advmap.newup,
4375 advmap.newacting,
4376 advmap.lastmap,
4377 advmap.osdmap)) {
4378 psdout(10) << "should restart peering, calling start_peering_interval again"
4379 << dendl;
4380 ps->start_peering_interval(
4381 advmap.lastmap,
4382 advmap.newup, advmap.up_primary,
4383 advmap.newacting, advmap.acting_primary,
4384 context< PeeringMachine >().get_cur_transaction());
4385 }
4386 ps->remove_down_peer_info(advmap.osdmap);
4387 ps->check_past_interval_bounds();
4388 return discard_event();
4389 }
4390
4391 boost::statechart::result PeeringState::Reset::react(const ActMap&)
4392 {
4393 DECLARE_LOCALS;
4394 if (ps->should_send_notify() && ps->get_primary().osd >= 0) {
4395 ps->info.history.refresh_prior_readable_until_ub(
4396 pl->get_mnow(),
4397 ps->prior_readable_until_ub);
4398 context< PeeringMachine >().send_notify(
4399 ps->get_primary().osd,
4400 pg_notify_t(
4401 ps->get_primary().shard, ps->pg_whoami.shard,
4402 ps->get_osdmap_epoch(),
4403 ps->get_osdmap_epoch(),
4404 ps->info,
4405 ps->past_intervals));
4406 }
4407
4408 ps->update_heartbeat_peers();
4409
4410 return transit< Started >();
4411 }
4412
4413 boost::statechart::result PeeringState::Reset::react(const QueryState& q)
4414 {
4415 q.f->open_object_section("state");
4416 q.f->dump_string("name", state_name);
4417 q.f->dump_stream("enter_time") << enter_time;
4418 q.f->close_section();
4419 return discard_event();
4420 }
4421
4422 void PeeringState::Reset::exit()
4423 {
4424 context< PeeringMachine >().log_exit(state_name, enter_time);
4425 DECLARE_LOCALS;
4426 utime_t dur = ceph_clock_now() - enter_time;
4427 pl->get_peering_perf().tinc(rs_reset_latency, dur);
4428 }
4429
4430 /*-------Start---------*/
4431 PeeringState::Start::Start(my_context ctx)
4432 : my_base(ctx),
4433 NamedState(context< PeeringMachine >().state_history, "Start")
4434 {
4435 context< PeeringMachine >().log_enter(state_name);
4436
4437 DECLARE_LOCALS;
4438 if (ps->is_primary()) {
4439 psdout(1) << "transitioning to Primary" << dendl;
4440 post_event(MakePrimary());
4441 } else { //is_stray
4442 psdout(1) << "transitioning to Stray" << dendl;
4443 post_event(MakeStray());
4444 }
4445 }
4446
4447 void PeeringState::Start::exit()
4448 {
4449 context< PeeringMachine >().log_exit(state_name, enter_time);
4450 DECLARE_LOCALS;
4451 utime_t dur = ceph_clock_now() - enter_time;
4452 pl->get_peering_perf().tinc(rs_start_latency, dur);
4453 }
4454
4455 /*---------Primary--------*/
4456 PeeringState::Primary::Primary(my_context ctx)
4457 : my_base(ctx),
4458 NamedState(context< PeeringMachine >().state_history, "Started/Primary")
4459 {
4460 context< PeeringMachine >().log_enter(state_name);
4461 DECLARE_LOCALS;
4462 ceph_assert(ps->want_acting.empty());
4463
4464 // set CREATING bit until we have peered for the first time.
4465 if (ps->info.history.last_epoch_started == 0) {
4466 ps->state_set(PG_STATE_CREATING);
4467 // use the history timestamp, which ultimately comes from the
4468 // monitor in the create case.
4469 utime_t t = ps->info.history.last_scrub_stamp;
4470 ps->info.stats.last_fresh = t;
4471 ps->info.stats.last_active = t;
4472 ps->info.stats.last_change = t;
4473 ps->info.stats.last_peered = t;
4474 ps->info.stats.last_clean = t;
4475 ps->info.stats.last_unstale = t;
4476 ps->info.stats.last_undegraded = t;
4477 ps->info.stats.last_fullsized = t;
4478 ps->info.stats.last_scrub_stamp = t;
4479 ps->info.stats.last_deep_scrub_stamp = t;
4480 ps->info.stats.last_clean_scrub_stamp = t;
4481 }
4482 }
4483
4484 boost::statechart::result PeeringState::Primary::react(const MNotifyRec& notevt)
4485 {
4486 DECLARE_LOCALS;
4487 psdout(7) << "handle_pg_notify from osd." << notevt.from << dendl;
4488 ps->proc_replica_info(
4489 notevt.from, notevt.notify.info, notevt.notify.epoch_sent);
4490 return discard_event();
4491 }
4492
4493 boost::statechart::result PeeringState::Primary::react(const ActMap&)
4494 {
4495 DECLARE_LOCALS;
4496 psdout(7) << "handle ActMap primary" << dendl;
4497 pl->publish_stats_to_osd();
4498 return discard_event();
4499 }
4500
4501 boost::statechart::result PeeringState::Primary::react(
4502 const SetForceRecovery&)
4503 {
4504 DECLARE_LOCALS;
4505 ps->set_force_recovery(true);
4506 return discard_event();
4507 }
4508
4509 boost::statechart::result PeeringState::Primary::react(
4510 const UnsetForceRecovery&)
4511 {
4512 DECLARE_LOCALS;
4513 ps->set_force_recovery(false);
4514 return discard_event();
4515 }
4516
4517 boost::statechart::result PeeringState::Primary::react(
4518 const RequestScrub& evt)
4519 {
4520 DECLARE_LOCALS;
4521 if (ps->is_primary()) {
4522 pl->scrub_requested(evt.deep, evt.repair);
4523 psdout(10) << "marking for scrub" << dendl;
4524 }
4525 return discard_event();
4526 }
4527
4528 boost::statechart::result PeeringState::Primary::react(
4529 const SetForceBackfill&)
4530 {
4531 DECLARE_LOCALS;
4532 ps->set_force_backfill(true);
4533 return discard_event();
4534 }
4535
4536 boost::statechart::result PeeringState::Primary::react(
4537 const UnsetForceBackfill&)
4538 {
4539 DECLARE_LOCALS;
4540 ps->set_force_backfill(false);
4541 return discard_event();
4542 }
4543
4544 void PeeringState::Primary::exit()
4545 {
4546 context< PeeringMachine >().log_exit(state_name, enter_time);
4547 DECLARE_LOCALS;
4548 ps->want_acting.clear();
4549 utime_t dur = ceph_clock_now() - enter_time;
4550 pl->get_peering_perf().tinc(rs_primary_latency, dur);
4551 pl->clear_primary_state();
4552 ps->state_clear(PG_STATE_CREATING);
4553 }
4554
4555 /*---------Peering--------*/
4556 PeeringState::Peering::Peering(my_context ctx)
4557 : my_base(ctx),
4558 NamedState(context< PeeringMachine >().state_history, "Started/Primary/Peering"),
4559 history_les_bound(false)
4560 {
4561 context< PeeringMachine >().log_enter(state_name);
4562 DECLARE_LOCALS;
4563
4564 ceph_assert(!ps->is_peered());
4565 ceph_assert(!ps->is_peering());
4566 ceph_assert(ps->is_primary());
4567 ps->state_set(PG_STATE_PEERING);
4568 }
4569
4570 boost::statechart::result PeeringState::Peering::react(const AdvMap& advmap)
4571 {
4572 DECLARE_LOCALS;
4573 psdout(10) << "Peering advmap" << dendl;
4574 if (prior_set.affected_by_map(*(advmap.osdmap), ps->dpp)) {
4575 psdout(1) << "Peering, affected_by_map, going to Reset" << dendl;
4576 post_event(advmap);
4577 return transit< Reset >();
4578 }
4579
4580 ps->adjust_need_up_thru(advmap.osdmap);
4581 ps->check_prior_readable_down_osds(advmap.osdmap);
4582
4583 return forward_event();
4584 }
4585
4586 boost::statechart::result PeeringState::Peering::react(const QueryState& q)
4587 {
4588 DECLARE_LOCALS;
4589
4590 q.f->open_object_section("state");
4591 q.f->dump_string("name", state_name);
4592 q.f->dump_stream("enter_time") << enter_time;
4593
4594 q.f->open_array_section("past_intervals");
4595 ps->past_intervals.dump(q.f);
4596 q.f->close_section();
4597
4598 q.f->open_array_section("probing_osds");
4599 for (set<pg_shard_t>::iterator p = prior_set.probe.begin();
4600 p != prior_set.probe.end();
4601 ++p)
4602 q.f->dump_stream("osd") << *p;
4603 q.f->close_section();
4604
4605 if (prior_set.pg_down)
4606 q.f->dump_string("blocked", "peering is blocked due to down osds");
4607
4608 q.f->open_array_section("down_osds_we_would_probe");
4609 for (set<int>::iterator p = prior_set.down.begin();
4610 p != prior_set.down.end();
4611 ++p)
4612 q.f->dump_int("osd", *p);
4613 q.f->close_section();
4614
4615 q.f->open_array_section("peering_blocked_by");
4616 for (map<int,epoch_t>::iterator p = prior_set.blocked_by.begin();
4617 p != prior_set.blocked_by.end();
4618 ++p) {
4619 q.f->open_object_section("osd");
4620 q.f->dump_int("osd", p->first);
4621 q.f->dump_int("current_lost_at", p->second);
4622 q.f->dump_string("comment", "starting or marking this osd lost may let us proceed");
4623 q.f->close_section();
4624 }
4625 q.f->close_section();
4626
4627 if (history_les_bound) {
4628 q.f->open_array_section("peering_blocked_by_detail");
4629 q.f->open_object_section("item");
4630 q.f->dump_string("detail","peering_blocked_by_history_les_bound");
4631 q.f->close_section();
4632 q.f->close_section();
4633 }
4634
4635 q.f->close_section();
4636 return forward_event();
4637 }
4638
4639 void PeeringState::Peering::exit()
4640 {
4641
4642 DECLARE_LOCALS;
4643 psdout(10) << "Leaving Peering" << dendl;
4644 context< PeeringMachine >().log_exit(state_name, enter_time);
4645 ps->state_clear(PG_STATE_PEERING);
4646 pl->clear_probe_targets();
4647
4648 utime_t dur = ceph_clock_now() - enter_time;
4649 pl->get_peering_perf().tinc(rs_peering_latency, dur);
4650 }
4651
4652
4653 /*------Backfilling-------*/
4654 PeeringState::Backfilling::Backfilling(my_context ctx)
4655 : my_base(ctx),
4656 NamedState(context< PeeringMachine >().state_history, "Started/Primary/Active/Backfilling")
4657 {
4658 context< PeeringMachine >().log_enter(state_name);
4659
4660
4661 DECLARE_LOCALS;
4662 ps->backfill_reserved = true;
4663 pl->on_backfill_reserved();
4664 ps->state_clear(PG_STATE_BACKFILL_TOOFULL);
4665 ps->state_clear(PG_STATE_BACKFILL_WAIT);
4666 ps->state_set(PG_STATE_BACKFILLING);
4667 pl->publish_stats_to_osd();
4668 }
4669
4670 void PeeringState::Backfilling::backfill_release_reservations()
4671 {
4672 DECLARE_LOCALS;
4673 pl->cancel_local_background_io_reservation();
4674 for (set<pg_shard_t>::iterator it = ps->backfill_targets.begin();
4675 it != ps->backfill_targets.end();
4676 ++it) {
4677 ceph_assert(*it != ps->pg_whoami);
4678 pl->send_cluster_message(
4679 it->osd,
4680 new MBackfillReserve(
4681 MBackfillReserve::RELEASE,
4682 spg_t(ps->info.pgid.pgid, it->shard),
4683 ps->get_osdmap_epoch()),
4684 ps->get_osdmap_epoch());
4685 }
4686 }
4687
4688 void PeeringState::Backfilling::cancel_backfill()
4689 {
4690 DECLARE_LOCALS;
4691 backfill_release_reservations();
4692 pl->on_backfill_canceled();
4693 }
4694
4695 boost::statechart::result
4696 PeeringState::Backfilling::react(const Backfilled &c)
4697 {
4698 backfill_release_reservations();
4699 return transit<Recovered>();
4700 }
4701
4702 boost::statechart::result
4703 PeeringState::Backfilling::react(const DeferBackfill &c)
4704 {
4705 DECLARE_LOCALS;
4706
4707 psdout(10) << "defer backfill, retry delay " << c.delay << dendl;
4708 ps->state_set(PG_STATE_BACKFILL_WAIT);
4709 ps->state_clear(PG_STATE_BACKFILLING);
4710 cancel_backfill();
4711
4712 pl->schedule_event_after(
4713 std::make_shared<PGPeeringEvent>(
4714 ps->get_osdmap_epoch(),
4715 ps->get_osdmap_epoch(),
4716 RequestBackfill()),
4717 c.delay);
4718 return transit<NotBackfilling>();
4719 }
4720
4721 boost::statechart::result
4722 PeeringState::Backfilling::react(const UnfoundBackfill &c)
4723 {
4724 DECLARE_LOCALS;
4725 psdout(10) << "backfill has unfound, can't continue" << dendl;
4726 ps->state_set(PG_STATE_BACKFILL_UNFOUND);
4727 ps->state_clear(PG_STATE_BACKFILLING);
4728 cancel_backfill();
4729 return transit<NotBackfilling>();
4730 }
4731
4732 boost::statechart::result
4733 PeeringState::Backfilling::react(const RemoteReservationRevokedTooFull &)
4734 {
4735 DECLARE_LOCALS;
4736
4737 ps->state_set(PG_STATE_BACKFILL_TOOFULL);
4738 ps->state_clear(PG_STATE_BACKFILLING);
4739 cancel_backfill();
4740
4741 pl->schedule_event_after(
4742 std::make_shared<PGPeeringEvent>(
4743 ps->get_osdmap_epoch(),
4744 ps->get_osdmap_epoch(),
4745 RequestBackfill()),
4746 ps->cct->_conf->osd_backfill_retry_interval);
4747
4748 return transit<NotBackfilling>();
4749 }
4750
4751 boost::statechart::result
4752 PeeringState::Backfilling::react(const RemoteReservationRevoked &)
4753 {
4754 DECLARE_LOCALS;
4755 ps->state_set(PG_STATE_BACKFILL_WAIT);
4756 cancel_backfill();
4757 if (ps->needs_backfill()) {
4758 return transit<WaitLocalBackfillReserved>();
4759 } else {
4760 // raced with MOSDPGBackfill::OP_BACKFILL_FINISH, ignore
4761 return discard_event();
4762 }
4763 }
4764
4765 void PeeringState::Backfilling::exit()
4766 {
4767 context< PeeringMachine >().log_exit(state_name, enter_time);
4768 DECLARE_LOCALS;
4769 ps->backfill_reserved = false;
4770 ps->state_clear(PG_STATE_BACKFILLING);
4771 ps->state_clear(PG_STATE_FORCED_BACKFILL | PG_STATE_FORCED_RECOVERY);
4772 utime_t dur = ceph_clock_now() - enter_time;
4773 pl->get_peering_perf().tinc(rs_backfilling_latency, dur);
4774 }
4775
4776 /*--WaitRemoteBackfillReserved--*/
4777
4778 PeeringState::WaitRemoteBackfillReserved::WaitRemoteBackfillReserved(my_context ctx)
4779 : my_base(ctx),
4780 NamedState(context< PeeringMachine >().state_history, "Started/Primary/Active/WaitRemoteBackfillReserved"),
4781 backfill_osd_it(context< Active >().remote_shards_to_reserve_backfill.begin())
4782 {
4783 context< PeeringMachine >().log_enter(state_name);
4784 DECLARE_LOCALS;
4785
4786 ps->state_set(PG_STATE_BACKFILL_WAIT);
4787 pl->publish_stats_to_osd();
4788 post_event(RemoteBackfillReserved());
4789 }
4790
4791 boost::statechart::result
4792 PeeringState::WaitRemoteBackfillReserved::react(const RemoteBackfillReserved &evt)
4793 {
4794 DECLARE_LOCALS;
4795
4796 int64_t num_bytes = ps->info.stats.stats.sum.num_bytes;
4797 psdout(10) << __func__ << " num_bytes " << num_bytes << dendl;
4798 if (backfill_osd_it !=
4799 context< Active >().remote_shards_to_reserve_backfill.end()) {
4800 // The primary never backfills itself
4801 ceph_assert(*backfill_osd_it != ps->pg_whoami);
4802 pl->send_cluster_message(
4803 backfill_osd_it->osd,
4804 new MBackfillReserve(
4805 MBackfillReserve::REQUEST,
4806 spg_t(context< PeeringMachine >().spgid.pgid, backfill_osd_it->shard),
4807 ps->get_osdmap_epoch(),
4808 ps->get_backfill_priority(),
4809 num_bytes,
4810 ps->peer_bytes[*backfill_osd_it]),
4811 ps->get_osdmap_epoch());
4812 ++backfill_osd_it;
4813 } else {
4814 ps->peer_bytes.clear();
4815 post_event(AllBackfillsReserved());
4816 }
4817 return discard_event();
4818 }
4819
4820 void PeeringState::WaitRemoteBackfillReserved::exit()
4821 {
4822 context< PeeringMachine >().log_exit(state_name, enter_time);
4823 DECLARE_LOCALS;
4824
4825 utime_t dur = ceph_clock_now() - enter_time;
4826 pl->get_peering_perf().tinc(rs_waitremotebackfillreserved_latency, dur);
4827 }
4828
4829 void PeeringState::WaitRemoteBackfillReserved::retry()
4830 {
4831 DECLARE_LOCALS;
4832 pl->cancel_local_background_io_reservation();
4833
4834 // Send CANCEL to all previously acquired reservations
4835 set<pg_shard_t>::const_iterator it, begin, end;
4836 begin = context< Active >().remote_shards_to_reserve_backfill.begin();
4837 end = context< Active >().remote_shards_to_reserve_backfill.end();
4838 ceph_assert(begin != end);
4839 for (it = begin; it != backfill_osd_it; ++it) {
4840 // The primary never backfills itself
4841 ceph_assert(*it != ps->pg_whoami);
4842 pl->send_cluster_message(
4843 it->osd,
4844 new MBackfillReserve(
4845 MBackfillReserve::RELEASE,
4846 spg_t(context< PeeringMachine >().spgid.pgid, it->shard),
4847 ps->get_osdmap_epoch()),
4848 ps->get_osdmap_epoch());
4849 }
4850
4851 ps->state_clear(PG_STATE_BACKFILL_WAIT);
4852 pl->publish_stats_to_osd();
4853
4854 pl->schedule_event_after(
4855 std::make_shared<PGPeeringEvent>(
4856 ps->get_osdmap_epoch(),
4857 ps->get_osdmap_epoch(),
4858 RequestBackfill()),
4859 ps->cct->_conf->osd_backfill_retry_interval);
4860 }
4861
4862 boost::statechart::result
4863 PeeringState::WaitRemoteBackfillReserved::react(const RemoteReservationRejectedTooFull &evt)
4864 {
4865 DECLARE_LOCALS;
4866 ps->state_set(PG_STATE_BACKFILL_TOOFULL);
4867 retry();
4868 return transit<NotBackfilling>();
4869 }
4870
4871 boost::statechart::result
4872 PeeringState::WaitRemoteBackfillReserved::react(const RemoteReservationRevoked &evt)
4873 {
4874 retry();
4875 return transit<NotBackfilling>();
4876 }
4877
4878 /*--WaitLocalBackfillReserved--*/
4879 PeeringState::WaitLocalBackfillReserved::WaitLocalBackfillReserved(my_context ctx)
4880 : my_base(ctx),
4881 NamedState(context< PeeringMachine >().state_history, "Started/Primary/Active/WaitLocalBackfillReserved")
4882 {
4883 context< PeeringMachine >().log_enter(state_name);
4884 DECLARE_LOCALS;
4885
4886 ps->state_set(PG_STATE_BACKFILL_WAIT);
4887 pl->request_local_background_io_reservation(
4888 ps->get_backfill_priority(),
4889 std::make_shared<PGPeeringEvent>(
4890 ps->get_osdmap_epoch(),
4891 ps->get_osdmap_epoch(),
4892 LocalBackfillReserved()),
4893 std::make_shared<PGPeeringEvent>(
4894 ps->get_osdmap_epoch(),
4895 ps->get_osdmap_epoch(),
4896 DeferBackfill(0.0)));
4897 pl->publish_stats_to_osd();
4898 }
4899
4900 void PeeringState::WaitLocalBackfillReserved::exit()
4901 {
4902 context< PeeringMachine >().log_exit(state_name, enter_time);
4903 DECLARE_LOCALS;
4904 utime_t dur = ceph_clock_now() - enter_time;
4905 pl->get_peering_perf().tinc(rs_waitlocalbackfillreserved_latency, dur);
4906 }
4907
4908 /*----NotBackfilling------*/
4909 PeeringState::NotBackfilling::NotBackfilling(my_context ctx)
4910 : my_base(ctx),
4911 NamedState(context< PeeringMachine >().state_history, "Started/Primary/Active/NotBackfilling")
4912 {
4913 context< PeeringMachine >().log_enter(state_name);
4914 DECLARE_LOCALS;
4915 ps->state_clear(PG_STATE_REPAIR);
4916 pl->publish_stats_to_osd();
4917 }
4918
4919 boost::statechart::result
4920 PeeringState::NotBackfilling::react(const RemoteBackfillReserved &evt)
4921 {
4922 return discard_event();
4923 }
4924
4925 boost::statechart::result
4926 PeeringState::NotBackfilling::react(const RemoteReservationRejectedTooFull &evt)
4927 {
4928 return discard_event();
4929 }
4930
4931 void PeeringState::NotBackfilling::exit()
4932 {
4933 context< PeeringMachine >().log_exit(state_name, enter_time);
4934
4935 DECLARE_LOCALS;
4936 ps->state_clear(PG_STATE_BACKFILL_UNFOUND);
4937 utime_t dur = ceph_clock_now() - enter_time;
4938 pl->get_peering_perf().tinc(rs_notbackfilling_latency, dur);
4939 }
4940
4941 /*----NotRecovering------*/
4942 PeeringState::NotRecovering::NotRecovering(my_context ctx)
4943 : my_base(ctx),
4944 NamedState(context< PeeringMachine >().state_history, "Started/Primary/Active/NotRecovering")
4945 {
4946 context< PeeringMachine >().log_enter(state_name);
4947 DECLARE_LOCALS;
4948 ps->state_clear(PG_STATE_REPAIR);
4949 pl->publish_stats_to_osd();
4950 }
4951
4952 void PeeringState::NotRecovering::exit()
4953 {
4954 context< PeeringMachine >().log_exit(state_name, enter_time);
4955
4956 DECLARE_LOCALS;
4957 ps->state_clear(PG_STATE_RECOVERY_UNFOUND);
4958 utime_t dur = ceph_clock_now() - enter_time;
4959 pl->get_peering_perf().tinc(rs_notrecovering_latency, dur);
4960 }
4961
4962 /*---RepNotRecovering----*/
4963 PeeringState::RepNotRecovering::RepNotRecovering(my_context ctx)
4964 : my_base(ctx),
4965 NamedState(context< PeeringMachine >().state_history, "Started/ReplicaActive/RepNotRecovering")
4966 {
4967 context< PeeringMachine >().log_enter(state_name);
4968 }
4969
4970 boost::statechart::result
4971 PeeringState::RepNotRecovering::react(const RejectTooFullRemoteReservation &evt)
4972 {
4973 DECLARE_LOCALS;
4974 ps->reject_reservation();
4975 post_event(RemoteReservationRejectedTooFull());
4976 return discard_event();
4977 }
4978
4979 void PeeringState::RepNotRecovering::exit()
4980 {
4981 context< PeeringMachine >().log_exit(state_name, enter_time);
4982 DECLARE_LOCALS;
4983 utime_t dur = ceph_clock_now() - enter_time;
4984 pl->get_peering_perf().tinc(rs_repnotrecovering_latency, dur);
4985 }
4986
4987 /*---RepWaitRecoveryReserved--*/
4988 PeeringState::RepWaitRecoveryReserved::RepWaitRecoveryReserved(my_context ctx)
4989 : my_base(ctx),
4990 NamedState(context< PeeringMachine >().state_history, "Started/ReplicaActive/RepWaitRecoveryReserved")
4991 {
4992 context< PeeringMachine >().log_enter(state_name);
4993 }
4994
4995 boost::statechart::result
4996 PeeringState::RepWaitRecoveryReserved::react(const RemoteRecoveryReserved &evt)
4997 {
4998 DECLARE_LOCALS;
4999 pl->send_cluster_message(
5000 ps->primary.osd,
5001 new MRecoveryReserve(
5002 MRecoveryReserve::GRANT,
5003 spg_t(ps->info.pgid.pgid, ps->primary.shard),
5004 ps->get_osdmap_epoch()),
5005 ps->get_osdmap_epoch());
5006 return transit<RepRecovering>();
5007 }
5008
5009 boost::statechart::result
5010 PeeringState::RepWaitRecoveryReserved::react(
5011 const RemoteReservationCanceled &evt)
5012 {
5013 DECLARE_LOCALS;
5014 pl->unreserve_recovery_space();
5015
5016 pl->cancel_remote_recovery_reservation();
5017 return transit<RepNotRecovering>();
5018 }
5019
5020 void PeeringState::RepWaitRecoveryReserved::exit()
5021 {
5022 context< PeeringMachine >().log_exit(state_name, enter_time);
5023 DECLARE_LOCALS;
5024 utime_t dur = ceph_clock_now() - enter_time;
5025 pl->get_peering_perf().tinc(rs_repwaitrecoveryreserved_latency, dur);
5026 }
5027
5028 /*-RepWaitBackfillReserved*/
5029 PeeringState::RepWaitBackfillReserved::RepWaitBackfillReserved(my_context ctx)
5030 : my_base(ctx),
5031 NamedState(context< PeeringMachine >().state_history, "Started/ReplicaActive/RepWaitBackfillReserved")
5032 {
5033 context< PeeringMachine >().log_enter(state_name);
5034 }
5035
5036 boost::statechart::result
5037 PeeringState::RepNotRecovering::react(const RequestBackfillPrio &evt)
5038 {
5039
5040 DECLARE_LOCALS;
5041
5042 if (!pl->try_reserve_recovery_space(
5043 evt.primary_num_bytes, evt.local_num_bytes)) {
5044 post_event(RejectTooFullRemoteReservation());
5045 } else {
5046 PGPeeringEventRef preempt;
5047 if (HAVE_FEATURE(ps->upacting_features, RECOVERY_RESERVATION_2)) {
5048 // older peers will interpret preemption as TOOFULL
5049 preempt = std::make_shared<PGPeeringEvent>(
5050 pl->get_osdmap_epoch(),
5051 pl->get_osdmap_epoch(),
5052 RemoteBackfillPreempted());
5053 }
5054 pl->request_remote_recovery_reservation(
5055 evt.priority,
5056 std::make_shared<PGPeeringEvent>(
5057 pl->get_osdmap_epoch(),
5058 pl->get_osdmap_epoch(),
5059 RemoteBackfillReserved()),
5060 preempt);
5061 }
5062 return transit<RepWaitBackfillReserved>();
5063 }
5064
5065 boost::statechart::result
5066 PeeringState::RepNotRecovering::react(const RequestRecoveryPrio &evt)
5067 {
5068 DECLARE_LOCALS;
5069
5070 // fall back to a local reckoning of priority of primary doesn't pass one
5071 // (pre-mimic compat)
5072 int prio = evt.priority ? evt.priority : ps->get_recovery_priority();
5073
5074 PGPeeringEventRef preempt;
5075 if (HAVE_FEATURE(ps->upacting_features, RECOVERY_RESERVATION_2)) {
5076 // older peers can't handle this
5077 preempt = std::make_shared<PGPeeringEvent>(
5078 ps->get_osdmap_epoch(),
5079 ps->get_osdmap_epoch(),
5080 RemoteRecoveryPreempted());
5081 }
5082
5083 pl->request_remote_recovery_reservation(
5084 prio,
5085 std::make_shared<PGPeeringEvent>(
5086 ps->get_osdmap_epoch(),
5087 ps->get_osdmap_epoch(),
5088 RemoteRecoveryReserved()),
5089 preempt);
5090 return transit<RepWaitRecoveryReserved>();
5091 }
5092
5093 void PeeringState::RepWaitBackfillReserved::exit()
5094 {
5095 context< PeeringMachine >().log_exit(state_name, enter_time);
5096 DECLARE_LOCALS;
5097 utime_t dur = ceph_clock_now() - enter_time;
5098 pl->get_peering_perf().tinc(rs_repwaitbackfillreserved_latency, dur);
5099 }
5100
5101 boost::statechart::result
5102 PeeringState::RepWaitBackfillReserved::react(const RemoteBackfillReserved &evt)
5103 {
5104 DECLARE_LOCALS;
5105
5106
5107 pl->send_cluster_message(
5108 ps->primary.osd,
5109 new MBackfillReserve(
5110 MBackfillReserve::GRANT,
5111 spg_t(ps->info.pgid.pgid, ps->primary.shard),
5112 ps->get_osdmap_epoch()),
5113 ps->get_osdmap_epoch());
5114 return transit<RepRecovering>();
5115 }
5116
5117 boost::statechart::result
5118 PeeringState::RepWaitBackfillReserved::react(
5119 const RejectTooFullRemoteReservation &evt)
5120 {
5121 DECLARE_LOCALS;
5122 ps->reject_reservation();
5123 post_event(RemoteReservationRejectedTooFull());
5124 return discard_event();
5125 }
5126
5127 boost::statechart::result
5128 PeeringState::RepWaitBackfillReserved::react(
5129 const RemoteReservationRejectedTooFull &evt)
5130 {
5131 DECLARE_LOCALS;
5132 pl->unreserve_recovery_space();
5133
5134 pl->cancel_remote_recovery_reservation();
5135 return transit<RepNotRecovering>();
5136 }
5137
5138 boost::statechart::result
5139 PeeringState::RepWaitBackfillReserved::react(
5140 const RemoteReservationCanceled &evt)
5141 {
5142 DECLARE_LOCALS;
5143 pl->unreserve_recovery_space();
5144
5145 pl->cancel_remote_recovery_reservation();
5146 return transit<RepNotRecovering>();
5147 }
5148
5149 /*---RepRecovering-------*/
5150 PeeringState::RepRecovering::RepRecovering(my_context ctx)
5151 : my_base(ctx),
5152 NamedState(context< PeeringMachine >().state_history, "Started/ReplicaActive/RepRecovering")
5153 {
5154 context< PeeringMachine >().log_enter(state_name);
5155 }
5156
5157 boost::statechart::result
5158 PeeringState::RepRecovering::react(const RemoteRecoveryPreempted &)
5159 {
5160 DECLARE_LOCALS;
5161
5162
5163 pl->unreserve_recovery_space();
5164 pl->send_cluster_message(
5165 ps->primary.osd,
5166 new MRecoveryReserve(
5167 MRecoveryReserve::REVOKE,
5168 spg_t(ps->info.pgid.pgid, ps->primary.shard),
5169 ps->get_osdmap_epoch()),
5170 ps->get_osdmap_epoch());
5171 return discard_event();
5172 }
5173
5174 boost::statechart::result
5175 PeeringState::RepRecovering::react(const BackfillTooFull &)
5176 {
5177 DECLARE_LOCALS;
5178
5179
5180 pl->unreserve_recovery_space();
5181 pl->send_cluster_message(
5182 ps->primary.osd,
5183 new MBackfillReserve(
5184 MBackfillReserve::REVOKE_TOOFULL,
5185 spg_t(ps->info.pgid.pgid, ps->primary.shard),
5186 ps->get_osdmap_epoch()),
5187 ps->get_osdmap_epoch());
5188 return discard_event();
5189 }
5190
5191 boost::statechart::result
5192 PeeringState::RepRecovering::react(const RemoteBackfillPreempted &)
5193 {
5194 DECLARE_LOCALS;
5195
5196
5197 pl->unreserve_recovery_space();
5198 pl->send_cluster_message(
5199 ps->primary.osd,
5200 new MBackfillReserve(
5201 MBackfillReserve::REVOKE,
5202 spg_t(ps->info.pgid.pgid, ps->primary.shard),
5203 ps->get_osdmap_epoch()),
5204 ps->get_osdmap_epoch());
5205 return discard_event();
5206 }
5207
5208 void PeeringState::RepRecovering::exit()
5209 {
5210 context< PeeringMachine >().log_exit(state_name, enter_time);
5211 DECLARE_LOCALS;
5212 pl->unreserve_recovery_space();
5213
5214 pl->cancel_remote_recovery_reservation();
5215 utime_t dur = ceph_clock_now() - enter_time;
5216 pl->get_peering_perf().tinc(rs_reprecovering_latency, dur);
5217 }
5218
5219 /*------Activating--------*/
5220 PeeringState::Activating::Activating(my_context ctx)
5221 : my_base(ctx),
5222 NamedState(context< PeeringMachine >().state_history, "Started/Primary/Active/Activating")
5223 {
5224 context< PeeringMachine >().log_enter(state_name);
5225 }
5226
5227 void PeeringState::Activating::exit()
5228 {
5229 context< PeeringMachine >().log_exit(state_name, enter_time);
5230 DECLARE_LOCALS;
5231 utime_t dur = ceph_clock_now() - enter_time;
5232 pl->get_peering_perf().tinc(rs_activating_latency, dur);
5233 }
5234
5235 PeeringState::WaitLocalRecoveryReserved::WaitLocalRecoveryReserved(my_context ctx)
5236 : my_base(ctx),
5237 NamedState(context< PeeringMachine >().state_history, "Started/Primary/Active/WaitLocalRecoveryReserved")
5238 {
5239 context< PeeringMachine >().log_enter(state_name);
5240 DECLARE_LOCALS;
5241
5242 // Make sure all nodes that part of the recovery aren't full
5243 if (!ps->cct->_conf->osd_debug_skip_full_check_in_recovery &&
5244 ps->get_osdmap()->check_full(ps->acting_recovery_backfill)) {
5245 post_event(RecoveryTooFull());
5246 return;
5247 }
5248
5249 ps->state_clear(PG_STATE_RECOVERY_TOOFULL);
5250 ps->state_set(PG_STATE_RECOVERY_WAIT);
5251 pl->request_local_background_io_reservation(
5252 ps->get_recovery_priority(),
5253 std::make_shared<PGPeeringEvent>(
5254 ps->get_osdmap_epoch(),
5255 ps->get_osdmap_epoch(),
5256 LocalRecoveryReserved()),
5257 std::make_shared<PGPeeringEvent>(
5258 ps->get_osdmap_epoch(),
5259 ps->get_osdmap_epoch(),
5260 DeferRecovery(0.0)));
5261 pl->publish_stats_to_osd();
5262 }
5263
5264 boost::statechart::result
5265 PeeringState::WaitLocalRecoveryReserved::react(const RecoveryTooFull &evt)
5266 {
5267 DECLARE_LOCALS;
5268 ps->state_set(PG_STATE_RECOVERY_TOOFULL);
5269 pl->schedule_event_after(
5270 std::make_shared<PGPeeringEvent>(
5271 ps->get_osdmap_epoch(),
5272 ps->get_osdmap_epoch(),
5273 DoRecovery()),
5274 ps->cct->_conf->osd_recovery_retry_interval);
5275 return transit<NotRecovering>();
5276 }
5277
5278 void PeeringState::WaitLocalRecoveryReserved::exit()
5279 {
5280 context< PeeringMachine >().log_exit(state_name, enter_time);
5281 DECLARE_LOCALS;
5282 utime_t dur = ceph_clock_now() - enter_time;
5283 pl->get_peering_perf().tinc(rs_waitlocalrecoveryreserved_latency, dur);
5284 }
5285
5286 PeeringState::WaitRemoteRecoveryReserved::WaitRemoteRecoveryReserved(my_context ctx)
5287 : my_base(ctx),
5288 NamedState(context< PeeringMachine >().state_history, "Started/Primary/Active/WaitRemoteRecoveryReserved"),
5289 remote_recovery_reservation_it(context< Active >().remote_shards_to_reserve_recovery.begin())
5290 {
5291 context< PeeringMachine >().log_enter(state_name);
5292 post_event(RemoteRecoveryReserved());
5293 }
5294
5295 boost::statechart::result
5296 PeeringState::WaitRemoteRecoveryReserved::react(const RemoteRecoveryReserved &evt) {
5297 DECLARE_LOCALS;
5298
5299 if (remote_recovery_reservation_it !=
5300 context< Active >().remote_shards_to_reserve_recovery.end()) {
5301 ceph_assert(*remote_recovery_reservation_it != ps->pg_whoami);
5302 pl->send_cluster_message(
5303 remote_recovery_reservation_it->osd,
5304 new MRecoveryReserve(
5305 MRecoveryReserve::REQUEST,
5306 spg_t(context< PeeringMachine >().spgid.pgid,
5307 remote_recovery_reservation_it->shard),
5308 ps->get_osdmap_epoch(),
5309 ps->get_recovery_priority()),
5310 ps->get_osdmap_epoch());
5311 ++remote_recovery_reservation_it;
5312 } else {
5313 post_event(AllRemotesReserved());
5314 }
5315 return discard_event();
5316 }
5317
5318 void PeeringState::WaitRemoteRecoveryReserved::exit()
5319 {
5320 context< PeeringMachine >().log_exit(state_name, enter_time);
5321 DECLARE_LOCALS;
5322 utime_t dur = ceph_clock_now() - enter_time;
5323 pl->get_peering_perf().tinc(rs_waitremoterecoveryreserved_latency, dur);
5324 }
5325
5326 PeeringState::Recovering::Recovering(my_context ctx)
5327 : my_base(ctx),
5328 NamedState(context< PeeringMachine >().state_history, "Started/Primary/Active/Recovering")
5329 {
5330 context< PeeringMachine >().log_enter(state_name);
5331
5332 DECLARE_LOCALS;
5333 ps->state_clear(PG_STATE_RECOVERY_WAIT);
5334 ps->state_clear(PG_STATE_RECOVERY_TOOFULL);
5335 ps->state_set(PG_STATE_RECOVERING);
5336 pl->on_recovery_reserved();
5337 ceph_assert(!ps->state_test(PG_STATE_ACTIVATING));
5338 pl->publish_stats_to_osd();
5339 }
5340
5341 void PeeringState::Recovering::release_reservations(bool cancel)
5342 {
5343 DECLARE_LOCALS;
5344 ceph_assert(cancel || !ps->pg_log.get_missing().have_missing());
5345
5346 // release remote reservations
5347 for (set<pg_shard_t>::const_iterator i =
5348 context< Active >().remote_shards_to_reserve_recovery.begin();
5349 i != context< Active >().remote_shards_to_reserve_recovery.end();
5350 ++i) {
5351 if (*i == ps->pg_whoami) // skip myself
5352 continue;
5353 pl->send_cluster_message(
5354 i->osd,
5355 new MRecoveryReserve(
5356 MRecoveryReserve::RELEASE,
5357 spg_t(ps->info.pgid.pgid, i->shard),
5358 ps->get_osdmap_epoch()),
5359 ps->get_osdmap_epoch());
5360 }
5361 }
5362
5363 boost::statechart::result
5364 PeeringState::Recovering::react(const AllReplicasRecovered &evt)
5365 {
5366 DECLARE_LOCALS;
5367 ps->state_clear(PG_STATE_FORCED_RECOVERY);
5368 release_reservations();
5369 pl->cancel_local_background_io_reservation();
5370 return transit<Recovered>();
5371 }
5372
5373 boost::statechart::result
5374 PeeringState::Recovering::react(const RequestBackfill &evt)
5375 {
5376 DECLARE_LOCALS;
5377
5378 release_reservations();
5379
5380 ps->state_clear(PG_STATE_FORCED_RECOVERY);
5381 pl->cancel_local_background_io_reservation();
5382 pl->publish_stats_to_osd();
5383 // transit any async_recovery_targets back into acting
5384 // so pg won't have to stay undersized for long
5385 // as backfill might take a long time to complete..
5386 if (!ps->async_recovery_targets.empty()) {
5387 pg_shard_t auth_log_shard;
5388 bool history_les_bound = false;
5389 ps->choose_acting(auth_log_shard, true, &history_les_bound);
5390 }
5391 return transit<WaitLocalBackfillReserved>();
5392 }
5393
5394 boost::statechart::result
5395 PeeringState::Recovering::react(const DeferRecovery &evt)
5396 {
5397 DECLARE_LOCALS;
5398 if (!ps->state_test(PG_STATE_RECOVERING)) {
5399 // we may have finished recovery and have an AllReplicasRecovered
5400 // event queued to move us to the next state.
5401 psdout(10) << "got defer recovery but not recovering" << dendl;
5402 return discard_event();
5403 }
5404 psdout(10) << "defer recovery, retry delay " << evt.delay << dendl;
5405 ps->state_set(PG_STATE_RECOVERY_WAIT);
5406 pl->cancel_local_background_io_reservation();
5407 release_reservations(true);
5408 pl->schedule_event_after(
5409 std::make_shared<PGPeeringEvent>(
5410 ps->get_osdmap_epoch(),
5411 ps->get_osdmap_epoch(),
5412 DoRecovery()),
5413 evt.delay);
5414 return transit<NotRecovering>();
5415 }
5416
5417 boost::statechart::result
5418 PeeringState::Recovering::react(const UnfoundRecovery &evt)
5419 {
5420 DECLARE_LOCALS;
5421 psdout(10) << "recovery has unfound, can't continue" << dendl;
5422 ps->state_set(PG_STATE_RECOVERY_UNFOUND);
5423 pl->cancel_local_background_io_reservation();
5424 release_reservations(true);
5425 return transit<NotRecovering>();
5426 }
5427
5428 void PeeringState::Recovering::exit()
5429 {
5430 context< PeeringMachine >().log_exit(state_name, enter_time);
5431
5432 DECLARE_LOCALS;
5433 utime_t dur = ceph_clock_now() - enter_time;
5434 ps->state_clear(PG_STATE_RECOVERING);
5435 pl->get_peering_perf().tinc(rs_recovering_latency, dur);
5436 }
5437
5438 PeeringState::Recovered::Recovered(my_context ctx)
5439 : my_base(ctx),
5440 NamedState(context< PeeringMachine >().state_history, "Started/Primary/Active/Recovered")
5441 {
5442 pg_shard_t auth_log_shard;
5443
5444 context< PeeringMachine >().log_enter(state_name);
5445
5446 DECLARE_LOCALS;
5447
5448 ceph_assert(!ps->needs_recovery());
5449
5450 // if we finished backfill, all acting are active; recheck if
5451 // DEGRADED | UNDERSIZED is appropriate.
5452 ceph_assert(!ps->acting_recovery_backfill.empty());
5453 if (ps->get_osdmap()->get_pg_size(context< PeeringMachine >().spgid.pgid) <=
5454 ps->acting_recovery_backfill.size()) {
5455 ps->state_clear(PG_STATE_FORCED_BACKFILL | PG_STATE_FORCED_RECOVERY);
5456 pl->publish_stats_to_osd();
5457 }
5458
5459 // adjust acting set? (e.g. because backfill completed...)
5460 bool history_les_bound = false;
5461 if (ps->acting != ps->up && !ps->choose_acting(auth_log_shard,
5462 true, &history_les_bound)) {
5463 ceph_assert(ps->want_acting.size());
5464 } else if (!ps->async_recovery_targets.empty()) {
5465 ps->choose_acting(auth_log_shard, true, &history_les_bound);
5466 }
5467
5468 if (context< Active >().all_replicas_activated &&
5469 ps->async_recovery_targets.empty())
5470 post_event(GoClean());
5471 }
5472
5473 void PeeringState::Recovered::exit()
5474 {
5475 context< PeeringMachine >().log_exit(state_name, enter_time);
5476 DECLARE_LOCALS;
5477
5478 utime_t dur = ceph_clock_now() - enter_time;
5479 pl->get_peering_perf().tinc(rs_recovered_latency, dur);
5480 }
5481
5482 PeeringState::Clean::Clean(my_context ctx)
5483 : my_base(ctx),
5484 NamedState(context< PeeringMachine >().state_history, "Started/Primary/Active/Clean")
5485 {
5486 context< PeeringMachine >().log_enter(state_name);
5487
5488 DECLARE_LOCALS;
5489
5490 if (ps->info.last_complete != ps->info.last_update) {
5491 ceph_abort();
5492 }
5493
5494
5495 ps->try_mark_clean();
5496
5497 context< PeeringMachine >().get_cur_transaction().register_on_commit(
5498 pl->on_clean());
5499 }
5500
5501 void PeeringState::Clean::exit()
5502 {
5503 context< PeeringMachine >().log_exit(state_name, enter_time);
5504
5505 DECLARE_LOCALS;
5506 ps->state_clear(PG_STATE_CLEAN);
5507 utime_t dur = ceph_clock_now() - enter_time;
5508 pl->get_peering_perf().tinc(rs_clean_latency, dur);
5509 }
5510
5511 template <typename T>
5512 set<pg_shard_t> unique_osd_shard_set(const pg_shard_t & skip, const T &in)
5513 {
5514 set<int> osds_found;
5515 set<pg_shard_t> out;
5516 for (typename T::const_iterator i = in.begin();
5517 i != in.end();
5518 ++i) {
5519 if (*i != skip && !osds_found.count(i->osd)) {
5520 osds_found.insert(i->osd);
5521 out.insert(*i);
5522 }
5523 }
5524 return out;
5525 }
5526
5527 /*---------Active---------*/
5528 PeeringState::Active::Active(my_context ctx)
5529 : my_base(ctx),
5530 NamedState(context< PeeringMachine >().state_history, "Started/Primary/Active"),
5531 remote_shards_to_reserve_recovery(
5532 unique_osd_shard_set(
5533 context< PeeringMachine >().state->pg_whoami,
5534 context< PeeringMachine >().state->acting_recovery_backfill)),
5535 remote_shards_to_reserve_backfill(
5536 unique_osd_shard_set(
5537 context< PeeringMachine >().state->pg_whoami,
5538 context< PeeringMachine >().state->backfill_targets)),
5539 all_replicas_activated(false)
5540 {
5541 context< PeeringMachine >().log_enter(state_name);
5542
5543
5544 DECLARE_LOCALS;
5545
5546 ceph_assert(!ps->backfill_reserved);
5547 ceph_assert(ps->is_primary());
5548 psdout(10) << "In Active, about to call activate" << dendl;
5549 ps->start_flush(context< PeeringMachine >().get_cur_transaction());
5550 ps->activate(context< PeeringMachine >().get_cur_transaction(),
5551 ps->get_osdmap_epoch(),
5552 context< PeeringMachine >().get_recovery_ctx());
5553
5554 // everyone has to commit/ack before we are truly active
5555 ps->blocked_by.clear();
5556 for (set<pg_shard_t>::iterator p = ps->acting_recovery_backfill.begin();
5557 p != ps->acting_recovery_backfill.end();
5558 ++p) {
5559 if (p->shard != ps->pg_whoami.shard) {
5560 ps->blocked_by.insert(p->shard);
5561 }
5562 }
5563 pl->publish_stats_to_osd();
5564 psdout(10) << "Activate Finished" << dendl;
5565 }
5566
5567 boost::statechart::result PeeringState::Active::react(const AdvMap& advmap)
5568 {
5569 DECLARE_LOCALS;
5570
5571 if (ps->should_restart_peering(
5572 advmap.up_primary,
5573 advmap.acting_primary,
5574 advmap.newup,
5575 advmap.newacting,
5576 advmap.lastmap,
5577 advmap.osdmap)) {
5578 psdout(10) << "Active advmap interval change, fast return" << dendl;
5579 return forward_event();
5580 }
5581 psdout(10) << "Active advmap" << dendl;
5582 bool need_publish = false;
5583
5584 pl->on_active_advmap(advmap.osdmap);
5585 if (ps->dirty_big_info) {
5586 // share updated purged_snaps to mgr/mon so that we (a) stop reporting
5587 // purged snaps and (b) perhaps share more snaps that we have purged
5588 // but didn't fit in pg_stat_t.
5589 need_publish = true;
5590 ps->share_pg_info();
5591 }
5592
5593 bool need_acting_change = false;
5594 for (size_t i = 0; i < ps->want_acting.size(); i++) {
5595 int osd = ps->want_acting[i];
5596 if (!advmap.osdmap->is_up(osd)) {
5597 pg_shard_t osd_with_shard(osd, shard_id_t(i));
5598 if (!ps->is_acting(osd_with_shard) && !ps->is_up(osd_with_shard)) {
5599 psdout(10) << "Active stray osd." << osd << " in want_acting is down"
5600 << dendl;
5601 need_acting_change = true;
5602 }
5603 }
5604 }
5605 if (need_acting_change) {
5606 psdout(10) << "Active need acting change, call choose_acting again"
5607 << dendl;
5608 // possibly because we re-add some strays into the acting set and
5609 // some of them then go down in a subsequent map before we could see
5610 // the map changing the pg temp.
5611 // call choose_acting again to clear them out.
5612 // note that we leave restrict_to_up_acting to false in order to
5613 // not overkill any chosen stray that is still alive.
5614 pg_shard_t auth_log_shard;
5615 bool history_les_bound = false;
5616 ps->remove_down_peer_info(advmap.osdmap);
5617 ps->choose_acting(auth_log_shard, false, &history_les_bound, true);
5618 }
5619
5620 /* Check for changes in pool size (if the acting set changed as a result,
5621 * this does not matter) */
5622 if (advmap.lastmap->get_pg_size(ps->info.pgid.pgid) !=
5623 ps->get_osdmap()->get_pg_size(ps->info.pgid.pgid)) {
5624 if (ps->get_osdmap()->get_pg_size(ps->info.pgid.pgid) <=
5625 ps->actingset.size()) {
5626 ps->state_clear(PG_STATE_UNDERSIZED);
5627 } else {
5628 ps->state_set(PG_STATE_UNDERSIZED);
5629 }
5630 // degraded changes will be detected by call from publish_stats_to_osd()
5631 need_publish = true;
5632 }
5633
5634 // if we haven't reported our PG stats in a long time, do so now.
5635 if (ps->info.stats.reported_epoch + ps->cct->_conf->osd_pg_stat_report_interval_max < advmap.osdmap->get_epoch()) {
5636 psdout(20) << "reporting stats to osd after " << (advmap.osdmap->get_epoch() - ps->info.stats.reported_epoch)
5637 << " epochs" << dendl;
5638 need_publish = true;
5639 }
5640
5641 if (need_publish)
5642 pl->publish_stats_to_osd();
5643
5644 if (ps->check_prior_readable_down_osds(advmap.osdmap)) {
5645 pl->recheck_readable();
5646 }
5647
5648 return forward_event();
5649 }
5650
5651 boost::statechart::result PeeringState::Active::react(const ActMap&)
5652 {
5653 DECLARE_LOCALS;
5654 psdout(10) << "Active: handling ActMap" << dendl;
5655 ceph_assert(ps->is_primary());
5656
5657 pl->on_active_actmap();
5658
5659 if (ps->have_unfound()) {
5660 // object may have become unfound
5661 ps->discover_all_missing(context<PeeringMachine>().get_recovery_ctx().msgs);
5662 }
5663
5664 uint64_t unfound = ps->missing_loc.num_unfound();
5665 if (unfound > 0 &&
5666 ps->all_unfound_are_queried_or_lost(ps->get_osdmap())) {
5667 if (ps->cct->_conf->osd_auto_mark_unfound_lost) {
5668 pl->get_clog_error() << context< PeeringMachine >().spgid.pgid << " has " << unfound
5669 << " objects unfound and apparently lost, would automatically "
5670 << "mark these objects lost but this feature is not yet implemented "
5671 << "(osd_auto_mark_unfound_lost)";
5672 } else
5673 pl->get_clog_error() << context< PeeringMachine >().spgid.pgid << " has "
5674 << unfound << " objects unfound and apparently lost";
5675 }
5676
5677 return forward_event();
5678 }
5679
5680 boost::statechart::result PeeringState::Active::react(const MNotifyRec& notevt)
5681 {
5682
5683 DECLARE_LOCALS;
5684 ceph_assert(ps->is_primary());
5685 if (ps->peer_info.count(notevt.from)) {
5686 psdout(10) << "Active: got notify from " << notevt.from
5687 << ", already have info from that osd, ignoring"
5688 << dendl;
5689 } else if (ps->peer_purged.count(notevt.from)) {
5690 psdout(10) << "Active: got notify from " << notevt.from
5691 << ", already purged that peer, ignoring"
5692 << dendl;
5693 } else {
5694 psdout(10) << "Active: got notify from " << notevt.from
5695 << ", calling proc_replica_info and discover_all_missing"
5696 << dendl;
5697 ps->proc_replica_info(
5698 notevt.from, notevt.notify.info, notevt.notify.epoch_sent);
5699 if (ps->have_unfound() || (ps->is_degraded() && ps->might_have_unfound.count(notevt.from))) {
5700 ps->discover_all_missing(
5701 context<PeeringMachine>().get_recovery_ctx().msgs);
5702 }
5703 // check if it is a previous down acting member that's coming back.
5704 // if so, request pg_temp change to trigger a new interval transition
5705 pg_shard_t auth_log_shard;
5706 bool history_les_bound = false;
5707 ps->choose_acting(auth_log_shard, false, &history_les_bound, true);
5708 if (!ps->want_acting.empty() && ps->want_acting != ps->acting) {
5709 psdout(10) << "Active: got notify from previous acting member "
5710 << notevt.from << ", requesting pg_temp change"
5711 << dendl;
5712 }
5713 }
5714 return discard_event();
5715 }
5716
5717 boost::statechart::result PeeringState::Active::react(const MTrim& trim)
5718 {
5719 DECLARE_LOCALS;
5720 ceph_assert(ps->is_primary());
5721
5722 // peer is informing us of their last_complete_ondisk
5723 ldout(ps->cct,10) << " replica osd." << trim.from << " lcod " << trim.trim_to << dendl;
5724 ps->update_peer_last_complete_ondisk(pg_shard_t{trim.from, trim.shard},
5725 trim.trim_to);
5726 // trim log when the pg is recovered
5727 ps->calc_min_last_complete_ondisk();
5728 return discard_event();
5729 }
5730
5731 boost::statechart::result PeeringState::Active::react(const MInfoRec& infoevt)
5732 {
5733 DECLARE_LOCALS;
5734 ceph_assert(ps->is_primary());
5735
5736 ceph_assert(!ps->acting_recovery_backfill.empty());
5737 if (infoevt.lease_ack) {
5738 ps->proc_lease_ack(infoevt.from.osd, *infoevt.lease_ack);
5739 }
5740 // don't update history (yet) if we are active and primary; the replica
5741 // may be telling us they have activated (and committed) but we can't
5742 // share that until _everyone_ does the same.
5743 if (ps->is_acting_recovery_backfill(infoevt.from) &&
5744 ps->peer_activated.count(infoevt.from) == 0) {
5745 psdout(10) << " peer osd." << infoevt.from
5746 << " activated and committed" << dendl;
5747 ps->peer_activated.insert(infoevt.from);
5748 ps->blocked_by.erase(infoevt.from.shard);
5749 pl->publish_stats_to_osd();
5750 if (ps->peer_activated.size() == ps->acting_recovery_backfill.size()) {
5751 all_activated_and_committed();
5752 }
5753 }
5754 return discard_event();
5755 }
5756
5757 boost::statechart::result PeeringState::Active::react(const MLogRec& logevt)
5758 {
5759 DECLARE_LOCALS;
5760 psdout(10) << "searching osd." << logevt.from
5761 << " log for unfound items" << dendl;
5762 ps->proc_replica_log(
5763 logevt.msg->info, logevt.msg->log, logevt.msg->missing, logevt.from);
5764 bool got_missing = ps->search_for_missing(
5765 ps->peer_info[logevt.from],
5766 ps->peer_missing[logevt.from],
5767 logevt.from,
5768 context< PeeringMachine >().get_recovery_ctx());
5769 // If there are missing AND we are "fully" active then start recovery now
5770 if (got_missing && ps->state_test(PG_STATE_ACTIVE)) {
5771 post_event(DoRecovery());
5772 }
5773 return discard_event();
5774 }
5775
5776 boost::statechart::result PeeringState::Active::react(const QueryState& q)
5777 {
5778 DECLARE_LOCALS;
5779
5780 q.f->open_object_section("state");
5781 q.f->dump_string("name", state_name);
5782 q.f->dump_stream("enter_time") << enter_time;
5783
5784 {
5785 q.f->open_array_section("might_have_unfound");
5786 for (set<pg_shard_t>::iterator p = ps->might_have_unfound.begin();
5787 p != ps->might_have_unfound.end();
5788 ++p) {
5789 q.f->open_object_section("osd");
5790 q.f->dump_stream("osd") << *p;
5791 if (ps->peer_missing.count(*p)) {
5792 q.f->dump_string("status", "already probed");
5793 } else if (ps->peer_missing_requested.count(*p)) {
5794 q.f->dump_string("status", "querying");
5795 } else if (!ps->get_osdmap()->is_up(p->osd)) {
5796 q.f->dump_string("status", "osd is down");
5797 } else {
5798 q.f->dump_string("status", "not queried");
5799 }
5800 q.f->close_section();
5801 }
5802 q.f->close_section();
5803 }
5804 {
5805 q.f->open_object_section("recovery_progress");
5806 q.f->open_array_section("backfill_targets");
5807 for (set<pg_shard_t>::const_iterator p = ps->backfill_targets.begin();
5808 p != ps->backfill_targets.end(); ++p)
5809 q.f->dump_stream("replica") << *p;
5810 q.f->close_section();
5811 pl->dump_recovery_info(q.f);
5812 q.f->close_section();
5813 }
5814
5815 q.f->close_section();
5816 return forward_event();
5817 }
5818
5819 boost::statechart::result PeeringState::Active::react(
5820 const ActivateCommitted &evt)
5821 {
5822 DECLARE_LOCALS;
5823 ceph_assert(!ps->peer_activated.count(ps->pg_whoami));
5824 ps->peer_activated.insert(ps->pg_whoami);
5825 psdout(10) << "_activate_committed " << evt.epoch
5826 << " peer_activated now " << ps->peer_activated
5827 << " last_interval_started "
5828 << ps->info.history.last_interval_started
5829 << " last_epoch_started "
5830 << ps->info.history.last_epoch_started
5831 << " same_interval_since "
5832 << ps->info.history.same_interval_since
5833 << dendl;
5834 ceph_assert(!ps->acting_recovery_backfill.empty());
5835 if (ps->peer_activated.size() == ps->acting_recovery_backfill.size())
5836 all_activated_and_committed();
5837 return discard_event();
5838 }
5839
5840 boost::statechart::result PeeringState::Active::react(const AllReplicasActivated &evt)
5841 {
5842
5843 DECLARE_LOCALS;
5844 pg_t pgid = context< PeeringMachine >().spgid.pgid;
5845
5846 all_replicas_activated = true;
5847
5848 ps->state_clear(PG_STATE_ACTIVATING);
5849 ps->state_clear(PG_STATE_CREATING);
5850 ps->state_clear(PG_STATE_PREMERGE);
5851
5852 bool merge_target;
5853 if (ps->pool.info.is_pending_merge(pgid, &merge_target)) {
5854 ps->state_set(PG_STATE_PEERED);
5855 ps->state_set(PG_STATE_PREMERGE);
5856
5857 if (ps->actingset.size() != ps->get_osdmap()->get_pg_size(pgid)) {
5858 if (merge_target) {
5859 pg_t src = pgid;
5860 src.set_ps(ps->pool.info.get_pg_num_pending());
5861 assert(src.get_parent() == pgid);
5862 pl->set_not_ready_to_merge_target(pgid, src);
5863 } else {
5864 pl->set_not_ready_to_merge_source(pgid);
5865 }
5866 }
5867 } else if (ps->acting.size() < ps->pool.info.min_size) {
5868 ps->state_set(PG_STATE_PEERED);
5869 } else {
5870 ps->state_set(PG_STATE_ACTIVE);
5871 }
5872
5873 auto mnow = pl->get_mnow();
5874 if (ps->prior_readable_until_ub > mnow) {
5875 psdout(10) << " waiting for prior_readable_until_ub "
5876 << ps->prior_readable_until_ub << " > mnow " << mnow << dendl;
5877 ps->state_set(PG_STATE_WAIT);
5878 pl->queue_check_readable(
5879 ps->last_peering_reset,
5880 ps->prior_readable_until_ub - mnow);
5881 } else {
5882 psdout(10) << " mnow " << mnow << " >= prior_readable_until_ub "
5883 << ps->prior_readable_until_ub << dendl;
5884 }
5885
5886 if (ps->pool.info.has_flag(pg_pool_t::FLAG_CREATING)) {
5887 pl->send_pg_created(pgid);
5888 }
5889
5890 ps->info.history.last_epoch_started = ps->info.last_epoch_started;
5891 ps->info.history.last_interval_started = ps->info.last_interval_started;
5892 ps->dirty_info = true;
5893
5894 ps->share_pg_info();
5895 pl->publish_stats_to_osd();
5896
5897 pl->on_activate_complete();
5898
5899 return discard_event();
5900 }
5901
5902 boost::statechart::result PeeringState::Active::react(const RenewLease& rl)
5903 {
5904 DECLARE_LOCALS;
5905 ps->proc_renew_lease();
5906 return discard_event();
5907 }
5908
5909 boost::statechart::result PeeringState::Active::react(const MLeaseAck& la)
5910 {
5911 DECLARE_LOCALS;
5912 ps->proc_lease_ack(la.from, la.lease_ack);
5913 return discard_event();
5914 }
5915
5916
5917 boost::statechart::result PeeringState::Active::react(const CheckReadable &evt)
5918 {
5919 DECLARE_LOCALS;
5920 pl->recheck_readable();
5921 return discard_event();
5922 }
5923
5924 /*
5925 * update info.history.last_epoch_started ONLY after we and all
5926 * replicas have activated AND committed the activate transaction
5927 * (i.e. the peering results are stable on disk).
5928 */
5929 void PeeringState::Active::all_activated_and_committed()
5930 {
5931 DECLARE_LOCALS;
5932 psdout(10) << "all_activated_and_committed" << dendl;
5933 ceph_assert(ps->is_primary());
5934 ceph_assert(ps->peer_activated.size() == ps->acting_recovery_backfill.size());
5935 ceph_assert(!ps->acting_recovery_backfill.empty());
5936 ceph_assert(ps->blocked_by.empty());
5937
5938 if (HAVE_FEATURE(ps->upacting_features, SERVER_OCTOPUS)) {
5939 // this is overkill when the activation is quick, but when it is slow it
5940 // is important, because the lease was renewed by the activate itself but we
5941 // don't know how long ago that was, and simply scheduling now may leave
5942 // a gap in lease coverage. keep it simple and aggressively renew.
5943 ps->renew_lease(pl->get_mnow());
5944 ps->send_lease();
5945 ps->schedule_renew_lease();
5946 }
5947
5948 // Degraded?
5949 ps->update_calc_stats();
5950 if (ps->info.stats.stats.sum.num_objects_degraded) {
5951 ps->state_set(PG_STATE_DEGRADED);
5952 } else {
5953 ps->state_clear(PG_STATE_DEGRADED);
5954 }
5955
5956 post_event(PeeringState::AllReplicasActivated());
5957 }
5958
5959
5960 void PeeringState::Active::exit()
5961 {
5962 context< PeeringMachine >().log_exit(state_name, enter_time);
5963
5964
5965 DECLARE_LOCALS;
5966 pl->cancel_local_background_io_reservation();
5967
5968 ps->blocked_by.clear();
5969 ps->backfill_reserved = false;
5970 ps->state_clear(PG_STATE_ACTIVATING);
5971 ps->state_clear(PG_STATE_DEGRADED);
5972 ps->state_clear(PG_STATE_UNDERSIZED);
5973 ps->state_clear(PG_STATE_BACKFILL_TOOFULL);
5974 ps->state_clear(PG_STATE_BACKFILL_WAIT);
5975 ps->state_clear(PG_STATE_RECOVERY_WAIT);
5976 ps->state_clear(PG_STATE_RECOVERY_TOOFULL);
5977 utime_t dur = ceph_clock_now() - enter_time;
5978 pl->get_peering_perf().tinc(rs_active_latency, dur);
5979 pl->on_active_exit();
5980 }
5981
5982 /*------ReplicaActive-----*/
5983 PeeringState::ReplicaActive::ReplicaActive(my_context ctx)
5984 : my_base(ctx),
5985 NamedState(context< PeeringMachine >().state_history, "Started/ReplicaActive")
5986 {
5987 context< PeeringMachine >().log_enter(state_name);
5988
5989 DECLARE_LOCALS;
5990 ps->start_flush(context< PeeringMachine >().get_cur_transaction());
5991 }
5992
5993
5994 boost::statechart::result PeeringState::ReplicaActive::react(
5995 const Activate& actevt) {
5996 DECLARE_LOCALS;
5997 psdout(10) << "In ReplicaActive, about to call activate" << dendl;
5998 ps->activate(
5999 context< PeeringMachine >().get_cur_transaction(),
6000 actevt.activation_epoch,
6001 context< PeeringMachine >().get_recovery_ctx());
6002 psdout(10) << "Activate Finished" << dendl;
6003 return discard_event();
6004 }
6005
6006 boost::statechart::result PeeringState::ReplicaActive::react(
6007 const ActivateCommitted &evt)
6008 {
6009 DECLARE_LOCALS;
6010 psdout(10) << __func__ << " " << evt.epoch << " telling primary" << dendl;
6011
6012 auto &rctx = context<PeeringMachine>().get_recovery_ctx();
6013 auto epoch = ps->get_osdmap_epoch();
6014 pg_info_t i = ps->info;
6015 i.history.last_epoch_started = evt.activation_epoch;
6016 i.history.last_interval_started = i.history.same_interval_since;
6017 rctx.send_info(
6018 ps->get_primary().osd,
6019 spg_t(ps->info.pgid.pgid, ps->get_primary().shard),
6020 epoch,
6021 epoch,
6022 i,
6023 {}, /* lease */
6024 ps->get_lease_ack());
6025
6026 if (ps->acting.size() >= ps->pool.info.min_size) {
6027 ps->state_set(PG_STATE_ACTIVE);
6028 } else {
6029 ps->state_set(PG_STATE_PEERED);
6030 }
6031 pl->on_activate_committed();
6032
6033 return discard_event();
6034 }
6035
6036 boost::statechart::result PeeringState::ReplicaActive::react(const MLease& l)
6037 {
6038 DECLARE_LOCALS;
6039 spg_t spgid = context< PeeringMachine >().spgid;
6040 epoch_t epoch = pl->get_osdmap_epoch();
6041
6042 ps->proc_lease(l.lease);
6043 pl->send_cluster_message(
6044 ps->get_primary().osd,
6045 new MOSDPGLeaseAck(epoch,
6046 spg_t(spgid.pgid, ps->get_primary().shard),
6047 ps->get_lease_ack()),
6048 epoch);
6049 return discard_event();
6050 }
6051
6052 boost::statechart::result PeeringState::ReplicaActive::react(const MInfoRec& infoevt)
6053 {
6054 DECLARE_LOCALS;
6055 ps->proc_primary_info(context<PeeringMachine>().get_cur_transaction(),
6056 infoevt.info);
6057 return discard_event();
6058 }
6059
6060 boost::statechart::result PeeringState::ReplicaActive::react(const MLogRec& logevt)
6061 {
6062 DECLARE_LOCALS;
6063 psdout(10) << "received log from " << logevt.from << dendl;
6064 ObjectStore::Transaction &t = context<PeeringMachine>().get_cur_transaction();
6065 ps->merge_log(t, logevt.msg->info, logevt.msg->log, logevt.from);
6066 ceph_assert(ps->pg_log.get_head() == ps->info.last_update);
6067 if (logevt.msg->lease) {
6068 ps->proc_lease(*logevt.msg->lease);
6069 }
6070
6071 return discard_event();
6072 }
6073
6074 boost::statechart::result PeeringState::ReplicaActive::react(const MTrim& trim)
6075 {
6076 DECLARE_LOCALS;
6077 // primary is instructing us to trim
6078 ps->pg_log.trim(trim.trim_to, ps->info);
6079 ps->dirty_info = true;
6080 return discard_event();
6081 }
6082
6083 boost::statechart::result PeeringState::ReplicaActive::react(const ActMap&)
6084 {
6085 DECLARE_LOCALS;
6086 if (ps->should_send_notify() && ps->get_primary().osd >= 0) {
6087 ps->info.history.refresh_prior_readable_until_ub(
6088 pl->get_mnow(), ps->prior_readable_until_ub);
6089 context< PeeringMachine >().send_notify(
6090 ps->get_primary().osd,
6091 pg_notify_t(
6092 ps->get_primary().shard, ps->pg_whoami.shard,
6093 ps->get_osdmap_epoch(),
6094 ps->get_osdmap_epoch(),
6095 ps->info,
6096 ps->past_intervals));
6097 }
6098 return discard_event();
6099 }
6100
6101 boost::statechart::result PeeringState::ReplicaActive::react(
6102 const MQuery& query)
6103 {
6104 DECLARE_LOCALS;
6105 ps->fulfill_query(query, context<PeeringMachine>().get_recovery_ctx());
6106 return discard_event();
6107 }
6108
6109 boost::statechart::result PeeringState::ReplicaActive::react(const QueryState& q)
6110 {
6111 q.f->open_object_section("state");
6112 q.f->dump_string("name", state_name);
6113 q.f->dump_stream("enter_time") << enter_time;
6114 q.f->close_section();
6115 return forward_event();
6116 }
6117
6118 void PeeringState::ReplicaActive::exit()
6119 {
6120 context< PeeringMachine >().log_exit(state_name, enter_time);
6121 DECLARE_LOCALS;
6122 pl->unreserve_recovery_space();
6123
6124 pl->cancel_remote_recovery_reservation();
6125 utime_t dur = ceph_clock_now() - enter_time;
6126 pl->get_peering_perf().tinc(rs_replicaactive_latency, dur);
6127
6128 ps->min_last_complete_ondisk = eversion_t();
6129 }
6130
6131 /*-------Stray---*/
6132 PeeringState::Stray::Stray(my_context ctx)
6133 : my_base(ctx),
6134 NamedState(context< PeeringMachine >().state_history, "Started/Stray")
6135 {
6136 context< PeeringMachine >().log_enter(state_name);
6137
6138
6139 DECLARE_LOCALS;
6140 ceph_assert(!ps->is_peered());
6141 ceph_assert(!ps->is_peering());
6142 ceph_assert(!ps->is_primary());
6143
6144 if (!ps->get_osdmap()->have_pg_pool(ps->info.pgid.pgid.pool())) {
6145 ldout(ps->cct,10) << __func__ << " pool is deleted" << dendl;
6146 post_event(DeleteStart());
6147 } else {
6148 ps->start_flush(context< PeeringMachine >().get_cur_transaction());
6149 }
6150 }
6151
6152 boost::statechart::result PeeringState::Stray::react(const MLogRec& logevt)
6153 {
6154 DECLARE_LOCALS;
6155 MOSDPGLog *msg = logevt.msg.get();
6156 psdout(10) << "got info+log from osd." << logevt.from << " " << msg->info << " " << msg->log << dendl;
6157
6158 ObjectStore::Transaction &t = context<PeeringMachine>().get_cur_transaction();
6159 if (msg->info.last_backfill == hobject_t()) {
6160 // restart backfill
6161 ps->info = msg->info;
6162 pl->on_info_history_change();
6163 ps->dirty_info = true;
6164 ps->dirty_big_info = true; // maybe.
6165
6166 PGLog::LogEntryHandlerRef rollbacker{pl->get_log_handler(t)};
6167 ps->pg_log.reset_backfill_claim_log(msg->log, rollbacker.get());
6168
6169 ps->pg_log.reset_backfill();
6170 } else {
6171 ps->merge_log(t, msg->info, msg->log, logevt.from);
6172 }
6173 if (logevt.msg->lease) {
6174 ps->proc_lease(*logevt.msg->lease);
6175 }
6176
6177 ceph_assert(ps->pg_log.get_head() == ps->info.last_update);
6178
6179 post_event(Activate(logevt.msg->info.last_epoch_started));
6180 return transit<ReplicaActive>();
6181 }
6182
6183 boost::statechart::result PeeringState::Stray::react(const MInfoRec& infoevt)
6184 {
6185 DECLARE_LOCALS;
6186 psdout(10) << "got info from osd." << infoevt.from << " " << infoevt.info << dendl;
6187
6188 if (ps->info.last_update > infoevt.info.last_update) {
6189 // rewind divergent log entries
6190 ObjectStore::Transaction &t = context<PeeringMachine>().get_cur_transaction();
6191 ps->rewind_divergent_log(t, infoevt.info.last_update);
6192 ps->info.stats = infoevt.info.stats;
6193 ps->info.hit_set = infoevt.info.hit_set;
6194 }
6195
6196 if (infoevt.lease) {
6197 ps->proc_lease(*infoevt.lease);
6198 }
6199
6200 ceph_assert(infoevt.info.last_update == ps->info.last_update);
6201 ceph_assert(ps->pg_log.get_head() == ps->info.last_update);
6202
6203 post_event(Activate(infoevt.info.last_epoch_started));
6204 return transit<ReplicaActive>();
6205 }
6206
6207 boost::statechart::result PeeringState::Stray::react(const MQuery& query)
6208 {
6209 DECLARE_LOCALS;
6210 ps->fulfill_query(query, context<PeeringMachine>().get_recovery_ctx());
6211 return discard_event();
6212 }
6213
6214 boost::statechart::result PeeringState::Stray::react(const ActMap&)
6215 {
6216 DECLARE_LOCALS;
6217 if (ps->should_send_notify() && ps->get_primary().osd >= 0) {
6218 ps->info.history.refresh_prior_readable_until_ub(
6219 pl->get_mnow(), ps->prior_readable_until_ub);
6220 context< PeeringMachine >().send_notify(
6221 ps->get_primary().osd,
6222 pg_notify_t(
6223 ps->get_primary().shard, ps->pg_whoami.shard,
6224 ps->get_osdmap_epoch(),
6225 ps->get_osdmap_epoch(),
6226 ps->info,
6227 ps->past_intervals));
6228 }
6229 return discard_event();
6230 }
6231
6232 void PeeringState::Stray::exit()
6233 {
6234 context< PeeringMachine >().log_exit(state_name, enter_time);
6235 DECLARE_LOCALS;
6236 utime_t dur = ceph_clock_now() - enter_time;
6237 pl->get_peering_perf().tinc(rs_stray_latency, dur);
6238 }
6239
6240
6241 /*--------ToDelete----------*/
6242 PeeringState::ToDelete::ToDelete(my_context ctx)
6243 : my_base(ctx),
6244 NamedState(context< PeeringMachine >().state_history, "Started/ToDelete")
6245 {
6246 context< PeeringMachine >().log_enter(state_name);
6247 DECLARE_LOCALS;
6248 pl->get_perf_logger().inc(l_osd_pg_removing);
6249 }
6250
6251 void PeeringState::ToDelete::exit()
6252 {
6253 context< PeeringMachine >().log_exit(state_name, enter_time);
6254 DECLARE_LOCALS;
6255 // note: on a successful removal, this path doesn't execute. see
6256 // _delete_some().
6257 pl->get_perf_logger().dec(l_osd_pg_removing);
6258
6259 pl->cancel_local_background_io_reservation();
6260 }
6261
6262 /*----WaitDeleteReserved----*/
6263 PeeringState::WaitDeleteReserved::WaitDeleteReserved(my_context ctx)
6264 : my_base(ctx),
6265 NamedState(context< PeeringMachine >().state_history,
6266 "Started/ToDelete/WaitDeleteReseved")
6267 {
6268 context< PeeringMachine >().log_enter(state_name);
6269 DECLARE_LOCALS;
6270 context< ToDelete >().priority = ps->get_delete_priority();
6271
6272 pl->cancel_local_background_io_reservation();
6273 pl->request_local_background_io_reservation(
6274 context<ToDelete>().priority,
6275 std::make_shared<PGPeeringEvent>(
6276 ps->get_osdmap_epoch(),
6277 ps->get_osdmap_epoch(),
6278 DeleteReserved()),
6279 std::make_shared<PGPeeringEvent>(
6280 ps->get_osdmap_epoch(),
6281 ps->get_osdmap_epoch(),
6282 DeleteInterrupted()));
6283 }
6284
6285 boost::statechart::result PeeringState::ToDelete::react(
6286 const ActMap& evt)
6287 {
6288 DECLARE_LOCALS;
6289 if (ps->get_delete_priority() != priority) {
6290 psdout(10) << __func__ << " delete priority changed, resetting"
6291 << dendl;
6292 return transit<ToDelete>();
6293 }
6294 return discard_event();
6295 }
6296
6297 void PeeringState::WaitDeleteReserved::exit()
6298 {
6299 context< PeeringMachine >().log_exit(state_name, enter_time);
6300 }
6301
6302 /*----Deleting-----*/
6303 PeeringState::Deleting::Deleting(my_context ctx)
6304 : my_base(ctx),
6305 NamedState(context< PeeringMachine >().state_history, "Started/ToDelete/Deleting")
6306 {
6307 start = ceph::mono_clock::now();
6308
6309 context< PeeringMachine >().log_enter(state_name);
6310
6311 DECLARE_LOCALS;
6312 ps->deleting = true;
6313 ObjectStore::Transaction &t = context<PeeringMachine>().get_cur_transaction();
6314
6315 // clear log
6316 PGLog::LogEntryHandlerRef rollbacker{pl->get_log_handler(t)};
6317 ps->pg_log.roll_forward(rollbacker.get());
6318
6319 // adjust info to backfill
6320 ps->info.set_last_backfill(hobject_t());
6321 ps->pg_log.reset_backfill();
6322 ps->dirty_info = true;
6323
6324 pl->on_removal(t);
6325 }
6326
6327 boost::statechart::result PeeringState::Deleting::react(
6328 const DeleteSome& evt)
6329 {
6330 DECLARE_LOCALS;
6331 next = pl->do_delete_work(context<PeeringMachine>().get_cur_transaction(),
6332 next);
6333 return discard_event();
6334 }
6335
6336 void PeeringState::Deleting::exit()
6337 {
6338 context< PeeringMachine >().log_exit(state_name, enter_time);
6339 DECLARE_LOCALS;
6340 ps->deleting = false;
6341 pl->cancel_local_background_io_reservation();
6342 psdout(20) << "Deleting::" << __func__ << this <<" finished in "
6343 << ceph::mono_clock::now() - start
6344 << dendl;
6345 }
6346
6347 /*--------GetInfo---------*/
6348 PeeringState::GetInfo::GetInfo(my_context ctx)
6349 : my_base(ctx),
6350 NamedState(context< PeeringMachine >().state_history, "Started/Primary/Peering/GetInfo")
6351 {
6352 context< PeeringMachine >().log_enter(state_name);
6353
6354
6355 DECLARE_LOCALS;
6356 ps->check_past_interval_bounds();
6357 ps->log_weirdness();
6358 PastIntervals::PriorSet &prior_set = context< Peering >().prior_set;
6359
6360 ceph_assert(ps->blocked_by.empty());
6361
6362 prior_set = ps->build_prior();
6363 ps->prior_readable_down_osds = prior_set.down;
6364 if (ps->prior_readable_down_osds.empty()) {
6365 psdout(10) << " no prior_set down osds, clearing prior_readable_until_ub"
6366 << dendl;
6367 ps->clear_prior_readable_until_ub();
6368 }
6369
6370 ps->reset_min_peer_features();
6371 get_infos();
6372 if (prior_set.pg_down) {
6373 post_event(IsDown());
6374 } else if (peer_info_requested.empty()) {
6375 post_event(GotInfo());
6376 }
6377 }
6378
6379 void PeeringState::GetInfo::get_infos()
6380 {
6381 DECLARE_LOCALS;
6382 PastIntervals::PriorSet &prior_set = context< Peering >().prior_set;
6383
6384 ps->blocked_by.clear();
6385 for (set<pg_shard_t>::const_iterator it = prior_set.probe.begin();
6386 it != prior_set.probe.end();
6387 ++it) {
6388 pg_shard_t peer = *it;
6389 if (peer == ps->pg_whoami) {
6390 continue;
6391 }
6392 if (ps->peer_info.count(peer)) {
6393 psdout(10) << " have osd." << peer << " info " << ps->peer_info[peer] << dendl;
6394 continue;
6395 }
6396 if (peer_info_requested.count(peer)) {
6397 psdout(10) << " already requested info from osd." << peer << dendl;
6398 ps->blocked_by.insert(peer.osd);
6399 } else if (!ps->get_osdmap()->is_up(peer.osd)) {
6400 psdout(10) << " not querying info from down osd." << peer << dendl;
6401 } else {
6402 psdout(10) << " querying info from osd." << peer << dendl;
6403 context< PeeringMachine >().send_query(
6404 peer.osd,
6405 pg_query_t(pg_query_t::INFO,
6406 it->shard, ps->pg_whoami.shard,
6407 ps->info.history,
6408 ps->get_osdmap_epoch()));
6409 peer_info_requested.insert(peer);
6410 ps->blocked_by.insert(peer.osd);
6411 }
6412 }
6413
6414 ps->check_prior_readable_down_osds(ps->get_osdmap());
6415
6416 pl->publish_stats_to_osd();
6417 }
6418
6419 boost::statechart::result PeeringState::GetInfo::react(const MNotifyRec& infoevt)
6420 {
6421
6422 DECLARE_LOCALS;
6423
6424 set<pg_shard_t>::iterator p = peer_info_requested.find(infoevt.from);
6425 if (p != peer_info_requested.end()) {
6426 peer_info_requested.erase(p);
6427 ps->blocked_by.erase(infoevt.from.osd);
6428 }
6429
6430 epoch_t old_start = ps->info.history.last_epoch_started;
6431 if (ps->proc_replica_info(
6432 infoevt.from, infoevt.notify.info, infoevt.notify.epoch_sent)) {
6433 // we got something new ...
6434 PastIntervals::PriorSet &prior_set = context< Peering >().prior_set;
6435 if (old_start < ps->info.history.last_epoch_started) {
6436 psdout(10) << " last_epoch_started moved forward, rebuilding prior" << dendl;
6437 prior_set = ps->build_prior();
6438 ps->prior_readable_down_osds = prior_set.down;
6439
6440 // filter out any osds that got dropped from the probe set from
6441 // peer_info_requested. this is less expensive than restarting
6442 // peering (which would re-probe everyone).
6443 set<pg_shard_t>::iterator p = peer_info_requested.begin();
6444 while (p != peer_info_requested.end()) {
6445 if (prior_set.probe.count(*p) == 0) {
6446 psdout(20) << " dropping osd." << *p << " from info_requested, no longer in probe set" << dendl;
6447 peer_info_requested.erase(p++);
6448 } else {
6449 ++p;
6450 }
6451 }
6452 get_infos();
6453 }
6454 psdout(20) << "Adding osd: " << infoevt.from.osd << " peer features: "
6455 << hex << infoevt.features << dec << dendl;
6456 ps->apply_peer_features(infoevt.features);
6457
6458 // are we done getting everything?
6459 if (peer_info_requested.empty() && !prior_set.pg_down) {
6460 psdout(20) << "Common peer features: " << hex << ps->get_min_peer_features() << dec << dendl;
6461 psdout(20) << "Common acting features: " << hex << ps->get_min_acting_features() << dec << dendl;
6462 psdout(20) << "Common upacting features: " << hex << ps->get_min_upacting_features() << dec << dendl;
6463 post_event(GotInfo());
6464 }
6465 }
6466 return discard_event();
6467 }
6468
6469 boost::statechart::result PeeringState::GetInfo::react(const QueryState& q)
6470 {
6471 DECLARE_LOCALS;
6472 q.f->open_object_section("state");
6473 q.f->dump_string("name", state_name);
6474 q.f->dump_stream("enter_time") << enter_time;
6475
6476 q.f->open_array_section("requested_info_from");
6477 for (set<pg_shard_t>::iterator p = peer_info_requested.begin();
6478 p != peer_info_requested.end();
6479 ++p) {
6480 q.f->open_object_section("osd");
6481 q.f->dump_stream("osd") << *p;
6482 if (ps->peer_info.count(*p)) {
6483 q.f->open_object_section("got_info");
6484 ps->peer_info[*p].dump(q.f);
6485 q.f->close_section();
6486 }
6487 q.f->close_section();
6488 }
6489 q.f->close_section();
6490
6491 q.f->close_section();
6492 return forward_event();
6493 }
6494
6495 void PeeringState::GetInfo::exit()
6496 {
6497 context< PeeringMachine >().log_exit(state_name, enter_time);
6498
6499 DECLARE_LOCALS;
6500 utime_t dur = ceph_clock_now() - enter_time;
6501 pl->get_peering_perf().tinc(rs_getinfo_latency, dur);
6502 ps->blocked_by.clear();
6503 }
6504
6505 /*------GetLog------------*/
6506 PeeringState::GetLog::GetLog(my_context ctx)
6507 : my_base(ctx),
6508 NamedState(
6509 context< PeeringMachine >().state_history,
6510 "Started/Primary/Peering/GetLog"),
6511 msg(0)
6512 {
6513 context< PeeringMachine >().log_enter(state_name);
6514
6515 DECLARE_LOCALS;
6516
6517 ps->log_weirdness();
6518
6519 // adjust acting?
6520 if (!ps->choose_acting(auth_log_shard, false,
6521 &context< Peering >().history_les_bound)) {
6522 if (!ps->want_acting.empty()) {
6523 post_event(NeedActingChange());
6524 } else {
6525 post_event(IsIncomplete());
6526 }
6527 return;
6528 }
6529
6530 // am i the best?
6531 if (auth_log_shard == ps->pg_whoami) {
6532 post_event(GotLog());
6533 return;
6534 }
6535
6536 const pg_info_t& best = ps->peer_info[auth_log_shard];
6537
6538 // am i broken?
6539 if (ps->info.last_update < best.log_tail) {
6540 psdout(10) << " not contiguous with osd." << auth_log_shard << ", down" << dendl;
6541 post_event(IsIncomplete());
6542 return;
6543 }
6544
6545 // how much log to request?
6546 eversion_t request_log_from = ps->info.last_update;
6547 ceph_assert(!ps->acting_recovery_backfill.empty());
6548 for (set<pg_shard_t>::iterator p = ps->acting_recovery_backfill.begin();
6549 p != ps->acting_recovery_backfill.end();
6550 ++p) {
6551 if (*p == ps->pg_whoami) continue;
6552 pg_info_t& ri = ps->peer_info[*p];
6553 if (ri.last_update < ps->info.log_tail && ri.last_update >= best.log_tail &&
6554 ri.last_update < request_log_from)
6555 request_log_from = ri.last_update;
6556 }
6557
6558 // how much?
6559 psdout(10) << " requesting log from osd." << auth_log_shard << dendl;
6560 context<PeeringMachine>().send_query(
6561 auth_log_shard.osd,
6562 pg_query_t(
6563 pg_query_t::LOG,
6564 auth_log_shard.shard, ps->pg_whoami.shard,
6565 request_log_from, ps->info.history,
6566 ps->get_osdmap_epoch()));
6567
6568 ceph_assert(ps->blocked_by.empty());
6569 ps->blocked_by.insert(auth_log_shard.osd);
6570 pl->publish_stats_to_osd();
6571 }
6572
6573 boost::statechart::result PeeringState::GetLog::react(const AdvMap& advmap)
6574 {
6575 // make sure our log source didn't go down. we need to check
6576 // explicitly because it may not be part of the prior set, which
6577 // means the Peering state check won't catch it going down.
6578 if (!advmap.osdmap->is_up(auth_log_shard.osd)) {
6579 psdout(10) << "GetLog: auth_log_shard osd."
6580 << auth_log_shard.osd << " went down" << dendl;
6581 post_event(advmap);
6582 return transit< Reset >();
6583 }
6584
6585 // let the Peering state do its checks.
6586 return forward_event();
6587 }
6588
6589 boost::statechart::result PeeringState::GetLog::react(const MLogRec& logevt)
6590 {
6591 ceph_assert(!msg);
6592 if (logevt.from != auth_log_shard) {
6593 psdout(10) << "GetLog: discarding log from "
6594 << "non-auth_log_shard osd." << logevt.from << dendl;
6595 return discard_event();
6596 }
6597 psdout(10) << "GetLog: received master log from osd."
6598 << logevt.from << dendl;
6599 msg = logevt.msg;
6600 post_event(GotLog());
6601 return discard_event();
6602 }
6603
6604 boost::statechart::result PeeringState::GetLog::react(const GotLog&)
6605 {
6606
6607 DECLARE_LOCALS;
6608 psdout(10) << "leaving GetLog" << dendl;
6609 if (msg) {
6610 psdout(10) << "processing master log" << dendl;
6611 ps->proc_master_log(context<PeeringMachine>().get_cur_transaction(),
6612 msg->info, msg->log, msg->missing,
6613 auth_log_shard);
6614 }
6615 ps->start_flush(context< PeeringMachine >().get_cur_transaction());
6616 return transit< GetMissing >();
6617 }
6618
6619 boost::statechart::result PeeringState::GetLog::react(const QueryState& q)
6620 {
6621 q.f->open_object_section("state");
6622 q.f->dump_string("name", state_name);
6623 q.f->dump_stream("enter_time") << enter_time;
6624 q.f->dump_stream("auth_log_shard") << auth_log_shard;
6625 q.f->close_section();
6626 return forward_event();
6627 }
6628
6629 void PeeringState::GetLog::exit()
6630 {
6631 context< PeeringMachine >().log_exit(state_name, enter_time);
6632
6633 DECLARE_LOCALS;
6634 utime_t dur = ceph_clock_now() - enter_time;
6635 pl->get_peering_perf().tinc(rs_getlog_latency, dur);
6636 ps->blocked_by.clear();
6637 }
6638
6639 /*------WaitActingChange--------*/
6640 PeeringState::WaitActingChange::WaitActingChange(my_context ctx)
6641 : my_base(ctx),
6642 NamedState(context< PeeringMachine >().state_history, "Started/Primary/WaitActingChange")
6643 {
6644 context< PeeringMachine >().log_enter(state_name);
6645 }
6646
6647 boost::statechart::result PeeringState::WaitActingChange::react(const AdvMap& advmap)
6648 {
6649 DECLARE_LOCALS;
6650 OSDMapRef osdmap = advmap.osdmap;
6651
6652 psdout(10) << "verifying no want_acting " << ps->want_acting << " targets didn't go down" << dendl;
6653 for (vector<int>::iterator p = ps->want_acting.begin(); p != ps->want_acting.end(); ++p) {
6654 if (!osdmap->is_up(*p)) {
6655 psdout(10) << " want_acting target osd." << *p << " went down, resetting" << dendl;
6656 post_event(advmap);
6657 return transit< Reset >();
6658 }
6659 }
6660 return forward_event();
6661 }
6662
6663 boost::statechart::result PeeringState::WaitActingChange::react(const MLogRec& logevt)
6664 {
6665 psdout(10) << "In WaitActingChange, ignoring MLocRec" << dendl;
6666 return discard_event();
6667 }
6668
6669 boost::statechart::result PeeringState::WaitActingChange::react(const MInfoRec& evt)
6670 {
6671 psdout(10) << "In WaitActingChange, ignoring MInfoRec" << dendl;
6672 return discard_event();
6673 }
6674
6675 boost::statechart::result PeeringState::WaitActingChange::react(const MNotifyRec& evt)
6676 {
6677 psdout(10) << "In WaitActingChange, ignoring MNotifyRec" << dendl;
6678 return discard_event();
6679 }
6680
6681 boost::statechart::result PeeringState::WaitActingChange::react(const QueryState& q)
6682 {
6683 q.f->open_object_section("state");
6684 q.f->dump_string("name", state_name);
6685 q.f->dump_stream("enter_time") << enter_time;
6686 q.f->dump_string("comment", "waiting for pg acting set to change");
6687 q.f->close_section();
6688 return forward_event();
6689 }
6690
6691 void PeeringState::WaitActingChange::exit()
6692 {
6693 context< PeeringMachine >().log_exit(state_name, enter_time);
6694 DECLARE_LOCALS;
6695 utime_t dur = ceph_clock_now() - enter_time;
6696 pl->get_peering_perf().tinc(rs_waitactingchange_latency, dur);
6697 }
6698
6699 /*------Down--------*/
6700 PeeringState::Down::Down(my_context ctx)
6701 : my_base(ctx),
6702 NamedState(context< PeeringMachine >().state_history, "Started/Primary/Peering/Down")
6703 {
6704 context< PeeringMachine >().log_enter(state_name);
6705 DECLARE_LOCALS;
6706
6707 ps->state_clear(PG_STATE_PEERING);
6708 ps->state_set(PG_STATE_DOWN);
6709
6710 auto &prior_set = context< Peering >().prior_set;
6711 ceph_assert(ps->blocked_by.empty());
6712 ps->blocked_by.insert(prior_set.down.begin(), prior_set.down.end());
6713 pl->publish_stats_to_osd();
6714 }
6715
6716 void PeeringState::Down::exit()
6717 {
6718 context< PeeringMachine >().log_exit(state_name, enter_time);
6719
6720 DECLARE_LOCALS;
6721
6722 ps->state_clear(PG_STATE_DOWN);
6723 utime_t dur = ceph_clock_now() - enter_time;
6724 pl->get_peering_perf().tinc(rs_down_latency, dur);
6725
6726 ps->blocked_by.clear();
6727 }
6728
6729 boost::statechart::result PeeringState::Down::react(const QueryState& q)
6730 {
6731 q.f->open_object_section("state");
6732 q.f->dump_string("name", state_name);
6733 q.f->dump_stream("enter_time") << enter_time;
6734 q.f->dump_string("comment",
6735 "not enough up instances of this PG to go active");
6736 q.f->close_section();
6737 return forward_event();
6738 }
6739
6740 boost::statechart::result PeeringState::Down::react(const MNotifyRec& infoevt)
6741 {
6742 DECLARE_LOCALS;
6743
6744 ceph_assert(ps->is_primary());
6745 epoch_t old_start = ps->info.history.last_epoch_started;
6746 if (!ps->peer_info.count(infoevt.from) &&
6747 ps->get_osdmap()->has_been_up_since(infoevt.from.osd, infoevt.notify.epoch_sent)) {
6748 ps->update_history(infoevt.notify.info.history);
6749 }
6750 // if we got something new to make pg escape down state
6751 if (ps->info.history.last_epoch_started > old_start) {
6752 psdout(10) << " last_epoch_started moved forward, re-enter getinfo" << dendl;
6753 ps->state_clear(PG_STATE_DOWN);
6754 ps->state_set(PG_STATE_PEERING);
6755 return transit< GetInfo >();
6756 }
6757
6758 return discard_event();
6759 }
6760
6761
6762 /*------Incomplete--------*/
6763 PeeringState::Incomplete::Incomplete(my_context ctx)
6764 : my_base(ctx),
6765 NamedState(context< PeeringMachine >().state_history, "Started/Primary/Peering/Incomplete")
6766 {
6767 context< PeeringMachine >().log_enter(state_name);
6768 DECLARE_LOCALS;
6769
6770 ps->state_clear(PG_STATE_PEERING);
6771 ps->state_set(PG_STATE_INCOMPLETE);
6772
6773 PastIntervals::PriorSet &prior_set = context< Peering >().prior_set;
6774 ceph_assert(ps->blocked_by.empty());
6775 ps->blocked_by.insert(prior_set.down.begin(), prior_set.down.end());
6776 pl->publish_stats_to_osd();
6777 }
6778
6779 boost::statechart::result PeeringState::Incomplete::react(const AdvMap &advmap) {
6780 DECLARE_LOCALS;
6781 int64_t poolnum = ps->info.pgid.pool();
6782
6783 // Reset if min_size turn smaller than previous value, pg might now be able to go active
6784 if (!advmap.osdmap->have_pg_pool(poolnum) ||
6785 advmap.lastmap->get_pools().find(poolnum)->second.min_size >
6786 advmap.osdmap->get_pools().find(poolnum)->second.min_size) {
6787 post_event(advmap);
6788 return transit< Reset >();
6789 }
6790
6791 return forward_event();
6792 }
6793
6794 boost::statechart::result PeeringState::Incomplete::react(const MNotifyRec& notevt) {
6795 DECLARE_LOCALS;
6796 psdout(7) << "handle_pg_notify from osd." << notevt.from << dendl;
6797 if (ps->proc_replica_info(
6798 notevt.from, notevt.notify.info, notevt.notify.epoch_sent)) {
6799 // We got something new, try again!
6800 return transit< GetLog >();
6801 } else {
6802 return discard_event();
6803 }
6804 }
6805
6806 boost::statechart::result PeeringState::Incomplete::react(
6807 const QueryState& q)
6808 {
6809 q.f->open_object_section("state");
6810 q.f->dump_string("name", state_name);
6811 q.f->dump_stream("enter_time") << enter_time;
6812 q.f->dump_string("comment", "not enough complete instances of this PG");
6813 q.f->close_section();
6814 return forward_event();
6815 }
6816
6817 void PeeringState::Incomplete::exit()
6818 {
6819 context< PeeringMachine >().log_exit(state_name, enter_time);
6820
6821 DECLARE_LOCALS;
6822
6823 ps->state_clear(PG_STATE_INCOMPLETE);
6824 utime_t dur = ceph_clock_now() - enter_time;
6825 pl->get_peering_perf().tinc(rs_incomplete_latency, dur);
6826
6827 ps->blocked_by.clear();
6828 }
6829
6830 /*------GetMissing--------*/
6831 PeeringState::GetMissing::GetMissing(my_context ctx)
6832 : my_base(ctx),
6833 NamedState(context< PeeringMachine >().state_history, "Started/Primary/Peering/GetMissing")
6834 {
6835 context< PeeringMachine >().log_enter(state_name);
6836
6837 DECLARE_LOCALS;
6838 ps->log_weirdness();
6839 ceph_assert(!ps->acting_recovery_backfill.empty());
6840 eversion_t since;
6841 for (set<pg_shard_t>::iterator i = ps->acting_recovery_backfill.begin();
6842 i != ps->acting_recovery_backfill.end();
6843 ++i) {
6844 if (*i == ps->get_primary()) continue;
6845 const pg_info_t& pi = ps->peer_info[*i];
6846 // reset this so to make sure the pg_missing_t is initialized and
6847 // has the correct semantics even if we don't need to get a
6848 // missing set from a shard. This way later additions due to
6849 // lost+unfound delete work properly.
6850 ps->peer_missing[*i].may_include_deletes = !ps->perform_deletes_during_peering();
6851
6852 if (pi.is_empty())
6853 continue; // no pg data, nothing divergent
6854
6855 if (pi.last_update < ps->pg_log.get_tail()) {
6856 psdout(10) << " osd." << *i << " is not contiguous, will restart backfill" << dendl;
6857 ps->peer_missing[*i].clear();
6858 continue;
6859 }
6860 if (pi.last_backfill == hobject_t()) {
6861 psdout(10) << " osd." << *i << " will fully backfill; can infer empty missing set" << dendl;
6862 ps->peer_missing[*i].clear();
6863 continue;
6864 }
6865
6866 if (pi.last_update == pi.last_complete && // peer has no missing
6867 pi.last_update == ps->info.last_update) { // peer is up to date
6868 // replica has no missing and identical log as us. no need to
6869 // pull anything.
6870 // FIXME: we can do better here. if last_update==last_complete we
6871 // can infer the rest!
6872 psdout(10) << " osd." << *i << " has no missing, identical log" << dendl;
6873 ps->peer_missing[*i].clear();
6874 continue;
6875 }
6876
6877 // We pull the log from the peer's last_epoch_started to ensure we
6878 // get enough log to detect divergent updates.
6879 since.epoch = pi.last_epoch_started;
6880 ceph_assert(pi.last_update >= ps->info.log_tail); // or else choose_acting() did a bad thing
6881 if (pi.log_tail <= since) {
6882 psdout(10) << " requesting log+missing since " << since << " from osd." << *i << dendl;
6883 context< PeeringMachine >().send_query(
6884 i->osd,
6885 pg_query_t(
6886 pg_query_t::LOG,
6887 i->shard, ps->pg_whoami.shard,
6888 since, ps->info.history,
6889 ps->get_osdmap_epoch()));
6890 } else {
6891 psdout(10) << " requesting fulllog+missing from osd." << *i
6892 << " (want since " << since << " < log.tail "
6893 << pi.log_tail << ")" << dendl;
6894 context< PeeringMachine >().send_query(
6895 i->osd, pg_query_t(
6896 pg_query_t::FULLLOG,
6897 i->shard, ps->pg_whoami.shard,
6898 ps->info.history, ps->get_osdmap_epoch()));
6899 }
6900 peer_missing_requested.insert(*i);
6901 ps->blocked_by.insert(i->osd);
6902 }
6903
6904 if (peer_missing_requested.empty()) {
6905 if (ps->need_up_thru) {
6906 psdout(10) << " still need up_thru update before going active"
6907 << dendl;
6908 post_event(NeedUpThru());
6909 return;
6910 }
6911
6912 // all good!
6913 post_event(Activate(ps->get_osdmap_epoch()));
6914 } else {
6915 pl->publish_stats_to_osd();
6916 }
6917 }
6918
6919 boost::statechart::result PeeringState::GetMissing::react(const MLogRec& logevt)
6920 {
6921 DECLARE_LOCALS;
6922
6923 peer_missing_requested.erase(logevt.from);
6924 ps->proc_replica_log(logevt.msg->info, logevt.msg->log, logevt.msg->missing, logevt.from);
6925
6926 if (peer_missing_requested.empty()) {
6927 if (ps->need_up_thru) {
6928 psdout(10) << " still need up_thru update before going active"
6929 << dendl;
6930 post_event(NeedUpThru());
6931 } else {
6932 psdout(10) << "Got last missing, don't need missing "
6933 << "posting Activate" << dendl;
6934 post_event(Activate(ps->get_osdmap_epoch()));
6935 }
6936 }
6937 return discard_event();
6938 }
6939
6940 boost::statechart::result PeeringState::GetMissing::react(const QueryState& q)
6941 {
6942 DECLARE_LOCALS;
6943 q.f->open_object_section("state");
6944 q.f->dump_string("name", state_name);
6945 q.f->dump_stream("enter_time") << enter_time;
6946
6947 q.f->open_array_section("peer_missing_requested");
6948 for (set<pg_shard_t>::iterator p = peer_missing_requested.begin();
6949 p != peer_missing_requested.end();
6950 ++p) {
6951 q.f->open_object_section("osd");
6952 q.f->dump_stream("osd") << *p;
6953 if (ps->peer_missing.count(*p)) {
6954 q.f->open_object_section("got_missing");
6955 ps->peer_missing[*p].dump(q.f);
6956 q.f->close_section();
6957 }
6958 q.f->close_section();
6959 }
6960 q.f->close_section();
6961
6962 q.f->close_section();
6963 return forward_event();
6964 }
6965
6966 void PeeringState::GetMissing::exit()
6967 {
6968 context< PeeringMachine >().log_exit(state_name, enter_time);
6969
6970 DECLARE_LOCALS;
6971 utime_t dur = ceph_clock_now() - enter_time;
6972 pl->get_peering_perf().tinc(rs_getmissing_latency, dur);
6973 ps->blocked_by.clear();
6974 }
6975
6976 /*------WaitUpThru--------*/
6977 PeeringState::WaitUpThru::WaitUpThru(my_context ctx)
6978 : my_base(ctx),
6979 NamedState(context< PeeringMachine >().state_history, "Started/Primary/Peering/WaitUpThru")
6980 {
6981 context< PeeringMachine >().log_enter(state_name);
6982 }
6983
6984 boost::statechart::result PeeringState::WaitUpThru::react(const ActMap& am)
6985 {
6986 DECLARE_LOCALS;
6987 if (!ps->need_up_thru) {
6988 post_event(Activate(ps->get_osdmap_epoch()));
6989 }
6990 return forward_event();
6991 }
6992
6993 boost::statechart::result PeeringState::WaitUpThru::react(const MLogRec& logevt)
6994 {
6995 DECLARE_LOCALS;
6996 psdout(10) << "Noting missing from osd." << logevt.from << dendl;
6997 ps->peer_missing[logevt.from].claim(logevt.msg->missing);
6998 ps->peer_info[logevt.from] = logevt.msg->info;
6999 return discard_event();
7000 }
7001
7002 boost::statechart::result PeeringState::WaitUpThru::react(const QueryState& q)
7003 {
7004 q.f->open_object_section("state");
7005 q.f->dump_string("name", state_name);
7006 q.f->dump_stream("enter_time") << enter_time;
7007 q.f->dump_string("comment", "waiting for osdmap to reflect a new up_thru for this osd");
7008 q.f->close_section();
7009 return forward_event();
7010 }
7011
7012 void PeeringState::WaitUpThru::exit()
7013 {
7014 context< PeeringMachine >().log_exit(state_name, enter_time);
7015 DECLARE_LOCALS;
7016 utime_t dur = ceph_clock_now() - enter_time;
7017 pl->get_peering_perf().tinc(rs_waitupthru_latency, dur);
7018 }
7019
7020 /*----PeeringState::PeeringMachine Methods-----*/
7021 #undef dout_prefix
7022 #define dout_prefix dpp->gen_prefix(*_dout)
7023
7024 void PeeringState::PeeringMachine::log_enter(const char *state_name)
7025 {
7026 DECLARE_LOCALS;
7027 psdout(5) << "enter " << state_name << dendl;
7028 pl->log_state_enter(state_name);
7029 }
7030
7031 void PeeringState::PeeringMachine::log_exit(const char *state_name, utime_t enter_time)
7032 {
7033 DECLARE_LOCALS;
7034 utime_t dur = ceph_clock_now() - enter_time;
7035 psdout(5) << "exit " << state_name << " " << dur << " " << event_count << " " << event_time << dendl;
7036 pl->log_state_exit(state_name, enter_time, event_count, event_time);
7037 event_count = 0;
7038 event_time = utime_t();
7039 }
7040
7041 ostream &operator<<(ostream &out, const PeeringState &ps) {
7042 out << "pg[" << ps.info
7043 << " " << pg_vector_string(ps.up);
7044 if (ps.acting != ps.up)
7045 out << "/" << pg_vector_string(ps.acting);
7046 if (ps.is_ec_pg())
7047 out << "p" << ps.get_primary();
7048 if (!ps.async_recovery_targets.empty())
7049 out << " async=[" << ps.async_recovery_targets << "]";
7050 if (!ps.backfill_targets.empty())
7051 out << " backfill=[" << ps.backfill_targets << "]";
7052 out << " r=" << ps.get_role();
7053 out << " lpr=" << ps.get_last_peering_reset();
7054
7055 if (ps.deleting)
7056 out << " DELETING";
7057
7058 if (!ps.past_intervals.empty()) {
7059 out << " pi=[" << ps.past_intervals.get_bounds()
7060 << ")/" << ps.past_intervals.size();
7061 }
7062
7063 if (ps.is_peered()) {
7064 if (ps.last_update_ondisk != ps.info.last_update)
7065 out << " luod=" << ps.last_update_ondisk;
7066 if (ps.last_update_applied != ps.info.last_update)
7067 out << " lua=" << ps.last_update_applied;
7068 }
7069
7070 if (ps.pg_log.get_tail() != ps.info.log_tail ||
7071 ps.pg_log.get_head() != ps.info.last_update)
7072 out << " (info mismatch, " << ps.pg_log.get_log() << ")";
7073
7074 if (!ps.pg_log.get_log().empty()) {
7075 if ((ps.pg_log.get_log().log.begin()->version <= ps.pg_log.get_tail())) {
7076 out << " (log bound mismatch, actual=["
7077 << ps.pg_log.get_log().log.begin()->version << ","
7078 << ps.pg_log.get_log().log.rbegin()->version << "]";
7079 out << ")";
7080 }
7081 }
7082
7083 out << " crt=" << ps.pg_log.get_can_rollback_to();
7084
7085 if (ps.last_complete_ondisk != ps.info.last_complete)
7086 out << " lcod " << ps.last_complete_ondisk;
7087
7088 out << " mlcod " << ps.min_last_complete_ondisk;
7089
7090 out << " " << pg_state_string(ps.get_state());
7091 if (ps.should_send_notify())
7092 out << " NOTIFY";
7093
7094 if (ps.prior_readable_until_ub != ceph::signedspan::zero()) {
7095 out << " pruub " << ps.prior_readable_until_ub
7096 << "@" << ps.get_prior_readable_down_osds();
7097 }
7098 return out;
7099 }