]> git.proxmox.com Git - ceph.git/blob - ceph/src/osd/PeeringState.cc
import 15.2.2 octopus source
[ceph.git] / ceph / src / osd / PeeringState.cc
1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2 // vim: ts=8 sw=2 smarttab
3
4 #include "PGPeeringEvent.h"
5 #include "common/ceph_releases.h"
6 #include "common/dout.h"
7 #include "PeeringState.h"
8
9 #include "messages/MOSDPGRemove.h"
10 #include "messages/MBackfillReserve.h"
11 #include "messages/MRecoveryReserve.h"
12 #include "messages/MOSDScrubReserve.h"
13 #include "messages/MOSDPGInfo.h"
14 #include "messages/MOSDPGInfo2.h"
15 #include "messages/MOSDPGTrim.h"
16 #include "messages/MOSDPGLog.h"
17 #include "messages/MOSDPGNotify.h"
18 #include "messages/MOSDPGNotify2.h"
19 #include "messages/MOSDPGQuery.h"
20 #include "messages/MOSDPGQuery2.h"
21 #include "messages/MOSDPGLease.h"
22 #include "messages/MOSDPGLeaseAck.h"
23
24 #define dout_context cct
25 #define dout_subsys ceph_subsys_osd
26
27 BufferedRecoveryMessages::BufferedRecoveryMessages(
28 ceph_release_t r,
29 PeeringCtx &ctx)
30 : require_osd_release(r) {
31 // steal messages from ctx
32 message_map.swap(ctx.message_map);
33 }
34
35 void BufferedRecoveryMessages::send_notify(int to, const pg_notify_t &n)
36 {
37 if (require_osd_release >= ceph_release_t::octopus) {
38 spg_t pgid(n.info.pgid.pgid, n.to);
39 send_osd_message(to, make_message<MOSDPGNotify2>(pgid, n));
40 } else {
41 send_osd_message(to, make_message<MOSDPGNotify>(n.epoch_sent, vector{n}));
42 }
43 }
44
45 void BufferedRecoveryMessages::send_query(
46 int to,
47 spg_t to_spgid,
48 const pg_query_t &q)
49 {
50 if (require_osd_release >= ceph_release_t::octopus) {
51 send_osd_message(to,
52 make_message<MOSDPGQuery2>(to_spgid, q));
53 } else {
54 auto m = make_message<MOSDPGQuery>(
55 q.epoch_sent,
56 MOSDPGQuery::pg_list_t{{to_spgid, q}});
57 send_osd_message(to, m);
58 }
59 }
60
61 void BufferedRecoveryMessages::send_info(
62 int to,
63 spg_t to_spgid,
64 epoch_t min_epoch,
65 epoch_t cur_epoch,
66 const pg_info_t &info,
67 std::optional<pg_lease_t> lease,
68 std::optional<pg_lease_ack_t> lease_ack)
69 {
70 if (require_osd_release >= ceph_release_t::octopus) {
71 send_osd_message(
72 to,
73 make_message<MOSDPGInfo2>(
74 to_spgid,
75 info,
76 cur_epoch,
77 min_epoch,
78 lease,
79 lease_ack)
80 );
81 } else {
82 send_osd_message(
83 to,
84 make_message<MOSDPGInfo>(
85 cur_epoch,
86 vector{pg_notify_t{to_spgid.shard,
87 info.pgid.shard,
88 min_epoch, cur_epoch,
89 info, PastIntervals{}}})
90 );
91 }
92 }
93
94 void PGPool::update(CephContext *cct, OSDMapRef map)
95 {
96 const pg_pool_t *pi = map->get_pg_pool(id);
97 if (!pi) {
98 return; // pool has been deleted
99 }
100 info = *pi;
101 name = map->get_pool_name(id);
102
103 bool updated = false;
104 if ((map->get_epoch() != cached_epoch + 1) ||
105 (pi->get_snap_epoch() == map->get_epoch())) {
106 updated = true;
107 }
108
109 if (info.is_pool_snaps_mode() && updated) {
110 snapc = pi->get_snap_context();
111 }
112 cached_epoch = map->get_epoch();
113 }
114
115 /*-------------Peering State Helpers----------------*/
116 #undef dout_prefix
117 #define dout_prefix (dpp->gen_prefix(*_dout))
118 #undef psdout
119 #define psdout(x) ldout(cct, x)
120
121 PeeringState::PeeringState(
122 CephContext *cct,
123 pg_shard_t pg_whoami,
124 spg_t spgid,
125 const PGPool &_pool,
126 OSDMapRef curmap,
127 DoutPrefixProvider *dpp,
128 PeeringListener *pl)
129 : state_history(*pl),
130 cct(cct),
131 spgid(spgid),
132 dpp(dpp),
133 pl(pl),
134 orig_ctx(0),
135 osdmap_ref(curmap),
136 pool(_pool),
137 pg_whoami(pg_whoami),
138 info(spgid),
139 pg_log(cct),
140 missing_loc(spgid, this, dpp, cct),
141 machine(this, cct, spgid, dpp, pl, &state_history)
142 {
143 machine.initiate();
144 }
145
146 void PeeringState::start_handle(PeeringCtx *new_ctx) {
147 ceph_assert(!rctx);
148 ceph_assert(!orig_ctx);
149 orig_ctx = new_ctx;
150 if (new_ctx) {
151 if (messages_pending_flush) {
152 rctx.emplace(*messages_pending_flush, *new_ctx);
153 } else {
154 rctx.emplace(*new_ctx);
155 }
156 rctx->start_time = ceph_clock_now();
157 }
158 }
159
160 void PeeringState::begin_block_outgoing() {
161 ceph_assert(!messages_pending_flush);
162 ceph_assert(orig_ctx);
163 ceph_assert(rctx);
164 messages_pending_flush = BufferedRecoveryMessages(
165 orig_ctx->require_osd_release);
166 rctx.emplace(*messages_pending_flush, *orig_ctx);
167 }
168
169 void PeeringState::clear_blocked_outgoing() {
170 ceph_assert(orig_ctx);
171 ceph_assert(rctx);
172 messages_pending_flush = std::optional<BufferedRecoveryMessages>();
173 }
174
175 void PeeringState::end_block_outgoing() {
176 ceph_assert(messages_pending_flush);
177 ceph_assert(orig_ctx);
178 ceph_assert(rctx);
179
180 orig_ctx->accept_buffered_messages(*messages_pending_flush);
181 rctx.emplace(*orig_ctx);
182 messages_pending_flush = std::optional<BufferedRecoveryMessages>();
183 }
184
185 void PeeringState::end_handle() {
186 if (rctx) {
187 utime_t dur = ceph_clock_now() - rctx->start_time;
188 machine.event_time += dur;
189 }
190
191 machine.event_count++;
192 rctx = std::nullopt;
193 orig_ctx = NULL;
194 }
195
196 void PeeringState::check_recovery_sources(const OSDMapRef& osdmap)
197 {
198 /*
199 * check that any peers we are planning to (or currently) pulling
200 * objects from are dealt with.
201 */
202 missing_loc.check_recovery_sources(osdmap);
203 pl->check_recovery_sources(osdmap);
204
205 for (set<pg_shard_t>::iterator i = peer_log_requested.begin();
206 i != peer_log_requested.end();
207 ) {
208 if (!osdmap->is_up(i->osd)) {
209 psdout(10) << "peer_log_requested removing " << *i << dendl;
210 peer_log_requested.erase(i++);
211 } else {
212 ++i;
213 }
214 }
215
216 for (set<pg_shard_t>::iterator i = peer_missing_requested.begin();
217 i != peer_missing_requested.end();
218 ) {
219 if (!osdmap->is_up(i->osd)) {
220 psdout(10) << "peer_missing_requested removing " << *i << dendl;
221 peer_missing_requested.erase(i++);
222 } else {
223 ++i;
224 }
225 }
226 }
227
228 void PeeringState::update_history(const pg_history_t& new_history)
229 {
230 auto mnow = pl->get_mnow();
231 info.history.refresh_prior_readable_until_ub(mnow, prior_readable_until_ub);
232 if (info.history.merge(new_history)) {
233 psdout(20) << __func__ << " advanced history from " << new_history << dendl;
234 dirty_info = true;
235 if (info.history.last_epoch_clean >= info.history.same_interval_since) {
236 psdout(20) << __func__ << " clearing past_intervals" << dendl;
237 past_intervals.clear();
238 dirty_big_info = true;
239 }
240 prior_readable_until_ub = info.history.get_prior_readable_until_ub(mnow);
241 if (prior_readable_until_ub != ceph::signedspan::zero()) {
242 dout(20) << __func__
243 << " prior_readable_until_ub " << prior_readable_until_ub
244 << " (mnow " << mnow << " + "
245 << info.history.prior_readable_until_ub << ")" << dendl;
246 }
247 }
248 pl->on_info_history_change();
249 }
250
251 void PeeringState::purge_strays()
252 {
253 if (is_premerge()) {
254 psdout(10) << "purge_strays " << stray_set << " but premerge, doing nothing"
255 << dendl;
256 return;
257 }
258 if (cct->_conf.get_val<bool>("osd_debug_no_purge_strays")) {
259 return;
260 }
261 psdout(10) << "purge_strays " << stray_set << dendl;
262
263 bool removed = false;
264 for (set<pg_shard_t>::iterator p = stray_set.begin();
265 p != stray_set.end();
266 ++p) {
267 ceph_assert(!is_acting_recovery_backfill(*p));
268 if (get_osdmap()->is_up(p->osd)) {
269 psdout(10) << "sending PGRemove to osd." << *p << dendl;
270 vector<spg_t> to_remove;
271 to_remove.push_back(spg_t(info.pgid.pgid, p->shard));
272 MOSDPGRemove *m = new MOSDPGRemove(
273 get_osdmap_epoch(),
274 to_remove);
275 pl->send_cluster_message(p->osd, m, get_osdmap_epoch());
276 } else {
277 psdout(10) << "not sending PGRemove to down osd." << *p << dendl;
278 }
279 peer_missing.erase(*p);
280 peer_info.erase(*p);
281 missing_loc.remove_stray_recovery_sources(*p);
282 peer_purged.insert(*p);
283 removed = true;
284 }
285
286 // if we removed anyone, update peers (which include peer_info)
287 if (removed)
288 update_heartbeat_peers();
289
290 stray_set.clear();
291
292 // clear _requested maps; we may have to peer() again if we discover
293 // (more) stray content
294 peer_log_requested.clear();
295 peer_missing_requested.clear();
296 }
297
298
299 bool PeeringState::proc_replica_info(
300 pg_shard_t from, const pg_info_t &oinfo, epoch_t send_epoch)
301 {
302 map<pg_shard_t, pg_info_t>::iterator p = peer_info.find(from);
303 if (p != peer_info.end() && p->second.last_update == oinfo.last_update) {
304 psdout(10) << " got dup osd." << from << " info "
305 << oinfo << ", identical to ours" << dendl;
306 return false;
307 }
308
309 if (!get_osdmap()->has_been_up_since(from.osd, send_epoch)) {
310 psdout(10) << " got info " << oinfo << " from down osd." << from
311 << " discarding" << dendl;
312 return false;
313 }
314
315 psdout(10) << " got osd." << from << " " << oinfo << dendl;
316 ceph_assert(is_primary());
317 peer_info[from] = oinfo;
318 might_have_unfound.insert(from);
319
320 update_history(oinfo.history);
321
322 // stray?
323 if (!is_up(from) && !is_acting(from)) {
324 psdout(10) << " osd." << from << " has stray content: " << oinfo << dendl;
325 stray_set.insert(from);
326 if (is_clean()) {
327 purge_strays();
328 }
329 }
330
331 // was this a new info? if so, update peers!
332 if (p == peer_info.end())
333 update_heartbeat_peers();
334
335 return true;
336 }
337
338
339 void PeeringState::remove_down_peer_info(const OSDMapRef &osdmap)
340 {
341 // Remove any downed osds from peer_info
342 bool removed = false;
343 map<pg_shard_t, pg_info_t>::iterator p = peer_info.begin();
344 while (p != peer_info.end()) {
345 if (!osdmap->is_up(p->first.osd)) {
346 psdout(10) << " dropping down osd." << p->first << " info " << p->second << dendl;
347 peer_missing.erase(p->first);
348 peer_log_requested.erase(p->first);
349 peer_missing_requested.erase(p->first);
350 peer_purged.erase(p->first);
351 peer_info.erase(p++);
352 removed = true;
353 } else
354 ++p;
355 }
356
357 // if we removed anyone, update peers (which include peer_info)
358 if (removed)
359 update_heartbeat_peers();
360
361 check_recovery_sources(osdmap);
362 }
363
364 void PeeringState::update_heartbeat_peers()
365 {
366 if (!is_primary())
367 return;
368
369 set<int> new_peers;
370 for (unsigned i=0; i<acting.size(); i++) {
371 if (acting[i] != CRUSH_ITEM_NONE)
372 new_peers.insert(acting[i]);
373 }
374 for (unsigned i=0; i<up.size(); i++) {
375 if (up[i] != CRUSH_ITEM_NONE)
376 new_peers.insert(up[i]);
377 }
378 for (map<pg_shard_t,pg_info_t>::iterator p = peer_info.begin();
379 p != peer_info.end();
380 ++p) {
381 new_peers.insert(p->first.osd);
382 }
383 pl->update_heartbeat_peers(std::move(new_peers));
384 }
385
386 void PeeringState::write_if_dirty(ObjectStore::Transaction& t)
387 {
388 pl->prepare_write(
389 info,
390 last_written_info,
391 past_intervals,
392 pg_log,
393 dirty_info,
394 dirty_big_info,
395 last_persisted_osdmap < get_osdmap_epoch(),
396 t);
397 if (dirty_info || dirty_big_info) {
398 last_persisted_osdmap = get_osdmap_epoch();
399 last_written_info = info;
400 dirty_info = false;
401 dirty_big_info = false;
402 }
403 }
404
405 void PeeringState::advance_map(
406 OSDMapRef osdmap, OSDMapRef lastmap,
407 vector<int>& newup, int up_primary,
408 vector<int>& newacting, int acting_primary,
409 PeeringCtx &rctx)
410 {
411 ceph_assert(lastmap == osdmap_ref);
412 psdout(10) << "handle_advance_map "
413 << newup << "/" << newacting
414 << " -- " << up_primary << "/" << acting_primary
415 << dendl;
416
417 update_osdmap_ref(osdmap);
418 pool.update(cct, osdmap);
419
420 AdvMap evt(
421 osdmap, lastmap, newup, up_primary,
422 newacting, acting_primary);
423 handle_event(evt, &rctx);
424 if (pool.info.last_change == osdmap_ref->get_epoch()) {
425 pl->on_pool_change();
426 }
427 readable_interval = pool.get_readable_interval();
428 last_require_osd_release = osdmap->require_osd_release;
429 }
430
431 void PeeringState::activate_map(PeeringCtx &rctx)
432 {
433 psdout(10) << __func__ << dendl;
434 ActMap evt;
435 handle_event(evt, &rctx);
436 if (osdmap_ref->get_epoch() - last_persisted_osdmap >
437 cct->_conf->osd_pg_epoch_persisted_max_stale) {
438 psdout(20) << __func__ << ": Dirtying info: last_persisted is "
439 << last_persisted_osdmap
440 << " while current is " << osdmap_ref->get_epoch() << dendl;
441 dirty_info = true;
442 } else {
443 psdout(20) << __func__ << ": Not dirtying info: last_persisted is "
444 << last_persisted_osdmap
445 << " while current is " << osdmap_ref->get_epoch() << dendl;
446 }
447 write_if_dirty(rctx.transaction);
448
449 if (get_osdmap()->check_new_blacklist_entries()) {
450 pl->check_blacklisted_watchers();
451 }
452 }
453
454 void PeeringState::set_last_peering_reset()
455 {
456 psdout(20) << "set_last_peering_reset " << get_osdmap_epoch() << dendl;
457 if (last_peering_reset != get_osdmap_epoch()) {
458 last_peering_reset = get_osdmap_epoch();
459 psdout(10) << "Clearing blocked outgoing recovery messages" << dendl;
460 clear_blocked_outgoing();
461 if (!pl->try_flush_or_schedule_async()) {
462 psdout(10) << "Beginning to block outgoing recovery messages" << dendl;
463 begin_block_outgoing();
464 } else {
465 psdout(10) << "Not blocking outgoing recovery messages" << dendl;
466 }
467 }
468 }
469
470 void PeeringState::complete_flush()
471 {
472 flushes_in_progress--;
473 if (flushes_in_progress == 0) {
474 pl->on_flushed();
475 }
476 }
477
478 void PeeringState::check_full_transition(OSDMapRef lastmap, OSDMapRef osdmap)
479 {
480 const pg_pool_t *pi = osdmap->get_pg_pool(info.pgid.pool());
481 if (!pi) {
482 return; // pool deleted
483 }
484 bool changed = false;
485 if (pi->has_flag(pg_pool_t::FLAG_FULL)) {
486 const pg_pool_t *opi = lastmap->get_pg_pool(info.pgid.pool());
487 if (!opi || !opi->has_flag(pg_pool_t::FLAG_FULL)) {
488 psdout(10) << " pool was marked full in " << osdmap->get_epoch() << dendl;
489 changed = true;
490 }
491 }
492 if (changed) {
493 info.history.last_epoch_marked_full = osdmap->get_epoch();
494 dirty_info = true;
495 }
496 }
497
498 bool PeeringState::should_restart_peering(
499 int newupprimary,
500 int newactingprimary,
501 const vector<int>& newup,
502 const vector<int>& newacting,
503 OSDMapRef lastmap,
504 OSDMapRef osdmap)
505 {
506 if (PastIntervals::is_new_interval(
507 primary.osd,
508 newactingprimary,
509 acting,
510 newacting,
511 up_primary.osd,
512 newupprimary,
513 up,
514 newup,
515 osdmap.get(),
516 lastmap.get(),
517 info.pgid.pgid)) {
518 psdout(20) << "new interval newup " << newup
519 << " newacting " << newacting << dendl;
520 return true;
521 }
522 if (!lastmap->is_up(pg_whoami.osd) && osdmap->is_up(pg_whoami.osd)) {
523 psdout(10) << __func__ << " osd transitioned from down -> up"
524 << dendl;
525 return true;
526 }
527 return false;
528 }
529
530 /* Called before initializing peering during advance_map */
531 void PeeringState::start_peering_interval(
532 const OSDMapRef lastmap,
533 const vector<int>& newup, int new_up_primary,
534 const vector<int>& newacting, int new_acting_primary,
535 ObjectStore::Transaction &t)
536 {
537 const OSDMapRef osdmap = get_osdmap();
538
539 set_last_peering_reset();
540
541 vector<int> oldacting, oldup;
542 int oldrole = get_role();
543
544 if (is_primary()) {
545 pl->clear_ready_to_merge();
546 }
547
548
549 pg_shard_t old_acting_primary = get_primary();
550 pg_shard_t old_up_primary = up_primary;
551 bool was_old_primary = is_primary();
552 bool was_old_nonprimary = is_nonprimary();
553
554 acting.swap(oldacting);
555 up.swap(oldup);
556 init_primary_up_acting(
557 newup,
558 newacting,
559 new_up_primary,
560 new_acting_primary);
561
562 if (info.stats.up != up ||
563 info.stats.acting != acting ||
564 info.stats.up_primary != new_up_primary ||
565 info.stats.acting_primary != new_acting_primary) {
566 info.stats.up = up;
567 info.stats.up_primary = new_up_primary;
568 info.stats.acting = acting;
569 info.stats.acting_primary = new_acting_primary;
570 info.stats.mapping_epoch = osdmap->get_epoch();
571 }
572
573 pl->clear_publish_stats();
574
575 // This will now be remapped during a backfill in cases
576 // that it would not have been before.
577 if (up != acting)
578 state_set(PG_STATE_REMAPPED);
579 else
580 state_clear(PG_STATE_REMAPPED);
581
582 int role = osdmap->calc_pg_role(pg_whoami, acting);
583 set_role(role);
584
585 // did acting, up, primary|acker change?
586 if (!lastmap) {
587 psdout(10) << " no lastmap" << dendl;
588 dirty_info = true;
589 dirty_big_info = true;
590 info.history.same_interval_since = osdmap->get_epoch();
591 } else {
592 std::stringstream debug;
593 ceph_assert(info.history.same_interval_since != 0);
594 bool new_interval = PastIntervals::check_new_interval(
595 old_acting_primary.osd,
596 new_acting_primary,
597 oldacting, newacting,
598 old_up_primary.osd,
599 new_up_primary,
600 oldup, newup,
601 info.history.same_interval_since,
602 info.history.last_epoch_clean,
603 osdmap.get(),
604 lastmap.get(),
605 info.pgid.pgid,
606 missing_loc.get_recoverable_predicate(),
607 &past_intervals,
608 &debug);
609 psdout(10) << __func__ << ": check_new_interval output: "
610 << debug.str() << dendl;
611 if (new_interval) {
612 if (osdmap->get_epoch() == pl->oldest_stored_osdmap() &&
613 info.history.last_epoch_clean < osdmap->get_epoch()) {
614 psdout(10) << " map gap, clearing past_intervals and faking" << dendl;
615 // our information is incomplete and useless; someone else was clean
616 // after everything we know if osdmaps were trimmed.
617 past_intervals.clear();
618 } else {
619 psdout(10) << " noting past " << past_intervals << dendl;
620 }
621 dirty_info = true;
622 dirty_big_info = true;
623 info.history.same_interval_since = osdmap->get_epoch();
624 if (osdmap->have_pg_pool(info.pgid.pgid.pool()) &&
625 info.pgid.pgid.is_split(lastmap->get_pg_num(info.pgid.pgid.pool()),
626 osdmap->get_pg_num(info.pgid.pgid.pool()),
627 nullptr)) {
628 info.history.last_epoch_split = osdmap->get_epoch();
629 }
630 }
631 }
632
633 if (old_up_primary != up_primary ||
634 oldup != up) {
635 info.history.same_up_since = osdmap->get_epoch();
636 }
637 // this comparison includes primary rank via pg_shard_t
638 if (old_acting_primary != get_primary()) {
639 info.history.same_primary_since = osdmap->get_epoch();
640 }
641
642 on_new_interval();
643 pl->on_info_history_change();
644
645 psdout(1) << __func__ << " up " << oldup << " -> " << up
646 << ", acting " << oldacting << " -> " << acting
647 << ", acting_primary " << old_acting_primary << " -> "
648 << new_acting_primary
649 << ", up_primary " << old_up_primary << " -> " << new_up_primary
650 << ", role " << oldrole << " -> " << role
651 << ", features acting " << acting_features
652 << " upacting " << upacting_features
653 << dendl;
654
655 // deactivate.
656 state_clear(PG_STATE_ACTIVE);
657 state_clear(PG_STATE_PEERED);
658 state_clear(PG_STATE_PREMERGE);
659 state_clear(PG_STATE_DOWN);
660 state_clear(PG_STATE_RECOVERY_WAIT);
661 state_clear(PG_STATE_RECOVERY_TOOFULL);
662 state_clear(PG_STATE_RECOVERING);
663
664 peer_purged.clear();
665 acting_recovery_backfill.clear();
666
667 // reset primary/replica state?
668 if (was_old_primary || is_primary()) {
669 pl->clear_want_pg_temp();
670 } else if (was_old_nonprimary || is_nonprimary()) {
671 pl->clear_want_pg_temp();
672 }
673 clear_primary_state();
674
675 pl->on_change(t);
676
677 ceph_assert(!deleting);
678
679 // should we tell the primary we are here?
680 send_notify = !is_primary();
681
682 if (role != oldrole ||
683 was_old_primary != is_primary()) {
684 // did primary change?
685 if (was_old_primary != is_primary()) {
686 state_clear(PG_STATE_CLEAN);
687 }
688
689 pl->on_role_change();
690 } else {
691 // no role change.
692 // did primary change?
693 if (get_primary() != old_acting_primary) {
694 psdout(10) << oldacting << " -> " << acting
695 << ", acting primary "
696 << old_acting_primary << " -> " << get_primary()
697 << dendl;
698 } else {
699 // primary is the same.
700 if (is_primary()) {
701 // i am (still) primary. but my replica set changed.
702 state_clear(PG_STATE_CLEAN);
703
704 psdout(10) << oldacting << " -> " << acting
705 << ", replicas changed" << dendl;
706 }
707 }
708 }
709
710 if (acting.empty() && !up.empty() && up_primary == pg_whoami) {
711 psdout(10) << " acting empty, but i am up[0], clearing pg_temp" << dendl;
712 pl->queue_want_pg_temp(acting);
713 }
714 }
715
716 void PeeringState::on_new_interval()
717 {
718 dout(20) << __func__ << dendl;
719 const OSDMapRef osdmap = get_osdmap();
720
721 // initialize features
722 acting_features = CEPH_FEATURES_SUPPORTED_DEFAULT;
723 upacting_features = CEPH_FEATURES_SUPPORTED_DEFAULT;
724 for (vector<int>::iterator p = acting.begin(); p != acting.end(); ++p) {
725 if (*p == CRUSH_ITEM_NONE)
726 continue;
727 uint64_t f = osdmap->get_xinfo(*p).features;
728 acting_features &= f;
729 upacting_features &= f;
730 }
731 for (vector<int>::iterator p = up.begin(); p != up.end(); ++p) {
732 if (*p == CRUSH_ITEM_NONE)
733 continue;
734 upacting_features &= osdmap->get_xinfo(*p).features;
735 }
736 psdout(20) << __func__ << " upacting_features 0x" << std::hex
737 << upacting_features << std::dec
738 << " from " << acting << "+" << up << dendl;
739
740 psdout(20) << __func__ << " checking missing set deletes flag. missing = "
741 << get_pg_log().get_missing() << dendl;
742
743 if (!pg_log.get_missing().may_include_deletes &&
744 !perform_deletes_during_peering()) {
745 pl->rebuild_missing_set_with_deletes(pg_log);
746 }
747 ceph_assert(
748 pg_log.get_missing().may_include_deletes ==
749 !perform_deletes_during_peering());
750
751 init_hb_stamps();
752
753 // update lease bounds for a new interval
754 auto mnow = pl->get_mnow();
755 prior_readable_until_ub = std::max(prior_readable_until_ub,
756 readable_until_ub);
757 prior_readable_until_ub = info.history.refresh_prior_readable_until_ub(
758 mnow, prior_readable_until_ub);
759 psdout(10) << __func__ << " prior_readable_until_ub "
760 << prior_readable_until_ub << " (mnow " << mnow << " + "
761 << info.history.prior_readable_until_ub << ")" << dendl;
762 prior_readable_down_osds.clear(); // we populate this when we build the priorset
763
764 readable_until =
765 readable_until_ub =
766 readable_until_ub_sent =
767 readable_until_ub_from_primary = ceph::signedspan::zero();
768
769 acting_readable_until_ub.clear();
770 if (is_primary()) {
771 acting_readable_until_ub.resize(acting.size(), ceph::signedspan::zero());
772 }
773
774 pl->on_new_interval();
775 }
776
777 void PeeringState::init_primary_up_acting(
778 const vector<int> &newup,
779 const vector<int> &newacting,
780 int new_up_primary,
781 int new_acting_primary)
782 {
783 actingset.clear();
784 acting = newacting;
785 for (uint8_t i = 0; i < acting.size(); ++i) {
786 if (acting[i] != CRUSH_ITEM_NONE)
787 actingset.insert(
788 pg_shard_t(
789 acting[i],
790 pool.info.is_erasure() ? shard_id_t(i) : shard_id_t::NO_SHARD));
791 }
792 upset.clear();
793 up = newup;
794 for (uint8_t i = 0; i < up.size(); ++i) {
795 if (up[i] != CRUSH_ITEM_NONE)
796 upset.insert(
797 pg_shard_t(
798 up[i],
799 pool.info.is_erasure() ? shard_id_t(i) : shard_id_t::NO_SHARD));
800 }
801 if (!pool.info.is_erasure()) {
802 // replicated
803 up_primary = pg_shard_t(new_up_primary, shard_id_t::NO_SHARD);
804 primary = pg_shard_t(new_acting_primary, shard_id_t::NO_SHARD);
805 } else {
806 // erasure
807 up_primary = pg_shard_t();
808 primary = pg_shard_t();
809 for (uint8_t i = 0; i < up.size(); ++i) {
810 if (up[i] == new_up_primary) {
811 up_primary = pg_shard_t(up[i], shard_id_t(i));
812 break;
813 }
814 }
815 for (uint8_t i = 0; i < acting.size(); ++i) {
816 if (acting[i] == new_acting_primary) {
817 primary = pg_shard_t(acting[i], shard_id_t(i));
818 break;
819 }
820 }
821 ceph_assert(up_primary.osd == new_up_primary);
822 ceph_assert(primary.osd == new_acting_primary);
823 }
824 }
825
826 void PeeringState::init_hb_stamps()
827 {
828 if (is_primary()) {
829 // we care about all other osds in the acting set
830 hb_stamps.resize(acting.size() - 1);
831 unsigned i = 0;
832 for (auto p : acting) {
833 if (p == CRUSH_ITEM_NONE || p == get_primary().osd) {
834 continue;
835 }
836 hb_stamps[i++] = pl->get_hb_stamps(p);
837 }
838 hb_stamps.resize(i);
839 } else if (is_nonprimary()) {
840 // we care about just the primary
841 hb_stamps.resize(1);
842 hb_stamps[0] = pl->get_hb_stamps(get_primary().osd);
843 } else {
844 hb_stamps.clear();
845 }
846 dout(10) << __func__ << " now " << hb_stamps << dendl;
847 }
848
849
850 void PeeringState::clear_recovery_state()
851 {
852 async_recovery_targets.clear();
853 backfill_targets.clear();
854 }
855
856 void PeeringState::clear_primary_state()
857 {
858 psdout(10) << "clear_primary_state" << dendl;
859
860 // clear peering state
861 stray_set.clear();
862 peer_log_requested.clear();
863 peer_missing_requested.clear();
864 peer_info.clear();
865 peer_bytes.clear();
866 peer_missing.clear();
867 peer_last_complete_ondisk.clear();
868 peer_activated.clear();
869 min_last_complete_ondisk = eversion_t();
870 pg_trim_to = eversion_t();
871 might_have_unfound.clear();
872 need_up_thru = false;
873 missing_loc.clear();
874 pg_log.reset_recovery_pointers();
875
876 clear_recovery_state();
877
878 last_update_ondisk = eversion_t();
879 missing_loc.clear();
880 pl->clear_primary_state();
881 }
882
883 /// return [start,end) bounds for required past_intervals
884 static pair<epoch_t, epoch_t> get_required_past_interval_bounds(
885 const pg_info_t &info,
886 epoch_t oldest_map) {
887 epoch_t start = std::max(
888 info.history.last_epoch_clean ? info.history.last_epoch_clean :
889 info.history.epoch_pool_created,
890 oldest_map);
891 epoch_t end = std::max(
892 info.history.same_interval_since,
893 info.history.epoch_pool_created);
894 return make_pair(start, end);
895 }
896
897
898 void PeeringState::check_past_interval_bounds() const
899 {
900 auto oldest_epoch = pl->oldest_stored_osdmap();
901 auto rpib = get_required_past_interval_bounds(
902 info,
903 oldest_epoch);
904 if (rpib.first >= rpib.second) {
905 // do not warn if the start bound is dictated by oldest_map; the
906 // past intervals are presumably appropriate given the pg info.
907 if (!past_intervals.empty() &&
908 rpib.first > oldest_epoch) {
909 pl->get_clog_error() << info.pgid << " required past_interval bounds are"
910 << " empty [" << rpib << ") but past_intervals is not: "
911 << past_intervals;
912 derr << info.pgid << " required past_interval bounds are"
913 << " empty [" << rpib << ") but past_intervals is not: "
914 << past_intervals << dendl;
915 }
916 } else {
917 if (past_intervals.empty()) {
918 pl->get_clog_error() << info.pgid << " required past_interval bounds are"
919 << " not empty [" << rpib << ") but past_intervals "
920 << past_intervals << " is empty";
921 derr << info.pgid << " required past_interval bounds are"
922 << " not empty [" << rpib << ") but past_intervals "
923 << past_intervals << " is empty" << dendl;
924 ceph_assert(!past_intervals.empty());
925 }
926
927 auto apib = past_intervals.get_bounds();
928 if (apib.first > rpib.first) {
929 pl->get_clog_error() << info.pgid << " past_intervals [" << apib
930 << ") start interval does not contain the required"
931 << " bound [" << rpib << ") start";
932 derr << info.pgid << " past_intervals [" << apib
933 << ") start interval does not contain the required"
934 << " bound [" << rpib << ") start" << dendl;
935 ceph_abort_msg("past_interval start interval mismatch");
936 }
937 if (apib.second != rpib.second) {
938 pl->get_clog_error() << info.pgid << " past_interal bound [" << apib
939 << ") end does not match required [" << rpib
940 << ") end";
941 derr << info.pgid << " past_interal bound [" << apib
942 << ") end does not match required [" << rpib
943 << ") end" << dendl;
944 ceph_abort_msg("past_interval end mismatch");
945 }
946 }
947 }
948
949 int PeeringState::clamp_recovery_priority(int priority, int pool_recovery_priority, int max)
950 {
951 static_assert(OSD_RECOVERY_PRIORITY_MIN < OSD_RECOVERY_PRIORITY_MAX, "Invalid priority range");
952 static_assert(OSD_RECOVERY_PRIORITY_MIN >= 0, "Priority range must match unsigned type");
953
954 ceph_assert(max <= OSD_RECOVERY_PRIORITY_MAX);
955
956 // User can't set this too high anymore, but might be a legacy value
957 if (pool_recovery_priority > OSD_POOL_PRIORITY_MAX)
958 pool_recovery_priority = OSD_POOL_PRIORITY_MAX;
959 if (pool_recovery_priority < OSD_POOL_PRIORITY_MIN)
960 pool_recovery_priority = OSD_POOL_PRIORITY_MIN;
961 // Shift range from min to max to 0 to max - min
962 pool_recovery_priority += (0 - OSD_POOL_PRIORITY_MIN);
963 ceph_assert(pool_recovery_priority >= 0 && pool_recovery_priority <= (OSD_POOL_PRIORITY_MAX - OSD_POOL_PRIORITY_MIN));
964
965 priority += pool_recovery_priority;
966
967 // Clamp to valid range
968 if (priority > max) {
969 return max;
970 } else if (priority < OSD_RECOVERY_PRIORITY_MIN) {
971 return OSD_RECOVERY_PRIORITY_MIN;
972 } else {
973 return priority;
974 }
975 }
976
977 unsigned PeeringState::get_recovery_priority()
978 {
979 // a higher value -> a higher priority
980 int ret = OSD_RECOVERY_PRIORITY_BASE;
981 int base = ret;
982
983 if (state & PG_STATE_FORCED_RECOVERY) {
984 ret = OSD_RECOVERY_PRIORITY_FORCED;
985 } else {
986 // XXX: This priority boost isn't so much about inactive, but about data-at-risk
987 if (is_degraded() && info.stats.avail_no_missing.size() < pool.info.min_size) {
988 base = OSD_RECOVERY_INACTIVE_PRIORITY_BASE;
989 // inactive: no. of replicas < min_size, highest priority since it blocks IO
990 ret = base + (pool.info.min_size - info.stats.avail_no_missing.size());
991 }
992
993 int64_t pool_recovery_priority = 0;
994 pool.info.opts.get(pool_opts_t::RECOVERY_PRIORITY, &pool_recovery_priority);
995
996 ret = clamp_recovery_priority(ret, pool_recovery_priority, max_prio_map[base]);
997 }
998 psdout(20) << __func__ << " recovery priority is " << ret << dendl;
999 return static_cast<unsigned>(ret);
1000 }
1001
1002 unsigned PeeringState::get_backfill_priority()
1003 {
1004 // a higher value -> a higher priority
1005 int ret = OSD_BACKFILL_PRIORITY_BASE;
1006 int base = ret;
1007
1008 if (state & PG_STATE_FORCED_BACKFILL) {
1009 ret = OSD_BACKFILL_PRIORITY_FORCED;
1010 } else {
1011 if (acting.size() < pool.info.min_size) {
1012 base = OSD_BACKFILL_INACTIVE_PRIORITY_BASE;
1013 // inactive: no. of replicas < min_size, highest priority since it blocks IO
1014 ret = base + (pool.info.min_size - acting.size());
1015
1016 } else if (is_undersized()) {
1017 // undersized: OSD_BACKFILL_DEGRADED_PRIORITY_BASE + num missing replicas
1018 ceph_assert(pool.info.size > actingset.size());
1019 base = OSD_BACKFILL_DEGRADED_PRIORITY_BASE;
1020 ret = base + (pool.info.size - actingset.size());
1021
1022 } else if (is_degraded()) {
1023 // degraded: baseline degraded
1024 base = ret = OSD_BACKFILL_DEGRADED_PRIORITY_BASE;
1025 }
1026
1027 // Adjust with pool's recovery priority
1028 int64_t pool_recovery_priority = 0;
1029 pool.info.opts.get(pool_opts_t::RECOVERY_PRIORITY, &pool_recovery_priority);
1030
1031 ret = clamp_recovery_priority(ret, pool_recovery_priority, max_prio_map[base]);
1032 }
1033
1034 psdout(20) << __func__ << " backfill priority is " << ret << dendl;
1035 return static_cast<unsigned>(ret);
1036 }
1037
1038 unsigned PeeringState::get_delete_priority()
1039 {
1040 auto state = get_osdmap()->get_state(pg_whoami.osd);
1041 if (state & (CEPH_OSD_BACKFILLFULL |
1042 CEPH_OSD_FULL)) {
1043 return OSD_DELETE_PRIORITY_FULL;
1044 } else if (state & CEPH_OSD_NEARFULL) {
1045 return OSD_DELETE_PRIORITY_FULLISH;
1046 } else {
1047 return OSD_DELETE_PRIORITY_NORMAL;
1048 }
1049 }
1050
1051 bool PeeringState::set_force_recovery(bool b)
1052 {
1053 bool did = false;
1054 if (b) {
1055 if (!(state & PG_STATE_FORCED_RECOVERY) &&
1056 (state & (PG_STATE_DEGRADED |
1057 PG_STATE_RECOVERY_WAIT |
1058 PG_STATE_RECOVERING))) {
1059 psdout(20) << __func__ << " set" << dendl;
1060 state_set(PG_STATE_FORCED_RECOVERY);
1061 pl->publish_stats_to_osd();
1062 did = true;
1063 }
1064 } else if (state & PG_STATE_FORCED_RECOVERY) {
1065 psdout(20) << __func__ << " clear" << dendl;
1066 state_clear(PG_STATE_FORCED_RECOVERY);
1067 pl->publish_stats_to_osd();
1068 did = true;
1069 }
1070 if (did) {
1071 psdout(20) << __func__ << " state " << get_current_state()
1072 << dendl;
1073 pl->update_local_background_io_priority(get_recovery_priority());
1074 }
1075 return did;
1076 }
1077
1078 bool PeeringState::set_force_backfill(bool b)
1079 {
1080 bool did = false;
1081 if (b) {
1082 if (!(state & PG_STATE_FORCED_BACKFILL) &&
1083 (state & (PG_STATE_DEGRADED |
1084 PG_STATE_BACKFILL_WAIT |
1085 PG_STATE_BACKFILLING))) {
1086 psdout(10) << __func__ << " set" << dendl;
1087 state_set(PG_STATE_FORCED_BACKFILL);
1088 pl->publish_stats_to_osd();
1089 did = true;
1090 }
1091 } else if (state & PG_STATE_FORCED_BACKFILL) {
1092 psdout(10) << __func__ << " clear" << dendl;
1093 state_clear(PG_STATE_FORCED_BACKFILL);
1094 pl->publish_stats_to_osd();
1095 did = true;
1096 }
1097 if (did) {
1098 psdout(20) << __func__ << " state " << get_current_state()
1099 << dendl;
1100 pl->update_local_background_io_priority(get_backfill_priority());
1101 }
1102 return did;
1103 }
1104
1105 void PeeringState::schedule_renew_lease()
1106 {
1107 pl->schedule_renew_lease(
1108 last_peering_reset,
1109 readable_interval / 2);
1110 }
1111
1112 void PeeringState::send_lease()
1113 {
1114 epoch_t epoch = pl->get_osdmap_epoch();
1115 for (auto peer : actingset) {
1116 if (peer == pg_whoami) {
1117 continue;
1118 }
1119 pl->send_cluster_message(
1120 peer.osd,
1121 new MOSDPGLease(epoch,
1122 spg_t(spgid.pgid, peer.shard),
1123 get_lease()),
1124 epoch);
1125 }
1126 }
1127
1128 void PeeringState::proc_lease(const pg_lease_t& l)
1129 {
1130 if (!HAVE_FEATURE(upacting_features, SERVER_OCTOPUS)) {
1131 psdout(20) << __func__ << " no-op, upacting_features 0x" << std::hex
1132 << upacting_features << std::dec
1133 << " does not include SERVER_OCTOPUS" << dendl;
1134 return;
1135 }
1136 if (!is_nonprimary()) {
1137 psdout(20) << __func__ << " no-op, !nonprimary" << dendl;
1138 return;
1139 }
1140 psdout(10) << __func__ << " " << l << dendl;
1141 if (l.readable_until_ub > readable_until_ub_from_primary) {
1142 readable_until_ub_from_primary = l.readable_until_ub;
1143 }
1144
1145 ceph::signedspan ru = ceph::signedspan::zero();
1146 if (l.readable_until != ceph::signedspan::zero() &&
1147 hb_stamps[0]->peer_clock_delta_ub) {
1148 ru = l.readable_until - *hb_stamps[0]->peer_clock_delta_ub;
1149 psdout(20) << " peer_clock_delta_ub " << *hb_stamps[0]->peer_clock_delta_ub
1150 << " -> ru " << ru << dendl;
1151 }
1152 if (ru > readable_until) {
1153 readable_until = ru;
1154 psdout(20) << __func__ << " readable_until now " << readable_until << dendl;
1155 // NOTE: if we ever decide to block/queue ops on the replica,
1156 // we'll need to wake them up here.
1157 }
1158
1159 ceph::signedspan ruub;
1160 if (hb_stamps[0]->peer_clock_delta_lb) {
1161 ruub = l.readable_until_ub - *hb_stamps[0]->peer_clock_delta_lb;
1162 psdout(20) << " peer_clock_delta_lb " << *hb_stamps[0]->peer_clock_delta_lb
1163 << " -> ruub " << ruub << dendl;
1164 } else {
1165 ruub = pl->get_mnow() + l.interval;
1166 psdout(20) << " no peer_clock_delta_lb -> ruub " << ruub << dendl;
1167 }
1168 if (ruub > readable_until_ub) {
1169 readable_until_ub = ruub;
1170 psdout(20) << __func__ << " readable_until_ub now " << readable_until_ub
1171 << dendl;
1172 }
1173 }
1174
1175 void PeeringState::proc_lease_ack(int from, const pg_lease_ack_t& a)
1176 {
1177 if (!HAVE_FEATURE(upacting_features, SERVER_OCTOPUS)) {
1178 return;
1179 }
1180 auto now = pl->get_mnow();
1181 bool was_min = false;
1182 for (unsigned i = 0; i < acting.size(); ++i) {
1183 if (from == acting[i]) {
1184 // the lease_ack value is based on the primary's clock
1185 if (a.readable_until_ub > acting_readable_until_ub[i]) {
1186 if (acting_readable_until_ub[i] == readable_until) {
1187 was_min = true;
1188 }
1189 acting_readable_until_ub[i] = a.readable_until_ub;
1190 break;
1191 }
1192 }
1193 }
1194 if (was_min) {
1195 auto old_ru = readable_until;
1196 recalc_readable_until();
1197 if (now < old_ru) {
1198 pl->recheck_readable();
1199 }
1200 }
1201 }
1202
1203 void PeeringState::proc_renew_lease()
1204 {
1205 if (!HAVE_FEATURE(upacting_features, SERVER_OCTOPUS)) {
1206 return;
1207 }
1208 renew_lease(pl->get_mnow());
1209 send_lease();
1210 schedule_renew_lease();
1211 }
1212
1213 void PeeringState::recalc_readable_until()
1214 {
1215 assert(is_primary());
1216 ceph::signedspan min = readable_until_ub_sent;
1217 for (unsigned i = 0; i < acting.size(); ++i) {
1218 if (acting[i] == pg_whoami.osd || acting[i] == CRUSH_ITEM_NONE) {
1219 continue;
1220 }
1221 dout(20) << __func__ << " peer osd." << acting[i]
1222 << " ruub " << acting_readable_until_ub[i] << dendl;
1223 if (acting_readable_until_ub[i] < min) {
1224 min = acting_readable_until_ub[i];
1225 }
1226 }
1227 readable_until = min;
1228 readable_until_ub = min;
1229 dout(20) << __func__ << " readable_until[_ub] " << readable_until
1230 << " (sent " << readable_until_ub_sent << ")" << dendl;
1231 }
1232
1233 bool PeeringState::check_prior_readable_down_osds(const OSDMapRef& map)
1234 {
1235 if (!HAVE_FEATURE(upacting_features, SERVER_OCTOPUS)) {
1236 return false;
1237 }
1238 bool changed = false;
1239 auto p = prior_readable_down_osds.begin();
1240 while (p != prior_readable_down_osds.end()) {
1241 if (map->is_dead(*p)) {
1242 dout(10) << __func__ << " prior_readable_down_osds osd." << *p
1243 << " is dead as of epoch " << map->get_epoch()
1244 << dendl;
1245 p = prior_readable_down_osds.erase(p);
1246 changed = true;
1247 } else {
1248 ++p;
1249 }
1250 }
1251 if (changed && prior_readable_down_osds.empty()) {
1252 psdout(10) << " empty prior_readable_down_osds, clearing ub" << dendl;
1253 clear_prior_readable_until_ub();
1254 return true;
1255 }
1256 return false;
1257 }
1258
1259 bool PeeringState::adjust_need_up_thru(const OSDMapRef osdmap)
1260 {
1261 epoch_t up_thru = osdmap->get_up_thru(pg_whoami.osd);
1262 if (need_up_thru &&
1263 up_thru >= info.history.same_interval_since) {
1264 psdout(10) << "adjust_need_up_thru now "
1265 << up_thru << ", need_up_thru now false" << dendl;
1266 need_up_thru = false;
1267 return true;
1268 }
1269 return false;
1270 }
1271
1272 PastIntervals::PriorSet PeeringState::build_prior()
1273 {
1274 if (1) {
1275 // sanity check
1276 for (map<pg_shard_t,pg_info_t>::iterator it = peer_info.begin();
1277 it != peer_info.end();
1278 ++it) {
1279 ceph_assert(info.history.last_epoch_started >=
1280 it->second.history.last_epoch_started);
1281 }
1282 }
1283
1284 const OSDMap &osdmap = *get_osdmap();
1285 PastIntervals::PriorSet prior = past_intervals.get_prior_set(
1286 pool.info.is_erasure(),
1287 info.history.last_epoch_started,
1288 &missing_loc.get_recoverable_predicate(),
1289 [&](epoch_t start, int osd, epoch_t *lost_at) {
1290 const osd_info_t *pinfo = 0;
1291 if (osdmap.exists(osd)) {
1292 pinfo = &osdmap.get_info(osd);
1293 if (lost_at)
1294 *lost_at = pinfo->lost_at;
1295 }
1296
1297 if (osdmap.is_up(osd)) {
1298 return PastIntervals::UP;
1299 } else if (!pinfo) {
1300 return PastIntervals::DNE;
1301 } else if (pinfo->lost_at > start) {
1302 return PastIntervals::LOST;
1303 } else {
1304 return PastIntervals::DOWN;
1305 }
1306 },
1307 up,
1308 acting,
1309 dpp);
1310
1311 if (prior.pg_down) {
1312 state_set(PG_STATE_DOWN);
1313 }
1314
1315 if (get_osdmap()->get_up_thru(pg_whoami.osd) <
1316 info.history.same_interval_since) {
1317 psdout(10) << "up_thru " << get_osdmap()->get_up_thru(pg_whoami.osd)
1318 << " < same_since " << info.history.same_interval_since
1319 << ", must notify monitor" << dendl;
1320 need_up_thru = true;
1321 } else {
1322 psdout(10) << "up_thru " << get_osdmap()->get_up_thru(pg_whoami.osd)
1323 << " >= same_since " << info.history.same_interval_since
1324 << ", all is well" << dendl;
1325 need_up_thru = false;
1326 }
1327 pl->set_probe_targets(prior.probe);
1328 return prior;
1329 }
1330
1331 bool PeeringState::needs_recovery() const
1332 {
1333 ceph_assert(is_primary());
1334
1335 auto &missing = pg_log.get_missing();
1336
1337 if (missing.num_missing()) {
1338 psdout(10) << __func__ << " primary has " << missing.num_missing()
1339 << " missing" << dendl;
1340 return true;
1341 }
1342
1343 ceph_assert(!acting_recovery_backfill.empty());
1344 set<pg_shard_t>::const_iterator end = acting_recovery_backfill.end();
1345 set<pg_shard_t>::const_iterator a = acting_recovery_backfill.begin();
1346 for (; a != end; ++a) {
1347 if (*a == get_primary()) continue;
1348 pg_shard_t peer = *a;
1349 map<pg_shard_t, pg_missing_t>::const_iterator pm = peer_missing.find(peer);
1350 if (pm == peer_missing.end()) {
1351 psdout(10) << __func__ << " osd." << peer << " doesn't have missing set"
1352 << dendl;
1353 continue;
1354 }
1355 if (pm->second.num_missing()) {
1356 psdout(10) << __func__ << " osd." << peer << " has "
1357 << pm->second.num_missing() << " missing" << dendl;
1358 return true;
1359 }
1360 }
1361
1362 psdout(10) << __func__ << " is recovered" << dendl;
1363 return false;
1364 }
1365
1366 bool PeeringState::needs_backfill() const
1367 {
1368 ceph_assert(is_primary());
1369
1370 // We can assume that only possible osds that need backfill
1371 // are on the backfill_targets vector nodes.
1372 set<pg_shard_t>::const_iterator end = backfill_targets.end();
1373 set<pg_shard_t>::const_iterator a = backfill_targets.begin();
1374 for (; a != end; ++a) {
1375 pg_shard_t peer = *a;
1376 map<pg_shard_t, pg_info_t>::const_iterator pi = peer_info.find(peer);
1377 if (!pi->second.last_backfill.is_max()) {
1378 psdout(10) << __func__ << " osd." << peer
1379 << " has last_backfill " << pi->second.last_backfill << dendl;
1380 return true;
1381 }
1382 }
1383
1384 psdout(10) << __func__ << " does not need backfill" << dendl;
1385 return false;
1386 }
1387
1388 /*
1389 * Returns true unless there is a non-lost OSD in might_have_unfound.
1390 */
1391 bool PeeringState::all_unfound_are_queried_or_lost(
1392 const OSDMapRef osdmap) const
1393 {
1394 ceph_assert(is_primary());
1395
1396 set<pg_shard_t>::const_iterator peer = might_have_unfound.begin();
1397 set<pg_shard_t>::const_iterator mend = might_have_unfound.end();
1398 for (; peer != mend; ++peer) {
1399 if (peer_missing.count(*peer))
1400 continue;
1401 map<pg_shard_t, pg_info_t>::const_iterator iter = peer_info.find(*peer);
1402 if (iter != peer_info.end() &&
1403 (iter->second.is_empty() || iter->second.dne()))
1404 continue;
1405 if (!osdmap->exists(peer->osd))
1406 continue;
1407 const osd_info_t &osd_info(osdmap->get_info(peer->osd));
1408 if (osd_info.lost_at <= osd_info.up_from) {
1409 // If there is even one OSD in might_have_unfound that isn't lost, we
1410 // still might retrieve our unfound.
1411 return false;
1412 }
1413 }
1414 psdout(10) << "all_unfound_are_queried_or_lost all of might_have_unfound "
1415 << might_have_unfound
1416 << " have been queried or are marked lost" << dendl;
1417 return true;
1418 }
1419
1420
1421 void PeeringState::reject_reservation()
1422 {
1423 pl->unreserve_recovery_space();
1424 pl->send_cluster_message(
1425 primary.osd,
1426 new MBackfillReserve(
1427 MBackfillReserve::REJECT_TOOFULL,
1428 spg_t(info.pgid.pgid, primary.shard),
1429 get_osdmap_epoch()),
1430 get_osdmap_epoch());
1431 }
1432
1433 /**
1434 * find_best_info
1435 *
1436 * Returns an iterator to the best info in infos sorted by:
1437 * 1) Prefer newer last_update
1438 * 2) Prefer longer tail if it brings another info into contiguity
1439 * 3) Prefer current primary
1440 */
1441 map<pg_shard_t, pg_info_t>::const_iterator PeeringState::find_best_info(
1442 const map<pg_shard_t, pg_info_t> &infos,
1443 bool restrict_to_up_acting,
1444 bool *history_les_bound) const
1445 {
1446 ceph_assert(history_les_bound);
1447 /* See doc/dev/osd_internals/last_epoch_started.rst before attempting
1448 * to make changes to this process. Also, make sure to update it
1449 * when you find bugs! */
1450 eversion_t min_last_update_acceptable = eversion_t::max();
1451 epoch_t max_last_epoch_started_found = 0;
1452 for (map<pg_shard_t, pg_info_t>::const_iterator i = infos.begin();
1453 i != infos.end();
1454 ++i) {
1455 if (!cct->_conf->osd_find_best_info_ignore_history_les &&
1456 max_last_epoch_started_found < i->second.history.last_epoch_started) {
1457 *history_les_bound = true;
1458 max_last_epoch_started_found = i->second.history.last_epoch_started;
1459 }
1460 if (!i->second.is_incomplete() &&
1461 max_last_epoch_started_found < i->second.last_epoch_started) {
1462 *history_les_bound = false;
1463 max_last_epoch_started_found = i->second.last_epoch_started;
1464 }
1465 }
1466 for (map<pg_shard_t, pg_info_t>::const_iterator i = infos.begin();
1467 i != infos.end();
1468 ++i) {
1469 if (max_last_epoch_started_found <= i->second.last_epoch_started) {
1470 if (min_last_update_acceptable > i->second.last_update)
1471 min_last_update_acceptable = i->second.last_update;
1472 }
1473 }
1474 if (min_last_update_acceptable == eversion_t::max())
1475 return infos.end();
1476
1477 map<pg_shard_t, pg_info_t>::const_iterator best = infos.end();
1478 // find osd with newest last_update (oldest for ec_pool).
1479 // if there are multiples, prefer
1480 // - a longer tail, if it brings another peer into log contiguity
1481 // - the current primary
1482 for (map<pg_shard_t, pg_info_t>::const_iterator p = infos.begin();
1483 p != infos.end();
1484 ++p) {
1485 if (restrict_to_up_acting && !is_up(p->first) &&
1486 !is_acting(p->first))
1487 continue;
1488 // Only consider peers with last_update >= min_last_update_acceptable
1489 if (p->second.last_update < min_last_update_acceptable)
1490 continue;
1491 // Disqualify anyone with a too old last_epoch_started
1492 if (p->second.last_epoch_started < max_last_epoch_started_found)
1493 continue;
1494 // Disqualify anyone who is incomplete (not fully backfilled)
1495 if (p->second.is_incomplete())
1496 continue;
1497 if (best == infos.end()) {
1498 best = p;
1499 continue;
1500 }
1501 // Prefer newer last_update
1502 if (pool.info.require_rollback()) {
1503 if (p->second.last_update > best->second.last_update)
1504 continue;
1505 if (p->second.last_update < best->second.last_update) {
1506 best = p;
1507 continue;
1508 }
1509 } else {
1510 if (p->second.last_update < best->second.last_update)
1511 continue;
1512 if (p->second.last_update > best->second.last_update) {
1513 best = p;
1514 continue;
1515 }
1516 }
1517
1518 // Prefer longer tail
1519 if (p->second.log_tail > best->second.log_tail) {
1520 continue;
1521 } else if (p->second.log_tail < best->second.log_tail) {
1522 best = p;
1523 continue;
1524 }
1525
1526 if (!p->second.has_missing() && best->second.has_missing()) {
1527 psdout(10) << __func__ << " prefer osd." << p->first
1528 << " because it is complete while best has missing"
1529 << dendl;
1530 best = p;
1531 continue;
1532 } else if (p->second.has_missing() && !best->second.has_missing()) {
1533 psdout(10) << __func__ << " skipping osd." << p->first
1534 << " because it has missing while best is complete"
1535 << dendl;
1536 continue;
1537 } else {
1538 // both are complete or have missing
1539 // fall through
1540 }
1541
1542 // prefer current primary (usually the caller), all things being equal
1543 if (p->first == pg_whoami) {
1544 psdout(10) << "calc_acting prefer osd." << p->first
1545 << " because it is current primary" << dendl;
1546 best = p;
1547 continue;
1548 }
1549 }
1550 return best;
1551 }
1552
1553 void PeeringState::calc_ec_acting(
1554 map<pg_shard_t, pg_info_t>::const_iterator auth_log_shard,
1555 unsigned size,
1556 const vector<int> &acting,
1557 const vector<int> &up,
1558 const map<pg_shard_t, pg_info_t> &all_info,
1559 bool restrict_to_up_acting,
1560 vector<int> *_want,
1561 set<pg_shard_t> *backfill,
1562 set<pg_shard_t> *acting_backfill,
1563 ostream &ss)
1564 {
1565 vector<int> want(size, CRUSH_ITEM_NONE);
1566 map<shard_id_t, set<pg_shard_t> > all_info_by_shard;
1567 for (map<pg_shard_t, pg_info_t>::const_iterator i = all_info.begin();
1568 i != all_info.end();
1569 ++i) {
1570 all_info_by_shard[i->first.shard].insert(i->first);
1571 }
1572 for (uint8_t i = 0; i < want.size(); ++i) {
1573 ss << "For position " << (unsigned)i << ": ";
1574 if (up.size() > (unsigned)i && up[i] != CRUSH_ITEM_NONE &&
1575 !all_info.find(pg_shard_t(up[i], shard_id_t(i)))->second.is_incomplete() &&
1576 all_info.find(pg_shard_t(up[i], shard_id_t(i)))->second.last_update >=
1577 auth_log_shard->second.log_tail) {
1578 ss << " selecting up[i]: " << pg_shard_t(up[i], shard_id_t(i)) << std::endl;
1579 want[i] = up[i];
1580 continue;
1581 }
1582 if (up.size() > (unsigned)i && up[i] != CRUSH_ITEM_NONE) {
1583 ss << " backfilling up[i]: " << pg_shard_t(up[i], shard_id_t(i))
1584 << " and ";
1585 backfill->insert(pg_shard_t(up[i], shard_id_t(i)));
1586 }
1587
1588 if (acting.size() > (unsigned)i && acting[i] != CRUSH_ITEM_NONE &&
1589 !all_info.find(pg_shard_t(acting[i], shard_id_t(i)))->second.is_incomplete() &&
1590 all_info.find(pg_shard_t(acting[i], shard_id_t(i)))->second.last_update >=
1591 auth_log_shard->second.log_tail) {
1592 ss << " selecting acting[i]: " << pg_shard_t(acting[i], shard_id_t(i)) << std::endl;
1593 want[i] = acting[i];
1594 } else if (!restrict_to_up_acting) {
1595 for (set<pg_shard_t>::iterator j = all_info_by_shard[shard_id_t(i)].begin();
1596 j != all_info_by_shard[shard_id_t(i)].end();
1597 ++j) {
1598 ceph_assert(j->shard == i);
1599 if (!all_info.find(*j)->second.is_incomplete() &&
1600 all_info.find(*j)->second.last_update >=
1601 auth_log_shard->second.log_tail) {
1602 ss << " selecting stray: " << *j << std::endl;
1603 want[i] = j->osd;
1604 break;
1605 }
1606 }
1607 if (want[i] == CRUSH_ITEM_NONE)
1608 ss << " failed to fill position " << (int)i << std::endl;
1609 }
1610 }
1611
1612 for (uint8_t i = 0; i < want.size(); ++i) {
1613 if (want[i] != CRUSH_ITEM_NONE) {
1614 acting_backfill->insert(pg_shard_t(want[i], shard_id_t(i)));
1615 }
1616 }
1617 acting_backfill->insert(backfill->begin(), backfill->end());
1618 _want->swap(want);
1619 }
1620
1621 /**
1622 * calculate the desired acting set.
1623 *
1624 * Choose an appropriate acting set. Prefer up[0], unless it is
1625 * incomplete, or another osd has a longer tail that allows us to
1626 * bring other up nodes up to date.
1627 */
1628 void PeeringState::calc_replicated_acting(
1629 map<pg_shard_t, pg_info_t>::const_iterator auth_log_shard,
1630 uint64_t force_auth_primary_missing_objects,
1631 unsigned size,
1632 const vector<int> &acting,
1633 const vector<int> &up,
1634 pg_shard_t up_primary,
1635 const map<pg_shard_t, pg_info_t> &all_info,
1636 bool restrict_to_up_acting,
1637 vector<int> *want,
1638 set<pg_shard_t> *backfill,
1639 set<pg_shard_t> *acting_backfill,
1640 const OSDMapRef osdmap,
1641 ostream &ss)
1642 {
1643 pg_shard_t auth_log_shard_id = auth_log_shard->first;
1644
1645 ss << __func__ << " newest update on osd." << auth_log_shard_id
1646 << " with " << auth_log_shard->second
1647 << (restrict_to_up_acting ? " restrict_to_up_acting" : "") << std::endl;
1648
1649 // select primary
1650 auto primary = all_info.find(up_primary);
1651 if (up.size() &&
1652 !primary->second.is_incomplete() &&
1653 primary->second.last_update >=
1654 auth_log_shard->second.log_tail) {
1655 if (HAVE_FEATURE(osdmap->get_up_osd_features(), SERVER_NAUTILUS)) {
1656 auto approx_missing_objects =
1657 primary->second.stats.stats.sum.num_objects_missing;
1658 auto auth_version = auth_log_shard->second.last_update.version;
1659 auto primary_version = primary->second.last_update.version;
1660 if (auth_version > primary_version) {
1661 approx_missing_objects += auth_version - primary_version;
1662 } else {
1663 approx_missing_objects += primary_version - auth_version;
1664 }
1665 if ((uint64_t)approx_missing_objects >
1666 force_auth_primary_missing_objects) {
1667 primary = auth_log_shard;
1668 ss << "up_primary: " << up_primary << ") has approximate "
1669 << approx_missing_objects
1670 << "(>" << force_auth_primary_missing_objects <<") "
1671 << "missing objects, osd." << auth_log_shard_id
1672 << " selected as primary instead"
1673 << std::endl;
1674 } else {
1675 ss << "up_primary: " << up_primary << ") selected as primary"
1676 << std::endl;
1677 }
1678 } else {
1679 ss << "up_primary: " << up_primary << ") selected as primary" << std::endl;
1680 }
1681 } else {
1682 ceph_assert(!auth_log_shard->second.is_incomplete());
1683 ss << "up[0] needs backfill, osd." << auth_log_shard_id
1684 << " selected as primary instead" << std::endl;
1685 primary = auth_log_shard;
1686 }
1687
1688 ss << __func__ << " primary is osd." << primary->first
1689 << " with " << primary->second << std::endl;
1690 want->push_back(primary->first.osd);
1691 acting_backfill->insert(primary->first);
1692
1693 /* We include auth_log_shard->second.log_tail because in GetLog,
1694 * we will request logs back to the min last_update over our
1695 * acting_backfill set, which will result in our log being extended
1696 * as far backwards as necessary to pick up any peers which can
1697 * be log recovered by auth_log_shard's log */
1698 eversion_t oldest_auth_log_entry =
1699 std::min(primary->second.log_tail, auth_log_shard->second.log_tail);
1700
1701 // select replicas that have log contiguity with primary.
1702 // prefer up, then acting, then any peer_info osds
1703 for (auto i : up) {
1704 pg_shard_t up_cand = pg_shard_t(i, shard_id_t::NO_SHARD);
1705 if (up_cand == primary->first)
1706 continue;
1707 const pg_info_t &cur_info = all_info.find(up_cand)->second;
1708 if (cur_info.is_incomplete() ||
1709 cur_info.last_update < oldest_auth_log_entry) {
1710 ss << " shard " << up_cand << " (up) backfill " << cur_info << std::endl;
1711 backfill->insert(up_cand);
1712 acting_backfill->insert(up_cand);
1713 } else {
1714 want->push_back(i);
1715 acting_backfill->insert(up_cand);
1716 ss << " osd." << i << " (up) accepted " << cur_info << std::endl;
1717 }
1718 }
1719
1720 if (want->size() >= size) {
1721 return;
1722 }
1723
1724 std::vector<std::pair<eversion_t, int>> candidate_by_last_update;
1725 candidate_by_last_update.reserve(acting.size());
1726 // This no longer has backfill OSDs, but they are covered above.
1727 for (auto i : acting) {
1728 pg_shard_t acting_cand(i, shard_id_t::NO_SHARD);
1729 // skip up osds we already considered above
1730 if (acting_cand == primary->first)
1731 continue;
1732 vector<int>::const_iterator up_it = find(up.begin(), up.end(), i);
1733 if (up_it != up.end())
1734 continue;
1735
1736 const pg_info_t &cur_info = all_info.find(acting_cand)->second;
1737 if (cur_info.is_incomplete() ||
1738 cur_info.last_update < oldest_auth_log_entry) {
1739 ss << " shard " << acting_cand << " (acting) REJECTED "
1740 << cur_info << std::endl;
1741 } else {
1742 candidate_by_last_update.emplace_back(cur_info.last_update, i);
1743 }
1744 }
1745
1746 auto sort_by_eversion =[](const std::pair<eversion_t, int> &lhs,
1747 const std::pair<eversion_t, int> &rhs) {
1748 return lhs.first > rhs.first;
1749 };
1750 // sort by last_update, in descending order.
1751 std::sort(candidate_by_last_update.begin(),
1752 candidate_by_last_update.end(), sort_by_eversion);
1753 for (auto &p: candidate_by_last_update) {
1754 ceph_assert(want->size() < size);
1755 want->push_back(p.second);
1756 pg_shard_t s = pg_shard_t(p.second, shard_id_t::NO_SHARD);
1757 acting_backfill->insert(s);
1758 ss << " shard " << s << " (acting) accepted "
1759 << all_info.find(s)->second << std::endl;
1760 if (want->size() >= size) {
1761 return;
1762 }
1763 }
1764
1765 if (restrict_to_up_acting) {
1766 return;
1767 }
1768 candidate_by_last_update.clear();
1769 candidate_by_last_update.reserve(all_info.size()); // overestimate but fine
1770 // continue to search stray to find more suitable peers
1771 for (auto &i : all_info) {
1772 // skip up osds we already considered above
1773 if (i.first == primary->first)
1774 continue;
1775 vector<int>::const_iterator up_it = find(up.begin(), up.end(), i.first.osd);
1776 if (up_it != up.end())
1777 continue;
1778 vector<int>::const_iterator acting_it = find(
1779 acting.begin(), acting.end(), i.first.osd);
1780 if (acting_it != acting.end())
1781 continue;
1782
1783 if (i.second.is_incomplete() ||
1784 i.second.last_update < oldest_auth_log_entry) {
1785 ss << " shard " << i.first << " (stray) REJECTED " << i.second
1786 << std::endl;
1787 } else {
1788 candidate_by_last_update.emplace_back(
1789 i.second.last_update, i.first.osd);
1790 }
1791 }
1792
1793 if (candidate_by_last_update.empty()) {
1794 // save us some effort
1795 return;
1796 }
1797
1798 // sort by last_update, in descending order.
1799 std::sort(candidate_by_last_update.begin(),
1800 candidate_by_last_update.end(), sort_by_eversion);
1801
1802 for (auto &p: candidate_by_last_update) {
1803 ceph_assert(want->size() < size);
1804 want->push_back(p.second);
1805 pg_shard_t s = pg_shard_t(p.second, shard_id_t::NO_SHARD);
1806 acting_backfill->insert(s);
1807 ss << " shard " << s << " (stray) accepted "
1808 << all_info.find(s)->second << std::endl;
1809 if (want->size() >= size) {
1810 return;
1811 }
1812 }
1813 }
1814
1815 bool PeeringState::recoverable(const vector<int> &want) const
1816 {
1817 unsigned num_want_acting = 0;
1818 set<pg_shard_t> have;
1819 for (int i = 0; i < (int)want.size(); ++i) {
1820 if (want[i] != CRUSH_ITEM_NONE) {
1821 ++num_want_acting;
1822 have.insert(
1823 pg_shard_t(
1824 want[i],
1825 pool.info.is_erasure() ? shard_id_t(i) : shard_id_t::NO_SHARD));
1826 }
1827 }
1828
1829 if (num_want_acting < pool.info.min_size) {
1830 const bool recovery_ec_pool_below_min_size=
1831 HAVE_FEATURE(get_osdmap()->get_up_osd_features(), SERVER_OCTOPUS);
1832
1833 if (pool.info.is_erasure() && !recovery_ec_pool_below_min_size) {
1834 psdout(10) << __func__ << " failed, ec recovery below min size not supported by pre-octopus" << dendl;
1835 return false;
1836 } else if (!cct->_conf.get_val<bool>("osd_allow_recovery_below_min_size")) {
1837 psdout(10) << __func__ << " failed, recovery below min size not enabled" << dendl;
1838 return false;
1839 }
1840 }
1841 if (missing_loc.get_recoverable_predicate()(have)) {
1842 return true;
1843 } else {
1844 psdout(10) << __func__ << " failed, not recoverable " << dendl;
1845 return false;
1846 }
1847 }
1848
1849 void PeeringState::choose_async_recovery_ec(
1850 const map<pg_shard_t, pg_info_t> &all_info,
1851 const pg_info_t &auth_info,
1852 vector<int> *want,
1853 set<pg_shard_t> *async_recovery,
1854 const OSDMapRef osdmap) const
1855 {
1856 set<pair<int, pg_shard_t> > candidates_by_cost;
1857 for (uint8_t i = 0; i < want->size(); ++i) {
1858 if ((*want)[i] == CRUSH_ITEM_NONE)
1859 continue;
1860
1861 // Considering log entries to recover is accurate enough for
1862 // now. We could use minimum_to_decode_with_cost() later if
1863 // necessary.
1864 pg_shard_t shard_i((*want)[i], shard_id_t(i));
1865 // do not include strays
1866 if (stray_set.find(shard_i) != stray_set.end())
1867 continue;
1868 // Do not include an osd that is not up, since choosing it as
1869 // an async_recovery_target will move it out of the acting set.
1870 // This results in it being identified as a stray during peering,
1871 // because it is no longer in the up or acting set.
1872 if (!is_up(shard_i))
1873 continue;
1874 auto shard_info = all_info.find(shard_i)->second;
1875 // for ec pools we rollback all entries past the authoritative
1876 // last_update *before* activation. This is relatively inexpensive
1877 // compared to recovery, since it is purely local, so treat shards
1878 // past the authoritative last_update the same as those equal to it.
1879 version_t auth_version = auth_info.last_update.version;
1880 version_t candidate_version = shard_info.last_update.version;
1881 if (HAVE_FEATURE(osdmap->get_up_osd_features(), SERVER_NAUTILUS)) {
1882 auto approx_missing_objects =
1883 shard_info.stats.stats.sum.num_objects_missing;
1884 if (auth_version > candidate_version) {
1885 approx_missing_objects += auth_version - candidate_version;
1886 }
1887 if (static_cast<uint64_t>(approx_missing_objects) >
1888 cct->_conf.get_val<uint64_t>("osd_async_recovery_min_cost")) {
1889 candidates_by_cost.emplace(approx_missing_objects, shard_i);
1890 }
1891 } else {
1892 if (auth_version > candidate_version &&
1893 (auth_version - candidate_version) > cct->_conf.get_val<uint64_t>("osd_async_recovery_min_cost")) {
1894 candidates_by_cost.insert(make_pair(auth_version - candidate_version, shard_i));
1895 }
1896 }
1897 }
1898
1899 psdout(20) << __func__ << " candidates by cost are: " << candidates_by_cost
1900 << dendl;
1901
1902 // take out as many osds as we can for async recovery, in order of cost
1903 for (auto rit = candidates_by_cost.rbegin();
1904 rit != candidates_by_cost.rend(); ++rit) {
1905 pg_shard_t cur_shard = rit->second;
1906 vector<int> candidate_want(*want);
1907 candidate_want[cur_shard.shard.id] = CRUSH_ITEM_NONE;
1908 if (recoverable(candidate_want)) {
1909 want->swap(candidate_want);
1910 async_recovery->insert(cur_shard);
1911 }
1912 }
1913 psdout(20) << __func__ << " result want=" << *want
1914 << " async_recovery=" << *async_recovery << dendl;
1915 }
1916
1917 void PeeringState::choose_async_recovery_replicated(
1918 const map<pg_shard_t, pg_info_t> &all_info,
1919 const pg_info_t &auth_info,
1920 vector<int> *want,
1921 set<pg_shard_t> *async_recovery,
1922 const OSDMapRef osdmap) const
1923 {
1924 set<pair<int, pg_shard_t> > candidates_by_cost;
1925 for (auto osd_num : *want) {
1926 pg_shard_t shard_i(osd_num, shard_id_t::NO_SHARD);
1927 // do not include strays
1928 if (stray_set.find(shard_i) != stray_set.end())
1929 continue;
1930 // Do not include an osd that is not up, since choosing it as
1931 // an async_recovery_target will move it out of the acting set.
1932 // This results in it being identified as a stray during peering,
1933 // because it is no longer in the up or acting set.
1934 if (!is_up(shard_i))
1935 continue;
1936 auto shard_info = all_info.find(shard_i)->second;
1937 // use the approximate magnitude of the difference in length of
1938 // logs plus historical missing objects as the cost of recovery
1939 version_t auth_version = auth_info.last_update.version;
1940 version_t candidate_version = shard_info.last_update.version;
1941 if (HAVE_FEATURE(osdmap->get_up_osd_features(), SERVER_NAUTILUS)) {
1942 auto approx_missing_objects =
1943 shard_info.stats.stats.sum.num_objects_missing;
1944 if (auth_version > candidate_version) {
1945 approx_missing_objects += auth_version - candidate_version;
1946 } else {
1947 approx_missing_objects += candidate_version - auth_version;
1948 }
1949 if (static_cast<uint64_t>(approx_missing_objects) >
1950 cct->_conf.get_val<uint64_t>("osd_async_recovery_min_cost")) {
1951 candidates_by_cost.emplace(approx_missing_objects, shard_i);
1952 }
1953 } else {
1954 size_t approx_entries;
1955 if (auth_version > candidate_version) {
1956 approx_entries = auth_version - candidate_version;
1957 } else {
1958 approx_entries = candidate_version - auth_version;
1959 }
1960 if (approx_entries > cct->_conf.get_val<uint64_t>("osd_async_recovery_min_cost")) {
1961 candidates_by_cost.insert(make_pair(approx_entries, shard_i));
1962 }
1963 }
1964 }
1965
1966 psdout(20) << __func__ << " candidates by cost are: " << candidates_by_cost
1967 << dendl;
1968 // take out as many osds as we can for async recovery, in order of cost
1969 for (auto rit = candidates_by_cost.rbegin();
1970 rit != candidates_by_cost.rend(); ++rit) {
1971 if (want->size() <= pool.info.min_size) {
1972 break;
1973 }
1974 pg_shard_t cur_shard = rit->second;
1975 vector<int> candidate_want(*want);
1976 for (auto it = candidate_want.begin(); it != candidate_want.end(); ++it) {
1977 if (*it == cur_shard.osd) {
1978 candidate_want.erase(it);
1979 want->swap(candidate_want);
1980 async_recovery->insert(cur_shard);
1981 break;
1982 }
1983 }
1984 }
1985 psdout(20) << __func__ << " result want=" << *want
1986 << " async_recovery=" << *async_recovery << dendl;
1987 }
1988
1989
1990
1991 /**
1992 * choose acting
1993 *
1994 * calculate the desired acting, and request a change with the monitor
1995 * if it differs from the current acting.
1996 *
1997 * if restrict_to_up_acting=true, we filter out anything that's not in
1998 * up/acting. in order to lift this restriction, we need to
1999 * 1) check whether it's worth switching the acting set any time we get
2000 * a new pg info (not just here, when recovery finishes)
2001 * 2) check whether anything in want_acting went down on each new map
2002 * (and, if so, calculate a new want_acting)
2003 * 3) remove the assertion in PG::PeeringState::Active::react(const AdvMap)
2004 * TODO!
2005 */
2006 bool PeeringState::choose_acting(pg_shard_t &auth_log_shard_id,
2007 bool restrict_to_up_acting,
2008 bool *history_les_bound,
2009 bool request_pg_temp_change_only)
2010 {
2011 map<pg_shard_t, pg_info_t> all_info(peer_info.begin(), peer_info.end());
2012 all_info[pg_whoami] = info;
2013
2014 if (cct->_conf->subsys.should_gather<dout_subsys, 10>()) {
2015 for (map<pg_shard_t, pg_info_t>::iterator p = all_info.begin();
2016 p != all_info.end();
2017 ++p) {
2018 psdout(10) << __func__ << " all_info osd." << p->first << " "
2019 << p->second << dendl;
2020 }
2021 }
2022
2023 map<pg_shard_t, pg_info_t>::const_iterator auth_log_shard =
2024 find_best_info(all_info, restrict_to_up_acting, history_les_bound);
2025
2026 if (auth_log_shard == all_info.end()) {
2027 if (up != acting) {
2028 psdout(10) << __func__ << " no suitable info found (incomplete backfills?),"
2029 << " reverting to up" << dendl;
2030 want_acting = up;
2031 vector<int> empty;
2032 pl->queue_want_pg_temp(empty);
2033 } else {
2034 psdout(10) << __func__ << " failed" << dendl;
2035 ceph_assert(want_acting.empty());
2036 }
2037 return false;
2038 }
2039
2040 ceph_assert(!auth_log_shard->second.is_incomplete());
2041 auth_log_shard_id = auth_log_shard->first;
2042
2043 set<pg_shard_t> want_backfill, want_acting_backfill;
2044 vector<int> want;
2045 stringstream ss;
2046 if (pool.info.is_replicated())
2047 calc_replicated_acting(
2048 auth_log_shard,
2049 cct->_conf.get_val<uint64_t>(
2050 "osd_force_auth_primary_missing_objects"),
2051 get_osdmap()->get_pg_size(info.pgid.pgid),
2052 acting,
2053 up,
2054 up_primary,
2055 all_info,
2056 restrict_to_up_acting,
2057 &want,
2058 &want_backfill,
2059 &want_acting_backfill,
2060 get_osdmap(),
2061 ss);
2062 else
2063 calc_ec_acting(
2064 auth_log_shard,
2065 get_osdmap()->get_pg_size(info.pgid.pgid),
2066 acting,
2067 up,
2068 all_info,
2069 restrict_to_up_acting,
2070 &want,
2071 &want_backfill,
2072 &want_acting_backfill,
2073 ss);
2074 psdout(10) << ss.str() << dendl;
2075
2076 if (!recoverable(want)) {
2077 want_acting.clear();
2078 return false;
2079 }
2080
2081 set<pg_shard_t> want_async_recovery;
2082 if (HAVE_FEATURE(get_osdmap()->get_up_osd_features(), SERVER_MIMIC)) {
2083 if (pool.info.is_erasure()) {
2084 choose_async_recovery_ec(
2085 all_info, auth_log_shard->second, &want, &want_async_recovery,
2086 get_osdmap());
2087 } else {
2088 choose_async_recovery_replicated(
2089 all_info, auth_log_shard->second, &want, &want_async_recovery,
2090 get_osdmap());
2091 }
2092 }
2093 while (want.size() > pool.info.size) {
2094 // async recovery should have taken out as many osds as it can.
2095 // if not, then always evict the last peer
2096 // (will get synchronously recovered later)
2097 psdout(10) << __func__ << " evicting osd." << want.back()
2098 << " from oversized want " << want << dendl;
2099 want.pop_back();
2100 }
2101 if (want != acting) {
2102 psdout(10) << __func__ << " want " << want << " != acting " << acting
2103 << ", requesting pg_temp change" << dendl;
2104 want_acting = want;
2105
2106 if (!cct->_conf->osd_debug_no_acting_change) {
2107 if (want_acting == up) {
2108 // There can't be any pending backfill if
2109 // want is the same as crush map up OSDs.
2110 ceph_assert(want_backfill.empty());
2111 vector<int> empty;
2112 pl->queue_want_pg_temp(empty);
2113 } else
2114 pl->queue_want_pg_temp(want);
2115 }
2116 return false;
2117 }
2118 if (request_pg_temp_change_only)
2119 return true;
2120 want_acting.clear();
2121 acting_recovery_backfill = want_acting_backfill;
2122 psdout(10) << "acting_recovery_backfill is "
2123 << acting_recovery_backfill << dendl;
2124 ceph_assert(
2125 backfill_targets.empty() ||
2126 backfill_targets == want_backfill);
2127 if (backfill_targets.empty()) {
2128 // Caller is GetInfo
2129 backfill_targets = want_backfill;
2130 }
2131 // Adding !needs_recovery() to let the async_recovery_targets reset after recovery is complete
2132 ceph_assert(
2133 async_recovery_targets.empty() ||
2134 async_recovery_targets == want_async_recovery ||
2135 !needs_recovery());
2136 if (async_recovery_targets.empty() || !needs_recovery()) {
2137 async_recovery_targets = want_async_recovery;
2138 }
2139 // Will not change if already set because up would have had to change
2140 // Verify that nothing in backfill is in stray_set
2141 for (set<pg_shard_t>::iterator i = want_backfill.begin();
2142 i != want_backfill.end();
2143 ++i) {
2144 ceph_assert(stray_set.find(*i) == stray_set.end());
2145 }
2146 psdout(10) << "choose_acting want=" << want << " backfill_targets="
2147 << want_backfill << " async_recovery_targets="
2148 << async_recovery_targets << dendl;
2149 return true;
2150 }
2151
2152 void PeeringState::log_weirdness()
2153 {
2154 if (pg_log.get_tail() != info.log_tail)
2155 pl->get_clog_error() << info.pgid
2156 << " info mismatch, log.tail " << pg_log.get_tail()
2157 << " != info.log_tail " << info.log_tail;
2158 if (pg_log.get_head() != info.last_update)
2159 pl->get_clog_error() << info.pgid
2160 << " info mismatch, log.head " << pg_log.get_head()
2161 << " != info.last_update " << info.last_update;
2162
2163 if (!pg_log.get_log().empty()) {
2164 // sloppy check
2165 if ((pg_log.get_log().log.begin()->version <= pg_log.get_tail()))
2166 pl->get_clog_error() << info.pgid
2167 << " log bound mismatch, info (tail,head] ("
2168 << pg_log.get_tail() << ","
2169 << pg_log.get_head() << "]"
2170 << " actual ["
2171 << pg_log.get_log().log.begin()->version << ","
2172 << pg_log.get_log().log.rbegin()->version << "]";
2173 }
2174
2175 if (pg_log.get_log().caller_ops.size() > pg_log.get_log().log.size()) {
2176 pl->get_clog_error() << info.pgid
2177 << " caller_ops.size "
2178 << pg_log.get_log().caller_ops.size()
2179 << " > log size " << pg_log.get_log().log.size();
2180 }
2181 }
2182
2183 /*
2184 * Process information from a replica to determine if it could have any
2185 * objects that i need.
2186 *
2187 * TODO: if the missing set becomes very large, this could get expensive.
2188 * Instead, we probably want to just iterate over our unfound set.
2189 */
2190 bool PeeringState::search_for_missing(
2191 const pg_info_t &oinfo, const pg_missing_t &omissing,
2192 pg_shard_t from,
2193 PeeringCtxWrapper &ctx)
2194 {
2195 uint64_t num_unfound_before = missing_loc.num_unfound();
2196 bool found_missing = missing_loc.add_source_info(
2197 from, oinfo, omissing, ctx.handle);
2198 if (found_missing && num_unfound_before != missing_loc.num_unfound())
2199 pl->publish_stats_to_osd();
2200 // avoid doing this if the peer is empty. This is abit of paranoia
2201 // to avoid doing something rash if add_source_info() above
2202 // incorrectly decided we found something new. (if the peer has
2203 // last_update=0'0 that's impossible.)
2204 if (found_missing &&
2205 oinfo.last_update != eversion_t()) {
2206 pg_info_t tinfo(oinfo);
2207 tinfo.pgid.shard = pg_whoami.shard;
2208 ctx.send_info(
2209 from.osd,
2210 spg_t(info.pgid.pgid, from.shard),
2211 get_osdmap_epoch(), // fixme: use lower epoch?
2212 get_osdmap_epoch(),
2213 tinfo);
2214 }
2215 return found_missing;
2216 }
2217
2218 bool PeeringState::discover_all_missing(
2219 BufferedRecoveryMessages &rctx)
2220 {
2221 auto &missing = pg_log.get_missing();
2222 uint64_t unfound = get_num_unfound();
2223 bool any = false; // did we start any queries
2224
2225 psdout(10) << __func__ << " "
2226 << missing.num_missing() << " missing, "
2227 << unfound << " unfound"
2228 << dendl;
2229
2230 std::set<pg_shard_t>::const_iterator m = might_have_unfound.begin();
2231 std::set<pg_shard_t>::const_iterator mend = might_have_unfound.end();
2232 for (; m != mend; ++m) {
2233 pg_shard_t peer(*m);
2234
2235 if (!get_osdmap()->is_up(peer.osd)) {
2236 psdout(20) << __func__ << " skipping down osd." << peer << dendl;
2237 continue;
2238 }
2239
2240 if (peer_purged.count(peer)) {
2241 psdout(20) << __func__ << " skipping purged osd." << peer << dendl;
2242 continue;
2243 }
2244
2245 map<pg_shard_t, pg_info_t>::const_iterator iter = peer_info.find(peer);
2246 if (iter != peer_info.end() &&
2247 (iter->second.is_empty() || iter->second.dne())) {
2248 // ignore empty peers
2249 continue;
2250 }
2251
2252 // If we've requested any of this stuff, the pg_missing_t information
2253 // should be on its way.
2254 // TODO: coalsce requested_* into a single data structure
2255 if (peer_missing.find(peer) != peer_missing.end()) {
2256 psdout(20) << __func__ << ": osd." << peer
2257 << ": we already have pg_missing_t" << dendl;
2258 continue;
2259 }
2260 if (peer_log_requested.find(peer) != peer_log_requested.end()) {
2261 psdout(20) << __func__ << ": osd." << peer
2262 << ": in peer_log_requested" << dendl;
2263 continue;
2264 }
2265 if (peer_missing_requested.find(peer) != peer_missing_requested.end()) {
2266 psdout(20) << __func__ << ": osd." << peer
2267 << ": in peer_missing_requested" << dendl;
2268 continue;
2269 }
2270
2271 // Request missing
2272 psdout(10) << __func__ << ": osd." << peer << ": requesting pg_missing_t"
2273 << dendl;
2274 peer_missing_requested.insert(peer);
2275 rctx.send_query(
2276 peer.osd,
2277 spg_t(info.pgid.pgid, peer.shard),
2278 pg_query_t(
2279 pg_query_t::FULLLOG,
2280 peer.shard, pg_whoami.shard,
2281 info.history, get_osdmap_epoch()));
2282 any = true;
2283 }
2284 return any;
2285 }
2286
2287 /* Build the might_have_unfound set.
2288 *
2289 * This is used by the primary OSD during recovery.
2290 *
2291 * This set tracks the OSDs which might have unfound objects that the primary
2292 * OSD needs. As we receive pg_missing_t from each OSD in might_have_unfound, we
2293 * will remove the OSD from the set.
2294 */
2295 void PeeringState::build_might_have_unfound()
2296 {
2297 ceph_assert(might_have_unfound.empty());
2298 ceph_assert(is_primary());
2299
2300 psdout(10) << __func__ << dendl;
2301
2302 check_past_interval_bounds();
2303
2304 might_have_unfound = past_intervals.get_might_have_unfound(
2305 pg_whoami,
2306 pool.info.is_erasure());
2307
2308 // include any (stray) peers
2309 for (map<pg_shard_t, pg_info_t>::iterator p = peer_info.begin();
2310 p != peer_info.end();
2311 ++p)
2312 might_have_unfound.insert(p->first);
2313
2314 psdout(15) << __func__ << ": built " << might_have_unfound << dendl;
2315 }
2316
2317 void PeeringState::activate(
2318 ObjectStore::Transaction& t,
2319 epoch_t activation_epoch,
2320 PeeringCtxWrapper &ctx)
2321 {
2322 ceph_assert(!is_peered());
2323
2324 // twiddle pg state
2325 state_clear(PG_STATE_DOWN);
2326
2327 send_notify = false;
2328
2329 if (is_primary()) {
2330 // only update primary last_epoch_started if we will go active
2331 if (acting.size() >= pool.info.min_size) {
2332 ceph_assert(cct->_conf->osd_find_best_info_ignore_history_les ||
2333 info.last_epoch_started <= activation_epoch);
2334 info.last_epoch_started = activation_epoch;
2335 info.last_interval_started = info.history.same_interval_since;
2336 }
2337 } else if (is_acting(pg_whoami)) {
2338 /* update last_epoch_started on acting replica to whatever the primary sent
2339 * unless it's smaller (could happen if we are going peered rather than
2340 * active, see doc/dev/osd_internals/last_epoch_started.rst) */
2341 if (info.last_epoch_started < activation_epoch) {
2342 info.last_epoch_started = activation_epoch;
2343 info.last_interval_started = info.history.same_interval_since;
2344 }
2345 }
2346
2347 auto &missing = pg_log.get_missing();
2348
2349 min_last_complete_ondisk = eversion_t(0,0); // we don't know (yet)!
2350 if (is_primary()) {
2351 last_update_ondisk = info.last_update;
2352 }
2353 last_update_applied = info.last_update;
2354 last_rollback_info_trimmed_to_applied = pg_log.get_can_rollback_to();
2355
2356 need_up_thru = false;
2357
2358 // write pg info, log
2359 dirty_info = true;
2360 dirty_big_info = true; // maybe
2361
2362 pl->schedule_event_on_commit(
2363 t,
2364 std::make_shared<PGPeeringEvent>(
2365 get_osdmap_epoch(),
2366 get_osdmap_epoch(),
2367 ActivateCommitted(
2368 get_osdmap_epoch(),
2369 activation_epoch)));
2370
2371 // init complete pointer
2372 if (missing.num_missing() == 0) {
2373 psdout(10) << "activate - no missing, moving last_complete " << info.last_complete
2374 << " -> " << info.last_update << dendl;
2375 info.last_complete = info.last_update;
2376 info.stats.stats.sum.num_objects_missing = 0;
2377 pg_log.reset_recovery_pointers();
2378 } else {
2379 psdout(10) << "activate - not complete, " << missing << dendl;
2380 info.stats.stats.sum.num_objects_missing = missing.num_missing();
2381 pg_log.activate_not_complete(info);
2382 }
2383
2384 log_weirdness();
2385
2386 if (is_primary()) {
2387 // initialize snap_trimq
2388 interval_set<snapid_t> to_trim;
2389 auto& removed_snaps_queue = get_osdmap()->get_removed_snaps_queue();
2390 auto p = removed_snaps_queue.find(info.pgid.pgid.pool());
2391 if (p != removed_snaps_queue.end()) {
2392 dout(20) << "activate - purged_snaps " << info.purged_snaps
2393 << " removed_snaps " << p->second
2394 << dendl;
2395 for (auto q : p->second) {
2396 to_trim.insert(q.first, q.second);
2397 }
2398 }
2399 interval_set<snapid_t> purged;
2400 purged.intersection_of(to_trim, info.purged_snaps);
2401 to_trim.subtract(purged);
2402
2403 if (HAVE_FEATURE(upacting_features, SERVER_OCTOPUS)) {
2404 renew_lease(pl->get_mnow());
2405 // do not schedule until we are actually activated
2406 }
2407
2408 // adjust purged_snaps: PG may have been inactive while snaps were pruned
2409 // from the removed_snaps_queue in the osdmap. update local purged_snaps
2410 // reflect only those snaps that we thought were pruned and were still in
2411 // the queue.
2412 info.purged_snaps.swap(purged);
2413
2414 // start up replicas
2415 info.history.refresh_prior_readable_until_ub(pl->get_mnow(),
2416 prior_readable_until_ub);
2417
2418 ceph_assert(!acting_recovery_backfill.empty());
2419 for (set<pg_shard_t>::iterator i = acting_recovery_backfill.begin();
2420 i != acting_recovery_backfill.end();
2421 ++i) {
2422 if (*i == pg_whoami) continue;
2423 pg_shard_t peer = *i;
2424 ceph_assert(peer_info.count(peer));
2425 pg_info_t& pi = peer_info[peer];
2426
2427 psdout(10) << "activate peer osd." << peer << " " << pi << dendl;
2428
2429 MOSDPGLog *m = 0;
2430 ceph_assert(peer_missing.count(peer));
2431 pg_missing_t& pm = peer_missing[peer];
2432
2433 bool needs_past_intervals = pi.dne();
2434
2435 if (pi.last_update == info.last_update) {
2436 // empty log
2437 if (!pi.last_backfill.is_max())
2438 pl->get_clog_info() << info.pgid << " continuing backfill to osd."
2439 << peer
2440 << " from (" << pi.log_tail << "," << pi.last_update
2441 << "] " << pi.last_backfill
2442 << " to " << info.last_update;
2443 if (!pi.is_empty()) {
2444 psdout(10) << "activate peer osd." << peer
2445 << " is up to date, queueing in pending_activators" << dendl;
2446 ctx.send_info(
2447 peer.osd,
2448 spg_t(info.pgid.pgid, peer.shard),
2449 get_osdmap_epoch(), // fixme: use lower epoch?
2450 get_osdmap_epoch(),
2451 info,
2452 get_lease());
2453 } else {
2454 psdout(10) << "activate peer osd." << peer
2455 << " is up to date, but sending pg_log anyway" << dendl;
2456 m = new MOSDPGLog(
2457 i->shard, pg_whoami.shard,
2458 get_osdmap_epoch(), info,
2459 last_peering_reset);
2460 }
2461 } else if (
2462 pg_log.get_tail() > pi.last_update ||
2463 pi.last_backfill == hobject_t() ||
2464 (backfill_targets.count(*i) && pi.last_backfill.is_max())) {
2465 /* ^ This last case covers a situation where a replica is not contiguous
2466 * with the auth_log, but is contiguous with this replica. Reshuffling
2467 * the active set to handle this would be tricky, so instead we just go
2468 * ahead and backfill it anyway. This is probably preferrable in any
2469 * case since the replica in question would have to be significantly
2470 * behind.
2471 */
2472 // backfill
2473 pl->get_clog_debug() << info.pgid << " starting backfill to osd." << peer
2474 << " from (" << pi.log_tail << "," << pi.last_update
2475 << "] " << pi.last_backfill
2476 << " to " << info.last_update;
2477
2478 pi.last_update = info.last_update;
2479 pi.last_complete = info.last_update;
2480 pi.set_last_backfill(hobject_t());
2481 pi.last_epoch_started = info.last_epoch_started;
2482 pi.last_interval_started = info.last_interval_started;
2483 pi.history = info.history;
2484 pi.hit_set = info.hit_set;
2485 // Save num_bytes for reservation request, can't be negative
2486 peer_bytes[peer] = std::max<int64_t>(0, pi.stats.stats.sum.num_bytes);
2487 pi.stats.stats.clear();
2488 pi.stats.stats.sum.num_bytes = peer_bytes[peer];
2489
2490 // initialize peer with our purged_snaps.
2491 pi.purged_snaps = info.purged_snaps;
2492
2493 m = new MOSDPGLog(
2494 i->shard, pg_whoami.shard,
2495 get_osdmap_epoch(), pi,
2496 last_peering_reset /* epoch to create pg at */);
2497
2498 // send some recent log, so that op dup detection works well.
2499 m->log.copy_up_to(cct, pg_log.get_log(),
2500 cct->_conf->osd_max_pg_log_entries);
2501 m->info.log_tail = m->log.tail;
2502 pi.log_tail = m->log.tail; // sigh...
2503
2504 pm.clear();
2505 } else {
2506 // catch up
2507 ceph_assert(pg_log.get_tail() <= pi.last_update);
2508 m = new MOSDPGLog(
2509 i->shard, pg_whoami.shard,
2510 get_osdmap_epoch(), info,
2511 last_peering_reset /* epoch to create pg at */);
2512 // send new stuff to append to replicas log
2513 m->log.copy_after(cct, pg_log.get_log(), pi.last_update);
2514 }
2515
2516 // share past_intervals if we are creating the pg on the replica
2517 // based on whether our info for that peer was dne() *before*
2518 // updating pi.history in the backfill block above.
2519 if (m && needs_past_intervals)
2520 m->past_intervals = past_intervals;
2521
2522 // update local version of peer's missing list!
2523 if (m && pi.last_backfill != hobject_t()) {
2524 for (list<pg_log_entry_t>::iterator p = m->log.log.begin();
2525 p != m->log.log.end();
2526 ++p) {
2527 if (p->soid <= pi.last_backfill &&
2528 !p->is_error()) {
2529 if (perform_deletes_during_peering() && p->is_delete()) {
2530 pm.rm(p->soid, p->version);
2531 } else {
2532 pm.add_next_event(*p);
2533 }
2534 }
2535 }
2536 }
2537
2538 if (m) {
2539 dout(10) << "activate peer osd." << peer << " sending " << m->log
2540 << dendl;
2541 m->lease = get_lease();
2542 pl->send_cluster_message(peer.osd, m, get_osdmap_epoch());
2543 }
2544
2545 // peer now has
2546 pi.last_update = info.last_update;
2547
2548 // update our missing
2549 if (pm.num_missing() == 0) {
2550 pi.last_complete = pi.last_update;
2551 psdout(10) << "activate peer osd." << peer << " " << pi
2552 << " uptodate" << dendl;
2553 } else {
2554 psdout(10) << "activate peer osd." << peer << " " << pi
2555 << " missing " << pm << dendl;
2556 }
2557 }
2558
2559 // Set up missing_loc
2560 set<pg_shard_t> complete_shards;
2561 for (set<pg_shard_t>::iterator i = acting_recovery_backfill.begin();
2562 i != acting_recovery_backfill.end();
2563 ++i) {
2564 psdout(20) << __func__ << " setting up missing_loc from shard " << *i
2565 << " " << dendl;
2566 if (*i == get_primary()) {
2567 missing_loc.add_active_missing(missing);
2568 if (!missing.have_missing())
2569 complete_shards.insert(*i);
2570 } else {
2571 auto peer_missing_entry = peer_missing.find(*i);
2572 ceph_assert(peer_missing_entry != peer_missing.end());
2573 missing_loc.add_active_missing(peer_missing_entry->second);
2574 if (!peer_missing_entry->second.have_missing() &&
2575 peer_info[*i].last_backfill.is_max())
2576 complete_shards.insert(*i);
2577 }
2578 }
2579
2580 // If necessary, create might_have_unfound to help us find our unfound objects.
2581 // NOTE: It's important that we build might_have_unfound before trimming the
2582 // past intervals.
2583 might_have_unfound.clear();
2584 if (needs_recovery()) {
2585 // If only one shard has missing, we do a trick to add all others as recovery
2586 // source, this is considered safe since the PGLogs have been merged locally,
2587 // and covers vast majority of the use cases, like one OSD/host is down for
2588 // a while for hardware repairing
2589 if (complete_shards.size() + 1 == acting_recovery_backfill.size()) {
2590 missing_loc.add_batch_sources_info(complete_shards, ctx.handle);
2591 } else {
2592 missing_loc.add_source_info(pg_whoami, info, pg_log.get_missing(),
2593 ctx.handle);
2594 for (set<pg_shard_t>::iterator i = acting_recovery_backfill.begin();
2595 i != acting_recovery_backfill.end();
2596 ++i) {
2597 if (*i == pg_whoami) continue;
2598 psdout(10) << __func__ << ": adding " << *i << " as a source" << dendl;
2599 ceph_assert(peer_missing.count(*i));
2600 ceph_assert(peer_info.count(*i));
2601 missing_loc.add_source_info(
2602 *i,
2603 peer_info[*i],
2604 peer_missing[*i],
2605 ctx.handle);
2606 }
2607 }
2608 for (map<pg_shard_t, pg_missing_t>::iterator i = peer_missing.begin();
2609 i != peer_missing.end();
2610 ++i) {
2611 if (is_acting_recovery_backfill(i->first))
2612 continue;
2613 ceph_assert(peer_info.count(i->first));
2614 search_for_missing(
2615 peer_info[i->first],
2616 i->second,
2617 i->first,
2618 ctx);
2619 }
2620
2621 build_might_have_unfound();
2622
2623 // Always call now so update_calc_stats() will be accurate
2624 discover_all_missing(ctx.msgs);
2625
2626 }
2627
2628 // num_objects_degraded if calculated should reflect this too, unless no
2629 // missing and we are about to go clean.
2630 if (get_osdmap()->get_pg_size(info.pgid.pgid) > actingset.size()) {
2631 state_set(PG_STATE_UNDERSIZED);
2632 }
2633
2634 state_set(PG_STATE_ACTIVATING);
2635 pl->on_activate(std::move(to_trim));
2636 }
2637 if (acting.size() >= pool.info.min_size) {
2638 PGLog::LogEntryHandlerRef rollbacker{pl->get_log_handler(t)};
2639 pg_log.roll_forward(rollbacker.get());
2640 }
2641 }
2642
2643 void PeeringState::share_pg_info()
2644 {
2645 psdout(10) << "share_pg_info" << dendl;
2646
2647 info.history.refresh_prior_readable_until_ub(pl->get_mnow(),
2648 prior_readable_until_ub);
2649
2650 // share new pg_info_t with replicas
2651 ceph_assert(!acting_recovery_backfill.empty());
2652 for (auto pg_shard : acting_recovery_backfill) {
2653 if (pg_shard == pg_whoami) continue;
2654 if (auto peer = peer_info.find(pg_shard); peer != peer_info.end()) {
2655 peer->second.last_epoch_started = info.last_epoch_started;
2656 peer->second.last_interval_started = info.last_interval_started;
2657 peer->second.history.merge(info.history);
2658 }
2659 Message* m = nullptr;
2660 if (last_require_osd_release >= ceph_release_t::octopus) {
2661 m = new MOSDPGInfo2{spg_t{info.pgid.pgid, pg_shard.shard},
2662 info,
2663 get_osdmap_epoch(),
2664 get_osdmap_epoch(),
2665 get_lease(), {}};
2666 } else {
2667 m = new MOSDPGInfo{get_osdmap_epoch(),
2668 {pg_notify_t{pg_shard.shard,
2669 pg_whoami.shard,
2670 get_osdmap_epoch(),
2671 get_osdmap_epoch(),
2672 info,
2673 past_intervals}}};
2674 }
2675 pl->send_cluster_message(pg_shard.osd, m, get_osdmap_epoch());
2676 }
2677 }
2678
2679 void PeeringState::merge_log(
2680 ObjectStore::Transaction& t, pg_info_t &oinfo, pg_log_t &olog,
2681 pg_shard_t from)
2682 {
2683 PGLog::LogEntryHandlerRef rollbacker{pl->get_log_handler(t)};
2684 pg_log.merge_log(
2685 oinfo, olog, from, info, rollbacker.get(), dirty_info, dirty_big_info);
2686 }
2687
2688 void PeeringState::rewind_divergent_log(
2689 ObjectStore::Transaction& t, eversion_t newhead)
2690 {
2691 PGLog::LogEntryHandlerRef rollbacker{pl->get_log_handler(t)};
2692 pg_log.rewind_divergent_log(
2693 newhead, info, rollbacker.get(), dirty_info, dirty_big_info);
2694 }
2695
2696
2697 void PeeringState::proc_primary_info(
2698 ObjectStore::Transaction &t, const pg_info_t &oinfo)
2699 {
2700 ceph_assert(!is_primary());
2701
2702 update_history(oinfo.history);
2703 if (!info.stats.stats_invalid && info.stats.stats.sum.num_scrub_errors) {
2704 info.stats.stats.sum.num_scrub_errors = 0;
2705 info.stats.stats.sum.num_shallow_scrub_errors = 0;
2706 info.stats.stats.sum.num_deep_scrub_errors = 0;
2707 dirty_info = true;
2708 }
2709
2710 if (!(info.purged_snaps == oinfo.purged_snaps)) {
2711 psdout(10) << __func__ << " updating purged_snaps to "
2712 << oinfo.purged_snaps
2713 << dendl;
2714 info.purged_snaps = oinfo.purged_snaps;
2715 dirty_info = true;
2716 dirty_big_info = true;
2717 }
2718 }
2719
2720 void PeeringState::proc_master_log(
2721 ObjectStore::Transaction& t, pg_info_t &oinfo,
2722 pg_log_t &olog, pg_missing_t& omissing, pg_shard_t from)
2723 {
2724 psdout(10) << "proc_master_log for osd." << from << ": "
2725 << olog << " " << omissing << dendl;
2726 ceph_assert(!is_peered() && is_primary());
2727
2728 // merge log into our own log to build master log. no need to
2729 // make any adjustments to their missing map; we are taking their
2730 // log to be authoritative (i.e., their entries are by definitely
2731 // non-divergent).
2732 merge_log(t, oinfo, olog, from);
2733 peer_info[from] = oinfo;
2734 psdout(10) << " peer osd." << from << " now " << oinfo
2735 << " " << omissing << dendl;
2736 might_have_unfound.insert(from);
2737
2738 // See doc/dev/osd_internals/last_epoch_started
2739 if (oinfo.last_epoch_started > info.last_epoch_started) {
2740 info.last_epoch_started = oinfo.last_epoch_started;
2741 dirty_info = true;
2742 }
2743 if (oinfo.last_interval_started > info.last_interval_started) {
2744 info.last_interval_started = oinfo.last_interval_started;
2745 dirty_info = true;
2746 }
2747 update_history(oinfo.history);
2748 ceph_assert(cct->_conf->osd_find_best_info_ignore_history_les ||
2749 info.last_epoch_started >= info.history.last_epoch_started);
2750
2751 peer_missing[from].claim(omissing);
2752 }
2753
2754 void PeeringState::proc_replica_log(
2755 pg_info_t &oinfo,
2756 const pg_log_t &olog,
2757 pg_missing_t& omissing,
2758 pg_shard_t from)
2759 {
2760 psdout(10) << "proc_replica_log for osd." << from << ": "
2761 << oinfo << " " << olog << " " << omissing << dendl;
2762
2763 pg_log.proc_replica_log(oinfo, olog, omissing, from);
2764
2765 peer_info[from] = oinfo;
2766 psdout(10) << " peer osd." << from << " now "
2767 << oinfo << " " << omissing << dendl;
2768 might_have_unfound.insert(from);
2769
2770 for (map<hobject_t, pg_missing_item>::const_iterator i =
2771 omissing.get_items().begin();
2772 i != omissing.get_items().end();
2773 ++i) {
2774 psdout(20) << " after missing " << i->first
2775 << " need " << i->second.need
2776 << " have " << i->second.have << dendl;
2777 }
2778 peer_missing[from].claim(omissing);
2779 }
2780
2781 void PeeringState::fulfill_info(
2782 pg_shard_t from, const pg_query_t &query,
2783 pair<pg_shard_t, pg_info_t> &notify_info)
2784 {
2785 ceph_assert(from == primary);
2786 ceph_assert(query.type == pg_query_t::INFO);
2787
2788 // info
2789 psdout(10) << "sending info" << dendl;
2790 notify_info = make_pair(from, info);
2791 }
2792
2793 void PeeringState::fulfill_log(
2794 pg_shard_t from, const pg_query_t &query, epoch_t query_epoch)
2795 {
2796 psdout(10) << "log request from " << from << dendl;
2797 ceph_assert(from == primary);
2798 ceph_assert(query.type != pg_query_t::INFO);
2799
2800 MOSDPGLog *mlog = new MOSDPGLog(
2801 from.shard, pg_whoami.shard,
2802 get_osdmap_epoch(),
2803 info, query_epoch);
2804 mlog->missing = pg_log.get_missing();
2805
2806 // primary -> other, when building master log
2807 if (query.type == pg_query_t::LOG) {
2808 psdout(10) << " sending info+missing+log since " << query.since
2809 << dendl;
2810 if (query.since != eversion_t() && query.since < pg_log.get_tail()) {
2811 pl->get_clog_error() << info.pgid << " got broken pg_query_t::LOG since "
2812 << query.since
2813 << " when my log.tail is " << pg_log.get_tail()
2814 << ", sending full log instead";
2815 mlog->log = pg_log.get_log(); // primary should not have requested this!!
2816 } else
2817 mlog->log.copy_after(cct, pg_log.get_log(), query.since);
2818 }
2819 else if (query.type == pg_query_t::FULLLOG) {
2820 psdout(10) << " sending info+missing+full log" << dendl;
2821 mlog->log = pg_log.get_log();
2822 }
2823
2824 psdout(10) << " sending " << mlog->log << " " << mlog->missing << dendl;
2825
2826 pl->send_cluster_message(from.osd, mlog, get_osdmap_epoch(), true);
2827 }
2828
2829 void PeeringState::fulfill_query(const MQuery& query, PeeringCtxWrapper &rctx)
2830 {
2831 if (query.query.type == pg_query_t::INFO) {
2832 pair<pg_shard_t, pg_info_t> notify_info;
2833 // note this refreshes our prior_readable_until_ub value
2834 update_history(query.query.history);
2835 fulfill_info(query.from, query.query, notify_info);
2836 rctx.send_notify(
2837 notify_info.first.osd,
2838 pg_notify_t(
2839 notify_info.first.shard, pg_whoami.shard,
2840 query.query_epoch,
2841 get_osdmap_epoch(),
2842 notify_info.second,
2843 past_intervals));
2844 } else {
2845 update_history(query.query.history);
2846 fulfill_log(query.from, query.query, query.query_epoch);
2847 }
2848 }
2849
2850 void PeeringState::try_mark_clean()
2851 {
2852 if (actingset.size() == get_osdmap()->get_pg_size(info.pgid.pgid)) {
2853 state_clear(PG_STATE_FORCED_BACKFILL | PG_STATE_FORCED_RECOVERY);
2854 state_set(PG_STATE_CLEAN);
2855 info.history.last_epoch_clean = get_osdmap_epoch();
2856 info.history.last_interval_clean = info.history.same_interval_since;
2857 past_intervals.clear();
2858 dirty_big_info = true;
2859 dirty_info = true;
2860 }
2861
2862 if (!is_active() && is_peered()) {
2863 if (is_clean()) {
2864 bool target;
2865 if (pool.info.is_pending_merge(info.pgid.pgid, &target)) {
2866 if (target) {
2867 psdout(10) << "ready to merge (target)" << dendl;
2868 pl->set_ready_to_merge_target(
2869 info.last_update,
2870 info.history.last_epoch_started,
2871 info.history.last_epoch_clean);
2872 } else {
2873 psdout(10) << "ready to merge (source)" << dendl;
2874 pl->set_ready_to_merge_source(info.last_update);
2875 }
2876 }
2877 } else {
2878 psdout(10) << "not clean, not ready to merge" << dendl;
2879 // we should have notified OSD in Active state entry point
2880 }
2881 }
2882
2883 state_clear(PG_STATE_FORCED_RECOVERY | PG_STATE_FORCED_BACKFILL);
2884
2885 share_pg_info();
2886 pl->publish_stats_to_osd();
2887 clear_recovery_state();
2888 }
2889
2890 void PeeringState::split_into(
2891 pg_t child_pgid, PeeringState *child, unsigned split_bits)
2892 {
2893 child->update_osdmap_ref(get_osdmap());
2894 child->pool = pool;
2895
2896 // Log
2897 pg_log.split_into(child_pgid, split_bits, &(child->pg_log));
2898 child->info.last_complete = info.last_complete;
2899
2900 info.last_update = pg_log.get_head();
2901 child->info.last_update = child->pg_log.get_head();
2902
2903 child->info.last_user_version = info.last_user_version;
2904
2905 info.log_tail = pg_log.get_tail();
2906 child->info.log_tail = child->pg_log.get_tail();
2907
2908 // reset last_complete, we might have modified pg_log & missing above
2909 pg_log.reset_complete_to(&info);
2910 child->pg_log.reset_complete_to(&child->info);
2911
2912 // Info
2913 child->info.history = info.history;
2914 child->info.history.epoch_created = get_osdmap_epoch();
2915 child->info.purged_snaps = info.purged_snaps;
2916
2917 if (info.last_backfill.is_max()) {
2918 child->info.set_last_backfill(hobject_t::get_max());
2919 } else {
2920 // restart backfill on parent and child to be safe. we could
2921 // probably do better in the bitwise sort case, but it's more
2922 // fragile (there may be special work to do on backfill completion
2923 // in the future).
2924 info.set_last_backfill(hobject_t());
2925 child->info.set_last_backfill(hobject_t());
2926 // restarting backfill implies that the missing set is empty,
2927 // since it is only used for objects prior to last_backfill
2928 pg_log.reset_backfill();
2929 child->pg_log.reset_backfill();
2930 }
2931
2932 child->info.stats = info.stats;
2933 child->info.stats.parent_split_bits = split_bits;
2934 info.stats.stats_invalid = true;
2935 child->info.stats.stats_invalid = true;
2936 child->info.last_epoch_started = info.last_epoch_started;
2937 child->info.last_interval_started = info.last_interval_started;
2938
2939 // There can't be recovery/backfill going on now
2940 int primary, up_primary;
2941 vector<int> newup, newacting;
2942 get_osdmap()->pg_to_up_acting_osds(
2943 child->info.pgid.pgid, &newup, &up_primary, &newacting, &primary);
2944 child->init_primary_up_acting(
2945 newup,
2946 newacting,
2947 up_primary,
2948 primary);
2949 child->role = OSDMap::calc_pg_role(pg_whoami, child->acting);
2950
2951 // this comparison includes primary rank via pg_shard_t
2952 if (get_primary() != child->get_primary())
2953 child->info.history.same_primary_since = get_osdmap_epoch();
2954
2955 child->info.stats.up = up;
2956 child->info.stats.up_primary = up_primary;
2957 child->info.stats.acting = acting;
2958 child->info.stats.acting_primary = primary;
2959 child->info.stats.mapping_epoch = get_osdmap_epoch();
2960
2961 // History
2962 child->past_intervals = past_intervals;
2963
2964 child->on_new_interval();
2965
2966 child->send_notify = !child->is_primary();
2967
2968 child->dirty_info = true;
2969 child->dirty_big_info = true;
2970 dirty_info = true;
2971 dirty_big_info = true;
2972 }
2973
2974 void PeeringState::merge_from(
2975 map<spg_t,PeeringState *>& sources,
2976 PeeringCtx &rctx,
2977 unsigned split_bits,
2978 const pg_merge_meta_t& last_pg_merge_meta)
2979 {
2980 bool incomplete = false;
2981 if (info.last_complete != info.last_update ||
2982 info.is_incomplete() ||
2983 info.dne()) {
2984 psdout(10) << __func__ << " target incomplete" << dendl;
2985 incomplete = true;
2986 }
2987 if (last_pg_merge_meta.source_pgid != pg_t()) {
2988 if (info.pgid.pgid != last_pg_merge_meta.source_pgid.get_parent()) {
2989 psdout(10) << __func__ << " target doesn't match expected parent "
2990 << last_pg_merge_meta.source_pgid.get_parent()
2991 << " of source_pgid " << last_pg_merge_meta.source_pgid
2992 << dendl;
2993 incomplete = true;
2994 }
2995 if (info.last_update != last_pg_merge_meta.target_version) {
2996 psdout(10) << __func__ << " target version doesn't match expected "
2997 << last_pg_merge_meta.target_version << dendl;
2998 incomplete = true;
2999 }
3000 }
3001
3002 PGLog::LogEntryHandlerRef handler{pl->get_log_handler(rctx.transaction)};
3003 pg_log.roll_forward(handler.get());
3004
3005 info.last_complete = info.last_update; // to fake out trim()
3006 pg_log.reset_recovery_pointers();
3007 pg_log.trim(info.last_update, info);
3008
3009 vector<PGLog*> log_from;
3010 for (auto& i : sources) {
3011 auto& source = i.second;
3012 if (!source) {
3013 psdout(10) << __func__ << " source " << i.first << " missing" << dendl;
3014 incomplete = true;
3015 continue;
3016 }
3017 if (source->info.last_complete != source->info.last_update ||
3018 source->info.is_incomplete() ||
3019 source->info.dne()) {
3020 psdout(10) << __func__ << " source " << source->pg_whoami
3021 << " incomplete"
3022 << dendl;
3023 incomplete = true;
3024 }
3025 if (last_pg_merge_meta.source_pgid != pg_t()) {
3026 if (source->info.pgid.pgid != last_pg_merge_meta.source_pgid) {
3027 dout(10) << __func__ << " source " << source->info.pgid.pgid
3028 << " doesn't match expected source pgid "
3029 << last_pg_merge_meta.source_pgid << dendl;
3030 incomplete = true;
3031 }
3032 if (source->info.last_update != last_pg_merge_meta.source_version) {
3033 dout(10) << __func__ << " source version doesn't match expected "
3034 << last_pg_merge_meta.target_version << dendl;
3035 incomplete = true;
3036 }
3037 }
3038
3039 // prepare log
3040 PGLog::LogEntryHandlerRef handler{
3041 source->pl->get_log_handler(rctx.transaction)};
3042 source->pg_log.roll_forward(handler.get());
3043 source->info.last_complete = source->info.last_update; // to fake out trim()
3044 source->pg_log.reset_recovery_pointers();
3045 source->pg_log.trim(source->info.last_update, source->info);
3046 log_from.push_back(&source->pg_log);
3047
3048 // combine stats
3049 info.stats.add(source->info.stats);
3050
3051 // pull up last_update
3052 info.last_update = std::max(info.last_update, source->info.last_update);
3053
3054 // adopt source's PastIntervals if target has none. we can do this since
3055 // pgp_num has been reduced prior to the merge, so the OSD mappings for
3056 // the PGs are identical.
3057 if (past_intervals.empty() && !source->past_intervals.empty()) {
3058 psdout(10) << __func__ << " taking source's past_intervals" << dendl;
3059 past_intervals = source->past_intervals;
3060 }
3061 }
3062
3063 info.last_complete = info.last_update;
3064 info.log_tail = info.last_update;
3065 if (incomplete) {
3066 info.last_backfill = hobject_t();
3067 }
3068
3069 // merge logs
3070 pg_log.merge_from(log_from, info.last_update);
3071
3072 // make sure we have a meaningful last_epoch_started/clean (if we were a
3073 // placeholder)
3074 if (info.history.epoch_created == 0) {
3075 // start with (a) source's history, since these PGs *should* have been
3076 // remapped in concert with each other...
3077 info.history = sources.begin()->second->info.history;
3078
3079 // we use the last_epoch_{started,clean} we got from
3080 // the caller, which are the epochs that were reported by the PGs were
3081 // found to be ready for merge.
3082 info.history.last_epoch_clean = last_pg_merge_meta.last_epoch_clean;
3083 info.history.last_epoch_started = last_pg_merge_meta.last_epoch_started;
3084 info.last_epoch_started = last_pg_merge_meta.last_epoch_started;
3085 psdout(10) << __func__
3086 << " set les/c to " << last_pg_merge_meta.last_epoch_started << "/"
3087 << last_pg_merge_meta.last_epoch_clean
3088 << " from pool last_dec_*, source pg history was "
3089 << sources.begin()->second->info.history
3090 << dendl;
3091
3092 // if the past_intervals start is later than last_epoch_clean, it
3093 // implies the source repeered again but the target didn't, or
3094 // that the source became clean in a later epoch than the target.
3095 // avoid the discrepancy but adjusting the interval start
3096 // backwards to match so that check_past_interval_bounds() will
3097 // not complain.
3098 auto pib = past_intervals.get_bounds();
3099 if (info.history.last_epoch_clean < pib.first) {
3100 psdout(10) << __func__ << " last_epoch_clean "
3101 << info.history.last_epoch_clean << " < past_interval start "
3102 << pib.first << ", adjusting start backwards" << dendl;
3103 past_intervals.adjust_start_backwards(info.history.last_epoch_clean);
3104 }
3105
3106 // Similarly, if the same_interval_since value is later than
3107 // last_epoch_clean, the next interval change will result in a
3108 // past_interval start that is later than last_epoch_clean. This
3109 // can happen if we use the pg_history values from the merge
3110 // source. Adjust the same_interval_since value backwards if that
3111 // happens. (We trust the les and lec values more because they came from
3112 // the real target, whereas the history value we stole from the source.)
3113 if (info.history.last_epoch_started < info.history.same_interval_since) {
3114 psdout(10) << __func__ << " last_epoch_started "
3115 << info.history.last_epoch_started << " < same_interval_since "
3116 << info.history.same_interval_since
3117 << ", adjusting pg_history backwards" << dendl;
3118 info.history.same_interval_since = info.history.last_epoch_clean;
3119 // make sure same_{up,primary}_since are <= same_interval_since
3120 info.history.same_up_since = std::min(
3121 info.history.same_up_since, info.history.same_interval_since);
3122 info.history.same_primary_since = std::min(
3123 info.history.same_primary_since, info.history.same_interval_since);
3124 }
3125 }
3126
3127 dirty_info = true;
3128 dirty_big_info = true;
3129 }
3130
3131 void PeeringState::start_split_stats(
3132 const set<spg_t>& childpgs, vector<object_stat_sum_t> *out)
3133 {
3134 out->resize(childpgs.size() + 1);
3135 info.stats.stats.sum.split(*out);
3136 }
3137
3138 void PeeringState::finish_split_stats(
3139 const object_stat_sum_t& stats, ObjectStore::Transaction &t)
3140 {
3141 info.stats.stats.sum = stats;
3142 write_if_dirty(t);
3143 }
3144
3145 void PeeringState::update_blocked_by()
3146 {
3147 // set a max on the number of blocking peers we report. if we go
3148 // over, report a random subset. keep the result sorted.
3149 unsigned keep = std::min<unsigned>(
3150 blocked_by.size(), cct->_conf->osd_max_pg_blocked_by);
3151 unsigned skip = blocked_by.size() - keep;
3152 info.stats.blocked_by.clear();
3153 info.stats.blocked_by.resize(keep);
3154 unsigned pos = 0;
3155 for (set<int>::iterator p = blocked_by.begin();
3156 p != blocked_by.end() && keep > 0;
3157 ++p) {
3158 if (skip > 0 && (rand() % (skip + keep) < skip)) {
3159 --skip;
3160 } else {
3161 info.stats.blocked_by[pos++] = *p;
3162 --keep;
3163 }
3164 }
3165 }
3166
3167 static bool find_shard(const set<pg_shard_t> & pgs, shard_id_t shard)
3168 {
3169 for (auto&p : pgs)
3170 if (p.shard == shard)
3171 return true;
3172 return false;
3173 }
3174
3175 static pg_shard_t get_another_shard(const set<pg_shard_t> & pgs, pg_shard_t skip, shard_id_t shard)
3176 {
3177 for (auto&p : pgs) {
3178 if (p == skip)
3179 continue;
3180 if (p.shard == shard)
3181 return p;
3182 }
3183 return pg_shard_t();
3184 }
3185
3186 void PeeringState::update_calc_stats()
3187 {
3188 info.stats.version = info.last_update;
3189 info.stats.created = info.history.epoch_created;
3190 info.stats.last_scrub = info.history.last_scrub;
3191 info.stats.last_scrub_stamp = info.history.last_scrub_stamp;
3192 info.stats.last_deep_scrub = info.history.last_deep_scrub;
3193 info.stats.last_deep_scrub_stamp = info.history.last_deep_scrub_stamp;
3194 info.stats.last_clean_scrub_stamp = info.history.last_clean_scrub_stamp;
3195 info.stats.last_epoch_clean = info.history.last_epoch_clean;
3196
3197 info.stats.log_size = pg_log.get_head().version - pg_log.get_tail().version;
3198 info.stats.ondisk_log_size = info.stats.log_size;
3199 info.stats.log_start = pg_log.get_tail();
3200 info.stats.ondisk_log_start = pg_log.get_tail();
3201 info.stats.snaptrimq_len = pl->get_snap_trimq_size();
3202
3203 unsigned num_shards = get_osdmap()->get_pg_size(info.pgid.pgid);
3204
3205 // In rare case that upset is too large (usually transient), use as target
3206 // for calculations below.
3207 unsigned target = std::max(num_shards, (unsigned)upset.size());
3208 // For undersized actingset may be larger with OSDs out
3209 unsigned nrep = std::max(actingset.size(), upset.size());
3210 // calc num_object_copies
3211 info.stats.stats.calc_copies(std::max(target, nrep));
3212 info.stats.stats.sum.num_objects_degraded = 0;
3213 info.stats.stats.sum.num_objects_unfound = 0;
3214 info.stats.stats.sum.num_objects_misplaced = 0;
3215 info.stats.avail_no_missing.clear();
3216 info.stats.object_location_counts.clear();
3217
3218 // We should never hit this condition, but if end up hitting it,
3219 // make sure to update num_objects and set PG_STATE_INCONSISTENT.
3220 if (info.stats.stats.sum.num_objects < 0) {
3221 psdout(0) << __func__ << " negative num_objects = "
3222 << info.stats.stats.sum.num_objects << " setting it to 0 "
3223 << dendl;
3224 info.stats.stats.sum.num_objects = 0;
3225 state_set(PG_STATE_INCONSISTENT);
3226 }
3227
3228 if ((is_remapped() || is_undersized() || !is_clean()) &&
3229 (is_peered()|| is_activating())) {
3230 psdout(20) << __func__ << " actingset " << actingset << " upset "
3231 << upset << " acting_recovery_backfill " << acting_recovery_backfill << dendl;
3232
3233 ceph_assert(!acting_recovery_backfill.empty());
3234
3235 bool estimate = false;
3236
3237 // NOTE: we only generate degraded, misplaced and unfound
3238 // values for the summation, not individual stat categories.
3239 int64_t num_objects = info.stats.stats.sum.num_objects;
3240
3241 // Objects missing from up nodes, sorted by # objects.
3242 boost::container::flat_set<pair<int64_t,pg_shard_t>> missing_target_objects;
3243 // Objects missing from nodes not in up, sort by # objects
3244 boost::container::flat_set<pair<int64_t,pg_shard_t>> acting_source_objects;
3245
3246 // Fill missing_target_objects/acting_source_objects
3247
3248 {
3249 int64_t missing;
3250
3251 // Primary first
3252 missing = pg_log.get_missing().num_missing();
3253 ceph_assert(acting_recovery_backfill.count(pg_whoami));
3254 if (upset.count(pg_whoami)) {
3255 missing_target_objects.emplace(missing, pg_whoami);
3256 } else {
3257 acting_source_objects.emplace(missing, pg_whoami);
3258 }
3259 info.stats.stats.sum.num_objects_missing_on_primary = missing;
3260 if (missing == 0)
3261 info.stats.avail_no_missing.push_back(pg_whoami);
3262 psdout(20) << __func__ << " shard " << pg_whoami
3263 << " primary objects " << num_objects
3264 << " missing " << missing
3265 << dendl;
3266 }
3267
3268 // All other peers
3269 for (auto& peer : peer_info) {
3270 // Primary should not be in the peer_info, skip if it is.
3271 if (peer.first == pg_whoami) continue;
3272 int64_t missing = 0;
3273 int64_t peer_num_objects = peer.second.stats.stats.sum.num_objects;
3274 // Backfill targets always track num_objects accurately
3275 // all other peers track missing accurately.
3276 if (is_backfill_target(peer.first)) {
3277 missing = std::max((int64_t)0, num_objects - peer_num_objects);
3278 } else {
3279 if (peer_missing.count(peer.first)) {
3280 missing = peer_missing[peer.first].num_missing();
3281 } else {
3282 psdout(20) << __func__ << " no peer_missing found for "
3283 << peer.first << dendl;
3284 if (is_recovering()) {
3285 estimate = true;
3286 }
3287 missing = std::max((int64_t)0, num_objects - peer_num_objects);
3288 }
3289 }
3290 if (upset.count(peer.first)) {
3291 missing_target_objects.emplace(missing, peer.first);
3292 } else if (actingset.count(peer.first)) {
3293 acting_source_objects.emplace(missing, peer.first);
3294 }
3295 peer.second.stats.stats.sum.num_objects_missing = missing;
3296 if (missing == 0)
3297 info.stats.avail_no_missing.push_back(peer.first);
3298 psdout(20) << __func__ << " shard " << peer.first
3299 << " objects " << peer_num_objects
3300 << " missing " << missing
3301 << dendl;
3302 }
3303
3304 // Compute object_location_counts
3305 for (auto& ml: missing_loc.get_missing_locs()) {
3306 info.stats.object_location_counts[ml.second]++;
3307 psdout(30) << __func__ << " " << ml.first << " object_location_counts["
3308 << ml.second << "]=" << info.stats.object_location_counts[ml.second]
3309 << dendl;
3310 }
3311 int64_t not_missing = num_objects - missing_loc.get_missing_locs().size();
3312 if (not_missing) {
3313 // During recovery we know upset == actingset and is being populated
3314 // During backfill we know that all non-missing objects are in the actingset
3315 info.stats.object_location_counts[actingset] = not_missing;
3316 }
3317 psdout(30) << __func__ << " object_location_counts["
3318 << upset << "]=" << info.stats.object_location_counts[upset]
3319 << dendl;
3320 psdout(20) << __func__ << " object_location_counts "
3321 << info.stats.object_location_counts << dendl;
3322
3323 // A misplaced object is not stored on the correct OSD
3324 int64_t misplaced = 0;
3325 // a degraded objects has fewer replicas or EC shards than the pool specifies.
3326 int64_t degraded = 0;
3327
3328 if (is_recovering()) {
3329 for (auto& sml: missing_loc.get_missing_by_count()) {
3330 for (auto& ml: sml.second) {
3331 int missing_shards;
3332 if (sml.first == shard_id_t::NO_SHARD) {
3333 psdout(20) << __func__ << " ml " << ml.second
3334 << " upset size " << upset.size()
3335 << " up " << ml.first.up << dendl;
3336 missing_shards = (int)upset.size() - ml.first.up;
3337 } else {
3338 // Handle shards not even in upset below
3339 if (!find_shard(upset, sml.first))
3340 continue;
3341 missing_shards = std::max(0, 1 - ml.first.up);
3342 psdout(20) << __func__
3343 << " shard " << sml.first
3344 << " ml " << ml.second
3345 << " missing shards " << missing_shards << dendl;
3346 }
3347 int odegraded = ml.second * missing_shards;
3348 // Copies on other osds but limited to the possible degraded
3349 int more_osds = std::min(missing_shards, ml.first.other);
3350 int omisplaced = ml.second * more_osds;
3351 ceph_assert(omisplaced <= odegraded);
3352 odegraded -= omisplaced;
3353
3354 misplaced += omisplaced;
3355 degraded += odegraded;
3356 }
3357 }
3358
3359 psdout(20) << __func__ << " missing based degraded "
3360 << degraded << dendl;
3361 psdout(20) << __func__ << " missing based misplaced "
3362 << misplaced << dendl;
3363
3364 // Handle undersized case
3365 if (pool.info.is_replicated()) {
3366 // Add degraded for missing targets (num_objects missing)
3367 ceph_assert(target >= upset.size());
3368 unsigned needed = target - upset.size();
3369 degraded += num_objects * needed;
3370 } else {
3371 for (unsigned i = 0 ; i < num_shards; ++i) {
3372 shard_id_t shard(i);
3373
3374 if (!find_shard(upset, shard)) {
3375 pg_shard_t pgs = get_another_shard(actingset, pg_shard_t(), shard);
3376
3377 if (pgs != pg_shard_t()) {
3378 int64_t missing;
3379
3380 if (pgs == pg_whoami)
3381 missing = info.stats.stats.sum.num_objects_missing_on_primary;
3382 else
3383 missing = peer_info[pgs].stats.stats.sum.num_objects_missing;
3384
3385 degraded += missing;
3386 misplaced += std::max((int64_t)0, num_objects - missing);
3387 } else {
3388 // No shard anywhere
3389 degraded += num_objects;
3390 }
3391 }
3392 }
3393 }
3394 goto out;
3395 }
3396
3397 // Handle undersized case
3398 if (pool.info.is_replicated()) {
3399 // Add to missing_target_objects
3400 ceph_assert(target >= missing_target_objects.size());
3401 unsigned needed = target - missing_target_objects.size();
3402 if (needed)
3403 missing_target_objects.emplace(num_objects * needed, pg_shard_t(pg_shard_t::NO_OSD));
3404 } else {
3405 for (unsigned i = 0 ; i < num_shards; ++i) {
3406 shard_id_t shard(i);
3407 bool found = false;
3408 for (const auto& t : missing_target_objects) {
3409 if (std::get<1>(t).shard == shard) {
3410 found = true;
3411 break;
3412 }
3413 }
3414 if (!found)
3415 missing_target_objects.emplace(num_objects, pg_shard_t(pg_shard_t::NO_OSD,shard));
3416 }
3417 }
3418
3419 for (const auto& item : missing_target_objects)
3420 psdout(20) << __func__ << " missing shard " << std::get<1>(item)
3421 << " missing= " << std::get<0>(item) << dendl;
3422 for (const auto& item : acting_source_objects)
3423 psdout(20) << __func__ << " acting shard " << std::get<1>(item)
3424 << " missing= " << std::get<0>(item) << dendl;
3425
3426 // Handle all objects not in missing for remapped
3427 // or backfill
3428 for (auto m = missing_target_objects.rbegin();
3429 m != missing_target_objects.rend(); ++m) {
3430
3431 int64_t extra_missing = -1;
3432
3433 if (pool.info.is_replicated()) {
3434 if (!acting_source_objects.empty()) {
3435 auto extra_copy = acting_source_objects.begin();
3436 extra_missing = std::get<0>(*extra_copy);
3437 acting_source_objects.erase(extra_copy);
3438 }
3439 } else { // Erasure coded
3440 // Use corresponding shard
3441 for (const auto& a : acting_source_objects) {
3442 if (std::get<1>(a).shard == std::get<1>(*m).shard) {
3443 extra_missing = std::get<0>(a);
3444 acting_source_objects.erase(a);
3445 break;
3446 }
3447 }
3448 }
3449
3450 if (extra_missing >= 0 && std::get<0>(*m) >= extra_missing) {
3451 // We don't know which of the objects on the target
3452 // are part of extra_missing so assume are all degraded.
3453 misplaced += std::get<0>(*m) - extra_missing;
3454 degraded += extra_missing;
3455 } else {
3456 // 1. extra_missing == -1, more targets than sources so degraded
3457 // 2. extra_missing > std::get<0>(m), so that we know that some extra_missing
3458 // previously degraded are now present on the target.
3459 degraded += std::get<0>(*m);
3460 }
3461 }
3462 // If there are still acting that haven't been accounted for
3463 // then they are misplaced
3464 for (const auto& a : acting_source_objects) {
3465 int64_t extra_misplaced = std::max((int64_t)0, num_objects - std::get<0>(a));
3466 psdout(20) << __func__ << " extra acting misplaced " << extra_misplaced
3467 << dendl;
3468 misplaced += extra_misplaced;
3469 }
3470 out:
3471 // NOTE: Tests use these messages to verify this code
3472 psdout(20) << __func__ << " degraded " << degraded
3473 << (estimate ? " (est)": "") << dendl;
3474 psdout(20) << __func__ << " misplaced " << misplaced
3475 << (estimate ? " (est)": "")<< dendl;
3476
3477 info.stats.stats.sum.num_objects_degraded = degraded;
3478 info.stats.stats.sum.num_objects_unfound = get_num_unfound();
3479 info.stats.stats.sum.num_objects_misplaced = misplaced;
3480 }
3481 }
3482
3483 std::optional<pg_stat_t> PeeringState::prepare_stats_for_publish(
3484 bool pg_stats_publish_valid,
3485 const pg_stat_t &pg_stats_publish,
3486 const object_stat_collection_t &unstable_stats)
3487 {
3488 if (info.stats.stats.sum.num_scrub_errors) {
3489 state_set(PG_STATE_INCONSISTENT);
3490 } else {
3491 state_clear(PG_STATE_INCONSISTENT);
3492 state_clear(PG_STATE_FAILED_REPAIR);
3493 }
3494
3495 utime_t now = ceph_clock_now();
3496 if (info.stats.state != state) {
3497 info.stats.last_change = now;
3498 // Optimistic estimation, if we just find out an inactive PG,
3499 // assumt it is active till now.
3500 if (!(state & PG_STATE_ACTIVE) &&
3501 (info.stats.state & PG_STATE_ACTIVE))
3502 info.stats.last_active = now;
3503
3504 if ((state & PG_STATE_ACTIVE) &&
3505 !(info.stats.state & PG_STATE_ACTIVE))
3506 info.stats.last_became_active = now;
3507 if ((state & (PG_STATE_ACTIVE|PG_STATE_PEERED)) &&
3508 !(info.stats.state & (PG_STATE_ACTIVE|PG_STATE_PEERED)))
3509 info.stats.last_became_peered = now;
3510 info.stats.state = state;
3511 }
3512
3513 update_calc_stats();
3514 if (info.stats.stats.sum.num_objects_degraded) {
3515 state_set(PG_STATE_DEGRADED);
3516 } else {
3517 state_clear(PG_STATE_DEGRADED);
3518 }
3519 update_blocked_by();
3520
3521 pg_stat_t pre_publish = info.stats;
3522 pre_publish.stats.add(unstable_stats);
3523 utime_t cutoff = now;
3524 cutoff -= cct->_conf->osd_pg_stat_report_interval_max;
3525
3526 // share (some of) our purged_snaps via the pg_stats. limit # of intervals
3527 // because we don't want to make the pg_stat_t structures too expensive.
3528 unsigned max = cct->_conf->osd_max_snap_prune_intervals_per_epoch;
3529 unsigned num = 0;
3530 auto i = info.purged_snaps.begin();
3531 while (num < max && i != info.purged_snaps.end()) {
3532 pre_publish.purged_snaps.insert(i.get_start(), i.get_len());
3533 ++num;
3534 ++i;
3535 }
3536 psdout(20) << __func__ << " reporting purged_snaps "
3537 << pre_publish.purged_snaps << dendl;
3538
3539 if (pg_stats_publish_valid && pre_publish == pg_stats_publish &&
3540 info.stats.last_fresh > cutoff) {
3541 psdout(15) << "publish_stats_to_osd " << pg_stats_publish.reported_epoch
3542 << ": no change since " << info.stats.last_fresh << dendl;
3543 return std::nullopt;
3544 } else {
3545 // update our stat summary and timestamps
3546 info.stats.reported_epoch = get_osdmap_epoch();
3547 ++info.stats.reported_seq;
3548
3549 info.stats.last_fresh = now;
3550
3551 if (info.stats.state & PG_STATE_CLEAN)
3552 info.stats.last_clean = now;
3553 if (info.stats.state & PG_STATE_ACTIVE)
3554 info.stats.last_active = now;
3555 if (info.stats.state & (PG_STATE_ACTIVE|PG_STATE_PEERED))
3556 info.stats.last_peered = now;
3557 info.stats.last_unstale = now;
3558 if ((info.stats.state & PG_STATE_DEGRADED) == 0)
3559 info.stats.last_undegraded = now;
3560 if ((info.stats.state & PG_STATE_UNDERSIZED) == 0)
3561 info.stats.last_fullsized = now;
3562
3563 psdout(15) << "publish_stats_to_osd " << pg_stats_publish.reported_epoch
3564 << ":" << pg_stats_publish.reported_seq << dendl;
3565 return std::make_optional(std::move(pre_publish));
3566 }
3567 }
3568
3569 void PeeringState::init(
3570 int role,
3571 const vector<int>& newup, int new_up_primary,
3572 const vector<int>& newacting, int new_acting_primary,
3573 const pg_history_t& history,
3574 const PastIntervals& pi,
3575 bool backfill,
3576 ObjectStore::Transaction &t)
3577 {
3578 psdout(10) << "init role " << role << " up "
3579 << newup << " acting " << newacting
3580 << " history " << history
3581 << " past_intervals " << pi
3582 << dendl;
3583
3584 set_role(role);
3585 init_primary_up_acting(
3586 newup,
3587 newacting,
3588 new_up_primary,
3589 new_acting_primary);
3590
3591 info.history = history;
3592 past_intervals = pi;
3593
3594 info.stats.up = up;
3595 info.stats.up_primary = new_up_primary;
3596 info.stats.acting = acting;
3597 info.stats.acting_primary = new_acting_primary;
3598 info.stats.mapping_epoch = info.history.same_interval_since;
3599
3600 if (!perform_deletes_during_peering()) {
3601 pg_log.set_missing_may_contain_deletes();
3602 }
3603
3604 if (backfill) {
3605 psdout(10) << __func__ << ": Setting backfill" << dendl;
3606 info.set_last_backfill(hobject_t());
3607 info.last_complete = info.last_update;
3608 pg_log.mark_log_for_rewrite();
3609 }
3610
3611 on_new_interval();
3612
3613 dirty_info = true;
3614 dirty_big_info = true;
3615 write_if_dirty(t);
3616 }
3617
3618 void PeeringState::dump_peering_state(Formatter *f)
3619 {
3620 f->dump_string("state", get_pg_state_string());
3621 f->dump_unsigned("epoch", get_osdmap_epoch());
3622 f->open_array_section("up");
3623 for (vector<int>::const_iterator p = up.begin(); p != up.end(); ++p)
3624 f->dump_unsigned("osd", *p);
3625 f->close_section();
3626 f->open_array_section("acting");
3627 for (vector<int>::const_iterator p = acting.begin(); p != acting.end(); ++p)
3628 f->dump_unsigned("osd", *p);
3629 f->close_section();
3630 if (!backfill_targets.empty()) {
3631 f->open_array_section("backfill_targets");
3632 for (set<pg_shard_t>::iterator p = backfill_targets.begin();
3633 p != backfill_targets.end();
3634 ++p)
3635 f->dump_stream("shard") << *p;
3636 f->close_section();
3637 }
3638 if (!async_recovery_targets.empty()) {
3639 f->open_array_section("async_recovery_targets");
3640 for (set<pg_shard_t>::iterator p = async_recovery_targets.begin();
3641 p != async_recovery_targets.end();
3642 ++p)
3643 f->dump_stream("shard") << *p;
3644 f->close_section();
3645 }
3646 if (!acting_recovery_backfill.empty()) {
3647 f->open_array_section("acting_recovery_backfill");
3648 for (set<pg_shard_t>::iterator p = acting_recovery_backfill.begin();
3649 p != acting_recovery_backfill.end();
3650 ++p)
3651 f->dump_stream("shard") << *p;
3652 f->close_section();
3653 }
3654 f->open_object_section("info");
3655 update_calc_stats();
3656 info.dump(f);
3657 f->close_section();
3658
3659 f->open_array_section("peer_info");
3660 for (map<pg_shard_t, pg_info_t>::const_iterator p = peer_info.begin();
3661 p != peer_info.end();
3662 ++p) {
3663 f->open_object_section("info");
3664 f->dump_stream("peer") << p->first;
3665 p->second.dump(f);
3666 f->close_section();
3667 }
3668 }
3669
3670 void PeeringState::update_stats(
3671 std::function<bool(pg_history_t &, pg_stat_t &)> f,
3672 ObjectStore::Transaction *t) {
3673 if (f(info.history, info.stats)) {
3674 pl->publish_stats_to_osd();
3675 }
3676 pl->on_info_history_change();
3677
3678 if (t) {
3679 dirty_info = true;
3680 write_if_dirty(*t);
3681 }
3682 }
3683
3684 bool PeeringState::append_log_entries_update_missing(
3685 const mempool::osd_pglog::list<pg_log_entry_t> &entries,
3686 ObjectStore::Transaction &t, std::optional<eversion_t> trim_to,
3687 std::optional<eversion_t> roll_forward_to)
3688 {
3689 ceph_assert(!entries.empty());
3690 ceph_assert(entries.begin()->version > info.last_update);
3691
3692 PGLog::LogEntryHandlerRef rollbacker{pl->get_log_handler(t)};
3693 bool invalidate_stats =
3694 pg_log.append_new_log_entries(
3695 info.last_backfill,
3696 entries,
3697 rollbacker.get());
3698
3699 if (roll_forward_to && entries.rbegin()->soid > info.last_backfill) {
3700 pg_log.roll_forward(rollbacker.get());
3701 }
3702 if (roll_forward_to && *roll_forward_to > pg_log.get_can_rollback_to()) {
3703 pg_log.roll_forward_to(*roll_forward_to, rollbacker.get());
3704 last_rollback_info_trimmed_to_applied = *roll_forward_to;
3705 }
3706
3707 info.last_update = pg_log.get_head();
3708
3709 if (pg_log.get_missing().num_missing() == 0) {
3710 // advance last_complete since nothing else is missing!
3711 info.last_complete = info.last_update;
3712 }
3713 info.stats.stats_invalid = info.stats.stats_invalid || invalidate_stats;
3714
3715 psdout(20) << __func__ << " trim_to bool = " << bool(trim_to)
3716 << " trim_to = " << (trim_to ? *trim_to : eversion_t()) << dendl;
3717 if (trim_to)
3718 pg_log.trim(*trim_to, info);
3719 dirty_info = true;
3720 write_if_dirty(t);
3721 return invalidate_stats;
3722 }
3723
3724 void PeeringState::merge_new_log_entries(
3725 const mempool::osd_pglog::list<pg_log_entry_t> &entries,
3726 ObjectStore::Transaction &t,
3727 std::optional<eversion_t> trim_to,
3728 std::optional<eversion_t> roll_forward_to)
3729 {
3730 psdout(10) << __func__ << " " << entries << dendl;
3731 ceph_assert(is_primary());
3732
3733 bool rebuild_missing = append_log_entries_update_missing(entries, t, trim_to, roll_forward_to);
3734 for (set<pg_shard_t>::const_iterator i = acting_recovery_backfill.begin();
3735 i != acting_recovery_backfill.end();
3736 ++i) {
3737 pg_shard_t peer(*i);
3738 if (peer == pg_whoami) continue;
3739 ceph_assert(peer_missing.count(peer));
3740 ceph_assert(peer_info.count(peer));
3741 pg_missing_t& pmissing(peer_missing[peer]);
3742 psdout(20) << __func__ << " peer_missing for " << peer
3743 << " = " << pmissing << dendl;
3744 pg_info_t& pinfo(peer_info[peer]);
3745 bool invalidate_stats = PGLog::append_log_entries_update_missing(
3746 pinfo.last_backfill,
3747 entries,
3748 true,
3749 NULL,
3750 pmissing,
3751 NULL,
3752 dpp);
3753 pinfo.last_update = info.last_update;
3754 pinfo.stats.stats_invalid = pinfo.stats.stats_invalid || invalidate_stats;
3755 rebuild_missing = rebuild_missing || invalidate_stats;
3756 }
3757
3758 if (!rebuild_missing) {
3759 return;
3760 }
3761
3762 for (auto &&i: entries) {
3763 missing_loc.rebuild(
3764 i.soid,
3765 pg_whoami,
3766 acting_recovery_backfill,
3767 info,
3768 pg_log.get_missing(),
3769 peer_missing,
3770 peer_info);
3771 }
3772 }
3773
3774 void PeeringState::add_log_entry(const pg_log_entry_t& e, bool applied)
3775 {
3776 // raise last_complete only if we were previously up to date
3777 if (info.last_complete == info.last_update)
3778 info.last_complete = e.version;
3779
3780 // raise last_update.
3781 ceph_assert(e.version > info.last_update);
3782 info.last_update = e.version;
3783
3784 // raise user_version, if it increased (it may have not get bumped
3785 // by all logged updates)
3786 if (e.user_version > info.last_user_version)
3787 info.last_user_version = e.user_version;
3788
3789 // log mutation
3790 pg_log.add(e, applied);
3791 psdout(10) << "add_log_entry " << e << dendl;
3792 }
3793
3794
3795 void PeeringState::append_log(
3796 const vector<pg_log_entry_t>& logv,
3797 eversion_t trim_to,
3798 eversion_t roll_forward_to,
3799 eversion_t mlcod,
3800 ObjectStore::Transaction &t,
3801 bool transaction_applied,
3802 bool async)
3803 {
3804 /* The primary has sent an info updating the history, but it may not
3805 * have arrived yet. We want to make sure that we cannot remember this
3806 * write without remembering that it happened in an interval which went
3807 * active in epoch history.last_epoch_started.
3808 */
3809 if (info.last_epoch_started != info.history.last_epoch_started) {
3810 info.history.last_epoch_started = info.last_epoch_started;
3811 }
3812 if (info.last_interval_started != info.history.last_interval_started) {
3813 info.history.last_interval_started = info.last_interval_started;
3814 }
3815 psdout(10) << "append_log " << pg_log.get_log() << " " << logv << dendl;
3816
3817 PGLog::LogEntryHandlerRef handler{pl->get_log_handler(t)};
3818 if (!transaction_applied) {
3819 /* We must be a backfill or async recovery peer, so it's ok if we apply
3820 * out-of-turn since we won't be considered when
3821 * determining a min possible last_update.
3822 *
3823 * We skip_rollforward() here, which advances the crt, without
3824 * doing an actual rollforward. This avoids cleaning up entries
3825 * from the backend and we do not end up in a situation, where the
3826 * object is deleted before we can _merge_object_divergent_entries().
3827 */
3828 pg_log.skip_rollforward();
3829 }
3830
3831 for (vector<pg_log_entry_t>::const_iterator p = logv.begin();
3832 p != logv.end();
3833 ++p) {
3834 add_log_entry(*p, transaction_applied);
3835
3836 /* We don't want to leave the rollforward artifacts around
3837 * here past last_backfill. It's ok for the same reason as
3838 * above */
3839 if (transaction_applied &&
3840 p->soid > info.last_backfill) {
3841 pg_log.roll_forward(handler.get());
3842 }
3843 }
3844 if (transaction_applied && roll_forward_to > pg_log.get_can_rollback_to()) {
3845 pg_log.roll_forward_to(
3846 roll_forward_to,
3847 handler.get());
3848 last_rollback_info_trimmed_to_applied = roll_forward_to;
3849 }
3850
3851 psdout(10) << __func__ << " approx pg log length = "
3852 << pg_log.get_log().approx_size() << dendl;
3853 psdout(10) << __func__ << " transaction_applied = "
3854 << transaction_applied << dendl;
3855 if (!transaction_applied || async)
3856 psdout(10) << __func__ << " " << pg_whoami
3857 << " is async_recovery or backfill target" << dendl;
3858 pg_log.trim(trim_to, info, transaction_applied, async);
3859
3860 // update the local pg, pg log
3861 dirty_info = true;
3862 write_if_dirty(t);
3863
3864 if (!is_primary())
3865 min_last_complete_ondisk = mlcod;
3866 }
3867
3868 void PeeringState::recover_got(
3869 const hobject_t &oid, eversion_t v,
3870 bool is_delete,
3871 ObjectStore::Transaction &t)
3872 {
3873 if (v > pg_log.get_can_rollback_to()) {
3874 /* This can only happen during a repair, and even then, it would
3875 * be one heck of a race. If we are repairing the object, the
3876 * write in question must be fully committed, so it's not valid
3877 * to roll it back anyway (and we'll be rolled forward shortly
3878 * anyway) */
3879 PGLog::LogEntryHandlerRef handler{pl->get_log_handler(t)};
3880 pg_log.roll_forward_to(v, handler.get());
3881 }
3882
3883 psdout(10) << "got missing " << oid << " v " << v << dendl;
3884 pg_log.recover_got(oid, v, info);
3885 if (pg_log.get_log().log.empty()) {
3886 psdout(10) << "last_complete now " << info.last_complete
3887 << " while log is empty" << dendl;
3888 } else if (pg_log.get_log().complete_to != pg_log.get_log().log.end()) {
3889 psdout(10) << "last_complete now " << info.last_complete
3890 << " log.complete_to " << pg_log.get_log().complete_to->version
3891 << dendl;
3892 } else {
3893 psdout(10) << "last_complete now " << info.last_complete
3894 << " log.complete_to at end" << dendl;
3895 //below is not true in the repair case.
3896 //assert(missing.num_missing() == 0); // otherwise, complete_to was wrong.
3897 ceph_assert(info.last_complete == info.last_update);
3898 }
3899
3900 if (is_primary()) {
3901 ceph_assert(missing_loc.needs_recovery(oid));
3902 if (!is_delete)
3903 missing_loc.add_location(oid, pg_whoami);
3904 }
3905
3906 // update pg
3907 dirty_info = true;
3908 write_if_dirty(t);
3909 }
3910
3911 void PeeringState::update_backfill_progress(
3912 const hobject_t &updated_backfill,
3913 const pg_stat_t &updated_stats,
3914 bool preserve_local_num_bytes,
3915 ObjectStore::Transaction &t) {
3916 info.set_last_backfill(updated_backfill);
3917 if (preserve_local_num_bytes) {
3918 psdout(25) << __func__ << " primary " << updated_stats.stats.sum.num_bytes
3919 << " local " << info.stats.stats.sum.num_bytes << dendl;
3920 int64_t bytes = info.stats.stats.sum.num_bytes;
3921 info.stats = updated_stats;
3922 info.stats.stats.sum.num_bytes = bytes;
3923 } else {
3924 psdout(20) << __func__ << " final " << updated_stats.stats.sum.num_bytes
3925 << " replaces local " << info.stats.stats.sum.num_bytes << dendl;
3926 info.stats = updated_stats;
3927 }
3928
3929 dirty_info = true;
3930 write_if_dirty(t);
3931 }
3932
3933 void PeeringState::adjust_purged_snaps(
3934 std::function<void(interval_set<snapid_t> &snaps)> f) {
3935 f(info.purged_snaps);
3936 dirty_info = true;
3937 dirty_big_info = true;
3938 }
3939
3940 void PeeringState::on_peer_recover(
3941 pg_shard_t peer,
3942 const hobject_t &soid,
3943 const eversion_t &version)
3944 {
3945 pl->publish_stats_to_osd();
3946 // done!
3947 peer_missing[peer].got(soid, version);
3948 missing_loc.add_location(soid, peer);
3949 }
3950
3951 void PeeringState::begin_peer_recover(
3952 pg_shard_t peer,
3953 const hobject_t soid)
3954 {
3955 peer_missing[peer].revise_have(soid, eversion_t());
3956 }
3957
3958 void PeeringState::force_object_missing(
3959 const set<pg_shard_t> &peers,
3960 const hobject_t &soid,
3961 eversion_t version)
3962 {
3963 for (auto &&peer : peers) {
3964 if (peer != primary) {
3965 peer_missing[peer].add(soid, version, eversion_t(), false);
3966 } else {
3967 pg_log.missing_add(soid, version, eversion_t());
3968 pg_log.reset_complete_to(&info);
3969 pg_log.set_last_requested(0);
3970 }
3971 }
3972
3973 missing_loc.rebuild(
3974 soid,
3975 pg_whoami,
3976 acting_recovery_backfill,
3977 info,
3978 pg_log.get_missing(),
3979 peer_missing,
3980 peer_info);
3981 }
3982
3983 void PeeringState::pre_submit_op(
3984 const hobject_t &hoid,
3985 const vector<pg_log_entry_t>& logv,
3986 eversion_t at_version)
3987 {
3988 if (at_version > eversion_t()) {
3989 for (auto &&i : get_acting_recovery_backfill()) {
3990 if (i == primary) continue;
3991 pg_info_t &pinfo = peer_info[i];
3992 // keep peer_info up to date
3993 if (pinfo.last_complete == pinfo.last_update)
3994 pinfo.last_complete = at_version;
3995 pinfo.last_update = at_version;
3996 }
3997 }
3998
3999 bool requires_missing_loc = false;
4000 for (auto &&i : get_async_recovery_targets()) {
4001 if (i == primary || !get_peer_missing(i).is_missing(hoid))
4002 continue;
4003 requires_missing_loc = true;
4004 for (auto &&entry: logv) {
4005 peer_missing[i].add_next_event(entry);
4006 }
4007 }
4008
4009 if (requires_missing_loc) {
4010 for (auto &&entry: logv) {
4011 psdout(30) << __func__ << " missing_loc before: "
4012 << missing_loc.get_locations(entry.soid) << dendl;
4013 missing_loc.add_missing(entry.soid, entry.version,
4014 eversion_t(), entry.is_delete());
4015 // clear out missing_loc
4016 missing_loc.clear_location(entry.soid);
4017 for (auto &i: get_actingset()) {
4018 if (!get_peer_missing(i).is_missing(entry.soid))
4019 missing_loc.add_location(entry.soid, i);
4020 }
4021 psdout(30) << __func__ << " missing_loc after: "
4022 << missing_loc.get_locations(entry.soid) << dendl;
4023 }
4024 }
4025 }
4026
4027 void PeeringState::recovery_committed_to(eversion_t version)
4028 {
4029 psdout(10) << __func__ << " version " << version
4030 << " now ondisk" << dendl;
4031 last_complete_ondisk = version;
4032
4033 if (last_complete_ondisk == info.last_update) {
4034 if (!is_primary()) {
4035 // Either we are a replica or backfill target.
4036 // we are fully up to date. tell the primary!
4037 pl->send_cluster_message(
4038 get_primary().osd,
4039 new MOSDPGTrim(
4040 get_osdmap_epoch(),
4041 spg_t(info.pgid.pgid, primary.shard),
4042 last_complete_ondisk),
4043 get_osdmap_epoch());
4044 } else {
4045 calc_min_last_complete_ondisk();
4046 }
4047 }
4048 }
4049
4050 void PeeringState::complete_write(eversion_t v, eversion_t lc)
4051 {
4052 last_update_ondisk = v;
4053 last_complete_ondisk = lc;
4054 calc_min_last_complete_ondisk();
4055 }
4056
4057 void PeeringState::calc_trim_to()
4058 {
4059 size_t target = pl->get_target_pg_log_entries();
4060
4061 eversion_t limit = std::min(
4062 min_last_complete_ondisk,
4063 pg_log.get_can_rollback_to());
4064 if (limit != eversion_t() &&
4065 limit != pg_trim_to &&
4066 pg_log.get_log().approx_size() > target) {
4067 size_t num_to_trim = std::min(pg_log.get_log().approx_size() - target,
4068 cct->_conf->osd_pg_log_trim_max);
4069 if (num_to_trim < cct->_conf->osd_pg_log_trim_min &&
4070 cct->_conf->osd_pg_log_trim_max >= cct->_conf->osd_pg_log_trim_min) {
4071 return;
4072 }
4073 list<pg_log_entry_t>::const_iterator it = pg_log.get_log().log.begin();
4074 eversion_t new_trim_to;
4075 for (size_t i = 0; i < num_to_trim; ++i) {
4076 new_trim_to = it->version;
4077 ++it;
4078 if (new_trim_to > limit) {
4079 new_trim_to = limit;
4080 psdout(10) << "calc_trim_to trimming to min_last_complete_ondisk" << dendl;
4081 break;
4082 }
4083 }
4084 psdout(10) << "calc_trim_to " << pg_trim_to << " -> " << new_trim_to << dendl;
4085 pg_trim_to = new_trim_to;
4086 assert(pg_trim_to <= pg_log.get_head());
4087 assert(pg_trim_to <= min_last_complete_ondisk);
4088 }
4089 }
4090
4091 void PeeringState::calc_trim_to_aggressive()
4092 {
4093 size_t target = pl->get_target_pg_log_entries();
4094
4095 // limit pg log trimming up to the can_rollback_to value
4096 eversion_t limit = std::min({
4097 pg_log.get_head(),
4098 pg_log.get_can_rollback_to(),
4099 last_update_ondisk});
4100 psdout(10) << __func__ << " limit = " << limit << dendl;
4101
4102 if (limit != eversion_t() &&
4103 limit != pg_trim_to &&
4104 pg_log.get_log().approx_size() > target) {
4105 psdout(10) << __func__ << " approx pg log length = "
4106 << pg_log.get_log().approx_size() << dendl;
4107 uint64_t num_to_trim = std::min<uint64_t>(pg_log.get_log().approx_size() - target,
4108 cct->_conf->osd_pg_log_trim_max);
4109 psdout(10) << __func__ << " num_to_trim = " << num_to_trim << dendl;
4110 if (num_to_trim < cct->_conf->osd_pg_log_trim_min &&
4111 cct->_conf->osd_pg_log_trim_max >= cct->_conf->osd_pg_log_trim_min) {
4112 return;
4113 }
4114 auto it = pg_log.get_log().log.begin(); // oldest log entry
4115 auto rit = pg_log.get_log().log.rbegin();
4116 eversion_t by_n_to_keep; // start from tail
4117 eversion_t by_n_to_trim = eversion_t::max(); // start from head
4118 for (size_t i = 0; it != pg_log.get_log().log.end(); ++it, ++rit) {
4119 i++;
4120 if (i > target && by_n_to_keep == eversion_t()) {
4121 by_n_to_keep = rit->version;
4122 }
4123 if (i >= num_to_trim && by_n_to_trim == eversion_t::max()) {
4124 by_n_to_trim = it->version;
4125 }
4126 if (by_n_to_keep != eversion_t() &&
4127 by_n_to_trim != eversion_t::max()) {
4128 break;
4129 }
4130 }
4131
4132 if (by_n_to_keep == eversion_t()) {
4133 return;
4134 }
4135
4136 pg_trim_to = std::min({by_n_to_keep, by_n_to_trim, limit});
4137 psdout(10) << __func__ << " pg_trim_to now " << pg_trim_to << dendl;
4138 ceph_assert(pg_trim_to <= pg_log.get_head());
4139 }
4140 }
4141
4142 void PeeringState::apply_op_stats(
4143 const hobject_t &soid,
4144 const object_stat_sum_t &delta_stats)
4145 {
4146 info.stats.stats.add(delta_stats);
4147 info.stats.stats.floor(0);
4148
4149 for (set<pg_shard_t>::const_iterator i = get_backfill_targets().begin();
4150 i != get_backfill_targets().end();
4151 ++i) {
4152 pg_shard_t bt = *i;
4153 pg_info_t& pinfo = peer_info[bt];
4154 if (soid <= pinfo.last_backfill)
4155 pinfo.stats.stats.add(delta_stats);
4156 }
4157 }
4158
4159 void PeeringState::update_complete_backfill_object_stats(
4160 const hobject_t &hoid,
4161 const pg_stat_t &stats)
4162 {
4163 for (auto &&bt: get_backfill_targets()) {
4164 pg_info_t& pinfo = peer_info[bt];
4165 //Add stats to all peers that were missing object
4166 if (hoid > pinfo.last_backfill)
4167 pinfo.stats.add(stats);
4168 }
4169 }
4170
4171 void PeeringState::update_peer_last_backfill(
4172 pg_shard_t peer,
4173 const hobject_t &new_last_backfill)
4174 {
4175 pg_info_t &pinfo = peer_info[peer];
4176 pinfo.last_backfill = new_last_backfill;
4177 if (new_last_backfill.is_max()) {
4178 /* pinfo.stats might be wrong if we did log-based recovery on the
4179 * backfilled portion in addition to continuing backfill.
4180 */
4181 pinfo.stats = info.stats;
4182 }
4183 }
4184
4185 void PeeringState::set_revert_with_targets(
4186 const hobject_t &soid,
4187 const set<pg_shard_t> &good_peers)
4188 {
4189 for (auto &&peer: good_peers) {
4190 missing_loc.add_location(soid, peer);
4191 }
4192 }
4193
4194 void PeeringState::prepare_backfill_for_missing(
4195 const hobject_t &soid,
4196 const eversion_t &version,
4197 const vector<pg_shard_t> &targets) {
4198 for (auto &&peer: targets) {
4199 peer_missing[peer].add(soid, version, eversion_t(), false);
4200 }
4201 }
4202
4203 void PeeringState::update_hset(const pg_hit_set_history_t &hset_history)
4204 {
4205 info.hit_set = hset_history;
4206 }
4207
4208 /*------------ Peering State Machine----------------*/
4209 #undef dout_prefix
4210 #define dout_prefix (context< PeeringMachine >().dpp->gen_prefix(*_dout) \
4211 << "state<" << get_state_name() << ">: ")
4212 #undef psdout
4213 #define psdout(x) ldout(context< PeeringMachine >().cct, x)
4214
4215 #define DECLARE_LOCALS \
4216 PeeringState *ps = context< PeeringMachine >().state; \
4217 std::ignore = ps; \
4218 PeeringListener *pl = context< PeeringMachine >().pl; \
4219 std::ignore = pl
4220
4221
4222 /*------Crashed-------*/
4223 PeeringState::Crashed::Crashed(my_context ctx)
4224 : my_base(ctx),
4225 NamedState(context< PeeringMachine >().state_history, "Crashed")
4226 {
4227 context< PeeringMachine >().log_enter(state_name);
4228 ceph_abort_msg("we got a bad state machine event");
4229 }
4230
4231
4232 /*------Initial-------*/
4233 PeeringState::Initial::Initial(my_context ctx)
4234 : my_base(ctx),
4235 NamedState(context< PeeringMachine >().state_history, "Initial")
4236 {
4237 context< PeeringMachine >().log_enter(state_name);
4238 }
4239
4240 boost::statechart::result PeeringState::Initial::react(const MNotifyRec& notify)
4241 {
4242 DECLARE_LOCALS;
4243 ps->proc_replica_info(
4244 notify.from, notify.notify.info, notify.notify.epoch_sent);
4245 ps->set_last_peering_reset();
4246 return transit< Primary >();
4247 }
4248
4249 boost::statechart::result PeeringState::Initial::react(const MInfoRec& i)
4250 {
4251 DECLARE_LOCALS;
4252 ceph_assert(!ps->is_primary());
4253 post_event(i);
4254 return transit< Stray >();
4255 }
4256
4257 boost::statechart::result PeeringState::Initial::react(const MLogRec& i)
4258 {
4259 DECLARE_LOCALS;
4260 ceph_assert(!ps->is_primary());
4261 post_event(i);
4262 return transit< Stray >();
4263 }
4264
4265 void PeeringState::Initial::exit()
4266 {
4267 context< PeeringMachine >().log_exit(state_name, enter_time);
4268 DECLARE_LOCALS;
4269 utime_t dur = ceph_clock_now() - enter_time;
4270 pl->get_peering_perf().tinc(rs_initial_latency, dur);
4271 }
4272
4273 /*------Started-------*/
4274 PeeringState::Started::Started(my_context ctx)
4275 : my_base(ctx),
4276 NamedState(context< PeeringMachine >().state_history, "Started")
4277 {
4278 context< PeeringMachine >().log_enter(state_name);
4279 }
4280
4281 boost::statechart::result
4282 PeeringState::Started::react(const IntervalFlush&)
4283 {
4284 psdout(10) << "Ending blocked outgoing recovery messages" << dendl;
4285 context< PeeringMachine >().state->end_block_outgoing();
4286 return discard_event();
4287 }
4288
4289 boost::statechart::result PeeringState::Started::react(const AdvMap& advmap)
4290 {
4291 DECLARE_LOCALS;
4292 psdout(10) << "Started advmap" << dendl;
4293 ps->check_full_transition(advmap.lastmap, advmap.osdmap);
4294 if (ps->should_restart_peering(
4295 advmap.up_primary,
4296 advmap.acting_primary,
4297 advmap.newup,
4298 advmap.newacting,
4299 advmap.lastmap,
4300 advmap.osdmap)) {
4301 psdout(10) << "should_restart_peering, transitioning to Reset"
4302 << dendl;
4303 post_event(advmap);
4304 return transit< Reset >();
4305 }
4306 ps->remove_down_peer_info(advmap.osdmap);
4307 return discard_event();
4308 }
4309
4310 boost::statechart::result PeeringState::Started::react(const QueryState& q)
4311 {
4312 q.f->open_object_section("state");
4313 q.f->dump_string("name", state_name);
4314 q.f->dump_stream("enter_time") << enter_time;
4315 q.f->close_section();
4316 return discard_event();
4317 }
4318
4319 void PeeringState::Started::exit()
4320 {
4321 context< PeeringMachine >().log_exit(state_name, enter_time);
4322 DECLARE_LOCALS;
4323 utime_t dur = ceph_clock_now() - enter_time;
4324 pl->get_peering_perf().tinc(rs_started_latency, dur);
4325 ps->state_clear(PG_STATE_WAIT | PG_STATE_LAGGY);
4326 }
4327
4328 /*--------Reset---------*/
4329 PeeringState::Reset::Reset(my_context ctx)
4330 : my_base(ctx),
4331 NamedState(context< PeeringMachine >().state_history, "Reset")
4332 {
4333 context< PeeringMachine >().log_enter(state_name);
4334 DECLARE_LOCALS;
4335
4336 ps->flushes_in_progress = 0;
4337 ps->set_last_peering_reset();
4338 ps->log_weirdness();
4339 }
4340
4341 boost::statechart::result
4342 PeeringState::Reset::react(const IntervalFlush&)
4343 {
4344 psdout(10) << "Ending blocked outgoing recovery messages" << dendl;
4345 context< PeeringMachine >().state->end_block_outgoing();
4346 return discard_event();
4347 }
4348
4349 boost::statechart::result PeeringState::Reset::react(const AdvMap& advmap)
4350 {
4351 DECLARE_LOCALS;
4352 psdout(10) << "Reset advmap" << dendl;
4353
4354 ps->check_full_transition(advmap.lastmap, advmap.osdmap);
4355
4356 if (ps->should_restart_peering(
4357 advmap.up_primary,
4358 advmap.acting_primary,
4359 advmap.newup,
4360 advmap.newacting,
4361 advmap.lastmap,
4362 advmap.osdmap)) {
4363 psdout(10) << "should restart peering, calling start_peering_interval again"
4364 << dendl;
4365 ps->start_peering_interval(
4366 advmap.lastmap,
4367 advmap.newup, advmap.up_primary,
4368 advmap.newacting, advmap.acting_primary,
4369 context< PeeringMachine >().get_cur_transaction());
4370 }
4371 ps->remove_down_peer_info(advmap.osdmap);
4372 ps->check_past_interval_bounds();
4373 return discard_event();
4374 }
4375
4376 boost::statechart::result PeeringState::Reset::react(const ActMap&)
4377 {
4378 DECLARE_LOCALS;
4379 if (ps->should_send_notify() && ps->get_primary().osd >= 0) {
4380 ps->info.history.refresh_prior_readable_until_ub(
4381 pl->get_mnow(),
4382 ps->prior_readable_until_ub);
4383 context< PeeringMachine >().send_notify(
4384 ps->get_primary().osd,
4385 pg_notify_t(
4386 ps->get_primary().shard, ps->pg_whoami.shard,
4387 ps->get_osdmap_epoch(),
4388 ps->get_osdmap_epoch(),
4389 ps->info,
4390 ps->past_intervals));
4391 }
4392
4393 ps->update_heartbeat_peers();
4394
4395 return transit< Started >();
4396 }
4397
4398 boost::statechart::result PeeringState::Reset::react(const QueryState& q)
4399 {
4400 q.f->open_object_section("state");
4401 q.f->dump_string("name", state_name);
4402 q.f->dump_stream("enter_time") << enter_time;
4403 q.f->close_section();
4404 return discard_event();
4405 }
4406
4407 void PeeringState::Reset::exit()
4408 {
4409 context< PeeringMachine >().log_exit(state_name, enter_time);
4410 DECLARE_LOCALS;
4411 utime_t dur = ceph_clock_now() - enter_time;
4412 pl->get_peering_perf().tinc(rs_reset_latency, dur);
4413 }
4414
4415 /*-------Start---------*/
4416 PeeringState::Start::Start(my_context ctx)
4417 : my_base(ctx),
4418 NamedState(context< PeeringMachine >().state_history, "Start")
4419 {
4420 context< PeeringMachine >().log_enter(state_name);
4421
4422 DECLARE_LOCALS;
4423 if (ps->is_primary()) {
4424 psdout(1) << "transitioning to Primary" << dendl;
4425 post_event(MakePrimary());
4426 } else { //is_stray
4427 psdout(1) << "transitioning to Stray" << dendl;
4428 post_event(MakeStray());
4429 }
4430 }
4431
4432 void PeeringState::Start::exit()
4433 {
4434 context< PeeringMachine >().log_exit(state_name, enter_time);
4435 DECLARE_LOCALS;
4436 utime_t dur = ceph_clock_now() - enter_time;
4437 pl->get_peering_perf().tinc(rs_start_latency, dur);
4438 }
4439
4440 /*---------Primary--------*/
4441 PeeringState::Primary::Primary(my_context ctx)
4442 : my_base(ctx),
4443 NamedState(context< PeeringMachine >().state_history, "Started/Primary")
4444 {
4445 context< PeeringMachine >().log_enter(state_name);
4446 DECLARE_LOCALS;
4447 ceph_assert(ps->want_acting.empty());
4448
4449 // set CREATING bit until we have peered for the first time.
4450 if (ps->info.history.last_epoch_started == 0) {
4451 ps->state_set(PG_STATE_CREATING);
4452 // use the history timestamp, which ultimately comes from the
4453 // monitor in the create case.
4454 utime_t t = ps->info.history.last_scrub_stamp;
4455 ps->info.stats.last_fresh = t;
4456 ps->info.stats.last_active = t;
4457 ps->info.stats.last_change = t;
4458 ps->info.stats.last_peered = t;
4459 ps->info.stats.last_clean = t;
4460 ps->info.stats.last_unstale = t;
4461 ps->info.stats.last_undegraded = t;
4462 ps->info.stats.last_fullsized = t;
4463 ps->info.stats.last_scrub_stamp = t;
4464 ps->info.stats.last_deep_scrub_stamp = t;
4465 ps->info.stats.last_clean_scrub_stamp = t;
4466 }
4467 }
4468
4469 boost::statechart::result PeeringState::Primary::react(const MNotifyRec& notevt)
4470 {
4471 DECLARE_LOCALS;
4472 psdout(7) << "handle_pg_notify from osd." << notevt.from << dendl;
4473 ps->proc_replica_info(
4474 notevt.from, notevt.notify.info, notevt.notify.epoch_sent);
4475 return discard_event();
4476 }
4477
4478 boost::statechart::result PeeringState::Primary::react(const ActMap&)
4479 {
4480 DECLARE_LOCALS;
4481 psdout(7) << "handle ActMap primary" << dendl;
4482 pl->publish_stats_to_osd();
4483 return discard_event();
4484 }
4485
4486 boost::statechart::result PeeringState::Primary::react(
4487 const SetForceRecovery&)
4488 {
4489 DECLARE_LOCALS;
4490 ps->set_force_recovery(true);
4491 return discard_event();
4492 }
4493
4494 boost::statechart::result PeeringState::Primary::react(
4495 const UnsetForceRecovery&)
4496 {
4497 DECLARE_LOCALS;
4498 ps->set_force_recovery(false);
4499 return discard_event();
4500 }
4501
4502 boost::statechart::result PeeringState::Primary::react(
4503 const RequestScrub& evt)
4504 {
4505 DECLARE_LOCALS;
4506 if (ps->is_primary()) {
4507 pl->scrub_requested(evt.deep, evt.repair);
4508 psdout(10) << "marking for scrub" << dendl;
4509 }
4510 return discard_event();
4511 }
4512
4513 boost::statechart::result PeeringState::Primary::react(
4514 const SetForceBackfill&)
4515 {
4516 DECLARE_LOCALS;
4517 ps->set_force_backfill(true);
4518 return discard_event();
4519 }
4520
4521 boost::statechart::result PeeringState::Primary::react(
4522 const UnsetForceBackfill&)
4523 {
4524 DECLARE_LOCALS;
4525 ps->set_force_backfill(false);
4526 return discard_event();
4527 }
4528
4529 void PeeringState::Primary::exit()
4530 {
4531 context< PeeringMachine >().log_exit(state_name, enter_time);
4532 DECLARE_LOCALS;
4533 ps->want_acting.clear();
4534 utime_t dur = ceph_clock_now() - enter_time;
4535 pl->get_peering_perf().tinc(rs_primary_latency, dur);
4536 pl->clear_primary_state();
4537 ps->state_clear(PG_STATE_CREATING);
4538 }
4539
4540 /*---------Peering--------*/
4541 PeeringState::Peering::Peering(my_context ctx)
4542 : my_base(ctx),
4543 NamedState(context< PeeringMachine >().state_history, "Started/Primary/Peering"),
4544 history_les_bound(false)
4545 {
4546 context< PeeringMachine >().log_enter(state_name);
4547 DECLARE_LOCALS;
4548
4549 ceph_assert(!ps->is_peered());
4550 ceph_assert(!ps->is_peering());
4551 ceph_assert(ps->is_primary());
4552 ps->state_set(PG_STATE_PEERING);
4553 }
4554
4555 boost::statechart::result PeeringState::Peering::react(const AdvMap& advmap)
4556 {
4557 DECLARE_LOCALS;
4558 psdout(10) << "Peering advmap" << dendl;
4559 if (prior_set.affected_by_map(*(advmap.osdmap), ps->dpp)) {
4560 psdout(1) << "Peering, affected_by_map, going to Reset" << dendl;
4561 post_event(advmap);
4562 return transit< Reset >();
4563 }
4564
4565 ps->adjust_need_up_thru(advmap.osdmap);
4566 ps->check_prior_readable_down_osds(advmap.osdmap);
4567
4568 return forward_event();
4569 }
4570
4571 boost::statechart::result PeeringState::Peering::react(const QueryState& q)
4572 {
4573 DECLARE_LOCALS;
4574
4575 q.f->open_object_section("state");
4576 q.f->dump_string("name", state_name);
4577 q.f->dump_stream("enter_time") << enter_time;
4578
4579 q.f->open_array_section("past_intervals");
4580 ps->past_intervals.dump(q.f);
4581 q.f->close_section();
4582
4583 q.f->open_array_section("probing_osds");
4584 for (set<pg_shard_t>::iterator p = prior_set.probe.begin();
4585 p != prior_set.probe.end();
4586 ++p)
4587 q.f->dump_stream("osd") << *p;
4588 q.f->close_section();
4589
4590 if (prior_set.pg_down)
4591 q.f->dump_string("blocked", "peering is blocked due to down osds");
4592
4593 q.f->open_array_section("down_osds_we_would_probe");
4594 for (set<int>::iterator p = prior_set.down.begin();
4595 p != prior_set.down.end();
4596 ++p)
4597 q.f->dump_int("osd", *p);
4598 q.f->close_section();
4599
4600 q.f->open_array_section("peering_blocked_by");
4601 for (map<int,epoch_t>::iterator p = prior_set.blocked_by.begin();
4602 p != prior_set.blocked_by.end();
4603 ++p) {
4604 q.f->open_object_section("osd");
4605 q.f->dump_int("osd", p->first);
4606 q.f->dump_int("current_lost_at", p->second);
4607 q.f->dump_string("comment", "starting or marking this osd lost may let us proceed");
4608 q.f->close_section();
4609 }
4610 q.f->close_section();
4611
4612 if (history_les_bound) {
4613 q.f->open_array_section("peering_blocked_by_detail");
4614 q.f->open_object_section("item");
4615 q.f->dump_string("detail","peering_blocked_by_history_les_bound");
4616 q.f->close_section();
4617 q.f->close_section();
4618 }
4619
4620 q.f->close_section();
4621 return forward_event();
4622 }
4623
4624 void PeeringState::Peering::exit()
4625 {
4626
4627 DECLARE_LOCALS;
4628 psdout(10) << "Leaving Peering" << dendl;
4629 context< PeeringMachine >().log_exit(state_name, enter_time);
4630 ps->state_clear(PG_STATE_PEERING);
4631 pl->clear_probe_targets();
4632
4633 utime_t dur = ceph_clock_now() - enter_time;
4634 pl->get_peering_perf().tinc(rs_peering_latency, dur);
4635 }
4636
4637
4638 /*------Backfilling-------*/
4639 PeeringState::Backfilling::Backfilling(my_context ctx)
4640 : my_base(ctx),
4641 NamedState(context< PeeringMachine >().state_history, "Started/Primary/Active/Backfilling")
4642 {
4643 context< PeeringMachine >().log_enter(state_name);
4644
4645
4646 DECLARE_LOCALS;
4647 ps->backfill_reserved = true;
4648 pl->on_backfill_reserved();
4649 ps->state_clear(PG_STATE_BACKFILL_TOOFULL);
4650 ps->state_clear(PG_STATE_BACKFILL_WAIT);
4651 ps->state_set(PG_STATE_BACKFILLING);
4652 pl->publish_stats_to_osd();
4653 }
4654
4655 void PeeringState::Backfilling::backfill_release_reservations()
4656 {
4657 DECLARE_LOCALS;
4658 pl->cancel_local_background_io_reservation();
4659 for (set<pg_shard_t>::iterator it = ps->backfill_targets.begin();
4660 it != ps->backfill_targets.end();
4661 ++it) {
4662 ceph_assert(*it != ps->pg_whoami);
4663 pl->send_cluster_message(
4664 it->osd,
4665 new MBackfillReserve(
4666 MBackfillReserve::RELEASE,
4667 spg_t(ps->info.pgid.pgid, it->shard),
4668 ps->get_osdmap_epoch()),
4669 ps->get_osdmap_epoch());
4670 }
4671 }
4672
4673 void PeeringState::Backfilling::cancel_backfill()
4674 {
4675 DECLARE_LOCALS;
4676 backfill_release_reservations();
4677 pl->on_backfill_canceled();
4678 }
4679
4680 boost::statechart::result
4681 PeeringState::Backfilling::react(const Backfilled &c)
4682 {
4683 backfill_release_reservations();
4684 return transit<Recovered>();
4685 }
4686
4687 boost::statechart::result
4688 PeeringState::Backfilling::react(const DeferBackfill &c)
4689 {
4690 DECLARE_LOCALS;
4691
4692 psdout(10) << "defer backfill, retry delay " << c.delay << dendl;
4693 ps->state_set(PG_STATE_BACKFILL_WAIT);
4694 ps->state_clear(PG_STATE_BACKFILLING);
4695 cancel_backfill();
4696
4697 pl->schedule_event_after(
4698 std::make_shared<PGPeeringEvent>(
4699 ps->get_osdmap_epoch(),
4700 ps->get_osdmap_epoch(),
4701 RequestBackfill()),
4702 c.delay);
4703 return transit<NotBackfilling>();
4704 }
4705
4706 boost::statechart::result
4707 PeeringState::Backfilling::react(const UnfoundBackfill &c)
4708 {
4709 DECLARE_LOCALS;
4710 psdout(10) << "backfill has unfound, can't continue" << dendl;
4711 ps->state_set(PG_STATE_BACKFILL_UNFOUND);
4712 ps->state_clear(PG_STATE_BACKFILLING);
4713 cancel_backfill();
4714 return transit<NotBackfilling>();
4715 }
4716
4717 boost::statechart::result
4718 PeeringState::Backfilling::react(const RemoteReservationRevokedTooFull &)
4719 {
4720 DECLARE_LOCALS;
4721
4722 ps->state_set(PG_STATE_BACKFILL_TOOFULL);
4723 ps->state_clear(PG_STATE_BACKFILLING);
4724 cancel_backfill();
4725
4726 pl->schedule_event_after(
4727 std::make_shared<PGPeeringEvent>(
4728 ps->get_osdmap_epoch(),
4729 ps->get_osdmap_epoch(),
4730 RequestBackfill()),
4731 ps->cct->_conf->osd_backfill_retry_interval);
4732
4733 return transit<NotBackfilling>();
4734 }
4735
4736 boost::statechart::result
4737 PeeringState::Backfilling::react(const RemoteReservationRevoked &)
4738 {
4739 DECLARE_LOCALS;
4740 ps->state_set(PG_STATE_BACKFILL_WAIT);
4741 cancel_backfill();
4742 if (ps->needs_backfill()) {
4743 return transit<WaitLocalBackfillReserved>();
4744 } else {
4745 // raced with MOSDPGBackfill::OP_BACKFILL_FINISH, ignore
4746 return discard_event();
4747 }
4748 }
4749
4750 void PeeringState::Backfilling::exit()
4751 {
4752 context< PeeringMachine >().log_exit(state_name, enter_time);
4753 DECLARE_LOCALS;
4754 ps->backfill_reserved = false;
4755 ps->state_clear(PG_STATE_BACKFILLING);
4756 ps->state_clear(PG_STATE_FORCED_BACKFILL | PG_STATE_FORCED_RECOVERY);
4757 utime_t dur = ceph_clock_now() - enter_time;
4758 pl->get_peering_perf().tinc(rs_backfilling_latency, dur);
4759 }
4760
4761 /*--WaitRemoteBackfillReserved--*/
4762
4763 PeeringState::WaitRemoteBackfillReserved::WaitRemoteBackfillReserved(my_context ctx)
4764 : my_base(ctx),
4765 NamedState(context< PeeringMachine >().state_history, "Started/Primary/Active/WaitRemoteBackfillReserved"),
4766 backfill_osd_it(context< Active >().remote_shards_to_reserve_backfill.begin())
4767 {
4768 context< PeeringMachine >().log_enter(state_name);
4769 DECLARE_LOCALS;
4770
4771 ps->state_set(PG_STATE_BACKFILL_WAIT);
4772 pl->publish_stats_to_osd();
4773 post_event(RemoteBackfillReserved());
4774 }
4775
4776 boost::statechart::result
4777 PeeringState::WaitRemoteBackfillReserved::react(const RemoteBackfillReserved &evt)
4778 {
4779 DECLARE_LOCALS;
4780
4781 int64_t num_bytes = ps->info.stats.stats.sum.num_bytes;
4782 psdout(10) << __func__ << " num_bytes " << num_bytes << dendl;
4783 if (backfill_osd_it !=
4784 context< Active >().remote_shards_to_reserve_backfill.end()) {
4785 // The primary never backfills itself
4786 ceph_assert(*backfill_osd_it != ps->pg_whoami);
4787 pl->send_cluster_message(
4788 backfill_osd_it->osd,
4789 new MBackfillReserve(
4790 MBackfillReserve::REQUEST,
4791 spg_t(context< PeeringMachine >().spgid.pgid, backfill_osd_it->shard),
4792 ps->get_osdmap_epoch(),
4793 ps->get_backfill_priority(),
4794 num_bytes,
4795 ps->peer_bytes[*backfill_osd_it]),
4796 ps->get_osdmap_epoch());
4797 ++backfill_osd_it;
4798 } else {
4799 ps->peer_bytes.clear();
4800 post_event(AllBackfillsReserved());
4801 }
4802 return discard_event();
4803 }
4804
4805 void PeeringState::WaitRemoteBackfillReserved::exit()
4806 {
4807 context< PeeringMachine >().log_exit(state_name, enter_time);
4808 DECLARE_LOCALS;
4809
4810 utime_t dur = ceph_clock_now() - enter_time;
4811 pl->get_peering_perf().tinc(rs_waitremotebackfillreserved_latency, dur);
4812 }
4813
4814 void PeeringState::WaitRemoteBackfillReserved::retry()
4815 {
4816 DECLARE_LOCALS;
4817 pl->cancel_local_background_io_reservation();
4818
4819 // Send CANCEL to all previously acquired reservations
4820 set<pg_shard_t>::const_iterator it, begin, end;
4821 begin = context< Active >().remote_shards_to_reserve_backfill.begin();
4822 end = context< Active >().remote_shards_to_reserve_backfill.end();
4823 ceph_assert(begin != end);
4824 for (it = begin; it != backfill_osd_it; ++it) {
4825 // The primary never backfills itself
4826 ceph_assert(*it != ps->pg_whoami);
4827 pl->send_cluster_message(
4828 it->osd,
4829 new MBackfillReserve(
4830 MBackfillReserve::RELEASE,
4831 spg_t(context< PeeringMachine >().spgid.pgid, it->shard),
4832 ps->get_osdmap_epoch()),
4833 ps->get_osdmap_epoch());
4834 }
4835
4836 ps->state_clear(PG_STATE_BACKFILL_WAIT);
4837 pl->publish_stats_to_osd();
4838
4839 pl->schedule_event_after(
4840 std::make_shared<PGPeeringEvent>(
4841 ps->get_osdmap_epoch(),
4842 ps->get_osdmap_epoch(),
4843 RequestBackfill()),
4844 ps->cct->_conf->osd_backfill_retry_interval);
4845 }
4846
4847 boost::statechart::result
4848 PeeringState::WaitRemoteBackfillReserved::react(const RemoteReservationRejectedTooFull &evt)
4849 {
4850 DECLARE_LOCALS;
4851 ps->state_set(PG_STATE_BACKFILL_TOOFULL);
4852 retry();
4853 return transit<NotBackfilling>();
4854 }
4855
4856 boost::statechart::result
4857 PeeringState::WaitRemoteBackfillReserved::react(const RemoteReservationRevoked &evt)
4858 {
4859 retry();
4860 return transit<NotBackfilling>();
4861 }
4862
4863 /*--WaitLocalBackfillReserved--*/
4864 PeeringState::WaitLocalBackfillReserved::WaitLocalBackfillReserved(my_context ctx)
4865 : my_base(ctx),
4866 NamedState(context< PeeringMachine >().state_history, "Started/Primary/Active/WaitLocalBackfillReserved")
4867 {
4868 context< PeeringMachine >().log_enter(state_name);
4869 DECLARE_LOCALS;
4870
4871 ps->state_set(PG_STATE_BACKFILL_WAIT);
4872 pl->request_local_background_io_reservation(
4873 ps->get_backfill_priority(),
4874 std::make_shared<PGPeeringEvent>(
4875 ps->get_osdmap_epoch(),
4876 ps->get_osdmap_epoch(),
4877 LocalBackfillReserved()),
4878 std::make_shared<PGPeeringEvent>(
4879 ps->get_osdmap_epoch(),
4880 ps->get_osdmap_epoch(),
4881 DeferBackfill(0.0)));
4882 pl->publish_stats_to_osd();
4883 }
4884
4885 void PeeringState::WaitLocalBackfillReserved::exit()
4886 {
4887 context< PeeringMachine >().log_exit(state_name, enter_time);
4888 DECLARE_LOCALS;
4889 utime_t dur = ceph_clock_now() - enter_time;
4890 pl->get_peering_perf().tinc(rs_waitlocalbackfillreserved_latency, dur);
4891 }
4892
4893 /*----NotBackfilling------*/
4894 PeeringState::NotBackfilling::NotBackfilling(my_context ctx)
4895 : my_base(ctx),
4896 NamedState(context< PeeringMachine >().state_history, "Started/Primary/Active/NotBackfilling")
4897 {
4898 context< PeeringMachine >().log_enter(state_name);
4899 DECLARE_LOCALS;
4900 ps->state_clear(PG_STATE_REPAIR);
4901 pl->publish_stats_to_osd();
4902 }
4903
4904 boost::statechart::result
4905 PeeringState::NotBackfilling::react(const RemoteBackfillReserved &evt)
4906 {
4907 return discard_event();
4908 }
4909
4910 boost::statechart::result
4911 PeeringState::NotBackfilling::react(const RemoteReservationRejectedTooFull &evt)
4912 {
4913 return discard_event();
4914 }
4915
4916 void PeeringState::NotBackfilling::exit()
4917 {
4918 context< PeeringMachine >().log_exit(state_name, enter_time);
4919
4920 DECLARE_LOCALS;
4921 ps->state_clear(PG_STATE_BACKFILL_UNFOUND);
4922 utime_t dur = ceph_clock_now() - enter_time;
4923 pl->get_peering_perf().tinc(rs_notbackfilling_latency, dur);
4924 }
4925
4926 /*----NotRecovering------*/
4927 PeeringState::NotRecovering::NotRecovering(my_context ctx)
4928 : my_base(ctx),
4929 NamedState(context< PeeringMachine >().state_history, "Started/Primary/Active/NotRecovering")
4930 {
4931 context< PeeringMachine >().log_enter(state_name);
4932 DECLARE_LOCALS;
4933 ps->state_clear(PG_STATE_REPAIR);
4934 pl->publish_stats_to_osd();
4935 }
4936
4937 void PeeringState::NotRecovering::exit()
4938 {
4939 context< PeeringMachine >().log_exit(state_name, enter_time);
4940
4941 DECLARE_LOCALS;
4942 ps->state_clear(PG_STATE_RECOVERY_UNFOUND);
4943 utime_t dur = ceph_clock_now() - enter_time;
4944 pl->get_peering_perf().tinc(rs_notrecovering_latency, dur);
4945 }
4946
4947 /*---RepNotRecovering----*/
4948 PeeringState::RepNotRecovering::RepNotRecovering(my_context ctx)
4949 : my_base(ctx),
4950 NamedState(context< PeeringMachine >().state_history, "Started/ReplicaActive/RepNotRecovering")
4951 {
4952 context< PeeringMachine >().log_enter(state_name);
4953 }
4954
4955 boost::statechart::result
4956 PeeringState::RepNotRecovering::react(const RejectTooFullRemoteReservation &evt)
4957 {
4958 DECLARE_LOCALS;
4959 ps->reject_reservation();
4960 post_event(RemoteReservationRejectedTooFull());
4961 return discard_event();
4962 }
4963
4964 void PeeringState::RepNotRecovering::exit()
4965 {
4966 context< PeeringMachine >().log_exit(state_name, enter_time);
4967 DECLARE_LOCALS;
4968 utime_t dur = ceph_clock_now() - enter_time;
4969 pl->get_peering_perf().tinc(rs_repnotrecovering_latency, dur);
4970 }
4971
4972 /*---RepWaitRecoveryReserved--*/
4973 PeeringState::RepWaitRecoveryReserved::RepWaitRecoveryReserved(my_context ctx)
4974 : my_base(ctx),
4975 NamedState(context< PeeringMachine >().state_history, "Started/ReplicaActive/RepWaitRecoveryReserved")
4976 {
4977 context< PeeringMachine >().log_enter(state_name);
4978 }
4979
4980 boost::statechart::result
4981 PeeringState::RepWaitRecoveryReserved::react(const RemoteRecoveryReserved &evt)
4982 {
4983 DECLARE_LOCALS;
4984 pl->send_cluster_message(
4985 ps->primary.osd,
4986 new MRecoveryReserve(
4987 MRecoveryReserve::GRANT,
4988 spg_t(ps->info.pgid.pgid, ps->primary.shard),
4989 ps->get_osdmap_epoch()),
4990 ps->get_osdmap_epoch());
4991 return transit<RepRecovering>();
4992 }
4993
4994 boost::statechart::result
4995 PeeringState::RepWaitRecoveryReserved::react(
4996 const RemoteReservationCanceled &evt)
4997 {
4998 DECLARE_LOCALS;
4999 pl->unreserve_recovery_space();
5000
5001 pl->cancel_remote_recovery_reservation();
5002 return transit<RepNotRecovering>();
5003 }
5004
5005 void PeeringState::RepWaitRecoveryReserved::exit()
5006 {
5007 context< PeeringMachine >().log_exit(state_name, enter_time);
5008 DECLARE_LOCALS;
5009 utime_t dur = ceph_clock_now() - enter_time;
5010 pl->get_peering_perf().tinc(rs_repwaitrecoveryreserved_latency, dur);
5011 }
5012
5013 /*-RepWaitBackfillReserved*/
5014 PeeringState::RepWaitBackfillReserved::RepWaitBackfillReserved(my_context ctx)
5015 : my_base(ctx),
5016 NamedState(context< PeeringMachine >().state_history, "Started/ReplicaActive/RepWaitBackfillReserved")
5017 {
5018 context< PeeringMachine >().log_enter(state_name);
5019 }
5020
5021 boost::statechart::result
5022 PeeringState::RepNotRecovering::react(const RequestBackfillPrio &evt)
5023 {
5024
5025 DECLARE_LOCALS;
5026
5027 if (!pl->try_reserve_recovery_space(
5028 evt.primary_num_bytes, evt.local_num_bytes)) {
5029 post_event(RejectTooFullRemoteReservation());
5030 } else {
5031 PGPeeringEventRef preempt;
5032 if (HAVE_FEATURE(ps->upacting_features, RECOVERY_RESERVATION_2)) {
5033 // older peers will interpret preemption as TOOFULL
5034 preempt = std::make_shared<PGPeeringEvent>(
5035 pl->get_osdmap_epoch(),
5036 pl->get_osdmap_epoch(),
5037 RemoteBackfillPreempted());
5038 }
5039 pl->request_remote_recovery_reservation(
5040 evt.priority,
5041 std::make_shared<PGPeeringEvent>(
5042 pl->get_osdmap_epoch(),
5043 pl->get_osdmap_epoch(),
5044 RemoteBackfillReserved()),
5045 preempt);
5046 }
5047 return transit<RepWaitBackfillReserved>();
5048 }
5049
5050 boost::statechart::result
5051 PeeringState::RepNotRecovering::react(const RequestRecoveryPrio &evt)
5052 {
5053 DECLARE_LOCALS;
5054
5055 // fall back to a local reckoning of priority of primary doesn't pass one
5056 // (pre-mimic compat)
5057 int prio = evt.priority ? evt.priority : ps->get_recovery_priority();
5058
5059 PGPeeringEventRef preempt;
5060 if (HAVE_FEATURE(ps->upacting_features, RECOVERY_RESERVATION_2)) {
5061 // older peers can't handle this
5062 preempt = std::make_shared<PGPeeringEvent>(
5063 ps->get_osdmap_epoch(),
5064 ps->get_osdmap_epoch(),
5065 RemoteRecoveryPreempted());
5066 }
5067
5068 pl->request_remote_recovery_reservation(
5069 prio,
5070 std::make_shared<PGPeeringEvent>(
5071 ps->get_osdmap_epoch(),
5072 ps->get_osdmap_epoch(),
5073 RemoteRecoveryReserved()),
5074 preempt);
5075 return transit<RepWaitRecoveryReserved>();
5076 }
5077
5078 void PeeringState::RepWaitBackfillReserved::exit()
5079 {
5080 context< PeeringMachine >().log_exit(state_name, enter_time);
5081 DECLARE_LOCALS;
5082 utime_t dur = ceph_clock_now() - enter_time;
5083 pl->get_peering_perf().tinc(rs_repwaitbackfillreserved_latency, dur);
5084 }
5085
5086 boost::statechart::result
5087 PeeringState::RepWaitBackfillReserved::react(const RemoteBackfillReserved &evt)
5088 {
5089 DECLARE_LOCALS;
5090
5091
5092 pl->send_cluster_message(
5093 ps->primary.osd,
5094 new MBackfillReserve(
5095 MBackfillReserve::GRANT,
5096 spg_t(ps->info.pgid.pgid, ps->primary.shard),
5097 ps->get_osdmap_epoch()),
5098 ps->get_osdmap_epoch());
5099 return transit<RepRecovering>();
5100 }
5101
5102 boost::statechart::result
5103 PeeringState::RepWaitBackfillReserved::react(
5104 const RejectTooFullRemoteReservation &evt)
5105 {
5106 DECLARE_LOCALS;
5107 ps->reject_reservation();
5108 post_event(RemoteReservationRejectedTooFull());
5109 return discard_event();
5110 }
5111
5112 boost::statechart::result
5113 PeeringState::RepWaitBackfillReserved::react(
5114 const RemoteReservationRejectedTooFull &evt)
5115 {
5116 DECLARE_LOCALS;
5117 pl->unreserve_recovery_space();
5118
5119 pl->cancel_remote_recovery_reservation();
5120 return transit<RepNotRecovering>();
5121 }
5122
5123 boost::statechart::result
5124 PeeringState::RepWaitBackfillReserved::react(
5125 const RemoteReservationCanceled &evt)
5126 {
5127 DECLARE_LOCALS;
5128 pl->unreserve_recovery_space();
5129
5130 pl->cancel_remote_recovery_reservation();
5131 return transit<RepNotRecovering>();
5132 }
5133
5134 /*---RepRecovering-------*/
5135 PeeringState::RepRecovering::RepRecovering(my_context ctx)
5136 : my_base(ctx),
5137 NamedState(context< PeeringMachine >().state_history, "Started/ReplicaActive/RepRecovering")
5138 {
5139 context< PeeringMachine >().log_enter(state_name);
5140 }
5141
5142 boost::statechart::result
5143 PeeringState::RepRecovering::react(const RemoteRecoveryPreempted &)
5144 {
5145 DECLARE_LOCALS;
5146
5147
5148 pl->unreserve_recovery_space();
5149 pl->send_cluster_message(
5150 ps->primary.osd,
5151 new MRecoveryReserve(
5152 MRecoveryReserve::REVOKE,
5153 spg_t(ps->info.pgid.pgid, ps->primary.shard),
5154 ps->get_osdmap_epoch()),
5155 ps->get_osdmap_epoch());
5156 return discard_event();
5157 }
5158
5159 boost::statechart::result
5160 PeeringState::RepRecovering::react(const BackfillTooFull &)
5161 {
5162 DECLARE_LOCALS;
5163
5164
5165 pl->unreserve_recovery_space();
5166 pl->send_cluster_message(
5167 ps->primary.osd,
5168 new MBackfillReserve(
5169 MBackfillReserve::REVOKE_TOOFULL,
5170 spg_t(ps->info.pgid.pgid, ps->primary.shard),
5171 ps->get_osdmap_epoch()),
5172 ps->get_osdmap_epoch());
5173 return discard_event();
5174 }
5175
5176 boost::statechart::result
5177 PeeringState::RepRecovering::react(const RemoteBackfillPreempted &)
5178 {
5179 DECLARE_LOCALS;
5180
5181
5182 pl->unreserve_recovery_space();
5183 pl->send_cluster_message(
5184 ps->primary.osd,
5185 new MBackfillReserve(
5186 MBackfillReserve::REVOKE,
5187 spg_t(ps->info.pgid.pgid, ps->primary.shard),
5188 ps->get_osdmap_epoch()),
5189 ps->get_osdmap_epoch());
5190 return discard_event();
5191 }
5192
5193 void PeeringState::RepRecovering::exit()
5194 {
5195 context< PeeringMachine >().log_exit(state_name, enter_time);
5196 DECLARE_LOCALS;
5197 pl->unreserve_recovery_space();
5198
5199 pl->cancel_remote_recovery_reservation();
5200 utime_t dur = ceph_clock_now() - enter_time;
5201 pl->get_peering_perf().tinc(rs_reprecovering_latency, dur);
5202 }
5203
5204 /*------Activating--------*/
5205 PeeringState::Activating::Activating(my_context ctx)
5206 : my_base(ctx),
5207 NamedState(context< PeeringMachine >().state_history, "Started/Primary/Active/Activating")
5208 {
5209 context< PeeringMachine >().log_enter(state_name);
5210 }
5211
5212 void PeeringState::Activating::exit()
5213 {
5214 context< PeeringMachine >().log_exit(state_name, enter_time);
5215 DECLARE_LOCALS;
5216 utime_t dur = ceph_clock_now() - enter_time;
5217 pl->get_peering_perf().tinc(rs_activating_latency, dur);
5218 }
5219
5220 PeeringState::WaitLocalRecoveryReserved::WaitLocalRecoveryReserved(my_context ctx)
5221 : my_base(ctx),
5222 NamedState(context< PeeringMachine >().state_history, "Started/Primary/Active/WaitLocalRecoveryReserved")
5223 {
5224 context< PeeringMachine >().log_enter(state_name);
5225 DECLARE_LOCALS;
5226
5227 // Make sure all nodes that part of the recovery aren't full
5228 if (!ps->cct->_conf->osd_debug_skip_full_check_in_recovery &&
5229 ps->get_osdmap()->check_full(ps->acting_recovery_backfill)) {
5230 post_event(RecoveryTooFull());
5231 return;
5232 }
5233
5234 ps->state_clear(PG_STATE_RECOVERY_TOOFULL);
5235 ps->state_set(PG_STATE_RECOVERY_WAIT);
5236 pl->request_local_background_io_reservation(
5237 ps->get_recovery_priority(),
5238 std::make_shared<PGPeeringEvent>(
5239 ps->get_osdmap_epoch(),
5240 ps->get_osdmap_epoch(),
5241 LocalRecoveryReserved()),
5242 std::make_shared<PGPeeringEvent>(
5243 ps->get_osdmap_epoch(),
5244 ps->get_osdmap_epoch(),
5245 DeferRecovery(0.0)));
5246 pl->publish_stats_to_osd();
5247 }
5248
5249 boost::statechart::result
5250 PeeringState::WaitLocalRecoveryReserved::react(const RecoveryTooFull &evt)
5251 {
5252 DECLARE_LOCALS;
5253 ps->state_set(PG_STATE_RECOVERY_TOOFULL);
5254 pl->schedule_event_after(
5255 std::make_shared<PGPeeringEvent>(
5256 ps->get_osdmap_epoch(),
5257 ps->get_osdmap_epoch(),
5258 DoRecovery()),
5259 ps->cct->_conf->osd_recovery_retry_interval);
5260 return transit<NotRecovering>();
5261 }
5262
5263 void PeeringState::WaitLocalRecoveryReserved::exit()
5264 {
5265 context< PeeringMachine >().log_exit(state_name, enter_time);
5266 DECLARE_LOCALS;
5267 utime_t dur = ceph_clock_now() - enter_time;
5268 pl->get_peering_perf().tinc(rs_waitlocalrecoveryreserved_latency, dur);
5269 }
5270
5271 PeeringState::WaitRemoteRecoveryReserved::WaitRemoteRecoveryReserved(my_context ctx)
5272 : my_base(ctx),
5273 NamedState(context< PeeringMachine >().state_history, "Started/Primary/Active/WaitRemoteRecoveryReserved"),
5274 remote_recovery_reservation_it(context< Active >().remote_shards_to_reserve_recovery.begin())
5275 {
5276 context< PeeringMachine >().log_enter(state_name);
5277 post_event(RemoteRecoveryReserved());
5278 }
5279
5280 boost::statechart::result
5281 PeeringState::WaitRemoteRecoveryReserved::react(const RemoteRecoveryReserved &evt) {
5282 DECLARE_LOCALS;
5283
5284 if (remote_recovery_reservation_it !=
5285 context< Active >().remote_shards_to_reserve_recovery.end()) {
5286 ceph_assert(*remote_recovery_reservation_it != ps->pg_whoami);
5287 pl->send_cluster_message(
5288 remote_recovery_reservation_it->osd,
5289 new MRecoveryReserve(
5290 MRecoveryReserve::REQUEST,
5291 spg_t(context< PeeringMachine >().spgid.pgid,
5292 remote_recovery_reservation_it->shard),
5293 ps->get_osdmap_epoch(),
5294 ps->get_recovery_priority()),
5295 ps->get_osdmap_epoch());
5296 ++remote_recovery_reservation_it;
5297 } else {
5298 post_event(AllRemotesReserved());
5299 }
5300 return discard_event();
5301 }
5302
5303 void PeeringState::WaitRemoteRecoveryReserved::exit()
5304 {
5305 context< PeeringMachine >().log_exit(state_name, enter_time);
5306 DECLARE_LOCALS;
5307 utime_t dur = ceph_clock_now() - enter_time;
5308 pl->get_peering_perf().tinc(rs_waitremoterecoveryreserved_latency, dur);
5309 }
5310
5311 PeeringState::Recovering::Recovering(my_context ctx)
5312 : my_base(ctx),
5313 NamedState(context< PeeringMachine >().state_history, "Started/Primary/Active/Recovering")
5314 {
5315 context< PeeringMachine >().log_enter(state_name);
5316
5317 DECLARE_LOCALS;
5318 ps->state_clear(PG_STATE_RECOVERY_WAIT);
5319 ps->state_clear(PG_STATE_RECOVERY_TOOFULL);
5320 ps->state_set(PG_STATE_RECOVERING);
5321 pl->on_recovery_reserved();
5322 ceph_assert(!ps->state_test(PG_STATE_ACTIVATING));
5323 pl->publish_stats_to_osd();
5324 }
5325
5326 void PeeringState::Recovering::release_reservations(bool cancel)
5327 {
5328 DECLARE_LOCALS;
5329 ceph_assert(cancel || !ps->pg_log.get_missing().have_missing());
5330
5331 // release remote reservations
5332 for (set<pg_shard_t>::const_iterator i =
5333 context< Active >().remote_shards_to_reserve_recovery.begin();
5334 i != context< Active >().remote_shards_to_reserve_recovery.end();
5335 ++i) {
5336 if (*i == ps->pg_whoami) // skip myself
5337 continue;
5338 pl->send_cluster_message(
5339 i->osd,
5340 new MRecoveryReserve(
5341 MRecoveryReserve::RELEASE,
5342 spg_t(ps->info.pgid.pgid, i->shard),
5343 ps->get_osdmap_epoch()),
5344 ps->get_osdmap_epoch());
5345 }
5346 }
5347
5348 boost::statechart::result
5349 PeeringState::Recovering::react(const AllReplicasRecovered &evt)
5350 {
5351 DECLARE_LOCALS;
5352 ps->state_clear(PG_STATE_FORCED_RECOVERY);
5353 release_reservations();
5354 pl->cancel_local_background_io_reservation();
5355 return transit<Recovered>();
5356 }
5357
5358 boost::statechart::result
5359 PeeringState::Recovering::react(const RequestBackfill &evt)
5360 {
5361 DECLARE_LOCALS;
5362
5363 release_reservations();
5364
5365 ps->state_clear(PG_STATE_FORCED_RECOVERY);
5366 pl->cancel_local_background_io_reservation();
5367 pl->publish_stats_to_osd();
5368 // transit any async_recovery_targets back into acting
5369 // so pg won't have to stay undersized for long
5370 // as backfill might take a long time to complete..
5371 if (!ps->async_recovery_targets.empty()) {
5372 pg_shard_t auth_log_shard;
5373 bool history_les_bound = false;
5374 ps->choose_acting(auth_log_shard, true, &history_les_bound);
5375 }
5376 return transit<WaitLocalBackfillReserved>();
5377 }
5378
5379 boost::statechart::result
5380 PeeringState::Recovering::react(const DeferRecovery &evt)
5381 {
5382 DECLARE_LOCALS;
5383 if (!ps->state_test(PG_STATE_RECOVERING)) {
5384 // we may have finished recovery and have an AllReplicasRecovered
5385 // event queued to move us to the next state.
5386 psdout(10) << "got defer recovery but not recovering" << dendl;
5387 return discard_event();
5388 }
5389 psdout(10) << "defer recovery, retry delay " << evt.delay << dendl;
5390 ps->state_set(PG_STATE_RECOVERY_WAIT);
5391 pl->cancel_local_background_io_reservation();
5392 release_reservations(true);
5393 pl->schedule_event_after(
5394 std::make_shared<PGPeeringEvent>(
5395 ps->get_osdmap_epoch(),
5396 ps->get_osdmap_epoch(),
5397 DoRecovery()),
5398 evt.delay);
5399 return transit<NotRecovering>();
5400 }
5401
5402 boost::statechart::result
5403 PeeringState::Recovering::react(const UnfoundRecovery &evt)
5404 {
5405 DECLARE_LOCALS;
5406 psdout(10) << "recovery has unfound, can't continue" << dendl;
5407 ps->state_set(PG_STATE_RECOVERY_UNFOUND);
5408 pl->cancel_local_background_io_reservation();
5409 release_reservations(true);
5410 return transit<NotRecovering>();
5411 }
5412
5413 void PeeringState::Recovering::exit()
5414 {
5415 context< PeeringMachine >().log_exit(state_name, enter_time);
5416
5417 DECLARE_LOCALS;
5418 utime_t dur = ceph_clock_now() - enter_time;
5419 ps->state_clear(PG_STATE_RECOVERING);
5420 pl->get_peering_perf().tinc(rs_recovering_latency, dur);
5421 }
5422
5423 PeeringState::Recovered::Recovered(my_context ctx)
5424 : my_base(ctx),
5425 NamedState(context< PeeringMachine >().state_history, "Started/Primary/Active/Recovered")
5426 {
5427 pg_shard_t auth_log_shard;
5428
5429 context< PeeringMachine >().log_enter(state_name);
5430
5431 DECLARE_LOCALS;
5432
5433 ceph_assert(!ps->needs_recovery());
5434
5435 // if we finished backfill, all acting are active; recheck if
5436 // DEGRADED | UNDERSIZED is appropriate.
5437 ceph_assert(!ps->acting_recovery_backfill.empty());
5438 if (ps->get_osdmap()->get_pg_size(context< PeeringMachine >().spgid.pgid) <=
5439 ps->acting_recovery_backfill.size()) {
5440 ps->state_clear(PG_STATE_FORCED_BACKFILL | PG_STATE_FORCED_RECOVERY);
5441 pl->publish_stats_to_osd();
5442 }
5443
5444 // adjust acting set? (e.g. because backfill completed...)
5445 bool history_les_bound = false;
5446 if (ps->acting != ps->up && !ps->choose_acting(auth_log_shard,
5447 true, &history_les_bound)) {
5448 ceph_assert(ps->want_acting.size());
5449 } else if (!ps->async_recovery_targets.empty()) {
5450 ps->choose_acting(auth_log_shard, true, &history_les_bound);
5451 }
5452
5453 if (context< Active >().all_replicas_activated &&
5454 ps->async_recovery_targets.empty())
5455 post_event(GoClean());
5456 }
5457
5458 void PeeringState::Recovered::exit()
5459 {
5460 context< PeeringMachine >().log_exit(state_name, enter_time);
5461 DECLARE_LOCALS;
5462
5463 utime_t dur = ceph_clock_now() - enter_time;
5464 pl->get_peering_perf().tinc(rs_recovered_latency, dur);
5465 }
5466
5467 PeeringState::Clean::Clean(my_context ctx)
5468 : my_base(ctx),
5469 NamedState(context< PeeringMachine >().state_history, "Started/Primary/Active/Clean")
5470 {
5471 context< PeeringMachine >().log_enter(state_name);
5472
5473 DECLARE_LOCALS;
5474
5475 if (ps->info.last_complete != ps->info.last_update) {
5476 ceph_abort();
5477 }
5478
5479
5480 ps->try_mark_clean();
5481
5482 context< PeeringMachine >().get_cur_transaction().register_on_commit(
5483 pl->on_clean());
5484 }
5485
5486 void PeeringState::Clean::exit()
5487 {
5488 context< PeeringMachine >().log_exit(state_name, enter_time);
5489
5490 DECLARE_LOCALS;
5491 ps->state_clear(PG_STATE_CLEAN);
5492 utime_t dur = ceph_clock_now() - enter_time;
5493 pl->get_peering_perf().tinc(rs_clean_latency, dur);
5494 }
5495
5496 template <typename T>
5497 set<pg_shard_t> unique_osd_shard_set(const pg_shard_t & skip, const T &in)
5498 {
5499 set<int> osds_found;
5500 set<pg_shard_t> out;
5501 for (typename T::const_iterator i = in.begin();
5502 i != in.end();
5503 ++i) {
5504 if (*i != skip && !osds_found.count(i->osd)) {
5505 osds_found.insert(i->osd);
5506 out.insert(*i);
5507 }
5508 }
5509 return out;
5510 }
5511
5512 /*---------Active---------*/
5513 PeeringState::Active::Active(my_context ctx)
5514 : my_base(ctx),
5515 NamedState(context< PeeringMachine >().state_history, "Started/Primary/Active"),
5516 remote_shards_to_reserve_recovery(
5517 unique_osd_shard_set(
5518 context< PeeringMachine >().state->pg_whoami,
5519 context< PeeringMachine >().state->acting_recovery_backfill)),
5520 remote_shards_to_reserve_backfill(
5521 unique_osd_shard_set(
5522 context< PeeringMachine >().state->pg_whoami,
5523 context< PeeringMachine >().state->backfill_targets)),
5524 all_replicas_activated(false)
5525 {
5526 context< PeeringMachine >().log_enter(state_name);
5527
5528
5529 DECLARE_LOCALS;
5530
5531 ceph_assert(!ps->backfill_reserved);
5532 ceph_assert(ps->is_primary());
5533 psdout(10) << "In Active, about to call activate" << dendl;
5534 ps->start_flush(context< PeeringMachine >().get_cur_transaction());
5535 ps->activate(context< PeeringMachine >().get_cur_transaction(),
5536 ps->get_osdmap_epoch(),
5537 context< PeeringMachine >().get_recovery_ctx());
5538
5539 // everyone has to commit/ack before we are truly active
5540 ps->blocked_by.clear();
5541 for (set<pg_shard_t>::iterator p = ps->acting_recovery_backfill.begin();
5542 p != ps->acting_recovery_backfill.end();
5543 ++p) {
5544 if (p->shard != ps->pg_whoami.shard) {
5545 ps->blocked_by.insert(p->shard);
5546 }
5547 }
5548 pl->publish_stats_to_osd();
5549 psdout(10) << "Activate Finished" << dendl;
5550 }
5551
5552 boost::statechart::result PeeringState::Active::react(const AdvMap& advmap)
5553 {
5554 DECLARE_LOCALS;
5555
5556 if (ps->should_restart_peering(
5557 advmap.up_primary,
5558 advmap.acting_primary,
5559 advmap.newup,
5560 advmap.newacting,
5561 advmap.lastmap,
5562 advmap.osdmap)) {
5563 psdout(10) << "Active advmap interval change, fast return" << dendl;
5564 return forward_event();
5565 }
5566 psdout(10) << "Active advmap" << dendl;
5567 bool need_publish = false;
5568
5569 pl->on_active_advmap(advmap.osdmap);
5570 if (ps->dirty_big_info) {
5571 // share updated purged_snaps to mgr/mon so that we (a) stop reporting
5572 // purged snaps and (b) perhaps share more snaps that we have purged
5573 // but didn't fit in pg_stat_t.
5574 need_publish = true;
5575 ps->share_pg_info();
5576 }
5577
5578 bool need_acting_change = false;
5579 for (size_t i = 0; i < ps->want_acting.size(); i++) {
5580 int osd = ps->want_acting[i];
5581 if (!advmap.osdmap->is_up(osd)) {
5582 pg_shard_t osd_with_shard(osd, shard_id_t(i));
5583 if (!ps->is_acting(osd_with_shard) && !ps->is_up(osd_with_shard)) {
5584 psdout(10) << "Active stray osd." << osd << " in want_acting is down"
5585 << dendl;
5586 need_acting_change = true;
5587 }
5588 }
5589 }
5590 if (need_acting_change) {
5591 psdout(10) << "Active need acting change, call choose_acting again"
5592 << dendl;
5593 // possibly because we re-add some strays into the acting set and
5594 // some of them then go down in a subsequent map before we could see
5595 // the map changing the pg temp.
5596 // call choose_acting again to clear them out.
5597 // note that we leave restrict_to_up_acting to false in order to
5598 // not overkill any chosen stray that is still alive.
5599 pg_shard_t auth_log_shard;
5600 bool history_les_bound = false;
5601 ps->remove_down_peer_info(advmap.osdmap);
5602 ps->choose_acting(auth_log_shard, false, &history_les_bound, true);
5603 }
5604
5605 /* Check for changes in pool size (if the acting set changed as a result,
5606 * this does not matter) */
5607 if (advmap.lastmap->get_pg_size(ps->info.pgid.pgid) !=
5608 ps->get_osdmap()->get_pg_size(ps->info.pgid.pgid)) {
5609 if (ps->get_osdmap()->get_pg_size(ps->info.pgid.pgid) <=
5610 ps->actingset.size()) {
5611 ps->state_clear(PG_STATE_UNDERSIZED);
5612 } else {
5613 ps->state_set(PG_STATE_UNDERSIZED);
5614 }
5615 // degraded changes will be detected by call from publish_stats_to_osd()
5616 need_publish = true;
5617 }
5618
5619 // if we haven't reported our PG stats in a long time, do so now.
5620 if (ps->info.stats.reported_epoch + ps->cct->_conf->osd_pg_stat_report_interval_max < advmap.osdmap->get_epoch()) {
5621 psdout(20) << "reporting stats to osd after " << (advmap.osdmap->get_epoch() - ps->info.stats.reported_epoch)
5622 << " epochs" << dendl;
5623 need_publish = true;
5624 }
5625
5626 if (need_publish)
5627 pl->publish_stats_to_osd();
5628
5629 if (ps->check_prior_readable_down_osds(advmap.osdmap)) {
5630 pl->recheck_readable();
5631 }
5632
5633 return forward_event();
5634 }
5635
5636 boost::statechart::result PeeringState::Active::react(const ActMap&)
5637 {
5638 DECLARE_LOCALS;
5639 psdout(10) << "Active: handling ActMap" << dendl;
5640 ceph_assert(ps->is_primary());
5641
5642 pl->on_active_actmap();
5643
5644 if (ps->have_unfound()) {
5645 // object may have become unfound
5646 ps->discover_all_missing(context<PeeringMachine>().get_recovery_ctx().msgs);
5647 }
5648
5649 uint64_t unfound = ps->missing_loc.num_unfound();
5650 if (unfound > 0 &&
5651 ps->all_unfound_are_queried_or_lost(ps->get_osdmap())) {
5652 if (ps->cct->_conf->osd_auto_mark_unfound_lost) {
5653 pl->get_clog_error() << context< PeeringMachine >().spgid.pgid << " has " << unfound
5654 << " objects unfound and apparently lost, would automatically "
5655 << "mark these objects lost but this feature is not yet implemented "
5656 << "(osd_auto_mark_unfound_lost)";
5657 } else
5658 pl->get_clog_error() << context< PeeringMachine >().spgid.pgid << " has "
5659 << unfound << " objects unfound and apparently lost";
5660 }
5661
5662 return forward_event();
5663 }
5664
5665 boost::statechart::result PeeringState::Active::react(const MNotifyRec& notevt)
5666 {
5667
5668 DECLARE_LOCALS;
5669 ceph_assert(ps->is_primary());
5670 if (ps->peer_info.count(notevt.from)) {
5671 psdout(10) << "Active: got notify from " << notevt.from
5672 << ", already have info from that osd, ignoring"
5673 << dendl;
5674 } else if (ps->peer_purged.count(notevt.from)) {
5675 psdout(10) << "Active: got notify from " << notevt.from
5676 << ", already purged that peer, ignoring"
5677 << dendl;
5678 } else {
5679 psdout(10) << "Active: got notify from " << notevt.from
5680 << ", calling proc_replica_info and discover_all_missing"
5681 << dendl;
5682 ps->proc_replica_info(
5683 notevt.from, notevt.notify.info, notevt.notify.epoch_sent);
5684 if (ps->have_unfound() || (ps->is_degraded() && ps->might_have_unfound.count(notevt.from))) {
5685 ps->discover_all_missing(
5686 context<PeeringMachine>().get_recovery_ctx().msgs);
5687 }
5688 // check if it is a previous down acting member that's coming back.
5689 // if so, request pg_temp change to trigger a new interval transition
5690 pg_shard_t auth_log_shard;
5691 bool history_les_bound = false;
5692 ps->choose_acting(auth_log_shard, false, &history_les_bound, true);
5693 if (!ps->want_acting.empty() && ps->want_acting != ps->acting) {
5694 psdout(10) << "Active: got notify from previous acting member "
5695 << notevt.from << ", requesting pg_temp change"
5696 << dendl;
5697 }
5698 }
5699 return discard_event();
5700 }
5701
5702 boost::statechart::result PeeringState::Active::react(const MTrim& trim)
5703 {
5704 DECLARE_LOCALS;
5705 ceph_assert(ps->is_primary());
5706
5707 // peer is informing us of their last_complete_ondisk
5708 ldout(ps->cct,10) << " replica osd." << trim.from << " lcod " << trim.trim_to << dendl;
5709 ps->update_peer_last_complete_ondisk(pg_shard_t{trim.from, trim.shard},
5710 trim.trim_to);
5711 // trim log when the pg is recovered
5712 ps->calc_min_last_complete_ondisk();
5713 return discard_event();
5714 }
5715
5716 boost::statechart::result PeeringState::Active::react(const MInfoRec& infoevt)
5717 {
5718 DECLARE_LOCALS;
5719 ceph_assert(ps->is_primary());
5720
5721 ceph_assert(!ps->acting_recovery_backfill.empty());
5722 if (infoevt.lease_ack) {
5723 ps->proc_lease_ack(infoevt.from.osd, *infoevt.lease_ack);
5724 }
5725 // don't update history (yet) if we are active and primary; the replica
5726 // may be telling us they have activated (and committed) but we can't
5727 // share that until _everyone_ does the same.
5728 if (ps->is_acting_recovery_backfill(infoevt.from) &&
5729 ps->peer_activated.count(infoevt.from) == 0) {
5730 psdout(10) << " peer osd." << infoevt.from
5731 << " activated and committed" << dendl;
5732 ps->peer_activated.insert(infoevt.from);
5733 ps->blocked_by.erase(infoevt.from.shard);
5734 pl->publish_stats_to_osd();
5735 if (ps->peer_activated.size() == ps->acting_recovery_backfill.size()) {
5736 all_activated_and_committed();
5737 }
5738 }
5739 return discard_event();
5740 }
5741
5742 boost::statechart::result PeeringState::Active::react(const MLogRec& logevt)
5743 {
5744 DECLARE_LOCALS;
5745 psdout(10) << "searching osd." << logevt.from
5746 << " log for unfound items" << dendl;
5747 ps->proc_replica_log(
5748 logevt.msg->info, logevt.msg->log, logevt.msg->missing, logevt.from);
5749 bool got_missing = ps->search_for_missing(
5750 ps->peer_info[logevt.from],
5751 ps->peer_missing[logevt.from],
5752 logevt.from,
5753 context< PeeringMachine >().get_recovery_ctx());
5754 // If there are missing AND we are "fully" active then start recovery now
5755 if (got_missing && ps->state_test(PG_STATE_ACTIVE)) {
5756 post_event(DoRecovery());
5757 }
5758 return discard_event();
5759 }
5760
5761 boost::statechart::result PeeringState::Active::react(const QueryState& q)
5762 {
5763 DECLARE_LOCALS;
5764
5765 q.f->open_object_section("state");
5766 q.f->dump_string("name", state_name);
5767 q.f->dump_stream("enter_time") << enter_time;
5768
5769 {
5770 q.f->open_array_section("might_have_unfound");
5771 for (set<pg_shard_t>::iterator p = ps->might_have_unfound.begin();
5772 p != ps->might_have_unfound.end();
5773 ++p) {
5774 q.f->open_object_section("osd");
5775 q.f->dump_stream("osd") << *p;
5776 if (ps->peer_missing.count(*p)) {
5777 q.f->dump_string("status", "already probed");
5778 } else if (ps->peer_missing_requested.count(*p)) {
5779 q.f->dump_string("status", "querying");
5780 } else if (!ps->get_osdmap()->is_up(p->osd)) {
5781 q.f->dump_string("status", "osd is down");
5782 } else {
5783 q.f->dump_string("status", "not queried");
5784 }
5785 q.f->close_section();
5786 }
5787 q.f->close_section();
5788 }
5789 {
5790 q.f->open_object_section("recovery_progress");
5791 q.f->open_array_section("backfill_targets");
5792 for (set<pg_shard_t>::const_iterator p = ps->backfill_targets.begin();
5793 p != ps->backfill_targets.end(); ++p)
5794 q.f->dump_stream("replica") << *p;
5795 q.f->close_section();
5796 pl->dump_recovery_info(q.f);
5797 q.f->close_section();
5798 }
5799
5800 q.f->close_section();
5801 return forward_event();
5802 }
5803
5804 boost::statechart::result PeeringState::Active::react(
5805 const ActivateCommitted &evt)
5806 {
5807 DECLARE_LOCALS;
5808 ceph_assert(!ps->peer_activated.count(ps->pg_whoami));
5809 ps->peer_activated.insert(ps->pg_whoami);
5810 psdout(10) << "_activate_committed " << evt.epoch
5811 << " peer_activated now " << ps->peer_activated
5812 << " last_interval_started "
5813 << ps->info.history.last_interval_started
5814 << " last_epoch_started "
5815 << ps->info.history.last_epoch_started
5816 << " same_interval_since "
5817 << ps->info.history.same_interval_since
5818 << dendl;
5819 ceph_assert(!ps->acting_recovery_backfill.empty());
5820 if (ps->peer_activated.size() == ps->acting_recovery_backfill.size())
5821 all_activated_and_committed();
5822 return discard_event();
5823 }
5824
5825 boost::statechart::result PeeringState::Active::react(const AllReplicasActivated &evt)
5826 {
5827
5828 DECLARE_LOCALS;
5829 pg_t pgid = context< PeeringMachine >().spgid.pgid;
5830
5831 all_replicas_activated = true;
5832
5833 ps->state_clear(PG_STATE_ACTIVATING);
5834 ps->state_clear(PG_STATE_CREATING);
5835 ps->state_clear(PG_STATE_PREMERGE);
5836
5837 bool merge_target;
5838 if (ps->pool.info.is_pending_merge(pgid, &merge_target)) {
5839 ps->state_set(PG_STATE_PEERED);
5840 ps->state_set(PG_STATE_PREMERGE);
5841
5842 if (ps->actingset.size() != ps->get_osdmap()->get_pg_size(pgid)) {
5843 if (merge_target) {
5844 pg_t src = pgid;
5845 src.set_ps(ps->pool.info.get_pg_num_pending());
5846 assert(src.get_parent() == pgid);
5847 pl->set_not_ready_to_merge_target(pgid, src);
5848 } else {
5849 pl->set_not_ready_to_merge_source(pgid);
5850 }
5851 }
5852 } else if (ps->acting.size() < ps->pool.info.min_size) {
5853 ps->state_set(PG_STATE_PEERED);
5854 } else {
5855 ps->state_set(PG_STATE_ACTIVE);
5856 }
5857
5858 auto mnow = pl->get_mnow();
5859 if (ps->prior_readable_until_ub > mnow) {
5860 psdout(10) << " waiting for prior_readable_until_ub "
5861 << ps->prior_readable_until_ub << " > mnow " << mnow << dendl;
5862 ps->state_set(PG_STATE_WAIT);
5863 pl->queue_check_readable(
5864 ps->last_peering_reset,
5865 ps->prior_readable_until_ub - mnow);
5866 } else {
5867 psdout(10) << " mnow " << mnow << " >= prior_readable_until_ub "
5868 << ps->prior_readable_until_ub << dendl;
5869 }
5870
5871 if (ps->pool.info.has_flag(pg_pool_t::FLAG_CREATING)) {
5872 pl->send_pg_created(pgid);
5873 }
5874
5875 ps->info.history.last_epoch_started = ps->info.last_epoch_started;
5876 ps->info.history.last_interval_started = ps->info.last_interval_started;
5877 ps->dirty_info = true;
5878
5879 ps->share_pg_info();
5880 pl->publish_stats_to_osd();
5881
5882 pl->on_activate_complete();
5883
5884 return discard_event();
5885 }
5886
5887 boost::statechart::result PeeringState::Active::react(const RenewLease& rl)
5888 {
5889 DECLARE_LOCALS;
5890 ps->proc_renew_lease();
5891 return discard_event();
5892 }
5893
5894 boost::statechart::result PeeringState::Active::react(const MLeaseAck& la)
5895 {
5896 DECLARE_LOCALS;
5897 ps->proc_lease_ack(la.from, la.lease_ack);
5898 return discard_event();
5899 }
5900
5901
5902 boost::statechart::result PeeringState::Active::react(const CheckReadable &evt)
5903 {
5904 DECLARE_LOCALS;
5905 pl->recheck_readable();
5906 return discard_event();
5907 }
5908
5909 /*
5910 * update info.history.last_epoch_started ONLY after we and all
5911 * replicas have activated AND committed the activate transaction
5912 * (i.e. the peering results are stable on disk).
5913 */
5914 void PeeringState::Active::all_activated_and_committed()
5915 {
5916 DECLARE_LOCALS;
5917 psdout(10) << "all_activated_and_committed" << dendl;
5918 ceph_assert(ps->is_primary());
5919 ceph_assert(ps->peer_activated.size() == ps->acting_recovery_backfill.size());
5920 ceph_assert(!ps->acting_recovery_backfill.empty());
5921 ceph_assert(ps->blocked_by.empty());
5922
5923 if (HAVE_FEATURE(ps->upacting_features, SERVER_OCTOPUS)) {
5924 // this is overkill when the activation is quick, but when it is slow it
5925 // is important, because the lease was renewed by the activate itself but we
5926 // don't know how long ago that was, and simply scheduling now may leave
5927 // a gap in lease coverage. keep it simple and aggressively renew.
5928 ps->renew_lease(pl->get_mnow());
5929 ps->send_lease();
5930 ps->schedule_renew_lease();
5931 }
5932
5933 // Degraded?
5934 ps->update_calc_stats();
5935 if (ps->info.stats.stats.sum.num_objects_degraded) {
5936 ps->state_set(PG_STATE_DEGRADED);
5937 } else {
5938 ps->state_clear(PG_STATE_DEGRADED);
5939 }
5940
5941 post_event(PeeringState::AllReplicasActivated());
5942 }
5943
5944
5945 void PeeringState::Active::exit()
5946 {
5947 context< PeeringMachine >().log_exit(state_name, enter_time);
5948
5949
5950 DECLARE_LOCALS;
5951 pl->cancel_local_background_io_reservation();
5952
5953 ps->blocked_by.clear();
5954 ps->backfill_reserved = false;
5955 ps->state_clear(PG_STATE_ACTIVATING);
5956 ps->state_clear(PG_STATE_DEGRADED);
5957 ps->state_clear(PG_STATE_UNDERSIZED);
5958 ps->state_clear(PG_STATE_BACKFILL_TOOFULL);
5959 ps->state_clear(PG_STATE_BACKFILL_WAIT);
5960 ps->state_clear(PG_STATE_RECOVERY_WAIT);
5961 ps->state_clear(PG_STATE_RECOVERY_TOOFULL);
5962 utime_t dur = ceph_clock_now() - enter_time;
5963 pl->get_peering_perf().tinc(rs_active_latency, dur);
5964 pl->on_active_exit();
5965 }
5966
5967 /*------ReplicaActive-----*/
5968 PeeringState::ReplicaActive::ReplicaActive(my_context ctx)
5969 : my_base(ctx),
5970 NamedState(context< PeeringMachine >().state_history, "Started/ReplicaActive")
5971 {
5972 context< PeeringMachine >().log_enter(state_name);
5973
5974 DECLARE_LOCALS;
5975 ps->start_flush(context< PeeringMachine >().get_cur_transaction());
5976 }
5977
5978
5979 boost::statechart::result PeeringState::ReplicaActive::react(
5980 const Activate& actevt) {
5981 DECLARE_LOCALS;
5982 psdout(10) << "In ReplicaActive, about to call activate" << dendl;
5983 ps->activate(
5984 context< PeeringMachine >().get_cur_transaction(),
5985 actevt.activation_epoch,
5986 context< PeeringMachine >().get_recovery_ctx());
5987 psdout(10) << "Activate Finished" << dendl;
5988 return discard_event();
5989 }
5990
5991 boost::statechart::result PeeringState::ReplicaActive::react(
5992 const ActivateCommitted &evt)
5993 {
5994 DECLARE_LOCALS;
5995 psdout(10) << __func__ << " " << evt.epoch << " telling primary" << dendl;
5996
5997 auto &rctx = context<PeeringMachine>().get_recovery_ctx();
5998 auto epoch = ps->get_osdmap_epoch();
5999 pg_info_t i = ps->info;
6000 i.history.last_epoch_started = evt.activation_epoch;
6001 i.history.last_interval_started = i.history.same_interval_since;
6002 rctx.send_info(
6003 ps->get_primary().osd,
6004 spg_t(ps->info.pgid.pgid, ps->get_primary().shard),
6005 epoch,
6006 epoch,
6007 i,
6008 {}, /* lease */
6009 ps->get_lease_ack());
6010
6011 if (ps->acting.size() >= ps->pool.info.min_size) {
6012 ps->state_set(PG_STATE_ACTIVE);
6013 } else {
6014 ps->state_set(PG_STATE_PEERED);
6015 }
6016 pl->on_activate_committed();
6017
6018 return discard_event();
6019 }
6020
6021 boost::statechart::result PeeringState::ReplicaActive::react(const MLease& l)
6022 {
6023 DECLARE_LOCALS;
6024 spg_t spgid = context< PeeringMachine >().spgid;
6025 epoch_t epoch = pl->get_osdmap_epoch();
6026
6027 ps->proc_lease(l.lease);
6028 pl->send_cluster_message(
6029 ps->get_primary().osd,
6030 new MOSDPGLeaseAck(epoch,
6031 spg_t(spgid.pgid, ps->get_primary().shard),
6032 ps->get_lease_ack()),
6033 epoch);
6034 return discard_event();
6035 }
6036
6037 boost::statechart::result PeeringState::ReplicaActive::react(const MInfoRec& infoevt)
6038 {
6039 DECLARE_LOCALS;
6040 ps->proc_primary_info(context<PeeringMachine>().get_cur_transaction(),
6041 infoevt.info);
6042 return discard_event();
6043 }
6044
6045 boost::statechart::result PeeringState::ReplicaActive::react(const MLogRec& logevt)
6046 {
6047 DECLARE_LOCALS;
6048 psdout(10) << "received log from " << logevt.from << dendl;
6049 ObjectStore::Transaction &t = context<PeeringMachine>().get_cur_transaction();
6050 ps->merge_log(t, logevt.msg->info, logevt.msg->log, logevt.from);
6051 ceph_assert(ps->pg_log.get_head() == ps->info.last_update);
6052 if (logevt.msg->lease) {
6053 ps->proc_lease(*logevt.msg->lease);
6054 }
6055
6056 return discard_event();
6057 }
6058
6059 boost::statechart::result PeeringState::ReplicaActive::react(const MTrim& trim)
6060 {
6061 DECLARE_LOCALS;
6062 // primary is instructing us to trim
6063 ps->pg_log.trim(trim.trim_to, ps->info);
6064 ps->dirty_info = true;
6065 return discard_event();
6066 }
6067
6068 boost::statechart::result PeeringState::ReplicaActive::react(const ActMap&)
6069 {
6070 DECLARE_LOCALS;
6071 if (ps->should_send_notify() && ps->get_primary().osd >= 0) {
6072 ps->info.history.refresh_prior_readable_until_ub(
6073 pl->get_mnow(), ps->prior_readable_until_ub);
6074 context< PeeringMachine >().send_notify(
6075 ps->get_primary().osd,
6076 pg_notify_t(
6077 ps->get_primary().shard, ps->pg_whoami.shard,
6078 ps->get_osdmap_epoch(),
6079 ps->get_osdmap_epoch(),
6080 ps->info,
6081 ps->past_intervals));
6082 }
6083 return discard_event();
6084 }
6085
6086 boost::statechart::result PeeringState::ReplicaActive::react(
6087 const MQuery& query)
6088 {
6089 DECLARE_LOCALS;
6090 ps->fulfill_query(query, context<PeeringMachine>().get_recovery_ctx());
6091 return discard_event();
6092 }
6093
6094 boost::statechart::result PeeringState::ReplicaActive::react(const QueryState& q)
6095 {
6096 q.f->open_object_section("state");
6097 q.f->dump_string("name", state_name);
6098 q.f->dump_stream("enter_time") << enter_time;
6099 q.f->close_section();
6100 return forward_event();
6101 }
6102
6103 void PeeringState::ReplicaActive::exit()
6104 {
6105 context< PeeringMachine >().log_exit(state_name, enter_time);
6106 DECLARE_LOCALS;
6107 pl->unreserve_recovery_space();
6108
6109 pl->cancel_remote_recovery_reservation();
6110 utime_t dur = ceph_clock_now() - enter_time;
6111 pl->get_peering_perf().tinc(rs_replicaactive_latency, dur);
6112
6113 ps->min_last_complete_ondisk = eversion_t();
6114 }
6115
6116 /*-------Stray---*/
6117 PeeringState::Stray::Stray(my_context ctx)
6118 : my_base(ctx),
6119 NamedState(context< PeeringMachine >().state_history, "Started/Stray")
6120 {
6121 context< PeeringMachine >().log_enter(state_name);
6122
6123
6124 DECLARE_LOCALS;
6125 ceph_assert(!ps->is_peered());
6126 ceph_assert(!ps->is_peering());
6127 ceph_assert(!ps->is_primary());
6128
6129 if (!ps->get_osdmap()->have_pg_pool(ps->info.pgid.pgid.pool())) {
6130 ldout(ps->cct,10) << __func__ << " pool is deleted" << dendl;
6131 post_event(DeleteStart());
6132 } else {
6133 ps->start_flush(context< PeeringMachine >().get_cur_transaction());
6134 }
6135 }
6136
6137 boost::statechart::result PeeringState::Stray::react(const MLogRec& logevt)
6138 {
6139 DECLARE_LOCALS;
6140 MOSDPGLog *msg = logevt.msg.get();
6141 psdout(10) << "got info+log from osd." << logevt.from << " " << msg->info << " " << msg->log << dendl;
6142
6143 ObjectStore::Transaction &t = context<PeeringMachine>().get_cur_transaction();
6144 if (msg->info.last_backfill == hobject_t()) {
6145 // restart backfill
6146 ps->info = msg->info;
6147 pl->on_info_history_change();
6148 ps->dirty_info = true;
6149 ps->dirty_big_info = true; // maybe.
6150
6151 PGLog::LogEntryHandlerRef rollbacker{pl->get_log_handler(t)};
6152 ps->pg_log.reset_backfill_claim_log(msg->log, rollbacker.get());
6153
6154 ps->pg_log.reset_backfill();
6155 } else {
6156 ps->merge_log(t, msg->info, msg->log, logevt.from);
6157 }
6158 if (logevt.msg->lease) {
6159 ps->proc_lease(*logevt.msg->lease);
6160 }
6161
6162 ceph_assert(ps->pg_log.get_head() == ps->info.last_update);
6163
6164 post_event(Activate(logevt.msg->info.last_epoch_started));
6165 return transit<ReplicaActive>();
6166 }
6167
6168 boost::statechart::result PeeringState::Stray::react(const MInfoRec& infoevt)
6169 {
6170 DECLARE_LOCALS;
6171 psdout(10) << "got info from osd." << infoevt.from << " " << infoevt.info << dendl;
6172
6173 if (ps->info.last_update > infoevt.info.last_update) {
6174 // rewind divergent log entries
6175 ObjectStore::Transaction &t = context<PeeringMachine>().get_cur_transaction();
6176 ps->rewind_divergent_log(t, infoevt.info.last_update);
6177 ps->info.stats = infoevt.info.stats;
6178 ps->info.hit_set = infoevt.info.hit_set;
6179 }
6180
6181 if (infoevt.lease) {
6182 ps->proc_lease(*infoevt.lease);
6183 }
6184
6185 ceph_assert(infoevt.info.last_update == ps->info.last_update);
6186 ceph_assert(ps->pg_log.get_head() == ps->info.last_update);
6187
6188 post_event(Activate(infoevt.info.last_epoch_started));
6189 return transit<ReplicaActive>();
6190 }
6191
6192 boost::statechart::result PeeringState::Stray::react(const MQuery& query)
6193 {
6194 DECLARE_LOCALS;
6195 ps->fulfill_query(query, context<PeeringMachine>().get_recovery_ctx());
6196 return discard_event();
6197 }
6198
6199 boost::statechart::result PeeringState::Stray::react(const ActMap&)
6200 {
6201 DECLARE_LOCALS;
6202 if (ps->should_send_notify() && ps->get_primary().osd >= 0) {
6203 ps->info.history.refresh_prior_readable_until_ub(
6204 pl->get_mnow(), ps->prior_readable_until_ub);
6205 context< PeeringMachine >().send_notify(
6206 ps->get_primary().osd,
6207 pg_notify_t(
6208 ps->get_primary().shard, ps->pg_whoami.shard,
6209 ps->get_osdmap_epoch(),
6210 ps->get_osdmap_epoch(),
6211 ps->info,
6212 ps->past_intervals));
6213 }
6214 return discard_event();
6215 }
6216
6217 void PeeringState::Stray::exit()
6218 {
6219 context< PeeringMachine >().log_exit(state_name, enter_time);
6220 DECLARE_LOCALS;
6221 utime_t dur = ceph_clock_now() - enter_time;
6222 pl->get_peering_perf().tinc(rs_stray_latency, dur);
6223 }
6224
6225
6226 /*--------ToDelete----------*/
6227 PeeringState::ToDelete::ToDelete(my_context ctx)
6228 : my_base(ctx),
6229 NamedState(context< PeeringMachine >().state_history, "Started/ToDelete")
6230 {
6231 context< PeeringMachine >().log_enter(state_name);
6232 DECLARE_LOCALS;
6233 pl->get_perf_logger().inc(l_osd_pg_removing);
6234 }
6235
6236 void PeeringState::ToDelete::exit()
6237 {
6238 context< PeeringMachine >().log_exit(state_name, enter_time);
6239 DECLARE_LOCALS;
6240 // note: on a successful removal, this path doesn't execute. see
6241 // _delete_some().
6242 pl->get_perf_logger().dec(l_osd_pg_removing);
6243
6244 pl->cancel_local_background_io_reservation();
6245 }
6246
6247 /*----WaitDeleteReserved----*/
6248 PeeringState::WaitDeleteReserved::WaitDeleteReserved(my_context ctx)
6249 : my_base(ctx),
6250 NamedState(context< PeeringMachine >().state_history,
6251 "Started/ToDelete/WaitDeleteReseved")
6252 {
6253 context< PeeringMachine >().log_enter(state_name);
6254 DECLARE_LOCALS;
6255 context< ToDelete >().priority = ps->get_delete_priority();
6256
6257 pl->cancel_local_background_io_reservation();
6258 pl->request_local_background_io_reservation(
6259 context<ToDelete>().priority,
6260 std::make_shared<PGPeeringEvent>(
6261 ps->get_osdmap_epoch(),
6262 ps->get_osdmap_epoch(),
6263 DeleteReserved()),
6264 std::make_shared<PGPeeringEvent>(
6265 ps->get_osdmap_epoch(),
6266 ps->get_osdmap_epoch(),
6267 DeleteInterrupted()));
6268 }
6269
6270 boost::statechart::result PeeringState::ToDelete::react(
6271 const ActMap& evt)
6272 {
6273 DECLARE_LOCALS;
6274 if (ps->get_delete_priority() != priority) {
6275 psdout(10) << __func__ << " delete priority changed, resetting"
6276 << dendl;
6277 return transit<ToDelete>();
6278 }
6279 return discard_event();
6280 }
6281
6282 void PeeringState::WaitDeleteReserved::exit()
6283 {
6284 context< PeeringMachine >().log_exit(state_name, enter_time);
6285 }
6286
6287 /*----Deleting-----*/
6288 PeeringState::Deleting::Deleting(my_context ctx)
6289 : my_base(ctx),
6290 NamedState(context< PeeringMachine >().state_history, "Started/ToDelete/Deleting")
6291 {
6292 context< PeeringMachine >().log_enter(state_name);
6293 DECLARE_LOCALS;
6294 ps->deleting = true;
6295 ObjectStore::Transaction &t = context<PeeringMachine>().get_cur_transaction();
6296
6297 // clear log
6298 PGLog::LogEntryHandlerRef rollbacker{pl->get_log_handler(t)};
6299 ps->pg_log.roll_forward(rollbacker.get());
6300
6301 // adjust info to backfill
6302 ps->info.set_last_backfill(hobject_t());
6303 ps->pg_log.reset_backfill();
6304 ps->dirty_info = true;
6305
6306 pl->on_removal(t);
6307 }
6308
6309 boost::statechart::result PeeringState::Deleting::react(
6310 const DeleteSome& evt)
6311 {
6312 DECLARE_LOCALS;
6313 pl->do_delete_work(context<PeeringMachine>().get_cur_transaction());
6314 return discard_event();
6315 }
6316
6317 void PeeringState::Deleting::exit()
6318 {
6319 context< PeeringMachine >().log_exit(state_name, enter_time);
6320 DECLARE_LOCALS;
6321 ps->deleting = false;
6322 pl->cancel_local_background_io_reservation();
6323 }
6324
6325 /*--------GetInfo---------*/
6326 PeeringState::GetInfo::GetInfo(my_context ctx)
6327 : my_base(ctx),
6328 NamedState(context< PeeringMachine >().state_history, "Started/Primary/Peering/GetInfo")
6329 {
6330 context< PeeringMachine >().log_enter(state_name);
6331
6332
6333 DECLARE_LOCALS;
6334 ps->check_past_interval_bounds();
6335 ps->log_weirdness();
6336 PastIntervals::PriorSet &prior_set = context< Peering >().prior_set;
6337
6338 ceph_assert(ps->blocked_by.empty());
6339
6340 prior_set = ps->build_prior();
6341 ps->prior_readable_down_osds = prior_set.down;
6342 if (ps->prior_readable_down_osds.empty()) {
6343 psdout(10) << " no prior_set down osds, clearing prior_readable_until_ub"
6344 << dendl;
6345 ps->clear_prior_readable_until_ub();
6346 }
6347
6348 ps->reset_min_peer_features();
6349 get_infos();
6350 if (prior_set.pg_down) {
6351 post_event(IsDown());
6352 } else if (peer_info_requested.empty()) {
6353 post_event(GotInfo());
6354 }
6355 }
6356
6357 void PeeringState::GetInfo::get_infos()
6358 {
6359 DECLARE_LOCALS;
6360 PastIntervals::PriorSet &prior_set = context< Peering >().prior_set;
6361
6362 ps->blocked_by.clear();
6363 for (set<pg_shard_t>::const_iterator it = prior_set.probe.begin();
6364 it != prior_set.probe.end();
6365 ++it) {
6366 pg_shard_t peer = *it;
6367 if (peer == ps->pg_whoami) {
6368 continue;
6369 }
6370 if (ps->peer_info.count(peer)) {
6371 psdout(10) << " have osd." << peer << " info " << ps->peer_info[peer] << dendl;
6372 continue;
6373 }
6374 if (peer_info_requested.count(peer)) {
6375 psdout(10) << " already requested info from osd." << peer << dendl;
6376 ps->blocked_by.insert(peer.osd);
6377 } else if (!ps->get_osdmap()->is_up(peer.osd)) {
6378 psdout(10) << " not querying info from down osd." << peer << dendl;
6379 } else {
6380 psdout(10) << " querying info from osd." << peer << dendl;
6381 context< PeeringMachine >().send_query(
6382 peer.osd,
6383 pg_query_t(pg_query_t::INFO,
6384 it->shard, ps->pg_whoami.shard,
6385 ps->info.history,
6386 ps->get_osdmap_epoch()));
6387 peer_info_requested.insert(peer);
6388 ps->blocked_by.insert(peer.osd);
6389 }
6390 }
6391
6392 ps->check_prior_readable_down_osds(ps->get_osdmap());
6393
6394 pl->publish_stats_to_osd();
6395 }
6396
6397 boost::statechart::result PeeringState::GetInfo::react(const MNotifyRec& infoevt)
6398 {
6399
6400 DECLARE_LOCALS;
6401
6402 set<pg_shard_t>::iterator p = peer_info_requested.find(infoevt.from);
6403 if (p != peer_info_requested.end()) {
6404 peer_info_requested.erase(p);
6405 ps->blocked_by.erase(infoevt.from.osd);
6406 }
6407
6408 epoch_t old_start = ps->info.history.last_epoch_started;
6409 if (ps->proc_replica_info(
6410 infoevt.from, infoevt.notify.info, infoevt.notify.epoch_sent)) {
6411 // we got something new ...
6412 PastIntervals::PriorSet &prior_set = context< Peering >().prior_set;
6413 if (old_start < ps->info.history.last_epoch_started) {
6414 psdout(10) << " last_epoch_started moved forward, rebuilding prior" << dendl;
6415 prior_set = ps->build_prior();
6416 ps->prior_readable_down_osds = prior_set.down;
6417
6418 // filter out any osds that got dropped from the probe set from
6419 // peer_info_requested. this is less expensive than restarting
6420 // peering (which would re-probe everyone).
6421 set<pg_shard_t>::iterator p = peer_info_requested.begin();
6422 while (p != peer_info_requested.end()) {
6423 if (prior_set.probe.count(*p) == 0) {
6424 psdout(20) << " dropping osd." << *p << " from info_requested, no longer in probe set" << dendl;
6425 peer_info_requested.erase(p++);
6426 } else {
6427 ++p;
6428 }
6429 }
6430 get_infos();
6431 }
6432 psdout(20) << "Adding osd: " << infoevt.from.osd << " peer features: "
6433 << hex << infoevt.features << dec << dendl;
6434 ps->apply_peer_features(infoevt.features);
6435
6436 // are we done getting everything?
6437 if (peer_info_requested.empty() && !prior_set.pg_down) {
6438 psdout(20) << "Common peer features: " << hex << ps->get_min_peer_features() << dec << dendl;
6439 psdout(20) << "Common acting features: " << hex << ps->get_min_acting_features() << dec << dendl;
6440 psdout(20) << "Common upacting features: " << hex << ps->get_min_upacting_features() << dec << dendl;
6441 post_event(GotInfo());
6442 }
6443 }
6444 return discard_event();
6445 }
6446
6447 boost::statechart::result PeeringState::GetInfo::react(const QueryState& q)
6448 {
6449 DECLARE_LOCALS;
6450 q.f->open_object_section("state");
6451 q.f->dump_string("name", state_name);
6452 q.f->dump_stream("enter_time") << enter_time;
6453
6454 q.f->open_array_section("requested_info_from");
6455 for (set<pg_shard_t>::iterator p = peer_info_requested.begin();
6456 p != peer_info_requested.end();
6457 ++p) {
6458 q.f->open_object_section("osd");
6459 q.f->dump_stream("osd") << *p;
6460 if (ps->peer_info.count(*p)) {
6461 q.f->open_object_section("got_info");
6462 ps->peer_info[*p].dump(q.f);
6463 q.f->close_section();
6464 }
6465 q.f->close_section();
6466 }
6467 q.f->close_section();
6468
6469 q.f->close_section();
6470 return forward_event();
6471 }
6472
6473 void PeeringState::GetInfo::exit()
6474 {
6475 context< PeeringMachine >().log_exit(state_name, enter_time);
6476
6477 DECLARE_LOCALS;
6478 utime_t dur = ceph_clock_now() - enter_time;
6479 pl->get_peering_perf().tinc(rs_getinfo_latency, dur);
6480 ps->blocked_by.clear();
6481 }
6482
6483 /*------GetLog------------*/
6484 PeeringState::GetLog::GetLog(my_context ctx)
6485 : my_base(ctx),
6486 NamedState(
6487 context< PeeringMachine >().state_history,
6488 "Started/Primary/Peering/GetLog"),
6489 msg(0)
6490 {
6491 context< PeeringMachine >().log_enter(state_name);
6492
6493 DECLARE_LOCALS;
6494
6495 ps->log_weirdness();
6496
6497 // adjust acting?
6498 if (!ps->choose_acting(auth_log_shard, false,
6499 &context< Peering >().history_les_bound)) {
6500 if (!ps->want_acting.empty()) {
6501 post_event(NeedActingChange());
6502 } else {
6503 post_event(IsIncomplete());
6504 }
6505 return;
6506 }
6507
6508 // am i the best?
6509 if (auth_log_shard == ps->pg_whoami) {
6510 post_event(GotLog());
6511 return;
6512 }
6513
6514 const pg_info_t& best = ps->peer_info[auth_log_shard];
6515
6516 // am i broken?
6517 if (ps->info.last_update < best.log_tail) {
6518 psdout(10) << " not contiguous with osd." << auth_log_shard << ", down" << dendl;
6519 post_event(IsIncomplete());
6520 return;
6521 }
6522
6523 // how much log to request?
6524 eversion_t request_log_from = ps->info.last_update;
6525 ceph_assert(!ps->acting_recovery_backfill.empty());
6526 for (set<pg_shard_t>::iterator p = ps->acting_recovery_backfill.begin();
6527 p != ps->acting_recovery_backfill.end();
6528 ++p) {
6529 if (*p == ps->pg_whoami) continue;
6530 pg_info_t& ri = ps->peer_info[*p];
6531 if (ri.last_update < ps->info.log_tail && ri.last_update >= best.log_tail &&
6532 ri.last_update < request_log_from)
6533 request_log_from = ri.last_update;
6534 }
6535
6536 // how much?
6537 psdout(10) << " requesting log from osd." << auth_log_shard << dendl;
6538 context<PeeringMachine>().send_query(
6539 auth_log_shard.osd,
6540 pg_query_t(
6541 pg_query_t::LOG,
6542 auth_log_shard.shard, ps->pg_whoami.shard,
6543 request_log_from, ps->info.history,
6544 ps->get_osdmap_epoch()));
6545
6546 ceph_assert(ps->blocked_by.empty());
6547 ps->blocked_by.insert(auth_log_shard.osd);
6548 pl->publish_stats_to_osd();
6549 }
6550
6551 boost::statechart::result PeeringState::GetLog::react(const AdvMap& advmap)
6552 {
6553 // make sure our log source didn't go down. we need to check
6554 // explicitly because it may not be part of the prior set, which
6555 // means the Peering state check won't catch it going down.
6556 if (!advmap.osdmap->is_up(auth_log_shard.osd)) {
6557 psdout(10) << "GetLog: auth_log_shard osd."
6558 << auth_log_shard.osd << " went down" << dendl;
6559 post_event(advmap);
6560 return transit< Reset >();
6561 }
6562
6563 // let the Peering state do its checks.
6564 return forward_event();
6565 }
6566
6567 boost::statechart::result PeeringState::GetLog::react(const MLogRec& logevt)
6568 {
6569 ceph_assert(!msg);
6570 if (logevt.from != auth_log_shard) {
6571 psdout(10) << "GetLog: discarding log from "
6572 << "non-auth_log_shard osd." << logevt.from << dendl;
6573 return discard_event();
6574 }
6575 psdout(10) << "GetLog: received master log from osd."
6576 << logevt.from << dendl;
6577 msg = logevt.msg;
6578 post_event(GotLog());
6579 return discard_event();
6580 }
6581
6582 boost::statechart::result PeeringState::GetLog::react(const GotLog&)
6583 {
6584
6585 DECLARE_LOCALS;
6586 psdout(10) << "leaving GetLog" << dendl;
6587 if (msg) {
6588 psdout(10) << "processing master log" << dendl;
6589 ps->proc_master_log(context<PeeringMachine>().get_cur_transaction(),
6590 msg->info, msg->log, msg->missing,
6591 auth_log_shard);
6592 }
6593 ps->start_flush(context< PeeringMachine >().get_cur_transaction());
6594 return transit< GetMissing >();
6595 }
6596
6597 boost::statechart::result PeeringState::GetLog::react(const QueryState& q)
6598 {
6599 q.f->open_object_section("state");
6600 q.f->dump_string("name", state_name);
6601 q.f->dump_stream("enter_time") << enter_time;
6602 q.f->dump_stream("auth_log_shard") << auth_log_shard;
6603 q.f->close_section();
6604 return forward_event();
6605 }
6606
6607 void PeeringState::GetLog::exit()
6608 {
6609 context< PeeringMachine >().log_exit(state_name, enter_time);
6610
6611 DECLARE_LOCALS;
6612 utime_t dur = ceph_clock_now() - enter_time;
6613 pl->get_peering_perf().tinc(rs_getlog_latency, dur);
6614 ps->blocked_by.clear();
6615 }
6616
6617 /*------WaitActingChange--------*/
6618 PeeringState::WaitActingChange::WaitActingChange(my_context ctx)
6619 : my_base(ctx),
6620 NamedState(context< PeeringMachine >().state_history, "Started/Primary/WaitActingChange")
6621 {
6622 context< PeeringMachine >().log_enter(state_name);
6623 }
6624
6625 boost::statechart::result PeeringState::WaitActingChange::react(const AdvMap& advmap)
6626 {
6627 DECLARE_LOCALS;
6628 OSDMapRef osdmap = advmap.osdmap;
6629
6630 psdout(10) << "verifying no want_acting " << ps->want_acting << " targets didn't go down" << dendl;
6631 for (vector<int>::iterator p = ps->want_acting.begin(); p != ps->want_acting.end(); ++p) {
6632 if (!osdmap->is_up(*p)) {
6633 psdout(10) << " want_acting target osd." << *p << " went down, resetting" << dendl;
6634 post_event(advmap);
6635 return transit< Reset >();
6636 }
6637 }
6638 return forward_event();
6639 }
6640
6641 boost::statechart::result PeeringState::WaitActingChange::react(const MLogRec& logevt)
6642 {
6643 psdout(10) << "In WaitActingChange, ignoring MLocRec" << dendl;
6644 return discard_event();
6645 }
6646
6647 boost::statechart::result PeeringState::WaitActingChange::react(const MInfoRec& evt)
6648 {
6649 psdout(10) << "In WaitActingChange, ignoring MInfoRec" << dendl;
6650 return discard_event();
6651 }
6652
6653 boost::statechart::result PeeringState::WaitActingChange::react(const MNotifyRec& evt)
6654 {
6655 psdout(10) << "In WaitActingChange, ignoring MNotifyRec" << dendl;
6656 return discard_event();
6657 }
6658
6659 boost::statechart::result PeeringState::WaitActingChange::react(const QueryState& q)
6660 {
6661 q.f->open_object_section("state");
6662 q.f->dump_string("name", state_name);
6663 q.f->dump_stream("enter_time") << enter_time;
6664 q.f->dump_string("comment", "waiting for pg acting set to change");
6665 q.f->close_section();
6666 return forward_event();
6667 }
6668
6669 void PeeringState::WaitActingChange::exit()
6670 {
6671 context< PeeringMachine >().log_exit(state_name, enter_time);
6672 DECLARE_LOCALS;
6673 utime_t dur = ceph_clock_now() - enter_time;
6674 pl->get_peering_perf().tinc(rs_waitactingchange_latency, dur);
6675 }
6676
6677 /*------Down--------*/
6678 PeeringState::Down::Down(my_context ctx)
6679 : my_base(ctx),
6680 NamedState(context< PeeringMachine >().state_history, "Started/Primary/Peering/Down")
6681 {
6682 context< PeeringMachine >().log_enter(state_name);
6683 DECLARE_LOCALS;
6684
6685 ps->state_clear(PG_STATE_PEERING);
6686 ps->state_set(PG_STATE_DOWN);
6687
6688 auto &prior_set = context< Peering >().prior_set;
6689 ceph_assert(ps->blocked_by.empty());
6690 ps->blocked_by.insert(prior_set.down.begin(), prior_set.down.end());
6691 pl->publish_stats_to_osd();
6692 }
6693
6694 void PeeringState::Down::exit()
6695 {
6696 context< PeeringMachine >().log_exit(state_name, enter_time);
6697
6698 DECLARE_LOCALS;
6699
6700 ps->state_clear(PG_STATE_DOWN);
6701 utime_t dur = ceph_clock_now() - enter_time;
6702 pl->get_peering_perf().tinc(rs_down_latency, dur);
6703
6704 ps->blocked_by.clear();
6705 }
6706
6707 boost::statechart::result PeeringState::Down::react(const QueryState& q)
6708 {
6709 q.f->open_object_section("state");
6710 q.f->dump_string("name", state_name);
6711 q.f->dump_stream("enter_time") << enter_time;
6712 q.f->dump_string("comment",
6713 "not enough up instances of this PG to go active");
6714 q.f->close_section();
6715 return forward_event();
6716 }
6717
6718 boost::statechart::result PeeringState::Down::react(const MNotifyRec& infoevt)
6719 {
6720 DECLARE_LOCALS;
6721
6722 ceph_assert(ps->is_primary());
6723 epoch_t old_start = ps->info.history.last_epoch_started;
6724 if (!ps->peer_info.count(infoevt.from) &&
6725 ps->get_osdmap()->has_been_up_since(infoevt.from.osd, infoevt.notify.epoch_sent)) {
6726 ps->update_history(infoevt.notify.info.history);
6727 }
6728 // if we got something new to make pg escape down state
6729 if (ps->info.history.last_epoch_started > old_start) {
6730 psdout(10) << " last_epoch_started moved forward, re-enter getinfo" << dendl;
6731 ps->state_clear(PG_STATE_DOWN);
6732 ps->state_set(PG_STATE_PEERING);
6733 return transit< GetInfo >();
6734 }
6735
6736 return discard_event();
6737 }
6738
6739
6740 /*------Incomplete--------*/
6741 PeeringState::Incomplete::Incomplete(my_context ctx)
6742 : my_base(ctx),
6743 NamedState(context< PeeringMachine >().state_history, "Started/Primary/Peering/Incomplete")
6744 {
6745 context< PeeringMachine >().log_enter(state_name);
6746 DECLARE_LOCALS;
6747
6748 ps->state_clear(PG_STATE_PEERING);
6749 ps->state_set(PG_STATE_INCOMPLETE);
6750
6751 PastIntervals::PriorSet &prior_set = context< Peering >().prior_set;
6752 ceph_assert(ps->blocked_by.empty());
6753 ps->blocked_by.insert(prior_set.down.begin(), prior_set.down.end());
6754 pl->publish_stats_to_osd();
6755 }
6756
6757 boost::statechart::result PeeringState::Incomplete::react(const AdvMap &advmap) {
6758 DECLARE_LOCALS;
6759 int64_t poolnum = ps->info.pgid.pool();
6760
6761 // Reset if min_size turn smaller than previous value, pg might now be able to go active
6762 if (!advmap.osdmap->have_pg_pool(poolnum) ||
6763 advmap.lastmap->get_pools().find(poolnum)->second.min_size >
6764 advmap.osdmap->get_pools().find(poolnum)->second.min_size) {
6765 post_event(advmap);
6766 return transit< Reset >();
6767 }
6768
6769 return forward_event();
6770 }
6771
6772 boost::statechart::result PeeringState::Incomplete::react(const MNotifyRec& notevt) {
6773 DECLARE_LOCALS;
6774 psdout(7) << "handle_pg_notify from osd." << notevt.from << dendl;
6775 if (ps->proc_replica_info(
6776 notevt.from, notevt.notify.info, notevt.notify.epoch_sent)) {
6777 // We got something new, try again!
6778 return transit< GetLog >();
6779 } else {
6780 return discard_event();
6781 }
6782 }
6783
6784 boost::statechart::result PeeringState::Incomplete::react(
6785 const QueryState& q)
6786 {
6787 q.f->open_object_section("state");
6788 q.f->dump_string("name", state_name);
6789 q.f->dump_stream("enter_time") << enter_time;
6790 q.f->dump_string("comment", "not enough complete instances of this PG");
6791 q.f->close_section();
6792 return forward_event();
6793 }
6794
6795 void PeeringState::Incomplete::exit()
6796 {
6797 context< PeeringMachine >().log_exit(state_name, enter_time);
6798
6799 DECLARE_LOCALS;
6800
6801 ps->state_clear(PG_STATE_INCOMPLETE);
6802 utime_t dur = ceph_clock_now() - enter_time;
6803 pl->get_peering_perf().tinc(rs_incomplete_latency, dur);
6804
6805 ps->blocked_by.clear();
6806 }
6807
6808 /*------GetMissing--------*/
6809 PeeringState::GetMissing::GetMissing(my_context ctx)
6810 : my_base(ctx),
6811 NamedState(context< PeeringMachine >().state_history, "Started/Primary/Peering/GetMissing")
6812 {
6813 context< PeeringMachine >().log_enter(state_name);
6814
6815 DECLARE_LOCALS;
6816 ps->log_weirdness();
6817 ceph_assert(!ps->acting_recovery_backfill.empty());
6818 eversion_t since;
6819 for (set<pg_shard_t>::iterator i = ps->acting_recovery_backfill.begin();
6820 i != ps->acting_recovery_backfill.end();
6821 ++i) {
6822 if (*i == ps->get_primary()) continue;
6823 const pg_info_t& pi = ps->peer_info[*i];
6824 // reset this so to make sure the pg_missing_t is initialized and
6825 // has the correct semantics even if we don't need to get a
6826 // missing set from a shard. This way later additions due to
6827 // lost+unfound delete work properly.
6828 ps->peer_missing[*i].may_include_deletes = !ps->perform_deletes_during_peering();
6829
6830 if (pi.is_empty())
6831 continue; // no pg data, nothing divergent
6832
6833 if (pi.last_update < ps->pg_log.get_tail()) {
6834 psdout(10) << " osd." << *i << " is not contiguous, will restart backfill" << dendl;
6835 ps->peer_missing[*i].clear();
6836 continue;
6837 }
6838 if (pi.last_backfill == hobject_t()) {
6839 psdout(10) << " osd." << *i << " will fully backfill; can infer empty missing set" << dendl;
6840 ps->peer_missing[*i].clear();
6841 continue;
6842 }
6843
6844 if (pi.last_update == pi.last_complete && // peer has no missing
6845 pi.last_update == ps->info.last_update) { // peer is up to date
6846 // replica has no missing and identical log as us. no need to
6847 // pull anything.
6848 // FIXME: we can do better here. if last_update==last_complete we
6849 // can infer the rest!
6850 psdout(10) << " osd." << *i << " has no missing, identical log" << dendl;
6851 ps->peer_missing[*i].clear();
6852 continue;
6853 }
6854
6855 // We pull the log from the peer's last_epoch_started to ensure we
6856 // get enough log to detect divergent updates.
6857 since.epoch = pi.last_epoch_started;
6858 ceph_assert(pi.last_update >= ps->info.log_tail); // or else choose_acting() did a bad thing
6859 if (pi.log_tail <= since) {
6860 psdout(10) << " requesting log+missing since " << since << " from osd." << *i << dendl;
6861 context< PeeringMachine >().send_query(
6862 i->osd,
6863 pg_query_t(
6864 pg_query_t::LOG,
6865 i->shard, ps->pg_whoami.shard,
6866 since, ps->info.history,
6867 ps->get_osdmap_epoch()));
6868 } else {
6869 psdout(10) << " requesting fulllog+missing from osd." << *i
6870 << " (want since " << since << " < log.tail "
6871 << pi.log_tail << ")" << dendl;
6872 context< PeeringMachine >().send_query(
6873 i->osd, pg_query_t(
6874 pg_query_t::FULLLOG,
6875 i->shard, ps->pg_whoami.shard,
6876 ps->info.history, ps->get_osdmap_epoch()));
6877 }
6878 peer_missing_requested.insert(*i);
6879 ps->blocked_by.insert(i->osd);
6880 }
6881
6882 if (peer_missing_requested.empty()) {
6883 if (ps->need_up_thru) {
6884 psdout(10) << " still need up_thru update before going active"
6885 << dendl;
6886 post_event(NeedUpThru());
6887 return;
6888 }
6889
6890 // all good!
6891 post_event(Activate(ps->get_osdmap_epoch()));
6892 } else {
6893 pl->publish_stats_to_osd();
6894 }
6895 }
6896
6897 boost::statechart::result PeeringState::GetMissing::react(const MLogRec& logevt)
6898 {
6899 DECLARE_LOCALS;
6900
6901 peer_missing_requested.erase(logevt.from);
6902 ps->proc_replica_log(logevt.msg->info, logevt.msg->log, logevt.msg->missing, logevt.from);
6903
6904 if (peer_missing_requested.empty()) {
6905 if (ps->need_up_thru) {
6906 psdout(10) << " still need up_thru update before going active"
6907 << dendl;
6908 post_event(NeedUpThru());
6909 } else {
6910 psdout(10) << "Got last missing, don't need missing "
6911 << "posting Activate" << dendl;
6912 post_event(Activate(ps->get_osdmap_epoch()));
6913 }
6914 }
6915 return discard_event();
6916 }
6917
6918 boost::statechart::result PeeringState::GetMissing::react(const QueryState& q)
6919 {
6920 DECLARE_LOCALS;
6921 q.f->open_object_section("state");
6922 q.f->dump_string("name", state_name);
6923 q.f->dump_stream("enter_time") << enter_time;
6924
6925 q.f->open_array_section("peer_missing_requested");
6926 for (set<pg_shard_t>::iterator p = peer_missing_requested.begin();
6927 p != peer_missing_requested.end();
6928 ++p) {
6929 q.f->open_object_section("osd");
6930 q.f->dump_stream("osd") << *p;
6931 if (ps->peer_missing.count(*p)) {
6932 q.f->open_object_section("got_missing");
6933 ps->peer_missing[*p].dump(q.f);
6934 q.f->close_section();
6935 }
6936 q.f->close_section();
6937 }
6938 q.f->close_section();
6939
6940 q.f->close_section();
6941 return forward_event();
6942 }
6943
6944 void PeeringState::GetMissing::exit()
6945 {
6946 context< PeeringMachine >().log_exit(state_name, enter_time);
6947
6948 DECLARE_LOCALS;
6949 utime_t dur = ceph_clock_now() - enter_time;
6950 pl->get_peering_perf().tinc(rs_getmissing_latency, dur);
6951 ps->blocked_by.clear();
6952 }
6953
6954 /*------WaitUpThru--------*/
6955 PeeringState::WaitUpThru::WaitUpThru(my_context ctx)
6956 : my_base(ctx),
6957 NamedState(context< PeeringMachine >().state_history, "Started/Primary/Peering/WaitUpThru")
6958 {
6959 context< PeeringMachine >().log_enter(state_name);
6960 }
6961
6962 boost::statechart::result PeeringState::WaitUpThru::react(const ActMap& am)
6963 {
6964 DECLARE_LOCALS;
6965 if (!ps->need_up_thru) {
6966 post_event(Activate(ps->get_osdmap_epoch()));
6967 }
6968 return forward_event();
6969 }
6970
6971 boost::statechart::result PeeringState::WaitUpThru::react(const MLogRec& logevt)
6972 {
6973 DECLARE_LOCALS;
6974 psdout(10) << "Noting missing from osd." << logevt.from << dendl;
6975 ps->peer_missing[logevt.from].claim(logevt.msg->missing);
6976 ps->peer_info[logevt.from] = logevt.msg->info;
6977 return discard_event();
6978 }
6979
6980 boost::statechart::result PeeringState::WaitUpThru::react(const QueryState& q)
6981 {
6982 q.f->open_object_section("state");
6983 q.f->dump_string("name", state_name);
6984 q.f->dump_stream("enter_time") << enter_time;
6985 q.f->dump_string("comment", "waiting for osdmap to reflect a new up_thru for this osd");
6986 q.f->close_section();
6987 return forward_event();
6988 }
6989
6990 void PeeringState::WaitUpThru::exit()
6991 {
6992 context< PeeringMachine >().log_exit(state_name, enter_time);
6993 DECLARE_LOCALS;
6994 utime_t dur = ceph_clock_now() - enter_time;
6995 pl->get_peering_perf().tinc(rs_waitupthru_latency, dur);
6996 }
6997
6998 /*----PeeringState::PeeringMachine Methods-----*/
6999 #undef dout_prefix
7000 #define dout_prefix dpp->gen_prefix(*_dout)
7001
7002 void PeeringState::PeeringMachine::log_enter(const char *state_name)
7003 {
7004 DECLARE_LOCALS;
7005 psdout(5) << "enter " << state_name << dendl;
7006 pl->log_state_enter(state_name);
7007 }
7008
7009 void PeeringState::PeeringMachine::log_exit(const char *state_name, utime_t enter_time)
7010 {
7011 DECLARE_LOCALS;
7012 utime_t dur = ceph_clock_now() - enter_time;
7013 psdout(5) << "exit " << state_name << " " << dur << " " << event_count << " " << event_time << dendl;
7014 pl->log_state_exit(state_name, enter_time, event_count, event_time);
7015 event_count = 0;
7016 event_time = utime_t();
7017 }
7018
7019 ostream &operator<<(ostream &out, const PeeringState &ps) {
7020 out << "pg[" << ps.info
7021 << " " << pg_vector_string(ps.up);
7022 if (ps.acting != ps.up)
7023 out << "/" << pg_vector_string(ps.acting);
7024 if (ps.is_ec_pg())
7025 out << "p" << ps.get_primary();
7026 if (!ps.async_recovery_targets.empty())
7027 out << " async=[" << ps.async_recovery_targets << "]";
7028 if (!ps.backfill_targets.empty())
7029 out << " backfill=[" << ps.backfill_targets << "]";
7030 out << " r=" << ps.get_role();
7031 out << " lpr=" << ps.get_last_peering_reset();
7032
7033 if (ps.deleting)
7034 out << " DELETING";
7035
7036 if (!ps.past_intervals.empty()) {
7037 out << " pi=[" << ps.past_intervals.get_bounds()
7038 << ")/" << ps.past_intervals.size();
7039 }
7040
7041 if (ps.is_peered()) {
7042 if (ps.last_update_ondisk != ps.info.last_update)
7043 out << " luod=" << ps.last_update_ondisk;
7044 if (ps.last_update_applied != ps.info.last_update)
7045 out << " lua=" << ps.last_update_applied;
7046 }
7047
7048 if (ps.pg_log.get_tail() != ps.info.log_tail ||
7049 ps.pg_log.get_head() != ps.info.last_update)
7050 out << " (info mismatch, " << ps.pg_log.get_log() << ")";
7051
7052 if (!ps.pg_log.get_log().empty()) {
7053 if ((ps.pg_log.get_log().log.begin()->version <= ps.pg_log.get_tail())) {
7054 out << " (log bound mismatch, actual=["
7055 << ps.pg_log.get_log().log.begin()->version << ","
7056 << ps.pg_log.get_log().log.rbegin()->version << "]";
7057 out << ")";
7058 }
7059 }
7060
7061 out << " crt=" << ps.pg_log.get_can_rollback_to();
7062
7063 if (ps.last_complete_ondisk != ps.info.last_complete)
7064 out << " lcod " << ps.last_complete_ondisk;
7065
7066 out << " mlcod " << ps.min_last_complete_ondisk;
7067
7068 out << " " << pg_state_string(ps.get_state());
7069 if (ps.should_send_notify())
7070 out << " NOTIFY";
7071
7072 if (ps.prior_readable_until_ub != ceph::signedspan::zero()) {
7073 out << " pruub " << ps.prior_readable_until_ub
7074 << "@" << ps.get_prior_readable_down_osds();
7075 }
7076 return out;
7077 }