]> git.proxmox.com Git - ceph.git/blob - ceph/src/osd/PeeringState.cc
import 15.2.0 Octopus source
[ceph.git] / ceph / src / osd / PeeringState.cc
1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2 // vim: ts=8 sw=2 smarttab
3
4 #include "PGPeeringEvent.h"
5 #include "common/ceph_releases.h"
6 #include "common/dout.h"
7 #include "PeeringState.h"
8
9 #include "messages/MOSDPGRemove.h"
10 #include "messages/MBackfillReserve.h"
11 #include "messages/MRecoveryReserve.h"
12 #include "messages/MOSDScrubReserve.h"
13 #include "messages/MOSDPGInfo.h"
14 #include "messages/MOSDPGInfo2.h"
15 #include "messages/MOSDPGTrim.h"
16 #include "messages/MOSDPGLog.h"
17 #include "messages/MOSDPGNotify.h"
18 #include "messages/MOSDPGNotify2.h"
19 #include "messages/MOSDPGQuery.h"
20 #include "messages/MOSDPGQuery2.h"
21 #include "messages/MOSDPGLease.h"
22 #include "messages/MOSDPGLeaseAck.h"
23
24 #define dout_context cct
25 #define dout_subsys ceph_subsys_osd
26
27 BufferedRecoveryMessages::BufferedRecoveryMessages(
28 ceph_release_t r,
29 PeeringCtx &ctx)
30 : require_osd_release(r) {
31 // steal messages from ctx
32 message_map.swap(ctx.message_map);
33 }
34
35 void BufferedRecoveryMessages::send_notify(int to, const pg_notify_t &n)
36 {
37 if (require_osd_release >= ceph_release_t::octopus) {
38 spg_t pgid(n.info.pgid.pgid, n.to);
39 send_osd_message(to, make_message<MOSDPGNotify2>(pgid, n));
40 } else {
41 send_osd_message(to, make_message<MOSDPGNotify>(n.epoch_sent, vector{n}));
42 }
43 }
44
45 void BufferedRecoveryMessages::send_query(
46 int to,
47 spg_t to_spgid,
48 const pg_query_t &q)
49 {
50 if (require_osd_release >= ceph_release_t::octopus) {
51 send_osd_message(to,
52 make_message<MOSDPGQuery2>(to_spgid, q));
53 } else {
54 auto m = make_message<MOSDPGQuery>(
55 q.epoch_sent,
56 MOSDPGQuery::pg_list_t{{to_spgid, q}});
57 send_osd_message(to, m);
58 }
59 }
60
61 void BufferedRecoveryMessages::send_info(
62 int to,
63 spg_t to_spgid,
64 epoch_t min_epoch,
65 epoch_t cur_epoch,
66 const pg_info_t &info,
67 std::optional<pg_lease_t> lease,
68 std::optional<pg_lease_ack_t> lease_ack)
69 {
70 if (require_osd_release >= ceph_release_t::octopus) {
71 send_osd_message(
72 to,
73 make_message<MOSDPGInfo2>(
74 to_spgid,
75 info,
76 cur_epoch,
77 min_epoch,
78 lease,
79 lease_ack)
80 );
81 } else {
82 send_osd_message(
83 to,
84 make_message<MOSDPGInfo>(
85 cur_epoch,
86 vector{pg_notify_t{to_spgid.shard,
87 info.pgid.shard,
88 min_epoch, cur_epoch,
89 info, PastIntervals{}}})
90 );
91 }
92 }
93
94 void PGPool::update(CephContext *cct, OSDMapRef map)
95 {
96 const pg_pool_t *pi = map->get_pg_pool(id);
97 if (!pi) {
98 return; // pool has been deleted
99 }
100 info = *pi;
101 name = map->get_pool_name(id);
102
103 bool updated = false;
104 if ((map->get_epoch() != cached_epoch + 1) ||
105 (pi->get_snap_epoch() == map->get_epoch())) {
106 updated = true;
107 }
108
109 assert(map->require_osd_release >= ceph_release_t::mimic);
110 if (info.is_pool_snaps_mode() && updated) {
111 snapc = pi->get_snap_context();
112 }
113 cached_epoch = map->get_epoch();
114 }
115
116 /*-------------Peering State Helpers----------------*/
117 #undef dout_prefix
118 #define dout_prefix (dpp->gen_prefix(*_dout))
119 #undef psdout
120 #define psdout(x) ldout(cct, x)
121
122 PeeringState::PeeringState(
123 CephContext *cct,
124 pg_shard_t pg_whoami,
125 spg_t spgid,
126 const PGPool &_pool,
127 OSDMapRef curmap,
128 DoutPrefixProvider *dpp,
129 PeeringListener *pl)
130 : state_history(*pl),
131 cct(cct),
132 spgid(spgid),
133 dpp(dpp),
134 pl(pl),
135 orig_ctx(0),
136 osdmap_ref(curmap),
137 pool(_pool),
138 pg_whoami(pg_whoami),
139 info(spgid),
140 pg_log(cct),
141 missing_loc(spgid, this, dpp, cct),
142 machine(this, cct, spgid, dpp, pl, &state_history)
143 {
144 machine.initiate();
145 }
146
147 void PeeringState::start_handle(PeeringCtx *new_ctx) {
148 ceph_assert(!rctx);
149 ceph_assert(!orig_ctx);
150 orig_ctx = new_ctx;
151 if (new_ctx) {
152 if (messages_pending_flush) {
153 rctx.emplace(*messages_pending_flush, *new_ctx);
154 } else {
155 rctx.emplace(*new_ctx);
156 }
157 rctx->start_time = ceph_clock_now();
158 }
159 }
160
161 void PeeringState::begin_block_outgoing() {
162 ceph_assert(!messages_pending_flush);
163 ceph_assert(orig_ctx);
164 ceph_assert(rctx);
165 messages_pending_flush = BufferedRecoveryMessages(
166 orig_ctx->require_osd_release);
167 rctx.emplace(*messages_pending_flush, *orig_ctx);
168 }
169
170 void PeeringState::clear_blocked_outgoing() {
171 ceph_assert(orig_ctx);
172 ceph_assert(rctx);
173 messages_pending_flush = std::optional<BufferedRecoveryMessages>();
174 }
175
176 void PeeringState::end_block_outgoing() {
177 ceph_assert(messages_pending_flush);
178 ceph_assert(orig_ctx);
179 ceph_assert(rctx);
180
181 orig_ctx->accept_buffered_messages(*messages_pending_flush);
182 rctx.emplace(*orig_ctx);
183 messages_pending_flush = std::optional<BufferedRecoveryMessages>();
184 }
185
186 void PeeringState::end_handle() {
187 if (rctx) {
188 utime_t dur = ceph_clock_now() - rctx->start_time;
189 machine.event_time += dur;
190 }
191
192 machine.event_count++;
193 rctx = std::nullopt;
194 orig_ctx = NULL;
195 }
196
197 void PeeringState::check_recovery_sources(const OSDMapRef& osdmap)
198 {
199 /*
200 * check that any peers we are planning to (or currently) pulling
201 * objects from are dealt with.
202 */
203 missing_loc.check_recovery_sources(osdmap);
204 pl->check_recovery_sources(osdmap);
205
206 for (set<pg_shard_t>::iterator i = peer_log_requested.begin();
207 i != peer_log_requested.end();
208 ) {
209 if (!osdmap->is_up(i->osd)) {
210 psdout(10) << "peer_log_requested removing " << *i << dendl;
211 peer_log_requested.erase(i++);
212 } else {
213 ++i;
214 }
215 }
216
217 for (set<pg_shard_t>::iterator i = peer_missing_requested.begin();
218 i != peer_missing_requested.end();
219 ) {
220 if (!osdmap->is_up(i->osd)) {
221 psdout(10) << "peer_missing_requested removing " << *i << dendl;
222 peer_missing_requested.erase(i++);
223 } else {
224 ++i;
225 }
226 }
227 }
228
229 void PeeringState::update_history(const pg_history_t& new_history)
230 {
231 auto mnow = pl->get_mnow();
232 info.history.refresh_prior_readable_until_ub(mnow, prior_readable_until_ub);
233 if (info.history.merge(new_history)) {
234 psdout(20) << __func__ << " advanced history from " << new_history << dendl;
235 dirty_info = true;
236 if (info.history.last_epoch_clean >= info.history.same_interval_since) {
237 psdout(20) << __func__ << " clearing past_intervals" << dendl;
238 past_intervals.clear();
239 dirty_big_info = true;
240 }
241 prior_readable_until_ub = info.history.get_prior_readable_until_ub(mnow);
242 if (prior_readable_until_ub != ceph::signedspan::zero()) {
243 dout(20) << __func__
244 << " prior_readable_until_ub " << prior_readable_until_ub
245 << " (mnow " << mnow << " + "
246 << info.history.prior_readable_until_ub << ")" << dendl;
247 }
248 }
249 pl->on_info_history_change();
250 }
251
252 void PeeringState::purge_strays()
253 {
254 if (is_premerge()) {
255 psdout(10) << "purge_strays " << stray_set << " but premerge, doing nothing"
256 << dendl;
257 return;
258 }
259 if (cct->_conf.get_val<bool>("osd_debug_no_purge_strays")) {
260 return;
261 }
262 psdout(10) << "purge_strays " << stray_set << dendl;
263
264 bool removed = false;
265 for (set<pg_shard_t>::iterator p = stray_set.begin();
266 p != stray_set.end();
267 ++p) {
268 ceph_assert(!is_acting_recovery_backfill(*p));
269 if (get_osdmap()->is_up(p->osd)) {
270 psdout(10) << "sending PGRemove to osd." << *p << dendl;
271 vector<spg_t> to_remove;
272 to_remove.push_back(spg_t(info.pgid.pgid, p->shard));
273 MOSDPGRemove *m = new MOSDPGRemove(
274 get_osdmap_epoch(),
275 to_remove);
276 pl->send_cluster_message(p->osd, m, get_osdmap_epoch());
277 } else {
278 psdout(10) << "not sending PGRemove to down osd." << *p << dendl;
279 }
280 peer_missing.erase(*p);
281 peer_info.erase(*p);
282 missing_loc.remove_stray_recovery_sources(*p);
283 peer_purged.insert(*p);
284 removed = true;
285 }
286
287 // if we removed anyone, update peers (which include peer_info)
288 if (removed)
289 update_heartbeat_peers();
290
291 stray_set.clear();
292
293 // clear _requested maps; we may have to peer() again if we discover
294 // (more) stray content
295 peer_log_requested.clear();
296 peer_missing_requested.clear();
297 }
298
299
300 bool PeeringState::proc_replica_info(
301 pg_shard_t from, const pg_info_t &oinfo, epoch_t send_epoch)
302 {
303 map<pg_shard_t, pg_info_t>::iterator p = peer_info.find(from);
304 if (p != peer_info.end() && p->second.last_update == oinfo.last_update) {
305 psdout(10) << " got dup osd." << from << " info "
306 << oinfo << ", identical to ours" << dendl;
307 return false;
308 }
309
310 if (!get_osdmap()->has_been_up_since(from.osd, send_epoch)) {
311 psdout(10) << " got info " << oinfo << " from down osd." << from
312 << " discarding" << dendl;
313 return false;
314 }
315
316 psdout(10) << " got osd." << from << " " << oinfo << dendl;
317 ceph_assert(is_primary());
318 peer_info[from] = oinfo;
319 might_have_unfound.insert(from);
320
321 update_history(oinfo.history);
322
323 // stray?
324 if (!is_up(from) && !is_acting(from)) {
325 psdout(10) << " osd." << from << " has stray content: " << oinfo << dendl;
326 stray_set.insert(from);
327 if (is_clean()) {
328 purge_strays();
329 }
330 }
331
332 // was this a new info? if so, update peers!
333 if (p == peer_info.end())
334 update_heartbeat_peers();
335
336 return true;
337 }
338
339
340 void PeeringState::remove_down_peer_info(const OSDMapRef &osdmap)
341 {
342 // Remove any downed osds from peer_info
343 bool removed = false;
344 map<pg_shard_t, pg_info_t>::iterator p = peer_info.begin();
345 while (p != peer_info.end()) {
346 if (!osdmap->is_up(p->first.osd)) {
347 psdout(10) << " dropping down osd." << p->first << " info " << p->second << dendl;
348 peer_missing.erase(p->first);
349 peer_log_requested.erase(p->first);
350 peer_missing_requested.erase(p->first);
351 peer_purged.erase(p->first);
352 peer_info.erase(p++);
353 removed = true;
354 } else
355 ++p;
356 }
357
358 // if we removed anyone, update peers (which include peer_info)
359 if (removed)
360 update_heartbeat_peers();
361
362 check_recovery_sources(osdmap);
363 }
364
365 void PeeringState::update_heartbeat_peers()
366 {
367 if (!is_primary())
368 return;
369
370 set<int> new_peers;
371 for (unsigned i=0; i<acting.size(); i++) {
372 if (acting[i] != CRUSH_ITEM_NONE)
373 new_peers.insert(acting[i]);
374 }
375 for (unsigned i=0; i<up.size(); i++) {
376 if (up[i] != CRUSH_ITEM_NONE)
377 new_peers.insert(up[i]);
378 }
379 for (map<pg_shard_t,pg_info_t>::iterator p = peer_info.begin();
380 p != peer_info.end();
381 ++p) {
382 new_peers.insert(p->first.osd);
383 }
384 pl->update_heartbeat_peers(std::move(new_peers));
385 }
386
387 void PeeringState::write_if_dirty(ObjectStore::Transaction& t)
388 {
389 pl->prepare_write(
390 info,
391 last_written_info,
392 past_intervals,
393 pg_log,
394 dirty_info,
395 dirty_big_info,
396 last_persisted_osdmap < get_osdmap_epoch(),
397 t);
398 if (dirty_info || dirty_big_info) {
399 last_persisted_osdmap = get_osdmap_epoch();
400 last_written_info = info;
401 dirty_info = false;
402 dirty_big_info = false;
403 }
404 }
405
406 void PeeringState::advance_map(
407 OSDMapRef osdmap, OSDMapRef lastmap,
408 vector<int>& newup, int up_primary,
409 vector<int>& newacting, int acting_primary,
410 PeeringCtx &rctx)
411 {
412 ceph_assert(lastmap == osdmap_ref);
413 psdout(10) << "handle_advance_map "
414 << newup << "/" << newacting
415 << " -- " << up_primary << "/" << acting_primary
416 << dendl;
417
418 update_osdmap_ref(osdmap);
419 pool.update(cct, osdmap);
420
421 AdvMap evt(
422 osdmap, lastmap, newup, up_primary,
423 newacting, acting_primary);
424 handle_event(evt, &rctx);
425 if (pool.info.last_change == osdmap_ref->get_epoch()) {
426 pl->on_pool_change();
427 }
428 readable_interval = pool.get_readable_interval();
429 last_require_osd_release = osdmap->require_osd_release;
430 }
431
432 void PeeringState::activate_map(PeeringCtx &rctx)
433 {
434 psdout(10) << __func__ << dendl;
435 ActMap evt;
436 handle_event(evt, &rctx);
437 if (osdmap_ref->get_epoch() - last_persisted_osdmap >
438 cct->_conf->osd_pg_epoch_persisted_max_stale) {
439 psdout(20) << __func__ << ": Dirtying info: last_persisted is "
440 << last_persisted_osdmap
441 << " while current is " << osdmap_ref->get_epoch() << dendl;
442 dirty_info = true;
443 } else {
444 psdout(20) << __func__ << ": Not dirtying info: last_persisted is "
445 << last_persisted_osdmap
446 << " while current is " << osdmap_ref->get_epoch() << dendl;
447 }
448 write_if_dirty(rctx.transaction);
449
450 if (get_osdmap()->check_new_blacklist_entries()) {
451 pl->check_blacklisted_watchers();
452 }
453 }
454
455 void PeeringState::set_last_peering_reset()
456 {
457 psdout(20) << "set_last_peering_reset " << get_osdmap_epoch() << dendl;
458 if (last_peering_reset != get_osdmap_epoch()) {
459 last_peering_reset = get_osdmap_epoch();
460 psdout(10) << "Clearing blocked outgoing recovery messages" << dendl;
461 clear_blocked_outgoing();
462 if (!pl->try_flush_or_schedule_async()) {
463 psdout(10) << "Beginning to block outgoing recovery messages" << dendl;
464 begin_block_outgoing();
465 } else {
466 psdout(10) << "Not blocking outgoing recovery messages" << dendl;
467 }
468 }
469 }
470
471 void PeeringState::complete_flush()
472 {
473 flushes_in_progress--;
474 if (flushes_in_progress == 0) {
475 pl->on_flushed();
476 }
477 }
478
479 void PeeringState::check_full_transition(OSDMapRef lastmap, OSDMapRef osdmap)
480 {
481 const pg_pool_t *pi = osdmap->get_pg_pool(info.pgid.pool());
482 if (!pi) {
483 return; // pool deleted
484 }
485 bool changed = false;
486 if (pi->has_flag(pg_pool_t::FLAG_FULL)) {
487 const pg_pool_t *opi = lastmap->get_pg_pool(info.pgid.pool());
488 if (!opi || !opi->has_flag(pg_pool_t::FLAG_FULL)) {
489 psdout(10) << " pool was marked full in " << osdmap->get_epoch() << dendl;
490 changed = true;
491 }
492 }
493 if (changed) {
494 info.history.last_epoch_marked_full = osdmap->get_epoch();
495 dirty_info = true;
496 }
497 }
498
499 bool PeeringState::should_restart_peering(
500 int newupprimary,
501 int newactingprimary,
502 const vector<int>& newup,
503 const vector<int>& newacting,
504 OSDMapRef lastmap,
505 OSDMapRef osdmap)
506 {
507 if (PastIntervals::is_new_interval(
508 primary.osd,
509 newactingprimary,
510 acting,
511 newacting,
512 up_primary.osd,
513 newupprimary,
514 up,
515 newup,
516 osdmap.get(),
517 lastmap.get(),
518 info.pgid.pgid)) {
519 psdout(20) << "new interval newup " << newup
520 << " newacting " << newacting << dendl;
521 return true;
522 }
523 if (!lastmap->is_up(pg_whoami.osd) && osdmap->is_up(pg_whoami.osd)) {
524 psdout(10) << __func__ << " osd transitioned from down -> up"
525 << dendl;
526 return true;
527 }
528 return false;
529 }
530
531 /* Called before initializing peering during advance_map */
532 void PeeringState::start_peering_interval(
533 const OSDMapRef lastmap,
534 const vector<int>& newup, int new_up_primary,
535 const vector<int>& newacting, int new_acting_primary,
536 ObjectStore::Transaction &t)
537 {
538 const OSDMapRef osdmap = get_osdmap();
539
540 set_last_peering_reset();
541
542 vector<int> oldacting, oldup;
543 int oldrole = get_role();
544
545 if (is_primary()) {
546 pl->clear_ready_to_merge();
547 }
548
549
550 pg_shard_t old_acting_primary = get_primary();
551 pg_shard_t old_up_primary = up_primary;
552 bool was_old_primary = is_primary();
553 bool was_old_nonprimary = is_nonprimary();
554
555 acting.swap(oldacting);
556 up.swap(oldup);
557 init_primary_up_acting(
558 newup,
559 newacting,
560 new_up_primary,
561 new_acting_primary);
562
563 if (info.stats.up != up ||
564 info.stats.acting != acting ||
565 info.stats.up_primary != new_up_primary ||
566 info.stats.acting_primary != new_acting_primary) {
567 info.stats.up = up;
568 info.stats.up_primary = new_up_primary;
569 info.stats.acting = acting;
570 info.stats.acting_primary = new_acting_primary;
571 info.stats.mapping_epoch = osdmap->get_epoch();
572 }
573
574 pl->clear_publish_stats();
575
576 // This will now be remapped during a backfill in cases
577 // that it would not have been before.
578 if (up != acting)
579 state_set(PG_STATE_REMAPPED);
580 else
581 state_clear(PG_STATE_REMAPPED);
582
583 int role = osdmap->calc_pg_role(pg_whoami, acting);
584 set_role(role);
585
586 // did acting, up, primary|acker change?
587 if (!lastmap) {
588 psdout(10) << " no lastmap" << dendl;
589 dirty_info = true;
590 dirty_big_info = true;
591 info.history.same_interval_since = osdmap->get_epoch();
592 } else {
593 std::stringstream debug;
594 ceph_assert(info.history.same_interval_since != 0);
595 bool new_interval = PastIntervals::check_new_interval(
596 old_acting_primary.osd,
597 new_acting_primary,
598 oldacting, newacting,
599 old_up_primary.osd,
600 new_up_primary,
601 oldup, newup,
602 info.history.same_interval_since,
603 info.history.last_epoch_clean,
604 osdmap.get(),
605 lastmap.get(),
606 info.pgid.pgid,
607 missing_loc.get_recoverable_predicate(),
608 &past_intervals,
609 &debug);
610 psdout(10) << __func__ << ": check_new_interval output: "
611 << debug.str() << dendl;
612 if (new_interval) {
613 if (osdmap->get_epoch() == pl->oldest_stored_osdmap() &&
614 info.history.last_epoch_clean < osdmap->get_epoch()) {
615 psdout(10) << " map gap, clearing past_intervals and faking" << dendl;
616 // our information is incomplete and useless; someone else was clean
617 // after everything we know if osdmaps were trimmed.
618 past_intervals.clear();
619 } else {
620 psdout(10) << " noting past " << past_intervals << dendl;
621 }
622 dirty_info = true;
623 dirty_big_info = true;
624 info.history.same_interval_since = osdmap->get_epoch();
625 if (osdmap->have_pg_pool(info.pgid.pgid.pool()) &&
626 info.pgid.pgid.is_split(lastmap->get_pg_num(info.pgid.pgid.pool()),
627 osdmap->get_pg_num(info.pgid.pgid.pool()),
628 nullptr)) {
629 info.history.last_epoch_split = osdmap->get_epoch();
630 }
631 }
632 }
633
634 if (old_up_primary != up_primary ||
635 oldup != up) {
636 info.history.same_up_since = osdmap->get_epoch();
637 }
638 // this comparison includes primary rank via pg_shard_t
639 if (old_acting_primary != get_primary()) {
640 info.history.same_primary_since = osdmap->get_epoch();
641 }
642
643 on_new_interval();
644 pl->on_info_history_change();
645
646 psdout(1) << __func__ << " up " << oldup << " -> " << up
647 << ", acting " << oldacting << " -> " << acting
648 << ", acting_primary " << old_acting_primary << " -> "
649 << new_acting_primary
650 << ", up_primary " << old_up_primary << " -> " << new_up_primary
651 << ", role " << oldrole << " -> " << role
652 << ", features acting " << acting_features
653 << " upacting " << upacting_features
654 << dendl;
655
656 // deactivate.
657 state_clear(PG_STATE_ACTIVE);
658 state_clear(PG_STATE_PEERED);
659 state_clear(PG_STATE_PREMERGE);
660 state_clear(PG_STATE_DOWN);
661 state_clear(PG_STATE_RECOVERY_WAIT);
662 state_clear(PG_STATE_RECOVERY_TOOFULL);
663 state_clear(PG_STATE_RECOVERING);
664
665 peer_purged.clear();
666 acting_recovery_backfill.clear();
667
668 // reset primary/replica state?
669 if (was_old_primary || is_primary()) {
670 pl->clear_want_pg_temp();
671 } else if (was_old_nonprimary || is_nonprimary()) {
672 pl->clear_want_pg_temp();
673 }
674 clear_primary_state();
675
676 pl->on_change(t);
677
678 ceph_assert(!deleting);
679
680 // should we tell the primary we are here?
681 send_notify = !is_primary();
682
683 if (role != oldrole ||
684 was_old_primary != is_primary()) {
685 // did primary change?
686 if (was_old_primary != is_primary()) {
687 state_clear(PG_STATE_CLEAN);
688 }
689
690 pl->on_role_change();
691 } else {
692 // no role change.
693 // did primary change?
694 if (get_primary() != old_acting_primary) {
695 psdout(10) << oldacting << " -> " << acting
696 << ", acting primary "
697 << old_acting_primary << " -> " << get_primary()
698 << dendl;
699 } else {
700 // primary is the same.
701 if (is_primary()) {
702 // i am (still) primary. but my replica set changed.
703 state_clear(PG_STATE_CLEAN);
704
705 psdout(10) << oldacting << " -> " << acting
706 << ", replicas changed" << dendl;
707 }
708 }
709 }
710
711 if (acting.empty() && !up.empty() && up_primary == pg_whoami) {
712 psdout(10) << " acting empty, but i am up[0], clearing pg_temp" << dendl;
713 pl->queue_want_pg_temp(acting);
714 }
715 }
716
717 void PeeringState::on_new_interval()
718 {
719 dout(20) << __func__ << dendl;
720 const OSDMapRef osdmap = get_osdmap();
721
722 // initialize features
723 acting_features = CEPH_FEATURES_SUPPORTED_DEFAULT;
724 upacting_features = CEPH_FEATURES_SUPPORTED_DEFAULT;
725 for (vector<int>::iterator p = acting.begin(); p != acting.end(); ++p) {
726 if (*p == CRUSH_ITEM_NONE)
727 continue;
728 uint64_t f = osdmap->get_xinfo(*p).features;
729 acting_features &= f;
730 upacting_features &= f;
731 }
732 for (vector<int>::iterator p = up.begin(); p != up.end(); ++p) {
733 if (*p == CRUSH_ITEM_NONE)
734 continue;
735 upacting_features &= osdmap->get_xinfo(*p).features;
736 }
737 psdout(20) << __func__ << " upacting_features 0x" << std::hex
738 << upacting_features << std::dec
739 << " from " << acting << "+" << up << dendl;
740
741 psdout(20) << __func__ << " checking missing set deletes flag. missing = "
742 << get_pg_log().get_missing() << dendl;
743
744 if (!pg_log.get_missing().may_include_deletes &&
745 !perform_deletes_during_peering()) {
746 pl->rebuild_missing_set_with_deletes(pg_log);
747 }
748 ceph_assert(
749 pg_log.get_missing().may_include_deletes ==
750 !perform_deletes_during_peering());
751
752 init_hb_stamps();
753
754 // update lease bounds for a new interval
755 auto mnow = pl->get_mnow();
756 prior_readable_until_ub = std::max(prior_readable_until_ub,
757 readable_until_ub);
758 prior_readable_until_ub = info.history.refresh_prior_readable_until_ub(
759 mnow, prior_readable_until_ub);
760 psdout(10) << __func__ << " prior_readable_until_ub "
761 << prior_readable_until_ub << " (mnow " << mnow << " + "
762 << info.history.prior_readable_until_ub << ")" << dendl;
763 prior_readable_down_osds.clear(); // we populate this when we build the priorset
764
765 readable_until =
766 readable_until_ub =
767 readable_until_ub_sent =
768 readable_until_ub_from_primary = ceph::signedspan::zero();
769
770 acting_readable_until_ub.clear();
771 if (is_primary()) {
772 acting_readable_until_ub.resize(acting.size(), ceph::signedspan::zero());
773 }
774
775 pl->on_new_interval();
776 }
777
778 void PeeringState::init_primary_up_acting(
779 const vector<int> &newup,
780 const vector<int> &newacting,
781 int new_up_primary,
782 int new_acting_primary)
783 {
784 actingset.clear();
785 acting = newacting;
786 for (uint8_t i = 0; i < acting.size(); ++i) {
787 if (acting[i] != CRUSH_ITEM_NONE)
788 actingset.insert(
789 pg_shard_t(
790 acting[i],
791 pool.info.is_erasure() ? shard_id_t(i) : shard_id_t::NO_SHARD));
792 }
793 upset.clear();
794 up = newup;
795 for (uint8_t i = 0; i < up.size(); ++i) {
796 if (up[i] != CRUSH_ITEM_NONE)
797 upset.insert(
798 pg_shard_t(
799 up[i],
800 pool.info.is_erasure() ? shard_id_t(i) : shard_id_t::NO_SHARD));
801 }
802 if (!pool.info.is_erasure()) {
803 // replicated
804 up_primary = pg_shard_t(new_up_primary, shard_id_t::NO_SHARD);
805 primary = pg_shard_t(new_acting_primary, shard_id_t::NO_SHARD);
806 } else {
807 // erasure
808 up_primary = pg_shard_t();
809 primary = pg_shard_t();
810 for (uint8_t i = 0; i < up.size(); ++i) {
811 if (up[i] == new_up_primary) {
812 up_primary = pg_shard_t(up[i], shard_id_t(i));
813 break;
814 }
815 }
816 for (uint8_t i = 0; i < acting.size(); ++i) {
817 if (acting[i] == new_acting_primary) {
818 primary = pg_shard_t(acting[i], shard_id_t(i));
819 break;
820 }
821 }
822 ceph_assert(up_primary.osd == new_up_primary);
823 ceph_assert(primary.osd == new_acting_primary);
824 }
825 }
826
827 void PeeringState::init_hb_stamps()
828 {
829 if (is_primary()) {
830 // we care about all other osds in the acting set
831 hb_stamps.resize(acting.size() - 1);
832 unsigned i = 0;
833 for (auto p : acting) {
834 if (p == CRUSH_ITEM_NONE || p == get_primary().osd) {
835 continue;
836 }
837 hb_stamps[i++] = pl->get_hb_stamps(p);
838 }
839 hb_stamps.resize(i);
840 } else if (is_nonprimary()) {
841 // we care about just the primary
842 hb_stamps.resize(1);
843 hb_stamps[0] = pl->get_hb_stamps(get_primary().osd);
844 } else {
845 hb_stamps.clear();
846 }
847 dout(10) << __func__ << " now " << hb_stamps << dendl;
848 }
849
850
851 void PeeringState::clear_recovery_state()
852 {
853 async_recovery_targets.clear();
854 backfill_targets.clear();
855 }
856
857 void PeeringState::clear_primary_state()
858 {
859 psdout(10) << "clear_primary_state" << dendl;
860
861 // clear peering state
862 stray_set.clear();
863 peer_log_requested.clear();
864 peer_missing_requested.clear();
865 peer_info.clear();
866 peer_bytes.clear();
867 peer_missing.clear();
868 peer_last_complete_ondisk.clear();
869 peer_activated.clear();
870 min_last_complete_ondisk = eversion_t();
871 pg_trim_to = eversion_t();
872 might_have_unfound.clear();
873 need_up_thru = false;
874 missing_loc.clear();
875 pg_log.reset_recovery_pointers();
876
877 clear_recovery_state();
878
879 last_update_ondisk = eversion_t();
880 missing_loc.clear();
881 pl->clear_primary_state();
882 }
883
884 /// return [start,end) bounds for required past_intervals
885 static pair<epoch_t, epoch_t> get_required_past_interval_bounds(
886 const pg_info_t &info,
887 epoch_t oldest_map) {
888 epoch_t start = std::max(
889 info.history.last_epoch_clean ? info.history.last_epoch_clean :
890 info.history.epoch_pool_created,
891 oldest_map);
892 epoch_t end = std::max(
893 info.history.same_interval_since,
894 info.history.epoch_pool_created);
895 return make_pair(start, end);
896 }
897
898
899 void PeeringState::check_past_interval_bounds() const
900 {
901 auto oldest_epoch = pl->oldest_stored_osdmap();
902 auto rpib = get_required_past_interval_bounds(
903 info,
904 oldest_epoch);
905 if (rpib.first >= rpib.second) {
906 // do not warn if the start bound is dictated by oldest_map; the
907 // past intervals are presumably appropriate given the pg info.
908 if (!past_intervals.empty() &&
909 rpib.first > oldest_epoch) {
910 pl->get_clog_error() << info.pgid << " required past_interval bounds are"
911 << " empty [" << rpib << ") but past_intervals is not: "
912 << past_intervals;
913 derr << info.pgid << " required past_interval bounds are"
914 << " empty [" << rpib << ") but past_intervals is not: "
915 << past_intervals << dendl;
916 }
917 } else {
918 if (past_intervals.empty()) {
919 pl->get_clog_error() << info.pgid << " required past_interval bounds are"
920 << " not empty [" << rpib << ") but past_intervals "
921 << past_intervals << " is empty";
922 derr << info.pgid << " required past_interval bounds are"
923 << " not empty [" << rpib << ") but past_intervals "
924 << past_intervals << " is empty" << dendl;
925 ceph_assert(!past_intervals.empty());
926 }
927
928 auto apib = past_intervals.get_bounds();
929 if (apib.first > rpib.first) {
930 pl->get_clog_error() << info.pgid << " past_intervals [" << apib
931 << ") start interval does not contain the required"
932 << " bound [" << rpib << ") start";
933 derr << info.pgid << " past_intervals [" << apib
934 << ") start interval does not contain the required"
935 << " bound [" << rpib << ") start" << dendl;
936 ceph_abort_msg("past_interval start interval mismatch");
937 }
938 if (apib.second != rpib.second) {
939 pl->get_clog_error() << info.pgid << " past_interal bound [" << apib
940 << ") end does not match required [" << rpib
941 << ") end";
942 derr << info.pgid << " past_interal bound [" << apib
943 << ") end does not match required [" << rpib
944 << ") end" << dendl;
945 ceph_abort_msg("past_interval end mismatch");
946 }
947 }
948 }
949
950 int PeeringState::clamp_recovery_priority(int priority, int pool_recovery_priority, int max)
951 {
952 static_assert(OSD_RECOVERY_PRIORITY_MIN < OSD_RECOVERY_PRIORITY_MAX, "Invalid priority range");
953 static_assert(OSD_RECOVERY_PRIORITY_MIN >= 0, "Priority range must match unsigned type");
954
955 ceph_assert(max <= OSD_RECOVERY_PRIORITY_MAX);
956
957 // User can't set this too high anymore, but might be a legacy value
958 if (pool_recovery_priority > OSD_POOL_PRIORITY_MAX)
959 pool_recovery_priority = OSD_POOL_PRIORITY_MAX;
960 if (pool_recovery_priority < OSD_POOL_PRIORITY_MIN)
961 pool_recovery_priority = OSD_POOL_PRIORITY_MIN;
962 // Shift range from min to max to 0 to max - min
963 pool_recovery_priority += (0 - OSD_POOL_PRIORITY_MIN);
964 ceph_assert(pool_recovery_priority >= 0 && pool_recovery_priority <= (OSD_POOL_PRIORITY_MAX - OSD_POOL_PRIORITY_MIN));
965
966 priority += pool_recovery_priority;
967
968 // Clamp to valid range
969 if (priority > max) {
970 return max;
971 } else if (priority < OSD_RECOVERY_PRIORITY_MIN) {
972 return OSD_RECOVERY_PRIORITY_MIN;
973 } else {
974 return priority;
975 }
976 }
977
978 unsigned PeeringState::get_recovery_priority()
979 {
980 // a higher value -> a higher priority
981 int ret = OSD_RECOVERY_PRIORITY_BASE;
982 int base = ret;
983
984 if (state & PG_STATE_FORCED_RECOVERY) {
985 ret = OSD_RECOVERY_PRIORITY_FORCED;
986 } else {
987 // XXX: This priority boost isn't so much about inactive, but about data-at-risk
988 if (is_degraded() && info.stats.avail_no_missing.size() < pool.info.min_size) {
989 base = OSD_RECOVERY_INACTIVE_PRIORITY_BASE;
990 // inactive: no. of replicas < min_size, highest priority since it blocks IO
991 ret = base + (pool.info.min_size - info.stats.avail_no_missing.size());
992 }
993
994 int64_t pool_recovery_priority = 0;
995 pool.info.opts.get(pool_opts_t::RECOVERY_PRIORITY, &pool_recovery_priority);
996
997 ret = clamp_recovery_priority(ret, pool_recovery_priority, max_prio_map[base]);
998 }
999 psdout(20) << __func__ << " recovery priority is " << ret << dendl;
1000 return static_cast<unsigned>(ret);
1001 }
1002
1003 unsigned PeeringState::get_backfill_priority()
1004 {
1005 // a higher value -> a higher priority
1006 int ret = OSD_BACKFILL_PRIORITY_BASE;
1007 int base = ret;
1008
1009 if (state & PG_STATE_FORCED_BACKFILL) {
1010 ret = OSD_BACKFILL_PRIORITY_FORCED;
1011 } else {
1012 if (acting.size() < pool.info.min_size) {
1013 base = OSD_BACKFILL_INACTIVE_PRIORITY_BASE;
1014 // inactive: no. of replicas < min_size, highest priority since it blocks IO
1015 ret = base + (pool.info.min_size - acting.size());
1016
1017 } else if (is_undersized()) {
1018 // undersized: OSD_BACKFILL_DEGRADED_PRIORITY_BASE + num missing replicas
1019 ceph_assert(pool.info.size > actingset.size());
1020 base = OSD_BACKFILL_DEGRADED_PRIORITY_BASE;
1021 ret = base + (pool.info.size - actingset.size());
1022
1023 } else if (is_degraded()) {
1024 // degraded: baseline degraded
1025 base = ret = OSD_BACKFILL_DEGRADED_PRIORITY_BASE;
1026 }
1027
1028 // Adjust with pool's recovery priority
1029 int64_t pool_recovery_priority = 0;
1030 pool.info.opts.get(pool_opts_t::RECOVERY_PRIORITY, &pool_recovery_priority);
1031
1032 ret = clamp_recovery_priority(ret, pool_recovery_priority, max_prio_map[base]);
1033 }
1034
1035 psdout(20) << __func__ << " backfill priority is " << ret << dendl;
1036 return static_cast<unsigned>(ret);
1037 }
1038
1039 unsigned PeeringState::get_delete_priority()
1040 {
1041 auto state = get_osdmap()->get_state(pg_whoami.osd);
1042 if (state & (CEPH_OSD_BACKFILLFULL |
1043 CEPH_OSD_FULL)) {
1044 return OSD_DELETE_PRIORITY_FULL;
1045 } else if (state & CEPH_OSD_NEARFULL) {
1046 return OSD_DELETE_PRIORITY_FULLISH;
1047 } else {
1048 return OSD_DELETE_PRIORITY_NORMAL;
1049 }
1050 }
1051
1052 bool PeeringState::set_force_recovery(bool b)
1053 {
1054 bool did = false;
1055 if (b) {
1056 if (!(state & PG_STATE_FORCED_RECOVERY) &&
1057 (state & (PG_STATE_DEGRADED |
1058 PG_STATE_RECOVERY_WAIT |
1059 PG_STATE_RECOVERING))) {
1060 psdout(20) << __func__ << " set" << dendl;
1061 state_set(PG_STATE_FORCED_RECOVERY);
1062 pl->publish_stats_to_osd();
1063 did = true;
1064 }
1065 } else if (state & PG_STATE_FORCED_RECOVERY) {
1066 psdout(20) << __func__ << " clear" << dendl;
1067 state_clear(PG_STATE_FORCED_RECOVERY);
1068 pl->publish_stats_to_osd();
1069 did = true;
1070 }
1071 if (did) {
1072 psdout(20) << __func__ << " state " << get_current_state()
1073 << dendl;
1074 pl->update_local_background_io_priority(get_recovery_priority());
1075 }
1076 return did;
1077 }
1078
1079 bool PeeringState::set_force_backfill(bool b)
1080 {
1081 bool did = false;
1082 if (b) {
1083 if (!(state & PG_STATE_FORCED_BACKFILL) &&
1084 (state & (PG_STATE_DEGRADED |
1085 PG_STATE_BACKFILL_WAIT |
1086 PG_STATE_BACKFILLING))) {
1087 psdout(10) << __func__ << " set" << dendl;
1088 state_set(PG_STATE_FORCED_BACKFILL);
1089 pl->publish_stats_to_osd();
1090 did = true;
1091 }
1092 } else if (state & PG_STATE_FORCED_BACKFILL) {
1093 psdout(10) << __func__ << " clear" << dendl;
1094 state_clear(PG_STATE_FORCED_BACKFILL);
1095 pl->publish_stats_to_osd();
1096 did = true;
1097 }
1098 if (did) {
1099 psdout(20) << __func__ << " state " << get_current_state()
1100 << dendl;
1101 pl->update_local_background_io_priority(get_backfill_priority());
1102 }
1103 return did;
1104 }
1105
1106 void PeeringState::schedule_renew_lease()
1107 {
1108 pl->schedule_renew_lease(
1109 last_peering_reset,
1110 readable_interval / 2);
1111 }
1112
1113 void PeeringState::send_lease()
1114 {
1115 epoch_t epoch = pl->get_osdmap_epoch();
1116 for (auto peer : actingset) {
1117 if (peer == pg_whoami) {
1118 continue;
1119 }
1120 pl->send_cluster_message(
1121 peer.osd,
1122 new MOSDPGLease(epoch,
1123 spg_t(spgid.pgid, peer.shard),
1124 get_lease()),
1125 epoch);
1126 }
1127 }
1128
1129 void PeeringState::proc_lease(const pg_lease_t& l)
1130 {
1131 if (!HAVE_FEATURE(upacting_features, SERVER_OCTOPUS)) {
1132 psdout(20) << __func__ << " no-op, upacting_features 0x" << std::hex
1133 << upacting_features << std::dec
1134 << " does not include SERVER_OCTOPUS" << dendl;
1135 return;
1136 }
1137 if (!is_nonprimary()) {
1138 psdout(20) << __func__ << " no-op, !nonprimary" << dendl;
1139 return;
1140 }
1141 psdout(10) << __func__ << " " << l << dendl;
1142 if (l.readable_until_ub > readable_until_ub_from_primary) {
1143 readable_until_ub_from_primary = l.readable_until_ub;
1144 }
1145
1146 ceph::signedspan ru = ceph::signedspan::zero();
1147 if (l.readable_until != ceph::signedspan::zero() &&
1148 hb_stamps[0]->peer_clock_delta_ub) {
1149 ru = l.readable_until - *hb_stamps[0]->peer_clock_delta_ub;
1150 psdout(20) << " peer_clock_delta_ub " << *hb_stamps[0]->peer_clock_delta_ub
1151 << " -> ru " << ru << dendl;
1152 }
1153 if (ru > readable_until) {
1154 readable_until = ru;
1155 psdout(20) << __func__ << " readable_until now " << readable_until << dendl;
1156 // NOTE: if we ever decide to block/queue ops on the replica,
1157 // we'll need to wake them up here.
1158 }
1159
1160 ceph::signedspan ruub;
1161 if (hb_stamps[0]->peer_clock_delta_lb) {
1162 ruub = l.readable_until_ub - *hb_stamps[0]->peer_clock_delta_lb;
1163 psdout(20) << " peer_clock_delta_lb " << *hb_stamps[0]->peer_clock_delta_lb
1164 << " -> ruub " << ruub << dendl;
1165 } else {
1166 ruub = pl->get_mnow() + l.interval;
1167 psdout(20) << " no peer_clock_delta_lb -> ruub " << ruub << dendl;
1168 }
1169 if (ruub > readable_until_ub) {
1170 readable_until_ub = ruub;
1171 psdout(20) << __func__ << " readable_until_ub now " << readable_until_ub
1172 << dendl;
1173 }
1174 }
1175
1176 void PeeringState::proc_lease_ack(int from, const pg_lease_ack_t& a)
1177 {
1178 if (!HAVE_FEATURE(upacting_features, SERVER_OCTOPUS)) {
1179 return;
1180 }
1181 auto now = pl->get_mnow();
1182 bool was_min = false;
1183 for (unsigned i = 0; i < acting.size(); ++i) {
1184 if (from == acting[i]) {
1185 // the lease_ack value is based on the primary's clock
1186 if (a.readable_until_ub > acting_readable_until_ub[i]) {
1187 if (acting_readable_until_ub[i] == readable_until) {
1188 was_min = true;
1189 }
1190 acting_readable_until_ub[i] = a.readable_until_ub;
1191 break;
1192 }
1193 }
1194 }
1195 if (was_min) {
1196 auto old_ru = readable_until;
1197 recalc_readable_until();
1198 if (now < old_ru) {
1199 pl->recheck_readable();
1200 }
1201 }
1202 }
1203
1204 void PeeringState::proc_renew_lease()
1205 {
1206 if (!HAVE_FEATURE(upacting_features, SERVER_OCTOPUS)) {
1207 return;
1208 }
1209 renew_lease(pl->get_mnow());
1210 send_lease();
1211 schedule_renew_lease();
1212 }
1213
1214 void PeeringState::recalc_readable_until()
1215 {
1216 assert(is_primary());
1217 ceph::signedspan min = readable_until_ub_sent;
1218 for (unsigned i = 0; i < acting.size(); ++i) {
1219 if (acting[i] == pg_whoami.osd || acting[i] == CRUSH_ITEM_NONE) {
1220 continue;
1221 }
1222 dout(20) << __func__ << " peer osd." << acting[i]
1223 << " ruub " << acting_readable_until_ub[i] << dendl;
1224 if (acting_readable_until_ub[i] < min) {
1225 min = acting_readable_until_ub[i];
1226 }
1227 }
1228 readable_until = min;
1229 readable_until_ub = min;
1230 dout(20) << __func__ << " readable_until[_ub] " << readable_until
1231 << " (sent " << readable_until_ub_sent << ")" << dendl;
1232 }
1233
1234 bool PeeringState::check_prior_readable_down_osds(const OSDMapRef& map)
1235 {
1236 if (!HAVE_FEATURE(upacting_features, SERVER_OCTOPUS)) {
1237 return false;
1238 }
1239 bool changed = false;
1240 auto p = prior_readable_down_osds.begin();
1241 while (p != prior_readable_down_osds.end()) {
1242 if (map->is_dead(*p)) {
1243 dout(10) << __func__ << " prior_readable_down_osds osd." << *p
1244 << " is dead as of epoch " << map->get_epoch()
1245 << dendl;
1246 p = prior_readable_down_osds.erase(p);
1247 changed = true;
1248 } else {
1249 ++p;
1250 }
1251 }
1252 if (changed && prior_readable_down_osds.empty()) {
1253 psdout(10) << " empty prior_readable_down_osds, clearing ub" << dendl;
1254 clear_prior_readable_until_ub();
1255 return true;
1256 }
1257 return false;
1258 }
1259
1260 bool PeeringState::adjust_need_up_thru(const OSDMapRef osdmap)
1261 {
1262 epoch_t up_thru = osdmap->get_up_thru(pg_whoami.osd);
1263 if (need_up_thru &&
1264 up_thru >= info.history.same_interval_since) {
1265 psdout(10) << "adjust_need_up_thru now "
1266 << up_thru << ", need_up_thru now false" << dendl;
1267 need_up_thru = false;
1268 return true;
1269 }
1270 return false;
1271 }
1272
1273 PastIntervals::PriorSet PeeringState::build_prior()
1274 {
1275 if (1) {
1276 // sanity check
1277 for (map<pg_shard_t,pg_info_t>::iterator it = peer_info.begin();
1278 it != peer_info.end();
1279 ++it) {
1280 ceph_assert(info.history.last_epoch_started >=
1281 it->second.history.last_epoch_started);
1282 }
1283 }
1284
1285 const OSDMap &osdmap = *get_osdmap();
1286 PastIntervals::PriorSet prior = past_intervals.get_prior_set(
1287 pool.info.is_erasure(),
1288 info.history.last_epoch_started,
1289 &missing_loc.get_recoverable_predicate(),
1290 [&](epoch_t start, int osd, epoch_t *lost_at) {
1291 const osd_info_t *pinfo = 0;
1292 if (osdmap.exists(osd)) {
1293 pinfo = &osdmap.get_info(osd);
1294 if (lost_at)
1295 *lost_at = pinfo->lost_at;
1296 }
1297
1298 if (osdmap.is_up(osd)) {
1299 return PastIntervals::UP;
1300 } else if (!pinfo) {
1301 return PastIntervals::DNE;
1302 } else if (pinfo->lost_at > start) {
1303 return PastIntervals::LOST;
1304 } else {
1305 return PastIntervals::DOWN;
1306 }
1307 },
1308 up,
1309 acting,
1310 dpp);
1311
1312 if (prior.pg_down) {
1313 state_set(PG_STATE_DOWN);
1314 }
1315
1316 if (get_osdmap()->get_up_thru(pg_whoami.osd) <
1317 info.history.same_interval_since) {
1318 psdout(10) << "up_thru " << get_osdmap()->get_up_thru(pg_whoami.osd)
1319 << " < same_since " << info.history.same_interval_since
1320 << ", must notify monitor" << dendl;
1321 need_up_thru = true;
1322 } else {
1323 psdout(10) << "up_thru " << get_osdmap()->get_up_thru(pg_whoami.osd)
1324 << " >= same_since " << info.history.same_interval_since
1325 << ", all is well" << dendl;
1326 need_up_thru = false;
1327 }
1328 pl->set_probe_targets(prior.probe);
1329 return prior;
1330 }
1331
1332 bool PeeringState::needs_recovery() const
1333 {
1334 ceph_assert(is_primary());
1335
1336 auto &missing = pg_log.get_missing();
1337
1338 if (missing.num_missing()) {
1339 psdout(10) << __func__ << " primary has " << missing.num_missing()
1340 << " missing" << dendl;
1341 return true;
1342 }
1343
1344 ceph_assert(!acting_recovery_backfill.empty());
1345 set<pg_shard_t>::const_iterator end = acting_recovery_backfill.end();
1346 set<pg_shard_t>::const_iterator a = acting_recovery_backfill.begin();
1347 for (; a != end; ++a) {
1348 if (*a == get_primary()) continue;
1349 pg_shard_t peer = *a;
1350 map<pg_shard_t, pg_missing_t>::const_iterator pm = peer_missing.find(peer);
1351 if (pm == peer_missing.end()) {
1352 psdout(10) << __func__ << " osd." << peer << " doesn't have missing set"
1353 << dendl;
1354 continue;
1355 }
1356 if (pm->second.num_missing()) {
1357 psdout(10) << __func__ << " osd." << peer << " has "
1358 << pm->second.num_missing() << " missing" << dendl;
1359 return true;
1360 }
1361 }
1362
1363 psdout(10) << __func__ << " is recovered" << dendl;
1364 return false;
1365 }
1366
1367 bool PeeringState::needs_backfill() const
1368 {
1369 ceph_assert(is_primary());
1370
1371 // We can assume that only possible osds that need backfill
1372 // are on the backfill_targets vector nodes.
1373 set<pg_shard_t>::const_iterator end = backfill_targets.end();
1374 set<pg_shard_t>::const_iterator a = backfill_targets.begin();
1375 for (; a != end; ++a) {
1376 pg_shard_t peer = *a;
1377 map<pg_shard_t, pg_info_t>::const_iterator pi = peer_info.find(peer);
1378 if (!pi->second.last_backfill.is_max()) {
1379 psdout(10) << __func__ << " osd." << peer
1380 << " has last_backfill " << pi->second.last_backfill << dendl;
1381 return true;
1382 }
1383 }
1384
1385 psdout(10) << __func__ << " does not need backfill" << dendl;
1386 return false;
1387 }
1388
1389 /*
1390 * Returns true unless there is a non-lost OSD in might_have_unfound.
1391 */
1392 bool PeeringState::all_unfound_are_queried_or_lost(
1393 const OSDMapRef osdmap) const
1394 {
1395 ceph_assert(is_primary());
1396
1397 set<pg_shard_t>::const_iterator peer = might_have_unfound.begin();
1398 set<pg_shard_t>::const_iterator mend = might_have_unfound.end();
1399 for (; peer != mend; ++peer) {
1400 if (peer_missing.count(*peer))
1401 continue;
1402 map<pg_shard_t, pg_info_t>::const_iterator iter = peer_info.find(*peer);
1403 if (iter != peer_info.end() &&
1404 (iter->second.is_empty() || iter->second.dne()))
1405 continue;
1406 if (!osdmap->exists(peer->osd))
1407 continue;
1408 const osd_info_t &osd_info(osdmap->get_info(peer->osd));
1409 if (osd_info.lost_at <= osd_info.up_from) {
1410 // If there is even one OSD in might_have_unfound that isn't lost, we
1411 // still might retrieve our unfound.
1412 return false;
1413 }
1414 }
1415 psdout(10) << "all_unfound_are_queried_or_lost all of might_have_unfound "
1416 << might_have_unfound
1417 << " have been queried or are marked lost" << dendl;
1418 return true;
1419 }
1420
1421
1422 void PeeringState::reject_reservation()
1423 {
1424 pl->unreserve_recovery_space();
1425 pl->send_cluster_message(
1426 primary.osd,
1427 new MBackfillReserve(
1428 MBackfillReserve::REJECT_TOOFULL,
1429 spg_t(info.pgid.pgid, primary.shard),
1430 get_osdmap_epoch()),
1431 get_osdmap_epoch());
1432 }
1433
1434 /**
1435 * find_best_info
1436 *
1437 * Returns an iterator to the best info in infos sorted by:
1438 * 1) Prefer newer last_update
1439 * 2) Prefer longer tail if it brings another info into contiguity
1440 * 3) Prefer current primary
1441 */
1442 map<pg_shard_t, pg_info_t>::const_iterator PeeringState::find_best_info(
1443 const map<pg_shard_t, pg_info_t> &infos,
1444 bool restrict_to_up_acting,
1445 bool *history_les_bound) const
1446 {
1447 ceph_assert(history_les_bound);
1448 /* See doc/dev/osd_internals/last_epoch_started.rst before attempting
1449 * to make changes to this process. Also, make sure to update it
1450 * when you find bugs! */
1451 eversion_t min_last_update_acceptable = eversion_t::max();
1452 epoch_t max_last_epoch_started_found = 0;
1453 for (map<pg_shard_t, pg_info_t>::const_iterator i = infos.begin();
1454 i != infos.end();
1455 ++i) {
1456 if (!cct->_conf->osd_find_best_info_ignore_history_les &&
1457 max_last_epoch_started_found < i->second.history.last_epoch_started) {
1458 *history_les_bound = true;
1459 max_last_epoch_started_found = i->second.history.last_epoch_started;
1460 }
1461 if (!i->second.is_incomplete() &&
1462 max_last_epoch_started_found < i->second.last_epoch_started) {
1463 *history_les_bound = false;
1464 max_last_epoch_started_found = i->second.last_epoch_started;
1465 }
1466 }
1467 for (map<pg_shard_t, pg_info_t>::const_iterator i = infos.begin();
1468 i != infos.end();
1469 ++i) {
1470 if (max_last_epoch_started_found <= i->second.last_epoch_started) {
1471 if (min_last_update_acceptable > i->second.last_update)
1472 min_last_update_acceptable = i->second.last_update;
1473 }
1474 }
1475 if (min_last_update_acceptable == eversion_t::max())
1476 return infos.end();
1477
1478 map<pg_shard_t, pg_info_t>::const_iterator best = infos.end();
1479 // find osd with newest last_update (oldest for ec_pool).
1480 // if there are multiples, prefer
1481 // - a longer tail, if it brings another peer into log contiguity
1482 // - the current primary
1483 for (map<pg_shard_t, pg_info_t>::const_iterator p = infos.begin();
1484 p != infos.end();
1485 ++p) {
1486 if (restrict_to_up_acting && !is_up(p->first) &&
1487 !is_acting(p->first))
1488 continue;
1489 // Only consider peers with last_update >= min_last_update_acceptable
1490 if (p->second.last_update < min_last_update_acceptable)
1491 continue;
1492 // Disqualify anyone with a too old last_epoch_started
1493 if (p->second.last_epoch_started < max_last_epoch_started_found)
1494 continue;
1495 // Disqualify anyone who is incomplete (not fully backfilled)
1496 if (p->second.is_incomplete())
1497 continue;
1498 if (best == infos.end()) {
1499 best = p;
1500 continue;
1501 }
1502 // Prefer newer last_update
1503 if (pool.info.require_rollback()) {
1504 if (p->second.last_update > best->second.last_update)
1505 continue;
1506 if (p->second.last_update < best->second.last_update) {
1507 best = p;
1508 continue;
1509 }
1510 } else {
1511 if (p->second.last_update < best->second.last_update)
1512 continue;
1513 if (p->second.last_update > best->second.last_update) {
1514 best = p;
1515 continue;
1516 }
1517 }
1518
1519 // Prefer longer tail
1520 if (p->second.log_tail > best->second.log_tail) {
1521 continue;
1522 } else if (p->second.log_tail < best->second.log_tail) {
1523 best = p;
1524 continue;
1525 }
1526
1527 if (!p->second.has_missing() && best->second.has_missing()) {
1528 psdout(10) << __func__ << " prefer osd." << p->first
1529 << " because it is complete while best has missing"
1530 << dendl;
1531 best = p;
1532 continue;
1533 } else if (p->second.has_missing() && !best->second.has_missing()) {
1534 psdout(10) << __func__ << " skipping osd." << p->first
1535 << " because it has missing while best is complete"
1536 << dendl;
1537 continue;
1538 } else {
1539 // both are complete or have missing
1540 // fall through
1541 }
1542
1543 // prefer current primary (usually the caller), all things being equal
1544 if (p->first == pg_whoami) {
1545 psdout(10) << "calc_acting prefer osd." << p->first
1546 << " because it is current primary" << dendl;
1547 best = p;
1548 continue;
1549 }
1550 }
1551 return best;
1552 }
1553
1554 void PeeringState::calc_ec_acting(
1555 map<pg_shard_t, pg_info_t>::const_iterator auth_log_shard,
1556 unsigned size,
1557 const vector<int> &acting,
1558 const vector<int> &up,
1559 const map<pg_shard_t, pg_info_t> &all_info,
1560 bool restrict_to_up_acting,
1561 vector<int> *_want,
1562 set<pg_shard_t> *backfill,
1563 set<pg_shard_t> *acting_backfill,
1564 ostream &ss)
1565 {
1566 vector<int> want(size, CRUSH_ITEM_NONE);
1567 map<shard_id_t, set<pg_shard_t> > all_info_by_shard;
1568 for (map<pg_shard_t, pg_info_t>::const_iterator i = all_info.begin();
1569 i != all_info.end();
1570 ++i) {
1571 all_info_by_shard[i->first.shard].insert(i->first);
1572 }
1573 for (uint8_t i = 0; i < want.size(); ++i) {
1574 ss << "For position " << (unsigned)i << ": ";
1575 if (up.size() > (unsigned)i && up[i] != CRUSH_ITEM_NONE &&
1576 !all_info.find(pg_shard_t(up[i], shard_id_t(i)))->second.is_incomplete() &&
1577 all_info.find(pg_shard_t(up[i], shard_id_t(i)))->second.last_update >=
1578 auth_log_shard->second.log_tail) {
1579 ss << " selecting up[i]: " << pg_shard_t(up[i], shard_id_t(i)) << std::endl;
1580 want[i] = up[i];
1581 continue;
1582 }
1583 if (up.size() > (unsigned)i && up[i] != CRUSH_ITEM_NONE) {
1584 ss << " backfilling up[i]: " << pg_shard_t(up[i], shard_id_t(i))
1585 << " and ";
1586 backfill->insert(pg_shard_t(up[i], shard_id_t(i)));
1587 }
1588
1589 if (acting.size() > (unsigned)i && acting[i] != CRUSH_ITEM_NONE &&
1590 !all_info.find(pg_shard_t(acting[i], shard_id_t(i)))->second.is_incomplete() &&
1591 all_info.find(pg_shard_t(acting[i], shard_id_t(i)))->second.last_update >=
1592 auth_log_shard->second.log_tail) {
1593 ss << " selecting acting[i]: " << pg_shard_t(acting[i], shard_id_t(i)) << std::endl;
1594 want[i] = acting[i];
1595 } else if (!restrict_to_up_acting) {
1596 for (set<pg_shard_t>::iterator j = all_info_by_shard[shard_id_t(i)].begin();
1597 j != all_info_by_shard[shard_id_t(i)].end();
1598 ++j) {
1599 ceph_assert(j->shard == i);
1600 if (!all_info.find(*j)->second.is_incomplete() &&
1601 all_info.find(*j)->second.last_update >=
1602 auth_log_shard->second.log_tail) {
1603 ss << " selecting stray: " << *j << std::endl;
1604 want[i] = j->osd;
1605 break;
1606 }
1607 }
1608 if (want[i] == CRUSH_ITEM_NONE)
1609 ss << " failed to fill position " << (int)i << std::endl;
1610 }
1611 }
1612
1613 for (uint8_t i = 0; i < want.size(); ++i) {
1614 if (want[i] != CRUSH_ITEM_NONE) {
1615 acting_backfill->insert(pg_shard_t(want[i], shard_id_t(i)));
1616 }
1617 }
1618 acting_backfill->insert(backfill->begin(), backfill->end());
1619 _want->swap(want);
1620 }
1621
1622 /**
1623 * calculate the desired acting set.
1624 *
1625 * Choose an appropriate acting set. Prefer up[0], unless it is
1626 * incomplete, or another osd has a longer tail that allows us to
1627 * bring other up nodes up to date.
1628 */
1629 void PeeringState::calc_replicated_acting(
1630 map<pg_shard_t, pg_info_t>::const_iterator auth_log_shard,
1631 uint64_t force_auth_primary_missing_objects,
1632 unsigned size,
1633 const vector<int> &acting,
1634 const vector<int> &up,
1635 pg_shard_t up_primary,
1636 const map<pg_shard_t, pg_info_t> &all_info,
1637 bool restrict_to_up_acting,
1638 vector<int> *want,
1639 set<pg_shard_t> *backfill,
1640 set<pg_shard_t> *acting_backfill,
1641 const OSDMapRef osdmap,
1642 ostream &ss)
1643 {
1644 pg_shard_t auth_log_shard_id = auth_log_shard->first;
1645
1646 ss << __func__ << " newest update on osd." << auth_log_shard_id
1647 << " with " << auth_log_shard->second
1648 << (restrict_to_up_acting ? " restrict_to_up_acting" : "") << std::endl;
1649
1650 // select primary
1651 auto primary = all_info.find(up_primary);
1652 if (up.size() &&
1653 !primary->second.is_incomplete() &&
1654 primary->second.last_update >=
1655 auth_log_shard->second.log_tail) {
1656 if (HAVE_FEATURE(osdmap->get_up_osd_features(), SERVER_NAUTILUS)) {
1657 auto approx_missing_objects =
1658 primary->second.stats.stats.sum.num_objects_missing;
1659 auto auth_version = auth_log_shard->second.last_update.version;
1660 auto primary_version = primary->second.last_update.version;
1661 if (auth_version > primary_version) {
1662 approx_missing_objects += auth_version - primary_version;
1663 } else {
1664 approx_missing_objects += primary_version - auth_version;
1665 }
1666 if ((uint64_t)approx_missing_objects >
1667 force_auth_primary_missing_objects) {
1668 primary = auth_log_shard;
1669 ss << "up_primary: " << up_primary << ") has approximate "
1670 << approx_missing_objects
1671 << "(>" << force_auth_primary_missing_objects <<") "
1672 << "missing objects, osd." << auth_log_shard_id
1673 << " selected as primary instead"
1674 << std::endl;
1675 } else {
1676 ss << "up_primary: " << up_primary << ") selected as primary"
1677 << std::endl;
1678 }
1679 } else {
1680 ss << "up_primary: " << up_primary << ") selected as primary" << std::endl;
1681 }
1682 } else {
1683 ceph_assert(!auth_log_shard->second.is_incomplete());
1684 ss << "up[0] needs backfill, osd." << auth_log_shard_id
1685 << " selected as primary instead" << std::endl;
1686 primary = auth_log_shard;
1687 }
1688
1689 ss << __func__ << " primary is osd." << primary->first
1690 << " with " << primary->second << std::endl;
1691 want->push_back(primary->first.osd);
1692 acting_backfill->insert(primary->first);
1693
1694 /* We include auth_log_shard->second.log_tail because in GetLog,
1695 * we will request logs back to the min last_update over our
1696 * acting_backfill set, which will result in our log being extended
1697 * as far backwards as necessary to pick up any peers which can
1698 * be log recovered by auth_log_shard's log */
1699 eversion_t oldest_auth_log_entry =
1700 std::min(primary->second.log_tail, auth_log_shard->second.log_tail);
1701
1702 // select replicas that have log contiguity with primary.
1703 // prefer up, then acting, then any peer_info osds
1704 for (auto i : up) {
1705 pg_shard_t up_cand = pg_shard_t(i, shard_id_t::NO_SHARD);
1706 if (up_cand == primary->first)
1707 continue;
1708 const pg_info_t &cur_info = all_info.find(up_cand)->second;
1709 if (cur_info.is_incomplete() ||
1710 cur_info.last_update < oldest_auth_log_entry) {
1711 ss << " shard " << up_cand << " (up) backfill " << cur_info << std::endl;
1712 backfill->insert(up_cand);
1713 acting_backfill->insert(up_cand);
1714 } else {
1715 want->push_back(i);
1716 acting_backfill->insert(up_cand);
1717 ss << " osd." << i << " (up) accepted " << cur_info << std::endl;
1718 }
1719 }
1720
1721 if (want->size() >= size) {
1722 return;
1723 }
1724
1725 std::vector<std::pair<eversion_t, int>> candidate_by_last_update;
1726 candidate_by_last_update.reserve(acting.size());
1727 // This no longer has backfill OSDs, but they are covered above.
1728 for (auto i : acting) {
1729 pg_shard_t acting_cand(i, shard_id_t::NO_SHARD);
1730 // skip up osds we already considered above
1731 if (acting_cand == primary->first)
1732 continue;
1733 vector<int>::const_iterator up_it = find(up.begin(), up.end(), i);
1734 if (up_it != up.end())
1735 continue;
1736
1737 const pg_info_t &cur_info = all_info.find(acting_cand)->second;
1738 if (cur_info.is_incomplete() ||
1739 cur_info.last_update < oldest_auth_log_entry) {
1740 ss << " shard " << acting_cand << " (acting) REJECTED "
1741 << cur_info << std::endl;
1742 } else {
1743 candidate_by_last_update.emplace_back(cur_info.last_update, i);
1744 }
1745 }
1746
1747 auto sort_by_eversion =[](const std::pair<eversion_t, int> &lhs,
1748 const std::pair<eversion_t, int> &rhs) {
1749 return lhs.first > rhs.first;
1750 };
1751 // sort by last_update, in descending order.
1752 std::sort(candidate_by_last_update.begin(),
1753 candidate_by_last_update.end(), sort_by_eversion);
1754 for (auto &p: candidate_by_last_update) {
1755 ceph_assert(want->size() < size);
1756 want->push_back(p.second);
1757 pg_shard_t s = pg_shard_t(p.second, shard_id_t::NO_SHARD);
1758 acting_backfill->insert(s);
1759 ss << " shard " << s << " (acting) accepted "
1760 << all_info.find(s)->second << std::endl;
1761 if (want->size() >= size) {
1762 return;
1763 }
1764 }
1765
1766 if (restrict_to_up_acting) {
1767 return;
1768 }
1769 candidate_by_last_update.clear();
1770 candidate_by_last_update.reserve(all_info.size()); // overestimate but fine
1771 // continue to search stray to find more suitable peers
1772 for (auto &i : all_info) {
1773 // skip up osds we already considered above
1774 if (i.first == primary->first)
1775 continue;
1776 vector<int>::const_iterator up_it = find(up.begin(), up.end(), i.first.osd);
1777 if (up_it != up.end())
1778 continue;
1779 vector<int>::const_iterator acting_it = find(
1780 acting.begin(), acting.end(), i.first.osd);
1781 if (acting_it != acting.end())
1782 continue;
1783
1784 if (i.second.is_incomplete() ||
1785 i.second.last_update < oldest_auth_log_entry) {
1786 ss << " shard " << i.first << " (stray) REJECTED " << i.second
1787 << std::endl;
1788 } else {
1789 candidate_by_last_update.emplace_back(
1790 i.second.last_update, i.first.osd);
1791 }
1792 }
1793
1794 if (candidate_by_last_update.empty()) {
1795 // save us some effort
1796 return;
1797 }
1798
1799 // sort by last_update, in descending order.
1800 std::sort(candidate_by_last_update.begin(),
1801 candidate_by_last_update.end(), sort_by_eversion);
1802
1803 for (auto &p: candidate_by_last_update) {
1804 ceph_assert(want->size() < size);
1805 want->push_back(p.second);
1806 pg_shard_t s = pg_shard_t(p.second, shard_id_t::NO_SHARD);
1807 acting_backfill->insert(s);
1808 ss << " shard " << s << " (stray) accepted "
1809 << all_info.find(s)->second << std::endl;
1810 if (want->size() >= size) {
1811 return;
1812 }
1813 }
1814 }
1815
1816 bool PeeringState::recoverable(const vector<int> &want) const
1817 {
1818 unsigned num_want_acting = 0;
1819 set<pg_shard_t> have;
1820 for (int i = 0; i < (int)want.size(); ++i) {
1821 if (want[i] != CRUSH_ITEM_NONE) {
1822 ++num_want_acting;
1823 have.insert(
1824 pg_shard_t(
1825 want[i],
1826 pool.info.is_erasure() ? shard_id_t(i) : shard_id_t::NO_SHARD));
1827 }
1828 }
1829
1830 if (num_want_acting < pool.info.min_size) {
1831 const bool recovery_ec_pool_below_min_size=
1832 HAVE_FEATURE(get_osdmap()->get_up_osd_features(), SERVER_OCTOPUS);
1833
1834 if (pool.info.is_erasure() && !recovery_ec_pool_below_min_size) {
1835 psdout(10) << __func__ << " failed, ec recovery below min size not supported by pre-octopus" << dendl;
1836 return false;
1837 } else if (!cct->_conf.get_val<bool>("osd_allow_recovery_below_min_size")) {
1838 psdout(10) << __func__ << " failed, recovery below min size not enabled" << dendl;
1839 return false;
1840 }
1841 }
1842 if (missing_loc.get_recoverable_predicate()(have)) {
1843 return true;
1844 } else {
1845 psdout(10) << __func__ << " failed, not recoverable " << dendl;
1846 return false;
1847 }
1848 }
1849
1850 void PeeringState::choose_async_recovery_ec(
1851 const map<pg_shard_t, pg_info_t> &all_info,
1852 const pg_info_t &auth_info,
1853 vector<int> *want,
1854 set<pg_shard_t> *async_recovery,
1855 const OSDMapRef osdmap) const
1856 {
1857 set<pair<int, pg_shard_t> > candidates_by_cost;
1858 for (uint8_t i = 0; i < want->size(); ++i) {
1859 if ((*want)[i] == CRUSH_ITEM_NONE)
1860 continue;
1861
1862 // Considering log entries to recover is accurate enough for
1863 // now. We could use minimum_to_decode_with_cost() later if
1864 // necessary.
1865 pg_shard_t shard_i((*want)[i], shard_id_t(i));
1866 // do not include strays
1867 if (stray_set.find(shard_i) != stray_set.end())
1868 continue;
1869 // Do not include an osd that is not up, since choosing it as
1870 // an async_recovery_target will move it out of the acting set.
1871 // This results in it being identified as a stray during peering,
1872 // because it is no longer in the up or acting set.
1873 if (!is_up(shard_i))
1874 continue;
1875 auto shard_info = all_info.find(shard_i)->second;
1876 // for ec pools we rollback all entries past the authoritative
1877 // last_update *before* activation. This is relatively inexpensive
1878 // compared to recovery, since it is purely local, so treat shards
1879 // past the authoritative last_update the same as those equal to it.
1880 version_t auth_version = auth_info.last_update.version;
1881 version_t candidate_version = shard_info.last_update.version;
1882 if (HAVE_FEATURE(osdmap->get_up_osd_features(), SERVER_NAUTILUS)) {
1883 auto approx_missing_objects =
1884 shard_info.stats.stats.sum.num_objects_missing;
1885 if (auth_version > candidate_version) {
1886 approx_missing_objects += auth_version - candidate_version;
1887 }
1888 if (static_cast<uint64_t>(approx_missing_objects) >
1889 cct->_conf.get_val<uint64_t>("osd_async_recovery_min_cost")) {
1890 candidates_by_cost.emplace(approx_missing_objects, shard_i);
1891 }
1892 } else {
1893 if (auth_version > candidate_version &&
1894 (auth_version - candidate_version) > cct->_conf.get_val<uint64_t>("osd_async_recovery_min_cost")) {
1895 candidates_by_cost.insert(make_pair(auth_version - candidate_version, shard_i));
1896 }
1897 }
1898 }
1899
1900 psdout(20) << __func__ << " candidates by cost are: " << candidates_by_cost
1901 << dendl;
1902
1903 // take out as many osds as we can for async recovery, in order of cost
1904 for (auto rit = candidates_by_cost.rbegin();
1905 rit != candidates_by_cost.rend(); ++rit) {
1906 pg_shard_t cur_shard = rit->second;
1907 vector<int> candidate_want(*want);
1908 candidate_want[cur_shard.shard.id] = CRUSH_ITEM_NONE;
1909 if (recoverable(candidate_want)) {
1910 want->swap(candidate_want);
1911 async_recovery->insert(cur_shard);
1912 }
1913 }
1914 psdout(20) << __func__ << " result want=" << *want
1915 << " async_recovery=" << *async_recovery << dendl;
1916 }
1917
1918 void PeeringState::choose_async_recovery_replicated(
1919 const map<pg_shard_t, pg_info_t> &all_info,
1920 const pg_info_t &auth_info,
1921 vector<int> *want,
1922 set<pg_shard_t> *async_recovery,
1923 const OSDMapRef osdmap) const
1924 {
1925 set<pair<int, pg_shard_t> > candidates_by_cost;
1926 for (auto osd_num : *want) {
1927 pg_shard_t shard_i(osd_num, shard_id_t::NO_SHARD);
1928 // do not include strays
1929 if (stray_set.find(shard_i) != stray_set.end())
1930 continue;
1931 // Do not include an osd that is not up, since choosing it as
1932 // an async_recovery_target will move it out of the acting set.
1933 // This results in it being identified as a stray during peering,
1934 // because it is no longer in the up or acting set.
1935 if (!is_up(shard_i))
1936 continue;
1937 auto shard_info = all_info.find(shard_i)->second;
1938 // use the approximate magnitude of the difference in length of
1939 // logs plus historical missing objects as the cost of recovery
1940 version_t auth_version = auth_info.last_update.version;
1941 version_t candidate_version = shard_info.last_update.version;
1942 if (HAVE_FEATURE(osdmap->get_up_osd_features(), SERVER_NAUTILUS)) {
1943 auto approx_missing_objects =
1944 shard_info.stats.stats.sum.num_objects_missing;
1945 if (auth_version > candidate_version) {
1946 approx_missing_objects += auth_version - candidate_version;
1947 } else {
1948 approx_missing_objects += candidate_version - auth_version;
1949 }
1950 if (static_cast<uint64_t>(approx_missing_objects) >
1951 cct->_conf.get_val<uint64_t>("osd_async_recovery_min_cost")) {
1952 candidates_by_cost.emplace(approx_missing_objects, shard_i);
1953 }
1954 } else {
1955 size_t approx_entries;
1956 if (auth_version > candidate_version) {
1957 approx_entries = auth_version - candidate_version;
1958 } else {
1959 approx_entries = candidate_version - auth_version;
1960 }
1961 if (approx_entries > cct->_conf.get_val<uint64_t>("osd_async_recovery_min_cost")) {
1962 candidates_by_cost.insert(make_pair(approx_entries, shard_i));
1963 }
1964 }
1965 }
1966
1967 psdout(20) << __func__ << " candidates by cost are: " << candidates_by_cost
1968 << dendl;
1969 // take out as many osds as we can for async recovery, in order of cost
1970 for (auto rit = candidates_by_cost.rbegin();
1971 rit != candidates_by_cost.rend(); ++rit) {
1972 if (want->size() <= pool.info.min_size) {
1973 break;
1974 }
1975 pg_shard_t cur_shard = rit->second;
1976 vector<int> candidate_want(*want);
1977 for (auto it = candidate_want.begin(); it != candidate_want.end(); ++it) {
1978 if (*it == cur_shard.osd) {
1979 candidate_want.erase(it);
1980 want->swap(candidate_want);
1981 async_recovery->insert(cur_shard);
1982 break;
1983 }
1984 }
1985 }
1986 psdout(20) << __func__ << " result want=" << *want
1987 << " async_recovery=" << *async_recovery << dendl;
1988 }
1989
1990
1991
1992 /**
1993 * choose acting
1994 *
1995 * calculate the desired acting, and request a change with the monitor
1996 * if it differs from the current acting.
1997 *
1998 * if restrict_to_up_acting=true, we filter out anything that's not in
1999 * up/acting. in order to lift this restriction, we need to
2000 * 1) check whether it's worth switching the acting set any time we get
2001 * a new pg info (not just here, when recovery finishes)
2002 * 2) check whether anything in want_acting went down on each new map
2003 * (and, if so, calculate a new want_acting)
2004 * 3) remove the assertion in PG::PeeringState::Active::react(const AdvMap)
2005 * TODO!
2006 */
2007 bool PeeringState::choose_acting(pg_shard_t &auth_log_shard_id,
2008 bool restrict_to_up_acting,
2009 bool *history_les_bound,
2010 bool request_pg_temp_change_only)
2011 {
2012 map<pg_shard_t, pg_info_t> all_info(peer_info.begin(), peer_info.end());
2013 all_info[pg_whoami] = info;
2014
2015 if (cct->_conf->subsys.should_gather<dout_subsys, 10>()) {
2016 for (map<pg_shard_t, pg_info_t>::iterator p = all_info.begin();
2017 p != all_info.end();
2018 ++p) {
2019 psdout(10) << __func__ << " all_info osd." << p->first << " "
2020 << p->second << dendl;
2021 }
2022 }
2023
2024 map<pg_shard_t, pg_info_t>::const_iterator auth_log_shard =
2025 find_best_info(all_info, restrict_to_up_acting, history_les_bound);
2026
2027 if (auth_log_shard == all_info.end()) {
2028 if (up != acting) {
2029 psdout(10) << __func__ << " no suitable info found (incomplete backfills?),"
2030 << " reverting to up" << dendl;
2031 want_acting = up;
2032 vector<int> empty;
2033 pl->queue_want_pg_temp(empty);
2034 } else {
2035 psdout(10) << __func__ << " failed" << dendl;
2036 ceph_assert(want_acting.empty());
2037 }
2038 return false;
2039 }
2040
2041 ceph_assert(!auth_log_shard->second.is_incomplete());
2042 auth_log_shard_id = auth_log_shard->first;
2043
2044 set<pg_shard_t> want_backfill, want_acting_backfill;
2045 vector<int> want;
2046 stringstream ss;
2047 if (pool.info.is_replicated())
2048 calc_replicated_acting(
2049 auth_log_shard,
2050 cct->_conf.get_val<uint64_t>(
2051 "osd_force_auth_primary_missing_objects"),
2052 get_osdmap()->get_pg_size(info.pgid.pgid),
2053 acting,
2054 up,
2055 up_primary,
2056 all_info,
2057 restrict_to_up_acting,
2058 &want,
2059 &want_backfill,
2060 &want_acting_backfill,
2061 get_osdmap(),
2062 ss);
2063 else
2064 calc_ec_acting(
2065 auth_log_shard,
2066 get_osdmap()->get_pg_size(info.pgid.pgid),
2067 acting,
2068 up,
2069 all_info,
2070 restrict_to_up_acting,
2071 &want,
2072 &want_backfill,
2073 &want_acting_backfill,
2074 ss);
2075 psdout(10) << ss.str() << dendl;
2076
2077 if (!recoverable(want)) {
2078 want_acting.clear();
2079 return false;
2080 }
2081
2082 set<pg_shard_t> want_async_recovery;
2083 if (HAVE_FEATURE(get_osdmap()->get_up_osd_features(), SERVER_MIMIC)) {
2084 if (pool.info.is_erasure()) {
2085 choose_async_recovery_ec(
2086 all_info, auth_log_shard->second, &want, &want_async_recovery,
2087 get_osdmap());
2088 } else {
2089 choose_async_recovery_replicated(
2090 all_info, auth_log_shard->second, &want, &want_async_recovery,
2091 get_osdmap());
2092 }
2093 }
2094 while (want.size() > pool.info.size) {
2095 // async recovery should have taken out as many osds as it can.
2096 // if not, then always evict the last peer
2097 // (will get synchronously recovered later)
2098 psdout(10) << __func__ << " evicting osd." << want.back()
2099 << " from oversized want " << want << dendl;
2100 want.pop_back();
2101 }
2102 if (want != acting) {
2103 psdout(10) << __func__ << " want " << want << " != acting " << acting
2104 << ", requesting pg_temp change" << dendl;
2105 want_acting = want;
2106
2107 if (!cct->_conf->osd_debug_no_acting_change) {
2108 if (want_acting == up) {
2109 // There can't be any pending backfill if
2110 // want is the same as crush map up OSDs.
2111 ceph_assert(want_backfill.empty());
2112 vector<int> empty;
2113 pl->queue_want_pg_temp(empty);
2114 } else
2115 pl->queue_want_pg_temp(want);
2116 }
2117 return false;
2118 }
2119 if (request_pg_temp_change_only)
2120 return true;
2121 want_acting.clear();
2122 acting_recovery_backfill = want_acting_backfill;
2123 psdout(10) << "acting_recovery_backfill is "
2124 << acting_recovery_backfill << dendl;
2125 ceph_assert(
2126 backfill_targets.empty() ||
2127 backfill_targets == want_backfill);
2128 if (backfill_targets.empty()) {
2129 // Caller is GetInfo
2130 backfill_targets = want_backfill;
2131 }
2132 // Adding !needs_recovery() to let the async_recovery_targets reset after recovery is complete
2133 ceph_assert(
2134 async_recovery_targets.empty() ||
2135 async_recovery_targets == want_async_recovery ||
2136 !needs_recovery());
2137 if (async_recovery_targets.empty() || !needs_recovery()) {
2138 async_recovery_targets = want_async_recovery;
2139 }
2140 // Will not change if already set because up would have had to change
2141 // Verify that nothing in backfill is in stray_set
2142 for (set<pg_shard_t>::iterator i = want_backfill.begin();
2143 i != want_backfill.end();
2144 ++i) {
2145 ceph_assert(stray_set.find(*i) == stray_set.end());
2146 }
2147 psdout(10) << "choose_acting want=" << want << " backfill_targets="
2148 << want_backfill << " async_recovery_targets="
2149 << async_recovery_targets << dendl;
2150 return true;
2151 }
2152
2153 void PeeringState::log_weirdness()
2154 {
2155 if (pg_log.get_tail() != info.log_tail)
2156 pl->get_clog_error() << info.pgid
2157 << " info mismatch, log.tail " << pg_log.get_tail()
2158 << " != info.log_tail " << info.log_tail;
2159 if (pg_log.get_head() != info.last_update)
2160 pl->get_clog_error() << info.pgid
2161 << " info mismatch, log.head " << pg_log.get_head()
2162 << " != info.last_update " << info.last_update;
2163
2164 if (!pg_log.get_log().empty()) {
2165 // sloppy check
2166 if ((pg_log.get_log().log.begin()->version <= pg_log.get_tail()))
2167 pl->get_clog_error() << info.pgid
2168 << " log bound mismatch, info (tail,head] ("
2169 << pg_log.get_tail() << ","
2170 << pg_log.get_head() << "]"
2171 << " actual ["
2172 << pg_log.get_log().log.begin()->version << ","
2173 << pg_log.get_log().log.rbegin()->version << "]";
2174 }
2175
2176 if (pg_log.get_log().caller_ops.size() > pg_log.get_log().log.size()) {
2177 pl->get_clog_error() << info.pgid
2178 << " caller_ops.size "
2179 << pg_log.get_log().caller_ops.size()
2180 << " > log size " << pg_log.get_log().log.size();
2181 }
2182 }
2183
2184 /*
2185 * Process information from a replica to determine if it could have any
2186 * objects that i need.
2187 *
2188 * TODO: if the missing set becomes very large, this could get expensive.
2189 * Instead, we probably want to just iterate over our unfound set.
2190 */
2191 bool PeeringState::search_for_missing(
2192 const pg_info_t &oinfo, const pg_missing_t &omissing,
2193 pg_shard_t from,
2194 PeeringCtxWrapper &ctx)
2195 {
2196 uint64_t num_unfound_before = missing_loc.num_unfound();
2197 bool found_missing = missing_loc.add_source_info(
2198 from, oinfo, omissing, ctx.handle);
2199 if (found_missing && num_unfound_before != missing_loc.num_unfound())
2200 pl->publish_stats_to_osd();
2201 // avoid doing this if the peer is empty. This is abit of paranoia
2202 // to avoid doing something rash if add_source_info() above
2203 // incorrectly decided we found something new. (if the peer has
2204 // last_update=0'0 that's impossible.)
2205 if (found_missing &&
2206 oinfo.last_update != eversion_t()) {
2207 pg_info_t tinfo(oinfo);
2208 tinfo.pgid.shard = pg_whoami.shard;
2209 ctx.send_info(
2210 from.osd,
2211 spg_t(info.pgid.pgid, from.shard),
2212 get_osdmap_epoch(), // fixme: use lower epoch?
2213 get_osdmap_epoch(),
2214 tinfo);
2215 }
2216 return found_missing;
2217 }
2218
2219 bool PeeringState::discover_all_missing(
2220 BufferedRecoveryMessages &rctx)
2221 {
2222 auto &missing = pg_log.get_missing();
2223 uint64_t unfound = get_num_unfound();
2224 bool any = false; // did we start any queries
2225
2226 psdout(10) << __func__ << " "
2227 << missing.num_missing() << " missing, "
2228 << unfound << " unfound"
2229 << dendl;
2230
2231 std::set<pg_shard_t>::const_iterator m = might_have_unfound.begin();
2232 std::set<pg_shard_t>::const_iterator mend = might_have_unfound.end();
2233 for (; m != mend; ++m) {
2234 pg_shard_t peer(*m);
2235
2236 if (!get_osdmap()->is_up(peer.osd)) {
2237 psdout(20) << __func__ << " skipping down osd." << peer << dendl;
2238 continue;
2239 }
2240
2241 if (peer_purged.count(peer)) {
2242 psdout(20) << __func__ << " skipping purged osd." << peer << dendl;
2243 continue;
2244 }
2245
2246 map<pg_shard_t, pg_info_t>::const_iterator iter = peer_info.find(peer);
2247 if (iter != peer_info.end() &&
2248 (iter->second.is_empty() || iter->second.dne())) {
2249 // ignore empty peers
2250 continue;
2251 }
2252
2253 // If we've requested any of this stuff, the pg_missing_t information
2254 // should be on its way.
2255 // TODO: coalsce requested_* into a single data structure
2256 if (peer_missing.find(peer) != peer_missing.end()) {
2257 psdout(20) << __func__ << ": osd." << peer
2258 << ": we already have pg_missing_t" << dendl;
2259 continue;
2260 }
2261 if (peer_log_requested.find(peer) != peer_log_requested.end()) {
2262 psdout(20) << __func__ << ": osd." << peer
2263 << ": in peer_log_requested" << dendl;
2264 continue;
2265 }
2266 if (peer_missing_requested.find(peer) != peer_missing_requested.end()) {
2267 psdout(20) << __func__ << ": osd." << peer
2268 << ": in peer_missing_requested" << dendl;
2269 continue;
2270 }
2271
2272 // Request missing
2273 psdout(10) << __func__ << ": osd." << peer << ": requesting pg_missing_t"
2274 << dendl;
2275 peer_missing_requested.insert(peer);
2276 rctx.send_query(
2277 peer.osd,
2278 spg_t(info.pgid.pgid, peer.shard),
2279 pg_query_t(
2280 pg_query_t::FULLLOG,
2281 peer.shard, pg_whoami.shard,
2282 info.history, get_osdmap_epoch()));
2283 any = true;
2284 }
2285 return any;
2286 }
2287
2288 /* Build the might_have_unfound set.
2289 *
2290 * This is used by the primary OSD during recovery.
2291 *
2292 * This set tracks the OSDs which might have unfound objects that the primary
2293 * OSD needs. As we receive pg_missing_t from each OSD in might_have_unfound, we
2294 * will remove the OSD from the set.
2295 */
2296 void PeeringState::build_might_have_unfound()
2297 {
2298 ceph_assert(might_have_unfound.empty());
2299 ceph_assert(is_primary());
2300
2301 psdout(10) << __func__ << dendl;
2302
2303 check_past_interval_bounds();
2304
2305 might_have_unfound = past_intervals.get_might_have_unfound(
2306 pg_whoami,
2307 pool.info.is_erasure());
2308
2309 // include any (stray) peers
2310 for (map<pg_shard_t, pg_info_t>::iterator p = peer_info.begin();
2311 p != peer_info.end();
2312 ++p)
2313 might_have_unfound.insert(p->first);
2314
2315 psdout(15) << __func__ << ": built " << might_have_unfound << dendl;
2316 }
2317
2318 void PeeringState::activate(
2319 ObjectStore::Transaction& t,
2320 epoch_t activation_epoch,
2321 PeeringCtxWrapper &ctx)
2322 {
2323 ceph_assert(!is_peered());
2324
2325 // twiddle pg state
2326 state_clear(PG_STATE_DOWN);
2327
2328 send_notify = false;
2329
2330 if (is_primary()) {
2331 // only update primary last_epoch_started if we will go active
2332 if (acting.size() >= pool.info.min_size) {
2333 ceph_assert(cct->_conf->osd_find_best_info_ignore_history_les ||
2334 info.last_epoch_started <= activation_epoch);
2335 info.last_epoch_started = activation_epoch;
2336 info.last_interval_started = info.history.same_interval_since;
2337 }
2338 } else if (is_acting(pg_whoami)) {
2339 /* update last_epoch_started on acting replica to whatever the primary sent
2340 * unless it's smaller (could happen if we are going peered rather than
2341 * active, see doc/dev/osd_internals/last_epoch_started.rst) */
2342 if (info.last_epoch_started < activation_epoch) {
2343 info.last_epoch_started = activation_epoch;
2344 info.last_interval_started = info.history.same_interval_since;
2345 }
2346 }
2347
2348 auto &missing = pg_log.get_missing();
2349
2350 min_last_complete_ondisk = eversion_t(0,0); // we don't know (yet)!
2351 if (is_primary()) {
2352 last_update_ondisk = info.last_update;
2353 }
2354 last_update_applied = info.last_update;
2355 last_rollback_info_trimmed_to_applied = pg_log.get_can_rollback_to();
2356
2357 need_up_thru = false;
2358
2359 // write pg info, log
2360 dirty_info = true;
2361 dirty_big_info = true; // maybe
2362
2363 pl->schedule_event_on_commit(
2364 t,
2365 std::make_shared<PGPeeringEvent>(
2366 get_osdmap_epoch(),
2367 get_osdmap_epoch(),
2368 ActivateCommitted(
2369 get_osdmap_epoch(),
2370 activation_epoch)));
2371
2372 // init complete pointer
2373 if (missing.num_missing() == 0) {
2374 psdout(10) << "activate - no missing, moving last_complete " << info.last_complete
2375 << " -> " << info.last_update << dendl;
2376 info.last_complete = info.last_update;
2377 info.stats.stats.sum.num_objects_missing = 0;
2378 pg_log.reset_recovery_pointers();
2379 } else {
2380 psdout(10) << "activate - not complete, " << missing << dendl;
2381 info.stats.stats.sum.num_objects_missing = missing.num_missing();
2382 pg_log.activate_not_complete(info);
2383 }
2384
2385 log_weirdness();
2386
2387 if (is_primary()) {
2388 // initialize snap_trimq
2389 interval_set<snapid_t> to_trim;
2390 auto& removed_snaps_queue = get_osdmap()->get_removed_snaps_queue();
2391 auto p = removed_snaps_queue.find(info.pgid.pgid.pool());
2392 if (p != removed_snaps_queue.end()) {
2393 dout(20) << "activate - purged_snaps " << info.purged_snaps
2394 << " removed_snaps " << p->second
2395 << dendl;
2396 for (auto q : p->second) {
2397 to_trim.insert(q.first, q.second);
2398 }
2399 }
2400 interval_set<snapid_t> purged;
2401 purged.intersection_of(to_trim, info.purged_snaps);
2402 to_trim.subtract(purged);
2403
2404 if (HAVE_FEATURE(upacting_features, SERVER_OCTOPUS)) {
2405 renew_lease(pl->get_mnow());
2406 // do not schedule until we are actually activated
2407 }
2408
2409 // adjust purged_snaps: PG may have been inactive while snaps were pruned
2410 // from the removed_snaps_queue in the osdmap. update local purged_snaps
2411 // reflect only those snaps that we thought were pruned and were still in
2412 // the queue.
2413 info.purged_snaps.swap(purged);
2414
2415 // start up replicas
2416 info.history.refresh_prior_readable_until_ub(pl->get_mnow(),
2417 prior_readable_until_ub);
2418
2419 ceph_assert(!acting_recovery_backfill.empty());
2420 for (set<pg_shard_t>::iterator i = acting_recovery_backfill.begin();
2421 i != acting_recovery_backfill.end();
2422 ++i) {
2423 if (*i == pg_whoami) continue;
2424 pg_shard_t peer = *i;
2425 ceph_assert(peer_info.count(peer));
2426 pg_info_t& pi = peer_info[peer];
2427
2428 psdout(10) << "activate peer osd." << peer << " " << pi << dendl;
2429
2430 MOSDPGLog *m = 0;
2431 ceph_assert(peer_missing.count(peer));
2432 pg_missing_t& pm = peer_missing[peer];
2433
2434 bool needs_past_intervals = pi.dne();
2435
2436 if (pi.last_update == info.last_update) {
2437 // empty log
2438 if (!pi.last_backfill.is_max())
2439 pl->get_clog_info() << info.pgid << " continuing backfill to osd."
2440 << peer
2441 << " from (" << pi.log_tail << "," << pi.last_update
2442 << "] " << pi.last_backfill
2443 << " to " << info.last_update;
2444 if (!pi.is_empty()) {
2445 psdout(10) << "activate peer osd." << peer
2446 << " is up to date, queueing in pending_activators" << dendl;
2447 ctx.send_info(
2448 peer.osd,
2449 spg_t(info.pgid.pgid, peer.shard),
2450 get_osdmap_epoch(), // fixme: use lower epoch?
2451 get_osdmap_epoch(),
2452 info,
2453 get_lease());
2454 } else {
2455 psdout(10) << "activate peer osd." << peer
2456 << " is up to date, but sending pg_log anyway" << dendl;
2457 m = new MOSDPGLog(
2458 i->shard, pg_whoami.shard,
2459 get_osdmap_epoch(), info,
2460 last_peering_reset);
2461 }
2462 } else if (
2463 pg_log.get_tail() > pi.last_update ||
2464 pi.last_backfill == hobject_t() ||
2465 (backfill_targets.count(*i) && pi.last_backfill.is_max())) {
2466 /* ^ This last case covers a situation where a replica is not contiguous
2467 * with the auth_log, but is contiguous with this replica. Reshuffling
2468 * the active set to handle this would be tricky, so instead we just go
2469 * ahead and backfill it anyway. This is probably preferrable in any
2470 * case since the replica in question would have to be significantly
2471 * behind.
2472 */
2473 // backfill
2474 pl->get_clog_debug() << info.pgid << " starting backfill to osd." << peer
2475 << " from (" << pi.log_tail << "," << pi.last_update
2476 << "] " << pi.last_backfill
2477 << " to " << info.last_update;
2478
2479 pi.last_update = info.last_update;
2480 pi.last_complete = info.last_update;
2481 pi.set_last_backfill(hobject_t());
2482 pi.last_epoch_started = info.last_epoch_started;
2483 pi.last_interval_started = info.last_interval_started;
2484 pi.history = info.history;
2485 pi.hit_set = info.hit_set;
2486 // Save num_bytes for reservation request, can't be negative
2487 peer_bytes[peer] = std::max<int64_t>(0, pi.stats.stats.sum.num_bytes);
2488 pi.stats.stats.clear();
2489 pi.stats.stats.sum.num_bytes = peer_bytes[peer];
2490
2491 // initialize peer with our purged_snaps.
2492 pi.purged_snaps = info.purged_snaps;
2493
2494 m = new MOSDPGLog(
2495 i->shard, pg_whoami.shard,
2496 get_osdmap_epoch(), pi,
2497 last_peering_reset /* epoch to create pg at */);
2498
2499 // send some recent log, so that op dup detection works well.
2500 m->log.copy_up_to(cct, pg_log.get_log(),
2501 cct->_conf->osd_max_pg_log_entries);
2502 m->info.log_tail = m->log.tail;
2503 pi.log_tail = m->log.tail; // sigh...
2504
2505 pm.clear();
2506 } else {
2507 // catch up
2508 ceph_assert(pg_log.get_tail() <= pi.last_update);
2509 m = new MOSDPGLog(
2510 i->shard, pg_whoami.shard,
2511 get_osdmap_epoch(), info,
2512 last_peering_reset /* epoch to create pg at */);
2513 // send new stuff to append to replicas log
2514 m->log.copy_after(cct, pg_log.get_log(), pi.last_update);
2515 }
2516
2517 // share past_intervals if we are creating the pg on the replica
2518 // based on whether our info for that peer was dne() *before*
2519 // updating pi.history in the backfill block above.
2520 if (m && needs_past_intervals)
2521 m->past_intervals = past_intervals;
2522
2523 // update local version of peer's missing list!
2524 if (m && pi.last_backfill != hobject_t()) {
2525 for (list<pg_log_entry_t>::iterator p = m->log.log.begin();
2526 p != m->log.log.end();
2527 ++p) {
2528 if (p->soid <= pi.last_backfill &&
2529 !p->is_error()) {
2530 if (perform_deletes_during_peering() && p->is_delete()) {
2531 pm.rm(p->soid, p->version);
2532 } else {
2533 pm.add_next_event(*p);
2534 }
2535 }
2536 }
2537 }
2538
2539 if (m) {
2540 dout(10) << "activate peer osd." << peer << " sending " << m->log
2541 << dendl;
2542 m->lease = get_lease();
2543 pl->send_cluster_message(peer.osd, m, get_osdmap_epoch());
2544 }
2545
2546 // peer now has
2547 pi.last_update = info.last_update;
2548
2549 // update our missing
2550 if (pm.num_missing() == 0) {
2551 pi.last_complete = pi.last_update;
2552 psdout(10) << "activate peer osd." << peer << " " << pi
2553 << " uptodate" << dendl;
2554 } else {
2555 psdout(10) << "activate peer osd." << peer << " " << pi
2556 << " missing " << pm << dendl;
2557 }
2558 }
2559
2560 // Set up missing_loc
2561 set<pg_shard_t> complete_shards;
2562 for (set<pg_shard_t>::iterator i = acting_recovery_backfill.begin();
2563 i != acting_recovery_backfill.end();
2564 ++i) {
2565 psdout(20) << __func__ << " setting up missing_loc from shard " << *i
2566 << " " << dendl;
2567 if (*i == get_primary()) {
2568 missing_loc.add_active_missing(missing);
2569 if (!missing.have_missing())
2570 complete_shards.insert(*i);
2571 } else {
2572 auto peer_missing_entry = peer_missing.find(*i);
2573 ceph_assert(peer_missing_entry != peer_missing.end());
2574 missing_loc.add_active_missing(peer_missing_entry->second);
2575 if (!peer_missing_entry->second.have_missing() &&
2576 peer_info[*i].last_backfill.is_max())
2577 complete_shards.insert(*i);
2578 }
2579 }
2580
2581 // If necessary, create might_have_unfound to help us find our unfound objects.
2582 // NOTE: It's important that we build might_have_unfound before trimming the
2583 // past intervals.
2584 might_have_unfound.clear();
2585 if (needs_recovery()) {
2586 // If only one shard has missing, we do a trick to add all others as recovery
2587 // source, this is considered safe since the PGLogs have been merged locally,
2588 // and covers vast majority of the use cases, like one OSD/host is down for
2589 // a while for hardware repairing
2590 if (complete_shards.size() + 1 == acting_recovery_backfill.size()) {
2591 missing_loc.add_batch_sources_info(complete_shards, ctx.handle);
2592 } else {
2593 missing_loc.add_source_info(pg_whoami, info, pg_log.get_missing(),
2594 ctx.handle);
2595 for (set<pg_shard_t>::iterator i = acting_recovery_backfill.begin();
2596 i != acting_recovery_backfill.end();
2597 ++i) {
2598 if (*i == pg_whoami) continue;
2599 psdout(10) << __func__ << ": adding " << *i << " as a source" << dendl;
2600 ceph_assert(peer_missing.count(*i));
2601 ceph_assert(peer_info.count(*i));
2602 missing_loc.add_source_info(
2603 *i,
2604 peer_info[*i],
2605 peer_missing[*i],
2606 ctx.handle);
2607 }
2608 }
2609 for (map<pg_shard_t, pg_missing_t>::iterator i = peer_missing.begin();
2610 i != peer_missing.end();
2611 ++i) {
2612 if (is_acting_recovery_backfill(i->first))
2613 continue;
2614 ceph_assert(peer_info.count(i->first));
2615 search_for_missing(
2616 peer_info[i->first],
2617 i->second,
2618 i->first,
2619 ctx);
2620 }
2621
2622 build_might_have_unfound();
2623
2624 // Always call now so update_calc_stats() will be accurate
2625 discover_all_missing(ctx.msgs);
2626
2627 }
2628
2629 // num_objects_degraded if calculated should reflect this too, unless no
2630 // missing and we are about to go clean.
2631 if (get_osdmap()->get_pg_size(info.pgid.pgid) > actingset.size()) {
2632 state_set(PG_STATE_UNDERSIZED);
2633 }
2634
2635 state_set(PG_STATE_ACTIVATING);
2636 pl->on_activate(std::move(to_trim));
2637 }
2638 if (acting.size() >= pool.info.min_size) {
2639 PGLog::LogEntryHandlerRef rollbacker{pl->get_log_handler(t)};
2640 pg_log.roll_forward(rollbacker.get());
2641 }
2642 }
2643
2644 void PeeringState::share_pg_info()
2645 {
2646 psdout(10) << "share_pg_info" << dendl;
2647
2648 info.history.refresh_prior_readable_until_ub(pl->get_mnow(),
2649 prior_readable_until_ub);
2650
2651 // share new pg_info_t with replicas
2652 ceph_assert(!acting_recovery_backfill.empty());
2653 for (auto pg_shard : acting_recovery_backfill) {
2654 if (pg_shard == pg_whoami) continue;
2655 if (auto peer = peer_info.find(pg_shard); peer != peer_info.end()) {
2656 peer->second.last_epoch_started = info.last_epoch_started;
2657 peer->second.last_interval_started = info.last_interval_started;
2658 peer->second.history.merge(info.history);
2659 }
2660 Message* m = nullptr;
2661 if (last_require_osd_release >= ceph_release_t::octopus) {
2662 m = new MOSDPGInfo2{spg_t{info.pgid.pgid, pg_shard.shard},
2663 info,
2664 get_osdmap_epoch(),
2665 get_osdmap_epoch(),
2666 get_lease(), {}};
2667 } else {
2668 m = new MOSDPGInfo{get_osdmap_epoch(),
2669 {pg_notify_t{pg_shard.shard,
2670 pg_whoami.shard,
2671 get_osdmap_epoch(),
2672 get_osdmap_epoch(),
2673 info,
2674 past_intervals}}};
2675 }
2676 pl->send_cluster_message(pg_shard.osd, m, get_osdmap_epoch());
2677 }
2678 }
2679
2680 void PeeringState::merge_log(
2681 ObjectStore::Transaction& t, pg_info_t &oinfo, pg_log_t &olog,
2682 pg_shard_t from)
2683 {
2684 PGLog::LogEntryHandlerRef rollbacker{pl->get_log_handler(t)};
2685 pg_log.merge_log(
2686 oinfo, olog, from, info, rollbacker.get(), dirty_info, dirty_big_info);
2687 }
2688
2689 void PeeringState::rewind_divergent_log(
2690 ObjectStore::Transaction& t, eversion_t newhead)
2691 {
2692 PGLog::LogEntryHandlerRef rollbacker{pl->get_log_handler(t)};
2693 pg_log.rewind_divergent_log(
2694 newhead, info, rollbacker.get(), dirty_info, dirty_big_info);
2695 }
2696
2697
2698 void PeeringState::proc_primary_info(
2699 ObjectStore::Transaction &t, const pg_info_t &oinfo)
2700 {
2701 ceph_assert(!is_primary());
2702
2703 update_history(oinfo.history);
2704 if (!info.stats.stats_invalid && info.stats.stats.sum.num_scrub_errors) {
2705 info.stats.stats.sum.num_scrub_errors = 0;
2706 info.stats.stats.sum.num_shallow_scrub_errors = 0;
2707 info.stats.stats.sum.num_deep_scrub_errors = 0;
2708 dirty_info = true;
2709 }
2710
2711 if (!(info.purged_snaps == oinfo.purged_snaps)) {
2712 psdout(10) << __func__ << " updating purged_snaps to "
2713 << oinfo.purged_snaps
2714 << dendl;
2715 info.purged_snaps = oinfo.purged_snaps;
2716 dirty_info = true;
2717 dirty_big_info = true;
2718 }
2719 }
2720
2721 void PeeringState::proc_master_log(
2722 ObjectStore::Transaction& t, pg_info_t &oinfo,
2723 pg_log_t &olog, pg_missing_t& omissing, pg_shard_t from)
2724 {
2725 psdout(10) << "proc_master_log for osd." << from << ": "
2726 << olog << " " << omissing << dendl;
2727 ceph_assert(!is_peered() && is_primary());
2728
2729 // merge log into our own log to build master log. no need to
2730 // make any adjustments to their missing map; we are taking their
2731 // log to be authoritative (i.e., their entries are by definitely
2732 // non-divergent).
2733 merge_log(t, oinfo, olog, from);
2734 peer_info[from] = oinfo;
2735 psdout(10) << " peer osd." << from << " now " << oinfo
2736 << " " << omissing << dendl;
2737 might_have_unfound.insert(from);
2738
2739 // See doc/dev/osd_internals/last_epoch_started
2740 if (oinfo.last_epoch_started > info.last_epoch_started) {
2741 info.last_epoch_started = oinfo.last_epoch_started;
2742 dirty_info = true;
2743 }
2744 if (oinfo.last_interval_started > info.last_interval_started) {
2745 info.last_interval_started = oinfo.last_interval_started;
2746 dirty_info = true;
2747 }
2748 update_history(oinfo.history);
2749 ceph_assert(cct->_conf->osd_find_best_info_ignore_history_les ||
2750 info.last_epoch_started >= info.history.last_epoch_started);
2751
2752 peer_missing[from].claim(omissing);
2753 }
2754
2755 void PeeringState::proc_replica_log(
2756 pg_info_t &oinfo,
2757 const pg_log_t &olog,
2758 pg_missing_t& omissing,
2759 pg_shard_t from)
2760 {
2761 psdout(10) << "proc_replica_log for osd." << from << ": "
2762 << oinfo << " " << olog << " " << omissing << dendl;
2763
2764 pg_log.proc_replica_log(oinfo, olog, omissing, from);
2765
2766 peer_info[from] = oinfo;
2767 psdout(10) << " peer osd." << from << " now "
2768 << oinfo << " " << omissing << dendl;
2769 might_have_unfound.insert(from);
2770
2771 for (map<hobject_t, pg_missing_item>::const_iterator i =
2772 omissing.get_items().begin();
2773 i != omissing.get_items().end();
2774 ++i) {
2775 psdout(20) << " after missing " << i->first
2776 << " need " << i->second.need
2777 << " have " << i->second.have << dendl;
2778 }
2779 peer_missing[from].claim(omissing);
2780 }
2781
2782 void PeeringState::fulfill_info(
2783 pg_shard_t from, const pg_query_t &query,
2784 pair<pg_shard_t, pg_info_t> &notify_info)
2785 {
2786 ceph_assert(from == primary);
2787 ceph_assert(query.type == pg_query_t::INFO);
2788
2789 // info
2790 psdout(10) << "sending info" << dendl;
2791 notify_info = make_pair(from, info);
2792 }
2793
2794 void PeeringState::fulfill_log(
2795 pg_shard_t from, const pg_query_t &query, epoch_t query_epoch)
2796 {
2797 psdout(10) << "log request from " << from << dendl;
2798 ceph_assert(from == primary);
2799 ceph_assert(query.type != pg_query_t::INFO);
2800
2801 MOSDPGLog *mlog = new MOSDPGLog(
2802 from.shard, pg_whoami.shard,
2803 get_osdmap_epoch(),
2804 info, query_epoch);
2805 mlog->missing = pg_log.get_missing();
2806
2807 // primary -> other, when building master log
2808 if (query.type == pg_query_t::LOG) {
2809 psdout(10) << " sending info+missing+log since " << query.since
2810 << dendl;
2811 if (query.since != eversion_t() && query.since < pg_log.get_tail()) {
2812 pl->get_clog_error() << info.pgid << " got broken pg_query_t::LOG since "
2813 << query.since
2814 << " when my log.tail is " << pg_log.get_tail()
2815 << ", sending full log instead";
2816 mlog->log = pg_log.get_log(); // primary should not have requested this!!
2817 } else
2818 mlog->log.copy_after(cct, pg_log.get_log(), query.since);
2819 }
2820 else if (query.type == pg_query_t::FULLLOG) {
2821 psdout(10) << " sending info+missing+full log" << dendl;
2822 mlog->log = pg_log.get_log();
2823 }
2824
2825 psdout(10) << " sending " << mlog->log << " " << mlog->missing << dendl;
2826
2827 pl->send_cluster_message(from.osd, mlog, get_osdmap_epoch(), true);
2828 }
2829
2830 void PeeringState::fulfill_query(const MQuery& query, PeeringCtxWrapper &rctx)
2831 {
2832 if (query.query.type == pg_query_t::INFO) {
2833 pair<pg_shard_t, pg_info_t> notify_info;
2834 // note this refreshes our prior_readable_until_ub value
2835 update_history(query.query.history);
2836 fulfill_info(query.from, query.query, notify_info);
2837 rctx.send_notify(
2838 notify_info.first.osd,
2839 pg_notify_t(
2840 notify_info.first.shard, pg_whoami.shard,
2841 query.query_epoch,
2842 get_osdmap_epoch(),
2843 notify_info.second,
2844 past_intervals));
2845 } else {
2846 update_history(query.query.history);
2847 fulfill_log(query.from, query.query, query.query_epoch);
2848 }
2849 }
2850
2851 void PeeringState::try_mark_clean()
2852 {
2853 if (actingset.size() == get_osdmap()->get_pg_size(info.pgid.pgid)) {
2854 state_clear(PG_STATE_FORCED_BACKFILL | PG_STATE_FORCED_RECOVERY);
2855 state_set(PG_STATE_CLEAN);
2856 info.history.last_epoch_clean = get_osdmap_epoch();
2857 info.history.last_interval_clean = info.history.same_interval_since;
2858 past_intervals.clear();
2859 dirty_big_info = true;
2860 dirty_info = true;
2861 }
2862
2863 if (!is_active() && is_peered()) {
2864 if (is_clean()) {
2865 bool target;
2866 if (pool.info.is_pending_merge(info.pgid.pgid, &target)) {
2867 if (target) {
2868 psdout(10) << "ready to merge (target)" << dendl;
2869 pl->set_ready_to_merge_target(
2870 info.last_update,
2871 info.history.last_epoch_started,
2872 info.history.last_epoch_clean);
2873 } else {
2874 psdout(10) << "ready to merge (source)" << dendl;
2875 pl->set_ready_to_merge_source(info.last_update);
2876 }
2877 }
2878 } else {
2879 psdout(10) << "not clean, not ready to merge" << dendl;
2880 // we should have notified OSD in Active state entry point
2881 }
2882 }
2883
2884 state_clear(PG_STATE_FORCED_RECOVERY | PG_STATE_FORCED_BACKFILL);
2885
2886 share_pg_info();
2887 pl->publish_stats_to_osd();
2888 clear_recovery_state();
2889 }
2890
2891 void PeeringState::split_into(
2892 pg_t child_pgid, PeeringState *child, unsigned split_bits)
2893 {
2894 child->update_osdmap_ref(get_osdmap());
2895 child->pool = pool;
2896
2897 // Log
2898 pg_log.split_into(child_pgid, split_bits, &(child->pg_log));
2899 child->info.last_complete = info.last_complete;
2900
2901 info.last_update = pg_log.get_head();
2902 child->info.last_update = child->pg_log.get_head();
2903
2904 child->info.last_user_version = info.last_user_version;
2905
2906 info.log_tail = pg_log.get_tail();
2907 child->info.log_tail = child->pg_log.get_tail();
2908
2909 // reset last_complete, we might have modified pg_log & missing above
2910 pg_log.reset_complete_to(&info);
2911 child->pg_log.reset_complete_to(&child->info);
2912
2913 // Info
2914 child->info.history = info.history;
2915 child->info.history.epoch_created = get_osdmap_epoch();
2916 child->info.purged_snaps = info.purged_snaps;
2917
2918 if (info.last_backfill.is_max()) {
2919 child->info.set_last_backfill(hobject_t::get_max());
2920 } else {
2921 // restart backfill on parent and child to be safe. we could
2922 // probably do better in the bitwise sort case, but it's more
2923 // fragile (there may be special work to do on backfill completion
2924 // in the future).
2925 info.set_last_backfill(hobject_t());
2926 child->info.set_last_backfill(hobject_t());
2927 // restarting backfill implies that the missing set is empty,
2928 // since it is only used for objects prior to last_backfill
2929 pg_log.reset_backfill();
2930 child->pg_log.reset_backfill();
2931 }
2932
2933 child->info.stats = info.stats;
2934 child->info.stats.parent_split_bits = split_bits;
2935 info.stats.stats_invalid = true;
2936 child->info.stats.stats_invalid = true;
2937 child->info.last_epoch_started = info.last_epoch_started;
2938 child->info.last_interval_started = info.last_interval_started;
2939
2940 // There can't be recovery/backfill going on now
2941 int primary, up_primary;
2942 vector<int> newup, newacting;
2943 get_osdmap()->pg_to_up_acting_osds(
2944 child->info.pgid.pgid, &newup, &up_primary, &newacting, &primary);
2945 child->init_primary_up_acting(
2946 newup,
2947 newacting,
2948 up_primary,
2949 primary);
2950 child->role = OSDMap::calc_pg_role(pg_whoami, child->acting);
2951
2952 // this comparison includes primary rank via pg_shard_t
2953 if (get_primary() != child->get_primary())
2954 child->info.history.same_primary_since = get_osdmap_epoch();
2955
2956 child->info.stats.up = up;
2957 child->info.stats.up_primary = up_primary;
2958 child->info.stats.acting = acting;
2959 child->info.stats.acting_primary = primary;
2960 child->info.stats.mapping_epoch = get_osdmap_epoch();
2961
2962 // History
2963 child->past_intervals = past_intervals;
2964
2965 child->on_new_interval();
2966
2967 child->send_notify = !child->is_primary();
2968
2969 child->dirty_info = true;
2970 child->dirty_big_info = true;
2971 dirty_info = true;
2972 dirty_big_info = true;
2973 }
2974
2975 void PeeringState::merge_from(
2976 map<spg_t,PeeringState *>& sources,
2977 PeeringCtx &rctx,
2978 unsigned split_bits,
2979 const pg_merge_meta_t& last_pg_merge_meta)
2980 {
2981 bool incomplete = false;
2982 if (info.last_complete != info.last_update ||
2983 info.is_incomplete() ||
2984 info.dne()) {
2985 psdout(10) << __func__ << " target incomplete" << dendl;
2986 incomplete = true;
2987 }
2988 if (last_pg_merge_meta.source_pgid != pg_t()) {
2989 if (info.pgid.pgid != last_pg_merge_meta.source_pgid.get_parent()) {
2990 psdout(10) << __func__ << " target doesn't match expected parent "
2991 << last_pg_merge_meta.source_pgid.get_parent()
2992 << " of source_pgid " << last_pg_merge_meta.source_pgid
2993 << dendl;
2994 incomplete = true;
2995 }
2996 if (info.last_update != last_pg_merge_meta.target_version) {
2997 psdout(10) << __func__ << " target version doesn't match expected "
2998 << last_pg_merge_meta.target_version << dendl;
2999 incomplete = true;
3000 }
3001 }
3002
3003 PGLog::LogEntryHandlerRef handler{pl->get_log_handler(rctx.transaction)};
3004 pg_log.roll_forward(handler.get());
3005
3006 info.last_complete = info.last_update; // to fake out trim()
3007 pg_log.reset_recovery_pointers();
3008 pg_log.trim(info.last_update, info);
3009
3010 vector<PGLog*> log_from;
3011 for (auto& i : sources) {
3012 auto& source = i.second;
3013 if (!source) {
3014 psdout(10) << __func__ << " source " << i.first << " missing" << dendl;
3015 incomplete = true;
3016 continue;
3017 }
3018 if (source->info.last_complete != source->info.last_update ||
3019 source->info.is_incomplete() ||
3020 source->info.dne()) {
3021 psdout(10) << __func__ << " source " << source->pg_whoami
3022 << " incomplete"
3023 << dendl;
3024 incomplete = true;
3025 }
3026 if (last_pg_merge_meta.source_pgid != pg_t()) {
3027 if (source->info.pgid.pgid != last_pg_merge_meta.source_pgid) {
3028 dout(10) << __func__ << " source " << source->info.pgid.pgid
3029 << " doesn't match expected source pgid "
3030 << last_pg_merge_meta.source_pgid << dendl;
3031 incomplete = true;
3032 }
3033 if (source->info.last_update != last_pg_merge_meta.source_version) {
3034 dout(10) << __func__ << " source version doesn't match expected "
3035 << last_pg_merge_meta.target_version << dendl;
3036 incomplete = true;
3037 }
3038 }
3039
3040 // prepare log
3041 PGLog::LogEntryHandlerRef handler{
3042 source->pl->get_log_handler(rctx.transaction)};
3043 source->pg_log.roll_forward(handler.get());
3044 source->info.last_complete = source->info.last_update; // to fake out trim()
3045 source->pg_log.reset_recovery_pointers();
3046 source->pg_log.trim(source->info.last_update, source->info);
3047 log_from.push_back(&source->pg_log);
3048
3049 // combine stats
3050 info.stats.add(source->info.stats);
3051
3052 // pull up last_update
3053 info.last_update = std::max(info.last_update, source->info.last_update);
3054
3055 // adopt source's PastIntervals if target has none. we can do this since
3056 // pgp_num has been reduced prior to the merge, so the OSD mappings for
3057 // the PGs are identical.
3058 if (past_intervals.empty() && !source->past_intervals.empty()) {
3059 psdout(10) << __func__ << " taking source's past_intervals" << dendl;
3060 past_intervals = source->past_intervals;
3061 }
3062 }
3063
3064 info.last_complete = info.last_update;
3065 info.log_tail = info.last_update;
3066 if (incomplete) {
3067 info.last_backfill = hobject_t();
3068 }
3069
3070 // merge logs
3071 pg_log.merge_from(log_from, info.last_update);
3072
3073 // make sure we have a meaningful last_epoch_started/clean (if we were a
3074 // placeholder)
3075 if (info.history.epoch_created == 0) {
3076 // start with (a) source's history, since these PGs *should* have been
3077 // remapped in concert with each other...
3078 info.history = sources.begin()->second->info.history;
3079
3080 // we use the last_epoch_{started,clean} we got from
3081 // the caller, which are the epochs that were reported by the PGs were
3082 // found to be ready for merge.
3083 info.history.last_epoch_clean = last_pg_merge_meta.last_epoch_clean;
3084 info.history.last_epoch_started = last_pg_merge_meta.last_epoch_started;
3085 info.last_epoch_started = last_pg_merge_meta.last_epoch_started;
3086 psdout(10) << __func__
3087 << " set les/c to " << last_pg_merge_meta.last_epoch_started << "/"
3088 << last_pg_merge_meta.last_epoch_clean
3089 << " from pool last_dec_*, source pg history was "
3090 << sources.begin()->second->info.history
3091 << dendl;
3092
3093 // if the past_intervals start is later than last_epoch_clean, it
3094 // implies the source repeered again but the target didn't, or
3095 // that the source became clean in a later epoch than the target.
3096 // avoid the discrepancy but adjusting the interval start
3097 // backwards to match so that check_past_interval_bounds() will
3098 // not complain.
3099 auto pib = past_intervals.get_bounds();
3100 if (info.history.last_epoch_clean < pib.first) {
3101 psdout(10) << __func__ << " last_epoch_clean "
3102 << info.history.last_epoch_clean << " < past_interval start "
3103 << pib.first << ", adjusting start backwards" << dendl;
3104 past_intervals.adjust_start_backwards(info.history.last_epoch_clean);
3105 }
3106
3107 // Similarly, if the same_interval_since value is later than
3108 // last_epoch_clean, the next interval change will result in a
3109 // past_interval start that is later than last_epoch_clean. This
3110 // can happen if we use the pg_history values from the merge
3111 // source. Adjust the same_interval_since value backwards if that
3112 // happens. (We trust the les and lec values more because they came from
3113 // the real target, whereas the history value we stole from the source.)
3114 if (info.history.last_epoch_started < info.history.same_interval_since) {
3115 psdout(10) << __func__ << " last_epoch_started "
3116 << info.history.last_epoch_started << " < same_interval_since "
3117 << info.history.same_interval_since
3118 << ", adjusting pg_history backwards" << dendl;
3119 info.history.same_interval_since = info.history.last_epoch_clean;
3120 // make sure same_{up,primary}_since are <= same_interval_since
3121 info.history.same_up_since = std::min(
3122 info.history.same_up_since, info.history.same_interval_since);
3123 info.history.same_primary_since = std::min(
3124 info.history.same_primary_since, info.history.same_interval_since);
3125 }
3126 }
3127
3128 dirty_info = true;
3129 dirty_big_info = true;
3130 }
3131
3132 void PeeringState::start_split_stats(
3133 const set<spg_t>& childpgs, vector<object_stat_sum_t> *out)
3134 {
3135 out->resize(childpgs.size() + 1);
3136 info.stats.stats.sum.split(*out);
3137 }
3138
3139 void PeeringState::finish_split_stats(
3140 const object_stat_sum_t& stats, ObjectStore::Transaction &t)
3141 {
3142 info.stats.stats.sum = stats;
3143 write_if_dirty(t);
3144 }
3145
3146 void PeeringState::update_blocked_by()
3147 {
3148 // set a max on the number of blocking peers we report. if we go
3149 // over, report a random subset. keep the result sorted.
3150 unsigned keep = std::min<unsigned>(
3151 blocked_by.size(), cct->_conf->osd_max_pg_blocked_by);
3152 unsigned skip = blocked_by.size() - keep;
3153 info.stats.blocked_by.clear();
3154 info.stats.blocked_by.resize(keep);
3155 unsigned pos = 0;
3156 for (set<int>::iterator p = blocked_by.begin();
3157 p != blocked_by.end() && keep > 0;
3158 ++p) {
3159 if (skip > 0 && (rand() % (skip + keep) < skip)) {
3160 --skip;
3161 } else {
3162 info.stats.blocked_by[pos++] = *p;
3163 --keep;
3164 }
3165 }
3166 }
3167
3168 static bool find_shard(const set<pg_shard_t> & pgs, shard_id_t shard)
3169 {
3170 for (auto&p : pgs)
3171 if (p.shard == shard)
3172 return true;
3173 return false;
3174 }
3175
3176 static pg_shard_t get_another_shard(const set<pg_shard_t> & pgs, pg_shard_t skip, shard_id_t shard)
3177 {
3178 for (auto&p : pgs) {
3179 if (p == skip)
3180 continue;
3181 if (p.shard == shard)
3182 return p;
3183 }
3184 return pg_shard_t();
3185 }
3186
3187 void PeeringState::update_calc_stats()
3188 {
3189 info.stats.version = info.last_update;
3190 info.stats.created = info.history.epoch_created;
3191 info.stats.last_scrub = info.history.last_scrub;
3192 info.stats.last_scrub_stamp = info.history.last_scrub_stamp;
3193 info.stats.last_deep_scrub = info.history.last_deep_scrub;
3194 info.stats.last_deep_scrub_stamp = info.history.last_deep_scrub_stamp;
3195 info.stats.last_clean_scrub_stamp = info.history.last_clean_scrub_stamp;
3196 info.stats.last_epoch_clean = info.history.last_epoch_clean;
3197
3198 info.stats.log_size = pg_log.get_head().version - pg_log.get_tail().version;
3199 info.stats.ondisk_log_size = info.stats.log_size;
3200 info.stats.log_start = pg_log.get_tail();
3201 info.stats.ondisk_log_start = pg_log.get_tail();
3202 info.stats.snaptrimq_len = pl->get_snap_trimq_size();
3203
3204 unsigned num_shards = get_osdmap()->get_pg_size(info.pgid.pgid);
3205
3206 // In rare case that upset is too large (usually transient), use as target
3207 // for calculations below.
3208 unsigned target = std::max(num_shards, (unsigned)upset.size());
3209 // For undersized actingset may be larger with OSDs out
3210 unsigned nrep = std::max(actingset.size(), upset.size());
3211 // calc num_object_copies
3212 info.stats.stats.calc_copies(std::max(target, nrep));
3213 info.stats.stats.sum.num_objects_degraded = 0;
3214 info.stats.stats.sum.num_objects_unfound = 0;
3215 info.stats.stats.sum.num_objects_misplaced = 0;
3216 info.stats.avail_no_missing.clear();
3217 info.stats.object_location_counts.clear();
3218
3219 // We should never hit this condition, but if end up hitting it,
3220 // make sure to update num_objects and set PG_STATE_INCONSISTENT.
3221 if (info.stats.stats.sum.num_objects < 0) {
3222 psdout(0) << __func__ << " negative num_objects = "
3223 << info.stats.stats.sum.num_objects << " setting it to 0 "
3224 << dendl;
3225 info.stats.stats.sum.num_objects = 0;
3226 state_set(PG_STATE_INCONSISTENT);
3227 }
3228
3229 if ((is_remapped() || is_undersized() || !is_clean()) &&
3230 (is_peered()|| is_activating())) {
3231 psdout(20) << __func__ << " actingset " << actingset << " upset "
3232 << upset << " acting_recovery_backfill " << acting_recovery_backfill << dendl;
3233
3234 ceph_assert(!acting_recovery_backfill.empty());
3235
3236 bool estimate = false;
3237
3238 // NOTE: we only generate degraded, misplaced and unfound
3239 // values for the summation, not individual stat categories.
3240 int64_t num_objects = info.stats.stats.sum.num_objects;
3241
3242 // Objects missing from up nodes, sorted by # objects.
3243 boost::container::flat_set<pair<int64_t,pg_shard_t>> missing_target_objects;
3244 // Objects missing from nodes not in up, sort by # objects
3245 boost::container::flat_set<pair<int64_t,pg_shard_t>> acting_source_objects;
3246
3247 // Fill missing_target_objects/acting_source_objects
3248
3249 {
3250 int64_t missing;
3251
3252 // Primary first
3253 missing = pg_log.get_missing().num_missing();
3254 ceph_assert(acting_recovery_backfill.count(pg_whoami));
3255 if (upset.count(pg_whoami)) {
3256 missing_target_objects.emplace(missing, pg_whoami);
3257 } else {
3258 acting_source_objects.emplace(missing, pg_whoami);
3259 }
3260 info.stats.stats.sum.num_objects_missing_on_primary = missing;
3261 if (missing == 0)
3262 info.stats.avail_no_missing.push_back(pg_whoami);
3263 psdout(20) << __func__ << " shard " << pg_whoami
3264 << " primary objects " << num_objects
3265 << " missing " << missing
3266 << dendl;
3267 }
3268
3269 // All other peers
3270 for (auto& peer : peer_info) {
3271 // Primary should not be in the peer_info, skip if it is.
3272 if (peer.first == pg_whoami) continue;
3273 int64_t missing = 0;
3274 int64_t peer_num_objects = peer.second.stats.stats.sum.num_objects;
3275 // Backfill targets always track num_objects accurately
3276 // all other peers track missing accurately.
3277 if (is_backfill_target(peer.first)) {
3278 missing = std::max((int64_t)0, num_objects - peer_num_objects);
3279 } else {
3280 if (peer_missing.count(peer.first)) {
3281 missing = peer_missing[peer.first].num_missing();
3282 } else {
3283 psdout(20) << __func__ << " no peer_missing found for "
3284 << peer.first << dendl;
3285 if (is_recovering()) {
3286 estimate = true;
3287 }
3288 missing = std::max((int64_t)0, num_objects - peer_num_objects);
3289 }
3290 }
3291 if (upset.count(peer.first)) {
3292 missing_target_objects.emplace(missing, peer.first);
3293 } else if (actingset.count(peer.first)) {
3294 acting_source_objects.emplace(missing, peer.first);
3295 }
3296 peer.second.stats.stats.sum.num_objects_missing = missing;
3297 if (missing == 0)
3298 info.stats.avail_no_missing.push_back(peer.first);
3299 psdout(20) << __func__ << " shard " << peer.first
3300 << " objects " << peer_num_objects
3301 << " missing " << missing
3302 << dendl;
3303 }
3304
3305 // Compute object_location_counts
3306 for (auto& ml: missing_loc.get_missing_locs()) {
3307 info.stats.object_location_counts[ml.second]++;
3308 psdout(30) << __func__ << " " << ml.first << " object_location_counts["
3309 << ml.second << "]=" << info.stats.object_location_counts[ml.second]
3310 << dendl;
3311 }
3312 int64_t not_missing = num_objects - missing_loc.get_missing_locs().size();
3313 if (not_missing) {
3314 // During recovery we know upset == actingset and is being populated
3315 // During backfill we know that all non-missing objects are in the actingset
3316 info.stats.object_location_counts[actingset] = not_missing;
3317 }
3318 psdout(30) << __func__ << " object_location_counts["
3319 << upset << "]=" << info.stats.object_location_counts[upset]
3320 << dendl;
3321 psdout(20) << __func__ << " object_location_counts "
3322 << info.stats.object_location_counts << dendl;
3323
3324 // A misplaced object is not stored on the correct OSD
3325 int64_t misplaced = 0;
3326 // a degraded objects has fewer replicas or EC shards than the pool specifies.
3327 int64_t degraded = 0;
3328
3329 if (is_recovering()) {
3330 for (auto& sml: missing_loc.get_missing_by_count()) {
3331 for (auto& ml: sml.second) {
3332 int missing_shards;
3333 if (sml.first == shard_id_t::NO_SHARD) {
3334 psdout(20) << __func__ << " ml " << ml.second
3335 << " upset size " << upset.size()
3336 << " up " << ml.first.up << dendl;
3337 missing_shards = (int)upset.size() - ml.first.up;
3338 } else {
3339 // Handle shards not even in upset below
3340 if (!find_shard(upset, sml.first))
3341 continue;
3342 missing_shards = std::max(0, 1 - ml.first.up);
3343 psdout(20) << __func__
3344 << " shard " << sml.first
3345 << " ml " << ml.second
3346 << " missing shards " << missing_shards << dendl;
3347 }
3348 int odegraded = ml.second * missing_shards;
3349 // Copies on other osds but limited to the possible degraded
3350 int more_osds = std::min(missing_shards, ml.first.other);
3351 int omisplaced = ml.second * more_osds;
3352 ceph_assert(omisplaced <= odegraded);
3353 odegraded -= omisplaced;
3354
3355 misplaced += omisplaced;
3356 degraded += odegraded;
3357 }
3358 }
3359
3360 psdout(20) << __func__ << " missing based degraded "
3361 << degraded << dendl;
3362 psdout(20) << __func__ << " missing based misplaced "
3363 << misplaced << dendl;
3364
3365 // Handle undersized case
3366 if (pool.info.is_replicated()) {
3367 // Add degraded for missing targets (num_objects missing)
3368 ceph_assert(target >= upset.size());
3369 unsigned needed = target - upset.size();
3370 degraded += num_objects * needed;
3371 } else {
3372 for (unsigned i = 0 ; i < num_shards; ++i) {
3373 shard_id_t shard(i);
3374
3375 if (!find_shard(upset, shard)) {
3376 pg_shard_t pgs = get_another_shard(actingset, pg_shard_t(), shard);
3377
3378 if (pgs != pg_shard_t()) {
3379 int64_t missing;
3380
3381 if (pgs == pg_whoami)
3382 missing = info.stats.stats.sum.num_objects_missing_on_primary;
3383 else
3384 missing = peer_info[pgs].stats.stats.sum.num_objects_missing;
3385
3386 degraded += missing;
3387 misplaced += std::max((int64_t)0, num_objects - missing);
3388 } else {
3389 // No shard anywhere
3390 degraded += num_objects;
3391 }
3392 }
3393 }
3394 }
3395 goto out;
3396 }
3397
3398 // Handle undersized case
3399 if (pool.info.is_replicated()) {
3400 // Add to missing_target_objects
3401 ceph_assert(target >= missing_target_objects.size());
3402 unsigned needed = target - missing_target_objects.size();
3403 if (needed)
3404 missing_target_objects.emplace(num_objects * needed, pg_shard_t(pg_shard_t::NO_OSD));
3405 } else {
3406 for (unsigned i = 0 ; i < num_shards; ++i) {
3407 shard_id_t shard(i);
3408 bool found = false;
3409 for (const auto& t : missing_target_objects) {
3410 if (std::get<1>(t).shard == shard) {
3411 found = true;
3412 break;
3413 }
3414 }
3415 if (!found)
3416 missing_target_objects.emplace(num_objects, pg_shard_t(pg_shard_t::NO_OSD,shard));
3417 }
3418 }
3419
3420 for (const auto& item : missing_target_objects)
3421 psdout(20) << __func__ << " missing shard " << std::get<1>(item)
3422 << " missing= " << std::get<0>(item) << dendl;
3423 for (const auto& item : acting_source_objects)
3424 psdout(20) << __func__ << " acting shard " << std::get<1>(item)
3425 << " missing= " << std::get<0>(item) << dendl;
3426
3427 // Handle all objects not in missing for remapped
3428 // or backfill
3429 for (auto m = missing_target_objects.rbegin();
3430 m != missing_target_objects.rend(); ++m) {
3431
3432 int64_t extra_missing = -1;
3433
3434 if (pool.info.is_replicated()) {
3435 if (!acting_source_objects.empty()) {
3436 auto extra_copy = acting_source_objects.begin();
3437 extra_missing = std::get<0>(*extra_copy);
3438 acting_source_objects.erase(extra_copy);
3439 }
3440 } else { // Erasure coded
3441 // Use corresponding shard
3442 for (const auto& a : acting_source_objects) {
3443 if (std::get<1>(a).shard == std::get<1>(*m).shard) {
3444 extra_missing = std::get<0>(a);
3445 acting_source_objects.erase(a);
3446 break;
3447 }
3448 }
3449 }
3450
3451 if (extra_missing >= 0 && std::get<0>(*m) >= extra_missing) {
3452 // We don't know which of the objects on the target
3453 // are part of extra_missing so assume are all degraded.
3454 misplaced += std::get<0>(*m) - extra_missing;
3455 degraded += extra_missing;
3456 } else {
3457 // 1. extra_missing == -1, more targets than sources so degraded
3458 // 2. extra_missing > std::get<0>(m), so that we know that some extra_missing
3459 // previously degraded are now present on the target.
3460 degraded += std::get<0>(*m);
3461 }
3462 }
3463 // If there are still acting that haven't been accounted for
3464 // then they are misplaced
3465 for (const auto& a : acting_source_objects) {
3466 int64_t extra_misplaced = std::max((int64_t)0, num_objects - std::get<0>(a));
3467 psdout(20) << __func__ << " extra acting misplaced " << extra_misplaced
3468 << dendl;
3469 misplaced += extra_misplaced;
3470 }
3471 out:
3472 // NOTE: Tests use these messages to verify this code
3473 psdout(20) << __func__ << " degraded " << degraded
3474 << (estimate ? " (est)": "") << dendl;
3475 psdout(20) << __func__ << " misplaced " << misplaced
3476 << (estimate ? " (est)": "")<< dendl;
3477
3478 info.stats.stats.sum.num_objects_degraded = degraded;
3479 info.stats.stats.sum.num_objects_unfound = get_num_unfound();
3480 info.stats.stats.sum.num_objects_misplaced = misplaced;
3481 }
3482 }
3483
3484 std::optional<pg_stat_t> PeeringState::prepare_stats_for_publish(
3485 bool pg_stats_publish_valid,
3486 const pg_stat_t &pg_stats_publish,
3487 const object_stat_collection_t &unstable_stats)
3488 {
3489 if (info.stats.stats.sum.num_scrub_errors) {
3490 state_set(PG_STATE_INCONSISTENT);
3491 } else {
3492 state_clear(PG_STATE_INCONSISTENT);
3493 state_clear(PG_STATE_FAILED_REPAIR);
3494 }
3495
3496 utime_t now = ceph_clock_now();
3497 if (info.stats.state != state) {
3498 info.stats.last_change = now;
3499 // Optimistic estimation, if we just find out an inactive PG,
3500 // assumt it is active till now.
3501 if (!(state & PG_STATE_ACTIVE) &&
3502 (info.stats.state & PG_STATE_ACTIVE))
3503 info.stats.last_active = now;
3504
3505 if ((state & PG_STATE_ACTIVE) &&
3506 !(info.stats.state & PG_STATE_ACTIVE))
3507 info.stats.last_became_active = now;
3508 if ((state & (PG_STATE_ACTIVE|PG_STATE_PEERED)) &&
3509 !(info.stats.state & (PG_STATE_ACTIVE|PG_STATE_PEERED)))
3510 info.stats.last_became_peered = now;
3511 info.stats.state = state;
3512 }
3513
3514 update_calc_stats();
3515 if (info.stats.stats.sum.num_objects_degraded) {
3516 state_set(PG_STATE_DEGRADED);
3517 } else {
3518 state_clear(PG_STATE_DEGRADED);
3519 }
3520 update_blocked_by();
3521
3522 pg_stat_t pre_publish = info.stats;
3523 pre_publish.stats.add(unstable_stats);
3524 utime_t cutoff = now;
3525 cutoff -= cct->_conf->osd_pg_stat_report_interval_max;
3526
3527 // share (some of) our purged_snaps via the pg_stats. limit # of intervals
3528 // because we don't want to make the pg_stat_t structures too expensive.
3529 unsigned max = cct->_conf->osd_max_snap_prune_intervals_per_epoch;
3530 unsigned num = 0;
3531 auto i = info.purged_snaps.begin();
3532 while (num < max && i != info.purged_snaps.end()) {
3533 pre_publish.purged_snaps.insert(i.get_start(), i.get_len());
3534 ++num;
3535 ++i;
3536 }
3537 psdout(20) << __func__ << " reporting purged_snaps "
3538 << pre_publish.purged_snaps << dendl;
3539
3540 if (pg_stats_publish_valid && pre_publish == pg_stats_publish &&
3541 info.stats.last_fresh > cutoff) {
3542 psdout(15) << "publish_stats_to_osd " << pg_stats_publish.reported_epoch
3543 << ": no change since " << info.stats.last_fresh << dendl;
3544 return std::nullopt;
3545 } else {
3546 // update our stat summary and timestamps
3547 info.stats.reported_epoch = get_osdmap_epoch();
3548 ++info.stats.reported_seq;
3549
3550 info.stats.last_fresh = now;
3551
3552 if (info.stats.state & PG_STATE_CLEAN)
3553 info.stats.last_clean = now;
3554 if (info.stats.state & PG_STATE_ACTIVE)
3555 info.stats.last_active = now;
3556 if (info.stats.state & (PG_STATE_ACTIVE|PG_STATE_PEERED))
3557 info.stats.last_peered = now;
3558 info.stats.last_unstale = now;
3559 if ((info.stats.state & PG_STATE_DEGRADED) == 0)
3560 info.stats.last_undegraded = now;
3561 if ((info.stats.state & PG_STATE_UNDERSIZED) == 0)
3562 info.stats.last_fullsized = now;
3563
3564 psdout(15) << "publish_stats_to_osd " << pg_stats_publish.reported_epoch
3565 << ":" << pg_stats_publish.reported_seq << dendl;
3566 return std::make_optional(std::move(pre_publish));
3567 }
3568 }
3569
3570 void PeeringState::init(
3571 int role,
3572 const vector<int>& newup, int new_up_primary,
3573 const vector<int>& newacting, int new_acting_primary,
3574 const pg_history_t& history,
3575 const PastIntervals& pi,
3576 bool backfill,
3577 ObjectStore::Transaction &t)
3578 {
3579 psdout(10) << "init role " << role << " up "
3580 << newup << " acting " << newacting
3581 << " history " << history
3582 << " past_intervals " << pi
3583 << dendl;
3584
3585 set_role(role);
3586 init_primary_up_acting(
3587 newup,
3588 newacting,
3589 new_up_primary,
3590 new_acting_primary);
3591
3592 info.history = history;
3593 past_intervals = pi;
3594
3595 info.stats.up = up;
3596 info.stats.up_primary = new_up_primary;
3597 info.stats.acting = acting;
3598 info.stats.acting_primary = new_acting_primary;
3599 info.stats.mapping_epoch = info.history.same_interval_since;
3600
3601 if (!perform_deletes_during_peering()) {
3602 pg_log.set_missing_may_contain_deletes();
3603 }
3604
3605 if (backfill) {
3606 psdout(10) << __func__ << ": Setting backfill" << dendl;
3607 info.set_last_backfill(hobject_t());
3608 info.last_complete = info.last_update;
3609 pg_log.mark_log_for_rewrite();
3610 }
3611
3612 on_new_interval();
3613
3614 dirty_info = true;
3615 dirty_big_info = true;
3616 write_if_dirty(t);
3617 }
3618
3619 void PeeringState::dump_peering_state(Formatter *f)
3620 {
3621 f->dump_string("state", get_pg_state_string());
3622 f->dump_unsigned("epoch", get_osdmap_epoch());
3623 f->open_array_section("up");
3624 for (vector<int>::const_iterator p = up.begin(); p != up.end(); ++p)
3625 f->dump_unsigned("osd", *p);
3626 f->close_section();
3627 f->open_array_section("acting");
3628 for (vector<int>::const_iterator p = acting.begin(); p != acting.end(); ++p)
3629 f->dump_unsigned("osd", *p);
3630 f->close_section();
3631 if (!backfill_targets.empty()) {
3632 f->open_array_section("backfill_targets");
3633 for (set<pg_shard_t>::iterator p = backfill_targets.begin();
3634 p != backfill_targets.end();
3635 ++p)
3636 f->dump_stream("shard") << *p;
3637 f->close_section();
3638 }
3639 if (!async_recovery_targets.empty()) {
3640 f->open_array_section("async_recovery_targets");
3641 for (set<pg_shard_t>::iterator p = async_recovery_targets.begin();
3642 p != async_recovery_targets.end();
3643 ++p)
3644 f->dump_stream("shard") << *p;
3645 f->close_section();
3646 }
3647 if (!acting_recovery_backfill.empty()) {
3648 f->open_array_section("acting_recovery_backfill");
3649 for (set<pg_shard_t>::iterator p = acting_recovery_backfill.begin();
3650 p != acting_recovery_backfill.end();
3651 ++p)
3652 f->dump_stream("shard") << *p;
3653 f->close_section();
3654 }
3655 f->open_object_section("info");
3656 update_calc_stats();
3657 info.dump(f);
3658 f->close_section();
3659
3660 f->open_array_section("peer_info");
3661 for (map<pg_shard_t, pg_info_t>::const_iterator p = peer_info.begin();
3662 p != peer_info.end();
3663 ++p) {
3664 f->open_object_section("info");
3665 f->dump_stream("peer") << p->first;
3666 p->second.dump(f);
3667 f->close_section();
3668 }
3669 }
3670
3671 void PeeringState::update_stats(
3672 std::function<bool(pg_history_t &, pg_stat_t &)> f,
3673 ObjectStore::Transaction *t) {
3674 if (f(info.history, info.stats)) {
3675 pl->publish_stats_to_osd();
3676 }
3677 pl->on_info_history_change();
3678
3679 if (t) {
3680 dirty_info = true;
3681 write_if_dirty(*t);
3682 }
3683 }
3684
3685 bool PeeringState::append_log_entries_update_missing(
3686 const mempool::osd_pglog::list<pg_log_entry_t> &entries,
3687 ObjectStore::Transaction &t, std::optional<eversion_t> trim_to,
3688 std::optional<eversion_t> roll_forward_to)
3689 {
3690 ceph_assert(!entries.empty());
3691 ceph_assert(entries.begin()->version > info.last_update);
3692
3693 PGLog::LogEntryHandlerRef rollbacker{pl->get_log_handler(t)};
3694 bool invalidate_stats =
3695 pg_log.append_new_log_entries(
3696 info.last_backfill,
3697 entries,
3698 rollbacker.get());
3699
3700 if (roll_forward_to && entries.rbegin()->soid > info.last_backfill) {
3701 pg_log.roll_forward(rollbacker.get());
3702 }
3703 if (roll_forward_to && *roll_forward_to > pg_log.get_can_rollback_to()) {
3704 pg_log.roll_forward_to(*roll_forward_to, rollbacker.get());
3705 last_rollback_info_trimmed_to_applied = *roll_forward_to;
3706 }
3707
3708 info.last_update = pg_log.get_head();
3709
3710 if (pg_log.get_missing().num_missing() == 0) {
3711 // advance last_complete since nothing else is missing!
3712 info.last_complete = info.last_update;
3713 }
3714 info.stats.stats_invalid = info.stats.stats_invalid || invalidate_stats;
3715
3716 psdout(20) << __func__ << " trim_to bool = " << bool(trim_to)
3717 << " trim_to = " << (trim_to ? *trim_to : eversion_t()) << dendl;
3718 if (trim_to)
3719 pg_log.trim(*trim_to, info);
3720 dirty_info = true;
3721 write_if_dirty(t);
3722 return invalidate_stats;
3723 }
3724
3725 void PeeringState::merge_new_log_entries(
3726 const mempool::osd_pglog::list<pg_log_entry_t> &entries,
3727 ObjectStore::Transaction &t,
3728 std::optional<eversion_t> trim_to,
3729 std::optional<eversion_t> roll_forward_to)
3730 {
3731 psdout(10) << __func__ << " " << entries << dendl;
3732 ceph_assert(is_primary());
3733
3734 bool rebuild_missing = append_log_entries_update_missing(entries, t, trim_to, roll_forward_to);
3735 for (set<pg_shard_t>::const_iterator i = acting_recovery_backfill.begin();
3736 i != acting_recovery_backfill.end();
3737 ++i) {
3738 pg_shard_t peer(*i);
3739 if (peer == pg_whoami) continue;
3740 ceph_assert(peer_missing.count(peer));
3741 ceph_assert(peer_info.count(peer));
3742 pg_missing_t& pmissing(peer_missing[peer]);
3743 psdout(20) << __func__ << " peer_missing for " << peer
3744 << " = " << pmissing << dendl;
3745 pg_info_t& pinfo(peer_info[peer]);
3746 bool invalidate_stats = PGLog::append_log_entries_update_missing(
3747 pinfo.last_backfill,
3748 entries,
3749 true,
3750 NULL,
3751 pmissing,
3752 NULL,
3753 dpp);
3754 pinfo.last_update = info.last_update;
3755 pinfo.stats.stats_invalid = pinfo.stats.stats_invalid || invalidate_stats;
3756 rebuild_missing = rebuild_missing || invalidate_stats;
3757 }
3758
3759 if (!rebuild_missing) {
3760 return;
3761 }
3762
3763 for (auto &&i: entries) {
3764 missing_loc.rebuild(
3765 i.soid,
3766 pg_whoami,
3767 acting_recovery_backfill,
3768 info,
3769 pg_log.get_missing(),
3770 peer_missing,
3771 peer_info);
3772 }
3773 }
3774
3775 void PeeringState::add_log_entry(const pg_log_entry_t& e, bool applied)
3776 {
3777 // raise last_complete only if we were previously up to date
3778 if (info.last_complete == info.last_update)
3779 info.last_complete = e.version;
3780
3781 // raise last_update.
3782 ceph_assert(e.version > info.last_update);
3783 info.last_update = e.version;
3784
3785 // raise user_version, if it increased (it may have not get bumped
3786 // by all logged updates)
3787 if (e.user_version > info.last_user_version)
3788 info.last_user_version = e.user_version;
3789
3790 // log mutation
3791 pg_log.add(e, applied);
3792 psdout(10) << "add_log_entry " << e << dendl;
3793 }
3794
3795
3796 void PeeringState::append_log(
3797 const vector<pg_log_entry_t>& logv,
3798 eversion_t trim_to,
3799 eversion_t roll_forward_to,
3800 eversion_t mlcod,
3801 ObjectStore::Transaction &t,
3802 bool transaction_applied,
3803 bool async)
3804 {
3805 /* The primary has sent an info updating the history, but it may not
3806 * have arrived yet. We want to make sure that we cannot remember this
3807 * write without remembering that it happened in an interval which went
3808 * active in epoch history.last_epoch_started.
3809 */
3810 if (info.last_epoch_started != info.history.last_epoch_started) {
3811 info.history.last_epoch_started = info.last_epoch_started;
3812 }
3813 if (info.last_interval_started != info.history.last_interval_started) {
3814 info.history.last_interval_started = info.last_interval_started;
3815 }
3816 psdout(10) << "append_log " << pg_log.get_log() << " " << logv << dendl;
3817
3818 PGLog::LogEntryHandlerRef handler{pl->get_log_handler(t)};
3819 if (!transaction_applied) {
3820 /* We must be a backfill or async recovery peer, so it's ok if we apply
3821 * out-of-turn since we won't be considered when
3822 * determining a min possible last_update.
3823 *
3824 * We skip_rollforward() here, which advances the crt, without
3825 * doing an actual rollforward. This avoids cleaning up entries
3826 * from the backend and we do not end up in a situation, where the
3827 * object is deleted before we can _merge_object_divergent_entries().
3828 */
3829 pg_log.skip_rollforward();
3830 }
3831
3832 for (vector<pg_log_entry_t>::const_iterator p = logv.begin();
3833 p != logv.end();
3834 ++p) {
3835 add_log_entry(*p, transaction_applied);
3836
3837 /* We don't want to leave the rollforward artifacts around
3838 * here past last_backfill. It's ok for the same reason as
3839 * above */
3840 if (transaction_applied &&
3841 p->soid > info.last_backfill) {
3842 pg_log.roll_forward(handler.get());
3843 }
3844 }
3845 if (transaction_applied && roll_forward_to > pg_log.get_can_rollback_to()) {
3846 pg_log.roll_forward_to(
3847 roll_forward_to,
3848 handler.get());
3849 last_rollback_info_trimmed_to_applied = roll_forward_to;
3850 }
3851
3852 psdout(10) << __func__ << " approx pg log length = "
3853 << pg_log.get_log().approx_size() << dendl;
3854 psdout(10) << __func__ << " transaction_applied = "
3855 << transaction_applied << dendl;
3856 if (!transaction_applied || async)
3857 psdout(10) << __func__ << " " << pg_whoami
3858 << " is async_recovery or backfill target" << dendl;
3859 pg_log.trim(trim_to, info, transaction_applied, async);
3860
3861 // update the local pg, pg log
3862 dirty_info = true;
3863 write_if_dirty(t);
3864
3865 if (!is_primary())
3866 min_last_complete_ondisk = mlcod;
3867 }
3868
3869 void PeeringState::recover_got(
3870 const hobject_t &oid, eversion_t v,
3871 bool is_delete,
3872 ObjectStore::Transaction &t)
3873 {
3874 if (v > pg_log.get_can_rollback_to()) {
3875 /* This can only happen during a repair, and even then, it would
3876 * be one heck of a race. If we are repairing the object, the
3877 * write in question must be fully committed, so it's not valid
3878 * to roll it back anyway (and we'll be rolled forward shortly
3879 * anyway) */
3880 PGLog::LogEntryHandlerRef handler{pl->get_log_handler(t)};
3881 pg_log.roll_forward_to(v, handler.get());
3882 }
3883
3884 psdout(10) << "got missing " << oid << " v " << v << dendl;
3885 pg_log.recover_got(oid, v, info);
3886 if (pg_log.get_log().log.empty()) {
3887 psdout(10) << "last_complete now " << info.last_complete
3888 << " while log is empty" << dendl;
3889 } else if (pg_log.get_log().complete_to != pg_log.get_log().log.end()) {
3890 psdout(10) << "last_complete now " << info.last_complete
3891 << " log.complete_to " << pg_log.get_log().complete_to->version
3892 << dendl;
3893 } else {
3894 psdout(10) << "last_complete now " << info.last_complete
3895 << " log.complete_to at end" << dendl;
3896 //below is not true in the repair case.
3897 //assert(missing.num_missing() == 0); // otherwise, complete_to was wrong.
3898 ceph_assert(info.last_complete == info.last_update);
3899 }
3900
3901 if (is_primary()) {
3902 ceph_assert(missing_loc.needs_recovery(oid));
3903 if (!is_delete)
3904 missing_loc.add_location(oid, pg_whoami);
3905 }
3906
3907 // update pg
3908 dirty_info = true;
3909 write_if_dirty(t);
3910 }
3911
3912 void PeeringState::update_backfill_progress(
3913 const hobject_t &updated_backfill,
3914 const pg_stat_t &updated_stats,
3915 bool preserve_local_num_bytes,
3916 ObjectStore::Transaction &t) {
3917 info.set_last_backfill(updated_backfill);
3918 if (preserve_local_num_bytes) {
3919 psdout(25) << __func__ << " primary " << updated_stats.stats.sum.num_bytes
3920 << " local " << info.stats.stats.sum.num_bytes << dendl;
3921 int64_t bytes = info.stats.stats.sum.num_bytes;
3922 info.stats = updated_stats;
3923 info.stats.stats.sum.num_bytes = bytes;
3924 } else {
3925 psdout(20) << __func__ << " final " << updated_stats.stats.sum.num_bytes
3926 << " replaces local " << info.stats.stats.sum.num_bytes << dendl;
3927 info.stats = updated_stats;
3928 }
3929
3930 dirty_info = true;
3931 write_if_dirty(t);
3932 }
3933
3934 void PeeringState::adjust_purged_snaps(
3935 std::function<void(interval_set<snapid_t> &snaps)> f) {
3936 f(info.purged_snaps);
3937 dirty_info = true;
3938 dirty_big_info = true;
3939 }
3940
3941 void PeeringState::on_peer_recover(
3942 pg_shard_t peer,
3943 const hobject_t &soid,
3944 const eversion_t &version)
3945 {
3946 pl->publish_stats_to_osd();
3947 // done!
3948 peer_missing[peer].got(soid, version);
3949 missing_loc.add_location(soid, peer);
3950 }
3951
3952 void PeeringState::begin_peer_recover(
3953 pg_shard_t peer,
3954 const hobject_t soid)
3955 {
3956 peer_missing[peer].revise_have(soid, eversion_t());
3957 }
3958
3959 void PeeringState::force_object_missing(
3960 const set<pg_shard_t> &peers,
3961 const hobject_t &soid,
3962 eversion_t version)
3963 {
3964 for (auto &&peer : peers) {
3965 if (peer != primary) {
3966 peer_missing[peer].add(soid, version, eversion_t(), false);
3967 } else {
3968 pg_log.missing_add(soid, version, eversion_t());
3969 pg_log.reset_complete_to(&info);
3970 pg_log.set_last_requested(0);
3971 }
3972 }
3973
3974 missing_loc.rebuild(
3975 soid,
3976 pg_whoami,
3977 acting_recovery_backfill,
3978 info,
3979 pg_log.get_missing(),
3980 peer_missing,
3981 peer_info);
3982 }
3983
3984 void PeeringState::pre_submit_op(
3985 const hobject_t &hoid,
3986 const vector<pg_log_entry_t>& logv,
3987 eversion_t at_version)
3988 {
3989 if (at_version > eversion_t()) {
3990 for (auto &&i : get_acting_recovery_backfill()) {
3991 if (i == primary) continue;
3992 pg_info_t &pinfo = peer_info[i];
3993 // keep peer_info up to date
3994 if (pinfo.last_complete == pinfo.last_update)
3995 pinfo.last_complete = at_version;
3996 pinfo.last_update = at_version;
3997 }
3998 }
3999
4000 bool requires_missing_loc = false;
4001 for (auto &&i : get_async_recovery_targets()) {
4002 if (i == primary || !get_peer_missing(i).is_missing(hoid))
4003 continue;
4004 requires_missing_loc = true;
4005 for (auto &&entry: logv) {
4006 peer_missing[i].add_next_event(entry);
4007 }
4008 }
4009
4010 if (requires_missing_loc) {
4011 for (auto &&entry: logv) {
4012 psdout(30) << __func__ << " missing_loc before: "
4013 << missing_loc.get_locations(entry.soid) << dendl;
4014 missing_loc.add_missing(entry.soid, entry.version,
4015 eversion_t(), entry.is_delete());
4016 // clear out missing_loc
4017 missing_loc.clear_location(entry.soid);
4018 for (auto &i: get_actingset()) {
4019 if (!get_peer_missing(i).is_missing(entry.soid))
4020 missing_loc.add_location(entry.soid, i);
4021 }
4022 psdout(30) << __func__ << " missing_loc after: "
4023 << missing_loc.get_locations(entry.soid) << dendl;
4024 }
4025 }
4026 }
4027
4028 void PeeringState::recovery_committed_to(eversion_t version)
4029 {
4030 psdout(10) << __func__ << " version " << version
4031 << " now ondisk" << dendl;
4032 last_complete_ondisk = version;
4033
4034 if (last_complete_ondisk == info.last_update) {
4035 if (!is_primary()) {
4036 // Either we are a replica or backfill target.
4037 // we are fully up to date. tell the primary!
4038 pl->send_cluster_message(
4039 get_primary().osd,
4040 new MOSDPGTrim(
4041 get_osdmap_epoch(),
4042 spg_t(info.pgid.pgid, primary.shard),
4043 last_complete_ondisk),
4044 get_osdmap_epoch());
4045 } else {
4046 calc_min_last_complete_ondisk();
4047 }
4048 }
4049 }
4050
4051 void PeeringState::complete_write(eversion_t v, eversion_t lc)
4052 {
4053 last_update_ondisk = v;
4054 last_complete_ondisk = lc;
4055 calc_min_last_complete_ondisk();
4056 }
4057
4058 void PeeringState::calc_trim_to()
4059 {
4060 size_t target = pl->get_target_pg_log_entries();
4061
4062 eversion_t limit = std::min(
4063 min_last_complete_ondisk,
4064 pg_log.get_can_rollback_to());
4065 if (limit != eversion_t() &&
4066 limit != pg_trim_to &&
4067 pg_log.get_log().approx_size() > target) {
4068 size_t num_to_trim = std::min(pg_log.get_log().approx_size() - target,
4069 cct->_conf->osd_pg_log_trim_max);
4070 if (num_to_trim < cct->_conf->osd_pg_log_trim_min &&
4071 cct->_conf->osd_pg_log_trim_max >= cct->_conf->osd_pg_log_trim_min) {
4072 return;
4073 }
4074 list<pg_log_entry_t>::const_iterator it = pg_log.get_log().log.begin();
4075 eversion_t new_trim_to;
4076 for (size_t i = 0; i < num_to_trim; ++i) {
4077 new_trim_to = it->version;
4078 ++it;
4079 if (new_trim_to > limit) {
4080 new_trim_to = limit;
4081 psdout(10) << "calc_trim_to trimming to min_last_complete_ondisk" << dendl;
4082 break;
4083 }
4084 }
4085 psdout(10) << "calc_trim_to " << pg_trim_to << " -> " << new_trim_to << dendl;
4086 pg_trim_to = new_trim_to;
4087 assert(pg_trim_to <= pg_log.get_head());
4088 assert(pg_trim_to <= min_last_complete_ondisk);
4089 }
4090 }
4091
4092 void PeeringState::calc_trim_to_aggressive()
4093 {
4094 size_t target = pl->get_target_pg_log_entries();
4095
4096 // limit pg log trimming up to the can_rollback_to value
4097 eversion_t limit = std::min(
4098 pg_log.get_head(),
4099 pg_log.get_can_rollback_to());
4100 psdout(10) << __func__ << " limit = " << limit << dendl;
4101
4102 if (limit != eversion_t() &&
4103 limit != pg_trim_to &&
4104 pg_log.get_log().approx_size() > target) {
4105 psdout(10) << __func__ << " approx pg log length = "
4106 << pg_log.get_log().approx_size() << dendl;
4107 uint64_t num_to_trim = std::min<uint64_t>(pg_log.get_log().approx_size() - target,
4108 cct->_conf->osd_pg_log_trim_max);
4109 psdout(10) << __func__ << " num_to_trim = " << num_to_trim << dendl;
4110 if (num_to_trim < cct->_conf->osd_pg_log_trim_min &&
4111 cct->_conf->osd_pg_log_trim_max >= cct->_conf->osd_pg_log_trim_min) {
4112 return;
4113 }
4114 auto it = pg_log.get_log().log.begin(); // oldest log entry
4115 auto rit = pg_log.get_log().log.rbegin();
4116 eversion_t by_n_to_keep; // start from tail
4117 eversion_t by_n_to_trim = eversion_t::max(); // start from head
4118 for (size_t i = 0; it != pg_log.get_log().log.end(); ++it, ++rit) {
4119 i++;
4120 if (i > target && by_n_to_keep == eversion_t()) {
4121 by_n_to_keep = rit->version;
4122 }
4123 if (i >= num_to_trim && by_n_to_trim == eversion_t::max()) {
4124 by_n_to_trim = it->version;
4125 }
4126 if (by_n_to_keep != eversion_t() &&
4127 by_n_to_trim != eversion_t::max()) {
4128 break;
4129 }
4130 }
4131
4132 if (by_n_to_keep == eversion_t()) {
4133 return;
4134 }
4135
4136 pg_trim_to = std::min({by_n_to_keep, by_n_to_trim, limit});
4137 psdout(10) << __func__ << " pg_trim_to now " << pg_trim_to << dendl;
4138 ceph_assert(pg_trim_to <= pg_log.get_head());
4139 }
4140 }
4141
4142 void PeeringState::apply_op_stats(
4143 const hobject_t &soid,
4144 const object_stat_sum_t &delta_stats)
4145 {
4146 info.stats.stats.add(delta_stats);
4147 info.stats.stats.floor(0);
4148
4149 for (set<pg_shard_t>::const_iterator i = get_backfill_targets().begin();
4150 i != get_backfill_targets().end();
4151 ++i) {
4152 pg_shard_t bt = *i;
4153 pg_info_t& pinfo = peer_info[bt];
4154 if (soid <= pinfo.last_backfill)
4155 pinfo.stats.stats.add(delta_stats);
4156 }
4157 }
4158
4159 void PeeringState::update_complete_backfill_object_stats(
4160 const hobject_t &hoid,
4161 const pg_stat_t &stats)
4162 {
4163 for (auto &&bt: get_backfill_targets()) {
4164 pg_info_t& pinfo = peer_info[bt];
4165 //Add stats to all peers that were missing object
4166 if (hoid > pinfo.last_backfill)
4167 pinfo.stats.add(stats);
4168 }
4169 }
4170
4171 void PeeringState::update_peer_last_backfill(
4172 pg_shard_t peer,
4173 const hobject_t &new_last_backfill)
4174 {
4175 pg_info_t &pinfo = peer_info[peer];
4176 pinfo.last_backfill = new_last_backfill;
4177 if (new_last_backfill.is_max()) {
4178 /* pinfo.stats might be wrong if we did log-based recovery on the
4179 * backfilled portion in addition to continuing backfill.
4180 */
4181 pinfo.stats = info.stats;
4182 }
4183 }
4184
4185 void PeeringState::set_revert_with_targets(
4186 const hobject_t &soid,
4187 const set<pg_shard_t> &good_peers)
4188 {
4189 for (auto &&peer: good_peers) {
4190 missing_loc.add_location(soid, peer);
4191 }
4192 }
4193
4194 void PeeringState::prepare_backfill_for_missing(
4195 const hobject_t &soid,
4196 const eversion_t &version,
4197 const vector<pg_shard_t> &targets) {
4198 for (auto &&peer: targets) {
4199 peer_missing[peer].add(soid, version, eversion_t(), false);
4200 }
4201 }
4202
4203 void PeeringState::update_hset(const pg_hit_set_history_t &hset_history)
4204 {
4205 info.hit_set = hset_history;
4206 }
4207
4208 /*------------ Peering State Machine----------------*/
4209 #undef dout_prefix
4210 #define dout_prefix (context< PeeringMachine >().dpp->gen_prefix(*_dout) \
4211 << "state<" << get_state_name() << ">: ")
4212 #undef psdout
4213 #define psdout(x) ldout(context< PeeringMachine >().cct, x)
4214
4215 #define DECLARE_LOCALS \
4216 PeeringState *ps = context< PeeringMachine >().state; \
4217 std::ignore = ps; \
4218 PeeringListener *pl = context< PeeringMachine >().pl; \
4219 std::ignore = pl
4220
4221
4222 /*------Crashed-------*/
4223 PeeringState::Crashed::Crashed(my_context ctx)
4224 : my_base(ctx),
4225 NamedState(context< PeeringMachine >().state_history, "Crashed")
4226 {
4227 context< PeeringMachine >().log_enter(state_name);
4228 ceph_abort_msg("we got a bad state machine event");
4229 }
4230
4231
4232 /*------Initial-------*/
4233 PeeringState::Initial::Initial(my_context ctx)
4234 : my_base(ctx),
4235 NamedState(context< PeeringMachine >().state_history, "Initial")
4236 {
4237 context< PeeringMachine >().log_enter(state_name);
4238 }
4239
4240 boost::statechart::result PeeringState::Initial::react(const MNotifyRec& notify)
4241 {
4242 DECLARE_LOCALS;
4243 ps->proc_replica_info(
4244 notify.from, notify.notify.info, notify.notify.epoch_sent);
4245 ps->set_last_peering_reset();
4246 return transit< Primary >();
4247 }
4248
4249 boost::statechart::result PeeringState::Initial::react(const MInfoRec& i)
4250 {
4251 DECLARE_LOCALS;
4252 ceph_assert(!ps->is_primary());
4253 post_event(i);
4254 return transit< Stray >();
4255 }
4256
4257 boost::statechart::result PeeringState::Initial::react(const MLogRec& i)
4258 {
4259 DECLARE_LOCALS;
4260 ceph_assert(!ps->is_primary());
4261 post_event(i);
4262 return transit< Stray >();
4263 }
4264
4265 void PeeringState::Initial::exit()
4266 {
4267 context< PeeringMachine >().log_exit(state_name, enter_time);
4268 DECLARE_LOCALS;
4269 utime_t dur = ceph_clock_now() - enter_time;
4270 pl->get_peering_perf().tinc(rs_initial_latency, dur);
4271 }
4272
4273 /*------Started-------*/
4274 PeeringState::Started::Started(my_context ctx)
4275 : my_base(ctx),
4276 NamedState(context< PeeringMachine >().state_history, "Started")
4277 {
4278 context< PeeringMachine >().log_enter(state_name);
4279 }
4280
4281 boost::statechart::result
4282 PeeringState::Started::react(const IntervalFlush&)
4283 {
4284 psdout(10) << "Ending blocked outgoing recovery messages" << dendl;
4285 context< PeeringMachine >().state->end_block_outgoing();
4286 return discard_event();
4287 }
4288
4289 boost::statechart::result PeeringState::Started::react(const AdvMap& advmap)
4290 {
4291 DECLARE_LOCALS;
4292 psdout(10) << "Started advmap" << dendl;
4293 ps->check_full_transition(advmap.lastmap, advmap.osdmap);
4294 if (ps->should_restart_peering(
4295 advmap.up_primary,
4296 advmap.acting_primary,
4297 advmap.newup,
4298 advmap.newacting,
4299 advmap.lastmap,
4300 advmap.osdmap)) {
4301 psdout(10) << "should_restart_peering, transitioning to Reset"
4302 << dendl;
4303 post_event(advmap);
4304 return transit< Reset >();
4305 }
4306 ps->remove_down_peer_info(advmap.osdmap);
4307 return discard_event();
4308 }
4309
4310 boost::statechart::result PeeringState::Started::react(const QueryState& q)
4311 {
4312 q.f->open_object_section("state");
4313 q.f->dump_string("name", state_name);
4314 q.f->dump_stream("enter_time") << enter_time;
4315 q.f->close_section();
4316 return discard_event();
4317 }
4318
4319 void PeeringState::Started::exit()
4320 {
4321 context< PeeringMachine >().log_exit(state_name, enter_time);
4322 DECLARE_LOCALS;
4323 utime_t dur = ceph_clock_now() - enter_time;
4324 pl->get_peering_perf().tinc(rs_started_latency, dur);
4325 ps->state_clear(PG_STATE_WAIT | PG_STATE_LAGGY);
4326 }
4327
4328 /*--------Reset---------*/
4329 PeeringState::Reset::Reset(my_context ctx)
4330 : my_base(ctx),
4331 NamedState(context< PeeringMachine >().state_history, "Reset")
4332 {
4333 context< PeeringMachine >().log_enter(state_name);
4334 DECLARE_LOCALS;
4335
4336 ps->flushes_in_progress = 0;
4337 ps->set_last_peering_reset();
4338 ps->log_weirdness();
4339 }
4340
4341 boost::statechart::result
4342 PeeringState::Reset::react(const IntervalFlush&)
4343 {
4344 psdout(10) << "Ending blocked outgoing recovery messages" << dendl;
4345 context< PeeringMachine >().state->end_block_outgoing();
4346 return discard_event();
4347 }
4348
4349 boost::statechart::result PeeringState::Reset::react(const AdvMap& advmap)
4350 {
4351 DECLARE_LOCALS;
4352 psdout(10) << "Reset advmap" << dendl;
4353
4354 ps->check_full_transition(advmap.lastmap, advmap.osdmap);
4355
4356 if (ps->should_restart_peering(
4357 advmap.up_primary,
4358 advmap.acting_primary,
4359 advmap.newup,
4360 advmap.newacting,
4361 advmap.lastmap,
4362 advmap.osdmap)) {
4363 psdout(10) << "should restart peering, calling start_peering_interval again"
4364 << dendl;
4365 ps->start_peering_interval(
4366 advmap.lastmap,
4367 advmap.newup, advmap.up_primary,
4368 advmap.newacting, advmap.acting_primary,
4369 context< PeeringMachine >().get_cur_transaction());
4370 }
4371 ps->remove_down_peer_info(advmap.osdmap);
4372 ps->check_past_interval_bounds();
4373 return discard_event();
4374 }
4375
4376 boost::statechart::result PeeringState::Reset::react(const ActMap&)
4377 {
4378 DECLARE_LOCALS;
4379 if (ps->should_send_notify() && ps->get_primary().osd >= 0) {
4380 ps->info.history.refresh_prior_readable_until_ub(
4381 pl->get_mnow(),
4382 ps->prior_readable_until_ub);
4383 context< PeeringMachine >().send_notify(
4384 ps->get_primary().osd,
4385 pg_notify_t(
4386 ps->get_primary().shard, ps->pg_whoami.shard,
4387 ps->get_osdmap_epoch(),
4388 ps->get_osdmap_epoch(),
4389 ps->info,
4390 ps->past_intervals));
4391 }
4392
4393 ps->update_heartbeat_peers();
4394
4395 return transit< Started >();
4396 }
4397
4398 boost::statechart::result PeeringState::Reset::react(const QueryState& q)
4399 {
4400 q.f->open_object_section("state");
4401 q.f->dump_string("name", state_name);
4402 q.f->dump_stream("enter_time") << enter_time;
4403 q.f->close_section();
4404 return discard_event();
4405 }
4406
4407 void PeeringState::Reset::exit()
4408 {
4409 context< PeeringMachine >().log_exit(state_name, enter_time);
4410 DECLARE_LOCALS;
4411 utime_t dur = ceph_clock_now() - enter_time;
4412 pl->get_peering_perf().tinc(rs_reset_latency, dur);
4413 }
4414
4415 /*-------Start---------*/
4416 PeeringState::Start::Start(my_context ctx)
4417 : my_base(ctx),
4418 NamedState(context< PeeringMachine >().state_history, "Start")
4419 {
4420 context< PeeringMachine >().log_enter(state_name);
4421
4422 DECLARE_LOCALS;
4423 if (ps->is_primary()) {
4424 psdout(1) << "transitioning to Primary" << dendl;
4425 post_event(MakePrimary());
4426 } else { //is_stray
4427 psdout(1) << "transitioning to Stray" << dendl;
4428 post_event(MakeStray());
4429 }
4430 }
4431
4432 void PeeringState::Start::exit()
4433 {
4434 context< PeeringMachine >().log_exit(state_name, enter_time);
4435 DECLARE_LOCALS;
4436 utime_t dur = ceph_clock_now() - enter_time;
4437 pl->get_peering_perf().tinc(rs_start_latency, dur);
4438 }
4439
4440 /*---------Primary--------*/
4441 PeeringState::Primary::Primary(my_context ctx)
4442 : my_base(ctx),
4443 NamedState(context< PeeringMachine >().state_history, "Started/Primary")
4444 {
4445 context< PeeringMachine >().log_enter(state_name);
4446 DECLARE_LOCALS;
4447 ceph_assert(ps->want_acting.empty());
4448
4449 // set CREATING bit until we have peered for the first time.
4450 if (ps->info.history.last_epoch_started == 0) {
4451 ps->state_set(PG_STATE_CREATING);
4452 // use the history timestamp, which ultimately comes from the
4453 // monitor in the create case.
4454 utime_t t = ps->info.history.last_scrub_stamp;
4455 ps->info.stats.last_fresh = t;
4456 ps->info.stats.last_active = t;
4457 ps->info.stats.last_change = t;
4458 ps->info.stats.last_peered = t;
4459 ps->info.stats.last_clean = t;
4460 ps->info.stats.last_unstale = t;
4461 ps->info.stats.last_undegraded = t;
4462 ps->info.stats.last_fullsized = t;
4463 ps->info.stats.last_scrub_stamp = t;
4464 ps->info.stats.last_deep_scrub_stamp = t;
4465 ps->info.stats.last_clean_scrub_stamp = t;
4466 }
4467 }
4468
4469 boost::statechart::result PeeringState::Primary::react(const MNotifyRec& notevt)
4470 {
4471 DECLARE_LOCALS;
4472 psdout(7) << "handle_pg_notify from osd." << notevt.from << dendl;
4473 ps->proc_replica_info(
4474 notevt.from, notevt.notify.info, notevt.notify.epoch_sent);
4475 return discard_event();
4476 }
4477
4478 boost::statechart::result PeeringState::Primary::react(const ActMap&)
4479 {
4480 DECLARE_LOCALS;
4481 psdout(7) << "handle ActMap primary" << dendl;
4482 pl->publish_stats_to_osd();
4483 return discard_event();
4484 }
4485
4486 boost::statechart::result PeeringState::Primary::react(
4487 const SetForceRecovery&)
4488 {
4489 DECLARE_LOCALS;
4490 ps->set_force_recovery(true);
4491 return discard_event();
4492 }
4493
4494 boost::statechart::result PeeringState::Primary::react(
4495 const UnsetForceRecovery&)
4496 {
4497 DECLARE_LOCALS;
4498 ps->set_force_recovery(false);
4499 return discard_event();
4500 }
4501
4502 boost::statechart::result PeeringState::Primary::react(
4503 const RequestScrub& evt)
4504 {
4505 DECLARE_LOCALS;
4506 if (ps->is_primary()) {
4507 pl->scrub_requested(evt.deep, evt.repair);
4508 psdout(10) << "marking for scrub" << dendl;
4509 }
4510 return discard_event();
4511 }
4512
4513 boost::statechart::result PeeringState::Primary::react(
4514 const SetForceBackfill&)
4515 {
4516 DECLARE_LOCALS;
4517 ps->set_force_backfill(true);
4518 return discard_event();
4519 }
4520
4521 boost::statechart::result PeeringState::Primary::react(
4522 const UnsetForceBackfill&)
4523 {
4524 DECLARE_LOCALS;
4525 ps->set_force_backfill(false);
4526 return discard_event();
4527 }
4528
4529 void PeeringState::Primary::exit()
4530 {
4531 context< PeeringMachine >().log_exit(state_name, enter_time);
4532 DECLARE_LOCALS;
4533 ps->want_acting.clear();
4534 utime_t dur = ceph_clock_now() - enter_time;
4535 pl->get_peering_perf().tinc(rs_primary_latency, dur);
4536 pl->clear_primary_state();
4537 ps->state_clear(PG_STATE_CREATING);
4538 }
4539
4540 /*---------Peering--------*/
4541 PeeringState::Peering::Peering(my_context ctx)
4542 : my_base(ctx),
4543 NamedState(context< PeeringMachine >().state_history, "Started/Primary/Peering"),
4544 history_les_bound(false)
4545 {
4546 context< PeeringMachine >().log_enter(state_name);
4547 DECLARE_LOCALS;
4548
4549 ceph_assert(!ps->is_peered());
4550 ceph_assert(!ps->is_peering());
4551 ceph_assert(ps->is_primary());
4552 ps->state_set(PG_STATE_PEERING);
4553 }
4554
4555 boost::statechart::result PeeringState::Peering::react(const AdvMap& advmap)
4556 {
4557 DECLARE_LOCALS;
4558 psdout(10) << "Peering advmap" << dendl;
4559 if (prior_set.affected_by_map(*(advmap.osdmap), ps->dpp)) {
4560 psdout(1) << "Peering, affected_by_map, going to Reset" << dendl;
4561 post_event(advmap);
4562 return transit< Reset >();
4563 }
4564
4565 ps->adjust_need_up_thru(advmap.osdmap);
4566 ps->check_prior_readable_down_osds(advmap.osdmap);
4567
4568 return forward_event();
4569 }
4570
4571 boost::statechart::result PeeringState::Peering::react(const QueryState& q)
4572 {
4573 DECLARE_LOCALS;
4574
4575 q.f->open_object_section("state");
4576 q.f->dump_string("name", state_name);
4577 q.f->dump_stream("enter_time") << enter_time;
4578
4579 q.f->open_array_section("past_intervals");
4580 ps->past_intervals.dump(q.f);
4581 q.f->close_section();
4582
4583 q.f->open_array_section("probing_osds");
4584 for (set<pg_shard_t>::iterator p = prior_set.probe.begin();
4585 p != prior_set.probe.end();
4586 ++p)
4587 q.f->dump_stream("osd") << *p;
4588 q.f->close_section();
4589
4590 if (prior_set.pg_down)
4591 q.f->dump_string("blocked", "peering is blocked due to down osds");
4592
4593 q.f->open_array_section("down_osds_we_would_probe");
4594 for (set<int>::iterator p = prior_set.down.begin();
4595 p != prior_set.down.end();
4596 ++p)
4597 q.f->dump_int("osd", *p);
4598 q.f->close_section();
4599
4600 q.f->open_array_section("peering_blocked_by");
4601 for (map<int,epoch_t>::iterator p = prior_set.blocked_by.begin();
4602 p != prior_set.blocked_by.end();
4603 ++p) {
4604 q.f->open_object_section("osd");
4605 q.f->dump_int("osd", p->first);
4606 q.f->dump_int("current_lost_at", p->second);
4607 q.f->dump_string("comment", "starting or marking this osd lost may let us proceed");
4608 q.f->close_section();
4609 }
4610 q.f->close_section();
4611
4612 if (history_les_bound) {
4613 q.f->open_array_section("peering_blocked_by_detail");
4614 q.f->open_object_section("item");
4615 q.f->dump_string("detail","peering_blocked_by_history_les_bound");
4616 q.f->close_section();
4617 q.f->close_section();
4618 }
4619
4620 q.f->close_section();
4621 return forward_event();
4622 }
4623
4624 void PeeringState::Peering::exit()
4625 {
4626
4627 DECLARE_LOCALS;
4628 psdout(10) << "Leaving Peering" << dendl;
4629 context< PeeringMachine >().log_exit(state_name, enter_time);
4630 ps->state_clear(PG_STATE_PEERING);
4631 pl->clear_probe_targets();
4632
4633 utime_t dur = ceph_clock_now() - enter_time;
4634 pl->get_peering_perf().tinc(rs_peering_latency, dur);
4635 }
4636
4637
4638 /*------Backfilling-------*/
4639 PeeringState::Backfilling::Backfilling(my_context ctx)
4640 : my_base(ctx),
4641 NamedState(context< PeeringMachine >().state_history, "Started/Primary/Active/Backfilling")
4642 {
4643 context< PeeringMachine >().log_enter(state_name);
4644
4645
4646 DECLARE_LOCALS;
4647 ps->backfill_reserved = true;
4648 pl->on_backfill_reserved();
4649 ps->state_clear(PG_STATE_BACKFILL_TOOFULL);
4650 ps->state_clear(PG_STATE_BACKFILL_WAIT);
4651 ps->state_set(PG_STATE_BACKFILLING);
4652 pl->publish_stats_to_osd();
4653 }
4654
4655 void PeeringState::Backfilling::backfill_release_reservations()
4656 {
4657 DECLARE_LOCALS;
4658 pl->cancel_local_background_io_reservation();
4659 for (set<pg_shard_t>::iterator it = ps->backfill_targets.begin();
4660 it != ps->backfill_targets.end();
4661 ++it) {
4662 ceph_assert(*it != ps->pg_whoami);
4663 pl->send_cluster_message(
4664 it->osd,
4665 new MBackfillReserve(
4666 MBackfillReserve::RELEASE,
4667 spg_t(ps->info.pgid.pgid, it->shard),
4668 ps->get_osdmap_epoch()),
4669 ps->get_osdmap_epoch());
4670 }
4671 }
4672
4673 void PeeringState::Backfilling::cancel_backfill()
4674 {
4675 DECLARE_LOCALS;
4676 backfill_release_reservations();
4677 pl->on_backfill_canceled();
4678 }
4679
4680 boost::statechart::result
4681 PeeringState::Backfilling::react(const Backfilled &c)
4682 {
4683 backfill_release_reservations();
4684 return transit<Recovered>();
4685 }
4686
4687 boost::statechart::result
4688 PeeringState::Backfilling::react(const DeferBackfill &c)
4689 {
4690 DECLARE_LOCALS;
4691
4692 psdout(10) << "defer backfill, retry delay " << c.delay << dendl;
4693 ps->state_set(PG_STATE_BACKFILL_WAIT);
4694 ps->state_clear(PG_STATE_BACKFILLING);
4695 cancel_backfill();
4696
4697 pl->schedule_event_after(
4698 std::make_shared<PGPeeringEvent>(
4699 ps->get_osdmap_epoch(),
4700 ps->get_osdmap_epoch(),
4701 RequestBackfill()),
4702 c.delay);
4703 return transit<NotBackfilling>();
4704 }
4705
4706 boost::statechart::result
4707 PeeringState::Backfilling::react(const UnfoundBackfill &c)
4708 {
4709 DECLARE_LOCALS;
4710 psdout(10) << "backfill has unfound, can't continue" << dendl;
4711 ps->state_set(PG_STATE_BACKFILL_UNFOUND);
4712 ps->state_clear(PG_STATE_BACKFILLING);
4713 cancel_backfill();
4714 return transit<NotBackfilling>();
4715 }
4716
4717 boost::statechart::result
4718 PeeringState::Backfilling::react(const RemoteReservationRevokedTooFull &)
4719 {
4720 DECLARE_LOCALS;
4721
4722 ps->state_set(PG_STATE_BACKFILL_TOOFULL);
4723 ps->state_clear(PG_STATE_BACKFILLING);
4724 cancel_backfill();
4725
4726 pl->schedule_event_after(
4727 std::make_shared<PGPeeringEvent>(
4728 ps->get_osdmap_epoch(),
4729 ps->get_osdmap_epoch(),
4730 RequestBackfill()),
4731 ps->cct->_conf->osd_backfill_retry_interval);
4732
4733 return transit<NotBackfilling>();
4734 }
4735
4736 boost::statechart::result
4737 PeeringState::Backfilling::react(const RemoteReservationRevoked &)
4738 {
4739 DECLARE_LOCALS;
4740 ps->state_set(PG_STATE_BACKFILL_WAIT);
4741 cancel_backfill();
4742 if (ps->needs_backfill()) {
4743 return transit<WaitLocalBackfillReserved>();
4744 } else {
4745 // raced with MOSDPGBackfill::OP_BACKFILL_FINISH, ignore
4746 return discard_event();
4747 }
4748 }
4749
4750 void PeeringState::Backfilling::exit()
4751 {
4752 context< PeeringMachine >().log_exit(state_name, enter_time);
4753 DECLARE_LOCALS;
4754 ps->backfill_reserved = false;
4755 ps->state_clear(PG_STATE_BACKFILLING);
4756 ps->state_clear(PG_STATE_FORCED_BACKFILL | PG_STATE_FORCED_RECOVERY);
4757 utime_t dur = ceph_clock_now() - enter_time;
4758 pl->get_peering_perf().tinc(rs_backfilling_latency, dur);
4759 }
4760
4761 /*--WaitRemoteBackfillReserved--*/
4762
4763 PeeringState::WaitRemoteBackfillReserved::WaitRemoteBackfillReserved(my_context ctx)
4764 : my_base(ctx),
4765 NamedState(context< PeeringMachine >().state_history, "Started/Primary/Active/WaitRemoteBackfillReserved"),
4766 backfill_osd_it(context< Active >().remote_shards_to_reserve_backfill.begin())
4767 {
4768 context< PeeringMachine >().log_enter(state_name);
4769 DECLARE_LOCALS;
4770
4771 ps->state_set(PG_STATE_BACKFILL_WAIT);
4772 pl->publish_stats_to_osd();
4773 post_event(RemoteBackfillReserved());
4774 }
4775
4776 boost::statechart::result
4777 PeeringState::WaitRemoteBackfillReserved::react(const RemoteBackfillReserved &evt)
4778 {
4779 DECLARE_LOCALS;
4780
4781 int64_t num_bytes = ps->info.stats.stats.sum.num_bytes;
4782 psdout(10) << __func__ << " num_bytes " << num_bytes << dendl;
4783 if (backfill_osd_it !=
4784 context< Active >().remote_shards_to_reserve_backfill.end()) {
4785 // The primary never backfills itself
4786 ceph_assert(*backfill_osd_it != ps->pg_whoami);
4787 pl->send_cluster_message(
4788 backfill_osd_it->osd,
4789 new MBackfillReserve(
4790 MBackfillReserve::REQUEST,
4791 spg_t(context< PeeringMachine >().spgid.pgid, backfill_osd_it->shard),
4792 ps->get_osdmap_epoch(),
4793 ps->get_backfill_priority(),
4794 num_bytes,
4795 ps->peer_bytes[*backfill_osd_it]),
4796 ps->get_osdmap_epoch());
4797 ++backfill_osd_it;
4798 } else {
4799 ps->peer_bytes.clear();
4800 post_event(AllBackfillsReserved());
4801 }
4802 return discard_event();
4803 }
4804
4805 void PeeringState::WaitRemoteBackfillReserved::exit()
4806 {
4807 context< PeeringMachine >().log_exit(state_name, enter_time);
4808 DECLARE_LOCALS;
4809
4810 utime_t dur = ceph_clock_now() - enter_time;
4811 pl->get_peering_perf().tinc(rs_waitremotebackfillreserved_latency, dur);
4812 }
4813
4814 void PeeringState::WaitRemoteBackfillReserved::retry()
4815 {
4816 DECLARE_LOCALS;
4817 pl->cancel_local_background_io_reservation();
4818
4819 // Send CANCEL to all previously acquired reservations
4820 set<pg_shard_t>::const_iterator it, begin, end;
4821 begin = context< Active >().remote_shards_to_reserve_backfill.begin();
4822 end = context< Active >().remote_shards_to_reserve_backfill.end();
4823 ceph_assert(begin != end);
4824 for (it = begin; it != backfill_osd_it; ++it) {
4825 // The primary never backfills itself
4826 ceph_assert(*it != ps->pg_whoami);
4827 pl->send_cluster_message(
4828 it->osd,
4829 new MBackfillReserve(
4830 MBackfillReserve::RELEASE,
4831 spg_t(context< PeeringMachine >().spgid.pgid, it->shard),
4832 ps->get_osdmap_epoch()),
4833 ps->get_osdmap_epoch());
4834 }
4835
4836 ps->state_clear(PG_STATE_BACKFILL_WAIT);
4837 pl->publish_stats_to_osd();
4838
4839 pl->schedule_event_after(
4840 std::make_shared<PGPeeringEvent>(
4841 ps->get_osdmap_epoch(),
4842 ps->get_osdmap_epoch(),
4843 RequestBackfill()),
4844 ps->cct->_conf->osd_backfill_retry_interval);
4845 }
4846
4847 boost::statechart::result
4848 PeeringState::WaitRemoteBackfillReserved::react(const RemoteReservationRejectedTooFull &evt)
4849 {
4850 DECLARE_LOCALS;
4851 ps->state_set(PG_STATE_BACKFILL_TOOFULL);
4852 retry();
4853 return transit<NotBackfilling>();
4854 }
4855
4856 boost::statechart::result
4857 PeeringState::WaitRemoteBackfillReserved::react(const RemoteReservationRevoked &evt)
4858 {
4859 retry();
4860 return transit<NotBackfilling>();
4861 }
4862
4863 /*--WaitLocalBackfillReserved--*/
4864 PeeringState::WaitLocalBackfillReserved::WaitLocalBackfillReserved(my_context ctx)
4865 : my_base(ctx),
4866 NamedState(context< PeeringMachine >().state_history, "Started/Primary/Active/WaitLocalBackfillReserved")
4867 {
4868 context< PeeringMachine >().log_enter(state_name);
4869 DECLARE_LOCALS;
4870
4871 ps->state_set(PG_STATE_BACKFILL_WAIT);
4872 pl->request_local_background_io_reservation(
4873 ps->get_backfill_priority(),
4874 std::make_shared<PGPeeringEvent>(
4875 ps->get_osdmap_epoch(),
4876 ps->get_osdmap_epoch(),
4877 LocalBackfillReserved()),
4878 std::make_shared<PGPeeringEvent>(
4879 ps->get_osdmap_epoch(),
4880 ps->get_osdmap_epoch(),
4881 DeferBackfill(0.0)));
4882 pl->publish_stats_to_osd();
4883 }
4884
4885 void PeeringState::WaitLocalBackfillReserved::exit()
4886 {
4887 context< PeeringMachine >().log_exit(state_name, enter_time);
4888 DECLARE_LOCALS;
4889 utime_t dur = ceph_clock_now() - enter_time;
4890 pl->get_peering_perf().tinc(rs_waitlocalbackfillreserved_latency, dur);
4891 }
4892
4893 /*----NotBackfilling------*/
4894 PeeringState::NotBackfilling::NotBackfilling(my_context ctx)
4895 : my_base(ctx),
4896 NamedState(context< PeeringMachine >().state_history, "Started/Primary/Active/NotBackfilling")
4897 {
4898 context< PeeringMachine >().log_enter(state_name);
4899 DECLARE_LOCALS;
4900 ps->state_clear(PG_STATE_REPAIR);
4901 pl->publish_stats_to_osd();
4902 }
4903
4904 boost::statechart::result
4905 PeeringState::NotBackfilling::react(const RemoteBackfillReserved &evt)
4906 {
4907 return discard_event();
4908 }
4909
4910 boost::statechart::result
4911 PeeringState::NotBackfilling::react(const RemoteReservationRejectedTooFull &evt)
4912 {
4913 return discard_event();
4914 }
4915
4916 void PeeringState::NotBackfilling::exit()
4917 {
4918 context< PeeringMachine >().log_exit(state_name, enter_time);
4919
4920 DECLARE_LOCALS;
4921 ps->state_clear(PG_STATE_BACKFILL_UNFOUND);
4922 utime_t dur = ceph_clock_now() - enter_time;
4923 pl->get_peering_perf().tinc(rs_notbackfilling_latency, dur);
4924 }
4925
4926 /*----NotRecovering------*/
4927 PeeringState::NotRecovering::NotRecovering(my_context ctx)
4928 : my_base(ctx),
4929 NamedState(context< PeeringMachine >().state_history, "Started/Primary/Active/NotRecovering")
4930 {
4931 context< PeeringMachine >().log_enter(state_name);
4932 DECLARE_LOCALS;
4933 ps->state_clear(PG_STATE_REPAIR);
4934 pl->publish_stats_to_osd();
4935 }
4936
4937 void PeeringState::NotRecovering::exit()
4938 {
4939 context< PeeringMachine >().log_exit(state_name, enter_time);
4940
4941 DECLARE_LOCALS;
4942 ps->state_clear(PG_STATE_RECOVERY_UNFOUND);
4943 utime_t dur = ceph_clock_now() - enter_time;
4944 pl->get_peering_perf().tinc(rs_notrecovering_latency, dur);
4945 }
4946
4947 /*---RepNotRecovering----*/
4948 PeeringState::RepNotRecovering::RepNotRecovering(my_context ctx)
4949 : my_base(ctx),
4950 NamedState(context< PeeringMachine >().state_history, "Started/ReplicaActive/RepNotRecovering")
4951 {
4952 context< PeeringMachine >().log_enter(state_name);
4953 }
4954
4955 boost::statechart::result
4956 PeeringState::RepNotRecovering::react(const RejectTooFullRemoteReservation &evt)
4957 {
4958 DECLARE_LOCALS;
4959 ps->reject_reservation();
4960 post_event(RemoteReservationRejectedTooFull());
4961 return discard_event();
4962 }
4963
4964 void PeeringState::RepNotRecovering::exit()
4965 {
4966 context< PeeringMachine >().log_exit(state_name, enter_time);
4967 DECLARE_LOCALS;
4968 utime_t dur = ceph_clock_now() - enter_time;
4969 pl->get_peering_perf().tinc(rs_repnotrecovering_latency, dur);
4970 }
4971
4972 /*---RepWaitRecoveryReserved--*/
4973 PeeringState::RepWaitRecoveryReserved::RepWaitRecoveryReserved(my_context ctx)
4974 : my_base(ctx),
4975 NamedState(context< PeeringMachine >().state_history, "Started/ReplicaActive/RepWaitRecoveryReserved")
4976 {
4977 context< PeeringMachine >().log_enter(state_name);
4978 }
4979
4980 boost::statechart::result
4981 PeeringState::RepWaitRecoveryReserved::react(const RemoteRecoveryReserved &evt)
4982 {
4983 DECLARE_LOCALS;
4984 pl->send_cluster_message(
4985 ps->primary.osd,
4986 new MRecoveryReserve(
4987 MRecoveryReserve::GRANT,
4988 spg_t(ps->info.pgid.pgid, ps->primary.shard),
4989 ps->get_osdmap_epoch()),
4990 ps->get_osdmap_epoch());
4991 return transit<RepRecovering>();
4992 }
4993
4994 boost::statechart::result
4995 PeeringState::RepWaitRecoveryReserved::react(
4996 const RemoteReservationCanceled &evt)
4997 {
4998 DECLARE_LOCALS;
4999 pl->unreserve_recovery_space();
5000
5001 pl->cancel_remote_recovery_reservation();
5002 return transit<RepNotRecovering>();
5003 }
5004
5005 void PeeringState::RepWaitRecoveryReserved::exit()
5006 {
5007 context< PeeringMachine >().log_exit(state_name, enter_time);
5008 DECLARE_LOCALS;
5009 utime_t dur = ceph_clock_now() - enter_time;
5010 pl->get_peering_perf().tinc(rs_repwaitrecoveryreserved_latency, dur);
5011 }
5012
5013 /*-RepWaitBackfillReserved*/
5014 PeeringState::RepWaitBackfillReserved::RepWaitBackfillReserved(my_context ctx)
5015 : my_base(ctx),
5016 NamedState(context< PeeringMachine >().state_history, "Started/ReplicaActive/RepWaitBackfillReserved")
5017 {
5018 context< PeeringMachine >().log_enter(state_name);
5019 }
5020
5021 boost::statechart::result
5022 PeeringState::RepNotRecovering::react(const RequestBackfillPrio &evt)
5023 {
5024
5025 DECLARE_LOCALS;
5026
5027 if (!pl->try_reserve_recovery_space(
5028 evt.primary_num_bytes, evt.local_num_bytes)) {
5029 post_event(RejectTooFullRemoteReservation());
5030 } else {
5031 PGPeeringEventRef preempt;
5032 if (HAVE_FEATURE(ps->upacting_features, RECOVERY_RESERVATION_2)) {
5033 // older peers will interpret preemption as TOOFULL
5034 preempt = std::make_shared<PGPeeringEvent>(
5035 pl->get_osdmap_epoch(),
5036 pl->get_osdmap_epoch(),
5037 RemoteBackfillPreempted());
5038 }
5039 pl->request_remote_recovery_reservation(
5040 evt.priority,
5041 std::make_shared<PGPeeringEvent>(
5042 pl->get_osdmap_epoch(),
5043 pl->get_osdmap_epoch(),
5044 RemoteBackfillReserved()),
5045 preempt);
5046 }
5047 return transit<RepWaitBackfillReserved>();
5048 }
5049
5050 boost::statechart::result
5051 PeeringState::RepNotRecovering::react(const RequestRecoveryPrio &evt)
5052 {
5053 DECLARE_LOCALS;
5054
5055 // fall back to a local reckoning of priority of primary doesn't pass one
5056 // (pre-mimic compat)
5057 int prio = evt.priority ? evt.priority : ps->get_recovery_priority();
5058
5059 PGPeeringEventRef preempt;
5060 if (HAVE_FEATURE(ps->upacting_features, RECOVERY_RESERVATION_2)) {
5061 // older peers can't handle this
5062 preempt = std::make_shared<PGPeeringEvent>(
5063 ps->get_osdmap_epoch(),
5064 ps->get_osdmap_epoch(),
5065 RemoteRecoveryPreempted());
5066 }
5067
5068 pl->request_remote_recovery_reservation(
5069 prio,
5070 std::make_shared<PGPeeringEvent>(
5071 ps->get_osdmap_epoch(),
5072 ps->get_osdmap_epoch(),
5073 RemoteRecoveryReserved()),
5074 preempt);
5075 return transit<RepWaitRecoveryReserved>();
5076 }
5077
5078 void PeeringState::RepWaitBackfillReserved::exit()
5079 {
5080 context< PeeringMachine >().log_exit(state_name, enter_time);
5081 DECLARE_LOCALS;
5082 utime_t dur = ceph_clock_now() - enter_time;
5083 pl->get_peering_perf().tinc(rs_repwaitbackfillreserved_latency, dur);
5084 }
5085
5086 boost::statechart::result
5087 PeeringState::RepWaitBackfillReserved::react(const RemoteBackfillReserved &evt)
5088 {
5089 DECLARE_LOCALS;
5090
5091
5092 pl->send_cluster_message(
5093 ps->primary.osd,
5094 new MBackfillReserve(
5095 MBackfillReserve::GRANT,
5096 spg_t(ps->info.pgid.pgid, ps->primary.shard),
5097 ps->get_osdmap_epoch()),
5098 ps->get_osdmap_epoch());
5099 return transit<RepRecovering>();
5100 }
5101
5102 boost::statechart::result
5103 PeeringState::RepWaitBackfillReserved::react(
5104 const RejectTooFullRemoteReservation &evt)
5105 {
5106 DECLARE_LOCALS;
5107 ps->reject_reservation();
5108 post_event(RemoteReservationRejectedTooFull());
5109 return discard_event();
5110 }
5111
5112 boost::statechart::result
5113 PeeringState::RepWaitBackfillReserved::react(
5114 const RemoteReservationRejectedTooFull &evt)
5115 {
5116 DECLARE_LOCALS;
5117 pl->unreserve_recovery_space();
5118
5119 pl->cancel_remote_recovery_reservation();
5120 return transit<RepNotRecovering>();
5121 }
5122
5123 boost::statechart::result
5124 PeeringState::RepWaitBackfillReserved::react(
5125 const RemoteReservationCanceled &evt)
5126 {
5127 DECLARE_LOCALS;
5128 pl->unreserve_recovery_space();
5129
5130 pl->cancel_remote_recovery_reservation();
5131 return transit<RepNotRecovering>();
5132 }
5133
5134 /*---RepRecovering-------*/
5135 PeeringState::RepRecovering::RepRecovering(my_context ctx)
5136 : my_base(ctx),
5137 NamedState(context< PeeringMachine >().state_history, "Started/ReplicaActive/RepRecovering")
5138 {
5139 context< PeeringMachine >().log_enter(state_name);
5140 }
5141
5142 boost::statechart::result
5143 PeeringState::RepRecovering::react(const RemoteRecoveryPreempted &)
5144 {
5145 DECLARE_LOCALS;
5146
5147
5148 pl->unreserve_recovery_space();
5149 pl->send_cluster_message(
5150 ps->primary.osd,
5151 new MRecoveryReserve(
5152 MRecoveryReserve::REVOKE,
5153 spg_t(ps->info.pgid.pgid, ps->primary.shard),
5154 ps->get_osdmap_epoch()),
5155 ps->get_osdmap_epoch());
5156 return discard_event();
5157 }
5158
5159 boost::statechart::result
5160 PeeringState::RepRecovering::react(const BackfillTooFull &)
5161 {
5162 DECLARE_LOCALS;
5163
5164
5165 pl->unreserve_recovery_space();
5166 pl->send_cluster_message(
5167 ps->primary.osd,
5168 new MBackfillReserve(
5169 MBackfillReserve::REVOKE_TOOFULL,
5170 spg_t(ps->info.pgid.pgid, ps->primary.shard),
5171 ps->get_osdmap_epoch()),
5172 ps->get_osdmap_epoch());
5173 return discard_event();
5174 }
5175
5176 boost::statechart::result
5177 PeeringState::RepRecovering::react(const RemoteBackfillPreempted &)
5178 {
5179 DECLARE_LOCALS;
5180
5181
5182 pl->unreserve_recovery_space();
5183 pl->send_cluster_message(
5184 ps->primary.osd,
5185 new MBackfillReserve(
5186 MBackfillReserve::REVOKE,
5187 spg_t(ps->info.pgid.pgid, ps->primary.shard),
5188 ps->get_osdmap_epoch()),
5189 ps->get_osdmap_epoch());
5190 return discard_event();
5191 }
5192
5193 void PeeringState::RepRecovering::exit()
5194 {
5195 context< PeeringMachine >().log_exit(state_name, enter_time);
5196 DECLARE_LOCALS;
5197 pl->unreserve_recovery_space();
5198
5199 pl->cancel_remote_recovery_reservation();
5200 utime_t dur = ceph_clock_now() - enter_time;
5201 pl->get_peering_perf().tinc(rs_reprecovering_latency, dur);
5202 }
5203
5204 /*------Activating--------*/
5205 PeeringState::Activating::Activating(my_context ctx)
5206 : my_base(ctx),
5207 NamedState(context< PeeringMachine >().state_history, "Started/Primary/Active/Activating")
5208 {
5209 context< PeeringMachine >().log_enter(state_name);
5210 }
5211
5212 void PeeringState::Activating::exit()
5213 {
5214 context< PeeringMachine >().log_exit(state_name, enter_time);
5215 DECLARE_LOCALS;
5216 utime_t dur = ceph_clock_now() - enter_time;
5217 pl->get_peering_perf().tinc(rs_activating_latency, dur);
5218 }
5219
5220 PeeringState::WaitLocalRecoveryReserved::WaitLocalRecoveryReserved(my_context ctx)
5221 : my_base(ctx),
5222 NamedState(context< PeeringMachine >().state_history, "Started/Primary/Active/WaitLocalRecoveryReserved")
5223 {
5224 context< PeeringMachine >().log_enter(state_name);
5225 DECLARE_LOCALS;
5226
5227 // Make sure all nodes that part of the recovery aren't full
5228 if (!ps->cct->_conf->osd_debug_skip_full_check_in_recovery &&
5229 ps->get_osdmap()->check_full(ps->acting_recovery_backfill)) {
5230 post_event(RecoveryTooFull());
5231 return;
5232 }
5233
5234 ps->state_clear(PG_STATE_RECOVERY_TOOFULL);
5235 ps->state_set(PG_STATE_RECOVERY_WAIT);
5236 pl->request_local_background_io_reservation(
5237 ps->get_recovery_priority(),
5238 std::make_shared<PGPeeringEvent>(
5239 ps->get_osdmap_epoch(),
5240 ps->get_osdmap_epoch(),
5241 LocalRecoveryReserved()),
5242 std::make_shared<PGPeeringEvent>(
5243 ps->get_osdmap_epoch(),
5244 ps->get_osdmap_epoch(),
5245 DeferRecovery(0.0)));
5246 pl->publish_stats_to_osd();
5247 }
5248
5249 boost::statechart::result
5250 PeeringState::WaitLocalRecoveryReserved::react(const RecoveryTooFull &evt)
5251 {
5252 DECLARE_LOCALS;
5253 ps->state_set(PG_STATE_RECOVERY_TOOFULL);
5254 pl->schedule_event_after(
5255 std::make_shared<PGPeeringEvent>(
5256 ps->get_osdmap_epoch(),
5257 ps->get_osdmap_epoch(),
5258 DoRecovery()),
5259 ps->cct->_conf->osd_recovery_retry_interval);
5260 return transit<NotRecovering>();
5261 }
5262
5263 void PeeringState::WaitLocalRecoveryReserved::exit()
5264 {
5265 context< PeeringMachine >().log_exit(state_name, enter_time);
5266 DECLARE_LOCALS;
5267 utime_t dur = ceph_clock_now() - enter_time;
5268 pl->get_peering_perf().tinc(rs_waitlocalrecoveryreserved_latency, dur);
5269 }
5270
5271 PeeringState::WaitRemoteRecoveryReserved::WaitRemoteRecoveryReserved(my_context ctx)
5272 : my_base(ctx),
5273 NamedState(context< PeeringMachine >().state_history, "Started/Primary/Active/WaitRemoteRecoveryReserved"),
5274 remote_recovery_reservation_it(context< Active >().remote_shards_to_reserve_recovery.begin())
5275 {
5276 context< PeeringMachine >().log_enter(state_name);
5277 post_event(RemoteRecoveryReserved());
5278 }
5279
5280 boost::statechart::result
5281 PeeringState::WaitRemoteRecoveryReserved::react(const RemoteRecoveryReserved &evt) {
5282 DECLARE_LOCALS;
5283
5284 if (remote_recovery_reservation_it !=
5285 context< Active >().remote_shards_to_reserve_recovery.end()) {
5286 ceph_assert(*remote_recovery_reservation_it != ps->pg_whoami);
5287 pl->send_cluster_message(
5288 remote_recovery_reservation_it->osd,
5289 new MRecoveryReserve(
5290 MRecoveryReserve::REQUEST,
5291 spg_t(context< PeeringMachine >().spgid.pgid,
5292 remote_recovery_reservation_it->shard),
5293 ps->get_osdmap_epoch(),
5294 ps->get_recovery_priority()),
5295 ps->get_osdmap_epoch());
5296 ++remote_recovery_reservation_it;
5297 } else {
5298 post_event(AllRemotesReserved());
5299 }
5300 return discard_event();
5301 }
5302
5303 void PeeringState::WaitRemoteRecoveryReserved::exit()
5304 {
5305 context< PeeringMachine >().log_exit(state_name, enter_time);
5306 DECLARE_LOCALS;
5307 utime_t dur = ceph_clock_now() - enter_time;
5308 pl->get_peering_perf().tinc(rs_waitremoterecoveryreserved_latency, dur);
5309 }
5310
5311 PeeringState::Recovering::Recovering(my_context ctx)
5312 : my_base(ctx),
5313 NamedState(context< PeeringMachine >().state_history, "Started/Primary/Active/Recovering")
5314 {
5315 context< PeeringMachine >().log_enter(state_name);
5316
5317 DECLARE_LOCALS;
5318 ps->state_clear(PG_STATE_RECOVERY_WAIT);
5319 ps->state_clear(PG_STATE_RECOVERY_TOOFULL);
5320 ps->state_set(PG_STATE_RECOVERING);
5321 pl->on_recovery_reserved();
5322 ceph_assert(!ps->state_test(PG_STATE_ACTIVATING));
5323 pl->publish_stats_to_osd();
5324 }
5325
5326 void PeeringState::Recovering::release_reservations(bool cancel)
5327 {
5328 DECLARE_LOCALS;
5329 ceph_assert(cancel || !ps->pg_log.get_missing().have_missing());
5330
5331 // release remote reservations
5332 for (set<pg_shard_t>::const_iterator i =
5333 context< Active >().remote_shards_to_reserve_recovery.begin();
5334 i != context< Active >().remote_shards_to_reserve_recovery.end();
5335 ++i) {
5336 if (*i == ps->pg_whoami) // skip myself
5337 continue;
5338 pl->send_cluster_message(
5339 i->osd,
5340 new MRecoveryReserve(
5341 MRecoveryReserve::RELEASE,
5342 spg_t(ps->info.pgid.pgid, i->shard),
5343 ps->get_osdmap_epoch()),
5344 ps->get_osdmap_epoch());
5345 }
5346 }
5347
5348 boost::statechart::result
5349 PeeringState::Recovering::react(const AllReplicasRecovered &evt)
5350 {
5351 DECLARE_LOCALS;
5352 ps->state_clear(PG_STATE_FORCED_RECOVERY);
5353 release_reservations();
5354 pl->cancel_local_background_io_reservation();
5355 return transit<Recovered>();
5356 }
5357
5358 boost::statechart::result
5359 PeeringState::Recovering::react(const RequestBackfill &evt)
5360 {
5361 DECLARE_LOCALS;
5362
5363 release_reservations();
5364
5365 ps->state_clear(PG_STATE_FORCED_RECOVERY);
5366 pl->cancel_local_background_io_reservation();
5367 pl->publish_stats_to_osd();
5368 // transit any async_recovery_targets back into acting
5369 // so pg won't have to stay undersized for long
5370 // as backfill might take a long time to complete..
5371 if (!ps->async_recovery_targets.empty()) {
5372 pg_shard_t auth_log_shard;
5373 bool history_les_bound = false;
5374 ps->choose_acting(auth_log_shard, true, &history_les_bound);
5375 }
5376 return transit<WaitLocalBackfillReserved>();
5377 }
5378
5379 boost::statechart::result
5380 PeeringState::Recovering::react(const DeferRecovery &evt)
5381 {
5382 DECLARE_LOCALS;
5383 if (!ps->state_test(PG_STATE_RECOVERING)) {
5384 // we may have finished recovery and have an AllReplicasRecovered
5385 // event queued to move us to the next state.
5386 psdout(10) << "got defer recovery but not recovering" << dendl;
5387 return discard_event();
5388 }
5389 psdout(10) << "defer recovery, retry delay " << evt.delay << dendl;
5390 ps->state_set(PG_STATE_RECOVERY_WAIT);
5391 pl->cancel_local_background_io_reservation();
5392 release_reservations(true);
5393 pl->schedule_event_after(
5394 std::make_shared<PGPeeringEvent>(
5395 ps->get_osdmap_epoch(),
5396 ps->get_osdmap_epoch(),
5397 DoRecovery()),
5398 evt.delay);
5399 return transit<NotRecovering>();
5400 }
5401
5402 boost::statechart::result
5403 PeeringState::Recovering::react(const UnfoundRecovery &evt)
5404 {
5405 DECLARE_LOCALS;
5406 psdout(10) << "recovery has unfound, can't continue" << dendl;
5407 ps->state_set(PG_STATE_RECOVERY_UNFOUND);
5408 pl->cancel_local_background_io_reservation();
5409 release_reservations(true);
5410 return transit<NotRecovering>();
5411 }
5412
5413 void PeeringState::Recovering::exit()
5414 {
5415 context< PeeringMachine >().log_exit(state_name, enter_time);
5416
5417 DECLARE_LOCALS;
5418 utime_t dur = ceph_clock_now() - enter_time;
5419 ps->state_clear(PG_STATE_RECOVERING);
5420 pl->get_peering_perf().tinc(rs_recovering_latency, dur);
5421 }
5422
5423 PeeringState::Recovered::Recovered(my_context ctx)
5424 : my_base(ctx),
5425 NamedState(context< PeeringMachine >().state_history, "Started/Primary/Active/Recovered")
5426 {
5427 pg_shard_t auth_log_shard;
5428
5429 context< PeeringMachine >().log_enter(state_name);
5430
5431 DECLARE_LOCALS;
5432
5433 ceph_assert(!ps->needs_recovery());
5434
5435 // if we finished backfill, all acting are active; recheck if
5436 // DEGRADED | UNDERSIZED is appropriate.
5437 ceph_assert(!ps->acting_recovery_backfill.empty());
5438 if (ps->get_osdmap()->get_pg_size(context< PeeringMachine >().spgid.pgid) <=
5439 ps->acting_recovery_backfill.size()) {
5440 ps->state_clear(PG_STATE_FORCED_BACKFILL | PG_STATE_FORCED_RECOVERY);
5441 pl->publish_stats_to_osd();
5442 }
5443
5444 // adjust acting set? (e.g. because backfill completed...)
5445 bool history_les_bound = false;
5446 if (ps->acting != ps->up && !ps->choose_acting(auth_log_shard,
5447 true, &history_les_bound)) {
5448 ceph_assert(ps->want_acting.size());
5449 } else if (!ps->async_recovery_targets.empty()) {
5450 ps->choose_acting(auth_log_shard, true, &history_les_bound);
5451 }
5452
5453 if (context< Active >().all_replicas_activated &&
5454 ps->async_recovery_targets.empty())
5455 post_event(GoClean());
5456 }
5457
5458 void PeeringState::Recovered::exit()
5459 {
5460 context< PeeringMachine >().log_exit(state_name, enter_time);
5461 DECLARE_LOCALS;
5462
5463 utime_t dur = ceph_clock_now() - enter_time;
5464 pl->get_peering_perf().tinc(rs_recovered_latency, dur);
5465 }
5466
5467 PeeringState::Clean::Clean(my_context ctx)
5468 : my_base(ctx),
5469 NamedState(context< PeeringMachine >().state_history, "Started/Primary/Active/Clean")
5470 {
5471 context< PeeringMachine >().log_enter(state_name);
5472
5473 DECLARE_LOCALS;
5474
5475 if (ps->info.last_complete != ps->info.last_update) {
5476 ceph_abort();
5477 }
5478
5479
5480 ps->try_mark_clean();
5481
5482 context< PeeringMachine >().get_cur_transaction().register_on_commit(
5483 pl->on_clean());
5484 }
5485
5486 void PeeringState::Clean::exit()
5487 {
5488 context< PeeringMachine >().log_exit(state_name, enter_time);
5489
5490 DECLARE_LOCALS;
5491 ps->state_clear(PG_STATE_CLEAN);
5492 utime_t dur = ceph_clock_now() - enter_time;
5493 pl->get_peering_perf().tinc(rs_clean_latency, dur);
5494 }
5495
5496 template <typename T>
5497 set<pg_shard_t> unique_osd_shard_set(const pg_shard_t & skip, const T &in)
5498 {
5499 set<int> osds_found;
5500 set<pg_shard_t> out;
5501 for (typename T::const_iterator i = in.begin();
5502 i != in.end();
5503 ++i) {
5504 if (*i != skip && !osds_found.count(i->osd)) {
5505 osds_found.insert(i->osd);
5506 out.insert(*i);
5507 }
5508 }
5509 return out;
5510 }
5511
5512 /*---------Active---------*/
5513 PeeringState::Active::Active(my_context ctx)
5514 : my_base(ctx),
5515 NamedState(context< PeeringMachine >().state_history, "Started/Primary/Active"),
5516 remote_shards_to_reserve_recovery(
5517 unique_osd_shard_set(
5518 context< PeeringMachine >().state->pg_whoami,
5519 context< PeeringMachine >().state->acting_recovery_backfill)),
5520 remote_shards_to_reserve_backfill(
5521 unique_osd_shard_set(
5522 context< PeeringMachine >().state->pg_whoami,
5523 context< PeeringMachine >().state->backfill_targets)),
5524 all_replicas_activated(false)
5525 {
5526 context< PeeringMachine >().log_enter(state_name);
5527
5528
5529 DECLARE_LOCALS;
5530
5531 ceph_assert(!ps->backfill_reserved);
5532 ceph_assert(ps->is_primary());
5533 psdout(10) << "In Active, about to call activate" << dendl;
5534 ps->start_flush(context< PeeringMachine >().get_cur_transaction());
5535 ps->activate(context< PeeringMachine >().get_cur_transaction(),
5536 ps->get_osdmap_epoch(),
5537 context< PeeringMachine >().get_recovery_ctx());
5538
5539 // everyone has to commit/ack before we are truly active
5540 ps->blocked_by.clear();
5541 for (set<pg_shard_t>::iterator p = ps->acting_recovery_backfill.begin();
5542 p != ps->acting_recovery_backfill.end();
5543 ++p) {
5544 if (p->shard != ps->pg_whoami.shard) {
5545 ps->blocked_by.insert(p->shard);
5546 }
5547 }
5548 pl->publish_stats_to_osd();
5549 psdout(10) << "Activate Finished" << dendl;
5550 }
5551
5552 boost::statechart::result PeeringState::Active::react(const AdvMap& advmap)
5553 {
5554 DECLARE_LOCALS;
5555
5556 if (ps->should_restart_peering(
5557 advmap.up_primary,
5558 advmap.acting_primary,
5559 advmap.newup,
5560 advmap.newacting,
5561 advmap.lastmap,
5562 advmap.osdmap)) {
5563 psdout(10) << "Active advmap interval change, fast return" << dendl;
5564 return forward_event();
5565 }
5566 psdout(10) << "Active advmap" << dendl;
5567 bool need_publish = false;
5568
5569 pl->on_active_advmap(advmap.osdmap);
5570 if (ps->dirty_big_info) {
5571 // share updated purged_snaps to mgr/mon so that we (a) stop reporting
5572 // purged snaps and (b) perhaps share more snaps that we have purged
5573 // but didn't fit in pg_stat_t.
5574 need_publish = true;
5575 ps->share_pg_info();
5576 }
5577
5578 for (size_t i = 0; i < ps->want_acting.size(); i++) {
5579 int osd = ps->want_acting[i];
5580 if (!advmap.osdmap->is_up(osd)) {
5581 pg_shard_t osd_with_shard(osd, shard_id_t(i));
5582 ceph_assert(ps->is_acting(osd_with_shard) || ps->is_up(osd_with_shard));
5583 }
5584 }
5585
5586 /* Check for changes in pool size (if the acting set changed as a result,
5587 * this does not matter) */
5588 if (advmap.lastmap->get_pg_size(ps->info.pgid.pgid) !=
5589 ps->get_osdmap()->get_pg_size(ps->info.pgid.pgid)) {
5590 if (ps->get_osdmap()->get_pg_size(ps->info.pgid.pgid) <=
5591 ps->actingset.size()) {
5592 ps->state_clear(PG_STATE_UNDERSIZED);
5593 } else {
5594 ps->state_set(PG_STATE_UNDERSIZED);
5595 }
5596 // degraded changes will be detected by call from publish_stats_to_osd()
5597 need_publish = true;
5598 }
5599
5600 // if we haven't reported our PG stats in a long time, do so now.
5601 if (ps->info.stats.reported_epoch + ps->cct->_conf->osd_pg_stat_report_interval_max < advmap.osdmap->get_epoch()) {
5602 psdout(20) << "reporting stats to osd after " << (advmap.osdmap->get_epoch() - ps->info.stats.reported_epoch)
5603 << " epochs" << dendl;
5604 need_publish = true;
5605 }
5606
5607 if (need_publish)
5608 pl->publish_stats_to_osd();
5609
5610 if (ps->check_prior_readable_down_osds(advmap.osdmap)) {
5611 pl->recheck_readable();
5612 }
5613
5614 return forward_event();
5615 }
5616
5617 boost::statechart::result PeeringState::Active::react(const ActMap&)
5618 {
5619 DECLARE_LOCALS;
5620 psdout(10) << "Active: handling ActMap" << dendl;
5621 ceph_assert(ps->is_primary());
5622
5623 pl->on_active_actmap();
5624
5625 if (ps->have_unfound()) {
5626 // object may have become unfound
5627 ps->discover_all_missing(context<PeeringMachine>().get_recovery_ctx().msgs);
5628 }
5629
5630 uint64_t unfound = ps->missing_loc.num_unfound();
5631 if (unfound > 0 &&
5632 ps->all_unfound_are_queried_or_lost(ps->get_osdmap())) {
5633 if (ps->cct->_conf->osd_auto_mark_unfound_lost) {
5634 pl->get_clog_error() << context< PeeringMachine >().spgid.pgid << " has " << unfound
5635 << " objects unfound and apparently lost, would automatically "
5636 << "mark these objects lost but this feature is not yet implemented "
5637 << "(osd_auto_mark_unfound_lost)";
5638 } else
5639 pl->get_clog_error() << context< PeeringMachine >().spgid.pgid << " has "
5640 << unfound << " objects unfound and apparently lost";
5641 }
5642
5643 return forward_event();
5644 }
5645
5646 boost::statechart::result PeeringState::Active::react(const MNotifyRec& notevt)
5647 {
5648
5649 DECLARE_LOCALS;
5650 ceph_assert(ps->is_primary());
5651 if (ps->peer_info.count(notevt.from)) {
5652 psdout(10) << "Active: got notify from " << notevt.from
5653 << ", already have info from that osd, ignoring"
5654 << dendl;
5655 } else if (ps->peer_purged.count(notevt.from)) {
5656 psdout(10) << "Active: got notify from " << notevt.from
5657 << ", already purged that peer, ignoring"
5658 << dendl;
5659 } else {
5660 psdout(10) << "Active: got notify from " << notevt.from
5661 << ", calling proc_replica_info and discover_all_missing"
5662 << dendl;
5663 ps->proc_replica_info(
5664 notevt.from, notevt.notify.info, notevt.notify.epoch_sent);
5665 if (ps->have_unfound() || (ps->is_degraded() && ps->might_have_unfound.count(notevt.from))) {
5666 ps->discover_all_missing(
5667 context<PeeringMachine>().get_recovery_ctx().msgs);
5668 }
5669 // check if it is a previous down acting member that's coming back.
5670 // if so, request pg_temp change to trigger a new interval transition
5671 pg_shard_t auth_log_shard;
5672 bool history_les_bound = false;
5673 ps->choose_acting(auth_log_shard, false, &history_les_bound, true);
5674 if (!ps->want_acting.empty() && ps->want_acting != ps->acting) {
5675 psdout(10) << "Active: got notify from previous acting member "
5676 << notevt.from << ", requesting pg_temp change"
5677 << dendl;
5678 }
5679 }
5680 return discard_event();
5681 }
5682
5683 boost::statechart::result PeeringState::Active::react(const MTrim& trim)
5684 {
5685 DECLARE_LOCALS;
5686 ceph_assert(ps->is_primary());
5687
5688 // peer is informing us of their last_complete_ondisk
5689 ldout(ps->cct,10) << " replica osd." << trim.from << " lcod " << trim.trim_to << dendl;
5690 ps->update_peer_last_complete_ondisk(pg_shard_t{trim.from, trim.shard},
5691 trim.trim_to);
5692 // trim log when the pg is recovered
5693 ps->calc_min_last_complete_ondisk();
5694 return discard_event();
5695 }
5696
5697 boost::statechart::result PeeringState::Active::react(const MInfoRec& infoevt)
5698 {
5699 DECLARE_LOCALS;
5700 ceph_assert(ps->is_primary());
5701
5702 ceph_assert(!ps->acting_recovery_backfill.empty());
5703 if (infoevt.lease_ack) {
5704 ps->proc_lease_ack(infoevt.from.osd, *infoevt.lease_ack);
5705 }
5706 // don't update history (yet) if we are active and primary; the replica
5707 // may be telling us they have activated (and committed) but we can't
5708 // share that until _everyone_ does the same.
5709 if (ps->is_acting_recovery_backfill(infoevt.from) &&
5710 ps->peer_activated.count(infoevt.from) == 0) {
5711 psdout(10) << " peer osd." << infoevt.from
5712 << " activated and committed" << dendl;
5713 ps->peer_activated.insert(infoevt.from);
5714 ps->blocked_by.erase(infoevt.from.shard);
5715 pl->publish_stats_to_osd();
5716 if (ps->peer_activated.size() == ps->acting_recovery_backfill.size()) {
5717 all_activated_and_committed();
5718 }
5719 }
5720 return discard_event();
5721 }
5722
5723 boost::statechart::result PeeringState::Active::react(const MLogRec& logevt)
5724 {
5725 DECLARE_LOCALS;
5726 psdout(10) << "searching osd." << logevt.from
5727 << " log for unfound items" << dendl;
5728 ps->proc_replica_log(
5729 logevt.msg->info, logevt.msg->log, logevt.msg->missing, logevt.from);
5730 bool got_missing = ps->search_for_missing(
5731 ps->peer_info[logevt.from],
5732 ps->peer_missing[logevt.from],
5733 logevt.from,
5734 context< PeeringMachine >().get_recovery_ctx());
5735 // If there are missing AND we are "fully" active then start recovery now
5736 if (got_missing && ps->state_test(PG_STATE_ACTIVE)) {
5737 post_event(DoRecovery());
5738 }
5739 return discard_event();
5740 }
5741
5742 boost::statechart::result PeeringState::Active::react(const QueryState& q)
5743 {
5744 DECLARE_LOCALS;
5745
5746 q.f->open_object_section("state");
5747 q.f->dump_string("name", state_name);
5748 q.f->dump_stream("enter_time") << enter_time;
5749
5750 {
5751 q.f->open_array_section("might_have_unfound");
5752 for (set<pg_shard_t>::iterator p = ps->might_have_unfound.begin();
5753 p != ps->might_have_unfound.end();
5754 ++p) {
5755 q.f->open_object_section("osd");
5756 q.f->dump_stream("osd") << *p;
5757 if (ps->peer_missing.count(*p)) {
5758 q.f->dump_string("status", "already probed");
5759 } else if (ps->peer_missing_requested.count(*p)) {
5760 q.f->dump_string("status", "querying");
5761 } else if (!ps->get_osdmap()->is_up(p->osd)) {
5762 q.f->dump_string("status", "osd is down");
5763 } else {
5764 q.f->dump_string("status", "not queried");
5765 }
5766 q.f->close_section();
5767 }
5768 q.f->close_section();
5769 }
5770 {
5771 q.f->open_object_section("recovery_progress");
5772 q.f->open_array_section("backfill_targets");
5773 for (set<pg_shard_t>::const_iterator p = ps->backfill_targets.begin();
5774 p != ps->backfill_targets.end(); ++p)
5775 q.f->dump_stream("replica") << *p;
5776 q.f->close_section();
5777 pl->dump_recovery_info(q.f);
5778 q.f->close_section();
5779 }
5780
5781 q.f->close_section();
5782 return forward_event();
5783 }
5784
5785 boost::statechart::result PeeringState::Active::react(
5786 const ActivateCommitted &evt)
5787 {
5788 DECLARE_LOCALS;
5789 ceph_assert(!ps->peer_activated.count(ps->pg_whoami));
5790 ps->peer_activated.insert(ps->pg_whoami);
5791 psdout(10) << "_activate_committed " << evt.epoch
5792 << " peer_activated now " << ps->peer_activated
5793 << " last_interval_started "
5794 << ps->info.history.last_interval_started
5795 << " last_epoch_started "
5796 << ps->info.history.last_epoch_started
5797 << " same_interval_since "
5798 << ps->info.history.same_interval_since
5799 << dendl;
5800 ceph_assert(!ps->acting_recovery_backfill.empty());
5801 if (ps->peer_activated.size() == ps->acting_recovery_backfill.size())
5802 all_activated_and_committed();
5803 return discard_event();
5804 }
5805
5806 boost::statechart::result PeeringState::Active::react(const AllReplicasActivated &evt)
5807 {
5808
5809 DECLARE_LOCALS;
5810 pg_t pgid = context< PeeringMachine >().spgid.pgid;
5811
5812 all_replicas_activated = true;
5813
5814 ps->state_clear(PG_STATE_ACTIVATING);
5815 ps->state_clear(PG_STATE_CREATING);
5816 ps->state_clear(PG_STATE_PREMERGE);
5817
5818 bool merge_target;
5819 if (ps->pool.info.is_pending_merge(pgid, &merge_target)) {
5820 ps->state_set(PG_STATE_PEERED);
5821 ps->state_set(PG_STATE_PREMERGE);
5822
5823 if (ps->actingset.size() != ps->get_osdmap()->get_pg_size(pgid)) {
5824 if (merge_target) {
5825 pg_t src = pgid;
5826 src.set_ps(ps->pool.info.get_pg_num_pending());
5827 assert(src.get_parent() == pgid);
5828 pl->set_not_ready_to_merge_target(pgid, src);
5829 } else {
5830 pl->set_not_ready_to_merge_source(pgid);
5831 }
5832 }
5833 } else if (ps->acting.size() < ps->pool.info.min_size) {
5834 ps->state_set(PG_STATE_PEERED);
5835 } else {
5836 ps->state_set(PG_STATE_ACTIVE);
5837 }
5838
5839 auto mnow = pl->get_mnow();
5840 if (ps->prior_readable_until_ub > mnow) {
5841 psdout(10) << " waiting for prior_readable_until_ub "
5842 << ps->prior_readable_until_ub << " > mnow " << mnow << dendl;
5843 ps->state_set(PG_STATE_WAIT);
5844 pl->queue_check_readable(
5845 ps->last_peering_reset,
5846 ps->prior_readable_until_ub - mnow);
5847 } else {
5848 psdout(10) << " mnow " << mnow << " >= prior_readable_until_ub "
5849 << ps->prior_readable_until_ub << dendl;
5850 }
5851
5852 if (ps->pool.info.has_flag(pg_pool_t::FLAG_CREATING)) {
5853 pl->send_pg_created(pgid);
5854 }
5855
5856 ps->info.history.last_epoch_started = ps->info.last_epoch_started;
5857 ps->info.history.last_interval_started = ps->info.last_interval_started;
5858 ps->dirty_info = true;
5859
5860 ps->share_pg_info();
5861 pl->publish_stats_to_osd();
5862
5863 pl->on_activate_complete();
5864
5865 return discard_event();
5866 }
5867
5868 boost::statechart::result PeeringState::Active::react(const RenewLease& rl)
5869 {
5870 DECLARE_LOCALS;
5871 ps->proc_renew_lease();
5872 return discard_event();
5873 }
5874
5875 boost::statechart::result PeeringState::Active::react(const MLeaseAck& la)
5876 {
5877 DECLARE_LOCALS;
5878 ps->proc_lease_ack(la.from, la.lease_ack);
5879 return discard_event();
5880 }
5881
5882
5883 boost::statechart::result PeeringState::Active::react(const CheckReadable &evt)
5884 {
5885 DECLARE_LOCALS;
5886 pl->recheck_readable();
5887 return discard_event();
5888 }
5889
5890 /*
5891 * update info.history.last_epoch_started ONLY after we and all
5892 * replicas have activated AND committed the activate transaction
5893 * (i.e. the peering results are stable on disk).
5894 */
5895 void PeeringState::Active::all_activated_and_committed()
5896 {
5897 DECLARE_LOCALS;
5898 psdout(10) << "all_activated_and_committed" << dendl;
5899 ceph_assert(ps->is_primary());
5900 ceph_assert(ps->peer_activated.size() == ps->acting_recovery_backfill.size());
5901 ceph_assert(!ps->acting_recovery_backfill.empty());
5902 ceph_assert(ps->blocked_by.empty());
5903
5904 if (HAVE_FEATURE(ps->upacting_features, SERVER_OCTOPUS)) {
5905 // this is overkill when the activation is quick, but when it is slow it
5906 // is important, because the lease was renewed by the activate itself but we
5907 // don't know how long ago that was, and simply scheduling now may leave
5908 // a gap in lease coverage. keep it simple and aggressively renew.
5909 ps->renew_lease(pl->get_mnow());
5910 ps->send_lease();
5911 ps->schedule_renew_lease();
5912 }
5913
5914 // Degraded?
5915 ps->update_calc_stats();
5916 if (ps->info.stats.stats.sum.num_objects_degraded) {
5917 ps->state_set(PG_STATE_DEGRADED);
5918 } else {
5919 ps->state_clear(PG_STATE_DEGRADED);
5920 }
5921
5922 post_event(PeeringState::AllReplicasActivated());
5923 }
5924
5925
5926 void PeeringState::Active::exit()
5927 {
5928 context< PeeringMachine >().log_exit(state_name, enter_time);
5929
5930
5931 DECLARE_LOCALS;
5932 pl->cancel_local_background_io_reservation();
5933
5934 ps->blocked_by.clear();
5935 ps->backfill_reserved = false;
5936 ps->state_clear(PG_STATE_ACTIVATING);
5937 ps->state_clear(PG_STATE_DEGRADED);
5938 ps->state_clear(PG_STATE_UNDERSIZED);
5939 ps->state_clear(PG_STATE_BACKFILL_TOOFULL);
5940 ps->state_clear(PG_STATE_BACKFILL_WAIT);
5941 ps->state_clear(PG_STATE_RECOVERY_WAIT);
5942 ps->state_clear(PG_STATE_RECOVERY_TOOFULL);
5943 utime_t dur = ceph_clock_now() - enter_time;
5944 pl->get_peering_perf().tinc(rs_active_latency, dur);
5945 pl->on_active_exit();
5946 }
5947
5948 /*------ReplicaActive-----*/
5949 PeeringState::ReplicaActive::ReplicaActive(my_context ctx)
5950 : my_base(ctx),
5951 NamedState(context< PeeringMachine >().state_history, "Started/ReplicaActive")
5952 {
5953 context< PeeringMachine >().log_enter(state_name);
5954
5955 DECLARE_LOCALS;
5956 ps->start_flush(context< PeeringMachine >().get_cur_transaction());
5957 }
5958
5959
5960 boost::statechart::result PeeringState::ReplicaActive::react(
5961 const Activate& actevt) {
5962 DECLARE_LOCALS;
5963 psdout(10) << "In ReplicaActive, about to call activate" << dendl;
5964 ps->activate(
5965 context< PeeringMachine >().get_cur_transaction(),
5966 actevt.activation_epoch,
5967 context< PeeringMachine >().get_recovery_ctx());
5968 psdout(10) << "Activate Finished" << dendl;
5969 return discard_event();
5970 }
5971
5972 boost::statechart::result PeeringState::ReplicaActive::react(
5973 const ActivateCommitted &evt)
5974 {
5975 DECLARE_LOCALS;
5976 psdout(10) << __func__ << " " << evt.epoch << " telling primary" << dendl;
5977
5978 auto &rctx = context<PeeringMachine>().get_recovery_ctx();
5979 auto epoch = ps->get_osdmap_epoch();
5980 pg_info_t i = ps->info;
5981 i.history.last_epoch_started = evt.activation_epoch;
5982 i.history.last_interval_started = i.history.same_interval_since;
5983 rctx.send_info(
5984 ps->get_primary().osd,
5985 spg_t(ps->info.pgid.pgid, ps->get_primary().shard),
5986 epoch,
5987 epoch,
5988 i,
5989 {}, /* lease */
5990 ps->get_lease_ack());
5991
5992 if (ps->acting.size() >= ps->pool.info.min_size) {
5993 ps->state_set(PG_STATE_ACTIVE);
5994 } else {
5995 ps->state_set(PG_STATE_PEERED);
5996 }
5997 pl->on_activate_committed();
5998
5999 return discard_event();
6000 }
6001
6002 boost::statechart::result PeeringState::ReplicaActive::react(const MLease& l)
6003 {
6004 DECLARE_LOCALS;
6005 spg_t spgid = context< PeeringMachine >().spgid;
6006 epoch_t epoch = pl->get_osdmap_epoch();
6007
6008 ps->proc_lease(l.lease);
6009 pl->send_cluster_message(
6010 ps->get_primary().osd,
6011 new MOSDPGLeaseAck(epoch,
6012 spg_t(spgid.pgid, ps->get_primary().shard),
6013 ps->get_lease_ack()),
6014 epoch);
6015 return discard_event();
6016 }
6017
6018 boost::statechart::result PeeringState::ReplicaActive::react(const MInfoRec& infoevt)
6019 {
6020 DECLARE_LOCALS;
6021 ps->proc_primary_info(context<PeeringMachine>().get_cur_transaction(),
6022 infoevt.info);
6023 return discard_event();
6024 }
6025
6026 boost::statechart::result PeeringState::ReplicaActive::react(const MLogRec& logevt)
6027 {
6028 DECLARE_LOCALS;
6029 psdout(10) << "received log from " << logevt.from << dendl;
6030 ObjectStore::Transaction &t = context<PeeringMachine>().get_cur_transaction();
6031 ps->merge_log(t, logevt.msg->info, logevt.msg->log, logevt.from);
6032 ceph_assert(ps->pg_log.get_head() == ps->info.last_update);
6033 if (logevt.msg->lease) {
6034 ps->proc_lease(*logevt.msg->lease);
6035 }
6036
6037 return discard_event();
6038 }
6039
6040 boost::statechart::result PeeringState::ReplicaActive::react(const MTrim& trim)
6041 {
6042 DECLARE_LOCALS;
6043 // primary is instructing us to trim
6044 ps->pg_log.trim(trim.trim_to, ps->info);
6045 ps->dirty_info = true;
6046 return discard_event();
6047 }
6048
6049 boost::statechart::result PeeringState::ReplicaActive::react(const ActMap&)
6050 {
6051 DECLARE_LOCALS;
6052 if (ps->should_send_notify() && ps->get_primary().osd >= 0) {
6053 ps->info.history.refresh_prior_readable_until_ub(
6054 pl->get_mnow(), ps->prior_readable_until_ub);
6055 context< PeeringMachine >().send_notify(
6056 ps->get_primary().osd,
6057 pg_notify_t(
6058 ps->get_primary().shard, ps->pg_whoami.shard,
6059 ps->get_osdmap_epoch(),
6060 ps->get_osdmap_epoch(),
6061 ps->info,
6062 ps->past_intervals));
6063 }
6064 return discard_event();
6065 }
6066
6067 boost::statechart::result PeeringState::ReplicaActive::react(
6068 const MQuery& query)
6069 {
6070 DECLARE_LOCALS;
6071 ps->fulfill_query(query, context<PeeringMachine>().get_recovery_ctx());
6072 return discard_event();
6073 }
6074
6075 boost::statechart::result PeeringState::ReplicaActive::react(const QueryState& q)
6076 {
6077 q.f->open_object_section("state");
6078 q.f->dump_string("name", state_name);
6079 q.f->dump_stream("enter_time") << enter_time;
6080 q.f->close_section();
6081 return forward_event();
6082 }
6083
6084 void PeeringState::ReplicaActive::exit()
6085 {
6086 context< PeeringMachine >().log_exit(state_name, enter_time);
6087 DECLARE_LOCALS;
6088 pl->unreserve_recovery_space();
6089
6090 pl->cancel_remote_recovery_reservation();
6091 utime_t dur = ceph_clock_now() - enter_time;
6092 pl->get_peering_perf().tinc(rs_replicaactive_latency, dur);
6093
6094 ps->min_last_complete_ondisk = eversion_t();
6095 }
6096
6097 /*-------Stray---*/
6098 PeeringState::Stray::Stray(my_context ctx)
6099 : my_base(ctx),
6100 NamedState(context< PeeringMachine >().state_history, "Started/Stray")
6101 {
6102 context< PeeringMachine >().log_enter(state_name);
6103
6104
6105 DECLARE_LOCALS;
6106 ceph_assert(!ps->is_peered());
6107 ceph_assert(!ps->is_peering());
6108 ceph_assert(!ps->is_primary());
6109
6110 if (!ps->get_osdmap()->have_pg_pool(ps->info.pgid.pgid.pool())) {
6111 ldout(ps->cct,10) << __func__ << " pool is deleted" << dendl;
6112 post_event(DeleteStart());
6113 } else {
6114 ps->start_flush(context< PeeringMachine >().get_cur_transaction());
6115 }
6116 }
6117
6118 boost::statechart::result PeeringState::Stray::react(const MLogRec& logevt)
6119 {
6120 DECLARE_LOCALS;
6121 MOSDPGLog *msg = logevt.msg.get();
6122 psdout(10) << "got info+log from osd." << logevt.from << " " << msg->info << " " << msg->log << dendl;
6123
6124 ObjectStore::Transaction &t = context<PeeringMachine>().get_cur_transaction();
6125 if (msg->info.last_backfill == hobject_t()) {
6126 // restart backfill
6127 ps->info = msg->info;
6128 pl->on_info_history_change();
6129 ps->dirty_info = true;
6130 ps->dirty_big_info = true; // maybe.
6131
6132 PGLog::LogEntryHandlerRef rollbacker{pl->get_log_handler(t)};
6133 ps->pg_log.reset_backfill_claim_log(msg->log, rollbacker.get());
6134
6135 ps->pg_log.reset_backfill();
6136 } else {
6137 ps->merge_log(t, msg->info, msg->log, logevt.from);
6138 }
6139 if (logevt.msg->lease) {
6140 ps->proc_lease(*logevt.msg->lease);
6141 }
6142
6143 ceph_assert(ps->pg_log.get_head() == ps->info.last_update);
6144
6145 post_event(Activate(logevt.msg->info.last_epoch_started));
6146 return transit<ReplicaActive>();
6147 }
6148
6149 boost::statechart::result PeeringState::Stray::react(const MInfoRec& infoevt)
6150 {
6151 DECLARE_LOCALS;
6152 psdout(10) << "got info from osd." << infoevt.from << " " << infoevt.info << dendl;
6153
6154 if (ps->info.last_update > infoevt.info.last_update) {
6155 // rewind divergent log entries
6156 ObjectStore::Transaction &t = context<PeeringMachine>().get_cur_transaction();
6157 ps->rewind_divergent_log(t, infoevt.info.last_update);
6158 ps->info.stats = infoevt.info.stats;
6159 ps->info.hit_set = infoevt.info.hit_set;
6160 }
6161
6162 if (infoevt.lease) {
6163 ps->proc_lease(*infoevt.lease);
6164 }
6165
6166 ceph_assert(infoevt.info.last_update == ps->info.last_update);
6167 ceph_assert(ps->pg_log.get_head() == ps->info.last_update);
6168
6169 post_event(Activate(infoevt.info.last_epoch_started));
6170 return transit<ReplicaActive>();
6171 }
6172
6173 boost::statechart::result PeeringState::Stray::react(const MQuery& query)
6174 {
6175 DECLARE_LOCALS;
6176 ps->fulfill_query(query, context<PeeringMachine>().get_recovery_ctx());
6177 return discard_event();
6178 }
6179
6180 boost::statechart::result PeeringState::Stray::react(const ActMap&)
6181 {
6182 DECLARE_LOCALS;
6183 if (ps->should_send_notify() && ps->get_primary().osd >= 0) {
6184 ps->info.history.refresh_prior_readable_until_ub(
6185 pl->get_mnow(), ps->prior_readable_until_ub);
6186 context< PeeringMachine >().send_notify(
6187 ps->get_primary().osd,
6188 pg_notify_t(
6189 ps->get_primary().shard, ps->pg_whoami.shard,
6190 ps->get_osdmap_epoch(),
6191 ps->get_osdmap_epoch(),
6192 ps->info,
6193 ps->past_intervals));
6194 }
6195 return discard_event();
6196 }
6197
6198 void PeeringState::Stray::exit()
6199 {
6200 context< PeeringMachine >().log_exit(state_name, enter_time);
6201 DECLARE_LOCALS;
6202 utime_t dur = ceph_clock_now() - enter_time;
6203 pl->get_peering_perf().tinc(rs_stray_latency, dur);
6204 }
6205
6206
6207 /*--------ToDelete----------*/
6208 PeeringState::ToDelete::ToDelete(my_context ctx)
6209 : my_base(ctx),
6210 NamedState(context< PeeringMachine >().state_history, "Started/ToDelete")
6211 {
6212 context< PeeringMachine >().log_enter(state_name);
6213 DECLARE_LOCALS;
6214 pl->get_perf_logger().inc(l_osd_pg_removing);
6215 }
6216
6217 void PeeringState::ToDelete::exit()
6218 {
6219 context< PeeringMachine >().log_exit(state_name, enter_time);
6220 DECLARE_LOCALS;
6221 // note: on a successful removal, this path doesn't execute. see
6222 // _delete_some().
6223 pl->get_perf_logger().dec(l_osd_pg_removing);
6224
6225 pl->cancel_local_background_io_reservation();
6226 }
6227
6228 /*----WaitDeleteReserved----*/
6229 PeeringState::WaitDeleteReserved::WaitDeleteReserved(my_context ctx)
6230 : my_base(ctx),
6231 NamedState(context< PeeringMachine >().state_history,
6232 "Started/ToDelete/WaitDeleteReseved")
6233 {
6234 context< PeeringMachine >().log_enter(state_name);
6235 DECLARE_LOCALS;
6236 context< ToDelete >().priority = ps->get_delete_priority();
6237
6238 pl->cancel_local_background_io_reservation();
6239 pl->request_local_background_io_reservation(
6240 context<ToDelete>().priority,
6241 std::make_shared<PGPeeringEvent>(
6242 ps->get_osdmap_epoch(),
6243 ps->get_osdmap_epoch(),
6244 DeleteReserved()),
6245 std::make_shared<PGPeeringEvent>(
6246 ps->get_osdmap_epoch(),
6247 ps->get_osdmap_epoch(),
6248 DeleteInterrupted()));
6249 }
6250
6251 boost::statechart::result PeeringState::ToDelete::react(
6252 const ActMap& evt)
6253 {
6254 DECLARE_LOCALS;
6255 if (ps->get_delete_priority() != priority) {
6256 psdout(10) << __func__ << " delete priority changed, resetting"
6257 << dendl;
6258 return transit<ToDelete>();
6259 }
6260 return discard_event();
6261 }
6262
6263 void PeeringState::WaitDeleteReserved::exit()
6264 {
6265 context< PeeringMachine >().log_exit(state_name, enter_time);
6266 }
6267
6268 /*----Deleting-----*/
6269 PeeringState::Deleting::Deleting(my_context ctx)
6270 : my_base(ctx),
6271 NamedState(context< PeeringMachine >().state_history, "Started/ToDelete/Deleting")
6272 {
6273 context< PeeringMachine >().log_enter(state_name);
6274 DECLARE_LOCALS;
6275 ps->deleting = true;
6276 ObjectStore::Transaction &t = context<PeeringMachine>().get_cur_transaction();
6277
6278 // clear log
6279 PGLog::LogEntryHandlerRef rollbacker{pl->get_log_handler(t)};
6280 ps->pg_log.roll_forward(rollbacker.get());
6281
6282 // adjust info to backfill
6283 ps->info.set_last_backfill(hobject_t());
6284 ps->pg_log.reset_backfill();
6285 ps->dirty_info = true;
6286
6287 pl->on_removal(t);
6288 }
6289
6290 boost::statechart::result PeeringState::Deleting::react(
6291 const DeleteSome& evt)
6292 {
6293 DECLARE_LOCALS;
6294 pl->do_delete_work(context<PeeringMachine>().get_cur_transaction());
6295 return discard_event();
6296 }
6297
6298 void PeeringState::Deleting::exit()
6299 {
6300 context< PeeringMachine >().log_exit(state_name, enter_time);
6301 DECLARE_LOCALS;
6302 ps->deleting = false;
6303 pl->cancel_local_background_io_reservation();
6304 }
6305
6306 /*--------GetInfo---------*/
6307 PeeringState::GetInfo::GetInfo(my_context ctx)
6308 : my_base(ctx),
6309 NamedState(context< PeeringMachine >().state_history, "Started/Primary/Peering/GetInfo")
6310 {
6311 context< PeeringMachine >().log_enter(state_name);
6312
6313
6314 DECLARE_LOCALS;
6315 ps->check_past_interval_bounds();
6316 ps->log_weirdness();
6317 PastIntervals::PriorSet &prior_set = context< Peering >().prior_set;
6318
6319 ceph_assert(ps->blocked_by.empty());
6320
6321 prior_set = ps->build_prior();
6322 ps->prior_readable_down_osds = prior_set.down;
6323 if (ps->prior_readable_down_osds.empty()) {
6324 psdout(10) << " no prior_set down osds, clearing prior_readable_until_ub"
6325 << dendl;
6326 ps->clear_prior_readable_until_ub();
6327 }
6328
6329 ps->reset_min_peer_features();
6330 get_infos();
6331 if (prior_set.pg_down) {
6332 post_event(IsDown());
6333 } else if (peer_info_requested.empty()) {
6334 post_event(GotInfo());
6335 }
6336 }
6337
6338 void PeeringState::GetInfo::get_infos()
6339 {
6340 DECLARE_LOCALS;
6341 PastIntervals::PriorSet &prior_set = context< Peering >().prior_set;
6342
6343 ps->blocked_by.clear();
6344 for (set<pg_shard_t>::const_iterator it = prior_set.probe.begin();
6345 it != prior_set.probe.end();
6346 ++it) {
6347 pg_shard_t peer = *it;
6348 if (peer == ps->pg_whoami) {
6349 continue;
6350 }
6351 if (ps->peer_info.count(peer)) {
6352 psdout(10) << " have osd." << peer << " info " << ps->peer_info[peer] << dendl;
6353 continue;
6354 }
6355 if (peer_info_requested.count(peer)) {
6356 psdout(10) << " already requested info from osd." << peer << dendl;
6357 ps->blocked_by.insert(peer.osd);
6358 } else if (!ps->get_osdmap()->is_up(peer.osd)) {
6359 psdout(10) << " not querying info from down osd." << peer << dendl;
6360 } else {
6361 psdout(10) << " querying info from osd." << peer << dendl;
6362 context< PeeringMachine >().send_query(
6363 peer.osd,
6364 pg_query_t(pg_query_t::INFO,
6365 it->shard, ps->pg_whoami.shard,
6366 ps->info.history,
6367 ps->get_osdmap_epoch()));
6368 peer_info_requested.insert(peer);
6369 ps->blocked_by.insert(peer.osd);
6370 }
6371 }
6372
6373 ps->check_prior_readable_down_osds(ps->get_osdmap());
6374
6375 pl->publish_stats_to_osd();
6376 }
6377
6378 boost::statechart::result PeeringState::GetInfo::react(const MNotifyRec& infoevt)
6379 {
6380
6381 DECLARE_LOCALS;
6382
6383 set<pg_shard_t>::iterator p = peer_info_requested.find(infoevt.from);
6384 if (p != peer_info_requested.end()) {
6385 peer_info_requested.erase(p);
6386 ps->blocked_by.erase(infoevt.from.osd);
6387 }
6388
6389 epoch_t old_start = ps->info.history.last_epoch_started;
6390 if (ps->proc_replica_info(
6391 infoevt.from, infoevt.notify.info, infoevt.notify.epoch_sent)) {
6392 // we got something new ...
6393 PastIntervals::PriorSet &prior_set = context< Peering >().prior_set;
6394 if (old_start < ps->info.history.last_epoch_started) {
6395 psdout(10) << " last_epoch_started moved forward, rebuilding prior" << dendl;
6396 prior_set = ps->build_prior();
6397 ps->prior_readable_down_osds = prior_set.down;
6398
6399 // filter out any osds that got dropped from the probe set from
6400 // peer_info_requested. this is less expensive than restarting
6401 // peering (which would re-probe everyone).
6402 set<pg_shard_t>::iterator p = peer_info_requested.begin();
6403 while (p != peer_info_requested.end()) {
6404 if (prior_set.probe.count(*p) == 0) {
6405 psdout(20) << " dropping osd." << *p << " from info_requested, no longer in probe set" << dendl;
6406 peer_info_requested.erase(p++);
6407 } else {
6408 ++p;
6409 }
6410 }
6411 get_infos();
6412 }
6413 psdout(20) << "Adding osd: " << infoevt.from.osd << " peer features: "
6414 << hex << infoevt.features << dec << dendl;
6415 ps->apply_peer_features(infoevt.features);
6416
6417 // are we done getting everything?
6418 if (peer_info_requested.empty() && !prior_set.pg_down) {
6419 psdout(20) << "Common peer features: " << hex << ps->get_min_peer_features() << dec << dendl;
6420 psdout(20) << "Common acting features: " << hex << ps->get_min_acting_features() << dec << dendl;
6421 psdout(20) << "Common upacting features: " << hex << ps->get_min_upacting_features() << dec << dendl;
6422 post_event(GotInfo());
6423 }
6424 }
6425 return discard_event();
6426 }
6427
6428 boost::statechart::result PeeringState::GetInfo::react(const QueryState& q)
6429 {
6430 DECLARE_LOCALS;
6431 q.f->open_object_section("state");
6432 q.f->dump_string("name", state_name);
6433 q.f->dump_stream("enter_time") << enter_time;
6434
6435 q.f->open_array_section("requested_info_from");
6436 for (set<pg_shard_t>::iterator p = peer_info_requested.begin();
6437 p != peer_info_requested.end();
6438 ++p) {
6439 q.f->open_object_section("osd");
6440 q.f->dump_stream("osd") << *p;
6441 if (ps->peer_info.count(*p)) {
6442 q.f->open_object_section("got_info");
6443 ps->peer_info[*p].dump(q.f);
6444 q.f->close_section();
6445 }
6446 q.f->close_section();
6447 }
6448 q.f->close_section();
6449
6450 q.f->close_section();
6451 return forward_event();
6452 }
6453
6454 void PeeringState::GetInfo::exit()
6455 {
6456 context< PeeringMachine >().log_exit(state_name, enter_time);
6457
6458 DECLARE_LOCALS;
6459 utime_t dur = ceph_clock_now() - enter_time;
6460 pl->get_peering_perf().tinc(rs_getinfo_latency, dur);
6461 ps->blocked_by.clear();
6462 }
6463
6464 /*------GetLog------------*/
6465 PeeringState::GetLog::GetLog(my_context ctx)
6466 : my_base(ctx),
6467 NamedState(
6468 context< PeeringMachine >().state_history,
6469 "Started/Primary/Peering/GetLog"),
6470 msg(0)
6471 {
6472 context< PeeringMachine >().log_enter(state_name);
6473
6474 DECLARE_LOCALS;
6475
6476 ps->log_weirdness();
6477
6478 // adjust acting?
6479 if (!ps->choose_acting(auth_log_shard, false,
6480 &context< Peering >().history_les_bound)) {
6481 if (!ps->want_acting.empty()) {
6482 post_event(NeedActingChange());
6483 } else {
6484 post_event(IsIncomplete());
6485 }
6486 return;
6487 }
6488
6489 // am i the best?
6490 if (auth_log_shard == ps->pg_whoami) {
6491 post_event(GotLog());
6492 return;
6493 }
6494
6495 const pg_info_t& best = ps->peer_info[auth_log_shard];
6496
6497 // am i broken?
6498 if (ps->info.last_update < best.log_tail) {
6499 psdout(10) << " not contiguous with osd." << auth_log_shard << ", down" << dendl;
6500 post_event(IsIncomplete());
6501 return;
6502 }
6503
6504 // how much log to request?
6505 eversion_t request_log_from = ps->info.last_update;
6506 ceph_assert(!ps->acting_recovery_backfill.empty());
6507 for (set<pg_shard_t>::iterator p = ps->acting_recovery_backfill.begin();
6508 p != ps->acting_recovery_backfill.end();
6509 ++p) {
6510 if (*p == ps->pg_whoami) continue;
6511 pg_info_t& ri = ps->peer_info[*p];
6512 if (ri.last_update < ps->info.log_tail && ri.last_update >= best.log_tail &&
6513 ri.last_update < request_log_from)
6514 request_log_from = ri.last_update;
6515 }
6516
6517 // how much?
6518 psdout(10) << " requesting log from osd." << auth_log_shard << dendl;
6519 context<PeeringMachine>().send_query(
6520 auth_log_shard.osd,
6521 pg_query_t(
6522 pg_query_t::LOG,
6523 auth_log_shard.shard, ps->pg_whoami.shard,
6524 request_log_from, ps->info.history,
6525 ps->get_osdmap_epoch()));
6526
6527 ceph_assert(ps->blocked_by.empty());
6528 ps->blocked_by.insert(auth_log_shard.osd);
6529 pl->publish_stats_to_osd();
6530 }
6531
6532 boost::statechart::result PeeringState::GetLog::react(const AdvMap& advmap)
6533 {
6534 // make sure our log source didn't go down. we need to check
6535 // explicitly because it may not be part of the prior set, which
6536 // means the Peering state check won't catch it going down.
6537 if (!advmap.osdmap->is_up(auth_log_shard.osd)) {
6538 psdout(10) << "GetLog: auth_log_shard osd."
6539 << auth_log_shard.osd << " went down" << dendl;
6540 post_event(advmap);
6541 return transit< Reset >();
6542 }
6543
6544 // let the Peering state do its checks.
6545 return forward_event();
6546 }
6547
6548 boost::statechart::result PeeringState::GetLog::react(const MLogRec& logevt)
6549 {
6550 ceph_assert(!msg);
6551 if (logevt.from != auth_log_shard) {
6552 psdout(10) << "GetLog: discarding log from "
6553 << "non-auth_log_shard osd." << logevt.from << dendl;
6554 return discard_event();
6555 }
6556 psdout(10) << "GetLog: received master log from osd."
6557 << logevt.from << dendl;
6558 msg = logevt.msg;
6559 post_event(GotLog());
6560 return discard_event();
6561 }
6562
6563 boost::statechart::result PeeringState::GetLog::react(const GotLog&)
6564 {
6565
6566 DECLARE_LOCALS;
6567 psdout(10) << "leaving GetLog" << dendl;
6568 if (msg) {
6569 psdout(10) << "processing master log" << dendl;
6570 ps->proc_master_log(context<PeeringMachine>().get_cur_transaction(),
6571 msg->info, msg->log, msg->missing,
6572 auth_log_shard);
6573 }
6574 ps->start_flush(context< PeeringMachine >().get_cur_transaction());
6575 return transit< GetMissing >();
6576 }
6577
6578 boost::statechart::result PeeringState::GetLog::react(const QueryState& q)
6579 {
6580 q.f->open_object_section("state");
6581 q.f->dump_string("name", state_name);
6582 q.f->dump_stream("enter_time") << enter_time;
6583 q.f->dump_stream("auth_log_shard") << auth_log_shard;
6584 q.f->close_section();
6585 return forward_event();
6586 }
6587
6588 void PeeringState::GetLog::exit()
6589 {
6590 context< PeeringMachine >().log_exit(state_name, enter_time);
6591
6592 DECLARE_LOCALS;
6593 utime_t dur = ceph_clock_now() - enter_time;
6594 pl->get_peering_perf().tinc(rs_getlog_latency, dur);
6595 ps->blocked_by.clear();
6596 }
6597
6598 /*------WaitActingChange--------*/
6599 PeeringState::WaitActingChange::WaitActingChange(my_context ctx)
6600 : my_base(ctx),
6601 NamedState(context< PeeringMachine >().state_history, "Started/Primary/WaitActingChange")
6602 {
6603 context< PeeringMachine >().log_enter(state_name);
6604 }
6605
6606 boost::statechart::result PeeringState::WaitActingChange::react(const AdvMap& advmap)
6607 {
6608 DECLARE_LOCALS;
6609 OSDMapRef osdmap = advmap.osdmap;
6610
6611 psdout(10) << "verifying no want_acting " << ps->want_acting << " targets didn't go down" << dendl;
6612 for (vector<int>::iterator p = ps->want_acting.begin(); p != ps->want_acting.end(); ++p) {
6613 if (!osdmap->is_up(*p)) {
6614 psdout(10) << " want_acting target osd." << *p << " went down, resetting" << dendl;
6615 post_event(advmap);
6616 return transit< Reset >();
6617 }
6618 }
6619 return forward_event();
6620 }
6621
6622 boost::statechart::result PeeringState::WaitActingChange::react(const MLogRec& logevt)
6623 {
6624 psdout(10) << "In WaitActingChange, ignoring MLocRec" << dendl;
6625 return discard_event();
6626 }
6627
6628 boost::statechart::result PeeringState::WaitActingChange::react(const MInfoRec& evt)
6629 {
6630 psdout(10) << "In WaitActingChange, ignoring MInfoRec" << dendl;
6631 return discard_event();
6632 }
6633
6634 boost::statechart::result PeeringState::WaitActingChange::react(const MNotifyRec& evt)
6635 {
6636 psdout(10) << "In WaitActingChange, ignoring MNotifyRec" << dendl;
6637 return discard_event();
6638 }
6639
6640 boost::statechart::result PeeringState::WaitActingChange::react(const QueryState& q)
6641 {
6642 q.f->open_object_section("state");
6643 q.f->dump_string("name", state_name);
6644 q.f->dump_stream("enter_time") << enter_time;
6645 q.f->dump_string("comment", "waiting for pg acting set to change");
6646 q.f->close_section();
6647 return forward_event();
6648 }
6649
6650 void PeeringState::WaitActingChange::exit()
6651 {
6652 context< PeeringMachine >().log_exit(state_name, enter_time);
6653 DECLARE_LOCALS;
6654 utime_t dur = ceph_clock_now() - enter_time;
6655 pl->get_peering_perf().tinc(rs_waitactingchange_latency, dur);
6656 }
6657
6658 /*------Down--------*/
6659 PeeringState::Down::Down(my_context ctx)
6660 : my_base(ctx),
6661 NamedState(context< PeeringMachine >().state_history, "Started/Primary/Peering/Down")
6662 {
6663 context< PeeringMachine >().log_enter(state_name);
6664 DECLARE_LOCALS;
6665
6666 ps->state_clear(PG_STATE_PEERING);
6667 ps->state_set(PG_STATE_DOWN);
6668
6669 auto &prior_set = context< Peering >().prior_set;
6670 ceph_assert(ps->blocked_by.empty());
6671 ps->blocked_by.insert(prior_set.down.begin(), prior_set.down.end());
6672 pl->publish_stats_to_osd();
6673 }
6674
6675 void PeeringState::Down::exit()
6676 {
6677 context< PeeringMachine >().log_exit(state_name, enter_time);
6678
6679 DECLARE_LOCALS;
6680
6681 ps->state_clear(PG_STATE_DOWN);
6682 utime_t dur = ceph_clock_now() - enter_time;
6683 pl->get_peering_perf().tinc(rs_down_latency, dur);
6684
6685 ps->blocked_by.clear();
6686 }
6687
6688 boost::statechart::result PeeringState::Down::react(const QueryState& q)
6689 {
6690 q.f->open_object_section("state");
6691 q.f->dump_string("name", state_name);
6692 q.f->dump_stream("enter_time") << enter_time;
6693 q.f->dump_string("comment",
6694 "not enough up instances of this PG to go active");
6695 q.f->close_section();
6696 return forward_event();
6697 }
6698
6699 boost::statechart::result PeeringState::Down::react(const MNotifyRec& infoevt)
6700 {
6701 DECLARE_LOCALS;
6702
6703 ceph_assert(ps->is_primary());
6704 epoch_t old_start = ps->info.history.last_epoch_started;
6705 if (!ps->peer_info.count(infoevt.from) &&
6706 ps->get_osdmap()->has_been_up_since(infoevt.from.osd, infoevt.notify.epoch_sent)) {
6707 ps->update_history(infoevt.notify.info.history);
6708 }
6709 // if we got something new to make pg escape down state
6710 if (ps->info.history.last_epoch_started > old_start) {
6711 psdout(10) << " last_epoch_started moved forward, re-enter getinfo" << dendl;
6712 ps->state_clear(PG_STATE_DOWN);
6713 ps->state_set(PG_STATE_PEERING);
6714 return transit< GetInfo >();
6715 }
6716
6717 return discard_event();
6718 }
6719
6720
6721 /*------Incomplete--------*/
6722 PeeringState::Incomplete::Incomplete(my_context ctx)
6723 : my_base(ctx),
6724 NamedState(context< PeeringMachine >().state_history, "Started/Primary/Peering/Incomplete")
6725 {
6726 context< PeeringMachine >().log_enter(state_name);
6727 DECLARE_LOCALS;
6728
6729 ps->state_clear(PG_STATE_PEERING);
6730 ps->state_set(PG_STATE_INCOMPLETE);
6731
6732 PastIntervals::PriorSet &prior_set = context< Peering >().prior_set;
6733 ceph_assert(ps->blocked_by.empty());
6734 ps->blocked_by.insert(prior_set.down.begin(), prior_set.down.end());
6735 pl->publish_stats_to_osd();
6736 }
6737
6738 boost::statechart::result PeeringState::Incomplete::react(const AdvMap &advmap) {
6739 DECLARE_LOCALS;
6740 int64_t poolnum = ps->info.pgid.pool();
6741
6742 // Reset if min_size turn smaller than previous value, pg might now be able to go active
6743 if (!advmap.osdmap->have_pg_pool(poolnum) ||
6744 advmap.lastmap->get_pools().find(poolnum)->second.min_size >
6745 advmap.osdmap->get_pools().find(poolnum)->second.min_size) {
6746 post_event(advmap);
6747 return transit< Reset >();
6748 }
6749
6750 return forward_event();
6751 }
6752
6753 boost::statechart::result PeeringState::Incomplete::react(const MNotifyRec& notevt) {
6754 DECLARE_LOCALS;
6755 psdout(7) << "handle_pg_notify from osd." << notevt.from << dendl;
6756 if (ps->proc_replica_info(
6757 notevt.from, notevt.notify.info, notevt.notify.epoch_sent)) {
6758 // We got something new, try again!
6759 return transit< GetLog >();
6760 } else {
6761 return discard_event();
6762 }
6763 }
6764
6765 boost::statechart::result PeeringState::Incomplete::react(
6766 const QueryState& q)
6767 {
6768 q.f->open_object_section("state");
6769 q.f->dump_string("name", state_name);
6770 q.f->dump_stream("enter_time") << enter_time;
6771 q.f->dump_string("comment", "not enough complete instances of this PG");
6772 q.f->close_section();
6773 return forward_event();
6774 }
6775
6776 void PeeringState::Incomplete::exit()
6777 {
6778 context< PeeringMachine >().log_exit(state_name, enter_time);
6779
6780 DECLARE_LOCALS;
6781
6782 ps->state_clear(PG_STATE_INCOMPLETE);
6783 utime_t dur = ceph_clock_now() - enter_time;
6784 pl->get_peering_perf().tinc(rs_incomplete_latency, dur);
6785
6786 ps->blocked_by.clear();
6787 }
6788
6789 /*------GetMissing--------*/
6790 PeeringState::GetMissing::GetMissing(my_context ctx)
6791 : my_base(ctx),
6792 NamedState(context< PeeringMachine >().state_history, "Started/Primary/Peering/GetMissing")
6793 {
6794 context< PeeringMachine >().log_enter(state_name);
6795
6796 DECLARE_LOCALS;
6797 ps->log_weirdness();
6798 ceph_assert(!ps->acting_recovery_backfill.empty());
6799 eversion_t since;
6800 for (set<pg_shard_t>::iterator i = ps->acting_recovery_backfill.begin();
6801 i != ps->acting_recovery_backfill.end();
6802 ++i) {
6803 if (*i == ps->get_primary()) continue;
6804 const pg_info_t& pi = ps->peer_info[*i];
6805 // reset this so to make sure the pg_missing_t is initialized and
6806 // has the correct semantics even if we don't need to get a
6807 // missing set from a shard. This way later additions due to
6808 // lost+unfound delete work properly.
6809 ps->peer_missing[*i].may_include_deletes = !ps->perform_deletes_during_peering();
6810
6811 if (pi.is_empty())
6812 continue; // no pg data, nothing divergent
6813
6814 if (pi.last_update < ps->pg_log.get_tail()) {
6815 psdout(10) << " osd." << *i << " is not contiguous, will restart backfill" << dendl;
6816 ps->peer_missing[*i].clear();
6817 continue;
6818 }
6819 if (pi.last_backfill == hobject_t()) {
6820 psdout(10) << " osd." << *i << " will fully backfill; can infer empty missing set" << dendl;
6821 ps->peer_missing[*i].clear();
6822 continue;
6823 }
6824
6825 if (pi.last_update == pi.last_complete && // peer has no missing
6826 pi.last_update == ps->info.last_update) { // peer is up to date
6827 // replica has no missing and identical log as us. no need to
6828 // pull anything.
6829 // FIXME: we can do better here. if last_update==last_complete we
6830 // can infer the rest!
6831 psdout(10) << " osd." << *i << " has no missing, identical log" << dendl;
6832 ps->peer_missing[*i].clear();
6833 continue;
6834 }
6835
6836 // We pull the log from the peer's last_epoch_started to ensure we
6837 // get enough log to detect divergent updates.
6838 since.epoch = pi.last_epoch_started;
6839 ceph_assert(pi.last_update >= ps->info.log_tail); // or else choose_acting() did a bad thing
6840 if (pi.log_tail <= since) {
6841 psdout(10) << " requesting log+missing since " << since << " from osd." << *i << dendl;
6842 context< PeeringMachine >().send_query(
6843 i->osd,
6844 pg_query_t(
6845 pg_query_t::LOG,
6846 i->shard, ps->pg_whoami.shard,
6847 since, ps->info.history,
6848 ps->get_osdmap_epoch()));
6849 } else {
6850 psdout(10) << " requesting fulllog+missing from osd." << *i
6851 << " (want since " << since << " < log.tail "
6852 << pi.log_tail << ")" << dendl;
6853 context< PeeringMachine >().send_query(
6854 i->osd, pg_query_t(
6855 pg_query_t::FULLLOG,
6856 i->shard, ps->pg_whoami.shard,
6857 ps->info.history, ps->get_osdmap_epoch()));
6858 }
6859 peer_missing_requested.insert(*i);
6860 ps->blocked_by.insert(i->osd);
6861 }
6862
6863 if (peer_missing_requested.empty()) {
6864 if (ps->need_up_thru) {
6865 psdout(10) << " still need up_thru update before going active"
6866 << dendl;
6867 post_event(NeedUpThru());
6868 return;
6869 }
6870
6871 // all good!
6872 post_event(Activate(ps->get_osdmap_epoch()));
6873 } else {
6874 pl->publish_stats_to_osd();
6875 }
6876 }
6877
6878 boost::statechart::result PeeringState::GetMissing::react(const MLogRec& logevt)
6879 {
6880 DECLARE_LOCALS;
6881
6882 peer_missing_requested.erase(logevt.from);
6883 ps->proc_replica_log(logevt.msg->info, logevt.msg->log, logevt.msg->missing, logevt.from);
6884
6885 if (peer_missing_requested.empty()) {
6886 if (ps->need_up_thru) {
6887 psdout(10) << " still need up_thru update before going active"
6888 << dendl;
6889 post_event(NeedUpThru());
6890 } else {
6891 psdout(10) << "Got last missing, don't need missing "
6892 << "posting Activate" << dendl;
6893 post_event(Activate(ps->get_osdmap_epoch()));
6894 }
6895 }
6896 return discard_event();
6897 }
6898
6899 boost::statechart::result PeeringState::GetMissing::react(const QueryState& q)
6900 {
6901 DECLARE_LOCALS;
6902 q.f->open_object_section("state");
6903 q.f->dump_string("name", state_name);
6904 q.f->dump_stream("enter_time") << enter_time;
6905
6906 q.f->open_array_section("peer_missing_requested");
6907 for (set<pg_shard_t>::iterator p = peer_missing_requested.begin();
6908 p != peer_missing_requested.end();
6909 ++p) {
6910 q.f->open_object_section("osd");
6911 q.f->dump_stream("osd") << *p;
6912 if (ps->peer_missing.count(*p)) {
6913 q.f->open_object_section("got_missing");
6914 ps->peer_missing[*p].dump(q.f);
6915 q.f->close_section();
6916 }
6917 q.f->close_section();
6918 }
6919 q.f->close_section();
6920
6921 q.f->close_section();
6922 return forward_event();
6923 }
6924
6925 void PeeringState::GetMissing::exit()
6926 {
6927 context< PeeringMachine >().log_exit(state_name, enter_time);
6928
6929 DECLARE_LOCALS;
6930 utime_t dur = ceph_clock_now() - enter_time;
6931 pl->get_peering_perf().tinc(rs_getmissing_latency, dur);
6932 ps->blocked_by.clear();
6933 }
6934
6935 /*------WaitUpThru--------*/
6936 PeeringState::WaitUpThru::WaitUpThru(my_context ctx)
6937 : my_base(ctx),
6938 NamedState(context< PeeringMachine >().state_history, "Started/Primary/Peering/WaitUpThru")
6939 {
6940 context< PeeringMachine >().log_enter(state_name);
6941 }
6942
6943 boost::statechart::result PeeringState::WaitUpThru::react(const ActMap& am)
6944 {
6945 DECLARE_LOCALS;
6946 if (!ps->need_up_thru) {
6947 post_event(Activate(ps->get_osdmap_epoch()));
6948 }
6949 return forward_event();
6950 }
6951
6952 boost::statechart::result PeeringState::WaitUpThru::react(const MLogRec& logevt)
6953 {
6954 DECLARE_LOCALS;
6955 psdout(10) << "Noting missing from osd." << logevt.from << dendl;
6956 ps->peer_missing[logevt.from].claim(logevt.msg->missing);
6957 ps->peer_info[logevt.from] = logevt.msg->info;
6958 return discard_event();
6959 }
6960
6961 boost::statechart::result PeeringState::WaitUpThru::react(const QueryState& q)
6962 {
6963 q.f->open_object_section("state");
6964 q.f->dump_string("name", state_name);
6965 q.f->dump_stream("enter_time") << enter_time;
6966 q.f->dump_string("comment", "waiting for osdmap to reflect a new up_thru for this osd");
6967 q.f->close_section();
6968 return forward_event();
6969 }
6970
6971 void PeeringState::WaitUpThru::exit()
6972 {
6973 context< PeeringMachine >().log_exit(state_name, enter_time);
6974 DECLARE_LOCALS;
6975 utime_t dur = ceph_clock_now() - enter_time;
6976 pl->get_peering_perf().tinc(rs_waitupthru_latency, dur);
6977 }
6978
6979 /*----PeeringState::PeeringMachine Methods-----*/
6980 #undef dout_prefix
6981 #define dout_prefix dpp->gen_prefix(*_dout)
6982
6983 void PeeringState::PeeringMachine::log_enter(const char *state_name)
6984 {
6985 DECLARE_LOCALS;
6986 psdout(5) << "enter " << state_name << dendl;
6987 pl->log_state_enter(state_name);
6988 }
6989
6990 void PeeringState::PeeringMachine::log_exit(const char *state_name, utime_t enter_time)
6991 {
6992 DECLARE_LOCALS;
6993 utime_t dur = ceph_clock_now() - enter_time;
6994 psdout(5) << "exit " << state_name << " " << dur << " " << event_count << " " << event_time << dendl;
6995 pl->log_state_exit(state_name, enter_time, event_count, event_time);
6996 event_count = 0;
6997 event_time = utime_t();
6998 }
6999
7000 ostream &operator<<(ostream &out, const PeeringState &ps) {
7001 out << "pg[" << ps.info
7002 << " " << pg_vector_string(ps.up);
7003 if (ps.acting != ps.up)
7004 out << "/" << pg_vector_string(ps.acting);
7005 if (ps.is_ec_pg())
7006 out << "p" << ps.get_primary();
7007 if (!ps.async_recovery_targets.empty())
7008 out << " async=[" << ps.async_recovery_targets << "]";
7009 if (!ps.backfill_targets.empty())
7010 out << " backfill=[" << ps.backfill_targets << "]";
7011 out << " r=" << ps.get_role();
7012 out << " lpr=" << ps.get_last_peering_reset();
7013
7014 if (ps.deleting)
7015 out << " DELETING";
7016
7017 if (!ps.past_intervals.empty()) {
7018 out << " pi=[" << ps.past_intervals.get_bounds()
7019 << ")/" << ps.past_intervals.size();
7020 }
7021
7022 if (ps.is_peered()) {
7023 if (ps.last_update_ondisk != ps.info.last_update)
7024 out << " luod=" << ps.last_update_ondisk;
7025 if (ps.last_update_applied != ps.info.last_update)
7026 out << " lua=" << ps.last_update_applied;
7027 }
7028
7029 if (ps.pg_log.get_tail() != ps.info.log_tail ||
7030 ps.pg_log.get_head() != ps.info.last_update)
7031 out << " (info mismatch, " << ps.pg_log.get_log() << ")";
7032
7033 if (!ps.pg_log.get_log().empty()) {
7034 if ((ps.pg_log.get_log().log.begin()->version <= ps.pg_log.get_tail())) {
7035 out << " (log bound mismatch, actual=["
7036 << ps.pg_log.get_log().log.begin()->version << ","
7037 << ps.pg_log.get_log().log.rbegin()->version << "]";
7038 out << ")";
7039 }
7040 }
7041
7042 out << " crt=" << ps.pg_log.get_can_rollback_to();
7043
7044 if (ps.last_complete_ondisk != ps.info.last_complete)
7045 out << " lcod " << ps.last_complete_ondisk;
7046
7047 out << " mlcod " << ps.min_last_complete_ondisk;
7048
7049 out << " " << pg_state_string(ps.get_state());
7050 if (ps.should_send_notify())
7051 out << " NOTIFY";
7052
7053 if (ps.prior_readable_until_ub != ceph::signedspan::zero()) {
7054 out << " pruub " << ps.prior_readable_until_ub
7055 << "@" << ps.get_prior_readable_down_osds();
7056 }
7057 return out;
7058 }