]> git.proxmox.com Git - ceph.git/blob - ceph/src/osd/PeeringState.cc
import ceph 15.2.14
[ceph.git] / ceph / src / osd / PeeringState.cc
1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2 // vim: ts=8 sw=2 smarttab
3
4 #include "PGPeeringEvent.h"
5 #include "common/ceph_releases.h"
6 #include "common/dout.h"
7 #include "PeeringState.h"
8
9 #include "messages/MOSDPGRemove.h"
10 #include "messages/MBackfillReserve.h"
11 #include "messages/MRecoveryReserve.h"
12 #include "messages/MOSDScrubReserve.h"
13 #include "messages/MOSDPGInfo.h"
14 #include "messages/MOSDPGInfo2.h"
15 #include "messages/MOSDPGTrim.h"
16 #include "messages/MOSDPGLog.h"
17 #include "messages/MOSDPGNotify.h"
18 #include "messages/MOSDPGNotify2.h"
19 #include "messages/MOSDPGQuery.h"
20 #include "messages/MOSDPGQuery2.h"
21 #include "messages/MOSDPGLease.h"
22 #include "messages/MOSDPGLeaseAck.h"
23
24 #define dout_context cct
25 #define dout_subsys ceph_subsys_osd
26
27 BufferedRecoveryMessages::BufferedRecoveryMessages(
28 ceph_release_t r,
29 PeeringCtx &ctx)
30 : require_osd_release(r) {
31 // steal messages from ctx
32 message_map.swap(ctx.message_map);
33 }
34
35 void BufferedRecoveryMessages::send_notify(int to, const pg_notify_t &n)
36 {
37 if (require_osd_release >= ceph_release_t::octopus) {
38 spg_t pgid(n.info.pgid.pgid, n.to);
39 send_osd_message(to, make_message<MOSDPGNotify2>(pgid, n));
40 } else {
41 send_osd_message(to, make_message<MOSDPGNotify>(n.epoch_sent, vector{n}));
42 }
43 }
44
45 void BufferedRecoveryMessages::send_query(
46 int to,
47 spg_t to_spgid,
48 const pg_query_t &q)
49 {
50 if (require_osd_release >= ceph_release_t::octopus) {
51 send_osd_message(to,
52 make_message<MOSDPGQuery2>(to_spgid, q));
53 } else {
54 auto m = make_message<MOSDPGQuery>(
55 q.epoch_sent,
56 MOSDPGQuery::pg_list_t{{to_spgid, q}});
57 send_osd_message(to, m);
58 }
59 }
60
61 void BufferedRecoveryMessages::send_info(
62 int to,
63 spg_t to_spgid,
64 epoch_t min_epoch,
65 epoch_t cur_epoch,
66 const pg_info_t &info,
67 std::optional<pg_lease_t> lease,
68 std::optional<pg_lease_ack_t> lease_ack)
69 {
70 if (require_osd_release >= ceph_release_t::octopus) {
71 send_osd_message(
72 to,
73 make_message<MOSDPGInfo2>(
74 to_spgid,
75 info,
76 cur_epoch,
77 min_epoch,
78 lease,
79 lease_ack)
80 );
81 } else {
82 send_osd_message(
83 to,
84 make_message<MOSDPGInfo>(
85 cur_epoch,
86 vector{pg_notify_t{to_spgid.shard,
87 info.pgid.shard,
88 min_epoch, cur_epoch,
89 info, PastIntervals{}}})
90 );
91 }
92 }
93
94 void PGPool::update(CephContext *cct, OSDMapRef map)
95 {
96 const pg_pool_t *pi = map->get_pg_pool(id);
97 if (!pi) {
98 return; // pool has been deleted
99 }
100 info = *pi;
101 name = map->get_pool_name(id);
102
103 bool updated = false;
104 if ((map->get_epoch() != cached_epoch + 1) ||
105 (pi->get_snap_epoch() == map->get_epoch())) {
106 updated = true;
107 }
108
109 if (info.is_pool_snaps_mode() && updated) {
110 snapc = pi->get_snap_context();
111 }
112 cached_epoch = map->get_epoch();
113 }
114
115 /*-------------Peering State Helpers----------------*/
116 #undef dout_prefix
117 #define dout_prefix (dpp->gen_prefix(*_dout))
118 #undef psdout
119 #define psdout(x) ldout(cct, x)
120
121 PeeringState::PeeringState(
122 CephContext *cct,
123 pg_shard_t pg_whoami,
124 spg_t spgid,
125 const PGPool &_pool,
126 OSDMapRef curmap,
127 DoutPrefixProvider *dpp,
128 PeeringListener *pl)
129 : state_history(*pl),
130 cct(cct),
131 spgid(spgid),
132 dpp(dpp),
133 pl(pl),
134 orig_ctx(0),
135 osdmap_ref(curmap),
136 pool(_pool),
137 pg_whoami(pg_whoami),
138 info(spgid),
139 pg_log(cct),
140 missing_loc(spgid, this, dpp, cct),
141 machine(this, cct, spgid, dpp, pl, &state_history)
142 {
143 machine.initiate();
144 }
145
146 void PeeringState::start_handle(PeeringCtx *new_ctx) {
147 ceph_assert(!rctx);
148 ceph_assert(!orig_ctx);
149 orig_ctx = new_ctx;
150 if (new_ctx) {
151 if (messages_pending_flush) {
152 rctx.emplace(*messages_pending_flush, *new_ctx);
153 } else {
154 rctx.emplace(*new_ctx);
155 }
156 rctx->start_time = ceph_clock_now();
157 }
158 }
159
160 void PeeringState::begin_block_outgoing() {
161 ceph_assert(!messages_pending_flush);
162 ceph_assert(orig_ctx);
163 ceph_assert(rctx);
164 messages_pending_flush = BufferedRecoveryMessages(
165 orig_ctx->require_osd_release);
166 rctx.emplace(*messages_pending_flush, *orig_ctx);
167 }
168
169 void PeeringState::clear_blocked_outgoing() {
170 ceph_assert(orig_ctx);
171 ceph_assert(rctx);
172 messages_pending_flush = std::optional<BufferedRecoveryMessages>();
173 }
174
175 void PeeringState::end_block_outgoing() {
176 ceph_assert(messages_pending_flush);
177 ceph_assert(orig_ctx);
178 ceph_assert(rctx);
179
180 orig_ctx->accept_buffered_messages(*messages_pending_flush);
181 rctx.emplace(*orig_ctx);
182 messages_pending_flush = std::optional<BufferedRecoveryMessages>();
183 }
184
185 void PeeringState::end_handle() {
186 if (rctx) {
187 utime_t dur = ceph_clock_now() - rctx->start_time;
188 machine.event_time += dur;
189 }
190
191 machine.event_count++;
192 rctx = std::nullopt;
193 orig_ctx = NULL;
194 }
195
196 void PeeringState::check_recovery_sources(const OSDMapRef& osdmap)
197 {
198 /*
199 * check that any peers we are planning to (or currently) pulling
200 * objects from are dealt with.
201 */
202 missing_loc.check_recovery_sources(osdmap);
203 pl->check_recovery_sources(osdmap);
204
205 for (set<pg_shard_t>::iterator i = peer_log_requested.begin();
206 i != peer_log_requested.end();
207 ) {
208 if (!osdmap->is_up(i->osd)) {
209 psdout(10) << "peer_log_requested removing " << *i << dendl;
210 peer_log_requested.erase(i++);
211 } else {
212 ++i;
213 }
214 }
215
216 for (set<pg_shard_t>::iterator i = peer_missing_requested.begin();
217 i != peer_missing_requested.end();
218 ) {
219 if (!osdmap->is_up(i->osd)) {
220 psdout(10) << "peer_missing_requested removing " << *i << dendl;
221 peer_missing_requested.erase(i++);
222 } else {
223 ++i;
224 }
225 }
226 }
227
228 void PeeringState::update_history(const pg_history_t& new_history)
229 {
230 auto mnow = pl->get_mnow();
231 info.history.refresh_prior_readable_until_ub(mnow, prior_readable_until_ub);
232 if (info.history.merge(new_history)) {
233 psdout(20) << __func__ << " advanced history from " << new_history << dendl;
234 dirty_info = true;
235 if (info.history.last_epoch_clean >= info.history.same_interval_since) {
236 psdout(20) << __func__ << " clearing past_intervals" << dendl;
237 past_intervals.clear();
238 dirty_big_info = true;
239 }
240 prior_readable_until_ub = info.history.get_prior_readable_until_ub(mnow);
241 if (prior_readable_until_ub != ceph::signedspan::zero()) {
242 dout(20) << __func__
243 << " prior_readable_until_ub " << prior_readable_until_ub
244 << " (mnow " << mnow << " + "
245 << info.history.prior_readable_until_ub << ")" << dendl;
246 }
247 }
248 pl->on_info_history_change();
249 }
250
251 void PeeringState::purge_strays()
252 {
253 if (is_premerge()) {
254 psdout(10) << "purge_strays " << stray_set << " but premerge, doing nothing"
255 << dendl;
256 return;
257 }
258 if (cct->_conf.get_val<bool>("osd_debug_no_purge_strays")) {
259 return;
260 }
261 psdout(10) << "purge_strays " << stray_set << dendl;
262
263 bool removed = false;
264 for (set<pg_shard_t>::iterator p = stray_set.begin();
265 p != stray_set.end();
266 ++p) {
267 ceph_assert(!is_acting_recovery_backfill(*p));
268 if (get_osdmap()->is_up(p->osd)) {
269 psdout(10) << "sending PGRemove to osd." << *p << dendl;
270 vector<spg_t> to_remove;
271 to_remove.push_back(spg_t(info.pgid.pgid, p->shard));
272 MOSDPGRemove *m = new MOSDPGRemove(
273 get_osdmap_epoch(),
274 to_remove);
275 pl->send_cluster_message(p->osd, m, get_osdmap_epoch());
276 } else {
277 psdout(10) << "not sending PGRemove to down osd." << *p << dendl;
278 }
279 peer_missing.erase(*p);
280 peer_info.erase(*p);
281 missing_loc.remove_stray_recovery_sources(*p);
282 peer_purged.insert(*p);
283 removed = true;
284 }
285
286 // if we removed anyone, update peers (which include peer_info)
287 if (removed)
288 update_heartbeat_peers();
289
290 stray_set.clear();
291
292 // clear _requested maps; we may have to peer() again if we discover
293 // (more) stray content
294 peer_log_requested.clear();
295 peer_missing_requested.clear();
296 }
297
298
299 bool PeeringState::proc_replica_info(
300 pg_shard_t from, const pg_info_t &oinfo, epoch_t send_epoch)
301 {
302 map<pg_shard_t, pg_info_t>::iterator p = peer_info.find(from);
303 if (p != peer_info.end() && p->second.last_update == oinfo.last_update) {
304 psdout(10) << " got dup osd." << from << " info "
305 << oinfo << ", identical to ours" << dendl;
306 return false;
307 }
308
309 if (!get_osdmap()->has_been_up_since(from.osd, send_epoch)) {
310 psdout(10) << " got info " << oinfo << " from down osd." << from
311 << " discarding" << dendl;
312 return false;
313 }
314
315 psdout(10) << " got osd." << from << " " << oinfo << dendl;
316 ceph_assert(is_primary());
317 peer_info[from] = oinfo;
318 might_have_unfound.insert(from);
319
320 update_history(oinfo.history);
321
322 // stray?
323 if (!is_up(from) && !is_acting(from)) {
324 psdout(10) << " osd." << from << " has stray content: " << oinfo << dendl;
325 stray_set.insert(from);
326 if (is_clean()) {
327 purge_strays();
328 }
329 }
330
331 // was this a new info? if so, update peers!
332 if (p == peer_info.end())
333 update_heartbeat_peers();
334
335 return true;
336 }
337
338
339 void PeeringState::remove_down_peer_info(const OSDMapRef &osdmap)
340 {
341 // Remove any downed osds from peer_info
342 bool removed = false;
343 map<pg_shard_t, pg_info_t>::iterator p = peer_info.begin();
344 while (p != peer_info.end()) {
345 if (!osdmap->is_up(p->first.osd)) {
346 psdout(10) << " dropping down osd." << p->first << " info " << p->second << dendl;
347 peer_missing.erase(p->first);
348 peer_log_requested.erase(p->first);
349 peer_missing_requested.erase(p->first);
350 peer_info.erase(p++);
351 removed = true;
352 } else
353 ++p;
354 }
355
356 // Remove any downed osds from peer_purged so we can re-purge if necessary
357 auto it = peer_purged.begin();
358 while (it != peer_purged.end()) {
359 if (!osdmap->is_up(it->osd)) {
360 psdout(10) << " dropping down osd." << *it << " from peer_purged" << dendl;
361 peer_purged.erase(it++);
362 } else {
363 ++it;
364 }
365 }
366
367 // if we removed anyone, update peers (which include peer_info)
368 if (removed)
369 update_heartbeat_peers();
370
371 check_recovery_sources(osdmap);
372 }
373
374 void PeeringState::update_heartbeat_peers()
375 {
376 if (!is_primary())
377 return;
378
379 set<int> new_peers;
380 for (unsigned i=0; i<acting.size(); i++) {
381 if (acting[i] != CRUSH_ITEM_NONE)
382 new_peers.insert(acting[i]);
383 }
384 for (unsigned i=0; i<up.size(); i++) {
385 if (up[i] != CRUSH_ITEM_NONE)
386 new_peers.insert(up[i]);
387 }
388 for (map<pg_shard_t,pg_info_t>::iterator p = peer_info.begin();
389 p != peer_info.end();
390 ++p) {
391 new_peers.insert(p->first.osd);
392 }
393 pl->update_heartbeat_peers(std::move(new_peers));
394 }
395
396 void PeeringState::write_if_dirty(ObjectStore::Transaction& t)
397 {
398 pl->prepare_write(
399 info,
400 last_written_info,
401 past_intervals,
402 pg_log,
403 dirty_info,
404 dirty_big_info,
405 last_persisted_osdmap < get_osdmap_epoch(),
406 t);
407 if (dirty_info || dirty_big_info) {
408 last_persisted_osdmap = get_osdmap_epoch();
409 last_written_info = info;
410 dirty_info = false;
411 dirty_big_info = false;
412 }
413 }
414
415 void PeeringState::advance_map(
416 OSDMapRef osdmap, OSDMapRef lastmap,
417 vector<int>& newup, int up_primary,
418 vector<int>& newacting, int acting_primary,
419 PeeringCtx &rctx)
420 {
421 ceph_assert(lastmap == osdmap_ref);
422 psdout(10) << "handle_advance_map "
423 << newup << "/" << newacting
424 << " -- " << up_primary << "/" << acting_primary
425 << dendl;
426
427 update_osdmap_ref(osdmap);
428 pool.update(cct, osdmap);
429
430 AdvMap evt(
431 osdmap, lastmap, newup, up_primary,
432 newacting, acting_primary);
433 handle_event(evt, &rctx);
434 if (pool.info.last_change == osdmap_ref->get_epoch()) {
435 pl->on_pool_change();
436 }
437 readable_interval = pool.get_readable_interval();
438 last_require_osd_release = osdmap->require_osd_release;
439 }
440
441 void PeeringState::activate_map(PeeringCtx &rctx)
442 {
443 psdout(10) << __func__ << dendl;
444 ActMap evt;
445 handle_event(evt, &rctx);
446 if (osdmap_ref->get_epoch() - last_persisted_osdmap >
447 cct->_conf->osd_pg_epoch_persisted_max_stale) {
448 psdout(20) << __func__ << ": Dirtying info: last_persisted is "
449 << last_persisted_osdmap
450 << " while current is " << osdmap_ref->get_epoch() << dendl;
451 dirty_info = true;
452 } else {
453 psdout(20) << __func__ << ": Not dirtying info: last_persisted is "
454 << last_persisted_osdmap
455 << " while current is " << osdmap_ref->get_epoch() << dendl;
456 }
457 write_if_dirty(rctx.transaction);
458
459 if (get_osdmap()->check_new_blacklist_entries()) {
460 pl->check_blacklisted_watchers();
461 }
462 }
463
464 void PeeringState::set_last_peering_reset()
465 {
466 psdout(20) << "set_last_peering_reset " << get_osdmap_epoch() << dendl;
467 if (last_peering_reset != get_osdmap_epoch()) {
468 last_peering_reset = get_osdmap_epoch();
469 psdout(10) << "Clearing blocked outgoing recovery messages" << dendl;
470 clear_blocked_outgoing();
471 if (!pl->try_flush_or_schedule_async()) {
472 psdout(10) << "Beginning to block outgoing recovery messages" << dendl;
473 begin_block_outgoing();
474 } else {
475 psdout(10) << "Not blocking outgoing recovery messages" << dendl;
476 }
477 }
478 }
479
480 void PeeringState::complete_flush()
481 {
482 flushes_in_progress--;
483 if (flushes_in_progress == 0) {
484 pl->on_flushed();
485 }
486 }
487
488 void PeeringState::check_full_transition(OSDMapRef lastmap, OSDMapRef osdmap)
489 {
490 const pg_pool_t *pi = osdmap->get_pg_pool(info.pgid.pool());
491 if (!pi) {
492 return; // pool deleted
493 }
494 bool changed = false;
495 if (pi->has_flag(pg_pool_t::FLAG_FULL)) {
496 const pg_pool_t *opi = lastmap->get_pg_pool(info.pgid.pool());
497 if (!opi || !opi->has_flag(pg_pool_t::FLAG_FULL)) {
498 psdout(10) << " pool was marked full in " << osdmap->get_epoch() << dendl;
499 changed = true;
500 }
501 }
502 if (changed) {
503 info.history.last_epoch_marked_full = osdmap->get_epoch();
504 dirty_info = true;
505 }
506 }
507
508 bool PeeringState::should_restart_peering(
509 int newupprimary,
510 int newactingprimary,
511 const vector<int>& newup,
512 const vector<int>& newacting,
513 OSDMapRef lastmap,
514 OSDMapRef osdmap)
515 {
516 if (PastIntervals::is_new_interval(
517 primary.osd,
518 newactingprimary,
519 acting,
520 newacting,
521 up_primary.osd,
522 newupprimary,
523 up,
524 newup,
525 osdmap.get(),
526 lastmap.get(),
527 info.pgid.pgid)) {
528 psdout(20) << "new interval newup " << newup
529 << " newacting " << newacting << dendl;
530 return true;
531 }
532 if (!lastmap->is_up(pg_whoami.osd) && osdmap->is_up(pg_whoami.osd)) {
533 psdout(10) << __func__ << " osd transitioned from down -> up"
534 << dendl;
535 return true;
536 }
537 return false;
538 }
539
540 /* Called before initializing peering during advance_map */
541 void PeeringState::start_peering_interval(
542 const OSDMapRef lastmap,
543 const vector<int>& newup, int new_up_primary,
544 const vector<int>& newacting, int new_acting_primary,
545 ObjectStore::Transaction &t)
546 {
547 const OSDMapRef osdmap = get_osdmap();
548
549 set_last_peering_reset();
550
551 vector<int> oldacting, oldup;
552 int oldrole = get_role();
553
554 if (is_primary()) {
555 pl->clear_ready_to_merge();
556 }
557
558
559 pg_shard_t old_acting_primary = get_primary();
560 pg_shard_t old_up_primary = up_primary;
561 bool was_old_primary = is_primary();
562 bool was_old_nonprimary = is_nonprimary();
563
564 acting.swap(oldacting);
565 up.swap(oldup);
566 init_primary_up_acting(
567 newup,
568 newacting,
569 new_up_primary,
570 new_acting_primary);
571
572 if (info.stats.up != up ||
573 info.stats.acting != acting ||
574 info.stats.up_primary != new_up_primary ||
575 info.stats.acting_primary != new_acting_primary) {
576 info.stats.up = up;
577 info.stats.up_primary = new_up_primary;
578 info.stats.acting = acting;
579 info.stats.acting_primary = new_acting_primary;
580 info.stats.mapping_epoch = osdmap->get_epoch();
581 }
582
583 pl->clear_publish_stats();
584
585 // This will now be remapped during a backfill in cases
586 // that it would not have been before.
587 if (up != acting)
588 state_set(PG_STATE_REMAPPED);
589 else
590 state_clear(PG_STATE_REMAPPED);
591
592 int role = osdmap->calc_pg_role(pg_whoami, acting);
593 set_role(role);
594
595 // did acting, up, primary|acker change?
596 if (!lastmap) {
597 psdout(10) << " no lastmap" << dendl;
598 dirty_info = true;
599 dirty_big_info = true;
600 info.history.same_interval_since = osdmap->get_epoch();
601 } else {
602 std::stringstream debug;
603 ceph_assert(info.history.same_interval_since != 0);
604 bool new_interval = PastIntervals::check_new_interval(
605 old_acting_primary.osd,
606 new_acting_primary,
607 oldacting, newacting,
608 old_up_primary.osd,
609 new_up_primary,
610 oldup, newup,
611 info.history.same_interval_since,
612 info.history.last_epoch_clean,
613 osdmap.get(),
614 lastmap.get(),
615 info.pgid.pgid,
616 missing_loc.get_recoverable_predicate(),
617 &past_intervals,
618 &debug);
619 psdout(10) << __func__ << ": check_new_interval output: "
620 << debug.str() << dendl;
621 if (new_interval) {
622 if (osdmap->get_epoch() == pl->oldest_stored_osdmap() &&
623 info.history.last_epoch_clean < osdmap->get_epoch()) {
624 psdout(10) << " map gap, clearing past_intervals and faking" << dendl;
625 // our information is incomplete and useless; someone else was clean
626 // after everything we know if osdmaps were trimmed.
627 past_intervals.clear();
628 } else {
629 psdout(10) << " noting past " << past_intervals << dendl;
630 }
631 dirty_info = true;
632 dirty_big_info = true;
633 info.history.same_interval_since = osdmap->get_epoch();
634 if (osdmap->have_pg_pool(info.pgid.pgid.pool()) &&
635 info.pgid.pgid.is_split(lastmap->get_pg_num(info.pgid.pgid.pool()),
636 osdmap->get_pg_num(info.pgid.pgid.pool()),
637 nullptr)) {
638 info.history.last_epoch_split = osdmap->get_epoch();
639 }
640 }
641 }
642
643 if (old_up_primary != up_primary ||
644 oldup != up) {
645 info.history.same_up_since = osdmap->get_epoch();
646 }
647 // this comparison includes primary rank via pg_shard_t
648 if (old_acting_primary != get_primary()) {
649 info.history.same_primary_since = osdmap->get_epoch();
650 }
651
652 on_new_interval();
653 pl->on_info_history_change();
654
655 psdout(1) << __func__ << " up " << oldup << " -> " << up
656 << ", acting " << oldacting << " -> " << acting
657 << ", acting_primary " << old_acting_primary << " -> "
658 << new_acting_primary
659 << ", up_primary " << old_up_primary << " -> " << new_up_primary
660 << ", role " << oldrole << " -> " << role
661 << ", features acting " << acting_features
662 << " upacting " << upacting_features
663 << dendl;
664
665 // deactivate.
666 state_clear(PG_STATE_ACTIVE);
667 state_clear(PG_STATE_PEERED);
668 state_clear(PG_STATE_PREMERGE);
669 state_clear(PG_STATE_DOWN);
670 state_clear(PG_STATE_RECOVERY_WAIT);
671 state_clear(PG_STATE_RECOVERY_TOOFULL);
672 state_clear(PG_STATE_RECOVERING);
673
674 peer_purged.clear();
675 acting_recovery_backfill.clear();
676
677 // reset primary/replica state?
678 if (was_old_primary || is_primary()) {
679 pl->clear_want_pg_temp();
680 } else if (was_old_nonprimary || is_nonprimary()) {
681 pl->clear_want_pg_temp();
682 }
683 clear_primary_state();
684
685 pl->on_change(t);
686
687 ceph_assert(!deleting);
688
689 // should we tell the primary we are here?
690 send_notify = !is_primary();
691
692 if (role != oldrole ||
693 was_old_primary != is_primary()) {
694 // did primary change?
695 if (was_old_primary != is_primary()) {
696 state_clear(PG_STATE_CLEAN);
697 }
698
699 pl->on_role_change();
700 } else {
701 // no role change.
702 // did primary change?
703 if (get_primary() != old_acting_primary) {
704 psdout(10) << oldacting << " -> " << acting
705 << ", acting primary "
706 << old_acting_primary << " -> " << get_primary()
707 << dendl;
708 } else {
709 // primary is the same.
710 if (is_primary()) {
711 // i am (still) primary. but my replica set changed.
712 state_clear(PG_STATE_CLEAN);
713
714 psdout(10) << oldacting << " -> " << acting
715 << ", replicas changed" << dendl;
716 }
717 }
718 }
719
720 if (acting.empty() && !up.empty() && up_primary == pg_whoami) {
721 psdout(10) << " acting empty, but i am up[0], clearing pg_temp" << dendl;
722 pl->queue_want_pg_temp(acting);
723 }
724 }
725
726 void PeeringState::on_new_interval()
727 {
728 dout(20) << __func__ << dendl;
729 const OSDMapRef osdmap = get_osdmap();
730
731 // initialize features
732 acting_features = CEPH_FEATURES_SUPPORTED_DEFAULT;
733 upacting_features = CEPH_FEATURES_SUPPORTED_DEFAULT;
734 for (vector<int>::iterator p = acting.begin(); p != acting.end(); ++p) {
735 if (*p == CRUSH_ITEM_NONE)
736 continue;
737 uint64_t f = osdmap->get_xinfo(*p).features;
738 acting_features &= f;
739 upacting_features &= f;
740 }
741 for (vector<int>::iterator p = up.begin(); p != up.end(); ++p) {
742 if (*p == CRUSH_ITEM_NONE)
743 continue;
744 upacting_features &= osdmap->get_xinfo(*p).features;
745 }
746 psdout(20) << __func__ << " upacting_features 0x" << std::hex
747 << upacting_features << std::dec
748 << " from " << acting << "+" << up << dendl;
749
750 psdout(20) << __func__ << " checking missing set deletes flag. missing = "
751 << get_pg_log().get_missing() << dendl;
752
753 if (!pg_log.get_missing().may_include_deletes &&
754 !perform_deletes_during_peering()) {
755 pl->rebuild_missing_set_with_deletes(pg_log);
756 }
757 ceph_assert(
758 pg_log.get_missing().may_include_deletes ==
759 !perform_deletes_during_peering());
760
761 init_hb_stamps();
762
763 // update lease bounds for a new interval
764 auto mnow = pl->get_mnow();
765 prior_readable_until_ub = std::max(prior_readable_until_ub,
766 readable_until_ub);
767 prior_readable_until_ub = info.history.refresh_prior_readable_until_ub(
768 mnow, prior_readable_until_ub);
769 psdout(10) << __func__ << " prior_readable_until_ub "
770 << prior_readable_until_ub << " (mnow " << mnow << " + "
771 << info.history.prior_readable_until_ub << ")" << dendl;
772 prior_readable_down_osds.clear(); // we populate this when we build the priorset
773
774 readable_until =
775 readable_until_ub =
776 readable_until_ub_sent =
777 readable_until_ub_from_primary = ceph::signedspan::zero();
778
779 acting_readable_until_ub.clear();
780 if (is_primary()) {
781 acting_readable_until_ub.resize(acting.size(), ceph::signedspan::zero());
782 }
783
784 pl->on_new_interval();
785 }
786
787 void PeeringState::init_primary_up_acting(
788 const vector<int> &newup,
789 const vector<int> &newacting,
790 int new_up_primary,
791 int new_acting_primary)
792 {
793 actingset.clear();
794 acting = newacting;
795 for (uint8_t i = 0; i < acting.size(); ++i) {
796 if (acting[i] != CRUSH_ITEM_NONE)
797 actingset.insert(
798 pg_shard_t(
799 acting[i],
800 pool.info.is_erasure() ? shard_id_t(i) : shard_id_t::NO_SHARD));
801 }
802 upset.clear();
803 up = newup;
804 for (uint8_t i = 0; i < up.size(); ++i) {
805 if (up[i] != CRUSH_ITEM_NONE)
806 upset.insert(
807 pg_shard_t(
808 up[i],
809 pool.info.is_erasure() ? shard_id_t(i) : shard_id_t::NO_SHARD));
810 }
811 if (!pool.info.is_erasure()) {
812 // replicated
813 up_primary = pg_shard_t(new_up_primary, shard_id_t::NO_SHARD);
814 primary = pg_shard_t(new_acting_primary, shard_id_t::NO_SHARD);
815 } else {
816 // erasure
817 up_primary = pg_shard_t();
818 primary = pg_shard_t();
819 for (uint8_t i = 0; i < up.size(); ++i) {
820 if (up[i] == new_up_primary) {
821 up_primary = pg_shard_t(up[i], shard_id_t(i));
822 break;
823 }
824 }
825 for (uint8_t i = 0; i < acting.size(); ++i) {
826 if (acting[i] == new_acting_primary) {
827 primary = pg_shard_t(acting[i], shard_id_t(i));
828 break;
829 }
830 }
831 ceph_assert(up_primary.osd == new_up_primary);
832 ceph_assert(primary.osd == new_acting_primary);
833 }
834 }
835
836 void PeeringState::init_hb_stamps()
837 {
838 if (is_primary()) {
839 // we care about all other osds in the acting set
840 hb_stamps.resize(acting.size() - 1);
841 unsigned i = 0;
842 for (auto p : acting) {
843 if (p == CRUSH_ITEM_NONE || p == get_primary().osd) {
844 continue;
845 }
846 hb_stamps[i++] = pl->get_hb_stamps(p);
847 }
848 hb_stamps.resize(i);
849 } else if (is_nonprimary()) {
850 // we care about just the primary
851 hb_stamps.resize(1);
852 hb_stamps[0] = pl->get_hb_stamps(get_primary().osd);
853 } else {
854 hb_stamps.clear();
855 }
856 dout(10) << __func__ << " now " << hb_stamps << dendl;
857 }
858
859
860 void PeeringState::clear_recovery_state()
861 {
862 async_recovery_targets.clear();
863 backfill_targets.clear();
864 }
865
866 void PeeringState::clear_primary_state()
867 {
868 psdout(10) << "clear_primary_state" << dendl;
869
870 // clear peering state
871 stray_set.clear();
872 peer_log_requested.clear();
873 peer_missing_requested.clear();
874 peer_info.clear();
875 peer_bytes.clear();
876 peer_missing.clear();
877 peer_last_complete_ondisk.clear();
878 peer_activated.clear();
879 min_last_complete_ondisk = eversion_t();
880 pg_trim_to = eversion_t();
881 might_have_unfound.clear();
882 need_up_thru = false;
883 missing_loc.clear();
884 pg_log.reset_recovery_pointers();
885
886 clear_recovery_state();
887
888 last_update_ondisk = eversion_t();
889 missing_loc.clear();
890 pl->clear_primary_state();
891 }
892
893 /// return [start,end) bounds for required past_intervals
894 static pair<epoch_t, epoch_t> get_required_past_interval_bounds(
895 const pg_info_t &info,
896 epoch_t oldest_map) {
897 epoch_t start = std::max(
898 info.history.last_epoch_clean ? info.history.last_epoch_clean :
899 info.history.epoch_pool_created,
900 oldest_map);
901 epoch_t end = std::max(
902 info.history.same_interval_since,
903 info.history.epoch_pool_created);
904 return make_pair(start, end);
905 }
906
907
908 void PeeringState::check_past_interval_bounds() const
909 {
910 auto oldest_epoch = pl->oldest_stored_osdmap();
911 auto rpib = get_required_past_interval_bounds(
912 info,
913 oldest_epoch);
914 if (rpib.first >= rpib.second) {
915 // do not warn if the start bound is dictated by oldest_map; the
916 // past intervals are presumably appropriate given the pg info.
917 if (!past_intervals.empty() &&
918 rpib.first > oldest_epoch) {
919 pl->get_clog_error() << info.pgid << " required past_interval bounds are"
920 << " empty [" << rpib << ") but past_intervals is not: "
921 << past_intervals;
922 derr << info.pgid << " required past_interval bounds are"
923 << " empty [" << rpib << ") but past_intervals is not: "
924 << past_intervals << dendl;
925 }
926 } else {
927 if (past_intervals.empty()) {
928 pl->get_clog_error() << info.pgid << " required past_interval bounds are"
929 << " not empty [" << rpib << ") but past_intervals "
930 << past_intervals << " is empty";
931 derr << info.pgid << " required past_interval bounds are"
932 << " not empty [" << rpib << ") but past_intervals "
933 << past_intervals << " is empty" << dendl;
934 ceph_assert(!past_intervals.empty());
935 }
936
937 auto apib = past_intervals.get_bounds();
938 if (apib.first > rpib.first) {
939 pl->get_clog_error() << info.pgid << " past_intervals [" << apib
940 << ") start interval does not contain the required"
941 << " bound [" << rpib << ") start";
942 derr << info.pgid << " past_intervals [" << apib
943 << ") start interval does not contain the required"
944 << " bound [" << rpib << ") start" << dendl;
945 ceph_abort_msg("past_interval start interval mismatch");
946 }
947 if (apib.second != rpib.second) {
948 pl->get_clog_error() << info.pgid << " past_interal bound [" << apib
949 << ") end does not match required [" << rpib
950 << ") end";
951 derr << info.pgid << " past_interal bound [" << apib
952 << ") end does not match required [" << rpib
953 << ") end" << dendl;
954 ceph_abort_msg("past_interval end mismatch");
955 }
956 }
957 }
958
959 int PeeringState::clamp_recovery_priority(int priority, int pool_recovery_priority, int max)
960 {
961 static_assert(OSD_RECOVERY_PRIORITY_MIN < OSD_RECOVERY_PRIORITY_MAX, "Invalid priority range");
962 static_assert(OSD_RECOVERY_PRIORITY_MIN >= 0, "Priority range must match unsigned type");
963
964 ceph_assert(max <= OSD_RECOVERY_PRIORITY_MAX);
965
966 // User can't set this too high anymore, but might be a legacy value
967 if (pool_recovery_priority > OSD_POOL_PRIORITY_MAX)
968 pool_recovery_priority = OSD_POOL_PRIORITY_MAX;
969 if (pool_recovery_priority < OSD_POOL_PRIORITY_MIN)
970 pool_recovery_priority = OSD_POOL_PRIORITY_MIN;
971 // Shift range from min to max to 0 to max - min
972 pool_recovery_priority += (0 - OSD_POOL_PRIORITY_MIN);
973 ceph_assert(pool_recovery_priority >= 0 && pool_recovery_priority <= (OSD_POOL_PRIORITY_MAX - OSD_POOL_PRIORITY_MIN));
974
975 priority += pool_recovery_priority;
976
977 // Clamp to valid range
978 if (priority > max) {
979 return max;
980 } else if (priority < OSD_RECOVERY_PRIORITY_MIN) {
981 return OSD_RECOVERY_PRIORITY_MIN;
982 } else {
983 return priority;
984 }
985 }
986
987 unsigned PeeringState::get_recovery_priority()
988 {
989 // a higher value -> a higher priority
990 int ret = OSD_RECOVERY_PRIORITY_BASE;
991 int base = ret;
992
993 if (state & PG_STATE_FORCED_RECOVERY) {
994 ret = OSD_RECOVERY_PRIORITY_FORCED;
995 } else {
996 // XXX: This priority boost isn't so much about inactive, but about data-at-risk
997 if (is_degraded() && info.stats.avail_no_missing.size() < pool.info.min_size) {
998 base = OSD_RECOVERY_INACTIVE_PRIORITY_BASE;
999 // inactive: no. of replicas < min_size, highest priority since it blocks IO
1000 ret = base + (pool.info.min_size - info.stats.avail_no_missing.size());
1001 }
1002
1003 int64_t pool_recovery_priority = 0;
1004 pool.info.opts.get(pool_opts_t::RECOVERY_PRIORITY, &pool_recovery_priority);
1005
1006 ret = clamp_recovery_priority(ret, pool_recovery_priority, max_prio_map[base]);
1007 }
1008 psdout(20) << __func__ << " recovery priority is " << ret << dendl;
1009 return static_cast<unsigned>(ret);
1010 }
1011
1012 unsigned PeeringState::get_backfill_priority()
1013 {
1014 // a higher value -> a higher priority
1015 int ret = OSD_BACKFILL_PRIORITY_BASE;
1016 int base = ret;
1017
1018 if (state & PG_STATE_FORCED_BACKFILL) {
1019 ret = OSD_BACKFILL_PRIORITY_FORCED;
1020 } else {
1021 if (actingset.size() < pool.info.min_size) {
1022 base = OSD_BACKFILL_INACTIVE_PRIORITY_BASE;
1023 // inactive: no. of replicas < min_size, highest priority since it blocks IO
1024 ret = base + (pool.info.min_size - actingset.size());
1025
1026 } else if (is_undersized()) {
1027 // undersized: OSD_BACKFILL_DEGRADED_PRIORITY_BASE + num missing replicas
1028 ceph_assert(pool.info.size > actingset.size());
1029 base = OSD_BACKFILL_DEGRADED_PRIORITY_BASE;
1030 ret = base + (pool.info.size - actingset.size());
1031
1032 } else if (is_degraded()) {
1033 // degraded: baseline degraded
1034 base = ret = OSD_BACKFILL_DEGRADED_PRIORITY_BASE;
1035 }
1036
1037 // Adjust with pool's recovery priority
1038 int64_t pool_recovery_priority = 0;
1039 pool.info.opts.get(pool_opts_t::RECOVERY_PRIORITY, &pool_recovery_priority);
1040
1041 ret = clamp_recovery_priority(ret, pool_recovery_priority, max_prio_map[base]);
1042 }
1043
1044 psdout(20) << __func__ << " backfill priority is " << ret << dendl;
1045 return static_cast<unsigned>(ret);
1046 }
1047
1048 unsigned PeeringState::get_delete_priority()
1049 {
1050 auto state = get_osdmap()->get_state(pg_whoami.osd);
1051 if (state & (CEPH_OSD_BACKFILLFULL |
1052 CEPH_OSD_FULL)) {
1053 return OSD_DELETE_PRIORITY_FULL;
1054 } else if (state & CEPH_OSD_NEARFULL) {
1055 return OSD_DELETE_PRIORITY_FULLISH;
1056 } else {
1057 return OSD_DELETE_PRIORITY_NORMAL;
1058 }
1059 }
1060
1061 bool PeeringState::set_force_recovery(bool b)
1062 {
1063 bool did = false;
1064 if (b) {
1065 if (!(state & PG_STATE_FORCED_RECOVERY) &&
1066 (state & (PG_STATE_DEGRADED |
1067 PG_STATE_RECOVERY_WAIT |
1068 PG_STATE_RECOVERING))) {
1069 psdout(20) << __func__ << " set" << dendl;
1070 state_set(PG_STATE_FORCED_RECOVERY);
1071 pl->publish_stats_to_osd();
1072 did = true;
1073 }
1074 } else if (state & PG_STATE_FORCED_RECOVERY) {
1075 psdout(20) << __func__ << " clear" << dendl;
1076 state_clear(PG_STATE_FORCED_RECOVERY);
1077 pl->publish_stats_to_osd();
1078 did = true;
1079 }
1080 if (did) {
1081 psdout(20) << __func__ << " state " << get_current_state()
1082 << dendl;
1083 pl->update_local_background_io_priority(get_recovery_priority());
1084 }
1085 return did;
1086 }
1087
1088 bool PeeringState::set_force_backfill(bool b)
1089 {
1090 bool did = false;
1091 if (b) {
1092 if (!(state & PG_STATE_FORCED_BACKFILL) &&
1093 (state & (PG_STATE_DEGRADED |
1094 PG_STATE_BACKFILL_WAIT |
1095 PG_STATE_BACKFILLING))) {
1096 psdout(10) << __func__ << " set" << dendl;
1097 state_set(PG_STATE_FORCED_BACKFILL);
1098 pl->publish_stats_to_osd();
1099 did = true;
1100 }
1101 } else if (state & PG_STATE_FORCED_BACKFILL) {
1102 psdout(10) << __func__ << " clear" << dendl;
1103 state_clear(PG_STATE_FORCED_BACKFILL);
1104 pl->publish_stats_to_osd();
1105 did = true;
1106 }
1107 if (did) {
1108 psdout(20) << __func__ << " state " << get_current_state()
1109 << dendl;
1110 pl->update_local_background_io_priority(get_backfill_priority());
1111 }
1112 return did;
1113 }
1114
1115 void PeeringState::schedule_renew_lease()
1116 {
1117 pl->schedule_renew_lease(
1118 last_peering_reset,
1119 readable_interval / 2);
1120 }
1121
1122 void PeeringState::send_lease()
1123 {
1124 epoch_t epoch = pl->get_osdmap_epoch();
1125 for (auto peer : actingset) {
1126 if (peer == pg_whoami) {
1127 continue;
1128 }
1129 pl->send_cluster_message(
1130 peer.osd,
1131 new MOSDPGLease(epoch,
1132 spg_t(spgid.pgid, peer.shard),
1133 get_lease()),
1134 epoch);
1135 }
1136 }
1137
1138 void PeeringState::proc_lease(const pg_lease_t& l)
1139 {
1140 if (!HAVE_FEATURE(upacting_features, SERVER_OCTOPUS)) {
1141 psdout(20) << __func__ << " no-op, upacting_features 0x" << std::hex
1142 << upacting_features << std::dec
1143 << " does not include SERVER_OCTOPUS" << dendl;
1144 return;
1145 }
1146 if (!is_nonprimary()) {
1147 psdout(20) << __func__ << " no-op, !nonprimary" << dendl;
1148 return;
1149 }
1150 psdout(10) << __func__ << " " << l << dendl;
1151 if (l.readable_until_ub > readable_until_ub_from_primary) {
1152 readable_until_ub_from_primary = l.readable_until_ub;
1153 }
1154
1155 ceph::signedspan ru = ceph::signedspan::zero();
1156 if (l.readable_until != ceph::signedspan::zero() &&
1157 hb_stamps[0]->peer_clock_delta_ub) {
1158 ru = l.readable_until - *hb_stamps[0]->peer_clock_delta_ub;
1159 psdout(20) << " peer_clock_delta_ub " << *hb_stamps[0]->peer_clock_delta_ub
1160 << " -> ru " << ru << dendl;
1161 }
1162 if (ru > readable_until) {
1163 readable_until = ru;
1164 psdout(20) << __func__ << " readable_until now " << readable_until << dendl;
1165 // NOTE: if we ever decide to block/queue ops on the replica,
1166 // we'll need to wake them up here.
1167 }
1168
1169 ceph::signedspan ruub;
1170 if (hb_stamps[0]->peer_clock_delta_lb) {
1171 ruub = l.readable_until_ub - *hb_stamps[0]->peer_clock_delta_lb;
1172 psdout(20) << " peer_clock_delta_lb " << *hb_stamps[0]->peer_clock_delta_lb
1173 << " -> ruub " << ruub << dendl;
1174 } else {
1175 ruub = pl->get_mnow() + l.interval;
1176 psdout(20) << " no peer_clock_delta_lb -> ruub " << ruub << dendl;
1177 }
1178 if (ruub > readable_until_ub) {
1179 readable_until_ub = ruub;
1180 psdout(20) << __func__ << " readable_until_ub now " << readable_until_ub
1181 << dendl;
1182 }
1183 }
1184
1185 void PeeringState::proc_lease_ack(int from, const pg_lease_ack_t& a)
1186 {
1187 if (!HAVE_FEATURE(upacting_features, SERVER_OCTOPUS)) {
1188 return;
1189 }
1190 auto now = pl->get_mnow();
1191 bool was_min = false;
1192 for (unsigned i = 0; i < acting.size(); ++i) {
1193 if (from == acting[i]) {
1194 // the lease_ack value is based on the primary's clock
1195 if (a.readable_until_ub > acting_readable_until_ub[i]) {
1196 if (acting_readable_until_ub[i] == readable_until) {
1197 was_min = true;
1198 }
1199 acting_readable_until_ub[i] = a.readable_until_ub;
1200 break;
1201 }
1202 }
1203 }
1204 if (was_min) {
1205 auto old_ru = readable_until;
1206 recalc_readable_until();
1207 if (now < old_ru) {
1208 pl->recheck_readable();
1209 }
1210 }
1211 }
1212
1213 void PeeringState::proc_renew_lease()
1214 {
1215 if (!HAVE_FEATURE(upacting_features, SERVER_OCTOPUS)) {
1216 return;
1217 }
1218 renew_lease(pl->get_mnow());
1219 send_lease();
1220 schedule_renew_lease();
1221 }
1222
1223 void PeeringState::recalc_readable_until()
1224 {
1225 assert(is_primary());
1226 ceph::signedspan min = readable_until_ub_sent;
1227 for (unsigned i = 0; i < acting.size(); ++i) {
1228 if (acting[i] == pg_whoami.osd || acting[i] == CRUSH_ITEM_NONE) {
1229 continue;
1230 }
1231 dout(20) << __func__ << " peer osd." << acting[i]
1232 << " ruub " << acting_readable_until_ub[i] << dendl;
1233 if (acting_readable_until_ub[i] < min) {
1234 min = acting_readable_until_ub[i];
1235 }
1236 }
1237 readable_until = min;
1238 readable_until_ub = min;
1239 dout(20) << __func__ << " readable_until[_ub] " << readable_until
1240 << " (sent " << readable_until_ub_sent << ")" << dendl;
1241 }
1242
1243 bool PeeringState::check_prior_readable_down_osds(const OSDMapRef& map)
1244 {
1245 if (!HAVE_FEATURE(upacting_features, SERVER_OCTOPUS)) {
1246 return false;
1247 }
1248 bool changed = false;
1249 auto p = prior_readable_down_osds.begin();
1250 while (p != prior_readable_down_osds.end()) {
1251 if (map->is_dead(*p)) {
1252 dout(10) << __func__ << " prior_readable_down_osds osd." << *p
1253 << " is dead as of epoch " << map->get_epoch()
1254 << dendl;
1255 p = prior_readable_down_osds.erase(p);
1256 changed = true;
1257 } else {
1258 ++p;
1259 }
1260 }
1261 if (changed && prior_readable_down_osds.empty()) {
1262 psdout(10) << " empty prior_readable_down_osds, clearing ub" << dendl;
1263 clear_prior_readable_until_ub();
1264 return true;
1265 }
1266 return false;
1267 }
1268
1269 bool PeeringState::adjust_need_up_thru(const OSDMapRef osdmap)
1270 {
1271 epoch_t up_thru = osdmap->get_up_thru(pg_whoami.osd);
1272 if (need_up_thru &&
1273 up_thru >= info.history.same_interval_since) {
1274 psdout(10) << "adjust_need_up_thru now "
1275 << up_thru << ", need_up_thru now false" << dendl;
1276 need_up_thru = false;
1277 return true;
1278 }
1279 return false;
1280 }
1281
1282 PastIntervals::PriorSet PeeringState::build_prior()
1283 {
1284 if (1) {
1285 // sanity check
1286 for (map<pg_shard_t,pg_info_t>::iterator it = peer_info.begin();
1287 it != peer_info.end();
1288 ++it) {
1289 ceph_assert(info.history.last_epoch_started >=
1290 it->second.history.last_epoch_started);
1291 }
1292 }
1293
1294 const OSDMap &osdmap = *get_osdmap();
1295 PastIntervals::PriorSet prior = past_intervals.get_prior_set(
1296 pool.info.is_erasure(),
1297 info.history.last_epoch_started,
1298 &missing_loc.get_recoverable_predicate(),
1299 [&](epoch_t start, int osd, epoch_t *lost_at) {
1300 const osd_info_t *pinfo = 0;
1301 if (osdmap.exists(osd)) {
1302 pinfo = &osdmap.get_info(osd);
1303 if (lost_at)
1304 *lost_at = pinfo->lost_at;
1305 }
1306
1307 if (osdmap.is_up(osd)) {
1308 return PastIntervals::UP;
1309 } else if (!pinfo) {
1310 return PastIntervals::DNE;
1311 } else if (pinfo->lost_at > start) {
1312 return PastIntervals::LOST;
1313 } else {
1314 return PastIntervals::DOWN;
1315 }
1316 },
1317 up,
1318 acting,
1319 dpp);
1320
1321 if (prior.pg_down) {
1322 state_set(PG_STATE_DOWN);
1323 }
1324
1325 if (get_osdmap()->get_up_thru(pg_whoami.osd) <
1326 info.history.same_interval_since) {
1327 psdout(10) << "up_thru " << get_osdmap()->get_up_thru(pg_whoami.osd)
1328 << " < same_since " << info.history.same_interval_since
1329 << ", must notify monitor" << dendl;
1330 need_up_thru = true;
1331 } else {
1332 psdout(10) << "up_thru " << get_osdmap()->get_up_thru(pg_whoami.osd)
1333 << " >= same_since " << info.history.same_interval_since
1334 << ", all is well" << dendl;
1335 need_up_thru = false;
1336 }
1337 pl->set_probe_targets(prior.probe);
1338 return prior;
1339 }
1340
1341 bool PeeringState::needs_recovery() const
1342 {
1343 ceph_assert(is_primary());
1344
1345 auto &missing = pg_log.get_missing();
1346
1347 if (missing.num_missing()) {
1348 psdout(10) << __func__ << " primary has " << missing.num_missing()
1349 << " missing" << dendl;
1350 return true;
1351 }
1352
1353 ceph_assert(!acting_recovery_backfill.empty());
1354 set<pg_shard_t>::const_iterator end = acting_recovery_backfill.end();
1355 set<pg_shard_t>::const_iterator a = acting_recovery_backfill.begin();
1356 for (; a != end; ++a) {
1357 if (*a == get_primary()) continue;
1358 pg_shard_t peer = *a;
1359 map<pg_shard_t, pg_missing_t>::const_iterator pm = peer_missing.find(peer);
1360 if (pm == peer_missing.end()) {
1361 psdout(10) << __func__ << " osd." << peer << " doesn't have missing set"
1362 << dendl;
1363 continue;
1364 }
1365 if (pm->second.num_missing()) {
1366 psdout(10) << __func__ << " osd." << peer << " has "
1367 << pm->second.num_missing() << " missing" << dendl;
1368 return true;
1369 }
1370 }
1371
1372 psdout(10) << __func__ << " is recovered" << dendl;
1373 return false;
1374 }
1375
1376 bool PeeringState::needs_backfill() const
1377 {
1378 ceph_assert(is_primary());
1379
1380 // We can assume that only possible osds that need backfill
1381 // are on the backfill_targets vector nodes.
1382 set<pg_shard_t>::const_iterator end = backfill_targets.end();
1383 set<pg_shard_t>::const_iterator a = backfill_targets.begin();
1384 for (; a != end; ++a) {
1385 pg_shard_t peer = *a;
1386 map<pg_shard_t, pg_info_t>::const_iterator pi = peer_info.find(peer);
1387 if (!pi->second.last_backfill.is_max()) {
1388 psdout(10) << __func__ << " osd." << peer
1389 << " has last_backfill " << pi->second.last_backfill << dendl;
1390 return true;
1391 }
1392 }
1393
1394 psdout(10) << __func__ << " does not need backfill" << dendl;
1395 return false;
1396 }
1397
1398 /*
1399 * Returns true unless there is a non-lost OSD in might_have_unfound.
1400 */
1401 bool PeeringState::all_unfound_are_queried_or_lost(
1402 const OSDMapRef osdmap) const
1403 {
1404 ceph_assert(is_primary());
1405
1406 set<pg_shard_t>::const_iterator peer = might_have_unfound.begin();
1407 set<pg_shard_t>::const_iterator mend = might_have_unfound.end();
1408 for (; peer != mend; ++peer) {
1409 if (peer_missing.count(*peer))
1410 continue;
1411 map<pg_shard_t, pg_info_t>::const_iterator iter = peer_info.find(*peer);
1412 if (iter != peer_info.end() &&
1413 (iter->second.is_empty() || iter->second.dne()))
1414 continue;
1415 if (!osdmap->exists(peer->osd))
1416 continue;
1417 const osd_info_t &osd_info(osdmap->get_info(peer->osd));
1418 if (osd_info.lost_at <= osd_info.up_from) {
1419 // If there is even one OSD in might_have_unfound that isn't lost, we
1420 // still might retrieve our unfound.
1421 return false;
1422 }
1423 }
1424 psdout(10) << "all_unfound_are_queried_or_lost all of might_have_unfound "
1425 << might_have_unfound
1426 << " have been queried or are marked lost" << dendl;
1427 return true;
1428 }
1429
1430
1431 void PeeringState::reject_reservation()
1432 {
1433 pl->unreserve_recovery_space();
1434 pl->send_cluster_message(
1435 primary.osd,
1436 new MBackfillReserve(
1437 MBackfillReserve::REJECT_TOOFULL,
1438 spg_t(info.pgid.pgid, primary.shard),
1439 get_osdmap_epoch()),
1440 get_osdmap_epoch());
1441 }
1442
1443 /**
1444 * find_best_info
1445 *
1446 * Returns an iterator to the best info in infos sorted by:
1447 * 1) Prefer newer last_update
1448 * 2) Prefer longer tail if it brings another info into contiguity
1449 * 3) Prefer current primary
1450 */
1451 map<pg_shard_t, pg_info_t>::const_iterator PeeringState::find_best_info(
1452 const map<pg_shard_t, pg_info_t> &infos,
1453 bool restrict_to_up_acting,
1454 bool *history_les_bound) const
1455 {
1456 ceph_assert(history_les_bound);
1457 /* See doc/dev/osd_internals/last_epoch_started.rst before attempting
1458 * to make changes to this process. Also, make sure to update it
1459 * when you find bugs! */
1460 eversion_t min_last_update_acceptable = eversion_t::max();
1461 epoch_t max_last_epoch_started_found = 0;
1462 for (map<pg_shard_t, pg_info_t>::const_iterator i = infos.begin();
1463 i != infos.end();
1464 ++i) {
1465 if (!cct->_conf->osd_find_best_info_ignore_history_les &&
1466 max_last_epoch_started_found < i->second.history.last_epoch_started) {
1467 *history_les_bound = true;
1468 max_last_epoch_started_found = i->second.history.last_epoch_started;
1469 }
1470 if (!i->second.is_incomplete() &&
1471 max_last_epoch_started_found < i->second.last_epoch_started) {
1472 *history_les_bound = false;
1473 max_last_epoch_started_found = i->second.last_epoch_started;
1474 }
1475 }
1476 for (map<pg_shard_t, pg_info_t>::const_iterator i = infos.begin();
1477 i != infos.end();
1478 ++i) {
1479 if (max_last_epoch_started_found <= i->second.last_epoch_started) {
1480 if (min_last_update_acceptable > i->second.last_update)
1481 min_last_update_acceptable = i->second.last_update;
1482 }
1483 }
1484 if (min_last_update_acceptable == eversion_t::max())
1485 return infos.end();
1486
1487 map<pg_shard_t, pg_info_t>::const_iterator best = infos.end();
1488 // find osd with newest last_update (oldest for ec_pool).
1489 // if there are multiples, prefer
1490 // - a longer tail, if it brings another peer into log contiguity
1491 // - the current primary
1492 for (map<pg_shard_t, pg_info_t>::const_iterator p = infos.begin();
1493 p != infos.end();
1494 ++p) {
1495 if (restrict_to_up_acting && !is_up(p->first) &&
1496 !is_acting(p->first))
1497 continue;
1498 // Only consider peers with last_update >= min_last_update_acceptable
1499 if (p->second.last_update < min_last_update_acceptable)
1500 continue;
1501 // Disqualify anyone with a too old last_epoch_started
1502 if (p->second.last_epoch_started < max_last_epoch_started_found)
1503 continue;
1504 // Disqualify anyone who is incomplete (not fully backfilled)
1505 if (p->second.is_incomplete())
1506 continue;
1507 if (best == infos.end()) {
1508 best = p;
1509 continue;
1510 }
1511 // Prefer newer last_update
1512 if (pool.info.require_rollback()) {
1513 if (p->second.last_update > best->second.last_update)
1514 continue;
1515 if (p->second.last_update < best->second.last_update) {
1516 best = p;
1517 continue;
1518 }
1519 } else {
1520 if (p->second.last_update < best->second.last_update)
1521 continue;
1522 if (p->second.last_update > best->second.last_update) {
1523 best = p;
1524 continue;
1525 }
1526 }
1527
1528 // Prefer longer tail
1529 if (p->second.log_tail > best->second.log_tail) {
1530 continue;
1531 } else if (p->second.log_tail < best->second.log_tail) {
1532 best = p;
1533 continue;
1534 }
1535
1536 if (!p->second.has_missing() && best->second.has_missing()) {
1537 psdout(10) << __func__ << " prefer osd." << p->first
1538 << " because it is complete while best has missing"
1539 << dendl;
1540 best = p;
1541 continue;
1542 } else if (p->second.has_missing() && !best->second.has_missing()) {
1543 psdout(10) << __func__ << " skipping osd." << p->first
1544 << " because it has missing while best is complete"
1545 << dendl;
1546 continue;
1547 } else {
1548 // both are complete or have missing
1549 // fall through
1550 }
1551
1552 // prefer current primary (usually the caller), all things being equal
1553 if (p->first == pg_whoami) {
1554 psdout(10) << "calc_acting prefer osd." << p->first
1555 << " because it is current primary" << dendl;
1556 best = p;
1557 continue;
1558 }
1559 }
1560 return best;
1561 }
1562
1563 void PeeringState::calc_ec_acting(
1564 map<pg_shard_t, pg_info_t>::const_iterator auth_log_shard,
1565 unsigned size,
1566 const vector<int> &acting,
1567 const vector<int> &up,
1568 const map<pg_shard_t, pg_info_t> &all_info,
1569 bool restrict_to_up_acting,
1570 vector<int> *_want,
1571 set<pg_shard_t> *backfill,
1572 set<pg_shard_t> *acting_backfill,
1573 ostream &ss)
1574 {
1575 vector<int> want(size, CRUSH_ITEM_NONE);
1576 map<shard_id_t, set<pg_shard_t> > all_info_by_shard;
1577 for (map<pg_shard_t, pg_info_t>::const_iterator i = all_info.begin();
1578 i != all_info.end();
1579 ++i) {
1580 all_info_by_shard[i->first.shard].insert(i->first);
1581 }
1582 for (uint8_t i = 0; i < want.size(); ++i) {
1583 ss << "For position " << (unsigned)i << ": ";
1584 if (up.size() > (unsigned)i && up[i] != CRUSH_ITEM_NONE &&
1585 !all_info.find(pg_shard_t(up[i], shard_id_t(i)))->second.is_incomplete() &&
1586 all_info.find(pg_shard_t(up[i], shard_id_t(i)))->second.last_update >=
1587 auth_log_shard->second.log_tail) {
1588 ss << " selecting up[i]: " << pg_shard_t(up[i], shard_id_t(i)) << std::endl;
1589 want[i] = up[i];
1590 continue;
1591 }
1592 if (up.size() > (unsigned)i && up[i] != CRUSH_ITEM_NONE) {
1593 ss << " backfilling up[i]: " << pg_shard_t(up[i], shard_id_t(i))
1594 << " and ";
1595 backfill->insert(pg_shard_t(up[i], shard_id_t(i)));
1596 }
1597
1598 if (acting.size() > (unsigned)i && acting[i] != CRUSH_ITEM_NONE &&
1599 !all_info.find(pg_shard_t(acting[i], shard_id_t(i)))->second.is_incomplete() &&
1600 all_info.find(pg_shard_t(acting[i], shard_id_t(i)))->second.last_update >=
1601 auth_log_shard->second.log_tail) {
1602 ss << " selecting acting[i]: " << pg_shard_t(acting[i], shard_id_t(i)) << std::endl;
1603 want[i] = acting[i];
1604 } else if (!restrict_to_up_acting) {
1605 for (set<pg_shard_t>::iterator j = all_info_by_shard[shard_id_t(i)].begin();
1606 j != all_info_by_shard[shard_id_t(i)].end();
1607 ++j) {
1608 ceph_assert(j->shard == i);
1609 if (!all_info.find(*j)->second.is_incomplete() &&
1610 all_info.find(*j)->second.last_update >=
1611 auth_log_shard->second.log_tail) {
1612 ss << " selecting stray: " << *j << std::endl;
1613 want[i] = j->osd;
1614 break;
1615 }
1616 }
1617 if (want[i] == CRUSH_ITEM_NONE)
1618 ss << " failed to fill position " << (int)i << std::endl;
1619 }
1620 }
1621
1622 for (uint8_t i = 0; i < want.size(); ++i) {
1623 if (want[i] != CRUSH_ITEM_NONE) {
1624 acting_backfill->insert(pg_shard_t(want[i], shard_id_t(i)));
1625 }
1626 }
1627 acting_backfill->insert(backfill->begin(), backfill->end());
1628 _want->swap(want);
1629 }
1630
1631 /**
1632 * calculate the desired acting set.
1633 *
1634 * Choose an appropriate acting set. Prefer up[0], unless it is
1635 * incomplete, or another osd has a longer tail that allows us to
1636 * bring other up nodes up to date.
1637 */
1638 void PeeringState::calc_replicated_acting(
1639 map<pg_shard_t, pg_info_t>::const_iterator auth_log_shard,
1640 uint64_t force_auth_primary_missing_objects,
1641 unsigned size,
1642 const vector<int> &acting,
1643 const vector<int> &up,
1644 pg_shard_t up_primary,
1645 const map<pg_shard_t, pg_info_t> &all_info,
1646 bool restrict_to_up_acting,
1647 vector<int> *want,
1648 set<pg_shard_t> *backfill,
1649 set<pg_shard_t> *acting_backfill,
1650 const OSDMapRef osdmap,
1651 ostream &ss)
1652 {
1653 pg_shard_t auth_log_shard_id = auth_log_shard->first;
1654
1655 ss << __func__ << " newest update on osd." << auth_log_shard_id
1656 << " with " << auth_log_shard->second
1657 << (restrict_to_up_acting ? " restrict_to_up_acting" : "") << std::endl;
1658
1659 // select primary
1660 auto primary = all_info.find(up_primary);
1661 if (up.size() &&
1662 !primary->second.is_incomplete() &&
1663 primary->second.last_update >=
1664 auth_log_shard->second.log_tail) {
1665 if (HAVE_FEATURE(osdmap->get_up_osd_features(), SERVER_NAUTILUS)) {
1666 auto approx_missing_objects =
1667 primary->second.stats.stats.sum.num_objects_missing;
1668 auto auth_version = auth_log_shard->second.last_update.version;
1669 auto primary_version = primary->second.last_update.version;
1670 if (auth_version > primary_version) {
1671 approx_missing_objects += auth_version - primary_version;
1672 } else {
1673 approx_missing_objects += primary_version - auth_version;
1674 }
1675 if ((uint64_t)approx_missing_objects >
1676 force_auth_primary_missing_objects) {
1677 primary = auth_log_shard;
1678 ss << "up_primary: " << up_primary << ") has approximate "
1679 << approx_missing_objects
1680 << "(>" << force_auth_primary_missing_objects <<") "
1681 << "missing objects, osd." << auth_log_shard_id
1682 << " selected as primary instead"
1683 << std::endl;
1684 } else {
1685 ss << "up_primary: " << up_primary << ") selected as primary"
1686 << std::endl;
1687 }
1688 } else {
1689 ss << "up_primary: " << up_primary << ") selected as primary" << std::endl;
1690 }
1691 } else {
1692 ceph_assert(!auth_log_shard->second.is_incomplete());
1693 ss << "up[0] needs backfill, osd." << auth_log_shard_id
1694 << " selected as primary instead" << std::endl;
1695 primary = auth_log_shard;
1696 }
1697
1698 ss << __func__ << " primary is osd." << primary->first
1699 << " with " << primary->second << std::endl;
1700 want->push_back(primary->first.osd);
1701 acting_backfill->insert(primary->first);
1702
1703 /* We include auth_log_shard->second.log_tail because in GetLog,
1704 * we will request logs back to the min last_update over our
1705 * acting_backfill set, which will result in our log being extended
1706 * as far backwards as necessary to pick up any peers which can
1707 * be log recovered by auth_log_shard's log */
1708 eversion_t oldest_auth_log_entry =
1709 std::min(primary->second.log_tail, auth_log_shard->second.log_tail);
1710
1711 // select replicas that have log contiguity with primary.
1712 // prefer up, then acting, then any peer_info osds
1713 for (auto i : up) {
1714 pg_shard_t up_cand = pg_shard_t(i, shard_id_t::NO_SHARD);
1715 if (up_cand == primary->first)
1716 continue;
1717 const pg_info_t &cur_info = all_info.find(up_cand)->second;
1718 if (cur_info.is_incomplete() ||
1719 cur_info.last_update < oldest_auth_log_entry) {
1720 ss << " shard " << up_cand << " (up) backfill " << cur_info << std::endl;
1721 backfill->insert(up_cand);
1722 acting_backfill->insert(up_cand);
1723 } else {
1724 want->push_back(i);
1725 acting_backfill->insert(up_cand);
1726 ss << " osd." << i << " (up) accepted " << cur_info << std::endl;
1727 }
1728 }
1729
1730 if (want->size() >= size) {
1731 return;
1732 }
1733
1734 std::vector<std::pair<eversion_t, int>> candidate_by_last_update;
1735 candidate_by_last_update.reserve(acting.size());
1736 // This no longer has backfill OSDs, but they are covered above.
1737 for (auto i : acting) {
1738 pg_shard_t acting_cand(i, shard_id_t::NO_SHARD);
1739 // skip up osds we already considered above
1740 if (acting_cand == primary->first)
1741 continue;
1742 vector<int>::const_iterator up_it = find(up.begin(), up.end(), i);
1743 if (up_it != up.end())
1744 continue;
1745
1746 const pg_info_t &cur_info = all_info.find(acting_cand)->second;
1747 if (cur_info.is_incomplete() ||
1748 cur_info.last_update < oldest_auth_log_entry) {
1749 ss << " shard " << acting_cand << " (acting) REJECTED "
1750 << cur_info << std::endl;
1751 } else {
1752 candidate_by_last_update.emplace_back(cur_info.last_update, i);
1753 }
1754 }
1755
1756 auto sort_by_eversion =[](const std::pair<eversion_t, int> &lhs,
1757 const std::pair<eversion_t, int> &rhs) {
1758 return lhs.first > rhs.first;
1759 };
1760 // sort by last_update, in descending order.
1761 std::sort(candidate_by_last_update.begin(),
1762 candidate_by_last_update.end(), sort_by_eversion);
1763 for (auto &p: candidate_by_last_update) {
1764 ceph_assert(want->size() < size);
1765 want->push_back(p.second);
1766 pg_shard_t s = pg_shard_t(p.second, shard_id_t::NO_SHARD);
1767 acting_backfill->insert(s);
1768 ss << " shard " << s << " (acting) accepted "
1769 << all_info.find(s)->second << std::endl;
1770 if (want->size() >= size) {
1771 return;
1772 }
1773 }
1774
1775 if (restrict_to_up_acting) {
1776 return;
1777 }
1778 candidate_by_last_update.clear();
1779 candidate_by_last_update.reserve(all_info.size()); // overestimate but fine
1780 // continue to search stray to find more suitable peers
1781 for (auto &i : all_info) {
1782 // skip up osds we already considered above
1783 if (i.first == primary->first)
1784 continue;
1785 vector<int>::const_iterator up_it = find(up.begin(), up.end(), i.first.osd);
1786 if (up_it != up.end())
1787 continue;
1788 vector<int>::const_iterator acting_it = find(
1789 acting.begin(), acting.end(), i.first.osd);
1790 if (acting_it != acting.end())
1791 continue;
1792
1793 if (i.second.is_incomplete() ||
1794 i.second.last_update < oldest_auth_log_entry) {
1795 ss << " shard " << i.first << " (stray) REJECTED " << i.second
1796 << std::endl;
1797 } else {
1798 candidate_by_last_update.emplace_back(
1799 i.second.last_update, i.first.osd);
1800 }
1801 }
1802
1803 if (candidate_by_last_update.empty()) {
1804 // save us some effort
1805 return;
1806 }
1807
1808 // sort by last_update, in descending order.
1809 std::sort(candidate_by_last_update.begin(),
1810 candidate_by_last_update.end(), sort_by_eversion);
1811
1812 for (auto &p: candidate_by_last_update) {
1813 ceph_assert(want->size() < size);
1814 want->push_back(p.second);
1815 pg_shard_t s = pg_shard_t(p.second, shard_id_t::NO_SHARD);
1816 acting_backfill->insert(s);
1817 ss << " shard " << s << " (stray) accepted "
1818 << all_info.find(s)->second << std::endl;
1819 if (want->size() >= size) {
1820 return;
1821 }
1822 }
1823 }
1824
1825 bool PeeringState::recoverable(const vector<int> &want) const
1826 {
1827 unsigned num_want_acting = 0;
1828 set<pg_shard_t> have;
1829 for (int i = 0; i < (int)want.size(); ++i) {
1830 if (want[i] != CRUSH_ITEM_NONE) {
1831 ++num_want_acting;
1832 have.insert(
1833 pg_shard_t(
1834 want[i],
1835 pool.info.is_erasure() ? shard_id_t(i) : shard_id_t::NO_SHARD));
1836 }
1837 }
1838
1839 if (num_want_acting < pool.info.min_size) {
1840 const bool recovery_ec_pool_below_min_size=
1841 HAVE_FEATURE(get_osdmap()->get_up_osd_features(), SERVER_OCTOPUS);
1842
1843 if (pool.info.is_erasure() && !recovery_ec_pool_below_min_size) {
1844 psdout(10) << __func__ << " failed, ec recovery below min size not supported by pre-octopus" << dendl;
1845 return false;
1846 } else if (!cct->_conf.get_val<bool>("osd_allow_recovery_below_min_size")) {
1847 psdout(10) << __func__ << " failed, recovery below min size not enabled" << dendl;
1848 return false;
1849 }
1850 }
1851 if (missing_loc.get_recoverable_predicate()(have)) {
1852 return true;
1853 } else {
1854 psdout(10) << __func__ << " failed, not recoverable " << dendl;
1855 return false;
1856 }
1857 }
1858
1859 void PeeringState::choose_async_recovery_ec(
1860 const map<pg_shard_t, pg_info_t> &all_info,
1861 const pg_info_t &auth_info,
1862 vector<int> *want,
1863 set<pg_shard_t> *async_recovery,
1864 const OSDMapRef osdmap) const
1865 {
1866 set<pair<int, pg_shard_t> > candidates_by_cost;
1867 for (uint8_t i = 0; i < want->size(); ++i) {
1868 if ((*want)[i] == CRUSH_ITEM_NONE)
1869 continue;
1870
1871 // Considering log entries to recover is accurate enough for
1872 // now. We could use minimum_to_decode_with_cost() later if
1873 // necessary.
1874 pg_shard_t shard_i((*want)[i], shard_id_t(i));
1875 // do not include strays
1876 if (stray_set.find(shard_i) != stray_set.end())
1877 continue;
1878 // Do not include an osd that is not up, since choosing it as
1879 // an async_recovery_target will move it out of the acting set.
1880 // This results in it being identified as a stray during peering,
1881 // because it is no longer in the up or acting set.
1882 if (!is_up(shard_i))
1883 continue;
1884 auto shard_info = all_info.find(shard_i)->second;
1885 // for ec pools we rollback all entries past the authoritative
1886 // last_update *before* activation. This is relatively inexpensive
1887 // compared to recovery, since it is purely local, so treat shards
1888 // past the authoritative last_update the same as those equal to it.
1889 version_t auth_version = auth_info.last_update.version;
1890 version_t candidate_version = shard_info.last_update.version;
1891 if (HAVE_FEATURE(osdmap->get_up_osd_features(), SERVER_NAUTILUS)) {
1892 auto approx_missing_objects =
1893 shard_info.stats.stats.sum.num_objects_missing;
1894 if (auth_version > candidate_version) {
1895 approx_missing_objects += auth_version - candidate_version;
1896 }
1897 if (static_cast<uint64_t>(approx_missing_objects) >
1898 cct->_conf.get_val<uint64_t>("osd_async_recovery_min_cost")) {
1899 candidates_by_cost.emplace(approx_missing_objects, shard_i);
1900 }
1901 } else {
1902 if (auth_version > candidate_version &&
1903 (auth_version - candidate_version) > cct->_conf.get_val<uint64_t>("osd_async_recovery_min_cost")) {
1904 candidates_by_cost.insert(make_pair(auth_version - candidate_version, shard_i));
1905 }
1906 }
1907 }
1908
1909 psdout(20) << __func__ << " candidates by cost are: " << candidates_by_cost
1910 << dendl;
1911
1912 // take out as many osds as we can for async recovery, in order of cost
1913 for (auto rit = candidates_by_cost.rbegin();
1914 rit != candidates_by_cost.rend(); ++rit) {
1915 pg_shard_t cur_shard = rit->second;
1916 vector<int> candidate_want(*want);
1917 candidate_want[cur_shard.shard.id] = CRUSH_ITEM_NONE;
1918 if (recoverable(candidate_want)) {
1919 want->swap(candidate_want);
1920 async_recovery->insert(cur_shard);
1921 }
1922 }
1923 psdout(20) << __func__ << " result want=" << *want
1924 << " async_recovery=" << *async_recovery << dendl;
1925 }
1926
1927 void PeeringState::choose_async_recovery_replicated(
1928 const map<pg_shard_t, pg_info_t> &all_info,
1929 const pg_info_t &auth_info,
1930 vector<int> *want,
1931 set<pg_shard_t> *async_recovery,
1932 const OSDMapRef osdmap) const
1933 {
1934 set<pair<int, pg_shard_t> > candidates_by_cost;
1935 for (auto osd_num : *want) {
1936 pg_shard_t shard_i(osd_num, shard_id_t::NO_SHARD);
1937 // do not include strays
1938 if (stray_set.find(shard_i) != stray_set.end())
1939 continue;
1940 // Do not include an osd that is not up, since choosing it as
1941 // an async_recovery_target will move it out of the acting set.
1942 // This results in it being identified as a stray during peering,
1943 // because it is no longer in the up or acting set.
1944 if (!is_up(shard_i))
1945 continue;
1946 auto shard_info = all_info.find(shard_i)->second;
1947 // use the approximate magnitude of the difference in length of
1948 // logs plus historical missing objects as the cost of recovery
1949 version_t auth_version = auth_info.last_update.version;
1950 version_t candidate_version = shard_info.last_update.version;
1951 if (HAVE_FEATURE(osdmap->get_up_osd_features(), SERVER_NAUTILUS)) {
1952 auto approx_missing_objects =
1953 shard_info.stats.stats.sum.num_objects_missing;
1954 if (auth_version > candidate_version) {
1955 approx_missing_objects += auth_version - candidate_version;
1956 } else {
1957 approx_missing_objects += candidate_version - auth_version;
1958 }
1959 if (static_cast<uint64_t>(approx_missing_objects) >
1960 cct->_conf.get_val<uint64_t>("osd_async_recovery_min_cost")) {
1961 candidates_by_cost.emplace(approx_missing_objects, shard_i);
1962 }
1963 } else {
1964 size_t approx_entries;
1965 if (auth_version > candidate_version) {
1966 approx_entries = auth_version - candidate_version;
1967 } else {
1968 approx_entries = candidate_version - auth_version;
1969 }
1970 if (approx_entries > cct->_conf.get_val<uint64_t>("osd_async_recovery_min_cost")) {
1971 candidates_by_cost.insert(make_pair(approx_entries, shard_i));
1972 }
1973 }
1974 }
1975
1976 psdout(20) << __func__ << " candidates by cost are: " << candidates_by_cost
1977 << dendl;
1978 // take out as many osds as we can for async recovery, in order of cost
1979 for (auto rit = candidates_by_cost.rbegin();
1980 rit != candidates_by_cost.rend(); ++rit) {
1981 if (want->size() <= pool.info.min_size) {
1982 break;
1983 }
1984 pg_shard_t cur_shard = rit->second;
1985 vector<int> candidate_want(*want);
1986 for (auto it = candidate_want.begin(); it != candidate_want.end(); ++it) {
1987 if (*it == cur_shard.osd) {
1988 candidate_want.erase(it);
1989 want->swap(candidate_want);
1990 async_recovery->insert(cur_shard);
1991 break;
1992 }
1993 }
1994 }
1995 psdout(20) << __func__ << " result want=" << *want
1996 << " async_recovery=" << *async_recovery << dendl;
1997 }
1998
1999
2000
2001 /**
2002 * choose acting
2003 *
2004 * calculate the desired acting, and request a change with the monitor
2005 * if it differs from the current acting.
2006 *
2007 * if restrict_to_up_acting=true, we filter out anything that's not in
2008 * up/acting. in order to lift this restriction, we need to
2009 * 1) check whether it's worth switching the acting set any time we get
2010 * a new pg info (not just here, when recovery finishes)
2011 * 2) check whether anything in want_acting went down on each new map
2012 * (and, if so, calculate a new want_acting)
2013 * 3) remove the assertion in PG::PeeringState::Active::react(const AdvMap)
2014 * TODO!
2015 */
2016 bool PeeringState::choose_acting(pg_shard_t &auth_log_shard_id,
2017 bool restrict_to_up_acting,
2018 bool *history_les_bound,
2019 bool request_pg_temp_change_only)
2020 {
2021 map<pg_shard_t, pg_info_t> all_info(peer_info.begin(), peer_info.end());
2022 all_info[pg_whoami] = info;
2023
2024 if (cct->_conf->subsys.should_gather<dout_subsys, 10>()) {
2025 for (map<pg_shard_t, pg_info_t>::iterator p = all_info.begin();
2026 p != all_info.end();
2027 ++p) {
2028 psdout(10) << __func__ << " all_info osd." << p->first << " "
2029 << p->second << dendl;
2030 }
2031 }
2032
2033 map<pg_shard_t, pg_info_t>::const_iterator auth_log_shard =
2034 find_best_info(all_info, restrict_to_up_acting, history_les_bound);
2035
2036 if (auth_log_shard == all_info.end()) {
2037 if (up != acting) {
2038 psdout(10) << __func__ << " no suitable info found (incomplete backfills?),"
2039 << " reverting to up" << dendl;
2040 want_acting = up;
2041 vector<int> empty;
2042 pl->queue_want_pg_temp(empty);
2043 } else {
2044 psdout(10) << __func__ << " failed" << dendl;
2045 ceph_assert(want_acting.empty());
2046 }
2047 return false;
2048 }
2049
2050 ceph_assert(!auth_log_shard->second.is_incomplete());
2051 auth_log_shard_id = auth_log_shard->first;
2052
2053 set<pg_shard_t> want_backfill, want_acting_backfill;
2054 vector<int> want;
2055 stringstream ss;
2056 if (pool.info.is_replicated())
2057 calc_replicated_acting(
2058 auth_log_shard,
2059 cct->_conf.get_val<uint64_t>(
2060 "osd_force_auth_primary_missing_objects"),
2061 get_osdmap()->get_pg_size(info.pgid.pgid),
2062 acting,
2063 up,
2064 up_primary,
2065 all_info,
2066 restrict_to_up_acting,
2067 &want,
2068 &want_backfill,
2069 &want_acting_backfill,
2070 get_osdmap(),
2071 ss);
2072 else
2073 calc_ec_acting(
2074 auth_log_shard,
2075 get_osdmap()->get_pg_size(info.pgid.pgid),
2076 acting,
2077 up,
2078 all_info,
2079 restrict_to_up_acting,
2080 &want,
2081 &want_backfill,
2082 &want_acting_backfill,
2083 ss);
2084 psdout(10) << ss.str() << dendl;
2085
2086 if (!recoverable(want)) {
2087 want_acting.clear();
2088 return false;
2089 }
2090
2091 set<pg_shard_t> want_async_recovery;
2092 if (HAVE_FEATURE(get_osdmap()->get_up_osd_features(), SERVER_MIMIC)) {
2093 if (pool.info.is_erasure()) {
2094 choose_async_recovery_ec(
2095 all_info, auth_log_shard->second, &want, &want_async_recovery,
2096 get_osdmap());
2097 } else {
2098 choose_async_recovery_replicated(
2099 all_info, auth_log_shard->second, &want, &want_async_recovery,
2100 get_osdmap());
2101 }
2102 }
2103 while (want.size() > pool.info.size) {
2104 // async recovery should have taken out as many osds as it can.
2105 // if not, then always evict the last peer
2106 // (will get synchronously recovered later)
2107 psdout(10) << __func__ << " evicting osd." << want.back()
2108 << " from oversized want " << want << dendl;
2109 want.pop_back();
2110 }
2111 if (want != acting) {
2112 psdout(10) << __func__ << " want " << want << " != acting " << acting
2113 << ", requesting pg_temp change" << dendl;
2114 want_acting = want;
2115
2116 if (!cct->_conf->osd_debug_no_acting_change) {
2117 if (want_acting == up) {
2118 // There can't be any pending backfill if
2119 // want is the same as crush map up OSDs.
2120 ceph_assert(want_backfill.empty());
2121 vector<int> empty;
2122 pl->queue_want_pg_temp(empty);
2123 } else
2124 pl->queue_want_pg_temp(want);
2125 }
2126 return false;
2127 }
2128 if (request_pg_temp_change_only)
2129 return true;
2130 want_acting.clear();
2131 acting_recovery_backfill = want_acting_backfill;
2132 psdout(10) << "acting_recovery_backfill is "
2133 << acting_recovery_backfill << dendl;
2134 ceph_assert(
2135 backfill_targets.empty() ||
2136 backfill_targets == want_backfill);
2137 if (backfill_targets.empty()) {
2138 // Caller is GetInfo
2139 backfill_targets = want_backfill;
2140 }
2141 // Adding !needs_recovery() to let the async_recovery_targets reset after recovery is complete
2142 ceph_assert(
2143 async_recovery_targets.empty() ||
2144 async_recovery_targets == want_async_recovery ||
2145 !needs_recovery());
2146 if (async_recovery_targets.empty() || !needs_recovery()) {
2147 async_recovery_targets = want_async_recovery;
2148 }
2149 // Will not change if already set because up would have had to change
2150 // Verify that nothing in backfill is in stray_set
2151 for (set<pg_shard_t>::iterator i = want_backfill.begin();
2152 i != want_backfill.end();
2153 ++i) {
2154 ceph_assert(stray_set.find(*i) == stray_set.end());
2155 }
2156 psdout(10) << "choose_acting want=" << want << " backfill_targets="
2157 << want_backfill << " async_recovery_targets="
2158 << async_recovery_targets << dendl;
2159 return true;
2160 }
2161
2162 void PeeringState::log_weirdness()
2163 {
2164 if (pg_log.get_tail() != info.log_tail)
2165 pl->get_clog_error() << info.pgid
2166 << " info mismatch, log.tail " << pg_log.get_tail()
2167 << " != info.log_tail " << info.log_tail;
2168 if (pg_log.get_head() != info.last_update)
2169 pl->get_clog_error() << info.pgid
2170 << " info mismatch, log.head " << pg_log.get_head()
2171 << " != info.last_update " << info.last_update;
2172
2173 if (!pg_log.get_log().empty()) {
2174 // sloppy check
2175 if ((pg_log.get_log().log.begin()->version <= pg_log.get_tail()))
2176 pl->get_clog_error() << info.pgid
2177 << " log bound mismatch, info (tail,head] ("
2178 << pg_log.get_tail() << ","
2179 << pg_log.get_head() << "]"
2180 << " actual ["
2181 << pg_log.get_log().log.begin()->version << ","
2182 << pg_log.get_log().log.rbegin()->version << "]";
2183 }
2184
2185 if (pg_log.get_log().caller_ops.size() > pg_log.get_log().log.size()) {
2186 pl->get_clog_error() << info.pgid
2187 << " caller_ops.size "
2188 << pg_log.get_log().caller_ops.size()
2189 << " > log size " << pg_log.get_log().log.size();
2190 }
2191 }
2192
2193 /*
2194 * Process information from a replica to determine if it could have any
2195 * objects that i need.
2196 *
2197 * TODO: if the missing set becomes very large, this could get expensive.
2198 * Instead, we probably want to just iterate over our unfound set.
2199 */
2200 bool PeeringState::search_for_missing(
2201 const pg_info_t &oinfo, const pg_missing_t &omissing,
2202 pg_shard_t from,
2203 PeeringCtxWrapper &ctx)
2204 {
2205 uint64_t num_unfound_before = missing_loc.num_unfound();
2206 bool found_missing = missing_loc.add_source_info(
2207 from, oinfo, omissing, ctx.handle);
2208 if (found_missing && num_unfound_before != missing_loc.num_unfound())
2209 pl->publish_stats_to_osd();
2210 // avoid doing this if the peer is empty. This is abit of paranoia
2211 // to avoid doing something rash if add_source_info() above
2212 // incorrectly decided we found something new. (if the peer has
2213 // last_update=0'0 that's impossible.)
2214 if (found_missing &&
2215 oinfo.last_update != eversion_t()) {
2216 pg_info_t tinfo(oinfo);
2217 tinfo.pgid.shard = pg_whoami.shard;
2218 ctx.send_info(
2219 from.osd,
2220 spg_t(info.pgid.pgid, from.shard),
2221 get_osdmap_epoch(), // fixme: use lower epoch?
2222 get_osdmap_epoch(),
2223 tinfo);
2224 }
2225 return found_missing;
2226 }
2227
2228 bool PeeringState::discover_all_missing(
2229 BufferedRecoveryMessages &rctx)
2230 {
2231 auto &missing = pg_log.get_missing();
2232 uint64_t unfound = get_num_unfound();
2233 bool any = false; // did we start any queries
2234
2235 psdout(10) << __func__ << " "
2236 << missing.num_missing() << " missing, "
2237 << unfound << " unfound"
2238 << dendl;
2239
2240 std::set<pg_shard_t>::const_iterator m = might_have_unfound.begin();
2241 std::set<pg_shard_t>::const_iterator mend = might_have_unfound.end();
2242 for (; m != mend; ++m) {
2243 pg_shard_t peer(*m);
2244
2245 if (!get_osdmap()->is_up(peer.osd)) {
2246 psdout(20) << __func__ << " skipping down osd." << peer << dendl;
2247 continue;
2248 }
2249
2250 if (peer_purged.count(peer)) {
2251 psdout(20) << __func__ << " skipping purged osd." << peer << dendl;
2252 continue;
2253 }
2254
2255 map<pg_shard_t, pg_info_t>::const_iterator iter = peer_info.find(peer);
2256 if (iter != peer_info.end() &&
2257 (iter->second.is_empty() || iter->second.dne())) {
2258 // ignore empty peers
2259 continue;
2260 }
2261
2262 // If we've requested any of this stuff, the pg_missing_t information
2263 // should be on its way.
2264 // TODO: coalsce requested_* into a single data structure
2265 if (peer_missing.find(peer) != peer_missing.end()) {
2266 psdout(20) << __func__ << ": osd." << peer
2267 << ": we already have pg_missing_t" << dendl;
2268 continue;
2269 }
2270 if (peer_log_requested.find(peer) != peer_log_requested.end()) {
2271 psdout(20) << __func__ << ": osd." << peer
2272 << ": in peer_log_requested" << dendl;
2273 continue;
2274 }
2275 if (peer_missing_requested.find(peer) != peer_missing_requested.end()) {
2276 psdout(20) << __func__ << ": osd." << peer
2277 << ": in peer_missing_requested" << dendl;
2278 continue;
2279 }
2280
2281 // Request missing
2282 psdout(10) << __func__ << ": osd." << peer << ": requesting pg_missing_t"
2283 << dendl;
2284 peer_missing_requested.insert(peer);
2285 rctx.send_query(
2286 peer.osd,
2287 spg_t(info.pgid.pgid, peer.shard),
2288 pg_query_t(
2289 pg_query_t::FULLLOG,
2290 peer.shard, pg_whoami.shard,
2291 info.history, get_osdmap_epoch()));
2292 any = true;
2293 }
2294 return any;
2295 }
2296
2297 /* Build the might_have_unfound set.
2298 *
2299 * This is used by the primary OSD during recovery.
2300 *
2301 * This set tracks the OSDs which might have unfound objects that the primary
2302 * OSD needs. As we receive pg_missing_t from each OSD in might_have_unfound, we
2303 * will remove the OSD from the set.
2304 */
2305 void PeeringState::build_might_have_unfound()
2306 {
2307 ceph_assert(might_have_unfound.empty());
2308 ceph_assert(is_primary());
2309
2310 psdout(10) << __func__ << dendl;
2311
2312 check_past_interval_bounds();
2313
2314 might_have_unfound = past_intervals.get_might_have_unfound(
2315 pg_whoami,
2316 pool.info.is_erasure());
2317
2318 // include any (stray) peers
2319 for (map<pg_shard_t, pg_info_t>::iterator p = peer_info.begin();
2320 p != peer_info.end();
2321 ++p)
2322 might_have_unfound.insert(p->first);
2323
2324 psdout(15) << __func__ << ": built " << might_have_unfound << dendl;
2325 }
2326
2327 void PeeringState::activate(
2328 ObjectStore::Transaction& t,
2329 epoch_t activation_epoch,
2330 PeeringCtxWrapper &ctx)
2331 {
2332 ceph_assert(!is_peered());
2333
2334 // twiddle pg state
2335 state_clear(PG_STATE_DOWN);
2336
2337 send_notify = false;
2338
2339 if (is_primary()) {
2340 // only update primary last_epoch_started if we will go active
2341 if (actingset.size() >= pool.info.min_size) {
2342 ceph_assert(cct->_conf->osd_find_best_info_ignore_history_les ||
2343 info.last_epoch_started <= activation_epoch);
2344 info.last_epoch_started = activation_epoch;
2345 info.last_interval_started = info.history.same_interval_since;
2346 }
2347 } else if (is_acting(pg_whoami)) {
2348 /* update last_epoch_started on acting replica to whatever the primary sent
2349 * unless it's smaller (could happen if we are going peered rather than
2350 * active, see doc/dev/osd_internals/last_epoch_started.rst) */
2351 if (info.last_epoch_started < activation_epoch) {
2352 info.last_epoch_started = activation_epoch;
2353 info.last_interval_started = info.history.same_interval_since;
2354 }
2355 }
2356
2357 auto &missing = pg_log.get_missing();
2358
2359 min_last_complete_ondisk = eversion_t(0,0); // we don't know (yet)!
2360 if (is_primary()) {
2361 last_update_ondisk = info.last_update;
2362 }
2363 last_update_applied = info.last_update;
2364 last_rollback_info_trimmed_to_applied = pg_log.get_can_rollback_to();
2365
2366 need_up_thru = false;
2367
2368 // write pg info, log
2369 dirty_info = true;
2370 dirty_big_info = true; // maybe
2371
2372 pl->schedule_event_on_commit(
2373 t,
2374 std::make_shared<PGPeeringEvent>(
2375 get_osdmap_epoch(),
2376 get_osdmap_epoch(),
2377 ActivateCommitted(
2378 get_osdmap_epoch(),
2379 activation_epoch)));
2380
2381 // init complete pointer
2382 if (missing.num_missing() == 0) {
2383 psdout(10) << "activate - no missing, moving last_complete " << info.last_complete
2384 << " -> " << info.last_update << dendl;
2385 info.last_complete = info.last_update;
2386 info.stats.stats.sum.num_objects_missing = 0;
2387 pg_log.reset_recovery_pointers();
2388 } else {
2389 psdout(10) << "activate - not complete, " << missing << dendl;
2390 info.stats.stats.sum.num_objects_missing = missing.num_missing();
2391 pg_log.activate_not_complete(info);
2392 }
2393
2394 log_weirdness();
2395
2396 if (is_primary()) {
2397 // initialize snap_trimq
2398 interval_set<snapid_t> to_trim;
2399 auto& removed_snaps_queue = get_osdmap()->get_removed_snaps_queue();
2400 auto p = removed_snaps_queue.find(info.pgid.pgid.pool());
2401 if (p != removed_snaps_queue.end()) {
2402 dout(20) << "activate - purged_snaps " << info.purged_snaps
2403 << " removed_snaps " << p->second
2404 << dendl;
2405 for (auto q : p->second) {
2406 to_trim.insert(q.first, q.second);
2407 }
2408 }
2409 interval_set<snapid_t> purged;
2410 purged.intersection_of(to_trim, info.purged_snaps);
2411 to_trim.subtract(purged);
2412
2413 if (HAVE_FEATURE(upacting_features, SERVER_OCTOPUS)) {
2414 renew_lease(pl->get_mnow());
2415 // do not schedule until we are actually activated
2416 }
2417
2418 // adjust purged_snaps: PG may have been inactive while snaps were pruned
2419 // from the removed_snaps_queue in the osdmap. update local purged_snaps
2420 // reflect only those snaps that we thought were pruned and were still in
2421 // the queue.
2422 info.purged_snaps.swap(purged);
2423
2424 // start up replicas
2425 info.history.refresh_prior_readable_until_ub(pl->get_mnow(),
2426 prior_readable_until_ub);
2427
2428 ceph_assert(!acting_recovery_backfill.empty());
2429 for (set<pg_shard_t>::iterator i = acting_recovery_backfill.begin();
2430 i != acting_recovery_backfill.end();
2431 ++i) {
2432 if (*i == pg_whoami) continue;
2433 pg_shard_t peer = *i;
2434 ceph_assert(peer_info.count(peer));
2435 pg_info_t& pi = peer_info[peer];
2436
2437 psdout(10) << "activate peer osd." << peer << " " << pi << dendl;
2438
2439 MOSDPGLog *m = 0;
2440 ceph_assert(peer_missing.count(peer));
2441 pg_missing_t& pm = peer_missing[peer];
2442
2443 bool needs_past_intervals = pi.dne();
2444
2445 if (pi.last_update == info.last_update) {
2446 // empty log
2447 if (!pi.last_backfill.is_max())
2448 pl->get_clog_info() << info.pgid << " continuing backfill to osd."
2449 << peer
2450 << " from (" << pi.log_tail << "," << pi.last_update
2451 << "] " << pi.last_backfill
2452 << " to " << info.last_update;
2453 if (!pi.is_empty()) {
2454 psdout(10) << "activate peer osd." << peer
2455 << " is up to date, queueing in pending_activators" << dendl;
2456 ctx.send_info(
2457 peer.osd,
2458 spg_t(info.pgid.pgid, peer.shard),
2459 get_osdmap_epoch(), // fixme: use lower epoch?
2460 get_osdmap_epoch(),
2461 info,
2462 get_lease());
2463 } else {
2464 psdout(10) << "activate peer osd." << peer
2465 << " is up to date, but sending pg_log anyway" << dendl;
2466 m = new MOSDPGLog(
2467 i->shard, pg_whoami.shard,
2468 get_osdmap_epoch(), info,
2469 last_peering_reset);
2470 }
2471 } else if (
2472 pg_log.get_tail() > pi.last_update ||
2473 pi.last_backfill == hobject_t() ||
2474 (backfill_targets.count(*i) && pi.last_backfill.is_max())) {
2475 /* ^ This last case covers a situation where a replica is not contiguous
2476 * with the auth_log, but is contiguous with this replica. Reshuffling
2477 * the active set to handle this would be tricky, so instead we just go
2478 * ahead and backfill it anyway. This is probably preferrable in any
2479 * case since the replica in question would have to be significantly
2480 * behind.
2481 */
2482 // backfill
2483 pl->get_clog_debug() << info.pgid << " starting backfill to osd." << peer
2484 << " from (" << pi.log_tail << "," << pi.last_update
2485 << "] " << pi.last_backfill
2486 << " to " << info.last_update;
2487
2488 pi.last_update = info.last_update;
2489 pi.last_complete = info.last_update;
2490 pi.set_last_backfill(hobject_t());
2491 pi.last_epoch_started = info.last_epoch_started;
2492 pi.last_interval_started = info.last_interval_started;
2493 pi.history = info.history;
2494 pi.hit_set = info.hit_set;
2495 // Save num_bytes for reservation request, can't be negative
2496 peer_bytes[peer] = std::max<int64_t>(0, pi.stats.stats.sum.num_bytes);
2497 pi.stats.stats.clear();
2498 pi.stats.stats.sum.num_bytes = peer_bytes[peer];
2499
2500 // initialize peer with our purged_snaps.
2501 pi.purged_snaps = info.purged_snaps;
2502
2503 m = new MOSDPGLog(
2504 i->shard, pg_whoami.shard,
2505 get_osdmap_epoch(), pi,
2506 last_peering_reset /* epoch to create pg at */);
2507
2508 // send some recent log, so that op dup detection works well.
2509 m->log.copy_up_to(cct, pg_log.get_log(),
2510 cct->_conf->osd_max_pg_log_entries);
2511 m->info.log_tail = m->log.tail;
2512 pi.log_tail = m->log.tail; // sigh...
2513
2514 pm.clear();
2515 } else {
2516 // catch up
2517 ceph_assert(pg_log.get_tail() <= pi.last_update);
2518 m = new MOSDPGLog(
2519 i->shard, pg_whoami.shard,
2520 get_osdmap_epoch(), info,
2521 last_peering_reset /* epoch to create pg at */);
2522 // send new stuff to append to replicas log
2523 m->log.copy_after(cct, pg_log.get_log(), pi.last_update);
2524 }
2525
2526 // share past_intervals if we are creating the pg on the replica
2527 // based on whether our info for that peer was dne() *before*
2528 // updating pi.history in the backfill block above.
2529 if (m && needs_past_intervals)
2530 m->past_intervals = past_intervals;
2531
2532 // update local version of peer's missing list!
2533 if (m && pi.last_backfill != hobject_t()) {
2534 for (list<pg_log_entry_t>::iterator p = m->log.log.begin();
2535 p != m->log.log.end();
2536 ++p) {
2537 if (p->soid <= pi.last_backfill &&
2538 !p->is_error()) {
2539 if (perform_deletes_during_peering() && p->is_delete()) {
2540 pm.rm(p->soid, p->version);
2541 } else {
2542 pm.add_next_event(*p);
2543 }
2544 }
2545 }
2546 }
2547
2548 if (m) {
2549 dout(10) << "activate peer osd." << peer << " sending " << m->log
2550 << dendl;
2551 m->lease = get_lease();
2552 pl->send_cluster_message(peer.osd, m, get_osdmap_epoch());
2553 }
2554
2555 // peer now has
2556 pi.last_update = info.last_update;
2557
2558 // update our missing
2559 if (pm.num_missing() == 0) {
2560 pi.last_complete = pi.last_update;
2561 psdout(10) << "activate peer osd." << peer << " " << pi
2562 << " uptodate" << dendl;
2563 } else {
2564 psdout(10) << "activate peer osd." << peer << " " << pi
2565 << " missing " << pm << dendl;
2566 }
2567 }
2568
2569 // Set up missing_loc
2570 set<pg_shard_t> complete_shards;
2571 for (set<pg_shard_t>::iterator i = acting_recovery_backfill.begin();
2572 i != acting_recovery_backfill.end();
2573 ++i) {
2574 psdout(20) << __func__ << " setting up missing_loc from shard " << *i
2575 << " " << dendl;
2576 if (*i == get_primary()) {
2577 missing_loc.add_active_missing(missing);
2578 if (!missing.have_missing())
2579 complete_shards.insert(*i);
2580 } else {
2581 auto peer_missing_entry = peer_missing.find(*i);
2582 ceph_assert(peer_missing_entry != peer_missing.end());
2583 missing_loc.add_active_missing(peer_missing_entry->second);
2584 if (!peer_missing_entry->second.have_missing() &&
2585 peer_info[*i].last_backfill.is_max())
2586 complete_shards.insert(*i);
2587 }
2588 }
2589
2590 // If necessary, create might_have_unfound to help us find our unfound objects.
2591 // NOTE: It's important that we build might_have_unfound before trimming the
2592 // past intervals.
2593 might_have_unfound.clear();
2594 if (needs_recovery()) {
2595 // If only one shard has missing, we do a trick to add all others as recovery
2596 // source, this is considered safe since the PGLogs have been merged locally,
2597 // and covers vast majority of the use cases, like one OSD/host is down for
2598 // a while for hardware repairing
2599 if (complete_shards.size() + 1 == acting_recovery_backfill.size()) {
2600 missing_loc.add_batch_sources_info(complete_shards, ctx.handle);
2601 } else {
2602 missing_loc.add_source_info(pg_whoami, info, pg_log.get_missing(),
2603 ctx.handle);
2604 for (set<pg_shard_t>::iterator i = acting_recovery_backfill.begin();
2605 i != acting_recovery_backfill.end();
2606 ++i) {
2607 if (*i == pg_whoami) continue;
2608 psdout(10) << __func__ << ": adding " << *i << " as a source" << dendl;
2609 ceph_assert(peer_missing.count(*i));
2610 ceph_assert(peer_info.count(*i));
2611 missing_loc.add_source_info(
2612 *i,
2613 peer_info[*i],
2614 peer_missing[*i],
2615 ctx.handle);
2616 }
2617 }
2618 for (map<pg_shard_t, pg_missing_t>::iterator i = peer_missing.begin();
2619 i != peer_missing.end();
2620 ++i) {
2621 if (is_acting_recovery_backfill(i->first))
2622 continue;
2623 ceph_assert(peer_info.count(i->first));
2624 search_for_missing(
2625 peer_info[i->first],
2626 i->second,
2627 i->first,
2628 ctx);
2629 }
2630
2631 build_might_have_unfound();
2632
2633 // Always call now so update_calc_stats() will be accurate
2634 discover_all_missing(ctx.msgs);
2635
2636 }
2637
2638 // num_objects_degraded if calculated should reflect this too, unless no
2639 // missing and we are about to go clean.
2640 if (get_osdmap()->get_pg_size(info.pgid.pgid) > actingset.size()) {
2641 state_set(PG_STATE_UNDERSIZED);
2642 }
2643
2644 state_set(PG_STATE_ACTIVATING);
2645 pl->on_activate(std::move(to_trim));
2646 }
2647 if (actingset.size() >= pool.info.min_size) {
2648 PGLog::LogEntryHandlerRef rollbacker{pl->get_log_handler(t)};
2649 pg_log.roll_forward(rollbacker.get());
2650 }
2651 }
2652
2653 void PeeringState::share_pg_info()
2654 {
2655 psdout(10) << "share_pg_info" << dendl;
2656
2657 info.history.refresh_prior_readable_until_ub(pl->get_mnow(),
2658 prior_readable_until_ub);
2659
2660 // share new pg_info_t with replicas
2661 ceph_assert(!acting_recovery_backfill.empty());
2662 for (auto pg_shard : acting_recovery_backfill) {
2663 if (pg_shard == pg_whoami) continue;
2664 if (auto peer = peer_info.find(pg_shard); peer != peer_info.end()) {
2665 peer->second.last_epoch_started = info.last_epoch_started;
2666 peer->second.last_interval_started = info.last_interval_started;
2667 peer->second.history.merge(info.history);
2668 }
2669 Message* m = nullptr;
2670 if (last_require_osd_release >= ceph_release_t::octopus) {
2671 m = new MOSDPGInfo2{spg_t{info.pgid.pgid, pg_shard.shard},
2672 info,
2673 get_osdmap_epoch(),
2674 get_osdmap_epoch(),
2675 get_lease(), {}};
2676 } else {
2677 m = new MOSDPGInfo{get_osdmap_epoch(),
2678 {pg_notify_t{pg_shard.shard,
2679 pg_whoami.shard,
2680 get_osdmap_epoch(),
2681 get_osdmap_epoch(),
2682 info,
2683 past_intervals}}};
2684 }
2685 pl->send_cluster_message(pg_shard.osd, m, get_osdmap_epoch());
2686 }
2687 }
2688
2689 void PeeringState::merge_log(
2690 ObjectStore::Transaction& t, pg_info_t &oinfo, pg_log_t &olog,
2691 pg_shard_t from)
2692 {
2693 PGLog::LogEntryHandlerRef rollbacker{pl->get_log_handler(t)};
2694 pg_log.merge_log(
2695 oinfo, olog, from, info, rollbacker.get(), dirty_info, dirty_big_info);
2696 }
2697
2698 void PeeringState::rewind_divergent_log(
2699 ObjectStore::Transaction& t, eversion_t newhead)
2700 {
2701 PGLog::LogEntryHandlerRef rollbacker{pl->get_log_handler(t)};
2702 pg_log.rewind_divergent_log(
2703 newhead, info, rollbacker.get(), dirty_info, dirty_big_info);
2704 }
2705
2706
2707 void PeeringState::proc_primary_info(
2708 ObjectStore::Transaction &t, const pg_info_t &oinfo)
2709 {
2710 ceph_assert(!is_primary());
2711
2712 update_history(oinfo.history);
2713 if (!info.stats.stats_invalid && info.stats.stats.sum.num_scrub_errors) {
2714 info.stats.stats.sum.num_scrub_errors = 0;
2715 info.stats.stats.sum.num_shallow_scrub_errors = 0;
2716 info.stats.stats.sum.num_deep_scrub_errors = 0;
2717 dirty_info = true;
2718 }
2719
2720 if (!(info.purged_snaps == oinfo.purged_snaps)) {
2721 psdout(10) << __func__ << " updating purged_snaps to "
2722 << oinfo.purged_snaps
2723 << dendl;
2724 info.purged_snaps = oinfo.purged_snaps;
2725 dirty_info = true;
2726 dirty_big_info = true;
2727 }
2728 }
2729
2730 void PeeringState::proc_master_log(
2731 ObjectStore::Transaction& t, pg_info_t &oinfo,
2732 pg_log_t &olog, pg_missing_t& omissing, pg_shard_t from)
2733 {
2734 psdout(10) << "proc_master_log for osd." << from << ": "
2735 << olog << " " << omissing << dendl;
2736 ceph_assert(!is_peered() && is_primary());
2737
2738 // merge log into our own log to build master log. no need to
2739 // make any adjustments to their missing map; we are taking their
2740 // log to be authoritative (i.e., their entries are by definitely
2741 // non-divergent).
2742 merge_log(t, oinfo, olog, from);
2743 peer_info[from] = oinfo;
2744 psdout(10) << " peer osd." << from << " now " << oinfo
2745 << " " << omissing << dendl;
2746 might_have_unfound.insert(from);
2747
2748 // See doc/dev/osd_internals/last_epoch_started
2749 if (oinfo.last_epoch_started > info.last_epoch_started) {
2750 info.last_epoch_started = oinfo.last_epoch_started;
2751 dirty_info = true;
2752 }
2753 if (oinfo.last_interval_started > info.last_interval_started) {
2754 info.last_interval_started = oinfo.last_interval_started;
2755 dirty_info = true;
2756 }
2757 update_history(oinfo.history);
2758 ceph_assert(cct->_conf->osd_find_best_info_ignore_history_les ||
2759 info.last_epoch_started >= info.history.last_epoch_started);
2760
2761 peer_missing[from].claim(omissing);
2762 }
2763
2764 void PeeringState::proc_replica_log(
2765 pg_info_t &oinfo,
2766 const pg_log_t &olog,
2767 pg_missing_t& omissing,
2768 pg_shard_t from)
2769 {
2770 psdout(10) << "proc_replica_log for osd." << from << ": "
2771 << oinfo << " " << olog << " " << omissing << dendl;
2772
2773 pg_log.proc_replica_log(oinfo, olog, omissing, from);
2774
2775 peer_info[from] = oinfo;
2776 psdout(10) << " peer osd." << from << " now "
2777 << oinfo << " " << omissing << dendl;
2778 might_have_unfound.insert(from);
2779
2780 for (map<hobject_t, pg_missing_item>::const_iterator i =
2781 omissing.get_items().begin();
2782 i != omissing.get_items().end();
2783 ++i) {
2784 psdout(20) << " after missing " << i->first
2785 << " need " << i->second.need
2786 << " have " << i->second.have << dendl;
2787 }
2788 peer_missing[from].claim(omissing);
2789 }
2790
2791 void PeeringState::fulfill_info(
2792 pg_shard_t from, const pg_query_t &query,
2793 pair<pg_shard_t, pg_info_t> &notify_info)
2794 {
2795 ceph_assert(from == primary);
2796 ceph_assert(query.type == pg_query_t::INFO);
2797
2798 // info
2799 psdout(10) << "sending info" << dendl;
2800 notify_info = make_pair(from, info);
2801 }
2802
2803 void PeeringState::fulfill_log(
2804 pg_shard_t from, const pg_query_t &query, epoch_t query_epoch)
2805 {
2806 psdout(10) << "log request from " << from << dendl;
2807 ceph_assert(from == primary);
2808 ceph_assert(query.type != pg_query_t::INFO);
2809
2810 MOSDPGLog *mlog = new MOSDPGLog(
2811 from.shard, pg_whoami.shard,
2812 get_osdmap_epoch(),
2813 info, query_epoch);
2814 mlog->missing = pg_log.get_missing();
2815
2816 // primary -> other, when building master log
2817 if (query.type == pg_query_t::LOG) {
2818 psdout(10) << " sending info+missing+log since " << query.since
2819 << dendl;
2820 if (query.since != eversion_t() && query.since < pg_log.get_tail()) {
2821 pl->get_clog_error() << info.pgid << " got broken pg_query_t::LOG since "
2822 << query.since
2823 << " when my log.tail is " << pg_log.get_tail()
2824 << ", sending full log instead";
2825 mlog->log = pg_log.get_log(); // primary should not have requested this!!
2826 } else
2827 mlog->log.copy_after(cct, pg_log.get_log(), query.since);
2828 }
2829 else if (query.type == pg_query_t::FULLLOG) {
2830 psdout(10) << " sending info+missing+full log" << dendl;
2831 mlog->log = pg_log.get_log();
2832 }
2833
2834 psdout(10) << " sending " << mlog->log << " " << mlog->missing << dendl;
2835
2836 pl->send_cluster_message(from.osd, mlog, get_osdmap_epoch(), true);
2837 }
2838
2839 void PeeringState::fulfill_query(const MQuery& query, PeeringCtxWrapper &rctx)
2840 {
2841 if (query.query.type == pg_query_t::INFO) {
2842 pair<pg_shard_t, pg_info_t> notify_info;
2843 // note this refreshes our prior_readable_until_ub value
2844 update_history(query.query.history);
2845 fulfill_info(query.from, query.query, notify_info);
2846 rctx.send_notify(
2847 notify_info.first.osd,
2848 pg_notify_t(
2849 notify_info.first.shard, pg_whoami.shard,
2850 query.query_epoch,
2851 get_osdmap_epoch(),
2852 notify_info.second,
2853 past_intervals));
2854 } else {
2855 update_history(query.query.history);
2856 fulfill_log(query.from, query.query, query.query_epoch);
2857 }
2858 }
2859
2860 void PeeringState::try_mark_clean()
2861 {
2862 if (actingset.size() == get_osdmap()->get_pg_size(info.pgid.pgid)) {
2863 state_clear(PG_STATE_FORCED_BACKFILL | PG_STATE_FORCED_RECOVERY);
2864 state_set(PG_STATE_CLEAN);
2865 info.history.last_epoch_clean = get_osdmap_epoch();
2866 info.history.last_interval_clean = info.history.same_interval_since;
2867 past_intervals.clear();
2868 dirty_big_info = true;
2869 dirty_info = true;
2870 }
2871
2872 if (!is_active() && is_peered()) {
2873 if (is_clean()) {
2874 bool target;
2875 if (pool.info.is_pending_merge(info.pgid.pgid, &target)) {
2876 if (target) {
2877 psdout(10) << "ready to merge (target)" << dendl;
2878 pl->set_ready_to_merge_target(
2879 info.last_update,
2880 info.history.last_epoch_started,
2881 info.history.last_epoch_clean);
2882 } else {
2883 psdout(10) << "ready to merge (source)" << dendl;
2884 pl->set_ready_to_merge_source(info.last_update);
2885 }
2886 }
2887 } else {
2888 psdout(10) << "not clean, not ready to merge" << dendl;
2889 // we should have notified OSD in Active state entry point
2890 }
2891 }
2892
2893 state_clear(PG_STATE_FORCED_RECOVERY | PG_STATE_FORCED_BACKFILL);
2894
2895 share_pg_info();
2896 pl->publish_stats_to_osd();
2897 clear_recovery_state();
2898 }
2899
2900 void PeeringState::split_into(
2901 pg_t child_pgid, PeeringState *child, unsigned split_bits)
2902 {
2903 child->update_osdmap_ref(get_osdmap());
2904 child->pool = pool;
2905
2906 // Log
2907 pg_log.split_into(child_pgid, split_bits, &(child->pg_log));
2908 child->info.last_complete = info.last_complete;
2909
2910 info.last_update = pg_log.get_head();
2911 child->info.last_update = child->pg_log.get_head();
2912
2913 child->info.last_user_version = info.last_user_version;
2914
2915 info.log_tail = pg_log.get_tail();
2916 child->info.log_tail = child->pg_log.get_tail();
2917
2918 // reset last_complete, we might have modified pg_log & missing above
2919 pg_log.reset_complete_to(&info);
2920 child->pg_log.reset_complete_to(&child->info);
2921
2922 // Info
2923 child->info.history = info.history;
2924 child->info.history.epoch_created = get_osdmap_epoch();
2925 child->info.purged_snaps = info.purged_snaps;
2926
2927 if (info.last_backfill.is_max()) {
2928 child->info.set_last_backfill(hobject_t::get_max());
2929 } else {
2930 // restart backfill on parent and child to be safe. we could
2931 // probably do better in the bitwise sort case, but it's more
2932 // fragile (there may be special work to do on backfill completion
2933 // in the future).
2934 info.set_last_backfill(hobject_t());
2935 child->info.set_last_backfill(hobject_t());
2936 // restarting backfill implies that the missing set is empty,
2937 // since it is only used for objects prior to last_backfill
2938 pg_log.reset_backfill();
2939 child->pg_log.reset_backfill();
2940 }
2941
2942 child->info.stats = info.stats;
2943 child->info.stats.parent_split_bits = split_bits;
2944 info.stats.stats_invalid = true;
2945 child->info.stats.stats_invalid = true;
2946 child->info.last_epoch_started = info.last_epoch_started;
2947 child->info.last_interval_started = info.last_interval_started;
2948
2949 // There can't be recovery/backfill going on now
2950 int primary, up_primary;
2951 vector<int> newup, newacting;
2952 get_osdmap()->pg_to_up_acting_osds(
2953 child->info.pgid.pgid, &newup, &up_primary, &newacting, &primary);
2954 child->init_primary_up_acting(
2955 newup,
2956 newacting,
2957 up_primary,
2958 primary);
2959 child->role = OSDMap::calc_pg_role(pg_whoami, child->acting);
2960
2961 // this comparison includes primary rank via pg_shard_t
2962 if (get_primary() != child->get_primary())
2963 child->info.history.same_primary_since = get_osdmap_epoch();
2964
2965 child->info.stats.up = up;
2966 child->info.stats.up_primary = up_primary;
2967 child->info.stats.acting = acting;
2968 child->info.stats.acting_primary = primary;
2969 child->info.stats.mapping_epoch = get_osdmap_epoch();
2970
2971 // History
2972 child->past_intervals = past_intervals;
2973
2974 child->on_new_interval();
2975
2976 child->send_notify = !child->is_primary();
2977
2978 child->dirty_info = true;
2979 child->dirty_big_info = true;
2980 dirty_info = true;
2981 dirty_big_info = true;
2982 }
2983
2984 void PeeringState::merge_from(
2985 map<spg_t,PeeringState *>& sources,
2986 PeeringCtx &rctx,
2987 unsigned split_bits,
2988 const pg_merge_meta_t& last_pg_merge_meta)
2989 {
2990 bool incomplete = false;
2991 if (info.last_complete != info.last_update ||
2992 info.is_incomplete() ||
2993 info.dne()) {
2994 psdout(10) << __func__ << " target incomplete" << dendl;
2995 incomplete = true;
2996 }
2997 if (last_pg_merge_meta.source_pgid != pg_t()) {
2998 if (info.pgid.pgid != last_pg_merge_meta.source_pgid.get_parent()) {
2999 psdout(10) << __func__ << " target doesn't match expected parent "
3000 << last_pg_merge_meta.source_pgid.get_parent()
3001 << " of source_pgid " << last_pg_merge_meta.source_pgid
3002 << dendl;
3003 incomplete = true;
3004 }
3005 if (info.last_update != last_pg_merge_meta.target_version) {
3006 psdout(10) << __func__ << " target version doesn't match expected "
3007 << last_pg_merge_meta.target_version << dendl;
3008 incomplete = true;
3009 }
3010 }
3011
3012 PGLog::LogEntryHandlerRef handler{pl->get_log_handler(rctx.transaction)};
3013 pg_log.roll_forward(handler.get());
3014
3015 info.last_complete = info.last_update; // to fake out trim()
3016 pg_log.reset_recovery_pointers();
3017 pg_log.trim(info.last_update, info);
3018
3019 vector<PGLog*> log_from;
3020 for (auto& i : sources) {
3021 auto& source = i.second;
3022 if (!source) {
3023 psdout(10) << __func__ << " source " << i.first << " missing" << dendl;
3024 incomplete = true;
3025 continue;
3026 }
3027 if (source->info.last_complete != source->info.last_update ||
3028 source->info.is_incomplete() ||
3029 source->info.dne()) {
3030 psdout(10) << __func__ << " source " << source->pg_whoami
3031 << " incomplete"
3032 << dendl;
3033 incomplete = true;
3034 }
3035 if (last_pg_merge_meta.source_pgid != pg_t()) {
3036 if (source->info.pgid.pgid != last_pg_merge_meta.source_pgid) {
3037 dout(10) << __func__ << " source " << source->info.pgid.pgid
3038 << " doesn't match expected source pgid "
3039 << last_pg_merge_meta.source_pgid << dendl;
3040 incomplete = true;
3041 }
3042 if (source->info.last_update != last_pg_merge_meta.source_version) {
3043 dout(10) << __func__ << " source version doesn't match expected "
3044 << last_pg_merge_meta.target_version << dendl;
3045 incomplete = true;
3046 }
3047 }
3048
3049 // prepare log
3050 PGLog::LogEntryHandlerRef handler{
3051 source->pl->get_log_handler(rctx.transaction)};
3052 source->pg_log.roll_forward(handler.get());
3053 source->info.last_complete = source->info.last_update; // to fake out trim()
3054 source->pg_log.reset_recovery_pointers();
3055 source->pg_log.trim(source->info.last_update, source->info);
3056 log_from.push_back(&source->pg_log);
3057
3058 // combine stats
3059 info.stats.add(source->info.stats);
3060
3061 // pull up last_update
3062 info.last_update = std::max(info.last_update, source->info.last_update);
3063
3064 // adopt source's PastIntervals if target has none. we can do this since
3065 // pgp_num has been reduced prior to the merge, so the OSD mappings for
3066 // the PGs are identical.
3067 if (past_intervals.empty() && !source->past_intervals.empty()) {
3068 psdout(10) << __func__ << " taking source's past_intervals" << dendl;
3069 past_intervals = source->past_intervals;
3070 }
3071 }
3072
3073 info.last_complete = info.last_update;
3074 info.log_tail = info.last_update;
3075 if (incomplete) {
3076 info.last_backfill = hobject_t();
3077 }
3078
3079 // merge logs
3080 pg_log.merge_from(log_from, info.last_update);
3081
3082 // make sure we have a meaningful last_epoch_started/clean (if we were a
3083 // placeholder)
3084 if (info.history.epoch_created == 0) {
3085 // start with (a) source's history, since these PGs *should* have been
3086 // remapped in concert with each other...
3087 info.history = sources.begin()->second->info.history;
3088
3089 // we use the last_epoch_{started,clean} we got from
3090 // the caller, which are the epochs that were reported by the PGs were
3091 // found to be ready for merge.
3092 info.history.last_epoch_clean = last_pg_merge_meta.last_epoch_clean;
3093 info.history.last_epoch_started = last_pg_merge_meta.last_epoch_started;
3094 info.last_epoch_started = last_pg_merge_meta.last_epoch_started;
3095 psdout(10) << __func__
3096 << " set les/c to " << last_pg_merge_meta.last_epoch_started << "/"
3097 << last_pg_merge_meta.last_epoch_clean
3098 << " from pool last_dec_*, source pg history was "
3099 << sources.begin()->second->info.history
3100 << dendl;
3101
3102 // above we have pulled down source's history and we need to check
3103 // history.epoch_created again to confirm that source is not a placeholder
3104 // too. (peering requires a sane history.same_interval_since value for any
3105 // non-newly created pg and below here we know we are basically iterating
3106 // back a series of past maps to fake a merge process, hence we need to
3107 // fix history.same_interval_since first so that start_peering_interval()
3108 // will not complain)
3109 if (info.history.epoch_created == 0) {
3110 dout(10) << __func__ << " both merge target and source are placeholders,"
3111 << " set sis to lec " << info.history.last_epoch_clean
3112 << dendl;
3113 info.history.same_interval_since = info.history.last_epoch_clean;
3114 }
3115
3116 // if the past_intervals start is later than last_epoch_clean, it
3117 // implies the source repeered again but the target didn't, or
3118 // that the source became clean in a later epoch than the target.
3119 // avoid the discrepancy but adjusting the interval start
3120 // backwards to match so that check_past_interval_bounds() will
3121 // not complain.
3122 auto pib = past_intervals.get_bounds();
3123 if (info.history.last_epoch_clean < pib.first) {
3124 psdout(10) << __func__ << " last_epoch_clean "
3125 << info.history.last_epoch_clean << " < past_interval start "
3126 << pib.first << ", adjusting start backwards" << dendl;
3127 past_intervals.adjust_start_backwards(info.history.last_epoch_clean);
3128 }
3129
3130 // Similarly, if the same_interval_since value is later than
3131 // last_epoch_clean, the next interval change will result in a
3132 // past_interval start that is later than last_epoch_clean. This
3133 // can happen if we use the pg_history values from the merge
3134 // source. Adjust the same_interval_since value backwards if that
3135 // happens. (We trust the les and lec values more because they came from
3136 // the real target, whereas the history value we stole from the source.)
3137 if (info.history.last_epoch_started < info.history.same_interval_since) {
3138 psdout(10) << __func__ << " last_epoch_started "
3139 << info.history.last_epoch_started << " < same_interval_since "
3140 << info.history.same_interval_since
3141 << ", adjusting pg_history backwards" << dendl;
3142 info.history.same_interval_since = info.history.last_epoch_clean;
3143 // make sure same_{up,primary}_since are <= same_interval_since
3144 info.history.same_up_since = std::min(
3145 info.history.same_up_since, info.history.same_interval_since);
3146 info.history.same_primary_since = std::min(
3147 info.history.same_primary_since, info.history.same_interval_since);
3148 }
3149 }
3150
3151 dirty_info = true;
3152 dirty_big_info = true;
3153 }
3154
3155 void PeeringState::start_split_stats(
3156 const set<spg_t>& childpgs, vector<object_stat_sum_t> *out)
3157 {
3158 out->resize(childpgs.size() + 1);
3159 info.stats.stats.sum.split(*out);
3160 }
3161
3162 void PeeringState::finish_split_stats(
3163 const object_stat_sum_t& stats, ObjectStore::Transaction &t)
3164 {
3165 info.stats.stats.sum = stats;
3166 write_if_dirty(t);
3167 }
3168
3169 void PeeringState::update_blocked_by()
3170 {
3171 // set a max on the number of blocking peers we report. if we go
3172 // over, report a random subset. keep the result sorted.
3173 unsigned keep = std::min<unsigned>(
3174 blocked_by.size(), cct->_conf->osd_max_pg_blocked_by);
3175 unsigned skip = blocked_by.size() - keep;
3176 info.stats.blocked_by.clear();
3177 info.stats.blocked_by.resize(keep);
3178 unsigned pos = 0;
3179 for (set<int>::iterator p = blocked_by.begin();
3180 p != blocked_by.end() && keep > 0;
3181 ++p) {
3182 if (skip > 0 && (rand() % (skip + keep) < skip)) {
3183 --skip;
3184 } else {
3185 info.stats.blocked_by[pos++] = *p;
3186 --keep;
3187 }
3188 }
3189 }
3190
3191 static bool find_shard(const set<pg_shard_t> & pgs, shard_id_t shard)
3192 {
3193 for (auto&p : pgs)
3194 if (p.shard == shard)
3195 return true;
3196 return false;
3197 }
3198
3199 static pg_shard_t get_another_shard(const set<pg_shard_t> & pgs, pg_shard_t skip, shard_id_t shard)
3200 {
3201 for (auto&p : pgs) {
3202 if (p == skip)
3203 continue;
3204 if (p.shard == shard)
3205 return p;
3206 }
3207 return pg_shard_t();
3208 }
3209
3210 void PeeringState::update_calc_stats()
3211 {
3212 info.stats.version = info.last_update;
3213 info.stats.created = info.history.epoch_created;
3214 info.stats.last_scrub = info.history.last_scrub;
3215 info.stats.last_scrub_stamp = info.history.last_scrub_stamp;
3216 info.stats.last_deep_scrub = info.history.last_deep_scrub;
3217 info.stats.last_deep_scrub_stamp = info.history.last_deep_scrub_stamp;
3218 info.stats.last_clean_scrub_stamp = info.history.last_clean_scrub_stamp;
3219 info.stats.last_epoch_clean = info.history.last_epoch_clean;
3220
3221 info.stats.log_size = pg_log.get_head().version - pg_log.get_tail().version;
3222 info.stats.ondisk_log_size = info.stats.log_size;
3223 info.stats.log_start = pg_log.get_tail();
3224 info.stats.ondisk_log_start = pg_log.get_tail();
3225 info.stats.snaptrimq_len = pl->get_snap_trimq_size();
3226
3227 unsigned num_shards = get_osdmap()->get_pg_size(info.pgid.pgid);
3228
3229 // In rare case that upset is too large (usually transient), use as target
3230 // for calculations below.
3231 unsigned target = std::max(num_shards, (unsigned)upset.size());
3232 // For undersized actingset may be larger with OSDs out
3233 unsigned nrep = std::max(actingset.size(), upset.size());
3234 // calc num_object_copies
3235 info.stats.stats.calc_copies(std::max(target, nrep));
3236 info.stats.stats.sum.num_objects_degraded = 0;
3237 info.stats.stats.sum.num_objects_unfound = 0;
3238 info.stats.stats.sum.num_objects_misplaced = 0;
3239 info.stats.avail_no_missing.clear();
3240 info.stats.object_location_counts.clear();
3241
3242 // We should never hit this condition, but if end up hitting it,
3243 // make sure to update num_objects and set PG_STATE_INCONSISTENT.
3244 if (info.stats.stats.sum.num_objects < 0) {
3245 psdout(0) << __func__ << " negative num_objects = "
3246 << info.stats.stats.sum.num_objects << " setting it to 0 "
3247 << dendl;
3248 info.stats.stats.sum.num_objects = 0;
3249 state_set(PG_STATE_INCONSISTENT);
3250 }
3251
3252 if ((is_remapped() || is_undersized() || !is_clean()) &&
3253 (is_peered()|| is_activating())) {
3254 psdout(20) << __func__ << " actingset " << actingset << " upset "
3255 << upset << " acting_recovery_backfill " << acting_recovery_backfill << dendl;
3256
3257 ceph_assert(!acting_recovery_backfill.empty());
3258
3259 bool estimate = false;
3260
3261 // NOTE: we only generate degraded, misplaced and unfound
3262 // values for the summation, not individual stat categories.
3263 int64_t num_objects = info.stats.stats.sum.num_objects;
3264
3265 // Objects missing from up nodes, sorted by # objects.
3266 boost::container::flat_set<pair<int64_t,pg_shard_t>> missing_target_objects;
3267 // Objects missing from nodes not in up, sort by # objects
3268 boost::container::flat_set<pair<int64_t,pg_shard_t>> acting_source_objects;
3269
3270 // Fill missing_target_objects/acting_source_objects
3271
3272 {
3273 int64_t missing;
3274
3275 // Primary first
3276 missing = pg_log.get_missing().num_missing();
3277 ceph_assert(acting_recovery_backfill.count(pg_whoami));
3278 if (upset.count(pg_whoami)) {
3279 missing_target_objects.emplace(missing, pg_whoami);
3280 } else {
3281 acting_source_objects.emplace(missing, pg_whoami);
3282 }
3283 info.stats.stats.sum.num_objects_missing_on_primary = missing;
3284 if (missing == 0)
3285 info.stats.avail_no_missing.push_back(pg_whoami);
3286 psdout(20) << __func__ << " shard " << pg_whoami
3287 << " primary objects " << num_objects
3288 << " missing " << missing
3289 << dendl;
3290 }
3291
3292 // All other peers
3293 for (auto& peer : peer_info) {
3294 // Primary should not be in the peer_info, skip if it is.
3295 if (peer.first == pg_whoami) continue;
3296 int64_t missing = 0;
3297 int64_t peer_num_objects =
3298 std::max((int64_t)0, peer.second.stats.stats.sum.num_objects);
3299 // Backfill targets always track num_objects accurately
3300 // all other peers track missing accurately.
3301 if (is_backfill_target(peer.first)) {
3302 missing = std::max((int64_t)0, num_objects - peer_num_objects);
3303 } else {
3304 if (peer_missing.count(peer.first)) {
3305 missing = peer_missing[peer.first].num_missing();
3306 } else {
3307 psdout(20) << __func__ << " no peer_missing found for "
3308 << peer.first << dendl;
3309 if (is_recovering()) {
3310 estimate = true;
3311 }
3312 missing = std::max((int64_t)0, num_objects - peer_num_objects);
3313 }
3314 }
3315 if (upset.count(peer.first)) {
3316 missing_target_objects.emplace(missing, peer.first);
3317 } else if (actingset.count(peer.first)) {
3318 acting_source_objects.emplace(missing, peer.first);
3319 }
3320 peer.second.stats.stats.sum.num_objects_missing = missing;
3321 if (missing == 0)
3322 info.stats.avail_no_missing.push_back(peer.first);
3323 psdout(20) << __func__ << " shard " << peer.first
3324 << " objects " << peer_num_objects
3325 << " missing " << missing
3326 << dendl;
3327 }
3328
3329 // Compute object_location_counts
3330 for (auto& ml: missing_loc.get_missing_locs()) {
3331 info.stats.object_location_counts[ml.second]++;
3332 psdout(30) << __func__ << " " << ml.first << " object_location_counts["
3333 << ml.second << "]=" << info.stats.object_location_counts[ml.second]
3334 << dendl;
3335 }
3336 int64_t not_missing = num_objects - missing_loc.get_missing_locs().size();
3337 if (not_missing) {
3338 // During recovery we know upset == actingset and is being populated
3339 // During backfill we know that all non-missing objects are in the actingset
3340 info.stats.object_location_counts[actingset] = not_missing;
3341 }
3342 psdout(30) << __func__ << " object_location_counts["
3343 << upset << "]=" << info.stats.object_location_counts[upset]
3344 << dendl;
3345 psdout(20) << __func__ << " object_location_counts "
3346 << info.stats.object_location_counts << dendl;
3347
3348 // A misplaced object is not stored on the correct OSD
3349 int64_t misplaced = 0;
3350 // a degraded objects has fewer replicas or EC shards than the pool specifies.
3351 int64_t degraded = 0;
3352
3353 if (is_recovering()) {
3354 for (auto& sml: missing_loc.get_missing_by_count()) {
3355 for (auto& ml: sml.second) {
3356 int missing_shards;
3357 if (sml.first == shard_id_t::NO_SHARD) {
3358 psdout(20) << __func__ << " ml " << ml.second
3359 << " upset size " << upset.size()
3360 << " up " << ml.first.up << dendl;
3361 missing_shards = (int)upset.size() - ml.first.up;
3362 } else {
3363 // Handle shards not even in upset below
3364 if (!find_shard(upset, sml.first))
3365 continue;
3366 missing_shards = std::max(0, 1 - ml.first.up);
3367 psdout(20) << __func__
3368 << " shard " << sml.first
3369 << " ml " << ml.second
3370 << " missing shards " << missing_shards << dendl;
3371 }
3372 int odegraded = ml.second * missing_shards;
3373 // Copies on other osds but limited to the possible degraded
3374 int more_osds = std::min(missing_shards, ml.first.other);
3375 int omisplaced = ml.second * more_osds;
3376 ceph_assert(omisplaced <= odegraded);
3377 odegraded -= omisplaced;
3378
3379 misplaced += omisplaced;
3380 degraded += odegraded;
3381 }
3382 }
3383
3384 psdout(20) << __func__ << " missing based degraded "
3385 << degraded << dendl;
3386 psdout(20) << __func__ << " missing based misplaced "
3387 << misplaced << dendl;
3388
3389 // Handle undersized case
3390 if (pool.info.is_replicated()) {
3391 // Add degraded for missing targets (num_objects missing)
3392 ceph_assert(target >= upset.size());
3393 unsigned needed = target - upset.size();
3394 degraded += num_objects * needed;
3395 } else {
3396 for (unsigned i = 0 ; i < num_shards; ++i) {
3397 shard_id_t shard(i);
3398
3399 if (!find_shard(upset, shard)) {
3400 pg_shard_t pgs = get_another_shard(actingset, pg_shard_t(), shard);
3401
3402 if (pgs != pg_shard_t()) {
3403 int64_t missing;
3404
3405 if (pgs == pg_whoami)
3406 missing = info.stats.stats.sum.num_objects_missing_on_primary;
3407 else
3408 missing = peer_info[pgs].stats.stats.sum.num_objects_missing;
3409
3410 degraded += missing;
3411 misplaced += std::max((int64_t)0, num_objects - missing);
3412 } else {
3413 // No shard anywhere
3414 degraded += num_objects;
3415 }
3416 }
3417 }
3418 }
3419 goto out;
3420 }
3421
3422 // Handle undersized case
3423 if (pool.info.is_replicated()) {
3424 // Add to missing_target_objects
3425 ceph_assert(target >= missing_target_objects.size());
3426 unsigned needed = target - missing_target_objects.size();
3427 if (needed)
3428 missing_target_objects.emplace(num_objects * needed, pg_shard_t(pg_shard_t::NO_OSD));
3429 } else {
3430 for (unsigned i = 0 ; i < num_shards; ++i) {
3431 shard_id_t shard(i);
3432 bool found = false;
3433 for (const auto& t : missing_target_objects) {
3434 if (std::get<1>(t).shard == shard) {
3435 found = true;
3436 break;
3437 }
3438 }
3439 if (!found)
3440 missing_target_objects.emplace(num_objects, pg_shard_t(pg_shard_t::NO_OSD,shard));
3441 }
3442 }
3443
3444 for (const auto& item : missing_target_objects)
3445 psdout(20) << __func__ << " missing shard " << std::get<1>(item)
3446 << " missing= " << std::get<0>(item) << dendl;
3447 for (const auto& item : acting_source_objects)
3448 psdout(20) << __func__ << " acting shard " << std::get<1>(item)
3449 << " missing= " << std::get<0>(item) << dendl;
3450
3451 // Handle all objects not in missing for remapped
3452 // or backfill
3453 for (auto m = missing_target_objects.rbegin();
3454 m != missing_target_objects.rend(); ++m) {
3455
3456 int64_t extra_missing = -1;
3457
3458 if (pool.info.is_replicated()) {
3459 if (!acting_source_objects.empty()) {
3460 auto extra_copy = acting_source_objects.begin();
3461 extra_missing = std::get<0>(*extra_copy);
3462 acting_source_objects.erase(extra_copy);
3463 }
3464 } else { // Erasure coded
3465 // Use corresponding shard
3466 for (const auto& a : acting_source_objects) {
3467 if (std::get<1>(a).shard == std::get<1>(*m).shard) {
3468 extra_missing = std::get<0>(a);
3469 acting_source_objects.erase(a);
3470 break;
3471 }
3472 }
3473 }
3474
3475 if (extra_missing >= 0 && std::get<0>(*m) >= extra_missing) {
3476 // We don't know which of the objects on the target
3477 // are part of extra_missing so assume are all degraded.
3478 misplaced += std::get<0>(*m) - extra_missing;
3479 degraded += extra_missing;
3480 } else {
3481 // 1. extra_missing == -1, more targets than sources so degraded
3482 // 2. extra_missing > std::get<0>(m), so that we know that some extra_missing
3483 // previously degraded are now present on the target.
3484 degraded += std::get<0>(*m);
3485 }
3486 }
3487 // If there are still acting that haven't been accounted for
3488 // then they are misplaced
3489 for (const auto& a : acting_source_objects) {
3490 int64_t extra_misplaced = std::max((int64_t)0, num_objects - std::get<0>(a));
3491 psdout(20) << __func__ << " extra acting misplaced " << extra_misplaced
3492 << dendl;
3493 misplaced += extra_misplaced;
3494 }
3495 out:
3496 // NOTE: Tests use these messages to verify this code
3497 psdout(20) << __func__ << " degraded " << degraded
3498 << (estimate ? " (est)": "") << dendl;
3499 psdout(20) << __func__ << " misplaced " << misplaced
3500 << (estimate ? " (est)": "")<< dendl;
3501
3502 info.stats.stats.sum.num_objects_degraded = degraded;
3503 info.stats.stats.sum.num_objects_unfound = get_num_unfound();
3504 info.stats.stats.sum.num_objects_misplaced = misplaced;
3505 }
3506 }
3507
3508 std::optional<pg_stat_t> PeeringState::prepare_stats_for_publish(
3509 bool pg_stats_publish_valid,
3510 const pg_stat_t &pg_stats_publish,
3511 const object_stat_collection_t &unstable_stats)
3512 {
3513 if (info.stats.stats.sum.num_scrub_errors) {
3514 state_set(PG_STATE_INCONSISTENT);
3515 } else {
3516 state_clear(PG_STATE_INCONSISTENT);
3517 state_clear(PG_STATE_FAILED_REPAIR);
3518 }
3519
3520 utime_t now = ceph_clock_now();
3521 if (info.stats.state != state) {
3522 info.stats.last_change = now;
3523 // Optimistic estimation, if we just find out an inactive PG,
3524 // assumt it is active till now.
3525 if (!(state & PG_STATE_ACTIVE) &&
3526 (info.stats.state & PG_STATE_ACTIVE))
3527 info.stats.last_active = now;
3528
3529 if ((state & PG_STATE_ACTIVE) &&
3530 !(info.stats.state & PG_STATE_ACTIVE))
3531 info.stats.last_became_active = now;
3532 if ((state & (PG_STATE_ACTIVE|PG_STATE_PEERED)) &&
3533 !(info.stats.state & (PG_STATE_ACTIVE|PG_STATE_PEERED)))
3534 info.stats.last_became_peered = now;
3535 info.stats.state = state;
3536 }
3537
3538 update_calc_stats();
3539 if (info.stats.stats.sum.num_objects_degraded) {
3540 state_set(PG_STATE_DEGRADED);
3541 } else {
3542 state_clear(PG_STATE_DEGRADED);
3543 }
3544 update_blocked_by();
3545
3546 pg_stat_t pre_publish = info.stats;
3547 pre_publish.stats.add(unstable_stats);
3548 utime_t cutoff = now;
3549 cutoff -= cct->_conf->osd_pg_stat_report_interval_max;
3550
3551 // share (some of) our purged_snaps via the pg_stats. limit # of intervals
3552 // because we don't want to make the pg_stat_t structures too expensive.
3553 unsigned max = cct->_conf->osd_max_snap_prune_intervals_per_epoch;
3554 unsigned num = 0;
3555 auto i = info.purged_snaps.begin();
3556 while (num < max && i != info.purged_snaps.end()) {
3557 pre_publish.purged_snaps.insert(i.get_start(), i.get_len());
3558 ++num;
3559 ++i;
3560 }
3561 psdout(20) << __func__ << " reporting purged_snaps "
3562 << pre_publish.purged_snaps << dendl;
3563
3564 if (pg_stats_publish_valid && pre_publish == pg_stats_publish &&
3565 info.stats.last_fresh > cutoff) {
3566 psdout(15) << "publish_stats_to_osd " << pg_stats_publish.reported_epoch
3567 << ": no change since " << info.stats.last_fresh << dendl;
3568 return std::nullopt;
3569 } else {
3570 // update our stat summary and timestamps
3571 info.stats.reported_epoch = get_osdmap_epoch();
3572 ++info.stats.reported_seq;
3573
3574 info.stats.last_fresh = now;
3575
3576 if (info.stats.state & PG_STATE_CLEAN)
3577 info.stats.last_clean = now;
3578 if (info.stats.state & PG_STATE_ACTIVE)
3579 info.stats.last_active = now;
3580 if (info.stats.state & (PG_STATE_ACTIVE|PG_STATE_PEERED))
3581 info.stats.last_peered = now;
3582 info.stats.last_unstale = now;
3583 if ((info.stats.state & PG_STATE_DEGRADED) == 0)
3584 info.stats.last_undegraded = now;
3585 if ((info.stats.state & PG_STATE_UNDERSIZED) == 0)
3586 info.stats.last_fullsized = now;
3587
3588 psdout(15) << "publish_stats_to_osd " << pg_stats_publish.reported_epoch
3589 << ":" << pg_stats_publish.reported_seq << dendl;
3590 return std::make_optional(std::move(pre_publish));
3591 }
3592 }
3593
3594 void PeeringState::init(
3595 int role,
3596 const vector<int>& newup, int new_up_primary,
3597 const vector<int>& newacting, int new_acting_primary,
3598 const pg_history_t& history,
3599 const PastIntervals& pi,
3600 bool backfill,
3601 ObjectStore::Transaction &t)
3602 {
3603 psdout(10) << "init role " << role << " up "
3604 << newup << " acting " << newacting
3605 << " history " << history
3606 << " past_intervals " << pi
3607 << dendl;
3608
3609 set_role(role);
3610 init_primary_up_acting(
3611 newup,
3612 newacting,
3613 new_up_primary,
3614 new_acting_primary);
3615
3616 info.history = history;
3617 past_intervals = pi;
3618
3619 info.stats.up = up;
3620 info.stats.up_primary = new_up_primary;
3621 info.stats.acting = acting;
3622 info.stats.acting_primary = new_acting_primary;
3623 info.stats.mapping_epoch = info.history.same_interval_since;
3624
3625 if (!perform_deletes_during_peering()) {
3626 pg_log.set_missing_may_contain_deletes();
3627 }
3628
3629 if (backfill) {
3630 psdout(10) << __func__ << ": Setting backfill" << dendl;
3631 info.set_last_backfill(hobject_t());
3632 info.last_complete = info.last_update;
3633 pg_log.mark_log_for_rewrite();
3634 }
3635
3636 on_new_interval();
3637
3638 dirty_info = true;
3639 dirty_big_info = true;
3640 write_if_dirty(t);
3641 }
3642
3643 void PeeringState::dump_peering_state(Formatter *f)
3644 {
3645 f->dump_string("state", get_pg_state_string());
3646 f->dump_unsigned("epoch", get_osdmap_epoch());
3647 f->open_array_section("up");
3648 for (vector<int>::const_iterator p = up.begin(); p != up.end(); ++p)
3649 f->dump_unsigned("osd", *p);
3650 f->close_section();
3651 f->open_array_section("acting");
3652 for (vector<int>::const_iterator p = acting.begin(); p != acting.end(); ++p)
3653 f->dump_unsigned("osd", *p);
3654 f->close_section();
3655 if (!backfill_targets.empty()) {
3656 f->open_array_section("backfill_targets");
3657 for (set<pg_shard_t>::iterator p = backfill_targets.begin();
3658 p != backfill_targets.end();
3659 ++p)
3660 f->dump_stream("shard") << *p;
3661 f->close_section();
3662 }
3663 if (!async_recovery_targets.empty()) {
3664 f->open_array_section("async_recovery_targets");
3665 for (set<pg_shard_t>::iterator p = async_recovery_targets.begin();
3666 p != async_recovery_targets.end();
3667 ++p)
3668 f->dump_stream("shard") << *p;
3669 f->close_section();
3670 }
3671 if (!acting_recovery_backfill.empty()) {
3672 f->open_array_section("acting_recovery_backfill");
3673 for (set<pg_shard_t>::iterator p = acting_recovery_backfill.begin();
3674 p != acting_recovery_backfill.end();
3675 ++p)
3676 f->dump_stream("shard") << *p;
3677 f->close_section();
3678 }
3679 f->open_object_section("info");
3680 update_calc_stats();
3681 info.dump(f);
3682 f->close_section();
3683
3684 f->open_array_section("peer_info");
3685 for (map<pg_shard_t, pg_info_t>::const_iterator p = peer_info.begin();
3686 p != peer_info.end();
3687 ++p) {
3688 f->open_object_section("info");
3689 f->dump_stream("peer") << p->first;
3690 p->second.dump(f);
3691 f->close_section();
3692 }
3693 }
3694
3695 void PeeringState::update_stats(
3696 std::function<bool(pg_history_t &, pg_stat_t &)> f,
3697 ObjectStore::Transaction *t) {
3698 if (f(info.history, info.stats)) {
3699 pl->publish_stats_to_osd();
3700 }
3701 pl->on_info_history_change();
3702
3703 if (t) {
3704 dirty_info = true;
3705 write_if_dirty(*t);
3706 }
3707 }
3708
3709 bool PeeringState::append_log_entries_update_missing(
3710 const mempool::osd_pglog::list<pg_log_entry_t> &entries,
3711 ObjectStore::Transaction &t, std::optional<eversion_t> trim_to,
3712 std::optional<eversion_t> roll_forward_to)
3713 {
3714 ceph_assert(!entries.empty());
3715 ceph_assert(entries.begin()->version > info.last_update);
3716
3717 PGLog::LogEntryHandlerRef rollbacker{pl->get_log_handler(t)};
3718 bool invalidate_stats =
3719 pg_log.append_new_log_entries(
3720 info.last_backfill,
3721 entries,
3722 rollbacker.get());
3723
3724 if (roll_forward_to && entries.rbegin()->soid > info.last_backfill) {
3725 pg_log.roll_forward(rollbacker.get());
3726 }
3727 if (roll_forward_to && *roll_forward_to > pg_log.get_can_rollback_to()) {
3728 pg_log.roll_forward_to(*roll_forward_to, rollbacker.get());
3729 last_rollback_info_trimmed_to_applied = *roll_forward_to;
3730 }
3731
3732 info.last_update = pg_log.get_head();
3733
3734 if (pg_log.get_missing().num_missing() == 0) {
3735 // advance last_complete since nothing else is missing!
3736 info.last_complete = info.last_update;
3737 }
3738 info.stats.stats_invalid = info.stats.stats_invalid || invalidate_stats;
3739
3740 psdout(20) << __func__ << " trim_to bool = " << bool(trim_to)
3741 << " trim_to = " << (trim_to ? *trim_to : eversion_t()) << dendl;
3742 if (trim_to)
3743 pg_log.trim(*trim_to, info);
3744 dirty_info = true;
3745 write_if_dirty(t);
3746 return invalidate_stats;
3747 }
3748
3749 void PeeringState::merge_new_log_entries(
3750 const mempool::osd_pglog::list<pg_log_entry_t> &entries,
3751 ObjectStore::Transaction &t,
3752 std::optional<eversion_t> trim_to,
3753 std::optional<eversion_t> roll_forward_to)
3754 {
3755 psdout(10) << __func__ << " " << entries << dendl;
3756 ceph_assert(is_primary());
3757
3758 bool rebuild_missing = append_log_entries_update_missing(entries, t, trim_to, roll_forward_to);
3759 for (set<pg_shard_t>::const_iterator i = acting_recovery_backfill.begin();
3760 i != acting_recovery_backfill.end();
3761 ++i) {
3762 pg_shard_t peer(*i);
3763 if (peer == pg_whoami) continue;
3764 ceph_assert(peer_missing.count(peer));
3765 ceph_assert(peer_info.count(peer));
3766 pg_missing_t& pmissing(peer_missing[peer]);
3767 psdout(20) << __func__ << " peer_missing for " << peer
3768 << " = " << pmissing << dendl;
3769 pg_info_t& pinfo(peer_info[peer]);
3770 bool invalidate_stats = PGLog::append_log_entries_update_missing(
3771 pinfo.last_backfill,
3772 entries,
3773 true,
3774 NULL,
3775 pmissing,
3776 NULL,
3777 dpp);
3778 pinfo.last_update = info.last_update;
3779 pinfo.stats.stats_invalid = pinfo.stats.stats_invalid || invalidate_stats;
3780 rebuild_missing = rebuild_missing || invalidate_stats;
3781 }
3782
3783 if (!rebuild_missing) {
3784 return;
3785 }
3786
3787 for (auto &&i: entries) {
3788 missing_loc.rebuild(
3789 i.soid,
3790 pg_whoami,
3791 acting_recovery_backfill,
3792 info,
3793 pg_log.get_missing(),
3794 peer_missing,
3795 peer_info);
3796 }
3797 }
3798
3799 void PeeringState::add_log_entry(const pg_log_entry_t& e, bool applied)
3800 {
3801 // raise last_complete only if we were previously up to date
3802 if (info.last_complete == info.last_update)
3803 info.last_complete = e.version;
3804
3805 // raise last_update.
3806 ceph_assert(e.version > info.last_update);
3807 info.last_update = e.version;
3808
3809 // raise user_version, if it increased (it may have not get bumped
3810 // by all logged updates)
3811 if (e.user_version > info.last_user_version)
3812 info.last_user_version = e.user_version;
3813
3814 // log mutation
3815 pg_log.add(e, applied);
3816 psdout(10) << "add_log_entry " << e << dendl;
3817 }
3818
3819
3820 void PeeringState::append_log(
3821 const vector<pg_log_entry_t>& logv,
3822 eversion_t trim_to,
3823 eversion_t roll_forward_to,
3824 eversion_t mlcod,
3825 ObjectStore::Transaction &t,
3826 bool transaction_applied,
3827 bool async)
3828 {
3829 /* The primary has sent an info updating the history, but it may not
3830 * have arrived yet. We want to make sure that we cannot remember this
3831 * write without remembering that it happened in an interval which went
3832 * active in epoch history.last_epoch_started.
3833 */
3834 if (info.last_epoch_started != info.history.last_epoch_started) {
3835 info.history.last_epoch_started = info.last_epoch_started;
3836 }
3837 if (info.last_interval_started != info.history.last_interval_started) {
3838 info.history.last_interval_started = info.last_interval_started;
3839 }
3840 psdout(10) << "append_log " << pg_log.get_log() << " " << logv << dendl;
3841
3842 PGLog::LogEntryHandlerRef handler{pl->get_log_handler(t)};
3843 if (!transaction_applied) {
3844 /* We must be a backfill or async recovery peer, so it's ok if we apply
3845 * out-of-turn since we won't be considered when
3846 * determining a min possible last_update.
3847 *
3848 * We skip_rollforward() here, which advances the crt, without
3849 * doing an actual rollforward. This avoids cleaning up entries
3850 * from the backend and we do not end up in a situation, where the
3851 * object is deleted before we can _merge_object_divergent_entries().
3852 */
3853 pg_log.skip_rollforward();
3854 }
3855
3856 for (vector<pg_log_entry_t>::const_iterator p = logv.begin();
3857 p != logv.end();
3858 ++p) {
3859 add_log_entry(*p, transaction_applied);
3860
3861 /* We don't want to leave the rollforward artifacts around
3862 * here past last_backfill. It's ok for the same reason as
3863 * above */
3864 if (transaction_applied &&
3865 p->soid > info.last_backfill) {
3866 pg_log.roll_forward(handler.get());
3867 }
3868 }
3869 if (transaction_applied && roll_forward_to > pg_log.get_can_rollback_to()) {
3870 pg_log.roll_forward_to(
3871 roll_forward_to,
3872 handler.get());
3873 last_rollback_info_trimmed_to_applied = roll_forward_to;
3874 }
3875
3876 psdout(10) << __func__ << " approx pg log length = "
3877 << pg_log.get_log().approx_size() << dendl;
3878 psdout(10) << __func__ << " transaction_applied = "
3879 << transaction_applied << dendl;
3880 if (!transaction_applied || async)
3881 psdout(10) << __func__ << " " << pg_whoami
3882 << " is async_recovery or backfill target" << dendl;
3883 pg_log.trim(trim_to, info, transaction_applied, async);
3884
3885 // update the local pg, pg log
3886 dirty_info = true;
3887 write_if_dirty(t);
3888
3889 if (!is_primary())
3890 min_last_complete_ondisk = mlcod;
3891 }
3892
3893 void PeeringState::recover_got(
3894 const hobject_t &oid, eversion_t v,
3895 bool is_delete,
3896 ObjectStore::Transaction &t)
3897 {
3898 if (v > pg_log.get_can_rollback_to()) {
3899 /* This can only happen during a repair, and even then, it would
3900 * be one heck of a race. If we are repairing the object, the
3901 * write in question must be fully committed, so it's not valid
3902 * to roll it back anyway (and we'll be rolled forward shortly
3903 * anyway) */
3904 PGLog::LogEntryHandlerRef handler{pl->get_log_handler(t)};
3905 pg_log.roll_forward_to(v, handler.get());
3906 }
3907
3908 psdout(10) << "got missing " << oid << " v " << v << dendl;
3909 pg_log.recover_got(oid, v, info);
3910 if (pg_log.get_log().log.empty()) {
3911 psdout(10) << "last_complete now " << info.last_complete
3912 << " while log is empty" << dendl;
3913 } else if (pg_log.get_log().complete_to != pg_log.get_log().log.end()) {
3914 psdout(10) << "last_complete now " << info.last_complete
3915 << " log.complete_to " << pg_log.get_log().complete_to->version
3916 << dendl;
3917 } else {
3918 psdout(10) << "last_complete now " << info.last_complete
3919 << " log.complete_to at end" << dendl;
3920 //below is not true in the repair case.
3921 //assert(missing.num_missing() == 0); // otherwise, complete_to was wrong.
3922 ceph_assert(info.last_complete == info.last_update);
3923 }
3924
3925 if (is_primary()) {
3926 ceph_assert(missing_loc.needs_recovery(oid));
3927 if (!is_delete)
3928 missing_loc.add_location(oid, pg_whoami);
3929 }
3930
3931 // update pg
3932 dirty_info = true;
3933 write_if_dirty(t);
3934 }
3935
3936 void PeeringState::update_backfill_progress(
3937 const hobject_t &updated_backfill,
3938 const pg_stat_t &updated_stats,
3939 bool preserve_local_num_bytes,
3940 ObjectStore::Transaction &t) {
3941 info.set_last_backfill(updated_backfill);
3942 if (preserve_local_num_bytes) {
3943 psdout(25) << __func__ << " primary " << updated_stats.stats.sum.num_bytes
3944 << " local " << info.stats.stats.sum.num_bytes << dendl;
3945 int64_t bytes = info.stats.stats.sum.num_bytes;
3946 info.stats = updated_stats;
3947 info.stats.stats.sum.num_bytes = bytes;
3948 } else {
3949 psdout(20) << __func__ << " final " << updated_stats.stats.sum.num_bytes
3950 << " replaces local " << info.stats.stats.sum.num_bytes << dendl;
3951 info.stats = updated_stats;
3952 }
3953
3954 dirty_info = true;
3955 write_if_dirty(t);
3956 }
3957
3958 void PeeringState::adjust_purged_snaps(
3959 std::function<void(interval_set<snapid_t> &snaps)> f) {
3960 f(info.purged_snaps);
3961 dirty_info = true;
3962 dirty_big_info = true;
3963 }
3964
3965 void PeeringState::on_peer_recover(
3966 pg_shard_t peer,
3967 const hobject_t &soid,
3968 const eversion_t &version)
3969 {
3970 pl->publish_stats_to_osd();
3971 // done!
3972 peer_missing[peer].got(soid, version);
3973 missing_loc.add_location(soid, peer);
3974 }
3975
3976 void PeeringState::begin_peer_recover(
3977 pg_shard_t peer,
3978 const hobject_t soid)
3979 {
3980 peer_missing[peer].revise_have(soid, eversion_t());
3981 }
3982
3983 void PeeringState::force_object_missing(
3984 const set<pg_shard_t> &peers,
3985 const hobject_t &soid,
3986 eversion_t version)
3987 {
3988 for (auto &&peer : peers) {
3989 if (peer != primary) {
3990 peer_missing[peer].add(soid, version, eversion_t(), false);
3991 } else {
3992 pg_log.missing_add(soid, version, eversion_t());
3993 pg_log.reset_complete_to(&info);
3994 pg_log.set_last_requested(0);
3995 }
3996 }
3997
3998 missing_loc.rebuild(
3999 soid,
4000 pg_whoami,
4001 acting_recovery_backfill,
4002 info,
4003 pg_log.get_missing(),
4004 peer_missing,
4005 peer_info);
4006 }
4007
4008 void PeeringState::pre_submit_op(
4009 const hobject_t &hoid,
4010 const vector<pg_log_entry_t>& logv,
4011 eversion_t at_version)
4012 {
4013 if (at_version > eversion_t()) {
4014 for (auto &&i : get_acting_recovery_backfill()) {
4015 if (i == primary) continue;
4016 pg_info_t &pinfo = peer_info[i];
4017 // keep peer_info up to date
4018 if (pinfo.last_complete == pinfo.last_update)
4019 pinfo.last_complete = at_version;
4020 pinfo.last_update = at_version;
4021 }
4022 }
4023
4024 bool requires_missing_loc = false;
4025 for (auto &&i : get_async_recovery_targets()) {
4026 if (i == primary || !get_peer_missing(i).is_missing(hoid))
4027 continue;
4028 requires_missing_loc = true;
4029 for (auto &&entry: logv) {
4030 peer_missing[i].add_next_event(entry);
4031 }
4032 }
4033
4034 if (requires_missing_loc) {
4035 for (auto &&entry: logv) {
4036 psdout(30) << __func__ << " missing_loc before: "
4037 << missing_loc.get_locations(entry.soid) << dendl;
4038 missing_loc.add_missing(entry.soid, entry.version,
4039 eversion_t(), entry.is_delete());
4040 // clear out missing_loc
4041 missing_loc.clear_location(entry.soid);
4042 for (auto &i: get_actingset()) {
4043 if (!get_peer_missing(i).is_missing(entry.soid))
4044 missing_loc.add_location(entry.soid, i);
4045 }
4046 psdout(30) << __func__ << " missing_loc after: "
4047 << missing_loc.get_locations(entry.soid) << dendl;
4048 }
4049 }
4050 }
4051
4052 void PeeringState::recovery_committed_to(eversion_t version)
4053 {
4054 psdout(10) << __func__ << " version " << version
4055 << " now ondisk" << dendl;
4056 last_complete_ondisk = version;
4057
4058 if (last_complete_ondisk == info.last_update) {
4059 if (!is_primary()) {
4060 // Either we are a replica or backfill target.
4061 // we are fully up to date. tell the primary!
4062 pl->send_cluster_message(
4063 get_primary().osd,
4064 new MOSDPGTrim(
4065 get_osdmap_epoch(),
4066 spg_t(info.pgid.pgid, primary.shard),
4067 last_complete_ondisk),
4068 get_osdmap_epoch());
4069 } else {
4070 calc_min_last_complete_ondisk();
4071 }
4072 }
4073 }
4074
4075 void PeeringState::complete_write(eversion_t v, eversion_t lc)
4076 {
4077 last_update_ondisk = v;
4078 last_complete_ondisk = lc;
4079 calc_min_last_complete_ondisk();
4080 }
4081
4082 void PeeringState::calc_trim_to()
4083 {
4084 size_t target = pl->get_target_pg_log_entries();
4085
4086 eversion_t limit = std::min(
4087 min_last_complete_ondisk,
4088 pg_log.get_can_rollback_to());
4089 if (limit != eversion_t() &&
4090 limit != pg_trim_to &&
4091 pg_log.get_log().approx_size() > target) {
4092 size_t num_to_trim = std::min(pg_log.get_log().approx_size() - target,
4093 cct->_conf->osd_pg_log_trim_max);
4094 if (num_to_trim < cct->_conf->osd_pg_log_trim_min &&
4095 cct->_conf->osd_pg_log_trim_max >= cct->_conf->osd_pg_log_trim_min) {
4096 return;
4097 }
4098 list<pg_log_entry_t>::const_iterator it = pg_log.get_log().log.begin();
4099 eversion_t new_trim_to;
4100 for (size_t i = 0; i < num_to_trim; ++i) {
4101 new_trim_to = it->version;
4102 ++it;
4103 if (new_trim_to > limit) {
4104 new_trim_to = limit;
4105 psdout(10) << "calc_trim_to trimming to min_last_complete_ondisk" << dendl;
4106 break;
4107 }
4108 }
4109 psdout(10) << "calc_trim_to " << pg_trim_to << " -> " << new_trim_to << dendl;
4110 pg_trim_to = new_trim_to;
4111 assert(pg_trim_to <= pg_log.get_head());
4112 assert(pg_trim_to <= min_last_complete_ondisk);
4113 }
4114 }
4115
4116 void PeeringState::calc_trim_to_aggressive()
4117 {
4118 size_t target = pl->get_target_pg_log_entries();
4119
4120 // limit pg log trimming up to the can_rollback_to value
4121 eversion_t limit = std::min({
4122 pg_log.get_head(),
4123 pg_log.get_can_rollback_to(),
4124 last_update_ondisk});
4125 psdout(10) << __func__ << " limit = " << limit << dendl;
4126
4127 if (limit != eversion_t() &&
4128 limit != pg_trim_to &&
4129 pg_log.get_log().approx_size() > target) {
4130 psdout(10) << __func__ << " approx pg log length = "
4131 << pg_log.get_log().approx_size() << dendl;
4132 uint64_t num_to_trim = std::min<uint64_t>(pg_log.get_log().approx_size() - target,
4133 cct->_conf->osd_pg_log_trim_max);
4134 psdout(10) << __func__ << " num_to_trim = " << num_to_trim << dendl;
4135 if (num_to_trim < cct->_conf->osd_pg_log_trim_min &&
4136 cct->_conf->osd_pg_log_trim_max >= cct->_conf->osd_pg_log_trim_min) {
4137 return;
4138 }
4139 auto it = pg_log.get_log().log.begin(); // oldest log entry
4140 auto rit = pg_log.get_log().log.rbegin();
4141 eversion_t by_n_to_keep; // start from tail
4142 eversion_t by_n_to_trim = eversion_t::max(); // start from head
4143 for (size_t i = 0; it != pg_log.get_log().log.end(); ++it, ++rit) {
4144 i++;
4145 if (i > target && by_n_to_keep == eversion_t()) {
4146 by_n_to_keep = rit->version;
4147 }
4148 if (i >= num_to_trim && by_n_to_trim == eversion_t::max()) {
4149 by_n_to_trim = it->version;
4150 }
4151 if (by_n_to_keep != eversion_t() &&
4152 by_n_to_trim != eversion_t::max()) {
4153 break;
4154 }
4155 }
4156
4157 if (by_n_to_keep == eversion_t()) {
4158 return;
4159 }
4160
4161 pg_trim_to = std::min({by_n_to_keep, by_n_to_trim, limit});
4162 psdout(10) << __func__ << " pg_trim_to now " << pg_trim_to << dendl;
4163 ceph_assert(pg_trim_to <= pg_log.get_head());
4164 }
4165 }
4166
4167 void PeeringState::apply_op_stats(
4168 const hobject_t &soid,
4169 const object_stat_sum_t &delta_stats)
4170 {
4171 info.stats.stats.add(delta_stats);
4172 info.stats.stats.floor(0);
4173
4174 for (set<pg_shard_t>::const_iterator i = get_backfill_targets().begin();
4175 i != get_backfill_targets().end();
4176 ++i) {
4177 pg_shard_t bt = *i;
4178 pg_info_t& pinfo = peer_info[bt];
4179 if (soid <= pinfo.last_backfill)
4180 pinfo.stats.stats.add(delta_stats);
4181 }
4182 }
4183
4184 void PeeringState::update_complete_backfill_object_stats(
4185 const hobject_t &hoid,
4186 const pg_stat_t &stats)
4187 {
4188 for (auto &&bt: get_backfill_targets()) {
4189 pg_info_t& pinfo = peer_info[bt];
4190 //Add stats to all peers that were missing object
4191 if (hoid > pinfo.last_backfill)
4192 pinfo.stats.add(stats);
4193 }
4194 }
4195
4196 void PeeringState::update_peer_last_backfill(
4197 pg_shard_t peer,
4198 const hobject_t &new_last_backfill)
4199 {
4200 pg_info_t &pinfo = peer_info[peer];
4201 pinfo.last_backfill = new_last_backfill;
4202 if (new_last_backfill.is_max()) {
4203 /* pinfo.stats might be wrong if we did log-based recovery on the
4204 * backfilled portion in addition to continuing backfill.
4205 */
4206 pinfo.stats = info.stats;
4207 }
4208 }
4209
4210 void PeeringState::set_revert_with_targets(
4211 const hobject_t &soid,
4212 const set<pg_shard_t> &good_peers)
4213 {
4214 for (auto &&peer: good_peers) {
4215 missing_loc.add_location(soid, peer);
4216 }
4217 }
4218
4219 void PeeringState::prepare_backfill_for_missing(
4220 const hobject_t &soid,
4221 const eversion_t &version,
4222 const vector<pg_shard_t> &targets) {
4223 for (auto &&peer: targets) {
4224 peer_missing[peer].add(soid, version, eversion_t(), false);
4225 }
4226 }
4227
4228 void PeeringState::update_hset(const pg_hit_set_history_t &hset_history)
4229 {
4230 info.hit_set = hset_history;
4231 }
4232
4233 /*------------ Peering State Machine----------------*/
4234 #undef dout_prefix
4235 #define dout_prefix (context< PeeringMachine >().dpp->gen_prefix(*_dout) \
4236 << "state<" << get_state_name() << ">: ")
4237 #undef psdout
4238 #define psdout(x) ldout(context< PeeringMachine >().cct, x)
4239
4240 #define DECLARE_LOCALS \
4241 PeeringState *ps = context< PeeringMachine >().state; \
4242 std::ignore = ps; \
4243 PeeringListener *pl = context< PeeringMachine >().pl; \
4244 std::ignore = pl
4245
4246
4247 /*------Crashed-------*/
4248 PeeringState::Crashed::Crashed(my_context ctx)
4249 : my_base(ctx),
4250 NamedState(context< PeeringMachine >().state_history, "Crashed")
4251 {
4252 context< PeeringMachine >().log_enter(state_name);
4253 ceph_abort_msg("we got a bad state machine event");
4254 }
4255
4256
4257 /*------Initial-------*/
4258 PeeringState::Initial::Initial(my_context ctx)
4259 : my_base(ctx),
4260 NamedState(context< PeeringMachine >().state_history, "Initial")
4261 {
4262 context< PeeringMachine >().log_enter(state_name);
4263 }
4264
4265 boost::statechart::result PeeringState::Initial::react(const MNotifyRec& notify)
4266 {
4267 DECLARE_LOCALS;
4268 ps->proc_replica_info(
4269 notify.from, notify.notify.info, notify.notify.epoch_sent);
4270 ps->set_last_peering_reset();
4271 return transit< Primary >();
4272 }
4273
4274 boost::statechart::result PeeringState::Initial::react(const MInfoRec& i)
4275 {
4276 DECLARE_LOCALS;
4277 ceph_assert(!ps->is_primary());
4278 post_event(i);
4279 return transit< Stray >();
4280 }
4281
4282 boost::statechart::result PeeringState::Initial::react(const MLogRec& i)
4283 {
4284 DECLARE_LOCALS;
4285 ceph_assert(!ps->is_primary());
4286 post_event(i);
4287 return transit< Stray >();
4288 }
4289
4290 void PeeringState::Initial::exit()
4291 {
4292 context< PeeringMachine >().log_exit(state_name, enter_time);
4293 DECLARE_LOCALS;
4294 utime_t dur = ceph_clock_now() - enter_time;
4295 pl->get_peering_perf().tinc(rs_initial_latency, dur);
4296 }
4297
4298 /*------Started-------*/
4299 PeeringState::Started::Started(my_context ctx)
4300 : my_base(ctx),
4301 NamedState(context< PeeringMachine >().state_history, "Started")
4302 {
4303 context< PeeringMachine >().log_enter(state_name);
4304 }
4305
4306 boost::statechart::result
4307 PeeringState::Started::react(const IntervalFlush&)
4308 {
4309 psdout(10) << "Ending blocked outgoing recovery messages" << dendl;
4310 context< PeeringMachine >().state->end_block_outgoing();
4311 return discard_event();
4312 }
4313
4314 boost::statechart::result PeeringState::Started::react(const AdvMap& advmap)
4315 {
4316 DECLARE_LOCALS;
4317 psdout(10) << "Started advmap" << dendl;
4318 ps->check_full_transition(advmap.lastmap, advmap.osdmap);
4319 if (ps->should_restart_peering(
4320 advmap.up_primary,
4321 advmap.acting_primary,
4322 advmap.newup,
4323 advmap.newacting,
4324 advmap.lastmap,
4325 advmap.osdmap)) {
4326 psdout(10) << "should_restart_peering, transitioning to Reset"
4327 << dendl;
4328 post_event(advmap);
4329 return transit< Reset >();
4330 }
4331 ps->remove_down_peer_info(advmap.osdmap);
4332 return discard_event();
4333 }
4334
4335 boost::statechart::result PeeringState::Started::react(const QueryState& q)
4336 {
4337 q.f->open_object_section("state");
4338 q.f->dump_string("name", state_name);
4339 q.f->dump_stream("enter_time") << enter_time;
4340 q.f->close_section();
4341 return discard_event();
4342 }
4343
4344 void PeeringState::Started::exit()
4345 {
4346 context< PeeringMachine >().log_exit(state_name, enter_time);
4347 DECLARE_LOCALS;
4348 utime_t dur = ceph_clock_now() - enter_time;
4349 pl->get_peering_perf().tinc(rs_started_latency, dur);
4350 ps->state_clear(PG_STATE_WAIT | PG_STATE_LAGGY);
4351 }
4352
4353 /*--------Reset---------*/
4354 PeeringState::Reset::Reset(my_context ctx)
4355 : my_base(ctx),
4356 NamedState(context< PeeringMachine >().state_history, "Reset")
4357 {
4358 context< PeeringMachine >().log_enter(state_name);
4359 DECLARE_LOCALS;
4360
4361 ps->flushes_in_progress = 0;
4362 ps->set_last_peering_reset();
4363 ps->log_weirdness();
4364 }
4365
4366 boost::statechart::result
4367 PeeringState::Reset::react(const IntervalFlush&)
4368 {
4369 psdout(10) << "Ending blocked outgoing recovery messages" << dendl;
4370 context< PeeringMachine >().state->end_block_outgoing();
4371 return discard_event();
4372 }
4373
4374 boost::statechart::result PeeringState::Reset::react(const AdvMap& advmap)
4375 {
4376 DECLARE_LOCALS;
4377 psdout(10) << "Reset advmap" << dendl;
4378
4379 ps->check_full_transition(advmap.lastmap, advmap.osdmap);
4380
4381 if (ps->should_restart_peering(
4382 advmap.up_primary,
4383 advmap.acting_primary,
4384 advmap.newup,
4385 advmap.newacting,
4386 advmap.lastmap,
4387 advmap.osdmap)) {
4388 psdout(10) << "should restart peering, calling start_peering_interval again"
4389 << dendl;
4390 ps->start_peering_interval(
4391 advmap.lastmap,
4392 advmap.newup, advmap.up_primary,
4393 advmap.newacting, advmap.acting_primary,
4394 context< PeeringMachine >().get_cur_transaction());
4395 }
4396 ps->remove_down_peer_info(advmap.osdmap);
4397 ps->check_past_interval_bounds();
4398 return discard_event();
4399 }
4400
4401 boost::statechart::result PeeringState::Reset::react(const ActMap&)
4402 {
4403 DECLARE_LOCALS;
4404 if (ps->should_send_notify() && ps->get_primary().osd >= 0) {
4405 ps->info.history.refresh_prior_readable_until_ub(
4406 pl->get_mnow(),
4407 ps->prior_readable_until_ub);
4408 context< PeeringMachine >().send_notify(
4409 ps->get_primary().osd,
4410 pg_notify_t(
4411 ps->get_primary().shard, ps->pg_whoami.shard,
4412 ps->get_osdmap_epoch(),
4413 ps->get_osdmap_epoch(),
4414 ps->info,
4415 ps->past_intervals));
4416 }
4417
4418 ps->update_heartbeat_peers();
4419
4420 return transit< Started >();
4421 }
4422
4423 boost::statechart::result PeeringState::Reset::react(const QueryState& q)
4424 {
4425 q.f->open_object_section("state");
4426 q.f->dump_string("name", state_name);
4427 q.f->dump_stream("enter_time") << enter_time;
4428 q.f->close_section();
4429 return discard_event();
4430 }
4431
4432 void PeeringState::Reset::exit()
4433 {
4434 context< PeeringMachine >().log_exit(state_name, enter_time);
4435 DECLARE_LOCALS;
4436 utime_t dur = ceph_clock_now() - enter_time;
4437 pl->get_peering_perf().tinc(rs_reset_latency, dur);
4438 }
4439
4440 /*-------Start---------*/
4441 PeeringState::Start::Start(my_context ctx)
4442 : my_base(ctx),
4443 NamedState(context< PeeringMachine >().state_history, "Start")
4444 {
4445 context< PeeringMachine >().log_enter(state_name);
4446
4447 DECLARE_LOCALS;
4448 if (ps->is_primary()) {
4449 psdout(1) << "transitioning to Primary" << dendl;
4450 post_event(MakePrimary());
4451 } else { //is_stray
4452 psdout(1) << "transitioning to Stray" << dendl;
4453 post_event(MakeStray());
4454 }
4455 }
4456
4457 void PeeringState::Start::exit()
4458 {
4459 context< PeeringMachine >().log_exit(state_name, enter_time);
4460 DECLARE_LOCALS;
4461 utime_t dur = ceph_clock_now() - enter_time;
4462 pl->get_peering_perf().tinc(rs_start_latency, dur);
4463 }
4464
4465 /*---------Primary--------*/
4466 PeeringState::Primary::Primary(my_context ctx)
4467 : my_base(ctx),
4468 NamedState(context< PeeringMachine >().state_history, "Started/Primary")
4469 {
4470 context< PeeringMachine >().log_enter(state_name);
4471 DECLARE_LOCALS;
4472 ceph_assert(ps->want_acting.empty());
4473
4474 // set CREATING bit until we have peered for the first time.
4475 if (ps->info.history.last_epoch_started == 0) {
4476 ps->state_set(PG_STATE_CREATING);
4477 // use the history timestamp, which ultimately comes from the
4478 // monitor in the create case.
4479 utime_t t = ps->info.history.last_scrub_stamp;
4480 ps->info.stats.last_fresh = t;
4481 ps->info.stats.last_active = t;
4482 ps->info.stats.last_change = t;
4483 ps->info.stats.last_peered = t;
4484 ps->info.stats.last_clean = t;
4485 ps->info.stats.last_unstale = t;
4486 ps->info.stats.last_undegraded = t;
4487 ps->info.stats.last_fullsized = t;
4488 ps->info.stats.last_scrub_stamp = t;
4489 ps->info.stats.last_deep_scrub_stamp = t;
4490 ps->info.stats.last_clean_scrub_stamp = t;
4491 }
4492 }
4493
4494 boost::statechart::result PeeringState::Primary::react(const MNotifyRec& notevt)
4495 {
4496 DECLARE_LOCALS;
4497 psdout(7) << "handle_pg_notify from osd." << notevt.from << dendl;
4498 ps->proc_replica_info(
4499 notevt.from, notevt.notify.info, notevt.notify.epoch_sent);
4500 return discard_event();
4501 }
4502
4503 boost::statechart::result PeeringState::Primary::react(const ActMap&)
4504 {
4505 DECLARE_LOCALS;
4506 psdout(7) << "handle ActMap primary" << dendl;
4507 pl->publish_stats_to_osd();
4508 return discard_event();
4509 }
4510
4511 boost::statechart::result PeeringState::Primary::react(
4512 const SetForceRecovery&)
4513 {
4514 DECLARE_LOCALS;
4515 ps->set_force_recovery(true);
4516 return discard_event();
4517 }
4518
4519 boost::statechart::result PeeringState::Primary::react(
4520 const UnsetForceRecovery&)
4521 {
4522 DECLARE_LOCALS;
4523 ps->set_force_recovery(false);
4524 return discard_event();
4525 }
4526
4527 boost::statechart::result PeeringState::Primary::react(
4528 const RequestScrub& evt)
4529 {
4530 DECLARE_LOCALS;
4531 if (ps->is_primary()) {
4532 pl->scrub_requested(evt.deep, evt.repair);
4533 psdout(10) << "marking for scrub" << dendl;
4534 }
4535 return discard_event();
4536 }
4537
4538 boost::statechart::result PeeringState::Primary::react(
4539 const SetForceBackfill&)
4540 {
4541 DECLARE_LOCALS;
4542 ps->set_force_backfill(true);
4543 return discard_event();
4544 }
4545
4546 boost::statechart::result PeeringState::Primary::react(
4547 const UnsetForceBackfill&)
4548 {
4549 DECLARE_LOCALS;
4550 ps->set_force_backfill(false);
4551 return discard_event();
4552 }
4553
4554 void PeeringState::Primary::exit()
4555 {
4556 context< PeeringMachine >().log_exit(state_name, enter_time);
4557 DECLARE_LOCALS;
4558 ps->want_acting.clear();
4559 utime_t dur = ceph_clock_now() - enter_time;
4560 pl->get_peering_perf().tinc(rs_primary_latency, dur);
4561 pl->clear_primary_state();
4562 ps->state_clear(PG_STATE_CREATING);
4563 }
4564
4565 /*---------Peering--------*/
4566 PeeringState::Peering::Peering(my_context ctx)
4567 : my_base(ctx),
4568 NamedState(context< PeeringMachine >().state_history, "Started/Primary/Peering"),
4569 history_les_bound(false)
4570 {
4571 context< PeeringMachine >().log_enter(state_name);
4572 DECLARE_LOCALS;
4573
4574 ceph_assert(!ps->is_peered());
4575 ceph_assert(!ps->is_peering());
4576 ceph_assert(ps->is_primary());
4577 ps->state_set(PG_STATE_PEERING);
4578 }
4579
4580 boost::statechart::result PeeringState::Peering::react(const AdvMap& advmap)
4581 {
4582 DECLARE_LOCALS;
4583 psdout(10) << "Peering advmap" << dendl;
4584 if (prior_set.affected_by_map(*(advmap.osdmap), ps->dpp)) {
4585 psdout(1) << "Peering, affected_by_map, going to Reset" << dendl;
4586 post_event(advmap);
4587 return transit< Reset >();
4588 }
4589
4590 ps->adjust_need_up_thru(advmap.osdmap);
4591 ps->check_prior_readable_down_osds(advmap.osdmap);
4592
4593 return forward_event();
4594 }
4595
4596 boost::statechart::result PeeringState::Peering::react(const QueryState& q)
4597 {
4598 DECLARE_LOCALS;
4599
4600 q.f->open_object_section("state");
4601 q.f->dump_string("name", state_name);
4602 q.f->dump_stream("enter_time") << enter_time;
4603
4604 q.f->open_array_section("past_intervals");
4605 ps->past_intervals.dump(q.f);
4606 q.f->close_section();
4607
4608 q.f->open_array_section("probing_osds");
4609 for (set<pg_shard_t>::iterator p = prior_set.probe.begin();
4610 p != prior_set.probe.end();
4611 ++p)
4612 q.f->dump_stream("osd") << *p;
4613 q.f->close_section();
4614
4615 if (prior_set.pg_down)
4616 q.f->dump_string("blocked", "peering is blocked due to down osds");
4617
4618 q.f->open_array_section("down_osds_we_would_probe");
4619 for (set<int>::iterator p = prior_set.down.begin();
4620 p != prior_set.down.end();
4621 ++p)
4622 q.f->dump_int("osd", *p);
4623 q.f->close_section();
4624
4625 q.f->open_array_section("peering_blocked_by");
4626 for (map<int,epoch_t>::iterator p = prior_set.blocked_by.begin();
4627 p != prior_set.blocked_by.end();
4628 ++p) {
4629 q.f->open_object_section("osd");
4630 q.f->dump_int("osd", p->first);
4631 q.f->dump_int("current_lost_at", p->second);
4632 q.f->dump_string("comment", "starting or marking this osd lost may let us proceed");
4633 q.f->close_section();
4634 }
4635 q.f->close_section();
4636
4637 if (history_les_bound) {
4638 q.f->open_array_section("peering_blocked_by_detail");
4639 q.f->open_object_section("item");
4640 q.f->dump_string("detail","peering_blocked_by_history_les_bound");
4641 q.f->close_section();
4642 q.f->close_section();
4643 }
4644
4645 q.f->close_section();
4646 return forward_event();
4647 }
4648
4649 void PeeringState::Peering::exit()
4650 {
4651
4652 DECLARE_LOCALS;
4653 psdout(10) << "Leaving Peering" << dendl;
4654 context< PeeringMachine >().log_exit(state_name, enter_time);
4655 ps->state_clear(PG_STATE_PEERING);
4656 pl->clear_probe_targets();
4657
4658 utime_t dur = ceph_clock_now() - enter_time;
4659 pl->get_peering_perf().tinc(rs_peering_latency, dur);
4660 }
4661
4662
4663 /*------Backfilling-------*/
4664 PeeringState::Backfilling::Backfilling(my_context ctx)
4665 : my_base(ctx),
4666 NamedState(context< PeeringMachine >().state_history, "Started/Primary/Active/Backfilling")
4667 {
4668 context< PeeringMachine >().log_enter(state_name);
4669
4670
4671 DECLARE_LOCALS;
4672 ps->backfill_reserved = true;
4673 pl->on_backfill_reserved();
4674 ps->state_clear(PG_STATE_BACKFILL_TOOFULL);
4675 ps->state_clear(PG_STATE_BACKFILL_WAIT);
4676 ps->state_set(PG_STATE_BACKFILLING);
4677 pl->publish_stats_to_osd();
4678 }
4679
4680 void PeeringState::Backfilling::backfill_release_reservations()
4681 {
4682 DECLARE_LOCALS;
4683 pl->cancel_local_background_io_reservation();
4684 for (set<pg_shard_t>::iterator it = ps->backfill_targets.begin();
4685 it != ps->backfill_targets.end();
4686 ++it) {
4687 ceph_assert(*it != ps->pg_whoami);
4688 pl->send_cluster_message(
4689 it->osd,
4690 new MBackfillReserve(
4691 MBackfillReserve::RELEASE,
4692 spg_t(ps->info.pgid.pgid, it->shard),
4693 ps->get_osdmap_epoch()),
4694 ps->get_osdmap_epoch());
4695 }
4696 }
4697
4698 void PeeringState::Backfilling::cancel_backfill()
4699 {
4700 DECLARE_LOCALS;
4701 backfill_release_reservations();
4702 pl->on_backfill_canceled();
4703 }
4704
4705 boost::statechart::result
4706 PeeringState::Backfilling::react(const Backfilled &c)
4707 {
4708 backfill_release_reservations();
4709 return transit<Recovered>();
4710 }
4711
4712 boost::statechart::result
4713 PeeringState::Backfilling::react(const DeferBackfill &c)
4714 {
4715 DECLARE_LOCALS;
4716
4717 psdout(10) << "defer backfill, retry delay " << c.delay << dendl;
4718 ps->state_set(PG_STATE_BACKFILL_WAIT);
4719 ps->state_clear(PG_STATE_BACKFILLING);
4720 cancel_backfill();
4721
4722 pl->schedule_event_after(
4723 std::make_shared<PGPeeringEvent>(
4724 ps->get_osdmap_epoch(),
4725 ps->get_osdmap_epoch(),
4726 RequestBackfill()),
4727 c.delay);
4728 return transit<NotBackfilling>();
4729 }
4730
4731 boost::statechart::result
4732 PeeringState::Backfilling::react(const UnfoundBackfill &c)
4733 {
4734 DECLARE_LOCALS;
4735 psdout(10) << "backfill has unfound, can't continue" << dendl;
4736 ps->state_set(PG_STATE_BACKFILL_UNFOUND);
4737 ps->state_clear(PG_STATE_BACKFILLING);
4738 cancel_backfill();
4739 return transit<NotBackfilling>();
4740 }
4741
4742 boost::statechart::result
4743 PeeringState::Backfilling::react(const RemoteReservationRevokedTooFull &)
4744 {
4745 DECLARE_LOCALS;
4746
4747 ps->state_set(PG_STATE_BACKFILL_TOOFULL);
4748 ps->state_clear(PG_STATE_BACKFILLING);
4749 cancel_backfill();
4750
4751 pl->schedule_event_after(
4752 std::make_shared<PGPeeringEvent>(
4753 ps->get_osdmap_epoch(),
4754 ps->get_osdmap_epoch(),
4755 RequestBackfill()),
4756 ps->cct->_conf->osd_backfill_retry_interval);
4757
4758 return transit<NotBackfilling>();
4759 }
4760
4761 boost::statechart::result
4762 PeeringState::Backfilling::react(const RemoteReservationRevoked &)
4763 {
4764 DECLARE_LOCALS;
4765 ps->state_set(PG_STATE_BACKFILL_WAIT);
4766 cancel_backfill();
4767 if (ps->needs_backfill()) {
4768 return transit<WaitLocalBackfillReserved>();
4769 } else {
4770 // raced with MOSDPGBackfill::OP_BACKFILL_FINISH, ignore
4771 return discard_event();
4772 }
4773 }
4774
4775 void PeeringState::Backfilling::exit()
4776 {
4777 context< PeeringMachine >().log_exit(state_name, enter_time);
4778 DECLARE_LOCALS;
4779 ps->backfill_reserved = false;
4780 ps->state_clear(PG_STATE_BACKFILLING);
4781 ps->state_clear(PG_STATE_FORCED_BACKFILL | PG_STATE_FORCED_RECOVERY);
4782 utime_t dur = ceph_clock_now() - enter_time;
4783 pl->get_peering_perf().tinc(rs_backfilling_latency, dur);
4784 }
4785
4786 /*--WaitRemoteBackfillReserved--*/
4787
4788 PeeringState::WaitRemoteBackfillReserved::WaitRemoteBackfillReserved(my_context ctx)
4789 : my_base(ctx),
4790 NamedState(context< PeeringMachine >().state_history, "Started/Primary/Active/WaitRemoteBackfillReserved"),
4791 backfill_osd_it(context< Active >().remote_shards_to_reserve_backfill.begin())
4792 {
4793 context< PeeringMachine >().log_enter(state_name);
4794 DECLARE_LOCALS;
4795
4796 ps->state_set(PG_STATE_BACKFILL_WAIT);
4797 pl->publish_stats_to_osd();
4798 post_event(RemoteBackfillReserved());
4799 }
4800
4801 boost::statechart::result
4802 PeeringState::WaitRemoteBackfillReserved::react(const RemoteBackfillReserved &evt)
4803 {
4804 DECLARE_LOCALS;
4805
4806 int64_t num_bytes = ps->info.stats.stats.sum.num_bytes;
4807 psdout(10) << __func__ << " num_bytes " << num_bytes << dendl;
4808 if (backfill_osd_it !=
4809 context< Active >().remote_shards_to_reserve_backfill.end()) {
4810 // The primary never backfills itself
4811 ceph_assert(*backfill_osd_it != ps->pg_whoami);
4812 pl->send_cluster_message(
4813 backfill_osd_it->osd,
4814 new MBackfillReserve(
4815 MBackfillReserve::REQUEST,
4816 spg_t(context< PeeringMachine >().spgid.pgid, backfill_osd_it->shard),
4817 ps->get_osdmap_epoch(),
4818 ps->get_backfill_priority(),
4819 num_bytes,
4820 ps->peer_bytes[*backfill_osd_it]),
4821 ps->get_osdmap_epoch());
4822 ++backfill_osd_it;
4823 } else {
4824 ps->peer_bytes.clear();
4825 post_event(AllBackfillsReserved());
4826 }
4827 return discard_event();
4828 }
4829
4830 void PeeringState::WaitRemoteBackfillReserved::exit()
4831 {
4832 context< PeeringMachine >().log_exit(state_name, enter_time);
4833 DECLARE_LOCALS;
4834
4835 utime_t dur = ceph_clock_now() - enter_time;
4836 pl->get_peering_perf().tinc(rs_waitremotebackfillreserved_latency, dur);
4837 }
4838
4839 void PeeringState::WaitRemoteBackfillReserved::retry()
4840 {
4841 DECLARE_LOCALS;
4842 pl->cancel_local_background_io_reservation();
4843
4844 // Send CANCEL to all previously acquired reservations
4845 set<pg_shard_t>::const_iterator it, begin, end;
4846 begin = context< Active >().remote_shards_to_reserve_backfill.begin();
4847 end = context< Active >().remote_shards_to_reserve_backfill.end();
4848 ceph_assert(begin != end);
4849 for (it = begin; it != backfill_osd_it; ++it) {
4850 // The primary never backfills itself
4851 ceph_assert(*it != ps->pg_whoami);
4852 pl->send_cluster_message(
4853 it->osd,
4854 new MBackfillReserve(
4855 MBackfillReserve::RELEASE,
4856 spg_t(context< PeeringMachine >().spgid.pgid, it->shard),
4857 ps->get_osdmap_epoch()),
4858 ps->get_osdmap_epoch());
4859 }
4860
4861 ps->state_clear(PG_STATE_BACKFILL_WAIT);
4862 pl->publish_stats_to_osd();
4863
4864 pl->schedule_event_after(
4865 std::make_shared<PGPeeringEvent>(
4866 ps->get_osdmap_epoch(),
4867 ps->get_osdmap_epoch(),
4868 RequestBackfill()),
4869 ps->cct->_conf->osd_backfill_retry_interval);
4870 }
4871
4872 boost::statechart::result
4873 PeeringState::WaitRemoteBackfillReserved::react(const RemoteReservationRejectedTooFull &evt)
4874 {
4875 DECLARE_LOCALS;
4876 ps->state_set(PG_STATE_BACKFILL_TOOFULL);
4877 retry();
4878 return transit<NotBackfilling>();
4879 }
4880
4881 boost::statechart::result
4882 PeeringState::WaitRemoteBackfillReserved::react(const RemoteReservationRevoked &evt)
4883 {
4884 retry();
4885 return transit<NotBackfilling>();
4886 }
4887
4888 /*--WaitLocalBackfillReserved--*/
4889 PeeringState::WaitLocalBackfillReserved::WaitLocalBackfillReserved(my_context ctx)
4890 : my_base(ctx),
4891 NamedState(context< PeeringMachine >().state_history, "Started/Primary/Active/WaitLocalBackfillReserved")
4892 {
4893 context< PeeringMachine >().log_enter(state_name);
4894 DECLARE_LOCALS;
4895
4896 ps->state_set(PG_STATE_BACKFILL_WAIT);
4897 pl->request_local_background_io_reservation(
4898 ps->get_backfill_priority(),
4899 std::make_shared<PGPeeringEvent>(
4900 ps->get_osdmap_epoch(),
4901 ps->get_osdmap_epoch(),
4902 LocalBackfillReserved()),
4903 std::make_shared<PGPeeringEvent>(
4904 ps->get_osdmap_epoch(),
4905 ps->get_osdmap_epoch(),
4906 DeferBackfill(0.0)));
4907 pl->publish_stats_to_osd();
4908 }
4909
4910 void PeeringState::WaitLocalBackfillReserved::exit()
4911 {
4912 context< PeeringMachine >().log_exit(state_name, enter_time);
4913 DECLARE_LOCALS;
4914 utime_t dur = ceph_clock_now() - enter_time;
4915 pl->get_peering_perf().tinc(rs_waitlocalbackfillreserved_latency, dur);
4916 }
4917
4918 /*----NotBackfilling------*/
4919 PeeringState::NotBackfilling::NotBackfilling(my_context ctx)
4920 : my_base(ctx),
4921 NamedState(context< PeeringMachine >().state_history, "Started/Primary/Active/NotBackfilling")
4922 {
4923 context< PeeringMachine >().log_enter(state_name);
4924 DECLARE_LOCALS;
4925 ps->state_clear(PG_STATE_REPAIR);
4926 pl->publish_stats_to_osd();
4927 }
4928
4929 boost::statechart::result
4930 PeeringState::NotBackfilling::react(const RemoteBackfillReserved &evt)
4931 {
4932 return discard_event();
4933 }
4934
4935 boost::statechart::result
4936 PeeringState::NotBackfilling::react(const RemoteReservationRejectedTooFull &evt)
4937 {
4938 return discard_event();
4939 }
4940
4941 void PeeringState::NotBackfilling::exit()
4942 {
4943 context< PeeringMachine >().log_exit(state_name, enter_time);
4944
4945 DECLARE_LOCALS;
4946 ps->state_clear(PG_STATE_BACKFILL_UNFOUND);
4947 utime_t dur = ceph_clock_now() - enter_time;
4948 pl->get_peering_perf().tinc(rs_notbackfilling_latency, dur);
4949 }
4950
4951 /*----NotRecovering------*/
4952 PeeringState::NotRecovering::NotRecovering(my_context ctx)
4953 : my_base(ctx),
4954 NamedState(context< PeeringMachine >().state_history, "Started/Primary/Active/NotRecovering")
4955 {
4956 context< PeeringMachine >().log_enter(state_name);
4957 DECLARE_LOCALS;
4958 ps->state_clear(PG_STATE_REPAIR);
4959 pl->publish_stats_to_osd();
4960 }
4961
4962 void PeeringState::NotRecovering::exit()
4963 {
4964 context< PeeringMachine >().log_exit(state_name, enter_time);
4965
4966 DECLARE_LOCALS;
4967 ps->state_clear(PG_STATE_RECOVERY_UNFOUND);
4968 utime_t dur = ceph_clock_now() - enter_time;
4969 pl->get_peering_perf().tinc(rs_notrecovering_latency, dur);
4970 }
4971
4972 /*---RepNotRecovering----*/
4973 PeeringState::RepNotRecovering::RepNotRecovering(my_context ctx)
4974 : my_base(ctx),
4975 NamedState(context< PeeringMachine >().state_history, "Started/ReplicaActive/RepNotRecovering")
4976 {
4977 context< PeeringMachine >().log_enter(state_name);
4978 }
4979
4980 boost::statechart::result
4981 PeeringState::RepNotRecovering::react(const RejectTooFullRemoteReservation &evt)
4982 {
4983 DECLARE_LOCALS;
4984 ps->reject_reservation();
4985 post_event(RemoteReservationRejectedTooFull());
4986 return discard_event();
4987 }
4988
4989 void PeeringState::RepNotRecovering::exit()
4990 {
4991 context< PeeringMachine >().log_exit(state_name, enter_time);
4992 DECLARE_LOCALS;
4993 utime_t dur = ceph_clock_now() - enter_time;
4994 pl->get_peering_perf().tinc(rs_repnotrecovering_latency, dur);
4995 }
4996
4997 /*---RepWaitRecoveryReserved--*/
4998 PeeringState::RepWaitRecoveryReserved::RepWaitRecoveryReserved(my_context ctx)
4999 : my_base(ctx),
5000 NamedState(context< PeeringMachine >().state_history, "Started/ReplicaActive/RepWaitRecoveryReserved")
5001 {
5002 context< PeeringMachine >().log_enter(state_name);
5003 }
5004
5005 boost::statechart::result
5006 PeeringState::RepWaitRecoveryReserved::react(const RemoteRecoveryReserved &evt)
5007 {
5008 DECLARE_LOCALS;
5009 pl->send_cluster_message(
5010 ps->primary.osd,
5011 new MRecoveryReserve(
5012 MRecoveryReserve::GRANT,
5013 spg_t(ps->info.pgid.pgid, ps->primary.shard),
5014 ps->get_osdmap_epoch()),
5015 ps->get_osdmap_epoch());
5016 return transit<RepRecovering>();
5017 }
5018
5019 boost::statechart::result
5020 PeeringState::RepWaitRecoveryReserved::react(
5021 const RemoteReservationCanceled &evt)
5022 {
5023 DECLARE_LOCALS;
5024 pl->unreserve_recovery_space();
5025
5026 pl->cancel_remote_recovery_reservation();
5027 return transit<RepNotRecovering>();
5028 }
5029
5030 void PeeringState::RepWaitRecoveryReserved::exit()
5031 {
5032 context< PeeringMachine >().log_exit(state_name, enter_time);
5033 DECLARE_LOCALS;
5034 utime_t dur = ceph_clock_now() - enter_time;
5035 pl->get_peering_perf().tinc(rs_repwaitrecoveryreserved_latency, dur);
5036 }
5037
5038 /*-RepWaitBackfillReserved*/
5039 PeeringState::RepWaitBackfillReserved::RepWaitBackfillReserved(my_context ctx)
5040 : my_base(ctx),
5041 NamedState(context< PeeringMachine >().state_history, "Started/ReplicaActive/RepWaitBackfillReserved")
5042 {
5043 context< PeeringMachine >().log_enter(state_name);
5044 }
5045
5046 boost::statechart::result
5047 PeeringState::RepNotRecovering::react(const RequestBackfillPrio &evt)
5048 {
5049
5050 DECLARE_LOCALS;
5051
5052 if (!pl->try_reserve_recovery_space(
5053 evt.primary_num_bytes, evt.local_num_bytes)) {
5054 post_event(RejectTooFullRemoteReservation());
5055 } else {
5056 PGPeeringEventRef preempt;
5057 if (HAVE_FEATURE(ps->upacting_features, RECOVERY_RESERVATION_2)) {
5058 // older peers will interpret preemption as TOOFULL
5059 preempt = std::make_shared<PGPeeringEvent>(
5060 pl->get_osdmap_epoch(),
5061 pl->get_osdmap_epoch(),
5062 RemoteBackfillPreempted());
5063 }
5064 pl->request_remote_recovery_reservation(
5065 evt.priority,
5066 std::make_shared<PGPeeringEvent>(
5067 pl->get_osdmap_epoch(),
5068 pl->get_osdmap_epoch(),
5069 RemoteBackfillReserved()),
5070 preempt);
5071 }
5072 return transit<RepWaitBackfillReserved>();
5073 }
5074
5075 boost::statechart::result
5076 PeeringState::RepNotRecovering::react(const RequestRecoveryPrio &evt)
5077 {
5078 DECLARE_LOCALS;
5079
5080 // fall back to a local reckoning of priority of primary doesn't pass one
5081 // (pre-mimic compat)
5082 int prio = evt.priority ? evt.priority : ps->get_recovery_priority();
5083
5084 PGPeeringEventRef preempt;
5085 if (HAVE_FEATURE(ps->upacting_features, RECOVERY_RESERVATION_2)) {
5086 // older peers can't handle this
5087 preempt = std::make_shared<PGPeeringEvent>(
5088 ps->get_osdmap_epoch(),
5089 ps->get_osdmap_epoch(),
5090 RemoteRecoveryPreempted());
5091 }
5092
5093 pl->request_remote_recovery_reservation(
5094 prio,
5095 std::make_shared<PGPeeringEvent>(
5096 ps->get_osdmap_epoch(),
5097 ps->get_osdmap_epoch(),
5098 RemoteRecoveryReserved()),
5099 preempt);
5100 return transit<RepWaitRecoveryReserved>();
5101 }
5102
5103 void PeeringState::RepWaitBackfillReserved::exit()
5104 {
5105 context< PeeringMachine >().log_exit(state_name, enter_time);
5106 DECLARE_LOCALS;
5107 utime_t dur = ceph_clock_now() - enter_time;
5108 pl->get_peering_perf().tinc(rs_repwaitbackfillreserved_latency, dur);
5109 }
5110
5111 boost::statechart::result
5112 PeeringState::RepWaitBackfillReserved::react(const RemoteBackfillReserved &evt)
5113 {
5114 DECLARE_LOCALS;
5115
5116
5117 pl->send_cluster_message(
5118 ps->primary.osd,
5119 new MBackfillReserve(
5120 MBackfillReserve::GRANT,
5121 spg_t(ps->info.pgid.pgid, ps->primary.shard),
5122 ps->get_osdmap_epoch()),
5123 ps->get_osdmap_epoch());
5124 return transit<RepRecovering>();
5125 }
5126
5127 boost::statechart::result
5128 PeeringState::RepWaitBackfillReserved::react(
5129 const RejectTooFullRemoteReservation &evt)
5130 {
5131 DECLARE_LOCALS;
5132 ps->reject_reservation();
5133 post_event(RemoteReservationRejectedTooFull());
5134 return discard_event();
5135 }
5136
5137 boost::statechart::result
5138 PeeringState::RepWaitBackfillReserved::react(
5139 const RemoteReservationRejectedTooFull &evt)
5140 {
5141 DECLARE_LOCALS;
5142 pl->unreserve_recovery_space();
5143
5144 pl->cancel_remote_recovery_reservation();
5145 return transit<RepNotRecovering>();
5146 }
5147
5148 boost::statechart::result
5149 PeeringState::RepWaitBackfillReserved::react(
5150 const RemoteReservationCanceled &evt)
5151 {
5152 DECLARE_LOCALS;
5153 pl->unreserve_recovery_space();
5154
5155 pl->cancel_remote_recovery_reservation();
5156 return transit<RepNotRecovering>();
5157 }
5158
5159 /*---RepRecovering-------*/
5160 PeeringState::RepRecovering::RepRecovering(my_context ctx)
5161 : my_base(ctx),
5162 NamedState(context< PeeringMachine >().state_history, "Started/ReplicaActive/RepRecovering")
5163 {
5164 context< PeeringMachine >().log_enter(state_name);
5165 }
5166
5167 boost::statechart::result
5168 PeeringState::RepRecovering::react(const RemoteRecoveryPreempted &)
5169 {
5170 DECLARE_LOCALS;
5171
5172
5173 pl->unreserve_recovery_space();
5174 pl->send_cluster_message(
5175 ps->primary.osd,
5176 new MRecoveryReserve(
5177 MRecoveryReserve::REVOKE,
5178 spg_t(ps->info.pgid.pgid, ps->primary.shard),
5179 ps->get_osdmap_epoch()),
5180 ps->get_osdmap_epoch());
5181 return discard_event();
5182 }
5183
5184 boost::statechart::result
5185 PeeringState::RepRecovering::react(const BackfillTooFull &)
5186 {
5187 DECLARE_LOCALS;
5188
5189
5190 pl->unreserve_recovery_space();
5191 pl->send_cluster_message(
5192 ps->primary.osd,
5193 new MBackfillReserve(
5194 MBackfillReserve::REVOKE_TOOFULL,
5195 spg_t(ps->info.pgid.pgid, ps->primary.shard),
5196 ps->get_osdmap_epoch()),
5197 ps->get_osdmap_epoch());
5198 return discard_event();
5199 }
5200
5201 boost::statechart::result
5202 PeeringState::RepRecovering::react(const RemoteBackfillPreempted &)
5203 {
5204 DECLARE_LOCALS;
5205
5206
5207 pl->unreserve_recovery_space();
5208 pl->send_cluster_message(
5209 ps->primary.osd,
5210 new MBackfillReserve(
5211 MBackfillReserve::REVOKE,
5212 spg_t(ps->info.pgid.pgid, ps->primary.shard),
5213 ps->get_osdmap_epoch()),
5214 ps->get_osdmap_epoch());
5215 return discard_event();
5216 }
5217
5218 void PeeringState::RepRecovering::exit()
5219 {
5220 context< PeeringMachine >().log_exit(state_name, enter_time);
5221 DECLARE_LOCALS;
5222 pl->unreserve_recovery_space();
5223
5224 pl->cancel_remote_recovery_reservation();
5225 utime_t dur = ceph_clock_now() - enter_time;
5226 pl->get_peering_perf().tinc(rs_reprecovering_latency, dur);
5227 }
5228
5229 /*------Activating--------*/
5230 PeeringState::Activating::Activating(my_context ctx)
5231 : my_base(ctx),
5232 NamedState(context< PeeringMachine >().state_history, "Started/Primary/Active/Activating")
5233 {
5234 context< PeeringMachine >().log_enter(state_name);
5235 }
5236
5237 void PeeringState::Activating::exit()
5238 {
5239 context< PeeringMachine >().log_exit(state_name, enter_time);
5240 DECLARE_LOCALS;
5241 utime_t dur = ceph_clock_now() - enter_time;
5242 pl->get_peering_perf().tinc(rs_activating_latency, dur);
5243 }
5244
5245 PeeringState::WaitLocalRecoveryReserved::WaitLocalRecoveryReserved(my_context ctx)
5246 : my_base(ctx),
5247 NamedState(context< PeeringMachine >().state_history, "Started/Primary/Active/WaitLocalRecoveryReserved")
5248 {
5249 context< PeeringMachine >().log_enter(state_name);
5250 DECLARE_LOCALS;
5251
5252 // Make sure all nodes that part of the recovery aren't full
5253 if (!ps->cct->_conf->osd_debug_skip_full_check_in_recovery &&
5254 ps->get_osdmap()->check_full(ps->acting_recovery_backfill)) {
5255 post_event(RecoveryTooFull());
5256 return;
5257 }
5258
5259 ps->state_clear(PG_STATE_RECOVERY_TOOFULL);
5260 ps->state_set(PG_STATE_RECOVERY_WAIT);
5261 pl->request_local_background_io_reservation(
5262 ps->get_recovery_priority(),
5263 std::make_shared<PGPeeringEvent>(
5264 ps->get_osdmap_epoch(),
5265 ps->get_osdmap_epoch(),
5266 LocalRecoveryReserved()),
5267 std::make_shared<PGPeeringEvent>(
5268 ps->get_osdmap_epoch(),
5269 ps->get_osdmap_epoch(),
5270 DeferRecovery(0.0)));
5271 pl->publish_stats_to_osd();
5272 }
5273
5274 boost::statechart::result
5275 PeeringState::WaitLocalRecoveryReserved::react(const RecoveryTooFull &evt)
5276 {
5277 DECLARE_LOCALS;
5278 ps->state_set(PG_STATE_RECOVERY_TOOFULL);
5279 pl->schedule_event_after(
5280 std::make_shared<PGPeeringEvent>(
5281 ps->get_osdmap_epoch(),
5282 ps->get_osdmap_epoch(),
5283 DoRecovery()),
5284 ps->cct->_conf->osd_recovery_retry_interval);
5285 return transit<NotRecovering>();
5286 }
5287
5288 void PeeringState::WaitLocalRecoveryReserved::exit()
5289 {
5290 context< PeeringMachine >().log_exit(state_name, enter_time);
5291 DECLARE_LOCALS;
5292 utime_t dur = ceph_clock_now() - enter_time;
5293 pl->get_peering_perf().tinc(rs_waitlocalrecoveryreserved_latency, dur);
5294 }
5295
5296 PeeringState::WaitRemoteRecoveryReserved::WaitRemoteRecoveryReserved(my_context ctx)
5297 : my_base(ctx),
5298 NamedState(context< PeeringMachine >().state_history, "Started/Primary/Active/WaitRemoteRecoveryReserved"),
5299 remote_recovery_reservation_it(context< Active >().remote_shards_to_reserve_recovery.begin())
5300 {
5301 context< PeeringMachine >().log_enter(state_name);
5302 post_event(RemoteRecoveryReserved());
5303 }
5304
5305 boost::statechart::result
5306 PeeringState::WaitRemoteRecoveryReserved::react(const RemoteRecoveryReserved &evt) {
5307 DECLARE_LOCALS;
5308
5309 if (remote_recovery_reservation_it !=
5310 context< Active >().remote_shards_to_reserve_recovery.end()) {
5311 ceph_assert(*remote_recovery_reservation_it != ps->pg_whoami);
5312 pl->send_cluster_message(
5313 remote_recovery_reservation_it->osd,
5314 new MRecoveryReserve(
5315 MRecoveryReserve::REQUEST,
5316 spg_t(context< PeeringMachine >().spgid.pgid,
5317 remote_recovery_reservation_it->shard),
5318 ps->get_osdmap_epoch(),
5319 ps->get_recovery_priority()),
5320 ps->get_osdmap_epoch());
5321 ++remote_recovery_reservation_it;
5322 } else {
5323 post_event(AllRemotesReserved());
5324 }
5325 return discard_event();
5326 }
5327
5328 void PeeringState::WaitRemoteRecoveryReserved::exit()
5329 {
5330 context< PeeringMachine >().log_exit(state_name, enter_time);
5331 DECLARE_LOCALS;
5332 utime_t dur = ceph_clock_now() - enter_time;
5333 pl->get_peering_perf().tinc(rs_waitremoterecoveryreserved_latency, dur);
5334 }
5335
5336 PeeringState::Recovering::Recovering(my_context ctx)
5337 : my_base(ctx),
5338 NamedState(context< PeeringMachine >().state_history, "Started/Primary/Active/Recovering")
5339 {
5340 context< PeeringMachine >().log_enter(state_name);
5341
5342 DECLARE_LOCALS;
5343 ps->state_clear(PG_STATE_RECOVERY_WAIT);
5344 ps->state_clear(PG_STATE_RECOVERY_TOOFULL);
5345 ps->state_set(PG_STATE_RECOVERING);
5346 pl->on_recovery_reserved();
5347 ceph_assert(!ps->state_test(PG_STATE_ACTIVATING));
5348 pl->publish_stats_to_osd();
5349 }
5350
5351 void PeeringState::Recovering::release_reservations(bool cancel)
5352 {
5353 DECLARE_LOCALS;
5354 ceph_assert(cancel || !ps->pg_log.get_missing().have_missing());
5355
5356 // release remote reservations
5357 for (set<pg_shard_t>::const_iterator i =
5358 context< Active >().remote_shards_to_reserve_recovery.begin();
5359 i != context< Active >().remote_shards_to_reserve_recovery.end();
5360 ++i) {
5361 if (*i == ps->pg_whoami) // skip myself
5362 continue;
5363 pl->send_cluster_message(
5364 i->osd,
5365 new MRecoveryReserve(
5366 MRecoveryReserve::RELEASE,
5367 spg_t(ps->info.pgid.pgid, i->shard),
5368 ps->get_osdmap_epoch()),
5369 ps->get_osdmap_epoch());
5370 }
5371 }
5372
5373 boost::statechart::result
5374 PeeringState::Recovering::react(const AllReplicasRecovered &evt)
5375 {
5376 DECLARE_LOCALS;
5377 ps->state_clear(PG_STATE_FORCED_RECOVERY);
5378 release_reservations();
5379 pl->cancel_local_background_io_reservation();
5380 return transit<Recovered>();
5381 }
5382
5383 boost::statechart::result
5384 PeeringState::Recovering::react(const RequestBackfill &evt)
5385 {
5386 DECLARE_LOCALS;
5387
5388 release_reservations();
5389
5390 ps->state_clear(PG_STATE_FORCED_RECOVERY);
5391 pl->cancel_local_background_io_reservation();
5392 pl->publish_stats_to_osd();
5393 // transit any async_recovery_targets back into acting
5394 // so pg won't have to stay undersized for long
5395 // as backfill might take a long time to complete..
5396 if (!ps->async_recovery_targets.empty()) {
5397 pg_shard_t auth_log_shard;
5398 bool history_les_bound = false;
5399 ps->choose_acting(auth_log_shard, true, &history_les_bound);
5400 }
5401 return transit<WaitLocalBackfillReserved>();
5402 }
5403
5404 boost::statechart::result
5405 PeeringState::Recovering::react(const DeferRecovery &evt)
5406 {
5407 DECLARE_LOCALS;
5408 if (!ps->state_test(PG_STATE_RECOVERING)) {
5409 // we may have finished recovery and have an AllReplicasRecovered
5410 // event queued to move us to the next state.
5411 psdout(10) << "got defer recovery but not recovering" << dendl;
5412 return discard_event();
5413 }
5414 psdout(10) << "defer recovery, retry delay " << evt.delay << dendl;
5415 ps->state_set(PG_STATE_RECOVERY_WAIT);
5416 pl->cancel_local_background_io_reservation();
5417 release_reservations(true);
5418 pl->schedule_event_after(
5419 std::make_shared<PGPeeringEvent>(
5420 ps->get_osdmap_epoch(),
5421 ps->get_osdmap_epoch(),
5422 DoRecovery()),
5423 evt.delay);
5424 return transit<NotRecovering>();
5425 }
5426
5427 boost::statechart::result
5428 PeeringState::Recovering::react(const UnfoundRecovery &evt)
5429 {
5430 DECLARE_LOCALS;
5431 psdout(10) << "recovery has unfound, can't continue" << dendl;
5432 ps->state_set(PG_STATE_RECOVERY_UNFOUND);
5433 pl->cancel_local_background_io_reservation();
5434 release_reservations(true);
5435 return transit<NotRecovering>();
5436 }
5437
5438 void PeeringState::Recovering::exit()
5439 {
5440 context< PeeringMachine >().log_exit(state_name, enter_time);
5441
5442 DECLARE_LOCALS;
5443 utime_t dur = ceph_clock_now() - enter_time;
5444 ps->state_clear(PG_STATE_RECOVERING);
5445 pl->get_peering_perf().tinc(rs_recovering_latency, dur);
5446 }
5447
5448 PeeringState::Recovered::Recovered(my_context ctx)
5449 : my_base(ctx),
5450 NamedState(context< PeeringMachine >().state_history, "Started/Primary/Active/Recovered")
5451 {
5452 pg_shard_t auth_log_shard;
5453
5454 context< PeeringMachine >().log_enter(state_name);
5455
5456 DECLARE_LOCALS;
5457
5458 ceph_assert(!ps->needs_recovery());
5459
5460 // if we finished backfill, all acting are active; recheck if
5461 // DEGRADED | UNDERSIZED is appropriate.
5462 ceph_assert(!ps->acting_recovery_backfill.empty());
5463 if (ps->get_osdmap()->get_pg_size(context< PeeringMachine >().spgid.pgid) <=
5464 ps->acting_recovery_backfill.size()) {
5465 ps->state_clear(PG_STATE_FORCED_BACKFILL | PG_STATE_FORCED_RECOVERY);
5466 pl->publish_stats_to_osd();
5467 }
5468
5469 // adjust acting set? (e.g. because backfill completed...)
5470 bool history_les_bound = false;
5471 if (ps->acting != ps->up && !ps->choose_acting(auth_log_shard,
5472 true, &history_les_bound)) {
5473 ceph_assert(ps->want_acting.size());
5474 } else if (!ps->async_recovery_targets.empty()) {
5475 ps->choose_acting(auth_log_shard, true, &history_les_bound);
5476 }
5477
5478 if (context< Active >().all_replicas_activated &&
5479 ps->async_recovery_targets.empty())
5480 post_event(GoClean());
5481 }
5482
5483 void PeeringState::Recovered::exit()
5484 {
5485 context< PeeringMachine >().log_exit(state_name, enter_time);
5486 DECLARE_LOCALS;
5487
5488 utime_t dur = ceph_clock_now() - enter_time;
5489 pl->get_peering_perf().tinc(rs_recovered_latency, dur);
5490 }
5491
5492 PeeringState::Clean::Clean(my_context ctx)
5493 : my_base(ctx),
5494 NamedState(context< PeeringMachine >().state_history, "Started/Primary/Active/Clean")
5495 {
5496 context< PeeringMachine >().log_enter(state_name);
5497
5498 DECLARE_LOCALS;
5499
5500 if (ps->info.last_complete != ps->info.last_update) {
5501 ceph_abort();
5502 }
5503
5504
5505 ps->try_mark_clean();
5506
5507 context< PeeringMachine >().get_cur_transaction().register_on_commit(
5508 pl->on_clean());
5509 }
5510
5511 void PeeringState::Clean::exit()
5512 {
5513 context< PeeringMachine >().log_exit(state_name, enter_time);
5514
5515 DECLARE_LOCALS;
5516 ps->state_clear(PG_STATE_CLEAN);
5517 utime_t dur = ceph_clock_now() - enter_time;
5518 pl->get_peering_perf().tinc(rs_clean_latency, dur);
5519 }
5520
5521 template <typename T>
5522 set<pg_shard_t> unique_osd_shard_set(const pg_shard_t & skip, const T &in)
5523 {
5524 set<int> osds_found;
5525 set<pg_shard_t> out;
5526 for (typename T::const_iterator i = in.begin();
5527 i != in.end();
5528 ++i) {
5529 if (*i != skip && !osds_found.count(i->osd)) {
5530 osds_found.insert(i->osd);
5531 out.insert(*i);
5532 }
5533 }
5534 return out;
5535 }
5536
5537 /*---------Active---------*/
5538 PeeringState::Active::Active(my_context ctx)
5539 : my_base(ctx),
5540 NamedState(context< PeeringMachine >().state_history, "Started/Primary/Active"),
5541 remote_shards_to_reserve_recovery(
5542 unique_osd_shard_set(
5543 context< PeeringMachine >().state->pg_whoami,
5544 context< PeeringMachine >().state->acting_recovery_backfill)),
5545 remote_shards_to_reserve_backfill(
5546 unique_osd_shard_set(
5547 context< PeeringMachine >().state->pg_whoami,
5548 context< PeeringMachine >().state->backfill_targets)),
5549 all_replicas_activated(false)
5550 {
5551 context< PeeringMachine >().log_enter(state_name);
5552
5553
5554 DECLARE_LOCALS;
5555
5556 ceph_assert(!ps->backfill_reserved);
5557 ceph_assert(ps->is_primary());
5558 psdout(10) << "In Active, about to call activate" << dendl;
5559 ps->start_flush(context< PeeringMachine >().get_cur_transaction());
5560 ps->activate(context< PeeringMachine >().get_cur_transaction(),
5561 ps->get_osdmap_epoch(),
5562 context< PeeringMachine >().get_recovery_ctx());
5563
5564 // everyone has to commit/ack before we are truly active
5565 ps->blocked_by.clear();
5566 for (set<pg_shard_t>::iterator p = ps->acting_recovery_backfill.begin();
5567 p != ps->acting_recovery_backfill.end();
5568 ++p) {
5569 if (p->shard != ps->pg_whoami.shard) {
5570 ps->blocked_by.insert(p->shard);
5571 }
5572 }
5573 pl->publish_stats_to_osd();
5574 psdout(10) << "Activate Finished" << dendl;
5575 }
5576
5577 boost::statechart::result PeeringState::Active::react(const AdvMap& advmap)
5578 {
5579 DECLARE_LOCALS;
5580
5581 if (ps->should_restart_peering(
5582 advmap.up_primary,
5583 advmap.acting_primary,
5584 advmap.newup,
5585 advmap.newacting,
5586 advmap.lastmap,
5587 advmap.osdmap)) {
5588 psdout(10) << "Active advmap interval change, fast return" << dendl;
5589 return forward_event();
5590 }
5591 psdout(10) << "Active advmap" << dendl;
5592 bool need_publish = false;
5593
5594 pl->on_active_advmap(advmap.osdmap);
5595 if (ps->dirty_big_info) {
5596 // share updated purged_snaps to mgr/mon so that we (a) stop reporting
5597 // purged snaps and (b) perhaps share more snaps that we have purged
5598 // but didn't fit in pg_stat_t.
5599 need_publish = true;
5600 ps->share_pg_info();
5601 }
5602
5603 bool need_acting_change = false;
5604 for (size_t i = 0; i < ps->want_acting.size(); i++) {
5605 int osd = ps->want_acting[i];
5606 if (!advmap.osdmap->is_up(osd)) {
5607 pg_shard_t osd_with_shard(osd, shard_id_t(i));
5608 if (!ps->is_acting(osd_with_shard) && !ps->is_up(osd_with_shard)) {
5609 psdout(10) << "Active stray osd." << osd << " in want_acting is down"
5610 << dendl;
5611 need_acting_change = true;
5612 }
5613 }
5614 }
5615 if (need_acting_change) {
5616 psdout(10) << "Active need acting change, call choose_acting again"
5617 << dendl;
5618 // possibly because we re-add some strays into the acting set and
5619 // some of them then go down in a subsequent map before we could see
5620 // the map changing the pg temp.
5621 // call choose_acting again to clear them out.
5622 // note that we leave restrict_to_up_acting to false in order to
5623 // not overkill any chosen stray that is still alive.
5624 pg_shard_t auth_log_shard;
5625 bool history_les_bound = false;
5626 ps->remove_down_peer_info(advmap.osdmap);
5627 ps->choose_acting(auth_log_shard, false, &history_les_bound, true);
5628 }
5629
5630 /* Check for changes in pool size (if the acting set changed as a result,
5631 * this does not matter) */
5632 if (advmap.lastmap->get_pg_size(ps->info.pgid.pgid) !=
5633 ps->get_osdmap()->get_pg_size(ps->info.pgid.pgid)) {
5634 if (ps->get_osdmap()->get_pg_size(ps->info.pgid.pgid) <=
5635 ps->actingset.size()) {
5636 ps->state_clear(PG_STATE_UNDERSIZED);
5637 } else {
5638 ps->state_set(PG_STATE_UNDERSIZED);
5639 }
5640 // degraded changes will be detected by call from publish_stats_to_osd()
5641 need_publish = true;
5642 }
5643
5644 // if we haven't reported our PG stats in a long time, do so now.
5645 if (ps->info.stats.reported_epoch + ps->cct->_conf->osd_pg_stat_report_interval_max < advmap.osdmap->get_epoch()) {
5646 psdout(20) << "reporting stats to osd after " << (advmap.osdmap->get_epoch() - ps->info.stats.reported_epoch)
5647 << " epochs" << dendl;
5648 need_publish = true;
5649 }
5650
5651 if (need_publish)
5652 pl->publish_stats_to_osd();
5653
5654 if (ps->check_prior_readable_down_osds(advmap.osdmap)) {
5655 pl->recheck_readable();
5656 }
5657
5658 return forward_event();
5659 }
5660
5661 boost::statechart::result PeeringState::Active::react(const ActMap&)
5662 {
5663 DECLARE_LOCALS;
5664 psdout(10) << "Active: handling ActMap" << dendl;
5665 ceph_assert(ps->is_primary());
5666
5667 pl->on_active_actmap();
5668
5669 if (ps->have_unfound()) {
5670 // object may have become unfound
5671 ps->discover_all_missing(context<PeeringMachine>().get_recovery_ctx().msgs);
5672 }
5673
5674 uint64_t unfound = ps->missing_loc.num_unfound();
5675 if (unfound > 0 &&
5676 ps->all_unfound_are_queried_or_lost(ps->get_osdmap())) {
5677 if (ps->cct->_conf->osd_auto_mark_unfound_lost) {
5678 pl->get_clog_error() << context< PeeringMachine >().spgid.pgid << " has " << unfound
5679 << " objects unfound and apparently lost, would automatically "
5680 << "mark these objects lost but this feature is not yet implemented "
5681 << "(osd_auto_mark_unfound_lost)";
5682 } else
5683 pl->get_clog_error() << context< PeeringMachine >().spgid.pgid << " has "
5684 << unfound << " objects unfound and apparently lost";
5685 }
5686
5687 return forward_event();
5688 }
5689
5690 boost::statechart::result PeeringState::Active::react(const MNotifyRec& notevt)
5691 {
5692
5693 DECLARE_LOCALS;
5694 ceph_assert(ps->is_primary());
5695 if (ps->peer_info.count(notevt.from)) {
5696 psdout(10) << "Active: got notify from " << notevt.from
5697 << ", already have info from that osd, ignoring"
5698 << dendl;
5699 } else if (ps->peer_purged.count(notevt.from)) {
5700 psdout(10) << "Active: got notify from " << notevt.from
5701 << ", already purged that peer, ignoring"
5702 << dendl;
5703 } else {
5704 psdout(10) << "Active: got notify from " << notevt.from
5705 << ", calling proc_replica_info and discover_all_missing"
5706 << dendl;
5707 ps->proc_replica_info(
5708 notevt.from, notevt.notify.info, notevt.notify.epoch_sent);
5709 if (ps->have_unfound() || (ps->is_degraded() && ps->might_have_unfound.count(notevt.from))) {
5710 ps->discover_all_missing(
5711 context<PeeringMachine>().get_recovery_ctx().msgs);
5712 }
5713 // check if it is a previous down acting member that's coming back.
5714 // if so, request pg_temp change to trigger a new interval transition
5715 pg_shard_t auth_log_shard;
5716 bool history_les_bound = false;
5717 ps->choose_acting(auth_log_shard, false, &history_les_bound, true);
5718 if (!ps->want_acting.empty() && ps->want_acting != ps->acting) {
5719 psdout(10) << "Active: got notify from previous acting member "
5720 << notevt.from << ", requesting pg_temp change"
5721 << dendl;
5722 }
5723 }
5724 return discard_event();
5725 }
5726
5727 boost::statechart::result PeeringState::Active::react(const MTrim& trim)
5728 {
5729 DECLARE_LOCALS;
5730 ceph_assert(ps->is_primary());
5731
5732 // peer is informing us of their last_complete_ondisk
5733 ldout(ps->cct,10) << " replica osd." << trim.from << " lcod " << trim.trim_to << dendl;
5734 ps->update_peer_last_complete_ondisk(pg_shard_t{trim.from, trim.shard},
5735 trim.trim_to);
5736 // trim log when the pg is recovered
5737 ps->calc_min_last_complete_ondisk();
5738 return discard_event();
5739 }
5740
5741 boost::statechart::result PeeringState::Active::react(const MInfoRec& infoevt)
5742 {
5743 DECLARE_LOCALS;
5744 ceph_assert(ps->is_primary());
5745
5746 ceph_assert(!ps->acting_recovery_backfill.empty());
5747 if (infoevt.lease_ack) {
5748 ps->proc_lease_ack(infoevt.from.osd, *infoevt.lease_ack);
5749 }
5750 // don't update history (yet) if we are active and primary; the replica
5751 // may be telling us they have activated (and committed) but we can't
5752 // share that until _everyone_ does the same.
5753 if (ps->is_acting_recovery_backfill(infoevt.from) &&
5754 ps->peer_activated.count(infoevt.from) == 0) {
5755 psdout(10) << " peer osd." << infoevt.from
5756 << " activated and committed" << dendl;
5757 ps->peer_activated.insert(infoevt.from);
5758 ps->blocked_by.erase(infoevt.from.shard);
5759 pl->publish_stats_to_osd();
5760 if (ps->peer_activated.size() == ps->acting_recovery_backfill.size()) {
5761 all_activated_and_committed();
5762 }
5763 }
5764 return discard_event();
5765 }
5766
5767 boost::statechart::result PeeringState::Active::react(const MLogRec& logevt)
5768 {
5769 DECLARE_LOCALS;
5770 psdout(10) << "searching osd." << logevt.from
5771 << " log for unfound items" << dendl;
5772 ps->proc_replica_log(
5773 logevt.msg->info, logevt.msg->log, logevt.msg->missing, logevt.from);
5774 bool got_missing = ps->search_for_missing(
5775 ps->peer_info[logevt.from],
5776 ps->peer_missing[logevt.from],
5777 logevt.from,
5778 context< PeeringMachine >().get_recovery_ctx());
5779 // If there are missing AND we are "fully" active then start recovery now
5780 if (got_missing && ps->state_test(PG_STATE_ACTIVE)) {
5781 post_event(DoRecovery());
5782 }
5783 return discard_event();
5784 }
5785
5786 boost::statechart::result PeeringState::Active::react(const QueryState& q)
5787 {
5788 DECLARE_LOCALS;
5789
5790 q.f->open_object_section("state");
5791 q.f->dump_string("name", state_name);
5792 q.f->dump_stream("enter_time") << enter_time;
5793
5794 {
5795 q.f->open_array_section("might_have_unfound");
5796 for (set<pg_shard_t>::iterator p = ps->might_have_unfound.begin();
5797 p != ps->might_have_unfound.end();
5798 ++p) {
5799 q.f->open_object_section("osd");
5800 q.f->dump_stream("osd") << *p;
5801 if (ps->peer_missing.count(*p)) {
5802 q.f->dump_string("status", "already probed");
5803 } else if (ps->peer_missing_requested.count(*p)) {
5804 q.f->dump_string("status", "querying");
5805 } else if (!ps->get_osdmap()->is_up(p->osd)) {
5806 q.f->dump_string("status", "osd is down");
5807 } else {
5808 q.f->dump_string("status", "not queried");
5809 }
5810 q.f->close_section();
5811 }
5812 q.f->close_section();
5813 }
5814 {
5815 q.f->open_object_section("recovery_progress");
5816 q.f->open_array_section("backfill_targets");
5817 for (set<pg_shard_t>::const_iterator p = ps->backfill_targets.begin();
5818 p != ps->backfill_targets.end(); ++p)
5819 q.f->dump_stream("replica") << *p;
5820 q.f->close_section();
5821 pl->dump_recovery_info(q.f);
5822 q.f->close_section();
5823 }
5824
5825 q.f->close_section();
5826 return forward_event();
5827 }
5828
5829 boost::statechart::result PeeringState::Active::react(
5830 const ActivateCommitted &evt)
5831 {
5832 DECLARE_LOCALS;
5833 ceph_assert(!ps->peer_activated.count(ps->pg_whoami));
5834 ps->peer_activated.insert(ps->pg_whoami);
5835 psdout(10) << "_activate_committed " << evt.epoch
5836 << " peer_activated now " << ps->peer_activated
5837 << " last_interval_started "
5838 << ps->info.history.last_interval_started
5839 << " last_epoch_started "
5840 << ps->info.history.last_epoch_started
5841 << " same_interval_since "
5842 << ps->info.history.same_interval_since
5843 << dendl;
5844 ceph_assert(!ps->acting_recovery_backfill.empty());
5845 if (ps->peer_activated.size() == ps->acting_recovery_backfill.size())
5846 all_activated_and_committed();
5847 return discard_event();
5848 }
5849
5850 boost::statechart::result PeeringState::Active::react(const AllReplicasActivated &evt)
5851 {
5852
5853 DECLARE_LOCALS;
5854 pg_t pgid = context< PeeringMachine >().spgid.pgid;
5855
5856 all_replicas_activated = true;
5857
5858 ps->state_clear(PG_STATE_ACTIVATING);
5859 ps->state_clear(PG_STATE_CREATING);
5860 ps->state_clear(PG_STATE_PREMERGE);
5861
5862 bool merge_target;
5863 if (ps->pool.info.is_pending_merge(pgid, &merge_target)) {
5864 ps->state_set(PG_STATE_PEERED);
5865 ps->state_set(PG_STATE_PREMERGE);
5866
5867 if (ps->actingset.size() != ps->get_osdmap()->get_pg_size(pgid)) {
5868 if (merge_target) {
5869 pg_t src = pgid;
5870 src.set_ps(ps->pool.info.get_pg_num_pending());
5871 assert(src.get_parent() == pgid);
5872 pl->set_not_ready_to_merge_target(pgid, src);
5873 } else {
5874 pl->set_not_ready_to_merge_source(pgid);
5875 }
5876 }
5877 } else if (ps->actingset.size() < ps->pool.info.min_size) {
5878 ps->state_set(PG_STATE_PEERED);
5879 } else {
5880 ps->state_set(PG_STATE_ACTIVE);
5881 }
5882
5883 auto mnow = pl->get_mnow();
5884 if (ps->prior_readable_until_ub > mnow) {
5885 psdout(10) << " waiting for prior_readable_until_ub "
5886 << ps->prior_readable_until_ub << " > mnow " << mnow << dendl;
5887 ps->state_set(PG_STATE_WAIT);
5888 pl->queue_check_readable(
5889 ps->last_peering_reset,
5890 ps->prior_readable_until_ub - mnow);
5891 } else {
5892 psdout(10) << " mnow " << mnow << " >= prior_readable_until_ub "
5893 << ps->prior_readable_until_ub << dendl;
5894 }
5895
5896 if (ps->pool.info.has_flag(pg_pool_t::FLAG_CREATING)) {
5897 pl->send_pg_created(pgid);
5898 }
5899
5900 ps->info.history.last_epoch_started = ps->info.last_epoch_started;
5901 ps->info.history.last_interval_started = ps->info.last_interval_started;
5902 ps->dirty_info = true;
5903
5904 ps->share_pg_info();
5905 pl->publish_stats_to_osd();
5906
5907 pl->on_activate_complete();
5908
5909 return discard_event();
5910 }
5911
5912 boost::statechart::result PeeringState::Active::react(const RenewLease& rl)
5913 {
5914 DECLARE_LOCALS;
5915 ps->proc_renew_lease();
5916 return discard_event();
5917 }
5918
5919 boost::statechart::result PeeringState::Active::react(const MLeaseAck& la)
5920 {
5921 DECLARE_LOCALS;
5922 ps->proc_lease_ack(la.from, la.lease_ack);
5923 return discard_event();
5924 }
5925
5926
5927 boost::statechart::result PeeringState::Active::react(const CheckReadable &evt)
5928 {
5929 DECLARE_LOCALS;
5930 pl->recheck_readable();
5931 return discard_event();
5932 }
5933
5934 /*
5935 * update info.history.last_epoch_started ONLY after we and all
5936 * replicas have activated AND committed the activate transaction
5937 * (i.e. the peering results are stable on disk).
5938 */
5939 void PeeringState::Active::all_activated_and_committed()
5940 {
5941 DECLARE_LOCALS;
5942 psdout(10) << "all_activated_and_committed" << dendl;
5943 ceph_assert(ps->is_primary());
5944 ceph_assert(ps->peer_activated.size() == ps->acting_recovery_backfill.size());
5945 ceph_assert(!ps->acting_recovery_backfill.empty());
5946 ceph_assert(ps->blocked_by.empty());
5947
5948 if (HAVE_FEATURE(ps->upacting_features, SERVER_OCTOPUS)) {
5949 // this is overkill when the activation is quick, but when it is slow it
5950 // is important, because the lease was renewed by the activate itself but we
5951 // don't know how long ago that was, and simply scheduling now may leave
5952 // a gap in lease coverage. keep it simple and aggressively renew.
5953 ps->renew_lease(pl->get_mnow());
5954 ps->send_lease();
5955 ps->schedule_renew_lease();
5956 }
5957
5958 // Degraded?
5959 ps->update_calc_stats();
5960 if (ps->info.stats.stats.sum.num_objects_degraded) {
5961 ps->state_set(PG_STATE_DEGRADED);
5962 } else {
5963 ps->state_clear(PG_STATE_DEGRADED);
5964 }
5965
5966 post_event(PeeringState::AllReplicasActivated());
5967 }
5968
5969
5970 void PeeringState::Active::exit()
5971 {
5972 context< PeeringMachine >().log_exit(state_name, enter_time);
5973
5974
5975 DECLARE_LOCALS;
5976 pl->cancel_local_background_io_reservation();
5977
5978 ps->blocked_by.clear();
5979 ps->backfill_reserved = false;
5980 ps->state_clear(PG_STATE_ACTIVATING);
5981 ps->state_clear(PG_STATE_DEGRADED);
5982 ps->state_clear(PG_STATE_UNDERSIZED);
5983 ps->state_clear(PG_STATE_BACKFILL_TOOFULL);
5984 ps->state_clear(PG_STATE_BACKFILL_WAIT);
5985 ps->state_clear(PG_STATE_RECOVERY_WAIT);
5986 ps->state_clear(PG_STATE_RECOVERY_TOOFULL);
5987 utime_t dur = ceph_clock_now() - enter_time;
5988 pl->get_peering_perf().tinc(rs_active_latency, dur);
5989 pl->on_active_exit();
5990 }
5991
5992 /*------ReplicaActive-----*/
5993 PeeringState::ReplicaActive::ReplicaActive(my_context ctx)
5994 : my_base(ctx),
5995 NamedState(context< PeeringMachine >().state_history, "Started/ReplicaActive")
5996 {
5997 context< PeeringMachine >().log_enter(state_name);
5998
5999 DECLARE_LOCALS;
6000 ps->start_flush(context< PeeringMachine >().get_cur_transaction());
6001 }
6002
6003
6004 boost::statechart::result PeeringState::ReplicaActive::react(
6005 const Activate& actevt) {
6006 DECLARE_LOCALS;
6007 psdout(10) << "In ReplicaActive, about to call activate" << dendl;
6008 ps->activate(
6009 context< PeeringMachine >().get_cur_transaction(),
6010 actevt.activation_epoch,
6011 context< PeeringMachine >().get_recovery_ctx());
6012 psdout(10) << "Activate Finished" << dendl;
6013 return discard_event();
6014 }
6015
6016 boost::statechart::result PeeringState::ReplicaActive::react(
6017 const ActivateCommitted &evt)
6018 {
6019 DECLARE_LOCALS;
6020 psdout(10) << __func__ << " " << evt.epoch << " telling primary" << dendl;
6021
6022 auto &rctx = context<PeeringMachine>().get_recovery_ctx();
6023 auto epoch = ps->get_osdmap_epoch();
6024 pg_info_t i = ps->info;
6025 i.history.last_epoch_started = evt.activation_epoch;
6026 i.history.last_interval_started = i.history.same_interval_since;
6027 rctx.send_info(
6028 ps->get_primary().osd,
6029 spg_t(ps->info.pgid.pgid, ps->get_primary().shard),
6030 epoch,
6031 epoch,
6032 i,
6033 {}, /* lease */
6034 ps->get_lease_ack());
6035
6036 if (ps->actingset.size() >= ps->pool.info.min_size) {
6037 ps->state_set(PG_STATE_ACTIVE);
6038 } else {
6039 ps->state_set(PG_STATE_PEERED);
6040 }
6041 pl->on_activate_committed();
6042
6043 return discard_event();
6044 }
6045
6046 boost::statechart::result PeeringState::ReplicaActive::react(const MLease& l)
6047 {
6048 DECLARE_LOCALS;
6049 spg_t spgid = context< PeeringMachine >().spgid;
6050 epoch_t epoch = pl->get_osdmap_epoch();
6051
6052 ps->proc_lease(l.lease);
6053 pl->send_cluster_message(
6054 ps->get_primary().osd,
6055 new MOSDPGLeaseAck(epoch,
6056 spg_t(spgid.pgid, ps->get_primary().shard),
6057 ps->get_lease_ack()),
6058 epoch);
6059 return discard_event();
6060 }
6061
6062 boost::statechart::result PeeringState::ReplicaActive::react(const MInfoRec& infoevt)
6063 {
6064 DECLARE_LOCALS;
6065 ps->proc_primary_info(context<PeeringMachine>().get_cur_transaction(),
6066 infoevt.info);
6067 return discard_event();
6068 }
6069
6070 boost::statechart::result PeeringState::ReplicaActive::react(const MLogRec& logevt)
6071 {
6072 DECLARE_LOCALS;
6073 psdout(10) << "received log from " << logevt.from << dendl;
6074 ObjectStore::Transaction &t = context<PeeringMachine>().get_cur_transaction();
6075 ps->merge_log(t, logevt.msg->info, logevt.msg->log, logevt.from);
6076 ceph_assert(ps->pg_log.get_head() == ps->info.last_update);
6077 if (logevt.msg->lease) {
6078 ps->proc_lease(*logevt.msg->lease);
6079 }
6080
6081 return discard_event();
6082 }
6083
6084 boost::statechart::result PeeringState::ReplicaActive::react(const MTrim& trim)
6085 {
6086 DECLARE_LOCALS;
6087 // primary is instructing us to trim
6088 ps->pg_log.trim(trim.trim_to, ps->info);
6089 ps->dirty_info = true;
6090 return discard_event();
6091 }
6092
6093 boost::statechart::result PeeringState::ReplicaActive::react(const ActMap&)
6094 {
6095 DECLARE_LOCALS;
6096 if (ps->should_send_notify() && ps->get_primary().osd >= 0) {
6097 ps->info.history.refresh_prior_readable_until_ub(
6098 pl->get_mnow(), ps->prior_readable_until_ub);
6099 context< PeeringMachine >().send_notify(
6100 ps->get_primary().osd,
6101 pg_notify_t(
6102 ps->get_primary().shard, ps->pg_whoami.shard,
6103 ps->get_osdmap_epoch(),
6104 ps->get_osdmap_epoch(),
6105 ps->info,
6106 ps->past_intervals));
6107 }
6108 return discard_event();
6109 }
6110
6111 boost::statechart::result PeeringState::ReplicaActive::react(
6112 const MQuery& query)
6113 {
6114 DECLARE_LOCALS;
6115 ps->fulfill_query(query, context<PeeringMachine>().get_recovery_ctx());
6116 return discard_event();
6117 }
6118
6119 boost::statechart::result PeeringState::ReplicaActive::react(const QueryState& q)
6120 {
6121 q.f->open_object_section("state");
6122 q.f->dump_string("name", state_name);
6123 q.f->dump_stream("enter_time") << enter_time;
6124 q.f->close_section();
6125 return forward_event();
6126 }
6127
6128 void PeeringState::ReplicaActive::exit()
6129 {
6130 context< PeeringMachine >().log_exit(state_name, enter_time);
6131 DECLARE_LOCALS;
6132 pl->unreserve_recovery_space();
6133
6134 pl->cancel_remote_recovery_reservation();
6135 utime_t dur = ceph_clock_now() - enter_time;
6136 pl->get_peering_perf().tinc(rs_replicaactive_latency, dur);
6137
6138 ps->min_last_complete_ondisk = eversion_t();
6139 }
6140
6141 /*-------Stray---*/
6142 PeeringState::Stray::Stray(my_context ctx)
6143 : my_base(ctx),
6144 NamedState(context< PeeringMachine >().state_history, "Started/Stray")
6145 {
6146 context< PeeringMachine >().log_enter(state_name);
6147
6148
6149 DECLARE_LOCALS;
6150 ceph_assert(!ps->is_peered());
6151 ceph_assert(!ps->is_peering());
6152 ceph_assert(!ps->is_primary());
6153
6154 if (!ps->get_osdmap()->have_pg_pool(ps->info.pgid.pgid.pool())) {
6155 ldout(ps->cct,10) << __func__ << " pool is deleted" << dendl;
6156 post_event(DeleteStart());
6157 } else {
6158 ps->start_flush(context< PeeringMachine >().get_cur_transaction());
6159 }
6160 }
6161
6162 boost::statechart::result PeeringState::Stray::react(const MLogRec& logevt)
6163 {
6164 DECLARE_LOCALS;
6165 MOSDPGLog *msg = logevt.msg.get();
6166 psdout(10) << "got info+log from osd." << logevt.from << " " << msg->info << " " << msg->log << dendl;
6167
6168 ObjectStore::Transaction &t = context<PeeringMachine>().get_cur_transaction();
6169 if (msg->info.last_backfill == hobject_t()) {
6170 // restart backfill
6171 ps->info = msg->info;
6172 pl->on_info_history_change();
6173 ps->dirty_info = true;
6174 ps->dirty_big_info = true; // maybe.
6175
6176 PGLog::LogEntryHandlerRef rollbacker{pl->get_log_handler(t)};
6177 ps->pg_log.reset_backfill_claim_log(msg->log, rollbacker.get());
6178
6179 ps->pg_log.reset_backfill();
6180 } else {
6181 ps->merge_log(t, msg->info, msg->log, logevt.from);
6182 }
6183 if (logevt.msg->lease) {
6184 ps->proc_lease(*logevt.msg->lease);
6185 }
6186
6187 ceph_assert(ps->pg_log.get_head() == ps->info.last_update);
6188
6189 post_event(Activate(logevt.msg->info.last_epoch_started));
6190 return transit<ReplicaActive>();
6191 }
6192
6193 boost::statechart::result PeeringState::Stray::react(const MInfoRec& infoevt)
6194 {
6195 DECLARE_LOCALS;
6196 psdout(10) << "got info from osd." << infoevt.from << " " << infoevt.info << dendl;
6197
6198 if (ps->info.last_update > infoevt.info.last_update) {
6199 // rewind divergent log entries
6200 ObjectStore::Transaction &t = context<PeeringMachine>().get_cur_transaction();
6201 ps->rewind_divergent_log(t, infoevt.info.last_update);
6202 ps->info.stats = infoevt.info.stats;
6203 ps->info.hit_set = infoevt.info.hit_set;
6204 }
6205
6206 if (infoevt.lease) {
6207 ps->proc_lease(*infoevt.lease);
6208 }
6209
6210 ceph_assert(infoevt.info.last_update == ps->info.last_update);
6211 ceph_assert(ps->pg_log.get_head() == ps->info.last_update);
6212
6213 post_event(Activate(infoevt.info.last_epoch_started));
6214 return transit<ReplicaActive>();
6215 }
6216
6217 boost::statechart::result PeeringState::Stray::react(const MQuery& query)
6218 {
6219 DECLARE_LOCALS;
6220 ps->fulfill_query(query, context<PeeringMachine>().get_recovery_ctx());
6221 return discard_event();
6222 }
6223
6224 boost::statechart::result PeeringState::Stray::react(const ActMap&)
6225 {
6226 DECLARE_LOCALS;
6227 if (ps->should_send_notify() && ps->get_primary().osd >= 0) {
6228 ps->info.history.refresh_prior_readable_until_ub(
6229 pl->get_mnow(), ps->prior_readable_until_ub);
6230 context< PeeringMachine >().send_notify(
6231 ps->get_primary().osd,
6232 pg_notify_t(
6233 ps->get_primary().shard, ps->pg_whoami.shard,
6234 ps->get_osdmap_epoch(),
6235 ps->get_osdmap_epoch(),
6236 ps->info,
6237 ps->past_intervals));
6238 }
6239 return discard_event();
6240 }
6241
6242 void PeeringState::Stray::exit()
6243 {
6244 context< PeeringMachine >().log_exit(state_name, enter_time);
6245 DECLARE_LOCALS;
6246 utime_t dur = ceph_clock_now() - enter_time;
6247 pl->get_peering_perf().tinc(rs_stray_latency, dur);
6248 }
6249
6250
6251 /*--------ToDelete----------*/
6252 PeeringState::ToDelete::ToDelete(my_context ctx)
6253 : my_base(ctx),
6254 NamedState(context< PeeringMachine >().state_history, "Started/ToDelete")
6255 {
6256 context< PeeringMachine >().log_enter(state_name);
6257 DECLARE_LOCALS;
6258 pl->get_perf_logger().inc(l_osd_pg_removing);
6259 }
6260
6261 void PeeringState::ToDelete::exit()
6262 {
6263 context< PeeringMachine >().log_exit(state_name, enter_time);
6264 DECLARE_LOCALS;
6265 // note: on a successful removal, this path doesn't execute. see
6266 // _delete_some().
6267 pl->get_perf_logger().dec(l_osd_pg_removing);
6268
6269 pl->cancel_local_background_io_reservation();
6270 }
6271
6272 /*----WaitDeleteReserved----*/
6273 PeeringState::WaitDeleteReserved::WaitDeleteReserved(my_context ctx)
6274 : my_base(ctx),
6275 NamedState(context< PeeringMachine >().state_history,
6276 "Started/ToDelete/WaitDeleteReseved")
6277 {
6278 context< PeeringMachine >().log_enter(state_name);
6279 DECLARE_LOCALS;
6280 context< ToDelete >().priority = ps->get_delete_priority();
6281
6282 pl->cancel_local_background_io_reservation();
6283 pl->request_local_background_io_reservation(
6284 context<ToDelete>().priority,
6285 std::make_shared<PGPeeringEvent>(
6286 ps->get_osdmap_epoch(),
6287 ps->get_osdmap_epoch(),
6288 DeleteReserved()),
6289 std::make_shared<PGPeeringEvent>(
6290 ps->get_osdmap_epoch(),
6291 ps->get_osdmap_epoch(),
6292 DeleteInterrupted()));
6293 }
6294
6295 boost::statechart::result PeeringState::ToDelete::react(
6296 const ActMap& evt)
6297 {
6298 DECLARE_LOCALS;
6299 if (ps->get_delete_priority() != priority) {
6300 psdout(10) << __func__ << " delete priority changed, resetting"
6301 << dendl;
6302 return transit<ToDelete>();
6303 }
6304 return discard_event();
6305 }
6306
6307 void PeeringState::WaitDeleteReserved::exit()
6308 {
6309 context< PeeringMachine >().log_exit(state_name, enter_time);
6310 }
6311
6312 /*----Deleting-----*/
6313 PeeringState::Deleting::Deleting(my_context ctx)
6314 : my_base(ctx),
6315 NamedState(context< PeeringMachine >().state_history, "Started/ToDelete/Deleting")
6316 {
6317 start = ceph::mono_clock::now();
6318
6319 context< PeeringMachine >().log_enter(state_name);
6320
6321 DECLARE_LOCALS;
6322 ps->deleting = true;
6323 ObjectStore::Transaction &t = context<PeeringMachine>().get_cur_transaction();
6324
6325 // clear log
6326 PGLog::LogEntryHandlerRef rollbacker{pl->get_log_handler(t)};
6327 ps->pg_log.roll_forward(rollbacker.get());
6328
6329 // adjust info to backfill
6330 ps->info.set_last_backfill(hobject_t());
6331 ps->pg_log.reset_backfill();
6332 ps->dirty_info = true;
6333
6334 pl->on_removal(t);
6335 }
6336
6337 boost::statechart::result PeeringState::Deleting::react(
6338 const DeleteSome& evt)
6339 {
6340 DECLARE_LOCALS;
6341 next = pl->do_delete_work(context<PeeringMachine>().get_cur_transaction(),
6342 next);
6343 return discard_event();
6344 }
6345
6346 void PeeringState::Deleting::exit()
6347 {
6348 context< PeeringMachine >().log_exit(state_name, enter_time);
6349 DECLARE_LOCALS;
6350 ps->deleting = false;
6351 pl->cancel_local_background_io_reservation();
6352 psdout(20) << "Deleting::" << __func__ << this <<" finished in "
6353 << ceph::mono_clock::now() - start
6354 << dendl;
6355 }
6356
6357 /*--------GetInfo---------*/
6358 PeeringState::GetInfo::GetInfo(my_context ctx)
6359 : my_base(ctx),
6360 NamedState(context< PeeringMachine >().state_history, "Started/Primary/Peering/GetInfo")
6361 {
6362 context< PeeringMachine >().log_enter(state_name);
6363
6364
6365 DECLARE_LOCALS;
6366 ps->check_past_interval_bounds();
6367 ps->log_weirdness();
6368 PastIntervals::PriorSet &prior_set = context< Peering >().prior_set;
6369
6370 ceph_assert(ps->blocked_by.empty());
6371
6372 prior_set = ps->build_prior();
6373 ps->prior_readable_down_osds = prior_set.down;
6374 if (ps->prior_readable_down_osds.empty()) {
6375 psdout(10) << " no prior_set down osds, clearing prior_readable_until_ub"
6376 << dendl;
6377 ps->clear_prior_readable_until_ub();
6378 }
6379
6380 ps->reset_min_peer_features();
6381 get_infos();
6382 if (prior_set.pg_down) {
6383 post_event(IsDown());
6384 } else if (peer_info_requested.empty()) {
6385 post_event(GotInfo());
6386 }
6387 }
6388
6389 void PeeringState::GetInfo::get_infos()
6390 {
6391 DECLARE_LOCALS;
6392 PastIntervals::PriorSet &prior_set = context< Peering >().prior_set;
6393
6394 ps->blocked_by.clear();
6395 for (set<pg_shard_t>::const_iterator it = prior_set.probe.begin();
6396 it != prior_set.probe.end();
6397 ++it) {
6398 pg_shard_t peer = *it;
6399 if (peer == ps->pg_whoami) {
6400 continue;
6401 }
6402 if (ps->peer_info.count(peer)) {
6403 psdout(10) << " have osd." << peer << " info " << ps->peer_info[peer] << dendl;
6404 continue;
6405 }
6406 if (peer_info_requested.count(peer)) {
6407 psdout(10) << " already requested info from osd." << peer << dendl;
6408 ps->blocked_by.insert(peer.osd);
6409 } else if (!ps->get_osdmap()->is_up(peer.osd)) {
6410 psdout(10) << " not querying info from down osd." << peer << dendl;
6411 } else {
6412 psdout(10) << " querying info from osd." << peer << dendl;
6413 context< PeeringMachine >().send_query(
6414 peer.osd,
6415 pg_query_t(pg_query_t::INFO,
6416 it->shard, ps->pg_whoami.shard,
6417 ps->info.history,
6418 ps->get_osdmap_epoch()));
6419 peer_info_requested.insert(peer);
6420 ps->blocked_by.insert(peer.osd);
6421 }
6422 }
6423
6424 ps->check_prior_readable_down_osds(ps->get_osdmap());
6425
6426 pl->publish_stats_to_osd();
6427 }
6428
6429 boost::statechart::result PeeringState::GetInfo::react(const MNotifyRec& infoevt)
6430 {
6431
6432 DECLARE_LOCALS;
6433
6434 set<pg_shard_t>::iterator p = peer_info_requested.find(infoevt.from);
6435 if (p != peer_info_requested.end()) {
6436 peer_info_requested.erase(p);
6437 ps->blocked_by.erase(infoevt.from.osd);
6438 }
6439
6440 epoch_t old_start = ps->info.history.last_epoch_started;
6441 if (ps->proc_replica_info(
6442 infoevt.from, infoevt.notify.info, infoevt.notify.epoch_sent)) {
6443 // we got something new ...
6444 PastIntervals::PriorSet &prior_set = context< Peering >().prior_set;
6445 if (old_start < ps->info.history.last_epoch_started) {
6446 psdout(10) << " last_epoch_started moved forward, rebuilding prior" << dendl;
6447 prior_set = ps->build_prior();
6448 ps->prior_readable_down_osds = prior_set.down;
6449
6450 // filter out any osds that got dropped from the probe set from
6451 // peer_info_requested. this is less expensive than restarting
6452 // peering (which would re-probe everyone).
6453 set<pg_shard_t>::iterator p = peer_info_requested.begin();
6454 while (p != peer_info_requested.end()) {
6455 if (prior_set.probe.count(*p) == 0) {
6456 psdout(20) << " dropping osd." << *p << " from info_requested, no longer in probe set" << dendl;
6457 peer_info_requested.erase(p++);
6458 } else {
6459 ++p;
6460 }
6461 }
6462 get_infos();
6463 }
6464 psdout(20) << "Adding osd: " << infoevt.from.osd << " peer features: "
6465 << hex << infoevt.features << dec << dendl;
6466 ps->apply_peer_features(infoevt.features);
6467
6468 // are we done getting everything?
6469 if (peer_info_requested.empty() && !prior_set.pg_down) {
6470 psdout(20) << "Common peer features: " << hex << ps->get_min_peer_features() << dec << dendl;
6471 psdout(20) << "Common acting features: " << hex << ps->get_min_acting_features() << dec << dendl;
6472 psdout(20) << "Common upacting features: " << hex << ps->get_min_upacting_features() << dec << dendl;
6473 post_event(GotInfo());
6474 }
6475 }
6476 return discard_event();
6477 }
6478
6479 boost::statechart::result PeeringState::GetInfo::react(const QueryState& q)
6480 {
6481 DECLARE_LOCALS;
6482 q.f->open_object_section("state");
6483 q.f->dump_string("name", state_name);
6484 q.f->dump_stream("enter_time") << enter_time;
6485
6486 q.f->open_array_section("requested_info_from");
6487 for (set<pg_shard_t>::iterator p = peer_info_requested.begin();
6488 p != peer_info_requested.end();
6489 ++p) {
6490 q.f->open_object_section("osd");
6491 q.f->dump_stream("osd") << *p;
6492 if (ps->peer_info.count(*p)) {
6493 q.f->open_object_section("got_info");
6494 ps->peer_info[*p].dump(q.f);
6495 q.f->close_section();
6496 }
6497 q.f->close_section();
6498 }
6499 q.f->close_section();
6500
6501 q.f->close_section();
6502 return forward_event();
6503 }
6504
6505 void PeeringState::GetInfo::exit()
6506 {
6507 context< PeeringMachine >().log_exit(state_name, enter_time);
6508
6509 DECLARE_LOCALS;
6510 utime_t dur = ceph_clock_now() - enter_time;
6511 pl->get_peering_perf().tinc(rs_getinfo_latency, dur);
6512 ps->blocked_by.clear();
6513 }
6514
6515 /*------GetLog------------*/
6516 PeeringState::GetLog::GetLog(my_context ctx)
6517 : my_base(ctx),
6518 NamedState(
6519 context< PeeringMachine >().state_history,
6520 "Started/Primary/Peering/GetLog"),
6521 msg(0)
6522 {
6523 context< PeeringMachine >().log_enter(state_name);
6524
6525 DECLARE_LOCALS;
6526
6527 ps->log_weirdness();
6528
6529 // adjust acting?
6530 if (!ps->choose_acting(auth_log_shard, false,
6531 &context< Peering >().history_les_bound)) {
6532 if (!ps->want_acting.empty()) {
6533 post_event(NeedActingChange());
6534 } else {
6535 post_event(IsIncomplete());
6536 }
6537 return;
6538 }
6539
6540 // am i the best?
6541 if (auth_log_shard == ps->pg_whoami) {
6542 post_event(GotLog());
6543 return;
6544 }
6545
6546 const pg_info_t& best = ps->peer_info[auth_log_shard];
6547
6548 // am i broken?
6549 if (ps->info.last_update < best.log_tail) {
6550 psdout(10) << " not contiguous with osd." << auth_log_shard << ", down" << dendl;
6551 post_event(IsIncomplete());
6552 return;
6553 }
6554
6555 // how much log to request?
6556 eversion_t request_log_from = ps->info.last_update;
6557 ceph_assert(!ps->acting_recovery_backfill.empty());
6558 for (set<pg_shard_t>::iterator p = ps->acting_recovery_backfill.begin();
6559 p != ps->acting_recovery_backfill.end();
6560 ++p) {
6561 if (*p == ps->pg_whoami) continue;
6562 pg_info_t& ri = ps->peer_info[*p];
6563 if (ri.last_update < ps->info.log_tail && ri.last_update >= best.log_tail &&
6564 ri.last_update < request_log_from)
6565 request_log_from = ri.last_update;
6566 }
6567
6568 // how much?
6569 psdout(10) << " requesting log from osd." << auth_log_shard << dendl;
6570 context<PeeringMachine>().send_query(
6571 auth_log_shard.osd,
6572 pg_query_t(
6573 pg_query_t::LOG,
6574 auth_log_shard.shard, ps->pg_whoami.shard,
6575 request_log_from, ps->info.history,
6576 ps->get_osdmap_epoch()));
6577
6578 ceph_assert(ps->blocked_by.empty());
6579 ps->blocked_by.insert(auth_log_shard.osd);
6580 pl->publish_stats_to_osd();
6581 }
6582
6583 boost::statechart::result PeeringState::GetLog::react(const AdvMap& advmap)
6584 {
6585 // make sure our log source didn't go down. we need to check
6586 // explicitly because it may not be part of the prior set, which
6587 // means the Peering state check won't catch it going down.
6588 if (!advmap.osdmap->is_up(auth_log_shard.osd)) {
6589 psdout(10) << "GetLog: auth_log_shard osd."
6590 << auth_log_shard.osd << " went down" << dendl;
6591 post_event(advmap);
6592 return transit< Reset >();
6593 }
6594
6595 // let the Peering state do its checks.
6596 return forward_event();
6597 }
6598
6599 boost::statechart::result PeeringState::GetLog::react(const MLogRec& logevt)
6600 {
6601 ceph_assert(!msg);
6602 if (logevt.from != auth_log_shard) {
6603 psdout(10) << "GetLog: discarding log from "
6604 << "non-auth_log_shard osd." << logevt.from << dendl;
6605 return discard_event();
6606 }
6607 psdout(10) << "GetLog: received master log from osd."
6608 << logevt.from << dendl;
6609 msg = logevt.msg;
6610 post_event(GotLog());
6611 return discard_event();
6612 }
6613
6614 boost::statechart::result PeeringState::GetLog::react(const GotLog&)
6615 {
6616
6617 DECLARE_LOCALS;
6618 psdout(10) << "leaving GetLog" << dendl;
6619 if (msg) {
6620 psdout(10) << "processing master log" << dendl;
6621 ps->proc_master_log(context<PeeringMachine>().get_cur_transaction(),
6622 msg->info, msg->log, msg->missing,
6623 auth_log_shard);
6624 }
6625 ps->start_flush(context< PeeringMachine >().get_cur_transaction());
6626 return transit< GetMissing >();
6627 }
6628
6629 boost::statechart::result PeeringState::GetLog::react(const QueryState& q)
6630 {
6631 q.f->open_object_section("state");
6632 q.f->dump_string("name", state_name);
6633 q.f->dump_stream("enter_time") << enter_time;
6634 q.f->dump_stream("auth_log_shard") << auth_log_shard;
6635 q.f->close_section();
6636 return forward_event();
6637 }
6638
6639 void PeeringState::GetLog::exit()
6640 {
6641 context< PeeringMachine >().log_exit(state_name, enter_time);
6642
6643 DECLARE_LOCALS;
6644 utime_t dur = ceph_clock_now() - enter_time;
6645 pl->get_peering_perf().tinc(rs_getlog_latency, dur);
6646 ps->blocked_by.clear();
6647 }
6648
6649 /*------WaitActingChange--------*/
6650 PeeringState::WaitActingChange::WaitActingChange(my_context ctx)
6651 : my_base(ctx),
6652 NamedState(context< PeeringMachine >().state_history, "Started/Primary/WaitActingChange")
6653 {
6654 context< PeeringMachine >().log_enter(state_name);
6655 }
6656
6657 boost::statechart::result PeeringState::WaitActingChange::react(const AdvMap& advmap)
6658 {
6659 DECLARE_LOCALS;
6660 OSDMapRef osdmap = advmap.osdmap;
6661
6662 psdout(10) << "verifying no want_acting " << ps->want_acting << " targets didn't go down" << dendl;
6663 for (vector<int>::iterator p = ps->want_acting.begin(); p != ps->want_acting.end(); ++p) {
6664 if (!osdmap->is_up(*p)) {
6665 psdout(10) << " want_acting target osd." << *p << " went down, resetting" << dendl;
6666 post_event(advmap);
6667 return transit< Reset >();
6668 }
6669 }
6670 return forward_event();
6671 }
6672
6673 boost::statechart::result PeeringState::WaitActingChange::react(const MLogRec& logevt)
6674 {
6675 psdout(10) << "In WaitActingChange, ignoring MLocRec" << dendl;
6676 return discard_event();
6677 }
6678
6679 boost::statechart::result PeeringState::WaitActingChange::react(const MInfoRec& evt)
6680 {
6681 psdout(10) << "In WaitActingChange, ignoring MInfoRec" << dendl;
6682 return discard_event();
6683 }
6684
6685 boost::statechart::result PeeringState::WaitActingChange::react(const MNotifyRec& evt)
6686 {
6687 psdout(10) << "In WaitActingChange, ignoring MNotifyRec" << dendl;
6688 return discard_event();
6689 }
6690
6691 boost::statechart::result PeeringState::WaitActingChange::react(const QueryState& q)
6692 {
6693 q.f->open_object_section("state");
6694 q.f->dump_string("name", state_name);
6695 q.f->dump_stream("enter_time") << enter_time;
6696 q.f->dump_string("comment", "waiting for pg acting set to change");
6697 q.f->close_section();
6698 return forward_event();
6699 }
6700
6701 void PeeringState::WaitActingChange::exit()
6702 {
6703 context< PeeringMachine >().log_exit(state_name, enter_time);
6704 DECLARE_LOCALS;
6705 utime_t dur = ceph_clock_now() - enter_time;
6706 pl->get_peering_perf().tinc(rs_waitactingchange_latency, dur);
6707 }
6708
6709 /*------Down--------*/
6710 PeeringState::Down::Down(my_context ctx)
6711 : my_base(ctx),
6712 NamedState(context< PeeringMachine >().state_history, "Started/Primary/Peering/Down")
6713 {
6714 context< PeeringMachine >().log_enter(state_name);
6715 DECLARE_LOCALS;
6716
6717 ps->state_clear(PG_STATE_PEERING);
6718 ps->state_set(PG_STATE_DOWN);
6719
6720 auto &prior_set = context< Peering >().prior_set;
6721 ceph_assert(ps->blocked_by.empty());
6722 ps->blocked_by.insert(prior_set.down.begin(), prior_set.down.end());
6723 pl->publish_stats_to_osd();
6724 }
6725
6726 void PeeringState::Down::exit()
6727 {
6728 context< PeeringMachine >().log_exit(state_name, enter_time);
6729
6730 DECLARE_LOCALS;
6731
6732 ps->state_clear(PG_STATE_DOWN);
6733 utime_t dur = ceph_clock_now() - enter_time;
6734 pl->get_peering_perf().tinc(rs_down_latency, dur);
6735
6736 ps->blocked_by.clear();
6737 }
6738
6739 boost::statechart::result PeeringState::Down::react(const QueryState& q)
6740 {
6741 q.f->open_object_section("state");
6742 q.f->dump_string("name", state_name);
6743 q.f->dump_stream("enter_time") << enter_time;
6744 q.f->dump_string("comment",
6745 "not enough up instances of this PG to go active");
6746 q.f->close_section();
6747 return forward_event();
6748 }
6749
6750 boost::statechart::result PeeringState::Down::react(const MNotifyRec& infoevt)
6751 {
6752 DECLARE_LOCALS;
6753
6754 ceph_assert(ps->is_primary());
6755 epoch_t old_start = ps->info.history.last_epoch_started;
6756 if (!ps->peer_info.count(infoevt.from) &&
6757 ps->get_osdmap()->has_been_up_since(infoevt.from.osd, infoevt.notify.epoch_sent)) {
6758 ps->update_history(infoevt.notify.info.history);
6759 }
6760 // if we got something new to make pg escape down state
6761 if (ps->info.history.last_epoch_started > old_start) {
6762 psdout(10) << " last_epoch_started moved forward, re-enter getinfo" << dendl;
6763 ps->state_clear(PG_STATE_DOWN);
6764 ps->state_set(PG_STATE_PEERING);
6765 return transit< GetInfo >();
6766 }
6767
6768 return discard_event();
6769 }
6770
6771
6772 /*------Incomplete--------*/
6773 PeeringState::Incomplete::Incomplete(my_context ctx)
6774 : my_base(ctx),
6775 NamedState(context< PeeringMachine >().state_history, "Started/Primary/Peering/Incomplete")
6776 {
6777 context< PeeringMachine >().log_enter(state_name);
6778 DECLARE_LOCALS;
6779
6780 ps->state_clear(PG_STATE_PEERING);
6781 ps->state_set(PG_STATE_INCOMPLETE);
6782
6783 PastIntervals::PriorSet &prior_set = context< Peering >().prior_set;
6784 ceph_assert(ps->blocked_by.empty());
6785 ps->blocked_by.insert(prior_set.down.begin(), prior_set.down.end());
6786 pl->publish_stats_to_osd();
6787 }
6788
6789 boost::statechart::result PeeringState::Incomplete::react(const AdvMap &advmap) {
6790 DECLARE_LOCALS;
6791 int64_t poolnum = ps->info.pgid.pool();
6792
6793 // Reset if min_size turn smaller than previous value, pg might now be able to go active
6794 if (!advmap.osdmap->have_pg_pool(poolnum) ||
6795 advmap.lastmap->get_pools().find(poolnum)->second.min_size >
6796 advmap.osdmap->get_pools().find(poolnum)->second.min_size) {
6797 post_event(advmap);
6798 return transit< Reset >();
6799 }
6800
6801 return forward_event();
6802 }
6803
6804 boost::statechart::result PeeringState::Incomplete::react(const MNotifyRec& notevt) {
6805 DECLARE_LOCALS;
6806 psdout(7) << "handle_pg_notify from osd." << notevt.from << dendl;
6807 if (ps->proc_replica_info(
6808 notevt.from, notevt.notify.info, notevt.notify.epoch_sent)) {
6809 // We got something new, try again!
6810 return transit< GetLog >();
6811 } else {
6812 return discard_event();
6813 }
6814 }
6815
6816 boost::statechart::result PeeringState::Incomplete::react(
6817 const QueryState& q)
6818 {
6819 q.f->open_object_section("state");
6820 q.f->dump_string("name", state_name);
6821 q.f->dump_stream("enter_time") << enter_time;
6822 q.f->dump_string("comment", "not enough complete instances of this PG");
6823 q.f->close_section();
6824 return forward_event();
6825 }
6826
6827 void PeeringState::Incomplete::exit()
6828 {
6829 context< PeeringMachine >().log_exit(state_name, enter_time);
6830
6831 DECLARE_LOCALS;
6832
6833 ps->state_clear(PG_STATE_INCOMPLETE);
6834 utime_t dur = ceph_clock_now() - enter_time;
6835 pl->get_peering_perf().tinc(rs_incomplete_latency, dur);
6836
6837 ps->blocked_by.clear();
6838 }
6839
6840 /*------GetMissing--------*/
6841 PeeringState::GetMissing::GetMissing(my_context ctx)
6842 : my_base(ctx),
6843 NamedState(context< PeeringMachine >().state_history, "Started/Primary/Peering/GetMissing")
6844 {
6845 context< PeeringMachine >().log_enter(state_name);
6846
6847 DECLARE_LOCALS;
6848 ps->log_weirdness();
6849 ceph_assert(!ps->acting_recovery_backfill.empty());
6850 eversion_t since;
6851 for (set<pg_shard_t>::iterator i = ps->acting_recovery_backfill.begin();
6852 i != ps->acting_recovery_backfill.end();
6853 ++i) {
6854 if (*i == ps->get_primary()) continue;
6855 const pg_info_t& pi = ps->peer_info[*i];
6856 // reset this so to make sure the pg_missing_t is initialized and
6857 // has the correct semantics even if we don't need to get a
6858 // missing set from a shard. This way later additions due to
6859 // lost+unfound delete work properly.
6860 ps->peer_missing[*i].may_include_deletes = !ps->perform_deletes_during_peering();
6861
6862 if (pi.is_empty())
6863 continue; // no pg data, nothing divergent
6864
6865 if (pi.last_update < ps->pg_log.get_tail()) {
6866 psdout(10) << " osd." << *i << " is not contiguous, will restart backfill" << dendl;
6867 ps->peer_missing[*i].clear();
6868 continue;
6869 }
6870 if (pi.last_backfill == hobject_t()) {
6871 psdout(10) << " osd." << *i << " will fully backfill; can infer empty missing set" << dendl;
6872 ps->peer_missing[*i].clear();
6873 continue;
6874 }
6875
6876 if (pi.last_update == pi.last_complete && // peer has no missing
6877 pi.last_update == ps->info.last_update) { // peer is up to date
6878 // replica has no missing and identical log as us. no need to
6879 // pull anything.
6880 // FIXME: we can do better here. if last_update==last_complete we
6881 // can infer the rest!
6882 psdout(10) << " osd." << *i << " has no missing, identical log" << dendl;
6883 ps->peer_missing[*i].clear();
6884 continue;
6885 }
6886
6887 // We pull the log from the peer's last_epoch_started to ensure we
6888 // get enough log to detect divergent updates.
6889 since.epoch = pi.last_epoch_started;
6890 ceph_assert(pi.last_update >= ps->info.log_tail); // or else choose_acting() did a bad thing
6891 if (pi.log_tail <= since) {
6892 psdout(10) << " requesting log+missing since " << since << " from osd." << *i << dendl;
6893 context< PeeringMachine >().send_query(
6894 i->osd,
6895 pg_query_t(
6896 pg_query_t::LOG,
6897 i->shard, ps->pg_whoami.shard,
6898 since, ps->info.history,
6899 ps->get_osdmap_epoch()));
6900 } else {
6901 psdout(10) << " requesting fulllog+missing from osd." << *i
6902 << " (want since " << since << " < log.tail "
6903 << pi.log_tail << ")" << dendl;
6904 context< PeeringMachine >().send_query(
6905 i->osd, pg_query_t(
6906 pg_query_t::FULLLOG,
6907 i->shard, ps->pg_whoami.shard,
6908 ps->info.history, ps->get_osdmap_epoch()));
6909 }
6910 peer_missing_requested.insert(*i);
6911 ps->blocked_by.insert(i->osd);
6912 }
6913
6914 if (peer_missing_requested.empty()) {
6915 if (ps->need_up_thru) {
6916 psdout(10) << " still need up_thru update before going active"
6917 << dendl;
6918 post_event(NeedUpThru());
6919 return;
6920 }
6921
6922 // all good!
6923 post_event(Activate(ps->get_osdmap_epoch()));
6924 } else {
6925 pl->publish_stats_to_osd();
6926 }
6927 }
6928
6929 boost::statechart::result PeeringState::GetMissing::react(const MLogRec& logevt)
6930 {
6931 DECLARE_LOCALS;
6932
6933 peer_missing_requested.erase(logevt.from);
6934 ps->proc_replica_log(logevt.msg->info, logevt.msg->log, logevt.msg->missing, logevt.from);
6935
6936 if (peer_missing_requested.empty()) {
6937 if (ps->need_up_thru) {
6938 psdout(10) << " still need up_thru update before going active"
6939 << dendl;
6940 post_event(NeedUpThru());
6941 } else {
6942 psdout(10) << "Got last missing, don't need missing "
6943 << "posting Activate" << dendl;
6944 post_event(Activate(ps->get_osdmap_epoch()));
6945 }
6946 }
6947 return discard_event();
6948 }
6949
6950 boost::statechart::result PeeringState::GetMissing::react(const QueryState& q)
6951 {
6952 DECLARE_LOCALS;
6953 q.f->open_object_section("state");
6954 q.f->dump_string("name", state_name);
6955 q.f->dump_stream("enter_time") << enter_time;
6956
6957 q.f->open_array_section("peer_missing_requested");
6958 for (set<pg_shard_t>::iterator p = peer_missing_requested.begin();
6959 p != peer_missing_requested.end();
6960 ++p) {
6961 q.f->open_object_section("osd");
6962 q.f->dump_stream("osd") << *p;
6963 if (ps->peer_missing.count(*p)) {
6964 q.f->open_object_section("got_missing");
6965 ps->peer_missing[*p].dump(q.f);
6966 q.f->close_section();
6967 }
6968 q.f->close_section();
6969 }
6970 q.f->close_section();
6971
6972 q.f->close_section();
6973 return forward_event();
6974 }
6975
6976 void PeeringState::GetMissing::exit()
6977 {
6978 context< PeeringMachine >().log_exit(state_name, enter_time);
6979
6980 DECLARE_LOCALS;
6981 utime_t dur = ceph_clock_now() - enter_time;
6982 pl->get_peering_perf().tinc(rs_getmissing_latency, dur);
6983 ps->blocked_by.clear();
6984 }
6985
6986 /*------WaitUpThru--------*/
6987 PeeringState::WaitUpThru::WaitUpThru(my_context ctx)
6988 : my_base(ctx),
6989 NamedState(context< PeeringMachine >().state_history, "Started/Primary/Peering/WaitUpThru")
6990 {
6991 context< PeeringMachine >().log_enter(state_name);
6992 }
6993
6994 boost::statechart::result PeeringState::WaitUpThru::react(const ActMap& am)
6995 {
6996 DECLARE_LOCALS;
6997 if (!ps->need_up_thru) {
6998 post_event(Activate(ps->get_osdmap_epoch()));
6999 }
7000 return forward_event();
7001 }
7002
7003 boost::statechart::result PeeringState::WaitUpThru::react(const MLogRec& logevt)
7004 {
7005 DECLARE_LOCALS;
7006 psdout(10) << "Noting missing from osd." << logevt.from << dendl;
7007 ps->peer_missing[logevt.from].claim(logevt.msg->missing);
7008 ps->peer_info[logevt.from] = logevt.msg->info;
7009 return discard_event();
7010 }
7011
7012 boost::statechart::result PeeringState::WaitUpThru::react(const QueryState& q)
7013 {
7014 q.f->open_object_section("state");
7015 q.f->dump_string("name", state_name);
7016 q.f->dump_stream("enter_time") << enter_time;
7017 q.f->dump_string("comment", "waiting for osdmap to reflect a new up_thru for this osd");
7018 q.f->close_section();
7019 return forward_event();
7020 }
7021
7022 void PeeringState::WaitUpThru::exit()
7023 {
7024 context< PeeringMachine >().log_exit(state_name, enter_time);
7025 DECLARE_LOCALS;
7026 utime_t dur = ceph_clock_now() - enter_time;
7027 pl->get_peering_perf().tinc(rs_waitupthru_latency, dur);
7028 }
7029
7030 /*----PeeringState::PeeringMachine Methods-----*/
7031 #undef dout_prefix
7032 #define dout_prefix dpp->gen_prefix(*_dout)
7033
7034 void PeeringState::PeeringMachine::log_enter(const char *state_name)
7035 {
7036 DECLARE_LOCALS;
7037 psdout(5) << "enter " << state_name << dendl;
7038 pl->log_state_enter(state_name);
7039 }
7040
7041 void PeeringState::PeeringMachine::log_exit(const char *state_name, utime_t enter_time)
7042 {
7043 DECLARE_LOCALS;
7044 utime_t dur = ceph_clock_now() - enter_time;
7045 psdout(5) << "exit " << state_name << " " << dur << " " << event_count << " " << event_time << dendl;
7046 pl->log_state_exit(state_name, enter_time, event_count, event_time);
7047 event_count = 0;
7048 event_time = utime_t();
7049 }
7050
7051 ostream &operator<<(ostream &out, const PeeringState &ps) {
7052 out << "pg[" << ps.info
7053 << " " << pg_vector_string(ps.up);
7054 if (ps.acting != ps.up)
7055 out << "/" << pg_vector_string(ps.acting);
7056 if (ps.is_ec_pg())
7057 out << "p" << ps.get_primary();
7058 if (!ps.async_recovery_targets.empty())
7059 out << " async=[" << ps.async_recovery_targets << "]";
7060 if (!ps.backfill_targets.empty())
7061 out << " backfill=[" << ps.backfill_targets << "]";
7062 out << " r=" << ps.get_role();
7063 out << " lpr=" << ps.get_last_peering_reset();
7064
7065 if (ps.deleting)
7066 out << " DELETING";
7067
7068 if (!ps.past_intervals.empty()) {
7069 out << " pi=[" << ps.past_intervals.get_bounds()
7070 << ")/" << ps.past_intervals.size();
7071 }
7072
7073 if (ps.is_peered()) {
7074 if (ps.last_update_ondisk != ps.info.last_update)
7075 out << " luod=" << ps.last_update_ondisk;
7076 if (ps.last_update_applied != ps.info.last_update)
7077 out << " lua=" << ps.last_update_applied;
7078 }
7079
7080 if (ps.pg_log.get_tail() != ps.info.log_tail ||
7081 ps.pg_log.get_head() != ps.info.last_update)
7082 out << " (info mismatch, " << ps.pg_log.get_log() << ")";
7083
7084 if (!ps.pg_log.get_log().empty()) {
7085 if ((ps.pg_log.get_log().log.begin()->version <= ps.pg_log.get_tail())) {
7086 out << " (log bound mismatch, actual=["
7087 << ps.pg_log.get_log().log.begin()->version << ","
7088 << ps.pg_log.get_log().log.rbegin()->version << "]";
7089 out << ")";
7090 }
7091 }
7092
7093 out << " crt=" << ps.pg_log.get_can_rollback_to();
7094
7095 if (ps.last_complete_ondisk != ps.info.last_complete)
7096 out << " lcod " << ps.last_complete_ondisk;
7097
7098 out << " mlcod " << ps.min_last_complete_ondisk;
7099
7100 out << " " << pg_state_string(ps.get_state());
7101 if (ps.should_send_notify())
7102 out << " NOTIFY";
7103
7104 if (ps.prior_readable_until_ub != ceph::signedspan::zero()) {
7105 out << " pruub " << ps.prior_readable_until_ub
7106 << "@" << ps.get_prior_readable_down_osds();
7107 }
7108 return out;
7109 }