]>
Commit | Line | Data |
---|---|---|
9f95a23c TL |
1 | // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- |
2 | // vim: ts=8 sw=2 smarttab | |
3 | ||
4 | #include "PGPeeringEvent.h" | |
5 | #include "common/ceph_releases.h" | |
6 | #include "common/dout.h" | |
7 | #include "PeeringState.h" | |
8 | ||
9 | #include "messages/MOSDPGRemove.h" | |
10 | #include "messages/MBackfillReserve.h" | |
11 | #include "messages/MRecoveryReserve.h" | |
12 | #include "messages/MOSDScrubReserve.h" | |
13 | #include "messages/MOSDPGInfo.h" | |
14 | #include "messages/MOSDPGInfo2.h" | |
15 | #include "messages/MOSDPGTrim.h" | |
16 | #include "messages/MOSDPGLog.h" | |
17 | #include "messages/MOSDPGNotify.h" | |
18 | #include "messages/MOSDPGNotify2.h" | |
19 | #include "messages/MOSDPGQuery.h" | |
20 | #include "messages/MOSDPGQuery2.h" | |
21 | #include "messages/MOSDPGLease.h" | |
22 | #include "messages/MOSDPGLeaseAck.h" | |
23 | ||
24 | #define dout_context cct | |
25 | #define dout_subsys ceph_subsys_osd | |
26 | ||
f67539c2 TL |
27 | using std::dec; |
28 | using std::hex; | |
29 | using std::make_pair; | |
30 | using std::map; | |
31 | using std::ostream; | |
32 | using std::pair; | |
33 | using std::set; | |
34 | using std::stringstream; | |
35 | using std::vector; | |
36 | ||
37 | using ceph::Formatter; | |
38 | using ceph::make_message; | |
39 | ||
9f95a23c TL |
40 | BufferedRecoveryMessages::BufferedRecoveryMessages( |
41 | ceph_release_t r, | |
42 | PeeringCtx &ctx) | |
43 | : require_osd_release(r) { | |
44 | // steal messages from ctx | |
45 | message_map.swap(ctx.message_map); | |
46 | } | |
47 | ||
48 | void BufferedRecoveryMessages::send_notify(int to, const pg_notify_t &n) | |
49 | { | |
50 | if (require_osd_release >= ceph_release_t::octopus) { | |
51 | spg_t pgid(n.info.pgid.pgid, n.to); | |
52 | send_osd_message(to, make_message<MOSDPGNotify2>(pgid, n)); | |
53 | } else { | |
54 | send_osd_message(to, make_message<MOSDPGNotify>(n.epoch_sent, vector{n})); | |
55 | } | |
56 | } | |
57 | ||
58 | void BufferedRecoveryMessages::send_query( | |
59 | int to, | |
60 | spg_t to_spgid, | |
61 | const pg_query_t &q) | |
62 | { | |
63 | if (require_osd_release >= ceph_release_t::octopus) { | |
64 | send_osd_message(to, | |
65 | make_message<MOSDPGQuery2>(to_spgid, q)); | |
66 | } else { | |
67 | auto m = make_message<MOSDPGQuery>( | |
68 | q.epoch_sent, | |
69 | MOSDPGQuery::pg_list_t{{to_spgid, q}}); | |
70 | send_osd_message(to, m); | |
71 | } | |
72 | } | |
73 | ||
74 | void BufferedRecoveryMessages::send_info( | |
75 | int to, | |
76 | spg_t to_spgid, | |
77 | epoch_t min_epoch, | |
78 | epoch_t cur_epoch, | |
79 | const pg_info_t &info, | |
80 | std::optional<pg_lease_t> lease, | |
81 | std::optional<pg_lease_ack_t> lease_ack) | |
82 | { | |
83 | if (require_osd_release >= ceph_release_t::octopus) { | |
84 | send_osd_message( | |
85 | to, | |
86 | make_message<MOSDPGInfo2>( | |
87 | to_spgid, | |
88 | info, | |
89 | cur_epoch, | |
90 | min_epoch, | |
91 | lease, | |
92 | lease_ack) | |
93 | ); | |
94 | } else { | |
95 | send_osd_message( | |
96 | to, | |
97 | make_message<MOSDPGInfo>( | |
98 | cur_epoch, | |
99 | vector{pg_notify_t{to_spgid.shard, | |
100 | info.pgid.shard, | |
101 | min_epoch, cur_epoch, | |
102 | info, PastIntervals{}}}) | |
103 | ); | |
104 | } | |
105 | } | |
106 | ||
f67539c2 | 107 | void PGPool::update(OSDMapRef map) |
9f95a23c TL |
108 | { |
109 | const pg_pool_t *pi = map->get_pg_pool(id); | |
110 | if (!pi) { | |
111 | return; // pool has been deleted | |
112 | } | |
113 | info = *pi; | |
114 | name = map->get_pool_name(id); | |
115 | ||
116 | bool updated = false; | |
117 | if ((map->get_epoch() != cached_epoch + 1) || | |
118 | (pi->get_snap_epoch() == map->get_epoch())) { | |
119 | updated = true; | |
120 | } | |
121 | ||
9f95a23c TL |
122 | if (info.is_pool_snaps_mode() && updated) { |
123 | snapc = pi->get_snap_context(); | |
124 | } | |
125 | cached_epoch = map->get_epoch(); | |
126 | } | |
127 | ||
128 | /*-------------Peering State Helpers----------------*/ | |
129 | #undef dout_prefix | |
130 | #define dout_prefix (dpp->gen_prefix(*_dout)) | |
131 | #undef psdout | |
132 | #define psdout(x) ldout(cct, x) | |
133 | ||
134 | PeeringState::PeeringState( | |
135 | CephContext *cct, | |
136 | pg_shard_t pg_whoami, | |
137 | spg_t spgid, | |
138 | const PGPool &_pool, | |
139 | OSDMapRef curmap, | |
140 | DoutPrefixProvider *dpp, | |
141 | PeeringListener *pl) | |
142 | : state_history(*pl), | |
143 | cct(cct), | |
144 | spgid(spgid), | |
145 | dpp(dpp), | |
146 | pl(pl), | |
147 | orig_ctx(0), | |
148 | osdmap_ref(curmap), | |
149 | pool(_pool), | |
150 | pg_whoami(pg_whoami), | |
151 | info(spgid), | |
152 | pg_log(cct), | |
153 | missing_loc(spgid, this, dpp, cct), | |
154 | machine(this, cct, spgid, dpp, pl, &state_history) | |
155 | { | |
156 | machine.initiate(); | |
157 | } | |
158 | ||
159 | void PeeringState::start_handle(PeeringCtx *new_ctx) { | |
160 | ceph_assert(!rctx); | |
161 | ceph_assert(!orig_ctx); | |
162 | orig_ctx = new_ctx; | |
163 | if (new_ctx) { | |
164 | if (messages_pending_flush) { | |
165 | rctx.emplace(*messages_pending_flush, *new_ctx); | |
166 | } else { | |
167 | rctx.emplace(*new_ctx); | |
168 | } | |
169 | rctx->start_time = ceph_clock_now(); | |
170 | } | |
171 | } | |
172 | ||
173 | void PeeringState::begin_block_outgoing() { | |
174 | ceph_assert(!messages_pending_flush); | |
175 | ceph_assert(orig_ctx); | |
176 | ceph_assert(rctx); | |
177 | messages_pending_flush = BufferedRecoveryMessages( | |
178 | orig_ctx->require_osd_release); | |
179 | rctx.emplace(*messages_pending_flush, *orig_ctx); | |
180 | } | |
181 | ||
182 | void PeeringState::clear_blocked_outgoing() { | |
183 | ceph_assert(orig_ctx); | |
184 | ceph_assert(rctx); | |
185 | messages_pending_flush = std::optional<BufferedRecoveryMessages>(); | |
186 | } | |
187 | ||
188 | void PeeringState::end_block_outgoing() { | |
189 | ceph_assert(messages_pending_flush); | |
190 | ceph_assert(orig_ctx); | |
191 | ceph_assert(rctx); | |
192 | ||
193 | orig_ctx->accept_buffered_messages(*messages_pending_flush); | |
194 | rctx.emplace(*orig_ctx); | |
195 | messages_pending_flush = std::optional<BufferedRecoveryMessages>(); | |
196 | } | |
197 | ||
198 | void PeeringState::end_handle() { | |
199 | if (rctx) { | |
200 | utime_t dur = ceph_clock_now() - rctx->start_time; | |
201 | machine.event_time += dur; | |
202 | } | |
203 | ||
204 | machine.event_count++; | |
205 | rctx = std::nullopt; | |
206 | orig_ctx = NULL; | |
207 | } | |
208 | ||
209 | void PeeringState::check_recovery_sources(const OSDMapRef& osdmap) | |
210 | { | |
211 | /* | |
212 | * check that any peers we are planning to (or currently) pulling | |
213 | * objects from are dealt with. | |
214 | */ | |
215 | missing_loc.check_recovery_sources(osdmap); | |
216 | pl->check_recovery_sources(osdmap); | |
217 | ||
f67539c2 | 218 | for (auto i = peer_log_requested.begin(); i != peer_log_requested.end();) { |
9f95a23c TL |
219 | if (!osdmap->is_up(i->osd)) { |
220 | psdout(10) << "peer_log_requested removing " << *i << dendl; | |
221 | peer_log_requested.erase(i++); | |
222 | } else { | |
223 | ++i; | |
224 | } | |
225 | } | |
226 | ||
f67539c2 TL |
227 | for (auto i = peer_missing_requested.begin(); |
228 | i != peer_missing_requested.end();) { | |
9f95a23c TL |
229 | if (!osdmap->is_up(i->osd)) { |
230 | psdout(10) << "peer_missing_requested removing " << *i << dendl; | |
231 | peer_missing_requested.erase(i++); | |
232 | } else { | |
233 | ++i; | |
234 | } | |
235 | } | |
236 | } | |
237 | ||
238 | void PeeringState::update_history(const pg_history_t& new_history) | |
239 | { | |
240 | auto mnow = pl->get_mnow(); | |
241 | info.history.refresh_prior_readable_until_ub(mnow, prior_readable_until_ub); | |
242 | if (info.history.merge(new_history)) { | |
243 | psdout(20) << __func__ << " advanced history from " << new_history << dendl; | |
244 | dirty_info = true; | |
245 | if (info.history.last_epoch_clean >= info.history.same_interval_since) { | |
246 | psdout(20) << __func__ << " clearing past_intervals" << dendl; | |
247 | past_intervals.clear(); | |
248 | dirty_big_info = true; | |
249 | } | |
250 | prior_readable_until_ub = info.history.get_prior_readable_until_ub(mnow); | |
251 | if (prior_readable_until_ub != ceph::signedspan::zero()) { | |
252 | dout(20) << __func__ | |
253 | << " prior_readable_until_ub " << prior_readable_until_ub | |
254 | << " (mnow " << mnow << " + " | |
255 | << info.history.prior_readable_until_ub << ")" << dendl; | |
256 | } | |
257 | } | |
258 | pl->on_info_history_change(); | |
259 | } | |
260 | ||
f67539c2 TL |
261 | hobject_t PeeringState::earliest_backfill() const |
262 | { | |
263 | hobject_t e = hobject_t::get_max(); | |
264 | for (const pg_shard_t& bt : get_backfill_targets()) { | |
265 | const pg_info_t &pi = get_peer_info(bt); | |
266 | e = std::min(pi.last_backfill, e); | |
267 | } | |
268 | return e; | |
269 | } | |
270 | ||
9f95a23c TL |
271 | void PeeringState::purge_strays() |
272 | { | |
273 | if (is_premerge()) { | |
274 | psdout(10) << "purge_strays " << stray_set << " but premerge, doing nothing" | |
275 | << dendl; | |
276 | return; | |
277 | } | |
278 | if (cct->_conf.get_val<bool>("osd_debug_no_purge_strays")) { | |
279 | return; | |
280 | } | |
281 | psdout(10) << "purge_strays " << stray_set << dendl; | |
282 | ||
283 | bool removed = false; | |
f67539c2 | 284 | for (auto p = stray_set.begin(); p != stray_set.end(); ++p) { |
9f95a23c TL |
285 | ceph_assert(!is_acting_recovery_backfill(*p)); |
286 | if (get_osdmap()->is_up(p->osd)) { | |
287 | psdout(10) << "sending PGRemove to osd." << *p << dendl; | |
288 | vector<spg_t> to_remove; | |
289 | to_remove.push_back(spg_t(info.pgid.pgid, p->shard)); | |
f67539c2 | 290 | auto m = make_message<MOSDPGRemove>( |
9f95a23c TL |
291 | get_osdmap_epoch(), |
292 | to_remove); | |
293 | pl->send_cluster_message(p->osd, m, get_osdmap_epoch()); | |
294 | } else { | |
295 | psdout(10) << "not sending PGRemove to down osd." << *p << dendl; | |
296 | } | |
297 | peer_missing.erase(*p); | |
298 | peer_info.erase(*p); | |
299 | missing_loc.remove_stray_recovery_sources(*p); | |
300 | peer_purged.insert(*p); | |
301 | removed = true; | |
302 | } | |
303 | ||
304 | // if we removed anyone, update peers (which include peer_info) | |
305 | if (removed) | |
306 | update_heartbeat_peers(); | |
307 | ||
308 | stray_set.clear(); | |
309 | ||
310 | // clear _requested maps; we may have to peer() again if we discover | |
311 | // (more) stray content | |
312 | peer_log_requested.clear(); | |
313 | peer_missing_requested.clear(); | |
314 | } | |
315 | ||
f67539c2 TL |
316 | void PeeringState::query_unfound(Formatter *f, string state) |
317 | { | |
318 | psdout(20) << "Enter PeeringState common QueryUnfound" << dendl; | |
319 | { | |
320 | f->dump_string("state", state); | |
321 | f->dump_bool("available_might_have_unfound", true); | |
322 | f->open_array_section("might_have_unfound"); | |
323 | for (auto p = might_have_unfound.begin(); | |
324 | p != might_have_unfound.end(); | |
325 | ++p) { | |
326 | if (peer_missing.count(*p)) { | |
327 | ; // Ignore already probed OSDs | |
328 | } else { | |
329 | f->open_object_section("osd"); | |
330 | f->dump_stream("osd") << *p; | |
331 | if (peer_missing_requested.count(*p)) { | |
332 | f->dump_string("status", "querying"); | |
333 | } else if (!get_osdmap()->is_up(p->osd)) { | |
334 | f->dump_string("status", "osd is down"); | |
335 | } else { | |
336 | f->dump_string("status", "not queried"); | |
337 | } | |
338 | f->close_section(); | |
339 | } | |
340 | } | |
341 | f->close_section(); | |
342 | } | |
343 | psdout(20) << "Exit PeeringState common QueryUnfound" << dendl; | |
344 | return; | |
345 | } | |
9f95a23c TL |
346 | |
347 | bool PeeringState::proc_replica_info( | |
348 | pg_shard_t from, const pg_info_t &oinfo, epoch_t send_epoch) | |
349 | { | |
f67539c2 | 350 | auto p = peer_info.find(from); |
9f95a23c TL |
351 | if (p != peer_info.end() && p->second.last_update == oinfo.last_update) { |
352 | psdout(10) << " got dup osd." << from << " info " | |
353 | << oinfo << ", identical to ours" << dendl; | |
354 | return false; | |
355 | } | |
356 | ||
357 | if (!get_osdmap()->has_been_up_since(from.osd, send_epoch)) { | |
358 | psdout(10) << " got info " << oinfo << " from down osd." << from | |
359 | << " discarding" << dendl; | |
360 | return false; | |
361 | } | |
362 | ||
363 | psdout(10) << " got osd." << from << " " << oinfo << dendl; | |
364 | ceph_assert(is_primary()); | |
365 | peer_info[from] = oinfo; | |
366 | might_have_unfound.insert(from); | |
367 | ||
368 | update_history(oinfo.history); | |
369 | ||
370 | // stray? | |
371 | if (!is_up(from) && !is_acting(from)) { | |
372 | psdout(10) << " osd." << from << " has stray content: " << oinfo << dendl; | |
373 | stray_set.insert(from); | |
374 | if (is_clean()) { | |
375 | purge_strays(); | |
376 | } | |
377 | } | |
378 | ||
379 | // was this a new info? if so, update peers! | |
380 | if (p == peer_info.end()) | |
381 | update_heartbeat_peers(); | |
382 | ||
383 | return true; | |
384 | } | |
385 | ||
386 | ||
387 | void PeeringState::remove_down_peer_info(const OSDMapRef &osdmap) | |
388 | { | |
389 | // Remove any downed osds from peer_info | |
390 | bool removed = false; | |
f67539c2 | 391 | auto p = peer_info.begin(); |
9f95a23c TL |
392 | while (p != peer_info.end()) { |
393 | if (!osdmap->is_up(p->first.osd)) { | |
394 | psdout(10) << " dropping down osd." << p->first << " info " << p->second << dendl; | |
395 | peer_missing.erase(p->first); | |
396 | peer_log_requested.erase(p->first); | |
397 | peer_missing_requested.erase(p->first); | |
9f95a23c TL |
398 | peer_info.erase(p++); |
399 | removed = true; | |
400 | } else | |
401 | ++p; | |
402 | } | |
403 | ||
522d829b TL |
404 | // Remove any downed osds from peer_purged so we can re-purge if necessary |
405 | auto it = peer_purged.begin(); | |
406 | while (it != peer_purged.end()) { | |
407 | if (!osdmap->is_up(it->osd)) { | |
408 | psdout(10) << " dropping down osd." << *it << " from peer_purged" << dendl; | |
409 | peer_purged.erase(it++); | |
410 | } else { | |
411 | ++it; | |
412 | } | |
413 | } | |
414 | ||
9f95a23c TL |
415 | // if we removed anyone, update peers (which include peer_info) |
416 | if (removed) | |
417 | update_heartbeat_peers(); | |
418 | ||
419 | check_recovery_sources(osdmap); | |
420 | } | |
421 | ||
422 | void PeeringState::update_heartbeat_peers() | |
423 | { | |
424 | if (!is_primary()) | |
425 | return; | |
426 | ||
427 | set<int> new_peers; | |
428 | for (unsigned i=0; i<acting.size(); i++) { | |
429 | if (acting[i] != CRUSH_ITEM_NONE) | |
430 | new_peers.insert(acting[i]); | |
431 | } | |
432 | for (unsigned i=0; i<up.size(); i++) { | |
433 | if (up[i] != CRUSH_ITEM_NONE) | |
434 | new_peers.insert(up[i]); | |
435 | } | |
f67539c2 | 436 | for (auto p = peer_info.begin(); p != peer_info.end(); ++p) { |
9f95a23c TL |
437 | new_peers.insert(p->first.osd); |
438 | } | |
439 | pl->update_heartbeat_peers(std::move(new_peers)); | |
440 | } | |
441 | ||
442 | void PeeringState::write_if_dirty(ObjectStore::Transaction& t) | |
443 | { | |
444 | pl->prepare_write( | |
445 | info, | |
446 | last_written_info, | |
447 | past_intervals, | |
448 | pg_log, | |
449 | dirty_info, | |
450 | dirty_big_info, | |
451 | last_persisted_osdmap < get_osdmap_epoch(), | |
452 | t); | |
453 | if (dirty_info || dirty_big_info) { | |
454 | last_persisted_osdmap = get_osdmap_epoch(); | |
455 | last_written_info = info; | |
456 | dirty_info = false; | |
457 | dirty_big_info = false; | |
458 | } | |
459 | } | |
460 | ||
461 | void PeeringState::advance_map( | |
462 | OSDMapRef osdmap, OSDMapRef lastmap, | |
463 | vector<int>& newup, int up_primary, | |
464 | vector<int>& newacting, int acting_primary, | |
465 | PeeringCtx &rctx) | |
466 | { | |
467 | ceph_assert(lastmap == osdmap_ref); | |
468 | psdout(10) << "handle_advance_map " | |
469 | << newup << "/" << newacting | |
470 | << " -- " << up_primary << "/" << acting_primary | |
471 | << dendl; | |
472 | ||
473 | update_osdmap_ref(osdmap); | |
f67539c2 | 474 | pool.update(osdmap); |
9f95a23c TL |
475 | |
476 | AdvMap evt( | |
477 | osdmap, lastmap, newup, up_primary, | |
478 | newacting, acting_primary); | |
479 | handle_event(evt, &rctx); | |
480 | if (pool.info.last_change == osdmap_ref->get_epoch()) { | |
481 | pl->on_pool_change(); | |
482 | } | |
f67539c2 | 483 | readable_interval = pool.get_readable_interval(cct->_conf); |
9f95a23c TL |
484 | last_require_osd_release = osdmap->require_osd_release; |
485 | } | |
486 | ||
487 | void PeeringState::activate_map(PeeringCtx &rctx) | |
488 | { | |
489 | psdout(10) << __func__ << dendl; | |
490 | ActMap evt; | |
491 | handle_event(evt, &rctx); | |
492 | if (osdmap_ref->get_epoch() - last_persisted_osdmap > | |
493 | cct->_conf->osd_pg_epoch_persisted_max_stale) { | |
494 | psdout(20) << __func__ << ": Dirtying info: last_persisted is " | |
495 | << last_persisted_osdmap | |
496 | << " while current is " << osdmap_ref->get_epoch() << dendl; | |
497 | dirty_info = true; | |
498 | } else { | |
499 | psdout(20) << __func__ << ": Not dirtying info: last_persisted is " | |
500 | << last_persisted_osdmap | |
501 | << " while current is " << osdmap_ref->get_epoch() << dendl; | |
502 | } | |
503 | write_if_dirty(rctx.transaction); | |
504 | ||
f67539c2 TL |
505 | if (get_osdmap()->check_new_blocklist_entries()) { |
506 | pl->check_blocklisted_watchers(); | |
9f95a23c TL |
507 | } |
508 | } | |
509 | ||
510 | void PeeringState::set_last_peering_reset() | |
511 | { | |
512 | psdout(20) << "set_last_peering_reset " << get_osdmap_epoch() << dendl; | |
513 | if (last_peering_reset != get_osdmap_epoch()) { | |
514 | last_peering_reset = get_osdmap_epoch(); | |
515 | psdout(10) << "Clearing blocked outgoing recovery messages" << dendl; | |
516 | clear_blocked_outgoing(); | |
517 | if (!pl->try_flush_or_schedule_async()) { | |
518 | psdout(10) << "Beginning to block outgoing recovery messages" << dendl; | |
519 | begin_block_outgoing(); | |
520 | } else { | |
521 | psdout(10) << "Not blocking outgoing recovery messages" << dendl; | |
522 | } | |
523 | } | |
524 | } | |
525 | ||
526 | void PeeringState::complete_flush() | |
527 | { | |
528 | flushes_in_progress--; | |
529 | if (flushes_in_progress == 0) { | |
530 | pl->on_flushed(); | |
531 | } | |
532 | } | |
533 | ||
534 | void PeeringState::check_full_transition(OSDMapRef lastmap, OSDMapRef osdmap) | |
535 | { | |
536 | const pg_pool_t *pi = osdmap->get_pg_pool(info.pgid.pool()); | |
537 | if (!pi) { | |
538 | return; // pool deleted | |
539 | } | |
540 | bool changed = false; | |
541 | if (pi->has_flag(pg_pool_t::FLAG_FULL)) { | |
542 | const pg_pool_t *opi = lastmap->get_pg_pool(info.pgid.pool()); | |
543 | if (!opi || !opi->has_flag(pg_pool_t::FLAG_FULL)) { | |
544 | psdout(10) << " pool was marked full in " << osdmap->get_epoch() << dendl; | |
545 | changed = true; | |
546 | } | |
547 | } | |
548 | if (changed) { | |
549 | info.history.last_epoch_marked_full = osdmap->get_epoch(); | |
550 | dirty_info = true; | |
551 | } | |
552 | } | |
553 | ||
554 | bool PeeringState::should_restart_peering( | |
555 | int newupprimary, | |
556 | int newactingprimary, | |
557 | const vector<int>& newup, | |
558 | const vector<int>& newacting, | |
559 | OSDMapRef lastmap, | |
560 | OSDMapRef osdmap) | |
561 | { | |
562 | if (PastIntervals::is_new_interval( | |
563 | primary.osd, | |
564 | newactingprimary, | |
565 | acting, | |
566 | newacting, | |
567 | up_primary.osd, | |
568 | newupprimary, | |
569 | up, | |
570 | newup, | |
571 | osdmap.get(), | |
572 | lastmap.get(), | |
573 | info.pgid.pgid)) { | |
574 | psdout(20) << "new interval newup " << newup | |
575 | << " newacting " << newacting << dendl; | |
576 | return true; | |
577 | } | |
578 | if (!lastmap->is_up(pg_whoami.osd) && osdmap->is_up(pg_whoami.osd)) { | |
579 | psdout(10) << __func__ << " osd transitioned from down -> up" | |
580 | << dendl; | |
581 | return true; | |
582 | } | |
583 | return false; | |
584 | } | |
585 | ||
586 | /* Called before initializing peering during advance_map */ | |
587 | void PeeringState::start_peering_interval( | |
588 | const OSDMapRef lastmap, | |
589 | const vector<int>& newup, int new_up_primary, | |
590 | const vector<int>& newacting, int new_acting_primary, | |
591 | ObjectStore::Transaction &t) | |
592 | { | |
593 | const OSDMapRef osdmap = get_osdmap(); | |
594 | ||
595 | set_last_peering_reset(); | |
596 | ||
597 | vector<int> oldacting, oldup; | |
598 | int oldrole = get_role(); | |
599 | ||
600 | if (is_primary()) { | |
601 | pl->clear_ready_to_merge(); | |
602 | } | |
603 | ||
604 | ||
605 | pg_shard_t old_acting_primary = get_primary(); | |
606 | pg_shard_t old_up_primary = up_primary; | |
607 | bool was_old_primary = is_primary(); | |
608 | bool was_old_nonprimary = is_nonprimary(); | |
609 | ||
610 | acting.swap(oldacting); | |
611 | up.swap(oldup); | |
612 | init_primary_up_acting( | |
613 | newup, | |
614 | newacting, | |
615 | new_up_primary, | |
616 | new_acting_primary); | |
617 | ||
618 | if (info.stats.up != up || | |
619 | info.stats.acting != acting || | |
620 | info.stats.up_primary != new_up_primary || | |
621 | info.stats.acting_primary != new_acting_primary) { | |
622 | info.stats.up = up; | |
623 | info.stats.up_primary = new_up_primary; | |
624 | info.stats.acting = acting; | |
625 | info.stats.acting_primary = new_acting_primary; | |
626 | info.stats.mapping_epoch = osdmap->get_epoch(); | |
627 | } | |
628 | ||
629 | pl->clear_publish_stats(); | |
630 | ||
631 | // This will now be remapped during a backfill in cases | |
632 | // that it would not have been before. | |
633 | if (up != acting) | |
634 | state_set(PG_STATE_REMAPPED); | |
635 | else | |
636 | state_clear(PG_STATE_REMAPPED); | |
637 | ||
638 | int role = osdmap->calc_pg_role(pg_whoami, acting); | |
639 | set_role(role); | |
640 | ||
641 | // did acting, up, primary|acker change? | |
642 | if (!lastmap) { | |
643 | psdout(10) << " no lastmap" << dendl; | |
644 | dirty_info = true; | |
645 | dirty_big_info = true; | |
646 | info.history.same_interval_since = osdmap->get_epoch(); | |
647 | } else { | |
648 | std::stringstream debug; | |
649 | ceph_assert(info.history.same_interval_since != 0); | |
650 | bool new_interval = PastIntervals::check_new_interval( | |
651 | old_acting_primary.osd, | |
652 | new_acting_primary, | |
653 | oldacting, newacting, | |
654 | old_up_primary.osd, | |
655 | new_up_primary, | |
656 | oldup, newup, | |
657 | info.history.same_interval_since, | |
658 | info.history.last_epoch_clean, | |
659 | osdmap.get(), | |
660 | lastmap.get(), | |
661 | info.pgid.pgid, | |
662 | missing_loc.get_recoverable_predicate(), | |
663 | &past_intervals, | |
664 | &debug); | |
665 | psdout(10) << __func__ << ": check_new_interval output: " | |
666 | << debug.str() << dendl; | |
667 | if (new_interval) { | |
668 | if (osdmap->get_epoch() == pl->oldest_stored_osdmap() && | |
669 | info.history.last_epoch_clean < osdmap->get_epoch()) { | |
670 | psdout(10) << " map gap, clearing past_intervals and faking" << dendl; | |
671 | // our information is incomplete and useless; someone else was clean | |
672 | // after everything we know if osdmaps were trimmed. | |
673 | past_intervals.clear(); | |
674 | } else { | |
675 | psdout(10) << " noting past " << past_intervals << dendl; | |
676 | } | |
677 | dirty_info = true; | |
678 | dirty_big_info = true; | |
679 | info.history.same_interval_since = osdmap->get_epoch(); | |
680 | if (osdmap->have_pg_pool(info.pgid.pgid.pool()) && | |
681 | info.pgid.pgid.is_split(lastmap->get_pg_num(info.pgid.pgid.pool()), | |
682 | osdmap->get_pg_num(info.pgid.pgid.pool()), | |
683 | nullptr)) { | |
684 | info.history.last_epoch_split = osdmap->get_epoch(); | |
685 | } | |
686 | } | |
687 | } | |
688 | ||
689 | if (old_up_primary != up_primary || | |
690 | oldup != up) { | |
691 | info.history.same_up_since = osdmap->get_epoch(); | |
692 | } | |
693 | // this comparison includes primary rank via pg_shard_t | |
694 | if (old_acting_primary != get_primary()) { | |
695 | info.history.same_primary_since = osdmap->get_epoch(); | |
696 | } | |
697 | ||
698 | on_new_interval(); | |
699 | pl->on_info_history_change(); | |
700 | ||
701 | psdout(1) << __func__ << " up " << oldup << " -> " << up | |
702 | << ", acting " << oldacting << " -> " << acting | |
703 | << ", acting_primary " << old_acting_primary << " -> " | |
704 | << new_acting_primary | |
705 | << ", up_primary " << old_up_primary << " -> " << new_up_primary | |
706 | << ", role " << oldrole << " -> " << role | |
707 | << ", features acting " << acting_features | |
708 | << " upacting " << upacting_features | |
709 | << dendl; | |
710 | ||
711 | // deactivate. | |
712 | state_clear(PG_STATE_ACTIVE); | |
713 | state_clear(PG_STATE_PEERED); | |
714 | state_clear(PG_STATE_PREMERGE); | |
715 | state_clear(PG_STATE_DOWN); | |
716 | state_clear(PG_STATE_RECOVERY_WAIT); | |
717 | state_clear(PG_STATE_RECOVERY_TOOFULL); | |
718 | state_clear(PG_STATE_RECOVERING); | |
719 | ||
720 | peer_purged.clear(); | |
721 | acting_recovery_backfill.clear(); | |
722 | ||
723 | // reset primary/replica state? | |
724 | if (was_old_primary || is_primary()) { | |
725 | pl->clear_want_pg_temp(); | |
726 | } else if (was_old_nonprimary || is_nonprimary()) { | |
727 | pl->clear_want_pg_temp(); | |
728 | } | |
729 | clear_primary_state(); | |
730 | ||
731 | pl->on_change(t); | |
732 | ||
733 | ceph_assert(!deleting); | |
734 | ||
735 | // should we tell the primary we are here? | |
736 | send_notify = !is_primary(); | |
737 | ||
738 | if (role != oldrole || | |
739 | was_old_primary != is_primary()) { | |
740 | // did primary change? | |
741 | if (was_old_primary != is_primary()) { | |
742 | state_clear(PG_STATE_CLEAN); | |
743 | } | |
744 | ||
745 | pl->on_role_change(); | |
746 | } else { | |
747 | // no role change. | |
748 | // did primary change? | |
749 | if (get_primary() != old_acting_primary) { | |
750 | psdout(10) << oldacting << " -> " << acting | |
751 | << ", acting primary " | |
752 | << old_acting_primary << " -> " << get_primary() | |
753 | << dendl; | |
754 | } else { | |
755 | // primary is the same. | |
756 | if (is_primary()) { | |
757 | // i am (still) primary. but my replica set changed. | |
758 | state_clear(PG_STATE_CLEAN); | |
759 | ||
760 | psdout(10) << oldacting << " -> " << acting | |
761 | << ", replicas changed" << dendl; | |
762 | } | |
763 | } | |
764 | } | |
765 | ||
766 | if (acting.empty() && !up.empty() && up_primary == pg_whoami) { | |
767 | psdout(10) << " acting empty, but i am up[0], clearing pg_temp" << dendl; | |
768 | pl->queue_want_pg_temp(acting); | |
769 | } | |
770 | } | |
771 | ||
772 | void PeeringState::on_new_interval() | |
773 | { | |
774 | dout(20) << __func__ << dendl; | |
775 | const OSDMapRef osdmap = get_osdmap(); | |
776 | ||
777 | // initialize features | |
778 | acting_features = CEPH_FEATURES_SUPPORTED_DEFAULT; | |
779 | upacting_features = CEPH_FEATURES_SUPPORTED_DEFAULT; | |
f67539c2 | 780 | for (auto p = acting.begin(); p != acting.end(); ++p) { |
9f95a23c TL |
781 | if (*p == CRUSH_ITEM_NONE) |
782 | continue; | |
783 | uint64_t f = osdmap->get_xinfo(*p).features; | |
784 | acting_features &= f; | |
785 | upacting_features &= f; | |
786 | } | |
f67539c2 | 787 | for (auto p = up.begin(); p != up.end(); ++p) { |
9f95a23c TL |
788 | if (*p == CRUSH_ITEM_NONE) |
789 | continue; | |
790 | upacting_features &= osdmap->get_xinfo(*p).features; | |
791 | } | |
792 | psdout(20) << __func__ << " upacting_features 0x" << std::hex | |
793 | << upacting_features << std::dec | |
794 | << " from " << acting << "+" << up << dendl; | |
795 | ||
796 | psdout(20) << __func__ << " checking missing set deletes flag. missing = " | |
797 | << get_pg_log().get_missing() << dendl; | |
798 | ||
799 | if (!pg_log.get_missing().may_include_deletes && | |
800 | !perform_deletes_during_peering()) { | |
801 | pl->rebuild_missing_set_with_deletes(pg_log); | |
802 | } | |
803 | ceph_assert( | |
804 | pg_log.get_missing().may_include_deletes == | |
805 | !perform_deletes_during_peering()); | |
806 | ||
807 | init_hb_stamps(); | |
808 | ||
809 | // update lease bounds for a new interval | |
810 | auto mnow = pl->get_mnow(); | |
811 | prior_readable_until_ub = std::max(prior_readable_until_ub, | |
812 | readable_until_ub); | |
813 | prior_readable_until_ub = info.history.refresh_prior_readable_until_ub( | |
814 | mnow, prior_readable_until_ub); | |
815 | psdout(10) << __func__ << " prior_readable_until_ub " | |
816 | << prior_readable_until_ub << " (mnow " << mnow << " + " | |
817 | << info.history.prior_readable_until_ub << ")" << dendl; | |
818 | prior_readable_down_osds.clear(); // we populate this when we build the priorset | |
819 | ||
820 | readable_until = | |
821 | readable_until_ub = | |
822 | readable_until_ub_sent = | |
823 | readable_until_ub_from_primary = ceph::signedspan::zero(); | |
824 | ||
825 | acting_readable_until_ub.clear(); | |
826 | if (is_primary()) { | |
827 | acting_readable_until_ub.resize(acting.size(), ceph::signedspan::zero()); | |
828 | } | |
829 | ||
830 | pl->on_new_interval(); | |
831 | } | |
832 | ||
833 | void PeeringState::init_primary_up_acting( | |
834 | const vector<int> &newup, | |
835 | const vector<int> &newacting, | |
836 | int new_up_primary, | |
837 | int new_acting_primary) | |
838 | { | |
839 | actingset.clear(); | |
840 | acting = newacting; | |
841 | for (uint8_t i = 0; i < acting.size(); ++i) { | |
842 | if (acting[i] != CRUSH_ITEM_NONE) | |
843 | actingset.insert( | |
844 | pg_shard_t( | |
845 | acting[i], | |
846 | pool.info.is_erasure() ? shard_id_t(i) : shard_id_t::NO_SHARD)); | |
847 | } | |
848 | upset.clear(); | |
849 | up = newup; | |
850 | for (uint8_t i = 0; i < up.size(); ++i) { | |
851 | if (up[i] != CRUSH_ITEM_NONE) | |
852 | upset.insert( | |
853 | pg_shard_t( | |
854 | up[i], | |
855 | pool.info.is_erasure() ? shard_id_t(i) : shard_id_t::NO_SHARD)); | |
856 | } | |
857 | if (!pool.info.is_erasure()) { | |
858 | // replicated | |
859 | up_primary = pg_shard_t(new_up_primary, shard_id_t::NO_SHARD); | |
860 | primary = pg_shard_t(new_acting_primary, shard_id_t::NO_SHARD); | |
861 | } else { | |
862 | // erasure | |
863 | up_primary = pg_shard_t(); | |
864 | primary = pg_shard_t(); | |
865 | for (uint8_t i = 0; i < up.size(); ++i) { | |
866 | if (up[i] == new_up_primary) { | |
867 | up_primary = pg_shard_t(up[i], shard_id_t(i)); | |
868 | break; | |
869 | } | |
870 | } | |
871 | for (uint8_t i = 0; i < acting.size(); ++i) { | |
872 | if (acting[i] == new_acting_primary) { | |
873 | primary = pg_shard_t(acting[i], shard_id_t(i)); | |
874 | break; | |
875 | } | |
876 | } | |
877 | ceph_assert(up_primary.osd == new_up_primary); | |
878 | ceph_assert(primary.osd == new_acting_primary); | |
879 | } | |
880 | } | |
881 | ||
882 | void PeeringState::init_hb_stamps() | |
883 | { | |
884 | if (is_primary()) { | |
885 | // we care about all other osds in the acting set | |
886 | hb_stamps.resize(acting.size() - 1); | |
887 | unsigned i = 0; | |
888 | for (auto p : acting) { | |
889 | if (p == CRUSH_ITEM_NONE || p == get_primary().osd) { | |
890 | continue; | |
891 | } | |
892 | hb_stamps[i++] = pl->get_hb_stamps(p); | |
893 | } | |
894 | hb_stamps.resize(i); | |
895 | } else if (is_nonprimary()) { | |
896 | // we care about just the primary | |
897 | hb_stamps.resize(1); | |
898 | hb_stamps[0] = pl->get_hb_stamps(get_primary().osd); | |
899 | } else { | |
900 | hb_stamps.clear(); | |
901 | } | |
902 | dout(10) << __func__ << " now " << hb_stamps << dendl; | |
903 | } | |
904 | ||
905 | ||
906 | void PeeringState::clear_recovery_state() | |
907 | { | |
908 | async_recovery_targets.clear(); | |
909 | backfill_targets.clear(); | |
910 | } | |
911 | ||
912 | void PeeringState::clear_primary_state() | |
913 | { | |
914 | psdout(10) << "clear_primary_state" << dendl; | |
915 | ||
916 | // clear peering state | |
917 | stray_set.clear(); | |
918 | peer_log_requested.clear(); | |
919 | peer_missing_requested.clear(); | |
920 | peer_info.clear(); | |
921 | peer_bytes.clear(); | |
922 | peer_missing.clear(); | |
923 | peer_last_complete_ondisk.clear(); | |
924 | peer_activated.clear(); | |
925 | min_last_complete_ondisk = eversion_t(); | |
926 | pg_trim_to = eversion_t(); | |
927 | might_have_unfound.clear(); | |
928 | need_up_thru = false; | |
929 | missing_loc.clear(); | |
930 | pg_log.reset_recovery_pointers(); | |
931 | ||
932 | clear_recovery_state(); | |
933 | ||
934 | last_update_ondisk = eversion_t(); | |
935 | missing_loc.clear(); | |
936 | pl->clear_primary_state(); | |
937 | } | |
938 | ||
939 | /// return [start,end) bounds for required past_intervals | |
940 | static pair<epoch_t, epoch_t> get_required_past_interval_bounds( | |
941 | const pg_info_t &info, | |
942 | epoch_t oldest_map) { | |
943 | epoch_t start = std::max( | |
944 | info.history.last_epoch_clean ? info.history.last_epoch_clean : | |
945 | info.history.epoch_pool_created, | |
946 | oldest_map); | |
947 | epoch_t end = std::max( | |
948 | info.history.same_interval_since, | |
949 | info.history.epoch_pool_created); | |
950 | return make_pair(start, end); | |
951 | } | |
952 | ||
953 | ||
954 | void PeeringState::check_past_interval_bounds() const | |
955 | { | |
956 | auto oldest_epoch = pl->oldest_stored_osdmap(); | |
957 | auto rpib = get_required_past_interval_bounds( | |
958 | info, | |
959 | oldest_epoch); | |
960 | if (rpib.first >= rpib.second) { | |
961 | // do not warn if the start bound is dictated by oldest_map; the | |
962 | // past intervals are presumably appropriate given the pg info. | |
963 | if (!past_intervals.empty() && | |
964 | rpib.first > oldest_epoch) { | |
965 | pl->get_clog_error() << info.pgid << " required past_interval bounds are" | |
966 | << " empty [" << rpib << ") but past_intervals is not: " | |
967 | << past_intervals; | |
968 | derr << info.pgid << " required past_interval bounds are" | |
969 | << " empty [" << rpib << ") but past_intervals is not: " | |
970 | << past_intervals << dendl; | |
971 | } | |
972 | } else { | |
973 | if (past_intervals.empty()) { | |
974 | pl->get_clog_error() << info.pgid << " required past_interval bounds are" | |
975 | << " not empty [" << rpib << ") but past_intervals " | |
976 | << past_intervals << " is empty"; | |
977 | derr << info.pgid << " required past_interval bounds are" | |
978 | << " not empty [" << rpib << ") but past_intervals " | |
979 | << past_intervals << " is empty" << dendl; | |
980 | ceph_assert(!past_intervals.empty()); | |
981 | } | |
982 | ||
983 | auto apib = past_intervals.get_bounds(); | |
984 | if (apib.first > rpib.first) { | |
985 | pl->get_clog_error() << info.pgid << " past_intervals [" << apib | |
986 | << ") start interval does not contain the required" | |
987 | << " bound [" << rpib << ") start"; | |
988 | derr << info.pgid << " past_intervals [" << apib | |
989 | << ") start interval does not contain the required" | |
990 | << " bound [" << rpib << ") start" << dendl; | |
991 | ceph_abort_msg("past_interval start interval mismatch"); | |
992 | } | |
993 | if (apib.second != rpib.second) { | |
994 | pl->get_clog_error() << info.pgid << " past_interal bound [" << apib | |
995 | << ") end does not match required [" << rpib | |
996 | << ") end"; | |
997 | derr << info.pgid << " past_interal bound [" << apib | |
998 | << ") end does not match required [" << rpib | |
999 | << ") end" << dendl; | |
1000 | ceph_abort_msg("past_interval end mismatch"); | |
1001 | } | |
1002 | } | |
1003 | } | |
1004 | ||
1005 | int PeeringState::clamp_recovery_priority(int priority, int pool_recovery_priority, int max) | |
1006 | { | |
1007 | static_assert(OSD_RECOVERY_PRIORITY_MIN < OSD_RECOVERY_PRIORITY_MAX, "Invalid priority range"); | |
1008 | static_assert(OSD_RECOVERY_PRIORITY_MIN >= 0, "Priority range must match unsigned type"); | |
1009 | ||
1010 | ceph_assert(max <= OSD_RECOVERY_PRIORITY_MAX); | |
1011 | ||
1012 | // User can't set this too high anymore, but might be a legacy value | |
1013 | if (pool_recovery_priority > OSD_POOL_PRIORITY_MAX) | |
1014 | pool_recovery_priority = OSD_POOL_PRIORITY_MAX; | |
1015 | if (pool_recovery_priority < OSD_POOL_PRIORITY_MIN) | |
1016 | pool_recovery_priority = OSD_POOL_PRIORITY_MIN; | |
1017 | // Shift range from min to max to 0 to max - min | |
1018 | pool_recovery_priority += (0 - OSD_POOL_PRIORITY_MIN); | |
1019 | ceph_assert(pool_recovery_priority >= 0 && pool_recovery_priority <= (OSD_POOL_PRIORITY_MAX - OSD_POOL_PRIORITY_MIN)); | |
1020 | ||
1021 | priority += pool_recovery_priority; | |
1022 | ||
1023 | // Clamp to valid range | |
1024 | if (priority > max) { | |
1025 | return max; | |
1026 | } else if (priority < OSD_RECOVERY_PRIORITY_MIN) { | |
1027 | return OSD_RECOVERY_PRIORITY_MIN; | |
1028 | } else { | |
1029 | return priority; | |
1030 | } | |
1031 | } | |
1032 | ||
1033 | unsigned PeeringState::get_recovery_priority() | |
1034 | { | |
1035 | // a higher value -> a higher priority | |
1036 | int ret = OSD_RECOVERY_PRIORITY_BASE; | |
1037 | int base = ret; | |
1038 | ||
1039 | if (state & PG_STATE_FORCED_RECOVERY) { | |
1040 | ret = OSD_RECOVERY_PRIORITY_FORCED; | |
1041 | } else { | |
1042 | // XXX: This priority boost isn't so much about inactive, but about data-at-risk | |
1043 | if (is_degraded() && info.stats.avail_no_missing.size() < pool.info.min_size) { | |
1044 | base = OSD_RECOVERY_INACTIVE_PRIORITY_BASE; | |
1045 | // inactive: no. of replicas < min_size, highest priority since it blocks IO | |
1046 | ret = base + (pool.info.min_size - info.stats.avail_no_missing.size()); | |
1047 | } | |
1048 | ||
1049 | int64_t pool_recovery_priority = 0; | |
1050 | pool.info.opts.get(pool_opts_t::RECOVERY_PRIORITY, &pool_recovery_priority); | |
1051 | ||
1052 | ret = clamp_recovery_priority(ret, pool_recovery_priority, max_prio_map[base]); | |
1053 | } | |
1054 | psdout(20) << __func__ << " recovery priority is " << ret << dendl; | |
1055 | return static_cast<unsigned>(ret); | |
1056 | } | |
1057 | ||
1058 | unsigned PeeringState::get_backfill_priority() | |
1059 | { | |
1060 | // a higher value -> a higher priority | |
1061 | int ret = OSD_BACKFILL_PRIORITY_BASE; | |
1062 | int base = ret; | |
1063 | ||
1064 | if (state & PG_STATE_FORCED_BACKFILL) { | |
1065 | ret = OSD_BACKFILL_PRIORITY_FORCED; | |
1066 | } else { | |
f67539c2 | 1067 | if (actingset.size() < pool.info.min_size) { |
9f95a23c TL |
1068 | base = OSD_BACKFILL_INACTIVE_PRIORITY_BASE; |
1069 | // inactive: no. of replicas < min_size, highest priority since it blocks IO | |
f67539c2 | 1070 | ret = base + (pool.info.min_size - actingset.size()); |
9f95a23c TL |
1071 | |
1072 | } else if (is_undersized()) { | |
1073 | // undersized: OSD_BACKFILL_DEGRADED_PRIORITY_BASE + num missing replicas | |
1074 | ceph_assert(pool.info.size > actingset.size()); | |
1075 | base = OSD_BACKFILL_DEGRADED_PRIORITY_BASE; | |
1076 | ret = base + (pool.info.size - actingset.size()); | |
1077 | ||
1078 | } else if (is_degraded()) { | |
1079 | // degraded: baseline degraded | |
1080 | base = ret = OSD_BACKFILL_DEGRADED_PRIORITY_BASE; | |
1081 | } | |
1082 | ||
1083 | // Adjust with pool's recovery priority | |
1084 | int64_t pool_recovery_priority = 0; | |
1085 | pool.info.opts.get(pool_opts_t::RECOVERY_PRIORITY, &pool_recovery_priority); | |
1086 | ||
1087 | ret = clamp_recovery_priority(ret, pool_recovery_priority, max_prio_map[base]); | |
1088 | } | |
1089 | ||
1090 | psdout(20) << __func__ << " backfill priority is " << ret << dendl; | |
1091 | return static_cast<unsigned>(ret); | |
1092 | } | |
1093 | ||
1094 | unsigned PeeringState::get_delete_priority() | |
1095 | { | |
1096 | auto state = get_osdmap()->get_state(pg_whoami.osd); | |
1097 | if (state & (CEPH_OSD_BACKFILLFULL | | |
1098 | CEPH_OSD_FULL)) { | |
1099 | return OSD_DELETE_PRIORITY_FULL; | |
1100 | } else if (state & CEPH_OSD_NEARFULL) { | |
1101 | return OSD_DELETE_PRIORITY_FULLISH; | |
1102 | } else { | |
1103 | return OSD_DELETE_PRIORITY_NORMAL; | |
1104 | } | |
1105 | } | |
1106 | ||
1107 | bool PeeringState::set_force_recovery(bool b) | |
1108 | { | |
1109 | bool did = false; | |
1110 | if (b) { | |
1111 | if (!(state & PG_STATE_FORCED_RECOVERY) && | |
1112 | (state & (PG_STATE_DEGRADED | | |
1113 | PG_STATE_RECOVERY_WAIT | | |
1114 | PG_STATE_RECOVERING))) { | |
1115 | psdout(20) << __func__ << " set" << dendl; | |
1116 | state_set(PG_STATE_FORCED_RECOVERY); | |
1117 | pl->publish_stats_to_osd(); | |
1118 | did = true; | |
1119 | } | |
1120 | } else if (state & PG_STATE_FORCED_RECOVERY) { | |
1121 | psdout(20) << __func__ << " clear" << dendl; | |
1122 | state_clear(PG_STATE_FORCED_RECOVERY); | |
1123 | pl->publish_stats_to_osd(); | |
1124 | did = true; | |
1125 | } | |
1126 | if (did) { | |
1127 | psdout(20) << __func__ << " state " << get_current_state() | |
1128 | << dendl; | |
1129 | pl->update_local_background_io_priority(get_recovery_priority()); | |
1130 | } | |
1131 | return did; | |
1132 | } | |
1133 | ||
1134 | bool PeeringState::set_force_backfill(bool b) | |
1135 | { | |
1136 | bool did = false; | |
1137 | if (b) { | |
1138 | if (!(state & PG_STATE_FORCED_BACKFILL) && | |
1139 | (state & (PG_STATE_DEGRADED | | |
1140 | PG_STATE_BACKFILL_WAIT | | |
1141 | PG_STATE_BACKFILLING))) { | |
1142 | psdout(10) << __func__ << " set" << dendl; | |
1143 | state_set(PG_STATE_FORCED_BACKFILL); | |
1144 | pl->publish_stats_to_osd(); | |
1145 | did = true; | |
1146 | } | |
1147 | } else if (state & PG_STATE_FORCED_BACKFILL) { | |
1148 | psdout(10) << __func__ << " clear" << dendl; | |
1149 | state_clear(PG_STATE_FORCED_BACKFILL); | |
1150 | pl->publish_stats_to_osd(); | |
1151 | did = true; | |
1152 | } | |
1153 | if (did) { | |
1154 | psdout(20) << __func__ << " state " << get_current_state() | |
1155 | << dendl; | |
1156 | pl->update_local_background_io_priority(get_backfill_priority()); | |
1157 | } | |
1158 | return did; | |
1159 | } | |
1160 | ||
1161 | void PeeringState::schedule_renew_lease() | |
1162 | { | |
1163 | pl->schedule_renew_lease( | |
1164 | last_peering_reset, | |
1165 | readable_interval / 2); | |
1166 | } | |
1167 | ||
1168 | void PeeringState::send_lease() | |
1169 | { | |
1170 | epoch_t epoch = pl->get_osdmap_epoch(); | |
1171 | for (auto peer : actingset) { | |
1172 | if (peer == pg_whoami) { | |
1173 | continue; | |
1174 | } | |
1175 | pl->send_cluster_message( | |
1176 | peer.osd, | |
f67539c2 | 1177 | make_message<MOSDPGLease>(epoch, |
9f95a23c TL |
1178 | spg_t(spgid.pgid, peer.shard), |
1179 | get_lease()), | |
1180 | epoch); | |
1181 | } | |
1182 | } | |
1183 | ||
1184 | void PeeringState::proc_lease(const pg_lease_t& l) | |
1185 | { | |
1186 | if (!HAVE_FEATURE(upacting_features, SERVER_OCTOPUS)) { | |
1187 | psdout(20) << __func__ << " no-op, upacting_features 0x" << std::hex | |
1188 | << upacting_features << std::dec | |
1189 | << " does not include SERVER_OCTOPUS" << dendl; | |
1190 | return; | |
1191 | } | |
1192 | if (!is_nonprimary()) { | |
1193 | psdout(20) << __func__ << " no-op, !nonprimary" << dendl; | |
1194 | return; | |
1195 | } | |
1196 | psdout(10) << __func__ << " " << l << dendl; | |
1197 | if (l.readable_until_ub > readable_until_ub_from_primary) { | |
1198 | readable_until_ub_from_primary = l.readable_until_ub; | |
1199 | } | |
1200 | ||
1201 | ceph::signedspan ru = ceph::signedspan::zero(); | |
1202 | if (l.readable_until != ceph::signedspan::zero() && | |
1203 | hb_stamps[0]->peer_clock_delta_ub) { | |
1204 | ru = l.readable_until - *hb_stamps[0]->peer_clock_delta_ub; | |
1205 | psdout(20) << " peer_clock_delta_ub " << *hb_stamps[0]->peer_clock_delta_ub | |
1206 | << " -> ru " << ru << dendl; | |
1207 | } | |
1208 | if (ru > readable_until) { | |
1209 | readable_until = ru; | |
1210 | psdout(20) << __func__ << " readable_until now " << readable_until << dendl; | |
1211 | // NOTE: if we ever decide to block/queue ops on the replica, | |
1212 | // we'll need to wake them up here. | |
1213 | } | |
1214 | ||
1215 | ceph::signedspan ruub; | |
1216 | if (hb_stamps[0]->peer_clock_delta_lb) { | |
1217 | ruub = l.readable_until_ub - *hb_stamps[0]->peer_clock_delta_lb; | |
1218 | psdout(20) << " peer_clock_delta_lb " << *hb_stamps[0]->peer_clock_delta_lb | |
1219 | << " -> ruub " << ruub << dendl; | |
1220 | } else { | |
1221 | ruub = pl->get_mnow() + l.interval; | |
1222 | psdout(20) << " no peer_clock_delta_lb -> ruub " << ruub << dendl; | |
1223 | } | |
1224 | if (ruub > readable_until_ub) { | |
1225 | readable_until_ub = ruub; | |
1226 | psdout(20) << __func__ << " readable_until_ub now " << readable_until_ub | |
1227 | << dendl; | |
1228 | } | |
1229 | } | |
1230 | ||
1231 | void PeeringState::proc_lease_ack(int from, const pg_lease_ack_t& a) | |
1232 | { | |
1233 | if (!HAVE_FEATURE(upacting_features, SERVER_OCTOPUS)) { | |
1234 | return; | |
1235 | } | |
1236 | auto now = pl->get_mnow(); | |
1237 | bool was_min = false; | |
1238 | for (unsigned i = 0; i < acting.size(); ++i) { | |
1239 | if (from == acting[i]) { | |
1240 | // the lease_ack value is based on the primary's clock | |
1241 | if (a.readable_until_ub > acting_readable_until_ub[i]) { | |
1242 | if (acting_readable_until_ub[i] == readable_until) { | |
1243 | was_min = true; | |
1244 | } | |
1245 | acting_readable_until_ub[i] = a.readable_until_ub; | |
1246 | break; | |
1247 | } | |
1248 | } | |
1249 | } | |
1250 | if (was_min) { | |
1251 | auto old_ru = readable_until; | |
1252 | recalc_readable_until(); | |
1253 | if (now < old_ru) { | |
1254 | pl->recheck_readable(); | |
1255 | } | |
1256 | } | |
1257 | } | |
1258 | ||
1259 | void PeeringState::proc_renew_lease() | |
1260 | { | |
1261 | if (!HAVE_FEATURE(upacting_features, SERVER_OCTOPUS)) { | |
1262 | return; | |
1263 | } | |
1264 | renew_lease(pl->get_mnow()); | |
1265 | send_lease(); | |
1266 | schedule_renew_lease(); | |
1267 | } | |
1268 | ||
1269 | void PeeringState::recalc_readable_until() | |
1270 | { | |
1271 | assert(is_primary()); | |
1272 | ceph::signedspan min = readable_until_ub_sent; | |
1273 | for (unsigned i = 0; i < acting.size(); ++i) { | |
1274 | if (acting[i] == pg_whoami.osd || acting[i] == CRUSH_ITEM_NONE) { | |
1275 | continue; | |
1276 | } | |
1277 | dout(20) << __func__ << " peer osd." << acting[i] | |
1278 | << " ruub " << acting_readable_until_ub[i] << dendl; | |
1279 | if (acting_readable_until_ub[i] < min) { | |
1280 | min = acting_readable_until_ub[i]; | |
1281 | } | |
1282 | } | |
1283 | readable_until = min; | |
1284 | readable_until_ub = min; | |
1285 | dout(20) << __func__ << " readable_until[_ub] " << readable_until | |
1286 | << " (sent " << readable_until_ub_sent << ")" << dendl; | |
1287 | } | |
1288 | ||
1289 | bool PeeringState::check_prior_readable_down_osds(const OSDMapRef& map) | |
1290 | { | |
1291 | if (!HAVE_FEATURE(upacting_features, SERVER_OCTOPUS)) { | |
1292 | return false; | |
1293 | } | |
1294 | bool changed = false; | |
1295 | auto p = prior_readable_down_osds.begin(); | |
1296 | while (p != prior_readable_down_osds.end()) { | |
1297 | if (map->is_dead(*p)) { | |
1298 | dout(10) << __func__ << " prior_readable_down_osds osd." << *p | |
1299 | << " is dead as of epoch " << map->get_epoch() | |
1300 | << dendl; | |
1301 | p = prior_readable_down_osds.erase(p); | |
1302 | changed = true; | |
1303 | } else { | |
1304 | ++p; | |
1305 | } | |
1306 | } | |
1307 | if (changed && prior_readable_down_osds.empty()) { | |
1308 | psdout(10) << " empty prior_readable_down_osds, clearing ub" << dendl; | |
1309 | clear_prior_readable_until_ub(); | |
1310 | return true; | |
1311 | } | |
1312 | return false; | |
1313 | } | |
1314 | ||
1315 | bool PeeringState::adjust_need_up_thru(const OSDMapRef osdmap) | |
1316 | { | |
1317 | epoch_t up_thru = osdmap->get_up_thru(pg_whoami.osd); | |
1318 | if (need_up_thru && | |
1319 | up_thru >= info.history.same_interval_since) { | |
1320 | psdout(10) << "adjust_need_up_thru now " | |
1321 | << up_thru << ", need_up_thru now false" << dendl; | |
1322 | need_up_thru = false; | |
1323 | return true; | |
1324 | } | |
1325 | return false; | |
1326 | } | |
1327 | ||
1328 | PastIntervals::PriorSet PeeringState::build_prior() | |
1329 | { | |
1330 | if (1) { | |
1331 | // sanity check | |
f67539c2 | 1332 | for (auto it = peer_info.begin(); it != peer_info.end(); ++it) { |
9f95a23c TL |
1333 | ceph_assert(info.history.last_epoch_started >= |
1334 | it->second.history.last_epoch_started); | |
1335 | } | |
1336 | } | |
1337 | ||
1338 | const OSDMap &osdmap = *get_osdmap(); | |
1339 | PastIntervals::PriorSet prior = past_intervals.get_prior_set( | |
1340 | pool.info.is_erasure(), | |
1341 | info.history.last_epoch_started, | |
1342 | &missing_loc.get_recoverable_predicate(), | |
1343 | [&](epoch_t start, int osd, epoch_t *lost_at) { | |
1344 | const osd_info_t *pinfo = 0; | |
1345 | if (osdmap.exists(osd)) { | |
1346 | pinfo = &osdmap.get_info(osd); | |
1347 | if (lost_at) | |
1348 | *lost_at = pinfo->lost_at; | |
1349 | } | |
1350 | ||
1351 | if (osdmap.is_up(osd)) { | |
1352 | return PastIntervals::UP; | |
1353 | } else if (!pinfo) { | |
1354 | return PastIntervals::DNE; | |
1355 | } else if (pinfo->lost_at > start) { | |
1356 | return PastIntervals::LOST; | |
1357 | } else { | |
1358 | return PastIntervals::DOWN; | |
1359 | } | |
1360 | }, | |
1361 | up, | |
1362 | acting, | |
1363 | dpp); | |
1364 | ||
1365 | if (prior.pg_down) { | |
1366 | state_set(PG_STATE_DOWN); | |
1367 | } | |
1368 | ||
1369 | if (get_osdmap()->get_up_thru(pg_whoami.osd) < | |
1370 | info.history.same_interval_since) { | |
1371 | psdout(10) << "up_thru " << get_osdmap()->get_up_thru(pg_whoami.osd) | |
1372 | << " < same_since " << info.history.same_interval_since | |
1373 | << ", must notify monitor" << dendl; | |
1374 | need_up_thru = true; | |
1375 | } else { | |
1376 | psdout(10) << "up_thru " << get_osdmap()->get_up_thru(pg_whoami.osd) | |
1377 | << " >= same_since " << info.history.same_interval_since | |
1378 | << ", all is well" << dendl; | |
1379 | need_up_thru = false; | |
1380 | } | |
1381 | pl->set_probe_targets(prior.probe); | |
1382 | return prior; | |
1383 | } | |
1384 | ||
1385 | bool PeeringState::needs_recovery() const | |
1386 | { | |
1387 | ceph_assert(is_primary()); | |
1388 | ||
1389 | auto &missing = pg_log.get_missing(); | |
1390 | ||
1391 | if (missing.num_missing()) { | |
1392 | psdout(10) << __func__ << " primary has " << missing.num_missing() | |
1393 | << " missing" << dendl; | |
1394 | return true; | |
1395 | } | |
1396 | ||
1397 | ceph_assert(!acting_recovery_backfill.empty()); | |
f67539c2 TL |
1398 | for (const pg_shard_t& peer : acting_recovery_backfill) { |
1399 | if (peer == get_primary()) { | |
1400 | continue; | |
1401 | } | |
1402 | auto pm = peer_missing.find(peer); | |
9f95a23c TL |
1403 | if (pm == peer_missing.end()) { |
1404 | psdout(10) << __func__ << " osd." << peer << " doesn't have missing set" | |
1405 | << dendl; | |
1406 | continue; | |
1407 | } | |
1408 | if (pm->second.num_missing()) { | |
1409 | psdout(10) << __func__ << " osd." << peer << " has " | |
1410 | << pm->second.num_missing() << " missing" << dendl; | |
1411 | return true; | |
1412 | } | |
1413 | } | |
1414 | ||
1415 | psdout(10) << __func__ << " is recovered" << dendl; | |
1416 | return false; | |
1417 | } | |
1418 | ||
1419 | bool PeeringState::needs_backfill() const | |
1420 | { | |
1421 | ceph_assert(is_primary()); | |
1422 | ||
1423 | // We can assume that only possible osds that need backfill | |
1424 | // are on the backfill_targets vector nodes. | |
f67539c2 TL |
1425 | for (const pg_shard_t& peer : backfill_targets) { |
1426 | auto pi = peer_info.find(peer); | |
1427 | ceph_assert(pi != peer_info.end()); | |
9f95a23c TL |
1428 | if (!pi->second.last_backfill.is_max()) { |
1429 | psdout(10) << __func__ << " osd." << peer | |
1430 | << " has last_backfill " << pi->second.last_backfill << dendl; | |
1431 | return true; | |
1432 | } | |
1433 | } | |
1434 | ||
1435 | psdout(10) << __func__ << " does not need backfill" << dendl; | |
1436 | return false; | |
1437 | } | |
1438 | ||
1439 | /* | |
1440 | * Returns true unless there is a non-lost OSD in might_have_unfound. | |
1441 | */ | |
1442 | bool PeeringState::all_unfound_are_queried_or_lost( | |
1443 | const OSDMapRef osdmap) const | |
1444 | { | |
1445 | ceph_assert(is_primary()); | |
1446 | ||
f67539c2 TL |
1447 | auto peer = might_have_unfound.begin(); |
1448 | auto mend = might_have_unfound.end(); | |
9f95a23c TL |
1449 | for (; peer != mend; ++peer) { |
1450 | if (peer_missing.count(*peer)) | |
1451 | continue; | |
f67539c2 | 1452 | auto iter = peer_info.find(*peer); |
9f95a23c TL |
1453 | if (iter != peer_info.end() && |
1454 | (iter->second.is_empty() || iter->second.dne())) | |
1455 | continue; | |
1456 | if (!osdmap->exists(peer->osd)) | |
1457 | continue; | |
1458 | const osd_info_t &osd_info(osdmap->get_info(peer->osd)); | |
1459 | if (osd_info.lost_at <= osd_info.up_from) { | |
1460 | // If there is even one OSD in might_have_unfound that isn't lost, we | |
1461 | // still might retrieve our unfound. | |
1462 | return false; | |
1463 | } | |
1464 | } | |
1465 | psdout(10) << "all_unfound_are_queried_or_lost all of might_have_unfound " | |
1466 | << might_have_unfound | |
1467 | << " have been queried or are marked lost" << dendl; | |
1468 | return true; | |
1469 | } | |
1470 | ||
1471 | ||
1472 | void PeeringState::reject_reservation() | |
1473 | { | |
1474 | pl->unreserve_recovery_space(); | |
1475 | pl->send_cluster_message( | |
1476 | primary.osd, | |
f67539c2 | 1477 | make_message<MBackfillReserve>( |
9f95a23c TL |
1478 | MBackfillReserve::REJECT_TOOFULL, |
1479 | spg_t(info.pgid.pgid, primary.shard), | |
1480 | get_osdmap_epoch()), | |
1481 | get_osdmap_epoch()); | |
1482 | } | |
1483 | ||
1484 | /** | |
1485 | * find_best_info | |
1486 | * | |
1487 | * Returns an iterator to the best info in infos sorted by: | |
1488 | * 1) Prefer newer last_update | |
1489 | * 2) Prefer longer tail if it brings another info into contiguity | |
1490 | * 3) Prefer current primary | |
1491 | */ | |
1492 | map<pg_shard_t, pg_info_t>::const_iterator PeeringState::find_best_info( | |
1493 | const map<pg_shard_t, pg_info_t> &infos, | |
1494 | bool restrict_to_up_acting, | |
1495 | bool *history_les_bound) const | |
1496 | { | |
1497 | ceph_assert(history_les_bound); | |
1498 | /* See doc/dev/osd_internals/last_epoch_started.rst before attempting | |
1499 | * to make changes to this process. Also, make sure to update it | |
1500 | * when you find bugs! */ | |
9f95a23c | 1501 | epoch_t max_last_epoch_started_found = 0; |
f67539c2 | 1502 | for (auto i = infos.begin(); i != infos.end(); ++i) { |
9f95a23c TL |
1503 | if (!cct->_conf->osd_find_best_info_ignore_history_les && |
1504 | max_last_epoch_started_found < i->second.history.last_epoch_started) { | |
1505 | *history_les_bound = true; | |
1506 | max_last_epoch_started_found = i->second.history.last_epoch_started; | |
1507 | } | |
1508 | if (!i->second.is_incomplete() && | |
1509 | max_last_epoch_started_found < i->second.last_epoch_started) { | |
1510 | *history_les_bound = false; | |
1511 | max_last_epoch_started_found = i->second.last_epoch_started; | |
1512 | } | |
1513 | } | |
f67539c2 TL |
1514 | eversion_t min_last_update_acceptable = eversion_t::max(); |
1515 | for (auto i = infos.begin(); i != infos.end(); ++i) { | |
9f95a23c TL |
1516 | if (max_last_epoch_started_found <= i->second.last_epoch_started) { |
1517 | if (min_last_update_acceptable > i->second.last_update) | |
1518 | min_last_update_acceptable = i->second.last_update; | |
1519 | } | |
1520 | } | |
1521 | if (min_last_update_acceptable == eversion_t::max()) | |
1522 | return infos.end(); | |
1523 | ||
f67539c2 | 1524 | auto best = infos.end(); |
9f95a23c TL |
1525 | // find osd with newest last_update (oldest for ec_pool). |
1526 | // if there are multiples, prefer | |
1527 | // - a longer tail, if it brings another peer into log contiguity | |
1528 | // - the current primary | |
f67539c2 | 1529 | for (auto p = infos.begin(); p != infos.end(); ++p) { |
9f95a23c TL |
1530 | if (restrict_to_up_acting && !is_up(p->first) && |
1531 | !is_acting(p->first)) | |
1532 | continue; | |
1533 | // Only consider peers with last_update >= min_last_update_acceptable | |
1534 | if (p->second.last_update < min_last_update_acceptable) | |
1535 | continue; | |
1536 | // Disqualify anyone with a too old last_epoch_started | |
1537 | if (p->second.last_epoch_started < max_last_epoch_started_found) | |
1538 | continue; | |
1539 | // Disqualify anyone who is incomplete (not fully backfilled) | |
1540 | if (p->second.is_incomplete()) | |
1541 | continue; | |
1542 | if (best == infos.end()) { | |
1543 | best = p; | |
1544 | continue; | |
1545 | } | |
1546 | // Prefer newer last_update | |
1547 | if (pool.info.require_rollback()) { | |
1548 | if (p->second.last_update > best->second.last_update) | |
1549 | continue; | |
1550 | if (p->second.last_update < best->second.last_update) { | |
1551 | best = p; | |
1552 | continue; | |
1553 | } | |
1554 | } else { | |
1555 | if (p->second.last_update < best->second.last_update) | |
1556 | continue; | |
1557 | if (p->second.last_update > best->second.last_update) { | |
1558 | best = p; | |
1559 | continue; | |
1560 | } | |
1561 | } | |
1562 | ||
1563 | // Prefer longer tail | |
1564 | if (p->second.log_tail > best->second.log_tail) { | |
1565 | continue; | |
1566 | } else if (p->second.log_tail < best->second.log_tail) { | |
1567 | best = p; | |
1568 | continue; | |
1569 | } | |
1570 | ||
1571 | if (!p->second.has_missing() && best->second.has_missing()) { | |
1572 | psdout(10) << __func__ << " prefer osd." << p->first | |
1573 | << " because it is complete while best has missing" | |
1574 | << dendl; | |
1575 | best = p; | |
1576 | continue; | |
1577 | } else if (p->second.has_missing() && !best->second.has_missing()) { | |
1578 | psdout(10) << __func__ << " skipping osd." << p->first | |
1579 | << " because it has missing while best is complete" | |
1580 | << dendl; | |
1581 | continue; | |
1582 | } else { | |
1583 | // both are complete or have missing | |
1584 | // fall through | |
1585 | } | |
1586 | ||
1587 | // prefer current primary (usually the caller), all things being equal | |
1588 | if (p->first == pg_whoami) { | |
1589 | psdout(10) << "calc_acting prefer osd." << p->first | |
1590 | << " because it is current primary" << dendl; | |
1591 | best = p; | |
1592 | continue; | |
1593 | } | |
1594 | } | |
1595 | return best; | |
1596 | } | |
1597 | ||
1598 | void PeeringState::calc_ec_acting( | |
1599 | map<pg_shard_t, pg_info_t>::const_iterator auth_log_shard, | |
1600 | unsigned size, | |
1601 | const vector<int> &acting, | |
1602 | const vector<int> &up, | |
1603 | const map<pg_shard_t, pg_info_t> &all_info, | |
1604 | bool restrict_to_up_acting, | |
1605 | vector<int> *_want, | |
1606 | set<pg_shard_t> *backfill, | |
1607 | set<pg_shard_t> *acting_backfill, | |
1608 | ostream &ss) | |
1609 | { | |
1610 | vector<int> want(size, CRUSH_ITEM_NONE); | |
1611 | map<shard_id_t, set<pg_shard_t> > all_info_by_shard; | |
f67539c2 | 1612 | for (auto i = all_info.begin(); |
9f95a23c TL |
1613 | i != all_info.end(); |
1614 | ++i) { | |
1615 | all_info_by_shard[i->first.shard].insert(i->first); | |
1616 | } | |
1617 | for (uint8_t i = 0; i < want.size(); ++i) { | |
1618 | ss << "For position " << (unsigned)i << ": "; | |
1619 | if (up.size() > (unsigned)i && up[i] != CRUSH_ITEM_NONE && | |
1620 | !all_info.find(pg_shard_t(up[i], shard_id_t(i)))->second.is_incomplete() && | |
1621 | all_info.find(pg_shard_t(up[i], shard_id_t(i)))->second.last_update >= | |
1622 | auth_log_shard->second.log_tail) { | |
1623 | ss << " selecting up[i]: " << pg_shard_t(up[i], shard_id_t(i)) << std::endl; | |
1624 | want[i] = up[i]; | |
1625 | continue; | |
1626 | } | |
1627 | if (up.size() > (unsigned)i && up[i] != CRUSH_ITEM_NONE) { | |
1628 | ss << " backfilling up[i]: " << pg_shard_t(up[i], shard_id_t(i)) | |
1629 | << " and "; | |
1630 | backfill->insert(pg_shard_t(up[i], shard_id_t(i))); | |
1631 | } | |
1632 | ||
1633 | if (acting.size() > (unsigned)i && acting[i] != CRUSH_ITEM_NONE && | |
1634 | !all_info.find(pg_shard_t(acting[i], shard_id_t(i)))->second.is_incomplete() && | |
1635 | all_info.find(pg_shard_t(acting[i], shard_id_t(i)))->second.last_update >= | |
1636 | auth_log_shard->second.log_tail) { | |
1637 | ss << " selecting acting[i]: " << pg_shard_t(acting[i], shard_id_t(i)) << std::endl; | |
1638 | want[i] = acting[i]; | |
1639 | } else if (!restrict_to_up_acting) { | |
f67539c2 | 1640 | for (auto j = all_info_by_shard[shard_id_t(i)].begin(); |
9f95a23c TL |
1641 | j != all_info_by_shard[shard_id_t(i)].end(); |
1642 | ++j) { | |
1643 | ceph_assert(j->shard == i); | |
1644 | if (!all_info.find(*j)->second.is_incomplete() && | |
1645 | all_info.find(*j)->second.last_update >= | |
1646 | auth_log_shard->second.log_tail) { | |
1647 | ss << " selecting stray: " << *j << std::endl; | |
1648 | want[i] = j->osd; | |
1649 | break; | |
1650 | } | |
1651 | } | |
1652 | if (want[i] == CRUSH_ITEM_NONE) | |
1653 | ss << " failed to fill position " << (int)i << std::endl; | |
1654 | } | |
1655 | } | |
1656 | ||
1657 | for (uint8_t i = 0; i < want.size(); ++i) { | |
1658 | if (want[i] != CRUSH_ITEM_NONE) { | |
1659 | acting_backfill->insert(pg_shard_t(want[i], shard_id_t(i))); | |
1660 | } | |
1661 | } | |
1662 | acting_backfill->insert(backfill->begin(), backfill->end()); | |
1663 | _want->swap(want); | |
1664 | } | |
1665 | ||
f67539c2 TL |
1666 | std::pair<map<pg_shard_t, pg_info_t>::const_iterator, eversion_t> |
1667 | PeeringState::select_replicated_primary( | |
9f95a23c TL |
1668 | map<pg_shard_t, pg_info_t>::const_iterator auth_log_shard, |
1669 | uint64_t force_auth_primary_missing_objects, | |
f67539c2 | 1670 | const std::vector<int> &up, |
9f95a23c TL |
1671 | pg_shard_t up_primary, |
1672 | const map<pg_shard_t, pg_info_t> &all_info, | |
9f95a23c TL |
1673 | const OSDMapRef osdmap, |
1674 | ostream &ss) | |
1675 | { | |
1676 | pg_shard_t auth_log_shard_id = auth_log_shard->first; | |
1677 | ||
1678 | ss << __func__ << " newest update on osd." << auth_log_shard_id | |
f67539c2 | 1679 | << " with " << auth_log_shard->second << std::endl; |
9f95a23c TL |
1680 | |
1681 | // select primary | |
1682 | auto primary = all_info.find(up_primary); | |
1683 | if (up.size() && | |
1684 | !primary->second.is_incomplete() && | |
1685 | primary->second.last_update >= | |
1686 | auth_log_shard->second.log_tail) { | |
1687 | if (HAVE_FEATURE(osdmap->get_up_osd_features(), SERVER_NAUTILUS)) { | |
1688 | auto approx_missing_objects = | |
1689 | primary->second.stats.stats.sum.num_objects_missing; | |
1690 | auto auth_version = auth_log_shard->second.last_update.version; | |
1691 | auto primary_version = primary->second.last_update.version; | |
1692 | if (auth_version > primary_version) { | |
1693 | approx_missing_objects += auth_version - primary_version; | |
1694 | } else { | |
1695 | approx_missing_objects += primary_version - auth_version; | |
1696 | } | |
1697 | if ((uint64_t)approx_missing_objects > | |
1698 | force_auth_primary_missing_objects) { | |
1699 | primary = auth_log_shard; | |
1700 | ss << "up_primary: " << up_primary << ") has approximate " | |
1701 | << approx_missing_objects | |
1702 | << "(>" << force_auth_primary_missing_objects <<") " | |
1703 | << "missing objects, osd." << auth_log_shard_id | |
1704 | << " selected as primary instead" | |
1705 | << std::endl; | |
1706 | } else { | |
1707 | ss << "up_primary: " << up_primary << ") selected as primary" | |
1708 | << std::endl; | |
1709 | } | |
1710 | } else { | |
1711 | ss << "up_primary: " << up_primary << ") selected as primary" << std::endl; | |
1712 | } | |
1713 | } else { | |
1714 | ceph_assert(!auth_log_shard->second.is_incomplete()); | |
1715 | ss << "up[0] needs backfill, osd." << auth_log_shard_id | |
1716 | << " selected as primary instead" << std::endl; | |
1717 | primary = auth_log_shard; | |
1718 | } | |
1719 | ||
1720 | ss << __func__ << " primary is osd." << primary->first | |
1721 | << " with " << primary->second << std::endl; | |
9f95a23c TL |
1722 | |
1723 | /* We include auth_log_shard->second.log_tail because in GetLog, | |
1724 | * we will request logs back to the min last_update over our | |
1725 | * acting_backfill set, which will result in our log being extended | |
1726 | * as far backwards as necessary to pick up any peers which can | |
1727 | * be log recovered by auth_log_shard's log */ | |
1728 | eversion_t oldest_auth_log_entry = | |
1729 | std::min(primary->second.log_tail, auth_log_shard->second.log_tail); | |
1730 | ||
f67539c2 TL |
1731 | return std::make_pair(primary, oldest_auth_log_entry); |
1732 | } | |
1733 | ||
1734 | ||
1735 | /** | |
1736 | * calculate the desired acting set. | |
1737 | * | |
1738 | * Choose an appropriate acting set. Prefer up[0], unless it is | |
1739 | * incomplete, or another osd has a longer tail that allows us to | |
1740 | * bring other up nodes up to date. | |
1741 | */ | |
1742 | void PeeringState::calc_replicated_acting( | |
1743 | map<pg_shard_t, pg_info_t>::const_iterator primary, | |
1744 | eversion_t oldest_auth_log_entry, | |
1745 | unsigned size, | |
1746 | const vector<int> &acting, | |
1747 | const vector<int> &up, | |
1748 | pg_shard_t up_primary, | |
1749 | const map<pg_shard_t, pg_info_t> &all_info, | |
1750 | bool restrict_to_up_acting, | |
1751 | vector<int> *want, | |
1752 | set<pg_shard_t> *backfill, | |
1753 | set<pg_shard_t> *acting_backfill, | |
1754 | const OSDMapRef osdmap, | |
1755 | const PGPool& pool, | |
1756 | ostream &ss) | |
1757 | { | |
1758 | ss << __func__ << (restrict_to_up_acting ? " restrict_to_up_acting" : "") | |
1759 | << std::endl; | |
1760 | ||
1761 | want->push_back(primary->first.osd); | |
1762 | acting_backfill->insert(primary->first); | |
1763 | ||
9f95a23c TL |
1764 | // select replicas that have log contiguity with primary. |
1765 | // prefer up, then acting, then any peer_info osds | |
1766 | for (auto i : up) { | |
1767 | pg_shard_t up_cand = pg_shard_t(i, shard_id_t::NO_SHARD); | |
1768 | if (up_cand == primary->first) | |
1769 | continue; | |
1770 | const pg_info_t &cur_info = all_info.find(up_cand)->second; | |
1771 | if (cur_info.is_incomplete() || | |
1772 | cur_info.last_update < oldest_auth_log_entry) { | |
1773 | ss << " shard " << up_cand << " (up) backfill " << cur_info << std::endl; | |
1774 | backfill->insert(up_cand); | |
1775 | acting_backfill->insert(up_cand); | |
1776 | } else { | |
1777 | want->push_back(i); | |
1778 | acting_backfill->insert(up_cand); | |
1779 | ss << " osd." << i << " (up) accepted " << cur_info << std::endl; | |
1780 | } | |
1781 | } | |
1782 | ||
1783 | if (want->size() >= size) { | |
1784 | return; | |
1785 | } | |
1786 | ||
1787 | std::vector<std::pair<eversion_t, int>> candidate_by_last_update; | |
1788 | candidate_by_last_update.reserve(acting.size()); | |
1789 | // This no longer has backfill OSDs, but they are covered above. | |
1790 | for (auto i : acting) { | |
1791 | pg_shard_t acting_cand(i, shard_id_t::NO_SHARD); | |
1792 | // skip up osds we already considered above | |
1793 | if (acting_cand == primary->first) | |
1794 | continue; | |
f67539c2 | 1795 | auto up_it = find(up.begin(), up.end(), i); |
9f95a23c TL |
1796 | if (up_it != up.end()) |
1797 | continue; | |
1798 | ||
1799 | const pg_info_t &cur_info = all_info.find(acting_cand)->second; | |
1800 | if (cur_info.is_incomplete() || | |
1801 | cur_info.last_update < oldest_auth_log_entry) { | |
1802 | ss << " shard " << acting_cand << " (acting) REJECTED " | |
1803 | << cur_info << std::endl; | |
1804 | } else { | |
1805 | candidate_by_last_update.emplace_back(cur_info.last_update, i); | |
1806 | } | |
1807 | } | |
1808 | ||
1809 | auto sort_by_eversion =[](const std::pair<eversion_t, int> &lhs, | |
1810 | const std::pair<eversion_t, int> &rhs) { | |
1811 | return lhs.first > rhs.first; | |
1812 | }; | |
1813 | // sort by last_update, in descending order. | |
1814 | std::sort(candidate_by_last_update.begin(), | |
1815 | candidate_by_last_update.end(), sort_by_eversion); | |
1816 | for (auto &p: candidate_by_last_update) { | |
1817 | ceph_assert(want->size() < size); | |
1818 | want->push_back(p.second); | |
1819 | pg_shard_t s = pg_shard_t(p.second, shard_id_t::NO_SHARD); | |
1820 | acting_backfill->insert(s); | |
1821 | ss << " shard " << s << " (acting) accepted " | |
1822 | << all_info.find(s)->second << std::endl; | |
1823 | if (want->size() >= size) { | |
1824 | return; | |
1825 | } | |
1826 | } | |
1827 | ||
1828 | if (restrict_to_up_acting) { | |
1829 | return; | |
1830 | } | |
1831 | candidate_by_last_update.clear(); | |
1832 | candidate_by_last_update.reserve(all_info.size()); // overestimate but fine | |
1833 | // continue to search stray to find more suitable peers | |
1834 | for (auto &i : all_info) { | |
1835 | // skip up osds we already considered above | |
1836 | if (i.first == primary->first) | |
1837 | continue; | |
f67539c2 | 1838 | auto up_it = find(up.begin(), up.end(), i.first.osd); |
9f95a23c TL |
1839 | if (up_it != up.end()) |
1840 | continue; | |
f67539c2 | 1841 | auto acting_it = find( |
9f95a23c TL |
1842 | acting.begin(), acting.end(), i.first.osd); |
1843 | if (acting_it != acting.end()) | |
1844 | continue; | |
1845 | ||
1846 | if (i.second.is_incomplete() || | |
1847 | i.second.last_update < oldest_auth_log_entry) { | |
1848 | ss << " shard " << i.first << " (stray) REJECTED " << i.second | |
1849 | << std::endl; | |
1850 | } else { | |
1851 | candidate_by_last_update.emplace_back( | |
1852 | i.second.last_update, i.first.osd); | |
1853 | } | |
1854 | } | |
1855 | ||
1856 | if (candidate_by_last_update.empty()) { | |
1857 | // save us some effort | |
1858 | return; | |
1859 | } | |
1860 | ||
1861 | // sort by last_update, in descending order. | |
1862 | std::sort(candidate_by_last_update.begin(), | |
1863 | candidate_by_last_update.end(), sort_by_eversion); | |
1864 | ||
1865 | for (auto &p: candidate_by_last_update) { | |
1866 | ceph_assert(want->size() < size); | |
1867 | want->push_back(p.second); | |
1868 | pg_shard_t s = pg_shard_t(p.second, shard_id_t::NO_SHARD); | |
1869 | acting_backfill->insert(s); | |
1870 | ss << " shard " << s << " (stray) accepted " | |
1871 | << all_info.find(s)->second << std::endl; | |
1872 | if (want->size() >= size) { | |
1873 | return; | |
1874 | } | |
1875 | } | |
1876 | } | |
1877 | ||
f67539c2 TL |
1878 | // Defines osd preference order: acting set, then larger last_update |
1879 | using osd_ord_t = std::tuple<bool, eversion_t>; // <acting, last_update> | |
1880 | using osd_id_t = int; | |
1881 | ||
1882 | class bucket_candidates_t { | |
1883 | std::deque<std::pair<osd_ord_t, osd_id_t>> osds; | |
1884 | int selected = 0; | |
1885 | ||
1886 | public: | |
1887 | void add_osd(osd_ord_t ord, osd_id_t osd) { | |
1888 | // osds will be added in smallest to largest order | |
1889 | assert(osds.empty() || osds.back().first <= ord); | |
1890 | osds.push_back(std::make_pair(ord, osd)); | |
1891 | } | |
1892 | osd_id_t pop_osd() { | |
1893 | ceph_assert(!is_empty()); | |
1894 | auto ret = osds.back(); | |
1895 | osds.pop_back(); | |
1896 | return ret.second; | |
1897 | } | |
1898 | ||
1899 | void inc_selected() { selected++; } | |
1900 | unsigned get_num_selected() const { return selected; } | |
1901 | ||
1902 | osd_ord_t get_ord() const { | |
1903 | return osds.empty() ? std::make_tuple(false, eversion_t()) | |
1904 | : osds.back().first; | |
1905 | } | |
1906 | ||
1907 | bool is_empty() const { return osds.empty(); } | |
1908 | ||
1909 | bool operator<(const bucket_candidates_t &rhs) const { | |
1910 | return std::make_tuple(-selected, get_ord()) < | |
1911 | std::make_tuple(-rhs.selected, rhs.get_ord()); | |
1912 | } | |
1913 | ||
1914 | friend std::ostream &operator<<(std::ostream &, const bucket_candidates_t &); | |
1915 | }; | |
1916 | ||
1917 | std::ostream &operator<<(std::ostream &lhs, const bucket_candidates_t &cand) | |
1918 | { | |
1919 | return lhs << "candidates[" << cand.osds << "]"; | |
1920 | } | |
1921 | ||
1922 | class bucket_heap_t { | |
1923 | using elem_t = std::reference_wrapper<bucket_candidates_t>; | |
1924 | std::vector<elem_t> heap; | |
1925 | ||
1926 | // Max heap -- should emit buckets in order of preference | |
1927 | struct comp { | |
1928 | bool operator()(const elem_t &lhs, const elem_t &rhs) { | |
1929 | return lhs.get() < rhs.get(); | |
1930 | } | |
1931 | }; | |
1932 | public: | |
1933 | void push_if_nonempty(elem_t e) { | |
1934 | if (!e.get().is_empty()) { | |
1935 | heap.push_back(e); | |
1936 | std::push_heap(heap.begin(), heap.end(), comp()); | |
1937 | } | |
1938 | } | |
1939 | elem_t pop() { | |
1940 | std::pop_heap(heap.begin(), heap.end(), comp()); | |
1941 | auto ret = heap.back(); | |
1942 | heap.pop_back(); | |
1943 | return ret; | |
1944 | } | |
1945 | ||
1946 | bool is_empty() const { return heap.empty(); } | |
1947 | }; | |
1948 | ||
1949 | /** | |
1950 | * calc_replicated_acting_stretch | |
1951 | * | |
1952 | * Choose an acting set using as much of the up set as possible; filling | |
1953 | * in the remaining slots so as to maximize the number of crush buckets at | |
1954 | * level pool.info.peering_crush_bucket_barrier represented. | |
1955 | * | |
1956 | * Stretch clusters are a bit special: while they have a "size" the | |
1957 | * same way as normal pools, if we happen to lose a data center | |
1958 | * (we call it a "stretch bucket", but really it'll be a data center or | |
1959 | * a cloud availability zone), we don't actually want to shove | |
1960 | * 2 DC's worth of replication into a single site -- it won't fit! | |
1961 | * So we locally calculate a bucket_max, based | |
1962 | * on the targeted number of stretch buckets for the pool and | |
1963 | * its size. Then we won't pull more than bucket_max from any | |
1964 | * given ancestor even if it leaves us undersized. | |
1965 | ||
1966 | * There are two distinct phases: (commented below) | |
1967 | */ | |
1968 | void PeeringState::calc_replicated_acting_stretch( | |
1969 | map<pg_shard_t, pg_info_t>::const_iterator primary, | |
1970 | eversion_t oldest_auth_log_entry, | |
1971 | unsigned size, | |
1972 | const vector<int> &acting, | |
1973 | const vector<int> &up, | |
1974 | pg_shard_t up_primary, | |
1975 | const map<pg_shard_t, pg_info_t> &all_info, | |
1976 | bool restrict_to_up_acting, | |
1977 | vector<int> *want, | |
1978 | set<pg_shard_t> *backfill, | |
1979 | set<pg_shard_t> *acting_backfill, | |
1980 | const OSDMapRef osdmap, | |
1981 | const PGPool& pool, | |
1982 | ostream &ss) | |
1983 | { | |
1984 | ceph_assert(want); | |
1985 | ceph_assert(acting_backfill); | |
1986 | ceph_assert(backfill); | |
1987 | ss << __func__ << (restrict_to_up_acting ? " restrict_to_up_acting" : "") | |
1988 | << std::endl; | |
1989 | ||
1990 | auto used = [want](int osd) { | |
1991 | return std::find(want->begin(), want->end(), osd) != want->end(); | |
1992 | }; | |
1993 | ||
1994 | auto usable_info = [&](const auto &cur_info) mutable { | |
1995 | return !(cur_info.is_incomplete() || | |
1996 | cur_info.last_update < oldest_auth_log_entry); | |
1997 | }; | |
1998 | ||
1999 | auto osd_info = [&](int osd) mutable -> const pg_info_t & { | |
2000 | pg_shard_t cand = pg_shard_t(osd, shard_id_t::NO_SHARD); | |
2001 | const pg_info_t &cur_info = all_info.find(cand)->second; | |
2002 | return cur_info; | |
2003 | }; | |
2004 | ||
2005 | auto usable_osd = [&](int osd) mutable { | |
2006 | return usable_info(osd_info(osd)); | |
2007 | }; | |
2008 | ||
2009 | std::map<int, bucket_candidates_t> ancestors; | |
2010 | auto get_ancestor = [&](int osd) mutable { | |
2011 | int ancestor = osdmap->crush->get_parent_of_type( | |
2012 | osd, | |
2013 | pool.info.peering_crush_bucket_barrier, | |
2014 | pool.info.crush_rule); | |
2015 | return &ancestors[ancestor]; | |
2016 | }; | |
2017 | ||
2018 | unsigned bucket_max = pool.info.size / pool.info.peering_crush_bucket_target; | |
2019 | if (bucket_max * pool.info.peering_crush_bucket_target < pool.info.size) { | |
2020 | ++bucket_max; | |
2021 | } | |
2022 | ||
2023 | /* 1) Select all usable osds from the up set as well as the primary | |
2024 | * | |
2025 | * We also stash any unusable osds from up into backfill. | |
2026 | */ | |
2027 | auto add_required = [&](int osd) { | |
2028 | if (!used(osd)) { | |
2029 | want->push_back(osd); | |
2030 | acting_backfill->insert( | |
2031 | pg_shard_t(osd, shard_id_t::NO_SHARD)); | |
2032 | get_ancestor(osd)->inc_selected(); | |
2033 | } | |
2034 | }; | |
2035 | add_required(primary->first.osd); | |
2036 | ss << " osd " << primary->first.osd << " primary accepted " | |
2037 | << osd_info(primary->first.osd) << std::endl; | |
2038 | for (auto upcand: up) { | |
2039 | auto upshard = pg_shard_t(upcand, shard_id_t::NO_SHARD); | |
2040 | auto &curinfo = osd_info(upcand); | |
2041 | if (usable_osd(upcand)) { | |
2042 | ss << " osd " << upcand << " (up) accepted " << curinfo << std::endl; | |
2043 | add_required(upcand); | |
2044 | } else { | |
2045 | ss << " osd " << upcand << " (up) backfill " << curinfo << std::endl; | |
2046 | backfill->insert(upshard); | |
2047 | acting_backfill->insert(upshard); | |
2048 | } | |
2049 | } | |
2050 | ||
2051 | if (want->size() >= pool.info.size) { // non-failed CRUSH mappings are valid | |
2052 | ss << " up set sufficient" << std::endl; | |
2053 | return; | |
2054 | } | |
2055 | ss << " up set insufficient, considering remaining osds" << std::endl; | |
2056 | ||
2057 | /* 2) Fill out remaining slots from usable osds in all_info | |
2058 | * while maximizing the number of ancestor nodes at the | |
2059 | * barrier_id crush level. | |
2060 | */ | |
2061 | { | |
2062 | std::vector<std::pair<osd_ord_t, osd_id_t>> candidates; | |
2063 | /* To do this, we first filter the set of usable osd into an ordered | |
2064 | * list of usable osds | |
2065 | */ | |
2066 | auto get_osd_ord = [&](bool is_acting, const pg_info_t &info) -> osd_ord_t { | |
2067 | return std::make_tuple( | |
2068 | !is_acting /* acting should sort first */, | |
2069 | info.last_update); | |
2070 | }; | |
2071 | for (auto &cand : acting) { | |
2072 | auto &cand_info = osd_info(cand); | |
2073 | if (!used(cand) && usable_info(cand_info)) { | |
2074 | ss << " acting candidate " << cand << " " << cand_info << std::endl; | |
2075 | candidates.push_back(std::make_pair(get_osd_ord(true, cand_info), cand)); | |
2076 | } | |
2077 | } | |
2078 | if (!restrict_to_up_acting) { | |
2079 | for (auto &[cand, info] : all_info) { | |
2080 | if (!used(cand.osd) && usable_info(info) && | |
2081 | (std::find(acting.begin(), acting.end(), cand.osd) | |
2082 | == acting.end())) { | |
2083 | ss << " other candidate " << cand << " " << info << std::endl; | |
2084 | candidates.push_back( | |
2085 | std::make_pair(get_osd_ord(false, info), cand.osd)); | |
2086 | } | |
2087 | } | |
2088 | } | |
2089 | std::sort(candidates.begin(), candidates.end()); | |
2090 | ||
2091 | // We then filter these candidates by ancestor | |
2092 | std::for_each(candidates.begin(), candidates.end(), [&](auto cand) { | |
2093 | get_ancestor(cand.second)->add_osd(cand.first, cand.second); | |
2094 | }); | |
2095 | } | |
2096 | ||
2097 | auto pop_ancestor = [&](auto &ancestor) { | |
2098 | ceph_assert(!ancestor.is_empty()); | |
2099 | auto osd = ancestor.pop_osd(); | |
2100 | ||
2101 | ss << " accepting candidate " << osd << std::endl; | |
2102 | ||
2103 | ceph_assert(!used(osd)); | |
2104 | ceph_assert(usable_osd(osd)); | |
2105 | ||
2106 | want->push_back(osd); | |
2107 | acting_backfill->insert( | |
2108 | pg_shard_t(osd, shard_id_t::NO_SHARD)); | |
2109 | ancestor.inc_selected(); | |
2110 | }; | |
2111 | ||
2112 | /* Next, we use the ancestors map to grab a descendant of the | |
2113 | * peering_crush_mandatory_member if not already represented. | |
2114 | * | |
2115 | * TODO: using 0 here to match other users. Prior to merge, I | |
2116 | * expect that this and other users should instead check against | |
2117 | * CRUSH_ITEM_NONE. | |
2118 | */ | |
2119 | if (pool.info.peering_crush_mandatory_member != CRUSH_ITEM_NONE) { | |
2120 | auto aiter = ancestors.find(pool.info.peering_crush_mandatory_member); | |
2121 | if (aiter != ancestors.end() && | |
2122 | !aiter->second.get_num_selected()) { | |
2123 | ss << " adding required ancestor " << aiter->first << std::endl; | |
2124 | ceph_assert(!aiter->second.is_empty()); // wouldn't exist otherwise | |
2125 | pop_ancestor(aiter->second); | |
2126 | } | |
2127 | } | |
2128 | ||
2129 | /* We then place the ancestors in a heap ordered by fewest selected | |
2130 | * and then by the ordering token of the next osd */ | |
2131 | bucket_heap_t aheap; | |
2132 | std::for_each(ancestors.begin(), ancestors.end(), [&](auto &anc) { | |
2133 | aheap.push_if_nonempty(anc.second); | |
2134 | }); | |
2135 | ||
2136 | /* and pull from this heap until it's empty or we have enough. | |
2137 | * "We have enough" is a sufficient check here for | |
2138 | * stretch_set_can_peer() because our heap sorting always | |
2139 | * pulls from ancestors with the least number of included OSDs, | |
2140 | * so if it is possible to satisfy the bucket_count constraints we | |
2141 | * will do so. | |
2142 | */ | |
2143 | while (!aheap.is_empty() && want->size() < pool.info.size) { | |
2144 | auto next = aheap.pop(); | |
2145 | pop_ancestor(next.get()); | |
2146 | if (next.get().get_num_selected() < bucket_max) { | |
2147 | aheap.push_if_nonempty(next); | |
2148 | } | |
2149 | } | |
2150 | ||
2151 | /* The end result is that we should have as many buckets covered as | |
2152 | * possible while respecting up, the primary selection, | |
2153 | * the pool size (given bucket count constraints), | |
2154 | * and the mandatory member. | |
2155 | */ | |
2156 | } | |
2157 | ||
2158 | ||
9f95a23c TL |
2159 | bool PeeringState::recoverable(const vector<int> &want) const |
2160 | { | |
2161 | unsigned num_want_acting = 0; | |
2162 | set<pg_shard_t> have; | |
2163 | for (int i = 0; i < (int)want.size(); ++i) { | |
2164 | if (want[i] != CRUSH_ITEM_NONE) { | |
2165 | ++num_want_acting; | |
2166 | have.insert( | |
2167 | pg_shard_t( | |
2168 | want[i], | |
2169 | pool.info.is_erasure() ? shard_id_t(i) : shard_id_t::NO_SHARD)); | |
2170 | } | |
2171 | } | |
2172 | ||
2173 | if (num_want_acting < pool.info.min_size) { | |
2174 | const bool recovery_ec_pool_below_min_size= | |
2175 | HAVE_FEATURE(get_osdmap()->get_up_osd_features(), SERVER_OCTOPUS); | |
2176 | ||
2177 | if (pool.info.is_erasure() && !recovery_ec_pool_below_min_size) { | |
2178 | psdout(10) << __func__ << " failed, ec recovery below min size not supported by pre-octopus" << dendl; | |
2179 | return false; | |
2180 | } else if (!cct->_conf.get_val<bool>("osd_allow_recovery_below_min_size")) { | |
2181 | psdout(10) << __func__ << " failed, recovery below min size not enabled" << dendl; | |
2182 | return false; | |
2183 | } | |
2184 | } | |
2185 | if (missing_loc.get_recoverable_predicate()(have)) { | |
2186 | return true; | |
2187 | } else { | |
2188 | psdout(10) << __func__ << " failed, not recoverable " << dendl; | |
2189 | return false; | |
2190 | } | |
2191 | } | |
2192 | ||
2193 | void PeeringState::choose_async_recovery_ec( | |
2194 | const map<pg_shard_t, pg_info_t> &all_info, | |
2195 | const pg_info_t &auth_info, | |
2196 | vector<int> *want, | |
2197 | set<pg_shard_t> *async_recovery, | |
2198 | const OSDMapRef osdmap) const | |
2199 | { | |
2200 | set<pair<int, pg_shard_t> > candidates_by_cost; | |
2201 | for (uint8_t i = 0; i < want->size(); ++i) { | |
2202 | if ((*want)[i] == CRUSH_ITEM_NONE) | |
2203 | continue; | |
2204 | ||
2205 | // Considering log entries to recover is accurate enough for | |
2206 | // now. We could use minimum_to_decode_with_cost() later if | |
2207 | // necessary. | |
2208 | pg_shard_t shard_i((*want)[i], shard_id_t(i)); | |
2209 | // do not include strays | |
2210 | if (stray_set.find(shard_i) != stray_set.end()) | |
2211 | continue; | |
2212 | // Do not include an osd that is not up, since choosing it as | |
2213 | // an async_recovery_target will move it out of the acting set. | |
2214 | // This results in it being identified as a stray during peering, | |
2215 | // because it is no longer in the up or acting set. | |
2216 | if (!is_up(shard_i)) | |
2217 | continue; | |
2218 | auto shard_info = all_info.find(shard_i)->second; | |
2219 | // for ec pools we rollback all entries past the authoritative | |
2220 | // last_update *before* activation. This is relatively inexpensive | |
2221 | // compared to recovery, since it is purely local, so treat shards | |
2222 | // past the authoritative last_update the same as those equal to it. | |
2223 | version_t auth_version = auth_info.last_update.version; | |
2224 | version_t candidate_version = shard_info.last_update.version; | |
2225 | if (HAVE_FEATURE(osdmap->get_up_osd_features(), SERVER_NAUTILUS)) { | |
2226 | auto approx_missing_objects = | |
2227 | shard_info.stats.stats.sum.num_objects_missing; | |
2228 | if (auth_version > candidate_version) { | |
2229 | approx_missing_objects += auth_version - candidate_version; | |
2230 | } | |
2231 | if (static_cast<uint64_t>(approx_missing_objects) > | |
2232 | cct->_conf.get_val<uint64_t>("osd_async_recovery_min_cost")) { | |
2233 | candidates_by_cost.emplace(approx_missing_objects, shard_i); | |
2234 | } | |
2235 | } else { | |
2236 | if (auth_version > candidate_version && | |
2237 | (auth_version - candidate_version) > cct->_conf.get_val<uint64_t>("osd_async_recovery_min_cost")) { | |
2238 | candidates_by_cost.insert(make_pair(auth_version - candidate_version, shard_i)); | |
2239 | } | |
2240 | } | |
2241 | } | |
2242 | ||
2243 | psdout(20) << __func__ << " candidates by cost are: " << candidates_by_cost | |
2244 | << dendl; | |
2245 | ||
2246 | // take out as many osds as we can for async recovery, in order of cost | |
2247 | for (auto rit = candidates_by_cost.rbegin(); | |
2248 | rit != candidates_by_cost.rend(); ++rit) { | |
2249 | pg_shard_t cur_shard = rit->second; | |
2250 | vector<int> candidate_want(*want); | |
2251 | candidate_want[cur_shard.shard.id] = CRUSH_ITEM_NONE; | |
2252 | if (recoverable(candidate_want)) { | |
2253 | want->swap(candidate_want); | |
2254 | async_recovery->insert(cur_shard); | |
2255 | } | |
2256 | } | |
2257 | psdout(20) << __func__ << " result want=" << *want | |
2258 | << " async_recovery=" << *async_recovery << dendl; | |
2259 | } | |
2260 | ||
2261 | void PeeringState::choose_async_recovery_replicated( | |
2262 | const map<pg_shard_t, pg_info_t> &all_info, | |
2263 | const pg_info_t &auth_info, | |
2264 | vector<int> *want, | |
2265 | set<pg_shard_t> *async_recovery, | |
2266 | const OSDMapRef osdmap) const | |
2267 | { | |
2268 | set<pair<int, pg_shard_t> > candidates_by_cost; | |
2269 | for (auto osd_num : *want) { | |
2270 | pg_shard_t shard_i(osd_num, shard_id_t::NO_SHARD); | |
2271 | // do not include strays | |
2272 | if (stray_set.find(shard_i) != stray_set.end()) | |
2273 | continue; | |
2274 | // Do not include an osd that is not up, since choosing it as | |
2275 | // an async_recovery_target will move it out of the acting set. | |
2276 | // This results in it being identified as a stray during peering, | |
2277 | // because it is no longer in the up or acting set. | |
2278 | if (!is_up(shard_i)) | |
2279 | continue; | |
2280 | auto shard_info = all_info.find(shard_i)->second; | |
2281 | // use the approximate magnitude of the difference in length of | |
2282 | // logs plus historical missing objects as the cost of recovery | |
2283 | version_t auth_version = auth_info.last_update.version; | |
2284 | version_t candidate_version = shard_info.last_update.version; | |
2285 | if (HAVE_FEATURE(osdmap->get_up_osd_features(), SERVER_NAUTILUS)) { | |
2286 | auto approx_missing_objects = | |
2287 | shard_info.stats.stats.sum.num_objects_missing; | |
2288 | if (auth_version > candidate_version) { | |
2289 | approx_missing_objects += auth_version - candidate_version; | |
2290 | } else { | |
2291 | approx_missing_objects += candidate_version - auth_version; | |
2292 | } | |
2293 | if (static_cast<uint64_t>(approx_missing_objects) > | |
2294 | cct->_conf.get_val<uint64_t>("osd_async_recovery_min_cost")) { | |
2295 | candidates_by_cost.emplace(approx_missing_objects, shard_i); | |
2296 | } | |
2297 | } else { | |
2298 | size_t approx_entries; | |
2299 | if (auth_version > candidate_version) { | |
2300 | approx_entries = auth_version - candidate_version; | |
2301 | } else { | |
2302 | approx_entries = candidate_version - auth_version; | |
2303 | } | |
2304 | if (approx_entries > cct->_conf.get_val<uint64_t>("osd_async_recovery_min_cost")) { | |
2305 | candidates_by_cost.insert(make_pair(approx_entries, shard_i)); | |
2306 | } | |
2307 | } | |
2308 | } | |
2309 | ||
2310 | psdout(20) << __func__ << " candidates by cost are: " << candidates_by_cost | |
2311 | << dendl; | |
2312 | // take out as many osds as we can for async recovery, in order of cost | |
2313 | for (auto rit = candidates_by_cost.rbegin(); | |
2314 | rit != candidates_by_cost.rend(); ++rit) { | |
2315 | if (want->size() <= pool.info.min_size) { | |
2316 | break; | |
2317 | } | |
2318 | pg_shard_t cur_shard = rit->second; | |
2319 | vector<int> candidate_want(*want); | |
2320 | for (auto it = candidate_want.begin(); it != candidate_want.end(); ++it) { | |
2321 | if (*it == cur_shard.osd) { | |
f67539c2 TL |
2322 | candidate_want.erase(it); |
2323 | if (pool.info.stretch_set_can_peer(candidate_want, *osdmap, NULL)) { | |
2324 | // if we're in stretch mode, we can only remove the osd if it doesn't | |
2325 | // break peering limits. | |
2326 | want->swap(candidate_want); | |
2327 | async_recovery->insert(cur_shard); | |
2328 | } | |
2329 | break; | |
9f95a23c TL |
2330 | } |
2331 | } | |
2332 | } | |
f67539c2 | 2333 | |
9f95a23c TL |
2334 | psdout(20) << __func__ << " result want=" << *want |
2335 | << " async_recovery=" << *async_recovery << dendl; | |
2336 | } | |
2337 | ||
9f95a23c TL |
2338 | /** |
2339 | * choose acting | |
2340 | * | |
2341 | * calculate the desired acting, and request a change with the monitor | |
2342 | * if it differs from the current acting. | |
2343 | * | |
2344 | * if restrict_to_up_acting=true, we filter out anything that's not in | |
2345 | * up/acting. in order to lift this restriction, we need to | |
2346 | * 1) check whether it's worth switching the acting set any time we get | |
2347 | * a new pg info (not just here, when recovery finishes) | |
2348 | * 2) check whether anything in want_acting went down on each new map | |
2349 | * (and, if so, calculate a new want_acting) | |
2350 | * 3) remove the assertion in PG::PeeringState::Active::react(const AdvMap) | |
2351 | * TODO! | |
2352 | */ | |
2353 | bool PeeringState::choose_acting(pg_shard_t &auth_log_shard_id, | |
2354 | bool restrict_to_up_acting, | |
2355 | bool *history_les_bound, | |
2356 | bool request_pg_temp_change_only) | |
2357 | { | |
2358 | map<pg_shard_t, pg_info_t> all_info(peer_info.begin(), peer_info.end()); | |
2359 | all_info[pg_whoami] = info; | |
2360 | ||
2361 | if (cct->_conf->subsys.should_gather<dout_subsys, 10>()) { | |
f67539c2 | 2362 | for (auto p = all_info.begin(); p != all_info.end(); ++p) { |
9f95a23c TL |
2363 | psdout(10) << __func__ << " all_info osd." << p->first << " " |
2364 | << p->second << dendl; | |
2365 | } | |
2366 | } | |
2367 | ||
f67539c2 TL |
2368 | auto auth_log_shard = find_best_info(all_info, restrict_to_up_acting, |
2369 | history_les_bound); | |
9f95a23c TL |
2370 | |
2371 | if (auth_log_shard == all_info.end()) { | |
2372 | if (up != acting) { | |
2373 | psdout(10) << __func__ << " no suitable info found (incomplete backfills?)," | |
2374 | << " reverting to up" << dendl; | |
2375 | want_acting = up; | |
2376 | vector<int> empty; | |
2377 | pl->queue_want_pg_temp(empty); | |
2378 | } else { | |
2379 | psdout(10) << __func__ << " failed" << dendl; | |
2380 | ceph_assert(want_acting.empty()); | |
2381 | } | |
2382 | return false; | |
2383 | } | |
2384 | ||
2385 | ceph_assert(!auth_log_shard->second.is_incomplete()); | |
2386 | auth_log_shard_id = auth_log_shard->first; | |
2387 | ||
2388 | set<pg_shard_t> want_backfill, want_acting_backfill; | |
2389 | vector<int> want; | |
2390 | stringstream ss; | |
f67539c2 TL |
2391 | if (pool.info.is_replicated()) { |
2392 | auto [primary_shard, oldest_log] = select_replicated_primary( | |
9f95a23c TL |
2393 | auth_log_shard, |
2394 | cct->_conf.get_val<uint64_t>( | |
f67539c2 | 2395 | "osd_force_auth_primary_missing_objects"), |
9f95a23c TL |
2396 | up, |
2397 | up_primary, | |
2398 | all_info, | |
9f95a23c TL |
2399 | get_osdmap(), |
2400 | ss); | |
f67539c2 TL |
2401 | if (pool.info.is_stretch_pool()) { |
2402 | calc_replicated_acting_stretch( | |
2403 | primary_shard, | |
2404 | oldest_log, | |
2405 | get_osdmap()->get_pg_size(info.pgid.pgid), | |
2406 | acting, | |
2407 | up, | |
2408 | up_primary, | |
2409 | all_info, | |
2410 | restrict_to_up_acting, | |
2411 | &want, | |
2412 | &want_backfill, | |
2413 | &want_acting_backfill, | |
2414 | get_osdmap(), | |
2415 | pool, | |
2416 | ss); | |
2417 | } else { | |
2418 | calc_replicated_acting( | |
2419 | primary_shard, | |
2420 | oldest_log, | |
2421 | get_osdmap()->get_pg_size(info.pgid.pgid), | |
2422 | acting, | |
2423 | up, | |
2424 | up_primary, | |
2425 | all_info, | |
2426 | restrict_to_up_acting, | |
2427 | &want, | |
2428 | &want_backfill, | |
2429 | &want_acting_backfill, | |
2430 | get_osdmap(), | |
2431 | pool, | |
2432 | ss); | |
2433 | } | |
2434 | } else { | |
9f95a23c TL |
2435 | calc_ec_acting( |
2436 | auth_log_shard, | |
2437 | get_osdmap()->get_pg_size(info.pgid.pgid), | |
2438 | acting, | |
2439 | up, | |
2440 | all_info, | |
2441 | restrict_to_up_acting, | |
2442 | &want, | |
2443 | &want_backfill, | |
2444 | &want_acting_backfill, | |
2445 | ss); | |
f67539c2 | 2446 | } |
9f95a23c TL |
2447 | psdout(10) << ss.str() << dendl; |
2448 | ||
2449 | if (!recoverable(want)) { | |
2450 | want_acting.clear(); | |
2451 | return false; | |
2452 | } | |
2453 | ||
2454 | set<pg_shard_t> want_async_recovery; | |
2455 | if (HAVE_FEATURE(get_osdmap()->get_up_osd_features(), SERVER_MIMIC)) { | |
2456 | if (pool.info.is_erasure()) { | |
2457 | choose_async_recovery_ec( | |
2458 | all_info, auth_log_shard->second, &want, &want_async_recovery, | |
2459 | get_osdmap()); | |
2460 | } else { | |
2461 | choose_async_recovery_replicated( | |
2462 | all_info, auth_log_shard->second, &want, &want_async_recovery, | |
2463 | get_osdmap()); | |
2464 | } | |
2465 | } | |
2466 | while (want.size() > pool.info.size) { | |
2467 | // async recovery should have taken out as many osds as it can. | |
2468 | // if not, then always evict the last peer | |
2469 | // (will get synchronously recovered later) | |
2470 | psdout(10) << __func__ << " evicting osd." << want.back() | |
2471 | << " from oversized want " << want << dendl; | |
2472 | want.pop_back(); | |
2473 | } | |
2474 | if (want != acting) { | |
2475 | psdout(10) << __func__ << " want " << want << " != acting " << acting | |
2476 | << ", requesting pg_temp change" << dendl; | |
2477 | want_acting = want; | |
2478 | ||
2479 | if (!cct->_conf->osd_debug_no_acting_change) { | |
2480 | if (want_acting == up) { | |
2481 | // There can't be any pending backfill if | |
2482 | // want is the same as crush map up OSDs. | |
2483 | ceph_assert(want_backfill.empty()); | |
2484 | vector<int> empty; | |
2485 | pl->queue_want_pg_temp(empty); | |
2486 | } else | |
2487 | pl->queue_want_pg_temp(want); | |
2488 | } | |
2489 | return false; | |
2490 | } | |
f67539c2 | 2491 | |
9f95a23c TL |
2492 | if (request_pg_temp_change_only) |
2493 | return true; | |
2494 | want_acting.clear(); | |
2495 | acting_recovery_backfill = want_acting_backfill; | |
2496 | psdout(10) << "acting_recovery_backfill is " | |
2497 | << acting_recovery_backfill << dendl; | |
2498 | ceph_assert( | |
2499 | backfill_targets.empty() || | |
2500 | backfill_targets == want_backfill); | |
2501 | if (backfill_targets.empty()) { | |
2502 | // Caller is GetInfo | |
2503 | backfill_targets = want_backfill; | |
2504 | } | |
2505 | // Adding !needs_recovery() to let the async_recovery_targets reset after recovery is complete | |
2506 | ceph_assert( | |
2507 | async_recovery_targets.empty() || | |
2508 | async_recovery_targets == want_async_recovery || | |
2509 | !needs_recovery()); | |
2510 | if (async_recovery_targets.empty() || !needs_recovery()) { | |
2511 | async_recovery_targets = want_async_recovery; | |
2512 | } | |
2513 | // Will not change if already set because up would have had to change | |
2514 | // Verify that nothing in backfill is in stray_set | |
f67539c2 | 2515 | for (auto i = want_backfill.begin(); i != want_backfill.end(); ++i) { |
9f95a23c TL |
2516 | ceph_assert(stray_set.find(*i) == stray_set.end()); |
2517 | } | |
2518 | psdout(10) << "choose_acting want=" << want << " backfill_targets=" | |
2519 | << want_backfill << " async_recovery_targets=" | |
2520 | << async_recovery_targets << dendl; | |
2521 | return true; | |
2522 | } | |
2523 | ||
2524 | void PeeringState::log_weirdness() | |
2525 | { | |
2526 | if (pg_log.get_tail() != info.log_tail) | |
2527 | pl->get_clog_error() << info.pgid | |
2528 | << " info mismatch, log.tail " << pg_log.get_tail() | |
2529 | << " != info.log_tail " << info.log_tail; | |
2530 | if (pg_log.get_head() != info.last_update) | |
2531 | pl->get_clog_error() << info.pgid | |
2532 | << " info mismatch, log.head " << pg_log.get_head() | |
2533 | << " != info.last_update " << info.last_update; | |
2534 | ||
2535 | if (!pg_log.get_log().empty()) { | |
2536 | // sloppy check | |
2537 | if ((pg_log.get_log().log.begin()->version <= pg_log.get_tail())) | |
2538 | pl->get_clog_error() << info.pgid | |
2539 | << " log bound mismatch, info (tail,head] (" | |
2540 | << pg_log.get_tail() << "," | |
2541 | << pg_log.get_head() << "]" | |
2542 | << " actual [" | |
2543 | << pg_log.get_log().log.begin()->version << "," | |
2544 | << pg_log.get_log().log.rbegin()->version << "]"; | |
2545 | } | |
2546 | ||
2547 | if (pg_log.get_log().caller_ops.size() > pg_log.get_log().log.size()) { | |
2548 | pl->get_clog_error() << info.pgid | |
2549 | << " caller_ops.size " | |
2550 | << pg_log.get_log().caller_ops.size() | |
2551 | << " > log size " << pg_log.get_log().log.size(); | |
2552 | } | |
2553 | } | |
2554 | ||
2555 | /* | |
2556 | * Process information from a replica to determine if it could have any | |
2557 | * objects that i need. | |
2558 | * | |
2559 | * TODO: if the missing set becomes very large, this could get expensive. | |
2560 | * Instead, we probably want to just iterate over our unfound set. | |
2561 | */ | |
2562 | bool PeeringState::search_for_missing( | |
2563 | const pg_info_t &oinfo, const pg_missing_t &omissing, | |
2564 | pg_shard_t from, | |
2565 | PeeringCtxWrapper &ctx) | |
2566 | { | |
2567 | uint64_t num_unfound_before = missing_loc.num_unfound(); | |
2568 | bool found_missing = missing_loc.add_source_info( | |
2569 | from, oinfo, omissing, ctx.handle); | |
2570 | if (found_missing && num_unfound_before != missing_loc.num_unfound()) | |
2571 | pl->publish_stats_to_osd(); | |
2572 | // avoid doing this if the peer is empty. This is abit of paranoia | |
2573 | // to avoid doing something rash if add_source_info() above | |
2574 | // incorrectly decided we found something new. (if the peer has | |
2575 | // last_update=0'0 that's impossible.) | |
2576 | if (found_missing && | |
2577 | oinfo.last_update != eversion_t()) { | |
2578 | pg_info_t tinfo(oinfo); | |
2579 | tinfo.pgid.shard = pg_whoami.shard; | |
2580 | ctx.send_info( | |
2581 | from.osd, | |
2582 | spg_t(info.pgid.pgid, from.shard), | |
2583 | get_osdmap_epoch(), // fixme: use lower epoch? | |
2584 | get_osdmap_epoch(), | |
2585 | tinfo); | |
2586 | } | |
2587 | return found_missing; | |
2588 | } | |
2589 | ||
2590 | bool PeeringState::discover_all_missing( | |
2591 | BufferedRecoveryMessages &rctx) | |
2592 | { | |
2593 | auto &missing = pg_log.get_missing(); | |
2594 | uint64_t unfound = get_num_unfound(); | |
2595 | bool any = false; // did we start any queries | |
2596 | ||
2597 | psdout(10) << __func__ << " " | |
2598 | << missing.num_missing() << " missing, " | |
2599 | << unfound << " unfound" | |
2600 | << dendl; | |
2601 | ||
f67539c2 TL |
2602 | auto m = might_have_unfound.begin(); |
2603 | auto mend = might_have_unfound.end(); | |
9f95a23c TL |
2604 | for (; m != mend; ++m) { |
2605 | pg_shard_t peer(*m); | |
2606 | ||
2607 | if (!get_osdmap()->is_up(peer.osd)) { | |
2608 | psdout(20) << __func__ << " skipping down osd." << peer << dendl; | |
2609 | continue; | |
2610 | } | |
2611 | ||
2612 | if (peer_purged.count(peer)) { | |
2613 | psdout(20) << __func__ << " skipping purged osd." << peer << dendl; | |
2614 | continue; | |
2615 | } | |
2616 | ||
f67539c2 | 2617 | auto iter = peer_info.find(peer); |
9f95a23c TL |
2618 | if (iter != peer_info.end() && |
2619 | (iter->second.is_empty() || iter->second.dne())) { | |
2620 | // ignore empty peers | |
2621 | continue; | |
2622 | } | |
2623 | ||
2624 | // If we've requested any of this stuff, the pg_missing_t information | |
2625 | // should be on its way. | |
2626 | // TODO: coalsce requested_* into a single data structure | |
2627 | if (peer_missing.find(peer) != peer_missing.end()) { | |
2628 | psdout(20) << __func__ << ": osd." << peer | |
2629 | << ": we already have pg_missing_t" << dendl; | |
2630 | continue; | |
2631 | } | |
2632 | if (peer_log_requested.find(peer) != peer_log_requested.end()) { | |
2633 | psdout(20) << __func__ << ": osd." << peer | |
2634 | << ": in peer_log_requested" << dendl; | |
2635 | continue; | |
2636 | } | |
2637 | if (peer_missing_requested.find(peer) != peer_missing_requested.end()) { | |
2638 | psdout(20) << __func__ << ": osd." << peer | |
2639 | << ": in peer_missing_requested" << dendl; | |
2640 | continue; | |
2641 | } | |
2642 | ||
2643 | // Request missing | |
2644 | psdout(10) << __func__ << ": osd." << peer << ": requesting pg_missing_t" | |
2645 | << dendl; | |
2646 | peer_missing_requested.insert(peer); | |
2647 | rctx.send_query( | |
2648 | peer.osd, | |
2649 | spg_t(info.pgid.pgid, peer.shard), | |
2650 | pg_query_t( | |
2651 | pg_query_t::FULLLOG, | |
2652 | peer.shard, pg_whoami.shard, | |
2653 | info.history, get_osdmap_epoch())); | |
2654 | any = true; | |
2655 | } | |
2656 | return any; | |
2657 | } | |
2658 | ||
2659 | /* Build the might_have_unfound set. | |
2660 | * | |
2661 | * This is used by the primary OSD during recovery. | |
2662 | * | |
2663 | * This set tracks the OSDs which might have unfound objects that the primary | |
2664 | * OSD needs. As we receive pg_missing_t from each OSD in might_have_unfound, we | |
2665 | * will remove the OSD from the set. | |
2666 | */ | |
2667 | void PeeringState::build_might_have_unfound() | |
2668 | { | |
2669 | ceph_assert(might_have_unfound.empty()); | |
2670 | ceph_assert(is_primary()); | |
2671 | ||
2672 | psdout(10) << __func__ << dendl; | |
2673 | ||
2674 | check_past_interval_bounds(); | |
2675 | ||
2676 | might_have_unfound = past_intervals.get_might_have_unfound( | |
2677 | pg_whoami, | |
2678 | pool.info.is_erasure()); | |
2679 | ||
2680 | // include any (stray) peers | |
f67539c2 | 2681 | for (auto p = peer_info.begin(); p != peer_info.end(); ++p) |
9f95a23c TL |
2682 | might_have_unfound.insert(p->first); |
2683 | ||
2684 | psdout(15) << __func__ << ": built " << might_have_unfound << dendl; | |
2685 | } | |
2686 | ||
2687 | void PeeringState::activate( | |
2688 | ObjectStore::Transaction& t, | |
2689 | epoch_t activation_epoch, | |
2690 | PeeringCtxWrapper &ctx) | |
2691 | { | |
2692 | ceph_assert(!is_peered()); | |
2693 | ||
2694 | // twiddle pg state | |
2695 | state_clear(PG_STATE_DOWN); | |
2696 | ||
2697 | send_notify = false; | |
2698 | ||
2699 | if (is_primary()) { | |
2700 | // only update primary last_epoch_started if we will go active | |
f67539c2 | 2701 | if (acting_set_writeable()) { |
9f95a23c TL |
2702 | ceph_assert(cct->_conf->osd_find_best_info_ignore_history_les || |
2703 | info.last_epoch_started <= activation_epoch); | |
2704 | info.last_epoch_started = activation_epoch; | |
2705 | info.last_interval_started = info.history.same_interval_since; | |
2706 | } | |
2707 | } else if (is_acting(pg_whoami)) { | |
2708 | /* update last_epoch_started on acting replica to whatever the primary sent | |
2709 | * unless it's smaller (could happen if we are going peered rather than | |
2710 | * active, see doc/dev/osd_internals/last_epoch_started.rst) */ | |
2711 | if (info.last_epoch_started < activation_epoch) { | |
2712 | info.last_epoch_started = activation_epoch; | |
2713 | info.last_interval_started = info.history.same_interval_since; | |
2714 | } | |
2715 | } | |
2716 | ||
2717 | auto &missing = pg_log.get_missing(); | |
2718 | ||
2719 | min_last_complete_ondisk = eversion_t(0,0); // we don't know (yet)! | |
2720 | if (is_primary()) { | |
2721 | last_update_ondisk = info.last_update; | |
2722 | } | |
2723 | last_update_applied = info.last_update; | |
2724 | last_rollback_info_trimmed_to_applied = pg_log.get_can_rollback_to(); | |
2725 | ||
2726 | need_up_thru = false; | |
2727 | ||
2728 | // write pg info, log | |
2729 | dirty_info = true; | |
2730 | dirty_big_info = true; // maybe | |
2731 | ||
2732 | pl->schedule_event_on_commit( | |
2733 | t, | |
2734 | std::make_shared<PGPeeringEvent>( | |
2735 | get_osdmap_epoch(), | |
2736 | get_osdmap_epoch(), | |
2737 | ActivateCommitted( | |
2738 | get_osdmap_epoch(), | |
2739 | activation_epoch))); | |
2740 | ||
2741 | // init complete pointer | |
2742 | if (missing.num_missing() == 0) { | |
2743 | psdout(10) << "activate - no missing, moving last_complete " << info.last_complete | |
2744 | << " -> " << info.last_update << dendl; | |
2745 | info.last_complete = info.last_update; | |
2746 | info.stats.stats.sum.num_objects_missing = 0; | |
2747 | pg_log.reset_recovery_pointers(); | |
2748 | } else { | |
2749 | psdout(10) << "activate - not complete, " << missing << dendl; | |
2750 | info.stats.stats.sum.num_objects_missing = missing.num_missing(); | |
2751 | pg_log.activate_not_complete(info); | |
2752 | } | |
2753 | ||
2754 | log_weirdness(); | |
2755 | ||
2756 | if (is_primary()) { | |
2757 | // initialize snap_trimq | |
2758 | interval_set<snapid_t> to_trim; | |
2759 | auto& removed_snaps_queue = get_osdmap()->get_removed_snaps_queue(); | |
2760 | auto p = removed_snaps_queue.find(info.pgid.pgid.pool()); | |
2761 | if (p != removed_snaps_queue.end()) { | |
2762 | dout(20) << "activate - purged_snaps " << info.purged_snaps | |
2763 | << " removed_snaps " << p->second | |
2764 | << dendl; | |
2765 | for (auto q : p->second) { | |
2766 | to_trim.insert(q.first, q.second); | |
2767 | } | |
2768 | } | |
2769 | interval_set<snapid_t> purged; | |
2770 | purged.intersection_of(to_trim, info.purged_snaps); | |
2771 | to_trim.subtract(purged); | |
2772 | ||
2773 | if (HAVE_FEATURE(upacting_features, SERVER_OCTOPUS)) { | |
2774 | renew_lease(pl->get_mnow()); | |
2775 | // do not schedule until we are actually activated | |
2776 | } | |
2777 | ||
2778 | // adjust purged_snaps: PG may have been inactive while snaps were pruned | |
2779 | // from the removed_snaps_queue in the osdmap. update local purged_snaps | |
2780 | // reflect only those snaps that we thought were pruned and were still in | |
2781 | // the queue. | |
2782 | info.purged_snaps.swap(purged); | |
2783 | ||
2784 | // start up replicas | |
2785 | info.history.refresh_prior_readable_until_ub(pl->get_mnow(), | |
2786 | prior_readable_until_ub); | |
2787 | ||
2788 | ceph_assert(!acting_recovery_backfill.empty()); | |
f67539c2 | 2789 | for (auto i = acting_recovery_backfill.begin(); |
9f95a23c TL |
2790 | i != acting_recovery_backfill.end(); |
2791 | ++i) { | |
2792 | if (*i == pg_whoami) continue; | |
2793 | pg_shard_t peer = *i; | |
2794 | ceph_assert(peer_info.count(peer)); | |
2795 | pg_info_t& pi = peer_info[peer]; | |
2796 | ||
2797 | psdout(10) << "activate peer osd." << peer << " " << pi << dendl; | |
2798 | ||
f67539c2 | 2799 | MRef<MOSDPGLog> m; |
9f95a23c TL |
2800 | ceph_assert(peer_missing.count(peer)); |
2801 | pg_missing_t& pm = peer_missing[peer]; | |
2802 | ||
2803 | bool needs_past_intervals = pi.dne(); | |
2804 | ||
a4b75251 TL |
2805 | // Save num_bytes for backfill reservation request, can't be negative |
2806 | peer_bytes[peer] = std::max<int64_t>(0, pi.stats.stats.sum.num_bytes); | |
2807 | ||
9f95a23c TL |
2808 | if (pi.last_update == info.last_update) { |
2809 | // empty log | |
2810 | if (!pi.last_backfill.is_max()) | |
2811 | pl->get_clog_info() << info.pgid << " continuing backfill to osd." | |
2812 | << peer | |
2813 | << " from (" << pi.log_tail << "," << pi.last_update | |
2814 | << "] " << pi.last_backfill | |
2815 | << " to " << info.last_update; | |
2816 | if (!pi.is_empty()) { | |
2817 | psdout(10) << "activate peer osd." << peer | |
2818 | << " is up to date, queueing in pending_activators" << dendl; | |
2819 | ctx.send_info( | |
2820 | peer.osd, | |
2821 | spg_t(info.pgid.pgid, peer.shard), | |
2822 | get_osdmap_epoch(), // fixme: use lower epoch? | |
2823 | get_osdmap_epoch(), | |
2824 | info, | |
2825 | get_lease()); | |
2826 | } else { | |
2827 | psdout(10) << "activate peer osd." << peer | |
2828 | << " is up to date, but sending pg_log anyway" << dendl; | |
f67539c2 | 2829 | m = make_message<MOSDPGLog>( |
9f95a23c TL |
2830 | i->shard, pg_whoami.shard, |
2831 | get_osdmap_epoch(), info, | |
2832 | last_peering_reset); | |
2833 | } | |
2834 | } else if ( | |
2835 | pg_log.get_tail() > pi.last_update || | |
2836 | pi.last_backfill == hobject_t() || | |
2837 | (backfill_targets.count(*i) && pi.last_backfill.is_max())) { | |
2838 | /* ^ This last case covers a situation where a replica is not contiguous | |
2839 | * with the auth_log, but is contiguous with this replica. Reshuffling | |
2840 | * the active set to handle this would be tricky, so instead we just go | |
2841 | * ahead and backfill it anyway. This is probably preferrable in any | |
2842 | * case since the replica in question would have to be significantly | |
2843 | * behind. | |
2844 | */ | |
2845 | // backfill | |
2846 | pl->get_clog_debug() << info.pgid << " starting backfill to osd." << peer | |
2847 | << " from (" << pi.log_tail << "," << pi.last_update | |
2848 | << "] " << pi.last_backfill | |
2849 | << " to " << info.last_update; | |
2850 | ||
2851 | pi.last_update = info.last_update; | |
2852 | pi.last_complete = info.last_update; | |
2853 | pi.set_last_backfill(hobject_t()); | |
2854 | pi.last_epoch_started = info.last_epoch_started; | |
2855 | pi.last_interval_started = info.last_interval_started; | |
2856 | pi.history = info.history; | |
2857 | pi.hit_set = info.hit_set; | |
9f95a23c TL |
2858 | pi.stats.stats.clear(); |
2859 | pi.stats.stats.sum.num_bytes = peer_bytes[peer]; | |
2860 | ||
2861 | // initialize peer with our purged_snaps. | |
2862 | pi.purged_snaps = info.purged_snaps; | |
2863 | ||
f67539c2 | 2864 | m = make_message<MOSDPGLog>( |
9f95a23c TL |
2865 | i->shard, pg_whoami.shard, |
2866 | get_osdmap_epoch(), pi, | |
2867 | last_peering_reset /* epoch to create pg at */); | |
2868 | ||
2869 | // send some recent log, so that op dup detection works well. | |
2870 | m->log.copy_up_to(cct, pg_log.get_log(), | |
2871 | cct->_conf->osd_max_pg_log_entries); | |
2872 | m->info.log_tail = m->log.tail; | |
2873 | pi.log_tail = m->log.tail; // sigh... | |
2874 | ||
2875 | pm.clear(); | |
2876 | } else { | |
2877 | // catch up | |
2878 | ceph_assert(pg_log.get_tail() <= pi.last_update); | |
f67539c2 | 2879 | m = make_message<MOSDPGLog>( |
9f95a23c TL |
2880 | i->shard, pg_whoami.shard, |
2881 | get_osdmap_epoch(), info, | |
2882 | last_peering_reset /* epoch to create pg at */); | |
2883 | // send new stuff to append to replicas log | |
2884 | m->log.copy_after(cct, pg_log.get_log(), pi.last_update); | |
2885 | } | |
2886 | ||
2887 | // share past_intervals if we are creating the pg on the replica | |
2888 | // based on whether our info for that peer was dne() *before* | |
2889 | // updating pi.history in the backfill block above. | |
2890 | if (m && needs_past_intervals) | |
2891 | m->past_intervals = past_intervals; | |
2892 | ||
2893 | // update local version of peer's missing list! | |
2894 | if (m && pi.last_backfill != hobject_t()) { | |
f67539c2 | 2895 | for (auto p = m->log.log.begin(); p != m->log.log.end(); ++p) { |
9f95a23c TL |
2896 | if (p->soid <= pi.last_backfill && |
2897 | !p->is_error()) { | |
2898 | if (perform_deletes_during_peering() && p->is_delete()) { | |
2899 | pm.rm(p->soid, p->version); | |
2900 | } else { | |
2901 | pm.add_next_event(*p); | |
2902 | } | |
2903 | } | |
2904 | } | |
2905 | } | |
2906 | ||
2907 | if (m) { | |
2908 | dout(10) << "activate peer osd." << peer << " sending " << m->log | |
2909 | << dendl; | |
2910 | m->lease = get_lease(); | |
2911 | pl->send_cluster_message(peer.osd, m, get_osdmap_epoch()); | |
2912 | } | |
2913 | ||
2914 | // peer now has | |
2915 | pi.last_update = info.last_update; | |
2916 | ||
2917 | // update our missing | |
2918 | if (pm.num_missing() == 0) { | |
2919 | pi.last_complete = pi.last_update; | |
2920 | psdout(10) << "activate peer osd." << peer << " " << pi | |
2921 | << " uptodate" << dendl; | |
2922 | } else { | |
2923 | psdout(10) << "activate peer osd." << peer << " " << pi | |
2924 | << " missing " << pm << dendl; | |
2925 | } | |
2926 | } | |
2927 | ||
2928 | // Set up missing_loc | |
2929 | set<pg_shard_t> complete_shards; | |
f67539c2 | 2930 | for (auto i = acting_recovery_backfill.begin(); |
9f95a23c TL |
2931 | i != acting_recovery_backfill.end(); |
2932 | ++i) { | |
2933 | psdout(20) << __func__ << " setting up missing_loc from shard " << *i | |
2934 | << " " << dendl; | |
2935 | if (*i == get_primary()) { | |
2936 | missing_loc.add_active_missing(missing); | |
2937 | if (!missing.have_missing()) | |
2938 | complete_shards.insert(*i); | |
2939 | } else { | |
2940 | auto peer_missing_entry = peer_missing.find(*i); | |
2941 | ceph_assert(peer_missing_entry != peer_missing.end()); | |
2942 | missing_loc.add_active_missing(peer_missing_entry->second); | |
2943 | if (!peer_missing_entry->second.have_missing() && | |
2944 | peer_info[*i].last_backfill.is_max()) | |
2945 | complete_shards.insert(*i); | |
2946 | } | |
2947 | } | |
2948 | ||
2949 | // If necessary, create might_have_unfound to help us find our unfound objects. | |
2950 | // NOTE: It's important that we build might_have_unfound before trimming the | |
2951 | // past intervals. | |
2952 | might_have_unfound.clear(); | |
2953 | if (needs_recovery()) { | |
2954 | // If only one shard has missing, we do a trick to add all others as recovery | |
2955 | // source, this is considered safe since the PGLogs have been merged locally, | |
2956 | // and covers vast majority of the use cases, like one OSD/host is down for | |
2957 | // a while for hardware repairing | |
2958 | if (complete_shards.size() + 1 == acting_recovery_backfill.size()) { | |
2959 | missing_loc.add_batch_sources_info(complete_shards, ctx.handle); | |
2960 | } else { | |
2961 | missing_loc.add_source_info(pg_whoami, info, pg_log.get_missing(), | |
2962 | ctx.handle); | |
f67539c2 | 2963 | for (auto i = acting_recovery_backfill.begin(); |
9f95a23c TL |
2964 | i != acting_recovery_backfill.end(); |
2965 | ++i) { | |
2966 | if (*i == pg_whoami) continue; | |
2967 | psdout(10) << __func__ << ": adding " << *i << " as a source" << dendl; | |
2968 | ceph_assert(peer_missing.count(*i)); | |
2969 | ceph_assert(peer_info.count(*i)); | |
2970 | missing_loc.add_source_info( | |
2971 | *i, | |
2972 | peer_info[*i], | |
2973 | peer_missing[*i], | |
2974 | ctx.handle); | |
2975 | } | |
2976 | } | |
f67539c2 | 2977 | for (auto i = peer_missing.begin(); i != peer_missing.end(); ++i) { |
9f95a23c TL |
2978 | if (is_acting_recovery_backfill(i->first)) |
2979 | continue; | |
2980 | ceph_assert(peer_info.count(i->first)); | |
2981 | search_for_missing( | |
2982 | peer_info[i->first], | |
2983 | i->second, | |
2984 | i->first, | |
2985 | ctx); | |
2986 | } | |
2987 | ||
2988 | build_might_have_unfound(); | |
2989 | ||
2990 | // Always call now so update_calc_stats() will be accurate | |
2991 | discover_all_missing(ctx.msgs); | |
2992 | ||
2993 | } | |
2994 | ||
2995 | // num_objects_degraded if calculated should reflect this too, unless no | |
2996 | // missing and we are about to go clean. | |
2997 | if (get_osdmap()->get_pg_size(info.pgid.pgid) > actingset.size()) { | |
2998 | state_set(PG_STATE_UNDERSIZED); | |
2999 | } | |
3000 | ||
3001 | state_set(PG_STATE_ACTIVATING); | |
3002 | pl->on_activate(std::move(to_trim)); | |
3003 | } | |
f67539c2 | 3004 | if (acting_set_writeable()) { |
9f95a23c TL |
3005 | PGLog::LogEntryHandlerRef rollbacker{pl->get_log_handler(t)}; |
3006 | pg_log.roll_forward(rollbacker.get()); | |
3007 | } | |
3008 | } | |
3009 | ||
3010 | void PeeringState::share_pg_info() | |
3011 | { | |
3012 | psdout(10) << "share_pg_info" << dendl; | |
3013 | ||
3014 | info.history.refresh_prior_readable_until_ub(pl->get_mnow(), | |
3015 | prior_readable_until_ub); | |
3016 | ||
3017 | // share new pg_info_t with replicas | |
3018 | ceph_assert(!acting_recovery_backfill.empty()); | |
3019 | for (auto pg_shard : acting_recovery_backfill) { | |
3020 | if (pg_shard == pg_whoami) continue; | |
3021 | if (auto peer = peer_info.find(pg_shard); peer != peer_info.end()) { | |
3022 | peer->second.last_epoch_started = info.last_epoch_started; | |
3023 | peer->second.last_interval_started = info.last_interval_started; | |
3024 | peer->second.history.merge(info.history); | |
3025 | } | |
f67539c2 | 3026 | MessageRef m; |
9f95a23c | 3027 | if (last_require_osd_release >= ceph_release_t::octopus) { |
f67539c2 | 3028 | m = make_message<MOSDPGInfo2>(spg_t{info.pgid.pgid, pg_shard.shard}, |
9f95a23c TL |
3029 | info, |
3030 | get_osdmap_epoch(), | |
3031 | get_osdmap_epoch(), | |
f67539c2 TL |
3032 | std::optional<pg_lease_t>{get_lease()}, |
3033 | std::nullopt); | |
9f95a23c | 3034 | } else { |
f67539c2 TL |
3035 | m = make_message<MOSDPGInfo>(get_osdmap_epoch(), |
3036 | MOSDPGInfo::pg_list_t{ | |
3037 | pg_notify_t{pg_shard.shard, | |
3038 | pg_whoami.shard, | |
3039 | get_osdmap_epoch(), | |
3040 | get_osdmap_epoch(), | |
3041 | info, | |
3042 | past_intervals}}); | |
9f95a23c TL |
3043 | } |
3044 | pl->send_cluster_message(pg_shard.osd, m, get_osdmap_epoch()); | |
3045 | } | |
3046 | } | |
3047 | ||
3048 | void PeeringState::merge_log( | |
f67539c2 | 3049 | ObjectStore::Transaction& t, pg_info_t &oinfo, pg_log_t&& olog, |
9f95a23c TL |
3050 | pg_shard_t from) |
3051 | { | |
3052 | PGLog::LogEntryHandlerRef rollbacker{pl->get_log_handler(t)}; | |
3053 | pg_log.merge_log( | |
f67539c2 TL |
3054 | oinfo, std::move(olog), from, info, rollbacker.get(), |
3055 | dirty_info, dirty_big_info); | |
9f95a23c TL |
3056 | } |
3057 | ||
3058 | void PeeringState::rewind_divergent_log( | |
3059 | ObjectStore::Transaction& t, eversion_t newhead) | |
3060 | { | |
3061 | PGLog::LogEntryHandlerRef rollbacker{pl->get_log_handler(t)}; | |
3062 | pg_log.rewind_divergent_log( | |
3063 | newhead, info, rollbacker.get(), dirty_info, dirty_big_info); | |
3064 | } | |
3065 | ||
3066 | ||
3067 | void PeeringState::proc_primary_info( | |
3068 | ObjectStore::Transaction &t, const pg_info_t &oinfo) | |
3069 | { | |
3070 | ceph_assert(!is_primary()); | |
3071 | ||
3072 | update_history(oinfo.history); | |
3073 | if (!info.stats.stats_invalid && info.stats.stats.sum.num_scrub_errors) { | |
3074 | info.stats.stats.sum.num_scrub_errors = 0; | |
3075 | info.stats.stats.sum.num_shallow_scrub_errors = 0; | |
3076 | info.stats.stats.sum.num_deep_scrub_errors = 0; | |
3077 | dirty_info = true; | |
3078 | } | |
3079 | ||
3080 | if (!(info.purged_snaps == oinfo.purged_snaps)) { | |
3081 | psdout(10) << __func__ << " updating purged_snaps to " | |
3082 | << oinfo.purged_snaps | |
3083 | << dendl; | |
3084 | info.purged_snaps = oinfo.purged_snaps; | |
3085 | dirty_info = true; | |
3086 | dirty_big_info = true; | |
3087 | } | |
3088 | } | |
3089 | ||
3090 | void PeeringState::proc_master_log( | |
3091 | ObjectStore::Transaction& t, pg_info_t &oinfo, | |
f67539c2 | 3092 | pg_log_t&& olog, pg_missing_t&& omissing, pg_shard_t from) |
9f95a23c TL |
3093 | { |
3094 | psdout(10) << "proc_master_log for osd." << from << ": " | |
3095 | << olog << " " << omissing << dendl; | |
3096 | ceph_assert(!is_peered() && is_primary()); | |
3097 | ||
3098 | // merge log into our own log to build master log. no need to | |
3099 | // make any adjustments to their missing map; we are taking their | |
3100 | // log to be authoritative (i.e., their entries are by definitely | |
3101 | // non-divergent). | |
f67539c2 | 3102 | merge_log(t, oinfo, std::move(olog), from); |
9f95a23c TL |
3103 | peer_info[from] = oinfo; |
3104 | psdout(10) << " peer osd." << from << " now " << oinfo | |
3105 | << " " << omissing << dendl; | |
3106 | might_have_unfound.insert(from); | |
3107 | ||
3108 | // See doc/dev/osd_internals/last_epoch_started | |
3109 | if (oinfo.last_epoch_started > info.last_epoch_started) { | |
3110 | info.last_epoch_started = oinfo.last_epoch_started; | |
3111 | dirty_info = true; | |
3112 | } | |
3113 | if (oinfo.last_interval_started > info.last_interval_started) { | |
3114 | info.last_interval_started = oinfo.last_interval_started; | |
3115 | dirty_info = true; | |
3116 | } | |
3117 | update_history(oinfo.history); | |
3118 | ceph_assert(cct->_conf->osd_find_best_info_ignore_history_les || | |
3119 | info.last_epoch_started >= info.history.last_epoch_started); | |
3120 | ||
f67539c2 | 3121 | peer_missing[from].claim(std::move(omissing)); |
9f95a23c TL |
3122 | } |
3123 | ||
3124 | void PeeringState::proc_replica_log( | |
3125 | pg_info_t &oinfo, | |
3126 | const pg_log_t &olog, | |
f67539c2 | 3127 | pg_missing_t&& omissing, |
9f95a23c TL |
3128 | pg_shard_t from) |
3129 | { | |
3130 | psdout(10) << "proc_replica_log for osd." << from << ": " | |
3131 | << oinfo << " " << olog << " " << omissing << dendl; | |
3132 | ||
3133 | pg_log.proc_replica_log(oinfo, olog, omissing, from); | |
3134 | ||
3135 | peer_info[from] = oinfo; | |
3136 | psdout(10) << " peer osd." << from << " now " | |
3137 | << oinfo << " " << omissing << dendl; | |
3138 | might_have_unfound.insert(from); | |
3139 | ||
f67539c2 | 3140 | for (auto i = omissing.get_items().begin(); |
9f95a23c TL |
3141 | i != omissing.get_items().end(); |
3142 | ++i) { | |
3143 | psdout(20) << " after missing " << i->first | |
3144 | << " need " << i->second.need | |
3145 | << " have " << i->second.have << dendl; | |
3146 | } | |
f67539c2 | 3147 | peer_missing[from].claim(std::move(omissing)); |
9f95a23c TL |
3148 | } |
3149 | ||
3150 | void PeeringState::fulfill_info( | |
3151 | pg_shard_t from, const pg_query_t &query, | |
3152 | pair<pg_shard_t, pg_info_t> ¬ify_info) | |
3153 | { | |
3154 | ceph_assert(from == primary); | |
3155 | ceph_assert(query.type == pg_query_t::INFO); | |
3156 | ||
3157 | // info | |
3158 | psdout(10) << "sending info" << dendl; | |
3159 | notify_info = make_pair(from, info); | |
3160 | } | |
3161 | ||
3162 | void PeeringState::fulfill_log( | |
3163 | pg_shard_t from, const pg_query_t &query, epoch_t query_epoch) | |
3164 | { | |
3165 | psdout(10) << "log request from " << from << dendl; | |
3166 | ceph_assert(from == primary); | |
3167 | ceph_assert(query.type != pg_query_t::INFO); | |
3168 | ||
f67539c2 | 3169 | auto mlog = make_message<MOSDPGLog>( |
9f95a23c TL |
3170 | from.shard, pg_whoami.shard, |
3171 | get_osdmap_epoch(), | |
3172 | info, query_epoch); | |
3173 | mlog->missing = pg_log.get_missing(); | |
3174 | ||
3175 | // primary -> other, when building master log | |
3176 | if (query.type == pg_query_t::LOG) { | |
3177 | psdout(10) << " sending info+missing+log since " << query.since | |
3178 | << dendl; | |
3179 | if (query.since != eversion_t() && query.since < pg_log.get_tail()) { | |
3180 | pl->get_clog_error() << info.pgid << " got broken pg_query_t::LOG since " | |
3181 | << query.since | |
3182 | << " when my log.tail is " << pg_log.get_tail() | |
3183 | << ", sending full log instead"; | |
3184 | mlog->log = pg_log.get_log(); // primary should not have requested this!! | |
3185 | } else | |
3186 | mlog->log.copy_after(cct, pg_log.get_log(), query.since); | |
3187 | } | |
3188 | else if (query.type == pg_query_t::FULLLOG) { | |
3189 | psdout(10) << " sending info+missing+full log" << dendl; | |
3190 | mlog->log = pg_log.get_log(); | |
3191 | } | |
3192 | ||
3193 | psdout(10) << " sending " << mlog->log << " " << mlog->missing << dendl; | |
3194 | ||
3195 | pl->send_cluster_message(from.osd, mlog, get_osdmap_epoch(), true); | |
3196 | } | |
3197 | ||
3198 | void PeeringState::fulfill_query(const MQuery& query, PeeringCtxWrapper &rctx) | |
3199 | { | |
3200 | if (query.query.type == pg_query_t::INFO) { | |
3201 | pair<pg_shard_t, pg_info_t> notify_info; | |
3202 | // note this refreshes our prior_readable_until_ub value | |
3203 | update_history(query.query.history); | |
3204 | fulfill_info(query.from, query.query, notify_info); | |
3205 | rctx.send_notify( | |
3206 | notify_info.first.osd, | |
3207 | pg_notify_t( | |
3208 | notify_info.first.shard, pg_whoami.shard, | |
3209 | query.query_epoch, | |
3210 | get_osdmap_epoch(), | |
3211 | notify_info.second, | |
3212 | past_intervals)); | |
3213 | } else { | |
3214 | update_history(query.query.history); | |
3215 | fulfill_log(query.from, query.query, query.query_epoch); | |
3216 | } | |
3217 | } | |
3218 | ||
3219 | void PeeringState::try_mark_clean() | |
3220 | { | |
3221 | if (actingset.size() == get_osdmap()->get_pg_size(info.pgid.pgid)) { | |
3222 | state_clear(PG_STATE_FORCED_BACKFILL | PG_STATE_FORCED_RECOVERY); | |
3223 | state_set(PG_STATE_CLEAN); | |
3224 | info.history.last_epoch_clean = get_osdmap_epoch(); | |
3225 | info.history.last_interval_clean = info.history.same_interval_since; | |
3226 | past_intervals.clear(); | |
3227 | dirty_big_info = true; | |
3228 | dirty_info = true; | |
3229 | } | |
3230 | ||
3231 | if (!is_active() && is_peered()) { | |
3232 | if (is_clean()) { | |
3233 | bool target; | |
3234 | if (pool.info.is_pending_merge(info.pgid.pgid, &target)) { | |
3235 | if (target) { | |
3236 | psdout(10) << "ready to merge (target)" << dendl; | |
3237 | pl->set_ready_to_merge_target( | |
3238 | info.last_update, | |
3239 | info.history.last_epoch_started, | |
3240 | info.history.last_epoch_clean); | |
3241 | } else { | |
3242 | psdout(10) << "ready to merge (source)" << dendl; | |
3243 | pl->set_ready_to_merge_source(info.last_update); | |
3244 | } | |
3245 | } | |
3246 | } else { | |
3247 | psdout(10) << "not clean, not ready to merge" << dendl; | |
3248 | // we should have notified OSD in Active state entry point | |
3249 | } | |
3250 | } | |
3251 | ||
3252 | state_clear(PG_STATE_FORCED_RECOVERY | PG_STATE_FORCED_BACKFILL); | |
3253 | ||
3254 | share_pg_info(); | |
3255 | pl->publish_stats_to_osd(); | |
3256 | clear_recovery_state(); | |
3257 | } | |
3258 | ||
3259 | void PeeringState::split_into( | |
3260 | pg_t child_pgid, PeeringState *child, unsigned split_bits) | |
3261 | { | |
3262 | child->update_osdmap_ref(get_osdmap()); | |
3263 | child->pool = pool; | |
3264 | ||
3265 | // Log | |
3266 | pg_log.split_into(child_pgid, split_bits, &(child->pg_log)); | |
3267 | child->info.last_complete = info.last_complete; | |
3268 | ||
3269 | info.last_update = pg_log.get_head(); | |
3270 | child->info.last_update = child->pg_log.get_head(); | |
3271 | ||
3272 | child->info.last_user_version = info.last_user_version; | |
3273 | ||
3274 | info.log_tail = pg_log.get_tail(); | |
3275 | child->info.log_tail = child->pg_log.get_tail(); | |
3276 | ||
3277 | // reset last_complete, we might have modified pg_log & missing above | |
3278 | pg_log.reset_complete_to(&info); | |
3279 | child->pg_log.reset_complete_to(&child->info); | |
3280 | ||
3281 | // Info | |
3282 | child->info.history = info.history; | |
3283 | child->info.history.epoch_created = get_osdmap_epoch(); | |
3284 | child->info.purged_snaps = info.purged_snaps; | |
3285 | ||
3286 | if (info.last_backfill.is_max()) { | |
3287 | child->info.set_last_backfill(hobject_t::get_max()); | |
3288 | } else { | |
3289 | // restart backfill on parent and child to be safe. we could | |
3290 | // probably do better in the bitwise sort case, but it's more | |
3291 | // fragile (there may be special work to do on backfill completion | |
3292 | // in the future). | |
3293 | info.set_last_backfill(hobject_t()); | |
3294 | child->info.set_last_backfill(hobject_t()); | |
3295 | // restarting backfill implies that the missing set is empty, | |
3296 | // since it is only used for objects prior to last_backfill | |
3297 | pg_log.reset_backfill(); | |
3298 | child->pg_log.reset_backfill(); | |
3299 | } | |
3300 | ||
3301 | child->info.stats = info.stats; | |
3302 | child->info.stats.parent_split_bits = split_bits; | |
3303 | info.stats.stats_invalid = true; | |
3304 | child->info.stats.stats_invalid = true; | |
3305 | child->info.last_epoch_started = info.last_epoch_started; | |
3306 | child->info.last_interval_started = info.last_interval_started; | |
3307 | ||
3308 | // There can't be recovery/backfill going on now | |
3309 | int primary, up_primary; | |
3310 | vector<int> newup, newacting; | |
3311 | get_osdmap()->pg_to_up_acting_osds( | |
3312 | child->info.pgid.pgid, &newup, &up_primary, &newacting, &primary); | |
3313 | child->init_primary_up_acting( | |
3314 | newup, | |
3315 | newacting, | |
3316 | up_primary, | |
3317 | primary); | |
3318 | child->role = OSDMap::calc_pg_role(pg_whoami, child->acting); | |
3319 | ||
3320 | // this comparison includes primary rank via pg_shard_t | |
3321 | if (get_primary() != child->get_primary()) | |
3322 | child->info.history.same_primary_since = get_osdmap_epoch(); | |
3323 | ||
f67539c2 | 3324 | child->info.stats.up = newup; |
9f95a23c | 3325 | child->info.stats.up_primary = up_primary; |
f67539c2 | 3326 | child->info.stats.acting = newacting; |
9f95a23c TL |
3327 | child->info.stats.acting_primary = primary; |
3328 | child->info.stats.mapping_epoch = get_osdmap_epoch(); | |
3329 | ||
3330 | // History | |
3331 | child->past_intervals = past_intervals; | |
3332 | ||
3333 | child->on_new_interval(); | |
3334 | ||
3335 | child->send_notify = !child->is_primary(); | |
3336 | ||
3337 | child->dirty_info = true; | |
3338 | child->dirty_big_info = true; | |
3339 | dirty_info = true; | |
3340 | dirty_big_info = true; | |
3341 | } | |
3342 | ||
3343 | void PeeringState::merge_from( | |
3344 | map<spg_t,PeeringState *>& sources, | |
3345 | PeeringCtx &rctx, | |
3346 | unsigned split_bits, | |
3347 | const pg_merge_meta_t& last_pg_merge_meta) | |
3348 | { | |
3349 | bool incomplete = false; | |
3350 | if (info.last_complete != info.last_update || | |
3351 | info.is_incomplete() || | |
3352 | info.dne()) { | |
3353 | psdout(10) << __func__ << " target incomplete" << dendl; | |
3354 | incomplete = true; | |
3355 | } | |
3356 | if (last_pg_merge_meta.source_pgid != pg_t()) { | |
3357 | if (info.pgid.pgid != last_pg_merge_meta.source_pgid.get_parent()) { | |
3358 | psdout(10) << __func__ << " target doesn't match expected parent " | |
3359 | << last_pg_merge_meta.source_pgid.get_parent() | |
3360 | << " of source_pgid " << last_pg_merge_meta.source_pgid | |
3361 | << dendl; | |
3362 | incomplete = true; | |
3363 | } | |
3364 | if (info.last_update != last_pg_merge_meta.target_version) { | |
3365 | psdout(10) << __func__ << " target version doesn't match expected " | |
3366 | << last_pg_merge_meta.target_version << dendl; | |
3367 | incomplete = true; | |
3368 | } | |
3369 | } | |
3370 | ||
3371 | PGLog::LogEntryHandlerRef handler{pl->get_log_handler(rctx.transaction)}; | |
3372 | pg_log.roll_forward(handler.get()); | |
3373 | ||
3374 | info.last_complete = info.last_update; // to fake out trim() | |
3375 | pg_log.reset_recovery_pointers(); | |
3376 | pg_log.trim(info.last_update, info); | |
3377 | ||
3378 | vector<PGLog*> log_from; | |
3379 | for (auto& i : sources) { | |
3380 | auto& source = i.second; | |
3381 | if (!source) { | |
3382 | psdout(10) << __func__ << " source " << i.first << " missing" << dendl; | |
3383 | incomplete = true; | |
3384 | continue; | |
3385 | } | |
3386 | if (source->info.last_complete != source->info.last_update || | |
3387 | source->info.is_incomplete() || | |
3388 | source->info.dne()) { | |
3389 | psdout(10) << __func__ << " source " << source->pg_whoami | |
3390 | << " incomplete" | |
3391 | << dendl; | |
3392 | incomplete = true; | |
3393 | } | |
3394 | if (last_pg_merge_meta.source_pgid != pg_t()) { | |
3395 | if (source->info.pgid.pgid != last_pg_merge_meta.source_pgid) { | |
3396 | dout(10) << __func__ << " source " << source->info.pgid.pgid | |
3397 | << " doesn't match expected source pgid " | |
3398 | << last_pg_merge_meta.source_pgid << dendl; | |
3399 | incomplete = true; | |
3400 | } | |
3401 | if (source->info.last_update != last_pg_merge_meta.source_version) { | |
3402 | dout(10) << __func__ << " source version doesn't match expected " | |
3403 | << last_pg_merge_meta.target_version << dendl; | |
3404 | incomplete = true; | |
3405 | } | |
3406 | } | |
3407 | ||
3408 | // prepare log | |
3409 | PGLog::LogEntryHandlerRef handler{ | |
3410 | source->pl->get_log_handler(rctx.transaction)}; | |
3411 | source->pg_log.roll_forward(handler.get()); | |
3412 | source->info.last_complete = source->info.last_update; // to fake out trim() | |
3413 | source->pg_log.reset_recovery_pointers(); | |
3414 | source->pg_log.trim(source->info.last_update, source->info); | |
3415 | log_from.push_back(&source->pg_log); | |
3416 | ||
3417 | // combine stats | |
3418 | info.stats.add(source->info.stats); | |
3419 | ||
3420 | // pull up last_update | |
3421 | info.last_update = std::max(info.last_update, source->info.last_update); | |
3422 | ||
3423 | // adopt source's PastIntervals if target has none. we can do this since | |
3424 | // pgp_num has been reduced prior to the merge, so the OSD mappings for | |
3425 | // the PGs are identical. | |
3426 | if (past_intervals.empty() && !source->past_intervals.empty()) { | |
3427 | psdout(10) << __func__ << " taking source's past_intervals" << dendl; | |
3428 | past_intervals = source->past_intervals; | |
3429 | } | |
3430 | } | |
3431 | ||
3432 | info.last_complete = info.last_update; | |
3433 | info.log_tail = info.last_update; | |
3434 | if (incomplete) { | |
3435 | info.last_backfill = hobject_t(); | |
3436 | } | |
3437 | ||
3438 | // merge logs | |
3439 | pg_log.merge_from(log_from, info.last_update); | |
3440 | ||
3441 | // make sure we have a meaningful last_epoch_started/clean (if we were a | |
3442 | // placeholder) | |
3443 | if (info.history.epoch_created == 0) { | |
3444 | // start with (a) source's history, since these PGs *should* have been | |
3445 | // remapped in concert with each other... | |
3446 | info.history = sources.begin()->second->info.history; | |
3447 | ||
3448 | // we use the last_epoch_{started,clean} we got from | |
3449 | // the caller, which are the epochs that were reported by the PGs were | |
3450 | // found to be ready for merge. | |
3451 | info.history.last_epoch_clean = last_pg_merge_meta.last_epoch_clean; | |
3452 | info.history.last_epoch_started = last_pg_merge_meta.last_epoch_started; | |
3453 | info.last_epoch_started = last_pg_merge_meta.last_epoch_started; | |
3454 | psdout(10) << __func__ | |
3455 | << " set les/c to " << last_pg_merge_meta.last_epoch_started << "/" | |
3456 | << last_pg_merge_meta.last_epoch_clean | |
3457 | << " from pool last_dec_*, source pg history was " | |
3458 | << sources.begin()->second->info.history | |
3459 | << dendl; | |
3460 | ||
f6b5b4d7 TL |
3461 | // above we have pulled down source's history and we need to check |
3462 | // history.epoch_created again to confirm that source is not a placeholder | |
3463 | // too. (peering requires a sane history.same_interval_since value for any | |
3464 | // non-newly created pg and below here we know we are basically iterating | |
3465 | // back a series of past maps to fake a merge process, hence we need to | |
3466 | // fix history.same_interval_since first so that start_peering_interval() | |
3467 | // will not complain) | |
3468 | if (info.history.epoch_created == 0) { | |
3469 | dout(10) << __func__ << " both merge target and source are placeholders," | |
3470 | << " set sis to lec " << info.history.last_epoch_clean | |
3471 | << dendl; | |
3472 | info.history.same_interval_since = info.history.last_epoch_clean; | |
3473 | } | |
3474 | ||
9f95a23c TL |
3475 | // if the past_intervals start is later than last_epoch_clean, it |
3476 | // implies the source repeered again but the target didn't, or | |
3477 | // that the source became clean in a later epoch than the target. | |
3478 | // avoid the discrepancy but adjusting the interval start | |
3479 | // backwards to match so that check_past_interval_bounds() will | |
3480 | // not complain. | |
3481 | auto pib = past_intervals.get_bounds(); | |
3482 | if (info.history.last_epoch_clean < pib.first) { | |
3483 | psdout(10) << __func__ << " last_epoch_clean " | |
3484 | << info.history.last_epoch_clean << " < past_interval start " | |
3485 | << pib.first << ", adjusting start backwards" << dendl; | |
3486 | past_intervals.adjust_start_backwards(info.history.last_epoch_clean); | |
3487 | } | |
3488 | ||
3489 | // Similarly, if the same_interval_since value is later than | |
3490 | // last_epoch_clean, the next interval change will result in a | |
3491 | // past_interval start that is later than last_epoch_clean. This | |
3492 | // can happen if we use the pg_history values from the merge | |
3493 | // source. Adjust the same_interval_since value backwards if that | |
3494 | // happens. (We trust the les and lec values more because they came from | |
3495 | // the real target, whereas the history value we stole from the source.) | |
3496 | if (info.history.last_epoch_started < info.history.same_interval_since) { | |
3497 | psdout(10) << __func__ << " last_epoch_started " | |
3498 | << info.history.last_epoch_started << " < same_interval_since " | |
3499 | << info.history.same_interval_since | |
3500 | << ", adjusting pg_history backwards" << dendl; | |
3501 | info.history.same_interval_since = info.history.last_epoch_clean; | |
3502 | // make sure same_{up,primary}_since are <= same_interval_since | |
3503 | info.history.same_up_since = std::min( | |
3504 | info.history.same_up_since, info.history.same_interval_since); | |
3505 | info.history.same_primary_since = std::min( | |
3506 | info.history.same_primary_since, info.history.same_interval_since); | |
3507 | } | |
3508 | } | |
3509 | ||
3510 | dirty_info = true; | |
3511 | dirty_big_info = true; | |
3512 | } | |
3513 | ||
3514 | void PeeringState::start_split_stats( | |
3515 | const set<spg_t>& childpgs, vector<object_stat_sum_t> *out) | |
3516 | { | |
3517 | out->resize(childpgs.size() + 1); | |
3518 | info.stats.stats.sum.split(*out); | |
3519 | } | |
3520 | ||
3521 | void PeeringState::finish_split_stats( | |
3522 | const object_stat_sum_t& stats, ObjectStore::Transaction &t) | |
3523 | { | |
3524 | info.stats.stats.sum = stats; | |
3525 | write_if_dirty(t); | |
3526 | } | |
3527 | ||
3528 | void PeeringState::update_blocked_by() | |
3529 | { | |
3530 | // set a max on the number of blocking peers we report. if we go | |
3531 | // over, report a random subset. keep the result sorted. | |
3532 | unsigned keep = std::min<unsigned>( | |
3533 | blocked_by.size(), cct->_conf->osd_max_pg_blocked_by); | |
3534 | unsigned skip = blocked_by.size() - keep; | |
3535 | info.stats.blocked_by.clear(); | |
3536 | info.stats.blocked_by.resize(keep); | |
3537 | unsigned pos = 0; | |
f67539c2 | 3538 | for (auto p = blocked_by.begin(); p != blocked_by.end() && keep > 0; ++p) { |
9f95a23c TL |
3539 | if (skip > 0 && (rand() % (skip + keep) < skip)) { |
3540 | --skip; | |
3541 | } else { | |
3542 | info.stats.blocked_by[pos++] = *p; | |
3543 | --keep; | |
3544 | } | |
3545 | } | |
3546 | } | |
3547 | ||
3548 | static bool find_shard(const set<pg_shard_t> & pgs, shard_id_t shard) | |
3549 | { | |
3550 | for (auto&p : pgs) | |
3551 | if (p.shard == shard) | |
3552 | return true; | |
3553 | return false; | |
3554 | } | |
3555 | ||
3556 | static pg_shard_t get_another_shard(const set<pg_shard_t> & pgs, pg_shard_t skip, shard_id_t shard) | |
3557 | { | |
3558 | for (auto&p : pgs) { | |
3559 | if (p == skip) | |
3560 | continue; | |
3561 | if (p.shard == shard) | |
3562 | return p; | |
3563 | } | |
3564 | return pg_shard_t(); | |
3565 | } | |
3566 | ||
3567 | void PeeringState::update_calc_stats() | |
3568 | { | |
3569 | info.stats.version = info.last_update; | |
3570 | info.stats.created = info.history.epoch_created; | |
3571 | info.stats.last_scrub = info.history.last_scrub; | |
3572 | info.stats.last_scrub_stamp = info.history.last_scrub_stamp; | |
3573 | info.stats.last_deep_scrub = info.history.last_deep_scrub; | |
3574 | info.stats.last_deep_scrub_stamp = info.history.last_deep_scrub_stamp; | |
3575 | info.stats.last_clean_scrub_stamp = info.history.last_clean_scrub_stamp; | |
3576 | info.stats.last_epoch_clean = info.history.last_epoch_clean; | |
3577 | ||
3578 | info.stats.log_size = pg_log.get_head().version - pg_log.get_tail().version; | |
3579 | info.stats.ondisk_log_size = info.stats.log_size; | |
3580 | info.stats.log_start = pg_log.get_tail(); | |
3581 | info.stats.ondisk_log_start = pg_log.get_tail(); | |
3582 | info.stats.snaptrimq_len = pl->get_snap_trimq_size(); | |
3583 | ||
3584 | unsigned num_shards = get_osdmap()->get_pg_size(info.pgid.pgid); | |
3585 | ||
3586 | // In rare case that upset is too large (usually transient), use as target | |
3587 | // for calculations below. | |
3588 | unsigned target = std::max(num_shards, (unsigned)upset.size()); | |
3589 | // For undersized actingset may be larger with OSDs out | |
3590 | unsigned nrep = std::max(actingset.size(), upset.size()); | |
3591 | // calc num_object_copies | |
3592 | info.stats.stats.calc_copies(std::max(target, nrep)); | |
3593 | info.stats.stats.sum.num_objects_degraded = 0; | |
3594 | info.stats.stats.sum.num_objects_unfound = 0; | |
3595 | info.stats.stats.sum.num_objects_misplaced = 0; | |
3596 | info.stats.avail_no_missing.clear(); | |
3597 | info.stats.object_location_counts.clear(); | |
3598 | ||
3599 | // We should never hit this condition, but if end up hitting it, | |
3600 | // make sure to update num_objects and set PG_STATE_INCONSISTENT. | |
3601 | if (info.stats.stats.sum.num_objects < 0) { | |
3602 | psdout(0) << __func__ << " negative num_objects = " | |
3603 | << info.stats.stats.sum.num_objects << " setting it to 0 " | |
3604 | << dendl; | |
3605 | info.stats.stats.sum.num_objects = 0; | |
3606 | state_set(PG_STATE_INCONSISTENT); | |
3607 | } | |
3608 | ||
3609 | if ((is_remapped() || is_undersized() || !is_clean()) && | |
3610 | (is_peered()|| is_activating())) { | |
3611 | psdout(20) << __func__ << " actingset " << actingset << " upset " | |
3612 | << upset << " acting_recovery_backfill " << acting_recovery_backfill << dendl; | |
3613 | ||
3614 | ceph_assert(!acting_recovery_backfill.empty()); | |
3615 | ||
3616 | bool estimate = false; | |
3617 | ||
3618 | // NOTE: we only generate degraded, misplaced and unfound | |
3619 | // values for the summation, not individual stat categories. | |
3620 | int64_t num_objects = info.stats.stats.sum.num_objects; | |
3621 | ||
3622 | // Objects missing from up nodes, sorted by # objects. | |
3623 | boost::container::flat_set<pair<int64_t,pg_shard_t>> missing_target_objects; | |
3624 | // Objects missing from nodes not in up, sort by # objects | |
3625 | boost::container::flat_set<pair<int64_t,pg_shard_t>> acting_source_objects; | |
3626 | ||
3627 | // Fill missing_target_objects/acting_source_objects | |
3628 | ||
3629 | { | |
3630 | int64_t missing; | |
3631 | ||
3632 | // Primary first | |
3633 | missing = pg_log.get_missing().num_missing(); | |
3634 | ceph_assert(acting_recovery_backfill.count(pg_whoami)); | |
3635 | if (upset.count(pg_whoami)) { | |
3636 | missing_target_objects.emplace(missing, pg_whoami); | |
3637 | } else { | |
3638 | acting_source_objects.emplace(missing, pg_whoami); | |
3639 | } | |
3640 | info.stats.stats.sum.num_objects_missing_on_primary = missing; | |
3641 | if (missing == 0) | |
3642 | info.stats.avail_no_missing.push_back(pg_whoami); | |
3643 | psdout(20) << __func__ << " shard " << pg_whoami | |
3644 | << " primary objects " << num_objects | |
3645 | << " missing " << missing | |
3646 | << dendl; | |
3647 | } | |
3648 | ||
3649 | // All other peers | |
3650 | for (auto& peer : peer_info) { | |
3651 | // Primary should not be in the peer_info, skip if it is. | |
3652 | if (peer.first == pg_whoami) continue; | |
3653 | int64_t missing = 0; | |
f6b5b4d7 TL |
3654 | int64_t peer_num_objects = |
3655 | std::max((int64_t)0, peer.second.stats.stats.sum.num_objects); | |
9f95a23c TL |
3656 | // Backfill targets always track num_objects accurately |
3657 | // all other peers track missing accurately. | |
3658 | if (is_backfill_target(peer.first)) { | |
3659 | missing = std::max((int64_t)0, num_objects - peer_num_objects); | |
3660 | } else { | |
3661 | if (peer_missing.count(peer.first)) { | |
3662 | missing = peer_missing[peer.first].num_missing(); | |
3663 | } else { | |
3664 | psdout(20) << __func__ << " no peer_missing found for " | |
3665 | << peer.first << dendl; | |
3666 | if (is_recovering()) { | |
3667 | estimate = true; | |
3668 | } | |
3669 | missing = std::max((int64_t)0, num_objects - peer_num_objects); | |
3670 | } | |
3671 | } | |
3672 | if (upset.count(peer.first)) { | |
3673 | missing_target_objects.emplace(missing, peer.first); | |
3674 | } else if (actingset.count(peer.first)) { | |
3675 | acting_source_objects.emplace(missing, peer.first); | |
3676 | } | |
3677 | peer.second.stats.stats.sum.num_objects_missing = missing; | |
3678 | if (missing == 0) | |
3679 | info.stats.avail_no_missing.push_back(peer.first); | |
3680 | psdout(20) << __func__ << " shard " << peer.first | |
3681 | << " objects " << peer_num_objects | |
3682 | << " missing " << missing | |
3683 | << dendl; | |
3684 | } | |
3685 | ||
3686 | // Compute object_location_counts | |
3687 | for (auto& ml: missing_loc.get_missing_locs()) { | |
3688 | info.stats.object_location_counts[ml.second]++; | |
3689 | psdout(30) << __func__ << " " << ml.first << " object_location_counts[" | |
3690 | << ml.second << "]=" << info.stats.object_location_counts[ml.second] | |
3691 | << dendl; | |
3692 | } | |
3693 | int64_t not_missing = num_objects - missing_loc.get_missing_locs().size(); | |
3694 | if (not_missing) { | |
3695 | // During recovery we know upset == actingset and is being populated | |
3696 | // During backfill we know that all non-missing objects are in the actingset | |
3697 | info.stats.object_location_counts[actingset] = not_missing; | |
3698 | } | |
3699 | psdout(30) << __func__ << " object_location_counts[" | |
3700 | << upset << "]=" << info.stats.object_location_counts[upset] | |
3701 | << dendl; | |
3702 | psdout(20) << __func__ << " object_location_counts " | |
3703 | << info.stats.object_location_counts << dendl; | |
3704 | ||
3705 | // A misplaced object is not stored on the correct OSD | |
3706 | int64_t misplaced = 0; | |
3707 | // a degraded objects has fewer replicas or EC shards than the pool specifies. | |
3708 | int64_t degraded = 0; | |
3709 | ||
3710 | if (is_recovering()) { | |
3711 | for (auto& sml: missing_loc.get_missing_by_count()) { | |
3712 | for (auto& ml: sml.second) { | |
3713 | int missing_shards; | |
3714 | if (sml.first == shard_id_t::NO_SHARD) { | |
3715 | psdout(20) << __func__ << " ml " << ml.second | |
3716 | << " upset size " << upset.size() | |
3717 | << " up " << ml.first.up << dendl; | |
3718 | missing_shards = (int)upset.size() - ml.first.up; | |
3719 | } else { | |
3720 | // Handle shards not even in upset below | |
3721 | if (!find_shard(upset, sml.first)) | |
3722 | continue; | |
3723 | missing_shards = std::max(0, 1 - ml.first.up); | |
3724 | psdout(20) << __func__ | |
3725 | << " shard " << sml.first | |
3726 | << " ml " << ml.second | |
3727 | << " missing shards " << missing_shards << dendl; | |
3728 | } | |
3729 | int odegraded = ml.second * missing_shards; | |
3730 | // Copies on other osds but limited to the possible degraded | |
3731 | int more_osds = std::min(missing_shards, ml.first.other); | |
3732 | int omisplaced = ml.second * more_osds; | |
3733 | ceph_assert(omisplaced <= odegraded); | |
3734 | odegraded -= omisplaced; | |
3735 | ||
3736 | misplaced += omisplaced; | |
3737 | degraded += odegraded; | |
3738 | } | |
3739 | } | |
3740 | ||
3741 | psdout(20) << __func__ << " missing based degraded " | |
3742 | << degraded << dendl; | |
3743 | psdout(20) << __func__ << " missing based misplaced " | |
3744 | << misplaced << dendl; | |
3745 | ||
3746 | // Handle undersized case | |
3747 | if (pool.info.is_replicated()) { | |
3748 | // Add degraded for missing targets (num_objects missing) | |
3749 | ceph_assert(target >= upset.size()); | |
3750 | unsigned needed = target - upset.size(); | |
3751 | degraded += num_objects * needed; | |
3752 | } else { | |
3753 | for (unsigned i = 0 ; i < num_shards; ++i) { | |
3754 | shard_id_t shard(i); | |
3755 | ||
3756 | if (!find_shard(upset, shard)) { | |
3757 | pg_shard_t pgs = get_another_shard(actingset, pg_shard_t(), shard); | |
3758 | ||
3759 | if (pgs != pg_shard_t()) { | |
3760 | int64_t missing; | |
3761 | ||
3762 | if (pgs == pg_whoami) | |
3763 | missing = info.stats.stats.sum.num_objects_missing_on_primary; | |
3764 | else | |
3765 | missing = peer_info[pgs].stats.stats.sum.num_objects_missing; | |
3766 | ||
3767 | degraded += missing; | |
3768 | misplaced += std::max((int64_t)0, num_objects - missing); | |
3769 | } else { | |
3770 | // No shard anywhere | |
3771 | degraded += num_objects; | |
3772 | } | |
3773 | } | |
3774 | } | |
3775 | } | |
3776 | goto out; | |
3777 | } | |
3778 | ||
3779 | // Handle undersized case | |
3780 | if (pool.info.is_replicated()) { | |
3781 | // Add to missing_target_objects | |
3782 | ceph_assert(target >= missing_target_objects.size()); | |
3783 | unsigned needed = target - missing_target_objects.size(); | |
3784 | if (needed) | |
3785 | missing_target_objects.emplace(num_objects * needed, pg_shard_t(pg_shard_t::NO_OSD)); | |
3786 | } else { | |
3787 | for (unsigned i = 0 ; i < num_shards; ++i) { | |
3788 | shard_id_t shard(i); | |
3789 | bool found = false; | |
3790 | for (const auto& t : missing_target_objects) { | |
3791 | if (std::get<1>(t).shard == shard) { | |
3792 | found = true; | |
3793 | break; | |
3794 | } | |
3795 | } | |
3796 | if (!found) | |
3797 | missing_target_objects.emplace(num_objects, pg_shard_t(pg_shard_t::NO_OSD,shard)); | |
3798 | } | |
3799 | } | |
3800 | ||
3801 | for (const auto& item : missing_target_objects) | |
3802 | psdout(20) << __func__ << " missing shard " << std::get<1>(item) | |
3803 | << " missing= " << std::get<0>(item) << dendl; | |
3804 | for (const auto& item : acting_source_objects) | |
3805 | psdout(20) << __func__ << " acting shard " << std::get<1>(item) | |
3806 | << " missing= " << std::get<0>(item) << dendl; | |
3807 | ||
3808 | // Handle all objects not in missing for remapped | |
3809 | // or backfill | |
3810 | for (auto m = missing_target_objects.rbegin(); | |
3811 | m != missing_target_objects.rend(); ++m) { | |
3812 | ||
3813 | int64_t extra_missing = -1; | |
3814 | ||
3815 | if (pool.info.is_replicated()) { | |
3816 | if (!acting_source_objects.empty()) { | |
3817 | auto extra_copy = acting_source_objects.begin(); | |
3818 | extra_missing = std::get<0>(*extra_copy); | |
3819 | acting_source_objects.erase(extra_copy); | |
3820 | } | |
3821 | } else { // Erasure coded | |
3822 | // Use corresponding shard | |
3823 | for (const auto& a : acting_source_objects) { | |
3824 | if (std::get<1>(a).shard == std::get<1>(*m).shard) { | |
3825 | extra_missing = std::get<0>(a); | |
3826 | acting_source_objects.erase(a); | |
3827 | break; | |
3828 | } | |
3829 | } | |
3830 | } | |
3831 | ||
3832 | if (extra_missing >= 0 && std::get<0>(*m) >= extra_missing) { | |
3833 | // We don't know which of the objects on the target | |
3834 | // are part of extra_missing so assume are all degraded. | |
3835 | misplaced += std::get<0>(*m) - extra_missing; | |
3836 | degraded += extra_missing; | |
3837 | } else { | |
3838 | // 1. extra_missing == -1, more targets than sources so degraded | |
3839 | // 2. extra_missing > std::get<0>(m), so that we know that some extra_missing | |
3840 | // previously degraded are now present on the target. | |
3841 | degraded += std::get<0>(*m); | |
3842 | } | |
3843 | } | |
3844 | // If there are still acting that haven't been accounted for | |
3845 | // then they are misplaced | |
3846 | for (const auto& a : acting_source_objects) { | |
3847 | int64_t extra_misplaced = std::max((int64_t)0, num_objects - std::get<0>(a)); | |
3848 | psdout(20) << __func__ << " extra acting misplaced " << extra_misplaced | |
3849 | << dendl; | |
3850 | misplaced += extra_misplaced; | |
3851 | } | |
3852 | out: | |
3853 | // NOTE: Tests use these messages to verify this code | |
3854 | psdout(20) << __func__ << " degraded " << degraded | |
3855 | << (estimate ? " (est)": "") << dendl; | |
3856 | psdout(20) << __func__ << " misplaced " << misplaced | |
3857 | << (estimate ? " (est)": "")<< dendl; | |
3858 | ||
3859 | info.stats.stats.sum.num_objects_degraded = degraded; | |
3860 | info.stats.stats.sum.num_objects_unfound = get_num_unfound(); | |
3861 | info.stats.stats.sum.num_objects_misplaced = misplaced; | |
3862 | } | |
3863 | } | |
3864 | ||
3865 | std::optional<pg_stat_t> PeeringState::prepare_stats_for_publish( | |
3866 | bool pg_stats_publish_valid, | |
3867 | const pg_stat_t &pg_stats_publish, | |
3868 | const object_stat_collection_t &unstable_stats) | |
3869 | { | |
3870 | if (info.stats.stats.sum.num_scrub_errors) { | |
3871 | state_set(PG_STATE_INCONSISTENT); | |
3872 | } else { | |
3873 | state_clear(PG_STATE_INCONSISTENT); | |
3874 | state_clear(PG_STATE_FAILED_REPAIR); | |
3875 | } | |
3876 | ||
3877 | utime_t now = ceph_clock_now(); | |
3878 | if (info.stats.state != state) { | |
3879 | info.stats.last_change = now; | |
3880 | // Optimistic estimation, if we just find out an inactive PG, | |
3881 | // assumt it is active till now. | |
3882 | if (!(state & PG_STATE_ACTIVE) && | |
3883 | (info.stats.state & PG_STATE_ACTIVE)) | |
3884 | info.stats.last_active = now; | |
3885 | ||
3886 | if ((state & PG_STATE_ACTIVE) && | |
3887 | !(info.stats.state & PG_STATE_ACTIVE)) | |
3888 | info.stats.last_became_active = now; | |
3889 | if ((state & (PG_STATE_ACTIVE|PG_STATE_PEERED)) && | |
3890 | !(info.stats.state & (PG_STATE_ACTIVE|PG_STATE_PEERED))) | |
3891 | info.stats.last_became_peered = now; | |
3892 | info.stats.state = state; | |
3893 | } | |
3894 | ||
3895 | update_calc_stats(); | |
3896 | if (info.stats.stats.sum.num_objects_degraded) { | |
3897 | state_set(PG_STATE_DEGRADED); | |
3898 | } else { | |
3899 | state_clear(PG_STATE_DEGRADED); | |
3900 | } | |
3901 | update_blocked_by(); | |
3902 | ||
3903 | pg_stat_t pre_publish = info.stats; | |
3904 | pre_publish.stats.add(unstable_stats); | |
3905 | utime_t cutoff = now; | |
3906 | cutoff -= cct->_conf->osd_pg_stat_report_interval_max; | |
3907 | ||
3908 | // share (some of) our purged_snaps via the pg_stats. limit # of intervals | |
3909 | // because we don't want to make the pg_stat_t structures too expensive. | |
3910 | unsigned max = cct->_conf->osd_max_snap_prune_intervals_per_epoch; | |
3911 | unsigned num = 0; | |
3912 | auto i = info.purged_snaps.begin(); | |
3913 | while (num < max && i != info.purged_snaps.end()) { | |
3914 | pre_publish.purged_snaps.insert(i.get_start(), i.get_len()); | |
3915 | ++num; | |
3916 | ++i; | |
3917 | } | |
3918 | psdout(20) << __func__ << " reporting purged_snaps " | |
3919 | << pre_publish.purged_snaps << dendl; | |
3920 | ||
3921 | if (pg_stats_publish_valid && pre_publish == pg_stats_publish && | |
3922 | info.stats.last_fresh > cutoff) { | |
3923 | psdout(15) << "publish_stats_to_osd " << pg_stats_publish.reported_epoch | |
3924 | << ": no change since " << info.stats.last_fresh << dendl; | |
3925 | return std::nullopt; | |
3926 | } else { | |
3927 | // update our stat summary and timestamps | |
3928 | info.stats.reported_epoch = get_osdmap_epoch(); | |
3929 | ++info.stats.reported_seq; | |
3930 | ||
3931 | info.stats.last_fresh = now; | |
3932 | ||
3933 | if (info.stats.state & PG_STATE_CLEAN) | |
3934 | info.stats.last_clean = now; | |
3935 | if (info.stats.state & PG_STATE_ACTIVE) | |
3936 | info.stats.last_active = now; | |
3937 | if (info.stats.state & (PG_STATE_ACTIVE|PG_STATE_PEERED)) | |
3938 | info.stats.last_peered = now; | |
3939 | info.stats.last_unstale = now; | |
3940 | if ((info.stats.state & PG_STATE_DEGRADED) == 0) | |
3941 | info.stats.last_undegraded = now; | |
3942 | if ((info.stats.state & PG_STATE_UNDERSIZED) == 0) | |
3943 | info.stats.last_fullsized = now; | |
3944 | ||
3945 | psdout(15) << "publish_stats_to_osd " << pg_stats_publish.reported_epoch | |
3946 | << ":" << pg_stats_publish.reported_seq << dendl; | |
3947 | return std::make_optional(std::move(pre_publish)); | |
3948 | } | |
3949 | } | |
3950 | ||
3951 | void PeeringState::init( | |
3952 | int role, | |
3953 | const vector<int>& newup, int new_up_primary, | |
3954 | const vector<int>& newacting, int new_acting_primary, | |
3955 | const pg_history_t& history, | |
3956 | const PastIntervals& pi, | |
3957 | bool backfill, | |
3958 | ObjectStore::Transaction &t) | |
3959 | { | |
3960 | psdout(10) << "init role " << role << " up " | |
3961 | << newup << " acting " << newacting | |
3962 | << " history " << history | |
3963 | << " past_intervals " << pi | |
3964 | << dendl; | |
3965 | ||
3966 | set_role(role); | |
3967 | init_primary_up_acting( | |
3968 | newup, | |
3969 | newacting, | |
3970 | new_up_primary, | |
3971 | new_acting_primary); | |
3972 | ||
3973 | info.history = history; | |
3974 | past_intervals = pi; | |
3975 | ||
3976 | info.stats.up = up; | |
3977 | info.stats.up_primary = new_up_primary; | |
3978 | info.stats.acting = acting; | |
3979 | info.stats.acting_primary = new_acting_primary; | |
3980 | info.stats.mapping_epoch = info.history.same_interval_since; | |
3981 | ||
3982 | if (!perform_deletes_during_peering()) { | |
3983 | pg_log.set_missing_may_contain_deletes(); | |
3984 | } | |
3985 | ||
3986 | if (backfill) { | |
3987 | psdout(10) << __func__ << ": Setting backfill" << dendl; | |
3988 | info.set_last_backfill(hobject_t()); | |
3989 | info.last_complete = info.last_update; | |
3990 | pg_log.mark_log_for_rewrite(); | |
3991 | } | |
3992 | ||
3993 | on_new_interval(); | |
3994 | ||
3995 | dirty_info = true; | |
3996 | dirty_big_info = true; | |
3997 | write_if_dirty(t); | |
3998 | } | |
3999 | ||
4000 | void PeeringState::dump_peering_state(Formatter *f) | |
4001 | { | |
4002 | f->dump_string("state", get_pg_state_string()); | |
4003 | f->dump_unsigned("epoch", get_osdmap_epoch()); | |
4004 | f->open_array_section("up"); | |
f67539c2 | 4005 | for (auto p = up.begin(); p != up.end(); ++p) |
9f95a23c TL |
4006 | f->dump_unsigned("osd", *p); |
4007 | f->close_section(); | |
4008 | f->open_array_section("acting"); | |
f67539c2 | 4009 | for (auto p = acting.begin(); p != acting.end(); ++p) |
9f95a23c TL |
4010 | f->dump_unsigned("osd", *p); |
4011 | f->close_section(); | |
4012 | if (!backfill_targets.empty()) { | |
4013 | f->open_array_section("backfill_targets"); | |
f67539c2 | 4014 | for (auto p = backfill_targets.begin(); p != backfill_targets.end(); ++p) |
9f95a23c TL |
4015 | f->dump_stream("shard") << *p; |
4016 | f->close_section(); | |
4017 | } | |
4018 | if (!async_recovery_targets.empty()) { | |
4019 | f->open_array_section("async_recovery_targets"); | |
f67539c2 | 4020 | for (auto p = async_recovery_targets.begin(); |
9f95a23c TL |
4021 | p != async_recovery_targets.end(); |
4022 | ++p) | |
4023 | f->dump_stream("shard") << *p; | |
4024 | f->close_section(); | |
4025 | } | |
4026 | if (!acting_recovery_backfill.empty()) { | |
4027 | f->open_array_section("acting_recovery_backfill"); | |
f67539c2 | 4028 | for (auto p = acting_recovery_backfill.begin(); |
9f95a23c TL |
4029 | p != acting_recovery_backfill.end(); |
4030 | ++p) | |
4031 | f->dump_stream("shard") << *p; | |
4032 | f->close_section(); | |
4033 | } | |
4034 | f->open_object_section("info"); | |
4035 | update_calc_stats(); | |
4036 | info.dump(f); | |
4037 | f->close_section(); | |
4038 | ||
4039 | f->open_array_section("peer_info"); | |
f67539c2 | 4040 | for (auto p = peer_info.begin(); p != peer_info.end(); ++p) { |
9f95a23c TL |
4041 | f->open_object_section("info"); |
4042 | f->dump_stream("peer") << p->first; | |
4043 | p->second.dump(f); | |
4044 | f->close_section(); | |
4045 | } | |
f67539c2 | 4046 | f->close_section(); |
9f95a23c TL |
4047 | } |
4048 | ||
4049 | void PeeringState::update_stats( | |
4050 | std::function<bool(pg_history_t &, pg_stat_t &)> f, | |
4051 | ObjectStore::Transaction *t) { | |
4052 | if (f(info.history, info.stats)) { | |
4053 | pl->publish_stats_to_osd(); | |
4054 | } | |
4055 | pl->on_info_history_change(); | |
4056 | ||
4057 | if (t) { | |
4058 | dirty_info = true; | |
4059 | write_if_dirty(*t); | |
4060 | } | |
4061 | } | |
4062 | ||
4063 | bool PeeringState::append_log_entries_update_missing( | |
4064 | const mempool::osd_pglog::list<pg_log_entry_t> &entries, | |
4065 | ObjectStore::Transaction &t, std::optional<eversion_t> trim_to, | |
4066 | std::optional<eversion_t> roll_forward_to) | |
4067 | { | |
4068 | ceph_assert(!entries.empty()); | |
4069 | ceph_assert(entries.begin()->version > info.last_update); | |
4070 | ||
4071 | PGLog::LogEntryHandlerRef rollbacker{pl->get_log_handler(t)}; | |
4072 | bool invalidate_stats = | |
4073 | pg_log.append_new_log_entries( | |
4074 | info.last_backfill, | |
4075 | entries, | |
4076 | rollbacker.get()); | |
4077 | ||
4078 | if (roll_forward_to && entries.rbegin()->soid > info.last_backfill) { | |
4079 | pg_log.roll_forward(rollbacker.get()); | |
4080 | } | |
4081 | if (roll_forward_to && *roll_forward_to > pg_log.get_can_rollback_to()) { | |
4082 | pg_log.roll_forward_to(*roll_forward_to, rollbacker.get()); | |
4083 | last_rollback_info_trimmed_to_applied = *roll_forward_to; | |
4084 | } | |
4085 | ||
4086 | info.last_update = pg_log.get_head(); | |
4087 | ||
4088 | if (pg_log.get_missing().num_missing() == 0) { | |
4089 | // advance last_complete since nothing else is missing! | |
4090 | info.last_complete = info.last_update; | |
4091 | } | |
4092 | info.stats.stats_invalid = info.stats.stats_invalid || invalidate_stats; | |
4093 | ||
4094 | psdout(20) << __func__ << " trim_to bool = " << bool(trim_to) | |
4095 | << " trim_to = " << (trim_to ? *trim_to : eversion_t()) << dendl; | |
4096 | if (trim_to) | |
4097 | pg_log.trim(*trim_to, info); | |
4098 | dirty_info = true; | |
4099 | write_if_dirty(t); | |
4100 | return invalidate_stats; | |
4101 | } | |
4102 | ||
4103 | void PeeringState::merge_new_log_entries( | |
4104 | const mempool::osd_pglog::list<pg_log_entry_t> &entries, | |
4105 | ObjectStore::Transaction &t, | |
4106 | std::optional<eversion_t> trim_to, | |
4107 | std::optional<eversion_t> roll_forward_to) | |
4108 | { | |
4109 | psdout(10) << __func__ << " " << entries << dendl; | |
4110 | ceph_assert(is_primary()); | |
4111 | ||
4112 | bool rebuild_missing = append_log_entries_update_missing(entries, t, trim_to, roll_forward_to); | |
f67539c2 | 4113 | for (auto i = acting_recovery_backfill.begin(); |
9f95a23c TL |
4114 | i != acting_recovery_backfill.end(); |
4115 | ++i) { | |
4116 | pg_shard_t peer(*i); | |
4117 | if (peer == pg_whoami) continue; | |
4118 | ceph_assert(peer_missing.count(peer)); | |
4119 | ceph_assert(peer_info.count(peer)); | |
4120 | pg_missing_t& pmissing(peer_missing[peer]); | |
4121 | psdout(20) << __func__ << " peer_missing for " << peer | |
4122 | << " = " << pmissing << dendl; | |
4123 | pg_info_t& pinfo(peer_info[peer]); | |
4124 | bool invalidate_stats = PGLog::append_log_entries_update_missing( | |
4125 | pinfo.last_backfill, | |
4126 | entries, | |
4127 | true, | |
4128 | NULL, | |
4129 | pmissing, | |
4130 | NULL, | |
4131 | dpp); | |
4132 | pinfo.last_update = info.last_update; | |
4133 | pinfo.stats.stats_invalid = pinfo.stats.stats_invalid || invalidate_stats; | |
4134 | rebuild_missing = rebuild_missing || invalidate_stats; | |
4135 | } | |
4136 | ||
4137 | if (!rebuild_missing) { | |
4138 | return; | |
4139 | } | |
4140 | ||
4141 | for (auto &&i: entries) { | |
4142 | missing_loc.rebuild( | |
4143 | i.soid, | |
4144 | pg_whoami, | |
4145 | acting_recovery_backfill, | |
4146 | info, | |
4147 | pg_log.get_missing(), | |
4148 | peer_missing, | |
4149 | peer_info); | |
4150 | } | |
4151 | } | |
4152 | ||
4153 | void PeeringState::add_log_entry(const pg_log_entry_t& e, bool applied) | |
4154 | { | |
4155 | // raise last_complete only if we were previously up to date | |
4156 | if (info.last_complete == info.last_update) | |
4157 | info.last_complete = e.version; | |
4158 | ||
4159 | // raise last_update. | |
4160 | ceph_assert(e.version > info.last_update); | |
4161 | info.last_update = e.version; | |
4162 | ||
4163 | // raise user_version, if it increased (it may have not get bumped | |
4164 | // by all logged updates) | |
4165 | if (e.user_version > info.last_user_version) | |
4166 | info.last_user_version = e.user_version; | |
4167 | ||
4168 | // log mutation | |
4169 | pg_log.add(e, applied); | |
4170 | psdout(10) << "add_log_entry " << e << dendl; | |
4171 | } | |
4172 | ||
4173 | ||
4174 | void PeeringState::append_log( | |
f67539c2 | 4175 | vector<pg_log_entry_t>&& logv, |
9f95a23c TL |
4176 | eversion_t trim_to, |
4177 | eversion_t roll_forward_to, | |
4178 | eversion_t mlcod, | |
4179 | ObjectStore::Transaction &t, | |
4180 | bool transaction_applied, | |
4181 | bool async) | |
4182 | { | |
4183 | /* The primary has sent an info updating the history, but it may not | |
4184 | * have arrived yet. We want to make sure that we cannot remember this | |
4185 | * write without remembering that it happened in an interval which went | |
4186 | * active in epoch history.last_epoch_started. | |
4187 | */ | |
4188 | if (info.last_epoch_started != info.history.last_epoch_started) { | |
4189 | info.history.last_epoch_started = info.last_epoch_started; | |
4190 | } | |
4191 | if (info.last_interval_started != info.history.last_interval_started) { | |
4192 | info.history.last_interval_started = info.last_interval_started; | |
4193 | } | |
4194 | psdout(10) << "append_log " << pg_log.get_log() << " " << logv << dendl; | |
4195 | ||
4196 | PGLog::LogEntryHandlerRef handler{pl->get_log_handler(t)}; | |
4197 | if (!transaction_applied) { | |
4198 | /* We must be a backfill or async recovery peer, so it's ok if we apply | |
4199 | * out-of-turn since we won't be considered when | |
4200 | * determining a min possible last_update. | |
4201 | * | |
4202 | * We skip_rollforward() here, which advances the crt, without | |
4203 | * doing an actual rollforward. This avoids cleaning up entries | |
4204 | * from the backend and we do not end up in a situation, where the | |
4205 | * object is deleted before we can _merge_object_divergent_entries(). | |
4206 | */ | |
4207 | pg_log.skip_rollforward(); | |
4208 | } | |
4209 | ||
f67539c2 | 4210 | for (auto p = logv.begin(); p != logv.end(); ++p) { |
9f95a23c TL |
4211 | add_log_entry(*p, transaction_applied); |
4212 | ||
4213 | /* We don't want to leave the rollforward artifacts around | |
4214 | * here past last_backfill. It's ok for the same reason as | |
4215 | * above */ | |
4216 | if (transaction_applied && | |
4217 | p->soid > info.last_backfill) { | |
4218 | pg_log.roll_forward(handler.get()); | |
4219 | } | |
4220 | } | |
4221 | if (transaction_applied && roll_forward_to > pg_log.get_can_rollback_to()) { | |
4222 | pg_log.roll_forward_to( | |
4223 | roll_forward_to, | |
4224 | handler.get()); | |
4225 | last_rollback_info_trimmed_to_applied = roll_forward_to; | |
4226 | } | |
4227 | ||
4228 | psdout(10) << __func__ << " approx pg log length = " | |
4229 | << pg_log.get_log().approx_size() << dendl; | |
4230 | psdout(10) << __func__ << " transaction_applied = " | |
4231 | << transaction_applied << dendl; | |
4232 | if (!transaction_applied || async) | |
4233 | psdout(10) << __func__ << " " << pg_whoami | |
4234 | << " is async_recovery or backfill target" << dendl; | |
4235 | pg_log.trim(trim_to, info, transaction_applied, async); | |
4236 | ||
4237 | // update the local pg, pg log | |
4238 | dirty_info = true; | |
4239 | write_if_dirty(t); | |
4240 | ||
4241 | if (!is_primary()) | |
4242 | min_last_complete_ondisk = mlcod; | |
4243 | } | |
4244 | ||
4245 | void PeeringState::recover_got( | |
4246 | const hobject_t &oid, eversion_t v, | |
4247 | bool is_delete, | |
4248 | ObjectStore::Transaction &t) | |
4249 | { | |
4250 | if (v > pg_log.get_can_rollback_to()) { | |
4251 | /* This can only happen during a repair, and even then, it would | |
4252 | * be one heck of a race. If we are repairing the object, the | |
4253 | * write in question must be fully committed, so it's not valid | |
4254 | * to roll it back anyway (and we'll be rolled forward shortly | |
4255 | * anyway) */ | |
4256 | PGLog::LogEntryHandlerRef handler{pl->get_log_handler(t)}; | |
4257 | pg_log.roll_forward_to(v, handler.get()); | |
4258 | } | |
4259 | ||
4260 | psdout(10) << "got missing " << oid << " v " << v << dendl; | |
4261 | pg_log.recover_got(oid, v, info); | |
4262 | if (pg_log.get_log().log.empty()) { | |
4263 | psdout(10) << "last_complete now " << info.last_complete | |
4264 | << " while log is empty" << dendl; | |
4265 | } else if (pg_log.get_log().complete_to != pg_log.get_log().log.end()) { | |
4266 | psdout(10) << "last_complete now " << info.last_complete | |
4267 | << " log.complete_to " << pg_log.get_log().complete_to->version | |
4268 | << dendl; | |
4269 | } else { | |
4270 | psdout(10) << "last_complete now " << info.last_complete | |
4271 | << " log.complete_to at end" << dendl; | |
4272 | //below is not true in the repair case. | |
4273 | //assert(missing.num_missing() == 0); // otherwise, complete_to was wrong. | |
4274 | ceph_assert(info.last_complete == info.last_update); | |
4275 | } | |
4276 | ||
4277 | if (is_primary()) { | |
4278 | ceph_assert(missing_loc.needs_recovery(oid)); | |
4279 | if (!is_delete) | |
4280 | missing_loc.add_location(oid, pg_whoami); | |
4281 | } | |
4282 | ||
4283 | // update pg | |
4284 | dirty_info = true; | |
4285 | write_if_dirty(t); | |
4286 | } | |
4287 | ||
4288 | void PeeringState::update_backfill_progress( | |
4289 | const hobject_t &updated_backfill, | |
4290 | const pg_stat_t &updated_stats, | |
4291 | bool preserve_local_num_bytes, | |
4292 | ObjectStore::Transaction &t) { | |
4293 | info.set_last_backfill(updated_backfill); | |
4294 | if (preserve_local_num_bytes) { | |
4295 | psdout(25) << __func__ << " primary " << updated_stats.stats.sum.num_bytes | |
4296 | << " local " << info.stats.stats.sum.num_bytes << dendl; | |
4297 | int64_t bytes = info.stats.stats.sum.num_bytes; | |
4298 | info.stats = updated_stats; | |
4299 | info.stats.stats.sum.num_bytes = bytes; | |
4300 | } else { | |
4301 | psdout(20) << __func__ << " final " << updated_stats.stats.sum.num_bytes | |
4302 | << " replaces local " << info.stats.stats.sum.num_bytes << dendl; | |
4303 | info.stats = updated_stats; | |
4304 | } | |
4305 | ||
4306 | dirty_info = true; | |
4307 | write_if_dirty(t); | |
4308 | } | |
4309 | ||
4310 | void PeeringState::adjust_purged_snaps( | |
4311 | std::function<void(interval_set<snapid_t> &snaps)> f) { | |
4312 | f(info.purged_snaps); | |
4313 | dirty_info = true; | |
4314 | dirty_big_info = true; | |
4315 | } | |
4316 | ||
4317 | void PeeringState::on_peer_recover( | |
4318 | pg_shard_t peer, | |
4319 | const hobject_t &soid, | |
4320 | const eversion_t &version) | |
4321 | { | |
4322 | pl->publish_stats_to_osd(); | |
4323 | // done! | |
4324 | peer_missing[peer].got(soid, version); | |
4325 | missing_loc.add_location(soid, peer); | |
4326 | } | |
4327 | ||
4328 | void PeeringState::begin_peer_recover( | |
4329 | pg_shard_t peer, | |
4330 | const hobject_t soid) | |
4331 | { | |
4332 | peer_missing[peer].revise_have(soid, eversion_t()); | |
4333 | } | |
4334 | ||
4335 | void PeeringState::force_object_missing( | |
4336 | const set<pg_shard_t> &peers, | |
4337 | const hobject_t &soid, | |
4338 | eversion_t version) | |
4339 | { | |
4340 | for (auto &&peer : peers) { | |
4341 | if (peer != primary) { | |
4342 | peer_missing[peer].add(soid, version, eversion_t(), false); | |
4343 | } else { | |
4344 | pg_log.missing_add(soid, version, eversion_t()); | |
4345 | pg_log.reset_complete_to(&info); | |
4346 | pg_log.set_last_requested(0); | |
4347 | } | |
4348 | } | |
4349 | ||
4350 | missing_loc.rebuild( | |
4351 | soid, | |
4352 | pg_whoami, | |
4353 | acting_recovery_backfill, | |
4354 | info, | |
4355 | pg_log.get_missing(), | |
4356 | peer_missing, | |
4357 | peer_info); | |
4358 | } | |
4359 | ||
4360 | void PeeringState::pre_submit_op( | |
4361 | const hobject_t &hoid, | |
4362 | const vector<pg_log_entry_t>& logv, | |
4363 | eversion_t at_version) | |
4364 | { | |
4365 | if (at_version > eversion_t()) { | |
4366 | for (auto &&i : get_acting_recovery_backfill()) { | |
4367 | if (i == primary) continue; | |
4368 | pg_info_t &pinfo = peer_info[i]; | |
4369 | // keep peer_info up to date | |
4370 | if (pinfo.last_complete == pinfo.last_update) | |
4371 | pinfo.last_complete = at_version; | |
4372 | pinfo.last_update = at_version; | |
4373 | } | |
4374 | } | |
4375 | ||
4376 | bool requires_missing_loc = false; | |
4377 | for (auto &&i : get_async_recovery_targets()) { | |
4378 | if (i == primary || !get_peer_missing(i).is_missing(hoid)) | |
4379 | continue; | |
4380 | requires_missing_loc = true; | |
4381 | for (auto &&entry: logv) { | |
4382 | peer_missing[i].add_next_event(entry); | |
4383 | } | |
4384 | } | |
4385 | ||
4386 | if (requires_missing_loc) { | |
4387 | for (auto &&entry: logv) { | |
4388 | psdout(30) << __func__ << " missing_loc before: " | |
4389 | << missing_loc.get_locations(entry.soid) << dendl; | |
4390 | missing_loc.add_missing(entry.soid, entry.version, | |
4391 | eversion_t(), entry.is_delete()); | |
4392 | // clear out missing_loc | |
4393 | missing_loc.clear_location(entry.soid); | |
4394 | for (auto &i: get_actingset()) { | |
4395 | if (!get_peer_missing(i).is_missing(entry.soid)) | |
4396 | missing_loc.add_location(entry.soid, i); | |
4397 | } | |
4398 | psdout(30) << __func__ << " missing_loc after: " | |
4399 | << missing_loc.get_locations(entry.soid) << dendl; | |
4400 | } | |
4401 | } | |
4402 | } | |
4403 | ||
4404 | void PeeringState::recovery_committed_to(eversion_t version) | |
4405 | { | |
4406 | psdout(10) << __func__ << " version " << version | |
4407 | << " now ondisk" << dendl; | |
4408 | last_complete_ondisk = version; | |
4409 | ||
4410 | if (last_complete_ondisk == info.last_update) { | |
4411 | if (!is_primary()) { | |
4412 | // Either we are a replica or backfill target. | |
4413 | // we are fully up to date. tell the primary! | |
4414 | pl->send_cluster_message( | |
4415 | get_primary().osd, | |
f67539c2 | 4416 | make_message<MOSDPGTrim>( |
9f95a23c TL |
4417 | get_osdmap_epoch(), |
4418 | spg_t(info.pgid.pgid, primary.shard), | |
4419 | last_complete_ondisk), | |
4420 | get_osdmap_epoch()); | |
4421 | } else { | |
4422 | calc_min_last_complete_ondisk(); | |
4423 | } | |
4424 | } | |
4425 | } | |
4426 | ||
4427 | void PeeringState::complete_write(eversion_t v, eversion_t lc) | |
4428 | { | |
4429 | last_update_ondisk = v; | |
4430 | last_complete_ondisk = lc; | |
4431 | calc_min_last_complete_ondisk(); | |
4432 | } | |
4433 | ||
4434 | void PeeringState::calc_trim_to() | |
4435 | { | |
4436 | size_t target = pl->get_target_pg_log_entries(); | |
4437 | ||
4438 | eversion_t limit = std::min( | |
4439 | min_last_complete_ondisk, | |
4440 | pg_log.get_can_rollback_to()); | |
4441 | if (limit != eversion_t() && | |
4442 | limit != pg_trim_to && | |
4443 | pg_log.get_log().approx_size() > target) { | |
4444 | size_t num_to_trim = std::min(pg_log.get_log().approx_size() - target, | |
4445 | cct->_conf->osd_pg_log_trim_max); | |
4446 | if (num_to_trim < cct->_conf->osd_pg_log_trim_min && | |
4447 | cct->_conf->osd_pg_log_trim_max >= cct->_conf->osd_pg_log_trim_min) { | |
4448 | return; | |
4449 | } | |
f67539c2 | 4450 | auto it = pg_log.get_log().log.begin(); |
9f95a23c TL |
4451 | eversion_t new_trim_to; |
4452 | for (size_t i = 0; i < num_to_trim; ++i) { | |
4453 | new_trim_to = it->version; | |
4454 | ++it; | |
4455 | if (new_trim_to > limit) { | |
4456 | new_trim_to = limit; | |
4457 | psdout(10) << "calc_trim_to trimming to min_last_complete_ondisk" << dendl; | |
4458 | break; | |
4459 | } | |
4460 | } | |
4461 | psdout(10) << "calc_trim_to " << pg_trim_to << " -> " << new_trim_to << dendl; | |
4462 | pg_trim_to = new_trim_to; | |
4463 | assert(pg_trim_to <= pg_log.get_head()); | |
4464 | assert(pg_trim_to <= min_last_complete_ondisk); | |
4465 | } | |
4466 | } | |
4467 | ||
4468 | void PeeringState::calc_trim_to_aggressive() | |
4469 | { | |
4470 | size_t target = pl->get_target_pg_log_entries(); | |
4471 | ||
4472 | // limit pg log trimming up to the can_rollback_to value | |
1911f103 | 4473 | eversion_t limit = std::min({ |
9f95a23c | 4474 | pg_log.get_head(), |
1911f103 TL |
4475 | pg_log.get_can_rollback_to(), |
4476 | last_update_ondisk}); | |
9f95a23c TL |
4477 | psdout(10) << __func__ << " limit = " << limit << dendl; |
4478 | ||
4479 | if (limit != eversion_t() && | |
4480 | limit != pg_trim_to && | |
4481 | pg_log.get_log().approx_size() > target) { | |
4482 | psdout(10) << __func__ << " approx pg log length = " | |
4483 | << pg_log.get_log().approx_size() << dendl; | |
4484 | uint64_t num_to_trim = std::min<uint64_t>(pg_log.get_log().approx_size() - target, | |
4485 | cct->_conf->osd_pg_log_trim_max); | |
4486 | psdout(10) << __func__ << " num_to_trim = " << num_to_trim << dendl; | |
4487 | if (num_to_trim < cct->_conf->osd_pg_log_trim_min && | |
4488 | cct->_conf->osd_pg_log_trim_max >= cct->_conf->osd_pg_log_trim_min) { | |
4489 | return; | |
4490 | } | |
4491 | auto it = pg_log.get_log().log.begin(); // oldest log entry | |
4492 | auto rit = pg_log.get_log().log.rbegin(); | |
4493 | eversion_t by_n_to_keep; // start from tail | |
4494 | eversion_t by_n_to_trim = eversion_t::max(); // start from head | |
4495 | for (size_t i = 0; it != pg_log.get_log().log.end(); ++it, ++rit) { | |
4496 | i++; | |
4497 | if (i > target && by_n_to_keep == eversion_t()) { | |
4498 | by_n_to_keep = rit->version; | |
4499 | } | |
4500 | if (i >= num_to_trim && by_n_to_trim == eversion_t::max()) { | |
4501 | by_n_to_trim = it->version; | |
4502 | } | |
4503 | if (by_n_to_keep != eversion_t() && | |
4504 | by_n_to_trim != eversion_t::max()) { | |
4505 | break; | |
4506 | } | |
4507 | } | |
4508 | ||
4509 | if (by_n_to_keep == eversion_t()) { | |
4510 | return; | |
4511 | } | |
4512 | ||
4513 | pg_trim_to = std::min({by_n_to_keep, by_n_to_trim, limit}); | |
4514 | psdout(10) << __func__ << " pg_trim_to now " << pg_trim_to << dendl; | |
4515 | ceph_assert(pg_trim_to <= pg_log.get_head()); | |
4516 | } | |
4517 | } | |
4518 | ||
4519 | void PeeringState::apply_op_stats( | |
4520 | const hobject_t &soid, | |
4521 | const object_stat_sum_t &delta_stats) | |
4522 | { | |
4523 | info.stats.stats.add(delta_stats); | |
4524 | info.stats.stats.floor(0); | |
4525 | ||
f67539c2 | 4526 | for (auto i = get_backfill_targets().begin(); |
9f95a23c TL |
4527 | i != get_backfill_targets().end(); |
4528 | ++i) { | |
4529 | pg_shard_t bt = *i; | |
4530 | pg_info_t& pinfo = peer_info[bt]; | |
4531 | if (soid <= pinfo.last_backfill) | |
4532 | pinfo.stats.stats.add(delta_stats); | |
4533 | } | |
4534 | } | |
4535 | ||
4536 | void PeeringState::update_complete_backfill_object_stats( | |
4537 | const hobject_t &hoid, | |
4538 | const pg_stat_t &stats) | |
4539 | { | |
4540 | for (auto &&bt: get_backfill_targets()) { | |
4541 | pg_info_t& pinfo = peer_info[bt]; | |
4542 | //Add stats to all peers that were missing object | |
4543 | if (hoid > pinfo.last_backfill) | |
4544 | pinfo.stats.add(stats); | |
4545 | } | |
4546 | } | |
4547 | ||
4548 | void PeeringState::update_peer_last_backfill( | |
4549 | pg_shard_t peer, | |
4550 | const hobject_t &new_last_backfill) | |
4551 | { | |
4552 | pg_info_t &pinfo = peer_info[peer]; | |
4553 | pinfo.last_backfill = new_last_backfill; | |
4554 | if (new_last_backfill.is_max()) { | |
4555 | /* pinfo.stats might be wrong if we did log-based recovery on the | |
4556 | * backfilled portion in addition to continuing backfill. | |
4557 | */ | |
4558 | pinfo.stats = info.stats; | |
4559 | } | |
4560 | } | |
4561 | ||
4562 | void PeeringState::set_revert_with_targets( | |
4563 | const hobject_t &soid, | |
4564 | const set<pg_shard_t> &good_peers) | |
4565 | { | |
4566 | for (auto &&peer: good_peers) { | |
4567 | missing_loc.add_location(soid, peer); | |
4568 | } | |
4569 | } | |
4570 | ||
4571 | void PeeringState::prepare_backfill_for_missing( | |
4572 | const hobject_t &soid, | |
4573 | const eversion_t &version, | |
4574 | const vector<pg_shard_t> &targets) { | |
4575 | for (auto &&peer: targets) { | |
4576 | peer_missing[peer].add(soid, version, eversion_t(), false); | |
4577 | } | |
4578 | } | |
4579 | ||
4580 | void PeeringState::update_hset(const pg_hit_set_history_t &hset_history) | |
4581 | { | |
4582 | info.hit_set = hset_history; | |
4583 | } | |
4584 | ||
4585 | /*------------ Peering State Machine----------------*/ | |
4586 | #undef dout_prefix | |
4587 | #define dout_prefix (context< PeeringMachine >().dpp->gen_prefix(*_dout) \ | |
4588 | << "state<" << get_state_name() << ">: ") | |
4589 | #undef psdout | |
4590 | #define psdout(x) ldout(context< PeeringMachine >().cct, x) | |
4591 | ||
4592 | #define DECLARE_LOCALS \ | |
4593 | PeeringState *ps = context< PeeringMachine >().state; \ | |
4594 | std::ignore = ps; \ | |
4595 | PeeringListener *pl = context< PeeringMachine >().pl; \ | |
4596 | std::ignore = pl | |
4597 | ||
4598 | ||
4599 | /*------Crashed-------*/ | |
4600 | PeeringState::Crashed::Crashed(my_context ctx) | |
4601 | : my_base(ctx), | |
4602 | NamedState(context< PeeringMachine >().state_history, "Crashed") | |
4603 | { | |
4604 | context< PeeringMachine >().log_enter(state_name); | |
4605 | ceph_abort_msg("we got a bad state machine event"); | |
4606 | } | |
4607 | ||
4608 | ||
4609 | /*------Initial-------*/ | |
4610 | PeeringState::Initial::Initial(my_context ctx) | |
4611 | : my_base(ctx), | |
4612 | NamedState(context< PeeringMachine >().state_history, "Initial") | |
4613 | { | |
4614 | context< PeeringMachine >().log_enter(state_name); | |
4615 | } | |
4616 | ||
4617 | boost::statechart::result PeeringState::Initial::react(const MNotifyRec& notify) | |
4618 | { | |
4619 | DECLARE_LOCALS; | |
4620 | ps->proc_replica_info( | |
4621 | notify.from, notify.notify.info, notify.notify.epoch_sent); | |
4622 | ps->set_last_peering_reset(); | |
4623 | return transit< Primary >(); | |
4624 | } | |
4625 | ||
4626 | boost::statechart::result PeeringState::Initial::react(const MInfoRec& i) | |
4627 | { | |
4628 | DECLARE_LOCALS; | |
4629 | ceph_assert(!ps->is_primary()); | |
4630 | post_event(i); | |
4631 | return transit< Stray >(); | |
4632 | } | |
4633 | ||
4634 | boost::statechart::result PeeringState::Initial::react(const MLogRec& i) | |
4635 | { | |
4636 | DECLARE_LOCALS; | |
4637 | ceph_assert(!ps->is_primary()); | |
4638 | post_event(i); | |
4639 | return transit< Stray >(); | |
4640 | } | |
4641 | ||
4642 | void PeeringState::Initial::exit() | |
4643 | { | |
4644 | context< PeeringMachine >().log_exit(state_name, enter_time); | |
4645 | DECLARE_LOCALS; | |
4646 | utime_t dur = ceph_clock_now() - enter_time; | |
4647 | pl->get_peering_perf().tinc(rs_initial_latency, dur); | |
4648 | } | |
4649 | ||
4650 | /*------Started-------*/ | |
4651 | PeeringState::Started::Started(my_context ctx) | |
4652 | : my_base(ctx), | |
4653 | NamedState(context< PeeringMachine >().state_history, "Started") | |
4654 | { | |
4655 | context< PeeringMachine >().log_enter(state_name); | |
4656 | } | |
4657 | ||
4658 | boost::statechart::result | |
4659 | PeeringState::Started::react(const IntervalFlush&) | |
4660 | { | |
4661 | psdout(10) << "Ending blocked outgoing recovery messages" << dendl; | |
4662 | context< PeeringMachine >().state->end_block_outgoing(); | |
4663 | return discard_event(); | |
4664 | } | |
4665 | ||
4666 | boost::statechart::result PeeringState::Started::react(const AdvMap& advmap) | |
4667 | { | |
4668 | DECLARE_LOCALS; | |
4669 | psdout(10) << "Started advmap" << dendl; | |
4670 | ps->check_full_transition(advmap.lastmap, advmap.osdmap); | |
4671 | if (ps->should_restart_peering( | |
4672 | advmap.up_primary, | |
4673 | advmap.acting_primary, | |
4674 | advmap.newup, | |
4675 | advmap.newacting, | |
4676 | advmap.lastmap, | |
4677 | advmap.osdmap)) { | |
4678 | psdout(10) << "should_restart_peering, transitioning to Reset" | |
4679 | << dendl; | |
4680 | post_event(advmap); | |
4681 | return transit< Reset >(); | |
4682 | } | |
4683 | ps->remove_down_peer_info(advmap.osdmap); | |
4684 | return discard_event(); | |
4685 | } | |
4686 | ||
4687 | boost::statechart::result PeeringState::Started::react(const QueryState& q) | |
4688 | { | |
4689 | q.f->open_object_section("state"); | |
4690 | q.f->dump_string("name", state_name); | |
4691 | q.f->dump_stream("enter_time") << enter_time; | |
4692 | q.f->close_section(); | |
4693 | return discard_event(); | |
4694 | } | |
4695 | ||
f67539c2 TL |
4696 | boost::statechart::result PeeringState::Started::react(const QueryUnfound& q) |
4697 | { | |
4698 | q.f->dump_string("state", "Started"); | |
4699 | q.f->dump_bool("available_might_have_unfound", false); | |
4700 | return discard_event(); | |
4701 | } | |
4702 | ||
9f95a23c TL |
4703 | void PeeringState::Started::exit() |
4704 | { | |
4705 | context< PeeringMachine >().log_exit(state_name, enter_time); | |
4706 | DECLARE_LOCALS; | |
4707 | utime_t dur = ceph_clock_now() - enter_time; | |
4708 | pl->get_peering_perf().tinc(rs_started_latency, dur); | |
4709 | ps->state_clear(PG_STATE_WAIT | PG_STATE_LAGGY); | |
4710 | } | |
4711 | ||
4712 | /*--------Reset---------*/ | |
4713 | PeeringState::Reset::Reset(my_context ctx) | |
4714 | : my_base(ctx), | |
4715 | NamedState(context< PeeringMachine >().state_history, "Reset") | |
4716 | { | |
4717 | context< PeeringMachine >().log_enter(state_name); | |
4718 | DECLARE_LOCALS; | |
4719 | ||
4720 | ps->flushes_in_progress = 0; | |
4721 | ps->set_last_peering_reset(); | |
4722 | ps->log_weirdness(); | |
4723 | } | |
4724 | ||
4725 | boost::statechart::result | |
4726 | PeeringState::Reset::react(const IntervalFlush&) | |
4727 | { | |
4728 | psdout(10) << "Ending blocked outgoing recovery messages" << dendl; | |
4729 | context< PeeringMachine >().state->end_block_outgoing(); | |
4730 | return discard_event(); | |
4731 | } | |
4732 | ||
4733 | boost::statechart::result PeeringState::Reset::react(const AdvMap& advmap) | |
4734 | { | |
4735 | DECLARE_LOCALS; | |
4736 | psdout(10) << "Reset advmap" << dendl; | |
4737 | ||
4738 | ps->check_full_transition(advmap.lastmap, advmap.osdmap); | |
4739 | ||
4740 | if (ps->should_restart_peering( | |
4741 | advmap.up_primary, | |
4742 | advmap.acting_primary, | |
4743 | advmap.newup, | |
4744 | advmap.newacting, | |
4745 | advmap.lastmap, | |
4746 | advmap.osdmap)) { | |
4747 | psdout(10) << "should restart peering, calling start_peering_interval again" | |
4748 | << dendl; | |
4749 | ps->start_peering_interval( | |
4750 | advmap.lastmap, | |
4751 | advmap.newup, advmap.up_primary, | |
4752 | advmap.newacting, advmap.acting_primary, | |
4753 | context< PeeringMachine >().get_cur_transaction()); | |
4754 | } | |
4755 | ps->remove_down_peer_info(advmap.osdmap); | |
4756 | ps->check_past_interval_bounds(); | |
4757 | return discard_event(); | |
4758 | } | |
4759 | ||
4760 | boost::statechart::result PeeringState::Reset::react(const ActMap&) | |
4761 | { | |
4762 | DECLARE_LOCALS; | |
4763 | if (ps->should_send_notify() && ps->get_primary().osd >= 0) { | |
4764 | ps->info.history.refresh_prior_readable_until_ub( | |
4765 | pl->get_mnow(), | |
4766 | ps->prior_readable_until_ub); | |
4767 | context< PeeringMachine >().send_notify( | |
4768 | ps->get_primary().osd, | |
4769 | pg_notify_t( | |
4770 | ps->get_primary().shard, ps->pg_whoami.shard, | |
4771 | ps->get_osdmap_epoch(), | |
4772 | ps->get_osdmap_epoch(), | |
4773 | ps->info, | |
4774 | ps->past_intervals)); | |
4775 | } | |
4776 | ||
4777 | ps->update_heartbeat_peers(); | |
4778 | ||
4779 | return transit< Started >(); | |
4780 | } | |
4781 | ||
4782 | boost::statechart::result PeeringState::Reset::react(const QueryState& q) | |
4783 | { | |
4784 | q.f->open_object_section("state"); | |
4785 | q.f->dump_string("name", state_name); | |
4786 | q.f->dump_stream("enter_time") << enter_time; | |
4787 | q.f->close_section(); | |
4788 | return discard_event(); | |
4789 | } | |
4790 | ||
f67539c2 TL |
4791 | boost::statechart::result PeeringState::Reset::react(const QueryUnfound& q) |
4792 | { | |
4793 | q.f->dump_string("state", "Reset"); | |
4794 | q.f->dump_bool("available_might_have_unfound", false); | |
4795 | return discard_event(); | |
4796 | } | |
4797 | ||
9f95a23c TL |
4798 | void PeeringState::Reset::exit() |
4799 | { | |
4800 | context< PeeringMachine >().log_exit(state_name, enter_time); | |
4801 | DECLARE_LOCALS; | |
4802 | utime_t dur = ceph_clock_now() - enter_time; | |
4803 | pl->get_peering_perf().tinc(rs_reset_latency, dur); | |
4804 | } | |
4805 | ||
4806 | /*-------Start---------*/ | |
4807 | PeeringState::Start::Start(my_context ctx) | |
4808 | : my_base(ctx), | |
4809 | NamedState(context< PeeringMachine >().state_history, "Start") | |
4810 | { | |
4811 | context< PeeringMachine >().log_enter(state_name); | |
4812 | ||
4813 | DECLARE_LOCALS; | |
4814 | if (ps->is_primary()) { | |
4815 | psdout(1) << "transitioning to Primary" << dendl; | |
4816 | post_event(MakePrimary()); | |
4817 | } else { //is_stray | |
4818 | psdout(1) << "transitioning to Stray" << dendl; | |
4819 | post_event(MakeStray()); | |
4820 | } | |
4821 | } | |
4822 | ||
4823 | void PeeringState::Start::exit() | |
4824 | { | |
4825 | context< PeeringMachine >().log_exit(state_name, enter_time); | |
4826 | DECLARE_LOCALS; | |
4827 | utime_t dur = ceph_clock_now() - enter_time; | |
4828 | pl->get_peering_perf().tinc(rs_start_latency, dur); | |
4829 | } | |
4830 | ||
4831 | /*---------Primary--------*/ | |
4832 | PeeringState::Primary::Primary(my_context ctx) | |
4833 | : my_base(ctx), | |
4834 | NamedState(context< PeeringMachine >().state_history, "Started/Primary") | |
4835 | { | |
4836 | context< PeeringMachine >().log_enter(state_name); | |
4837 | DECLARE_LOCALS; | |
4838 | ceph_assert(ps->want_acting.empty()); | |
4839 | ||
4840 | // set CREATING bit until we have peered for the first time. | |
4841 | if (ps->info.history.last_epoch_started == 0) { | |
4842 | ps->state_set(PG_STATE_CREATING); | |
4843 | // use the history timestamp, which ultimately comes from the | |
4844 | // monitor in the create case. | |
4845 | utime_t t = ps->info.history.last_scrub_stamp; | |
4846 | ps->info.stats.last_fresh = t; | |
4847 | ps->info.stats.last_active = t; | |
4848 | ps->info.stats.last_change = t; | |
4849 | ps->info.stats.last_peered = t; | |
4850 | ps->info.stats.last_clean = t; | |
4851 | ps->info.stats.last_unstale = t; | |
4852 | ps->info.stats.last_undegraded = t; | |
4853 | ps->info.stats.last_fullsized = t; | |
4854 | ps->info.stats.last_scrub_stamp = t; | |
4855 | ps->info.stats.last_deep_scrub_stamp = t; | |
4856 | ps->info.stats.last_clean_scrub_stamp = t; | |
4857 | } | |
4858 | } | |
4859 | ||
4860 | boost::statechart::result PeeringState::Primary::react(const MNotifyRec& notevt) | |
4861 | { | |
4862 | DECLARE_LOCALS; | |
4863 | psdout(7) << "handle_pg_notify from osd." << notevt.from << dendl; | |
4864 | ps->proc_replica_info( | |
4865 | notevt.from, notevt.notify.info, notevt.notify.epoch_sent); | |
4866 | return discard_event(); | |
4867 | } | |
4868 | ||
4869 | boost::statechart::result PeeringState::Primary::react(const ActMap&) | |
4870 | { | |
4871 | DECLARE_LOCALS; | |
4872 | psdout(7) << "handle ActMap primary" << dendl; | |
4873 | pl->publish_stats_to_osd(); | |
4874 | return discard_event(); | |
4875 | } | |
4876 | ||
4877 | boost::statechart::result PeeringState::Primary::react( | |
4878 | const SetForceRecovery&) | |
4879 | { | |
4880 | DECLARE_LOCALS; | |
4881 | ps->set_force_recovery(true); | |
4882 | return discard_event(); | |
4883 | } | |
4884 | ||
4885 | boost::statechart::result PeeringState::Primary::react( | |
4886 | const UnsetForceRecovery&) | |
4887 | { | |
4888 | DECLARE_LOCALS; | |
4889 | ps->set_force_recovery(false); | |
4890 | return discard_event(); | |
4891 | } | |
4892 | ||
4893 | boost::statechart::result PeeringState::Primary::react( | |
4894 | const RequestScrub& evt) | |
4895 | { | |
4896 | DECLARE_LOCALS; | |
4897 | if (ps->is_primary()) { | |
4898 | pl->scrub_requested(evt.deep, evt.repair); | |
4899 | psdout(10) << "marking for scrub" << dendl; | |
4900 | } | |
4901 | return discard_event(); | |
4902 | } | |
4903 | ||
4904 | boost::statechart::result PeeringState::Primary::react( | |
4905 | const SetForceBackfill&) | |
4906 | { | |
4907 | DECLARE_LOCALS; | |
4908 | ps->set_force_backfill(true); | |
4909 | return discard_event(); | |
4910 | } | |
4911 | ||
4912 | boost::statechart::result PeeringState::Primary::react( | |
4913 | const UnsetForceBackfill&) | |
4914 | { | |
4915 | DECLARE_LOCALS; | |
4916 | ps->set_force_backfill(false); | |
4917 | return discard_event(); | |
4918 | } | |
4919 | ||
4920 | void PeeringState::Primary::exit() | |
4921 | { | |
4922 | context< PeeringMachine >().log_exit(state_name, enter_time); | |
4923 | DECLARE_LOCALS; | |
4924 | ps->want_acting.clear(); | |
4925 | utime_t dur = ceph_clock_now() - enter_time; | |
4926 | pl->get_peering_perf().tinc(rs_primary_latency, dur); | |
4927 | pl->clear_primary_state(); | |
4928 | ps->state_clear(PG_STATE_CREATING); | |
4929 | } | |
4930 | ||
4931 | /*---------Peering--------*/ | |
4932 | PeeringState::Peering::Peering(my_context ctx) | |
4933 | : my_base(ctx), | |
4934 | NamedState(context< PeeringMachine >().state_history, "Started/Primary/Peering"), | |
4935 | history_les_bound(false) | |
4936 | { | |
4937 | context< PeeringMachine >().log_enter(state_name); | |
4938 | DECLARE_LOCALS; | |
4939 | ||
4940 | ceph_assert(!ps->is_peered()); | |
4941 | ceph_assert(!ps->is_peering()); | |
4942 | ceph_assert(ps->is_primary()); | |
4943 | ps->state_set(PG_STATE_PEERING); | |
4944 | } | |
4945 | ||
4946 | boost::statechart::result PeeringState::Peering::react(const AdvMap& advmap) | |
4947 | { | |
4948 | DECLARE_LOCALS; | |
4949 | psdout(10) << "Peering advmap" << dendl; | |
4950 | if (prior_set.affected_by_map(*(advmap.osdmap), ps->dpp)) { | |
4951 | psdout(1) << "Peering, affected_by_map, going to Reset" << dendl; | |
4952 | post_event(advmap); | |
4953 | return transit< Reset >(); | |
4954 | } | |
4955 | ||
4956 | ps->adjust_need_up_thru(advmap.osdmap); | |
4957 | ps->check_prior_readable_down_osds(advmap.osdmap); | |
4958 | ||
4959 | return forward_event(); | |
4960 | } | |
4961 | ||
4962 | boost::statechart::result PeeringState::Peering::react(const QueryState& q) | |
4963 | { | |
4964 | DECLARE_LOCALS; | |
4965 | ||
4966 | q.f->open_object_section("state"); | |
4967 | q.f->dump_string("name", state_name); | |
4968 | q.f->dump_stream("enter_time") << enter_time; | |
4969 | ||
4970 | q.f->open_array_section("past_intervals"); | |
4971 | ps->past_intervals.dump(q.f); | |
4972 | q.f->close_section(); | |
4973 | ||
4974 | q.f->open_array_section("probing_osds"); | |
f67539c2 | 4975 | for (auto p = prior_set.probe.begin(); p != prior_set.probe.end(); ++p) |
9f95a23c TL |
4976 | q.f->dump_stream("osd") << *p; |
4977 | q.f->close_section(); | |
4978 | ||
4979 | if (prior_set.pg_down) | |
4980 | q.f->dump_string("blocked", "peering is blocked due to down osds"); | |
4981 | ||
4982 | q.f->open_array_section("down_osds_we_would_probe"); | |
f67539c2 | 4983 | for (auto p = prior_set.down.begin(); p != prior_set.down.end(); ++p) |
9f95a23c TL |
4984 | q.f->dump_int("osd", *p); |
4985 | q.f->close_section(); | |
4986 | ||
4987 | q.f->open_array_section("peering_blocked_by"); | |
f67539c2 | 4988 | for (auto p = prior_set.blocked_by.begin(); |
9f95a23c TL |
4989 | p != prior_set.blocked_by.end(); |
4990 | ++p) { | |
4991 | q.f->open_object_section("osd"); | |
4992 | q.f->dump_int("osd", p->first); | |
4993 | q.f->dump_int("current_lost_at", p->second); | |
4994 | q.f->dump_string("comment", "starting or marking this osd lost may let us proceed"); | |
4995 | q.f->close_section(); | |
4996 | } | |
4997 | q.f->close_section(); | |
4998 | ||
4999 | if (history_les_bound) { | |
5000 | q.f->open_array_section("peering_blocked_by_detail"); | |
5001 | q.f->open_object_section("item"); | |
5002 | q.f->dump_string("detail","peering_blocked_by_history_les_bound"); | |
5003 | q.f->close_section(); | |
5004 | q.f->close_section(); | |
5005 | } | |
5006 | ||
5007 | q.f->close_section(); | |
5008 | return forward_event(); | |
5009 | } | |
5010 | ||
f67539c2 TL |
5011 | boost::statechart::result PeeringState::Peering::react(const QueryUnfound& q) |
5012 | { | |
5013 | q.f->dump_string("state", "Peering"); | |
5014 | q.f->dump_bool("available_might_have_unfound", false); | |
5015 | return discard_event(); | |
5016 | } | |
5017 | ||
9f95a23c TL |
5018 | void PeeringState::Peering::exit() |
5019 | { | |
5020 | ||
5021 | DECLARE_LOCALS; | |
5022 | psdout(10) << "Leaving Peering" << dendl; | |
5023 | context< PeeringMachine >().log_exit(state_name, enter_time); | |
5024 | ps->state_clear(PG_STATE_PEERING); | |
5025 | pl->clear_probe_targets(); | |
5026 | ||
5027 | utime_t dur = ceph_clock_now() - enter_time; | |
5028 | pl->get_peering_perf().tinc(rs_peering_latency, dur); | |
5029 | } | |
5030 | ||
5031 | ||
5032 | /*------Backfilling-------*/ | |
5033 | PeeringState::Backfilling::Backfilling(my_context ctx) | |
5034 | : my_base(ctx), | |
5035 | NamedState(context< PeeringMachine >().state_history, "Started/Primary/Active/Backfilling") | |
5036 | { | |
5037 | context< PeeringMachine >().log_enter(state_name); | |
5038 | ||
5039 | ||
5040 | DECLARE_LOCALS; | |
5041 | ps->backfill_reserved = true; | |
5042 | pl->on_backfill_reserved(); | |
5043 | ps->state_clear(PG_STATE_BACKFILL_TOOFULL); | |
5044 | ps->state_clear(PG_STATE_BACKFILL_WAIT); | |
5045 | ps->state_set(PG_STATE_BACKFILLING); | |
5046 | pl->publish_stats_to_osd(); | |
5047 | } | |
5048 | ||
5049 | void PeeringState::Backfilling::backfill_release_reservations() | |
5050 | { | |
5051 | DECLARE_LOCALS; | |
5052 | pl->cancel_local_background_io_reservation(); | |
f67539c2 | 5053 | for (auto it = ps->backfill_targets.begin(); |
9f95a23c TL |
5054 | it != ps->backfill_targets.end(); |
5055 | ++it) { | |
5056 | ceph_assert(*it != ps->pg_whoami); | |
5057 | pl->send_cluster_message( | |
5058 | it->osd, | |
f67539c2 | 5059 | make_message<MBackfillReserve>( |
9f95a23c TL |
5060 | MBackfillReserve::RELEASE, |
5061 | spg_t(ps->info.pgid.pgid, it->shard), | |
5062 | ps->get_osdmap_epoch()), | |
5063 | ps->get_osdmap_epoch()); | |
5064 | } | |
5065 | } | |
5066 | ||
5067 | void PeeringState::Backfilling::cancel_backfill() | |
5068 | { | |
5069 | DECLARE_LOCALS; | |
5070 | backfill_release_reservations(); | |
5071 | pl->on_backfill_canceled(); | |
5072 | } | |
5073 | ||
5074 | boost::statechart::result | |
5075 | PeeringState::Backfilling::react(const Backfilled &c) | |
5076 | { | |
5077 | backfill_release_reservations(); | |
5078 | return transit<Recovered>(); | |
5079 | } | |
5080 | ||
5081 | boost::statechart::result | |
5082 | PeeringState::Backfilling::react(const DeferBackfill &c) | |
5083 | { | |
5084 | DECLARE_LOCALS; | |
5085 | ||
5086 | psdout(10) << "defer backfill, retry delay " << c.delay << dendl; | |
5087 | ps->state_set(PG_STATE_BACKFILL_WAIT); | |
5088 | ps->state_clear(PG_STATE_BACKFILLING); | |
5089 | cancel_backfill(); | |
5090 | ||
5091 | pl->schedule_event_after( | |
5092 | std::make_shared<PGPeeringEvent>( | |
5093 | ps->get_osdmap_epoch(), | |
5094 | ps->get_osdmap_epoch(), | |
5095 | RequestBackfill()), | |
5096 | c.delay); | |
5097 | return transit<NotBackfilling>(); | |
5098 | } | |
5099 | ||
5100 | boost::statechart::result | |
5101 | PeeringState::Backfilling::react(const UnfoundBackfill &c) | |
5102 | { | |
5103 | DECLARE_LOCALS; | |
5104 | psdout(10) << "backfill has unfound, can't continue" << dendl; | |
5105 | ps->state_set(PG_STATE_BACKFILL_UNFOUND); | |
5106 | ps->state_clear(PG_STATE_BACKFILLING); | |
5107 | cancel_backfill(); | |
5108 | return transit<NotBackfilling>(); | |
5109 | } | |
5110 | ||
5111 | boost::statechart::result | |
5112 | PeeringState::Backfilling::react(const RemoteReservationRevokedTooFull &) | |
5113 | { | |
5114 | DECLARE_LOCALS; | |
5115 | ||
5116 | ps->state_set(PG_STATE_BACKFILL_TOOFULL); | |
5117 | ps->state_clear(PG_STATE_BACKFILLING); | |
5118 | cancel_backfill(); | |
5119 | ||
5120 | pl->schedule_event_after( | |
5121 | std::make_shared<PGPeeringEvent>( | |
5122 | ps->get_osdmap_epoch(), | |
5123 | ps->get_osdmap_epoch(), | |
5124 | RequestBackfill()), | |
5125 | ps->cct->_conf->osd_backfill_retry_interval); | |
5126 | ||
5127 | return transit<NotBackfilling>(); | |
5128 | } | |
5129 | ||
5130 | boost::statechart::result | |
5131 | PeeringState::Backfilling::react(const RemoteReservationRevoked &) | |
5132 | { | |
5133 | DECLARE_LOCALS; | |
5134 | ps->state_set(PG_STATE_BACKFILL_WAIT); | |
5135 | cancel_backfill(); | |
5136 | if (ps->needs_backfill()) { | |
5137 | return transit<WaitLocalBackfillReserved>(); | |
5138 | } else { | |
5139 | // raced with MOSDPGBackfill::OP_BACKFILL_FINISH, ignore | |
5140 | return discard_event(); | |
5141 | } | |
5142 | } | |
5143 | ||
5144 | void PeeringState::Backfilling::exit() | |
5145 | { | |
5146 | context< PeeringMachine >().log_exit(state_name, enter_time); | |
5147 | DECLARE_LOCALS; | |
5148 | ps->backfill_reserved = false; | |
5149 | ps->state_clear(PG_STATE_BACKFILLING); | |
5150 | ps->state_clear(PG_STATE_FORCED_BACKFILL | PG_STATE_FORCED_RECOVERY); | |
5151 | utime_t dur = ceph_clock_now() - enter_time; | |
5152 | pl->get_peering_perf().tinc(rs_backfilling_latency, dur); | |
5153 | } | |
5154 | ||
5155 | /*--WaitRemoteBackfillReserved--*/ | |
5156 | ||
5157 | PeeringState::WaitRemoteBackfillReserved::WaitRemoteBackfillReserved(my_context ctx) | |
5158 | : my_base(ctx), | |
5159 | NamedState(context< PeeringMachine >().state_history, "Started/Primary/Active/WaitRemoteBackfillReserved"), | |
5160 | backfill_osd_it(context< Active >().remote_shards_to_reserve_backfill.begin()) | |
5161 | { | |
5162 | context< PeeringMachine >().log_enter(state_name); | |
5163 | DECLARE_LOCALS; | |
5164 | ||
5165 | ps->state_set(PG_STATE_BACKFILL_WAIT); | |
5166 | pl->publish_stats_to_osd(); | |
5167 | post_event(RemoteBackfillReserved()); | |
5168 | } | |
5169 | ||
5170 | boost::statechart::result | |
5171 | PeeringState::WaitRemoteBackfillReserved::react(const RemoteBackfillReserved &evt) | |
5172 | { | |
5173 | DECLARE_LOCALS; | |
5174 | ||
5175 | int64_t num_bytes = ps->info.stats.stats.sum.num_bytes; | |
5176 | psdout(10) << __func__ << " num_bytes " << num_bytes << dendl; | |
5177 | if (backfill_osd_it != | |
5178 | context< Active >().remote_shards_to_reserve_backfill.end()) { | |
5179 | // The primary never backfills itself | |
5180 | ceph_assert(*backfill_osd_it != ps->pg_whoami); | |
5181 | pl->send_cluster_message( | |
5182 | backfill_osd_it->osd, | |
f67539c2 | 5183 | make_message<MBackfillReserve>( |
9f95a23c TL |
5184 | MBackfillReserve::REQUEST, |
5185 | spg_t(context< PeeringMachine >().spgid.pgid, backfill_osd_it->shard), | |
5186 | ps->get_osdmap_epoch(), | |
5187 | ps->get_backfill_priority(), | |
5188 | num_bytes, | |
5189 | ps->peer_bytes[*backfill_osd_it]), | |
5190 | ps->get_osdmap_epoch()); | |
5191 | ++backfill_osd_it; | |
5192 | } else { | |
5193 | ps->peer_bytes.clear(); | |
5194 | post_event(AllBackfillsReserved()); | |
5195 | } | |
5196 | return discard_event(); | |
5197 | } | |
5198 | ||
5199 | void PeeringState::WaitRemoteBackfillReserved::exit() | |
5200 | { | |
5201 | context< PeeringMachine >().log_exit(state_name, enter_time); | |
5202 | DECLARE_LOCALS; | |
5203 | ||
5204 | utime_t dur = ceph_clock_now() - enter_time; | |
5205 | pl->get_peering_perf().tinc(rs_waitremotebackfillreserved_latency, dur); | |
5206 | } | |
5207 | ||
5208 | void PeeringState::WaitRemoteBackfillReserved::retry() | |
5209 | { | |
5210 | DECLARE_LOCALS; | |
5211 | pl->cancel_local_background_io_reservation(); | |
5212 | ||
5213 | // Send CANCEL to all previously acquired reservations | |
5214 | set<pg_shard_t>::const_iterator it, begin, end; | |
5215 | begin = context< Active >().remote_shards_to_reserve_backfill.begin(); | |
5216 | end = context< Active >().remote_shards_to_reserve_backfill.end(); | |
5217 | ceph_assert(begin != end); | |
5218 | for (it = begin; it != backfill_osd_it; ++it) { | |
5219 | // The primary never backfills itself | |
5220 | ceph_assert(*it != ps->pg_whoami); | |
5221 | pl->send_cluster_message( | |
5222 | it->osd, | |
f67539c2 | 5223 | make_message<MBackfillReserve>( |
9f95a23c TL |
5224 | MBackfillReserve::RELEASE, |
5225 | spg_t(context< PeeringMachine >().spgid.pgid, it->shard), | |
5226 | ps->get_osdmap_epoch()), | |
5227 | ps->get_osdmap_epoch()); | |
5228 | } | |
5229 | ||
5230 | ps->state_clear(PG_STATE_BACKFILL_WAIT); | |
5231 | pl->publish_stats_to_osd(); | |
5232 | ||
5233 | pl->schedule_event_after( | |
5234 | std::make_shared<PGPeeringEvent>( | |
5235 | ps->get_osdmap_epoch(), | |
5236 | ps->get_osdmap_epoch(), | |
5237 | RequestBackfill()), | |
5238 | ps->cct->_conf->osd_backfill_retry_interval); | |
5239 | } | |
5240 | ||
5241 | boost::statechart::result | |
5242 | PeeringState::WaitRemoteBackfillReserved::react(const RemoteReservationRejectedTooFull &evt) | |
5243 | { | |
5244 | DECLARE_LOCALS; | |
5245 | ps->state_set(PG_STATE_BACKFILL_TOOFULL); | |
5246 | retry(); | |
5247 | return transit<NotBackfilling>(); | |
5248 | } | |
5249 | ||
5250 | boost::statechart::result | |
5251 | PeeringState::WaitRemoteBackfillReserved::react(const RemoteReservationRevoked &evt) | |
5252 | { | |
5253 | retry(); | |
5254 | return transit<NotBackfilling>(); | |
5255 | } | |
5256 | ||
5257 | /*--WaitLocalBackfillReserved--*/ | |
5258 | PeeringState::WaitLocalBackfillReserved::WaitLocalBackfillReserved(my_context ctx) | |
5259 | : my_base(ctx), | |
5260 | NamedState(context< PeeringMachine >().state_history, "Started/Primary/Active/WaitLocalBackfillReserved") | |
5261 | { | |
5262 | context< PeeringMachine >().log_enter(state_name); | |
5263 | DECLARE_LOCALS; | |
5264 | ||
5265 | ps->state_set(PG_STATE_BACKFILL_WAIT); | |
5266 | pl->request_local_background_io_reservation( | |
5267 | ps->get_backfill_priority(), | |
f67539c2 | 5268 | std::make_unique<PGPeeringEvent>( |
9f95a23c TL |
5269 | ps->get_osdmap_epoch(), |
5270 | ps->get_osdmap_epoch(), | |
5271 | LocalBackfillReserved()), | |
f67539c2 | 5272 | std::make_unique<PGPeeringEvent>( |
9f95a23c TL |
5273 | ps->get_osdmap_epoch(), |
5274 | ps->get_osdmap_epoch(), | |
5275 | DeferBackfill(0.0))); | |
5276 | pl->publish_stats_to_osd(); | |
5277 | } | |
5278 | ||
5279 | void PeeringState::WaitLocalBackfillReserved::exit() | |
5280 | { | |
5281 | context< PeeringMachine >().log_exit(state_name, enter_time); | |
5282 | DECLARE_LOCALS; | |
5283 | utime_t dur = ceph_clock_now() - enter_time; | |
5284 | pl->get_peering_perf().tinc(rs_waitlocalbackfillreserved_latency, dur); | |
5285 | } | |
5286 | ||
5287 | /*----NotBackfilling------*/ | |
5288 | PeeringState::NotBackfilling::NotBackfilling(my_context ctx) | |
5289 | : my_base(ctx), | |
5290 | NamedState(context< PeeringMachine >().state_history, "Started/Primary/Active/NotBackfilling") | |
5291 | { | |
5292 | context< PeeringMachine >().log_enter(state_name); | |
5293 | DECLARE_LOCALS; | |
5294 | ps->state_clear(PG_STATE_REPAIR); | |
5295 | pl->publish_stats_to_osd(); | |
5296 | } | |
5297 | ||
f67539c2 TL |
5298 | boost::statechart::result PeeringState::NotBackfilling::react(const QueryUnfound& q) |
5299 | { | |
5300 | DECLARE_LOCALS; | |
5301 | ||
5302 | ps->query_unfound(q.f, "NotBackfilling"); | |
5303 | return discard_event(); | |
5304 | } | |
5305 | ||
9f95a23c TL |
5306 | boost::statechart::result |
5307 | PeeringState::NotBackfilling::react(const RemoteBackfillReserved &evt) | |
5308 | { | |
5309 | return discard_event(); | |
5310 | } | |
5311 | ||
5312 | boost::statechart::result | |
5313 | PeeringState::NotBackfilling::react(const RemoteReservationRejectedTooFull &evt) | |
5314 | { | |
5315 | return discard_event(); | |
5316 | } | |
5317 | ||
5318 | void PeeringState::NotBackfilling::exit() | |
5319 | { | |
5320 | context< PeeringMachine >().log_exit(state_name, enter_time); | |
5321 | ||
5322 | DECLARE_LOCALS; | |
5323 | ps->state_clear(PG_STATE_BACKFILL_UNFOUND); | |
5324 | utime_t dur = ceph_clock_now() - enter_time; | |
5325 | pl->get_peering_perf().tinc(rs_notbackfilling_latency, dur); | |
5326 | } | |
5327 | ||
5328 | /*----NotRecovering------*/ | |
5329 | PeeringState::NotRecovering::NotRecovering(my_context ctx) | |
5330 | : my_base(ctx), | |
5331 | NamedState(context< PeeringMachine >().state_history, "Started/Primary/Active/NotRecovering") | |
5332 | { | |
5333 | context< PeeringMachine >().log_enter(state_name); | |
5334 | DECLARE_LOCALS; | |
5335 | ps->state_clear(PG_STATE_REPAIR); | |
5336 | pl->publish_stats_to_osd(); | |
5337 | } | |
5338 | ||
f67539c2 TL |
5339 | boost::statechart::result PeeringState::NotRecovering::react(const QueryUnfound& q) |
5340 | { | |
5341 | DECLARE_LOCALS; | |
5342 | ||
5343 | ps->query_unfound(q.f, "NotRecovering"); | |
5344 | return discard_event(); | |
5345 | } | |
5346 | ||
9f95a23c TL |
5347 | void PeeringState::NotRecovering::exit() |
5348 | { | |
5349 | context< PeeringMachine >().log_exit(state_name, enter_time); | |
5350 | ||
5351 | DECLARE_LOCALS; | |
5352 | ps->state_clear(PG_STATE_RECOVERY_UNFOUND); | |
5353 | utime_t dur = ceph_clock_now() - enter_time; | |
5354 | pl->get_peering_perf().tinc(rs_notrecovering_latency, dur); | |
5355 | } | |
5356 | ||
5357 | /*---RepNotRecovering----*/ | |
5358 | PeeringState::RepNotRecovering::RepNotRecovering(my_context ctx) | |
5359 | : my_base(ctx), | |
5360 | NamedState(context< PeeringMachine >().state_history, "Started/ReplicaActive/RepNotRecovering") | |
5361 | { | |
5362 | context< PeeringMachine >().log_enter(state_name); | |
5363 | } | |
5364 | ||
5365 | boost::statechart::result | |
5366 | PeeringState::RepNotRecovering::react(const RejectTooFullRemoteReservation &evt) | |
5367 | { | |
5368 | DECLARE_LOCALS; | |
5369 | ps->reject_reservation(); | |
5370 | post_event(RemoteReservationRejectedTooFull()); | |
5371 | return discard_event(); | |
5372 | } | |
5373 | ||
5374 | void PeeringState::RepNotRecovering::exit() | |
5375 | { | |
5376 | context< PeeringMachine >().log_exit(state_name, enter_time); | |
5377 | DECLARE_LOCALS; | |
5378 | utime_t dur = ceph_clock_now() - enter_time; | |
5379 | pl->get_peering_perf().tinc(rs_repnotrecovering_latency, dur); | |
5380 | } | |
5381 | ||
5382 | /*---RepWaitRecoveryReserved--*/ | |
5383 | PeeringState::RepWaitRecoveryReserved::RepWaitRecoveryReserved(my_context ctx) | |
5384 | : my_base(ctx), | |
5385 | NamedState(context< PeeringMachine >().state_history, "Started/ReplicaActive/RepWaitRecoveryReserved") | |
5386 | { | |
5387 | context< PeeringMachine >().log_enter(state_name); | |
5388 | } | |
5389 | ||
5390 | boost::statechart::result | |
5391 | PeeringState::RepWaitRecoveryReserved::react(const RemoteRecoveryReserved &evt) | |
5392 | { | |
5393 | DECLARE_LOCALS; | |
5394 | pl->send_cluster_message( | |
5395 | ps->primary.osd, | |
f67539c2 | 5396 | make_message<MRecoveryReserve>( |
9f95a23c TL |
5397 | MRecoveryReserve::GRANT, |
5398 | spg_t(ps->info.pgid.pgid, ps->primary.shard), | |
5399 | ps->get_osdmap_epoch()), | |
5400 | ps->get_osdmap_epoch()); | |
5401 | return transit<RepRecovering>(); | |
5402 | } | |
5403 | ||
5404 | boost::statechart::result | |
5405 | PeeringState::RepWaitRecoveryReserved::react( | |
5406 | const RemoteReservationCanceled &evt) | |
5407 | { | |
5408 | DECLARE_LOCALS; | |
5409 | pl->unreserve_recovery_space(); | |
5410 | ||
5411 | pl->cancel_remote_recovery_reservation(); | |
5412 | return transit<RepNotRecovering>(); | |
5413 | } | |
5414 | ||
5415 | void PeeringState::RepWaitRecoveryReserved::exit() | |
5416 | { | |
5417 | context< PeeringMachine >().log_exit(state_name, enter_time); | |
5418 | DECLARE_LOCALS; | |
5419 | utime_t dur = ceph_clock_now() - enter_time; | |
5420 | pl->get_peering_perf().tinc(rs_repwaitrecoveryreserved_latency, dur); | |
5421 | } | |
5422 | ||
5423 | /*-RepWaitBackfillReserved*/ | |
5424 | PeeringState::RepWaitBackfillReserved::RepWaitBackfillReserved(my_context ctx) | |
5425 | : my_base(ctx), | |
5426 | NamedState(context< PeeringMachine >().state_history, "Started/ReplicaActive/RepWaitBackfillReserved") | |
5427 | { | |
5428 | context< PeeringMachine >().log_enter(state_name); | |
5429 | } | |
5430 | ||
5431 | boost::statechart::result | |
5432 | PeeringState::RepNotRecovering::react(const RequestBackfillPrio &evt) | |
5433 | { | |
5434 | ||
5435 | DECLARE_LOCALS; | |
5436 | ||
5437 | if (!pl->try_reserve_recovery_space( | |
5438 | evt.primary_num_bytes, evt.local_num_bytes)) { | |
5439 | post_event(RejectTooFullRemoteReservation()); | |
5440 | } else { | |
f67539c2 | 5441 | PGPeeringEventURef preempt; |
9f95a23c TL |
5442 | if (HAVE_FEATURE(ps->upacting_features, RECOVERY_RESERVATION_2)) { |
5443 | // older peers will interpret preemption as TOOFULL | |
f67539c2 | 5444 | preempt = std::make_unique<PGPeeringEvent>( |
9f95a23c TL |
5445 | pl->get_osdmap_epoch(), |
5446 | pl->get_osdmap_epoch(), | |
5447 | RemoteBackfillPreempted()); | |
5448 | } | |
5449 | pl->request_remote_recovery_reservation( | |
5450 | evt.priority, | |
f67539c2 | 5451 | std::make_unique<PGPeeringEvent>( |
9f95a23c TL |
5452 | pl->get_osdmap_epoch(), |
5453 | pl->get_osdmap_epoch(), | |
5454 | RemoteBackfillReserved()), | |
f67539c2 | 5455 | std::move(preempt)); |
9f95a23c TL |
5456 | } |
5457 | return transit<RepWaitBackfillReserved>(); | |
5458 | } | |
5459 | ||
5460 | boost::statechart::result | |
5461 | PeeringState::RepNotRecovering::react(const RequestRecoveryPrio &evt) | |
5462 | { | |
5463 | DECLARE_LOCALS; | |
5464 | ||
5465 | // fall back to a local reckoning of priority of primary doesn't pass one | |
5466 | // (pre-mimic compat) | |
5467 | int prio = evt.priority ? evt.priority : ps->get_recovery_priority(); | |
5468 | ||
f67539c2 | 5469 | PGPeeringEventURef preempt; |
9f95a23c TL |
5470 | if (HAVE_FEATURE(ps->upacting_features, RECOVERY_RESERVATION_2)) { |
5471 | // older peers can't handle this | |
f67539c2 | 5472 | preempt = std::make_unique<PGPeeringEvent>( |
9f95a23c TL |
5473 | ps->get_osdmap_epoch(), |
5474 | ps->get_osdmap_epoch(), | |
5475 | RemoteRecoveryPreempted()); | |
5476 | } | |
5477 | ||
5478 | pl->request_remote_recovery_reservation( | |
5479 | prio, | |
f67539c2 | 5480 | std::make_unique<PGPeeringEvent>( |
9f95a23c TL |
5481 | ps->get_osdmap_epoch(), |
5482 | ps->get_osdmap_epoch(), | |
5483 | RemoteRecoveryReserved()), | |
f67539c2 | 5484 | std::move(preempt)); |
9f95a23c TL |
5485 | return transit<RepWaitRecoveryReserved>(); |
5486 | } | |
5487 | ||
5488 | void PeeringState::RepWaitBackfillReserved::exit() | |
5489 | { | |
5490 | context< PeeringMachine >().log_exit(state_name, enter_time); | |
5491 | DECLARE_LOCALS; | |
5492 | utime_t dur = ceph_clock_now() - enter_time; | |
5493 | pl->get_peering_perf().tinc(rs_repwaitbackfillreserved_latency, dur); | |
5494 | } | |
5495 | ||
5496 | boost::statechart::result | |
5497 | PeeringState::RepWaitBackfillReserved::react(const RemoteBackfillReserved &evt) | |
5498 | { | |
5499 | DECLARE_LOCALS; | |
5500 | ||
5501 | ||
5502 | pl->send_cluster_message( | |
5503 | ps->primary.osd, | |
f67539c2 | 5504 | make_message<MBackfillReserve>( |
9f95a23c TL |
5505 | MBackfillReserve::GRANT, |
5506 | spg_t(ps->info.pgid.pgid, ps->primary.shard), | |
5507 | ps->get_osdmap_epoch()), | |
5508 | ps->get_osdmap_epoch()); | |
5509 | return transit<RepRecovering>(); | |
5510 | } | |
5511 | ||
5512 | boost::statechart::result | |
5513 | PeeringState::RepWaitBackfillReserved::react( | |
5514 | const RejectTooFullRemoteReservation &evt) | |
5515 | { | |
5516 | DECLARE_LOCALS; | |
5517 | ps->reject_reservation(); | |
5518 | post_event(RemoteReservationRejectedTooFull()); | |
5519 | return discard_event(); | |
5520 | } | |
5521 | ||
5522 | boost::statechart::result | |
5523 | PeeringState::RepWaitBackfillReserved::react( | |
5524 | const RemoteReservationRejectedTooFull &evt) | |
5525 | { | |
5526 | DECLARE_LOCALS; | |
5527 | pl->unreserve_recovery_space(); | |
5528 | ||
5529 | pl->cancel_remote_recovery_reservation(); | |
5530 | return transit<RepNotRecovering>(); | |
5531 | } | |
5532 | ||
5533 | boost::statechart::result | |
5534 | PeeringState::RepWaitBackfillReserved::react( | |
5535 | const RemoteReservationCanceled &evt) | |
5536 | { | |
5537 | DECLARE_LOCALS; | |
5538 | pl->unreserve_recovery_space(); | |
5539 | ||
5540 | pl->cancel_remote_recovery_reservation(); | |
5541 | return transit<RepNotRecovering>(); | |
5542 | } | |
5543 | ||
5544 | /*---RepRecovering-------*/ | |
5545 | PeeringState::RepRecovering::RepRecovering(my_context ctx) | |
5546 | : my_base(ctx), | |
5547 | NamedState(context< PeeringMachine >().state_history, "Started/ReplicaActive/RepRecovering") | |
5548 | { | |
5549 | context< PeeringMachine >().log_enter(state_name); | |
5550 | } | |
5551 | ||
5552 | boost::statechart::result | |
5553 | PeeringState::RepRecovering::react(const RemoteRecoveryPreempted &) | |
5554 | { | |
5555 | DECLARE_LOCALS; | |
5556 | ||
5557 | ||
5558 | pl->unreserve_recovery_space(); | |
5559 | pl->send_cluster_message( | |
5560 | ps->primary.osd, | |
f67539c2 | 5561 | make_message<MRecoveryReserve>( |
9f95a23c TL |
5562 | MRecoveryReserve::REVOKE, |
5563 | spg_t(ps->info.pgid.pgid, ps->primary.shard), | |
5564 | ps->get_osdmap_epoch()), | |
5565 | ps->get_osdmap_epoch()); | |
5566 | return discard_event(); | |
5567 | } | |
5568 | ||
5569 | boost::statechart::result | |
5570 | PeeringState::RepRecovering::react(const BackfillTooFull &) | |
5571 | { | |
5572 | DECLARE_LOCALS; | |
5573 | ||
5574 | ||
5575 | pl->unreserve_recovery_space(); | |
5576 | pl->send_cluster_message( | |
5577 | ps->primary.osd, | |
f67539c2 | 5578 | make_message<MBackfillReserve>( |
9f95a23c TL |
5579 | MBackfillReserve::REVOKE_TOOFULL, |
5580 | spg_t(ps->info.pgid.pgid, ps->primary.shard), | |
5581 | ps->get_osdmap_epoch()), | |
5582 | ps->get_osdmap_epoch()); | |
5583 | return discard_event(); | |
5584 | } | |
5585 | ||
5586 | boost::statechart::result | |
5587 | PeeringState::RepRecovering::react(const RemoteBackfillPreempted &) | |
5588 | { | |
5589 | DECLARE_LOCALS; | |
5590 | ||
5591 | ||
5592 | pl->unreserve_recovery_space(); | |
5593 | pl->send_cluster_message( | |
5594 | ps->primary.osd, | |
f67539c2 | 5595 | make_message<MBackfillReserve>( |
9f95a23c TL |
5596 | MBackfillReserve::REVOKE, |
5597 | spg_t(ps->info.pgid.pgid, ps->primary.shard), | |
5598 | ps->get_osdmap_epoch()), | |
5599 | ps->get_osdmap_epoch()); | |
5600 | return discard_event(); | |
5601 | } | |
5602 | ||
5603 | void PeeringState::RepRecovering::exit() | |
5604 | { | |
5605 | context< PeeringMachine >().log_exit(state_name, enter_time); | |
5606 | DECLARE_LOCALS; | |
5607 | pl->unreserve_recovery_space(); | |
5608 | ||
5609 | pl->cancel_remote_recovery_reservation(); | |
5610 | utime_t dur = ceph_clock_now() - enter_time; | |
5611 | pl->get_peering_perf().tinc(rs_reprecovering_latency, dur); | |
5612 | } | |
5613 | ||
5614 | /*------Activating--------*/ | |
5615 | PeeringState::Activating::Activating(my_context ctx) | |
5616 | : my_base(ctx), | |
5617 | NamedState(context< PeeringMachine >().state_history, "Started/Primary/Active/Activating") | |
5618 | { | |
5619 | context< PeeringMachine >().log_enter(state_name); | |
5620 | } | |
5621 | ||
5622 | void PeeringState::Activating::exit() | |
5623 | { | |
5624 | context< PeeringMachine >().log_exit(state_name, enter_time); | |
5625 | DECLARE_LOCALS; | |
5626 | utime_t dur = ceph_clock_now() - enter_time; | |
5627 | pl->get_peering_perf().tinc(rs_activating_latency, dur); | |
5628 | } | |
5629 | ||
5630 | PeeringState::WaitLocalRecoveryReserved::WaitLocalRecoveryReserved(my_context ctx) | |
5631 | : my_base(ctx), | |
5632 | NamedState(context< PeeringMachine >().state_history, "Started/Primary/Active/WaitLocalRecoveryReserved") | |
5633 | { | |
5634 | context< PeeringMachine >().log_enter(state_name); | |
5635 | DECLARE_LOCALS; | |
5636 | ||
5637 | // Make sure all nodes that part of the recovery aren't full | |
5638 | if (!ps->cct->_conf->osd_debug_skip_full_check_in_recovery && | |
5639 | ps->get_osdmap()->check_full(ps->acting_recovery_backfill)) { | |
5640 | post_event(RecoveryTooFull()); | |
5641 | return; | |
5642 | } | |
5643 | ||
5644 | ps->state_clear(PG_STATE_RECOVERY_TOOFULL); | |
5645 | ps->state_set(PG_STATE_RECOVERY_WAIT); | |
5646 | pl->request_local_background_io_reservation( | |
5647 | ps->get_recovery_priority(), | |
f67539c2 | 5648 | std::make_unique<PGPeeringEvent>( |
9f95a23c TL |
5649 | ps->get_osdmap_epoch(), |
5650 | ps->get_osdmap_epoch(), | |
5651 | LocalRecoveryReserved()), | |
f67539c2 | 5652 | std::make_unique<PGPeeringEvent>( |
9f95a23c TL |
5653 | ps->get_osdmap_epoch(), |
5654 | ps->get_osdmap_epoch(), | |
5655 | DeferRecovery(0.0))); | |
5656 | pl->publish_stats_to_osd(); | |
5657 | } | |
5658 | ||
5659 | boost::statechart::result | |
5660 | PeeringState::WaitLocalRecoveryReserved::react(const RecoveryTooFull &evt) | |
5661 | { | |
5662 | DECLARE_LOCALS; | |
5663 | ps->state_set(PG_STATE_RECOVERY_TOOFULL); | |
5664 | pl->schedule_event_after( | |
5665 | std::make_shared<PGPeeringEvent>( | |
5666 | ps->get_osdmap_epoch(), | |
5667 | ps->get_osdmap_epoch(), | |
5668 | DoRecovery()), | |
5669 | ps->cct->_conf->osd_recovery_retry_interval); | |
5670 | return transit<NotRecovering>(); | |
5671 | } | |
5672 | ||
5673 | void PeeringState::WaitLocalRecoveryReserved::exit() | |
5674 | { | |
5675 | context< PeeringMachine >().log_exit(state_name, enter_time); | |
5676 | DECLARE_LOCALS; | |
5677 | utime_t dur = ceph_clock_now() - enter_time; | |
5678 | pl->get_peering_perf().tinc(rs_waitlocalrecoveryreserved_latency, dur); | |
5679 | } | |
5680 | ||
5681 | PeeringState::WaitRemoteRecoveryReserved::WaitRemoteRecoveryReserved(my_context ctx) | |
5682 | : my_base(ctx), | |
5683 | NamedState(context< PeeringMachine >().state_history, "Started/Primary/Active/WaitRemoteRecoveryReserved"), | |
5684 | remote_recovery_reservation_it(context< Active >().remote_shards_to_reserve_recovery.begin()) | |
5685 | { | |
5686 | context< PeeringMachine >().log_enter(state_name); | |
5687 | post_event(RemoteRecoveryReserved()); | |
5688 | } | |
5689 | ||
5690 | boost::statechart::result | |
5691 | PeeringState::WaitRemoteRecoveryReserved::react(const RemoteRecoveryReserved &evt) { | |
5692 | DECLARE_LOCALS; | |
5693 | ||
5694 | if (remote_recovery_reservation_it != | |
5695 | context< Active >().remote_shards_to_reserve_recovery.end()) { | |
5696 | ceph_assert(*remote_recovery_reservation_it != ps->pg_whoami); | |
5697 | pl->send_cluster_message( | |
5698 | remote_recovery_reservation_it->osd, | |
f67539c2 | 5699 | make_message<MRecoveryReserve>( |
9f95a23c TL |
5700 | MRecoveryReserve::REQUEST, |
5701 | spg_t(context< PeeringMachine >().spgid.pgid, | |
5702 | remote_recovery_reservation_it->shard), | |
5703 | ps->get_osdmap_epoch(), | |
5704 | ps->get_recovery_priority()), | |
5705 | ps->get_osdmap_epoch()); | |
5706 | ++remote_recovery_reservation_it; | |
5707 | } else { | |
5708 | post_event(AllRemotesReserved()); | |
5709 | } | |
5710 | return discard_event(); | |
5711 | } | |
5712 | ||
5713 | void PeeringState::WaitRemoteRecoveryReserved::exit() | |
5714 | { | |
5715 | context< PeeringMachine >().log_exit(state_name, enter_time); | |
5716 | DECLARE_LOCALS; | |
5717 | utime_t dur = ceph_clock_now() - enter_time; | |
5718 | pl->get_peering_perf().tinc(rs_waitremoterecoveryreserved_latency, dur); | |
5719 | } | |
5720 | ||
5721 | PeeringState::Recovering::Recovering(my_context ctx) | |
5722 | : my_base(ctx), | |
5723 | NamedState(context< PeeringMachine >().state_history, "Started/Primary/Active/Recovering") | |
5724 | { | |
5725 | context< PeeringMachine >().log_enter(state_name); | |
5726 | ||
5727 | DECLARE_LOCALS; | |
5728 | ps->state_clear(PG_STATE_RECOVERY_WAIT); | |
5729 | ps->state_clear(PG_STATE_RECOVERY_TOOFULL); | |
5730 | ps->state_set(PG_STATE_RECOVERING); | |
5731 | pl->on_recovery_reserved(); | |
5732 | ceph_assert(!ps->state_test(PG_STATE_ACTIVATING)); | |
5733 | pl->publish_stats_to_osd(); | |
5734 | } | |
5735 | ||
5736 | void PeeringState::Recovering::release_reservations(bool cancel) | |
5737 | { | |
5738 | DECLARE_LOCALS; | |
5739 | ceph_assert(cancel || !ps->pg_log.get_missing().have_missing()); | |
5740 | ||
5741 | // release remote reservations | |
f67539c2 TL |
5742 | for (auto i = context< Active >().remote_shards_to_reserve_recovery.begin(); |
5743 | i != context< Active >().remote_shards_to_reserve_recovery.end(); | |
5744 | ++i) { | |
9f95a23c TL |
5745 | if (*i == ps->pg_whoami) // skip myself |
5746 | continue; | |
5747 | pl->send_cluster_message( | |
5748 | i->osd, | |
f67539c2 | 5749 | make_message<MRecoveryReserve>( |
9f95a23c TL |
5750 | MRecoveryReserve::RELEASE, |
5751 | spg_t(ps->info.pgid.pgid, i->shard), | |
5752 | ps->get_osdmap_epoch()), | |
5753 | ps->get_osdmap_epoch()); | |
5754 | } | |
5755 | } | |
5756 | ||
5757 | boost::statechart::result | |
5758 | PeeringState::Recovering::react(const AllReplicasRecovered &evt) | |
5759 | { | |
5760 | DECLARE_LOCALS; | |
5761 | ps->state_clear(PG_STATE_FORCED_RECOVERY); | |
5762 | release_reservations(); | |
5763 | pl->cancel_local_background_io_reservation(); | |
5764 | return transit<Recovered>(); | |
5765 | } | |
5766 | ||
5767 | boost::statechart::result | |
5768 | PeeringState::Recovering::react(const RequestBackfill &evt) | |
5769 | { | |
5770 | DECLARE_LOCALS; | |
5771 | ||
5772 | release_reservations(); | |
5773 | ||
5774 | ps->state_clear(PG_STATE_FORCED_RECOVERY); | |
5775 | pl->cancel_local_background_io_reservation(); | |
5776 | pl->publish_stats_to_osd(); | |
5777 | // transit any async_recovery_targets back into acting | |
5778 | // so pg won't have to stay undersized for long | |
5779 | // as backfill might take a long time to complete.. | |
5780 | if (!ps->async_recovery_targets.empty()) { | |
5781 | pg_shard_t auth_log_shard; | |
5782 | bool history_les_bound = false; | |
f67539c2 | 5783 | // FIXME: Uh-oh we have to check this return value; choose_acting can fail! |
9f95a23c TL |
5784 | ps->choose_acting(auth_log_shard, true, &history_les_bound); |
5785 | } | |
5786 | return transit<WaitLocalBackfillReserved>(); | |
5787 | } | |
5788 | ||
5789 | boost::statechart::result | |
5790 | PeeringState::Recovering::react(const DeferRecovery &evt) | |
5791 | { | |
5792 | DECLARE_LOCALS; | |
5793 | if (!ps->state_test(PG_STATE_RECOVERING)) { | |
5794 | // we may have finished recovery and have an AllReplicasRecovered | |
5795 | // event queued to move us to the next state. | |
5796 | psdout(10) << "got defer recovery but not recovering" << dendl; | |
5797 | return discard_event(); | |
5798 | } | |
5799 | psdout(10) << "defer recovery, retry delay " << evt.delay << dendl; | |
5800 | ps->state_set(PG_STATE_RECOVERY_WAIT); | |
5801 | pl->cancel_local_background_io_reservation(); | |
5802 | release_reservations(true); | |
5803 | pl->schedule_event_after( | |
5804 | std::make_shared<PGPeeringEvent>( | |
5805 | ps->get_osdmap_epoch(), | |
5806 | ps->get_osdmap_epoch(), | |
5807 | DoRecovery()), | |
5808 | evt.delay); | |
5809 | return transit<NotRecovering>(); | |
5810 | } | |
5811 | ||
5812 | boost::statechart::result | |
5813 | PeeringState::Recovering::react(const UnfoundRecovery &evt) | |
5814 | { | |
5815 | DECLARE_LOCALS; | |
5816 | psdout(10) << "recovery has unfound, can't continue" << dendl; | |
5817 | ps->state_set(PG_STATE_RECOVERY_UNFOUND); | |
5818 | pl->cancel_local_background_io_reservation(); | |
5819 | release_reservations(true); | |
5820 | return transit<NotRecovering>(); | |
5821 | } | |
5822 | ||
5823 | void PeeringState::Recovering::exit() | |
5824 | { | |
5825 | context< PeeringMachine >().log_exit(state_name, enter_time); | |
5826 | ||
5827 | DECLARE_LOCALS; | |
5828 | utime_t dur = ceph_clock_now() - enter_time; | |
5829 | ps->state_clear(PG_STATE_RECOVERING); | |
5830 | pl->get_peering_perf().tinc(rs_recovering_latency, dur); | |
5831 | } | |
5832 | ||
5833 | PeeringState::Recovered::Recovered(my_context ctx) | |
5834 | : my_base(ctx), | |
5835 | NamedState(context< PeeringMachine >().state_history, "Started/Primary/Active/Recovered") | |
5836 | { | |
5837 | pg_shard_t auth_log_shard; | |
5838 | ||
5839 | context< PeeringMachine >().log_enter(state_name); | |
5840 | ||
5841 | DECLARE_LOCALS; | |
5842 | ||
5843 | ceph_assert(!ps->needs_recovery()); | |
5844 | ||
5845 | // if we finished backfill, all acting are active; recheck if | |
5846 | // DEGRADED | UNDERSIZED is appropriate. | |
5847 | ceph_assert(!ps->acting_recovery_backfill.empty()); | |
5848 | if (ps->get_osdmap()->get_pg_size(context< PeeringMachine >().spgid.pgid) <= | |
5849 | ps->acting_recovery_backfill.size()) { | |
5850 | ps->state_clear(PG_STATE_FORCED_BACKFILL | PG_STATE_FORCED_RECOVERY); | |
5851 | pl->publish_stats_to_osd(); | |
5852 | } | |
5853 | ||
5854 | // adjust acting set? (e.g. because backfill completed...) | |
5855 | bool history_les_bound = false; | |
5856 | if (ps->acting != ps->up && !ps->choose_acting(auth_log_shard, | |
5857 | true, &history_les_bound)) { | |
5858 | ceph_assert(ps->want_acting.size()); | |
5859 | } else if (!ps->async_recovery_targets.empty()) { | |
f67539c2 | 5860 | // FIXME: Uh-oh we have to check this return value; choose_acting can fail! |
9f95a23c TL |
5861 | ps->choose_acting(auth_log_shard, true, &history_les_bound); |
5862 | } | |
5863 | ||
5864 | if (context< Active >().all_replicas_activated && | |
5865 | ps->async_recovery_targets.empty()) | |
5866 | post_event(GoClean()); | |
5867 | } | |
5868 | ||
5869 | void PeeringState::Recovered::exit() | |
5870 | { | |
5871 | context< PeeringMachine >().log_exit(state_name, enter_time); | |
5872 | DECLARE_LOCALS; | |
5873 | ||
5874 | utime_t dur = ceph_clock_now() - enter_time; | |
5875 | pl->get_peering_perf().tinc(rs_recovered_latency, dur); | |
5876 | } | |
5877 | ||
5878 | PeeringState::Clean::Clean(my_context ctx) | |
5879 | : my_base(ctx), | |
5880 | NamedState(context< PeeringMachine >().state_history, "Started/Primary/Active/Clean") | |
5881 | { | |
5882 | context< PeeringMachine >().log_enter(state_name); | |
5883 | ||
5884 | DECLARE_LOCALS; | |
5885 | ||
5886 | if (ps->info.last_complete != ps->info.last_update) { | |
5887 | ceph_abort(); | |
5888 | } | |
5889 | ||
5890 | ||
5891 | ps->try_mark_clean(); | |
5892 | ||
5893 | context< PeeringMachine >().get_cur_transaction().register_on_commit( | |
5894 | pl->on_clean()); | |
5895 | } | |
5896 | ||
5897 | void PeeringState::Clean::exit() | |
5898 | { | |
5899 | context< PeeringMachine >().log_exit(state_name, enter_time); | |
5900 | ||
5901 | DECLARE_LOCALS; | |
5902 | ps->state_clear(PG_STATE_CLEAN); | |
5903 | utime_t dur = ceph_clock_now() - enter_time; | |
5904 | pl->get_peering_perf().tinc(rs_clean_latency, dur); | |
5905 | } | |
5906 | ||
5907 | template <typename T> | |
5908 | set<pg_shard_t> unique_osd_shard_set(const pg_shard_t & skip, const T &in) | |
5909 | { | |
5910 | set<int> osds_found; | |
5911 | set<pg_shard_t> out; | |
f67539c2 | 5912 | for (auto i = in.begin(); i != in.end(); ++i) { |
9f95a23c TL |
5913 | if (*i != skip && !osds_found.count(i->osd)) { |
5914 | osds_found.insert(i->osd); | |
5915 | out.insert(*i); | |
5916 | } | |
5917 | } | |
5918 | return out; | |
5919 | } | |
5920 | ||
5921 | /*---------Active---------*/ | |
5922 | PeeringState::Active::Active(my_context ctx) | |
5923 | : my_base(ctx), | |
5924 | NamedState(context< PeeringMachine >().state_history, "Started/Primary/Active"), | |
5925 | remote_shards_to_reserve_recovery( | |
5926 | unique_osd_shard_set( | |
5927 | context< PeeringMachine >().state->pg_whoami, | |
5928 | context< PeeringMachine >().state->acting_recovery_backfill)), | |
5929 | remote_shards_to_reserve_backfill( | |
5930 | unique_osd_shard_set( | |
5931 | context< PeeringMachine >().state->pg_whoami, | |
5932 | context< PeeringMachine >().state->backfill_targets)), | |
5933 | all_replicas_activated(false) | |
5934 | { | |
5935 | context< PeeringMachine >().log_enter(state_name); | |
5936 | ||
5937 | ||
5938 | DECLARE_LOCALS; | |
5939 | ||
5940 | ceph_assert(!ps->backfill_reserved); | |
5941 | ceph_assert(ps->is_primary()); | |
5942 | psdout(10) << "In Active, about to call activate" << dendl; | |
5943 | ps->start_flush(context< PeeringMachine >().get_cur_transaction()); | |
5944 | ps->activate(context< PeeringMachine >().get_cur_transaction(), | |
5945 | ps->get_osdmap_epoch(), | |
5946 | context< PeeringMachine >().get_recovery_ctx()); | |
5947 | ||
5948 | // everyone has to commit/ack before we are truly active | |
5949 | ps->blocked_by.clear(); | |
f67539c2 | 5950 | for (auto p = ps->acting_recovery_backfill.begin(); |
9f95a23c TL |
5951 | p != ps->acting_recovery_backfill.end(); |
5952 | ++p) { | |
5953 | if (p->shard != ps->pg_whoami.shard) { | |
5954 | ps->blocked_by.insert(p->shard); | |
5955 | } | |
5956 | } | |
5957 | pl->publish_stats_to_osd(); | |
5958 | psdout(10) << "Activate Finished" << dendl; | |
5959 | } | |
5960 | ||
5961 | boost::statechart::result PeeringState::Active::react(const AdvMap& advmap) | |
5962 | { | |
5963 | DECLARE_LOCALS; | |
5964 | ||
5965 | if (ps->should_restart_peering( | |
5966 | advmap.up_primary, | |
5967 | advmap.acting_primary, | |
5968 | advmap.newup, | |
5969 | advmap.newacting, | |
5970 | advmap.lastmap, | |
5971 | advmap.osdmap)) { | |
5972 | psdout(10) << "Active advmap interval change, fast return" << dendl; | |
5973 | return forward_event(); | |
5974 | } | |
5975 | psdout(10) << "Active advmap" << dendl; | |
5976 | bool need_publish = false; | |
5977 | ||
5978 | pl->on_active_advmap(advmap.osdmap); | |
5979 | if (ps->dirty_big_info) { | |
5980 | // share updated purged_snaps to mgr/mon so that we (a) stop reporting | |
5981 | // purged snaps and (b) perhaps share more snaps that we have purged | |
5982 | // but didn't fit in pg_stat_t. | |
5983 | need_publish = true; | |
5984 | ps->share_pg_info(); | |
5985 | } | |
5986 | ||
801d1391 | 5987 | bool need_acting_change = false; |
9f95a23c TL |
5988 | for (size_t i = 0; i < ps->want_acting.size(); i++) { |
5989 | int osd = ps->want_acting[i]; | |
5990 | if (!advmap.osdmap->is_up(osd)) { | |
5991 | pg_shard_t osd_with_shard(osd, shard_id_t(i)); | |
801d1391 TL |
5992 | if (!ps->is_acting(osd_with_shard) && !ps->is_up(osd_with_shard)) { |
5993 | psdout(10) << "Active stray osd." << osd << " in want_acting is down" | |
5994 | << dendl; | |
5995 | need_acting_change = true; | |
5996 | } | |
9f95a23c TL |
5997 | } |
5998 | } | |
801d1391 TL |
5999 | if (need_acting_change) { |
6000 | psdout(10) << "Active need acting change, call choose_acting again" | |
6001 | << dendl; | |
6002 | // possibly because we re-add some strays into the acting set and | |
6003 | // some of them then go down in a subsequent map before we could see | |
6004 | // the map changing the pg temp. | |
6005 | // call choose_acting again to clear them out. | |
6006 | // note that we leave restrict_to_up_acting to false in order to | |
6007 | // not overkill any chosen stray that is still alive. | |
6008 | pg_shard_t auth_log_shard; | |
6009 | bool history_les_bound = false; | |
6010 | ps->remove_down_peer_info(advmap.osdmap); | |
6011 | ps->choose_acting(auth_log_shard, false, &history_les_bound, true); | |
6012 | } | |
9f95a23c TL |
6013 | |
6014 | /* Check for changes in pool size (if the acting set changed as a result, | |
6015 | * this does not matter) */ | |
6016 | if (advmap.lastmap->get_pg_size(ps->info.pgid.pgid) != | |
6017 | ps->get_osdmap()->get_pg_size(ps->info.pgid.pgid)) { | |
6018 | if (ps->get_osdmap()->get_pg_size(ps->info.pgid.pgid) <= | |
6019 | ps->actingset.size()) { | |
6020 | ps->state_clear(PG_STATE_UNDERSIZED); | |
6021 | } else { | |
6022 | ps->state_set(PG_STATE_UNDERSIZED); | |
6023 | } | |
6024 | // degraded changes will be detected by call from publish_stats_to_osd() | |
6025 | need_publish = true; | |
6026 | } | |
6027 | ||
6028 | // if we haven't reported our PG stats in a long time, do so now. | |
6029 | if (ps->info.stats.reported_epoch + ps->cct->_conf->osd_pg_stat_report_interval_max < advmap.osdmap->get_epoch()) { | |
6030 | psdout(20) << "reporting stats to osd after " << (advmap.osdmap->get_epoch() - ps->info.stats.reported_epoch) | |
6031 | << " epochs" << dendl; | |
6032 | need_publish = true; | |
6033 | } | |
6034 | ||
6035 | if (need_publish) | |
6036 | pl->publish_stats_to_osd(); | |
6037 | ||
6038 | if (ps->check_prior_readable_down_osds(advmap.osdmap)) { | |
6039 | pl->recheck_readable(); | |
6040 | } | |
6041 | ||
6042 | return forward_event(); | |
6043 | } | |
6044 | ||
6045 | boost::statechart::result PeeringState::Active::react(const ActMap&) | |
6046 | { | |
6047 | DECLARE_LOCALS; | |
6048 | psdout(10) << "Active: handling ActMap" << dendl; | |
6049 | ceph_assert(ps->is_primary()); | |
6050 | ||
6051 | pl->on_active_actmap(); | |
6052 | ||
6053 | if (ps->have_unfound()) { | |
6054 | // object may have become unfound | |
6055 | ps->discover_all_missing(context<PeeringMachine>().get_recovery_ctx().msgs); | |
6056 | } | |
6057 | ||
6058 | uint64_t unfound = ps->missing_loc.num_unfound(); | |
6059 | if (unfound > 0 && | |
6060 | ps->all_unfound_are_queried_or_lost(ps->get_osdmap())) { | |
6061 | if (ps->cct->_conf->osd_auto_mark_unfound_lost) { | |
6062 | pl->get_clog_error() << context< PeeringMachine >().spgid.pgid << " has " << unfound | |
6063 | << " objects unfound and apparently lost, would automatically " | |
6064 | << "mark these objects lost but this feature is not yet implemented " | |
6065 | << "(osd_auto_mark_unfound_lost)"; | |
6066 | } else | |
6067 | pl->get_clog_error() << context< PeeringMachine >().spgid.pgid << " has " | |
6068 | << unfound << " objects unfound and apparently lost"; | |
6069 | } | |
6070 | ||
6071 | return forward_event(); | |
6072 | } | |
6073 | ||
6074 | boost::statechart::result PeeringState::Active::react(const MNotifyRec& notevt) | |
6075 | { | |
6076 | ||
6077 | DECLARE_LOCALS; | |
6078 | ceph_assert(ps->is_primary()); | |
6079 | if (ps->peer_info.count(notevt.from)) { | |
6080 | psdout(10) << "Active: got notify from " << notevt.from | |
6081 | << ", already have info from that osd, ignoring" | |
6082 | << dendl; | |
6083 | } else if (ps->peer_purged.count(notevt.from)) { | |
6084 | psdout(10) << "Active: got notify from " << notevt.from | |
6085 | << ", already purged that peer, ignoring" | |
6086 | << dendl; | |
6087 | } else { | |
6088 | psdout(10) << "Active: got notify from " << notevt.from | |
6089 | << ", calling proc_replica_info and discover_all_missing" | |
6090 | << dendl; | |
6091 | ps->proc_replica_info( | |
6092 | notevt.from, notevt.notify.info, notevt.notify.epoch_sent); | |
6093 | if (ps->have_unfound() || (ps->is_degraded() && ps->might_have_unfound.count(notevt.from))) { | |
6094 | ps->discover_all_missing( | |
6095 | context<PeeringMachine>().get_recovery_ctx().msgs); | |
6096 | } | |
6097 | // check if it is a previous down acting member that's coming back. | |
6098 | // if so, request pg_temp change to trigger a new interval transition | |
6099 | pg_shard_t auth_log_shard; | |
6100 | bool history_les_bound = false; | |
f67539c2 | 6101 | // FIXME: Uh-oh we have to check this return value; choose_acting can fail! |
9f95a23c TL |
6102 | ps->choose_acting(auth_log_shard, false, &history_les_bound, true); |
6103 | if (!ps->want_acting.empty() && ps->want_acting != ps->acting) { | |
6104 | psdout(10) << "Active: got notify from previous acting member " | |
6105 | << notevt.from << ", requesting pg_temp change" | |
6106 | << dendl; | |
6107 | } | |
6108 | } | |
6109 | return discard_event(); | |
6110 | } | |
6111 | ||
6112 | boost::statechart::result PeeringState::Active::react(const MTrim& trim) | |
6113 | { | |
6114 | DECLARE_LOCALS; | |
6115 | ceph_assert(ps->is_primary()); | |
6116 | ||
6117 | // peer is informing us of their last_complete_ondisk | |
6118 | ldout(ps->cct,10) << " replica osd." << trim.from << " lcod " << trim.trim_to << dendl; | |
6119 | ps->update_peer_last_complete_ondisk(pg_shard_t{trim.from, trim.shard}, | |
6120 | trim.trim_to); | |
6121 | // trim log when the pg is recovered | |
6122 | ps->calc_min_last_complete_ondisk(); | |
6123 | return discard_event(); | |
6124 | } | |
6125 | ||
6126 | boost::statechart::result PeeringState::Active::react(const MInfoRec& infoevt) | |
6127 | { | |
6128 | DECLARE_LOCALS; | |
6129 | ceph_assert(ps->is_primary()); | |
6130 | ||
6131 | ceph_assert(!ps->acting_recovery_backfill.empty()); | |
6132 | if (infoevt.lease_ack) { | |
6133 | ps->proc_lease_ack(infoevt.from.osd, *infoevt.lease_ack); | |
6134 | } | |
6135 | // don't update history (yet) if we are active and primary; the replica | |
6136 | // may be telling us they have activated (and committed) but we can't | |
6137 | // share that until _everyone_ does the same. | |
6138 | if (ps->is_acting_recovery_backfill(infoevt.from) && | |
6139 | ps->peer_activated.count(infoevt.from) == 0) { | |
6140 | psdout(10) << " peer osd." << infoevt.from | |
6141 | << " activated and committed" << dendl; | |
6142 | ps->peer_activated.insert(infoevt.from); | |
6143 | ps->blocked_by.erase(infoevt.from.shard); | |
6144 | pl->publish_stats_to_osd(); | |
6145 | if (ps->peer_activated.size() == ps->acting_recovery_backfill.size()) { | |
6146 | all_activated_and_committed(); | |
6147 | } | |
6148 | } | |
6149 | return discard_event(); | |
6150 | } | |
6151 | ||
6152 | boost::statechart::result PeeringState::Active::react(const MLogRec& logevt) | |
6153 | { | |
6154 | DECLARE_LOCALS; | |
6155 | psdout(10) << "searching osd." << logevt.from | |
6156 | << " log for unfound items" << dendl; | |
6157 | ps->proc_replica_log( | |
f67539c2 | 6158 | logevt.msg->info, logevt.msg->log, std::move(logevt.msg->missing), logevt.from); |
9f95a23c TL |
6159 | bool got_missing = ps->search_for_missing( |
6160 | ps->peer_info[logevt.from], | |
6161 | ps->peer_missing[logevt.from], | |
6162 | logevt.from, | |
6163 | context< PeeringMachine >().get_recovery_ctx()); | |
6164 | // If there are missing AND we are "fully" active then start recovery now | |
6165 | if (got_missing && ps->state_test(PG_STATE_ACTIVE)) { | |
6166 | post_event(DoRecovery()); | |
6167 | } | |
6168 | return discard_event(); | |
6169 | } | |
6170 | ||
6171 | boost::statechart::result PeeringState::Active::react(const QueryState& q) | |
6172 | { | |
6173 | DECLARE_LOCALS; | |
6174 | ||
6175 | q.f->open_object_section("state"); | |
6176 | q.f->dump_string("name", state_name); | |
6177 | q.f->dump_stream("enter_time") << enter_time; | |
6178 | ||
6179 | { | |
6180 | q.f->open_array_section("might_have_unfound"); | |
f67539c2 | 6181 | for (auto p = ps->might_have_unfound.begin(); |
9f95a23c TL |
6182 | p != ps->might_have_unfound.end(); |
6183 | ++p) { | |
6184 | q.f->open_object_section("osd"); | |
6185 | q.f->dump_stream("osd") << *p; | |
6186 | if (ps->peer_missing.count(*p)) { | |
6187 | q.f->dump_string("status", "already probed"); | |
6188 | } else if (ps->peer_missing_requested.count(*p)) { | |
6189 | q.f->dump_string("status", "querying"); | |
6190 | } else if (!ps->get_osdmap()->is_up(p->osd)) { | |
6191 | q.f->dump_string("status", "osd is down"); | |
6192 | } else { | |
6193 | q.f->dump_string("status", "not queried"); | |
6194 | } | |
6195 | q.f->close_section(); | |
6196 | } | |
6197 | q.f->close_section(); | |
6198 | } | |
6199 | { | |
6200 | q.f->open_object_section("recovery_progress"); | |
6201 | q.f->open_array_section("backfill_targets"); | |
f67539c2 | 6202 | for (auto p = ps->backfill_targets.begin(); |
9f95a23c TL |
6203 | p != ps->backfill_targets.end(); ++p) |
6204 | q.f->dump_stream("replica") << *p; | |
6205 | q.f->close_section(); | |
6206 | pl->dump_recovery_info(q.f); | |
6207 | q.f->close_section(); | |
6208 | } | |
6209 | ||
6210 | q.f->close_section(); | |
6211 | return forward_event(); | |
6212 | } | |
6213 | ||
f67539c2 TL |
6214 | boost::statechart::result PeeringState::Active::react(const QueryUnfound& q) |
6215 | { | |
6216 | DECLARE_LOCALS; | |
6217 | ||
6218 | ps->query_unfound(q.f, "Active"); | |
6219 | return discard_event(); | |
6220 | } | |
6221 | ||
9f95a23c TL |
6222 | boost::statechart::result PeeringState::Active::react( |
6223 | const ActivateCommitted &evt) | |
6224 | { | |
6225 | DECLARE_LOCALS; | |
6226 | ceph_assert(!ps->peer_activated.count(ps->pg_whoami)); | |
6227 | ps->peer_activated.insert(ps->pg_whoami); | |
6228 | psdout(10) << "_activate_committed " << evt.epoch | |
6229 | << " peer_activated now " << ps->peer_activated | |
6230 | << " last_interval_started " | |
6231 | << ps->info.history.last_interval_started | |
6232 | << " last_epoch_started " | |
6233 | << ps->info.history.last_epoch_started | |
6234 | << " same_interval_since " | |
6235 | << ps->info.history.same_interval_since | |
6236 | << dendl; | |
6237 | ceph_assert(!ps->acting_recovery_backfill.empty()); | |
6238 | if (ps->peer_activated.size() == ps->acting_recovery_backfill.size()) | |
6239 | all_activated_and_committed(); | |
6240 | return discard_event(); | |
6241 | } | |
6242 | ||
6243 | boost::statechart::result PeeringState::Active::react(const AllReplicasActivated &evt) | |
6244 | { | |
6245 | ||
6246 | DECLARE_LOCALS; | |
6247 | pg_t pgid = context< PeeringMachine >().spgid.pgid; | |
6248 | ||
6249 | all_replicas_activated = true; | |
6250 | ||
6251 | ps->state_clear(PG_STATE_ACTIVATING); | |
6252 | ps->state_clear(PG_STATE_CREATING); | |
6253 | ps->state_clear(PG_STATE_PREMERGE); | |
6254 | ||
6255 | bool merge_target; | |
6256 | if (ps->pool.info.is_pending_merge(pgid, &merge_target)) { | |
6257 | ps->state_set(PG_STATE_PEERED); | |
6258 | ps->state_set(PG_STATE_PREMERGE); | |
6259 | ||
6260 | if (ps->actingset.size() != ps->get_osdmap()->get_pg_size(pgid)) { | |
6261 | if (merge_target) { | |
6262 | pg_t src = pgid; | |
6263 | src.set_ps(ps->pool.info.get_pg_num_pending()); | |
6264 | assert(src.get_parent() == pgid); | |
6265 | pl->set_not_ready_to_merge_target(pgid, src); | |
6266 | } else { | |
6267 | pl->set_not_ready_to_merge_source(pgid); | |
6268 | } | |
6269 | } | |
f67539c2 | 6270 | } else if (!ps->acting_set_writeable()) { |
9f95a23c TL |
6271 | ps->state_set(PG_STATE_PEERED); |
6272 | } else { | |
6273 | ps->state_set(PG_STATE_ACTIVE); | |
6274 | } | |
6275 | ||
6276 | auto mnow = pl->get_mnow(); | |
6277 | if (ps->prior_readable_until_ub > mnow) { | |
6278 | psdout(10) << " waiting for prior_readable_until_ub " | |
6279 | << ps->prior_readable_until_ub << " > mnow " << mnow << dendl; | |
6280 | ps->state_set(PG_STATE_WAIT); | |
6281 | pl->queue_check_readable( | |
6282 | ps->last_peering_reset, | |
6283 | ps->prior_readable_until_ub - mnow); | |
6284 | } else { | |
6285 | psdout(10) << " mnow " << mnow << " >= prior_readable_until_ub " | |
6286 | << ps->prior_readable_until_ub << dendl; | |
6287 | } | |
6288 | ||
6289 | if (ps->pool.info.has_flag(pg_pool_t::FLAG_CREATING)) { | |
6290 | pl->send_pg_created(pgid); | |
6291 | } | |
6292 | ||
6293 | ps->info.history.last_epoch_started = ps->info.last_epoch_started; | |
6294 | ps->info.history.last_interval_started = ps->info.last_interval_started; | |
6295 | ps->dirty_info = true; | |
6296 | ||
6297 | ps->share_pg_info(); | |
6298 | pl->publish_stats_to_osd(); | |
6299 | ||
6300 | pl->on_activate_complete(); | |
6301 | ||
6302 | return discard_event(); | |
6303 | } | |
6304 | ||
6305 | boost::statechart::result PeeringState::Active::react(const RenewLease& rl) | |
6306 | { | |
6307 | DECLARE_LOCALS; | |
6308 | ps->proc_renew_lease(); | |
6309 | return discard_event(); | |
6310 | } | |
6311 | ||
6312 | boost::statechart::result PeeringState::Active::react(const MLeaseAck& la) | |
6313 | { | |
6314 | DECLARE_LOCALS; | |
6315 | ps->proc_lease_ack(la.from, la.lease_ack); | |
6316 | return discard_event(); | |
6317 | } | |
6318 | ||
6319 | ||
6320 | boost::statechart::result PeeringState::Active::react(const CheckReadable &evt) | |
6321 | { | |
6322 | DECLARE_LOCALS; | |
6323 | pl->recheck_readable(); | |
6324 | return discard_event(); | |
6325 | } | |
6326 | ||
6327 | /* | |
6328 | * update info.history.last_epoch_started ONLY after we and all | |
6329 | * replicas have activated AND committed the activate transaction | |
6330 | * (i.e. the peering results are stable on disk). | |
6331 | */ | |
6332 | void PeeringState::Active::all_activated_and_committed() | |
6333 | { | |
6334 | DECLARE_LOCALS; | |
6335 | psdout(10) << "all_activated_and_committed" << dendl; | |
6336 | ceph_assert(ps->is_primary()); | |
6337 | ceph_assert(ps->peer_activated.size() == ps->acting_recovery_backfill.size()); | |
6338 | ceph_assert(!ps->acting_recovery_backfill.empty()); | |
6339 | ceph_assert(ps->blocked_by.empty()); | |
6340 | ||
6341 | if (HAVE_FEATURE(ps->upacting_features, SERVER_OCTOPUS)) { | |
6342 | // this is overkill when the activation is quick, but when it is slow it | |
6343 | // is important, because the lease was renewed by the activate itself but we | |
6344 | // don't know how long ago that was, and simply scheduling now may leave | |
6345 | // a gap in lease coverage. keep it simple and aggressively renew. | |
6346 | ps->renew_lease(pl->get_mnow()); | |
6347 | ps->send_lease(); | |
6348 | ps->schedule_renew_lease(); | |
6349 | } | |
6350 | ||
6351 | // Degraded? | |
6352 | ps->update_calc_stats(); | |
6353 | if (ps->info.stats.stats.sum.num_objects_degraded) { | |
6354 | ps->state_set(PG_STATE_DEGRADED); | |
6355 | } else { | |
6356 | ps->state_clear(PG_STATE_DEGRADED); | |
6357 | } | |
6358 | ||
6359 | post_event(PeeringState::AllReplicasActivated()); | |
6360 | } | |
6361 | ||
6362 | ||
6363 | void PeeringState::Active::exit() | |
6364 | { | |
6365 | context< PeeringMachine >().log_exit(state_name, enter_time); | |
6366 | ||
6367 | ||
6368 | DECLARE_LOCALS; | |
6369 | pl->cancel_local_background_io_reservation(); | |
6370 | ||
6371 | ps->blocked_by.clear(); | |
6372 | ps->backfill_reserved = false; | |
6373 | ps->state_clear(PG_STATE_ACTIVATING); | |
6374 | ps->state_clear(PG_STATE_DEGRADED); | |
6375 | ps->state_clear(PG_STATE_UNDERSIZED); | |
6376 | ps->state_clear(PG_STATE_BACKFILL_TOOFULL); | |
6377 | ps->state_clear(PG_STATE_BACKFILL_WAIT); | |
6378 | ps->state_clear(PG_STATE_RECOVERY_WAIT); | |
6379 | ps->state_clear(PG_STATE_RECOVERY_TOOFULL); | |
6380 | utime_t dur = ceph_clock_now() - enter_time; | |
6381 | pl->get_peering_perf().tinc(rs_active_latency, dur); | |
6382 | pl->on_active_exit(); | |
6383 | } | |
6384 | ||
6385 | /*------ReplicaActive-----*/ | |
6386 | PeeringState::ReplicaActive::ReplicaActive(my_context ctx) | |
6387 | : my_base(ctx), | |
6388 | NamedState(context< PeeringMachine >().state_history, "Started/ReplicaActive") | |
6389 | { | |
6390 | context< PeeringMachine >().log_enter(state_name); | |
6391 | ||
6392 | DECLARE_LOCALS; | |
6393 | ps->start_flush(context< PeeringMachine >().get_cur_transaction()); | |
6394 | } | |
6395 | ||
6396 | ||
6397 | boost::statechart::result PeeringState::ReplicaActive::react( | |
6398 | const Activate& actevt) { | |
6399 | DECLARE_LOCALS; | |
6400 | psdout(10) << "In ReplicaActive, about to call activate" << dendl; | |
6401 | ps->activate( | |
6402 | context< PeeringMachine >().get_cur_transaction(), | |
6403 | actevt.activation_epoch, | |
6404 | context< PeeringMachine >().get_recovery_ctx()); | |
6405 | psdout(10) << "Activate Finished" << dendl; | |
6406 | return discard_event(); | |
6407 | } | |
6408 | ||
6409 | boost::statechart::result PeeringState::ReplicaActive::react( | |
6410 | const ActivateCommitted &evt) | |
6411 | { | |
6412 | DECLARE_LOCALS; | |
6413 | psdout(10) << __func__ << " " << evt.epoch << " telling primary" << dendl; | |
6414 | ||
6415 | auto &rctx = context<PeeringMachine>().get_recovery_ctx(); | |
6416 | auto epoch = ps->get_osdmap_epoch(); | |
6417 | pg_info_t i = ps->info; | |
6418 | i.history.last_epoch_started = evt.activation_epoch; | |
6419 | i.history.last_interval_started = i.history.same_interval_since; | |
6420 | rctx.send_info( | |
6421 | ps->get_primary().osd, | |
6422 | spg_t(ps->info.pgid.pgid, ps->get_primary().shard), | |
6423 | epoch, | |
6424 | epoch, | |
6425 | i, | |
6426 | {}, /* lease */ | |
6427 | ps->get_lease_ack()); | |
6428 | ||
f67539c2 | 6429 | if (ps->acting_set_writeable()) { |
9f95a23c TL |
6430 | ps->state_set(PG_STATE_ACTIVE); |
6431 | } else { | |
6432 | ps->state_set(PG_STATE_PEERED); | |
6433 | } | |
6434 | pl->on_activate_committed(); | |
6435 | ||
6436 | return discard_event(); | |
6437 | } | |
6438 | ||
6439 | boost::statechart::result PeeringState::ReplicaActive::react(const MLease& l) | |
6440 | { | |
6441 | DECLARE_LOCALS; | |
6442 | spg_t spgid = context< PeeringMachine >().spgid; | |
6443 | epoch_t epoch = pl->get_osdmap_epoch(); | |
6444 | ||
6445 | ps->proc_lease(l.lease); | |
6446 | pl->send_cluster_message( | |
6447 | ps->get_primary().osd, | |
f67539c2 | 6448 | make_message<MOSDPGLeaseAck>(epoch, |
9f95a23c TL |
6449 | spg_t(spgid.pgid, ps->get_primary().shard), |
6450 | ps->get_lease_ack()), | |
6451 | epoch); | |
6452 | return discard_event(); | |
6453 | } | |
6454 | ||
6455 | boost::statechart::result PeeringState::ReplicaActive::react(const MInfoRec& infoevt) | |
6456 | { | |
6457 | DECLARE_LOCALS; | |
6458 | ps->proc_primary_info(context<PeeringMachine>().get_cur_transaction(), | |
6459 | infoevt.info); | |
6460 | return discard_event(); | |
6461 | } | |
6462 | ||
6463 | boost::statechart::result PeeringState::ReplicaActive::react(const MLogRec& logevt) | |
6464 | { | |
6465 | DECLARE_LOCALS; | |
6466 | psdout(10) << "received log from " << logevt.from << dendl; | |
6467 | ObjectStore::Transaction &t = context<PeeringMachine>().get_cur_transaction(); | |
f67539c2 | 6468 | ps->merge_log(t, logevt.msg->info, std::move(logevt.msg->log), logevt.from); |
9f95a23c TL |
6469 | ceph_assert(ps->pg_log.get_head() == ps->info.last_update); |
6470 | if (logevt.msg->lease) { | |
6471 | ps->proc_lease(*logevt.msg->lease); | |
6472 | } | |
6473 | ||
6474 | return discard_event(); | |
6475 | } | |
6476 | ||
6477 | boost::statechart::result PeeringState::ReplicaActive::react(const MTrim& trim) | |
6478 | { | |
6479 | DECLARE_LOCALS; | |
6480 | // primary is instructing us to trim | |
6481 | ps->pg_log.trim(trim.trim_to, ps->info); | |
6482 | ps->dirty_info = true; | |
6483 | return discard_event(); | |
6484 | } | |
6485 | ||
6486 | boost::statechart::result PeeringState::ReplicaActive::react(const ActMap&) | |
6487 | { | |
6488 | DECLARE_LOCALS; | |
6489 | if (ps->should_send_notify() && ps->get_primary().osd >= 0) { | |
6490 | ps->info.history.refresh_prior_readable_until_ub( | |
6491 | pl->get_mnow(), ps->prior_readable_until_ub); | |
6492 | context< PeeringMachine >().send_notify( | |
6493 | ps->get_primary().osd, | |
6494 | pg_notify_t( | |
6495 | ps->get_primary().shard, ps->pg_whoami.shard, | |
6496 | ps->get_osdmap_epoch(), | |
6497 | ps->get_osdmap_epoch(), | |
6498 | ps->info, | |
6499 | ps->past_intervals)); | |
6500 | } | |
6501 | return discard_event(); | |
6502 | } | |
6503 | ||
6504 | boost::statechart::result PeeringState::ReplicaActive::react( | |
6505 | const MQuery& query) | |
6506 | { | |
6507 | DECLARE_LOCALS; | |
6508 | ps->fulfill_query(query, context<PeeringMachine>().get_recovery_ctx()); | |
6509 | return discard_event(); | |
6510 | } | |
6511 | ||
6512 | boost::statechart::result PeeringState::ReplicaActive::react(const QueryState& q) | |
6513 | { | |
6514 | q.f->open_object_section("state"); | |
6515 | q.f->dump_string("name", state_name); | |
6516 | q.f->dump_stream("enter_time") << enter_time; | |
6517 | q.f->close_section(); | |
6518 | return forward_event(); | |
6519 | } | |
6520 | ||
f67539c2 TL |
6521 | boost::statechart::result PeeringState::ReplicaActive::react(const QueryUnfound& q) |
6522 | { | |
6523 | q.f->dump_string("state", "ReplicaActive"); | |
6524 | q.f->dump_bool("available_might_have_unfound", false); | |
6525 | return discard_event(); | |
6526 | } | |
6527 | ||
9f95a23c TL |
6528 | void PeeringState::ReplicaActive::exit() |
6529 | { | |
6530 | context< PeeringMachine >().log_exit(state_name, enter_time); | |
6531 | DECLARE_LOCALS; | |
6532 | pl->unreserve_recovery_space(); | |
6533 | ||
6534 | pl->cancel_remote_recovery_reservation(); | |
6535 | utime_t dur = ceph_clock_now() - enter_time; | |
6536 | pl->get_peering_perf().tinc(rs_replicaactive_latency, dur); | |
6537 | ||
6538 | ps->min_last_complete_ondisk = eversion_t(); | |
6539 | } | |
6540 | ||
6541 | /*-------Stray---*/ | |
6542 | PeeringState::Stray::Stray(my_context ctx) | |
6543 | : my_base(ctx), | |
6544 | NamedState(context< PeeringMachine >().state_history, "Started/Stray") | |
6545 | { | |
6546 | context< PeeringMachine >().log_enter(state_name); | |
6547 | ||
6548 | ||
6549 | DECLARE_LOCALS; | |
6550 | ceph_assert(!ps->is_peered()); | |
6551 | ceph_assert(!ps->is_peering()); | |
6552 | ceph_assert(!ps->is_primary()); | |
6553 | ||
6554 | if (!ps->get_osdmap()->have_pg_pool(ps->info.pgid.pgid.pool())) { | |
6555 | ldout(ps->cct,10) << __func__ << " pool is deleted" << dendl; | |
6556 | post_event(DeleteStart()); | |
6557 | } else { | |
6558 | ps->start_flush(context< PeeringMachine >().get_cur_transaction()); | |
6559 | } | |
6560 | } | |
6561 | ||
6562 | boost::statechart::result PeeringState::Stray::react(const MLogRec& logevt) | |
6563 | { | |
6564 | DECLARE_LOCALS; | |
6565 | MOSDPGLog *msg = logevt.msg.get(); | |
6566 | psdout(10) << "got info+log from osd." << logevt.from << " " << msg->info << " " << msg->log << dendl; | |
6567 | ||
6568 | ObjectStore::Transaction &t = context<PeeringMachine>().get_cur_transaction(); | |
6569 | if (msg->info.last_backfill == hobject_t()) { | |
6570 | // restart backfill | |
6571 | ps->info = msg->info; | |
6572 | pl->on_info_history_change(); | |
6573 | ps->dirty_info = true; | |
6574 | ps->dirty_big_info = true; // maybe. | |
6575 | ||
6576 | PGLog::LogEntryHandlerRef rollbacker{pl->get_log_handler(t)}; | |
6577 | ps->pg_log.reset_backfill_claim_log(msg->log, rollbacker.get()); | |
6578 | ||
6579 | ps->pg_log.reset_backfill(); | |
6580 | } else { | |
f67539c2 | 6581 | ps->merge_log(t, msg->info, std::move(msg->log), logevt.from); |
9f95a23c TL |
6582 | } |
6583 | if (logevt.msg->lease) { | |
6584 | ps->proc_lease(*logevt.msg->lease); | |
6585 | } | |
6586 | ||
6587 | ceph_assert(ps->pg_log.get_head() == ps->info.last_update); | |
6588 | ||
6589 | post_event(Activate(logevt.msg->info.last_epoch_started)); | |
6590 | return transit<ReplicaActive>(); | |
6591 | } | |
6592 | ||
6593 | boost::statechart::result PeeringState::Stray::react(const MInfoRec& infoevt) | |
6594 | { | |
6595 | DECLARE_LOCALS; | |
6596 | psdout(10) << "got info from osd." << infoevt.from << " " << infoevt.info << dendl; | |
6597 | ||
6598 | if (ps->info.last_update > infoevt.info.last_update) { | |
6599 | // rewind divergent log entries | |
6600 | ObjectStore::Transaction &t = context<PeeringMachine>().get_cur_transaction(); | |
6601 | ps->rewind_divergent_log(t, infoevt.info.last_update); | |
6602 | ps->info.stats = infoevt.info.stats; | |
6603 | ps->info.hit_set = infoevt.info.hit_set; | |
6604 | } | |
6605 | ||
6606 | if (infoevt.lease) { | |
6607 | ps->proc_lease(*infoevt.lease); | |
6608 | } | |
6609 | ||
6610 | ceph_assert(infoevt.info.last_update == ps->info.last_update); | |
6611 | ceph_assert(ps->pg_log.get_head() == ps->info.last_update); | |
6612 | ||
6613 | post_event(Activate(infoevt.info.last_epoch_started)); | |
6614 | return transit<ReplicaActive>(); | |
6615 | } | |
6616 | ||
6617 | boost::statechart::result PeeringState::Stray::react(const MQuery& query) | |
6618 | { | |
6619 | DECLARE_LOCALS; | |
6620 | ps->fulfill_query(query, context<PeeringMachine>().get_recovery_ctx()); | |
6621 | return discard_event(); | |
6622 | } | |
6623 | ||
6624 | boost::statechart::result PeeringState::Stray::react(const ActMap&) | |
6625 | { | |
6626 | DECLARE_LOCALS; | |
6627 | if (ps->should_send_notify() && ps->get_primary().osd >= 0) { | |
6628 | ps->info.history.refresh_prior_readable_until_ub( | |
6629 | pl->get_mnow(), ps->prior_readable_until_ub); | |
6630 | context< PeeringMachine >().send_notify( | |
6631 | ps->get_primary().osd, | |
6632 | pg_notify_t( | |
6633 | ps->get_primary().shard, ps->pg_whoami.shard, | |
6634 | ps->get_osdmap_epoch(), | |
6635 | ps->get_osdmap_epoch(), | |
6636 | ps->info, | |
6637 | ps->past_intervals)); | |
6638 | } | |
6639 | return discard_event(); | |
6640 | } | |
6641 | ||
6642 | void PeeringState::Stray::exit() | |
6643 | { | |
6644 | context< PeeringMachine >().log_exit(state_name, enter_time); | |
6645 | DECLARE_LOCALS; | |
6646 | utime_t dur = ceph_clock_now() - enter_time; | |
6647 | pl->get_peering_perf().tinc(rs_stray_latency, dur); | |
6648 | } | |
6649 | ||
6650 | ||
6651 | /*--------ToDelete----------*/ | |
6652 | PeeringState::ToDelete::ToDelete(my_context ctx) | |
6653 | : my_base(ctx), | |
6654 | NamedState(context< PeeringMachine >().state_history, "Started/ToDelete") | |
6655 | { | |
6656 | context< PeeringMachine >().log_enter(state_name); | |
6657 | DECLARE_LOCALS; | |
6658 | pl->get_perf_logger().inc(l_osd_pg_removing); | |
6659 | } | |
6660 | ||
6661 | void PeeringState::ToDelete::exit() | |
6662 | { | |
6663 | context< PeeringMachine >().log_exit(state_name, enter_time); | |
6664 | DECLARE_LOCALS; | |
6665 | // note: on a successful removal, this path doesn't execute. see | |
6666 | // _delete_some(). | |
6667 | pl->get_perf_logger().dec(l_osd_pg_removing); | |
6668 | ||
6669 | pl->cancel_local_background_io_reservation(); | |
6670 | } | |
6671 | ||
6672 | /*----WaitDeleteReserved----*/ | |
6673 | PeeringState::WaitDeleteReserved::WaitDeleteReserved(my_context ctx) | |
6674 | : my_base(ctx), | |
6675 | NamedState(context< PeeringMachine >().state_history, | |
6676 | "Started/ToDelete/WaitDeleteReseved") | |
6677 | { | |
6678 | context< PeeringMachine >().log_enter(state_name); | |
6679 | DECLARE_LOCALS; | |
6680 | context< ToDelete >().priority = ps->get_delete_priority(); | |
6681 | ||
6682 | pl->cancel_local_background_io_reservation(); | |
6683 | pl->request_local_background_io_reservation( | |
6684 | context<ToDelete>().priority, | |
f67539c2 | 6685 | std::make_unique<PGPeeringEvent>( |
9f95a23c TL |
6686 | ps->get_osdmap_epoch(), |
6687 | ps->get_osdmap_epoch(), | |
6688 | DeleteReserved()), | |
f67539c2 | 6689 | std::make_unique<PGPeeringEvent>( |
9f95a23c TL |
6690 | ps->get_osdmap_epoch(), |
6691 | ps->get_osdmap_epoch(), | |
6692 | DeleteInterrupted())); | |
6693 | } | |
6694 | ||
6695 | boost::statechart::result PeeringState::ToDelete::react( | |
6696 | const ActMap& evt) | |
6697 | { | |
6698 | DECLARE_LOCALS; | |
6699 | if (ps->get_delete_priority() != priority) { | |
6700 | psdout(10) << __func__ << " delete priority changed, resetting" | |
6701 | << dendl; | |
6702 | return transit<ToDelete>(); | |
6703 | } | |
6704 | return discard_event(); | |
6705 | } | |
6706 | ||
6707 | void PeeringState::WaitDeleteReserved::exit() | |
6708 | { | |
6709 | context< PeeringMachine >().log_exit(state_name, enter_time); | |
6710 | } | |
6711 | ||
6712 | /*----Deleting-----*/ | |
6713 | PeeringState::Deleting::Deleting(my_context ctx) | |
6714 | : my_base(ctx), | |
6715 | NamedState(context< PeeringMachine >().state_history, "Started/ToDelete/Deleting") | |
6716 | { | |
6717 | context< PeeringMachine >().log_enter(state_name); | |
adb31ebb | 6718 | |
9f95a23c TL |
6719 | DECLARE_LOCALS; |
6720 | ps->deleting = true; | |
6721 | ObjectStore::Transaction &t = context<PeeringMachine>().get_cur_transaction(); | |
6722 | ||
6723 | // clear log | |
6724 | PGLog::LogEntryHandlerRef rollbacker{pl->get_log_handler(t)}; | |
6725 | ps->pg_log.roll_forward(rollbacker.get()); | |
6726 | ||
6727 | // adjust info to backfill | |
6728 | ps->info.set_last_backfill(hobject_t()); | |
6729 | ps->pg_log.reset_backfill(); | |
6730 | ps->dirty_info = true; | |
6731 | ||
6732 | pl->on_removal(t); | |
6733 | } | |
6734 | ||
6735 | boost::statechart::result PeeringState::Deleting::react( | |
6736 | const DeleteSome& evt) | |
6737 | { | |
6738 | DECLARE_LOCALS; | |
f67539c2 TL |
6739 | std::pair<ghobject_t, bool> p; |
6740 | p = pl->do_delete_work(context<PeeringMachine>().get_cur_transaction(), | |
adb31ebb | 6741 | next); |
f67539c2 TL |
6742 | next = p.first; |
6743 | return p.second ? discard_event() : terminate(); | |
9f95a23c TL |
6744 | } |
6745 | ||
6746 | void PeeringState::Deleting::exit() | |
6747 | { | |
6748 | context< PeeringMachine >().log_exit(state_name, enter_time); | |
6749 | DECLARE_LOCALS; | |
6750 | ps->deleting = false; | |
6751 | pl->cancel_local_background_io_reservation(); | |
6752 | } | |
6753 | ||
6754 | /*--------GetInfo---------*/ | |
6755 | PeeringState::GetInfo::GetInfo(my_context ctx) | |
6756 | : my_base(ctx), | |
6757 | NamedState(context< PeeringMachine >().state_history, "Started/Primary/Peering/GetInfo") | |
6758 | { | |
6759 | context< PeeringMachine >().log_enter(state_name); | |
6760 | ||
6761 | ||
6762 | DECLARE_LOCALS; | |
6763 | ps->check_past_interval_bounds(); | |
6764 | ps->log_weirdness(); | |
6765 | PastIntervals::PriorSet &prior_set = context< Peering >().prior_set; | |
6766 | ||
6767 | ceph_assert(ps->blocked_by.empty()); | |
6768 | ||
6769 | prior_set = ps->build_prior(); | |
6770 | ps->prior_readable_down_osds = prior_set.down; | |
6771 | if (ps->prior_readable_down_osds.empty()) { | |
6772 | psdout(10) << " no prior_set down osds, clearing prior_readable_until_ub" | |
6773 | << dendl; | |
6774 | ps->clear_prior_readable_until_ub(); | |
6775 | } | |
6776 | ||
6777 | ps->reset_min_peer_features(); | |
6778 | get_infos(); | |
6779 | if (prior_set.pg_down) { | |
6780 | post_event(IsDown()); | |
6781 | } else if (peer_info_requested.empty()) { | |
6782 | post_event(GotInfo()); | |
6783 | } | |
6784 | } | |
6785 | ||
6786 | void PeeringState::GetInfo::get_infos() | |
6787 | { | |
6788 | DECLARE_LOCALS; | |
6789 | PastIntervals::PriorSet &prior_set = context< Peering >().prior_set; | |
6790 | ||
6791 | ps->blocked_by.clear(); | |
f67539c2 | 6792 | for (auto it = prior_set.probe.begin(); it != prior_set.probe.end(); ++it) { |
9f95a23c TL |
6793 | pg_shard_t peer = *it; |
6794 | if (peer == ps->pg_whoami) { | |
6795 | continue; | |
6796 | } | |
6797 | if (ps->peer_info.count(peer)) { | |
6798 | psdout(10) << " have osd." << peer << " info " << ps->peer_info[peer] << dendl; | |
6799 | continue; | |
6800 | } | |
6801 | if (peer_info_requested.count(peer)) { | |
6802 | psdout(10) << " already requested info from osd." << peer << dendl; | |
6803 | ps->blocked_by.insert(peer.osd); | |
6804 | } else if (!ps->get_osdmap()->is_up(peer.osd)) { | |
6805 | psdout(10) << " not querying info from down osd." << peer << dendl; | |
6806 | } else { | |
6807 | psdout(10) << " querying info from osd." << peer << dendl; | |
6808 | context< PeeringMachine >().send_query( | |
6809 | peer.osd, | |
6810 | pg_query_t(pg_query_t::INFO, | |
6811 | it->shard, ps->pg_whoami.shard, | |
6812 | ps->info.history, | |
6813 | ps->get_osdmap_epoch())); | |
6814 | peer_info_requested.insert(peer); | |
6815 | ps->blocked_by.insert(peer.osd); | |
6816 | } | |
6817 | } | |
6818 | ||
6819 | ps->check_prior_readable_down_osds(ps->get_osdmap()); | |
6820 | ||
6821 | pl->publish_stats_to_osd(); | |
6822 | } | |
6823 | ||
6824 | boost::statechart::result PeeringState::GetInfo::react(const MNotifyRec& infoevt) | |
6825 | { | |
6826 | ||
6827 | DECLARE_LOCALS; | |
6828 | ||
f67539c2 | 6829 | auto p = peer_info_requested.find(infoevt.from); |
9f95a23c TL |
6830 | if (p != peer_info_requested.end()) { |
6831 | peer_info_requested.erase(p); | |
6832 | ps->blocked_by.erase(infoevt.from.osd); | |
6833 | } | |
6834 | ||
6835 | epoch_t old_start = ps->info.history.last_epoch_started; | |
6836 | if (ps->proc_replica_info( | |
6837 | infoevt.from, infoevt.notify.info, infoevt.notify.epoch_sent)) { | |
6838 | // we got something new ... | |
6839 | PastIntervals::PriorSet &prior_set = context< Peering >().prior_set; | |
6840 | if (old_start < ps->info.history.last_epoch_started) { | |
6841 | psdout(10) << " last_epoch_started moved forward, rebuilding prior" << dendl; | |
6842 | prior_set = ps->build_prior(); | |
6843 | ps->prior_readable_down_osds = prior_set.down; | |
6844 | ||
6845 | // filter out any osds that got dropped from the probe set from | |
6846 | // peer_info_requested. this is less expensive than restarting | |
6847 | // peering (which would re-probe everyone). | |
f67539c2 | 6848 | auto p = peer_info_requested.begin(); |
9f95a23c TL |
6849 | while (p != peer_info_requested.end()) { |
6850 | if (prior_set.probe.count(*p) == 0) { | |
6851 | psdout(20) << " dropping osd." << *p << " from info_requested, no longer in probe set" << dendl; | |
6852 | peer_info_requested.erase(p++); | |
6853 | } else { | |
6854 | ++p; | |
6855 | } | |
6856 | } | |
6857 | get_infos(); | |
6858 | } | |
6859 | psdout(20) << "Adding osd: " << infoevt.from.osd << " peer features: " | |
6860 | << hex << infoevt.features << dec << dendl; | |
6861 | ps->apply_peer_features(infoevt.features); | |
6862 | ||
6863 | // are we done getting everything? | |
6864 | if (peer_info_requested.empty() && !prior_set.pg_down) { | |
6865 | psdout(20) << "Common peer features: " << hex << ps->get_min_peer_features() << dec << dendl; | |
6866 | psdout(20) << "Common acting features: " << hex << ps->get_min_acting_features() << dec << dendl; | |
6867 | psdout(20) << "Common upacting features: " << hex << ps->get_min_upacting_features() << dec << dendl; | |
6868 | post_event(GotInfo()); | |
6869 | } | |
6870 | } | |
6871 | return discard_event(); | |
6872 | } | |
6873 | ||
6874 | boost::statechart::result PeeringState::GetInfo::react(const QueryState& q) | |
6875 | { | |
6876 | DECLARE_LOCALS; | |
6877 | q.f->open_object_section("state"); | |
6878 | q.f->dump_string("name", state_name); | |
6879 | q.f->dump_stream("enter_time") << enter_time; | |
6880 | ||
6881 | q.f->open_array_section("requested_info_from"); | |
f67539c2 | 6882 | for (auto p = peer_info_requested.begin(); |
9f95a23c TL |
6883 | p != peer_info_requested.end(); |
6884 | ++p) { | |
6885 | q.f->open_object_section("osd"); | |
6886 | q.f->dump_stream("osd") << *p; | |
6887 | if (ps->peer_info.count(*p)) { | |
6888 | q.f->open_object_section("got_info"); | |
6889 | ps->peer_info[*p].dump(q.f); | |
6890 | q.f->close_section(); | |
6891 | } | |
6892 | q.f->close_section(); | |
6893 | } | |
6894 | q.f->close_section(); | |
6895 | ||
6896 | q.f->close_section(); | |
6897 | return forward_event(); | |
6898 | } | |
6899 | ||
f67539c2 TL |
6900 | boost::statechart::result PeeringState::GetInfo::react(const QueryUnfound& q) |
6901 | { | |
6902 | q.f->dump_string("state", "GetInfo"); | |
6903 | q.f->dump_bool("available_might_have_unfound", false); | |
6904 | return discard_event(); | |
6905 | } | |
6906 | ||
9f95a23c TL |
6907 | void PeeringState::GetInfo::exit() |
6908 | { | |
6909 | context< PeeringMachine >().log_exit(state_name, enter_time); | |
6910 | ||
6911 | DECLARE_LOCALS; | |
6912 | utime_t dur = ceph_clock_now() - enter_time; | |
6913 | pl->get_peering_perf().tinc(rs_getinfo_latency, dur); | |
6914 | ps->blocked_by.clear(); | |
6915 | } | |
6916 | ||
6917 | /*------GetLog------------*/ | |
6918 | PeeringState::GetLog::GetLog(my_context ctx) | |
6919 | : my_base(ctx), | |
6920 | NamedState( | |
6921 | context< PeeringMachine >().state_history, | |
6922 | "Started/Primary/Peering/GetLog"), | |
6923 | msg(0) | |
6924 | { | |
6925 | context< PeeringMachine >().log_enter(state_name); | |
6926 | ||
6927 | DECLARE_LOCALS; | |
6928 | ||
6929 | ps->log_weirdness(); | |
6930 | ||
6931 | // adjust acting? | |
6932 | if (!ps->choose_acting(auth_log_shard, false, | |
6933 | &context< Peering >().history_les_bound)) { | |
6934 | if (!ps->want_acting.empty()) { | |
6935 | post_event(NeedActingChange()); | |
6936 | } else { | |
6937 | post_event(IsIncomplete()); | |
6938 | } | |
6939 | return; | |
6940 | } | |
6941 | ||
6942 | // am i the best? | |
6943 | if (auth_log_shard == ps->pg_whoami) { | |
6944 | post_event(GotLog()); | |
6945 | return; | |
6946 | } | |
6947 | ||
6948 | const pg_info_t& best = ps->peer_info[auth_log_shard]; | |
6949 | ||
6950 | // am i broken? | |
6951 | if (ps->info.last_update < best.log_tail) { | |
6952 | psdout(10) << " not contiguous with osd." << auth_log_shard << ", down" << dendl; | |
6953 | post_event(IsIncomplete()); | |
6954 | return; | |
6955 | } | |
6956 | ||
6957 | // how much log to request? | |
6958 | eversion_t request_log_from = ps->info.last_update; | |
6959 | ceph_assert(!ps->acting_recovery_backfill.empty()); | |
f67539c2 | 6960 | for (auto p = ps->acting_recovery_backfill.begin(); |
9f95a23c TL |
6961 | p != ps->acting_recovery_backfill.end(); |
6962 | ++p) { | |
6963 | if (*p == ps->pg_whoami) continue; | |
6964 | pg_info_t& ri = ps->peer_info[*p]; | |
6965 | if (ri.last_update < ps->info.log_tail && ri.last_update >= best.log_tail && | |
6966 | ri.last_update < request_log_from) | |
6967 | request_log_from = ri.last_update; | |
6968 | } | |
6969 | ||
6970 | // how much? | |
6971 | psdout(10) << " requesting log from osd." << auth_log_shard << dendl; | |
6972 | context<PeeringMachine>().send_query( | |
6973 | auth_log_shard.osd, | |
6974 | pg_query_t( | |
6975 | pg_query_t::LOG, | |
6976 | auth_log_shard.shard, ps->pg_whoami.shard, | |
6977 | request_log_from, ps->info.history, | |
6978 | ps->get_osdmap_epoch())); | |
6979 | ||
6980 | ceph_assert(ps->blocked_by.empty()); | |
6981 | ps->blocked_by.insert(auth_log_shard.osd); | |
6982 | pl->publish_stats_to_osd(); | |
6983 | } | |
6984 | ||
6985 | boost::statechart::result PeeringState::GetLog::react(const AdvMap& advmap) | |
6986 | { | |
6987 | // make sure our log source didn't go down. we need to check | |
6988 | // explicitly because it may not be part of the prior set, which | |
6989 | // means the Peering state check won't catch it going down. | |
6990 | if (!advmap.osdmap->is_up(auth_log_shard.osd)) { | |
6991 | psdout(10) << "GetLog: auth_log_shard osd." | |
6992 | << auth_log_shard.osd << " went down" << dendl; | |
6993 | post_event(advmap); | |
6994 | return transit< Reset >(); | |
6995 | } | |
6996 | ||
6997 | // let the Peering state do its checks. | |
6998 | return forward_event(); | |
6999 | } | |
7000 | ||
7001 | boost::statechart::result PeeringState::GetLog::react(const MLogRec& logevt) | |
7002 | { | |
7003 | ceph_assert(!msg); | |
7004 | if (logevt.from != auth_log_shard) { | |
7005 | psdout(10) << "GetLog: discarding log from " | |
7006 | << "non-auth_log_shard osd." << logevt.from << dendl; | |
7007 | return discard_event(); | |
7008 | } | |
7009 | psdout(10) << "GetLog: received master log from osd." | |
7010 | << logevt.from << dendl; | |
7011 | msg = logevt.msg; | |
7012 | post_event(GotLog()); | |
7013 | return discard_event(); | |
7014 | } | |
7015 | ||
7016 | boost::statechart::result PeeringState::GetLog::react(const GotLog&) | |
7017 | { | |
7018 | ||
7019 | DECLARE_LOCALS; | |
7020 | psdout(10) << "leaving GetLog" << dendl; | |
7021 | if (msg) { | |
7022 | psdout(10) << "processing master log" << dendl; | |
7023 | ps->proc_master_log(context<PeeringMachine>().get_cur_transaction(), | |
f67539c2 | 7024 | msg->info, std::move(msg->log), std::move(msg->missing), |
9f95a23c TL |
7025 | auth_log_shard); |
7026 | } | |
7027 | ps->start_flush(context< PeeringMachine >().get_cur_transaction()); | |
7028 | return transit< GetMissing >(); | |
7029 | } | |
7030 | ||
7031 | boost::statechart::result PeeringState::GetLog::react(const QueryState& q) | |
7032 | { | |
7033 | q.f->open_object_section("state"); | |
7034 | q.f->dump_string("name", state_name); | |
7035 | q.f->dump_stream("enter_time") << enter_time; | |
7036 | q.f->dump_stream("auth_log_shard") << auth_log_shard; | |
7037 | q.f->close_section(); | |
7038 | return forward_event(); | |
7039 | } | |
7040 | ||
f67539c2 TL |
7041 | boost::statechart::result PeeringState::GetLog::react(const QueryUnfound& q) |
7042 | { | |
7043 | q.f->dump_string("state", "GetLog"); | |
7044 | q.f->dump_bool("available_might_have_unfound", false); | |
7045 | return discard_event(); | |
7046 | } | |
7047 | ||
9f95a23c TL |
7048 | void PeeringState::GetLog::exit() |
7049 | { | |
7050 | context< PeeringMachine >().log_exit(state_name, enter_time); | |
7051 | ||
7052 | DECLARE_LOCALS; | |
7053 | utime_t dur = ceph_clock_now() - enter_time; | |
7054 | pl->get_peering_perf().tinc(rs_getlog_latency, dur); | |
7055 | ps->blocked_by.clear(); | |
7056 | } | |
7057 | ||
7058 | /*------WaitActingChange--------*/ | |
7059 | PeeringState::WaitActingChange::WaitActingChange(my_context ctx) | |
7060 | : my_base(ctx), | |
7061 | NamedState(context< PeeringMachine >().state_history, "Started/Primary/WaitActingChange") | |
7062 | { | |
7063 | context< PeeringMachine >().log_enter(state_name); | |
7064 | } | |
7065 | ||
7066 | boost::statechart::result PeeringState::WaitActingChange::react(const AdvMap& advmap) | |
7067 | { | |
7068 | DECLARE_LOCALS; | |
7069 | OSDMapRef osdmap = advmap.osdmap; | |
7070 | ||
7071 | psdout(10) << "verifying no want_acting " << ps->want_acting << " targets didn't go down" << dendl; | |
f67539c2 | 7072 | for (auto p = ps->want_acting.begin(); p != ps->want_acting.end(); ++p) { |
9f95a23c TL |
7073 | if (!osdmap->is_up(*p)) { |
7074 | psdout(10) << " want_acting target osd." << *p << " went down, resetting" << dendl; | |
7075 | post_event(advmap); | |
7076 | return transit< Reset >(); | |
7077 | } | |
7078 | } | |
7079 | return forward_event(); | |
7080 | } | |
7081 | ||
7082 | boost::statechart::result PeeringState::WaitActingChange::react(const MLogRec& logevt) | |
7083 | { | |
7084 | psdout(10) << "In WaitActingChange, ignoring MLocRec" << dendl; | |
7085 | return discard_event(); | |
7086 | } | |
7087 | ||
7088 | boost::statechart::result PeeringState::WaitActingChange::react(const MInfoRec& evt) | |
7089 | { | |
7090 | psdout(10) << "In WaitActingChange, ignoring MInfoRec" << dendl; | |
7091 | return discard_event(); | |
7092 | } | |
7093 | ||
7094 | boost::statechart::result PeeringState::WaitActingChange::react(const MNotifyRec& evt) | |
7095 | { | |
7096 | psdout(10) << "In WaitActingChange, ignoring MNotifyRec" << dendl; | |
7097 | return discard_event(); | |
7098 | } | |
7099 | ||
7100 | boost::statechart::result PeeringState::WaitActingChange::react(const QueryState& q) | |
7101 | { | |
7102 | q.f->open_object_section("state"); | |
7103 | q.f->dump_string("name", state_name); | |
7104 | q.f->dump_stream("enter_time") << enter_time; | |
7105 | q.f->dump_string("comment", "waiting for pg acting set to change"); | |
7106 | q.f->close_section(); | |
7107 | return forward_event(); | |
7108 | } | |
7109 | ||
f67539c2 TL |
7110 | boost::statechart::result PeeringState::WaitActingChange::react(const QueryUnfound& q) |
7111 | { | |
7112 | q.f->dump_string("state", "WaitActingChange"); | |
7113 | q.f->dump_bool("available_might_have_unfound", false); | |
7114 | return discard_event(); | |
7115 | } | |
7116 | ||
9f95a23c TL |
7117 | void PeeringState::WaitActingChange::exit() |
7118 | { | |
7119 | context< PeeringMachine >().log_exit(state_name, enter_time); | |
7120 | DECLARE_LOCALS; | |
7121 | utime_t dur = ceph_clock_now() - enter_time; | |
7122 | pl->get_peering_perf().tinc(rs_waitactingchange_latency, dur); | |
7123 | } | |
7124 | ||
7125 | /*------Down--------*/ | |
7126 | PeeringState::Down::Down(my_context ctx) | |
7127 | : my_base(ctx), | |
7128 | NamedState(context< PeeringMachine >().state_history, "Started/Primary/Peering/Down") | |
7129 | { | |
7130 | context< PeeringMachine >().log_enter(state_name); | |
7131 | DECLARE_LOCALS; | |
7132 | ||
7133 | ps->state_clear(PG_STATE_PEERING); | |
7134 | ps->state_set(PG_STATE_DOWN); | |
7135 | ||
7136 | auto &prior_set = context< Peering >().prior_set; | |
7137 | ceph_assert(ps->blocked_by.empty()); | |
7138 | ps->blocked_by.insert(prior_set.down.begin(), prior_set.down.end()); | |
7139 | pl->publish_stats_to_osd(); | |
7140 | } | |
7141 | ||
7142 | void PeeringState::Down::exit() | |
7143 | { | |
7144 | context< PeeringMachine >().log_exit(state_name, enter_time); | |
7145 | ||
7146 | DECLARE_LOCALS; | |
7147 | ||
7148 | ps->state_clear(PG_STATE_DOWN); | |
7149 | utime_t dur = ceph_clock_now() - enter_time; | |
7150 | pl->get_peering_perf().tinc(rs_down_latency, dur); | |
7151 | ||
7152 | ps->blocked_by.clear(); | |
7153 | } | |
7154 | ||
7155 | boost::statechart::result PeeringState::Down::react(const QueryState& q) | |
7156 | { | |
7157 | q.f->open_object_section("state"); | |
7158 | q.f->dump_string("name", state_name); | |
7159 | q.f->dump_stream("enter_time") << enter_time; | |
7160 | q.f->dump_string("comment", | |
7161 | "not enough up instances of this PG to go active"); | |
7162 | q.f->close_section(); | |
7163 | return forward_event(); | |
7164 | } | |
7165 | ||
f67539c2 TL |
7166 | boost::statechart::result PeeringState::Down::react(const QueryUnfound& q) |
7167 | { | |
7168 | q.f->dump_string("state", "Down"); | |
7169 | q.f->dump_bool("available_might_have_unfound", false); | |
7170 | return discard_event(); | |
7171 | } | |
7172 | ||
9f95a23c TL |
7173 | boost::statechart::result PeeringState::Down::react(const MNotifyRec& infoevt) |
7174 | { | |
7175 | DECLARE_LOCALS; | |
7176 | ||
7177 | ceph_assert(ps->is_primary()); | |
7178 | epoch_t old_start = ps->info.history.last_epoch_started; | |
7179 | if (!ps->peer_info.count(infoevt.from) && | |
7180 | ps->get_osdmap()->has_been_up_since(infoevt.from.osd, infoevt.notify.epoch_sent)) { | |
7181 | ps->update_history(infoevt.notify.info.history); | |
7182 | } | |
7183 | // if we got something new to make pg escape down state | |
7184 | if (ps->info.history.last_epoch_started > old_start) { | |
7185 | psdout(10) << " last_epoch_started moved forward, re-enter getinfo" << dendl; | |
7186 | ps->state_clear(PG_STATE_DOWN); | |
7187 | ps->state_set(PG_STATE_PEERING); | |
7188 | return transit< GetInfo >(); | |
7189 | } | |
7190 | ||
7191 | return discard_event(); | |
7192 | } | |
7193 | ||
7194 | ||
7195 | /*------Incomplete--------*/ | |
7196 | PeeringState::Incomplete::Incomplete(my_context ctx) | |
7197 | : my_base(ctx), | |
7198 | NamedState(context< PeeringMachine >().state_history, "Started/Primary/Peering/Incomplete") | |
7199 | { | |
7200 | context< PeeringMachine >().log_enter(state_name); | |
7201 | DECLARE_LOCALS; | |
7202 | ||
7203 | ps->state_clear(PG_STATE_PEERING); | |
7204 | ps->state_set(PG_STATE_INCOMPLETE); | |
7205 | ||
7206 | PastIntervals::PriorSet &prior_set = context< Peering >().prior_set; | |
7207 | ceph_assert(ps->blocked_by.empty()); | |
7208 | ps->blocked_by.insert(prior_set.down.begin(), prior_set.down.end()); | |
7209 | pl->publish_stats_to_osd(); | |
7210 | } | |
7211 | ||
7212 | boost::statechart::result PeeringState::Incomplete::react(const AdvMap &advmap) { | |
7213 | DECLARE_LOCALS; | |
7214 | int64_t poolnum = ps->info.pgid.pool(); | |
7215 | ||
7216 | // Reset if min_size turn smaller than previous value, pg might now be able to go active | |
7217 | if (!advmap.osdmap->have_pg_pool(poolnum) || | |
7218 | advmap.lastmap->get_pools().find(poolnum)->second.min_size > | |
7219 | advmap.osdmap->get_pools().find(poolnum)->second.min_size) { | |
7220 | post_event(advmap); | |
7221 | return transit< Reset >(); | |
7222 | } | |
7223 | ||
7224 | return forward_event(); | |
7225 | } | |
7226 | ||
7227 | boost::statechart::result PeeringState::Incomplete::react(const MNotifyRec& notevt) { | |
7228 | DECLARE_LOCALS; | |
7229 | psdout(7) << "handle_pg_notify from osd." << notevt.from << dendl; | |
7230 | if (ps->proc_replica_info( | |
7231 | notevt.from, notevt.notify.info, notevt.notify.epoch_sent)) { | |
7232 | // We got something new, try again! | |
7233 | return transit< GetLog >(); | |
7234 | } else { | |
7235 | return discard_event(); | |
7236 | } | |
7237 | } | |
7238 | ||
7239 | boost::statechart::result PeeringState::Incomplete::react( | |
7240 | const QueryState& q) | |
7241 | { | |
7242 | q.f->open_object_section("state"); | |
7243 | q.f->dump_string("name", state_name); | |
7244 | q.f->dump_stream("enter_time") << enter_time; | |
7245 | q.f->dump_string("comment", "not enough complete instances of this PG"); | |
7246 | q.f->close_section(); | |
7247 | return forward_event(); | |
7248 | } | |
7249 | ||
f67539c2 TL |
7250 | boost::statechart::result PeeringState::Incomplete::react(const QueryUnfound& q) |
7251 | { | |
7252 | q.f->dump_string("state", "Incomplete"); | |
7253 | q.f->dump_bool("available_might_have_unfound", false); | |
7254 | return discard_event(); | |
7255 | } | |
7256 | ||
9f95a23c TL |
7257 | void PeeringState::Incomplete::exit() |
7258 | { | |
7259 | context< PeeringMachine >().log_exit(state_name, enter_time); | |
7260 | ||
7261 | DECLARE_LOCALS; | |
7262 | ||
7263 | ps->state_clear(PG_STATE_INCOMPLETE); | |
7264 | utime_t dur = ceph_clock_now() - enter_time; | |
7265 | pl->get_peering_perf().tinc(rs_incomplete_latency, dur); | |
7266 | ||
7267 | ps->blocked_by.clear(); | |
7268 | } | |
7269 | ||
7270 | /*------GetMissing--------*/ | |
7271 | PeeringState::GetMissing::GetMissing(my_context ctx) | |
7272 | : my_base(ctx), | |
7273 | NamedState(context< PeeringMachine >().state_history, "Started/Primary/Peering/GetMissing") | |
7274 | { | |
7275 | context< PeeringMachine >().log_enter(state_name); | |
7276 | ||
7277 | DECLARE_LOCALS; | |
7278 | ps->log_weirdness(); | |
7279 | ceph_assert(!ps->acting_recovery_backfill.empty()); | |
7280 | eversion_t since; | |
f67539c2 | 7281 | for (auto i = ps->acting_recovery_backfill.begin(); |
9f95a23c TL |
7282 | i != ps->acting_recovery_backfill.end(); |
7283 | ++i) { | |
7284 | if (*i == ps->get_primary()) continue; | |
7285 | const pg_info_t& pi = ps->peer_info[*i]; | |
7286 | // reset this so to make sure the pg_missing_t is initialized and | |
7287 | // has the correct semantics even if we don't need to get a | |
7288 | // missing set from a shard. This way later additions due to | |
7289 | // lost+unfound delete work properly. | |
7290 | ps->peer_missing[*i].may_include_deletes = !ps->perform_deletes_during_peering(); | |
7291 | ||
7292 | if (pi.is_empty()) | |
7293 | continue; // no pg data, nothing divergent | |
7294 | ||
7295 | if (pi.last_update < ps->pg_log.get_tail()) { | |
7296 | psdout(10) << " osd." << *i << " is not contiguous, will restart backfill" << dendl; | |
7297 | ps->peer_missing[*i].clear(); | |
7298 | continue; | |
7299 | } | |
7300 | if (pi.last_backfill == hobject_t()) { | |
7301 | psdout(10) << " osd." << *i << " will fully backfill; can infer empty missing set" << dendl; | |
7302 | ps->peer_missing[*i].clear(); | |
7303 | continue; | |
7304 | } | |
7305 | ||
7306 | if (pi.last_update == pi.last_complete && // peer has no missing | |
7307 | pi.last_update == ps->info.last_update) { // peer is up to date | |
7308 | // replica has no missing and identical log as us. no need to | |
7309 | // pull anything. | |
7310 | // FIXME: we can do better here. if last_update==last_complete we | |
7311 | // can infer the rest! | |
7312 | psdout(10) << " osd." << *i << " has no missing, identical log" << dendl; | |
7313 | ps->peer_missing[*i].clear(); | |
7314 | continue; | |
7315 | } | |
7316 | ||
7317 | // We pull the log from the peer's last_epoch_started to ensure we | |
7318 | // get enough log to detect divergent updates. | |
7319 | since.epoch = pi.last_epoch_started; | |
7320 | ceph_assert(pi.last_update >= ps->info.log_tail); // or else choose_acting() did a bad thing | |
7321 | if (pi.log_tail <= since) { | |
7322 | psdout(10) << " requesting log+missing since " << since << " from osd." << *i << dendl; | |
7323 | context< PeeringMachine >().send_query( | |
7324 | i->osd, | |
7325 | pg_query_t( | |
7326 | pg_query_t::LOG, | |
7327 | i->shard, ps->pg_whoami.shard, | |
7328 | since, ps->info.history, | |
7329 | ps->get_osdmap_epoch())); | |
7330 | } else { | |
7331 | psdout(10) << " requesting fulllog+missing from osd." << *i | |
7332 | << " (want since " << since << " < log.tail " | |
7333 | << pi.log_tail << ")" << dendl; | |
7334 | context< PeeringMachine >().send_query( | |
7335 | i->osd, pg_query_t( | |
7336 | pg_query_t::FULLLOG, | |
7337 | i->shard, ps->pg_whoami.shard, | |
7338 | ps->info.history, ps->get_osdmap_epoch())); | |
7339 | } | |
7340 | peer_missing_requested.insert(*i); | |
7341 | ps->blocked_by.insert(i->osd); | |
7342 | } | |
7343 | ||
7344 | if (peer_missing_requested.empty()) { | |
7345 | if (ps->need_up_thru) { | |
7346 | psdout(10) << " still need up_thru update before going active" | |
7347 | << dendl; | |
7348 | post_event(NeedUpThru()); | |
7349 | return; | |
7350 | } | |
7351 | ||
7352 | // all good! | |
7353 | post_event(Activate(ps->get_osdmap_epoch())); | |
7354 | } else { | |
7355 | pl->publish_stats_to_osd(); | |
7356 | } | |
7357 | } | |
7358 | ||
7359 | boost::statechart::result PeeringState::GetMissing::react(const MLogRec& logevt) | |
7360 | { | |
7361 | DECLARE_LOCALS; | |
7362 | ||
7363 | peer_missing_requested.erase(logevt.from); | |
f67539c2 TL |
7364 | ps->proc_replica_log(logevt.msg->info, |
7365 | logevt.msg->log, | |
7366 | std::move(logevt.msg->missing), | |
7367 | logevt.from); | |
9f95a23c TL |
7368 | |
7369 | if (peer_missing_requested.empty()) { | |
7370 | if (ps->need_up_thru) { | |
7371 | psdout(10) << " still need up_thru update before going active" | |
7372 | << dendl; | |
7373 | post_event(NeedUpThru()); | |
7374 | } else { | |
7375 | psdout(10) << "Got last missing, don't need missing " | |
7376 | << "posting Activate" << dendl; | |
7377 | post_event(Activate(ps->get_osdmap_epoch())); | |
7378 | } | |
7379 | } | |
7380 | return discard_event(); | |
7381 | } | |
7382 | ||
7383 | boost::statechart::result PeeringState::GetMissing::react(const QueryState& q) | |
7384 | { | |
7385 | DECLARE_LOCALS; | |
7386 | q.f->open_object_section("state"); | |
7387 | q.f->dump_string("name", state_name); | |
7388 | q.f->dump_stream("enter_time") << enter_time; | |
7389 | ||
7390 | q.f->open_array_section("peer_missing_requested"); | |
f67539c2 | 7391 | for (auto p = peer_missing_requested.begin(); |
9f95a23c TL |
7392 | p != peer_missing_requested.end(); |
7393 | ++p) { | |
7394 | q.f->open_object_section("osd"); | |
7395 | q.f->dump_stream("osd") << *p; | |
7396 | if (ps->peer_missing.count(*p)) { | |
7397 | q.f->open_object_section("got_missing"); | |
7398 | ps->peer_missing[*p].dump(q.f); | |
7399 | q.f->close_section(); | |
7400 | } | |
7401 | q.f->close_section(); | |
7402 | } | |
7403 | q.f->close_section(); | |
7404 | ||
7405 | q.f->close_section(); | |
7406 | return forward_event(); | |
7407 | } | |
7408 | ||
f67539c2 TL |
7409 | boost::statechart::result PeeringState::GetMissing::react(const QueryUnfound& q) |
7410 | { | |
7411 | q.f->dump_string("state", "GetMising"); | |
7412 | q.f->dump_bool("available_might_have_unfound", false); | |
7413 | return discard_event(); | |
7414 | } | |
7415 | ||
9f95a23c TL |
7416 | void PeeringState::GetMissing::exit() |
7417 | { | |
7418 | context< PeeringMachine >().log_exit(state_name, enter_time); | |
7419 | ||
7420 | DECLARE_LOCALS; | |
7421 | utime_t dur = ceph_clock_now() - enter_time; | |
7422 | pl->get_peering_perf().tinc(rs_getmissing_latency, dur); | |
7423 | ps->blocked_by.clear(); | |
7424 | } | |
7425 | ||
7426 | /*------WaitUpThru--------*/ | |
7427 | PeeringState::WaitUpThru::WaitUpThru(my_context ctx) | |
7428 | : my_base(ctx), | |
7429 | NamedState(context< PeeringMachine >().state_history, "Started/Primary/Peering/WaitUpThru") | |
7430 | { | |
7431 | context< PeeringMachine >().log_enter(state_name); | |
7432 | } | |
7433 | ||
7434 | boost::statechart::result PeeringState::WaitUpThru::react(const ActMap& am) | |
7435 | { | |
7436 | DECLARE_LOCALS; | |
7437 | if (!ps->need_up_thru) { | |
7438 | post_event(Activate(ps->get_osdmap_epoch())); | |
7439 | } | |
7440 | return forward_event(); | |
7441 | } | |
7442 | ||
7443 | boost::statechart::result PeeringState::WaitUpThru::react(const MLogRec& logevt) | |
7444 | { | |
7445 | DECLARE_LOCALS; | |
7446 | psdout(10) << "Noting missing from osd." << logevt.from << dendl; | |
f67539c2 | 7447 | ps->peer_missing[logevt.from].claim(std::move(logevt.msg->missing)); |
9f95a23c TL |
7448 | ps->peer_info[logevt.from] = logevt.msg->info; |
7449 | return discard_event(); | |
7450 | } | |
7451 | ||
7452 | boost::statechart::result PeeringState::WaitUpThru::react(const QueryState& q) | |
7453 | { | |
7454 | q.f->open_object_section("state"); | |
7455 | q.f->dump_string("name", state_name); | |
7456 | q.f->dump_stream("enter_time") << enter_time; | |
7457 | q.f->dump_string("comment", "waiting for osdmap to reflect a new up_thru for this osd"); | |
7458 | q.f->close_section(); | |
7459 | return forward_event(); | |
7460 | } | |
7461 | ||
f67539c2 TL |
7462 | boost::statechart::result PeeringState::WaitUpThru::react(const QueryUnfound& q) |
7463 | { | |
7464 | q.f->dump_string("state", "WaitUpThru"); | |
7465 | q.f->dump_bool("available_might_have_unfound", false); | |
7466 | return discard_event(); | |
7467 | } | |
7468 | ||
9f95a23c TL |
7469 | void PeeringState::WaitUpThru::exit() |
7470 | { | |
7471 | context< PeeringMachine >().log_exit(state_name, enter_time); | |
7472 | DECLARE_LOCALS; | |
7473 | utime_t dur = ceph_clock_now() - enter_time; | |
7474 | pl->get_peering_perf().tinc(rs_waitupthru_latency, dur); | |
7475 | } | |
7476 | ||
7477 | /*----PeeringState::PeeringMachine Methods-----*/ | |
7478 | #undef dout_prefix | |
7479 | #define dout_prefix dpp->gen_prefix(*_dout) | |
7480 | ||
7481 | void PeeringState::PeeringMachine::log_enter(const char *state_name) | |
7482 | { | |
7483 | DECLARE_LOCALS; | |
7484 | psdout(5) << "enter " << state_name << dendl; | |
7485 | pl->log_state_enter(state_name); | |
7486 | } | |
7487 | ||
7488 | void PeeringState::PeeringMachine::log_exit(const char *state_name, utime_t enter_time) | |
7489 | { | |
7490 | DECLARE_LOCALS; | |
7491 | utime_t dur = ceph_clock_now() - enter_time; | |
7492 | psdout(5) << "exit " << state_name << " " << dur << " " << event_count << " " << event_time << dendl; | |
7493 | pl->log_state_exit(state_name, enter_time, event_count, event_time); | |
7494 | event_count = 0; | |
7495 | event_time = utime_t(); | |
7496 | } | |
7497 | ||
7498 | ostream &operator<<(ostream &out, const PeeringState &ps) { | |
7499 | out << "pg[" << ps.info | |
7500 | << " " << pg_vector_string(ps.up); | |
7501 | if (ps.acting != ps.up) | |
7502 | out << "/" << pg_vector_string(ps.acting); | |
7503 | if (ps.is_ec_pg()) | |
7504 | out << "p" << ps.get_primary(); | |
7505 | if (!ps.async_recovery_targets.empty()) | |
7506 | out << " async=[" << ps.async_recovery_targets << "]"; | |
7507 | if (!ps.backfill_targets.empty()) | |
7508 | out << " backfill=[" << ps.backfill_targets << "]"; | |
7509 | out << " r=" << ps.get_role(); | |
7510 | out << " lpr=" << ps.get_last_peering_reset(); | |
7511 | ||
7512 | if (ps.deleting) | |
7513 | out << " DELETING"; | |
7514 | ||
7515 | if (!ps.past_intervals.empty()) { | |
7516 | out << " pi=[" << ps.past_intervals.get_bounds() | |
7517 | << ")/" << ps.past_intervals.size(); | |
7518 | } | |
7519 | ||
7520 | if (ps.is_peered()) { | |
7521 | if (ps.last_update_ondisk != ps.info.last_update) | |
7522 | out << " luod=" << ps.last_update_ondisk; | |
7523 | if (ps.last_update_applied != ps.info.last_update) | |
7524 | out << " lua=" << ps.last_update_applied; | |
7525 | } | |
7526 | ||
7527 | if (ps.pg_log.get_tail() != ps.info.log_tail || | |
7528 | ps.pg_log.get_head() != ps.info.last_update) | |
7529 | out << " (info mismatch, " << ps.pg_log.get_log() << ")"; | |
7530 | ||
7531 | if (!ps.pg_log.get_log().empty()) { | |
7532 | if ((ps.pg_log.get_log().log.begin()->version <= ps.pg_log.get_tail())) { | |
7533 | out << " (log bound mismatch, actual=[" | |
7534 | << ps.pg_log.get_log().log.begin()->version << "," | |
7535 | << ps.pg_log.get_log().log.rbegin()->version << "]"; | |
7536 | out << ")"; | |
7537 | } | |
7538 | } | |
7539 | ||
7540 | out << " crt=" << ps.pg_log.get_can_rollback_to(); | |
7541 | ||
7542 | if (ps.last_complete_ondisk != ps.info.last_complete) | |
7543 | out << " lcod " << ps.last_complete_ondisk; | |
7544 | ||
7545 | out << " mlcod " << ps.min_last_complete_ondisk; | |
7546 | ||
7547 | out << " " << pg_state_string(ps.get_state()); | |
7548 | if (ps.should_send_notify()) | |
7549 | out << " NOTIFY"; | |
7550 | ||
7551 | if (ps.prior_readable_until_ub != ceph::signedspan::zero()) { | |
7552 | out << " pruub " << ps.prior_readable_until_ub | |
7553 | << "@" << ps.get_prior_readable_down_osds(); | |
7554 | } | |
7555 | return out; | |
7556 | } | |
f67539c2 TL |
7557 | |
7558 | std::vector<pg_shard_t> PeeringState::get_replica_recovery_order() const | |
7559 | { | |
7560 | std::vector<std::pair<unsigned int, pg_shard_t>> replicas_by_num_missing, | |
7561 | async_by_num_missing; | |
7562 | replicas_by_num_missing.reserve(get_acting_recovery_backfill().size() - 1); | |
7563 | for (auto &p : get_acting_recovery_backfill()) { | |
7564 | if (p == get_primary()) { | |
7565 | continue; | |
7566 | } | |
7567 | auto pm = get_peer_missing().find(p); | |
7568 | assert(pm != get_peer_missing().end()); | |
7569 | auto nm = pm->second.num_missing(); | |
7570 | if (nm != 0) { | |
7571 | if (is_async_recovery_target(p)) { | |
7572 | async_by_num_missing.push_back(make_pair(nm, p)); | |
7573 | } else { | |
7574 | replicas_by_num_missing.push_back(make_pair(nm, p)); | |
7575 | } | |
7576 | } | |
7577 | } | |
7578 | // sort by number of missing objects, in ascending order. | |
7579 | auto func = [](const std::pair<unsigned int, pg_shard_t> &lhs, | |
7580 | const std::pair<unsigned int, pg_shard_t> &rhs) { | |
7581 | return lhs.first < rhs.first; | |
7582 | }; | |
7583 | // acting goes first | |
7584 | std::sort(replicas_by_num_missing.begin(), replicas_by_num_missing.end(), func); | |
7585 | // then async_recovery_targets | |
7586 | std::sort(async_by_num_missing.begin(), async_by_num_missing.end(), func); | |
7587 | replicas_by_num_missing.insert(replicas_by_num_missing.end(), | |
7588 | async_by_num_missing.begin(), async_by_num_missing.end()); | |
7589 | ||
7590 | std::vector<pg_shard_t> ret; | |
7591 | ret.reserve(replicas_by_num_missing.size()); | |
7592 | for (auto p : replicas_by_num_missing) { | |
7593 | ret.push_back(p.second); | |
7594 | } | |
7595 | return ret; | |
7596 | } | |
7597 | ||
7598 |