]>
Commit | Line | Data |
---|---|---|
7c673cae FG |
1 | // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- |
2 | // vim: ts=8 sw=2 smarttab | |
3 | /* | |
4 | * Ceph - scalable distributed file system | |
5 | * | |
6 | * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net> | |
7 | * | |
8 | * This is free software; you can redistribute it and/or | |
9 | * modify it under the terms of the GNU Lesser General Public | |
10 | * License version 2.1, as published by the Free Software | |
11 | * Foundation. See file COPYING. | |
12 | * | |
13 | */ | |
14 | ||
15 | #include "PG.h" | |
7c673cae | 16 | #include "messages/MOSDRepScrub.h" |
7c673cae FG |
17 | |
18 | #include "common/errno.h" | |
9f95a23c | 19 | #include "common/ceph_releases.h" |
7c673cae FG |
20 | #include "common/config.h" |
21 | #include "OSD.h" | |
22 | #include "OpRequest.h" | |
20effc67 TL |
23 | #include "osd/scrubber/ScrubStore.h" |
24 | #include "osd/scrubber/pg_scrubber.h" | |
9f95a23c | 25 | #include "osd/scheduler/OpSchedulerItem.h" |
20effc67 | 26 | #include "Session.h" |
7c673cae FG |
27 | |
28 | #include "common/Timer.h" | |
29 | #include "common/perf_counters.h" | |
30 | ||
31 | #include "messages/MOSDOp.h" | |
7c673cae FG |
32 | #include "messages/MOSDPGScan.h" |
33 | #include "messages/MOSDPGBackfill.h" | |
34 | #include "messages/MOSDPGBackfillRemove.h" | |
35 | #include "messages/MBackfillReserve.h" | |
36 | #include "messages/MRecoveryReserve.h" | |
37 | #include "messages/MOSDPGPush.h" | |
38 | #include "messages/MOSDPGPushReply.h" | |
39 | #include "messages/MOSDPGPull.h" | |
40 | #include "messages/MOSDECSubOpWrite.h" | |
41 | #include "messages/MOSDECSubOpWriteReply.h" | |
42 | #include "messages/MOSDECSubOpRead.h" | |
43 | #include "messages/MOSDECSubOpReadReply.h" | |
44 | #include "messages/MOSDPGUpdateLogMissing.h" | |
45 | #include "messages/MOSDPGUpdateLogMissingReply.h" | |
46 | #include "messages/MOSDBackoff.h" | |
47 | #include "messages/MOSDScrubReserve.h" | |
7c673cae | 48 | #include "messages/MOSDRepOp.h" |
7c673cae FG |
49 | #include "messages/MOSDRepOpReply.h" |
50 | #include "messages/MOSDRepScrubMap.h" | |
c07f9fc5 FG |
51 | #include "messages/MOSDPGRecoveryDelete.h" |
52 | #include "messages/MOSDPGRecoveryDeleteReply.h" | |
7c673cae FG |
53 | |
54 | #include "common/BackTrace.h" | |
55 | #include "common/EventTrace.h" | |
56 | ||
57 | #ifdef WITH_LTTNG | |
58 | #define TRACEPOINT_DEFINE | |
59 | #define TRACEPOINT_PROBE_DYNAMIC_LINKAGE | |
60 | #include "tracing/pg.h" | |
61 | #undef TRACEPOINT_PROBE_DYNAMIC_LINKAGE | |
62 | #undef TRACEPOINT_DEFINE | |
63 | #else | |
64 | #define tracepoint(...) | |
65 | #endif | |
66 | ||
67 | #include <sstream> | |
68 | ||
69 | #define dout_context cct | |
70 | #define dout_subsys ceph_subsys_osd | |
71 | #undef dout_prefix | |
72 | #define dout_prefix _prefix(_dout, this) | |
73 | ||
f67539c2 TL |
74 | using std::list; |
75 | using std::map; | |
76 | using std::ostringstream; | |
77 | using std::pair; | |
78 | using std::set; | |
79 | using std::string; | |
80 | using std::stringstream; | |
81 | using std::unique_ptr; | |
82 | using std::vector; | |
83 | ||
84 | using ceph::bufferlist; | |
85 | using ceph::bufferptr; | |
86 | using ceph::decode; | |
87 | using ceph::encode; | |
88 | using ceph::Formatter; | |
89 | ||
9f95a23c | 90 | using namespace ceph::osd::scheduler; |
7c673cae FG |
91 | |
92 | template <class T> | |
93 | static ostream& _prefix(std::ostream *_dout, T *t) | |
94 | { | |
11fdf7f2 | 95 | return t->gen_prefix(*_dout); |
7c673cae FG |
96 | } |
97 | ||
7c673cae FG |
98 | void PG::get(const char* tag) |
99 | { | |
11fdf7f2 TL |
100 | int after = ++ref; |
101 | lgeneric_subdout(cct, refs, 5) << "PG::get " << this << " " | |
102 | << "tag " << (tag ? tag : "(none") << " " | |
103 | << (after - 1) << " -> " << after << dendl; | |
7c673cae | 104 | #ifdef PG_DEBUG_REFS |
11fdf7f2 | 105 | std::lock_guard l(_ref_id_lock); |
7c673cae FG |
106 | _tag_counts[tag]++; |
107 | #endif | |
108 | } | |
109 | ||
110 | void PG::put(const char* tag) | |
111 | { | |
112 | #ifdef PG_DEBUG_REFS | |
113 | { | |
11fdf7f2 | 114 | std::lock_guard l(_ref_id_lock); |
7c673cae | 115 | auto tag_counts_entry = _tag_counts.find(tag); |
11fdf7f2 | 116 | ceph_assert(tag_counts_entry != _tag_counts.end()); |
7c673cae FG |
117 | --tag_counts_entry->second; |
118 | if (tag_counts_entry->second == 0) { | |
119 | _tag_counts.erase(tag_counts_entry); | |
120 | } | |
121 | } | |
122 | #endif | |
11fdf7f2 TL |
123 | auto local_cct = cct; |
124 | int after = --ref; | |
125 | lgeneric_subdout(local_cct, refs, 5) << "PG::put " << this << " " | |
126 | << "tag " << (tag ? tag : "(none") << " " | |
127 | << (after + 1) << " -> " << after | |
128 | << dendl; | |
129 | if (after == 0) | |
7c673cae FG |
130 | delete this; |
131 | } | |
132 | ||
133 | #ifdef PG_DEBUG_REFS | |
134 | uint64_t PG::get_with_id() | |
135 | { | |
136 | ref++; | |
11fdf7f2 | 137 | std::lock_guard l(_ref_id_lock); |
7c673cae | 138 | uint64_t id = ++_ref_id; |
20effc67 | 139 | ClibBackTrace bt(0); |
7c673cae FG |
140 | stringstream ss; |
141 | bt.print(ss); | |
11fdf7f2 TL |
142 | lgeneric_subdout(cct, refs, 5) << "PG::get " << this << " " << info.pgid |
143 | << " got id " << id << " " | |
144 | << (ref - 1) << " -> " << ref | |
145 | << dendl; | |
146 | ceph_assert(!_live_ids.count(id)); | |
7c673cae FG |
147 | _live_ids.insert(make_pair(id, ss.str())); |
148 | return id; | |
149 | } | |
150 | ||
151 | void PG::put_with_id(uint64_t id) | |
152 | { | |
11fdf7f2 TL |
153 | int newref = --ref; |
154 | lgeneric_subdout(cct, refs, 5) << "PG::put " << this << " " << info.pgid | |
155 | << " put id " << id << " " | |
156 | << (newref + 1) << " -> " << newref | |
157 | << dendl; | |
7c673cae | 158 | { |
11fdf7f2 TL |
159 | std::lock_guard l(_ref_id_lock); |
160 | ceph_assert(_live_ids.count(id)); | |
7c673cae FG |
161 | _live_ids.erase(id); |
162 | } | |
11fdf7f2 | 163 | if (newref) |
7c673cae FG |
164 | delete this; |
165 | } | |
166 | ||
167 | void PG::dump_live_ids() | |
168 | { | |
11fdf7f2 | 169 | std::lock_guard l(_ref_id_lock); |
7c673cae FG |
170 | dout(0) << "\t" << __func__ << ": " << info.pgid << " live ids:" << dendl; |
171 | for (map<uint64_t, string>::iterator i = _live_ids.begin(); | |
172 | i != _live_ids.end(); | |
173 | ++i) { | |
174 | dout(0) << "\t\tid: " << *i << dendl; | |
175 | } | |
176 | dout(0) << "\t" << __func__ << ": " << info.pgid << " live tags:" << dendl; | |
177 | for (map<string, uint64_t>::iterator i = _tag_counts.begin(); | |
178 | i != _tag_counts.end(); | |
179 | ++i) { | |
180 | dout(0) << "\t\tid: " << *i << dendl; | |
181 | } | |
182 | } | |
183 | #endif | |
184 | ||
7c673cae FG |
185 | PG::PG(OSDService *o, OSDMapRef curmap, |
186 | const PGPool &_pool, spg_t p) : | |
9f95a23c | 187 | pg_whoami(o->whoami, p.shard), |
11fdf7f2 TL |
188 | pg_id(p), |
189 | coll(p), | |
7c673cae FG |
190 | osd(o), |
191 | cct(o->cct), | |
192 | osdriver(osd->store, coll_t(), OSD::make_snapmapper_oid()), | |
193 | snap_mapper( | |
194 | cct, | |
195 | &osdriver, | |
196 | p.ps(), | |
11fdf7f2 | 197 | p.get_split_bits(_pool.info.get_pg_num()), |
7c673cae FG |
198 | _pool.id, |
199 | p.shard), | |
7c673cae | 200 | trace_endpoint("0.0.0.0", 0, "PG"), |
7c673cae | 201 | info_struct_v(0), |
7c673cae | 202 | pgmeta_oid(p.make_pgmeta_oid()), |
7c673cae | 203 | stat_queue_item(this), |
7c673cae FG |
204 | recovery_queued(false), |
205 | recovery_ops_active(0), | |
7c673cae | 206 | backfill_reserving(false), |
7c673cae | 207 | finish_sync_event(NULL), |
7c673cae FG |
208 | scrub_after_recovery(false), |
209 | active_pushes(0), | |
9f95a23c TL |
210 | recovery_state( |
211 | o->cct, | |
212 | pg_whoami, | |
213 | p, | |
214 | _pool, | |
215 | curmap, | |
216 | this, | |
217 | this), | |
218 | pool(recovery_state.get_pool()), | |
219 | info(recovery_state.get_info()) | |
7c673cae FG |
220 | { |
221 | #ifdef PG_DEBUG_REFS | |
222 | osd->add_pgid(p, this); | |
223 | #endif | |
224 | #ifdef WITH_BLKIN | |
225 | std::stringstream ss; | |
226 | ss << "PG " << info.pgid; | |
227 | trace_endpoint.copy_name(ss.str()); | |
228 | #endif | |
7c673cae FG |
229 | } |
230 | ||
231 | PG::~PG() | |
232 | { | |
7c673cae FG |
233 | #ifdef PG_DEBUG_REFS |
234 | osd->remove_pgid(info.pgid, this); | |
235 | #endif | |
236 | } | |
237 | ||
7c673cae FG |
238 | void PG::lock(bool no_lockdep) const |
239 | { | |
9f95a23c TL |
240 | #ifdef CEPH_DEBUG_MUTEX |
241 | _lock.lock(no_lockdep); | |
242 | #else | |
243 | _lock.lock(); | |
244 | locked_by = std::this_thread::get_id(); | |
245 | #endif | |
7c673cae | 246 | // if we have unrecorded dirty state with the lock dropped, there is a bug |
9f95a23c | 247 | ceph_assert(!recovery_state.debug_has_dirty_state()); |
7c673cae FG |
248 | |
249 | dout(30) << "lock" << dendl; | |
250 | } | |
251 | ||
9f95a23c TL |
252 | bool PG::is_locked() const |
253 | { | |
254 | return ceph_mutex_is_locked(_lock); | |
255 | } | |
256 | ||
257 | void PG::unlock() const | |
258 | { | |
259 | //generic_dout(0) << this << " " << info.pgid << " unlock" << dendl; | |
260 | ceph_assert(!recovery_state.debug_has_dirty_state()); | |
261 | #ifndef CEPH_DEBUG_MUTEX | |
262 | locked_by = {}; | |
263 | #endif | |
264 | _lock.unlock(); | |
265 | } | |
266 | ||
11fdf7f2 | 267 | std::ostream& PG::gen_prefix(std::ostream& out) const |
7c673cae | 268 | { |
9f95a23c TL |
269 | OSDMapRef mapref = recovery_state.get_osdmap(); |
270 | #ifdef CEPH_DEBUG_MUTEX | |
7c673cae | 271 | if (_lock.is_locked_by_me()) { |
9f95a23c TL |
272 | #else |
273 | if (locked_by == std::this_thread::get_id()) { | |
274 | #endif | |
7c673cae FG |
275 | out << "osd." << osd->whoami |
276 | << " pg_epoch: " << (mapref ? mapref->get_epoch():0) | |
277 | << " " << *this << " "; | |
278 | } else { | |
279 | out << "osd." << osd->whoami | |
280 | << " pg_epoch: " << (mapref ? mapref->get_epoch():0) | |
9f95a23c | 281 | << " pg[" << pg_id.pgid << "(unlocked)] "; |
7c673cae | 282 | } |
11fdf7f2 | 283 | return out; |
7c673cae | 284 | } |
7c673cae | 285 | |
9f95a23c TL |
286 | PerfCounters &PG::get_peering_perf() { |
287 | return *(osd->recoverystate_perf); | |
7c673cae FG |
288 | } |
289 | ||
9f95a23c TL |
290 | PerfCounters &PG::get_perf_logger() { |
291 | return *(osd->logger); | |
292 | } | |
7c673cae | 293 | |
9f95a23c TL |
294 | void PG::log_state_enter(const char *state) { |
295 | osd->pg_recovery_stats.log_enter(state); | |
296 | } | |
7c673cae | 297 | |
9f95a23c TL |
298 | void PG::log_state_exit( |
299 | const char *state_name, utime_t enter_time, | |
300 | uint64_t events, utime_t event_dur) { | |
301 | osd->pg_recovery_stats.log_exit( | |
302 | state_name, ceph_clock_now() - enter_time, events, event_dur); | |
7c673cae | 303 | } |
f67539c2 | 304 | |
9f95a23c | 305 | /********* PG **********/ |
7c673cae FG |
306 | |
307 | void PG::remove_snap_mapped_object( | |
308 | ObjectStore::Transaction &t, const hobject_t &soid) | |
309 | { | |
310 | t.remove( | |
311 | coll, | |
312 | ghobject_t(soid, ghobject_t::NO_GEN, pg_whoami.shard)); | |
313 | clear_object_snap_mapping(&t, soid); | |
314 | } | |
315 | ||
316 | void PG::clear_object_snap_mapping( | |
317 | ObjectStore::Transaction *t, const hobject_t &soid) | |
318 | { | |
319 | OSDriver::OSTransaction _t(osdriver.get_transaction(t)); | |
320 | if (soid.snap < CEPH_MAXSNAP) { | |
321 | int r = snap_mapper.remove_oid( | |
322 | soid, | |
323 | &_t); | |
324 | if (!(r == 0 || r == -ENOENT)) { | |
325 | derr << __func__ << ": remove_oid returned " << cpp_strerror(r) << dendl; | |
326 | ceph_abort(); | |
327 | } | |
328 | } | |
329 | } | |
330 | ||
331 | void PG::update_object_snap_mapping( | |
332 | ObjectStore::Transaction *t, const hobject_t &soid, const set<snapid_t> &snaps) | |
333 | { | |
334 | OSDriver::OSTransaction _t(osdriver.get_transaction(t)); | |
11fdf7f2 | 335 | ceph_assert(soid.snap < CEPH_MAXSNAP); |
7c673cae FG |
336 | int r = snap_mapper.remove_oid( |
337 | soid, | |
338 | &_t); | |
339 | if (!(r == 0 || r == -ENOENT)) { | |
340 | derr << __func__ << ": remove_oid returned " << cpp_strerror(r) << dendl; | |
341 | ceph_abort(); | |
342 | } | |
343 | snap_mapper.add_oid( | |
344 | soid, | |
345 | snaps, | |
346 | &_t); | |
347 | } | |
348 | ||
9f95a23c TL |
349 | /******* PG ***********/ |
350 | void PG::clear_primary_state() | |
7c673cae | 351 | { |
f67539c2 TL |
352 | dout(20) << __func__ << dendl; |
353 | ||
9f95a23c | 354 | projected_log = PGLog::IndexedLog(); |
7c673cae | 355 | |
9f95a23c TL |
356 | snap_trimq.clear(); |
357 | snap_trimq_repeat.clear(); | |
358 | finish_sync_event = 0; // so that _finish_recovery doesn't go off in another thread | |
359 | release_pg_backoffs(); | |
7c673cae | 360 | |
f67539c2 TL |
361 | if (m_scrubber) { |
362 | m_scrubber->discard_replica_reservations(); | |
363 | } | |
9f95a23c TL |
364 | scrub_after_recovery = false; |
365 | ||
366 | agent_clear(); | |
7c673cae FG |
367 | } |
368 | ||
91327a77 | 369 | |
9f95a23c TL |
370 | bool PG::op_has_sufficient_caps(OpRequestRef& op) |
371 | { | |
372 | // only check MOSDOp | |
373 | if (op->get_req()->get_type() != CEPH_MSG_OSD_OP) | |
c07f9fc5 | 374 | return true; |
9f95a23c TL |
375 | |
376 | auto req = op->get_req<MOSDOp>(); | |
377 | auto priv = req->get_connection()->get_priv(); | |
378 | auto session = static_cast<Session*>(priv.get()); | |
379 | if (!session) { | |
380 | dout(0) << "op_has_sufficient_caps: no session for op " << *req << dendl; | |
c07f9fc5 | 381 | return false; |
7c673cae | 382 | } |
9f95a23c TL |
383 | OSDCap& caps = session->caps; |
384 | priv.reset(); | |
7c673cae | 385 | |
9f95a23c TL |
386 | const string &key = req->get_hobj().get_key().empty() ? |
387 | req->get_oid().name : | |
388 | req->get_hobj().get_key(); | |
91327a77 | 389 | |
9f95a23c TL |
390 | bool cap = caps.is_capable(pool.name, req->get_hobj().nspace, |
391 | pool.info.application_metadata, | |
392 | key, | |
393 | op->need_read_cap(), | |
394 | op->need_write_cap(), | |
395 | op->classes(), | |
396 | session->get_peer_socket_addr()); | |
91327a77 | 397 | |
9f95a23c TL |
398 | dout(20) << "op_has_sufficient_caps " |
399 | << "session=" << session | |
400 | << " pool=" << pool.id << " (" << pool.name | |
401 | << " " << req->get_hobj().nspace | |
402 | << ")" | |
403 | << " pool_app_metadata=" << pool.info.application_metadata | |
404 | << " need_read_cap=" << op->need_read_cap() | |
405 | << " need_write_cap=" << op->need_write_cap() | |
406 | << " classes=" << op->classes() | |
407 | << " -> " << (cap ? "yes" : "NO") | |
408 | << dendl; | |
409 | return cap; | |
7c673cae FG |
410 | } |
411 | ||
9f95a23c | 412 | void PG::queue_recovery() |
91327a77 | 413 | { |
9f95a23c TL |
414 | if (!is_primary() || !is_peered()) { |
415 | dout(10) << "queue_recovery -- not primary or not peered " << dendl; | |
416 | ceph_assert(!recovery_queued); | |
417 | } else if (recovery_queued) { | |
418 | dout(10) << "queue_recovery -- already queued" << dendl; | |
91327a77 | 419 | } else { |
9f95a23c TL |
420 | dout(10) << "queue_recovery -- queuing" << dendl; |
421 | recovery_queued = true; | |
422 | osd->queue_for_recovery(this); | |
91327a77 AA |
423 | } |
424 | } | |
9f95a23c | 425 | |
f67539c2 | 426 | void PG::queue_scrub_after_repair() |
7c673cae | 427 | { |
f67539c2 | 428 | dout(10) << __func__ << dendl; |
9f95a23c | 429 | ceph_assert(ceph_mutex_is_locked(_lock)); |
f67539c2 TL |
430 | |
431 | m_planned_scrub.must_deep_scrub = true; | |
432 | m_planned_scrub.check_repair = true; | |
433 | m_planned_scrub.must_scrub = true; | |
434 | ||
20effc67 TL |
435 | if (is_scrub_queued_or_active()) { |
436 | dout(10) << __func__ << ": scrubbing already (" | |
437 | << (is_scrubbing() ? "active)" : "queued)") << dendl; | |
f67539c2 TL |
438 | return; |
439 | } | |
440 | ||
441 | m_scrubber->set_op_parameters(m_planned_scrub); | |
442 | dout(15) << __func__ << ": queueing" << dendl; | |
443 | ||
f67539c2 | 444 | osd->queue_scrub_after_repair(this, Scrub::scrub_prio_t::high_priority); |
9f95a23c | 445 | } |
7c673cae | 446 | |
9f95a23c TL |
447 | unsigned PG::get_scrub_priority() |
448 | { | |
449 | // a higher value -> a higher priority | |
f67539c2 TL |
450 | int64_t pool_scrub_priority = |
451 | pool.info.opts.value_or(pool_opts_t::SCRUB_PRIORITY, (int64_t)0); | |
9f95a23c TL |
452 | return pool_scrub_priority > 0 ? pool_scrub_priority : cct->_conf->osd_scrub_priority; |
453 | } | |
7c673cae | 454 | |
9f95a23c TL |
455 | Context *PG::finish_recovery() |
456 | { | |
457 | dout(10) << "finish_recovery" << dendl; | |
458 | ceph_assert(info.last_complete == info.last_update); | |
7c673cae | 459 | |
9f95a23c | 460 | clear_recovery_state(); |
92f5a8d4 | 461 | |
9f95a23c TL |
462 | /* |
463 | * sync all this before purging strays. but don't block! | |
464 | */ | |
465 | finish_sync_event = new C_PG_FinishRecovery(this); | |
466 | return finish_sync_event; | |
7c673cae FG |
467 | } |
468 | ||
f67539c2 | 469 | void PG::_finish_recovery(Context* c) |
7c673cae | 470 | { |
f67539c2 TL |
471 | dout(15) << __func__ << " finish_sync_event? " << finish_sync_event << " clean? " |
472 | << is_clean() << dendl; | |
473 | ||
9f95a23c TL |
474 | std::scoped_lock locker{*this}; |
475 | if (recovery_state.is_deleting() || !is_clean()) { | |
476 | dout(10) << __func__ << " raced with delete or repair" << dendl; | |
477 | return; | |
7c673cae | 478 | } |
9f95a23c TL |
479 | // When recovery is initiated by a repair, that flag is left on |
480 | state_clear(PG_STATE_REPAIR); | |
481 | if (c == finish_sync_event) { | |
f67539c2 | 482 | dout(15) << __func__ << " scrub_after_recovery? " << scrub_after_recovery << dendl; |
9f95a23c TL |
483 | finish_sync_event = 0; |
484 | recovery_state.purge_strays(); | |
7c673cae | 485 | |
9f95a23c TL |
486 | publish_stats_to_osd(); |
487 | ||
488 | if (scrub_after_recovery) { | |
489 | dout(10) << "_finish_recovery requeueing for scrub" << dendl; | |
490 | scrub_after_recovery = false; | |
f67539c2 | 491 | queue_scrub_after_repair(); |
7c673cae | 492 | } |
9f95a23c TL |
493 | } else { |
494 | dout(10) << "_finish_recovery -- stale" << dendl; | |
7c673cae | 495 | } |
9f95a23c | 496 | } |
7c673cae | 497 | |
9f95a23c TL |
498 | void PG::start_recovery_op(const hobject_t& soid) |
499 | { | |
500 | dout(10) << "start_recovery_op " << soid | |
501 | #ifdef DEBUG_RECOVERY_OIDS | |
502 | << " (" << recovering_oids << ")" | |
503 | #endif | |
504 | << dendl; | |
505 | ceph_assert(recovery_ops_active >= 0); | |
506 | recovery_ops_active++; | |
507 | #ifdef DEBUG_RECOVERY_OIDS | |
508 | recovering_oids.insert(soid); | |
509 | #endif | |
510 | osd->start_recovery_op(this, soid); | |
7c673cae FG |
511 | } |
512 | ||
9f95a23c | 513 | void PG::finish_recovery_op(const hobject_t& soid, bool dequeue) |
7c673cae | 514 | { |
9f95a23c TL |
515 | dout(10) << "finish_recovery_op " << soid |
516 | #ifdef DEBUG_RECOVERY_OIDS | |
f67539c2 | 517 | << " (" << recovering_oids << ")" |
9f95a23c TL |
518 | #endif |
519 | << dendl; | |
520 | ceph_assert(recovery_ops_active > 0); | |
521 | recovery_ops_active--; | |
522 | #ifdef DEBUG_RECOVERY_OIDS | |
523 | ceph_assert(recovering_oids.count(soid)); | |
524 | recovering_oids.erase(recovering_oids.find(soid)); | |
525 | #endif | |
526 | osd->finish_recovery_op(this, soid, dequeue); | |
7c673cae | 527 | |
9f95a23c TL |
528 | if (!dequeue) { |
529 | queue_recovery(); | |
7c673cae | 530 | } |
7c673cae FG |
531 | } |
532 | ||
9f95a23c TL |
533 | void PG::split_into(pg_t child_pgid, PG *child, unsigned split_bits) |
534 | { | |
535 | recovery_state.split_into(child_pgid, &child->recovery_state, split_bits); | |
7c673cae | 536 | |
9f95a23c | 537 | child->update_snap_mapper_bits(split_bits); |
7c673cae | 538 | |
9f95a23c TL |
539 | child->snap_trimq = snap_trimq; |
540 | child->snap_trimq_repeat = snap_trimq_repeat; | |
541 | ||
542 | _split_into(child_pgid, child, split_bits); | |
543 | ||
544 | // release all backoffs for simplicity | |
545 | release_backoffs(hobject_t(), hobject_t::get_max()); | |
7c673cae FG |
546 | } |
547 | ||
9f95a23c | 548 | void PG::start_split_stats(const set<spg_t>& childpgs, vector<object_stat_sum_t> *out) |
7c673cae | 549 | { |
9f95a23c | 550 | recovery_state.start_split_stats(childpgs, out); |
7c673cae FG |
551 | } |
552 | ||
9f95a23c TL |
553 | void PG::finish_split_stats(const object_stat_sum_t& stats, ObjectStore::Transaction &t) |
554 | { | |
555 | recovery_state.finish_split_stats(stats, t); | |
556 | } | |
557 | ||
558 | void PG::merge_from(map<spg_t,PGRef>& sources, PeeringCtx &rctx, | |
559 | unsigned split_bits, | |
560 | const pg_merge_meta_t& last_pg_merge_meta) | |
561 | { | |
562 | dout(10) << __func__ << " from " << sources << " split_bits " << split_bits | |
563 | << dendl; | |
564 | map<spg_t, PeeringState*> source_ps; | |
565 | for (auto &&source : sources) { | |
566 | source_ps.emplace(source.first, &source.second->recovery_state); | |
567 | } | |
568 | recovery_state.merge_from(source_ps, rctx, split_bits, last_pg_merge_meta); | |
569 | ||
570 | for (auto& i : sources) { | |
571 | auto& source = i.second; | |
572 | // wipe out source's pgmeta | |
573 | rctx.transaction.remove(source->coll, source->pgmeta_oid); | |
574 | ||
575 | // merge (and destroy source collection) | |
576 | rctx.transaction.merge_collection(source->coll, coll, split_bits); | |
7c673cae FG |
577 | } |
578 | ||
9f95a23c TL |
579 | // merge_collection does this, but maybe all of our sources were missing. |
580 | rctx.transaction.collection_set_bits(coll, split_bits); | |
581 | ||
582 | snap_mapper.update_bits(split_bits); | |
7c673cae FG |
583 | } |
584 | ||
9f95a23c | 585 | void PG::add_backoff(const ceph::ref_t<Session>& s, const hobject_t& begin, const hobject_t& end) |
7c673cae | 586 | { |
9f95a23c TL |
587 | auto con = s->con; |
588 | if (!con) // OSD::ms_handle_reset clears s->con without a lock | |
589 | return; | |
590 | auto b = s->have_backoff(info.pgid, begin); | |
591 | if (b) { | |
592 | derr << __func__ << " already have backoff for " << s << " begin " << begin | |
593 | << " " << *b << dendl; | |
594 | ceph_abort(); | |
7c673cae | 595 | } |
9f95a23c TL |
596 | std::lock_guard l(backoff_lock); |
597 | b = ceph::make_ref<Backoff>(info.pgid, this, s, ++s->backoff_seq, begin, end); | |
598 | backoffs[begin].insert(b); | |
599 | s->add_backoff(b); | |
600 | dout(10) << __func__ << " session " << s << " added " << *b << dendl; | |
601 | con->send_message( | |
602 | new MOSDBackoff( | |
603 | info.pgid, | |
604 | get_osdmap_epoch(), | |
605 | CEPH_OSD_BACKOFF_OP_BLOCK, | |
606 | b->id, | |
607 | begin, | |
608 | end)); | |
7c673cae FG |
609 | } |
610 | ||
9f95a23c | 611 | void PG::release_backoffs(const hobject_t& begin, const hobject_t& end) |
7c673cae | 612 | { |
9f95a23c TL |
613 | dout(10) << __func__ << " [" << begin << "," << end << ")" << dendl; |
614 | vector<ceph::ref_t<Backoff>> bv; | |
615 | { | |
616 | std::lock_guard l(backoff_lock); | |
617 | auto p = backoffs.lower_bound(begin); | |
618 | while (p != backoffs.end()) { | |
619 | int r = cmp(p->first, end); | |
620 | dout(20) << __func__ << " ? " << r << " " << p->first | |
621 | << " " << p->second << dendl; | |
622 | // note: must still examine begin=end=p->first case | |
623 | if (r > 0 || (r == 0 && begin < end)) { | |
624 | break; | |
625 | } | |
626 | dout(20) << __func__ << " checking " << p->first | |
627 | << " " << p->second << dendl; | |
628 | auto q = p->second.begin(); | |
629 | while (q != p->second.end()) { | |
630 | dout(20) << __func__ << " checking " << *q << dendl; | |
20effc67 TL |
631 | int rr = cmp((*q)->begin, begin); |
632 | if (rr == 0 || (rr > 0 && (*q)->end < end)) { | |
9f95a23c TL |
633 | bv.push_back(*q); |
634 | q = p->second.erase(q); | |
635 | } else { | |
636 | ++q; | |
637 | } | |
638 | } | |
639 | if (p->second.empty()) { | |
640 | p = backoffs.erase(p); | |
641 | } else { | |
642 | ++p; | |
643 | } | |
7c673cae FG |
644 | } |
645 | } | |
9f95a23c TL |
646 | for (auto b : bv) { |
647 | std::lock_guard l(b->lock); | |
648 | dout(10) << __func__ << " " << *b << dendl; | |
649 | if (b->session) { | |
650 | ceph_assert(b->pg == this); | |
651 | ConnectionRef con = b->session->con; | |
652 | if (con) { // OSD::ms_handle_reset clears s->con without a lock | |
653 | con->send_message( | |
654 | new MOSDBackoff( | |
655 | info.pgid, | |
656 | get_osdmap_epoch(), | |
657 | CEPH_OSD_BACKOFF_OP_UNBLOCK, | |
658 | b->id, | |
659 | b->begin, | |
660 | b->end)); | |
7c673cae | 661 | } |
9f95a23c TL |
662 | if (b->is_new()) { |
663 | b->state = Backoff::STATE_DELETING; | |
7c673cae | 664 | } else { |
9f95a23c TL |
665 | b->session->rm_backoff(b); |
666 | b->session.reset(); | |
7c673cae | 667 | } |
9f95a23c TL |
668 | b->pg.reset(); |
669 | } | |
7c673cae | 670 | } |
7c673cae FG |
671 | } |
672 | ||
9f95a23c | 673 | void PG::clear_backoffs() |
7c673cae | 674 | { |
9f95a23c TL |
675 | dout(10) << __func__ << " " << dendl; |
676 | map<hobject_t,set<ceph::ref_t<Backoff>>> ls; | |
677 | { | |
678 | std::lock_guard l(backoff_lock); | |
679 | ls.swap(backoffs); | |
680 | } | |
681 | for (auto& p : ls) { | |
682 | for (auto& b : p.second) { | |
683 | std::lock_guard l(b->lock); | |
684 | dout(10) << __func__ << " " << *b << dendl; | |
685 | if (b->session) { | |
686 | ceph_assert(b->pg == this); | |
687 | if (b->is_new()) { | |
688 | b->state = Backoff::STATE_DELETING; | |
689 | } else { | |
690 | b->session->rm_backoff(b); | |
691 | b->session.reset(); | |
692 | } | |
693 | b->pg.reset(); | |
694 | } | |
695 | } | |
696 | } | |
697 | } | |
7c673cae | 698 | |
9f95a23c TL |
699 | // called by Session::clear_backoffs() |
700 | void PG::rm_backoff(const ceph::ref_t<Backoff>& b) | |
701 | { | |
702 | dout(10) << __func__ << " " << *b << dendl; | |
703 | std::lock_guard l(backoff_lock); | |
704 | ceph_assert(ceph_mutex_is_locked_by_me(b->lock)); | |
705 | ceph_assert(b->pg == this); | |
706 | auto p = backoffs.find(b->begin); | |
707 | // may race with release_backoffs() | |
708 | if (p != backoffs.end()) { | |
709 | auto q = p->second.find(b); | |
710 | if (q != p->second.end()) { | |
711 | p->second.erase(q); | |
712 | if (p->second.empty()) { | |
713 | backoffs.erase(p); | |
714 | } | |
715 | } | |
716 | } | |
717 | } | |
7c673cae | 718 | |
f67539c2 | 719 | void PG::clear_recovery_state() |
9f95a23c TL |
720 | { |
721 | dout(10) << "clear_recovery_state" << dendl; | |
7c673cae | 722 | |
9f95a23c | 723 | finish_sync_event = 0; |
7c673cae | 724 | |
9f95a23c TL |
725 | hobject_t soid; |
726 | while (recovery_ops_active > 0) { | |
727 | #ifdef DEBUG_RECOVERY_OIDS | |
728 | soid = *recovering_oids.begin(); | |
729 | #endif | |
730 | finish_recovery_op(soid, true); | |
731 | } | |
7c673cae | 732 | |
9f95a23c TL |
733 | backfill_info.clear(); |
734 | peer_backfill_info.clear(); | |
735 | waiting_on_backfill.clear(); | |
736 | _clear_recovery_state(); // pg impl specific hook | |
737 | } | |
7c673cae | 738 | |
9f95a23c TL |
739 | void PG::cancel_recovery() |
740 | { | |
741 | dout(10) << "cancel_recovery" << dendl; | |
742 | clear_recovery_state(); | |
7c673cae FG |
743 | } |
744 | ||
9f95a23c TL |
745 | void PG::set_probe_targets(const set<pg_shard_t> &probe_set) |
746 | { | |
747 | std::lock_guard l(heartbeat_peer_lock); | |
748 | probe_targets.clear(); | |
749 | for (set<pg_shard_t>::iterator i = probe_set.begin(); | |
750 | i != probe_set.end(); | |
7c673cae | 751 | ++i) { |
9f95a23c | 752 | probe_targets.insert(i->osd); |
7c673cae | 753 | } |
7c673cae FG |
754 | } |
755 | ||
9f95a23c | 756 | void PG::send_cluster_message( |
f67539c2 | 757 | int target, MessageRef m, |
20effc67 | 758 | epoch_t epoch, bool share_map_update) |
9f95a23c TL |
759 | { |
760 | ConnectionRef con = osd->get_con_osd_cluster( | |
761 | target, get_osdmap_epoch()); | |
762 | if (!con) { | |
11fdf7f2 TL |
763 | return; |
764 | } | |
7c673cae | 765 | |
9f95a23c TL |
766 | if (share_map_update) { |
767 | osd->maybe_share_map(con.get(), get_osdmap()); | |
7c673cae | 768 | } |
9f95a23c TL |
769 | osd->send_message_osd_cluster(m, con.get()); |
770 | } | |
7c673cae | 771 | |
9f95a23c TL |
772 | void PG::clear_probe_targets() |
773 | { | |
774 | std::lock_guard l(heartbeat_peer_lock); | |
775 | probe_targets.clear(); | |
776 | } | |
7c673cae | 777 | |
9f95a23c TL |
778 | void PG::update_heartbeat_peers(set<int> new_peers) |
779 | { | |
780 | bool need_update = false; | |
781 | heartbeat_peer_lock.lock(); | |
782 | if (new_peers == heartbeat_peers) { | |
783 | dout(10) << "update_heartbeat_peers " << heartbeat_peers << " unchanged" << dendl; | |
784 | } else { | |
785 | dout(10) << "update_heartbeat_peers " << heartbeat_peers << " -> " << new_peers << dendl; | |
786 | heartbeat_peers.swap(new_peers); | |
787 | need_update = true; | |
11fdf7f2 | 788 | } |
9f95a23c | 789 | heartbeat_peer_lock.unlock(); |
11fdf7f2 | 790 | |
9f95a23c TL |
791 | if (need_update) |
792 | osd->need_heartbeat_peer_update(); | |
793 | } | |
11fdf7f2 | 794 | |
11fdf7f2 | 795 | |
9f95a23c TL |
796 | bool PG::check_in_progress_op( |
797 | const osd_reqid_t &r, | |
798 | eversion_t *version, | |
799 | version_t *user_version, | |
800 | int *return_code, | |
801 | vector<pg_log_op_return_item_t> *op_returns | |
802 | ) const | |
803 | { | |
804 | return ( | |
805 | projected_log.get_request(r, version, user_version, return_code, | |
806 | op_returns) || | |
807 | recovery_state.get_pg_log().get_log().get_request( | |
808 | r, version, user_version, return_code, op_returns)); | |
11fdf7f2 TL |
809 | } |
810 | ||
9f95a23c | 811 | void PG::publish_stats_to_osd() |
11fdf7f2 | 812 | { |
9f95a23c TL |
813 | if (!is_primary()) |
814 | return; | |
11fdf7f2 | 815 | |
20effc67 TL |
816 | ceph_assert(m_scrubber); |
817 | recovery_state.update_stats_wo_resched( | |
818 | [scrubber = m_scrubber.get()](pg_history_t& hist, | |
819 | pg_stat_t& info) mutable -> void { | |
820 | info.scrub_sched_status = scrubber->get_schedule(); | |
821 | }); | |
822 | ||
9f95a23c | 823 | std::lock_guard l{pg_stats_publish_lock}; |
20effc67 TL |
824 | auto stats = |
825 | recovery_state.prepare_stats_for_publish(pg_stats_publish, unstable_stats); | |
9f95a23c | 826 | if (stats) { |
20effc67 | 827 | pg_stats_publish = std::move(stats); |
11fdf7f2 | 828 | } |
11fdf7f2 TL |
829 | } |
830 | ||
9f95a23c | 831 | unsigned PG::get_target_pg_log_entries() const |
11fdf7f2 | 832 | { |
9f95a23c | 833 | return osd->get_target_pg_log_entries(); |
11fdf7f2 TL |
834 | } |
835 | ||
9f95a23c | 836 | void PG::clear_publish_stats() |
11fdf7f2 | 837 | { |
9f95a23c TL |
838 | dout(15) << "clear_stats" << dendl; |
839 | std::lock_guard l{pg_stats_publish_lock}; | |
20effc67 | 840 | pg_stats_publish.reset(); |
7c673cae FG |
841 | } |
842 | ||
843 | /** | |
9f95a23c | 844 | * initialize a newly instantiated pg |
7c673cae | 845 | * |
9f95a23c TL |
846 | * Initialize PG state, as when a PG is initially created, or when it |
847 | * is first instantiated on the current node. | |
7c673cae | 848 | * |
9f95a23c TL |
849 | * @param role our role/rank |
850 | * @param newup up set | |
851 | * @param newacting acting set | |
852 | * @param history pg history | |
853 | * @param pi past_intervals | |
854 | * @param backfill true if info should be marked as backfill | |
855 | * @param t transaction to write out our new state in | |
7c673cae | 856 | */ |
9f95a23c TL |
857 | void PG::init( |
858 | int role, | |
859 | const vector<int>& newup, int new_up_primary, | |
860 | const vector<int>& newacting, int new_acting_primary, | |
861 | const pg_history_t& history, | |
862 | const PastIntervals& pi, | |
9f95a23c TL |
863 | ObjectStore::Transaction &t) |
864 | { | |
865 | recovery_state.init( | |
866 | role, newup, new_up_primary, newacting, | |
20effc67 | 867 | new_acting_primary, history, pi, t); |
9f95a23c | 868 | } |
7c673cae | 869 | |
9f95a23c TL |
870 | void PG::shutdown() |
871 | { | |
872 | ch->flush(); | |
873 | std::scoped_lock l{*this}; | |
874 | recovery_state.shutdown(); | |
875 | on_shutdown(); | |
876 | } | |
7c673cae | 877 | |
9f95a23c TL |
878 | #pragma GCC diagnostic ignored "-Wpragmas" |
879 | #pragma GCC diagnostic push | |
880 | #pragma GCC diagnostic ignored "-Wdeprecated-declarations" | |
7c673cae | 881 | |
9f95a23c TL |
882 | void PG::upgrade(ObjectStore *store) |
883 | { | |
884 | dout(0) << __func__ << " " << info_struct_v << " -> " << pg_latest_struct_v | |
885 | << dendl; | |
886 | ceph_assert(info_struct_v <= 10); | |
887 | ObjectStore::Transaction t; | |
7c673cae | 888 | |
9f95a23c | 889 | // <do upgrade steps here> |
7c673cae | 890 | |
9f95a23c TL |
891 | // finished upgrade! |
892 | ceph_assert(info_struct_v == 10); | |
7c673cae | 893 | |
9f95a23c TL |
894 | // update infover_key |
895 | if (info_struct_v < pg_latest_struct_v) { | |
896 | map<string,bufferlist> v; | |
897 | __u8 ver = pg_latest_struct_v; | |
898 | encode(ver, v[string(infover_key)]); | |
899 | t.omap_setkeys(coll, pgmeta_oid, v); | |
7c673cae | 900 | } |
7c673cae | 901 | |
9f95a23c | 902 | recovery_state.force_write_state(t); |
7c673cae | 903 | |
9f95a23c TL |
904 | ObjectStore::CollectionHandle ch = store->open_collection(coll); |
905 | int r = store->queue_transaction(ch, std::move(t)); | |
906 | if (r != 0) { | |
907 | derr << __func__ << ": queue_transaction returned " | |
908 | << cpp_strerror(r) << dendl; | |
909 | ceph_abort(); | |
7c673cae | 910 | } |
9f95a23c | 911 | ceph_assert(r == 0); |
7c673cae | 912 | |
9f95a23c TL |
913 | C_SaferCond waiter; |
914 | if (!ch->flush_commit(&waiter)) { | |
915 | waiter.wait(); | |
7c673cae | 916 | } |
9f95a23c | 917 | } |
7c673cae | 918 | |
9f95a23c TL |
919 | #pragma GCC diagnostic pop |
920 | #pragma GCC diagnostic warning "-Wpragmas" | |
7c673cae | 921 | |
9f95a23c TL |
922 | void PG::prepare_write( |
923 | pg_info_t &info, | |
924 | pg_info_t &last_written_info, | |
925 | PastIntervals &past_intervals, | |
926 | PGLog &pglog, | |
927 | bool dirty_info, | |
928 | bool dirty_big_info, | |
929 | bool need_write_epoch, | |
930 | ObjectStore::Transaction &t) | |
931 | { | |
932 | info.stats.stats.add(unstable_stats); | |
933 | unstable_stats.clear(); | |
934 | map<string,bufferlist> km; | |
935 | string key_to_remove; | |
936 | if (dirty_big_info || dirty_info) { | |
937 | int ret = prepare_info_keymap( | |
938 | cct, | |
939 | &km, | |
940 | &key_to_remove, | |
11fdf7f2 | 941 | get_osdmap_epoch(), |
9f95a23c TL |
942 | info, |
943 | last_written_info, | |
944 | past_intervals, | |
945 | dirty_big_info, | |
946 | need_write_epoch, | |
947 | cct->_conf->osd_fast_info, | |
948 | osd->logger, | |
949 | this); | |
950 | ceph_assert(ret == 0); | |
7c673cae | 951 | } |
9f95a23c TL |
952 | pglog.write_log_and_missing( |
953 | t, &km, coll, pgmeta_oid, pool.info.require_rollback()); | |
954 | if (!km.empty()) | |
955 | t.omap_setkeys(coll, pgmeta_oid, km); | |
956 | if (!key_to_remove.empty()) | |
957 | t.omap_rmkey(coll, pgmeta_oid, key_to_remove); | |
958 | } | |
7c673cae | 959 | |
9f95a23c TL |
960 | #pragma GCC diagnostic ignored "-Wpragmas" |
961 | #pragma GCC diagnostic push | |
962 | #pragma GCC diagnostic ignored "-Wdeprecated-declarations" | |
7c673cae | 963 | |
9f95a23c TL |
964 | bool PG::_has_removal_flag(ObjectStore *store, |
965 | spg_t pgid) | |
966 | { | |
967 | coll_t coll(pgid); | |
968 | ghobject_t pgmeta_oid(pgid.make_pgmeta_oid()); | |
7c673cae | 969 | |
9f95a23c TL |
970 | // first try new way |
971 | set<string> keys; | |
972 | keys.insert("_remove"); | |
973 | map<string,bufferlist> values; | |
974 | auto ch = store->open_collection(coll); | |
975 | ceph_assert(ch); | |
976 | if (store->omap_get_values(ch, pgmeta_oid, keys, &values) == 0 && | |
977 | values.size() == 1) | |
978 | return true; | |
7c673cae | 979 | |
9f95a23c TL |
980 | return false; |
981 | } | |
c07f9fc5 | 982 | |
9f95a23c TL |
983 | int PG::peek_map_epoch(ObjectStore *store, |
984 | spg_t pgid, | |
985 | epoch_t *pepoch) | |
986 | { | |
987 | coll_t coll(pgid); | |
9f95a23c TL |
988 | ghobject_t pgmeta_oid(pgid.make_pgmeta_oid()); |
989 | epoch_t cur_epoch = 0; | |
7c673cae | 990 | |
9f95a23c TL |
991 | // validate collection name |
992 | ceph_assert(coll.is_pg()); | |
7c673cae | 993 | |
9f95a23c TL |
994 | // try for v8 |
995 | set<string> keys; | |
996 | keys.insert(string(infover_key)); | |
997 | keys.insert(string(epoch_key)); | |
998 | map<string,bufferlist> values; | |
999 | auto ch = store->open_collection(coll); | |
1000 | ceph_assert(ch); | |
1001 | int r = store->omap_get_values(ch, pgmeta_oid, keys, &values); | |
1002 | if (r == 0) { | |
1003 | ceph_assert(values.size() == 2); | |
7c673cae | 1004 | |
9f95a23c TL |
1005 | // sanity check version |
1006 | auto bp = values[string(infover_key)].cbegin(); | |
1007 | __u8 struct_v = 0; | |
1008 | decode(struct_v, bp); | |
1009 | ceph_assert(struct_v >= 8); | |
91327a77 | 1010 | |
9f95a23c TL |
1011 | // get epoch |
1012 | bp = values[string(epoch_key)].begin(); | |
1013 | decode(cur_epoch, bp); | |
1014 | } else { | |
1015 | // probably bug 10617; see OSD::load_pgs() | |
1016 | return -1; | |
1017 | } | |
7c673cae | 1018 | |
9f95a23c TL |
1019 | *pepoch = cur_epoch; |
1020 | return 0; | |
1021 | } | |
7c673cae | 1022 | |
9f95a23c TL |
1023 | #pragma GCC diagnostic pop |
1024 | #pragma GCC diagnostic warning "-Wpragmas" | |
7c673cae | 1025 | |
9f95a23c TL |
1026 | bool PG::check_log_for_corruption(ObjectStore *store) |
1027 | { | |
1028 | /// TODO: this method needs to work with the omap log | |
1029 | return true; | |
1030 | } | |
7c673cae | 1031 | |
9f95a23c TL |
1032 | //! Get the name we're going to save our corrupt page log as |
1033 | std::string PG::get_corrupt_pg_log_name() const | |
1034 | { | |
1035 | const int MAX_BUF = 512; | |
1036 | char buf[MAX_BUF]; | |
1037 | struct tm tm_buf; | |
1038 | time_t my_time(time(NULL)); | |
1039 | const struct tm *t = localtime_r(&my_time, &tm_buf); | |
1040 | int ret = strftime(buf, sizeof(buf), "corrupt_log_%Y-%m-%d_%k:%M_", t); | |
1041 | if (ret == 0) { | |
1042 | dout(0) << "strftime failed" << dendl; | |
1043 | return "corrupt_log_unknown_time"; | |
7c673cae | 1044 | } |
9f95a23c TL |
1045 | string out(buf); |
1046 | out += stringify(info.pgid); | |
1047 | return out; | |
7c673cae FG |
1048 | } |
1049 | ||
9f95a23c TL |
1050 | int PG::read_info( |
1051 | ObjectStore *store, spg_t pgid, const coll_t &coll, | |
1052 | pg_info_t &info, PastIntervals &past_intervals, | |
1053 | __u8 &struct_v) | |
7c673cae | 1054 | { |
9f95a23c TL |
1055 | set<string> keys; |
1056 | keys.insert(string(infover_key)); | |
1057 | keys.insert(string(info_key)); | |
1058 | keys.insert(string(biginfo_key)); | |
1059 | keys.insert(string(fastinfo_key)); | |
1060 | ghobject_t pgmeta_oid(pgid.make_pgmeta_oid()); | |
1061 | map<string,bufferlist> values; | |
1062 | auto ch = store->open_collection(coll); | |
1063 | ceph_assert(ch); | |
1064 | int r = store->omap_get_values(ch, pgmeta_oid, keys, &values); | |
1065 | ceph_assert(r == 0); | |
1066 | ceph_assert(values.size() == 3 || | |
1067 | values.size() == 4); | |
7c673cae | 1068 | |
9f95a23c TL |
1069 | auto p = values[string(infover_key)].cbegin(); |
1070 | decode(struct_v, p); | |
1071 | ceph_assert(struct_v >= 10); | |
7c673cae | 1072 | |
9f95a23c TL |
1073 | p = values[string(info_key)].begin(); |
1074 | decode(info, p); | |
7c673cae | 1075 | |
9f95a23c TL |
1076 | p = values[string(biginfo_key)].begin(); |
1077 | decode(past_intervals, p); | |
1078 | decode(info.purged_snaps, p); | |
7c673cae | 1079 | |
9f95a23c TL |
1080 | p = values[string(fastinfo_key)].begin(); |
1081 | if (!p.end()) { | |
1082 | pg_fast_info_t fast; | |
1083 | decode(fast, p); | |
1084 | fast.try_apply_to(&info); | |
1085 | } | |
1086 | return 0; | |
7c673cae FG |
1087 | } |
1088 | ||
9f95a23c TL |
1089 | void PG::read_state(ObjectStore *store) |
1090 | { | |
1091 | PastIntervals past_intervals_from_disk; | |
1092 | pg_info_t info_from_disk; | |
1093 | int r = read_info( | |
1094 | store, | |
1095 | pg_id, | |
1096 | coll, | |
1097 | info_from_disk, | |
1098 | past_intervals_from_disk, | |
1099 | info_struct_v); | |
1100 | ceph_assert(r >= 0); | |
7c673cae | 1101 | |
9f95a23c TL |
1102 | if (info_struct_v < pg_compat_struct_v) { |
1103 | derr << "PG needs upgrade, but on-disk data is too old; upgrade to" | |
1104 | << " an older version first." << dendl; | |
1105 | ceph_abort_msg("PG too old to upgrade"); | |
1106 | } | |
7c673cae | 1107 | |
9f95a23c TL |
1108 | recovery_state.init_from_disk_state( |
1109 | std::move(info_from_disk), | |
1110 | std::move(past_intervals_from_disk), | |
1111 | [this, store] (PGLog &pglog) { | |
1112 | ostringstream oss; | |
1113 | pglog.read_log_and_missing( | |
1114 | store, | |
1115 | ch, | |
1116 | pgmeta_oid, | |
1117 | info, | |
1118 | oss, | |
1119 | cct->_conf->osd_ignore_stale_divergent_priors, | |
1120 | cct->_conf->osd_debug_verify_missing_on_start); | |
1121 | ||
1122 | if (oss.tellp()) | |
1123 | osd->clog->error() << oss.str(); | |
1124 | return 0; | |
1125 | }); | |
7c673cae | 1126 | |
9f95a23c TL |
1127 | if (info_struct_v < pg_latest_struct_v) { |
1128 | upgrade(store); | |
7c673cae FG |
1129 | } |
1130 | ||
9f95a23c TL |
1131 | // initialize current mapping |
1132 | { | |
1133 | int primary, up_primary; | |
1134 | vector<int> acting, up; | |
1135 | get_osdmap()->pg_to_up_acting_osds( | |
1136 | pg_id.pgid, &up, &up_primary, &acting, &primary); | |
1137 | recovery_state.init_primary_up_acting( | |
1138 | up, | |
1139 | acting, | |
1140 | up_primary, | |
1141 | primary); | |
1142 | recovery_state.set_role(OSDMap::calc_pg_role(pg_whoami, acting)); | |
1143 | } | |
1144 | ||
1145 | // init pool options | |
1146 | store->set_collection_opts(ch, pool.info.opts); | |
7c673cae | 1147 | |
20effc67 | 1148 | PeeringCtx rctx; |
9f95a23c TL |
1149 | handle_initialize(rctx); |
1150 | // note: we don't activate here because we know the OSD will advance maps | |
1151 | // during boot. | |
1152 | write_if_dirty(rctx.transaction); | |
1153 | store->queue_transaction(ch, std::move(rctx.transaction)); | |
7c673cae FG |
1154 | } |
1155 | ||
9f95a23c TL |
1156 | void PG::update_snap_map( |
1157 | const vector<pg_log_entry_t> &log_entries, | |
1158 | ObjectStore::Transaction &t) | |
7c673cae | 1159 | { |
f67539c2 | 1160 | for (auto i = log_entries.cbegin(); i != log_entries.cend(); ++i) { |
9f95a23c TL |
1161 | OSDriver::OSTransaction _t(osdriver.get_transaction(&t)); |
1162 | if (i->soid.snap < CEPH_MAXSNAP) { | |
1163 | if (i->is_delete()) { | |
1164 | int r = snap_mapper.remove_oid( | |
1165 | i->soid, | |
1166 | &_t); | |
f67539c2 | 1167 | if (r) |
9f95a23c TL |
1168 | derr << __func__ << " remove_oid " << i->soid << " failed with " << r << dendl; |
1169 | // On removal tolerate missing key corruption | |
1170 | ceph_assert(r == 0 || r == -ENOENT); | |
1171 | } else if (i->is_update()) { | |
1172 | ceph_assert(i->snaps.length() > 0); | |
1173 | vector<snapid_t> snaps; | |
1174 | bufferlist snapbl = i->snaps; | |
1175 | auto p = snapbl.cbegin(); | |
1176 | try { | |
1177 | decode(snaps, p); | |
1178 | } catch (...) { | |
1179 | derr << __func__ << " decode snaps failure on " << *i << dendl; | |
1180 | snaps.clear(); | |
1181 | } | |
1182 | set<snapid_t> _snaps(snaps.begin(), snaps.end()); | |
7c673cae | 1183 | |
9f95a23c TL |
1184 | if (i->is_clone() || i->is_promote()) { |
1185 | snap_mapper.add_oid( | |
1186 | i->soid, | |
1187 | _snaps, | |
1188 | &_t); | |
1189 | } else if (i->is_modify()) { | |
1190 | int r = snap_mapper.update_snaps( | |
1191 | i->soid, | |
1192 | _snaps, | |
1193 | 0, | |
1194 | &_t); | |
1195 | ceph_assert(r == 0); | |
11fdf7f2 | 1196 | } else { |
9f95a23c | 1197 | ceph_assert(i->is_clean()); |
11fdf7f2 TL |
1198 | } |
1199 | } | |
11fdf7f2 TL |
1200 | } |
1201 | } | |
7c673cae FG |
1202 | } |
1203 | ||
9f95a23c TL |
1204 | /** |
1205 | * filter trimming|trimmed snaps out of snapcontext | |
1206 | */ | |
1207 | void PG::filter_snapc(vector<snapid_t> &snaps) | |
7c673cae | 1208 | { |
9f95a23c TL |
1209 | // nothing needs to trim, we can return immediately |
1210 | if (snap_trimq.empty() && info.purged_snaps.empty()) | |
1211 | return; | |
1212 | ||
1213 | bool filtering = false; | |
1214 | vector<snapid_t> newsnaps; | |
1215 | for (vector<snapid_t>::iterator p = snaps.begin(); | |
1216 | p != snaps.end(); | |
1217 | ++p) { | |
1218 | if (snap_trimq.contains(*p) || info.purged_snaps.contains(*p)) { | |
1219 | if (!filtering) { | |
1220 | // start building a new vector with what we've seen so far | |
1221 | dout(10) << "filter_snapc filtering " << snaps << dendl; | |
1222 | newsnaps.insert(newsnaps.begin(), snaps.begin(), p); | |
1223 | filtering = true; | |
1224 | } | |
1225 | dout(20) << "filter_snapc removing trimq|purged snap " << *p << dendl; | |
1226 | } else { | |
1227 | if (filtering) | |
1228 | newsnaps.push_back(*p); // continue building new vector | |
d2e6a577 | 1229 | } |
c07f9fc5 | 1230 | } |
9f95a23c TL |
1231 | if (filtering) { |
1232 | snaps.swap(newsnaps); | |
1233 | dout(10) << "filter_snapc result " << snaps << dendl; | |
a8e16298 | 1234 | } |
a8e16298 TL |
1235 | } |
1236 | ||
9f95a23c | 1237 | void PG::requeue_object_waiters(map<hobject_t, list<OpRequestRef>>& m) |
a8e16298 | 1238 | { |
f67539c2 | 1239 | for (auto it = m.begin(); it != m.end(); ++it) |
9f95a23c TL |
1240 | requeue_ops(it->second); |
1241 | m.clear(); | |
c07f9fc5 | 1242 | } |
7c673cae | 1243 | |
9f95a23c | 1244 | void PG::requeue_op(OpRequestRef op) |
c07f9fc5 | 1245 | { |
9f95a23c TL |
1246 | auto p = waiting_for_map.find(op->get_source()); |
1247 | if (p != waiting_for_map.end()) { | |
1248 | dout(20) << __func__ << " " << op << " (waiting_for_map " << p->first << ")" | |
1249 | << dendl; | |
1250 | p->second.push_front(op); | |
c07f9fc5 | 1251 | } else { |
9f95a23c TL |
1252 | dout(20) << __func__ << " " << op << dendl; |
1253 | osd->enqueue_front( | |
1254 | OpSchedulerItem( | |
1255 | unique_ptr<OpSchedulerItem::OpQueueable>(new PGOpItem(info.pgid, op)), | |
1256 | op->get_req()->get_cost(), | |
1257 | op->get_req()->get_priority(), | |
1258 | op->get_req()->get_recv_stamp(), | |
1259 | op->get_req()->get_source().num(), | |
1260 | get_osdmap_epoch())); | |
7c673cae | 1261 | } |
c07f9fc5 | 1262 | } |
7c673cae | 1263 | |
9f95a23c | 1264 | void PG::requeue_ops(list<OpRequestRef> &ls) |
c07f9fc5 | 1265 | { |
9f95a23c TL |
1266 | for (list<OpRequestRef>::reverse_iterator i = ls.rbegin(); |
1267 | i != ls.rend(); | |
1268 | ++i) { | |
1269 | requeue_op(*i); | |
c07f9fc5 | 1270 | } |
9f95a23c | 1271 | ls.clear(); |
7c673cae FG |
1272 | } |
1273 | ||
9f95a23c | 1274 | void PG::requeue_map_waiters() |
7c673cae | 1275 | { |
9f95a23c TL |
1276 | epoch_t epoch = get_osdmap_epoch(); |
1277 | auto p = waiting_for_map.begin(); | |
1278 | while (p != waiting_for_map.end()) { | |
1279 | if (epoch < p->second.front()->min_epoch) { | |
1280 | dout(20) << __func__ << " " << p->first << " front op " | |
1281 | << p->second.front() << " must still wait, doing nothing" | |
1282 | << dendl; | |
1283 | ++p; | |
1284 | } else { | |
1285 | dout(20) << __func__ << " " << p->first << " " << p->second << dendl; | |
1286 | for (auto q = p->second.rbegin(); q != p->second.rend(); ++q) { | |
1287 | auto req = *q; | |
1288 | osd->enqueue_front(OpSchedulerItem( | |
1289 | unique_ptr<OpSchedulerItem::OpQueueable>(new PGOpItem(info.pgid, req)), | |
1290 | req->get_req()->get_cost(), | |
1291 | req->get_req()->get_priority(), | |
1292 | req->get_req()->get_recv_stamp(), | |
1293 | req->get_req()->get_source().num(), | |
1294 | epoch)); | |
1295 | } | |
1296 | p = waiting_for_map.erase(p); | |
c07f9fc5 | 1297 | } |
9f95a23c TL |
1298 | } |
1299 | } | |
7c673cae | 1300 | |
f67539c2 TL |
1301 | bool PG::get_must_scrub() const |
1302 | { | |
1303 | dout(20) << __func__ << " must_scrub? " << (m_planned_scrub.must_scrub ? "true" : "false") << dendl; | |
1304 | return m_planned_scrub.must_scrub; | |
1305 | } | |
1306 | ||
1307 | unsigned int PG::scrub_requeue_priority(Scrub::scrub_prio_t with_priority) const | |
1308 | { | |
1309 | return m_scrubber->scrub_requeue_priority(with_priority); | |
1310 | } | |
1311 | ||
1312 | unsigned int PG::scrub_requeue_priority(Scrub::scrub_prio_t with_priority, unsigned int suggested_priority) const | |
1313 | { | |
1314 | return m_scrubber->scrub_requeue_priority(with_priority, suggested_priority); | |
1315 | } | |
7c673cae | 1316 | |
9f95a23c TL |
1317 | // ========================================================================================== |
1318 | // SCRUB | |
7c673cae | 1319 | |
9f95a23c | 1320 | /* |
f67539c2 TL |
1321 | * implementation note: |
1322 | * PG::sched_scrub() is called only once per a specific scrub session. | |
1323 | * That call commits us to the whatever choices are made (deep/shallow, etc'). | |
1324 | * Unless failing to start scrubbing, the 'planned scrub' flag-set is 'frozen' into | |
1325 | * PgScrubber's m_flags, then cleared. | |
9f95a23c | 1326 | */ |
20effc67 | 1327 | Scrub::schedule_result_t PG::sched_scrub() |
11fdf7f2 | 1328 | { |
f67539c2 TL |
1329 | dout(15) << __func__ << " pg(" << info.pgid |
1330 | << (is_active() ? ") <active>" : ") <not-active>") | |
1331 | << (is_clean() ? " <clean>" : " <not-clean>") << dendl; | |
9f95a23c | 1332 | ceph_assert(ceph_mutex_is_locked(_lock)); |
20effc67 | 1333 | ceph_assert(m_scrubber); |
f67539c2 | 1334 | |
20effc67 TL |
1335 | if (is_scrub_queued_or_active()) { |
1336 | return Scrub::schedule_result_t::already_started; | |
11fdf7f2 | 1337 | } |
11fdf7f2 | 1338 | |
20effc67 TL |
1339 | if (!is_primary() || !is_active() || !is_clean()) { |
1340 | return Scrub::schedule_result_t::bad_pg_state; | |
f67539c2 | 1341 | } |
7c673cae | 1342 | |
f67539c2 TL |
1343 | // analyse the combination of the requested scrub flags, the osd/pool configuration |
1344 | // and the PG status to determine whether we should scrub now, and what type of scrub | |
1345 | // should that be. | |
1346 | auto updated_flags = verify_scrub_mode(); | |
1347 | if (!updated_flags) { | |
1348 | // the stars do not align for starting a scrub for this PG at this time | |
1349 | // (due to configuration or priority issues) | |
1350 | // The reason was already reported by the callee. | |
1351 | dout(10) << __func__ << ": failed to initiate a scrub" << dendl; | |
20effc67 | 1352 | return Scrub::schedule_result_t::preconditions; |
f67539c2 | 1353 | } |
7c673cae | 1354 | |
f67539c2 TL |
1355 | // try to reserve the local OSD resources. If failing: no harm. We will |
1356 | // be retried by the OSD later on. | |
1357 | if (!m_scrubber->reserve_local()) { | |
1358 | dout(10) << __func__ << ": failed to reserve locally" << dendl; | |
20effc67 | 1359 | return Scrub::schedule_result_t::no_local_resources; |
f67539c2 | 1360 | } |
7c673cae | 1361 | |
f67539c2 TL |
1362 | // can commit to the updated flags now, as nothing will stop the scrub |
1363 | m_planned_scrub = *updated_flags; | |
9f95a23c | 1364 | |
f67539c2 TL |
1365 | // An interrupted recovery repair could leave this set. |
1366 | state_clear(PG_STATE_REPAIR); | |
9f95a23c | 1367 | |
f67539c2 TL |
1368 | // Pass control to the scrubber. It is the scrubber that handles the replicas' |
1369 | // resources reservations. | |
1370 | m_scrubber->set_op_parameters(m_planned_scrub); | |
9f95a23c | 1371 | |
f67539c2 | 1372 | dout(10) << __func__ << ": queueing" << dendl; |
f67539c2 | 1373 | osd->queue_for_scrub(this, Scrub::scrub_prio_t::low_priority); |
20effc67 | 1374 | return Scrub::schedule_result_t::scrub_initiated; |
f67539c2 TL |
1375 | } |
1376 | ||
1377 | double PG::next_deepscrub_interval() const | |
1378 | { | |
1379 | double deep_scrub_interval = | |
1380 | pool.info.opts.value_or(pool_opts_t::DEEP_SCRUB_INTERVAL, 0.0); | |
1381 | if (deep_scrub_interval <= 0.0) | |
1382 | deep_scrub_interval = cct->_conf->osd_deep_scrub_interval; | |
1383 | return info.history.last_deep_scrub_stamp + deep_scrub_interval; | |
1384 | } | |
1385 | ||
1386 | bool PG::is_time_for_deep(bool allow_deep_scrub, | |
1387 | bool allow_scrub, | |
1388 | bool has_deep_errors, | |
1389 | const requested_scrub_t& planned) const | |
1390 | { | |
20effc67 TL |
1391 | dout(10) << __func__ << ": need_auto?" << planned.need_auto << " allow_deep_scrub? " |
1392 | << allow_deep_scrub << dendl; | |
f67539c2 TL |
1393 | |
1394 | if (!allow_deep_scrub) | |
1395 | return false; | |
1396 | ||
1397 | if (planned.need_auto) { | |
1398 | dout(10) << __func__ << ": need repair after scrub errors" << dendl; | |
1399 | return true; | |
1400 | } | |
1401 | ||
20effc67 TL |
1402 | if (ceph_clock_now() >= next_deepscrub_interval()) { |
1403 | dout(20) << __func__ << ": now (" << ceph_clock_now() << ") >= time for deep (" | |
1404 | << next_deepscrub_interval() << ")" << dendl; | |
f67539c2 | 1405 | return true; |
20effc67 | 1406 | } |
f67539c2 TL |
1407 | |
1408 | if (has_deep_errors) { | |
1409 | osd->clog->info() << "osd." << osd->whoami << " pg " << info.pgid | |
1410 | << " Deep scrub errors, upgrading scrub to deep-scrub"; | |
1411 | return true; | |
9f95a23c TL |
1412 | } |
1413 | ||
f67539c2 TL |
1414 | // we only flip coins if 'allow_scrub' is asserted. Otherwise - as this function is |
1415 | // called often, we will probably be deep-scrubbing most of the time. | |
1416 | if (allow_scrub) { | |
1417 | bool deep_coin_flip = | |
1418 | (rand() % 100) < cct->_conf->osd_deep_scrub_randomize_ratio * 100; | |
1419 | ||
1420 | dout(15) << __func__ << ": time_for_deep=" << planned.time_for_deep | |
1421 | << " deep_coin_flip=" << deep_coin_flip << dendl; | |
1422 | ||
1423 | if (deep_coin_flip) | |
1424 | return true; | |
1425 | } | |
1426 | ||
1427 | return false; | |
1428 | } | |
1429 | ||
1430 | bool PG::verify_periodic_scrub_mode(bool allow_deep_scrub, | |
1431 | bool try_to_auto_repair, | |
1432 | bool allow_regular_scrub, | |
1433 | bool has_deep_errors, | |
1434 | requested_scrub_t& planned) const | |
1435 | ||
1436 | { | |
1437 | ceph_assert(!planned.must_deep_scrub && !planned.must_repair); | |
1438 | ||
1439 | if (!allow_deep_scrub && has_deep_errors) { | |
1440 | osd->clog->error() | |
1441 | << "osd." << osd->whoami << " pg " << info.pgid | |
1442 | << " Regular scrub skipped due to deep-scrub errors and nodeep-scrub set"; | |
9f95a23c | 1443 | return false; |
f67539c2 TL |
1444 | } |
1445 | ||
1446 | if (allow_deep_scrub) { | |
1447 | // Initial entry and scheduled scrubs without nodeep_scrub set get here | |
1448 | ||
1449 | planned.time_for_deep = | |
1450 | is_time_for_deep(allow_deep_scrub, allow_regular_scrub, has_deep_errors, planned); | |
1451 | ||
1452 | if (try_to_auto_repair) { | |
1453 | if (planned.time_for_deep) { | |
1454 | dout(20) << __func__ << ": auto repair with deep scrubbing" << dendl; | |
1455 | planned.auto_repair = true; | |
1456 | } else if (allow_regular_scrub) { | |
1457 | dout(20) << __func__ << ": auto repair with scrubbing, rescrub if errors found" | |
1458 | << dendl; | |
1459 | planned.deep_scrub_on_error = true; | |
9f95a23c | 1460 | } |
7c673cae | 1461 | } |
7c673cae | 1462 | } |
f67539c2 TL |
1463 | |
1464 | dout(20) << __func__ << " updated flags: " << planned | |
1465 | << " allow_regular_scrub: " << allow_regular_scrub << dendl; | |
1466 | ||
1467 | // NOSCRUB so skip regular scrubs | |
1468 | if (!allow_regular_scrub && !planned.time_for_deep) { | |
1469 | return false; | |
1470 | } | |
1471 | ||
9f95a23c | 1472 | return true; |
7c673cae FG |
1473 | } |
1474 | ||
f67539c2 | 1475 | std::optional<requested_scrub_t> PG::verify_scrub_mode() const |
7c673cae | 1476 | { |
20effc67 TL |
1477 | const bool allow_regular_scrub = |
1478 | !(get_osdmap()->test_flag(CEPH_OSDMAP_NOSCRUB) || | |
1479 | pool.info.has_flag(pg_pool_t::FLAG_NOSCRUB)); | |
1480 | const bool allow_deep_scrub = | |
1481 | allow_regular_scrub && | |
1482 | !(get_osdmap()->test_flag(CEPH_OSDMAP_NODEEP_SCRUB) || | |
1483 | pool.info.has_flag(pg_pool_t::FLAG_NODEEP_SCRUB)); | |
1484 | const bool has_deep_errors = (info.stats.stats.sum.num_deep_scrub_errors > 0); | |
1485 | const bool try_to_auto_repair = (cct->_conf->osd_scrub_auto_repair && | |
1486 | get_pgbackend()->auto_repair_supported()); | |
1487 | ||
1488 | dout(10) << __func__ << " pg: " << info.pgid | |
1489 | << " allow: " << allow_regular_scrub << "/" << allow_deep_scrub | |
1490 | << " deep errs: " << has_deep_errors | |
1491 | << " auto-repair: " << try_to_auto_repair << " (" | |
1492 | << cct->_conf->osd_scrub_auto_repair << ")" << dendl; | |
7c673cae | 1493 | |
f67539c2 TL |
1494 | auto upd_flags = m_planned_scrub; |
1495 | ||
1496 | upd_flags.time_for_deep = false; | |
1497 | // Clear these in case user issues the scrub/repair command during | |
1498 | // the scheduling of the scrub/repair (e.g. request reservation) | |
1499 | upd_flags.deep_scrub_on_error = false; | |
1500 | upd_flags.auto_repair = false; | |
1501 | ||
1502 | if (upd_flags.must_scrub && !upd_flags.must_deep_scrub && has_deep_errors) { | |
20effc67 TL |
1503 | osd->clog->error() |
1504 | << "osd." << osd->whoami << " pg " << info.pgid | |
1505 | << " Regular scrub request, deep-scrub details will be lost"; | |
f67539c2 TL |
1506 | } |
1507 | ||
1508 | if (!upd_flags.must_scrub) { | |
1509 | // All periodic scrub handling goes here because must_scrub is | |
1510 | // always set for must_deep_scrub and must_repair. | |
1511 | ||
20effc67 TL |
1512 | const bool can_start_periodic = verify_periodic_scrub_mode( |
1513 | allow_deep_scrub, try_to_auto_repair, allow_regular_scrub, | |
1514 | has_deep_errors, upd_flags); | |
f67539c2 | 1515 | if (!can_start_periodic) { |
20effc67 TL |
1516 | // "I don't want no scrub" |
1517 | dout(20) << __func__ << ": no periodic scrubs allowed" << dendl; | |
f67539c2 TL |
1518 | return std::nullopt; |
1519 | } | |
1520 | } | |
1521 | ||
1522 | // scrubbing while recovering? | |
1523 | ||
1524 | bool prevented_by_recovery = | |
1525 | osd->is_recovery_active() && !cct->_conf->osd_scrub_during_recovery && | |
1526 | (!cct->_conf->osd_repair_during_recovery || !upd_flags.must_repair); | |
1527 | ||
1528 | if (prevented_by_recovery) { | |
1529 | dout(20) << __func__ << ": scrubbing prevented during recovery" << dendl; | |
1530 | return std::nullopt; | |
7c673cae | 1531 | } |
f67539c2 TL |
1532 | |
1533 | upd_flags.need_auto = false; | |
1534 | return upd_flags; | |
7c673cae FG |
1535 | } |
1536 | ||
20effc67 TL |
1537 | /* |
1538 | * Note: on_info_history_change() is used in those two cases where we're not sure | |
1539 | * whether the role of the PG was changed, and if so - was this change relayed to the | |
1540 | * scrub-queue. | |
1541 | */ | |
1542 | void PG::on_info_history_change() | |
7c673cae | 1543 | { |
20effc67 TL |
1544 | dout(20) << __func__ << " for a " << (is_primary() ? "Primary" : "non-primary") <<dendl; |
1545 | ||
1546 | ceph_assert(m_scrubber); | |
1547 | m_scrubber->on_maybe_registration_change(m_planned_scrub); | |
9f95a23c | 1548 | } |
7c673cae | 1549 | |
20effc67 | 1550 | void PG::reschedule_scrub() |
9f95a23c | 1551 | { |
20effc67 TL |
1552 | dout(20) << __func__ << " for a " << (is_primary() ? "Primary" : "non-primary") <<dendl; |
1553 | ||
1554 | // we are assuming no change in primary status | |
1555 | if (is_primary()) { | |
1556 | ceph_assert(m_scrubber); | |
1557 | m_scrubber->update_scrub_job(m_planned_scrub); | |
1558 | } | |
1559 | } | |
1560 | ||
1561 | void PG::on_primary_status_change(bool was_primary, bool now_primary) | |
1562 | { | |
1563 | // make sure we have a working scrubber when becoming a primary | |
1564 | ||
1565 | if (was_primary != now_primary) { | |
1566 | ceph_assert(m_scrubber); | |
1567 | m_scrubber->on_primary_change(m_planned_scrub); | |
f67539c2 | 1568 | } |
9f95a23c | 1569 | } |
7c673cae | 1570 | |
f67539c2 | 1571 | void PG::scrub_requested(scrub_level_t scrub_level, scrub_type_t scrub_type) |
9f95a23c | 1572 | { |
20effc67 | 1573 | ceph_assert(m_scrubber); |
f67539c2 | 1574 | m_scrubber->scrub_requested(scrub_level, scrub_type, m_planned_scrub); |
9f95a23c | 1575 | } |
7c673cae | 1576 | |
9f95a23c TL |
1577 | void PG::clear_ready_to_merge() { |
1578 | osd->clear_ready_to_merge(this); | |
1579 | } | |
7c673cae | 1580 | |
9f95a23c TL |
1581 | void PG::queue_want_pg_temp(const vector<int> &wanted) { |
1582 | osd->queue_want_pg_temp(get_pgid().pgid, wanted); | |
1583 | } | |
7c673cae | 1584 | |
9f95a23c TL |
1585 | void PG::clear_want_pg_temp() { |
1586 | osd->remove_want_pg_temp(get_pgid().pgid); | |
1587 | } | |
7c673cae | 1588 | |
9f95a23c TL |
1589 | void PG::on_role_change() { |
1590 | requeue_ops(waiting_for_peered); | |
1591 | plpg_on_role_change(); | |
1592 | } | |
7c673cae | 1593 | |
20effc67 TL |
1594 | void PG::on_new_interval() |
1595 | { | |
9f95a23c TL |
1596 | projected_last_update = eversion_t(); |
1597 | cancel_recovery(); | |
20effc67 TL |
1598 | |
1599 | assert(m_scrubber); | |
1600 | // log some scrub data before we react to the interval | |
1601 | dout(20) << __func__ << (is_scrub_queued_or_active() ? " scrubbing " : " ") | |
1602 | << "flags: " << m_planned_scrub << dendl; | |
1603 | ||
1604 | m_scrubber->on_maybe_registration_change(m_planned_scrub); | |
9f95a23c | 1605 | } |
7c673cae | 1606 | |
9f95a23c TL |
1607 | epoch_t PG::oldest_stored_osdmap() { |
1608 | return osd->get_superblock().oldest_map; | |
1609 | } | |
7c673cae | 1610 | |
9f95a23c TL |
1611 | OstreamTemp PG::get_clog_info() { |
1612 | return osd->clog->info(); | |
1613 | } | |
7c673cae | 1614 | |
9f95a23c TL |
1615 | OstreamTemp PG::get_clog_debug() { |
1616 | return osd->clog->debug(); | |
1617 | } | |
7c673cae | 1618 | |
9f95a23c TL |
1619 | OstreamTemp PG::get_clog_error() { |
1620 | return osd->clog->error(); | |
1621 | } | |
7c673cae | 1622 | |
9f95a23c TL |
1623 | void PG::schedule_event_after( |
1624 | PGPeeringEventRef event, | |
1625 | float delay) { | |
1626 | std::lock_guard lock(osd->recovery_request_lock); | |
1627 | osd->recovery_request_timer.add_event_after( | |
1628 | delay, | |
1629 | new QueuePeeringEvt( | |
1630 | this, | |
1631 | std::move(event))); | |
1632 | } | |
7c673cae | 1633 | |
9f95a23c TL |
1634 | void PG::request_local_background_io_reservation( |
1635 | unsigned priority, | |
f67539c2 TL |
1636 | PGPeeringEventURef on_grant, |
1637 | PGPeeringEventURef on_preempt) { | |
9f95a23c TL |
1638 | osd->local_reserver.request_reservation( |
1639 | pg_id, | |
1640 | on_grant ? new QueuePeeringEvt( | |
f67539c2 | 1641 | this, std::move(on_grant)) : nullptr, |
9f95a23c TL |
1642 | priority, |
1643 | on_preempt ? new QueuePeeringEvt( | |
f67539c2 | 1644 | this, std::move(on_preempt)) : nullptr); |
9f95a23c | 1645 | } |
7c673cae | 1646 | |
9f95a23c TL |
1647 | void PG::update_local_background_io_priority( |
1648 | unsigned priority) { | |
1649 | osd->local_reserver.update_priority( | |
1650 | pg_id, | |
1651 | priority); | |
1652 | } | |
7c673cae | 1653 | |
9f95a23c TL |
1654 | void PG::cancel_local_background_io_reservation() { |
1655 | osd->local_reserver.cancel_reservation( | |
1656 | pg_id); | |
1657 | } | |
7c673cae | 1658 | |
9f95a23c TL |
1659 | void PG::request_remote_recovery_reservation( |
1660 | unsigned priority, | |
f67539c2 TL |
1661 | PGPeeringEventURef on_grant, |
1662 | PGPeeringEventURef on_preempt) { | |
9f95a23c TL |
1663 | osd->remote_reserver.request_reservation( |
1664 | pg_id, | |
1665 | on_grant ? new QueuePeeringEvt( | |
f67539c2 | 1666 | this, std::move(on_grant)) : nullptr, |
9f95a23c TL |
1667 | priority, |
1668 | on_preempt ? new QueuePeeringEvt( | |
f67539c2 | 1669 | this, std::move(on_preempt)) : nullptr); |
9f95a23c | 1670 | } |
11fdf7f2 | 1671 | |
9f95a23c TL |
1672 | void PG::cancel_remote_recovery_reservation() { |
1673 | osd->remote_reserver.cancel_reservation( | |
1674 | pg_id); | |
7c673cae FG |
1675 | } |
1676 | ||
9f95a23c TL |
1677 | void PG::schedule_event_on_commit( |
1678 | ObjectStore::Transaction &t, | |
1679 | PGPeeringEventRef on_commit) | |
11fdf7f2 | 1680 | { |
9f95a23c | 1681 | t.register_on_commit(new QueuePeeringEvt(this, on_commit)); |
11fdf7f2 TL |
1682 | } |
1683 | ||
f67539c2 TL |
1684 | void PG::on_activate(interval_set<snapid_t> snaps) |
1685 | { | |
1686 | ceph_assert(!m_scrubber->are_callbacks_pending()); | |
1687 | ceph_assert(callbacks_for_degraded_object.empty()); | |
1688 | snap_trimq = snaps; | |
1689 | release_pg_backoffs(); | |
1690 | projected_last_update = info.last_update; | |
1691 | } | |
1692 | ||
9f95a23c | 1693 | void PG::on_active_exit() |
11fdf7f2 | 1694 | { |
9f95a23c TL |
1695 | backfill_reserving = false; |
1696 | agent_stop(); | |
11fdf7f2 TL |
1697 | } |
1698 | ||
9f95a23c | 1699 | void PG::on_active_advmap(const OSDMapRef &osdmap) |
11fdf7f2 | 1700 | { |
9f95a23c TL |
1701 | const auto& new_removed_snaps = osdmap->get_new_removed_snaps(); |
1702 | auto i = new_removed_snaps.find(get_pgid().pool()); | |
1703 | if (i != new_removed_snaps.end()) { | |
1704 | bool bad = false; | |
1705 | for (auto j : i->second) { | |
1706 | if (snap_trimq.intersects(j.first, j.second)) { | |
1707 | decltype(snap_trimq) added, overlap; | |
1708 | added.insert(j.first, j.second); | |
1709 | overlap.intersection_of(snap_trimq, added); | |
1710 | derr << __func__ << " removed_snaps already contains " | |
1711 | << overlap << dendl; | |
1712 | bad = true; | |
1713 | snap_trimq.union_of(added); | |
1714 | } else { | |
1715 | snap_trimq.insert(j.first, j.second); | |
1716 | } | |
1717 | } | |
1718 | dout(10) << __func__ << " new removed_snaps " << i->second | |
1719 | << ", snap_trimq now " << snap_trimq << dendl; | |
1720 | ceph_assert(!bad || !cct->_conf->osd_debug_verify_cached_snaps); | |
1721 | } | |
1722 | ||
1723 | const auto& new_purged_snaps = osdmap->get_new_purged_snaps(); | |
1724 | auto j = new_purged_snaps.find(get_pgid().pgid.pool()); | |
1725 | if (j != new_purged_snaps.end()) { | |
1726 | bool bad = false; | |
1727 | for (auto k : j->second) { | |
1728 | if (!recovery_state.get_info().purged_snaps.contains(k.first, k.second)) { | |
1729 | interval_set<snapid_t> rm, overlap; | |
1730 | rm.insert(k.first, k.second); | |
1731 | overlap.intersection_of(recovery_state.get_info().purged_snaps, rm); | |
1732 | derr << __func__ << " purged_snaps does not contain " | |
1733 | << rm << ", only " << overlap << dendl; | |
1734 | recovery_state.adjust_purged_snaps( | |
1735 | [&overlap](auto &purged_snaps) { | |
1736 | purged_snaps.subtract(overlap); | |
1737 | }); | |
1738 | // This can currently happen in the normal (if unlikely) course of | |
1739 | // events. Because adding snaps to purged_snaps does not increase | |
1740 | // the pg version or add a pg log entry, we don't reliably propagate | |
1741 | // purged_snaps additions to other OSDs. | |
1742 | // One example: | |
1743 | // - purge S | |
1744 | // - primary and replicas update purged_snaps | |
1745 | // - no object updates | |
1746 | // - pg mapping changes, new primary on different node | |
1747 | // - new primary pg version == eversion_t(), so info is not | |
1748 | // propagated. | |
1749 | //bad = true; | |
1750 | } else { | |
1751 | recovery_state.adjust_purged_snaps( | |
1752 | [&k](auto &purged_snaps) { | |
1753 | purged_snaps.erase(k.first, k.second); | |
1754 | }); | |
11fdf7f2 TL |
1755 | } |
1756 | } | |
9f95a23c TL |
1757 | dout(10) << __func__ << " new purged_snaps " << j->second |
1758 | << ", now " << recovery_state.get_info().purged_snaps << dendl; | |
1759 | ceph_assert(!bad || !cct->_conf->osd_debug_verify_cached_snaps); | |
11fdf7f2 | 1760 | } |
11fdf7f2 TL |
1761 | } |
1762 | ||
9f95a23c | 1763 | void PG::queue_snap_retrim(snapid_t snap) |
7c673cae | 1764 | { |
9f95a23c TL |
1765 | if (!is_active() || |
1766 | !is_primary()) { | |
1767 | dout(10) << __func__ << " snap " << snap << " - not active and primary" | |
1768 | << dendl; | |
7c673cae | 1769 | return; |
7c673cae | 1770 | } |
9f95a23c TL |
1771 | if (!snap_trimq.contains(snap)) { |
1772 | snap_trimq.insert(snap); | |
1773 | snap_trimq_repeat.insert(snap); | |
1774 | dout(20) << __func__ << " snap " << snap | |
1775 | << ", trimq now " << snap_trimq | |
1776 | << ", repeat " << snap_trimq_repeat << dendl; | |
1777 | kick_snap_trim(); | |
1778 | } else { | |
1779 | dout(20) << __func__ << " snap " << snap | |
1780 | << " already in trimq " << snap_trimq << dendl; | |
7c673cae | 1781 | } |
7c673cae FG |
1782 | } |
1783 | ||
9f95a23c | 1784 | void PG::on_active_actmap() |
7c673cae | 1785 | { |
9f95a23c TL |
1786 | if (cct->_conf->osd_check_for_log_corruption) |
1787 | check_log_for_corruption(osd->store); | |
1788 | ||
1789 | ||
1790 | if (recovery_state.is_active()) { | |
1791 | dout(10) << "Active: kicking snap trim" << dendl; | |
1792 | kick_snap_trim(); | |
7c673cae | 1793 | } |
9f95a23c TL |
1794 | |
1795 | if (recovery_state.is_peered() && | |
1796 | !recovery_state.is_clean() && | |
1797 | !recovery_state.get_osdmap()->test_flag(CEPH_OSDMAP_NOBACKFILL) && | |
1798 | (!recovery_state.get_osdmap()->test_flag(CEPH_OSDMAP_NOREBALANCE) || | |
1799 | recovery_state.is_degraded())) { | |
1800 | queue_recovery(); | |
7c673cae FG |
1801 | } |
1802 | } | |
1803 | ||
9f95a23c | 1804 | void PG::on_backfill_reserved() |
7c673cae | 1805 | { |
9f95a23c TL |
1806 | backfill_reserving = false; |
1807 | queue_recovery(); | |
7c673cae FG |
1808 | } |
1809 | ||
9f95a23c | 1810 | void PG::on_backfill_canceled() |
7c673cae | 1811 | { |
9f95a23c TL |
1812 | if (!waiting_on_backfill.empty()) { |
1813 | waiting_on_backfill.clear(); | |
1814 | finish_recovery_op(hobject_t::get_max()); | |
7c673cae FG |
1815 | } |
1816 | } | |
1817 | ||
9f95a23c | 1818 | void PG::on_recovery_reserved() |
7c673cae | 1819 | { |
9f95a23c | 1820 | queue_recovery(); |
7c673cae FG |
1821 | } |
1822 | ||
9f95a23c | 1823 | void PG::set_not_ready_to_merge_target(pg_t pgid, pg_t src) |
7c673cae | 1824 | { |
9f95a23c | 1825 | osd->set_not_ready_to_merge_target(pgid, src); |
7c673cae FG |
1826 | } |
1827 | ||
9f95a23c | 1828 | void PG::set_not_ready_to_merge_source(pg_t pgid) |
7c673cae | 1829 | { |
9f95a23c | 1830 | osd->set_not_ready_to_merge_source(pgid); |
7c673cae FG |
1831 | } |
1832 | ||
9f95a23c | 1833 | void PG::set_ready_to_merge_target(eversion_t lu, epoch_t les, epoch_t lec) |
7c673cae | 1834 | { |
9f95a23c | 1835 | osd->set_ready_to_merge_target(this, lu, les, lec); |
7c673cae FG |
1836 | } |
1837 | ||
9f95a23c | 1838 | void PG::set_ready_to_merge_source(eversion_t lu) |
7c673cae | 1839 | { |
9f95a23c | 1840 | osd->set_ready_to_merge_source(this, lu); |
7c673cae FG |
1841 | } |
1842 | ||
9f95a23c | 1843 | void PG::send_pg_created(pg_t pgid) |
7c673cae | 1844 | { |
9f95a23c TL |
1845 | osd->send_pg_created(pgid); |
1846 | } | |
7c673cae | 1847 | |
9f95a23c TL |
1848 | ceph::signedspan PG::get_mnow() |
1849 | { | |
1850 | return osd->get_mnow(); | |
1851 | } | |
7c673cae | 1852 | |
9f95a23c TL |
1853 | HeartbeatStampsRef PG::get_hb_stamps(int peer) |
1854 | { | |
1855 | return osd->get_hb_stamps(peer); | |
7c673cae FG |
1856 | } |
1857 | ||
9f95a23c TL |
1858 | void PG::schedule_renew_lease(epoch_t lpr, ceph::timespan delay) |
1859 | { | |
1860 | auto spgid = info.pgid; | |
1861 | auto o = osd; | |
1862 | osd->mono_timer.add_event( | |
1863 | delay, | |
1864 | [o, lpr, spgid]() { | |
1865 | o->queue_renew_lease(lpr, spgid); | |
1866 | }); | |
1867 | } | |
7c673cae | 1868 | |
9f95a23c | 1869 | void PG::queue_check_readable(epoch_t lpr, ceph::timespan delay) |
7c673cae | 1870 | { |
9f95a23c | 1871 | osd->queue_check_readable(info.pgid, lpr, delay); |
7c673cae FG |
1872 | } |
1873 | ||
9f95a23c | 1874 | void PG::rebuild_missing_set_with_deletes(PGLog &pglog) |
91327a77 | 1875 | { |
9f95a23c TL |
1876 | pglog.rebuild_missing_set_with_deletes( |
1877 | osd->store, | |
1878 | ch, | |
1879 | recovery_state.get_info()); | |
91327a77 AA |
1880 | } |
1881 | ||
9f95a23c | 1882 | void PG::on_activate_committed() |
91327a77 | 1883 | { |
9f95a23c TL |
1884 | if (!is_primary()) { |
1885 | // waiters | |
1886 | if (recovery_state.needs_flush() == 0) { | |
1887 | requeue_ops(waiting_for_peered); | |
1888 | } else if (!waiting_for_peered.empty()) { | |
1889 | dout(10) << __func__ << " flushes in progress, moving " | |
1890 | << waiting_for_peered.size() << " items to waiting_for_flush" | |
1891 | << dendl; | |
1892 | ceph_assert(waiting_for_flush.empty()); | |
1893 | waiting_for_flush.swap(waiting_for_peered); | |
91327a77 | 1894 | } |
9f95a23c TL |
1895 | } |
1896 | } | |
91327a77 | 1897 | |
9f95a23c TL |
1898 | // Compute pending backfill data |
1899 | static int64_t pending_backfill(CephContext *cct, int64_t bf_bytes, int64_t local_bytes) | |
11fdf7f2 | 1900 | { |
9f95a23c TL |
1901 | lgeneric_dout(cct, 20) << __func__ << " Adjust local usage " |
1902 | << (local_bytes >> 10) << "KiB" | |
1903 | << " primary usage " << (bf_bytes >> 10) | |
1904 | << "KiB" << dendl; | |
11fdf7f2 | 1905 | |
9f95a23c TL |
1906 | return std::max((int64_t)0, bf_bytes - local_bytes); |
1907 | } | |
7c673cae | 1908 | |
7c673cae | 1909 | |
9f95a23c TL |
1910 | // We can zero the value of primary num_bytes as just an atomic. |
1911 | // However, setting above zero reserves space for backfill and requires | |
1912 | // the OSDService::stat_lock which protects all OSD usage | |
1913 | bool PG::try_reserve_recovery_space( | |
1914 | int64_t primary_bytes, int64_t local_bytes) { | |
1915 | // Use tentative_bacfill_full() to make sure enough | |
1916 | // space is available to handle target bytes from primary. | |
7c673cae | 1917 | |
9f95a23c TL |
1918 | // TODO: If we passed num_objects from primary we could account for |
1919 | // an estimate of the metadata overhead. | |
7c673cae | 1920 | |
9f95a23c TL |
1921 | // TODO: If we had compressed_allocated and compressed_original from primary |
1922 | // we could compute compression ratio and adjust accordingly. | |
7c673cae | 1923 | |
9f95a23c TL |
1924 | // XXX: There is no way to get omap overhead and this would only apply |
1925 | // to whatever possibly different partition that is storing the database. | |
7c673cae | 1926 | |
9f95a23c TL |
1927 | // update_osd_stat() from heartbeat will do this on a new |
1928 | // statfs using ps->primary_bytes. | |
1929 | uint64_t pending_adjustment = 0; | |
1930 | if (primary_bytes) { | |
1931 | // For erasure coded pool overestimate by a full stripe per object | |
1932 | // because we don't know how each objected rounded to the nearest stripe | |
1933 | if (pool.info.is_erasure()) { | |
1934 | primary_bytes /= (int)get_pgbackend()->get_ec_data_chunk_count(); | |
1935 | primary_bytes += get_pgbackend()->get_ec_stripe_chunk_size() * | |
1936 | info.stats.stats.sum.num_objects; | |
1937 | local_bytes /= (int)get_pgbackend()->get_ec_data_chunk_count(); | |
1938 | local_bytes += get_pgbackend()->get_ec_stripe_chunk_size() * | |
1939 | info.stats.stats.sum.num_objects; | |
1940 | } | |
1941 | pending_adjustment = pending_backfill( | |
1942 | cct, | |
1943 | primary_bytes, | |
1944 | local_bytes); | |
1945 | dout(10) << __func__ << " primary_bytes " << (primary_bytes >> 10) | |
1946 | << "KiB" | |
1947 | << " local " << (local_bytes >> 10) << "KiB" | |
1948 | << " pending_adjustments " << (pending_adjustment >> 10) << "KiB" | |
1949 | << dendl; | |
7c673cae | 1950 | } |
7c673cae | 1951 | |
9f95a23c TL |
1952 | // This lock protects not only the stats OSDService but also setting the |
1953 | // pg primary_bytes. That's why we don't immediately unlock | |
1954 | std::lock_guard l{osd->stat_lock}; | |
1955 | osd_stat_t cur_stat = osd->osd_stat; | |
1956 | if (cct->_conf->osd_debug_reject_backfill_probability > 0 && | |
1957 | (rand()%1000 < (cct->_conf->osd_debug_reject_backfill_probability*1000.0))) { | |
1958 | dout(10) << "backfill reservation rejected: failure injection" | |
1959 | << dendl; | |
1960 | return false; | |
1961 | } else if (!cct->_conf->osd_debug_skip_full_check_in_backfill_reservation && | |
1962 | osd->tentative_backfill_full(this, pending_adjustment, cur_stat)) { | |
1963 | dout(10) << "backfill reservation rejected: backfill full" | |
1964 | << dendl; | |
1965 | return false; | |
1966 | } else { | |
1967 | // Don't reserve space if skipped reservation check, this is used | |
1968 | // to test the other backfill full check AND in case a corruption | |
1969 | // of num_bytes requires ignoring that value and trying the | |
1970 | // backfill anyway. | |
1971 | if (primary_bytes && | |
1972 | !cct->_conf->osd_debug_skip_full_check_in_backfill_reservation) { | |
1973 | primary_num_bytes.store(primary_bytes); | |
1974 | local_num_bytes.store(local_bytes); | |
1975 | } else { | |
1976 | unreserve_recovery_space(); | |
1977 | } | |
1978 | return true; | |
7c673cae FG |
1979 | } |
1980 | } | |
1981 | ||
9f95a23c TL |
1982 | void PG::unreserve_recovery_space() { |
1983 | primary_num_bytes.store(0); | |
1984 | local_num_bytes.store(0); | |
7c673cae FG |
1985 | } |
1986 | ||
9f95a23c | 1987 | void PG::_scan_rollback_obs(const vector<ghobject_t> &rollback_obs) |
7c673cae | 1988 | { |
9f95a23c TL |
1989 | ObjectStore::Transaction t; |
1990 | eversion_t trimmed_to = recovery_state.get_last_rollback_info_trimmed_to_applied(); | |
1991 | for (vector<ghobject_t>::const_iterator i = rollback_obs.begin(); | |
1992 | i != rollback_obs.end(); | |
1993 | ++i) { | |
1994 | if (i->generation < trimmed_to.version) { | |
1995 | dout(10) << __func__ << "osd." << osd->whoami | |
1996 | << " pg " << info.pgid | |
1997 | << " found obsolete rollback obj " | |
1998 | << *i << " generation < trimmed_to " | |
1999 | << trimmed_to | |
2000 | << "...repaired" << dendl; | |
2001 | t.remove(coll, *i); | |
2002 | } | |
2003 | } | |
2004 | if (!t.empty()) { | |
2005 | derr << __func__ << ": queueing trans to clean up obsolete rollback objs" | |
2006 | << dendl; | |
2007 | osd->store->queue_transaction(ch, std::move(t), NULL); | |
2008 | } | |
7c673cae FG |
2009 | } |
2010 | ||
7c673cae | 2011 | |
9f95a23c | 2012 | void PG::_repair_oinfo_oid(ScrubMap &smap) |
7c673cae | 2013 | { |
9f95a23c TL |
2014 | for (map<hobject_t, ScrubMap::object>::reverse_iterator i = smap.objects.rbegin(); |
2015 | i != smap.objects.rend(); | |
2016 | ++i) { | |
2017 | const hobject_t &hoid = i->first; | |
2018 | ScrubMap::object &o = i->second; | |
7c673cae | 2019 | |
9f95a23c TL |
2020 | bufferlist bl; |
2021 | if (o.attrs.find(OI_ATTR) == o.attrs.end()) { | |
2022 | continue; | |
2023 | } | |
2024 | bl.push_back(o.attrs[OI_ATTR]); | |
2025 | object_info_t oi; | |
2026 | try { | |
2027 | oi.decode(bl); | |
2028 | } catch(...) { | |
2029 | continue; | |
2030 | } | |
2031 | if (oi.soid != hoid) { | |
2032 | ObjectStore::Transaction t; | |
2033 | OSDriver::OSTransaction _t(osdriver.get_transaction(&t)); | |
2034 | osd->clog->error() << "osd." << osd->whoami | |
2035 | << " found object info error on pg " | |
2036 | << info.pgid | |
2037 | << " oid " << hoid << " oid in object info: " | |
2038 | << oi.soid | |
2039 | << "...repaired"; | |
2040 | // Fix object info | |
2041 | oi.soid = hoid; | |
2042 | bl.clear(); | |
2043 | encode(oi, bl, get_osdmap()->get_features(CEPH_ENTITY_TYPE_OSD, nullptr)); | |
7c673cae | 2044 | |
9f95a23c TL |
2045 | bufferptr bp(bl.c_str(), bl.length()); |
2046 | o.attrs[OI_ATTR] = bp; | |
7c673cae | 2047 | |
9f95a23c TL |
2048 | t.setattr(coll, ghobject_t(hoid), OI_ATTR, bl); |
2049 | int r = osd->store->queue_transaction(ch, std::move(t)); | |
2050 | if (r != 0) { | |
2051 | derr << __func__ << ": queue_transaction got " << cpp_strerror(r) | |
2052 | << dendl; | |
2053 | } | |
7c673cae FG |
2054 | } |
2055 | } | |
7c673cae | 2056 | } |
7c673cae | 2057 | |
9f95a23c TL |
2058 | void PG::repair_object( |
2059 | const hobject_t &soid, | |
2060 | const list<pair<ScrubMap::object, pg_shard_t> > &ok_peers, | |
2061 | const set<pg_shard_t> &bad_peers) | |
2062 | { | |
2063 | set<pg_shard_t> ok_shards; | |
2064 | for (auto &&peer: ok_peers) ok_shards.insert(peer.second); | |
d2e6a577 | 2065 | |
9f95a23c TL |
2066 | dout(10) << "repair_object " << soid |
2067 | << " bad_peers osd.{" << bad_peers << "}," | |
2068 | << " ok_peers osd.{" << ok_shards << "}" << dendl; | |
11fdf7f2 | 2069 | |
9f95a23c TL |
2070 | const ScrubMap::object &po = ok_peers.back().first; |
2071 | eversion_t v; | |
2072 | object_info_t oi; | |
2073 | try { | |
2074 | bufferlist bv; | |
2075 | if (po.attrs.count(OI_ATTR)) { | |
2076 | bv.push_back(po.attrs.find(OI_ATTR)->second); | |
2077 | } | |
2078 | auto bliter = bv.cbegin(); | |
2079 | decode(oi, bliter); | |
2080 | } catch (...) { | |
2081 | dout(0) << __func__ << ": Need version of replica, bad object_info_t: " | |
2082 | << soid << dendl; | |
2083 | ceph_abort(); | |
11fdf7f2 TL |
2084 | } |
2085 | ||
9f95a23c TL |
2086 | if (bad_peers.count(get_primary())) { |
2087 | // We should only be scrubbing if the PG is clean. | |
2088 | ceph_assert(waiting_for_unreadable_object.empty()); | |
2089 | dout(10) << __func__ << ": primary = " << get_primary() << dendl; | |
11fdf7f2 TL |
2090 | } |
2091 | ||
9f95a23c TL |
2092 | /* No need to pass ok_peers, they must not be missing the object, so |
2093 | * force_object_missing will add them to missing_loc anyway */ | |
2094 | recovery_state.force_object_missing(bad_peers, soid, oi.version); | |
7c673cae FG |
2095 | } |
2096 | ||
20effc67 | 2097 | void PG::forward_scrub_event(ScrubAPI fn, epoch_t epoch_queued, std::string_view desc) |
7c673cae | 2098 | { |
20effc67 TL |
2099 | dout(20) << __func__ << ": " << desc << " queued at: " << epoch_queued << dendl; |
2100 | ceph_assert(m_scrubber); | |
2101 | if (is_active()) { | |
f67539c2 | 2102 | ((*m_scrubber).*fn)(epoch_queued); |
7c673cae | 2103 | } else { |
f67539c2 TL |
2104 | // pg might be in the process of being deleted |
2105 | dout(5) << __func__ << " refusing to forward. " << (is_clean() ? "(clean) " : "(not clean) ") << | |
20effc67 | 2106 | (is_active() ? "(active) " : "(not active) ") << dendl; |
7c673cae | 2107 | } |
f67539c2 | 2108 | } |
7c673cae | 2109 | |
20effc67 TL |
2110 | void PG::forward_scrub_event(ScrubSafeAPI fn, |
2111 | epoch_t epoch_queued, | |
2112 | Scrub::act_token_t act_token, | |
2113 | std::string_view desc) | |
f67539c2 | 2114 | { |
20effc67 TL |
2115 | dout(20) << __func__ << ": " << desc << " queued: " << epoch_queued |
2116 | << " token: " << act_token << dendl; | |
2117 | ceph_assert(m_scrubber); | |
2118 | if (is_active()) { | |
2119 | ((*m_scrubber).*fn)(epoch_queued, act_token); | |
2120 | } else { | |
2121 | // pg might be in the process of being deleted | |
2122 | dout(5) << __func__ << " refusing to forward. " | |
2123 | << (is_clean() ? "(clean) " : "(not clean) ") | |
2124 | << (is_active() ? "(active) " : "(not active) ") << dendl; | |
2125 | } | |
7c673cae FG |
2126 | } |
2127 | ||
20effc67 | 2128 | void PG::replica_scrub(OpRequestRef op, ThreadPool::TPHandle& handle) |
f6b5b4d7 | 2129 | { |
20effc67 TL |
2130 | dout(10) << __func__ << " (op)" << dendl; |
2131 | ceph_assert(m_scrubber); | |
2132 | m_scrubber->replica_scrub_op(op); | |
f6b5b4d7 TL |
2133 | } |
2134 | ||
f67539c2 | 2135 | void PG::replica_scrub(epoch_t epoch_queued, |
20effc67 | 2136 | Scrub::act_token_t act_token, |
f67539c2 TL |
2137 | [[maybe_unused]] ThreadPool::TPHandle& handle) |
2138 | { | |
2139 | dout(10) << __func__ << " queued at: " << epoch_queued | |
2140 | << (is_primary() ? " (primary)" : " (replica)") << dendl; | |
20effc67 TL |
2141 | forward_scrub_event(&ScrubPgIF::send_start_replica, epoch_queued, act_token, |
2142 | "StartReplica/nw"); | |
f67539c2 | 2143 | } |
7c673cae | 2144 | |
f67539c2 TL |
2145 | bool PG::ops_blocked_by_scrub() const |
2146 | { | |
2147 | return !waiting_for_scrub.empty(); | |
2148 | } | |
11fdf7f2 | 2149 | |
f67539c2 TL |
2150 | Scrub::scrub_prio_t PG::is_scrub_blocking_ops() const |
2151 | { | |
2152 | return waiting_for_scrub.empty() ? Scrub::scrub_prio_t::low_priority | |
2153 | : Scrub::scrub_prio_t::high_priority; | |
7c673cae FG |
2154 | } |
2155 | ||
9f95a23c | 2156 | bool PG::old_peering_msg(epoch_t reply_epoch, epoch_t query_epoch) |
7c673cae | 2157 | { |
f67539c2 TL |
2158 | if (auto last_reset = get_last_peering_reset(); |
2159 | last_reset > reply_epoch || last_reset > query_epoch) { | |
2160 | dout(10) << "old_peering_msg reply_epoch " << reply_epoch << " query_epoch " | |
2161 | << query_epoch << " last_peering_reset " << last_reset << dendl; | |
9f95a23c TL |
2162 | return true; |
2163 | } | |
2164 | return false; | |
7c673cae FG |
2165 | } |
2166 | ||
9f95a23c TL |
2167 | struct FlushState { |
2168 | PGRef pg; | |
2169 | epoch_t epoch; | |
2170 | FlushState(PG *pg, epoch_t epoch) : pg(pg), epoch(epoch) {} | |
2171 | ~FlushState() { | |
2172 | std::scoped_lock l{*pg}; | |
2173 | if (!pg->pg_has_reset_since(epoch)) { | |
2174 | pg->recovery_state.complete_flush(); | |
2175 | } | |
7c673cae | 2176 | } |
9f95a23c TL |
2177 | }; |
2178 | typedef std::shared_ptr<FlushState> FlushStateRef; | |
7c673cae | 2179 | |
9f95a23c | 2180 | void PG::start_flush_on_transaction(ObjectStore::Transaction &t) |
7c673cae | 2181 | { |
9f95a23c TL |
2182 | // flush in progress ops |
2183 | FlushStateRef flush_trigger (std::make_shared<FlushState>( | |
2184 | this, get_osdmap_epoch())); | |
2185 | t.register_on_applied(new ContainerContext<FlushStateRef>(flush_trigger)); | |
2186 | t.register_on_commit(new ContainerContext<FlushStateRef>(flush_trigger)); | |
7c673cae FG |
2187 | } |
2188 | ||
9f95a23c | 2189 | bool PG::try_flush_or_schedule_async() |
11fdf7f2 | 2190 | { |
9f95a23c TL |
2191 | Context *c = new QueuePeeringEvt( |
2192 | this, get_osdmap_epoch(), PeeringState::IntervalFlush()); | |
2193 | if (!ch->flush_commit(c)) { | |
2194 | return false; | |
2195 | } else { | |
2196 | delete c; | |
2197 | return true; | |
2198 | } | |
11fdf7f2 TL |
2199 | } |
2200 | ||
9f95a23c | 2201 | ostream& operator<<(ostream& out, const PG& pg) |
11fdf7f2 | 2202 | { |
9f95a23c | 2203 | out << pg.recovery_state; |
f67539c2 TL |
2204 | |
2205 | // listing all scrub-related flags - both current and "planned next scrub" | |
2206 | if (pg.is_scrubbing()) { | |
2207 | out << *pg.m_scrubber; | |
2208 | } | |
2209 | out << pg.m_planned_scrub; | |
11fdf7f2 | 2210 | |
9f95a23c TL |
2211 | if (pg.recovery_ops_active) |
2212 | out << " rops=" << pg.recovery_ops_active; | |
11fdf7f2 | 2213 | |
9f95a23c TL |
2214 | //out << " (" << pg.pg_log.get_tail() << "," << pg.pg_log.get_head() << "]"; |
2215 | if (pg.recovery_state.have_missing()) { | |
2216 | out << " m=" << pg.recovery_state.get_num_missing(); | |
2217 | if (pg.is_primary()) { | |
2218 | uint64_t unfound = pg.recovery_state.get_num_unfound(); | |
2219 | if (unfound) | |
2220 | out << " u=" << unfound; | |
2221 | } | |
2222 | } | |
2223 | if (!pg.is_clean()) { | |
2224 | out << " mbc=" << pg.recovery_state.get_missing_by_count(); | |
2225 | } | |
2226 | if (!pg.snap_trimq.empty()) { | |
2227 | out << " trimq="; | |
2228 | // only show a count if the set is large | |
2229 | if (pg.snap_trimq.num_intervals() > 16) { | |
2230 | out << pg.snap_trimq.size(); | |
2231 | if (!pg.snap_trimq_repeat.empty()) { | |
2232 | out << "(" << pg.snap_trimq_repeat.size() << ")"; | |
2233 | } | |
2234 | } else { | |
2235 | out << pg.snap_trimq; | |
2236 | if (!pg.snap_trimq_repeat.empty()) { | |
2237 | out << "(" << pg.snap_trimq_repeat << ")"; | |
2238 | } | |
2239 | } | |
2240 | } | |
2241 | if (!pg.recovery_state.get_info().purged_snaps.empty()) { | |
2242 | out << " ps="; // snap trim queue / purged snaps | |
2243 | if (pg.recovery_state.get_info().purged_snaps.num_intervals() > 16) { | |
2244 | out << pg.recovery_state.get_info().purged_snaps.size(); | |
2245 | } else { | |
2246 | out << pg.recovery_state.get_info().purged_snaps; | |
2247 | } | |
11fdf7f2 | 2248 | } |
11fdf7f2 | 2249 | |
9f95a23c | 2250 | out << "]"; |
9f95a23c | 2251 | return out; |
11fdf7f2 TL |
2252 | } |
2253 | ||
9f95a23c | 2254 | bool PG::can_discard_op(OpRequestRef& op) |
7c673cae | 2255 | { |
9f95a23c TL |
2256 | auto m = op->get_req<MOSDOp>(); |
2257 | if (cct->_conf->osd_discard_disconnected_ops && OSD::op_is_discardable(m)) { | |
2258 | dout(20) << " discard " << *m << dendl; | |
2259 | return true; | |
2260 | } | |
7c673cae | 2261 | |
9f95a23c TL |
2262 | if (m->get_map_epoch() < info.history.same_primary_since) { |
2263 | dout(7) << " changed after " << m->get_map_epoch() | |
2264 | << ", dropping " << *m << dendl; | |
2265 | return true; | |
2266 | } | |
7c673cae | 2267 | |
9f95a23c TL |
2268 | if ((m->get_flags() & (CEPH_OSD_FLAG_BALANCE_READS | |
2269 | CEPH_OSD_FLAG_LOCALIZE_READS)) && | |
2270 | !is_primary() && | |
2271 | m->get_map_epoch() < info.history.same_interval_since) { | |
2272 | // Note: the Objecter will resend on interval change without the primary | |
2273 | // changing if it actually sent to a replica. If the primary hasn't | |
2274 | // changed since the send epoch, we got it, and we're primary, it won't | |
2275 | // have resent even if the interval did change as it sent it to the primary | |
2276 | // (us). | |
2277 | return true; | |
7c673cae | 2278 | } |
7c673cae | 2279 | |
7c673cae | 2280 | |
9f95a23c TL |
2281 | if (m->get_connection()->has_feature(CEPH_FEATURE_RESEND_ON_SPLIT)) { |
2282 | // >= luminous client | |
2283 | if (m->get_connection()->has_feature(CEPH_FEATURE_SERVER_NAUTILUS)) { | |
2284 | // >= nautilus client | |
2285 | if (m->get_map_epoch() < pool.info.get_last_force_op_resend()) { | |
2286 | dout(7) << __func__ << " sent before last_force_op_resend " | |
2287 | << pool.info.last_force_op_resend | |
2288 | << ", dropping" << *m << dendl; | |
2289 | return true; | |
2290 | } | |
2291 | } else { | |
2292 | // == < nautilus client (luminous or mimic) | |
2293 | if (m->get_map_epoch() < pool.info.get_last_force_op_resend_prenautilus()) { | |
2294 | dout(7) << __func__ << " sent before last_force_op_resend_prenautilus " | |
2295 | << pool.info.last_force_op_resend_prenautilus | |
2296 | << ", dropping" << *m << dendl; | |
2297 | return true; | |
2298 | } | |
7c673cae | 2299 | } |
9f95a23c TL |
2300 | if (m->get_map_epoch() < info.history.last_epoch_split) { |
2301 | dout(7) << __func__ << " pg split in " | |
2302 | << info.history.last_epoch_split << ", dropping" << dendl; | |
2303 | return true; | |
7c673cae | 2304 | } |
9f95a23c TL |
2305 | } else if (m->get_connection()->has_feature(CEPH_FEATURE_OSD_POOLRESEND)) { |
2306 | // < luminous client | |
2307 | if (m->get_map_epoch() < pool.info.get_last_force_op_resend_preluminous()) { | |
2308 | dout(7) << __func__ << " sent before last_force_op_resend_preluminous " | |
2309 | << pool.info.last_force_op_resend_preluminous | |
2310 | << ", dropping" << *m << dendl; | |
2311 | return true; | |
7c673cae FG |
2312 | } |
2313 | } | |
2314 | ||
9f95a23c | 2315 | return false; |
7c673cae FG |
2316 | } |
2317 | ||
9f95a23c TL |
2318 | template<typename T, int MSGTYPE> |
2319 | bool PG::can_discard_replica_op(OpRequestRef& op) | |
7c673cae | 2320 | { |
9f95a23c TL |
2321 | auto m = op->get_req<T>(); |
2322 | ceph_assert(m->get_type() == MSGTYPE); | |
7c673cae | 2323 | |
9f95a23c | 2324 | int from = m->get_source().num(); |
7c673cae | 2325 | |
9f95a23c TL |
2326 | // if a repop is replied after a replica goes down in a new osdmap, and |
2327 | // before the pg advances to this new osdmap, the repop replies before this | |
2328 | // repop can be discarded by that replica OSD, because the primary resets the | |
2329 | // connection to it when handling the new osdmap marking it down, and also | |
2330 | // resets the messenger sesssion when the replica reconnects. to avoid the | |
2331 | // out-of-order replies, the messages from that replica should be discarded. | |
2332 | OSDMapRef next_map = osd->get_next_osdmap(); | |
f67539c2 TL |
2333 | if (next_map->is_down(from)) { |
2334 | dout(20) << " " << __func__ << " dead for nextmap is down " << from << dendl; | |
9f95a23c | 2335 | return true; |
f67539c2 | 2336 | } |
9f95a23c TL |
2337 | /* Mostly, this overlaps with the old_peering_msg |
2338 | * condition. An important exception is pushes | |
2339 | * sent by replicas not in the acting set, since | |
2340 | * if such a replica goes down it does not cause | |
2341 | * a new interval. */ | |
f67539c2 TL |
2342 | if (next_map->get_down_at(from) >= m->map_epoch) { |
2343 | dout(20) << " " << __func__ << " dead for 'get_down_at' " << from << dendl; | |
9f95a23c | 2344 | return true; |
f67539c2 | 2345 | } |
7c673cae | 2346 | |
9f95a23c TL |
2347 | // same pg? |
2348 | // if pg changes _at all_, we reset and repeer! | |
2349 | if (old_peering_msg(m->map_epoch, m->map_epoch)) { | |
2350 | dout(10) << "can_discard_replica_op pg changed " << info.history | |
2351 | << " after " << m->map_epoch | |
2352 | << ", dropping" << dendl; | |
2353 | return true; | |
7c673cae | 2354 | } |
9f95a23c | 2355 | return false; |
7c673cae FG |
2356 | } |
2357 | ||
9f95a23c | 2358 | bool PG::can_discard_scan(OpRequestRef op) |
7c673cae | 2359 | { |
9f95a23c TL |
2360 | auto m = op->get_req<MOSDPGScan>(); |
2361 | ceph_assert(m->get_type() == MSG_OSD_PG_SCAN); | |
7c673cae | 2362 | |
9f95a23c TL |
2363 | if (old_peering_msg(m->map_epoch, m->query_epoch)) { |
2364 | dout(10) << " got old scan, ignoring" << dendl; | |
2365 | return true; | |
7c673cae | 2366 | } |
9f95a23c | 2367 | return false; |
7c673cae FG |
2368 | } |
2369 | ||
9f95a23c | 2370 | bool PG::can_discard_backfill(OpRequestRef op) |
7c673cae | 2371 | { |
9f95a23c TL |
2372 | auto m = op->get_req<MOSDPGBackfill>(); |
2373 | ceph_assert(m->get_type() == MSG_OSD_PG_BACKFILL); | |
7c673cae | 2374 | |
9f95a23c TL |
2375 | if (old_peering_msg(m->map_epoch, m->query_epoch)) { |
2376 | dout(10) << " got old backfill, ignoring" << dendl; | |
2377 | return true; | |
7c673cae FG |
2378 | } |
2379 | ||
9f95a23c | 2380 | return false; |
7c673cae | 2381 | |
7c673cae FG |
2382 | } |
2383 | ||
9f95a23c | 2384 | bool PG::can_discard_request(OpRequestRef& op) |
7c673cae | 2385 | { |
9f95a23c TL |
2386 | switch (op->get_req()->get_type()) { |
2387 | case CEPH_MSG_OSD_OP: | |
2388 | return can_discard_op(op); | |
2389 | case CEPH_MSG_OSD_BACKOFF: | |
2390 | return false; // never discard | |
2391 | case MSG_OSD_REPOP: | |
2392 | return can_discard_replica_op<MOSDRepOp, MSG_OSD_REPOP>(op); | |
2393 | case MSG_OSD_PG_PUSH: | |
2394 | return can_discard_replica_op<MOSDPGPush, MSG_OSD_PG_PUSH>(op); | |
2395 | case MSG_OSD_PG_PULL: | |
2396 | return can_discard_replica_op<MOSDPGPull, MSG_OSD_PG_PULL>(op); | |
2397 | case MSG_OSD_PG_PUSH_REPLY: | |
2398 | return can_discard_replica_op<MOSDPGPushReply, MSG_OSD_PG_PUSH_REPLY>(op); | |
2399 | case MSG_OSD_REPOPREPLY: | |
2400 | return can_discard_replica_op<MOSDRepOpReply, MSG_OSD_REPOPREPLY>(op); | |
2401 | case MSG_OSD_PG_RECOVERY_DELETE: | |
2402 | return can_discard_replica_op<MOSDPGRecoveryDelete, MSG_OSD_PG_RECOVERY_DELETE>(op); | |
7c673cae | 2403 | |
9f95a23c TL |
2404 | case MSG_OSD_PG_RECOVERY_DELETE_REPLY: |
2405 | return can_discard_replica_op<MOSDPGRecoveryDeleteReply, MSG_OSD_PG_RECOVERY_DELETE_REPLY>(op); | |
7c673cae | 2406 | |
9f95a23c TL |
2407 | case MSG_OSD_EC_WRITE: |
2408 | return can_discard_replica_op<MOSDECSubOpWrite, MSG_OSD_EC_WRITE>(op); | |
2409 | case MSG_OSD_EC_WRITE_REPLY: | |
2410 | return can_discard_replica_op<MOSDECSubOpWriteReply, MSG_OSD_EC_WRITE_REPLY>(op); | |
2411 | case MSG_OSD_EC_READ: | |
2412 | return can_discard_replica_op<MOSDECSubOpRead, MSG_OSD_EC_READ>(op); | |
2413 | case MSG_OSD_EC_READ_REPLY: | |
2414 | return can_discard_replica_op<MOSDECSubOpReadReply, MSG_OSD_EC_READ_REPLY>(op); | |
2415 | case MSG_OSD_REP_SCRUB: | |
2416 | return can_discard_replica_op<MOSDRepScrub, MSG_OSD_REP_SCRUB>(op); | |
2417 | case MSG_OSD_SCRUB_RESERVE: | |
2418 | return can_discard_replica_op<MOSDScrubReserve, MSG_OSD_SCRUB_RESERVE>(op); | |
2419 | case MSG_OSD_REP_SCRUBMAP: | |
2420 | return can_discard_replica_op<MOSDRepScrubMap, MSG_OSD_REP_SCRUBMAP>(op); | |
2421 | case MSG_OSD_PG_UPDATE_LOG_MISSING: | |
2422 | return can_discard_replica_op< | |
2423 | MOSDPGUpdateLogMissing, MSG_OSD_PG_UPDATE_LOG_MISSING>(op); | |
2424 | case MSG_OSD_PG_UPDATE_LOG_MISSING_REPLY: | |
2425 | return can_discard_replica_op< | |
2426 | MOSDPGUpdateLogMissingReply, MSG_OSD_PG_UPDATE_LOG_MISSING_REPLY>(op); | |
2427 | ||
2428 | case MSG_OSD_PG_SCAN: | |
2429 | return can_discard_scan(op); | |
2430 | case MSG_OSD_PG_BACKFILL: | |
2431 | return can_discard_backfill(op); | |
2432 | case MSG_OSD_PG_BACKFILL_REMOVE: | |
2433 | return can_discard_replica_op<MOSDPGBackfillRemove, | |
2434 | MSG_OSD_PG_BACKFILL_REMOVE>(op); | |
7c673cae | 2435 | } |
9f95a23c | 2436 | return true; |
7c673cae FG |
2437 | } |
2438 | ||
9f95a23c | 2439 | void PG::do_peering_event(PGPeeringEventRef evt, PeeringCtx &rctx) |
7c673cae | 2440 | { |
9f95a23c TL |
2441 | dout(10) << __func__ << ": " << evt->get_desc() << dendl; |
2442 | ceph_assert(have_same_or_newer_map(evt->get_epoch_sent())); | |
2443 | if (old_peering_evt(evt)) { | |
2444 | dout(10) << "discard old " << evt->get_desc() << dendl; | |
2445 | } else { | |
2446 | recovery_state.handle_event(evt, &rctx); | |
7c673cae | 2447 | } |
9f95a23c TL |
2448 | // write_if_dirty regardless of path above to ensure we capture any work |
2449 | // done by OSD::advance_pg(). | |
2450 | write_if_dirty(rctx.transaction); | |
7c673cae FG |
2451 | } |
2452 | ||
9f95a23c | 2453 | void PG::queue_peering_event(PGPeeringEventRef evt) |
7c673cae | 2454 | { |
9f95a23c TL |
2455 | if (old_peering_evt(evt)) |
2456 | return; | |
2457 | osd->osd->enqueue_peering_evt(info.pgid, evt); | |
7c673cae FG |
2458 | } |
2459 | ||
9f95a23c TL |
2460 | void PG::queue_null(epoch_t msg_epoch, |
2461 | epoch_t query_epoch) | |
7c673cae | 2462 | { |
9f95a23c TL |
2463 | dout(10) << "null" << dendl; |
2464 | queue_peering_event( | |
2465 | PGPeeringEventRef(std::make_shared<PGPeeringEvent>(msg_epoch, query_epoch, | |
2466 | NullEvt()))); | |
7c673cae FG |
2467 | } |
2468 | ||
9f95a23c | 2469 | void PG::find_unfound(epoch_t queued, PeeringCtx &rctx) |
7c673cae | 2470 | { |
9f95a23c TL |
2471 | /* |
2472 | * if we couldn't start any recovery ops and things are still | |
2473 | * unfound, see if we can discover more missing object locations. | |
2474 | * It may be that our initial locations were bad and we errored | |
2475 | * out while trying to pull. | |
2476 | */ | |
2477 | if (!recovery_state.discover_all_missing(rctx)) { | |
2478 | string action; | |
2479 | if (state_test(PG_STATE_BACKFILLING)) { | |
2480 | auto evt = PGPeeringEventRef( | |
2481 | new PGPeeringEvent( | |
2482 | queued, | |
2483 | queued, | |
2484 | PeeringState::UnfoundBackfill())); | |
2485 | queue_peering_event(evt); | |
2486 | action = "in backfill"; | |
2487 | } else if (state_test(PG_STATE_RECOVERING)) { | |
2488 | auto evt = PGPeeringEventRef( | |
2489 | new PGPeeringEvent( | |
2490 | queued, | |
2491 | queued, | |
2492 | PeeringState::UnfoundRecovery())); | |
2493 | queue_peering_event(evt); | |
2494 | action = "in recovery"; | |
2495 | } else { | |
2496 | action = "already out of recovery/backfill"; | |
7c673cae | 2497 | } |
9f95a23c TL |
2498 | dout(10) << __func__ << ": no luck, giving up on this pg for now (" << action << ")" << dendl; |
2499 | } else { | |
2500 | dout(10) << __func__ << ": no luck, giving up on this pg for now (queue_recovery)" << dendl; | |
2501 | queue_recovery(); | |
7c673cae | 2502 | } |
7c673cae FG |
2503 | } |
2504 | ||
9f95a23c TL |
2505 | void PG::handle_advance_map( |
2506 | OSDMapRef osdmap, OSDMapRef lastmap, | |
2507 | vector<int>& newup, int up_primary, | |
2508 | vector<int>& newacting, int acting_primary, | |
2509 | PeeringCtx &rctx) | |
7c673cae | 2510 | { |
9f95a23c TL |
2511 | dout(10) << __func__ << ": " << osdmap->get_epoch() << dendl; |
2512 | osd_shard->update_pg_epoch(pg_slot, osdmap->get_epoch()); | |
2513 | recovery_state.advance_map( | |
2514 | osdmap, | |
2515 | lastmap, | |
2516 | newup, | |
2517 | up_primary, | |
2518 | newacting, | |
2519 | acting_primary, | |
2520 | rctx); | |
7c673cae FG |
2521 | } |
2522 | ||
9f95a23c | 2523 | void PG::handle_activate_map(PeeringCtx &rctx) |
7c673cae | 2524 | { |
9f95a23c TL |
2525 | dout(10) << __func__ << ": " << get_osdmap()->get_epoch() |
2526 | << dendl; | |
2527 | recovery_state.activate_map(rctx); | |
7c673cae | 2528 | |
9f95a23c | 2529 | requeue_map_waiters(); |
7c673cae FG |
2530 | } |
2531 | ||
9f95a23c | 2532 | void PG::handle_initialize(PeeringCtx &rctx) |
7c673cae | 2533 | { |
9f95a23c TL |
2534 | dout(10) << __func__ << dendl; |
2535 | PeeringState::Initialize evt; | |
2536 | recovery_state.handle_event(evt, &rctx); | |
7c673cae FG |
2537 | } |
2538 | ||
f67539c2 | 2539 | |
9f95a23c | 2540 | void PG::handle_query_state(Formatter *f) |
7c673cae | 2541 | { |
9f95a23c TL |
2542 | dout(10) << "handle_query_state" << dendl; |
2543 | PeeringState::QueryState q(f); | |
2544 | recovery_state.handle_event(q, 0); | |
7c673cae FG |
2545 | } |
2546 | ||
9f95a23c | 2547 | void PG::init_collection_pool_opts() |
11fdf7f2 | 2548 | { |
9f95a23c TL |
2549 | auto r = osd->store->set_collection_opts(ch, pool.info.opts); |
2550 | if (r < 0 && r != -EOPNOTSUPP) { | |
2551 | derr << __func__ << " set_collection_opts returns error:" << r << dendl; | |
11fdf7f2 | 2552 | } |
11fdf7f2 TL |
2553 | } |
2554 | ||
9f95a23c | 2555 | void PG::on_pool_change() |
7c673cae | 2556 | { |
9f95a23c TL |
2557 | init_collection_pool_opts(); |
2558 | plpg_on_pool_change(); | |
7c673cae FG |
2559 | } |
2560 | ||
9f95a23c TL |
2561 | void PG::C_DeleteMore::complete(int r) { |
2562 | ceph_assert(r == 0); | |
2563 | pg->lock(); | |
2564 | if (!pg->pg_has_reset_since(epoch)) { | |
2565 | pg->osd->queue_for_pg_delete(pg->get_pgid(), epoch); | |
7c673cae | 2566 | } |
9f95a23c TL |
2567 | pg->unlock(); |
2568 | delete this; | |
7c673cae FG |
2569 | } |
2570 | ||
f67539c2 TL |
2571 | std::pair<ghobject_t, bool> PG::do_delete_work( |
2572 | ObjectStore::Transaction &t, | |
2573 | ghobject_t _next) | |
7c673cae | 2574 | { |
9f95a23c | 2575 | dout(10) << __func__ << dendl; |
7c673cae | 2576 | |
9f95a23c TL |
2577 | { |
2578 | float osd_delete_sleep = osd->osd->get_osd_delete_sleep(); | |
2579 | if (osd_delete_sleep > 0 && delete_needs_sleep) { | |
2580 | epoch_t e = get_osdmap()->get_epoch(); | |
2581 | PGRef pgref(this); | |
2582 | auto delete_requeue_callback = new LambdaContext([this, pgref, e](int r) { | |
20effc67 | 2583 | dout(20) << "do_delete_work() [cb] wake up at " |
9f95a23c TL |
2584 | << ceph_clock_now() |
2585 | << ", re-queuing delete" << dendl; | |
2586 | std::scoped_lock locker{*this}; | |
2587 | delete_needs_sleep = false; | |
2588 | if (!pg_has_reset_since(e)) { | |
2589 | osd->queue_for_pg_delete(get_pgid(), e); | |
2590 | } | |
2591 | }); | |
7c673cae | 2592 | |
9f95a23c TL |
2593 | auto delete_schedule_time = ceph::real_clock::now(); |
2594 | delete_schedule_time += ceph::make_timespan(osd_delete_sleep); | |
2595 | std::lock_guard l{osd->sleep_lock}; | |
2596 | osd->sleep_timer.add_event_at(delete_schedule_time, | |
2597 | delete_requeue_callback); | |
2598 | dout(20) << __func__ << " Delete scheduled at " << delete_schedule_time << dendl; | |
f67539c2 | 2599 | return std::make_pair(_next, true); |
9f95a23c TL |
2600 | } |
2601 | } | |
7c673cae | 2602 | |
9f95a23c | 2603 | delete_needs_sleep = true; |
7c673cae | 2604 | |
adb31ebb TL |
2605 | ghobject_t next; |
2606 | ||
9f95a23c TL |
2607 | vector<ghobject_t> olist; |
2608 | int max = std::min(osd->store->get_ideal_list_max(), | |
2609 | (int)cct->_conf->osd_target_transaction_size); | |
adb31ebb | 2610 | |
9f95a23c TL |
2611 | osd->store->collection_list( |
2612 | ch, | |
adb31ebb | 2613 | _next, |
9f95a23c TL |
2614 | ghobject_t::get_max(), |
2615 | max, | |
2616 | &olist, | |
2617 | &next); | |
2618 | dout(20) << __func__ << " " << olist << dendl; | |
7c673cae | 2619 | |
adb31ebb TL |
2620 | // make sure we've removed everything |
2621 | // by one more listing from the beginning | |
2622 | if (_next != ghobject_t() && olist.empty()) { | |
2623 | next = ghobject_t(); | |
2624 | osd->store->collection_list( | |
2625 | ch, | |
2626 | next, | |
2627 | ghobject_t::get_max(), | |
2628 | max, | |
2629 | &olist, | |
2630 | &next); | |
20effc67 TL |
2631 | for (auto& oid : olist) { |
2632 | if (oid == pgmeta_oid) { | |
2633 | dout(20) << __func__ << " removing pgmeta object " << oid << dendl; | |
2634 | } else { | |
2635 | dout(0) << __func__ << " additional unexpected onode" | |
2636 | <<" new onode has appeared since PG removal started" | |
2637 | << oid << dendl; | |
b3b6e05e | 2638 | } |
adb31ebb TL |
2639 | } |
2640 | } | |
2641 | ||
9f95a23c TL |
2642 | OSDriver::OSTransaction _t(osdriver.get_transaction(&t)); |
2643 | int64_t num = 0; | |
2644 | for (auto& oid : olist) { | |
2645 | if (oid == pgmeta_oid) { | |
7c673cae FG |
2646 | continue; |
2647 | } | |
9f95a23c TL |
2648 | if (oid.is_pgmeta()) { |
2649 | osd->clog->warn() << info.pgid << " found stray pgmeta-like " << oid | |
2650 | << " during PG removal"; | |
7c673cae | 2651 | } |
9f95a23c TL |
2652 | int r = snap_mapper.remove_oid(oid.hobj, &_t); |
2653 | if (r != 0 && r != -ENOENT) { | |
2654 | ceph_abort(); | |
7c673cae | 2655 | } |
9f95a23c TL |
2656 | t.remove(coll, oid); |
2657 | ++num; | |
7c673cae | 2658 | } |
f67539c2 | 2659 | bool running = true; |
9f95a23c TL |
2660 | if (num) { |
2661 | dout(20) << __func__ << " deleting " << num << " objects" << dendl; | |
2662 | Context *fin = new C_DeleteMore(this, get_osdmap_epoch()); | |
2663 | t.register_on_commit(fin); | |
7c673cae | 2664 | } else { |
9f95a23c TL |
2665 | if (cct->_conf->osd_inject_failure_on_pg_removal) { |
2666 | _exit(1); | |
7c673cae | 2667 | } |
7c673cae | 2668 | |
9f95a23c TL |
2669 | // final flush here to ensure completions drop refs. Of particular concern |
2670 | // are the SnapMapper ContainerContexts. | |
2671 | { | |
2672 | PGRef pgref(this); | |
2673 | PGLog::clear_info_log(info.pgid, &t); | |
2674 | t.remove_collection(coll); | |
2675 | t.register_on_commit(new ContainerContext<PGRef>(pgref)); | |
2676 | t.register_on_applied(new ContainerContext<PGRef>(pgref)); | |
2677 | osd->store->queue_transaction(ch, std::move(t)); | |
7c673cae | 2678 | } |
9f95a23c | 2679 | ch->flush(); |
7c673cae | 2680 | |
9f95a23c TL |
2681 | if (!osd->try_finish_pg_delete(this, pool.info.get_pg_num())) { |
2682 | dout(1) << __func__ << " raced with merge, reinstantiating" << dendl; | |
2683 | ch = osd->store->create_new_collection(coll); | |
2684 | create_pg_collection(t, | |
2685 | info.pgid, | |
2686 | info.pgid.get_split_bits(pool.info.get_pg_num())); | |
2687 | init_pg_ondisk(t, info.pgid, &pool.info); | |
2688 | recovery_state.reset_last_persisted(); | |
2689 | } else { | |
2690 | recovery_state.set_delete_complete(); | |
7c673cae | 2691 | |
9f95a23c TL |
2692 | // cancel reserver here, since the PG is about to get deleted and the |
2693 | // exit() methods don't run when that happens. | |
2694 | osd->local_reserver.cancel_reservation(info.pgid); | |
7c673cae | 2695 | |
f67539c2 | 2696 | running = false; |
9f95a23c | 2697 | } |
7c673cae | 2698 | } |
f67539c2 | 2699 | return {next, running}; |
7c673cae FG |
2700 | } |
2701 | ||
9f95a23c | 2702 | int PG::pg_stat_adjust(osd_stat_t *ns) |
7c673cae | 2703 | { |
9f95a23c TL |
2704 | osd_stat_t &new_stat = *ns; |
2705 | if (is_primary()) { | |
2706 | return 0; | |
7c673cae | 2707 | } |
9f95a23c TL |
2708 | // Adjust the kb_used by adding pending backfill data |
2709 | uint64_t reserved_num_bytes = get_reserved_num_bytes(); | |
7c673cae | 2710 | |
9f95a23c TL |
2711 | // For now we don't consider projected space gains here |
2712 | // I suggest we have an optional 2 pass backfill that frees up | |
2713 | // space in a first pass. This could be triggered when at nearfull | |
2714 | // or near to backfillfull. | |
2715 | if (reserved_num_bytes > 0) { | |
2716 | // TODO: Handle compression by adjusting by the PGs average | |
2717 | // compression precentage. | |
2718 | dout(20) << __func__ << " reserved_num_bytes " << (reserved_num_bytes >> 10) << "KiB" | |
2719 | << " Before kb_used " << new_stat.statfs.kb_used() << "KiB" << dendl; | |
2720 | if (new_stat.statfs.available > reserved_num_bytes) | |
2721 | new_stat.statfs.available -= reserved_num_bytes; | |
2722 | else | |
2723 | new_stat.statfs.available = 0; | |
2724 | dout(20) << __func__ << " After kb_used " << new_stat.statfs.kb_used() << "KiB" << dendl; | |
2725 | return 1; | |
7c673cae | 2726 | } |
9f95a23c | 2727 | return 0; |
7c673cae FG |
2728 | } |
2729 | ||
11fdf7f2 TL |
2730 | void PG::dump_pgstate_history(Formatter *f) |
2731 | { | |
9f95a23c TL |
2732 | std::scoped_lock l{*this}; |
2733 | recovery_state.dump_history(f); | |
11fdf7f2 | 2734 | } |
7c673cae | 2735 | |
11fdf7f2 TL |
2736 | void PG::dump_missing(Formatter *f) |
2737 | { | |
9f95a23c | 2738 | for (auto& i : recovery_state.get_pg_log().get_missing().get_items()) { |
11fdf7f2 TL |
2739 | f->open_object_section("object"); |
2740 | f->dump_object("oid", i.first); | |
2741 | f->dump_object("missing_info", i.second); | |
9f95a23c TL |
2742 | if (recovery_state.get_missing_loc().needs_recovery(i.first)) { |
2743 | f->dump_bool( | |
2744 | "unfound", | |
2745 | recovery_state.get_missing_loc().is_unfound(i.first)); | |
11fdf7f2 | 2746 | f->open_array_section("locations"); |
9f95a23c | 2747 | for (auto l : recovery_state.get_missing_loc().get_locations(i.first)) { |
11fdf7f2 TL |
2748 | f->dump_object("shard", l); |
2749 | } | |
2750 | f->close_section(); | |
2751 | } | |
2752 | f->close_section(); | |
2753 | } | |
2754 | } | |
2755 | ||
20effc67 | 2756 | void PG::with_pg_stats(std::function<void(const pg_stat_t&, epoch_t lec)>&& f) |
11fdf7f2 | 2757 | { |
9f95a23c | 2758 | std::lock_guard l{pg_stats_publish_lock}; |
20effc67 TL |
2759 | if (pg_stats_publish) { |
2760 | f(*pg_stats_publish, pg_stats_publish->get_effective_last_epoch_clean()); | |
11fdf7f2 | 2761 | } |
11fdf7f2 TL |
2762 | } |
2763 | ||
20effc67 | 2764 | void PG::with_heartbeat_peers(std::function<void(int)>&& f) |
11fdf7f2 | 2765 | { |
9f95a23c | 2766 | std::lock_guard l{heartbeat_peer_lock}; |
11fdf7f2 TL |
2767 | for (auto p : heartbeat_peers) { |
2768 | f(p); | |
2769 | } | |
2770 | for (auto p : probe_targets) { | |
2771 | f(p); | |
2772 | } | |
9f95a23c TL |
2773 | } |
2774 | ||
2775 | uint64_t PG::get_min_alloc_size() const { | |
2776 | return osd->store->get_min_alloc_size(); | |
11fdf7f2 | 2777 | } |