]> git.proxmox.com Git - ceph.git/blame - ceph/src/osd/OSD.cc
import ceph pacific 16.2.5
[ceph.git] / ceph / src / osd / OSD.cc
CommitLineData
7c673cae
FG
1// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2// vim: ts=8 sw=2 smarttab
3/*
4 * Ceph - scalable distributed file system
5 *
6 * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
7 * Copyright (C) 2017 OVH
8 *
9 * This is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU Lesser General Public
11 * License version 2.1, as published by the Free Software
12 * Foundation. See file COPYING.
13 *
14 */
11fdf7f2 15
7c673cae 16#include "acconfig.h"
11fdf7f2
TL
17
18#include <cctype>
7c673cae
FG
19#include <fstream>
20#include <iostream>
11fdf7f2
TL
21#include <iterator>
22
23#include <unistd.h>
7c673cae
FG
24#include <sys/stat.h>
25#include <signal.h>
eafe8130 26#include <time.h>
7c673cae 27#include <boost/scoped_ptr.hpp>
eafe8130 28#include <boost/range/adaptor/reversed.hpp>
7c673cae
FG
29
30#ifdef HAVE_SYS_PARAM_H
31#include <sys/param.h>
32#endif
33
34#ifdef HAVE_SYS_MOUNT_H
35#include <sys/mount.h>
36#endif
37
38#include "osd/PG.h"
f67539c2
TL
39#include "osd/scrub_machine.h"
40#include "osd/pg_scrubber.h"
7c673cae
FG
41
42#include "include/types.h"
43#include "include/compat.h"
11fdf7f2 44#include "include/random.h"
7c673cae
FG
45
46#include "OSD.h"
47#include "OSDMap.h"
48#include "Watch.h"
49#include "osdc/Objecter.h"
50
51#include "common/errno.h"
52#include "common/ceph_argparse.h"
9f95a23c 53#include "common/ceph_releases.h"
224ce89b 54#include "common/ceph_time.h"
7c673cae 55#include "common/version.h"
f67539c2 56#include "common/async/blocked_completion.h"
b5b8bbf5 57#include "common/pick_address.h"
11fdf7f2
TL
58#include "common/blkdev.h"
59#include "common/numa.h"
7c673cae
FG
60
61#include "os/ObjectStore.h"
62#ifdef HAVE_LIBFUSE
63#include "os/FuseStore.h"
64#endif
65
66#include "PrimaryLogPG.h"
67
7c673cae
FG
68#include "msg/Messenger.h"
69#include "msg/Message.h"
70
71#include "mon/MonClient.h"
72
73#include "messages/MLog.h"
74
75#include "messages/MGenericMessage.h"
7c673cae
FG
76#include "messages/MOSDPing.h"
77#include "messages/MOSDFailure.h"
78#include "messages/MOSDMarkMeDown.h"
9f95a23c 79#include "messages/MOSDMarkMeDead.h"
7c673cae
FG
80#include "messages/MOSDFull.h"
81#include "messages/MOSDOp.h"
82#include "messages/MOSDOpReply.h"
83#include "messages/MOSDBackoff.h"
84#include "messages/MOSDBeacon.h"
85#include "messages/MOSDRepOp.h"
86#include "messages/MOSDRepOpReply.h"
87#include "messages/MOSDBoot.h"
88#include "messages/MOSDPGTemp.h"
11fdf7f2 89#include "messages/MOSDPGReadyToMerge.h"
7c673cae
FG
90
91#include "messages/MOSDMap.h"
92#include "messages/MMonGetOSDMap.h"
93#include "messages/MOSDPGNotify.h"
9f95a23c 94#include "messages/MOSDPGNotify2.h"
7c673cae 95#include "messages/MOSDPGQuery.h"
9f95a23c 96#include "messages/MOSDPGQuery2.h"
7c673cae
FG
97#include "messages/MOSDPGLog.h"
98#include "messages/MOSDPGRemove.h"
99#include "messages/MOSDPGInfo.h"
9f95a23c 100#include "messages/MOSDPGInfo2.h"
7c673cae 101#include "messages/MOSDPGCreate.h"
11fdf7f2 102#include "messages/MOSDPGCreate2.h"
7c673cae
FG
103#include "messages/MBackfillReserve.h"
104#include "messages/MRecoveryReserve.h"
c07f9fc5 105#include "messages/MOSDForceRecovery.h"
7c673cae
FG
106#include "messages/MOSDECSubOpWrite.h"
107#include "messages/MOSDECSubOpWriteReply.h"
108#include "messages/MOSDECSubOpRead.h"
109#include "messages/MOSDECSubOpReadReply.h"
110#include "messages/MOSDPGCreated.h"
111#include "messages/MOSDPGUpdateLogMissing.h"
112#include "messages/MOSDPGUpdateLogMissingReply.h"
113
11fdf7f2
TL
114#include "messages/MOSDPeeringOp.h"
115
7c673cae
FG
116#include "messages/MOSDAlive.h"
117
118#include "messages/MOSDScrub.h"
11fdf7f2 119#include "messages/MOSDScrub2.h"
7c673cae
FG
120#include "messages/MOSDRepScrub.h"
121
7c673cae
FG
122#include "messages/MCommand.h"
123#include "messages/MCommandReply.h"
124
125#include "messages/MPGStats.h"
7c673cae
FG
126
127#include "messages/MWatchNotify.h"
128#include "messages/MOSDPGPush.h"
129#include "messages/MOSDPGPushReply.h"
130#include "messages/MOSDPGPull.h"
131
9f95a23c
TL
132#include "messages/MMonGetPurgedSnaps.h"
133#include "messages/MMonGetPurgedSnapsReply.h"
134
7c673cae
FG
135#include "common/perf_counters.h"
136#include "common/Timer.h"
137#include "common/LogClient.h"
138#include "common/AsyncReserver.h"
139#include "common/HeartbeatMap.h"
140#include "common/admin_socket.h"
141#include "common/ceph_context.h"
142
143#include "global/signal_handler.h"
144#include "global/pidfile.h"
145
146#include "include/color.h"
147#include "perfglue/cpu_profiler.h"
148#include "perfglue/heap_profiler.h"
149
f67539c2 150#include "osd/ClassHandler.h"
7c673cae
FG
151#include "osd/OpRequest.h"
152
153#include "auth/AuthAuthorizeHandler.h"
154#include "auth/RotatingKeyRing.h"
7c673cae
FG
155
156#include "objclass/objclass.h"
157
158#include "common/cmdparse.h"
159#include "include/str_list.h"
160#include "include/util.h"
161
11fdf7f2 162#include "include/ceph_assert.h"
7c673cae
FG
163#include "common/config.h"
164#include "common/EventTrace.h"
165
11fdf7f2
TL
166#include "json_spirit/json_spirit_reader.h"
167#include "json_spirit/json_spirit_writer.h"
168
7c673cae
FG
169#ifdef WITH_LTTNG
170#define TRACEPOINT_DEFINE
171#define TRACEPOINT_PROBE_DYNAMIC_LINKAGE
172#include "tracing/osd.h"
173#undef TRACEPOINT_PROBE_DYNAMIC_LINKAGE
174#undef TRACEPOINT_DEFINE
175#else
176#define tracepoint(...)
177#endif
f67539c2
TL
178#ifdef HAVE_JAEGER
179#include "common/tracer.h"
180#endif
7c673cae
FG
181
182#define dout_context cct
183#define dout_subsys ceph_subsys_osd
184#undef dout_prefix
185#define dout_prefix _prefix(_dout, whoami, get_osdmap_epoch())
186
f67539c2
TL
187using std::deque;
188using std::list;
189using std::lock_guard;
190using std::make_pair;
191using std::make_tuple;
192using std::make_unique;
193using std::map;
194using std::ostream;
195using std::ostringstream;
196using std::pair;
197using std::set;
198using std::string;
199using std::stringstream;
200using std::to_string;
201using std::unique_ptr;
202using std::vector;
203
204using ceph::bufferlist;
205using ceph::bufferptr;
206using ceph::decode;
207using ceph::encode;
208using ceph::fixed_u_to_string;
209using ceph::Formatter;
210using ceph::heartbeat_handle_d;
211using ceph::make_mutex;
212
9f95a23c
TL
213using namespace ceph::osd::scheduler;
214using TOPNSPC::common::cmd_getval;
224ce89b 215
7c673cae
FG
216static ostream& _prefix(std::ostream* _dout, int whoami, epoch_t epoch) {
217 return *_dout << "osd." << whoami << " " << epoch << " ";
218}
219
7c673cae
FG
220//Initial features in new superblock.
221//Features here are also automatically upgraded
222CompatSet OSD::get_osd_initial_compat_set() {
223 CompatSet::FeatureSet ceph_osd_feature_compat;
224 CompatSet::FeatureSet ceph_osd_feature_ro_compat;
225 CompatSet::FeatureSet ceph_osd_feature_incompat;
226 ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_BASE);
227 ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_PGINFO);
228 ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_OLOC);
229 ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_LEC);
230 ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_CATEGORIES);
231 ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_HOBJECTPOOL);
232 ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_BIGINFO);
233 ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_LEVELDBINFO);
234 ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_LEVELDBLOG);
235 ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_SNAPMAPPER);
236 ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_HINTS);
237 ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_PGMETA);
238 ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_MISSING);
239 ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_FASTINFO);
c07f9fc5 240 ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_RECOVERY_DELETES);
9f95a23c 241 ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_SNAPMAPPER2);
7c673cae
FG
242 return CompatSet(ceph_osd_feature_compat, ceph_osd_feature_ro_compat,
243 ceph_osd_feature_incompat);
244}
245
246//Features are added here that this OSD supports.
247CompatSet OSD::get_osd_compat_set() {
248 CompatSet compat = get_osd_initial_compat_set();
249 //Any features here can be set in code, but not in initial superblock
250 compat.incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_SHARDS);
251 return compat;
252}
253
f67539c2 254OSDService::OSDService(OSD *osd, ceph::async::io_context_pool& poolctx) :
7c673cae
FG
255 osd(osd),
256 cct(osd->cct),
7c673cae
FG
257 whoami(osd->whoami), store(osd->store),
258 log_client(osd->log_client), clog(osd->clog),
259 pg_recovery_stats(osd->pg_recovery_stats),
260 cluster_messenger(osd->cluster_messenger),
261 client_messenger(osd->client_messenger),
262 logger(osd->logger),
263 recoverystate_perf(osd->recoverystate_perf),
264 monc(osd->monc),
11fdf7f2
TL
265 osd_max_object_size(cct->_conf, "osd_max_object_size"),
266 osd_skip_data_digest(cct->_conf, "osd_skip_data_digest"),
267 publish_lock{ceph::make_mutex("OSDService::publish_lock")},
268 pre_publish_lock{ceph::make_mutex("OSDService::pre_publish_lock")},
7c673cae 269 max_oldest_map(0),
eafe8130
TL
270 scrubs_local(0),
271 scrubs_remote(0),
7c673cae
FG
272 agent_valid_iterator(false),
273 agent_ops(0),
274 flush_mode_high_count(0),
275 agent_active(true),
276 agent_thread(this),
277 agent_stop_flag(false),
7c673cae
FG
278 agent_timer(osd->client_messenger->cct, agent_timer_lock),
279 last_recalibrate(ceph_clock_now()),
280 promote_max_objects(0),
281 promote_max_bytes(0),
f67539c2 282 poolctx(poolctx),
9f95a23c
TL
283 objecter(make_unique<Objecter>(osd->client_messenger->cct,
284 osd->objecter_messenger,
f67539c2 285 osd->monc, poolctx)),
11fdf7f2 286 m_objecter_finishers(cct->_conf->osd_objecter_finishers),
7c673cae
FG
287 watch_timer(osd->client_messenger->cct, watch_lock),
288 next_notif_id(0),
7c673cae 289 recovery_request_timer(cct, recovery_request_lock, false),
11fdf7f2 290 sleep_timer(cct, sleep_lock, false),
7c673cae 291 reserver_finisher(cct),
3efd9988 292 local_reserver(cct, &reserver_finisher, cct->_conf->osd_max_backfills,
7c673cae 293 cct->_conf->osd_min_recovery_priority),
3efd9988 294 remote_reserver(cct, &reserver_finisher, cct->_conf->osd_max_backfills,
7c673cae 295 cct->_conf->osd_min_recovery_priority),
3efd9988 296 snap_reserver(cct, &reserver_finisher,
7c673cae 297 cct->_conf->osd_max_trimming_pgs),
7c673cae
FG
298 recovery_ops_active(0),
299 recovery_ops_reserved(0),
300 recovery_paused(false),
7c673cae
FG
301 map_cache(cct, cct->_conf->osd_map_cache_size),
302 map_bl_cache(cct->_conf->osd_map_cache_size),
303 map_bl_inc_cache(cct->_conf->osd_map_cache_size),
7c673cae 304 cur_state(NONE),
11fdf7f2 305 cur_ratio(0), physical_ratio(0),
9f95a23c 306 boot_epoch(0), up_epoch(0), bind_epoch(0)
7c673cae
FG
307{
308 objecter->init();
11fdf7f2
TL
309
310 for (int i = 0; i < m_objecter_finishers; i++) {
311 ostringstream str;
312 str << "objecter-finisher-" << i;
9f95a23c
TL
313 auto fin = make_unique<Finisher>(osd->client_messenger->cct, str.str(), "finisher");
314 objecter_finishers.push_back(std::move(fin));
11fdf7f2 315 }
7c673cae
FG
316}
317
31f18b77 318#ifdef PG_DEBUG_REFS
f67539c2 319void OSDService::add_pgid(spg_t pgid, PG *pg) {
11fdf7f2 320 std::lock_guard l(pgid_lock);
31f18b77
FG
321 if (!pgid_tracker.count(pgid)) {
322 live_pgs[pgid] = pg;
323 }
324 pgid_tracker[pgid]++;
325}
326void OSDService::remove_pgid(spg_t pgid, PG *pg)
327{
11fdf7f2
TL
328 std::lock_guard l(pgid_lock);
329 ceph_assert(pgid_tracker.count(pgid));
330 ceph_assert(pgid_tracker[pgid] > 0);
31f18b77
FG
331 pgid_tracker[pgid]--;
332 if (pgid_tracker[pgid] == 0) {
333 pgid_tracker.erase(pgid);
334 live_pgs.erase(pgid);
335 }
336}
337void OSDService::dump_live_pgids()
338{
11fdf7f2 339 std::lock_guard l(pgid_lock);
31f18b77
FG
340 derr << "live pgids:" << dendl;
341 for (map<spg_t, int>::const_iterator i = pgid_tracker.cbegin();
342 i != pgid_tracker.cend();
343 ++i) {
344 derr << "\t" << *i << dendl;
345 live_pgs[i->first]->dump_live_ids();
346 }
347}
348#endif
349
350
9f95a23c
TL
351ceph::signedspan OSDService::get_mnow()
352{
353 return ceph::mono_clock::now() - osd->startup_time;
354}
7c673cae 355
11fdf7f2
TL
356void OSDService::identify_splits_and_merges(
357 OSDMapRef old_map,
358 OSDMapRef new_map,
359 spg_t pgid,
360 set<pair<spg_t,epoch_t>> *split_children,
361 set<pair<spg_t,epoch_t>> *merge_pgs)
7c673cae 362{
11fdf7f2 363 if (!old_map->have_pg_pool(pgid.pool())) {
7c673cae 364 return;
7c673cae 365 }
7c673cae 366 int old_pgnum = old_map->get_pg_num(pgid.pool());
11fdf7f2
TL
367 auto p = osd->pg_num_history.pg_nums.find(pgid.pool());
368 if (p == osd->pg_num_history.pg_nums.end()) {
369 return;
370 }
371 dout(20) << __func__ << " " << pgid << " e" << old_map->get_epoch()
372 << " to e" << new_map->get_epoch()
373 << " pg_nums " << p->second << dendl;
374 deque<spg_t> queue;
375 queue.push_back(pgid);
eafe8130 376 set<spg_t> did;
11fdf7f2
TL
377 while (!queue.empty()) {
378 auto cur = queue.front();
379 queue.pop_front();
eafe8130 380 did.insert(cur);
11fdf7f2
TL
381 unsigned pgnum = old_pgnum;
382 for (auto q = p->second.lower_bound(old_map->get_epoch());
383 q != p->second.end() &&
384 q->first <= new_map->get_epoch();
385 ++q) {
386 if (pgnum < q->second) {
387 // split?
388 if (cur.ps() < pgnum) {
389 set<spg_t> children;
390 if (cur.is_split(pgnum, q->second, &children)) {
391 dout(20) << __func__ << " " << cur << " e" << q->first
392 << " pg_num " << pgnum << " -> " << q->second
393 << " children " << children << dendl;
394 for (auto i : children) {
395 split_children->insert(make_pair(i, q->first));
eafe8130
TL
396 if (!did.count(i))
397 queue.push_back(i);
11fdf7f2
TL
398 }
399 }
400 } else if (cur.ps() < q->second) {
401 dout(20) << __func__ << " " << cur << " e" << q->first
402 << " pg_num " << pgnum << " -> " << q->second
403 << " is a child" << dendl;
404 // normally we'd capture this from the parent, but it's
405 // possible the parent doesn't exist yet (it will be
406 // fabricated to allow an intervening merge). note this PG
407 // as a split child here to be sure we catch it.
408 split_children->insert(make_pair(cur, q->first));
409 } else {
410 dout(20) << __func__ << " " << cur << " e" << q->first
411 << " pg_num " << pgnum << " -> " << q->second
412 << " is post-split, skipping" << dendl;
413 }
414 } else if (merge_pgs) {
415 // merge?
416 if (cur.ps() >= q->second) {
417 if (cur.ps() < pgnum) {
418 spg_t parent;
419 if (cur.is_merge_source(pgnum, q->second, &parent)) {
420 set<spg_t> children;
421 parent.is_split(q->second, pgnum, &children);
422 dout(20) << __func__ << " " << cur << " e" << q->first
423 << " pg_num " << pgnum << " -> " << q->second
424 << " is merge source, target " << parent
425 << ", source(s) " << children << dendl;
426 merge_pgs->insert(make_pair(parent, q->first));
eafe8130
TL
427 if (!did.count(parent)) {
428 // queue (and re-scan) parent in case it might not exist yet
429 // and there are some future splits pending on it
430 queue.push_back(parent);
431 }
11fdf7f2
TL
432 for (auto c : children) {
433 merge_pgs->insert(make_pair(c, q->first));
eafe8130
TL
434 if (!did.count(c))
435 queue.push_back(c);
11fdf7f2
TL
436 }
437 }
438 } else {
439 dout(20) << __func__ << " " << cur << " e" << q->first
440 << " pg_num " << pgnum << " -> " << q->second
441 << " is beyond old pgnum, skipping" << dendl;
442 }
443 } else {
444 set<spg_t> children;
445 if (cur.is_split(q->second, pgnum, &children)) {
446 dout(20) << __func__ << " " << cur << " e" << q->first
447 << " pg_num " << pgnum << " -> " << q->second
448 << " is merge target, source " << children << dendl;
449 for (auto c : children) {
450 merge_pgs->insert(make_pair(c, q->first));
eafe8130
TL
451 if (!did.count(c))
452 queue.push_back(c);
11fdf7f2
TL
453 }
454 merge_pgs->insert(make_pair(cur, q->first));
455 }
7c673cae
FG
456 }
457 }
11fdf7f2 458 pgnum = q->second;
7c673cae
FG
459 }
460 }
461}
462
7c673cae
FG
463void OSDService::need_heartbeat_peer_update()
464{
465 osd->need_heartbeat_peer_update();
466}
467
9f95a23c
TL
468HeartbeatStampsRef OSDService::get_hb_stamps(unsigned peer)
469{
470 std::lock_guard l(hb_stamp_lock);
471 if (peer >= hb_stamps.size()) {
472 hb_stamps.resize(peer + 1);
473 }
474 if (!hb_stamps[peer]) {
475 hb_stamps[peer] = ceph::make_ref<HeartbeatStamps>(peer);
476 }
477 return hb_stamps[peer];
478}
479
480void OSDService::queue_renew_lease(epoch_t epoch, spg_t spgid)
481{
482 osd->enqueue_peering_evt(
483 spgid,
484 PGPeeringEventRef(
485 std::make_shared<PGPeeringEvent>(
486 epoch, epoch,
487 RenewLease())));
488}
489
7c673cae
FG
490void OSDService::start_shutdown()
491{
492 {
11fdf7f2 493 std::lock_guard l(agent_timer_lock);
7c673cae
FG
494 agent_timer.shutdown();
495 }
31f18b77
FG
496
497 {
11fdf7f2
TL
498 std::lock_guard l(sleep_lock);
499 sleep_timer.shutdown();
31f18b77 500 }
81eedcae
TL
501
502 {
503 std::lock_guard l(recovery_request_lock);
504 recovery_request_timer.shutdown();
505 }
7c673cae
FG
506}
507
31f18b77 508void OSDService::shutdown_reserver()
7c673cae
FG
509{
510 reserver_finisher.wait_for_empty();
511 reserver_finisher.stop();
31f18b77
FG
512}
513
514void OSDService::shutdown()
515{
9f95a23c
TL
516 mono_timer.suspend();
517
7c673cae 518 {
11fdf7f2 519 std::lock_guard l(watch_lock);
7c673cae
FG
520 watch_timer.shutdown();
521 }
522
523 objecter->shutdown();
9f95a23c 524 for (auto& f : objecter_finishers) {
11fdf7f2
TL
525 f->wait_for_empty();
526 f->stop();
7c673cae
FG
527 }
528
11fdf7f2 529 publish_map(OSDMapRef());
7c673cae
FG
530 next_osdmap = OSDMapRef();
531}
532
533void OSDService::init()
534{
535 reserver_finisher.start();
9f95a23c 536 for (auto& f : objecter_finishers) {
11fdf7f2
TL
537 f->start();
538 }
7c673cae
FG
539 objecter->set_client_incarnation(0);
540
541 // deprioritize objecter in daemonperf output
542 objecter->get_logger()->set_prio_adjust(-3);
543
544 watch_timer.init();
545 agent_timer.init();
9f95a23c 546 mono_timer.resume();
7c673cae
FG
547
548 agent_thread.create("osd_srv_agent");
549
550 if (cct->_conf->osd_recovery_delay_start)
551 defer_recovery(cct->_conf->osd_recovery_delay_start);
552}
553
554void OSDService::final_init()
555{
556 objecter->start(osdmap.get());
557}
558
559void OSDService::activate_map()
560{
561 // wake/unwake the tiering agent
9f95a23c 562 std::lock_guard l{agent_lock};
7c673cae
FG
563 agent_active =
564 !osdmap->test_flag(CEPH_OSDMAP_NOTIERAGENT) &&
565 osd->is_active();
9f95a23c 566 agent_cond.notify_all();
7c673cae
FG
567}
568
181888fb
FG
569void OSDService::request_osdmap_update(epoch_t e)
570{
571 osd->osdmap_subscribe(e, false);
572}
573
9f95a23c 574
7c673cae
FG
575class AgentTimeoutCB : public Context {
576 PGRef pg;
577public:
578 explicit AgentTimeoutCB(PGRef _pg) : pg(_pg) {}
579 void finish(int) override {
580 pg->agent_choose_mode_restart();
581 }
582};
583
584void OSDService::agent_entry()
585{
586 dout(10) << __func__ << " start" << dendl;
9f95a23c 587 std::unique_lock agent_locker{agent_lock};
7c673cae
FG
588
589 while (!agent_stop_flag) {
590 if (agent_queue.empty()) {
591 dout(20) << __func__ << " empty queue" << dendl;
9f95a23c 592 agent_cond.wait(agent_locker);
7c673cae
FG
593 continue;
594 }
595 uint64_t level = agent_queue.rbegin()->first;
596 set<PGRef>& top = agent_queue.rbegin()->second;
597 dout(10) << __func__
598 << " tiers " << agent_queue.size()
599 << ", top is " << level
600 << " with pgs " << top.size()
601 << ", ops " << agent_ops << "/"
602 << cct->_conf->osd_agent_max_ops
603 << (agent_active ? " active" : " NOT ACTIVE")
604 << dendl;
605 dout(20) << __func__ << " oids " << agent_oids << dendl;
606 int max = cct->_conf->osd_agent_max_ops - agent_ops;
607 int agent_flush_quota = max;
608 if (!flush_mode_high_count)
609 agent_flush_quota = cct->_conf->osd_agent_max_low_ops - agent_ops;
610 if (agent_flush_quota <= 0 || top.empty() || !agent_active) {
9f95a23c 611 agent_cond.wait(agent_locker);
7c673cae
FG
612 continue;
613 }
614
615 if (!agent_valid_iterator || agent_queue_pos == top.end()) {
616 agent_queue_pos = top.begin();
617 agent_valid_iterator = true;
618 }
619 PGRef pg = *agent_queue_pos;
620 dout(10) << "high_count " << flush_mode_high_count
621 << " agent_ops " << agent_ops
622 << " flush_quota " << agent_flush_quota << dendl;
9f95a23c 623 agent_locker.unlock();
7c673cae 624 if (!pg->agent_work(max, agent_flush_quota)) {
11fdf7f2 625 dout(10) << __func__ << " " << pg->pg_id
7c673cae
FG
626 << " no agent_work, delay for " << cct->_conf->osd_agent_delay_time
627 << " seconds" << dendl;
628
f67539c2 629 logger->inc(l_osd_tier_delay);
7c673cae 630 // Queue a timer to call agent_choose_mode for this pg in 5 seconds
9f95a23c 631 std::lock_guard timer_locker{agent_timer_lock};
7c673cae
FG
632 Context *cb = new AgentTimeoutCB(pg);
633 agent_timer.add_event_after(cct->_conf->osd_agent_delay_time, cb);
7c673cae 634 }
9f95a23c 635 agent_locker.lock();
7c673cae 636 }
7c673cae
FG
637 dout(10) << __func__ << " finish" << dendl;
638}
639
640void OSDService::agent_stop()
641{
642 {
11fdf7f2 643 std::lock_guard l(agent_lock);
7c673cae
FG
644
645 // By this time all ops should be cancelled
11fdf7f2 646 ceph_assert(agent_ops == 0);
7c673cae
FG
647 // By this time all PGs are shutdown and dequeued
648 if (!agent_queue.empty()) {
649 set<PGRef>& top = agent_queue.rbegin()->second;
11fdf7f2
TL
650 derr << "agent queue not empty, for example " << (*top.begin())->get_pgid() << dendl;
651 ceph_abort_msg("agent queue not empty");
7c673cae
FG
652 }
653
654 agent_stop_flag = true;
9f95a23c 655 agent_cond.notify_all();
7c673cae
FG
656 }
657 agent_thread.join();
658}
659
660// -------------------------------------
661
662void OSDService::promote_throttle_recalibrate()
663{
664 utime_t now = ceph_clock_now();
665 double dur = now - last_recalibrate;
666 last_recalibrate = now;
667 unsigned prob = promote_probability_millis;
668
669 uint64_t target_obj_sec = cct->_conf->osd_tier_promote_max_objects_sec;
670 uint64_t target_bytes_sec = cct->_conf->osd_tier_promote_max_bytes_sec;
671
672 unsigned min_prob = 1;
673
674 uint64_t attempts, obj, bytes;
675 promote_counter.sample_and_attenuate(&attempts, &obj, &bytes);
676 dout(10) << __func__ << " " << attempts << " attempts, promoted "
1adf2230 677 << obj << " objects and " << byte_u_t(bytes) << "; target "
7c673cae 678 << target_obj_sec << " obj/sec or "
1adf2230 679 << byte_u_t(target_bytes_sec) << "/sec"
7c673cae
FG
680 << dendl;
681
682 // calculate what the probability *should* be, given the targets
683 unsigned new_prob;
684 if (attempts && dur > 0) {
685 uint64_t avg_size = 1;
686 if (obj)
11fdf7f2 687 avg_size = std::max<uint64_t>(bytes / obj, 1);
7c673cae
FG
688 unsigned po = (double)target_obj_sec * dur * 1000.0 / (double)attempts;
689 unsigned pb = (double)target_bytes_sec / (double)avg_size * dur * 1000.0
690 / (double)attempts;
691 dout(20) << __func__ << " po " << po << " pb " << pb << " avg_size "
692 << avg_size << dendl;
693 if (target_obj_sec && target_bytes_sec)
11fdf7f2 694 new_prob = std::min(po, pb);
7c673cae
FG
695 else if (target_obj_sec)
696 new_prob = po;
697 else if (target_bytes_sec)
698 new_prob = pb;
699 else
700 new_prob = 1000;
701 } else {
702 new_prob = 1000;
703 }
704 dout(20) << __func__ << " new_prob " << new_prob << dendl;
705
706 // correct for persistent skew between target rate and actual rate, adjust
707 double ratio = 1.0;
708 unsigned actual = 0;
709 if (attempts && obj) {
710 actual = obj * 1000 / attempts;
711 ratio = (double)actual / (double)prob;
712 new_prob = (double)new_prob / ratio;
713 }
11fdf7f2
TL
714 new_prob = std::max(new_prob, min_prob);
715 new_prob = std::min(new_prob, 1000u);
7c673cae
FG
716
717 // adjust
718 prob = (prob + new_prob) / 2;
11fdf7f2
TL
719 prob = std::max(prob, min_prob);
720 prob = std::min(prob, 1000u);
7c673cae
FG
721 dout(10) << __func__ << " actual " << actual
722 << ", actual/prob ratio " << ratio
723 << ", adjusted new_prob " << new_prob
724 << ", prob " << promote_probability_millis << " -> " << prob
725 << dendl;
726 promote_probability_millis = prob;
727
728 // set hard limits for this interval to mitigate stampedes
91327a77
AA
729 promote_max_objects = target_obj_sec * osd->OSD_TICK_INTERVAL * 2;
730 promote_max_bytes = target_bytes_sec * osd->OSD_TICK_INTERVAL * 2;
7c673cae
FG
731}
732
733// -------------------------------------
734
735float OSDService::get_failsafe_full_ratio()
736{
737 float full_ratio = cct->_conf->osd_failsafe_full_ratio;
738 if (full_ratio > 1.0) full_ratio /= 100.0;
739 return full_ratio;
740}
741
11fdf7f2 742OSDService::s_names OSDService::recalc_full_state(float ratio, float pratio, string &inject)
7c673cae 743{
7c673cae
FG
744 // The OSDMap ratios take precendence. So if the failsafe is .95 and
745 // the admin sets the cluster full to .96, the failsafe moves up to .96
746 // too. (Not that having failsafe == full is ideal, but it's better than
747 // dropping writes before the clusters appears full.)
748 OSDMapRef osdmap = get_osdmap();
749 if (!osdmap || osdmap->get_epoch() == 0) {
11fdf7f2 750 return NONE;
7c673cae
FG
751 }
752 float nearfull_ratio = osdmap->get_nearfull_ratio();
753 float backfillfull_ratio = std::max(osdmap->get_backfillfull_ratio(), nearfull_ratio);
754 float full_ratio = std::max(osdmap->get_full_ratio(), backfillfull_ratio);
755 float failsafe_ratio = std::max(get_failsafe_full_ratio(), full_ratio);
756
9f95a23c 757 if (osdmap->require_osd_release < ceph_release_t::luminous) {
7c673cae
FG
758 // use the failsafe for nearfull and full; the mon isn't using the
759 // flags anyway because we're mid-upgrade.
760 full_ratio = failsafe_ratio;
761 backfillfull_ratio = failsafe_ratio;
762 nearfull_ratio = failsafe_ratio;
763 } else if (full_ratio <= 0 ||
764 backfillfull_ratio <= 0 ||
765 nearfull_ratio <= 0) {
766 derr << __func__ << " full_ratio, backfillfull_ratio or nearfull_ratio is <= 0" << dendl;
767 // use failsafe flag. ick. the monitor did something wrong or the user
768 // did something stupid.
769 full_ratio = failsafe_ratio;
770 backfillfull_ratio = failsafe_ratio;
771 nearfull_ratio = failsafe_ratio;
772 }
773
7c673cae 774 if (injectfull_state > NONE && injectfull) {
7c673cae 775 inject = "(Injected)";
11fdf7f2
TL
776 return injectfull_state;
777 } else if (pratio > failsafe_ratio) {
778 return FAILSAFE;
7c673cae 779 } else if (ratio > full_ratio) {
11fdf7f2 780 return FULL;
7c673cae 781 } else if (ratio > backfillfull_ratio) {
11fdf7f2 782 return BACKFILLFULL;
92f5a8d4 783 } else if (pratio > nearfull_ratio) {
11fdf7f2 784 return NEARFULL;
7c673cae 785 }
11fdf7f2
TL
786 return NONE;
787}
788
789void OSDService::check_full_status(float ratio, float pratio)
790{
791 std::lock_guard l(full_status_lock);
792
793 cur_ratio = ratio;
794 physical_ratio = pratio;
795
796 string inject;
797 s_names new_state;
798 new_state = recalc_full_state(ratio, pratio, inject);
799
7c673cae 800 dout(20) << __func__ << " cur ratio " << ratio
11fdf7f2 801 << ", physical ratio " << pratio
7c673cae
FG
802 << ", new state " << get_full_state_name(new_state)
803 << " " << inject
804 << dendl;
805
806 // warn
807 if (cur_state != new_state) {
808 dout(10) << __func__ << " " << get_full_state_name(cur_state)
809 << " -> " << get_full_state_name(new_state) << dendl;
810 if (new_state == FAILSAFE) {
c07f9fc5 811 clog->error() << "full status failsafe engaged, dropping updates, now "
7c673cae
FG
812 << (int)roundf(ratio * 100) << "% full";
813 } else if (cur_state == FAILSAFE) {
c07f9fc5
FG
814 clog->error() << "full status failsafe disengaged, no longer dropping "
815 << "updates, now " << (int)roundf(ratio * 100) << "% full";
7c673cae
FG
816 }
817 cur_state = new_state;
818 }
819}
820
821bool OSDService::need_fullness_update()
822{
823 OSDMapRef osdmap = get_osdmap();
824 s_names cur = NONE;
825 if (osdmap->exists(whoami)) {
826 if (osdmap->get_state(whoami) & CEPH_OSD_FULL) {
827 cur = FULL;
828 } else if (osdmap->get_state(whoami) & CEPH_OSD_BACKFILLFULL) {
829 cur = BACKFILLFULL;
830 } else if (osdmap->get_state(whoami) & CEPH_OSD_NEARFULL) {
831 cur = NEARFULL;
832 }
833 }
834 s_names want = NONE;
835 if (is_full())
836 want = FULL;
837 else if (is_backfillfull())
838 want = BACKFILLFULL;
839 else if (is_nearfull())
840 want = NEARFULL;
841 return want != cur;
842}
843
11fdf7f2 844bool OSDService::_check_inject_full(DoutPrefixProvider *dpp, s_names type) const
7c673cae 845{
7c673cae
FG
846 if (injectfull && injectfull_state >= type) {
847 // injectfull is either a count of the number of times to return failsafe full
848 // or if -1 then always return full
849 if (injectfull > 0)
850 --injectfull;
11fdf7f2
TL
851 ldpp_dout(dpp, 10) << __func__ << " Injected " << get_full_state_name(type) << " OSD ("
852 << (injectfull < 0 ? "set" : std::to_string(injectfull)) << ")"
853 << dendl;
7c673cae
FG
854 return true;
855 }
11fdf7f2
TL
856 return false;
857}
858
859bool OSDService::_check_full(DoutPrefixProvider *dpp, s_names type) const
860{
861 std::lock_guard l(full_status_lock);
862
863 if (_check_inject_full(dpp, type))
864 return true;
865
866 if (cur_state >= type)
867 ldpp_dout(dpp, 10) << __func__ << " current usage is " << cur_ratio
868 << " physical " << physical_ratio << dendl;
7c673cae 869
7c673cae
FG
870 return cur_state >= type;
871}
872
11fdf7f2
TL
873bool OSDService::_tentative_full(DoutPrefixProvider *dpp, s_names type, uint64_t adjust_used, osd_stat_t adjusted_stat)
874{
875 ldpp_dout(dpp, 20) << __func__ << " type " << get_full_state_name(type) << " adjust_used " << (adjust_used >> 10) << "KiB" << dendl;
876 {
877 std::lock_guard l(full_status_lock);
878 if (_check_inject_full(dpp, type)) {
879 return true;
880 }
881 }
882
883 float pratio;
884 float ratio = compute_adjusted_ratio(adjusted_stat, &pratio, adjust_used);
885
886 string notused;
887 s_names tentative_state = recalc_full_state(ratio, pratio, notused);
888
889 if (tentative_state >= type)
890 ldpp_dout(dpp, 10) << __func__ << " tentative usage is " << ratio << dendl;
891
892 return tentative_state >= type;
893}
894
895bool OSDService::check_failsafe_full(DoutPrefixProvider *dpp) const
896{
897 return _check_full(dpp, FAILSAFE);
898}
899
900bool OSDService::check_full(DoutPrefixProvider *dpp) const
7c673cae 901{
11fdf7f2 902 return _check_full(dpp, FULL);
7c673cae
FG
903}
904
11fdf7f2 905bool OSDService::tentative_backfill_full(DoutPrefixProvider *dpp, uint64_t adjust_used, osd_stat_t stats)
7c673cae 906{
11fdf7f2 907 return _tentative_full(dpp, BACKFILLFULL, adjust_used, stats);
7c673cae
FG
908}
909
11fdf7f2 910bool OSDService::check_backfill_full(DoutPrefixProvider *dpp) const
7c673cae 911{
11fdf7f2 912 return _check_full(dpp, BACKFILLFULL);
7c673cae
FG
913}
914
11fdf7f2 915bool OSDService::check_nearfull(DoutPrefixProvider *dpp) const
7c673cae 916{
11fdf7f2 917 return _check_full(dpp, NEARFULL);
7c673cae
FG
918}
919
920bool OSDService::is_failsafe_full() const
921{
11fdf7f2 922 std::lock_guard l(full_status_lock);
7c673cae
FG
923 return cur_state == FAILSAFE;
924}
925
926bool OSDService::is_full() const
927{
11fdf7f2 928 std::lock_guard l(full_status_lock);
7c673cae
FG
929 return cur_state >= FULL;
930}
931
932bool OSDService::is_backfillfull() const
933{
11fdf7f2 934 std::lock_guard l(full_status_lock);
7c673cae
FG
935 return cur_state >= BACKFILLFULL;
936}
937
938bool OSDService::is_nearfull() const
939{
11fdf7f2 940 std::lock_guard l(full_status_lock);
7c673cae
FG
941 return cur_state >= NEARFULL;
942}
943
944void OSDService::set_injectfull(s_names type, int64_t count)
945{
11fdf7f2 946 std::lock_guard l(full_status_lock);
7c673cae
FG
947 injectfull_state = type;
948 injectfull = count;
949}
950
11fdf7f2
TL
951void OSDService::set_statfs(const struct store_statfs_t &stbuf,
952 osd_alert_list_t& alerts)
7c673cae 953{
224ce89b 954 uint64_t bytes = stbuf.total;
224ce89b 955 uint64_t avail = stbuf.available;
11fdf7f2
TL
956 uint64_t used = stbuf.get_used_raw();
957
958 // For testing fake statfs values so it doesn't matter if all
959 // OSDs are using the same partition.
960 if (cct->_conf->fake_statfs_for_testing) {
961 uint64_t total_num_bytes = 0;
962 vector<PGRef> pgs;
963 osd->_get_pgs(&pgs);
964 for (auto p : pgs) {
965 total_num_bytes += p->get_stats_num_bytes();
966 }
967 bytes = cct->_conf->fake_statfs_for_testing;
968 if (total_num_bytes < bytes)
969 avail = bytes - total_num_bytes;
970 else
971 avail = 0;
972 dout(0) << __func__ << " fake total " << cct->_conf->fake_statfs_for_testing
973 << " adjust available " << avail
974 << dendl;
975 used = bytes - avail;
976 }
7c673cae 977
f67539c2
TL
978 logger->set(l_osd_stat_bytes, bytes);
979 logger->set(l_osd_stat_bytes_used, used);
980 logger->set(l_osd_stat_bytes_avail, avail);
7c673cae 981
11fdf7f2
TL
982 std::lock_guard l(stat_lock);
983 osd_stat.statfs = stbuf;
984 osd_stat.os_alerts.clear();
985 osd_stat.os_alerts[whoami].swap(alerts);
986 if (cct->_conf->fake_statfs_for_testing) {
987 osd_stat.statfs.total = bytes;
988 osd_stat.statfs.available = avail;
989 // For testing don't want used to go negative, so clear reserved
990 osd_stat.statfs.internally_reserved = 0;
224ce89b
WB
991 }
992}
7c673cae 993
11fdf7f2
TL
994osd_stat_t OSDService::set_osd_stat(vector<int>& hb_peers,
995 int num_pgs)
224ce89b 996{
eafe8130
TL
997 utime_t now = ceph_clock_now();
998 auto stale_time = g_conf().get_val<int64_t>("osd_mon_heartbeat_stat_stale");
11fdf7f2
TL
999 std::lock_guard l(stat_lock);
1000 osd_stat.hb_peers.swap(hb_peers);
1001 osd->op_tracker.get_age_ms_histogram(&osd_stat.op_queue_age_hist);
1002 osd_stat.num_pgs = num_pgs;
eafe8130
TL
1003 // Clean entries that aren't updated
1004 // This is called often enough that we can just remove 1 at a time
1005 for (auto i: osd_stat.hb_pingtime) {
1006 if (i.second.last_update == 0)
1007 continue;
1008 if (stale_time && now.sec() - i.second.last_update > stale_time) {
1009 dout(20) << __func__ << " time out heartbeat for osd " << i.first
1010 << " last_update " << i.second.last_update << dendl;
1011 osd_stat.hb_pingtime.erase(i.first);
1012 break;
1013 }
1014 }
11fdf7f2
TL
1015 return osd_stat;
1016}
1017
1018void OSDService::inc_osd_stat_repaired()
1019{
1020 std::lock_guard l(stat_lock);
1021 osd_stat.num_shards_repaired++;
1022 return;
1023}
1024
1025float OSDService::compute_adjusted_ratio(osd_stat_t new_stat, float *pratio,
1026 uint64_t adjust_used)
1027{
1028 *pratio =
b3b6e05e 1029 ((float)new_stat.statfs.get_used_raw()) / ((float)new_stat.statfs.total);
11fdf7f2
TL
1030
1031 if (adjust_used) {
1032 dout(20) << __func__ << " Before kb_used() " << new_stat.statfs.kb_used() << dendl;
1033 if (new_stat.statfs.available > adjust_used)
1034 new_stat.statfs.available -= adjust_used;
1035 else
1036 new_stat.statfs.available = 0;
1037 dout(20) << __func__ << " After kb_used() " << new_stat.statfs.kb_used() << dendl;
7c673cae
FG
1038 }
1039
11fdf7f2
TL
1040 // Check all pgs and adjust kb_used to include all pending backfill data
1041 int backfill_adjusted = 0;
1042 vector<PGRef> pgs;
1043 osd->_get_pgs(&pgs);
1044 for (auto p : pgs) {
1045 backfill_adjusted += p->pg_stat_adjust(&new_stat);
1046 }
1047 if (backfill_adjusted) {
1048 dout(20) << __func__ << " backfill adjusted " << new_stat << dendl;
1049 }
b3b6e05e 1050 return ((float)new_stat.statfs.get_used_raw()) / ((float)new_stat.statfs.total);
7c673cae
FG
1051}
1052
7c673cae
FG
1053void OSDService::send_message_osd_cluster(int peer, Message *m, epoch_t from_epoch)
1054{
1055 OSDMapRef next_map = get_nextmap_reserved();
1056 // service map is always newer/newest
11fdf7f2 1057 ceph_assert(from_epoch <= next_map->get_epoch());
7c673cae
FG
1058
1059 if (next_map->is_down(peer) ||
1060 next_map->get_info(peer).up_from > from_epoch) {
1061 m->put();
1062 release_map(next_map);
1063 return;
1064 }
9f95a23c
TL
1065 ConnectionRef peer_con;
1066 if (peer == whoami) {
1067 peer_con = osd->cluster_messenger->get_loopback_connection();
1068 } else {
1069 peer_con = osd->cluster_messenger->connect_to_osd(
1070 next_map->get_cluster_addrs(peer), false, true);
1071 }
1072 maybe_share_map(peer_con.get(), next_map);
7c673cae
FG
1073 peer_con->send_message(m);
1074 release_map(next_map);
1075}
1076
9f95a23c
TL
1077void OSDService::send_message_osd_cluster(std::vector<std::pair<int, Message*>>& messages, epoch_t from_epoch)
1078{
1079 OSDMapRef next_map = get_nextmap_reserved();
1080 // service map is always newer/newest
1081 ceph_assert(from_epoch <= next_map->get_epoch());
1082
1083 for (auto& iter : messages) {
1084 if (next_map->is_down(iter.first) ||
1085 next_map->get_info(iter.first).up_from > from_epoch) {
1086 iter.second->put();
1087 continue;
1088 }
1089 ConnectionRef peer_con;
1090 if (iter.first == whoami) {
1091 peer_con = osd->cluster_messenger->get_loopback_connection();
1092 } else {
1093 peer_con = osd->cluster_messenger->connect_to_osd(
1094 next_map->get_cluster_addrs(iter.first), false, true);
1095 }
1096 maybe_share_map(peer_con.get(), next_map);
1097 peer_con->send_message(iter.second);
1098 }
1099 release_map(next_map);
1100}
7c673cae
FG
1101ConnectionRef OSDService::get_con_osd_cluster(int peer, epoch_t from_epoch)
1102{
1103 OSDMapRef next_map = get_nextmap_reserved();
1104 // service map is always newer/newest
11fdf7f2 1105 ceph_assert(from_epoch <= next_map->get_epoch());
7c673cae
FG
1106
1107 if (next_map->is_down(peer) ||
1108 next_map->get_info(peer).up_from > from_epoch) {
1109 release_map(next_map);
1110 return NULL;
1111 }
9f95a23c
TL
1112 ConnectionRef con;
1113 if (peer == whoami) {
1114 con = osd->cluster_messenger->get_loopback_connection();
1115 } else {
1116 con = osd->cluster_messenger->connect_to_osd(
1117 next_map->get_cluster_addrs(peer), false, true);
1118 }
7c673cae
FG
1119 release_map(next_map);
1120 return con;
1121}
1122
1123pair<ConnectionRef,ConnectionRef> OSDService::get_con_osd_hb(int peer, epoch_t from_epoch)
1124{
1125 OSDMapRef next_map = get_nextmap_reserved();
1126 // service map is always newer/newest
11fdf7f2 1127 ceph_assert(from_epoch <= next_map->get_epoch());
7c673cae
FG
1128
1129 pair<ConnectionRef,ConnectionRef> ret;
1130 if (next_map->is_down(peer) ||
1131 next_map->get_info(peer).up_from > from_epoch) {
1132 release_map(next_map);
1133 return ret;
1134 }
11fdf7f2
TL
1135 ret.first = osd->hb_back_client_messenger->connect_to_osd(
1136 next_map->get_hb_back_addrs(peer));
1137 ret.second = osd->hb_front_client_messenger->connect_to_osd(
1138 next_map->get_hb_front_addrs(peer));
7c673cae
FG
1139 release_map(next_map);
1140 return ret;
1141}
1142
11fdf7f2
TL
1143entity_name_t OSDService::get_cluster_msgr_name() const
1144{
1145 return cluster_messenger->get_myname();
1146}
7c673cae 1147
94b18763
FG
1148void OSDService::queue_want_pg_temp(pg_t pgid,
1149 const vector<int>& want,
1150 bool forced)
7c673cae 1151{
11fdf7f2 1152 std::lock_guard l(pg_temp_lock);
94b18763 1153 auto p = pg_temp_pending.find(pgid);
7c673cae 1154 if (p == pg_temp_pending.end() ||
94b18763
FG
1155 p->second.acting != want ||
1156 forced) {
11fdf7f2 1157 pg_temp_wanted[pgid] = {want, forced};
7c673cae
FG
1158 }
1159}
1160
1161void OSDService::remove_want_pg_temp(pg_t pgid)
1162{
11fdf7f2 1163 std::lock_guard l(pg_temp_lock);
7c673cae
FG
1164 pg_temp_wanted.erase(pgid);
1165 pg_temp_pending.erase(pgid);
1166}
1167
1168void OSDService::_sent_pg_temp()
1169{
11fdf7f2
TL
1170#ifdef HAVE_STDLIB_MAP_SPLICING
1171 pg_temp_pending.merge(pg_temp_wanted);
1172#else
94b18763
FG
1173 pg_temp_pending.insert(make_move_iterator(begin(pg_temp_wanted)),
1174 make_move_iterator(end(pg_temp_wanted)));
11fdf7f2 1175#endif
7c673cae
FG
1176 pg_temp_wanted.clear();
1177}
1178
1179void OSDService::requeue_pg_temp()
1180{
11fdf7f2 1181 std::lock_guard l(pg_temp_lock);
7c673cae
FG
1182 // wanted overrides pending. note that remove_want_pg_temp
1183 // clears the item out of both.
1184 unsigned old_wanted = pg_temp_wanted.size();
1185 unsigned old_pending = pg_temp_pending.size();
1186 _sent_pg_temp();
1187 pg_temp_wanted.swap(pg_temp_pending);
1188 dout(10) << __func__ << " " << old_wanted << " + " << old_pending << " -> "
1189 << pg_temp_wanted.size() << dendl;
1190}
1191
94b18763
FG
1192std::ostream& operator<<(std::ostream& out,
1193 const OSDService::pg_temp_t& pg_temp)
1194{
1195 out << pg_temp.acting;
1196 if (pg_temp.forced) {
1197 out << " (forced)";
1198 }
1199 return out;
1200}
1201
7c673cae
FG
1202void OSDService::send_pg_temp()
1203{
11fdf7f2 1204 std::lock_guard l(pg_temp_lock);
7c673cae
FG
1205 if (pg_temp_wanted.empty())
1206 return;
1207 dout(10) << "send_pg_temp " << pg_temp_wanted << dendl;
94b18763 1208 MOSDPGTemp *ms[2] = {nullptr, nullptr};
11fdf7f2
TL
1209 for (auto& [pgid, pg_temp] : pg_temp_wanted) {
1210 auto& m = ms[pg_temp.forced];
94b18763
FG
1211 if (!m) {
1212 m = new MOSDPGTemp(osdmap->get_epoch());
11fdf7f2 1213 m->forced = pg_temp.forced;
94b18763 1214 }
11fdf7f2 1215 m->pg_temp.emplace(pgid, pg_temp.acting);
94b18763
FG
1216 }
1217 for (auto m : ms) {
1218 if (m) {
1219 monc->send_mon_message(m);
1220 }
1221 }
7c673cae
FG
1222 _sent_pg_temp();
1223}
1224
1225void OSDService::send_pg_created(pg_t pgid)
1226{
11fdf7f2 1227 std::lock_guard l(pg_created_lock);
7c673cae 1228 dout(20) << __func__ << dendl;
11fdf7f2 1229 auto o = get_osdmap();
9f95a23c 1230 if (o->require_osd_release >= ceph_release_t::luminous) {
11fdf7f2 1231 pg_created.insert(pgid);
c07f9fc5
FG
1232 monc->send_mon_message(new MOSDPGCreated(pgid));
1233 }
7c673cae
FG
1234}
1235
11fdf7f2
TL
1236void OSDService::send_pg_created()
1237{
1238 std::lock_guard l(pg_created_lock);
1239 dout(20) << __func__ << dendl;
1240 auto o = get_osdmap();
9f95a23c 1241 if (o->require_osd_release >= ceph_release_t::luminous) {
11fdf7f2
TL
1242 for (auto pgid : pg_created) {
1243 monc->send_mon_message(new MOSDPGCreated(pgid));
1244 }
1245 }
1246}
1247
1248void OSDService::prune_pg_created()
1249{
1250 std::lock_guard l(pg_created_lock);
1251 dout(20) << __func__ << dendl;
1252 auto o = get_osdmap();
1253 auto i = pg_created.begin();
1254 while (i != pg_created.end()) {
1255 auto p = o->get_pg_pool(i->pool());
1256 if (!p || !p->has_flag(pg_pool_t::FLAG_CREATING)) {
1257 dout(20) << __func__ << " pruning " << *i << dendl;
1258 i = pg_created.erase(i);
1259 } else {
1260 dout(20) << __func__ << " keeping " << *i << dendl;
1261 ++i;
1262 }
1263 }
1264}
1265
1266
7c673cae
FG
1267// --------------------------------------
1268// dispatch
1269
eafe8130 1270bool OSDService::can_inc_scrubs()
7c673cae
FG
1271{
1272 bool can_inc = false;
11fdf7f2 1273 std::lock_guard l(sched_scrub_lock);
7c673cae 1274
eafe8130
TL
1275 if (scrubs_local + scrubs_remote < cct->_conf->osd_max_scrubs) {
1276 dout(20) << __func__ << " == true " << scrubs_local << " local + " << scrubs_remote
1277 << " remote < max " << cct->_conf->osd_max_scrubs << dendl;
7c673cae
FG
1278 can_inc = true;
1279 } else {
eafe8130
TL
1280 dout(20) << __func__ << " == false " << scrubs_local << " local + " << scrubs_remote
1281 << " remote >= max " << cct->_conf->osd_max_scrubs << dendl;
7c673cae
FG
1282 }
1283
1284 return can_inc;
1285}
1286
eafe8130 1287bool OSDService::inc_scrubs_local()
7c673cae
FG
1288{
1289 bool result = false;
eafe8130
TL
1290 std::lock_guard l{sched_scrub_lock};
1291 if (scrubs_local + scrubs_remote < cct->_conf->osd_max_scrubs) {
1292 dout(20) << __func__ << " " << scrubs_local << " -> " << (scrubs_local+1)
1293 << " (max " << cct->_conf->osd_max_scrubs << ", remote " << scrubs_remote << ")" << dendl;
7c673cae 1294 result = true;
eafe8130 1295 ++scrubs_local;
7c673cae 1296 } else {
eafe8130 1297 dout(20) << __func__ << " " << scrubs_local << " local + " << scrubs_remote << " remote >= max " << cct->_conf->osd_max_scrubs << dendl;
7c673cae 1298 }
7c673cae
FG
1299 return result;
1300}
1301
eafe8130 1302void OSDService::dec_scrubs_local()
7c673cae 1303{
eafe8130
TL
1304 std::lock_guard l{sched_scrub_lock};
1305 dout(20) << __func__ << " " << scrubs_local << " -> " << (scrubs_local-1)
1306 << " (max " << cct->_conf->osd_max_scrubs << ", remote " << scrubs_remote << ")" << dendl;
1307 --scrubs_local;
1308 ceph_assert(scrubs_local >= 0);
7c673cae
FG
1309}
1310
eafe8130 1311bool OSDService::inc_scrubs_remote()
7c673cae 1312{
eafe8130
TL
1313 bool result = false;
1314 std::lock_guard l{sched_scrub_lock};
1315 if (scrubs_local + scrubs_remote < cct->_conf->osd_max_scrubs) {
1316 dout(20) << __func__ << " " << scrubs_remote << " -> " << (scrubs_remote+1)
1317 << " (max " << cct->_conf->osd_max_scrubs << ", local " << scrubs_local << ")" << dendl;
1318 result = true;
1319 ++scrubs_remote;
7c673cae 1320 } else {
eafe8130 1321 dout(20) << __func__ << " " << scrubs_local << " local + " << scrubs_remote << " remote >= max " << cct->_conf->osd_max_scrubs << dendl;
7c673cae 1322 }
eafe8130
TL
1323 return result;
1324}
1325
1326void OSDService::dec_scrubs_remote()
1327{
1328 std::lock_guard l{sched_scrub_lock};
1329 dout(20) << __func__ << " " << scrubs_remote << " -> " << (scrubs_remote-1)
1330 << " (max " << cct->_conf->osd_max_scrubs << ", local " << scrubs_local << ")" << dendl;
1331 --scrubs_remote;
1332 ceph_assert(scrubs_remote >= 0);
7c673cae
FG
1333}
1334
eafe8130 1335void OSDService::dump_scrub_reservations(Formatter *f)
7c673cae 1336{
eafe8130
TL
1337 std::lock_guard l{sched_scrub_lock};
1338 f->dump_int("scrubs_local", scrubs_local);
1339 f->dump_int("scrubs_remote", scrubs_remote);
1340 f->dump_int("osd_max_scrubs", cct->_conf->osd_max_scrubs);
7c673cae
FG
1341}
1342
1343void OSDService::retrieve_epochs(epoch_t *_boot_epoch, epoch_t *_up_epoch,
1344 epoch_t *_bind_epoch) const
1345{
11fdf7f2 1346 std::lock_guard l(epoch_lock);
7c673cae
FG
1347 if (_boot_epoch)
1348 *_boot_epoch = boot_epoch;
1349 if (_up_epoch)
1350 *_up_epoch = up_epoch;
1351 if (_bind_epoch)
1352 *_bind_epoch = bind_epoch;
1353}
1354
1355void OSDService::set_epochs(const epoch_t *_boot_epoch, const epoch_t *_up_epoch,
1356 const epoch_t *_bind_epoch)
1357{
11fdf7f2 1358 std::lock_guard l(epoch_lock);
7c673cae 1359 if (_boot_epoch) {
11fdf7f2 1360 ceph_assert(*_boot_epoch == 0 || *_boot_epoch >= boot_epoch);
7c673cae
FG
1361 boot_epoch = *_boot_epoch;
1362 }
1363 if (_up_epoch) {
11fdf7f2 1364 ceph_assert(*_up_epoch == 0 || *_up_epoch >= up_epoch);
7c673cae
FG
1365 up_epoch = *_up_epoch;
1366 }
1367 if (_bind_epoch) {
11fdf7f2 1368 ceph_assert(*_bind_epoch == 0 || *_bind_epoch >= bind_epoch);
7c673cae
FG
1369 bind_epoch = *_bind_epoch;
1370 }
1371}
1372
1373bool OSDService::prepare_to_stop()
1374{
9f95a23c 1375 std::unique_lock l(is_stopping_lock);
7c673cae
FG
1376 if (get_state() != NOT_STOPPING)
1377 return false;
1378
1379 OSDMapRef osdmap = get_osdmap();
1380 if (osdmap && osdmap->is_up(whoami)) {
1381 dout(0) << __func__ << " telling mon we are shutting down" << dendl;
1382 set_state(PREPARING_TO_STOP);
11fdf7f2
TL
1383 monc->send_mon_message(
1384 new MOSDMarkMeDown(
1385 monc->get_fsid(),
1386 whoami,
1387 osdmap->get_addrs(whoami),
1388 osdmap->get_epoch(),
1389 true // request ack
1390 ));
9f95a23c
TL
1391 const auto timeout = ceph::make_timespan(cct->_conf->osd_mon_shutdown_timeout);
1392 is_stopping_cond.wait_for(l, timeout,
1393 [this] { return get_state() == STOPPING; });
7c673cae
FG
1394 }
1395 dout(0) << __func__ << " starting shutdown" << dendl;
1396 set_state(STOPPING);
1397 return true;
1398}
1399
1400void OSDService::got_stop_ack()
1401{
9f95a23c 1402 std::scoped_lock l(is_stopping_lock);
7c673cae
FG
1403 if (get_state() == PREPARING_TO_STOP) {
1404 dout(0) << __func__ << " starting shutdown" << dendl;
1405 set_state(STOPPING);
9f95a23c 1406 is_stopping_cond.notify_all();
7c673cae
FG
1407 } else {
1408 dout(10) << __func__ << " ignoring msg" << dendl;
1409 }
1410}
1411
1412MOSDMap *OSDService::build_incremental_map_msg(epoch_t since, epoch_t to,
1413 OSDSuperblock& sblock)
1414{
28e407b8
AA
1415 MOSDMap *m = new MOSDMap(monc->get_fsid(),
1416 osdmap->get_encoding_features());
7c673cae
FG
1417 m->oldest_map = max_oldest_map;
1418 m->newest_map = sblock.newest_map;
1419
11fdf7f2
TL
1420 int max = cct->_conf->osd_map_message_max;
1421 ssize_t max_bytes = cct->_conf->osd_map_message_max_bytes;
1422
1423 if (since < m->oldest_map) {
1424 // we don't have the next map the target wants, so start with a
1425 // full map.
1426 bufferlist bl;
1427 dout(10) << __func__ << " oldest map " << max_oldest_map << " > since "
1428 << since << ", starting with full map" << dendl;
1429 since = m->oldest_map;
1430 if (!get_map_bl(since, bl)) {
1431 derr << __func__ << " missing full map " << since << dendl;
1432 goto panic;
1433 }
1434 max--;
1435 max_bytes -= bl.length();
f67539c2 1436 m->maps[since] = std::move(bl);
11fdf7f2
TL
1437 }
1438 for (epoch_t e = since + 1; e <= to; ++e) {
7c673cae 1439 bufferlist bl;
11fdf7f2 1440 if (get_inc_map_bl(e, bl)) {
f67539c2 1441 m->incremental_maps[e] = std::move(bl);
11fdf7f2 1442 } else {
e306af50 1443 dout(10) << __func__ << " missing incremental map " << e << dendl;
11fdf7f2
TL
1444 if (!get_map_bl(e, bl)) {
1445 derr << __func__ << " also missing full map " << e << dendl;
1446 goto panic;
1447 }
f67539c2 1448 m->maps[e] = std::move(bl);
11fdf7f2
TL
1449 }
1450 max--;
1451 max_bytes -= bl.length();
1452 if (max <= 0 || max_bytes <= 0) {
7c673cae 1453 break;
11fdf7f2
TL
1454 }
1455 }
1456 return m;
1457
1458 panic:
1459 if (!m->maps.empty() ||
1460 !m->incremental_maps.empty()) {
1461 // send what we have so far
1462 return m;
1463 }
1464 // send something
1465 bufferlist bl;
1466 if (get_inc_map_bl(m->newest_map, bl)) {
f67539c2 1467 m->incremental_maps[m->newest_map] = std::move(bl);
11fdf7f2
TL
1468 } else {
1469 derr << __func__ << " unable to load latest map " << m->newest_map << dendl;
1470 if (!get_map_bl(m->newest_map, bl)) {
1471 derr << __func__ << " unable to load latest full map " << m->newest_map
7c673cae 1472 << dendl;
11fdf7f2 1473 ceph_abort();
7c673cae 1474 }
f67539c2 1475 m->maps[m->newest_map] = std::move(bl);
7c673cae
FG
1476 }
1477 return m;
1478}
1479
1480void OSDService::send_map(MOSDMap *m, Connection *con)
1481{
1482 con->send_message(m);
1483}
1484
1485void OSDService::send_incremental_map(epoch_t since, Connection *con,
9f95a23c 1486 const OSDMapRef& osdmap)
7c673cae
FG
1487{
1488 epoch_t to = osdmap->get_epoch();
1489 dout(10) << "send_incremental_map " << since << " -> " << to
1490 << " to " << con << " " << con->get_peer_addr() << dendl;
1491
1492 MOSDMap *m = NULL;
1493 while (!m) {
1494 OSDSuperblock sblock(get_superblock());
1495 if (since < sblock.oldest_map) {
1496 // just send latest full map
28e407b8
AA
1497 MOSDMap *m = new MOSDMap(monc->get_fsid(),
1498 osdmap->get_encoding_features());
7c673cae
FG
1499 m->oldest_map = max_oldest_map;
1500 m->newest_map = sblock.newest_map;
1501 get_map_bl(to, m->maps[to]);
1502 send_map(m, con);
1503 return;
1504 }
1505
1506 if (to > since && (int64_t)(to - since) > cct->_conf->osd_map_share_max_epochs) {
1507 dout(10) << " " << (to - since) << " > max " << cct->_conf->osd_map_share_max_epochs
1508 << ", only sending most recent" << dendl;
1509 since = to - cct->_conf->osd_map_share_max_epochs;
1510 }
1511
7c673cae
FG
1512 m = build_incremental_map_msg(since, to, sblock);
1513 }
1514 send_map(m, con);
1515}
1516
1517bool OSDService::_get_map_bl(epoch_t e, bufferlist& bl)
1518{
1519 bool found = map_bl_cache.lookup(e, &bl);
31f18b77 1520 if (found) {
f67539c2 1521 logger->inc(l_osd_map_bl_cache_hit);
7c673cae 1522 return true;
31f18b77 1523 }
f67539c2 1524 logger->inc(l_osd_map_bl_cache_miss);
11fdf7f2 1525 found = store->read(meta_ch,
31f18b77
FG
1526 OSD::get_osdmap_pobject_name(e), 0, 0, bl,
1527 CEPH_OSD_OP_FLAG_FADVISE_WILLNEED) >= 0;
1528 if (found) {
7c673cae 1529 _add_map_bl(e, bl);
31f18b77 1530 }
7c673cae
FG
1531 return found;
1532}
1533
1534bool OSDService::get_inc_map_bl(epoch_t e, bufferlist& bl)
1535{
11fdf7f2 1536 std::lock_guard l(map_cache_lock);
7c673cae 1537 bool found = map_bl_inc_cache.lookup(e, &bl);
31f18b77 1538 if (found) {
f67539c2 1539 logger->inc(l_osd_map_bl_cache_hit);
7c673cae 1540 return true;
31f18b77 1541 }
f67539c2 1542 logger->inc(l_osd_map_bl_cache_miss);
11fdf7f2 1543 found = store->read(meta_ch,
31f18b77
FG
1544 OSD::get_inc_osdmap_pobject_name(e), 0, 0, bl,
1545 CEPH_OSD_OP_FLAG_FADVISE_WILLNEED) >= 0;
1546 if (found) {
7c673cae 1547 _add_map_inc_bl(e, bl);
31f18b77 1548 }
7c673cae
FG
1549 return found;
1550}
1551
1552void OSDService::_add_map_bl(epoch_t e, bufferlist& bl)
1553{
1554 dout(10) << "add_map_bl " << e << " " << bl.length() << " bytes" << dendl;
31f18b77
FG
1555 // cache a contiguous buffer
1556 if (bl.get_num_buffers() > 1) {
1557 bl.rebuild();
1558 }
1559 bl.try_assign_to_mempool(mempool::mempool_osd_mapbl);
7c673cae
FG
1560 map_bl_cache.add(e, bl);
1561}
1562
1563void OSDService::_add_map_inc_bl(epoch_t e, bufferlist& bl)
1564{
1565 dout(10) << "add_map_inc_bl " << e << " " << bl.length() << " bytes" << dendl;
31f18b77
FG
1566 // cache a contiguous buffer
1567 if (bl.get_num_buffers() > 1) {
1568 bl.rebuild();
1569 }
1570 bl.try_assign_to_mempool(mempool::mempool_osd_mapbl);
7c673cae
FG
1571 map_bl_inc_cache.add(e, bl);
1572}
1573
7c673cae
FG
1574OSDMapRef OSDService::_add_map(OSDMap *o)
1575{
1576 epoch_t e = o->get_epoch();
1577
1578 if (cct->_conf->osd_map_dedup) {
1579 // Dedup against an existing map at a nearby epoch
1580 OSDMapRef for_dedup = map_cache.lower_bound(e);
1581 if (for_dedup) {
1582 OSDMap::dedup(for_dedup.get(), o);
1583 }
1584 }
1585 bool existed;
1586 OSDMapRef l = map_cache.add(e, o, &existed);
1587 if (existed) {
1588 delete o;
1589 }
1590 return l;
1591}
1592
1593OSDMapRef OSDService::try_get_map(epoch_t epoch)
1594{
11fdf7f2 1595 std::lock_guard l(map_cache_lock);
7c673cae
FG
1596 OSDMapRef retval = map_cache.lookup(epoch);
1597 if (retval) {
1598 dout(30) << "get_map " << epoch << " -cached" << dendl;
f67539c2 1599 logger->inc(l_osd_map_cache_hit);
7c673cae
FG
1600 return retval;
1601 }
f67539c2 1602 {
7c673cae
FG
1603 logger->inc(l_osd_map_cache_miss);
1604 epoch_t lb = map_cache.cached_key_lower_bound();
1605 if (epoch < lb) {
1606 dout(30) << "get_map " << epoch << " - miss, below lower bound" << dendl;
1607 logger->inc(l_osd_map_cache_miss_low);
1608 logger->inc(l_osd_map_cache_miss_low_avg, lb - epoch);
1609 }
1610 }
1611
1612 OSDMap *map = new OSDMap;
1613 if (epoch > 0) {
1614 dout(20) << "get_map " << epoch << " - loading and decoding " << map << dendl;
1615 bufferlist bl;
1616 if (!_get_map_bl(epoch, bl) || bl.length() == 0) {
1617 derr << "failed to load OSD map for epoch " << epoch << ", got " << bl.length() << " bytes" << dendl;
1618 delete map;
1619 return OSDMapRef();
1620 }
1621 map->decode(bl);
1622 } else {
1623 dout(20) << "get_map " << epoch << " - return initial " << map << dendl;
1624 }
1625 return _add_map(map);
1626}
1627
1628// ops
1629
1630
1631void OSDService::reply_op_error(OpRequestRef op, int err)
1632{
9f95a23c 1633 reply_op_error(op, err, eversion_t(), 0, {});
7c673cae
FG
1634}
1635
1636void OSDService::reply_op_error(OpRequestRef op, int err, eversion_t v,
9f95a23c
TL
1637 version_t uv,
1638 vector<pg_log_op_return_item_t> op_returns)
7c673cae 1639{
9f95a23c 1640 auto m = op->get_req<MOSDOp>();
11fdf7f2 1641 ceph_assert(m->get_type() == CEPH_MSG_OSD_OP);
7c673cae
FG
1642 int flags;
1643 flags = m->get_flags() & (CEPH_OSD_FLAG_ACK|CEPH_OSD_FLAG_ONDISK);
1644
9f95a23c
TL
1645 MOSDOpReply *reply = new MOSDOpReply(m, err, osdmap->get_epoch(), flags,
1646 !m->has_flag(CEPH_OSD_FLAG_RETURNVEC));
7c673cae 1647 reply->set_reply_versions(v, uv);
9f95a23c 1648 reply->set_op_returns(op_returns);
7c673cae
FG
1649 m->get_connection()->send_message(reply);
1650}
1651
1652void OSDService::handle_misdirected_op(PG *pg, OpRequestRef op)
1653{
31f18b77
FG
1654 if (!cct->_conf->osd_debug_misdirected_ops) {
1655 return;
1656 }
1657
9f95a23c 1658 auto m = op->get_req<MOSDOp>();
11fdf7f2 1659 ceph_assert(m->get_type() == CEPH_MSG_OSD_OP);
7c673cae 1660
11fdf7f2 1661 ceph_assert(m->get_map_epoch() >= pg->get_history().same_primary_since);
7c673cae
FG
1662
1663 if (pg->is_ec_pg()) {
1664 /**
1665 * OSD recomputes op target based on current OSDMap. With an EC pg, we
1666 * can get this result:
1667 * 1) client at map 512 sends an op to osd 3, pg_t 3.9 based on mapping
1668 * [CRUSH_ITEM_NONE, 2, 3]/3
1669 * 2) OSD 3 at map 513 remaps op to osd 3, spg_t 3.9s0 based on mapping
1670 * [3, 2, 3]/3
1671 * 3) PG 3.9s0 dequeues the op at epoch 512 and notices that it isn't primary
1672 * -- misdirected op
1673 * 4) client resends and this time PG 3.9s0 having caught up to 513 gets
1674 * it and fulfils it
1675 *
1676 * We can't compute the op target based on the sending map epoch due to
1677 * splitting. The simplest thing is to detect such cases here and drop
1678 * them without an error (the client will resend anyway).
1679 */
11fdf7f2 1680 ceph_assert(m->get_map_epoch() <= superblock.newest_map);
7c673cae
FG
1681 OSDMapRef opmap = try_get_map(m->get_map_epoch());
1682 if (!opmap) {
1683 dout(7) << __func__ << ": " << *pg << " no longer have map for "
1684 << m->get_map_epoch() << ", dropping" << dendl;
1685 return;
1686 }
1687 pg_t _pgid = m->get_raw_pg();
1688 spg_t pgid;
1689 if ((m->get_flags() & CEPH_OSD_FLAG_PGOP) == 0)
1690 _pgid = opmap->raw_pg_to_pg(_pgid);
1691 if (opmap->get_primary_shard(_pgid, &pgid) &&
11fdf7f2 1692 pgid.shard != pg->pg_id.shard) {
7c673cae
FG
1693 dout(7) << __func__ << ": " << *pg << " primary changed since "
1694 << m->get_map_epoch() << ", dropping" << dendl;
1695 return;
1696 }
1697 }
1698
1699 dout(7) << *pg << " misdirected op in " << m->get_map_epoch() << dendl;
1700 clog->warn() << m->get_source_inst() << " misdirected " << m->get_reqid()
1701 << " pg " << m->get_raw_pg()
1702 << " to osd." << whoami
11fdf7f2 1703 << " not " << pg->get_acting()
7c673cae 1704 << " in e" << m->get_map_epoch() << "/" << osdmap->get_epoch();
7c673cae
FG
1705}
1706
9f95a23c 1707void OSDService::enqueue_back(OpSchedulerItem&& qi)
7c673cae 1708{
11fdf7f2 1709 osd->op_shardedwq.queue(std::move(qi));
7c673cae
FG
1710}
1711
9f95a23c 1712void OSDService::enqueue_front(OpSchedulerItem&& qi)
7c673cae 1713{
11fdf7f2 1714 osd->op_shardedwq.queue_front(std::move(qi));
7c673cae
FG
1715}
1716
11fdf7f2
TL
1717void OSDService::queue_recovery_context(
1718 PG *pg,
1719 GenContext<ThreadPool::TPHandle&> *c)
7c673cae 1720{
11fdf7f2
TL
1721 epoch_t e = get_osdmap_epoch();
1722 enqueue_back(
9f95a23c
TL
1723 OpSchedulerItem(
1724 unique_ptr<OpSchedulerItem::OpQueueable>(
11fdf7f2
TL
1725 new PGRecoveryContext(pg->get_pgid(), c, e)),
1726 cct->_conf->osd_recovery_cost,
1727 cct->_conf->osd_recovery_priority,
1728 ceph_clock_now(),
1729 0,
1730 e));
7c673cae
FG
1731}
1732
1733void OSDService::queue_for_snap_trim(PG *pg)
1734{
1735 dout(10) << "queueing " << *pg << " for snaptrim" << dendl;
11fdf7f2 1736 enqueue_back(
9f95a23c
TL
1737 OpSchedulerItem(
1738 unique_ptr<OpSchedulerItem::OpQueueable>(
11fdf7f2
TL
1739 new PGSnapTrim(pg->get_pgid(), pg->get_osdmap_epoch())),
1740 cct->_conf->osd_snap_trim_cost,
1741 cct->_conf->osd_snap_trim_priority,
1742 ceph_clock_now(),
1743 0,
1744 pg->get_osdmap_epoch()));
1745}
1746
f67539c2
TL
1747template <class MSG_TYPE>
1748void OSDService::queue_scrub_event_msg(PG* pg,
1749 Scrub::scrub_prio_t with_priority,
1750 unsigned int qu_priority)
11fdf7f2 1751{
11fdf7f2 1752 const auto epoch = pg->get_osdmap_epoch();
f67539c2
TL
1753 auto msg = new MSG_TYPE(pg->get_pgid(), epoch);
1754 dout(15) << "queue a scrub event (" << *msg << ") for " << *pg << ". Epoch: " << epoch << dendl;
1755
1756 enqueue_back(OpSchedulerItem(
1757 unique_ptr<OpSchedulerItem::OpQueueable>(msg), cct->_conf->osd_scrub_cost,
1758 pg->scrub_requeue_priority(with_priority, qu_priority), ceph_clock_now(), 0, epoch));
1759}
1760
1761template <class MSG_TYPE>
1762void OSDService::queue_scrub_event_msg(PG* pg, Scrub::scrub_prio_t with_priority)
1763{
1764 const auto epoch = pg->get_osdmap_epoch();
1765 auto msg = new MSG_TYPE(pg->get_pgid(), epoch);
1766 dout(15) << "queue a scrub event (" << *msg << ") for " << *pg << ". Epoch: " << epoch << dendl;
1767
1768 enqueue_back(OpSchedulerItem(
1769 unique_ptr<OpSchedulerItem::OpQueueable>(msg), cct->_conf->osd_scrub_cost,
1770 pg->scrub_requeue_priority(with_priority), ceph_clock_now(), 0, epoch));
1771}
1772
1773void OSDService::queue_for_scrub(PG* pg, Scrub::scrub_prio_t with_priority)
1774{
1775 queue_scrub_event_msg<PGScrub>(pg, with_priority);
1776}
1777
1778void OSDService::queue_scrub_after_repair(PG* pg, Scrub::scrub_prio_t with_priority)
1779{
1780 queue_scrub_event_msg<PGScrubAfterRepair>(pg, with_priority);
1781}
1782
1783void OSDService::queue_for_rep_scrub(PG* pg,
1784 Scrub::scrub_prio_t with_priority,
1785 unsigned int qu_priority)
1786{
1787 queue_scrub_event_msg<PGRepScrub>(pg, with_priority, qu_priority);
1788}
1789
1790void OSDService::queue_for_rep_scrub_resched(PG* pg,
1791 Scrub::scrub_prio_t with_priority,
1792 unsigned int qu_priority)
1793{
1794 // Resulting scrub event: 'SchedReplica'
1795 queue_scrub_event_msg<PGRepScrubResched>(pg, with_priority, qu_priority);
1796}
1797
1798void OSDService::queue_for_scrub_granted(PG* pg, Scrub::scrub_prio_t with_priority)
1799{
1800 // Resulting scrub event: 'RemotesReserved'
1801 queue_scrub_event_msg<PGScrubResourcesOK>(pg, with_priority);
1802}
1803
1804void OSDService::queue_for_scrub_denied(PG* pg, Scrub::scrub_prio_t with_priority)
1805{
1806 // Resulting scrub event: 'ReservationFailure'
1807 queue_scrub_event_msg<PGScrubDenied>(pg, with_priority);
1808}
1809
1810void OSDService::queue_for_scrub_resched(PG* pg, Scrub::scrub_prio_t with_priority)
1811{
1812 // Resulting scrub event: 'InternalSchedScrub'
1813 queue_scrub_event_msg<PGScrubResched>(pg, with_priority);
1814}
1815
1816void OSDService::queue_scrub_pushes_update(PG* pg, Scrub::scrub_prio_t with_priority)
1817{
1818 // Resulting scrub event: 'ActivePushesUpd'
1819 queue_scrub_event_msg<PGScrubPushesUpdate>(pg, with_priority);
1820}
1821
1822void OSDService::queue_scrub_applied_update(PG* pg, Scrub::scrub_prio_t with_priority)
1823{
1824 queue_scrub_event_msg<PGScrubAppliedUpdate>(pg, with_priority);
1825}
1826
1827void OSDService::queue_scrub_unblocking(PG* pg, Scrub::scrub_prio_t with_priority)
1828{
1829 // Resulting scrub event: 'Unblocked'
1830 queue_scrub_event_msg<PGScrubUnblocked>(pg, with_priority);
1831}
1832
1833void OSDService::queue_scrub_digest_update(PG* pg, Scrub::scrub_prio_t with_priority)
1834{
1835 // Resulting scrub event: 'DigestUpdate'
1836 queue_scrub_event_msg<PGScrubDigestUpdate>(pg, with_priority);
1837}
1838
1839void OSDService::queue_scrub_got_repl_maps(PG* pg, Scrub::scrub_prio_t with_priority)
1840{
1841 // Resulting scrub event: 'GotReplicas'
1842 queue_scrub_event_msg<PGScrubGotReplMaps>(pg, with_priority);
1843}
1844
1845void OSDService::queue_scrub_replica_pushes(PG *pg, Scrub::scrub_prio_t with_priority)
1846{
1847 // Resulting scrub event: 'ReplicaPushesUpd'
1848 queue_scrub_event_msg<PGScrubReplicaPushes>(pg, with_priority);
11fdf7f2
TL
1849}
1850
1851void OSDService::queue_for_pg_delete(spg_t pgid, epoch_t e)
1852{
1853 dout(10) << __func__ << " on " << pgid << " e " << e << dendl;
1854 enqueue_back(
9f95a23c
TL
1855 OpSchedulerItem(
1856 unique_ptr<OpSchedulerItem::OpQueueable>(
11fdf7f2
TL
1857 new PGDelete(pgid, e)),
1858 cct->_conf->osd_pg_delete_cost,
1859 cct->_conf->osd_pg_delete_priority,
1860 ceph_clock_now(),
1861 0,
1862 e));
1863}
1864
1865bool OSDService::try_finish_pg_delete(PG *pg, unsigned old_pg_num)
1866{
1867 return osd->try_finish_pg_delete(pg, old_pg_num);
1868}
1869
1870// ---
1871
1872void OSDService::set_ready_to_merge_source(PG *pg, eversion_t version)
1873{
1874 std::lock_guard l(merge_lock);
1875 dout(10) << __func__ << " " << pg->pg_id << dendl;
1876 ready_to_merge_source[pg->pg_id.pgid] = version;
1877 assert(not_ready_to_merge_source.count(pg->pg_id.pgid) == 0);
1878 _send_ready_to_merge();
1879}
1880
1881void OSDService::set_ready_to_merge_target(PG *pg,
1882 eversion_t version,
1883 epoch_t last_epoch_started,
1884 epoch_t last_epoch_clean)
1885{
1886 std::lock_guard l(merge_lock);
1887 dout(10) << __func__ << " " << pg->pg_id << dendl;
1888 ready_to_merge_target.insert(make_pair(pg->pg_id.pgid,
1889 make_tuple(version,
1890 last_epoch_started,
1891 last_epoch_clean)));
1892 assert(not_ready_to_merge_target.count(pg->pg_id.pgid) == 0);
1893 _send_ready_to_merge();
1894}
1895
1896void OSDService::set_not_ready_to_merge_source(pg_t source)
1897{
1898 std::lock_guard l(merge_lock);
1899 dout(10) << __func__ << " " << source << dendl;
1900 not_ready_to_merge_source.insert(source);
1901 assert(ready_to_merge_source.count(source) == 0);
1902 _send_ready_to_merge();
1903}
1904
1905void OSDService::set_not_ready_to_merge_target(pg_t target, pg_t source)
1906{
1907 std::lock_guard l(merge_lock);
1908 dout(10) << __func__ << " " << target << " source " << source << dendl;
1909 not_ready_to_merge_target[target] = source;
1910 assert(ready_to_merge_target.count(target) == 0);
1911 _send_ready_to_merge();
1912}
1913
1914void OSDService::send_ready_to_merge()
1915{
1916 std::lock_guard l(merge_lock);
1917 _send_ready_to_merge();
1918}
1919
1920void OSDService::_send_ready_to_merge()
1921{
1922 dout(20) << __func__
1923 << " ready_to_merge_source " << ready_to_merge_source
1924 << " not_ready_to_merge_source " << not_ready_to_merge_source
1925 << " ready_to_merge_target " << ready_to_merge_target
1926 << " not_ready_to_merge_target " << not_ready_to_merge_target
1927 << " sent_ready_to_merge_source " << sent_ready_to_merge_source
1928 << dendl;
1929 for (auto src : not_ready_to_merge_source) {
1930 if (sent_ready_to_merge_source.count(src) == 0) {
1931 monc->send_mon_message(new MOSDPGReadyToMerge(
1932 src,
1933 {}, {}, 0, 0,
1934 false,
1935 osdmap->get_epoch()));
1936 sent_ready_to_merge_source.insert(src);
1937 }
1938 }
1939 for (auto p : not_ready_to_merge_target) {
1940 if (sent_ready_to_merge_source.count(p.second) == 0) {
1941 monc->send_mon_message(new MOSDPGReadyToMerge(
1942 p.second,
1943 {}, {}, 0, 0,
1944 false,
1945 osdmap->get_epoch()));
1946 sent_ready_to_merge_source.insert(p.second);
1947 }
1948 }
1949 for (auto src : ready_to_merge_source) {
1950 if (not_ready_to_merge_source.count(src.first) ||
1951 not_ready_to_merge_target.count(src.first.get_parent())) {
1952 continue;
1953 }
1954 auto p = ready_to_merge_target.find(src.first.get_parent());
1955 if (p != ready_to_merge_target.end() &&
1956 sent_ready_to_merge_source.count(src.first) == 0) {
1957 monc->send_mon_message(new MOSDPGReadyToMerge(
1958 src.first, // source pgid
1959 src.second, // src version
1960 std::get<0>(p->second), // target version
1961 std::get<1>(p->second), // PG's last_epoch_started
1962 std::get<2>(p->second), // PG's last_epoch_clean
1963 true,
1964 osdmap->get_epoch()));
1965 sent_ready_to_merge_source.insert(src.first);
1966 }
1967 }
1968}
1969
1970void OSDService::clear_ready_to_merge(PG *pg)
1971{
1972 std::lock_guard l(merge_lock);
1973 dout(10) << __func__ << " " << pg->pg_id << dendl;
1974 ready_to_merge_source.erase(pg->pg_id.pgid);
1975 ready_to_merge_target.erase(pg->pg_id.pgid);
1976 not_ready_to_merge_source.erase(pg->pg_id.pgid);
1977 not_ready_to_merge_target.erase(pg->pg_id.pgid);
1978 sent_ready_to_merge_source.erase(pg->pg_id.pgid);
1979}
1980
1981void OSDService::clear_sent_ready_to_merge()
1982{
1983 std::lock_guard l(merge_lock);
1984 sent_ready_to_merge_source.clear();
1985}
1986
9f95a23c 1987void OSDService::prune_sent_ready_to_merge(const OSDMapRef& osdmap)
11fdf7f2
TL
1988{
1989 std::lock_guard l(merge_lock);
1990 auto i = sent_ready_to_merge_source.begin();
1991 while (i != sent_ready_to_merge_source.end()) {
1992 if (!osdmap->pg_exists(*i)) {
1993 dout(10) << __func__ << " " << *i << dendl;
1994 i = sent_ready_to_merge_source.erase(i);
1995 } else {
1996 ++i;
1997 }
1998 }
7c673cae
FG
1999}
2000
11fdf7f2
TL
2001// ---
2002
2003void OSDService::_queue_for_recovery(
2004 std::pair<epoch_t, PGRef> p,
2005 uint64_t reserved_pushes)
2006{
9f95a23c 2007 ceph_assert(ceph_mutex_is_locked_by_me(recovery_lock));
11fdf7f2 2008 enqueue_back(
9f95a23c
TL
2009 OpSchedulerItem(
2010 unique_ptr<OpSchedulerItem::OpQueueable>(
11fdf7f2
TL
2011 new PGRecovery(
2012 p.second->get_pgid(), p.first, reserved_pushes)),
2013 cct->_conf->osd_recovery_cost,
2014 cct->_conf->osd_recovery_priority,
2015 ceph_clock_now(),
2016 0,
2017 p.first));
2018}
7c673cae
FG
2019
2020// ====================================================================
2021// OSD
2022
2023#undef dout_prefix
2024#define dout_prefix *_dout
2025
2026// Commands shared between OSD's console and admin console:
f67539c2 2027namespace ceph::osd_cmds {
7c673cae 2028
11fdf7f2 2029int heap(CephContext& cct, const cmdmap_t& cmdmap, Formatter& f, std::ostream& os);
f67539c2
TL
2030
2031} // namespace ceph::osd_cmds
7c673cae 2032
e306af50 2033int OSD::mkfs(CephContext *cct, ObjectStore *store, uuid_d fsid, int whoami, string osdspec_affinity)
7c673cae
FG
2034{
2035 int ret;
2036
7c673cae
FG
2037 OSDSuperblock sb;
2038 bufferlist sbbl;
11fdf7f2 2039 ObjectStore::CollectionHandle ch;
7c673cae
FG
2040
2041 // if we are fed a uuid for this osd, use it.
2042 store->set_fsid(cct->_conf->osd_uuid);
2043
2044 ret = store->mkfs();
2045 if (ret) {
224ce89b
WB
2046 derr << "OSD::mkfs: ObjectStore::mkfs failed with error "
2047 << cpp_strerror(ret) << dendl;
7c673cae
FG
2048 goto free_store;
2049 }
2050
31f18b77 2051 store->set_cache_shards(1); // doesn't matter for mkfs!
7c673cae
FG
2052
2053 ret = store->mount();
2054 if (ret) {
224ce89b
WB
2055 derr << "OSD::mkfs: couldn't mount ObjectStore: error "
2056 << cpp_strerror(ret) << dendl;
7c673cae
FG
2057 goto free_store;
2058 }
2059
11fdf7f2
TL
2060 ch = store->open_collection(coll_t::meta());
2061 if (ch) {
2062 ret = store->read(ch, OSD_SUPERBLOCK_GOBJECT, 0, 0, sbbl);
2063 if (ret < 0) {
2064 derr << "OSD::mkfs: have meta collection but no superblock" << dendl;
2065 goto free_store;
2066 }
7c673cae
FG
2067 /* if we already have superblock, check content of superblock */
2068 dout(0) << " have superblock" << dendl;
11fdf7f2
TL
2069 auto p = sbbl.cbegin();
2070 decode(sb, p);
7c673cae
FG
2071 if (whoami != sb.whoami) {
2072 derr << "provided osd id " << whoami << " != superblock's " << sb.whoami
2073 << dendl;
2074 ret = -EINVAL;
2075 goto umount_store;
2076 }
2077 if (fsid != sb.cluster_fsid) {
2078 derr << "provided cluster fsid " << fsid
2079 << " != superblock's " << sb.cluster_fsid << dendl;
2080 ret = -EINVAL;
2081 goto umount_store;
2082 }
2083 } else {
2084 // create superblock
2085 sb.cluster_fsid = fsid;
2086 sb.osd_fsid = store->get_fsid();
2087 sb.whoami = whoami;
2088 sb.compat_features = get_osd_initial_compat_set();
2089
2090 bufferlist bl;
11fdf7f2 2091 encode(sb, bl);
7c673cae 2092
11fdf7f2
TL
2093 ObjectStore::CollectionHandle ch = store->create_new_collection(
2094 coll_t::meta());
7c673cae
FG
2095 ObjectStore::Transaction t;
2096 t.create_collection(coll_t::meta(), 0);
2097 t.write(coll_t::meta(), OSD_SUPERBLOCK_GOBJECT, 0, bl.length(), bl);
11fdf7f2 2098 ret = store->queue_transaction(ch, std::move(t));
7c673cae
FG
2099 if (ret) {
2100 derr << "OSD::mkfs: error while writing OSD_SUPERBLOCK_GOBJECT: "
11fdf7f2 2101 << "queue_transaction returned " << cpp_strerror(ret) << dendl;
7c673cae
FG
2102 goto umount_store;
2103 }
2104 }
2105
e306af50 2106 ret = write_meta(cct, store, sb.cluster_fsid, sb.osd_fsid, whoami, osdspec_affinity);
7c673cae 2107 if (ret) {
224ce89b
WB
2108 derr << "OSD::mkfs: failed to write fsid file: error "
2109 << cpp_strerror(ret) << dendl;
7c673cae
FG
2110 goto umount_store;
2111 }
2112
2113umount_store:
11fdf7f2
TL
2114 if (ch) {
2115 ch.reset();
2116 }
7c673cae
FG
2117 store->umount();
2118free_store:
2119 delete store;
2120 return ret;
2121}
2122
e306af50 2123int OSD::write_meta(CephContext *cct, ObjectStore *store, uuid_d& cluster_fsid, uuid_d& osd_fsid, int whoami, string& osdspec_affinity)
7c673cae
FG
2124{
2125 char val[80];
2126 int r;
2127
2128 snprintf(val, sizeof(val), "%s", CEPH_OSD_ONDISK_MAGIC);
2129 r = store->write_meta("magic", val);
2130 if (r < 0)
2131 return r;
2132
2133 snprintf(val, sizeof(val), "%d", whoami);
2134 r = store->write_meta("whoami", val);
2135 if (r < 0)
2136 return r;
2137
2138 cluster_fsid.print(val);
2139 r = store->write_meta("ceph_fsid", val);
2140 if (r < 0)
2141 return r;
2142
11fdf7f2 2143 string key = cct->_conf.get_val<string>("key");
3efd9988
FG
2144 if (key.size()) {
2145 r = store->write_meta("osd_key", key);
2146 if (r < 0)
2147 return r;
b32b8144 2148 } else {
11fdf7f2 2149 string keyfile = cct->_conf.get_val<string>("keyfile");
b32b8144
FG
2150 if (!keyfile.empty()) {
2151 bufferlist keybl;
2152 string err;
11fdf7f2 2153 r = keybl.read_file(keyfile.c_str(), &err);
b32b8144
FG
2154 if (r < 0) {
2155 derr << __func__ << " failed to read keyfile " << keyfile << ": "
2156 << err << ": " << cpp_strerror(r) << dendl;
2157 return r;
2158 }
2159 r = store->write_meta("osd_key", keybl.to_str());
2160 if (r < 0)
2161 return r;
2162 }
3efd9988 2163 }
e306af50
TL
2164 if (!osdspec_affinity.empty()) {
2165 r = store->write_meta("osdspec_affinity", osdspec_affinity.c_str());
2166 if (r < 0)
2167 return r;
2168 }
3efd9988 2169
7c673cae
FG
2170 r = store->write_meta("ready", "ready");
2171 if (r < 0)
2172 return r;
2173
2174 return 0;
2175}
2176
11fdf7f2
TL
2177int OSD::peek_meta(ObjectStore *store,
2178 std::string *magic,
2179 uuid_d *cluster_fsid,
2180 uuid_d *osd_fsid,
2181 int *whoami,
9f95a23c 2182 ceph_release_t *require_osd_release)
7c673cae
FG
2183{
2184 string val;
2185
2186 int r = store->read_meta("magic", &val);
2187 if (r < 0)
2188 return r;
11fdf7f2 2189 *magic = val;
7c673cae
FG
2190
2191 r = store->read_meta("whoami", &val);
2192 if (r < 0)
2193 return r;
11fdf7f2 2194 *whoami = atoi(val.c_str());
7c673cae
FG
2195
2196 r = store->read_meta("ceph_fsid", &val);
2197 if (r < 0)
2198 return r;
11fdf7f2 2199 r = cluster_fsid->parse(val.c_str());
7c673cae
FG
2200 if (!r)
2201 return -EINVAL;
2202
2203 r = store->read_meta("fsid", &val);
2204 if (r < 0) {
11fdf7f2 2205 *osd_fsid = uuid_d();
7c673cae 2206 } else {
11fdf7f2 2207 r = osd_fsid->parse(val.c_str());
7c673cae
FG
2208 if (!r)
2209 return -EINVAL;
2210 }
2211
11fdf7f2
TL
2212 r = store->read_meta("require_osd_release", &val);
2213 if (r >= 0) {
9f95a23c 2214 *require_osd_release = ceph_release_from_name(val);
11fdf7f2
TL
2215 }
2216
7c673cae
FG
2217 return 0;
2218}
2219
2220
2221#undef dout_prefix
2222#define dout_prefix _prefix(_dout, whoami, get_osdmap_epoch())
2223
2224// cons/des
2225
2226OSD::OSD(CephContext *cct_, ObjectStore *store_,
2227 int id,
2228 Messenger *internal_messenger,
2229 Messenger *external_messenger,
2230 Messenger *hb_client_front,
2231 Messenger *hb_client_back,
2232 Messenger *hb_front_serverm,
2233 Messenger *hb_back_serverm,
2234 Messenger *osdc_messenger,
2235 MonClient *mc,
f67539c2
TL
2236 const std::string &dev, const std::string &jdev,
2237 ceph::async::io_context_pool& poolctx) :
7c673cae 2238 Dispatcher(cct_),
7c673cae 2239 tick_timer(cct, osd_lock),
7c673cae 2240 tick_timer_without_osd_lock(cct, tick_timer_lock),
11fdf7f2 2241 gss_ktfile_client(cct->_conf.get_val<std::string>("gss_ktab_client_file")),
7c673cae
FG
2242 cluster_messenger(internal_messenger),
2243 client_messenger(external_messenger),
2244 objecter_messenger(osdc_messenger),
2245 monc(mc),
9f95a23c 2246 mgrc(cct_, client_messenger, &mc->monmap),
f67539c2
TL
2247 logger(create_logger()),
2248 recoverystate_perf(create_recoverystate_perf()),
7c673cae
FG
2249 store(store_),
2250 log_client(cct, client_messenger, &mc->monmap, LogClient::NO_FLAGS),
2251 clog(log_client.create_channel()),
2252 whoami(id),
2253 dev_path(dev), journal_path(jdev),
31f18b77 2254 store_is_rotational(store->is_rotational()),
7c673cae
FG
2255 trace_endpoint("0.0.0.0", 0, "osd"),
2256 asok_hook(NULL),
11fdf7f2
TL
2257 m_osd_pg_epoch_max_lag_factor(cct->_conf.get_val<double>(
2258 "osd_pg_epoch_max_lag_factor")),
7c673cae 2259 osd_compat(get_osd_compat_set()),
7c673cae 2260 osd_op_tp(cct, "OSD::osd_op_tp", "tp_osd_tp",
31f18b77 2261 get_num_op_threads()),
7c673cae
FG
2262 heartbeat_stop(false),
2263 heartbeat_need_update(true),
2264 hb_front_client_messenger(hb_client_front),
2265 hb_back_client_messenger(hb_client_back),
2266 hb_front_server_messenger(hb_front_serverm),
2267 hb_back_server_messenger(hb_back_serverm),
2268 daily_loadavg(0.0),
2269 heartbeat_thread(this),
2270 heartbeat_dispatcher(this),
2271 op_tracker(cct, cct->_conf->osd_enable_op_tracker,
2272 cct->_conf->osd_num_op_tracker_shard),
2273 test_ops_hook(NULL),
7c673cae 2274 op_shardedwq(
7c673cae 2275 this,
f67539c2
TL
2276 ceph::make_timespan(cct->_conf->osd_op_thread_timeout),
2277 ceph::make_timespan(cct->_conf->osd_op_thread_suicide_timeout),
7c673cae 2278 &osd_op_tp),
7c673cae 2279 last_pg_create_epoch(0),
11fdf7f2 2280 boot_finisher(cct),
7c673cae
FG
2281 up_thru_wanted(0),
2282 requested_full_first(0),
2283 requested_full_last(0),
f67539c2 2284 service(this, poolctx)
7c673cae 2285{
11fdf7f2
TL
2286
2287 if (!gss_ktfile_client.empty()) {
f67539c2
TL
2288 // Assert we can export environment variable
2289 /*
11fdf7f2
TL
2290 The default client keytab is used, if it is present and readable,
2291 to automatically obtain initial credentials for GSSAPI client
2292 applications. The principal name of the first entry in the client
2293 keytab is used by default when obtaining initial credentials.
2294 1. The KRB5_CLIENT_KTNAME environment variable.
2295 2. The default_client_keytab_name profile variable in [libdefaults].
2296 3. The hardcoded default, DEFCKTNAME.
2297 */
f67539c2 2298 const int32_t set_result(setenv("KRB5_CLIENT_KTNAME",
11fdf7f2
TL
2299 gss_ktfile_client.c_str(), 1));
2300 ceph_assert(set_result == 0);
2301 }
2302
7c673cae
FG
2303 monc->set_messenger(client_messenger);
2304 op_tracker.set_complaint_and_threshold(cct->_conf->osd_op_complaint_time,
2305 cct->_conf->osd_op_log_threshold);
2306 op_tracker.set_history_size_and_duration(cct->_conf->osd_op_history_size,
2307 cct->_conf->osd_op_history_duration);
2308 op_tracker.set_history_slow_op_size_and_threshold(cct->_conf->osd_op_history_slow_op_size,
2309 cct->_conf->osd_op_history_slow_op_threshold);
9f95a23c 2310 ObjectCleanRegions::set_max_num_intervals(cct->_conf->osd_object_clean_region_max_num_intervals);
7c673cae
FG
2311#ifdef WITH_BLKIN
2312 std::stringstream ss;
2313 ss << "osd." << whoami;
2314 trace_endpoint.copy_name(ss.str());
2315#endif
11fdf7f2
TL
2316
2317 // initialize shards
2318 num_shards = get_num_op_shards();
2319 for (uint32_t i = 0; i < num_shards; i++) {
2320 OSDShard *one_shard = new OSDShard(
2321 i,
2322 cct,
9f95a23c 2323 this);
11fdf7f2
TL
2324 shards.push_back(one_shard);
2325 }
b3b6e05e
TL
2326
2327 // override some config options if mclock is enabled on all the shards
2328 maybe_override_options_for_qos();
7c673cae
FG
2329}
2330
2331OSD::~OSD()
2332{
11fdf7f2
TL
2333 while (!shards.empty()) {
2334 delete shards.back();
2335 shards.pop_back();
2336 }
7c673cae
FG
2337 cct->get_perfcounters_collection()->remove(recoverystate_perf);
2338 cct->get_perfcounters_collection()->remove(logger);
2339 delete recoverystate_perf;
2340 delete logger;
2341 delete store;
2342}
2343
91327a77
AA
2344double OSD::get_tick_interval() const
2345{
2346 // vary +/- 5% to avoid scrub scheduling livelocks
2347 constexpr auto delta = 0.05;
91327a77 2348 return (OSD_TICK_INTERVAL *
11fdf7f2 2349 ceph::util::generate_random_number(1.0 - delta, 1.0 + delta));
91327a77
AA
2350}
2351
7c673cae
FG
2352void OSD::handle_signal(int signum)
2353{
11fdf7f2 2354 ceph_assert(signum == SIGINT || signum == SIGTERM);
7c673cae
FG
2355 derr << "*** Got signal " << sig_str(signum) << " ***" << dendl;
2356 shutdown();
2357}
2358
2359int OSD::pre_init()
2360{
11fdf7f2 2361 std::lock_guard lock(osd_lock);
7c673cae
FG
2362 if (is_stopping())
2363 return 0;
2364
2365 if (store->test_mount_in_use()) {
2366 derr << "OSD::pre_init: object store '" << dev_path << "' is "
2367 << "currently in use. (Is ceph-osd already running?)" << dendl;
2368 return -EBUSY;
2369 }
2370
11fdf7f2
TL
2371 cct->_conf.add_observer(this);
2372 return 0;
2373}
2374
2375int OSD::set_numa_affinity()
2376{
2377 // storage numa node
2378 int store_node = -1;
2379 store->get_numa_node(&store_node, nullptr, nullptr);
2380 if (store_node >= 0) {
2381 dout(1) << __func__ << " storage numa node " << store_node << dendl;
2382 }
2383
2384 // check network numa node(s)
2385 int front_node = -1, back_node = -1;
2386 string front_iface = pick_iface(
2387 cct,
2388 client_messenger->get_myaddrs().front().get_sockaddr_storage());
2389 string back_iface = pick_iface(
2390 cct,
2391 cluster_messenger->get_myaddrs().front().get_sockaddr_storage());
2392 int r = get_iface_numa_node(front_iface, &front_node);
92f5a8d4 2393 if (r >= 0 && front_node >= 0) {
11fdf7f2 2394 dout(1) << __func__ << " public network " << front_iface << " numa node "
92f5a8d4 2395 << front_node << dendl;
11fdf7f2 2396 r = get_iface_numa_node(back_iface, &back_node);
92f5a8d4 2397 if (r >= 0 && back_node >= 0) {
11fdf7f2
TL
2398 dout(1) << __func__ << " cluster network " << back_iface << " numa node "
2399 << back_node << dendl;
2400 if (front_node == back_node &&
2401 front_node == store_node) {
2402 dout(1) << " objectstore and network numa nodes all match" << dendl;
2403 if (g_conf().get_val<bool>("osd_numa_auto_affinity")) {
2404 numa_node = front_node;
2405 }
92f5a8d4
TL
2406 } else if (front_node != back_node) {
2407 dout(1) << __func__ << " public and cluster network numa nodes do not match"
2408 << dendl;
11fdf7f2
TL
2409 } else {
2410 dout(1) << __func__ << " objectstore and network numa nodes do not match"
2411 << dendl;
2412 }
92f5a8d4
TL
2413 } else if (back_node == -2) {
2414 dout(1) << __func__ << " cluster network " << back_iface
2415 << " ports numa nodes do not match" << dendl;
2416 } else {
2417 derr << __func__ << " unable to identify cluster interface '" << back_iface
2418 << "' numa node: " << cpp_strerror(r) << dendl;
11fdf7f2 2419 }
92f5a8d4
TL
2420 } else if (front_node == -2) {
2421 dout(1) << __func__ << " public network " << front_iface
2422 << " ports numa nodes do not match" << dendl;
11fdf7f2
TL
2423 } else {
2424 derr << __func__ << " unable to identify public interface '" << front_iface
2425 << "' numa node: " << cpp_strerror(r) << dendl;
2426 }
2427 if (int node = g_conf().get_val<int64_t>("osd_numa_node"); node >= 0) {
2428 // this takes precedence over the automagic logic above
2429 numa_node = node;
2430 }
2431 if (numa_node >= 0) {
2432 int r = get_numa_node_cpu_set(numa_node, &numa_cpu_set_size, &numa_cpu_set);
2433 if (r < 0) {
2434 dout(1) << __func__ << " unable to determine numa node " << numa_node
2435 << " CPUs" << dendl;
2436 numa_node = -1;
2437 } else {
2438 dout(1) << __func__ << " setting numa affinity to node " << numa_node
2439 << " cpus "
2440 << cpu_set_to_str_list(numa_cpu_set_size, &numa_cpu_set)
2441 << dendl;
92f5a8d4 2442 r = set_cpu_affinity_all_threads(numa_cpu_set_size, &numa_cpu_set);
11fdf7f2
TL
2443 if (r < 0) {
2444 r = -errno;
2445 derr << __func__ << " failed to set numa affinity: " << cpp_strerror(r)
2446 << dendl;
2447 numa_node = -1;
2448 }
2449 }
2450 } else {
2451 dout(1) << __func__ << " not setting numa affinity" << dendl;
2452 }
7c673cae
FG
2453 return 0;
2454}
2455
2456// asok
2457
2458class OSDSocketHook : public AdminSocketHook {
2459 OSD *osd;
2460public:
2461 explicit OSDSocketHook(OSD *o) : osd(o) {}
9f95a23c
TL
2462 int call(std::string_view prefix, const cmdmap_t& cmdmap,
2463 Formatter *f,
2464 std::ostream& ss,
2465 bufferlist& out) override {
2466 ceph_abort("should use async hook");
2467 }
2468 void call_async(
2469 std::string_view prefix,
2470 const cmdmap_t& cmdmap,
2471 Formatter *f,
2472 const bufferlist& inbl,
2473 std::function<void(int,const std::string&,bufferlist&)> on_finish) override {
11fdf7f2 2474 try {
9f95a23c
TL
2475 osd->asok_command(prefix, cmdmap, f, inbl, on_finish);
2476 } catch (const TOPNSPC::common::bad_cmd_get& e) {
2477 bufferlist empty;
2478 on_finish(-EINVAL, e.what(), empty);
11fdf7f2 2479 }
7c673cae
FG
2480 }
2481};
2482
11fdf7f2
TL
2483std::set<int64_t> OSD::get_mapped_pools()
2484{
2485 std::set<int64_t> pools;
2486 std::vector<spg_t> pgids;
2487 _get_pgids(&pgids);
2488 for (const auto &pgid : pgids) {
2489 pools.insert(pgid.pool());
2490 }
2491 return pools;
2492}
2493
9f95a23c
TL
2494void OSD::asok_command(
2495 std::string_view prefix, const cmdmap_t& cmdmap,
2496 Formatter *f,
2497 const bufferlist& inbl,
2498 std::function<void(int,const std::string&,bufferlist&)> on_finish)
7c673cae 2499{
9f95a23c
TL
2500 int ret = 0;
2501 stringstream ss; // stderr error message stream
2502 bufferlist outbl; // if empty at end, we'll dump formatter as output
2503
2504 // --- PG commands are routed here to PG::do_command ---
2505 if (prefix == "pg" ||
2506 prefix == "query" ||
2507 prefix == "mark_unfound_lost" ||
2508 prefix == "list_unfound" ||
2509 prefix == "scrub" ||
2510 prefix == "deep_scrub"
2511 ) {
2512 string pgidstr;
2513 pg_t pgid;
2514 if (!cmd_getval(cmdmap, "pgid", pgidstr)) {
2515 ss << "no pgid specified";
2516 ret = -EINVAL;
2517 goto out;
2518 }
2519 if (!pgid.parse(pgidstr.c_str())) {
2520 ss << "couldn't parse pgid '" << pgidstr << "'";
2521 ret = -EINVAL;
2522 goto out;
2523 }
2524 spg_t pcand;
2525 PGRef pg;
2526 if (get_osdmap()->get_primary_shard(pgid, &pcand) &&
2527 (pg = _lookup_lock_pg(pcand))) {
2528 if (pg->is_primary()) {
2529 cmdmap_t new_cmdmap = cmdmap;
2530 try {
2531 pg->do_command(prefix, new_cmdmap, inbl, on_finish);
2532 pg->unlock();
2533 return; // the pg handler calls on_finish directly
2534 } catch (const TOPNSPC::common::bad_cmd_get& e) {
2535 pg->unlock();
2536 ss << e.what();
2537 ret = -EINVAL;
2538 goto out;
2539 }
2540 } else {
2541 ss << "not primary for pgid " << pgid;
2542 // do not reply; they will get newer maps and realize they
2543 // need to resend.
2544 pg->unlock();
2545 ret = -EAGAIN;
2546 goto out;
2547 }
2548 } else {
2549 ss << "i don't have pgid " << pgid;
2550 ret = -ENOENT;
2551 }
2552 }
2553
2554 // --- OSD commands follow ---
2555
2556 else if (prefix == "status") {
2557 lock_guard l(osd_lock);
7c673cae
FG
2558 f->open_object_section("status");
2559 f->dump_stream("cluster_fsid") << superblock.cluster_fsid;
2560 f->dump_stream("osd_fsid") << superblock.osd_fsid;
2561 f->dump_unsigned("whoami", superblock.whoami);
2562 f->dump_string("state", get_state_name(get_state()));
2563 f->dump_unsigned("oldest_map", superblock.oldest_map);
2564 f->dump_unsigned("newest_map", superblock.newest_map);
11fdf7f2 2565 f->dump_unsigned("num_pgs", num_pgs);
7c673cae 2566 f->close_section();
9f95a23c 2567 } else if (prefix == "flush_journal") {
7c673cae 2568 store->flush_journal();
9f95a23c
TL
2569 } else if (prefix == "dump_ops_in_flight" ||
2570 prefix == "ops" ||
2571 prefix == "dump_blocked_ops" ||
2572 prefix == "dump_historic_ops" ||
2573 prefix == "dump_historic_ops_by_duration" ||
2574 prefix == "dump_historic_slow_ops") {
c07f9fc5
FG
2575
2576 const string error_str = "op_tracker tracking is not enabled now, so no ops are tracked currently, \
2577even those get stuck. Please enable \"osd_enable_op_tracker\", and the tracker \
2578will start to track new ops received afterwards.";
2579
2580 set<string> filters;
2581 vector<string> filter_str;
9f95a23c 2582 if (cmd_getval(cmdmap, "filterstr", filter_str)) {
c07f9fc5
FG
2583 copy(filter_str.begin(), filter_str.end(),
2584 inserter(filters, filters.end()));
2585 }
2586
9f95a23c
TL
2587 if (prefix == "dump_ops_in_flight" ||
2588 prefix == "ops") {
c07f9fc5
FG
2589 if (!op_tracker.dump_ops_in_flight(f, false, filters)) {
2590 ss << error_str;
9f95a23c
TL
2591 ret = -EINVAL;
2592 goto out;
c07f9fc5
FG
2593 }
2594 }
9f95a23c 2595 if (prefix == "dump_blocked_ops") {
c07f9fc5
FG
2596 if (!op_tracker.dump_ops_in_flight(f, true, filters)) {
2597 ss << error_str;
9f95a23c
TL
2598 ret = -EINVAL;
2599 goto out;
c07f9fc5
FG
2600 }
2601 }
9f95a23c 2602 if (prefix == "dump_historic_ops") {
c07f9fc5
FG
2603 if (!op_tracker.dump_historic_ops(f, false, filters)) {
2604 ss << error_str;
9f95a23c
TL
2605 ret = -EINVAL;
2606 goto out;
c07f9fc5
FG
2607 }
2608 }
9f95a23c 2609 if (prefix == "dump_historic_ops_by_duration") {
c07f9fc5
FG
2610 if (!op_tracker.dump_historic_ops(f, true, filters)) {
2611 ss << error_str;
9f95a23c
TL
2612 ret = -EINVAL;
2613 goto out;
c07f9fc5
FG
2614 }
2615 }
9f95a23c 2616 if (prefix == "dump_historic_slow_ops") {
c07f9fc5
FG
2617 if (!op_tracker.dump_historic_slow_ops(f, filters)) {
2618 ss << error_str;
9f95a23c
TL
2619 ret = -EINVAL;
2620 goto out;
c07f9fc5 2621 }
7c673cae 2622 }
9f95a23c 2623 } else if (prefix == "dump_op_pq_state") {
7c673cae
FG
2624 f->open_object_section("pq");
2625 op_shardedwq.dump(f);
2626 f->close_section();
f67539c2 2627 } else if (prefix == "dump_blocklist") {
7c673cae
FG
2628 list<pair<entity_addr_t,utime_t> > bl;
2629 OSDMapRef curmap = service.get_osdmap();
2630
f67539c2
TL
2631 f->open_array_section("blocklist");
2632 curmap->get_blocklist(&bl);
7c673cae
FG
2633 for (list<pair<entity_addr_t,utime_t> >::iterator it = bl.begin();
2634 it != bl.end(); ++it) {
224ce89b 2635 f->open_object_section("entry");
7c673cae
FG
2636 f->open_object_section("entity_addr_t");
2637 it->first.dump(f);
2638 f->close_section(); //entity_addr_t
2639 it->second.localtime(f->dump_stream("expire_time"));
2640 f->close_section(); //entry
2641 }
f67539c2 2642 f->close_section(); //blocklist
9f95a23c 2643 } else if (prefix == "dump_watchers") {
7c673cae
FG
2644 list<obj_watch_item_t> watchers;
2645 // scan pg's
11fdf7f2
TL
2646 vector<PGRef> pgs;
2647 _get_pgs(&pgs);
2648 for (auto& pg : pgs) {
2649 list<obj_watch_item_t> pg_watchers;
2650 pg->get_watchers(&pg_watchers);
2651 watchers.splice(watchers.end(), pg_watchers);
7c673cae
FG
2652 }
2653
2654 f->open_array_section("watchers");
2655 for (list<obj_watch_item_t>::iterator it = watchers.begin();
2656 it != watchers.end(); ++it) {
2657
224ce89b 2658 f->open_object_section("watch");
7c673cae
FG
2659
2660 f->dump_string("namespace", it->obj.nspace);
2661 f->dump_string("object", it->obj.oid.name);
2662
2663 f->open_object_section("entity_name");
2664 it->wi.name.dump(f);
2665 f->close_section(); //entity_name_t
2666
224ce89b
WB
2667 f->dump_unsigned("cookie", it->wi.cookie);
2668 f->dump_unsigned("timeout", it->wi.timeout_seconds);
7c673cae
FG
2669
2670 f->open_object_section("entity_addr_t");
2671 it->wi.addr.dump(f);
2672 f->close_section(); //entity_addr_t
2673
2674 f->close_section(); //watch
2675 }
2676
2677 f->close_section(); //watchers
9f95a23c 2678 } else if (prefix == "dump_recovery_reservations") {
7c673cae
FG
2679 f->open_object_section("reservations");
2680 f->open_object_section("local_reservations");
2681 service.local_reserver.dump(f);
2682 f->close_section();
2683 f->open_object_section("remote_reservations");
2684 service.remote_reserver.dump(f);
2685 f->close_section();
2686 f->close_section();
9f95a23c 2687 } else if (prefix == "dump_scrub_reservations") {
eafe8130
TL
2688 f->open_object_section("scrub_reservations");
2689 service.dump_scrub_reservations(f);
2690 f->close_section();
9f95a23c 2691 } else if (prefix == "get_latest_osdmap") {
7c673cae 2692 get_latest_osdmap();
9f95a23c 2693 } else if (prefix == "set_heap_property") {
7c673cae
FG
2694 string property;
2695 int64_t value = 0;
2696 string error;
2697 bool success = false;
9f95a23c 2698 if (!cmd_getval(cmdmap, "property", property)) {
7c673cae
FG
2699 error = "unable to get property";
2700 success = false;
9f95a23c 2701 } else if (!cmd_getval(cmdmap, "value", value)) {
7c673cae
FG
2702 error = "unable to get value";
2703 success = false;
2704 } else if (value < 0) {
2705 error = "negative value not allowed";
2706 success = false;
2707 } else if (!ceph_heap_set_numeric_property(property.c_str(), (size_t)value)) {
2708 error = "invalid property";
2709 success = false;
2710 } else {
2711 success = true;
2712 }
2713 f->open_object_section("result");
2714 f->dump_string("error", error);
2715 f->dump_bool("success", success);
2716 f->close_section();
9f95a23c 2717 } else if (prefix == "get_heap_property") {
7c673cae
FG
2718 string property;
2719 size_t value = 0;
2720 string error;
2721 bool success = false;
9f95a23c 2722 if (!cmd_getval(cmdmap, "property", property)) {
7c673cae
FG
2723 error = "unable to get property";
2724 success = false;
2725 } else if (!ceph_heap_get_numeric_property(property.c_str(), &value)) {
2726 error = "invalid property";
2727 success = false;
2728 } else {
2729 success = true;
2730 }
2731 f->open_object_section("result");
2732 f->dump_string("error", error);
2733 f->dump_bool("success", success);
2734 f->dump_int("value", value);
2735 f->close_section();
9f95a23c 2736 } else if (prefix == "dump_objectstore_kv_stats") {
7c673cae 2737 store->get_db_statistics(f);
9f95a23c 2738 } else if (prefix == "dump_scrubs") {
7c673cae 2739 service.dumps_scrub(f);
9f95a23c 2740 } else if (prefix == "calc_objectstore_db_histogram") {
7c673cae 2741 store->generate_db_histogram(f);
9f95a23c 2742 } else if (prefix == "flush_store_cache") {
11fdf7f2 2743 store->flush_cache(&ss);
9f95a23c 2744 } else if (prefix == "dump_pgstate_history") {
7c673cae 2745 f->open_object_section("pgstate_history");
9f95a23c 2746 f->open_array_section("pgs");
11fdf7f2
TL
2747 vector<PGRef> pgs;
2748 _get_pgs(&pgs);
2749 for (auto& pg : pgs) {
9f95a23c 2750 f->open_object_section("pg");
11fdf7f2 2751 f->dump_stream("pg") << pg->pg_id;
9f95a23c 2752 f->dump_string("currently", pg->get_current_state());
11fdf7f2 2753 pg->dump_pgstate_history(f);
9f95a23c 2754 f->close_section();
7c673cae
FG
2755 }
2756 f->close_section();
9f95a23c
TL
2757 f->close_section();
2758 } else if (prefix == "compact") {
224ce89b
WB
2759 dout(1) << "triggering manual compaction" << dendl;
2760 auto start = ceph::coarse_mono_clock::now();
2761 store->compact();
2762 auto end = ceph::coarse_mono_clock::now();
11fdf7f2 2763 double duration = std::chrono::duration<double>(end-start).count();
f67539c2 2764 dout(1) << "finished manual compaction in "
11fdf7f2 2765 << duration
224ce89b
WB
2766 << " seconds" << dendl;
2767 f->open_object_section("compact_result");
11fdf7f2
TL
2768 f->dump_float("elapsed_time", duration);
2769 f->close_section();
9f95a23c 2770 } else if (prefix == "get_mapped_pools") {
11fdf7f2
TL
2771 f->open_array_section("mapped_pools");
2772 set<int64_t> poollist = get_mapped_pools();
2773 for (auto pool : poollist) {
2774 f->dump_int("pool_id", pool);
2775 }
2776 f->close_section();
9f95a23c 2777 } else if (prefix == "smart") {
11fdf7f2 2778 string devid;
9f95a23c
TL
2779 cmd_getval(cmdmap, "devid", devid);
2780 ostringstream out;
2781 probe_smart(devid, out);
2782 outbl.append(out.str());
2783 } else if (prefix == "list_devices") {
11fdf7f2
TL
2784 set<string> devnames;
2785 store->get_devices(&devnames);
9f95a23c 2786 f->open_array_section("list_devices");
11fdf7f2
TL
2787 for (auto dev : devnames) {
2788 if (dev.find("dm-") == 0) {
2789 continue;
2790 }
9f95a23c
TL
2791 string err;
2792 f->open_object_section("device");
11fdf7f2 2793 f->dump_string("device", "/dev/" + dev);
9f95a23c
TL
2794 f->dump_string("device_id", get_device_id(dev, &err));
2795 f->close_section();
11fdf7f2 2796 }
224ce89b 2797 f->close_section();
9f95a23c
TL
2798 } else if (prefix == "send_beacon") {
2799 lock_guard l(osd_lock);
11fdf7f2
TL
2800 if (is_active()) {
2801 send_beacon(ceph::coarse_mono_clock::now());
2802 }
9f95a23c
TL
2803 }
2804
2805 else if (prefix == "cluster_log") {
2806 vector<string> msg;
2807 cmd_getval(cmdmap, "message", msg);
2808 if (msg.empty()) {
2809 ret = -EINVAL;
2810 ss << "ignoring empty log message";
2811 goto out;
2812 }
2813 string message = msg.front();
2814 for (vector<string>::iterator a = ++msg.begin(); a != msg.end(); ++a)
2815 message += " " + *a;
2816 string lvl;
2817 cmd_getval(cmdmap, "level", lvl);
2818 clog_type level = string_to_clog_type(lvl);
2819 if (level < 0) {
2820 ret = -EINVAL;
2821 ss << "unknown level '" << lvl << "'";
2822 goto out;
2823 }
2824 clog->do_log(level, message);
2825 }
2826
2827 else if (prefix == "bench") {
9f95a23c
TL
2828 int64_t count;
2829 int64_t bsize;
2830 int64_t osize, onum;
2831 // default count 1G, size 4MB
2832 cmd_getval(cmdmap, "count", count, (int64_t)1 << 30);
2833 cmd_getval(cmdmap, "size", bsize, (int64_t)4 << 20);
2834 cmd_getval(cmdmap, "object_size", osize, (int64_t)0);
2835 cmd_getval(cmdmap, "object_num", onum, (int64_t)0);
2836
2837 uint32_t duration = cct->_conf->osd_bench_duration;
2838
2839 if (bsize > (int64_t) cct->_conf->osd_bench_max_block_size) {
2840 // let us limit the block size because the next checks rely on it
2841 // having a sane value. If we allow any block size to be set things
2842 // can still go sideways.
2843 ss << "block 'size' values are capped at "
2844 << byte_u_t(cct->_conf->osd_bench_max_block_size) << ". If you wish to use"
2845 << " a higher value, please adjust 'osd_bench_max_block_size'";
2846 ret = -EINVAL;
2847 goto out;
2848 } else if (bsize < (int64_t) (1 << 20)) {
2849 // entering the realm of small block sizes.
2850 // limit the count to a sane value, assuming a configurable amount of
2851 // IOPS and duration, so that the OSD doesn't get hung up on this,
2852 // preventing timeouts from going off
2853 int64_t max_count =
2854 bsize * duration * cct->_conf->osd_bench_small_size_max_iops;
2855 if (count > max_count) {
2856 ss << "'count' values greater than " << max_count
2857 << " for a block size of " << byte_u_t(bsize) << ", assuming "
2858 << cct->_conf->osd_bench_small_size_max_iops << " IOPS,"
2859 << " for " << duration << " seconds,"
2860 << " can cause ill effects on osd. "
2861 << " Please adjust 'osd_bench_small_size_max_iops' with a higher"
2862 << " value if you wish to use a higher 'count'.";
2863 ret = -EINVAL;
2864 goto out;
eafe8130
TL
2865 }
2866 } else {
9f95a23c
TL
2867 // 1MB block sizes are big enough so that we get more stuff done.
2868 // However, to avoid the osd from getting hung on this and having
2869 // timers being triggered, we are going to limit the count assuming
2870 // a configurable throughput and duration.
2871 // NOTE: max_count is the total amount of bytes that we believe we
2872 // will be able to write during 'duration' for the given
2873 // throughput. The block size hardly impacts this unless it's
2874 // way too big. Given we already check how big the block size
2875 // is, it's safe to assume everything will check out.
2876 int64_t max_count =
2877 cct->_conf->osd_bench_large_size_max_throughput * duration;
2878 if (count > max_count) {
2879 ss << "'count' values greater than " << max_count
2880 << " for a block size of " << byte_u_t(bsize) << ", assuming "
2881 << byte_u_t(cct->_conf->osd_bench_large_size_max_throughput) << "/s,"
2882 << " for " << duration << " seconds,"
2883 << " can cause ill effects on osd. "
2884 << " Please adjust 'osd_bench_large_size_max_throughput'"
2885 << " with a higher value if you wish to use a higher 'count'.";
2886 ret = -EINVAL;
2887 goto out;
2888 }
eafe8130 2889 }
eafe8130 2890
9f95a23c
TL
2891 if (osize && bsize > osize)
2892 bsize = osize;
eafe8130 2893
9f95a23c
TL
2894 dout(1) << " bench count " << count
2895 << " bsize " << byte_u_t(bsize) << dendl;
eafe8130 2896
9f95a23c
TL
2897 ObjectStore::Transaction cleanupt;
2898
2899 if (osize && onum) {
2900 bufferlist bl;
2901 bufferptr bp(osize);
2902 bp.zero();
2903 bl.push_back(std::move(bp));
2904 bl.rebuild_page_aligned();
2905 for (int i=0; i<onum; ++i) {
2906 char nm[30];
2907 snprintf(nm, sizeof(nm), "disk_bw_test_%d", i);
2908 object_t oid(nm);
2909 hobject_t soid(sobject_t(oid, 0));
2910 ObjectStore::Transaction t;
2911 t.write(coll_t(), ghobject_t(soid), 0, osize, bl);
2912 store->queue_transaction(service.meta_ch, std::move(t), NULL);
2913 cleanupt.remove(coll_t(), ghobject_t(soid));
2914 }
2915 }
2916
2917 bufferlist bl;
2918 bufferptr bp(bsize);
2919 bp.zero();
2920 bl.push_back(std::move(bp));
2921 bl.rebuild_page_aligned();
2922
2923 {
2924 C_SaferCond waiter;
2925 if (!service.meta_ch->flush_commit(&waiter)) {
2926 waiter.wait();
2927 }
2928 }
2929
2930 utime_t start = ceph_clock_now();
2931 for (int64_t pos = 0; pos < count; pos += bsize) {
2932 char nm[30];
2933 unsigned offset = 0;
2934 if (onum && osize) {
2935 snprintf(nm, sizeof(nm), "disk_bw_test_%d", (int)(rand() % onum));
2936 offset = rand() % (osize / bsize) * bsize;
2937 } else {
2938 snprintf(nm, sizeof(nm), "disk_bw_test_%lld", (long long)pos);
2939 }
2940 object_t oid(nm);
2941 hobject_t soid(sobject_t(oid, 0));
2942 ObjectStore::Transaction t;
2943 t.write(coll_t::meta(), ghobject_t(soid), offset, bsize, bl);
2944 store->queue_transaction(service.meta_ch, std::move(t), NULL);
2945 if (!onum || !osize)
2946 cleanupt.remove(coll_t::meta(), ghobject_t(soid));
2947 }
2948
2949 {
2950 C_SaferCond waiter;
2951 if (!service.meta_ch->flush_commit(&waiter)) {
2952 waiter.wait();
2953 }
2954 }
2955 utime_t end = ceph_clock_now();
2956
2957 // clean up
2958 store->queue_transaction(service.meta_ch, std::move(cleanupt), NULL);
2959 {
2960 C_SaferCond waiter;
2961 if (!service.meta_ch->flush_commit(&waiter)) {
2962 waiter.wait();
2963 }
2964 }
2965
2966 double elapsed = end - start;
2967 double rate = count / elapsed;
2968 double iops = rate / bsize;
2969 f->open_object_section("osd_bench_results");
2970 f->dump_int("bytes_written", count);
2971 f->dump_int("blocksize", bsize);
2972 f->dump_float("elapsed_sec", elapsed);
2973 f->dump_float("bytes_per_sec", rate);
2974 f->dump_float("iops", iops);
2975 f->close_section();
2976 }
2977
2978 else if (prefix == "flush_pg_stats") {
2979 mgrc.send_pgstats();
2980 f->dump_unsigned("stat_seq", service.get_osd_stat_seq());
2981 }
2982
2983 else if (prefix == "heap") {
2984 ret = ceph::osd_cmds::heap(*cct, cmdmap, *f, ss);
2985 }
2986
2987 else if (prefix == "debug dump_missing") {
2988 f->open_array_section("pgs");
2989 vector<PGRef> pgs;
2990 _get_pgs(&pgs);
2991 for (auto& pg : pgs) {
2992 string s = stringify(pg->pg_id);
2993 f->open_array_section(s.c_str());
2994 pg->lock();
2995 pg->dump_missing(f);
2996 pg->unlock();
2997 f->close_section();
2998 }
2999 f->close_section();
3000 }
3001
3002 else if (prefix == "debug kick_recovery_wq") {
3003 int64_t delay;
3004 cmd_getval(cmdmap, "delay", delay);
3005 ostringstream oss;
3006 oss << delay;
3007 ret = cct->_conf.set_val("osd_recovery_delay_start", oss.str().c_str());
3008 if (ret != 0) {
3009 ss << "kick_recovery_wq: error setting "
3010 << "osd_recovery_delay_start to '" << delay << "': error "
3011 << ret;
3012 goto out;
3013 }
3014 cct->_conf.apply_changes(nullptr);
3015 ss << "kicking recovery queue. set osd_recovery_delay_start "
3016 << "to " << cct->_conf->osd_recovery_delay_start;
3017 }
3018
3019 else if (prefix == "cpu_profiler") {
3020 ostringstream ds;
3021 string arg;
3022 cmd_getval(cmdmap, "arg", arg);
3023 vector<string> argvec;
3024 get_str_vec(arg, argvec);
3025 cpu_profiler_handle_command(argvec, ds);
3026 outbl.append(ds.str());
3027 }
3028
3029 else if (prefix == "dump_pg_recovery_stats") {
3030 lock_guard l(osd_lock);
3031 pg_recovery_stats.dump_formatted(f);
3032 }
3033
3034 else if (prefix == "reset_pg_recovery_stats") {
3035 lock_guard l(osd_lock);
3036 pg_recovery_stats.reset();
3037 }
3038
3039 else if (prefix == "perf histogram dump") {
3040 std::string logger;
3041 std::string counter;
3042 cmd_getval(cmdmap, "logger", logger);
3043 cmd_getval(cmdmap, "counter", counter);
3044 cct->get_perfcounters_collection()->dump_formatted_histograms(
3045 f, false, logger, counter);
3046 }
3047
3048 else if (prefix == "cache drop") {
3049 lock_guard l(osd_lock);
3050 dout(20) << "clearing all caches" << dendl;
3051 // Clear the objectstore's cache - onode and buffer for Bluestore,
3052 // system's pagecache for Filestore
3053 ret = store->flush_cache(&ss);
3054 if (ret < 0) {
3055 ss << "Error flushing objectstore cache: " << cpp_strerror(ret);
3056 goto out;
3057 }
3058 // Clear the objectcontext cache (per PG)
3059 vector<PGRef> pgs;
3060 _get_pgs(&pgs);
3061 for (auto& pg: pgs) {
3062 pg->clear_cache();
3063 }
3064 }
3065
3066 else if (prefix == "cache status") {
3067 lock_guard l(osd_lock);
3068 int obj_ctx_count = 0;
3069 vector<PGRef> pgs;
3070 _get_pgs(&pgs);
3071 for (auto& pg: pgs) {
3072 obj_ctx_count += pg->get_cache_obj_count();
3073 }
3074 f->open_object_section("cache_status");
3075 f->dump_int("object_ctx", obj_ctx_count);
3076 store->dump_cache_stats(f);
3077 f->close_section();
3078 }
3079
3080 else if (prefix == "scrub_purged_snaps") {
3081 lock_guard l(osd_lock);
3082 scrub_purged_snaps();
3083 }
3084
3085 else if (prefix == "dump_osd_network") {
3086 lock_guard l(osd_lock);
3087 int64_t value = 0;
3088 if (!(cmd_getval(cmdmap, "value", value))) {
3089 // Convert milliseconds to microseconds
3090 value = static_cast<double>(g_conf().get_val<double>(
3091 "mon_warn_on_slow_ping_time")) * 1000;
3092 if (value == 0) {
3093 double ratio = g_conf().get_val<double>("mon_warn_on_slow_ping_ratio");
3094 value = g_conf().get_val<int64_t>("osd_heartbeat_grace");
3095 value *= 1000000 * ratio; // Seconds of grace to microseconds at ratio
3096 }
3097 } else {
3098 // Convert user input to microseconds
3099 value *= 1000;
3100 }
3101 if (value < 0) value = 0;
3102
3103 struct osd_ping_time_t {
3104 uint32_t pingtime;
3105 int to;
3106 bool back;
3107 std::array<uint32_t,3> times;
3108 std::array<uint32_t,3> min;
3109 std::array<uint32_t,3> max;
3110 uint32_t last;
3111 uint32_t last_update;
3112
3113 bool operator<(const osd_ping_time_t& rhs) const {
3114 if (pingtime < rhs.pingtime)
3115 return true;
3116 if (pingtime > rhs.pingtime)
3117 return false;
3118 if (to < rhs.to)
3119 return true;
3120 if (to > rhs.to)
3121 return false;
3122 return back;
3123 }
3124 };
3125
3126 set<osd_ping_time_t> sorted;
3127 // Get pingtimes under lock and not on the stack
eafe8130
TL
3128 map<int, osd_stat_t::Interfaces> *pingtimes = new map<int, osd_stat_t::Interfaces>;
3129 service.get_hb_pingtime(pingtimes);
3130 for (auto j : *pingtimes) {
3131 if (j.second.last_update == 0)
3132 continue;
3133 osd_ping_time_t item;
3134 item.pingtime = std::max(j.second.back_pingtime[0], j.second.back_pingtime[1]);
3135 item.pingtime = std::max(item.pingtime, j.second.back_pingtime[2]);
3136 if (item.pingtime >= value) {
3137 item.to = j.first;
3138 item.times[0] = j.second.back_pingtime[0];
3139 item.times[1] = j.second.back_pingtime[1];
3140 item.times[2] = j.second.back_pingtime[2];
3141 item.min[0] = j.second.back_min[0];
3142 item.min[1] = j.second.back_min[1];
3143 item.min[2] = j.second.back_min[2];
3144 item.max[0] = j.second.back_max[0];
3145 item.max[1] = j.second.back_max[1];
3146 item.max[2] = j.second.back_max[2];
3147 item.last = j.second.back_last;
3148 item.back = true;
3149 item.last_update = j.second.last_update;
3150 sorted.emplace(item);
3151 }
3152 if (j.second.front_last == 0)
3153 continue;
3154 item.pingtime = std::max(j.second.front_pingtime[0], j.second.front_pingtime[1]);
3155 item.pingtime = std::max(item.pingtime, j.second.front_pingtime[2]);
3156 if (item.pingtime >= value) {
3157 item.to = j.first;
3158 item.times[0] = j.second.front_pingtime[0];
3159 item.times[1] = j.second.front_pingtime[1];
3160 item.times[2] = j.second.front_pingtime[2];
3161 item.min[0] = j.second.front_min[0];
3162 item.min[1] = j.second.front_min[1];
3163 item.min[2] = j.second.front_min[2];
3164 item.max[0] = j.second.front_max[0];
3165 item.max[1] = j.second.front_max[1];
3166 item.max[2] = j.second.front_max[2];
3167 item.last = j.second.front_last;
3168 item.last_update = j.second.last_update;
3169 item.back = false;
3170 sorted.emplace(item);
3171 }
3172 }
3173 delete pingtimes;
3174 //
3175 // Network ping times (1min 5min 15min)
3176 f->open_object_section("network_ping_times");
3177 f->dump_int("threshold", value / 1000);
3178 f->open_array_section("entries");
3179 for (auto &sitem : boost::adaptors::reverse(sorted)) {
3180 ceph_assert(sitem.pingtime >= value);
3181 f->open_object_section("entry");
3182
3183 const time_t lu(sitem.last_update);
3184 char buffer[26];
3185 string lustr(ctime_r(&lu, buffer));
3186 lustr.pop_back(); // Remove trailing \n
3187 auto stale = cct->_conf.get_val<int64_t>("osd_heartbeat_stale");
3188 f->dump_string("last update", lustr);
3189 f->dump_bool("stale", ceph_clock_now().sec() - sitem.last_update > stale);
3190 f->dump_int("from osd", whoami);
3191 f->dump_int("to osd", sitem.to);
3192 f->dump_string("interface", (sitem.back ? "back" : "front"));
3193 f->open_object_section("average");
3194 f->dump_format_unquoted("1min", "%s", fixed_u_to_string(sitem.times[0],3).c_str());
3195 f->dump_format_unquoted("5min", "%s", fixed_u_to_string(sitem.times[1],3).c_str());
3196 f->dump_format_unquoted("15min", "%s", fixed_u_to_string(sitem.times[2],3).c_str());
3197 f->close_section(); // average
3198 f->open_object_section("min");
3199 f->dump_format_unquoted("1min", "%s", fixed_u_to_string(sitem.max[0],3).c_str());
3200 f->dump_format_unquoted("5min", "%s", fixed_u_to_string(sitem.max[1],3).c_str());
3201 f->dump_format_unquoted("15min", "%s", fixed_u_to_string(sitem.max[2],3).c_str());
3202 f->close_section(); // min
3203 f->open_object_section("max");
3204 f->dump_format_unquoted("1min", "%s", fixed_u_to_string(sitem.max[0],3).c_str());
3205 f->dump_format_unquoted("5min", "%s", fixed_u_to_string(sitem.max[1],3).c_str());
3206 f->dump_format_unquoted("15min", "%s", fixed_u_to_string(sitem.max[2],3).c_str());
3207 f->close_section(); // max
3208 f->dump_format_unquoted("last", "%s", fixed_u_to_string(sitem.last,3).c_str());
3209 f->close_section(); // entry
3210 }
3211 f->close_section(); // entries
3212 f->close_section(); // network_ping_times
7c673cae 3213 } else {
11fdf7f2 3214 ceph_abort_msg("broken asok registration");
7c673cae 3215 }
9f95a23c
TL
3216
3217 out:
3218 on_finish(ret, ss.str(), outbl);
7c673cae
FG
3219}
3220
3221class TestOpsSocketHook : public AdminSocketHook {
3222 OSDService *service;
3223 ObjectStore *store;
3224public:
3225 TestOpsSocketHook(OSDService *s, ObjectStore *st) : service(s), store(st) {}
9f95a23c
TL
3226 int call(std::string_view command, const cmdmap_t& cmdmap,
3227 Formatter *f,
3228 std::ostream& errss,
3229 bufferlist& out) override {
3230 int r = 0;
3231 stringstream outss;
11fdf7f2 3232 try {
9f95a23c
TL
3233 test_ops(service, store, command, cmdmap, outss);
3234 out.append(outss);
3235 } catch (const TOPNSPC::common::bad_cmd_get& e) {
3236 errss << e.what();
3237 r = -EINVAL;
11fdf7f2 3238 }
9f95a23c 3239 return r;
7c673cae
FG
3240 }
3241 void test_ops(OSDService *service, ObjectStore *store,
11fdf7f2 3242 std::string_view command, const cmdmap_t& cmdmap, ostream &ss);
7c673cae
FG
3243
3244};
3245
3246class OSD::C_Tick : public Context {
3247 OSD *osd;
3248 public:
3249 explicit C_Tick(OSD *o) : osd(o) {}
3250 void finish(int r) override {
3251 osd->tick();
3252 }
3253};
3254
3255class OSD::C_Tick_WithoutOSDLock : public Context {
3256 OSD *osd;
3257 public:
3258 explicit C_Tick_WithoutOSDLock(OSD *o) : osd(o) {}
3259 void finish(int r) override {
3260 osd->tick_without_osd_lock();
3261 }
3262};
3263
3264int OSD::enable_disable_fuse(bool stop)
3265{
3266#ifdef HAVE_LIBFUSE
3267 int r;
3268 string mntpath = cct->_conf->osd_data + "/fuse";
3269 if (fuse_store && (stop || !cct->_conf->osd_objectstore_fuse)) {
3270 dout(1) << __func__ << " disabling" << dendl;
3271 fuse_store->stop();
3272 delete fuse_store;
3273 fuse_store = NULL;
3274 r = ::rmdir(mntpath.c_str());
7c673cae 3275 if (r < 0) {
c07f9fc5
FG
3276 r = -errno;
3277 derr << __func__ << " failed to rmdir " << mntpath << ": "
3278 << cpp_strerror(r) << dendl;
7c673cae
FG
3279 return r;
3280 }
3281 return 0;
3282 }
3283 if (!fuse_store && cct->_conf->osd_objectstore_fuse) {
3284 dout(1) << __func__ << " enabling" << dendl;
3285 r = ::mkdir(mntpath.c_str(), 0700);
3286 if (r < 0)
3287 r = -errno;
3288 if (r < 0 && r != -EEXIST) {
3289 derr << __func__ << " unable to create " << mntpath << ": "
3290 << cpp_strerror(r) << dendl;
3291 return r;
3292 }
3293 fuse_store = new FuseStore(store, mntpath);
3294 r = fuse_store->start();
3295 if (r < 0) {
3296 derr << __func__ << " unable to start fuse: " << cpp_strerror(r) << dendl;
3297 delete fuse_store;
3298 fuse_store = NULL;
3299 return r;
3300 }
3301 }
3302#endif // HAVE_LIBFUSE
3303 return 0;
3304}
3305
9f95a23c
TL
3306size_t OSD::get_num_cache_shards()
3307{
3308 return cct->_conf.get_val<Option::size_t>("osd_num_cache_shards");
3309}
3310
31f18b77
FG
3311int OSD::get_num_op_shards()
3312{
3313 if (cct->_conf->osd_op_num_shards)
3314 return cct->_conf->osd_op_num_shards;
3315 if (store_is_rotational)
3316 return cct->_conf->osd_op_num_shards_hdd;
3317 else
3318 return cct->_conf->osd_op_num_shards_ssd;
3319}
3320
3321int OSD::get_num_op_threads()
3322{
3323 if (cct->_conf->osd_op_num_threads_per_shard)
3324 return get_num_op_shards() * cct->_conf->osd_op_num_threads_per_shard;
3325 if (store_is_rotational)
3326 return get_num_op_shards() * cct->_conf->osd_op_num_threads_per_shard_hdd;
3327 else
3328 return get_num_op_shards() * cct->_conf->osd_op_num_threads_per_shard_ssd;
3329}
3330
c07f9fc5
FG
3331float OSD::get_osd_recovery_sleep()
3332{
3333 if (cct->_conf->osd_recovery_sleep)
3334 return cct->_conf->osd_recovery_sleep;
d2e6a577 3335 if (!store_is_rotational && !journal_is_rotational)
c07f9fc5 3336 return cct->_conf->osd_recovery_sleep_ssd;
d2e6a577 3337 else if (store_is_rotational && !journal_is_rotational)
11fdf7f2 3338 return cct->_conf.get_val<double>("osd_recovery_sleep_hybrid");
d2e6a577
FG
3339 else
3340 return cct->_conf->osd_recovery_sleep_hdd;
c07f9fc5
FG
3341}
3342
11fdf7f2
TL
3343float OSD::get_osd_delete_sleep()
3344{
3345 float osd_delete_sleep = cct->_conf.get_val<double>("osd_delete_sleep");
3346 if (osd_delete_sleep > 0)
3347 return osd_delete_sleep;
3348 if (!store_is_rotational && !journal_is_rotational)
3349 return cct->_conf.get_val<double>("osd_delete_sleep_ssd");
3350 if (store_is_rotational && !journal_is_rotational)
3351 return cct->_conf.get_val<double>("osd_delete_sleep_hybrid");
3352 return cct->_conf.get_val<double>("osd_delete_sleep_hdd");
3353}
3354
9f95a23c
TL
3355int OSD::get_recovery_max_active()
3356{
3357 if (cct->_conf->osd_recovery_max_active)
3358 return cct->_conf->osd_recovery_max_active;
3359 if (store_is_rotational)
3360 return cct->_conf->osd_recovery_max_active_hdd;
3361 else
3362 return cct->_conf->osd_recovery_max_active_ssd;
3363}
3364
494da23a
TL
3365float OSD::get_osd_snap_trim_sleep()
3366{
3367 float osd_snap_trim_sleep = cct->_conf.get_val<double>("osd_snap_trim_sleep");
3368 if (osd_snap_trim_sleep > 0)
3369 return osd_snap_trim_sleep;
3370 if (!store_is_rotational && !journal_is_rotational)
3371 return cct->_conf.get_val<double>("osd_snap_trim_sleep_ssd");
3372 if (store_is_rotational && !journal_is_rotational)
3373 return cct->_conf.get_val<double>("osd_snap_trim_sleep_hybrid");
3374 return cct->_conf.get_val<double>("osd_snap_trim_sleep_hdd");
3375}
3376
7c673cae
FG
3377int OSD::init()
3378{
9f95a23c 3379 OSDMapRef osdmap;
7c673cae 3380 CompatSet initial, diff;
11fdf7f2 3381 std::lock_guard lock(osd_lock);
7c673cae
FG
3382 if (is_stopping())
3383 return 0;
3384
3385 tick_timer.init();
3386 tick_timer_without_osd_lock.init();
3387 service.recovery_request_timer.init();
11fdf7f2
TL
3388 service.sleep_timer.init();
3389
3390 boot_finisher.start();
3391
3392 {
3393 string val;
3394 store->read_meta("require_osd_release", &val);
9f95a23c 3395 last_require_osd_release = ceph_release_from_name(val);
11fdf7f2 3396 }
7c673cae
FG
3397
3398 // mount.
31f18b77
FG
3399 dout(2) << "init " << dev_path
3400 << " (looks like " << (store_is_rotational ? "hdd" : "ssd") << ")"
3401 << dendl;
d2e6a577 3402 dout(2) << "journal " << journal_path << dendl;
11fdf7f2 3403 ceph_assert(store); // call pre_init() first!
7c673cae 3404
9f95a23c 3405 store->set_cache_shards(get_num_cache_shards());
7c673cae
FG
3406
3407 int r = store->mount();
3408 if (r < 0) {
3409 derr << "OSD:init: unable to mount object store" << dendl;
3410 return r;
3411 }
d2e6a577
FG
3412 journal_is_rotational = store->is_journal_rotational();
3413 dout(2) << "journal looks like " << (journal_is_rotational ? "hdd" : "ssd")
3414 << dendl;
7c673cae
FG
3415
3416 enable_disable_fuse(false);
3417
3418 dout(2) << "boot" << dendl;
3419
11fdf7f2
TL
3420 service.meta_ch = store->open_collection(coll_t::meta());
3421
7c673cae
FG
3422 // initialize the daily loadavg with current 15min loadavg
3423 double loadavgs[3];
3424 if (getloadavg(loadavgs, 3) == 3) {
3425 daily_loadavg = loadavgs[2];
3426 } else {
3427 derr << "OSD::init() : couldn't read loadavgs\n" << dendl;
3428 daily_loadavg = 1.0;
3429 }
3430
3431 int rotating_auth_attempts = 0;
11fdf7f2
TL
3432 auto rotating_auth_timeout =
3433 g_conf().get_val<int64_t>("rotating_keys_bootstrap_timeout");
7c673cae
FG
3434
3435 // sanity check long object name handling
3436 {
3437 hobject_t l;
3438 l.oid.name = string(cct->_conf->osd_max_object_name_len, 'n');
3439 l.set_key(string(cct->_conf->osd_max_object_name_len, 'k'));
3440 l.nspace = string(cct->_conf->osd_max_object_namespace_len, 's');
3441 r = store->validate_hobject_key(l);
3442 if (r < 0) {
3443 derr << "backend (" << store->get_type() << ") is unable to support max "
3444 << "object name[space] len" << dendl;
3445 derr << " osd max object name len = "
3446 << cct->_conf->osd_max_object_name_len << dendl;
3447 derr << " osd max object namespace len = "
3448 << cct->_conf->osd_max_object_namespace_len << dendl;
3449 derr << cpp_strerror(r) << dendl;
3450 if (cct->_conf->osd_check_max_object_name_len_on_startup) {
3451 goto out;
3452 }
3453 derr << "osd_check_max_object_name_len_on_startup = false, starting anyway"
3454 << dendl;
3455 } else {
3456 dout(20) << "configured osd_max_object_name[space]_len looks ok" << dendl;
3457 }
3458 }
3459
3460 // read superblock
3461 r = read_superblock();
3462 if (r < 0) {
3463 derr << "OSD::init() : unable to read osd superblock" << dendl;
3464 r = -EINVAL;
3465 goto out;
3466 }
3467
3468 if (osd_compat.compare(superblock.compat_features) < 0) {
3469 derr << "The disk uses features unsupported by the executable." << dendl;
3470 derr << " ondisk features " << superblock.compat_features << dendl;
3471 derr << " daemon features " << osd_compat << dendl;
3472
3473 if (osd_compat.writeable(superblock.compat_features)) {
3474 CompatSet diff = osd_compat.unsupported(superblock.compat_features);
3475 derr << "it is still writeable, though. Missing features: " << diff << dendl;
3476 r = -EOPNOTSUPP;
3477 goto out;
3478 }
3479 else {
3480 CompatSet diff = osd_compat.unsupported(superblock.compat_features);
3481 derr << "Cannot write to disk! Missing features: " << diff << dendl;
3482 r = -EOPNOTSUPP;
3483 goto out;
3484 }
3485 }
3486
3487 assert_warn(whoami == superblock.whoami);
3488 if (whoami != superblock.whoami) {
3489 derr << "OSD::init: superblock says osd"
3490 << superblock.whoami << " but I am osd." << whoami << dendl;
3491 r = -EINVAL;
3492 goto out;
3493 }
3494
9f95a23c
TL
3495 startup_time = ceph::mono_clock::now();
3496
11fdf7f2 3497 // load up "current" osdmap
9f95a23c
TL
3498 assert_warn(!get_osdmap());
3499 if (get_osdmap()) {
11fdf7f2
TL
3500 derr << "OSD::init: unable to read current osdmap" << dendl;
3501 r = -EINVAL;
3502 goto out;
3503 }
3504 osdmap = get_map(superblock.current_epoch);
9f95a23c 3505 set_osdmap(osdmap);
11fdf7f2
TL
3506
3507 // make sure we don't have legacy pgs deleting
3508 {
3509 vector<coll_t> ls;
3510 int r = store->list_collections(ls);
3511 ceph_assert(r >= 0);
3512 for (auto c : ls) {
3513 spg_t pgid;
3514 if (c.is_pg(&pgid) &&
3515 !osdmap->have_pg_pool(pgid.pool())) {
3516 ghobject_t oid = make_final_pool_info_oid(pgid.pool());
3517 if (!store->exists(service.meta_ch, oid)) {
3518 derr << __func__ << " missing pg_pool_t for deleted pool "
3519 << pgid.pool() << " for pg " << pgid
3520 << "; please downgrade to luminous and allow "
3521 << "pg deletion to complete before upgrading" << dendl;
3522 ceph_abort();
3523 }
3524 }
3525 }
3526 }
3527
7c673cae
FG
3528 initial = get_osd_initial_compat_set();
3529 diff = superblock.compat_features.unsupported(initial);
3530 if (superblock.compat_features.merge(initial)) {
9f95a23c
TL
3531 // Are we adding SNAPMAPPER2?
3532 if (diff.incompat.contains(CEPH_OSD_FEATURE_INCOMPAT_SNAPMAPPER2)) {
3533 dout(1) << __func__ << " upgrade snap_mapper (first start as octopus)"
3534 << dendl;
3535 auto ch = service.meta_ch;
3536 auto hoid = make_snapmapper_oid();
3537 unsigned max = cct->_conf->osd_target_transaction_size;
3538 r = SnapMapper::convert_legacy(cct, store, ch, hoid, max);
3539 if (r < 0)
3540 goto out;
3541 }
7c673cae
FG
3542 // We need to persist the new compat_set before we
3543 // do anything else
3544 dout(5) << "Upgrading superblock adding: " << diff << dendl;
3545 ObjectStore::Transaction t;
3546 write_superblock(t);
11fdf7f2 3547 r = store->queue_transaction(service.meta_ch, std::move(t));
7c673cae
FG
3548 if (r < 0)
3549 goto out;
3550 }
3551
3552 // make sure snap mapper object exists
11fdf7f2 3553 if (!store->exists(service.meta_ch, OSD::make_snapmapper_oid())) {
7c673cae
FG
3554 dout(10) << "init creating/touching snapmapper object" << dendl;
3555 ObjectStore::Transaction t;
3556 t.touch(coll_t::meta(), OSD::make_snapmapper_oid());
11fdf7f2 3557 r = store->queue_transaction(service.meta_ch, std::move(t));
7c673cae
FG
3558 if (r < 0)
3559 goto out;
3560 }
9f95a23c
TL
3561 if (!store->exists(service.meta_ch, OSD::make_purged_snaps_oid())) {
3562 dout(10) << "init creating/touching purged_snaps object" << dendl;
3563 ObjectStore::Transaction t;
3564 t.touch(coll_t::meta(), OSD::make_purged_snaps_oid());
3565 r = store->queue_transaction(service.meta_ch, std::move(t));
3566 if (r < 0)
3567 goto out;
3568 }
7c673cae
FG
3569
3570 if (cct->_conf->osd_open_classes_on_start) {
9f95a23c 3571 int r = ClassHandler::get_instance().open_all_classes();
7c673cae
FG
3572 if (r)
3573 dout(1) << "warning: got an error loading one or more classes: " << cpp_strerror(r) << dendl;
3574 }
3575
11fdf7f2 3576 check_osdmap_features();
7c673cae 3577
7c673cae
FG
3578 {
3579 epoch_t bind_epoch = osdmap->get_epoch();
3580 service.set_epochs(NULL, NULL, &bind_epoch);
3581 }
3582
3583 clear_temp_objects();
3584
d2e6a577 3585 // initialize osdmap references in sharded wq
11fdf7f2
TL
3586 for (auto& shard : shards) {
3587 std::lock_guard l(shard->osdmap_lock);
3588 shard->shard_osdmap = osdmap;
3589 }
d2e6a577 3590
7c673cae
FG
3591 // load up pgs (as they previously existed)
3592 load_pgs();
3593
3594 dout(2) << "superblock: I am osd." << superblock.whoami << dendl;
7c673cae 3595
f67539c2
TL
3596 if (cct->_conf.get_val<bool>("osd_compact_on_start")) {
3597 dout(2) << "compacting object store's omap" << dendl;
3598 store->compact();
3599 }
7c673cae 3600
11fdf7f2
TL
3601 // prime osd stats
3602 {
3603 struct store_statfs_t stbuf;
3604 osd_alert_list_t alerts;
3605 int r = store->statfs(&stbuf, &alerts);
3606 ceph_assert(r == 0);
3607 service.set_statfs(stbuf, alerts);
3608 }
3609
f67539c2 3610 // client_messenger's auth_client will be set up by monc->init() later.
11fdf7f2
TL
3611 for (auto m : { cluster_messenger,
3612 objecter_messenger,
3613 hb_front_client_messenger,
3614 hb_back_client_messenger,
3615 hb_front_server_messenger,
3616 hb_back_server_messenger } ) {
3617 m->set_auth_client(monc);
3618 }
3619 for (auto m : { client_messenger,
3620 cluster_messenger,
3621 hb_front_server_messenger,
3622 hb_back_server_messenger }) {
3623 m->set_auth_server(monc);
3624 }
3625 monc->set_handle_authentication_dispatcher(this);
7c673cae
FG
3626
3627 monc->set_want_keys(CEPH_ENTITY_TYPE_MON | CEPH_ENTITY_TYPE_OSD
3628 | CEPH_ENTITY_TYPE_MGR);
3629 r = monc->init();
3630 if (r < 0)
3631 goto out;
3632
f67539c2 3633 mgrc.set_pgstats_cb([this]() { return collect_pg_stats(); });
11fdf7f2 3634 mgrc.set_perf_metric_query_cb(
9f95a23c
TL
3635 [this](const ConfigPayload &config_payload) {
3636 set_perf_queries(config_payload);
11fdf7f2 3637 },
9f95a23c
TL
3638 [this] {
3639 return get_perf_reports();
11fdf7f2 3640 });
7c673cae 3641 mgrc.init();
7c673cae
FG
3642
3643 // tell monc about log_client so it will know about mon session resets
3644 monc->set_log_client(&log_client);
3645 update_log_config();
3646
11fdf7f2
TL
3647 // i'm ready!
3648 client_messenger->add_dispatcher_tail(&mgrc);
3649 client_messenger->add_dispatcher_tail(this);
3650 cluster_messenger->add_dispatcher_head(this);
3651
3652 hb_front_client_messenger->add_dispatcher_head(&heartbeat_dispatcher);
3653 hb_back_client_messenger->add_dispatcher_head(&heartbeat_dispatcher);
3654 hb_front_server_messenger->add_dispatcher_head(&heartbeat_dispatcher);
3655 hb_back_server_messenger->add_dispatcher_head(&heartbeat_dispatcher);
3656
9f95a23c 3657 objecter_messenger->add_dispatcher_head(service.objecter.get());
11fdf7f2 3658
28e407b8
AA
3659 service.init();
3660 service.publish_map(osdmap);
3661 service.publish_superblock(superblock);
3662 service.max_oldest_map = superblock.oldest_map;
3663
11fdf7f2
TL
3664 for (auto& shard : shards) {
3665 // put PGs in a temporary set because we may modify pg_slots
3666 // unordered_map below.
3667 set<PGRef> pgs;
3668 for (auto& i : shard->pg_slots) {
3669 PGRef pg = i.second->pg;
3670 if (!pg) {
3671 continue;
3672 }
3673 pgs.insert(pg);
3674 }
3675 for (auto pg : pgs) {
9f95a23c 3676 std::scoped_lock l{*pg};
11fdf7f2
TL
3677 set<pair<spg_t,epoch_t>> new_children;
3678 set<pair<spg_t,epoch_t>> merge_pgs;
3679 service.identify_splits_and_merges(pg->get_osdmap(), osdmap, pg->pg_id,
3680 &new_children, &merge_pgs);
3681 if (!new_children.empty()) {
3682 for (auto shard : shards) {
3683 shard->prime_splits(osdmap, &new_children);
3684 }
3685 assert(new_children.empty());
3686 }
3687 if (!merge_pgs.empty()) {
3688 for (auto shard : shards) {
3689 shard->prime_merges(osdmap, &merge_pgs);
3690 }
3691 assert(merge_pgs.empty());
3692 }
11fdf7f2
TL
3693 }
3694 }
3695
7c673cae 3696 osd_op_tp.start();
7c673cae 3697
7c673cae
FG
3698 // start the heartbeat
3699 heartbeat_thread.create("osd_srv_heartbt");
3700
3701 // tick
91327a77
AA
3702 tick_timer.add_event_after(get_tick_interval(),
3703 new C_Tick(this));
7c673cae 3704 {
11fdf7f2 3705 std::lock_guard l(tick_timer_lock);
91327a77
AA
3706 tick_timer_without_osd_lock.add_event_after(get_tick_interval(),
3707 new C_Tick_WithoutOSDLock(this));
7c673cae
FG
3708 }
3709
9f95a23c 3710 osd_lock.unlock();
7c673cae
FG
3711
3712 r = monc->authenticate();
3713 if (r < 0) {
c07f9fc5
FG
3714 derr << __func__ << " authentication failed: " << cpp_strerror(r)
3715 << dendl;
11fdf7f2 3716 exit(1);
7c673cae
FG
3717 }
3718
11fdf7f2 3719 while (monc->wait_auth_rotating(rotating_auth_timeout) < 0) {
7c673cae
FG
3720 derr << "unable to obtain rotating service keys; retrying" << dendl;
3721 ++rotating_auth_attempts;
11fdf7f2 3722 if (rotating_auth_attempts > g_conf()->max_rotating_auth_attempts) {
c07f9fc5 3723 derr << __func__ << " wait_auth_rotating timed out" << dendl;
11fdf7f2 3724 exit(1);
7c673cae
FG
3725 }
3726 }
3727
3728 r = update_crush_device_class();
3729 if (r < 0) {
d2e6a577
FG
3730 derr << __func__ << " unable to update_crush_device_class: "
3731 << cpp_strerror(r) << dendl;
11fdf7f2 3732 exit(1);
7c673cae
FG
3733 }
3734
3735 r = update_crush_location();
3736 if (r < 0) {
d2e6a577 3737 derr << __func__ << " unable to update_crush_location: "
c07f9fc5 3738 << cpp_strerror(r) << dendl;
11fdf7f2 3739 exit(1);
7c673cae
FG
3740 }
3741
9f95a23c 3742 osd_lock.lock();
7c673cae
FG
3743 if (is_stopping())
3744 return 0;
3745
3746 // start objecter *after* we have authenticated, so that we don't ignore
3747 // the OSDMaps it requests.
3748 service.final_init();
3749
3750 check_config();
3751
3752 dout(10) << "ensuring pgs have consumed prior maps" << dendl;
3753 consume_map();
7c673cae
FG
3754
3755 dout(0) << "done with init, starting boot process" << dendl;
3756
3757 // subscribe to any pg creations
3758 monc->sub_want("osd_pg_creates", last_pg_create_epoch, 0);
3759
3760 // MgrClient needs this (it doesn't have MonClient reference itself)
3761 monc->sub_want("mgrmap", 0, 0);
3762
3763 // we don't need to ask for an osdmap here; objecter will
3764 //monc->sub_want("osdmap", osdmap->get_epoch(), CEPH_SUBSCRIBE_ONETIME);
3765
3766 monc->renew_subs();
3767
3768 start_boot();
3769
3770 return 0;
7c673cae
FG
3771
3772out:
3773 enable_disable_fuse(true);
3774 store->umount();
3775 delete store;
3776 store = NULL;
3777 return r;
3778}
3779
3780void OSD::final_init()
3781{
3782 AdminSocket *admin_socket = cct->get_admin_socket();
3783 asok_hook = new OSDSocketHook(this);
9f95a23c 3784 int r = admin_socket->register_command("status", asok_hook,
7c673cae 3785 "high-level status of OSD");
11fdf7f2 3786 ceph_assert(r == 0);
9f95a23c 3787 r = admin_socket->register_command("flush_journal",
7c673cae
FG
3788 asok_hook,
3789 "flush the journal to permanent store");
11fdf7f2 3790 ceph_assert(r == 0);
9f95a23c 3791 r = admin_socket->register_command("dump_ops_in_flight " \
c07f9fc5
FG
3792 "name=filterstr,type=CephString,n=N,req=false",
3793 asok_hook,
7c673cae 3794 "show the ops currently in flight");
11fdf7f2 3795 ceph_assert(r == 0);
9f95a23c 3796 r = admin_socket->register_command("ops " \
c07f9fc5
FG
3797 "name=filterstr,type=CephString,n=N,req=false",
3798 asok_hook,
7c673cae 3799 "show the ops currently in flight");
11fdf7f2 3800 ceph_assert(r == 0);
9f95a23c 3801 r = admin_socket->register_command("dump_blocked_ops " \
c07f9fc5
FG
3802 "name=filterstr,type=CephString,n=N,req=false",
3803 asok_hook,
7c673cae 3804 "show the blocked ops currently in flight");
11fdf7f2 3805 ceph_assert(r == 0);
9f95a23c 3806 r = admin_socket->register_command("dump_historic_ops " \
c07f9fc5 3807 "name=filterstr,type=CephString,n=N,req=false",
7c673cae
FG
3808 asok_hook,
3809 "show recent ops");
11fdf7f2 3810 ceph_assert(r == 0);
9f95a23c 3811 r = admin_socket->register_command("dump_historic_slow_ops " \
c07f9fc5 3812 "name=filterstr,type=CephString,n=N,req=false",
7c673cae
FG
3813 asok_hook,
3814 "show slowest recent ops");
11fdf7f2 3815 ceph_assert(r == 0);
9f95a23c 3816 r = admin_socket->register_command("dump_historic_ops_by_duration " \
c07f9fc5 3817 "name=filterstr,type=CephString,n=N,req=false",
7c673cae
FG
3818 asok_hook,
3819 "show slowest recent ops, sorted by duration");
11fdf7f2 3820 ceph_assert(r == 0);
9f95a23c 3821 r = admin_socket->register_command("dump_op_pq_state",
7c673cae
FG
3822 asok_hook,
3823 "dump op priority queue state");
11fdf7f2 3824 ceph_assert(r == 0);
f67539c2 3825 r = admin_socket->register_command("dump_blocklist",
7c673cae 3826 asok_hook,
f67539c2 3827 "dump blocklisted clients and times");
11fdf7f2 3828 ceph_assert(r == 0);
9f95a23c 3829 r = admin_socket->register_command("dump_watchers",
7c673cae
FG
3830 asok_hook,
3831 "show clients which have active watches,"
3832 " and on which objects");
11fdf7f2 3833 ceph_assert(r == 0);
9f95a23c 3834 r = admin_socket->register_command("dump_recovery_reservations",
7c673cae
FG
3835 asok_hook,
3836 "show recovery reservations");
11fdf7f2 3837 ceph_assert(r == 0);
9f95a23c 3838 r = admin_socket->register_command("dump_scrub_reservations",
eafe8130 3839 asok_hook,
f6b5b4d7 3840 "show scrub reservations");
eafe8130 3841 ceph_assert(r == 0);
9f95a23c 3842 r = admin_socket->register_command("get_latest_osdmap",
7c673cae
FG
3843 asok_hook,
3844 "force osd to update the latest map from "
3845 "the mon");
11fdf7f2 3846 ceph_assert(r == 0);
7c673cae 3847
9f95a23c 3848 r = admin_socket->register_command("set_heap_property " \
7c673cae
FG
3849 "name=property,type=CephString " \
3850 "name=value,type=CephInt",
3851 asok_hook,
3852 "update malloc extension heap property");
11fdf7f2 3853 ceph_assert(r == 0);
7c673cae 3854
9f95a23c 3855 r = admin_socket->register_command("get_heap_property " \
7c673cae
FG
3856 "name=property,type=CephString",
3857 asok_hook,
3858 "get malloc extension heap property");
11fdf7f2 3859 ceph_assert(r == 0);
7c673cae
FG
3860
3861 r = admin_socket->register_command("dump_objectstore_kv_stats",
7c673cae
FG
3862 asok_hook,
3863 "print statistics of kvdb which used by bluestore");
11fdf7f2 3864 ceph_assert(r == 0);
7c673cae
FG
3865
3866 r = admin_socket->register_command("dump_scrubs",
7c673cae
FG
3867 asok_hook,
3868 "print scheduled scrubs");
11fdf7f2 3869 ceph_assert(r == 0);
7c673cae
FG
3870
3871 r = admin_socket->register_command("calc_objectstore_db_histogram",
7c673cae
FG
3872 asok_hook,
3873 "Generate key value histogram of kvdb(rocksdb) which used by bluestore");
11fdf7f2 3874 ceph_assert(r == 0);
7c673cae
FG
3875
3876 r = admin_socket->register_command("flush_store_cache",
7c673cae
FG
3877 asok_hook,
3878 "Flush bluestore internal cache");
11fdf7f2 3879 ceph_assert(r == 0);
9f95a23c 3880 r = admin_socket->register_command("dump_pgstate_history",
7c673cae
FG
3881 asok_hook,
3882 "show recent state history");
11fdf7f2 3883 ceph_assert(r == 0);
7c673cae 3884
9f95a23c 3885 r = admin_socket->register_command("compact",
224ce89b
WB
3886 asok_hook,
3887 "Commpact object store's omap."
3888 " WARNING: Compaction probably slows your requests");
11fdf7f2
TL
3889 ceph_assert(r == 0);
3890
9f95a23c 3891 r = admin_socket->register_command("get_mapped_pools",
11fdf7f2
TL
3892 asok_hook,
3893 "dump pools whose PG(s) are mapped to this OSD.");
3894
3895 ceph_assert(r == 0);
3896
9f95a23c 3897 r = admin_socket->register_command("smart name=devid,type=CephString,req=false",
11fdf7f2
TL
3898 asok_hook,
3899 "probe OSD devices for SMART data.");
3900
3901 ceph_assert(r == 0);
3902
9f95a23c 3903 r = admin_socket->register_command("list_devices",
11fdf7f2
TL
3904 asok_hook,
3905 "list OSD devices.");
9f95a23c 3906 r = admin_socket->register_command("send_beacon",
11fdf7f2
TL
3907 asok_hook,
3908 "send OSD beacon to mon immediately");
224ce89b 3909
9f95a23c
TL
3910 r = admin_socket->register_command(
3911 "dump_osd_network name=value,type=CephInt,req=false", asok_hook,
3912 "Dump osd heartbeat network ping times");
eafe8130
TL
3913 ceph_assert(r == 0);
3914
7c673cae
FG
3915 test_ops_hook = new TestOpsSocketHook(&(this->service), this->store);
3916 // Note: pools are CephString instead of CephPoolname because
3917 // these commands traditionally support both pool names and numbers
3918 r = admin_socket->register_command(
7c673cae
FG
3919 "setomapval " \
3920 "name=pool,type=CephString " \
3921 "name=objname,type=CephObjectname " \
3922 "name=key,type=CephString "\
3923 "name=val,type=CephString",
3924 test_ops_hook,
3925 "set omap key");
11fdf7f2 3926 ceph_assert(r == 0);
7c673cae 3927 r = admin_socket->register_command(
7c673cae
FG
3928 "rmomapkey " \
3929 "name=pool,type=CephString " \
3930 "name=objname,type=CephObjectname " \
3931 "name=key,type=CephString",
3932 test_ops_hook,
3933 "remove omap key");
11fdf7f2 3934 ceph_assert(r == 0);
7c673cae 3935 r = admin_socket->register_command(
7c673cae
FG
3936 "setomapheader " \
3937 "name=pool,type=CephString " \
3938 "name=objname,type=CephObjectname " \
3939 "name=header,type=CephString",
3940 test_ops_hook,
3941 "set omap header");
11fdf7f2 3942 ceph_assert(r == 0);
7c673cae
FG
3943
3944 r = admin_socket->register_command(
7c673cae
FG
3945 "getomap " \
3946 "name=pool,type=CephString " \
3947 "name=objname,type=CephObjectname",
3948 test_ops_hook,
3949 "output entire object map");
11fdf7f2 3950 ceph_assert(r == 0);
7c673cae
FG
3951
3952 r = admin_socket->register_command(
7c673cae
FG
3953 "truncobj " \
3954 "name=pool,type=CephString " \
3955 "name=objname,type=CephObjectname " \
3956 "name=len,type=CephInt",
3957 test_ops_hook,
3958 "truncate object to length");
11fdf7f2 3959 ceph_assert(r == 0);
7c673cae
FG
3960
3961 r = admin_socket->register_command(
7c673cae
FG
3962 "injectdataerr " \
3963 "name=pool,type=CephString " \
3964 "name=objname,type=CephObjectname " \
3965 "name=shardid,type=CephInt,req=false,range=0|255",
3966 test_ops_hook,
3967 "inject data error to an object");
11fdf7f2 3968 ceph_assert(r == 0);
7c673cae
FG
3969
3970 r = admin_socket->register_command(
7c673cae
FG
3971 "injectmdataerr " \
3972 "name=pool,type=CephString " \
3973 "name=objname,type=CephObjectname " \
3974 "name=shardid,type=CephInt,req=false,range=0|255",
3975 test_ops_hook,
3976 "inject metadata error to an object");
11fdf7f2 3977 ceph_assert(r == 0);
7c673cae 3978 r = admin_socket->register_command(
7c673cae
FG
3979 "set_recovery_delay " \
3980 "name=utime,type=CephInt,req=false",
3981 test_ops_hook,
3982 "Delay osd recovery by specified seconds");
11fdf7f2 3983 ceph_assert(r == 0);
7c673cae 3984 r = admin_socket->register_command(
7c673cae
FG
3985 "injectfull " \
3986 "name=type,type=CephString,req=false " \
3987 "name=count,type=CephInt,req=false ",
3988 test_ops_hook,
3989 "Inject a full disk (optional count times)");
11fdf7f2 3990 ceph_assert(r == 0);
9f95a23c
TL
3991 r = admin_socket->register_command(
3992 "bench " \
3993 "name=count,type=CephInt,req=false " \
3994 "name=size,type=CephInt,req=false " \
3995 "name=object_size,type=CephInt,req=false " \
3996 "name=object_num,type=CephInt,req=false ",
3997 asok_hook,
3998 "OSD benchmark: write <count> <size>-byte objects(with <obj_size> <obj_num>), " \
3999 "(default count=1G default size=4MB). Results in log.");
4000 ceph_assert(r == 0);
4001 r = admin_socket->register_command(
4002 "cluster_log " \
4003 "name=level,type=CephChoices,strings=error,warning,info,debug " \
4004 "name=message,type=CephString,n=N",
4005 asok_hook,
4006 "log a message to the cluster log");
4007 ceph_assert(r == 0);
4008 r = admin_socket->register_command(
4009 "flush_pg_stats",
4010 asok_hook,
4011 "flush pg stats");
4012 ceph_assert(r == 0);
4013 r = admin_socket->register_command(
4014 "heap " \
4015 "name=heapcmd,type=CephChoices,strings=" \
4016 "dump|start_profiler|stop_profiler|release|get_release_rate|set_release_rate|stats " \
4017 "name=value,type=CephString,req=false",
4018 asok_hook,
4019 "show heap usage info (available only if compiled with tcmalloc)");
4020 ceph_assert(r == 0);
4021 r = admin_socket->register_command(
4022 "debug dump_missing " \
4023 "name=filename,type=CephFilepath",
4024 asok_hook,
4025 "dump missing objects to a named file");
4026 ceph_assert(r == 0);
4027 r = admin_socket->register_command(
4028 "debug kick_recovery_wq " \
4029 "name=delay,type=CephInt,range=0",
4030 asok_hook,
4031 "set osd_recovery_delay_start to <val>");
4032 ceph_assert(r == 0);
4033 r = admin_socket->register_command(
4034 "cpu_profiler " \
4035 "name=arg,type=CephChoices,strings=status|flush",
4036 asok_hook,
4037 "run cpu profiling on daemon");
4038 ceph_assert(r == 0);
4039 r = admin_socket->register_command(
4040 "dump_pg_recovery_stats",
4041 asok_hook,
4042 "dump pg recovery statistics");
4043 ceph_assert(r == 0);
4044 r = admin_socket->register_command(
4045 "reset_pg_recovery_stats",
4046 asok_hook,
4047 "reset pg recovery statistics");
4048 ceph_assert(r == 0);
4049 r = admin_socket->register_command(
4050 "cache drop",
4051 asok_hook,
4052 "Drop all OSD caches");
4053 ceph_assert(r == 0);
4054 r = admin_socket->register_command(
4055 "cache status",
4056 asok_hook,
4057 "Get OSD caches statistics");
4058 ceph_assert(r == 0);
4059 r = admin_socket->register_command(
4060 "scrub_purged_snaps",
4061 asok_hook,
4062 "Scrub purged_snaps vs snapmapper index");
4063 ceph_assert(r == 0);
7c673cae 4064
9f95a23c
TL
4065 // -- pg commands --
4066 // old form: ceph pg <pgid> command ...
4067 r = admin_socket->register_command(
4068 "pg " \
4069 "name=pgid,type=CephPgid " \
4070 "name=cmd,type=CephChoices,strings=query",
4071 asok_hook,
4072 "");
4073 ceph_assert(r == 0);
4074 r = admin_socket->register_command(
4075 "pg " \
4076 "name=pgid,type=CephPgid " \
4077 "name=cmd,type=CephChoices,strings=mark_unfound_lost " \
4078 "name=mulcmd,type=CephChoices,strings=revert|delete",
4079 asok_hook,
4080 "");
4081 ceph_assert(r == 0);
4082 r = admin_socket->register_command(
4083 "pg " \
4084 "name=pgid,type=CephPgid " \
4085 "name=cmd,type=CephChoices,strings=list_unfound " \
4086 "name=offset,type=CephString,req=false",
4087 asok_hook,
4088 "");
4089 ceph_assert(r == 0);
4090 r = admin_socket->register_command(
4091 "pg " \
4092 "name=pgid,type=CephPgid " \
4093 "name=cmd,type=CephChoices,strings=scrub " \
4094 "name=time,type=CephInt,req=false",
4095 asok_hook,
4096 "");
4097 ceph_assert(r == 0);
4098 r = admin_socket->register_command(
4099 "pg " \
4100 "name=pgid,type=CephPgid " \
4101 "name=cmd,type=CephChoices,strings=deep_scrub " \
4102 "name=time,type=CephInt,req=false",
4103 asok_hook,
4104 "");
4105 ceph_assert(r == 0);
4106 // new form: tell <pgid> <cmd> for both cli and rest
4107 r = admin_socket->register_command(
4108 "query",
4109 asok_hook,
4110 "show details of a specific pg");
4111 ceph_assert(r == 0);
4112 r = admin_socket->register_command(
4113 "mark_unfound_lost " \
4114 "name=pgid,type=CephPgid,req=false " \
4115 "name=mulcmd,type=CephChoices,strings=revert|delete",
4116 asok_hook,
4117 "mark all unfound objects in this pg as lost, either removing or reverting to a prior version if one is available");
4118 ceph_assert(r == 0);
4119 r = admin_socket->register_command(
4120 "list_unfound " \
4121 "name=pgid,type=CephPgid,req=false " \
4122 "name=offset,type=CephString,req=false",
4123 asok_hook,
4124 "list unfound objects on this pg, perhaps starting at an offset given in JSON");
4125 ceph_assert(r == 0);
4126 r = admin_socket->register_command(
4127 "scrub " \
4128 "name=pgid,type=CephPgid,req=false " \
4129 "name=time,type=CephInt,req=false",
4130 asok_hook,
4131 "Trigger a scheduled scrub ");
4132 ceph_assert(r == 0);
4133 r = admin_socket->register_command(
4134 "deep_scrub " \
4135 "name=pgid,type=CephPgid,req=false " \
4136 "name=time,type=CephInt,req=false",
4137 asok_hook,
4138 "Trigger a scheduled deep scrub ");
4139 ceph_assert(r == 0);
4140}
7c673cae 4141
f67539c2 4142PerfCounters* OSD::create_logger()
9f95a23c 4143{
f67539c2 4144 PerfCounters* logger = build_osd_logger(cct);
7c673cae 4145 cct->get_perfcounters_collection()->add(logger);
f67539c2 4146 return logger;
7c673cae
FG
4147}
4148
f67539c2 4149PerfCounters* OSD::create_recoverystate_perf()
7c673cae 4150{
f67539c2 4151 PerfCounters* recoverystate_perf = build_recoverystate_perf(cct);
7c673cae 4152 cct->get_perfcounters_collection()->add(recoverystate_perf);
f67539c2 4153 return recoverystate_perf;
7c673cae
FG
4154}
4155
4156int OSD::shutdown()
4157{
92f5a8d4
TL
4158 if (cct->_conf->osd_fast_shutdown) {
4159 derr << "*** Immediate shutdown (osd_fast_shutdown=true) ***" << dendl;
f67539c2
TL
4160 if (cct->_conf->osd_fast_shutdown_notify_mon)
4161 service.prepare_to_stop();
92f5a8d4
TL
4162 cct->_log->flush();
4163 _exit(0);
4164 }
4165
7c673cae
FG
4166 if (!service.prepare_to_stop())
4167 return 0; // already shutting down
9f95a23c 4168 osd_lock.lock();
7c673cae 4169 if (is_stopping()) {
9f95a23c 4170 osd_lock.unlock();
7c673cae
FG
4171 return 0;
4172 }
11fdf7f2 4173 dout(0) << "shutdown" << dendl;
7c673cae
FG
4174
4175 set_state(STATE_STOPPING);
4176
4177 // Debugging
11fdf7f2
TL
4178 if (cct->_conf.get_val<bool>("osd_debug_shutdown")) {
4179 cct->_conf.set_val("debug_osd", "100");
4180 cct->_conf.set_val("debug_journal", "100");
4181 cct->_conf.set_val("debug_filestore", "100");
4182 cct->_conf.set_val("debug_bluestore", "100");
4183 cct->_conf.set_val("debug_ms", "100");
4184 cct->_conf.apply_changes(nullptr);
3efd9988 4185 }
7c673cae
FG
4186
4187 // stop MgrClient earlier as it's more like an internal consumer of OSD
4188 mgrc.shutdown();
4189
4190 service.start_shutdown();
4191
4192 // stop sending work to pgs. this just prevents any new work in _process
4193 // from racing with on_shutdown and potentially entering the pg after.
4194 op_shardedwq.drain();
4195
4196 // Shutdown PGs
4197 {
11fdf7f2
TL
4198 vector<PGRef> pgs;
4199 _get_pgs(&pgs);
4200 for (auto pg : pgs) {
4201 pg->shutdown();
7c673cae
FG
4202 }
4203 }
7c673cae
FG
4204
4205 // drain op queue again (in case PGs requeued something)
4206 op_shardedwq.drain();
4207 {
4208 finished.clear(); // zap waiters (bleh, this is messy)
11fdf7f2 4209 waiting_for_osdmap.clear();
7c673cae
FG
4210 }
4211
7c673cae 4212 // unregister commands
11fdf7f2 4213 cct->get_admin_socket()->unregister_commands(asok_hook);
7c673cae
FG
4214 delete asok_hook;
4215 asok_hook = NULL;
4216
11fdf7f2 4217 cct->get_admin_socket()->unregister_commands(test_ops_hook);
7c673cae
FG
4218 delete test_ops_hook;
4219 test_ops_hook = NULL;
4220
9f95a23c 4221 osd_lock.unlock();
7c673cae 4222
9f95a23c
TL
4223 {
4224 std::lock_guard l{heartbeat_lock};
4225 heartbeat_stop = true;
4226 heartbeat_cond.notify_all();
4227 heartbeat_peers.clear();
4228 }
7c673cae
FG
4229 heartbeat_thread.join();
4230
9f95a23c
TL
4231 hb_back_server_messenger->mark_down_all();
4232 hb_front_server_messenger->mark_down_all();
4233 hb_front_client_messenger->mark_down_all();
4234 hb_back_client_messenger->mark_down_all();
4235
7c673cae
FG
4236 osd_op_tp.drain();
4237 osd_op_tp.stop();
4238 dout(10) << "op sharded tp stopped" << dendl;
4239
7c673cae
FG
4240 dout(10) << "stopping agent" << dendl;
4241 service.agent_stop();
4242
11fdf7f2
TL
4243 boot_finisher.wait_for_empty();
4244
9f95a23c 4245 osd_lock.lock();
7c673cae 4246
11fdf7f2 4247 boot_finisher.stop();
494da23a 4248 reset_heartbeat_peers(true);
7c673cae
FG
4249
4250 tick_timer.shutdown();
4251
4252 {
11fdf7f2 4253 std::lock_guard l(tick_timer_lock);
7c673cae
FG
4254 tick_timer_without_osd_lock.shutdown();
4255 }
4256
4257 // note unmount epoch
9f95a23c 4258 dout(10) << "noting clean unmount in epoch " << get_osdmap_epoch() << dendl;
7c673cae 4259 superblock.mounted = service.get_boot_epoch();
9f95a23c 4260 superblock.clean_thru = get_osdmap_epoch();
7c673cae
FG
4261 ObjectStore::Transaction t;
4262 write_superblock(t);
11fdf7f2 4263 int r = store->queue_transaction(service.meta_ch, std::move(t));
7c673cae
FG
4264 if (r) {
4265 derr << "OSD::shutdown: error writing superblock: "
4266 << cpp_strerror(r) << dendl;
4267 }
4268
4269
31f18b77
FG
4270 service.shutdown_reserver();
4271
7c673cae
FG
4272 // Remove PGs
4273#ifdef PG_DEBUG_REFS
4274 service.dump_live_pgids();
4275#endif
11fdf7f2
TL
4276 while (true) {
4277 vector<PGRef> pgs;
4278 _get_pgs(&pgs, true);
4279 if (pgs.empty()) {
4280 break;
4281 }
4282 for (auto& pg : pgs) {
4283 if (pg->is_deleted()) {
4284 continue;
4285 }
4286 dout(20) << " kicking pg " << pg << dendl;
4287 pg->lock();
4288 if (pg->get_num_ref() != 1) {
4289 derr << "pgid " << pg->get_pgid() << " has ref count of "
4290 << pg->get_num_ref() << dendl;
7c673cae 4291#ifdef PG_DEBUG_REFS
11fdf7f2 4292 pg->dump_live_ids();
7c673cae 4293#endif
31f18b77
FG
4294 if (cct->_conf->osd_shutdown_pgref_assert) {
4295 ceph_abort();
4296 }
7c673cae 4297 }
11fdf7f2
TL
4298 pg->ch.reset();
4299 pg->unlock();
7c673cae 4300 }
7c673cae
FG
4301 }
4302#ifdef PG_DEBUG_REFS
4303 service.dump_live_pgids();
4304#endif
f64942e4 4305
9f95a23c 4306 osd_lock.unlock();
11fdf7f2 4307 cct->_conf.remove_observer(this);
9f95a23c 4308 osd_lock.lock();
7c673cae 4309
11fdf7f2
TL
4310 service.meta_ch.reset();
4311
7c673cae
FG
4312 dout(10) << "syncing store" << dendl;
4313 enable_disable_fuse(true);
4314
4315 if (cct->_conf->osd_journal_flush_on_shutdown) {
4316 dout(10) << "flushing journal" << dendl;
4317 store->flush_journal();
4318 }
4319
7c673cae 4320 monc->shutdown();
9f95a23c
TL
4321 osd_lock.unlock();
4322 {
4323 std::unique_lock l{map_lock};
4324 set_osdmap(OSDMapRef());
4325 }
11fdf7f2
TL
4326 for (auto s : shards) {
4327 std::lock_guard l(s->osdmap_lock);
4328 s->shard_osdmap = OSDMapRef();
4329 }
7c673cae 4330 service.shutdown();
11fdf7f2
TL
4331
4332 std::lock_guard lock(osd_lock);
4333 store->umount();
4334 delete store;
4335 store = nullptr;
4336 dout(10) << "Store synced" << dendl;
4337
7c673cae
FG
4338 op_tracker.on_shutdown();
4339
9f95a23c 4340 ClassHandler::get_instance().shutdown();
7c673cae
FG
4341 client_messenger->shutdown();
4342 cluster_messenger->shutdown();
4343 hb_front_client_messenger->shutdown();
4344 hb_back_client_messenger->shutdown();
4345 objecter_messenger->shutdown();
4346 hb_front_server_messenger->shutdown();
4347 hb_back_server_messenger->shutdown();
4348
7c673cae
FG
4349 return r;
4350}
4351
4352int OSD::mon_cmd_maybe_osd_create(string &cmd)
4353{
4354 bool created = false;
4355 while (true) {
4356 dout(10) << __func__ << " cmd: " << cmd << dendl;
4357 vector<string> vcmd{cmd};
4358 bufferlist inbl;
4359 C_SaferCond w;
4360 string outs;
4361 monc->start_mon_command(vcmd, inbl, NULL, &outs, &w);
4362 int r = w.wait();
4363 if (r < 0) {
4364 if (r == -ENOENT && !created) {
4365 string newcmd = "{\"prefix\": \"osd create\", \"id\": " + stringify(whoami)
4366 + ", \"uuid\": \"" + stringify(superblock.osd_fsid) + "\"}";
4367 vector<string> vnewcmd{newcmd};
4368 bufferlist inbl;
4369 C_SaferCond w;
4370 string outs;
4371 monc->start_mon_command(vnewcmd, inbl, NULL, &outs, &w);
4372 int r = w.wait();
4373 if (r < 0) {
4374 derr << __func__ << " fail: osd does not exist and created failed: "
4375 << cpp_strerror(r) << dendl;
4376 return r;
4377 }
4378 created = true;
4379 continue;
4380 }
4381 derr << __func__ << " fail: '" << outs << "': " << cpp_strerror(r) << dendl;
4382 return r;
4383 }
4384 break;
4385 }
4386
4387 return 0;
4388}
4389
4390int OSD::update_crush_location()
4391{
4392 if (!cct->_conf->osd_crush_update_on_start) {
4393 dout(10) << __func__ << " osd_crush_update_on_start = false" << dendl;
4394 return 0;
4395 }
4396
4397 char weight[32];
4398 if (cct->_conf->osd_crush_initial_weight >= 0) {
4399 snprintf(weight, sizeof(weight), "%.4lf", cct->_conf->osd_crush_initial_weight);
4400 } else {
4401 struct store_statfs_t st;
11fdf7f2
TL
4402 osd_alert_list_t alerts;
4403 int r = store->statfs(&st, &alerts);
7c673cae
FG
4404 if (r < 0) {
4405 derr << "statfs: " << cpp_strerror(r) << dendl;
4406 return r;
4407 }
4408 snprintf(weight, sizeof(weight), "%.4lf",
11fdf7f2
TL
4409 std::max(.00001,
4410 double(st.total) /
4411 double(1ull << 40 /* TB */)));
7c673cae
FG
4412 }
4413
9f95a23c 4414 dout(10) << __func__ << " crush location is " << cct->crush_location << dendl;
7c673cae
FG
4415
4416 string cmd =
4417 string("{\"prefix\": \"osd crush create-or-move\", ") +
9f95a23c
TL
4418 string("\"id\": ") + stringify(whoami) + ", " +
4419 string("\"weight\":") + weight + ", " +
4420 string("\"args\": [") + stringify(cct->crush_location) + "]}";
7c673cae
FG
4421 return mon_cmd_maybe_osd_create(cmd);
4422}
4423
4424int OSD::update_crush_device_class()
4425{
224ce89b
WB
4426 if (!cct->_conf->osd_class_update_on_start) {
4427 dout(10) << __func__ << " osd_class_update_on_start = false" << dendl;
4428 return 0;
4429 }
4430
7c673cae
FG
4431 string device_class;
4432 int r = store->read_meta("crush_device_class", &device_class);
224ce89b
WB
4433 if (r < 0 || device_class.empty()) {
4434 device_class = store->get_default_device_class();
4435 }
4436
4437 if (device_class.empty()) {
d2e6a577 4438 dout(20) << __func__ << " no device class stored locally" << dendl;
7c673cae 4439 return 0;
224ce89b 4440 }
7c673cae
FG
4441
4442 string cmd =
4443 string("{\"prefix\": \"osd crush set-device-class\", ") +
224ce89b
WB
4444 string("\"class\": \"") + device_class + string("\", ") +
4445 string("\"ids\": [\"") + stringify(whoami) + string("\"]}");
7c673cae 4446
224ce89b 4447 r = mon_cmd_maybe_osd_create(cmd);
11fdf7f2
TL
4448 if (r == -EBUSY) {
4449 // good, already bound to a device-class
4450 return 0;
4451 } else {
4452 return r;
4453 }
7c673cae
FG
4454}
4455
4456void OSD::write_superblock(ObjectStore::Transaction& t)
4457{
4458 dout(10) << "write_superblock " << superblock << dendl;
4459
4460 //hack: at minimum it's using the baseline feature set
4461 if (!superblock.compat_features.incompat.contains(CEPH_OSD_FEATURE_INCOMPAT_BASE))
4462 superblock.compat_features.incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_BASE);
4463
4464 bufferlist bl;
11fdf7f2 4465 encode(superblock, bl);
7c673cae
FG
4466 t.write(coll_t::meta(), OSD_SUPERBLOCK_GOBJECT, 0, bl.length(), bl);
4467}
4468
4469int OSD::read_superblock()
4470{
4471 bufferlist bl;
11fdf7f2 4472 int r = store->read(service.meta_ch, OSD_SUPERBLOCK_GOBJECT, 0, 0, bl);
7c673cae
FG
4473 if (r < 0)
4474 return r;
4475
11fdf7f2
TL
4476 auto p = bl.cbegin();
4477 decode(superblock, p);
7c673cae
FG
4478
4479 dout(10) << "read_superblock " << superblock << dendl;
4480
4481 return 0;
4482}
4483
4484void OSD::clear_temp_objects()
4485{
4486 dout(10) << __func__ << dendl;
4487 vector<coll_t> ls;
4488 store->list_collections(ls);
4489 for (vector<coll_t>::iterator p = ls.begin(); p != ls.end(); ++p) {
4490 spg_t pgid;
4491 if (!p->is_pg(&pgid))
4492 continue;
4493
4494 // list temp objects
4495 dout(20) << " clearing temps in " << *p << " pgid " << pgid << dendl;
4496
4497 vector<ghobject_t> temps;
4498 ghobject_t next;
4499 while (1) {
4500 vector<ghobject_t> objects;
11fdf7f2
TL
4501 auto ch = store->open_collection(*p);
4502 ceph_assert(ch);
4503 store->collection_list(ch, next, ghobject_t::get_max(),
7c673cae
FG
4504 store->get_ideal_list_max(),
4505 &objects, &next);
4506 if (objects.empty())
4507 break;
4508 vector<ghobject_t>::iterator q;
4509 for (q = objects.begin(); q != objects.end(); ++q) {
4510 // Hammer set pool for temps to -1, so check for clean-up
4511 if (q->hobj.is_temp() || (q->hobj.pool == -1)) {
4512 temps.push_back(*q);
4513 } else {
4514 break;
4515 }
4516 }
4517 // If we saw a non-temp object and hit the break above we can
4518 // break out of the while loop too.
4519 if (q != objects.end())
4520 break;
4521 }
4522 if (!temps.empty()) {
4523 ObjectStore::Transaction t;
4524 int removed = 0;
4525 for (vector<ghobject_t>::iterator q = temps.begin(); q != temps.end(); ++q) {
4526 dout(20) << " removing " << *p << " object " << *q << dendl;
4527 t.remove(*p, *q);
4528 if (++removed > cct->_conf->osd_target_transaction_size) {
11fdf7f2 4529 store->queue_transaction(service.meta_ch, std::move(t));
7c673cae
FG
4530 t = ObjectStore::Transaction();
4531 removed = 0;
4532 }
4533 }
4534 if (removed) {
11fdf7f2 4535 store->queue_transaction(service.meta_ch, std::move(t));
7c673cae
FG
4536 }
4537 }
4538 }
4539}
4540
4541void OSD::recursive_remove_collection(CephContext* cct,
4542 ObjectStore *store, spg_t pgid,
4543 coll_t tmp)
4544{
4545 OSDriver driver(
4546 store,
4547 coll_t(),
4548 make_snapmapper_oid());
4549
11fdf7f2 4550 ObjectStore::CollectionHandle ch = store->open_collection(tmp);
7c673cae
FG
4551 ObjectStore::Transaction t;
4552 SnapMapper mapper(cct, &driver, 0, 0, 0, pgid.shard);
4553
11fdf7f2
TL
4554 ghobject_t next;
4555 int max = cct->_conf->osd_target_transaction_size;
7c673cae 4556 vector<ghobject_t> objects;
11fdf7f2
TL
4557 objects.reserve(max);
4558 while (true) {
4559 objects.clear();
4560 store->collection_list(ch, next, ghobject_t::get_max(),
4561 max, &objects, &next);
4562 generic_dout(10) << __func__ << " " << objects << dendl;
4563 if (objects.empty())
4564 break;
4565 for (auto& p: objects) {
4566 OSDriver::OSTransaction _t(driver.get_transaction(&t));
4567 int r = mapper.remove_oid(p.hobj, &_t);
4568 if (r != 0 && r != -ENOENT)
4569 ceph_abort();
4570 t.remove(tmp, p);
7c673cae 4571 }
11fdf7f2
TL
4572 int r = store->queue_transaction(ch, std::move(t));
4573 ceph_assert(r == 0);
4574 t = ObjectStore::Transaction();
7c673cae
FG
4575 }
4576 t.remove_collection(tmp);
11fdf7f2
TL
4577 int r = store->queue_transaction(ch, std::move(t));
4578 ceph_assert(r == 0);
7c673cae
FG
4579
4580 C_SaferCond waiter;
11fdf7f2 4581 if (!ch->flush_commit(&waiter)) {
7c673cae
FG
4582 waiter.wait();
4583 }
4584}
4585
4586
4587// ======================================================
4588// PG's
4589
7c673cae
FG
4590PG* OSD::_make_pg(
4591 OSDMapRef createmap,
4592 spg_t pgid)
4593{
11fdf7f2
TL
4594 dout(10) << __func__ << " " << pgid << dendl;
4595 pg_pool_t pi;
4596 map<string,string> ec_profile;
4597 string name;
4598 if (createmap->have_pg_pool(pgid.pool())) {
4599 pi = *createmap->get_pg_pool(pgid.pool());
4600 name = createmap->get_pool_name(pgid.pool());
4601 if (pi.is_erasure()) {
4602 ec_profile = createmap->get_erasure_code_profile(pi.erasure_code_profile);
4603 }
4604 } else {
4605 // pool was deleted; grab final pg_pool_t off disk.
4606 ghobject_t oid = make_final_pool_info_oid(pgid.pool());
4607 bufferlist bl;
4608 int r = store->read(service.meta_ch, oid, 0, 0, bl);
4609 if (r < 0) {
4610 derr << __func__ << " missing pool " << pgid.pool() << " tombstone"
4611 << dendl;
4612 return nullptr;
4613 }
4614 ceph_assert(r >= 0);
4615 auto p = bl.cbegin();
4616 decode(pi, p);
4617 decode(name, p);
4618 if (p.end()) { // dev release v13.0.2 did not include ec_profile
4619 derr << __func__ << " missing ec_profile from pool " << pgid.pool()
4620 << " tombstone" << dendl;
4621 return nullptr;
4622 }
4623 decode(ec_profile, p);
4624 }
f67539c2 4625 PGPool pool(createmap, pgid.pool(), pi, name);
7c673cae 4626 PG *pg;
11fdf7f2
TL
4627 if (pi.type == pg_pool_t::TYPE_REPLICATED ||
4628 pi.type == pg_pool_t::TYPE_ERASURE)
4629 pg = new PrimaryLogPG(&service, createmap, pool, ec_profile, pgid);
7c673cae
FG
4630 else
4631 ceph_abort();
7c673cae
FG
4632 return pg;
4633}
4634
11fdf7f2 4635void OSD::_get_pgs(vector<PGRef> *v, bool clear_too)
7c673cae 4636{
11fdf7f2
TL
4637 v->clear();
4638 v->reserve(get_num_pgs());
4639 for (auto& s : shards) {
4640 std::lock_guard l(s->shard_lock);
4641 for (auto& j : s->pg_slots) {
4642 if (j.second->pg &&
4643 !j.second->pg->is_deleted()) {
4644 v->push_back(j.second->pg);
4645 if (clear_too) {
4646 s->_detach_pg(j.second.get());
4647 }
4648 }
7c673cae 4649 }
7c673cae 4650 }
7c673cae
FG
4651}
4652
11fdf7f2 4653void OSD::_get_pgids(vector<spg_t> *v)
7c673cae 4654{
11fdf7f2
TL
4655 v->clear();
4656 v->reserve(get_num_pgs());
4657 for (auto& s : shards) {
4658 std::lock_guard l(s->shard_lock);
4659 for (auto& j : s->pg_slots) {
4660 if (j.second->pg &&
4661 !j.second->pg->is_deleted()) {
4662 v->push_back(j.first);
4663 }
7c673cae
FG
4664 }
4665 }
7c673cae
FG
4666}
4667
11fdf7f2 4668void OSD::register_pg(PGRef pg)
7c673cae 4669{
11fdf7f2
TL
4670 spg_t pgid = pg->get_pgid();
4671 uint32_t shard_index = pgid.hash_to_shard(num_shards);
4672 auto sdata = shards[shard_index];
4673 std::lock_guard l(sdata->shard_lock);
4674 auto r = sdata->pg_slots.emplace(pgid, make_unique<OSDShardPGSlot>());
4675 ceph_assert(r.second);
4676 auto *slot = r.first->second.get();
4677 dout(20) << __func__ << " " << pgid << " " << pg << dendl;
4678 sdata->_attach_pg(slot, pg.get());
4679}
7c673cae 4680
11fdf7f2
TL
4681bool OSD::try_finish_pg_delete(PG *pg, unsigned old_pg_num)
4682{
4683 auto sdata = pg->osd_shard;
4684 ceph_assert(sdata);
4685 {
4686 std::lock_guard l(sdata->shard_lock);
4687 auto p = sdata->pg_slots.find(pg->pg_id);
4688 if (p == sdata->pg_slots.end() ||
4689 !p->second->pg) {
4690 dout(20) << __func__ << " " << pg->pg_id << " not found" << dendl;
4691 return false;
4692 }
4693 if (p->second->waiting_for_merge_epoch) {
4694 dout(20) << __func__ << " " << pg->pg_id << " waiting for merge" << dendl;
4695 return false;
4696 }
4697 dout(20) << __func__ << " " << pg->pg_id << " " << pg << dendl;
4698 sdata->_detach_pg(p->second.get());
4699 }
7c673cae 4700
11fdf7f2
TL
4701 for (auto shard : shards) {
4702 shard->unprime_split_children(pg->pg_id, old_pg_num);
4703 }
7c673cae 4704
11fdf7f2
TL
4705 // update pg count now since we might not get an osdmap any time soon.
4706 if (pg->is_primary())
4707 service.logger->dec(l_osd_pg_primary);
9f95a23c
TL
4708 else if (pg->is_nonprimary())
4709 service.logger->dec(l_osd_pg_replica); // misnomver
11fdf7f2
TL
4710 else
4711 service.logger->dec(l_osd_pg_stray);
7c673cae 4712
11fdf7f2 4713 return true;
7c673cae
FG
4714}
4715
11fdf7f2 4716PGRef OSD::_lookup_pg(spg_t pgid)
7c673cae 4717{
11fdf7f2
TL
4718 uint32_t shard_index = pgid.hash_to_shard(num_shards);
4719 auto sdata = shards[shard_index];
4720 std::lock_guard l(sdata->shard_lock);
4721 auto p = sdata->pg_slots.find(pgid);
4722 if (p == sdata->pg_slots.end()) {
7c673cae 4723 return nullptr;
11fdf7f2
TL
4724 }
4725 return p->second->pg;
7c673cae
FG
4726}
4727
11fdf7f2 4728PGRef OSD::_lookup_lock_pg(spg_t pgid)
31f18b77 4729{
11fdf7f2
TL
4730 PGRef pg = _lookup_pg(pgid);
4731 if (!pg) {
4732 return nullptr;
4733 }
4734 pg->lock();
4735 if (!pg->is_deleted()) {
4736 return pg;
4737 }
4738 pg->unlock();
4739 return nullptr;
31f18b77
FG
4740}
4741
11fdf7f2 4742PGRef OSD::lookup_lock_pg(spg_t pgid)
7c673cae 4743{
11fdf7f2 4744 return _lookup_lock_pg(pgid);
7c673cae
FG
4745}
4746
4747void OSD::load_pgs()
4748{
9f95a23c 4749 ceph_assert(ceph_mutex_is_locked(osd_lock));
7c673cae 4750 dout(0) << "load_pgs" << dendl;
11fdf7f2 4751
7c673cae 4752 {
11fdf7f2
TL
4753 auto pghist = make_pg_num_history_oid();
4754 bufferlist bl;
4755 int r = store->read(service.meta_ch, pghist, 0, 0, bl, 0);
4756 if (r >= 0 && bl.length() > 0) {
4757 auto p = bl.cbegin();
4758 decode(pg_num_history, p);
4759 }
4760 dout(20) << __func__ << " pg_num_history " << pg_num_history << dendl;
7c673cae
FG
4761 }
4762
4763 vector<coll_t> ls;
4764 int r = store->list_collections(ls);
4765 if (r < 0) {
4766 derr << "failed to list pgs: " << cpp_strerror(-r) << dendl;
4767 }
4768
11fdf7f2 4769 int num = 0;
7c673cae
FG
4770 for (vector<coll_t>::iterator it = ls.begin();
4771 it != ls.end();
4772 ++it) {
4773 spg_t pgid;
4774 if (it->is_temp(&pgid) ||
4775 (it->is_pg(&pgid) && PG::_has_removal_flag(store, pgid))) {
11fdf7f2
TL
4776 dout(10) << "load_pgs " << *it
4777 << " removing, legacy or flagged for removal pg" << dendl;
7c673cae
FG
4778 recursive_remove_collection(cct, store, pgid, *it);
4779 continue;
4780 }
4781
4782 if (!it->is_pg(&pgid)) {
4783 dout(10) << "load_pgs ignoring unrecognized " << *it << dendl;
4784 continue;
4785 }
4786
7c673cae 4787 dout(10) << "pgid " << pgid << " coll " << coll_t(pgid) << dendl;
7c673cae 4788 epoch_t map_epoch = 0;
11fdf7f2 4789 int r = PG::peek_map_epoch(store, pgid, &map_epoch);
7c673cae
FG
4790 if (r < 0) {
4791 derr << __func__ << " unable to peek at " << pgid << " metadata, skipping"
4792 << dendl;
4793 continue;
4794 }
4795
11fdf7f2 4796 PGRef pg;
7c673cae
FG
4797 if (map_epoch > 0) {
4798 OSDMapRef pgosdmap = service.try_get_map(map_epoch);
4799 if (!pgosdmap) {
9f95a23c 4800 if (!get_osdmap()->have_pg_pool(pgid.pool())) {
7c673cae
FG
4801 derr << __func__ << ": could not find map for epoch " << map_epoch
4802 << " on pg " << pgid << ", but the pool is not present in the "
4803 << "current map, so this is probably a result of bug 10617. "
4804 << "Skipping the pg for now, you can use ceph-objectstore-tool "
4805 << "to clean it up later." << dendl;
4806 continue;
4807 } else {
4808 derr << __func__ << ": have pgid " << pgid << " at epoch "
4809 << map_epoch << ", but missing map. Crashing."
4810 << dendl;
11fdf7f2 4811 ceph_abort_msg("Missing map in load_pgs");
7c673cae
FG
4812 }
4813 }
11fdf7f2 4814 pg = _make_pg(pgosdmap, pgid);
7c673cae 4815 } else {
9f95a23c 4816 pg = _make_pg(get_osdmap(), pgid);
7c673cae 4817 }
11fdf7f2
TL
4818 if (!pg) {
4819 recursive_remove_collection(cct, store, pgid, *it);
4820 continue;
4821 }
4822
4823 // there can be no waiters here, so we don't call _wake_pg_slot
7c673cae 4824
11fdf7f2 4825 pg->lock();
7c673cae
FG
4826 pg->ch = store->open_collection(pg->coll);
4827
4828 // read pg state, log
11fdf7f2 4829 pg->read_state(store);
7c673cae 4830
94b18763
FG
4831 if (pg->dne()) {
4832 dout(10) << "load_pgs " << *it << " deleting dne" << dendl;
4833 pg->ch = nullptr;
94b18763 4834 pg->unlock();
94b18763
FG
4835 recursive_remove_collection(cct, store, pgid, *it);
4836 continue;
4837 }
11fdf7f2
TL
4838 {
4839 uint32_t shard_index = pgid.hash_to_shard(shards.size());
4840 assert(NULL != shards[shard_index]);
4841 store->set_collection_commit_queue(pg->coll, &(shards[shard_index]->context_queue));
4842 }
7c673cae
FG
4843
4844 pg->reg_next_scrub();
4845
11fdf7f2 4846 dout(10) << __func__ << " loaded " << *pg << dendl;
7c673cae 4847 pg->unlock();
7c673cae 4848
11fdf7f2
TL
4849 register_pg(pg);
4850 ++num;
7c673cae 4851 }
11fdf7f2 4852 dout(0) << __func__ << " opened " << num << " pgs" << dendl;
7c673cae
FG
4853}
4854
4855
11fdf7f2
TL
4856PGRef OSD::handle_pg_create_info(const OSDMapRef& osdmap,
4857 const PGCreateInfo *info)
4858{
4859 spg_t pgid = info->pgid;
7c673cae 4860
11fdf7f2
TL
4861 if (maybe_wait_for_max_pg(osdmap, pgid, info->by_mon)) {
4862 dout(10) << __func__ << " hit max pg, dropping" << dendl;
4863 return nullptr;
4864 }
3efd9988 4865
9f95a23c 4866 PeeringCtx rctx = create_context();
7c673cae 4867
11fdf7f2 4868 OSDMapRef startmap = get_map(info->epoch);
7c673cae 4869
11fdf7f2
TL
4870 if (info->by_mon) {
4871 int64_t pool_id = pgid.pgid.pool();
4872 const pg_pool_t *pool = osdmap->get_pg_pool(pool_id);
4873 if (!pool) {
4874 dout(10) << __func__ << " ignoring " << pgid << ", pool dne" << dendl;
4875 return nullptr;
4876 }
9f95a23c 4877 if (osdmap->require_osd_release >= ceph_release_t::nautilus &&
11fdf7f2
TL
4878 !pool->has_flag(pg_pool_t::FLAG_CREATING)) {
4879 // this ensures we do not process old creating messages after the
4880 // pool's initial pgs have been created (and pg are subsequently
4881 // allowed to split or merge).
4882 dout(20) << __func__ << " dropping " << pgid
4883 << "create, pool does not have CREATING flag set" << dendl;
4884 return nullptr;
7c673cae
FG
4885 }
4886 }
7c673cae 4887
11fdf7f2
TL
4888 int up_primary, acting_primary;
4889 vector<int> up, acting;
4890 startmap->pg_to_up_acting_osds(
4891 pgid.pgid, &up, &up_primary, &acting, &acting_primary);
7c673cae 4892
11fdf7f2
TL
4893 const pg_pool_t* pp = startmap->get_pg_pool(pgid.pool());
4894 if (pp->has_flag(pg_pool_t::FLAG_EC_OVERWRITES) &&
4895 store->get_type() != "bluestore") {
4896 clog->warn() << "pg " << pgid
4897 << " is at risk of silent data corruption: "
4898 << "the pool allows ec overwrites but is not stored in "
4899 << "bluestore, so deep scrubbing will not detect bitrot";
7c673cae 4900 }
9f95a23c
TL
4901 create_pg_collection(
4902 rctx.transaction, pgid, pgid.get_split_bits(pp->get_pg_num()));
4903 init_pg_ondisk(rctx.transaction, pgid, pp);
7c673cae 4904
9f95a23c 4905 int role = startmap->calc_pg_role(pg_shard_t(whoami, pgid.shard), acting);
7c673cae 4906
11fdf7f2
TL
4907 PGRef pg = _make_pg(startmap, pgid);
4908 pg->ch = store->create_new_collection(pg->coll);
7c673cae 4909
11fdf7f2
TL
4910 {
4911 uint32_t shard_index = pgid.hash_to_shard(shards.size());
4912 assert(NULL != shards[shard_index]);
4913 store->set_collection_commit_queue(pg->coll, &(shards[shard_index]->context_queue));
7c673cae 4914 }
7c673cae 4915
11fdf7f2 4916 pg->lock(true);
7c673cae 4917
11fdf7f2
TL
4918 // we are holding the shard lock
4919 ceph_assert(!pg->is_deleted());
4920
4921 pg->init(
4922 role,
4923 up,
4924 up_primary,
4925 acting,
4926 acting_primary,
4927 info->history,
4928 info->past_intervals,
4929 false,
4930 rctx.transaction);
7c673cae 4931
92f5a8d4
TL
4932 pg->init_collection_pool_opts();
4933
11fdf7f2 4934 if (pg->is_primary()) {
9f95a23c 4935 std::lock_guard locker{m_perf_queries_lock};
11fdf7f2
TL
4936 pg->set_dynamic_perf_stats_queries(m_perf_queries);
4937 }
7c673cae 4938
9f95a23c
TL
4939 pg->handle_initialize(rctx);
4940 pg->handle_activate_map(rctx);
7c673cae 4941
11fdf7f2 4942 dispatch_context(rctx, pg.get(), osdmap, nullptr);
7c673cae 4943
11fdf7f2
TL
4944 dout(10) << __func__ << " new pg " << *pg << dendl;
4945 return pg;
7c673cae
FG
4946}
4947
11fdf7f2
TL
4948bool OSD::maybe_wait_for_max_pg(const OSDMapRef& osdmap,
4949 spg_t pgid,
4950 bool is_mon_create)
3efd9988
FG
4951{
4952 const auto max_pgs_per_osd =
11fdf7f2
TL
4953 (cct->_conf.get_val<uint64_t>("mon_max_pg_per_osd") *
4954 cct->_conf.get_val<double>("osd_max_pg_per_osd_hard_ratio"));
3efd9988 4955
11fdf7f2 4956 if (num_pgs < max_pgs_per_osd) {
3efd9988
FG
4957 return false;
4958 }
11fdf7f2
TL
4959
4960 std::lock_guard l(pending_creates_lock);
3efd9988
FG
4961 if (is_mon_create) {
4962 pending_creates_from_mon++;
4963 } else {
9f95a23c
TL
4964 bool is_primary = osdmap->get_pg_acting_role(pgid, whoami) == 0;
4965 pending_creates_from_osd.emplace(pgid, is_primary);
3efd9988 4966 }
1adf2230 4967 dout(1) << __func__ << " withhold creation of pg " << pgid
11fdf7f2 4968 << ": " << num_pgs << " >= "<< max_pgs_per_osd << dendl;
3efd9988
FG
4969 return true;
4970}
4971
4972// to re-trigger a peering, we have to twiddle the pg mapping a little bit,
4973// see PG::should_restart_peering(). OSDMap::pg_to_up_acting_osds() will turn
4974// to up set if pg_temp is empty. so an empty pg_temp won't work.
4975static vector<int32_t> twiddle(const vector<int>& acting) {
4976 if (acting.size() > 1) {
4977 return {acting[0]};
4978 } else {
4979 vector<int32_t> twiddled(acting.begin(), acting.end());
4980 twiddled.push_back(-1);
4981 return twiddled;
4982 }
4983}
4984
4985void OSD::resume_creating_pg()
4986{
4987 bool do_sub_pg_creates = false;
b32b8144 4988 bool have_pending_creates = false;
3efd9988
FG
4989 {
4990 const auto max_pgs_per_osd =
11fdf7f2
TL
4991 (cct->_conf.get_val<uint64_t>("mon_max_pg_per_osd") *
4992 cct->_conf.get_val<double>("osd_max_pg_per_osd_hard_ratio"));
4993 if (max_pgs_per_osd <= num_pgs) {
3efd9988
FG
4994 // this could happen if admin decreases this setting before a PG is removed
4995 return;
4996 }
11fdf7f2
TL
4997 unsigned spare_pgs = max_pgs_per_osd - num_pgs;
4998 std::lock_guard l(pending_creates_lock);
3efd9988 4999 if (pending_creates_from_mon > 0) {
11fdf7f2
TL
5000 dout(20) << __func__ << " pending_creates_from_mon "
5001 << pending_creates_from_mon << dendl;
3efd9988
FG
5002 do_sub_pg_creates = true;
5003 if (pending_creates_from_mon >= spare_pgs) {
5004 spare_pgs = pending_creates_from_mon = 0;
5005 } else {
5006 spare_pgs -= pending_creates_from_mon;
5007 pending_creates_from_mon = 0;
5008 }
5009 }
5010 auto pg = pending_creates_from_osd.cbegin();
5011 while (spare_pgs > 0 && pg != pending_creates_from_osd.cend()) {
94b18763 5012 dout(20) << __func__ << " pg " << pg->first << dendl;
3efd9988 5013 vector<int> acting;
9f95a23c
TL
5014 get_osdmap()->pg_to_up_acting_osds(pg->first.pgid, nullptr, nullptr, &acting, nullptr);
5015 service.queue_want_pg_temp(pg->first.pgid, twiddle(acting), true);
3efd9988 5016 pg = pending_creates_from_osd.erase(pg);
94b18763 5017 do_sub_pg_creates = true;
3efd9988
FG
5018 spare_pgs--;
5019 }
b32b8144
FG
5020 have_pending_creates = (pending_creates_from_mon > 0 ||
5021 !pending_creates_from_osd.empty());
3efd9988 5022 }
b32b8144
FG
5023
5024 bool do_renew_subs = false;
3efd9988
FG
5025 if (do_sub_pg_creates) {
5026 if (monc->sub_want("osd_pg_creates", last_pg_create_epoch, 0)) {
5027 dout(4) << __func__ << ": resolicit pg creates from mon since "
5028 << last_pg_create_epoch << dendl;
b32b8144 5029 do_renew_subs = true;
3efd9988
FG
5030 }
5031 }
9f95a23c 5032 version_t start = get_osdmap_epoch() + 1;
b32b8144
FG
5033 if (have_pending_creates) {
5034 // don't miss any new osdmap deleting PGs
5035 if (monc->sub_want("osdmap", start, 0)) {
5036 dout(4) << __func__ << ": resolicit osdmap from mon since "
5037 << start << dendl;
5038 do_renew_subs = true;
5039 }
94b18763 5040 } else if (do_sub_pg_creates) {
b32b8144
FG
5041 // no need to subscribe the osdmap continuously anymore
5042 // once the pgtemp and/or mon_subscribe(pg_creates) is sent
5043 if (monc->sub_want_increment("osdmap", start, CEPH_SUBSCRIBE_ONETIME)) {
11fdf7f2 5044 dout(4) << __func__ << ": re-subscribe osdmap(onetime) since "
b32b8144
FG
5045 << start << dendl;
5046 do_renew_subs = true;
5047 }
5048 }
5049
5050 if (do_renew_subs) {
5051 monc->renew_subs();
5052 }
5053
94b18763 5054 service.send_pg_temp();
3efd9988 5055}
7c673cae
FG
5056
5057void OSD::build_initial_pg_history(
5058 spg_t pgid,
5059 epoch_t created,
5060 utime_t created_stamp,
5061 pg_history_t *h,
5062 PastIntervals *pi)
5063{
5064 dout(10) << __func__ << " " << pgid << " created " << created << dendl;
9f95a23c 5065 *h = pg_history_t(created, created_stamp);
7c673cae
FG
5066
5067 OSDMapRef lastmap = service.get_map(created);
5068 int up_primary, acting_primary;
5069 vector<int> up, acting;
5070 lastmap->pg_to_up_acting_osds(
5071 pgid.pgid, &up, &up_primary, &acting, &acting_primary);
5072
5073 ostringstream debug;
9f95a23c 5074 for (epoch_t e = created + 1; e <= get_osdmap_epoch(); ++e) {
7c673cae
FG
5075 OSDMapRef osdmap = service.get_map(e);
5076 int new_up_primary, new_acting_primary;
5077 vector<int> new_up, new_acting;
5078 osdmap->pg_to_up_acting_osds(
5079 pgid.pgid, &new_up, &new_up_primary, &new_acting, &new_acting_primary);
5080
5081 // this is a bit imprecise, but sufficient?
5082 struct min_size_predicate_t : public IsPGRecoverablePredicate {
5083 const pg_pool_t *pi;
5084 bool operator()(const set<pg_shard_t> &have) const {
5085 return have.size() >= pi->min_size;
5086 }
11fdf7f2 5087 explicit min_size_predicate_t(const pg_pool_t *i) : pi(i) {}
7c673cae
FG
5088 } min_size_predicate(osdmap->get_pg_pool(pgid.pgid.pool()));
5089
5090 bool new_interval = PastIntervals::check_new_interval(
5091 acting_primary,
5092 new_acting_primary,
5093 acting, new_acting,
5094 up_primary,
5095 new_up_primary,
5096 up, new_up,
5097 h->same_interval_since,
5098 h->last_epoch_clean,
9f95a23c
TL
5099 osdmap.get(),
5100 lastmap.get(),
7c673cae 5101 pgid.pgid,
9f95a23c 5102 min_size_predicate,
7c673cae
FG
5103 pi,
5104 &debug);
5105 if (new_interval) {
5106 h->same_interval_since = e;
181888fb
FG
5107 if (up != new_up) {
5108 h->same_up_since = e;
5109 }
5110 if (acting_primary != new_acting_primary) {
5111 h->same_primary_since = e;
5112 }
5113 if (pgid.pgid.is_split(lastmap->get_pg_num(pgid.pgid.pool()),
5114 osdmap->get_pg_num(pgid.pgid.pool()),
5115 nullptr)) {
5116 h->last_epoch_split = e;
5117 }
5118 up = new_up;
5119 acting = new_acting;
5120 up_primary = new_up_primary;
5121 acting_primary = new_acting_primary;
c07f9fc5 5122 }
7c673cae
FG
5123 lastmap = osdmap;
5124 }
5125 dout(20) << __func__ << " " << debug.str() << dendl;
5126 dout(10) << __func__ << " " << *h << " " << *pi
5127 << " [" << (pi->empty() ? pair<epoch_t,epoch_t>(0,0) :
5128 pi->get_bounds()) << ")"
5129 << dendl;
5130}
5131
7c673cae
FG
5132void OSD::_add_heartbeat_peer(int p)
5133{
5134 if (p == whoami)
5135 return;
5136 HeartbeatInfo *hi;
5137
5138 map<int,HeartbeatInfo>::iterator i = heartbeat_peers.find(p);
5139 if (i == heartbeat_peers.end()) {
9f95a23c 5140 pair<ConnectionRef,ConnectionRef> cons = service.get_con_osd_hb(p, get_osdmap_epoch());
7c673cae
FG
5141 if (!cons.first)
5142 return;
9f95a23c
TL
5143 assert(cons.second);
5144
7c673cae
FG
5145 hi = &heartbeat_peers[p];
5146 hi->peer = p;
9f95a23c
TL
5147
5148 auto stamps = service.get_hb_stamps(p);
5149
5150 auto sb = ceph::make_ref<Session>(cct, cons.first.get());
5151 sb->peer = p;
5152 sb->stamps = stamps;
eafe8130 5153 hi->hb_interval_start = ceph_clock_now();
7c673cae 5154 hi->con_back = cons.first.get();
9f95a23c
TL
5155 hi->con_back->set_priv(sb);
5156
5157 auto sf = ceph::make_ref<Session>(cct, cons.second.get());
5158 sf->peer = p;
5159 sf->stamps = stamps;
5160 hi->con_front = cons.second.get();
5161 hi->con_front->set_priv(sf);
5162
5163 dout(10) << "_add_heartbeat_peer: new peer osd." << p
5164 << " " << hi->con_back->get_peer_addr()
5165 << " " << hi->con_front->get_peer_addr()
5166 << dendl;
7c673cae
FG
5167 } else {
5168 hi = &i->second;
5169 }
9f95a23c 5170 hi->epoch = get_osdmap_epoch();
7c673cae
FG
5171}
5172
5173void OSD::_remove_heartbeat_peer(int n)
5174{
5175 map<int,HeartbeatInfo>::iterator q = heartbeat_peers.find(n);
11fdf7f2 5176 ceph_assert(q != heartbeat_peers.end());
7c673cae
FG
5177 dout(20) << " removing heartbeat peer osd." << n
5178 << " " << q->second.con_back->get_peer_addr()
5179 << " " << (q->second.con_front ? q->second.con_front->get_peer_addr() : entity_addr_t())
5180 << dendl;
9f95a23c 5181 q->second.clear_mark_down();
7c673cae
FG
5182 heartbeat_peers.erase(q);
5183}
5184
5185void OSD::need_heartbeat_peer_update()
5186{
5187 if (is_stopping())
5188 return;
5189 dout(20) << "need_heartbeat_peer_update" << dendl;
5190 heartbeat_set_peers_need_update();
5191}
5192
5193void OSD::maybe_update_heartbeat_peers()
5194{
9f95a23c 5195 ceph_assert(ceph_mutex_is_locked(osd_lock));
7c673cae 5196
11fdf7f2 5197 if (is_waiting_for_healthy() || is_active()) {
7c673cae
FG
5198 utime_t now = ceph_clock_now();
5199 if (last_heartbeat_resample == utime_t()) {
5200 last_heartbeat_resample = now;
5201 heartbeat_set_peers_need_update();
5202 } else if (!heartbeat_peers_need_update()) {
5203 utime_t dur = now - last_heartbeat_resample;
5204 if (dur > cct->_conf->osd_heartbeat_grace) {
5205 dout(10) << "maybe_update_heartbeat_peers forcing update after " << dur << " seconds" << dendl;
5206 heartbeat_set_peers_need_update();
5207 last_heartbeat_resample = now;
494da23a
TL
5208 // automatically clean up any stale heartbeat peers
5209 // if we are unhealthy, then clean all
5210 reset_heartbeat_peers(is_waiting_for_healthy());
7c673cae
FG
5211 }
5212 }
5213 }
5214
5215 if (!heartbeat_peers_need_update())
5216 return;
5217 heartbeat_clear_peers_need_update();
5218
11fdf7f2 5219 std::lock_guard l(heartbeat_lock);
7c673cae
FG
5220
5221 dout(10) << "maybe_update_heartbeat_peers updating" << dendl;
5222
5223
5224 // build heartbeat from set
5225 if (is_active()) {
11fdf7f2
TL
5226 vector<PGRef> pgs;
5227 _get_pgs(&pgs);
5228 for (auto& pg : pgs) {
5229 pg->with_heartbeat_peers([&](int peer) {
9f95a23c 5230 if (get_osdmap()->is_up(peer)) {
11fdf7f2
TL
5231 _add_heartbeat_peer(peer);
5232 }
5233 });
7c673cae
FG
5234 }
5235 }
5236
5237 // include next and previous up osds to ensure we have a fully-connected set
5238 set<int> want, extras;
9f95a23c 5239 const int next = get_osdmap()->get_next_up_osd_after(whoami);
7c673cae
FG
5240 if (next >= 0)
5241 want.insert(next);
9f95a23c 5242 int prev = get_osdmap()->get_previous_up_osd_before(whoami);
7c673cae
FG
5243 if (prev >= 0 && prev != next)
5244 want.insert(prev);
5245
11fdf7f2
TL
5246 // make sure we have at least **min_down** osds coming from different
5247 // subtree level (e.g., hosts) for fast failure detection.
5248 auto min_down = cct->_conf.get_val<uint64_t>("mon_osd_min_down_reporters");
5249 auto subtree = cct->_conf.get_val<string>("mon_osd_reporter_subtree_level");
9f95a23c
TL
5250 auto limit = std::max(min_down, (uint64_t)cct->_conf->osd_heartbeat_min_peers);
5251 get_osdmap()->get_random_up_osds_by_subtree(
5252 whoami, subtree, limit, want, &want);
11fdf7f2 5253
7c673cae
FG
5254 for (set<int>::iterator p = want.begin(); p != want.end(); ++p) {
5255 dout(10) << " adding neighbor peer osd." << *p << dendl;
5256 extras.insert(*p);
5257 _add_heartbeat_peer(*p);
5258 }
5259
5260 // remove down peers; enumerate extras
5261 map<int,HeartbeatInfo>::iterator p = heartbeat_peers.begin();
5262 while (p != heartbeat_peers.end()) {
9f95a23c 5263 if (!get_osdmap()->is_up(p->first)) {
7c673cae
FG
5264 int o = p->first;
5265 ++p;
5266 _remove_heartbeat_peer(o);
5267 continue;
5268 }
9f95a23c 5269 if (p->second.epoch < get_osdmap_epoch()) {
7c673cae
FG
5270 extras.insert(p->first);
5271 }
5272 ++p;
5273 }
5274
5275 // too few?
11fdf7f2 5276 for (int n = next; n >= 0; ) {
7c673cae
FG
5277 if ((int)heartbeat_peers.size() >= cct->_conf->osd_heartbeat_min_peers)
5278 break;
5279 if (!extras.count(n) && !want.count(n) && n != whoami) {
5280 dout(10) << " adding random peer osd." << n << dendl;
5281 extras.insert(n);
5282 _add_heartbeat_peer(n);
5283 }
9f95a23c 5284 n = get_osdmap()->get_next_up_osd_after(n);
11fdf7f2 5285 if (n == next)
7c673cae
FG
5286 break; // came full circle; stop
5287 }
5288
5289 // too many?
5290 for (set<int>::iterator p = extras.begin();
5291 (int)heartbeat_peers.size() > cct->_conf->osd_heartbeat_min_peers && p != extras.end();
5292 ++p) {
5293 if (want.count(*p))
5294 continue;
5295 _remove_heartbeat_peer(*p);
5296 }
5297
5298 dout(10) << "maybe_update_heartbeat_peers " << heartbeat_peers.size() << " peers, extras " << extras << dendl;
9f95a23c
TL
5299
5300 // clean up stale failure pending
5301 for (auto it = failure_pending.begin(); it != failure_pending.end();) {
5302 if (heartbeat_peers.count(it->first) == 0) {
5303 send_still_alive(get_osdmap_epoch(), it->first, it->second.second);
5304 failure_pending.erase(it++);
5305 } else {
5306 it++;
5307 }
5308 }
7c673cae
FG
5309}
5310
494da23a 5311void OSD::reset_heartbeat_peers(bool all)
7c673cae 5312{
9f95a23c 5313 ceph_assert(ceph_mutex_is_locked(osd_lock));
7c673cae 5314 dout(10) << "reset_heartbeat_peers" << dendl;
494da23a
TL
5315 utime_t stale = ceph_clock_now();
5316 stale -= cct->_conf.get_val<int64_t>("osd_heartbeat_stale");
11fdf7f2 5317 std::lock_guard l(heartbeat_lock);
494da23a 5318 for (auto it = heartbeat_peers.begin(); it != heartbeat_peers.end();) {
b3b6e05e 5319 auto& [peer, hi] = *it;
494da23a 5320 if (all || hi.is_stale(stale)) {
9f95a23c 5321 hi.clear_mark_down();
494da23a 5322 // stop sending failure_report to mon too
b3b6e05e
TL
5323 failure_queue.erase(peer);
5324 failure_pending.erase(peer);
5325 it = heartbeat_peers.erase(it);
494da23a 5326 } else {
b3b6e05e 5327 ++it;
7c673cae 5328 }
7c673cae 5329 }
7c673cae
FG
5330}
5331
5332void OSD::handle_osd_ping(MOSDPing *m)
5333{
5334 if (superblock.cluster_fsid != m->fsid) {
5335 dout(20) << "handle_osd_ping from " << m->get_source_inst()
9f95a23c
TL
5336 << " bad fsid " << m->fsid << " != " << superblock.cluster_fsid
5337 << dendl;
7c673cae
FG
5338 m->put();
5339 return;
5340 }
5341
5342 int from = m->get_source().num();
5343
9f95a23c 5344 heartbeat_lock.lock();
7c673cae 5345 if (is_stopping()) {
9f95a23c 5346 heartbeat_lock.unlock();
7c673cae
FG
5347 m->put();
5348 return;
5349 }
5350
9f95a23c
TL
5351 utime_t now = ceph_clock_now();
5352 auto mnow = service.get_mnow();
5353 ConnectionRef con(m->get_connection());
7c673cae 5354 OSDMapRef curmap = service.get_osdmap();
c07f9fc5 5355 if (!curmap) {
9f95a23c 5356 heartbeat_lock.unlock();
c07f9fc5
FG
5357 m->put();
5358 return;
5359 }
7c673cae 5360
9f95a23c
TL
5361 auto sref = con->get_priv();
5362 Session *s = static_cast<Session*>(sref.get());
5363 if (!s) {
5364 heartbeat_lock.unlock();
5365 m->put();
5366 return;
5367 }
5368 if (!s->stamps) {
5369 s->peer = from;
5370 s->stamps = service.get_hb_stamps(from);
5371 }
5372
7c673cae
FG
5373 switch (m->op) {
5374
5375 case MOSDPing::PING:
5376 {
5377 if (cct->_conf->osd_debug_drop_ping_probability > 0) {
5378 auto heartbeat_drop = debug_heartbeat_drops_remaining.find(from);
5379 if (heartbeat_drop != debug_heartbeat_drops_remaining.end()) {
5380 if (heartbeat_drop->second == 0) {
5381 debug_heartbeat_drops_remaining.erase(heartbeat_drop);
5382 } else {
5383 --heartbeat_drop->second;
5384 dout(5) << "Dropping heartbeat from " << from
5385 << ", " << heartbeat_drop->second
5386 << " remaining to drop" << dendl;
5387 break;
5388 }
5389 } else if (cct->_conf->osd_debug_drop_ping_probability >
5390 ((((double)(rand()%100))/100.0))) {
5391 heartbeat_drop =
5392 debug_heartbeat_drops_remaining.insert(std::make_pair(from,
5393 cct->_conf->osd_debug_drop_ping_duration)).first;
5394 dout(5) << "Dropping heartbeat from " << from
5395 << ", " << heartbeat_drop->second
5396 << " remaining to drop" << dendl;
5397 break;
5398 }
5399 }
5400
9f95a23c
TL
5401 ceph::signedspan sender_delta_ub{};
5402 s->stamps->got_ping(
5403 m->up_from,
5404 mnow,
5405 m->mono_send_stamp,
5406 m->delta_ub,
5407 &sender_delta_ub);
5408 dout(20) << __func__ << " new stamps " << *s->stamps << dendl;
5409
7c673cae 5410 if (!cct->get_heartbeat_map()->is_healthy()) {
9f95a23c
TL
5411 dout(10) << "internal heartbeat not healthy, dropping ping request"
5412 << dendl;
7c673cae
FG
5413 break;
5414 }
5415
5416 Message *r = new MOSDPing(monc->get_fsid(),
5417 curmap->get_epoch(),
9f95a23c
TL
5418 MOSDPing::PING_REPLY,
5419 m->ping_stamp,
5420 m->mono_ping_stamp,
5421 mnow,
5422 service.get_up_epoch(),
5423 cct->_conf->osd_heartbeat_min_size,
5424 sender_delta_ub);
5425 con->send_message(r);
7c673cae
FG
5426
5427 if (curmap->is_up(from)) {
7c673cae 5428 if (is_active()) {
9f95a23c
TL
5429 ConnectionRef cluster_con = service.get_con_osd_cluster(
5430 from, curmap->get_epoch());
5431 if (cluster_con) {
5432 service.maybe_share_map(cluster_con.get(), curmap, m->map_epoch);
7c673cae
FG
5433 }
5434 }
5435 } else if (!curmap->exists(from) ||
5436 curmap->get_down_at(from) > m->map_epoch) {
5437 // tell them they have died
5438 Message *r = new MOSDPing(monc->get_fsid(),
5439 curmap->get_epoch(),
5440 MOSDPing::YOU_DIED,
9f95a23c
TL
5441 m->ping_stamp,
5442 m->mono_ping_stamp,
5443 mnow,
5444 service.get_up_epoch(),
31f18b77 5445 cct->_conf->osd_heartbeat_min_size);
9f95a23c 5446 con->send_message(r);
7c673cae
FG
5447 }
5448 }
5449 break;
5450
5451 case MOSDPing::PING_REPLY:
5452 {
5453 map<int,HeartbeatInfo>::iterator i = heartbeat_peers.find(from);
5454 if (i != heartbeat_peers.end()) {
9f95a23c 5455 auto acked = i->second.ping_history.find(m->ping_stamp);
11fdf7f2 5456 if (acked != i->second.ping_history.end()) {
11fdf7f2 5457 int &unacknowledged = acked->second.second;
9f95a23c 5458 if (con == i->second.con_back) {
11fdf7f2
TL
5459 dout(25) << "handle_osd_ping got reply from osd." << from
5460 << " first_tx " << i->second.first_tx
5461 << " last_tx " << i->second.last_tx
9f95a23c
TL
5462 << " last_rx_back " << i->second.last_rx_back
5463 << " -> " << now
11fdf7f2
TL
5464 << " last_rx_front " << i->second.last_rx_front
5465 << dendl;
5466 i->second.last_rx_back = now;
5467 ceph_assert(unacknowledged > 0);
5468 --unacknowledged;
5469 // if there is no front con, set both stamps.
5470 if (i->second.con_front == NULL) {
5471 i->second.last_rx_front = now;
5472 ceph_assert(unacknowledged > 0);
5473 --unacknowledged;
5474 }
9f95a23c 5475 } else if (con == i->second.con_front) {
11fdf7f2
TL
5476 dout(25) << "handle_osd_ping got reply from osd." << from
5477 << " first_tx " << i->second.first_tx
5478 << " last_tx " << i->second.last_tx
5479 << " last_rx_back " << i->second.last_rx_back
9f95a23c
TL
5480 << " last_rx_front " << i->second.last_rx_front
5481 << " -> " << now
11fdf7f2
TL
5482 << dendl;
5483 i->second.last_rx_front = now;
5484 ceph_assert(unacknowledged > 0);
5485 --unacknowledged;
5486 }
7c673cae 5487
11fdf7f2
TL
5488 if (unacknowledged == 0) {
5489 // succeeded in getting all replies
5490 dout(25) << "handle_osd_ping got all replies from osd." << from
9f95a23c 5491 << " , erase pending ping(sent at " << m->ping_stamp << ")"
11fdf7f2
TL
5492 << " and older pending ping(s)"
5493 << dendl;
eafe8130
TL
5494
5495#define ROUND_S_TO_USEC(sec) (uint32_t)((sec) * 1000 * 1000 + 0.5)
5496 ++i->second.hb_average_count;
9f95a23c 5497 uint32_t back_pingtime = ROUND_S_TO_USEC(i->second.last_rx_back - m->ping_stamp);
eafe8130
TL
5498 i->second.hb_total_back += back_pingtime;
5499 if (back_pingtime < i->second.hb_min_back)
5500 i->second.hb_min_back = back_pingtime;
5501 if (back_pingtime > i->second.hb_max_back)
5502 i->second.hb_max_back = back_pingtime;
9f95a23c 5503 uint32_t front_pingtime = ROUND_S_TO_USEC(i->second.last_rx_front - m->ping_stamp);
eafe8130
TL
5504 i->second.hb_total_front += front_pingtime;
5505 if (front_pingtime < i->second.hb_min_front)
5506 i->second.hb_min_front = front_pingtime;
5507 if (front_pingtime > i->second.hb_max_front)
5508 i->second.hb_max_front = front_pingtime;
5509
5510 ceph_assert(i->second.hb_interval_start != utime_t());
5511 if (i->second.hb_interval_start == utime_t())
5512 i->second.hb_interval_start = now;
5513 int64_t hb_avg_time_period = 60;
5514 if (cct->_conf.get_val<int64_t>("debug_heartbeat_testing_span")) {
5515 hb_avg_time_period = cct->_conf.get_val<int64_t>("debug_heartbeat_testing_span");
5516 }
5517 if (now - i->second.hb_interval_start >= utime_t(hb_avg_time_period, 0)) {
5518 uint32_t back_avg = i->second.hb_total_back / i->second.hb_average_count;
5519 uint32_t back_min = i->second.hb_min_back;
5520 uint32_t back_max = i->second.hb_max_back;
5521 uint32_t front_avg = i->second.hb_total_front / i->second.hb_average_count;
5522 uint32_t front_min = i->second.hb_min_front;
5523 uint32_t front_max = i->second.hb_max_front;
5524
5525 // Reset for new interval
5526 i->second.hb_average_count = 0;
5527 i->second.hb_interval_start = now;
5528 i->second.hb_total_back = i->second.hb_max_back = 0;
5529 i->second.hb_min_back = UINT_MAX;
5530 i->second.hb_total_front = i->second.hb_max_front = 0;
5531 i->second.hb_min_front = UINT_MAX;
5532
5533 // Record per osd interace ping times
5534 // Based on osd_heartbeat_interval ignoring that it is randomly short than this interval
5535 if (i->second.hb_back_pingtime.size() == 0) {
5536 ceph_assert(i->second.hb_front_pingtime.size() == 0);
5537 for (unsigned k = 0 ; k < hb_vector_size; ++k) {
5538 i->second.hb_back_pingtime.push_back(back_avg);
5539 i->second.hb_back_min.push_back(back_min);
5540 i->second.hb_back_max.push_back(back_max);
5541 i->second.hb_front_pingtime.push_back(front_avg);
5542 i->second.hb_front_min.push_back(front_min);
5543 i->second.hb_front_max.push_back(front_max);
5544 ++i->second.hb_index;
5545 }
5546 } else {
5547 int index = i->second.hb_index & (hb_vector_size - 1);
5548 i->second.hb_back_pingtime[index] = back_avg;
5549 i->second.hb_back_min[index] = back_min;
5550 i->second.hb_back_max[index] = back_max;
5551 i->second.hb_front_pingtime[index] = front_avg;
5552 i->second.hb_front_min[index] = front_min;
5553 i->second.hb_front_max[index] = front_max;
5554 ++i->second.hb_index;
5555 }
5556
5557 {
5558 std::lock_guard l(service.stat_lock);
5559 service.osd_stat.hb_pingtime[from].last_update = now.sec();
5560 service.osd_stat.hb_pingtime[from].back_last = back_pingtime;
5561
5562 uint32_t total = 0;
5563 uint32_t min = UINT_MAX;
5564 uint32_t max = 0;
5565 uint32_t count = 0;
5566 uint32_t which = 0;
5567 uint32_t size = (uint32_t)i->second.hb_back_pingtime.size();
5568 for (int32_t k = size - 1 ; k >= 0; --k) {
5569 ++count;
5570 int index = (i->second.hb_index + k) % size;
5571 total += i->second.hb_back_pingtime[index];
5572 if (i->second.hb_back_min[index] < min)
5573 min = i->second.hb_back_min[index];
5574 if (i->second.hb_back_max[index] > max)
5575 max = i->second.hb_back_max[index];
5576 if (count == 1 || count == 5 || count == 15) {
5577 service.osd_stat.hb_pingtime[from].back_pingtime[which] = total / count;
5578 service.osd_stat.hb_pingtime[from].back_min[which] = min;
5579 service.osd_stat.hb_pingtime[from].back_max[which] = max;
5580 which++;
5581 if (count == 15)
5582 break;
5583 }
5584 }
5585
5586 if (i->second.con_front != NULL) {
5587 service.osd_stat.hb_pingtime[from].front_last = front_pingtime;
5588
5589 total = 0;
5590 min = UINT_MAX;
5591 max = 0;
5592 count = 0;
5593 which = 0;
5594 for (int32_t k = size - 1 ; k >= 0; --k) {
5595 ++count;
5596 int index = (i->second.hb_index + k) % size;
5597 total += i->second.hb_front_pingtime[index];
5598 if (i->second.hb_front_min[index] < min)
5599 min = i->second.hb_front_min[index];
5600 if (i->second.hb_front_max[index] > max)
5601 max = i->second.hb_front_max[index];
5602 if (count == 1 || count == 5 || count == 15) {
5603 service.osd_stat.hb_pingtime[from].front_pingtime[which] = total / count;
5604 service.osd_stat.hb_pingtime[from].front_min[which] = min;
5605 service.osd_stat.hb_pingtime[from].front_max[which] = max;
5606 which++;
5607 if (count == 15)
5608 break;
5609 }
5610 }
5611 }
5612 }
5613 } else {
5614 std::lock_guard l(service.stat_lock);
5615 service.osd_stat.hb_pingtime[from].back_last = back_pingtime;
5616 if (i->second.con_front != NULL)
5617 service.osd_stat.hb_pingtime[from].front_last = front_pingtime;
5618 }
11fdf7f2 5619 i->second.ping_history.erase(i->second.ping_history.begin(), ++acked);
7c673cae
FG
5620 }
5621
11fdf7f2
TL
5622 if (i->second.is_healthy(now)) {
5623 // Cancel false reports
5624 auto failure_queue_entry = failure_queue.find(from);
5625 if (failure_queue_entry != failure_queue.end()) {
5626 dout(10) << "handle_osd_ping canceling queued "
5627 << "failure report for osd." << from << dendl;
5628 failure_queue.erase(failure_queue_entry);
5629 }
5630
5631 auto failure_pending_entry = failure_pending.find(from);
5632 if (failure_pending_entry != failure_pending.end()) {
5633 dout(10) << "handle_osd_ping canceling in-flight "
5634 << "failure report for osd." << from << dendl;
5635 send_still_alive(curmap->get_epoch(),
5636 from,
5637 failure_pending_entry->second.second);
5638 failure_pending.erase(failure_pending_entry);
5639 }
7c673cae 5640 }
11fdf7f2
TL
5641 } else {
5642 // old replies, deprecated by newly sent pings.
9f95a23c 5643 dout(10) << "handle_osd_ping no pending ping(sent at " << m->ping_stamp
11fdf7f2
TL
5644 << ") is found, treat as covered by newly sent pings "
5645 << "and ignore"
5646 << dendl;
7c673cae
FG
5647 }
5648 }
5649
5650 if (m->map_epoch &&
5651 curmap->is_up(from)) {
7c673cae 5652 if (is_active()) {
9f95a23c
TL
5653 ConnectionRef cluster_con = service.get_con_osd_cluster(
5654 from, curmap->get_epoch());
5655 if (cluster_con) {
5656 service.maybe_share_map(cluster_con.get(), curmap, m->map_epoch);
7c673cae
FG
5657 }
5658 }
5659 }
9f95a23c
TL
5660
5661 s->stamps->got_ping_reply(
5662 mnow,
5663 m->mono_send_stamp,
5664 m->delta_ub);
5665 dout(20) << __func__ << " new stamps " << *s->stamps << dendl;
7c673cae
FG
5666 }
5667 break;
5668
5669 case MOSDPing::YOU_DIED:
5670 dout(10) << "handle_osd_ping " << m->get_source_inst()
5671 << " says i am down in " << m->map_epoch << dendl;
5672 osdmap_subscribe(curmap->get_epoch()+1, false);
5673 break;
5674 }
5675
9f95a23c 5676 heartbeat_lock.unlock();
7c673cae
FG
5677 m->put();
5678}
5679
5680void OSD::heartbeat_entry()
5681{
9f95a23c 5682 std::unique_lock l(heartbeat_lock);
7c673cae
FG
5683 if (is_stopping())
5684 return;
5685 while (!heartbeat_stop) {
5686 heartbeat();
5687
eafe8130
TL
5688 double wait;
5689 if (cct->_conf.get_val<bool>("debug_disable_randomized_ping")) {
5690 wait = (float)cct->_conf->osd_heartbeat_interval;
5691 } else {
5692 wait = .5 + ((float)(rand() % 10)/10.0) * (float)cct->_conf->osd_heartbeat_interval;
5693 }
9f95a23c 5694 auto w = ceph::make_timespan(wait);
7c673cae 5695 dout(30) << "heartbeat_entry sleeping for " << wait << dendl;
9f95a23c 5696 heartbeat_cond.wait_for(l, w);
7c673cae
FG
5697 if (is_stopping())
5698 return;
5699 dout(30) << "heartbeat_entry woke up" << dendl;
5700 }
5701}
5702
5703void OSD::heartbeat_check()
5704{
9f95a23c 5705 ceph_assert(ceph_mutex_is_locked(heartbeat_lock));
7c673cae
FG
5706 utime_t now = ceph_clock_now();
5707
11fdf7f2 5708 // check for incoming heartbeats (move me elsewhere?)
7c673cae
FG
5709 for (map<int,HeartbeatInfo>::iterator p = heartbeat_peers.begin();
5710 p != heartbeat_peers.end();
5711 ++p) {
5712
5713 if (p->second.first_tx == utime_t()) {
5714 dout(25) << "heartbeat_check we haven't sent ping to osd." << p->first
11fdf7f2 5715 << " yet, skipping" << dendl;
7c673cae
FG
5716 continue;
5717 }
5718
5719 dout(25) << "heartbeat_check osd." << p->first
5720 << " first_tx " << p->second.first_tx
5721 << " last_tx " << p->second.last_tx
5722 << " last_rx_back " << p->second.last_rx_back
5723 << " last_rx_front " << p->second.last_rx_front
5724 << dendl;
11fdf7f2
TL
5725 if (p->second.is_unhealthy(now)) {
5726 utime_t oldest_deadline = p->second.ping_history.begin()->second.first;
7c673cae
FG
5727 if (p->second.last_rx_back == utime_t() ||
5728 p->second.last_rx_front == utime_t()) {
11fdf7f2
TL
5729 derr << "heartbeat_check: no reply from "
5730 << p->second.con_front->get_peer_addr().get_sockaddr()
5731 << " osd." << p->first
5732 << " ever on either front or back, first ping sent "
5733 << p->second.first_tx
5734 << " (oldest deadline " << oldest_deadline << ")"
5735 << dendl;
7c673cae 5736 // fail
11fdf7f2 5737 failure_queue[p->first] = p->second.first_tx;
7c673cae 5738 } else {
11fdf7f2
TL
5739 derr << "heartbeat_check: no reply from "
5740 << p->second.con_front->get_peer_addr().get_sockaddr()
7c673cae
FG
5741 << " osd." << p->first << " since back " << p->second.last_rx_back
5742 << " front " << p->second.last_rx_front
11fdf7f2
TL
5743 << " (oldest deadline " << oldest_deadline << ")"
5744 << dendl;
7c673cae 5745 // fail
11fdf7f2 5746 failure_queue[p->first] = std::min(p->second.last_rx_back, p->second.last_rx_front);
7c673cae
FG
5747 }
5748 }
5749 }
5750}
5751
5752void OSD::heartbeat()
5753{
9f95a23c 5754 ceph_assert(ceph_mutex_is_locked_by_me(heartbeat_lock));
7c673cae
FG
5755 dout(30) << "heartbeat" << dendl;
5756
5757 // get CPU load avg
5758 double loadavgs[1];
11fdf7f2
TL
5759 int hb_interval = cct->_conf->osd_heartbeat_interval;
5760 int n_samples = 86400;
5761 if (hb_interval > 1) {
5762 n_samples /= hb_interval;
5763 if (n_samples < 1)
5764 n_samples = 1;
5765 }
5766
7c673cae
FG
5767 if (getloadavg(loadavgs, 1) == 1) {
5768 logger->set(l_osd_loadavg, 100 * loadavgs[0]);
5769 daily_loadavg = (daily_loadavg * (n_samples - 1) + loadavgs[0]) / n_samples;
5770 dout(30) << "heartbeat: daily_loadavg " << daily_loadavg << dendl;
5771 }
5772
5773 dout(30) << "heartbeat checking stats" << dendl;
5774
11fdf7f2 5775 // refresh peer list and osd stats
7c673cae
FG
5776 vector<int> hb_peers;
5777 for (map<int,HeartbeatInfo>::iterator p = heartbeat_peers.begin();
5778 p != heartbeat_peers.end();
5779 ++p)
5780 hb_peers.push_back(p->first);
7c673cae 5781
11fdf7f2
TL
5782 auto new_stat = service.set_osd_stat(hb_peers, get_num_pgs());
5783 dout(5) << __func__ << " " << new_stat << dendl;
5784 ceph_assert(new_stat.statfs.total);
5785
5786 float pratio;
5787 float ratio = service.compute_adjusted_ratio(new_stat, &pratio);
5788
5789 service.check_full_status(ratio, pratio);
7c673cae
FG
5790
5791 utime_t now = ceph_clock_now();
9f95a23c 5792 auto mnow = service.get_mnow();
11fdf7f2
TL
5793 utime_t deadline = now;
5794 deadline += cct->_conf->osd_heartbeat_grace;
7c673cae
FG
5795
5796 // send heartbeats
5797 for (map<int,HeartbeatInfo>::iterator i = heartbeat_peers.begin();
5798 i != heartbeat_peers.end();
5799 ++i) {
5800 int peer = i->first;
f67539c2
TL
5801 Session *s = static_cast<Session*>(i->second.con_back->get_priv().get());
5802 if (!s) {
5803 dout(30) << "heartbeat osd." << peer << " has no open con" << dendl;
5804 continue;
5805 }
9f95a23c
TL
5806 dout(30) << "heartbeat sending ping to osd." << peer << dendl;
5807
7c673cae
FG
5808 i->second.last_tx = now;
5809 if (i->second.first_tx == utime_t())
5810 i->second.first_tx = now;
11fdf7f2
TL
5811 i->second.ping_history[now] = make_pair(deadline,
5812 HeartbeatInfo::HEARTBEAT_MAX_CONN);
eafe8130
TL
5813 if (i->second.hb_interval_start == utime_t())
5814 i->second.hb_interval_start = now;
9f95a23c 5815
9f95a23c
TL
5816 std::optional<ceph::signedspan> delta_ub;
5817 s->stamps->sent_ping(&delta_ub);
5818
5819 i->second.con_back->send_message(
5820 new MOSDPing(monc->get_fsid(),
5821 service.get_osdmap_epoch(),
5822 MOSDPing::PING,
5823 now,
5824 mnow,
5825 mnow,
5826 service.get_up_epoch(),
5827 cct->_conf->osd_heartbeat_min_size,
5828 delta_ub));
7c673cae
FG
5829
5830 if (i->second.con_front)
9f95a23c
TL
5831 i->second.con_front->send_message(
5832 new MOSDPing(monc->get_fsid(),
5833 service.get_osdmap_epoch(),
5834 MOSDPing::PING,
5835 now,
5836 mnow,
5837 mnow,
5838 service.get_up_epoch(),
5839 cct->_conf->osd_heartbeat_min_size,
5840 delta_ub));
7c673cae
FG
5841 }
5842
5843 logger->set(l_osd_hb_to, heartbeat_peers.size());
5844
5845 // hmm.. am i all alone?
5846 dout(30) << "heartbeat lonely?" << dendl;
5847 if (heartbeat_peers.empty()) {
5848 if (now - last_mon_heartbeat > cct->_conf->osd_mon_heartbeat_interval && is_active()) {
5849 last_mon_heartbeat = now;
5850 dout(10) << "i have no heartbeat peers; checking mon for new map" << dendl;
9f95a23c 5851 osdmap_subscribe(get_osdmap_epoch() + 1, false);
7c673cae
FG
5852 }
5853 }
5854
5855 dout(30) << "heartbeat done" << dendl;
5856}
5857
5858bool OSD::heartbeat_reset(Connection *con)
5859{
11fdf7f2
TL
5860 std::lock_guard l(heartbeat_lock);
5861 auto s = con->get_priv();
9f95a23c 5862 dout(20) << __func__ << " con " << con << " s " << s.get() << dendl;
11fdf7f2 5863 con->set_priv(nullptr);
7c673cae 5864 if (s) {
7c673cae 5865 if (is_stopping()) {
7c673cae
FG
5866 return true;
5867 }
9f95a23c
TL
5868 auto session = static_cast<Session*>(s.get());
5869 auto p = heartbeat_peers.find(session->peer);
7c673cae
FG
5870 if (p != heartbeat_peers.end() &&
5871 (p->second.con_back == con ||
5872 p->second.con_front == con)) {
5873 dout(10) << "heartbeat_reset failed hb con " << con << " for osd." << p->second.peer
5874 << ", reopening" << dendl;
9f95a23c 5875 p->second.clear_mark_down(con);
7c673cae
FG
5876 pair<ConnectionRef,ConnectionRef> newcon = service.get_con_osd_hb(p->second.peer, p->second.epoch);
5877 if (newcon.first) {
5878 p->second.con_back = newcon.first.get();
11fdf7f2 5879 p->second.con_back->set_priv(s);
7c673cae
FG
5880 if (newcon.second) {
5881 p->second.con_front = newcon.second.get();
11fdf7f2 5882 p->second.con_front->set_priv(s);
7c673cae 5883 }
11fdf7f2 5884 p->second.ping_history.clear();
7c673cae
FG
5885 } else {
5886 dout(10) << "heartbeat_reset failed hb con " << con << " for osd." << p->second.peer
5887 << ", raced with osdmap update, closing out peer" << dendl;
5888 heartbeat_peers.erase(p);
5889 }
5890 } else {
5891 dout(10) << "heartbeat_reset closing (old) failed hb con " << con << dendl;
5892 }
7c673cae
FG
5893 }
5894 return true;
5895}
5896
5897
5898
5899// =========================================
5900
5901void OSD::tick()
5902{
9f95a23c 5903 ceph_assert(ceph_mutex_is_locked(osd_lock));
7c673cae
FG
5904 dout(10) << "tick" << dendl;
5905
9f95a23c
TL
5906 utime_t now = ceph_clock_now();
5907 // throw out any obsolete markdown log
5908 utime_t grace = utime_t(cct->_conf->osd_max_markdown_period, 0);
5909 while (!osd_markdown_log.empty() &&
5910 osd_markdown_log.front() + grace < now)
5911 osd_markdown_log.pop_front();
5912
7c673cae
FG
5913 if (is_active() || is_waiting_for_healthy()) {
5914 maybe_update_heartbeat_peers();
5915 }
5916
5917 if (is_waiting_for_healthy()) {
5918 start_boot();
494da23a
TL
5919 }
5920
5921 if (is_waiting_for_healthy() || is_booting()) {
5922 std::lock_guard l(heartbeat_lock);
494da23a
TL
5923 if (now - last_mon_heartbeat > cct->_conf->osd_mon_heartbeat_interval) {
5924 last_mon_heartbeat = now;
5925 dout(1) << __func__ << " checking mon for new map" << dendl;
9f95a23c 5926 osdmap_subscribe(get_osdmap_epoch() + 1, false);
11fdf7f2 5927 }
7c673cae
FG
5928 }
5929
5930 do_waiters();
5931
9f95a23c
TL
5932 // scrub purged_snaps every deep scrub interval
5933 {
5934 const utime_t last = superblock.last_purged_snaps_scrub;
5935 utime_t next = last;
5936 next += cct->_conf->osd_scrub_min_interval;
5937 std::mt19937 rng;
5938 // use a seed that is stable for each scrub interval, but varies
5939 // by OSD to avoid any herds.
5940 rng.seed(whoami + superblock.last_purged_snaps_scrub.sec());
5941 double r = (rng() % 1024) / 1024;
5942 next +=
5943 cct->_conf->osd_scrub_min_interval *
5944 cct->_conf->osd_scrub_interval_randomize_ratio * r;
5945 if (next < ceph_clock_now()) {
5946 dout(20) << __func__ << " last_purged_snaps_scrub " << last
5947 << " next " << next << " ... now" << dendl;
5948 scrub_purged_snaps();
5949 } else {
5950 dout(20) << __func__ << " last_purged_snaps_scrub " << last
5951 << " next " << next << dendl;
5952 }
5953 }
5954
91327a77 5955 tick_timer.add_event_after(get_tick_interval(), new C_Tick(this));
7c673cae
FG
5956}
5957
5958void OSD::tick_without_osd_lock()
5959{
9f95a23c 5960 ceph_assert(ceph_mutex_is_locked(tick_timer_lock));
7c673cae
FG
5961 dout(10) << "tick_without_osd_lock" << dendl;
5962
f67539c2
TL
5963 logger->set(l_osd_cached_crc, ceph::buffer::get_cached_crc());
5964 logger->set(l_osd_cached_crc_adjusted, ceph::buffer::get_cached_crc_adjusted());
5965 logger->set(l_osd_missed_crc, ceph::buffer::get_missed_crc());
11fdf7f2
TL
5966
5967 // refresh osd stats
5968 struct store_statfs_t stbuf;
5969 osd_alert_list_t alerts;
5970 int r = store->statfs(&stbuf, &alerts);
5971 ceph_assert(r == 0);
5972 service.set_statfs(stbuf, alerts);
7c673cae
FG
5973
5974 // osd_lock is not being held, which means the OSD state
5975 // might change when doing the monitor report
5976 if (is_active() || is_waiting_for_healthy()) {
9f95a23c
TL
5977 {
5978 std::lock_guard l{heartbeat_lock};
5979 heartbeat_check();
5980 }
5981 map_lock.lock_shared();
11fdf7f2 5982 std::lock_guard l(mon_report_lock);
7c673cae
FG
5983
5984 // mon report?
7c673cae 5985 utime_t now = ceph_clock_now();
11fdf7f2
TL
5986 if (service.need_fullness_update() ||
5987 now - last_mon_report > cct->_conf->osd_mon_report_interval) {
7c673cae 5988 last_mon_report = now;
7c673cae
FG
5989 send_full_update();
5990 send_failures();
7c673cae 5991 }
9f95a23c 5992 map_lock.unlock_shared();
11fdf7f2
TL
5993
5994 epoch_t max_waiting_epoch = 0;
5995 for (auto s : shards) {
5996 max_waiting_epoch = std::max(max_waiting_epoch,
5997 s->get_max_waiting_epoch());
5998 }
5999 if (max_waiting_epoch > get_osdmap()->get_epoch()) {
6000 dout(20) << __func__ << " max_waiting_epoch " << max_waiting_epoch
6001 << ", requesting new map" << dendl;
6002 osdmap_subscribe(superblock.newest_map + 1, false);
6003 }
7c673cae
FG
6004 }
6005
6006 if (is_active()) {
6007 if (!scrub_random_backoff()) {
6008 sched_scrub();
6009 }
6010 service.promote_throttle_recalibrate();
3efd9988 6011 resume_creating_pg();
224ce89b
WB
6012 bool need_send_beacon = false;
6013 const auto now = ceph::coarse_mono_clock::now();
6014 {
6015 // borrow lec lock to pretect last_sent_beacon from changing
11fdf7f2 6016 std::lock_guard l{min_last_epoch_clean_lock};
224ce89b 6017 const auto elapsed = now - last_sent_beacon;
f67539c2 6018 if (std::chrono::duration_cast<std::chrono::seconds>(elapsed).count() >
224ce89b
WB
6019 cct->_conf->osd_beacon_report_interval) {
6020 need_send_beacon = true;
6021 }
6022 }
6023 if (need_send_beacon) {
6024 send_beacon(now);
6025 }
7c673cae
FG
6026 }
6027
11fdf7f2 6028 mgrc.update_daemon_health(get_health_metrics());
7c673cae 6029 service.kick_recovery_queue();
91327a77
AA
6030 tick_timer_without_osd_lock.add_event_after(get_tick_interval(),
6031 new C_Tick_WithoutOSDLock(this));
7c673cae
FG
6032}
6033
7c673cae
FG
6034// Usage:
6035// setomapval <pool-id> [namespace/]<obj-name> <key> <val>
6036// rmomapkey <pool-id> [namespace/]<obj-name> <key>
6037// setomapheader <pool-id> [namespace/]<obj-name> <header>
6038// getomap <pool> [namespace/]<obj-name>
6039// truncobj <pool-id> [namespace/]<obj-name> <newlen>
6040// injectmdataerr [namespace/]<obj-name> [shardid]
6041// injectdataerr [namespace/]<obj-name> [shardid]
6042//
6043// set_recovery_delay [utime]
6044void TestOpsSocketHook::test_ops(OSDService *service, ObjectStore *store,
11fdf7f2
TL
6045 std::string_view command,
6046 const cmdmap_t& cmdmap, ostream &ss)
7c673cae
FG
6047{
6048 //Test support
6049 //Support changing the omap on a single osd by using the Admin Socket to
6050 //directly request the osd make a change.
6051 if (command == "setomapval" || command == "rmomapkey" ||
6052 command == "setomapheader" || command == "getomap" ||
6053 command == "truncobj" || command == "injectmdataerr" ||
6054 command == "injectdataerr"
6055 ) {
6056 pg_t rawpg;
6057 int64_t pool;
6058 OSDMapRef curmap = service->get_osdmap();
6059 int r = -1;
6060
6061 string poolstr;
6062
9f95a23c 6063 cmd_getval(cmdmap, "pool", poolstr);
7c673cae
FG
6064 pool = curmap->lookup_pg_pool_name(poolstr);
6065 //If we can't find it by name then maybe id specified
6066 if (pool < 0 && isdigit(poolstr[0]))
6067 pool = atoll(poolstr.c_str());
6068 if (pool < 0) {
b5b8bbf5 6069 ss << "Invalid pool '" << poolstr << "''";
7c673cae
FG
6070 return;
6071 }
6072
6073 string objname, nspace;
9f95a23c 6074 cmd_getval(cmdmap, "objname", objname);
7c673cae
FG
6075 std::size_t found = objname.find_first_of('/');
6076 if (found != string::npos) {
6077 nspace = objname.substr(0, found);
6078 objname = objname.substr(found+1);
6079 }
6080 object_locator_t oloc(pool, nspace);
6081 r = curmap->object_locator_to_pg(object_t(objname), oloc, rawpg);
6082
6083 if (r < 0) {
6084 ss << "Invalid namespace/objname";
6085 return;
6086 }
6087
6088 int64_t shardid;
9f95a23c 6089 cmd_getval(cmdmap, "shardid", shardid, int64_t(shard_id_t::NO_SHARD));
7c673cae
FG
6090 hobject_t obj(object_t(objname), string(""), CEPH_NOSNAP, rawpg.ps(), pool, nspace);
6091 ghobject_t gobj(obj, ghobject_t::NO_GEN, shard_id_t(uint8_t(shardid)));
6092 spg_t pgid(curmap->raw_pg_to_pg(rawpg), shard_id_t(shardid));
6093 if (curmap->pg_is_ec(rawpg)) {
6094 if ((command != "injectdataerr") && (command != "injectmdataerr")) {
6095 ss << "Must not call on ec pool, except injectdataerr or injectmdataerr";
6096 return;
6097 }
6098 }
6099
6100 ObjectStore::Transaction t;
6101
6102 if (command == "setomapval") {
6103 map<string, bufferlist> newattrs;
6104 bufferlist val;
6105 string key, valstr;
9f95a23c
TL
6106 cmd_getval(cmdmap, "key", key);
6107 cmd_getval(cmdmap, "val", valstr);
7c673cae
FG
6108
6109 val.append(valstr);
6110 newattrs[key] = val;
6111 t.omap_setkeys(coll_t(pgid), ghobject_t(obj), newattrs);
11fdf7f2 6112 r = store->queue_transaction(service->meta_ch, std::move(t));
7c673cae
FG
6113 if (r < 0)
6114 ss << "error=" << r;
6115 else
6116 ss << "ok";
6117 } else if (command == "rmomapkey") {
6118 string key;
9f95a23c 6119 cmd_getval(cmdmap, "key", key);
7c673cae 6120
9f95a23c 6121 t.omap_rmkey(coll_t(pgid), ghobject_t(obj), key);
11fdf7f2 6122 r = store->queue_transaction(service->meta_ch, std::move(t));
7c673cae
FG
6123 if (r < 0)
6124 ss << "error=" << r;
6125 else
6126 ss << "ok";
6127 } else if (command == "setomapheader") {
6128 bufferlist newheader;
6129 string headerstr;
6130
9f95a23c 6131 cmd_getval(cmdmap, "header", headerstr);
7c673cae
FG
6132 newheader.append(headerstr);
6133 t.omap_setheader(coll_t(pgid), ghobject_t(obj), newheader);
11fdf7f2 6134 r = store->queue_transaction(service->meta_ch, std::move(t));
7c673cae
FG
6135 if (r < 0)
6136 ss << "error=" << r;
6137 else
6138 ss << "ok";
6139 } else if (command == "getomap") {
6140 //Debug: Output entire omap
6141 bufferlist hdrbl;
6142 map<string, bufferlist> keyvals;
11fdf7f2
TL
6143 auto ch = store->open_collection(coll_t(pgid));
6144 if (!ch) {
6145 ss << "unable to open collection for " << pgid;
6146 r = -ENOENT;
6147 } else {
6148 r = store->omap_get(ch, ghobject_t(obj), &hdrbl, &keyvals);
6149 if (r >= 0) {
7c673cae
FG
6150 ss << "header=" << string(hdrbl.c_str(), hdrbl.length());
6151 for (map<string, bufferlist>::iterator it = keyvals.begin();
11fdf7f2 6152 it != keyvals.end(); ++it)
7c673cae
FG
6153 ss << " key=" << (*it).first << " val="
6154 << string((*it).second.c_str(), (*it).second.length());
11fdf7f2 6155 } else {
7c673cae 6156 ss << "error=" << r;
11fdf7f2 6157 }
7c673cae
FG
6158 }
6159 } else if (command == "truncobj") {
6160 int64_t trunclen;
9f95a23c 6161 cmd_getval(cmdmap, "len", trunclen);
7c673cae 6162 t.truncate(coll_t(pgid), ghobject_t(obj), trunclen);
11fdf7f2 6163 r = store->queue_transaction(service->meta_ch, std::move(t));
7c673cae
FG
6164 if (r < 0)
6165 ss << "error=" << r;
6166 else
6167 ss << "ok";
6168 } else if (command == "injectdataerr") {
6169 store->inject_data_error(gobj);
6170 ss << "ok";
6171 } else if (command == "injectmdataerr") {
6172 store->inject_mdata_error(gobj);
6173 ss << "ok";
6174 }
6175 return;
6176 }
6177 if (command == "set_recovery_delay") {
6178 int64_t delay;
9f95a23c 6179 cmd_getval(cmdmap, "utime", delay, (int64_t)0);
7c673cae
FG
6180 ostringstream oss;
6181 oss << delay;
11fdf7f2 6182 int r = service->cct->_conf.set_val("osd_recovery_delay_start",
7c673cae
FG
6183 oss.str().c_str());
6184 if (r != 0) {
6185 ss << "set_recovery_delay: error setting "
6186 << "osd_recovery_delay_start to '" << delay << "': error "
6187 << r;
6188 return;
6189 }
11fdf7f2 6190 service->cct->_conf.apply_changes(nullptr);
7c673cae
FG
6191 ss << "set_recovery_delay: set osd_recovery_delay_start "
6192 << "to " << service->cct->_conf->osd_recovery_delay_start;
6193 return;
6194 }
7c673cae
FG
6195 if (command == "injectfull") {
6196 int64_t count;
6197 string type;
6198 OSDService::s_names state;
9f95a23c
TL
6199 cmd_getval(cmdmap, "type", type, string("full"));
6200 cmd_getval(cmdmap, "count", count, (int64_t)-1);
7c673cae
FG
6201 if (type == "none" || count == 0) {
6202 type = "none";
6203 count = 0;
6204 }
6205 state = service->get_full_state(type);
6206 if (state == OSDService::s_names::INVALID) {
6207 ss << "Invalid type use (none, nearfull, backfillfull, full, failsafe)";
6208 return;
6209 }
6210 service->set_injectfull(state, count);
6211 return;
6212 }
6213 ss << "Internal error - command=" << command;
6214}
6215
7c673cae
FG
6216// =========================================
6217
6218void OSD::ms_handle_connect(Connection *con)
6219{
6220 dout(10) << __func__ << " con " << con << dendl;
6221 if (con->get_peer_type() == CEPH_ENTITY_TYPE_MON) {
11fdf7f2 6222 std::lock_guard l(osd_lock);
7c673cae
FG
6223 if (is_stopping())
6224 return;
6225 dout(10) << __func__ << " on mon" << dendl;
6226
6227 if (is_preboot()) {
6228 start_boot();
6229 } else if (is_booting()) {
6230 _send_boot(); // resend boot message
6231 } else {
9f95a23c 6232 map_lock.lock_shared();
11fdf7f2 6233 std::lock_guard l2(mon_report_lock);
7c673cae
FG
6234
6235 utime_t now = ceph_clock_now();
6236 last_mon_report = now;
6237
6238 // resend everything, it's a new session
6239 send_full_update();
6240 send_alive();
6241 service.requeue_pg_temp();
11fdf7f2 6242 service.clear_sent_ready_to_merge();
7c673cae 6243 service.send_pg_temp();
11fdf7f2
TL
6244 service.send_ready_to_merge();
6245 service.send_pg_created();
7c673cae
FG
6246 requeue_failures();
6247 send_failures();
7c673cae 6248
9f95a23c 6249 map_lock.unlock_shared();
7c673cae
FG
6250 if (is_active()) {
6251 send_beacon(ceph::coarse_mono_clock::now());
6252 }
6253 }
6254
6255 // full map requests may happen while active or pre-boot
6256 if (requested_full_first) {
6257 rerequest_full_maps();
6258 }
6259 }
6260}
6261
6262void OSD::ms_handle_fast_connect(Connection *con)
6263{
6264 if (con->get_peer_type() != CEPH_ENTITY_TYPE_MON &&
6265 con->get_peer_type() != CEPH_ENTITY_TYPE_MGR) {
9f95a23c
TL
6266 if (auto s = ceph::ref_cast<Session>(con->get_priv()); !s) {
6267 s = ceph::make_ref<Session>(cct, con);
6268 con->set_priv(s);
7c673cae
FG
6269 dout(10) << " new session (outgoing) " << s << " con=" << s->con
6270 << " addr=" << s->con->get_peer_addr() << dendl;
6271 // we don't connect to clients
11fdf7f2 6272 ceph_assert(con->get_peer_type() == CEPH_ENTITY_TYPE_OSD);
7c673cae
FG
6273 s->entity_name.set_type(CEPH_ENTITY_TYPE_OSD);
6274 }
7c673cae
FG
6275 }
6276}
6277
6278void OSD::ms_handle_fast_accept(Connection *con)
6279{
6280 if (con->get_peer_type() != CEPH_ENTITY_TYPE_MON &&
6281 con->get_peer_type() != CEPH_ENTITY_TYPE_MGR) {
9f95a23c
TL
6282 if (auto s = ceph::ref_cast<Session>(con->get_priv()); !s) {
6283 s = ceph::make_ref<Session>(cct, con);
6284 con->set_priv(s);
7c673cae
FG
6285 dout(10) << "new session (incoming)" << s << " con=" << con
6286 << " addr=" << con->get_peer_addr()
6287 << " must have raced with connect" << dendl;
11fdf7f2 6288 ceph_assert(con->get_peer_type() == CEPH_ENTITY_TYPE_OSD);
7c673cae
FG
6289 s->entity_name.set_type(CEPH_ENTITY_TYPE_OSD);
6290 }
7c673cae
FG
6291 }
6292}
6293
6294bool OSD::ms_handle_reset(Connection *con)
6295{
9f95a23c
TL
6296 auto session = ceph::ref_cast<Session>(con->get_priv());
6297 dout(2) << "ms_handle_reset con " << con << " session " << session.get() << dendl;
7c673cae
FG
6298 if (!session)
6299 return false;
6300 session->wstate.reset(con);
11fdf7f2
TL
6301 session->con->set_priv(nullptr);
6302 session->con.reset(); // break con <-> session ref cycle
7c673cae
FG
6303 // note that we break session->con *before* the session_handle_reset
6304 // cleanup below. this avoids a race between us and
6305 // PG::add_backoff, Session::check_backoff, etc.
9f95a23c 6306 session_handle_reset(session);
7c673cae
FG
6307 return true;
6308}
6309
6310bool OSD::ms_handle_refused(Connection *con)
6311{
6312 if (!cct->_conf->osd_fast_fail_on_connection_refused)
6313 return false;
6314
9f95a23c
TL
6315 auto session = ceph::ref_cast<Session>(con->get_priv());
6316 dout(2) << "ms_handle_refused con " << con << " session " << session.get() << dendl;
7c673cae
FG
6317 if (!session)
6318 return false;
6319 int type = con->get_peer_type();
6320 // handle only OSD failures here
6321 if (monc && (type == CEPH_ENTITY_TYPE_OSD)) {
6322 OSDMapRef osdmap = get_osdmap();
6323 if (osdmap) {
6324 int id = osdmap->identify_osd_on_all_channels(con->get_peer_addr());
6325 if (id >= 0 && osdmap->is_up(id)) {
6326 // I'm cheating mon heartbeat grace logic, because we know it's not going
6327 // to respawn alone. +1 so we won't hit any boundary case.
11fdf7f2
TL
6328 monc->send_mon_message(
6329 new MOSDFailure(
6330 monc->get_fsid(),
6331 id,
6332 osdmap->get_addrs(id),
6333 cct->_conf->osd_heartbeat_grace + 1,
6334 osdmap->get_epoch(),
6335 MOSDFailure::FLAG_IMMEDIATE | MOSDFailure::FLAG_FAILED
6336 ));
7c673cae
FG
6337 }
6338 }
6339 }
7c673cae
FG
6340 return true;
6341}
6342
f67539c2 6343struct CB_OSD_GetVersion {
7c673cae 6344 OSD *osd;
f67539c2
TL
6345 explicit CB_OSD_GetVersion(OSD *o) : osd(o) {}
6346 void operator ()(boost::system::error_code ec, version_t newest,
6347 version_t oldest) {
6348 if (!ec)
7c673cae
FG
6349 osd->_got_mon_epochs(oldest, newest);
6350 }
6351};
6352
6353void OSD::start_boot()
6354{
6355 if (!_is_healthy()) {
6356 // if we are not healthy, do not mark ourselves up (yet)
6357 dout(1) << "not healthy; waiting to boot" << dendl;
6358 if (!is_waiting_for_healthy())
6359 start_waiting_for_healthy();
6360 // send pings sooner rather than later
6361 heartbeat_kick();
6362 return;
6363 }
6364 dout(1) << __func__ << dendl;
6365 set_state(STATE_PREBOOT);
6366 dout(10) << "start_boot - have maps " << superblock.oldest_map
6367 << ".." << superblock.newest_map << dendl;
f67539c2 6368 monc->get_version("osdmap", CB_OSD_GetVersion(this));
7c673cae
FG
6369}
6370
6371void OSD::_got_mon_epochs(epoch_t oldest, epoch_t newest)
6372{
11fdf7f2 6373 std::lock_guard l(osd_lock);
7c673cae
FG
6374 if (is_preboot()) {
6375 _preboot(oldest, newest);
6376 }
6377}
6378
6379void OSD::_preboot(epoch_t oldest, epoch_t newest)
6380{
11fdf7f2 6381 ceph_assert(is_preboot());
7c673cae
FG
6382 dout(10) << __func__ << " _preboot mon has osdmaps "
6383 << oldest << ".." << newest << dendl;
6384
6385 // ensure our local fullness awareness is accurate
81eedcae
TL
6386 {
6387 std::lock_guard l(heartbeat_lock);
6388 heartbeat();
6389 }
7c673cae 6390
9f95a23c
TL
6391 const auto& monmap = monc->monmap;
6392 const auto osdmap = get_osdmap();
7c673cae 6393 // if our map within recent history, try to add ourselves to the osdmap.
31f18b77
FG
6394 if (osdmap->get_epoch() == 0) {
6395 derr << "waiting for initial osdmap" << dendl;
c07f9fc5 6396 } else if (osdmap->is_destroyed(whoami)) {
b32b8144
FG
6397 derr << "osdmap says I am destroyed" << dendl;
6398 // provide a small margin so we don't livelock seeing if we
6399 // un-destroyed ourselves.
6400 if (osdmap->get_epoch() > newest - 1) {
6401 exit(0);
6402 }
81eedcae 6403 } else if (osdmap->is_noup(whoami)) {
7c673cae
FG
6404 derr << "osdmap NOUP flag is set, waiting for it to clear" << dendl;
6405 } else if (!osdmap->test_flag(CEPH_OSDMAP_SORTBITWISE)) {
6406 derr << "osdmap SORTBITWISE OSDMap flag is NOT set; please set it"
6407 << dendl;
7c673cae
FG
6408 } else if (service.need_fullness_update()) {
6409 derr << "osdmap fullness state needs update" << dendl;
6410 send_full_update();
9f95a23c
TL
6411 } else if (monmap.min_mon_release >= ceph_release_t::octopus &&
6412 superblock.purged_snaps_last < superblock.current_epoch) {
6413 dout(10) << __func__ << " purged_snaps_last " << superblock.purged_snaps_last
6414 << " < newest_map " << superblock.current_epoch << dendl;
6415 _get_purged_snaps();
7c673cae
FG
6416 } else if (osdmap->get_epoch() >= oldest - 1 &&
6417 osdmap->get_epoch() + cct->_conf->osd_map_message_max > newest) {
11fdf7f2
TL
6418
6419 // wait for pgs to fully catch up in a different thread, since
6420 // this thread might be required for splitting and merging PGs to
6421 // make progress.
6422 boot_finisher.queue(
9f95a23c 6423 new LambdaContext(
11fdf7f2 6424 [this](int r) {
9f95a23c 6425 std::unique_lock l(osd_lock);
11fdf7f2
TL
6426 if (is_preboot()) {
6427 dout(10) << __func__ << " waiting for peering work to drain"
6428 << dendl;
9f95a23c 6429 l.unlock();
11fdf7f2 6430 for (auto shard : shards) {
9f95a23c 6431 shard->wait_min_pg_epoch(get_osdmap_epoch());
11fdf7f2 6432 }
9f95a23c 6433 l.lock();
11fdf7f2
TL
6434 }
6435 if (is_preboot()) {
6436 _send_boot();
6437 }
6438 }));
6439 return;
7c673cae
FG
6440 }
6441
6442 // get all the latest maps
6443 if (osdmap->get_epoch() + 1 >= oldest)
6444 osdmap_subscribe(osdmap->get_epoch() + 1, false);
6445 else
6446 osdmap_subscribe(oldest - 1, true);
6447}
6448
9f95a23c
TL
6449void OSD::_get_purged_snaps()
6450{
6451 // NOTE: this is a naive, stateless implementaiton. it may send multiple
6452 // overlapping requests to the mon, which will be somewhat inefficient, but
6453 // it should be reliable.
6454 dout(10) << __func__ << " purged_snaps_last " << superblock.purged_snaps_last
6455 << ", newest_map " << superblock.current_epoch << dendl;
6456 MMonGetPurgedSnaps *m = new MMonGetPurgedSnaps(
6457 superblock.purged_snaps_last + 1,
6458 superblock.current_epoch + 1);
6459 monc->send_mon_message(m);
6460}
6461
6462void OSD::handle_get_purged_snaps_reply(MMonGetPurgedSnapsReply *m)
6463{
6464 dout(10) << __func__ << " " << *m << dendl;
6465 ObjectStore::Transaction t;
6466 if (!is_preboot() ||
6467 m->last < superblock.purged_snaps_last) {
6468 goto out;
6469 }
6470 SnapMapper::record_purged_snaps(cct, store, service.meta_ch,
6471 make_purged_snaps_oid(), &t,
6472 m->purged_snaps);
6473 superblock.purged_snaps_last = m->last;
6474 write_superblock(t);
6475 store->queue_transaction(
6476 service.meta_ch,
6477 std::move(t));
6478 service.publish_superblock(superblock);
6479 if (m->last < superblock.current_epoch) {
6480 _get_purged_snaps();
6481 } else {
6482 start_boot();
6483 }
6484out:
6485 m->put();
6486}
6487
7c673cae
FG
6488void OSD::send_full_update()
6489{
6490 if (!service.need_fullness_update())
6491 return;
6492 unsigned state = 0;
6493 if (service.is_full()) {
6494 state = CEPH_OSD_FULL;
6495 } else if (service.is_backfillfull()) {
6496 state = CEPH_OSD_BACKFILLFULL;
6497 } else if (service.is_nearfull()) {
6498 state = CEPH_OSD_NEARFULL;
6499 }
6500 set<string> s;
6501 OSDMap::calc_state_set(state, s);
6502 dout(10) << __func__ << " want state " << s << dendl;
9f95a23c 6503 monc->send_mon_message(new MOSDFull(get_osdmap_epoch(), state));
7c673cae
FG
6504}
6505
6506void OSD::start_waiting_for_healthy()
6507{
6508 dout(1) << "start_waiting_for_healthy" << dendl;
6509 set_state(STATE_WAITING_FOR_HEALTHY);
6510 last_heartbeat_resample = utime_t();
181888fb
FG
6511
6512 // subscribe to osdmap updates, in case our peers really are known to be dead
9f95a23c 6513 osdmap_subscribe(get_osdmap_epoch() + 1, false);
7c673cae
FG
6514}
6515
6516bool OSD::_is_healthy()
6517{
6518 if (!cct->get_heartbeat_map()->is_healthy()) {
6519 dout(1) << "is_healthy false -- internal heartbeat failed" << dendl;
6520 return false;
6521 }
6522
6523 if (is_waiting_for_healthy()) {
11fdf7f2 6524 utime_t now = ceph_clock_now();
9f95a23c
TL
6525 if (osd_markdown_log.empty()) {
6526 dout(5) << __func__ << " force returning true since last markdown"
6527 << " was " << cct->_conf->osd_max_markdown_period
6528 << "s ago" << dendl;
11fdf7f2
TL
6529 return true;
6530 }
6531 std::lock_guard l(heartbeat_lock);
7c673cae
FG
6532 int num = 0, up = 0;
6533 for (map<int,HeartbeatInfo>::iterator p = heartbeat_peers.begin();
6534 p != heartbeat_peers.end();
6535 ++p) {
11fdf7f2 6536 if (p->second.is_healthy(now))
7c673cae
FG
6537 ++up;
6538 ++num;
6539 }
6540 if ((float)up < (float)num * cct->_conf->osd_heartbeat_min_healthy_ratio) {
6541 dout(1) << "is_healthy false -- only " << up << "/" << num << " up peers (less than "
6542 << int(cct->_conf->osd_heartbeat_min_healthy_ratio * 100.0) << "%)" << dendl;
6543 return false;
6544 }
6545 }
6546
6547 return true;
6548}
6549
6550void OSD::_send_boot()
6551{
6552 dout(10) << "_send_boot" << dendl;
11fdf7f2
TL
6553 Connection *local_connection =
6554 cluster_messenger->get_loopback_connection().get();
6555 entity_addrvec_t client_addrs = client_messenger->get_myaddrs();
6556 entity_addrvec_t cluster_addrs = cluster_messenger->get_myaddrs();
6557 entity_addrvec_t hb_back_addrs = hb_back_server_messenger->get_myaddrs();
6558 entity_addrvec_t hb_front_addrs = hb_front_server_messenger->get_myaddrs();
6559
6560 dout(20) << " initial client_addrs " << client_addrs
6561 << ", cluster_addrs " << cluster_addrs
6562 << ", hb_back_addrs " << hb_back_addrs
6563 << ", hb_front_addrs " << hb_front_addrs
6564 << dendl;
6565 if (cluster_messenger->set_addr_unknowns(client_addrs)) {
6566 dout(10) << " assuming cluster_addrs match client_addrs "
6567 << client_addrs << dendl;
6568 cluster_addrs = cluster_messenger->get_myaddrs();
6569 }
6570 if (auto session = local_connection->get_priv(); !session) {
6571 cluster_messenger->ms_deliver_handle_fast_connect(local_connection);
7c673cae
FG
6572 }
6573
7c673cae 6574 local_connection = hb_back_server_messenger->get_loopback_connection().get();
11fdf7f2
TL
6575 if (hb_back_server_messenger->set_addr_unknowns(cluster_addrs)) {
6576 dout(10) << " assuming hb_back_addrs match cluster_addrs "
6577 << cluster_addrs << dendl;
6578 hb_back_addrs = hb_back_server_messenger->get_myaddrs();
7c673cae 6579 }
11fdf7f2
TL
6580 if (auto session = local_connection->get_priv(); !session) {
6581 hb_back_server_messenger->ms_deliver_handle_fast_connect(local_connection);
7c673cae
FG
6582 }
6583
11fdf7f2
TL
6584 local_connection = hb_front_server_messenger->get_loopback_connection().get();
6585 if (hb_front_server_messenger->set_addr_unknowns(client_addrs)) {
6586 dout(10) << " assuming hb_front_addrs match client_addrs "
6587 << client_addrs << dendl;
6588 hb_front_addrs = hb_front_server_messenger->get_myaddrs();
6589 }
6590 if (auto session = local_connection->get_priv(); !session) {
6591 hb_front_server_messenger->ms_deliver_handle_fast_connect(local_connection);
6592 }
6593
6594 // we now know what our front and back addrs will be, and we are
6595 // about to tell the mon what our metadata (including numa bindings)
6596 // are, so now is a good time!
6597 set_numa_affinity();
6598
6599 MOSDBoot *mboot = new MOSDBoot(
6600 superblock, get_osdmap_epoch(), service.get_boot_epoch(),
6601 hb_back_addrs, hb_front_addrs, cluster_addrs,
6602 CEPH_FEATURES_ALL);
6603 dout(10) << " final client_addrs " << client_addrs
6604 << ", cluster_addrs " << cluster_addrs
6605 << ", hb_back_addrs " << hb_back_addrs
6606 << ", hb_front_addrs " << hb_front_addrs
7c673cae
FG
6607 << dendl;
6608 _collect_metadata(&mboot->metadata);
6609 monc->send_mon_message(mboot);
6610 set_state(STATE_BOOTING);
6611}
6612
6613void OSD::_collect_metadata(map<string,string> *pm)
6614{
6615 // config info
6616 (*pm)["osd_data"] = dev_path;
c07f9fc5
FG
6617 if (store->get_type() == "filestore") {
6618 // not applicable for bluestore
6619 (*pm)["osd_journal"] = journal_path;
6620 }
11fdf7f2
TL
6621 (*pm)["front_addr"] = stringify(client_messenger->get_myaddrs());
6622 (*pm)["back_addr"] = stringify(cluster_messenger->get_myaddrs());
6623 (*pm)["hb_front_addr"] = stringify(hb_front_server_messenger->get_myaddrs());
6624 (*pm)["hb_back_addr"] = stringify(hb_back_server_messenger->get_myaddrs());
7c673cae
FG
6625
6626 // backend
6627 (*pm)["osd_objectstore"] = store->get_type();
31f18b77 6628 (*pm)["rotational"] = store_is_rotational ? "1" : "0";
d2e6a577 6629 (*pm)["journal_rotational"] = journal_is_rotational ? "1" : "0";
c07f9fc5 6630 (*pm)["default_device_class"] = store->get_default_device_class();
f6b5b4d7
TL
6631 string osdspec_affinity;
6632 int r = store->read_meta("osdspec_affinity", &osdspec_affinity);
6633 if (r < 0 || osdspec_affinity.empty()) {
6634 osdspec_affinity = "";
6635 }
6636 (*pm)["osdspec_affinity"] = osdspec_affinity;
7c673cae
FG
6637 store->collect_metadata(pm);
6638
6639 collect_sys_info(pm, cct);
6640
11fdf7f2
TL
6641 (*pm)["front_iface"] = pick_iface(
6642 cct,
6643 client_messenger->get_myaddrs().front().get_sockaddr_storage());
6644 (*pm)["back_iface"] = pick_iface(
6645 cct,
6646 cluster_messenger->get_myaddrs().front().get_sockaddr_storage());
6647
6648 // network numa
6649 {
6650 int node = -1;
6651 set<int> nodes;
6652 set<string> unknown;
6653 for (auto nm : { "front_iface", "back_iface" }) {
6654 if (!(*pm)[nm].size()) {
6655 unknown.insert(nm);
6656 continue;
6657 }
6658 int n = -1;
6659 int r = get_iface_numa_node((*pm)[nm], &n);
6660 if (r < 0) {
6661 unknown.insert((*pm)[nm]);
6662 continue;
6663 }
6664 nodes.insert(n);
6665 if (node < 0) {
6666 node = n;
6667 }
6668 }
6669 if (unknown.size()) {
6670 (*pm)["network_numa_unknown_ifaces"] = stringify(unknown);
6671 }
6672 if (!nodes.empty()) {
6673 (*pm)["network_numa_nodes"] = stringify(nodes);
6674 }
6675 if (node >= 0 && nodes.size() == 1 && unknown.empty()) {
6676 (*pm)["network_numa_node"] = stringify(node);
6677 }
6678 }
6679
6680 if (numa_node >= 0) {
6681 (*pm)["numa_node"] = stringify(numa_node);
6682 (*pm)["numa_node_cpus"] = cpu_set_to_str_list(numa_cpu_set_size,
6683 &numa_cpu_set);
6684 }
6685
6686 set<string> devnames;
6687 store->get_devices(&devnames);
9f95a23c
TL
6688 map<string,string> errs;
6689 get_device_metadata(devnames, pm, &errs);
6690 for (auto& i : errs) {
6691 dout(1) << __func__ << " " << i.first << ": " << i.second << dendl;
11fdf7f2 6692 }
7c673cae
FG
6693 dout(10) << __func__ << " " << *pm << dendl;
6694}
6695
6696void OSD::queue_want_up_thru(epoch_t want)
6697{
9f95a23c
TL
6698 std::shared_lock map_locker{map_lock};
6699 epoch_t cur = get_osdmap()->get_up_thru(whoami);
6700 std::lock_guard report_locker(mon_report_lock);
7c673cae
FG
6701 if (want > up_thru_wanted) {
6702 dout(10) << "queue_want_up_thru now " << want << " (was " << up_thru_wanted << ")"
6703 << ", currently " << cur
6704 << dendl;
6705 up_thru_wanted = want;
6706 send_alive();
6707 } else {
6708 dout(10) << "queue_want_up_thru want " << want << " <= queued " << up_thru_wanted
6709 << ", currently " << cur
6710 << dendl;
6711 }
7c673cae
FG
6712}
6713
6714void OSD::send_alive()
6715{
9f95a23c
TL
6716 ceph_assert(ceph_mutex_is_locked(mon_report_lock));
6717 const auto osdmap = get_osdmap();
7c673cae
FG
6718 if (!osdmap->exists(whoami))
6719 return;
6720 epoch_t up_thru = osdmap->get_up_thru(whoami);
6721 dout(10) << "send_alive up_thru currently " << up_thru << " want " << up_thru_wanted << dendl;
6722 if (up_thru_wanted > up_thru) {
6723 dout(10) << "send_alive want " << up_thru_wanted << dendl;
6724 monc->send_mon_message(new MOSDAlive(osdmap->get_epoch(), up_thru_wanted));
6725 }
6726}
6727
6728void OSD::request_full_map(epoch_t first, epoch_t last)
6729{
6730 dout(10) << __func__ << " " << first << ".." << last
6731 << ", previously requested "
6732 << requested_full_first << ".." << requested_full_last << dendl;
9f95a23c 6733 ceph_assert(ceph_mutex_is_locked(osd_lock));
11fdf7f2
TL
6734 ceph_assert(first > 0 && last > 0);
6735 ceph_assert(first <= last);
6736 ceph_assert(first >= requested_full_first); // we shouldn't ever ask for older maps
7c673cae
FG
6737 if (requested_full_first == 0) {
6738 // first request
6739 requested_full_first = first;
6740 requested_full_last = last;
6741 } else if (last <= requested_full_last) {
6742 // dup
6743 return;
6744 } else {
6745 // additional request
6746 first = requested_full_last + 1;
6747 requested_full_last = last;
6748 }
6749 MMonGetOSDMap *req = new MMonGetOSDMap;
6750 req->request_full(first, last);
6751 monc->send_mon_message(req);
6752}
6753
6754void OSD::got_full_map(epoch_t e)
6755{
11fdf7f2 6756 ceph_assert(requested_full_first <= requested_full_last);
9f95a23c 6757 ceph_assert(ceph_mutex_is_locked(osd_lock));
7c673cae
FG
6758 if (requested_full_first == 0) {
6759 dout(20) << __func__ << " " << e << ", nothing requested" << dendl;
6760 return;
6761 }
6762 if (e < requested_full_first) {
6763 dout(10) << __func__ << " " << e << ", requested " << requested_full_first
6764 << ".." << requested_full_last
6765 << ", ignoring" << dendl;
6766 return;
6767 }
6768 if (e >= requested_full_last) {
6769 dout(10) << __func__ << " " << e << ", requested " << requested_full_first
6770 << ".." << requested_full_last << ", resetting" << dendl;
6771 requested_full_first = requested_full_last = 0;
6772 return;
6773 }
f67539c2 6774
7c673cae
FG
6775 requested_full_first = e + 1;
6776
6777 dout(10) << __func__ << " " << e << ", requested " << requested_full_first
6778 << ".." << requested_full_last
6779 << ", still need more" << dendl;
6780}
6781
6782void OSD::requeue_failures()
6783{
11fdf7f2 6784 std::lock_guard l(heartbeat_lock);
7c673cae
FG
6785 unsigned old_queue = failure_queue.size();
6786 unsigned old_pending = failure_pending.size();
11fdf7f2 6787 for (auto p = failure_pending.begin(); p != failure_pending.end(); ) {
7c673cae
FG
6788 failure_queue[p->first] = p->second.first;
6789 failure_pending.erase(p++);
6790 }
6791 dout(10) << __func__ << " " << old_queue << " + " << old_pending << " -> "
6792 << failure_queue.size() << dendl;
6793}
6794
6795void OSD::send_failures()
6796{
9f95a23c
TL
6797 ceph_assert(ceph_mutex_is_locked(map_lock));
6798 ceph_assert(ceph_mutex_is_locked(mon_report_lock));
11fdf7f2 6799 std::lock_guard l(heartbeat_lock);
7c673cae 6800 utime_t now = ceph_clock_now();
9f95a23c 6801 const auto osdmap = get_osdmap();
7c673cae
FG
6802 while (!failure_queue.empty()) {
6803 int osd = failure_queue.begin()->first;
7c673cae
FG
6804 if (!failure_pending.count(osd)) {
6805 int failed_for = (int)(double)(now - failure_queue.begin()->second);
11fdf7f2
TL
6806 monc->send_mon_message(
6807 new MOSDFailure(
6808 monc->get_fsid(),
6809 osd,
6810 osdmap->get_addrs(osd),
6811 failed_for,
6812 osdmap->get_epoch()));
6813 failure_pending[osd] = make_pair(failure_queue.begin()->second,
6814 osdmap->get_addrs(osd));
7c673cae
FG
6815 }
6816 failure_queue.erase(osd);
6817 }
6818}
6819
11fdf7f2 6820void OSD::send_still_alive(epoch_t epoch, int osd, const entity_addrvec_t &addrs)
7c673cae 6821{
11fdf7f2
TL
6822 MOSDFailure *m = new MOSDFailure(monc->get_fsid(), osd, addrs, 0, epoch,
6823 MOSDFailure::FLAG_ALIVE);
7c673cae
FG
6824 monc->send_mon_message(m);
6825}
6826
11fdf7f2 6827void OSD::cancel_pending_failures()
7c673cae 6828{
11fdf7f2
TL
6829 std::lock_guard l(heartbeat_lock);
6830 auto it = failure_pending.begin();
6831 while (it != failure_pending.end()) {
6832 dout(10) << __func__ << " canceling in-flight failure report for osd."
6833 << it->first << dendl;
9f95a23c 6834 send_still_alive(get_osdmap_epoch(), it->first, it->second.second);
11fdf7f2 6835 failure_pending.erase(it++);
7c673cae 6836 }
7c673cae
FG
6837}
6838
6839void OSD::send_beacon(const ceph::coarse_mono_clock::time_point& now)
6840{
6841 const auto& monmap = monc->monmap;
6842 // send beacon to mon even if we are just connected, and the monmap is not
6843 // initialized yet by then.
6844 if (monmap.epoch > 0 &&
6845 monmap.get_required_features().contains_all(
6846 ceph::features::mon::FEATURE_LUMINOUS)) {
6847 dout(20) << __func__ << " sending" << dendl;
7c673cae
FG
6848 MOSDBeacon* beacon = nullptr;
6849 {
11fdf7f2 6850 std::lock_guard l{min_last_epoch_clean_lock};
9f95a23c
TL
6851 beacon = new MOSDBeacon(get_osdmap_epoch(),
6852 min_last_epoch_clean,
f67539c2
TL
6853 superblock.last_purged_snaps_scrub,
6854 cct->_conf->osd_beacon_report_interval);
494da23a 6855 beacon->pgs = min_last_epoch_clean_pgs;
224ce89b 6856 last_sent_beacon = now;
7c673cae
FG
6857 }
6858 monc->send_mon_message(beacon);
6859 } else {
6860 dout(20) << __func__ << " not sending" << dendl;
6861 }
6862}
6863
7c673cae
FG
6864void OSD::handle_command(MCommand *m)
6865{
6866 ConnectionRef con = m->get_connection();
9f95a23c 6867 auto session = ceph::ref_cast<Session>(con->get_priv());
7c673cae 6868 if (!session) {
9f95a23c 6869 con->send_message(new MCommandReply(m, -EACCES));
7c673cae
FG
6870 m->put();
6871 return;
6872 }
9f95a23c
TL
6873 if (!session->caps.allow_all()) {
6874 con->send_message(new MCommandReply(m, -EACCES));
7c673cae
FG
6875 m->put();
6876 return;
6877 }
9f95a23c 6878 cct->get_admin_socket()->queue_tell_command(m);
7c673cae
FG
6879 m->put();
6880}
6881
f64942e4
AA
6882namespace {
6883 class unlock_guard {
9f95a23c 6884 ceph::mutex& m;
f64942e4 6885 public:
9f95a23c 6886 explicit unlock_guard(ceph::mutex& mutex)
f64942e4
AA
6887 : m(mutex)
6888 {
11fdf7f2 6889 m.unlock();
f64942e4
AA
6890 }
6891 unlock_guard(unlock_guard&) = delete;
6892 ~unlock_guard() {
11fdf7f2 6893 m.lock();
f64942e4
AA
6894 }
6895 };
6896}
6897
9f95a23c 6898void OSD::scrub_purged_snaps()
7c673cae 6899{
9f95a23c
TL
6900 dout(10) << __func__ << dendl;
6901 ceph_assert(ceph_mutex_is_locked(osd_lock));
6902 SnapMapper::Scrubber s(cct, store, service.meta_ch,
6903 make_snapmapper_oid(),
6904 make_purged_snaps_oid());
6905 clog->debug() << "purged_snaps scrub starts";
6906 osd_lock.unlock();
6907 s.run();
6908 if (s.stray.size()) {
6909 clog->debug() << "purged_snaps scrub found " << s.stray.size() << " strays";
6910 } else {
6911 clog->debug() << "purged_snaps scrub ok";
224ce89b 6912 }
9f95a23c
TL
6913 set<pair<spg_t,snapid_t>> queued;
6914 for (auto& [pool, snap, hash, shard] : s.stray) {
6915 const pg_pool_t *pi = get_osdmap()->get_pg_pool(pool);
6916 if (!pi) {
6917 dout(20) << __func__ << " pool " << pool << " dne" << dendl;
6918 continue;
11fdf7f2 6919 }
9f95a23c
TL
6920 pg_t pgid(pi->raw_hash_to_pg(hash), pool);
6921 spg_t spgid(pgid, shard);
6922 pair<spg_t,snapid_t> p(spgid, snap);
6923 if (queued.count(p)) {
6924 dout(20) << __func__ << " pg " << spgid << " snap " << snap
6925 << " already queued" << dendl;
6926 continue;
11fdf7f2 6927 }
9f95a23c
TL
6928 PGRef pg = lookup_lock_pg(spgid);
6929 if (!pg) {
6930 dout(20) << __func__ << " pg " << spgid << " not found" << dendl;
6931 continue;
11fdf7f2 6932 }
9f95a23c
TL
6933 queued.insert(p);
6934 dout(10) << __func__ << " requeue pg " << spgid << " " << pg << " snap "
6935 << snap << dendl;
6936 pg->queue_snap_retrim(snap);
6937 pg->unlock();
7c673cae 6938 }
9f95a23c
TL
6939 osd_lock.lock();
6940 if (is_stopping()) {
6941 return;
6942 }
6943 dout(10) << __func__ << " done queueing pgs, updating superblock" << dendl;
6944 ObjectStore::Transaction t;
6945 superblock.last_purged_snaps_scrub = ceph_clock_now();
6946 write_superblock(t);
6947 int tr = store->queue_transaction(service.meta_ch, std::move(t), nullptr);
6948 ceph_assert(tr == 0);
6949 if (is_active()) {
6950 send_beacon(ceph::coarse_mono_clock::now());
6951 }
6952 dout(10) << __func__ << " done" << dendl;
11fdf7f2
TL
6953}
6954
6955void OSD::probe_smart(const string& only_devid, ostream& ss)
6956{
6957 set<string> devnames;
6958 store->get_devices(&devnames);
6959 uint64_t smart_timeout = cct->_conf.get_val<uint64_t>(
6960 "osd_smart_report_timeout");
6961
6962 // == typedef std::map<std::string, mValue> mObject;
6963 json_spirit::mObject json_map;
6964
6965 for (auto dev : devnames) {
6966 // smartctl works only on physical devices; filter out any logical device
6967 if (dev.find("dm-") == 0) {
6968 continue;
6969 }
6970
6971 string err;
6972 string devid = get_device_id(dev, &err);
6973 if (devid.size() == 0) {
6974 dout(10) << __func__ << " no unique id for dev " << dev << " ("
6975 << err << "), skipping" << dendl;
6976 continue;
6977 }
6978 if (only_devid.size() && devid != only_devid) {
6979 continue;
6980 }
6981
6982 json_spirit::mValue smart_json;
6983 if (block_device_get_metrics(dev, smart_timeout,
6984 &smart_json)) {
6985 dout(10) << "block_device_get_metrics failed for /dev/" << dev << dendl;
6986 continue;
6987 }
6988 json_map[devid] = smart_json;
7c673cae 6989 }
11fdf7f2 6990 json_spirit::write(json_map, ss, json_spirit::pretty_print);
7c673cae
FG
6991}
6992
6993bool OSD::heartbeat_dispatch(Message *m)
6994{
6995 dout(30) << "heartbeat_dispatch " << m << dendl;
6996 switch (m->get_type()) {
6997
6998 case CEPH_MSG_PING:
6999 dout(10) << "ping from " << m->get_source_inst() << dendl;
7000 m->put();
7001 break;
7002
7003 case MSG_OSD_PING:
7004 handle_osd_ping(static_cast<MOSDPing*>(m));
7005 break;
7006
7007 default:
7008 dout(0) << "dropping unexpected message " << *m << " from " << m->get_source_inst() << dendl;
7009 m->put();
7010 }
7011
7012 return true;
7013}
7014
7015bool OSD::ms_dispatch(Message *m)
7016{
7017 dout(20) << "OSD::ms_dispatch: " << *m << dendl;
7018 if (m->get_type() == MSG_OSD_MARK_ME_DOWN) {
7019 service.got_stop_ack();
7020 m->put();
7021 return true;
7022 }
7023
7024 // lock!
7025
9f95a23c 7026 osd_lock.lock();
7c673cae 7027 if (is_stopping()) {
9f95a23c 7028 osd_lock.unlock();
7c673cae
FG
7029 m->put();
7030 return true;
7031 }
7032
7033 do_waiters();
7034 _dispatch(m);
7035
9f95a23c 7036 osd_lock.unlock();
7c673cae
FG
7037
7038 return true;
7039}
7040
9f95a23c
TL
7041void OSDService::maybe_share_map(
7042 Connection *con,
7043 const OSDMapRef& osdmap,
7044 epoch_t peer_epoch_lb)
7c673cae 7045{
9f95a23c
TL
7046 // NOTE: we assume caller hold something that keeps the Connection itself
7047 // pinned (e.g., an OpRequest's MessageRef).
7048 auto session = ceph::ref_cast<Session>(con->get_priv());
7049 if (!session) {
7c673cae
FG
7050 return;
7051 }
7c673cae 7052
9f95a23c
TL
7053 // assume the peer has the newer of the op's sent_epoch and what
7054 // we think we sent them.
7c673cae 7055 session->sent_epoch_lock.lock();
9f95a23c
TL
7056 if (peer_epoch_lb > session->last_sent_epoch) {
7057 dout(10) << __func__ << " con " << con
7058 << " " << con->get_peer_addr()
7059 << " map epoch " << session->last_sent_epoch
7060 << " -> " << peer_epoch_lb << " (as per caller)" << dendl;
7061 session->last_sent_epoch = peer_epoch_lb;
7062 }
7063 epoch_t last_sent_epoch = session->last_sent_epoch;
7c673cae
FG
7064 session->sent_epoch_lock.unlock();
7065
9f95a23c
TL
7066 if (osdmap->get_epoch() <= last_sent_epoch) {
7067 return;
7068 }
11fdf7f2 7069
9f95a23c
TL
7070 send_incremental_map(last_sent_epoch, con, osdmap);
7071 last_sent_epoch = osdmap->get_epoch();
7c673cae
FG
7072
7073 session->sent_epoch_lock.lock();
7074 if (session->last_sent_epoch < last_sent_epoch) {
9f95a23c
TL
7075 dout(10) << __func__ << " con " << con
7076 << " " << con->get_peer_addr()
7077 << " map epoch " << session->last_sent_epoch
7078 << " -> " << last_sent_epoch << " (shared)" << dendl;
7c673cae
FG
7079 session->last_sent_epoch = last_sent_epoch;
7080 }
7081 session->sent_epoch_lock.unlock();
7c673cae
FG
7082}
7083
9f95a23c 7084void OSD::dispatch_session_waiting(const ceph::ref_t<Session>& session, OSDMapRef osdmap)
7c673cae 7085{
9f95a23c 7086 ceph_assert(ceph_mutex_is_locked(session->session_dispatch_lock));
7c673cae
FG
7087
7088 auto i = session->waiting_on_map.begin();
7089 while (i != session->waiting_on_map.end()) {
7090 OpRequestRef op = &(*i);
11fdf7f2 7091 ceph_assert(ms_can_fast_dispatch(op->get_req()));
9f95a23c 7092 auto m = op->get_req<MOSDFastDispatchOp>();
7c673cae
FG
7093 if (m->get_min_epoch() > osdmap->get_epoch()) {
7094 break;
7095 }
7096 session->waiting_on_map.erase(i++);
7097 op->put();
7098
7099 spg_t pgid;
7100 if (m->get_type() == CEPH_MSG_OSD_OP) {
7101 pg_t actual_pgid = osdmap->raw_pg_to_pg(
7102 static_cast<const MOSDOp*>(m)->get_pg());
7103 if (!osdmap->get_primary_shard(actual_pgid, &pgid)) {
7104 continue;
7105 }
7106 } else {
7107 pgid = m->get_spg();
7108 }
11fdf7f2 7109 enqueue_op(pgid, std::move(op), m->get_map_epoch());
7c673cae
FG
7110 }
7111
7112 if (session->waiting_on_map.empty()) {
7113 clear_session_waiting_on_map(session);
7114 } else {
7115 register_session_waiting_on_map(session);
7116 }
7117}
7118
7119void OSD::ms_fast_dispatch(Message *m)
7120{
f67539c2
TL
7121
7122#ifdef HAVE_JAEGER
7123 jaeger_tracing::init_tracer("osd-services-reinit");
7124 dout(10) << "jaeger tracer after " << opentracing::Tracer::Global() << dendl;
7125 auto dispatch_span = jaeger_tracing::new_span(__func__);
7126#endif
11fdf7f2 7127 FUNCTRACE(cct);
7c673cae
FG
7128 if (service.is_stopping()) {
7129 m->put();
7130 return;
7131 }
11fdf7f2
TL
7132
7133 // peering event?
7134 switch (m->get_type()) {
7135 case CEPH_MSG_PING:
7136 dout(10) << "ping from " << m->get_source() << dendl;
7137 m->put();
7138 return;
11fdf7f2
TL
7139 case MSG_OSD_FORCE_RECOVERY:
7140 handle_fast_force_recovery(static_cast<MOSDForceRecovery*>(m));
7141 return;
7142 case MSG_OSD_SCRUB2:
7143 handle_fast_scrub(static_cast<MOSDScrub2*>(m));
7144 return;
7145
7146 case MSG_OSD_PG_CREATE2:
7147 return handle_fast_pg_create(static_cast<MOSDPGCreate2*>(m));
7148 case MSG_OSD_PG_QUERY:
7149 return handle_fast_pg_query(static_cast<MOSDPGQuery*>(m));
7150 case MSG_OSD_PG_NOTIFY:
7151 return handle_fast_pg_notify(static_cast<MOSDPGNotify*>(m));
7152 case MSG_OSD_PG_INFO:
7153 return handle_fast_pg_info(static_cast<MOSDPGInfo*>(m));
7154 case MSG_OSD_PG_REMOVE:
7155 return handle_fast_pg_remove(static_cast<MOSDPGRemove*>(m));
7156
7157 // these are single-pg messages that handle themselves
7158 case MSG_OSD_PG_LOG:
7159 case MSG_OSD_PG_TRIM:
9f95a23c
TL
7160 case MSG_OSD_PG_NOTIFY2:
7161 case MSG_OSD_PG_QUERY2:
7162 case MSG_OSD_PG_INFO2:
11fdf7f2
TL
7163 case MSG_OSD_BACKFILL_RESERVE:
7164 case MSG_OSD_RECOVERY_RESERVE:
9f95a23c
TL
7165 case MSG_OSD_PG_LEASE:
7166 case MSG_OSD_PG_LEASE_ACK:
11fdf7f2
TL
7167 {
7168 MOSDPeeringOp *pm = static_cast<MOSDPeeringOp*>(m);
7169 if (require_osd_peer(pm)) {
7170 enqueue_peering_evt(
7171 pm->get_spg(),
7172 PGPeeringEventRef(pm->get_event()));
7173 }
7174 pm->put();
7175 return;
7176 }
7177 }
7178
7c673cae
FG
7179 OpRequestRef op = op_tracker.create_request<OpRequest, Message*>(m);
7180 {
7181#ifdef WITH_LTTNG
7182 osd_reqid_t reqid = op->get_reqid();
7183#endif
7184 tracepoint(osd, ms_fast_dispatch, reqid.name._type,
7185 reqid.name._num, reqid.tid, reqid.inc);
7186 }
f67539c2
TL
7187#ifdef HAVE_JAEGER
7188 op->set_osd_parent_span(dispatch_span);
7189 if (op->osd_parent_span) {
7190 auto op_req_span = jaeger_tracing::child_span("op-request-created", op->osd_parent_span);
7191 op->set_osd_parent_span(op_req_span);
7192 }
7193#endif
7c673cae
FG
7194 if (m->trace)
7195 op->osd_trace.init("osd op", &trace_endpoint, &m->trace);
7196
11fdf7f2 7197 // note sender epoch, min req's epoch
7c673cae
FG
7198 op->sent_epoch = static_cast<MOSDFastDispatchOp*>(m)->get_map_epoch();
7199 op->min_epoch = static_cast<MOSDFastDispatchOp*>(m)->get_min_epoch();
11fdf7f2 7200 ceph_assert(op->min_epoch <= op->sent_epoch); // sanity check!
7c673cae
FG
7201
7202 service.maybe_inject_dispatch_delay();
7203
7204 if (m->get_connection()->has_features(CEPH_FEATUREMASK_RESEND_ON_SPLIT) ||
7205 m->get_type() != CEPH_MSG_OSD_OP) {
7206 // queue it directly
7207 enqueue_op(
7208 static_cast<MOSDFastDispatchOp*>(m)->get_spg(),
11fdf7f2 7209 std::move(op),
7c673cae
FG
7210 static_cast<MOSDFastDispatchOp*>(m)->get_map_epoch());
7211 } else {
7212 // legacy client, and this is an MOSDOp (the *only* fast dispatch
7213 // message that didn't have an explicit spg_t); we need to map
7214 // them to an spg_t while preserving delivery order.
11fdf7f2
TL
7215 auto priv = m->get_connection()->get_priv();
7216 if (auto session = static_cast<Session*>(priv.get()); session) {
7217 std::lock_guard l{session->session_dispatch_lock};
7218 op->get();
7219 session->waiting_on_map.push_back(*op);
7220 OSDMapRef nextmap = service.get_nextmap_reserved();
7221 dispatch_session_waiting(session, nextmap);
7222 service.release_map(nextmap);
7c673cae
FG
7223 }
7224 }
f67539c2 7225 OID_EVENT_TRACE_WITH_MSG(m, "MS_FAST_DISPATCH_END", false);
7c673cae
FG
7226}
7227
11fdf7f2 7228int OSD::ms_handle_authentication(Connection *con)
7c673cae 7229{
11fdf7f2 7230 int ret = 0;
9f95a23c 7231 auto s = ceph::ref_cast<Session>(con->get_priv());
11fdf7f2 7232 if (!s) {
9f95a23c
TL
7233 s = ceph::make_ref<Session>(cct, con);
7234 con->set_priv(s);
11fdf7f2
TL
7235 s->entity_name = con->get_peer_entity_name();
7236 dout(10) << __func__ << " new session " << s << " con " << s->con
7237 << " entity " << s->entity_name
7238 << " addr " << con->get_peer_addrs() << dendl;
7239 } else {
7240 dout(10) << __func__ << " existing session " << s << " con " << s->con
7241 << " entity " << s->entity_name
7242 << " addr " << con->get_peer_addrs() << dendl;
7c673cae
FG
7243 }
7244
11fdf7f2 7245 AuthCapsInfo &caps_info = con->get_peer_caps_info();
9f95a23c 7246 if (caps_info.allow_all) {
11fdf7f2 7247 s->caps.set_allow_all();
9f95a23c 7248 } else if (caps_info.caps.length() > 0) {
11fdf7f2
TL
7249 bufferlist::const_iterator p = caps_info.caps.cbegin();
7250 string str;
7251 try {
7252 decode(str, p);
7253 }
f67539c2 7254 catch (ceph::buffer::error& e) {
11fdf7f2
TL
7255 dout(10) << __func__ << " session " << s << " " << s->entity_name
7256 << " failed to decode caps string" << dendl;
9f95a23c 7257 ret = -EACCES;
11fdf7f2
TL
7258 }
7259 if (!ret) {
7c673cae 7260 bool success = s->caps.parse(str);
11fdf7f2
TL
7261 if (success) {
7262 dout(10) << __func__ << " session " << s
7263 << " " << s->entity_name
7264 << " has caps " << s->caps << " '" << str << "'" << dendl;
7265 ret = 1;
7266 } else {
7267 dout(10) << __func__ << " session " << s << " " << s->entity_name
7268 << " failed to parse caps '" << str << "'" << dendl;
9f95a23c 7269 ret = -EACCES;
11fdf7f2 7270 }
7c673cae 7271 }
7c673cae 7272 }
11fdf7f2 7273 return ret;
7c673cae
FG
7274}
7275
7276void OSD::do_waiters()
7277{
9f95a23c 7278 ceph_assert(ceph_mutex_is_locked(osd_lock));
7c673cae
FG
7279
7280 dout(10) << "do_waiters -- start" << dendl;
7281 while (!finished.empty()) {
7282 OpRequestRef next = finished.front();
7283 finished.pop_front();
7284 dispatch_op(next);
7285 }
7286 dout(10) << "do_waiters -- finish" << dendl;
7287}
7288
7289void OSD::dispatch_op(OpRequestRef op)
7290{
7291 switch (op->get_req()->get_type()) {
7292
7293 case MSG_OSD_PG_CREATE:
7294 handle_pg_create(op);
7295 break;
7c673cae
FG
7296 }
7297}
7298
7299void OSD::_dispatch(Message *m)
7300{
9f95a23c 7301 ceph_assert(ceph_mutex_is_locked(osd_lock));
7c673cae
FG
7302 dout(20) << "_dispatch " << m << " " << *m << dendl;
7303
7304 switch (m->get_type()) {
7c673cae
FG
7305 // -- don't need OSDMap --
7306
7307 // map and replication
7308 case CEPH_MSG_OSD_MAP:
7309 handle_osd_map(static_cast<MOSDMap*>(m));
7310 break;
9f95a23c
TL
7311 case MSG_MON_GET_PURGED_SNAPS_REPLY:
7312 handle_get_purged_snaps_reply(static_cast<MMonGetPurgedSnapsReply*>(m));
7313 break;
7c673cae
FG
7314
7315 // osd
7c673cae
FG
7316 case MSG_OSD_SCRUB:
7317 handle_scrub(static_cast<MOSDScrub*>(m));
7318 break;
7319
11fdf7f2
TL
7320 case MSG_COMMAND:
7321 handle_command(static_cast<MCommand*>(m));
7322 return;
c07f9fc5 7323
7c673cae
FG
7324 // -- need OSDMap --
7325
7326 case MSG_OSD_PG_CREATE:
7c673cae
FG
7327 {
7328 OpRequestRef op = op_tracker.create_request<OpRequest, Message*>(m);
7329 if (m->trace)
7330 op->osd_trace.init("osd op", &trace_endpoint, &m->trace);
7331 // no map? starting up?
9f95a23c 7332 if (!get_osdmap()) {
7c673cae
FG
7333 dout(7) << "no OSDMap, not booted" << dendl;
7334 logger->inc(l_osd_waiting_for_map);
7335 waiting_for_osdmap.push_back(op);
7336 op->mark_delayed("no osdmap");
7337 break;
7338 }
7339
7340 // need OSDMap
7341 dispatch_op(op);
7342 }
7343 }
7344}
7345
11fdf7f2 7346// remove me post-nautilus
7c673cae
FG
7347void OSD::handle_scrub(MOSDScrub *m)
7348{
7349 dout(10) << "handle_scrub " << *m << dendl;
7350 if (!require_mon_or_mgr_peer(m)) {
7351 m->put();
7352 return;
7353 }
7354 if (m->fsid != monc->get_fsid()) {
11fdf7f2
TL
7355 dout(0) << "handle_scrub fsid " << m->fsid << " != " << monc->get_fsid()
7356 << dendl;
7c673cae
FG
7357 m->put();
7358 return;
7359 }
7360
11fdf7f2
TL
7361 vector<spg_t> spgs;
7362 _get_pgids(&spgs);
7363
7364 if (!m->scrub_pgs.empty()) {
7365 vector<spg_t> v;
7366 for (auto pgid : m->scrub_pgs) {
7c673cae 7367 spg_t pcand;
9f95a23c 7368 if (get_osdmap()->get_primary_shard(pgid, &pcand) &&
11fdf7f2
TL
7369 std::find(spgs.begin(), spgs.end(), pcand) != spgs.end()) {
7370 v.push_back(pcand);
7c673cae
FG
7371 }
7372 }
11fdf7f2
TL
7373 spgs.swap(v);
7374 }
7375
7376 for (auto pgid : spgs) {
7377 enqueue_peering_evt(
7378 pgid,
7379 PGPeeringEventRef(
7380 std::make_shared<PGPeeringEvent>(
7381 get_osdmap_epoch(),
7382 get_osdmap_epoch(),
9f95a23c 7383 PeeringState::RequestScrub(m->deep, m->repair))));
7c673cae
FG
7384 }
7385
7386 m->put();
7387}
7388
11fdf7f2
TL
7389void OSD::handle_fast_scrub(MOSDScrub2 *m)
7390{
7391 dout(10) << __func__ << " " << *m << dendl;
7392 if (!require_mon_or_mgr_peer(m)) {
7393 m->put();
7394 return;
7395 }
7396 if (m->fsid != monc->get_fsid()) {
7397 dout(0) << __func__ << " fsid " << m->fsid << " != " << monc->get_fsid()
7398 << dendl;
7399 m->put();
7400 return;
7401 }
7402 for (auto pgid : m->scrub_pgs) {
7403 enqueue_peering_evt(
7404 pgid,
7405 PGPeeringEventRef(
7406 std::make_shared<PGPeeringEvent>(
7407 m->epoch,
7408 m->epoch,
9f95a23c 7409 PeeringState::RequestScrub(m->deep, m->repair))));
11fdf7f2
TL
7410 }
7411 m->put();
7412}
7413
7c673cae
FG
7414bool OSD::scrub_random_backoff()
7415{
7416 bool coin_flip = (rand() / (double)RAND_MAX >=
7417 cct->_conf->osd_scrub_backoff_ratio);
7418 if (!coin_flip) {
7419 dout(20) << "scrub_random_backoff lost coin flip, randomly backing off" << dendl;
7420 return true;
7421 }
7422 return false;
7423}
7424
7425OSDService::ScrubJob::ScrubJob(CephContext* cct,
7426 const spg_t& pg, const utime_t& timestamp,
7427 double pool_scrub_min_interval,
7428 double pool_scrub_max_interval, bool must)
7429 : cct(cct),
7430 pgid(pg),
7431 sched_time(timestamp),
7432 deadline(timestamp)
7433{
7434 // if not explicitly requested, postpone the scrub with a random delay
7435 if (!must) {
7436 double scrub_min_interval = pool_scrub_min_interval > 0 ?
7437 pool_scrub_min_interval : cct->_conf->osd_scrub_min_interval;
7438 double scrub_max_interval = pool_scrub_max_interval > 0 ?
7439 pool_scrub_max_interval : cct->_conf->osd_scrub_max_interval;
7440
7441 sched_time += scrub_min_interval;
7442 double r = rand() / (double)RAND_MAX;
7443 sched_time +=
7444 scrub_min_interval * cct->_conf->osd_scrub_interval_randomize_ratio * r;
11fdf7f2
TL
7445 if (scrub_max_interval == 0) {
7446 deadline = utime_t();
7447 } else {
7448 deadline += scrub_max_interval;
7449 }
7450
7c673cae
FG
7451 }
7452}
7453
7454bool OSDService::ScrubJob::ScrubJob::operator<(const OSDService::ScrubJob& rhs) const {
7455 if (sched_time < rhs.sched_time)
7456 return true;
7457 if (sched_time > rhs.sched_time)
7458 return false;
7459 return pgid < rhs.pgid;
7460}
7461
f67539c2
TL
7462void OSDService::dumps_scrub(ceph::Formatter *f)
7463{
7464 ceph_assert(f != nullptr);
7465 std::lock_guard l(sched_scrub_lock);
7466
7467 f->open_array_section("scrubs");
7468 for (const auto &i: sched_scrub_pg) {
7469 f->open_object_section("scrub");
7470 f->dump_stream("pgid") << i.pgid;
7471 f->dump_stream("sched_time") << i.sched_time;
7472 f->dump_stream("deadline") << i.deadline;
7473 f->dump_bool("forced", i.sched_time == PgScrubber::scrub_must_stamp());
7474 f->close_section();
7475 }
7476 f->close_section();
7477}
7478
9f95a23c
TL
7479double OSD::scrub_sleep_time(bool must_scrub)
7480{
7481 if (must_scrub) {
7482 return cct->_conf->osd_scrub_sleep;
7483 }
7484 utime_t now = ceph_clock_now();
7485 if (scrub_time_permit(now)) {
7486 return cct->_conf->osd_scrub_sleep;
7487 }
7488 double normal_sleep = cct->_conf->osd_scrub_sleep;
7489 double extended_sleep = cct->_conf->osd_scrub_extended_sleep;
7490 return std::max(extended_sleep, normal_sleep);
7491}
7492
7c673cae
FG
7493bool OSD::scrub_time_permit(utime_t now)
7494{
7495 struct tm bdt;
7496 time_t tt = now.sec();
7497 localtime_r(&tt, &bdt);
28e407b8
AA
7498
7499 bool day_permit = false;
7500 if (cct->_conf->osd_scrub_begin_week_day < cct->_conf->osd_scrub_end_week_day) {
7501 if (bdt.tm_wday >= cct->_conf->osd_scrub_begin_week_day && bdt.tm_wday < cct->_conf->osd_scrub_end_week_day) {
7502 day_permit = true;
7503 }
7504 } else {
7505 if (bdt.tm_wday >= cct->_conf->osd_scrub_begin_week_day || bdt.tm_wday < cct->_conf->osd_scrub_end_week_day) {
7506 day_permit = true;
7507 }
7508 }
7509
7510 if (!day_permit) {
7511 dout(20) << __func__ << " should run between week day " << cct->_conf->osd_scrub_begin_week_day
7512 << " - " << cct->_conf->osd_scrub_end_week_day
7513 << " now " << bdt.tm_wday << " = no" << dendl;
7514 return false;
7515 }
7516
7c673cae
FG
7517 bool time_permit = false;
7518 if (cct->_conf->osd_scrub_begin_hour < cct->_conf->osd_scrub_end_hour) {
7519 if (bdt.tm_hour >= cct->_conf->osd_scrub_begin_hour && bdt.tm_hour < cct->_conf->osd_scrub_end_hour) {
7520 time_permit = true;
7521 }
7522 } else {
7523 if (bdt.tm_hour >= cct->_conf->osd_scrub_begin_hour || bdt.tm_hour < cct->_conf->osd_scrub_end_hour) {
7524 time_permit = true;
7525 }
7526 }
f67539c2 7527 if (time_permit) {
7c673cae
FG
7528 dout(20) << __func__ << " should run between " << cct->_conf->osd_scrub_begin_hour
7529 << " - " << cct->_conf->osd_scrub_end_hour
f67539c2 7530 << " now " << bdt.tm_hour << " = yes" << dendl;
7c673cae
FG
7531 } else {
7532 dout(20) << __func__ << " should run between " << cct->_conf->osd_scrub_begin_hour
7533 << " - " << cct->_conf->osd_scrub_end_hour
f67539c2 7534 << " now " << bdt.tm_hour << " = no" << dendl;
7c673cae
FG
7535 }
7536 return time_permit;
7537}
7538
7539bool OSD::scrub_load_below_threshold()
7540{
7541 double loadavgs[3];
7542 if (getloadavg(loadavgs, 3) != 3) {
7543 dout(10) << __func__ << " couldn't read loadavgs\n" << dendl;
7544 return false;
7545 }
7546
7547 // allow scrub if below configured threshold
91327a77
AA
7548 long cpus = sysconf(_SC_NPROCESSORS_ONLN);
7549 double loadavg_per_cpu = cpus > 0 ? loadavgs[0] / cpus : loadavgs[0];
7550 if (loadavg_per_cpu < cct->_conf->osd_scrub_load_threshold) {
7551 dout(20) << __func__ << " loadavg per cpu " << loadavg_per_cpu
7c673cae
FG
7552 << " < max " << cct->_conf->osd_scrub_load_threshold
7553 << " = yes" << dendl;
7554 return true;
7555 }
7556
7557 // allow scrub if below daily avg and currently decreasing
7558 if (loadavgs[0] < daily_loadavg && loadavgs[0] < loadavgs[2]) {
7559 dout(20) << __func__ << " loadavg " << loadavgs[0]
7560 << " < daily_loadavg " << daily_loadavg
7561 << " and < 15m avg " << loadavgs[2]
7562 << " = yes" << dendl;
7563 return true;
7564 }
7565
7566 dout(20) << __func__ << " loadavg " << loadavgs[0]
7567 << " >= max " << cct->_conf->osd_scrub_load_threshold
7568 << " and ( >= daily_loadavg " << daily_loadavg
7569 << " or >= 15m avg " << loadavgs[2]
7570 << ") = no" << dendl;
7571 return false;
7572}
7573
7574void OSD::sched_scrub()
7575{
f67539c2
TL
7576 dout(20) << __func__ << " sched_scrub starts" << dendl;
7577
7c673cae 7578 // if not permitted, fail fast
eafe8130 7579 if (!service.can_inc_scrubs()) {
f67539c2 7580 dout(20) << __func__ << ": OSD cannot inc scrubs" << dendl;
7c673cae
FG
7581 return;
7582 }
eafe8130 7583 bool allow_requested_repair_only = false;
f6b5b4d7
TL
7584 if (service.is_recovery_active() && !cct->_conf->osd_scrub_during_recovery) {
7585 if (!cct->_conf->osd_repair_during_recovery) {
f67539c2 7586 dout(15) << __func__ << ": not scheduling scrubs due to active recovery" << dendl;
eafe8130
TL
7587 return;
7588 }
f6b5b4d7
TL
7589 dout(10) << __func__
7590 << " will only schedule explicitly requested repair due to active recovery"
7591 << dendl;
7592 allow_requested_repair_only = true;
b5b8bbf5
FG
7593 }
7594
7c673cae
FG
7595 utime_t now = ceph_clock_now();
7596 bool time_permit = scrub_time_permit(now);
7597 bool load_is_low = scrub_load_below_threshold();
7598 dout(20) << "sched_scrub load_is_low=" << (int)load_is_low << dendl;
7599
f67539c2
TL
7600 OSDService::ScrubJob scrub_job;
7601 if (service.first_scrub_stamp(&scrub_job)) {
7c673cae 7602 do {
f67539c2 7603 dout(30) << "sched_scrub examine " << scrub_job.pgid << " at " << scrub_job.sched_time << dendl;
7c673cae 7604
f67539c2 7605 if (scrub_job.sched_time > now) {
7c673cae 7606 // save ourselves some effort
f67539c2 7607 dout(20) << "sched_scrub " << scrub_job.pgid << " scheduled at " << scrub_job.sched_time
7c673cae
FG
7608 << " > " << now << dendl;
7609 break;
7610 }
7611
f67539c2
TL
7612 if ((scrub_job.deadline.is_zero() || scrub_job.deadline >= now) && !(time_permit && load_is_low)) {
7613 dout(15) << __func__ << " not scheduling scrub for " << scrub_job.pgid << " due to "
7c673cae
FG
7614 << (!time_permit ? "time not permit" : "high load") << dendl;
7615 continue;
7616 }
7617
f67539c2
TL
7618 PGRef pg = _lookup_lock_pg(scrub_job.pgid);
7619 if (!pg) {
7620 dout(20) << __func__ << " pg " << scrub_job.pgid << " not found" << dendl;
7c673cae 7621 continue;
f67539c2
TL
7622 }
7623
494da23a 7624 // This has already started, so go on to the next scrub job
f67539c2 7625 if (pg->is_scrub_active()) {
494da23a 7626 pg->unlock();
f67539c2 7627 dout(20) << __func__ << ": already in progress pgid " << scrub_job.pgid << dendl;
494da23a
TL
7628 continue;
7629 }
f67539c2
TL
7630 // Skip other kinds of scrubbing if only explicitly requested repairing is allowed
7631 if (allow_requested_repair_only && !pg->m_planned_scrub.must_repair) {
eafe8130 7632 pg->unlock();
f67539c2 7633 dout(10) << __func__ << " skip " << scrub_job.pgid
eafe8130
TL
7634 << " because repairing is not explicitly requested on it"
7635 << dendl;
7636 continue;
7637 }
f67539c2 7638
494da23a 7639 // If it is reserving, let it resolve before going to the next scrub job
f67539c2 7640 if (pg->m_scrubber->is_reserving()) {
494da23a 7641 pg->unlock();
f67539c2 7642 dout(10) << __func__ << ": reserve in progress pgid " << scrub_job.pgid << dendl;
494da23a
TL
7643 break;
7644 }
f67539c2 7645 dout(15) << "sched_scrub scrubbing " << scrub_job.pgid << " at " << scrub_job.sched_time
11fdf7f2
TL
7646 << (pg->get_must_scrub() ? ", explicitly requested" :
7647 (load_is_low ? ", load_is_low" : " deadline < now"))
7648 << dendl;
7649 if (pg->sched_scrub()) {
7650 pg->unlock();
f67539c2 7651 dout(10) << __func__ << " scheduled a scrub!" << " (~" << scrub_job.pgid << "~)" << dendl;
11fdf7f2 7652 break;
7c673cae
FG
7653 }
7654 pg->unlock();
f67539c2 7655 } while (service.next_scrub_stamp(scrub_job, &scrub_job));
7c673cae
FG
7656 }
7657 dout(20) << "sched_scrub done" << dendl;
7658}
7659
494da23a
TL
7660void OSD::resched_all_scrubs()
7661{
7662 dout(10) << __func__ << ": start" << dendl;
b3b6e05e
TL
7663 const vector<spg_t> pgs = [this] {
7664 vector<spg_t> pgs;
7665 OSDService::ScrubJob job;
7666 if (service.first_scrub_stamp(&job)) {
7667 do {
7668 pgs.push_back(job.pgid);
7669 } while (service.next_scrub_stamp(job, &job));
7670 }
7671 return pgs;
7672 }();
7673 for (auto& pgid : pgs) {
7674 dout(20) << __func__ << ": examine " << pgid << dendl;
7675 PGRef pg = _lookup_lock_pg(pgid);
494da23a
TL
7676 if (!pg)
7677 continue;
f67539c2 7678 if (!pg->m_planned_scrub.must_scrub && !pg->m_planned_scrub.need_auto) {
b3b6e05e 7679 dout(15) << __func__ << ": reschedule " << pgid << dendl;
494da23a
TL
7680 pg->on_info_history_change();
7681 }
7682 pg->unlock();
494da23a
TL
7683 }
7684 dout(10) << __func__ << ": done" << dendl;
7685}
7686
11fdf7f2
TL
7687MPGStats* OSD::collect_pg_stats()
7688{
7689 // This implementation unconditionally sends every is_primary PG's
7690 // stats every time we're called. This has equivalent cost to the
7691 // previous implementation's worst case where all PGs are busy and
7692 // their stats are always enqueued for sending.
9f95a23c 7693 std::shared_lock l{map_lock};
11fdf7f2 7694
11fdf7f2
TL
7695 osd_stat_t cur_stat = service.get_osd_stat();
7696 cur_stat.os_perf_stat = store->get_cur_stats();
7697
9f95a23c 7698 auto m = new MPGStats(monc->get_fsid(), get_osdmap_epoch());
11fdf7f2
TL
7699 m->osd_stat = cur_stat;
7700
7701 std::lock_guard lec{min_last_epoch_clean_lock};
9f95a23c 7702 min_last_epoch_clean = get_osdmap_epoch();
11fdf7f2
TL
7703 min_last_epoch_clean_pgs.clear();
7704
7705 std::set<int64_t> pool_set;
7706 vector<PGRef> pgs;
7707 _get_pgs(&pgs);
7708 for (auto& pg : pgs) {
7709 auto pool = pg->pg_id.pgid.pool();
7710 pool_set.emplace((int64_t)pool);
7711 if (!pg->is_primary()) {
7712 continue;
7713 }
7714 pg->get_pg_stats([&](const pg_stat_t& s, epoch_t lec) {
7715 m->pg_stat[pg->pg_id.pgid] = s;
f67539c2 7716 min_last_epoch_clean = std::min(min_last_epoch_clean, lec);
11fdf7f2
TL
7717 min_last_epoch_clean_pgs.push_back(pg->pg_id.pgid);
7718 });
7719 }
7720 store_statfs_t st;
81eedcae 7721 bool per_pool_stats = false;
9f95a23c 7722 bool per_pool_omap_stats = false;
11fdf7f2 7723 for (auto p : pool_set) {
9f95a23c 7724 int r = store->pool_statfs(p, &st, &per_pool_omap_stats);
11fdf7f2
TL
7725 if (r == -ENOTSUP) {
7726 break;
7727 } else {
7728 assert(r >= 0);
7729 m->pool_stat[p] = st;
81eedcae 7730 per_pool_stats = true;
11fdf7f2
TL
7731 }
7732 }
7c673cae 7733
81eedcae
TL
7734 // indicate whether we are reporting per-pool stats
7735 m->osd_stat.num_osds = 1;
7736 m->osd_stat.num_per_pool_osds = per_pool_stats ? 1 : 0;
9f95a23c 7737 m->osd_stat.num_per_pool_omap_osds = per_pool_omap_stats ? 1 : 0;
81eedcae 7738
11fdf7f2
TL
7739 return m;
7740}
7c673cae 7741
11fdf7f2 7742vector<DaemonHealthMetric> OSD::get_health_metrics()
b32b8144 7743{
11fdf7f2
TL
7744 vector<DaemonHealthMetric> metrics;
7745 {
7746 utime_t oldest_secs;
7747 const utime_t now = ceph_clock_now();
7748 auto too_old = now;
7749 too_old -= cct->_conf.get_val<double>("osd_op_complaint_time");
7750 int slow = 0;
7751 TrackedOpRef oldest_op;
7752 auto count_slow_ops = [&](TrackedOp& op) {
7753 if (op.get_initiated() < too_old) {
9f95a23c
TL
7754 stringstream ss;
7755 ss << "slow request " << op.get_desc()
7756 << " initiated "
7757 << op.get_initiated()
7758 << " currently "
7759 << op.state_string();
7760 lgeneric_subdout(cct,osd,20) << ss.str() << dendl;
7761 clog->warn() << ss.str();
11fdf7f2
TL
7762 slow++;
7763 if (!oldest_op || op.get_initiated() < oldest_op->get_initiated()) {
7764 oldest_op = &op;
7765 }
7766 return true;
7767 } else {
7768 return false;
7769 }
7770 };
7771 if (op_tracker.visit_ops_in_flight(&oldest_secs, count_slow_ops)) {
7772 if (slow) {
7773 derr << __func__ << " reporting " << slow << " slow ops, oldest is "
7774 << oldest_op->get_desc() << dendl;
7775 }
7776 metrics.emplace_back(daemon_metric::SLOW_OPS, slow, oldest_secs);
7777 } else {
7778 // no news is not good news.
7779 metrics.emplace_back(daemon_metric::SLOW_OPS, 0, 0);
7780 }
7781 }
7782 {
7783 std::lock_guard l(pending_creates_lock);
7784 auto n_primaries = pending_creates_from_mon;
7785 for (const auto& create : pending_creates_from_osd) {
7786 if (create.second) {
7787 n_primaries++;
7788 }
b32b8144 7789 }
11fdf7f2 7790 metrics.emplace_back(daemon_metric::PENDING_CREATING_PGS, n_primaries);
b32b8144 7791 }
b32b8144
FG
7792 return metrics;
7793}
7794
7c673cae
FG
7795// =====================================================
7796// MAP
7797
7798void OSD::wait_for_new_map(OpRequestRef op)
7799{
7800 // ask?
7801 if (waiting_for_osdmap.empty()) {
9f95a23c 7802 osdmap_subscribe(get_osdmap_epoch() + 1, false);
7c673cae
FG
7803 }
7804
7805 logger->inc(l_osd_waiting_for_map);
7806 waiting_for_osdmap.push_back(op);
7807 op->mark_delayed("wait for new map");
7808}
7809
7810
7811/** update_map
7812 * assimilate new OSDMap(s). scan pgs, etc.
7813 */
7814
7815void OSD::note_down_osd(int peer)
7816{
9f95a23c
TL
7817 ceph_assert(ceph_mutex_is_locked(osd_lock));
7818 cluster_messenger->mark_down_addrs(get_osdmap()->get_cluster_addrs(peer));
7c673cae 7819
9f95a23c 7820 std::lock_guard l{heartbeat_lock};
7c673cae
FG
7821 failure_queue.erase(peer);
7822 failure_pending.erase(peer);
7823 map<int,HeartbeatInfo>::iterator p = heartbeat_peers.find(peer);
7824 if (p != heartbeat_peers.end()) {
9f95a23c 7825 p->second.clear_mark_down();
7c673cae
FG
7826 heartbeat_peers.erase(p);
7827 }
7c673cae
FG
7828}
7829
7830void OSD::note_up_osd(int peer)
7831{
7c673cae
FG
7832 heartbeat_set_peers_need_update();
7833}
7834
7835struct C_OnMapCommit : public Context {
7836 OSD *osd;
7837 epoch_t first, last;
7838 MOSDMap *msg;
7839 C_OnMapCommit(OSD *o, epoch_t f, epoch_t l, MOSDMap *m)
7840 : osd(o), first(f), last(l), msg(m) {}
7841 void finish(int r) override {
7842 osd->_committed_osd_maps(first, last, msg);
7843 msg->put();
7844 }
7845};
7846
7c673cae
FG
7847void OSD::osdmap_subscribe(version_t epoch, bool force_request)
7848{
11fdf7f2 7849 std::lock_guard l(osdmap_subscribe_lock);
181888fb 7850 if (latest_subscribed_epoch >= epoch && !force_request)
7c673cae
FG
7851 return;
7852
11fdf7f2 7853 latest_subscribed_epoch = std::max<uint64_t>(epoch, latest_subscribed_epoch);
181888fb 7854
7c673cae
FG
7855 if (monc->sub_want_increment("osdmap", epoch, CEPH_SUBSCRIBE_ONETIME) ||
7856 force_request) {
7857 monc->renew_subs();
7858 }
7859}
7860
7861void OSD::trim_maps(epoch_t oldest, int nreceived, bool skip_maps)
7862{
7863 epoch_t min = std::min(oldest, service.map_cache.cached_key_lower_bound());
7864 if (min <= superblock.oldest_map)
7865 return;
7866
7867 int num = 0;
7868 ObjectStore::Transaction t;
7869 for (epoch_t e = superblock.oldest_map; e < min; ++e) {
7870 dout(20) << " removing old osdmap epoch " << e << dendl;
7871 t.remove(coll_t::meta(), get_osdmap_pobject_name(e));
7872 t.remove(coll_t::meta(), get_inc_osdmap_pobject_name(e));
7873 superblock.oldest_map = e + 1;
7874 num++;
7875 if (num >= cct->_conf->osd_target_transaction_size && num >= nreceived) {
7876 service.publish_superblock(superblock);
7877 write_superblock(t);
11fdf7f2
TL
7878 int tr = store->queue_transaction(service.meta_ch, std::move(t), nullptr);
7879 ceph_assert(tr == 0);
7c673cae
FG
7880 num = 0;
7881 if (!skip_maps) {
7882 // skip_maps leaves us with a range of old maps if we fail to remove all
7883 // of them before moving superblock.oldest_map forward to the first map
7884 // in the incoming MOSDMap msg. so we should continue removing them in
7885 // this case, even we could do huge series of delete transactions all at
7886 // once.
7887 break;
7888 }
7889 }
7890 }
7891 if (num > 0) {
7892 service.publish_superblock(superblock);
7893 write_superblock(t);
11fdf7f2
TL
7894 int tr = store->queue_transaction(service.meta_ch, std::move(t), nullptr);
7895 ceph_assert(tr == 0);
7c673cae
FG
7896 }
7897 // we should not remove the cached maps
11fdf7f2 7898 ceph_assert(min <= service.map_cache.cached_key_lower_bound());
7c673cae
FG
7899}
7900
7901void OSD::handle_osd_map(MOSDMap *m)
7902{
11fdf7f2
TL
7903 // wait for pgs to catch up
7904 {
7905 // we extend the map cache pins to accomodate pgs slow to consume maps
7906 // for some period, until we hit the max_lag_factor bound, at which point
7907 // we block here to stop injesting more maps than they are able to keep
7908 // up with.
7909 epoch_t max_lag = cct->_conf->osd_map_cache_size *
7910 m_osd_pg_epoch_max_lag_factor;
7911 ceph_assert(max_lag > 0);
7912 epoch_t osd_min = 0;
7913 for (auto shard : shards) {
7914 epoch_t min = shard->get_min_pg_epoch();
7915 if (osd_min == 0 || min < osd_min) {
7916 osd_min = min;
7917 }
7918 }
9f95a23c 7919 epoch_t osdmap_epoch = get_osdmap_epoch();
11fdf7f2 7920 if (osd_min > 0 &&
9f95a23c
TL
7921 osdmap_epoch > max_lag &&
7922 osdmap_epoch - max_lag > osd_min) {
7923 epoch_t need = osdmap_epoch - max_lag;
11fdf7f2
TL
7924 dout(10) << __func__ << " waiting for pgs to catch up (need " << need
7925 << " max_lag " << max_lag << ")" << dendl;
7926 for (auto shard : shards) {
7927 epoch_t min = shard->get_min_pg_epoch();
7928 if (need > min) {
7929 dout(10) << __func__ << " waiting for pgs to consume " << need
7930 << " (shard " << shard->shard_id << " min " << min
7931 << ", map cache is " << cct->_conf->osd_map_cache_size
7932 << ", max_lag_factor " << m_osd_pg_epoch_max_lag_factor
7933 << ")" << dendl;
7934 unlock_guard unlock{osd_lock};
7935 shard->wait_min_pg_epoch(need);
7936 }
7937 }
7938 }
7939 }
7940
9f95a23c 7941 ceph_assert(ceph_mutex_is_locked(osd_lock));
11fdf7f2
TL
7942 map<epoch_t,OSDMapRef> added_maps;
7943 map<epoch_t,bufferlist> added_maps_bl;
7c673cae
FG
7944 if (m->fsid != monc->get_fsid()) {
7945 dout(0) << "handle_osd_map fsid " << m->fsid << " != "
7946 << monc->get_fsid() << dendl;
7947 m->put();
7948 return;
7949 }
7950 if (is_initializing()) {
7951 dout(0) << "ignoring osdmap until we have initialized" << dendl;
7952 m->put();
7953 return;
7954 }
7955
9f95a23c
TL
7956 auto session = ceph::ref_cast<Session>(m->get_connection()->get_priv());
7957 if (session && !(session->entity_name.is_mon() ||
7c673cae
FG
7958 session->entity_name.is_osd())) {
7959 //not enough perms!
7960 dout(10) << "got osd map from Session " << session
7961 << " which we can't take maps from (not a mon or osd)" << dendl;
7962 m->put();
7c673cae
FG
7963 return;
7964 }
7c673cae
FG
7965
7966 // share with the objecter
7967 if (!is_preboot())
7968 service.objecter->handle_osd_map(m);
7969
7970 epoch_t first = m->get_first();
7971 epoch_t last = m->get_last();
7972 dout(3) << "handle_osd_map epochs [" << first << "," << last << "], i have "
7973 << superblock.newest_map
7974 << ", src has [" << m->oldest_map << "," << m->newest_map << "]"
7975 << dendl;
7976
7977 logger->inc(l_osd_map);
7978 logger->inc(l_osd_mape, last - first + 1);
7979 if (first <= superblock.newest_map)
7980 logger->inc(l_osd_mape_dup, superblock.newest_map - first + 1);
7981 if (service.max_oldest_map < m->oldest_map) {
7982 service.max_oldest_map = m->oldest_map;
11fdf7f2 7983 ceph_assert(service.max_oldest_map >= superblock.oldest_map);
7c673cae
FG
7984 }
7985
7986 // make sure there is something new, here, before we bother flushing
7987 // the queues and such
7988 if (last <= superblock.newest_map) {
7989 dout(10) << " no new maps here, dropping" << dendl;
7990 m->put();
7991 return;
7992 }
7993
7994 // missing some?
7995 bool skip_maps = false;
7996 if (first > superblock.newest_map + 1) {
7997 dout(10) << "handle_osd_map message skips epochs "
7998 << superblock.newest_map + 1 << ".." << (first-1) << dendl;
7999 if (m->oldest_map <= superblock.newest_map + 1) {
8000 osdmap_subscribe(superblock.newest_map + 1, false);
8001 m->put();
8002 return;
8003 }
8004 // always try to get the full range of maps--as many as we can. this
8005 // 1- is good to have
8006 // 2- is at present the only way to ensure that we get a *full* map as
8007 // the first map!
8008 if (m->oldest_map < first) {
8009 osdmap_subscribe(m->oldest_map - 1, true);
8010 m->put();
8011 return;
8012 }
8013 skip_maps = true;
8014 }
8015
8016 ObjectStore::Transaction t;
8017 uint64_t txn_size = 0;
8018
9f95a23c
TL
8019 map<epoch_t,mempool::osdmap::map<int64_t,snap_interval_set_t>> purged_snaps;
8020
7c673cae 8021 // store new maps: queue for disk and put in the osdmap cache
11fdf7f2 8022 epoch_t start = std::max(superblock.newest_map + 1, first);
7c673cae
FG
8023 for (epoch_t e = start; e <= last; e++) {
8024 if (txn_size >= t.get_num_bytes()) {
8025 derr << __func__ << " transaction size overflowed" << dendl;
11fdf7f2 8026 ceph_assert(txn_size < t.get_num_bytes());
7c673cae
FG
8027 }
8028 txn_size = t.get_num_bytes();
8029 map<epoch_t,bufferlist>::iterator p;
8030 p = m->maps.find(e);
8031 if (p != m->maps.end()) {
8032 dout(10) << "handle_osd_map got full map for epoch " << e << dendl;
8033 OSDMap *o = new OSDMap;
8034 bufferlist& bl = p->second;
8035
8036 o->decode(bl);
8037
9f95a23c
TL
8038 purged_snaps[e] = o->get_new_purged_snaps();
8039
7c673cae
FG
8040 ghobject_t fulloid = get_osdmap_pobject_name(e);
8041 t.write(coll_t::meta(), fulloid, 0, bl.length(), bl);
11fdf7f2
TL
8042 added_maps[e] = add_map(o);
8043 added_maps_bl[e] = bl;
7c673cae
FG
8044 got_full_map(e);
8045 continue;
8046 }
8047
8048 p = m->incremental_maps.find(e);
8049 if (p != m->incremental_maps.end()) {
8050 dout(10) << "handle_osd_map got inc map for epoch " << e << dendl;
8051 bufferlist& bl = p->second;
8052 ghobject_t oid = get_inc_osdmap_pobject_name(e);
8053 t.write(coll_t::meta(), oid, 0, bl.length(), bl);
7c673cae
FG
8054
8055 OSDMap *o = new OSDMap;
8056 if (e > 1) {
8057 bufferlist obl;
8058 bool got = get_map_bl(e - 1, obl);
11fdf7f2
TL
8059 if (!got) {
8060 auto p = added_maps_bl.find(e - 1);
8061 ceph_assert(p != added_maps_bl.end());
8062 obl = p->second;
8063 }
7c673cae
FG
8064 o->decode(obl);
8065 }
8066
8067 OSDMap::Incremental inc;
11fdf7f2 8068 auto p = bl.cbegin();
7c673cae 8069 inc.decode(p);
494da23a 8070
7c673cae 8071 if (o->apply_incremental(inc) < 0) {
9f95a23c 8072 derr << "ERROR: bad fsid? i have " << get_osdmap()->get_fsid() << " and inc has " << inc.fsid << dendl;
11fdf7f2 8073 ceph_abort_msg("bad fsid");
7c673cae
FG
8074 }
8075
8076 bufferlist fbl;
8077 o->encode(fbl, inc.encode_features | CEPH_FEATURE_RESERVED);
8078
8079 bool injected_failure = false;
8080 if (cct->_conf->osd_inject_bad_map_crc_probability > 0 &&
8081 (rand() % 10000) < cct->_conf->osd_inject_bad_map_crc_probability*10000.0) {
8082 derr << __func__ << " injecting map crc failure" << dendl;
8083 injected_failure = true;
8084 }
8085
8086 if ((inc.have_crc && o->get_crc() != inc.full_crc) || injected_failure) {
8087 dout(2) << "got incremental " << e
8088 << " but failed to encode full with correct crc; requesting"
8089 << dendl;
8090 clog->warn() << "failed to encode map e" << e << " with expected crc";
8091 dout(20) << "my encoded map was:\n";
8092 fbl.hexdump(*_dout);
8093 *_dout << dendl;
8094 delete o;
8095 request_full_map(e, last);
8096 last = e - 1;
f6b5b4d7
TL
8097
8098 // don't continue committing if we failed to enc the first inc map
8099 if (last < start) {
8100 dout(10) << __func__ << " bailing because last < start (" << last << "<" << start << ")" << dendl;
8101 m->put();
8102 return;
8103 }
7c673cae
FG
8104 break;
8105 }
8106 got_full_map(e);
9f95a23c 8107 purged_snaps[e] = o->get_new_purged_snaps();
7c673cae
FG
8108
8109 ghobject_t fulloid = get_osdmap_pobject_name(e);
8110 t.write(coll_t::meta(), fulloid, 0, fbl.length(), fbl);
11fdf7f2
TL
8111 added_maps[e] = add_map(o);
8112 added_maps_bl[e] = fbl;
7c673cae
FG
8113 continue;
8114 }
8115
11fdf7f2 8116 ceph_abort_msg("MOSDMap lied about what maps it had?");
7c673cae
FG
8117 }
8118
8119 // even if this map isn't from a mon, we may have satisfied our subscription
8120 monc->sub_got("osdmap", last);
8121
8122 if (!m->maps.empty() && requested_full_first) {
8123 dout(10) << __func__ << " still missing full maps " << requested_full_first
8124 << ".." << requested_full_last << dendl;
8125 rerequest_full_maps();
8126 }
8127
7c673cae
FG
8128 if (superblock.oldest_map) {
8129 // make sure we at least keep pace with incoming maps
8130 trim_maps(m->oldest_map, last - first + 1, skip_maps);
11fdf7f2 8131 pg_num_history.prune(superblock.oldest_map);
7c673cae
FG
8132 }
8133
8134 if (!superblock.oldest_map || skip_maps)
8135 superblock.oldest_map = first;
8136 superblock.newest_map = last;
8137 superblock.current_epoch = last;
8138
8139 // note in the superblock that we were clean thru the prior epoch
8140 epoch_t boot_epoch = service.get_boot_epoch();
8141 if (boot_epoch && boot_epoch >= superblock.mounted) {
8142 superblock.mounted = boot_epoch;
8143 superblock.clean_thru = last;
8144 }
8145
11fdf7f2
TL
8146 // check for pg_num changes and deleted pools
8147 OSDMapRef lastmap;
8148 for (auto& i : added_maps) {
8149 if (!lastmap) {
8150 if (!(lastmap = service.try_get_map(i.first - 1))) {
8151 dout(10) << __func__ << " can't get previous map " << i.first - 1
8152 << " probably first start of this osd" << dendl;
8153 continue;
8154 }
8155 }
8156 ceph_assert(lastmap->get_epoch() + 1 == i.second->get_epoch());
8157 for (auto& j : lastmap->get_pools()) {
8158 if (!i.second->have_pg_pool(j.first)) {
8159 pg_num_history.log_pool_delete(i.first, j.first);
8160 dout(10) << __func__ << " recording final pg_pool_t for pool "
8161 << j.first << dendl;
8162 // this information is needed by _make_pg() if have to restart before
8163 // the pool is deleted and need to instantiate a new (zombie) PG[Pool].
8164 ghobject_t obj = make_final_pool_info_oid(j.first);
8165 bufferlist bl;
8166 encode(j.second, bl, CEPH_FEATURES_ALL);
8167 string name = lastmap->get_pool_name(j.first);
8168 encode(name, bl);
8169 map<string,string> profile;
8170 if (lastmap->get_pg_pool(j.first)->is_erasure()) {
8171 profile = lastmap->get_erasure_code_profile(
8172 lastmap->get_pg_pool(j.first)->erasure_code_profile);
8173 }
8174 encode(profile, bl);
8175 t.write(coll_t::meta(), obj, 0, bl.length(), bl);
11fdf7f2
TL
8176 } else if (unsigned new_pg_num = i.second->get_pg_num(j.first);
8177 new_pg_num != j.second.get_pg_num()) {
8178 dout(10) << __func__ << " recording pool " << j.first << " pg_num "
8179 << j.second.get_pg_num() << " -> " << new_pg_num << dendl;
8180 pg_num_history.log_pg_num_change(i.first, j.first, new_pg_num);
8181 }
8182 }
8183 for (auto& j : i.second->get_pools()) {
8184 if (!lastmap->have_pg_pool(j.first)) {
8185 dout(10) << __func__ << " recording new pool " << j.first << " pg_num "
8186 << j.second.get_pg_num() << dendl;
8187 pg_num_history.log_pg_num_change(i.first, j.first,
8188 j.second.get_pg_num());
8189 }
8190 }
8191 lastmap = i.second;
8192 }
8193 pg_num_history.epoch = last;
8194 {
8195 bufferlist bl;
8196 ::encode(pg_num_history, bl);
8197 t.write(coll_t::meta(), make_pg_num_history_oid(), 0, bl.length(), bl);
8198 dout(20) << __func__ << " pg_num_history " << pg_num_history << dendl;
8199 }
8200
9f95a23c
TL
8201 // record new purged_snaps
8202 if (superblock.purged_snaps_last == start - 1) {
8203 SnapMapper::record_purged_snaps(cct, store, service.meta_ch,
8204 make_purged_snaps_oid(), &t,
8205 purged_snaps);
8206 superblock.purged_snaps_last = last;
8207 } else {
8208 dout(10) << __func__ << " superblock purged_snaps_last is "
8209 << superblock.purged_snaps_last
8210 << ", not recording new purged_snaps" << dendl;
8211 }
8212
7c673cae
FG
8213 // superblock and commit
8214 write_superblock(t);
11fdf7f2 8215 t.register_on_commit(new C_OnMapCommit(this, start, last, m));
7c673cae 8216 store->queue_transaction(
11fdf7f2
TL
8217 service.meta_ch,
8218 std::move(t));
7c673cae
FG
8219 service.publish_superblock(superblock);
8220}
8221
8222void OSD::_committed_osd_maps(epoch_t first, epoch_t last, MOSDMap *m)
8223{
8224 dout(10) << __func__ << " " << first << ".." << last << dendl;
8225 if (is_stopping()) {
8226 dout(10) << __func__ << " bailing, we are shutting down" << dendl;
8227 return;
8228 }
11fdf7f2 8229 std::lock_guard l(osd_lock);
31f18b77
FG
8230 if (is_stopping()) {
8231 dout(10) << __func__ << " bailing, we are shutting down" << dendl;
8232 return;
8233 }
9f95a23c 8234 map_lock.lock();
7c673cae 8235
f6b5b4d7
TL
8236 ceph_assert(first <= last);
8237
7c673cae
FG
8238 bool do_shutdown = false;
8239 bool do_restart = false;
8240 bool network_error = false;
f6b5b4d7 8241 OSDMapRef osdmap = get_osdmap();
7c673cae
FG
8242
8243 // advance through the new maps
8244 for (epoch_t cur = first; cur <= last; cur++) {
8245 dout(10) << " advance to epoch " << cur
8246 << " (<= last " << last
8247 << " <= newest_map " << superblock.newest_map
8248 << ")" << dendl;
8249
8250 OSDMapRef newmap = get_map(cur);
11fdf7f2 8251 ceph_assert(newmap); // we just cached it above!
7c673cae 8252
f67539c2 8253 // start blocklisting messages sent to peers that go down.
7c673cae
FG
8254 service.pre_publish_map(newmap);
8255
8256 // kill connections to newly down osds
8257 bool waited_for_reservations = false;
8258 set<int> old;
9f95a23c 8259 osdmap = get_osdmap();
7c673cae
FG
8260 osdmap->get_all_osds(old);
8261 for (set<int>::iterator p = old.begin(); p != old.end(); ++p) {
8262 if (*p != whoami &&
8263 osdmap->is_up(*p) && // in old map
8264 newmap->is_down(*p)) { // but not the new one
8265 if (!waited_for_reservations) {
8266 service.await_reserved_maps();
8267 waited_for_reservations = true;
8268 }
8269 note_down_osd(*p);
8270 } else if (*p != whoami &&
8271 osdmap->is_down(*p) &&
8272 newmap->is_up(*p)) {
8273 note_up_osd(*p);
8274 }
8275 }
8276
81eedcae 8277 if (osdmap->is_noup(whoami) != newmap->is_noup(whoami)) {
7c673cae
FG
8278 dout(10) << __func__ << " NOUP flag changed in " << newmap->get_epoch()
8279 << dendl;
8280 if (is_booting()) {
8281 // this captures the case where we sent the boot message while
8282 // NOUP was being set on the mon and our boot request was
8283 // dropped, and then later it is cleared. it imperfectly
8284 // handles the case where our original boot message was not
8285 // dropped and we restart even though we might have booted, but
8286 // that is harmless (boot will just take slightly longer).
8287 do_restart = true;
8288 }
8289 }
8290
9f95a23c
TL
8291 osdmap = std::move(newmap);
8292 set_osdmap(osdmap);
7c673cae
FG
8293 epoch_t up_epoch;
8294 epoch_t boot_epoch;
8295 service.retrieve_epochs(&boot_epoch, &up_epoch, NULL);
8296 if (!up_epoch &&
8297 osdmap->is_up(whoami) &&
11fdf7f2 8298 osdmap->get_addrs(whoami) == client_messenger->get_myaddrs()) {
7c673cae
FG
8299 up_epoch = osdmap->get_epoch();
8300 dout(10) << "up_epoch is " << up_epoch << dendl;
8301 if (!boot_epoch) {
8302 boot_epoch = osdmap->get_epoch();
8303 dout(10) << "boot_epoch is " << boot_epoch << dendl;
8304 }
8305 service.set_epochs(&boot_epoch, &up_epoch, NULL);
8306 }
8307 }
8308
7c673cae
FG
8309 epoch_t _bind_epoch = service.get_bind_epoch();
8310 if (osdmap->is_up(whoami) &&
11fdf7f2
TL
8311 osdmap->get_addrs(whoami).legacy_equals(
8312 client_messenger->get_myaddrs()) &&
7c673cae
FG
8313 _bind_epoch < osdmap->get_up_from(whoami)) {
8314
8315 if (is_booting()) {
8316 dout(1) << "state: booting -> active" << dendl;
8317 set_state(STATE_ACTIVE);
11fdf7f2 8318 do_restart = false;
7c673cae
FG
8319
8320 // set incarnation so that osd_reqid_t's we generate for our
8321 // objecter requests are unique across restarts.
8322 service.objecter->set_client_incarnation(osdmap->get_epoch());
11fdf7f2 8323 cancel_pending_failures();
7c673cae
FG
8324 }
8325 }
8326
8327 if (osdmap->get_epoch() > 0 &&
8328 is_active()) {
8329 if (!osdmap->exists(whoami)) {
9f95a23c 8330 derr << "map says i do not exist. shutting down." << dendl;
7c673cae
FG
8331 do_shutdown = true; // don't call shutdown() while we have
8332 // everything paused
9f95a23c
TL
8333 } else if (osdmap->is_stop(whoami)) {
8334 derr << "map says i am stopped by admin. shutting down." << dendl;
8335 do_shutdown = true;
7c673cae 8336 } else if (!osdmap->is_up(whoami) ||
11fdf7f2
TL
8337 !osdmap->get_addrs(whoami).legacy_equals(
8338 client_messenger->get_myaddrs()) ||
8339 !osdmap->get_cluster_addrs(whoami).legacy_equals(
8340 cluster_messenger->get_myaddrs()) ||
8341 !osdmap->get_hb_back_addrs(whoami).legacy_equals(
8342 hb_back_server_messenger->get_myaddrs()) ||
8343 !osdmap->get_hb_front_addrs(whoami).legacy_equals(
8344 hb_front_server_messenger->get_myaddrs())) {
7c673cae
FG
8345 if (!osdmap->is_up(whoami)) {
8346 if (service.is_preparing_to_stop() || service.is_stopping()) {
8347 service.got_stop_ack();
8348 } else {
c07f9fc5
FG
8349 clog->warn() << "Monitor daemon marked osd." << whoami << " down, "
8350 "but it is still running";
8351 clog->debug() << "map e" << osdmap->get_epoch()
8352 << " wrongly marked me down at e"
8353 << osdmap->get_down_at(whoami);
7c673cae 8354 }
9f95a23c
TL
8355 if (monc->monmap.min_mon_release >= ceph_release_t::octopus) {
8356 // note that this is best-effort...
8357 monc->send_mon_message(
8358 new MOSDMarkMeDead(
8359 monc->get_fsid(),
8360 whoami,
8361 osdmap->get_epoch()));
8362 }
11fdf7f2
TL
8363 } else if (!osdmap->get_addrs(whoami).legacy_equals(
8364 client_messenger->get_myaddrs())) {
7c673cae 8365 clog->error() << "map e" << osdmap->get_epoch()
11fdf7f2
TL
8366 << " had wrong client addr (" << osdmap->get_addrs(whoami)
8367 << " != my " << client_messenger->get_myaddrs() << ")";
8368 } else if (!osdmap->get_cluster_addrs(whoami).legacy_equals(
8369 cluster_messenger->get_myaddrs())) {
7c673cae
FG
8370 clog->error() << "map e" << osdmap->get_epoch()
8371 << " had wrong cluster addr ("
11fdf7f2
TL
8372 << osdmap->get_cluster_addrs(whoami)
8373 << " != my " << cluster_messenger->get_myaddrs() << ")";
8374 } else if (!osdmap->get_hb_back_addrs(whoami).legacy_equals(
8375 hb_back_server_messenger->get_myaddrs())) {
7c673cae 8376 clog->error() << "map e" << osdmap->get_epoch()
c07f9fc5 8377 << " had wrong heartbeat back addr ("
11fdf7f2
TL
8378 << osdmap->get_hb_back_addrs(whoami)
8379 << " != my " << hb_back_server_messenger->get_myaddrs()
7c673cae 8380 << ")";
11fdf7f2
TL
8381 } else if (!osdmap->get_hb_front_addrs(whoami).legacy_equals(
8382 hb_front_server_messenger->get_myaddrs())) {
7c673cae 8383 clog->error() << "map e" << osdmap->get_epoch()
c07f9fc5 8384 << " had wrong heartbeat front addr ("
11fdf7f2
TL
8385 << osdmap->get_hb_front_addrs(whoami)
8386 << " != my " << hb_front_server_messenger->get_myaddrs()
7c673cae
FG
8387 << ")";
8388 }
8389
8390 if (!service.is_stopping()) {
8391 epoch_t up_epoch = 0;
8392 epoch_t bind_epoch = osdmap->get_epoch();
8393 service.set_epochs(NULL,&up_epoch, &bind_epoch);
8394 do_restart = true;
8395
8396 //add markdown log
8397 utime_t now = ceph_clock_now();
8398 utime_t grace = utime_t(cct->_conf->osd_max_markdown_period, 0);
8399 osd_markdown_log.push_back(now);
7c673cae 8400 if ((int)osd_markdown_log.size() > cct->_conf->osd_max_markdown_count) {
9f95a23c
TL
8401 derr << __func__ << " marked down "
8402 << osd_markdown_log.size()
8403 << " > osd_max_markdown_count "
8404 << cct->_conf->osd_max_markdown_count
8405 << " in last " << grace << " seconds, shutting down"
8406 << dendl;
7c673cae
FG
8407 do_restart = false;
8408 do_shutdown = true;
8409 }
8410
8411 start_waiting_for_healthy();
8412
8413 set<int> avoid_ports;
8414#if defined(__FreeBSD__)
8415 // prevent FreeBSD from grabbing the client_messenger port during
f67539c2 8416 // rebinding. In which case a cluster_meesneger will connect also
7c673cae 8417 // to the same port
11fdf7f2 8418 client_messenger->get_myaddrs().get_ports(&avoid_ports);
7c673cae 8419#endif
11fdf7f2 8420 cluster_messenger->get_myaddrs().get_ports(&avoid_ports);
7c673cae
FG
8421
8422 int r = cluster_messenger->rebind(avoid_ports);
8423 if (r != 0) {
8424 do_shutdown = true; // FIXME: do_restart?
8425 network_error = true;
9f95a23c
TL
8426 derr << __func__ << " marked down:"
8427 << " rebind cluster_messenger failed" << dendl;
7c673cae
FG
8428 }
8429
9f95a23c
TL
8430 hb_back_server_messenger->mark_down_all();
8431 hb_front_server_messenger->mark_down_all();
7c673cae
FG
8432 hb_front_client_messenger->mark_down_all();
8433 hb_back_client_messenger->mark_down_all();
8434
494da23a 8435 reset_heartbeat_peers(true);
7c673cae
FG
8436 }
8437 }
8438 }
8439
9f95a23c 8440 map_lock.unlock();
7c673cae 8441
11fdf7f2 8442 check_osdmap_features();
7c673cae
FG
8443
8444 // yay!
8445 consume_map();
8446
8447 if (is_active() || is_waiting_for_healthy())
8448 maybe_update_heartbeat_peers();
8449
11fdf7f2 8450 if (is_active()) {
7c673cae
FG
8451 activate_map();
8452 }
8453
31f18b77 8454 if (do_shutdown) {
7c673cae 8455 if (network_error) {
11fdf7f2 8456 cancel_pending_failures();
7c673cae
FG
8457 }
8458 // trigger shutdown in a different thread
8459 dout(0) << __func__ << " shutdown OSD via async signal" << dendl;
8460 queue_async_signal(SIGINT);
8461 }
31f18b77
FG
8462 else if (m->newest_map && m->newest_map > last) {
8463 dout(10) << " msg say newest map is " << m->newest_map
8464 << ", requesting more" << dendl;
8465 osdmap_subscribe(osdmap->get_epoch()+1, false);
8466 }
7c673cae
FG
8467 else if (is_preboot()) {
8468 if (m->get_source().is_mon())
8469 _preboot(m->oldest_map, m->newest_map);
8470 else
8471 start_boot();
8472 }
8473 else if (do_restart)
8474 start_boot();
8475
8476}
8477
11fdf7f2 8478void OSD::check_osdmap_features()
7c673cae
FG
8479{
8480 // adjust required feature bits?
8481
8482 // we have to be a bit careful here, because we are accessing the
8483 // Policy structures without taking any lock. in particular, only
8484 // modify integer values that can safely be read by a racing CPU.
8485 // since we are only accessing existing Policy structures a their
8486 // current memory location, and setting or clearing bits in integer
8487 // fields, and we are the only writer, this is not a problem.
8488
9f95a23c 8489 const auto osdmap = get_osdmap();
7c673cae
FG
8490 {
8491 Messenger::Policy p = client_messenger->get_default_policy();
8492 uint64_t mask;
8493 uint64_t features = osdmap->get_features(entity_name_t::TYPE_CLIENT, &mask);
8494 if ((p.features_required & mask) != features) {
8495 dout(0) << "crush map has features " << features
8496 << ", adjusting msgr requires for clients" << dendl;
8497 p.features_required = (p.features_required & ~mask) | features;
8498 client_messenger->set_default_policy(p);
8499 }
8500 }
8501 {
8502 Messenger::Policy p = client_messenger->get_policy(entity_name_t::TYPE_MON);
8503 uint64_t mask;
8504 uint64_t features = osdmap->get_features(entity_name_t::TYPE_MON, &mask);
8505 if ((p.features_required & mask) != features) {
8506 dout(0) << "crush map has features " << features
8507 << " was " << p.features_required
8508 << ", adjusting msgr requires for mons" << dendl;
8509 p.features_required = (p.features_required & ~mask) | features;
8510 client_messenger->set_policy(entity_name_t::TYPE_MON, p);
8511 }
8512 }
8513 {
8514 Messenger::Policy p = cluster_messenger->get_policy(entity_name_t::TYPE_OSD);
8515 uint64_t mask;
8516 uint64_t features = osdmap->get_features(entity_name_t::TYPE_OSD, &mask);
8517
8518 if ((p.features_required & mask) != features) {
8519 dout(0) << "crush map has features " << features
8520 << ", adjusting msgr requires for osds" << dendl;
8521 p.features_required = (p.features_required & ~mask) | features;
8522 cluster_messenger->set_policy(entity_name_t::TYPE_OSD, p);
8523 }
8524
11fdf7f2 8525 if (!superblock.compat_features.incompat.contains(CEPH_OSD_FEATURE_INCOMPAT_SHARDS)) {
7c673cae
FG
8526 dout(0) << __func__ << " enabling on-disk ERASURE CODES compat feature" << dendl;
8527 superblock.compat_features.incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_SHARDS);
8528 ObjectStore::Transaction t;
8529 write_superblock(t);
11fdf7f2
TL
8530 int err = store->queue_transaction(service.meta_ch, std::move(t), NULL);
8531 ceph_assert(err == 0);
7c673cae
FG
8532 }
8533 }
11fdf7f2 8534
9f95a23c
TL
8535 if (osdmap->require_osd_release < ceph_release_t::nautilus) {
8536 hb_front_server_messenger->set_require_authorizer(false);
8537 hb_back_server_messenger->set_require_authorizer(false);
8538 } else {
8539 hb_front_server_messenger->set_require_authorizer(true);
8540 hb_back_server_messenger->set_require_authorizer(true);
11fdf7f2
TL
8541 }
8542
8543 if (osdmap->require_osd_release != last_require_osd_release) {
8544 dout(1) << __func__ << " require_osd_release " << last_require_osd_release
8545 << " -> " << to_string(osdmap->require_osd_release) << dendl;
8546 store->write_meta("require_osd_release",
8547 stringify((int)osdmap->require_osd_release));
8548 last_require_osd_release = osdmap->require_osd_release;
8549 }
7c673cae
FG
8550}
8551
11fdf7f2
TL
8552struct C_FinishSplits : public Context {
8553 OSD *osd;
8554 set<PGRef> pgs;
8555 C_FinishSplits(OSD *osd, const set<PGRef> &in)
8556 : osd(osd), pgs(in) {}
8557 void finish(int r) override {
8558 osd->_finish_splits(pgs);
8559 }
8560};
8561
8562void OSD::_finish_splits(set<PGRef>& pgs)
7c673cae 8563{
11fdf7f2
TL
8564 dout(10) << __func__ << " " << pgs << dendl;
8565 if (is_stopping())
8566 return;
11fdf7f2
TL
8567 for (set<PGRef>::iterator i = pgs.begin();
8568 i != pgs.end();
8569 ++i) {
8570 PG *pg = i->get();
7c673cae 8571
9f95a23c 8572 PeeringCtx rctx = create_context();
11fdf7f2
TL
8573 pg->lock();
8574 dout(10) << __func__ << " " << *pg << dendl;
8575 epoch_t e = pg->get_osdmap_epoch();
9f95a23c 8576 pg->handle_initialize(rctx);
11fdf7f2 8577 pg->queue_null(e, e);
9f95a23c 8578 dispatch_context(rctx, pg, service.get_osdmap());
11fdf7f2 8579 pg->unlock();
7c673cae 8580
11fdf7f2
TL
8581 unsigned shard_index = pg->pg_id.hash_to_shard(num_shards);
8582 shards[shard_index]->register_and_wake_split_child(pg);
7c673cae 8583 }
11fdf7f2
TL
8584};
8585
8586bool OSD::add_merge_waiter(OSDMapRef nextmap, spg_t target, PGRef src,
8587 unsigned need)
8588{
8589 std::lock_guard l(merge_lock);
8590 auto& p = merge_waiters[nextmap->get_epoch()][target];
8591 p[src->pg_id] = src;
8592 dout(10) << __func__ << " added merge_waiter " << src->pg_id
8593 << " for " << target << ", have " << p.size() << "/" << need
8594 << dendl;
8595 return p.size() == need;
8596}
8597
8598bool OSD::advance_pg(
8599 epoch_t osd_epoch,
8600 PG *pg,
8601 ThreadPool::TPHandle &handle,
9f95a23c 8602 PeeringCtx &rctx)
11fdf7f2
TL
8603{
8604 if (osd_epoch <= pg->get_osdmap_epoch()) {
8605 return true;
8606 }
8607 ceph_assert(pg->is_locked());
8608 OSDMapRef lastmap = pg->get_osdmap();
11fdf7f2
TL
8609 set<PGRef> new_pgs; // any split children
8610 bool ret = true;
8611
8612 unsigned old_pg_num = lastmap->have_pg_pool(pg->pg_id.pool()) ?
8613 lastmap->get_pg_num(pg->pg_id.pool()) : 0;
8614 for (epoch_t next_epoch = pg->get_osdmap_epoch() + 1;
8615 next_epoch <= osd_epoch;
7c673cae
FG
8616 ++next_epoch) {
8617 OSDMapRef nextmap = service.try_get_map(next_epoch);
8618 if (!nextmap) {
8619 dout(20) << __func__ << " missing map " << next_epoch << dendl;
7c673cae
FG
8620 continue;
8621 }
8622
11fdf7f2
TL
8623 unsigned new_pg_num =
8624 (old_pg_num && nextmap->have_pg_pool(pg->pg_id.pool())) ?
8625 nextmap->get_pg_num(pg->pg_id.pool()) : 0;
8626 if (old_pg_num && new_pg_num && old_pg_num != new_pg_num) {
8627 // check for merge
8628 if (nextmap->have_pg_pool(pg->pg_id.pool())) {
8629 spg_t parent;
8630 if (pg->pg_id.is_merge_source(
8631 old_pg_num,
8632 new_pg_num,
8633 &parent)) {
8634 // we are merge source
8635 PGRef spg = pg; // carry a ref
8636 dout(1) << __func__ << " " << pg->pg_id
8637 << " is merge source, target is " << parent
8638 << dendl;
8639 pg->write_if_dirty(rctx);
9f95a23c
TL
8640 if (!new_pgs.empty()) {
8641 rctx.transaction.register_on_applied(new C_FinishSplits(this,
8642 new_pgs));
8643 new_pgs.clear();
8644 }
8645 dispatch_context(rctx, pg, pg->get_osdmap(), &handle);
11fdf7f2 8646 pg->ch->flush();
eafe8130
TL
8647 // release backoffs explicitly, since the on_shutdown path
8648 // aggressively tears down backoff state.
8649 if (pg->is_primary()) {
8650 pg->release_pg_backoffs();
8651 }
11fdf7f2
TL
8652 pg->on_shutdown();
8653 OSDShard *sdata = pg->osd_shard;
8654 {
8655 std::lock_guard l(sdata->shard_lock);
8656 if (pg->pg_slot) {
8657 sdata->_detach_pg(pg->pg_slot);
8658 // update pg count now since we might not get an osdmap
8659 // any time soon.
8660 if (pg->is_primary())
8661 logger->dec(l_osd_pg_primary);
9f95a23c
TL
8662 else if (pg->is_nonprimary())
8663 logger->dec(l_osd_pg_replica); // misnomer
11fdf7f2
TL
8664 else
8665 logger->dec(l_osd_pg_stray);
8666 }
8667 }
8668 pg->unlock();
8669
8670 set<spg_t> children;
8671 parent.is_split(new_pg_num, old_pg_num, &children);
8672 if (add_merge_waiter(nextmap, parent, pg, children.size())) {
8673 enqueue_peering_evt(
8674 parent,
8675 PGPeeringEventRef(
8676 std::make_shared<PGPeeringEvent>(
8677 nextmap->get_epoch(),
8678 nextmap->get_epoch(),
8679 NullEvt())));
8680 }
8681 ret = false;
8682 goto out;
8683 } else if (pg->pg_id.is_merge_target(old_pg_num, new_pg_num)) {
8684 // we are merge target
8685 set<spg_t> children;
8686 pg->pg_id.is_split(new_pg_num, old_pg_num, &children);
8687 dout(20) << __func__ << " " << pg->pg_id
8688 << " is merge target, sources are " << children
8689 << dendl;
8690 map<spg_t,PGRef> sources;
8691 {
8692 std::lock_guard l(merge_lock);
8693 auto& s = merge_waiters[nextmap->get_epoch()][pg->pg_id];
8694 unsigned need = children.size();
8695 dout(20) << __func__ << " have " << s.size() << "/"
8696 << need << dendl;
8697 if (s.size() == need) {
8698 sources.swap(s);
8699 merge_waiters[nextmap->get_epoch()].erase(pg->pg_id);
8700 if (merge_waiters[nextmap->get_epoch()].empty()) {
8701 merge_waiters.erase(nextmap->get_epoch());
8702 }
8703 }
8704 }
8705 if (!sources.empty()) {
8706 unsigned new_pg_num = nextmap->get_pg_num(pg->pg_id.pool());
8707 unsigned split_bits = pg->pg_id.get_split_bits(new_pg_num);
8708 dout(1) << __func__ << " merging " << pg->pg_id << dendl;
8709 pg->merge_from(
8710 sources, rctx, split_bits,
8711 nextmap->get_pg_pool(
8712 pg->pg_id.pool())->last_pg_merge_meta);
8713 pg->pg_slot->waiting_for_merge_epoch = 0;
8714 } else {
8715 dout(20) << __func__ << " not ready to merge yet" << dendl;
8716 pg->write_if_dirty(rctx);
9f95a23c
TL
8717 if (!new_pgs.empty()) {
8718 rctx.transaction.register_on_applied(new C_FinishSplits(this,
8719 new_pgs));
8720 new_pgs.clear();
8721 }
8722 dispatch_context(rctx, pg, pg->get_osdmap(), &handle);
11fdf7f2
TL
8723 pg->unlock();
8724 // kick source(s) to get them ready
8725 for (auto& i : children) {
8726 dout(20) << __func__ << " kicking source " << i << dendl;
8727 enqueue_peering_evt(
8728 i,
8729 PGPeeringEventRef(
8730 std::make_shared<PGPeeringEvent>(
8731 nextmap->get_epoch(),
8732 nextmap->get_epoch(),
8733 NullEvt())));
8734 }
8735 ret = false;
8736 goto out;
8737 }
8738 }
8739 }
8740 }
8741
7c673cae
FG
8742 vector<int> newup, newacting;
8743 int up_primary, acting_primary;
8744 nextmap->pg_to_up_acting_osds(
11fdf7f2 8745 pg->pg_id.pgid,
7c673cae
FG
8746 &newup, &up_primary,
8747 &newacting, &acting_primary);
8748 pg->handle_advance_map(
8749 nextmap, lastmap, newup, up_primary,
8750 newacting, acting_primary, rctx);
8751
494da23a
TL
8752 auto oldpool = lastmap->get_pools().find(pg->pg_id.pool());
8753 auto newpool = nextmap->get_pools().find(pg->pg_id.pool());
8754 if (oldpool != lastmap->get_pools().end()
8755 && newpool != nextmap->get_pools().end()) {
8756 dout(20) << __func__
8757 << " new pool opts " << newpool->second.opts
8758 << " old pool opts " << oldpool->second.opts
8759 << dendl;
8760
8761 double old_min_interval = 0, new_min_interval = 0;
8762 oldpool->second.opts.get(pool_opts_t::SCRUB_MIN_INTERVAL, &old_min_interval);
8763 newpool->second.opts.get(pool_opts_t::SCRUB_MIN_INTERVAL, &new_min_interval);
8764
8765 double old_max_interval = 0, new_max_interval = 0;
8766 oldpool->second.opts.get(pool_opts_t::SCRUB_MAX_INTERVAL, &old_max_interval);
8767 newpool->second.opts.get(pool_opts_t::SCRUB_MAX_INTERVAL, &new_max_interval);
8768
8769 // Assume if an interval is change from set to unset or vice versa the actual config
8770 // is different. Keep it simple even if it is possible to call resched_all_scrub()
8771 // unnecessarily.
8772 if (old_min_interval != new_min_interval || old_max_interval != new_max_interval) {
8773 pg->on_info_history_change();
8774 }
8775 }
8776
11fdf7f2
TL
8777 if (new_pg_num && old_pg_num != new_pg_num) {
8778 // check for split
8779 set<spg_t> children;
8780 if (pg->pg_id.is_split(
8781 old_pg_num,
8782 new_pg_num,
8783 &children)) {
8784 split_pgs(
8785 pg, children, &new_pgs, lastmap, nextmap,
8786 rctx);
8787 }
7c673cae
FG
8788 }
8789
8790 lastmap = nextmap;
11fdf7f2 8791 old_pg_num = new_pg_num;
7c673cae
FG
8792 handle.reset_tp_timeout();
8793 }
7c673cae 8794 pg->handle_activate_map(rctx);
11fdf7f2
TL
8795
8796 ret = true;
8797 out:
8798 if (!new_pgs.empty()) {
9f95a23c 8799 rctx.transaction.register_on_applied(new C_FinishSplits(this, new_pgs));
7c673cae 8800 }
11fdf7f2 8801 return ret;
7c673cae
FG
8802}
8803
8804void OSD::consume_map()
8805{
9f95a23c
TL
8806 ceph_assert(ceph_mutex_is_locked(osd_lock));
8807 auto osdmap = get_osdmap();
7c673cae
FG
8808 dout(7) << "consume_map version " << osdmap->get_epoch() << dendl;
8809
3efd9988
FG
8810 /** make sure the cluster is speaking in SORTBITWISE, because we don't
8811 * speak the older sorting version any more. Be careful not to force
8812 * a shutdown if we are merely processing old maps, though.
8813 */
8814 if (!osdmap->test_flag(CEPH_OSDMAP_SORTBITWISE) && is_active()) {
8815 derr << __func__ << " SORTBITWISE flag is not set" << dendl;
8816 ceph_abort();
8817 }
8818
11fdf7f2
TL
8819 service.pre_publish_map(osdmap);
8820 service.await_reserved_maps();
8821 service.publish_map(osdmap);
7c673cae 8822
11fdf7f2
TL
8823 // prime splits and merges
8824 set<pair<spg_t,epoch_t>> newly_split; // splits, and when
8825 set<pair<spg_t,epoch_t>> merge_pgs; // merge participants, and when
8826 for (auto& shard : shards) {
8827 shard->identify_splits_and_merges(osdmap, &newly_split, &merge_pgs);
8828 }
8829 if (!newly_split.empty()) {
8830 for (auto& shard : shards) {
8831 shard->prime_splits(osdmap, &newly_split);
8832 }
8833 ceph_assert(newly_split.empty());
8834 }
7c673cae 8835
11fdf7f2
TL
8836 // prune sent_ready_to_merge
8837 service.prune_sent_ready_to_merge(osdmap);
7c673cae 8838
11fdf7f2
TL
8839 // FIXME, maybe: We could race against an incoming peering message
8840 // that instantiates a merge PG after identify_merges() below and
8841 // never set up its peer to complete the merge. An OSD restart
8842 // would clear it up. This is a hard race to resolve,
8843 // extraordinarily rare (we only merge PGs that are stable and
8844 // clean, so it'd have to be an imported PG to an OSD with a
8845 // slightly stale OSDMap...), so I'm ignoring it for now. We plan to
8846 // replace all of this with a seastar-based code soon anyway.
8847 if (!merge_pgs.empty()) {
8848 // mark the pgs we already have, or create new and empty merge
8849 // participants for those we are missing. do this all under the
8850 // shard lock so we don't have to worry about racing pg creates
8851 // via _process.
8852 for (auto& shard : shards) {
8853 shard->prime_merges(osdmap, &merge_pgs);
7c673cae 8854 }
11fdf7f2
TL
8855 ceph_assert(merge_pgs.empty());
8856 }
8857
8858 service.prune_pg_created();
8859
8860 unsigned pushes_to_free = 0;
8861 for (auto& shard : shards) {
8862 shard->consume_map(osdmap, &pushes_to_free);
8863 }
8864
8865 vector<spg_t> pgids;
8866 _get_pgids(&pgids);
8867
8868 // count (FIXME, probably during seastar rewrite)
8869 int num_pg_primary = 0, num_pg_replica = 0, num_pg_stray = 0;
8870 vector<PGRef> pgs;
8871 _get_pgs(&pgs);
8872 for (auto& pg : pgs) {
8873 // FIXME (probably during seastar rewrite): this is lockless and
8874 // racy, but we don't want to take pg lock here.
8875 if (pg->is_primary())
8876 num_pg_primary++;
9f95a23c
TL
8877 else if (pg->is_nonprimary())
8878 num_pg_replica++; // misnomer
11fdf7f2
TL
8879 else
8880 num_pg_stray++;
8881 }
3efd9988 8882
11fdf7f2
TL
8883 {
8884 // FIXME (as part of seastar rewrite): move to OSDShard
8885 std::lock_guard l(pending_creates_lock);
8886 for (auto pg = pending_creates_from_osd.begin();
8887 pg != pending_creates_from_osd.end();) {
9f95a23c 8888 if (osdmap->get_pg_acting_role(pg->first, whoami) < 0) {
11fdf7f2
TL
8889 dout(10) << __func__ << " pg " << pg->first << " doesn't map here, "
8890 << "discarding pending_create_from_osd" << dendl;
3efd9988
FG
8891 pg = pending_creates_from_osd.erase(pg);
8892 } else {
8893 ++pg;
8894 }
8895 }
7c673cae
FG
8896 }
8897
7c673cae
FG
8898 service.maybe_inject_dispatch_delay();
8899
8900 dispatch_sessions_waiting_on_map();
8901
8902 service.maybe_inject_dispatch_delay();
8903
11fdf7f2 8904 service.release_reserved_pushes(pushes_to_free);
7c673cae 8905
11fdf7f2
TL
8906 // queue null events to push maps down to individual PGs
8907 for (auto pgid : pgids) {
8908 enqueue_peering_evt(
8909 pgid,
8910 PGPeeringEventRef(
8911 std::make_shared<PGPeeringEvent>(
8912 osdmap->get_epoch(),
8913 osdmap->get_epoch(),
8914 NullEvt())));
7c673cae 8915 }
11fdf7f2 8916 logger->set(l_osd_pg, pgids.size());
7c673cae
FG
8917 logger->set(l_osd_pg_primary, num_pg_primary);
8918 logger->set(l_osd_pg_replica, num_pg_replica);
8919 logger->set(l_osd_pg_stray, num_pg_stray);
8920}
8921
8922void OSD::activate_map()
8923{
9f95a23c
TL
8924 ceph_assert(ceph_mutex_is_locked(osd_lock));
8925 auto osdmap = get_osdmap();
7c673cae
FG
8926
8927 dout(7) << "activate_map version " << osdmap->get_epoch() << dendl;
8928
7c673cae
FG
8929 // norecover?
8930 if (osdmap->test_flag(CEPH_OSDMAP_NORECOVER)) {
8931 if (!service.recovery_is_paused()) {
8932 dout(1) << "pausing recovery (NORECOVER flag set)" << dendl;
8933 service.pause_recovery();
8934 }
8935 } else {
8936 if (service.recovery_is_paused()) {
8937 dout(1) << "unpausing recovery (NORECOVER flag unset)" << dendl;
8938 service.unpause_recovery();
8939 }
8940 }
8941
8942 service.activate_map();
8943
8944 // process waiters
8945 take_waiters(waiting_for_osdmap);
8946}
8947
8948bool OSD::require_mon_peer(const Message *m)
8949{
8950 if (!m->get_connection()->peer_is_mon()) {
8951 dout(0) << "require_mon_peer received from non-mon "
8952 << m->get_connection()->get_peer_addr()
8953 << " " << *m << dendl;
8954 return false;
8955 }
8956 return true;
8957}
8958
8959bool OSD::require_mon_or_mgr_peer(const Message *m)
8960{
8961 if (!m->get_connection()->peer_is_mon() &&
8962 !m->get_connection()->peer_is_mgr()) {
8963 dout(0) << "require_mon_or_mgr_peer received from non-mon, non-mgr "
8964 << m->get_connection()->get_peer_addr()
8965 << " " << *m << dendl;
8966 return false;
8967 }
8968 return true;
8969}
8970
8971bool OSD::require_osd_peer(const Message *m)
8972{
8973 if (!m->get_connection()->peer_is_osd()) {
8974 dout(0) << "require_osd_peer received from non-osd "
8975 << m->get_connection()->get_peer_addr()
8976 << " " << *m << dendl;
8977 return false;
8978 }
8979 return true;
8980}
8981
8982bool OSD::require_self_aliveness(const Message *m, epoch_t epoch)
8983{
8984 epoch_t up_epoch = service.get_up_epoch();
8985 if (epoch < up_epoch) {
8986 dout(7) << "from pre-up epoch " << epoch << " < " << up_epoch << dendl;
8987 return false;
8988 }
8989
8990 if (!is_active()) {
8991 dout(7) << "still in boot state, dropping message " << *m << dendl;
8992 return false;
8993 }
8994
8995 return true;
8996}
8997
9f95a23c 8998bool OSD::require_same_peer_instance(const Message *m, const OSDMapRef& map,
7c673cae
FG
8999 bool is_fast_dispatch)
9000{
9001 int from = m->get_source().num();
9002
9003 if (map->is_down(from) ||
11fdf7f2 9004 (map->get_cluster_addrs(from) != m->get_source_addrs())) {
7c673cae
FG
9005 dout(5) << "from dead osd." << from << ", marking down, "
9006 << " msg was " << m->get_source_inst().addr
11fdf7f2
TL
9007 << " expected "
9008 << (map->is_up(from) ?
9009 map->get_cluster_addrs(from) : entity_addrvec_t())
7c673cae
FG
9010 << dendl;
9011 ConnectionRef con = m->get_connection();
9012 con->mark_down();
9f95a23c 9013 if (auto s = ceph::ref_cast<Session>(con->get_priv()); s) {
7c673cae 9014 if (!is_fast_dispatch)
9f95a23c 9015 s->session_dispatch_lock.lock();
7c673cae 9016 clear_session_waiting_on_map(s);
11fdf7f2
TL
9017 con->set_priv(nullptr); // break ref <-> session cycle, if any
9018 s->con.reset();
7c673cae 9019 if (!is_fast_dispatch)
9f95a23c 9020 s->session_dispatch_lock.unlock();
7c673cae
FG
9021 }
9022 return false;
9023 }
9024 return true;
9025}
9026
9027
9028/*
9029 * require that we have same (or newer) map, and that
9030 * the source is the pg primary.
9031 */
9032bool OSD::require_same_or_newer_map(OpRequestRef& op, epoch_t epoch,
9033 bool is_fast_dispatch)
9034{
9035 const Message *m = op->get_req();
9f95a23c 9036 const auto osdmap = get_osdmap();
7c673cae
FG
9037 dout(15) << "require_same_or_newer_map " << epoch
9038 << " (i am " << osdmap->get_epoch() << ") " << m << dendl;
9039
9f95a23c 9040 ceph_assert(ceph_mutex_is_locked(osd_lock));
7c673cae
FG
9041
9042 // do they have a newer map?
9043 if (epoch > osdmap->get_epoch()) {
9044 dout(7) << "waiting for newer map epoch " << epoch
9045 << " > my " << osdmap->get_epoch() << " with " << m << dendl;
9046 wait_for_new_map(op);
9047 return false;
9048 }
9049
9050 if (!require_self_aliveness(op->get_req(), epoch)) {
9051 return false;
9052 }
9053
9054 // ok, our map is same or newer.. do they still exist?
9055 if (m->get_connection()->get_messenger() == cluster_messenger &&
9056 !require_same_peer_instance(op->get_req(), osdmap, is_fast_dispatch)) {
9057 return false;
9058 }
9059
9060 return true;
9061}
9062
9063
9064
9065
9066
9067// ----------------------------------------
9068// pg creation
9069
9070void OSD::split_pgs(
9071 PG *parent,
31f18b77 9072 const set<spg_t> &childpgids, set<PGRef> *out_pgs,
7c673cae
FG
9073 OSDMapRef curmap,
9074 OSDMapRef nextmap,
9f95a23c 9075 PeeringCtx &rctx)
7c673cae 9076{
11fdf7f2
TL
9077 unsigned pg_num = nextmap->get_pg_num(parent->pg_id.pool());
9078 parent->update_snap_mapper_bits(parent->get_pgid().get_split_bits(pg_num));
7c673cae 9079
11fdf7f2
TL
9080 vector<object_stat_sum_t> updated_stats;
9081 parent->start_split_stats(childpgids, &updated_stats);
7c673cae
FG
9082
9083 vector<object_stat_sum_t>::iterator stat_iter = updated_stats.begin();
9084 for (set<spg_t>::const_iterator i = childpgids.begin();
9085 i != childpgids.end();
9086 ++i, ++stat_iter) {
11fdf7f2
TL
9087 ceph_assert(stat_iter != updated_stats.end());
9088 dout(10) << __func__ << " splitting " << *parent << " into " << *i << dendl;
7c673cae
FG
9089 PG* child = _make_pg(nextmap, *i);
9090 child->lock(true);
9091 out_pgs->insert(child);
11fdf7f2 9092 child->ch = store->create_new_collection(child->coll);
7c673cae 9093
11fdf7f2
TL
9094 {
9095 uint32_t shard_index = i->hash_to_shard(shards.size());
9096 assert(NULL != shards[shard_index]);
9097 store->set_collection_commit_queue(child->coll, &(shards[shard_index]->context_queue));
9098 }
7c673cae 9099
11fdf7f2
TL
9100 unsigned split_bits = i->get_split_bits(pg_num);
9101 dout(10) << " pg_num is " << pg_num
9102 << ", m_seed " << i->ps()
9103 << ", split_bits is " << split_bits << dendl;
7c673cae
FG
9104 parent->split_colls(
9105 *i,
9106 split_bits,
9107 i->ps(),
11fdf7f2 9108 &child->get_pool().info,
9f95a23c 9109 rctx.transaction);
7c673cae
FG
9110 parent->split_into(
9111 i->pgid,
9112 child,
9113 split_bits);
7c673cae 9114
92f5a8d4
TL
9115 child->init_collection_pool_opts();
9116
9f95a23c 9117 child->finish_split_stats(*stat_iter, rctx.transaction);
7c673cae
FG
9118 child->unlock();
9119 }
11fdf7f2 9120 ceph_assert(stat_iter != updated_stats.end());
9f95a23c 9121 parent->finish_split_stats(*stat_iter, rctx.transaction);
7c673cae
FG
9122}
9123
9124/*
9125 * holding osd_lock
9126 */
9127void OSD::handle_pg_create(OpRequestRef op)
9128{
9f95a23c
TL
9129 // NOTE: this can be removed in P release (mimic is the last version to
9130 // send MOSDPGCreate messages).
9131
9132 auto m = op->get_req<MOSDPGCreate>();
11fdf7f2 9133 ceph_assert(m->get_type() == MSG_OSD_PG_CREATE);
7c673cae
FG
9134
9135 dout(10) << "handle_pg_create " << *m << dendl;
9136
9137 if (!require_mon_peer(op->get_req())) {
9138 return;
9139 }
9140
9141 if (!require_same_or_newer_map(op, m->epoch, false))
9142 return;
9143
9144 op->mark_started();
9145
9f95a23c 9146 const auto osdmap = get_osdmap();
7c673cae
FG
9147 map<pg_t,utime_t>::const_iterator ci = m->ctimes.begin();
9148 for (map<pg_t,pg_create_t>::const_iterator p = m->mkpg.begin();
9149 p != m->mkpg.end();
9150 ++p, ++ci) {
11fdf7f2 9151 ceph_assert(ci != m->ctimes.end() && ci->first == p->first);
7c673cae
FG
9152 epoch_t created = p->second.created;
9153 if (p->second.split_bits) // Skip split pgs
9154 continue;
9155 pg_t on = p->first;
9156
7c673cae
FG
9157 if (!osdmap->have_pg_pool(on.pool())) {
9158 dout(20) << "ignoring pg on deleted pool " << on << dendl;
9159 continue;
9160 }
9161
9162 dout(20) << "mkpg " << on << " e" << created << "@" << ci->second << dendl;
9163
9f95a23c
TL
9164 spg_t pgid;
9165 bool mapped = osdmap->get_primary_shard(on, &pgid);
9166 ceph_assert(mapped);
9167
7c673cae
FG
9168 // is it still ours?
9169 vector<int> up, acting;
9170 int up_primary = -1;
9171 int acting_primary = -1;
9172 osdmap->pg_to_up_acting_osds(on, &up, &up_primary, &acting, &acting_primary);
9f95a23c 9173 int role = osdmap->calc_pg_role(pg_shard_t(whoami, pgid.shard), acting);
7c673cae
FG
9174
9175 if (acting_primary != whoami) {
9176 dout(10) << "mkpg " << on << " not acting_primary (" << acting_primary
9177 << "), my role=" << role << ", skipping" << dendl;
9178 continue;
9179 }
9180
7c673cae 9181
11fdf7f2 9182 PastIntervals pi;
7c673cae
FG
9183 pg_history_t history;
9184 build_initial_pg_history(pgid, created, ci->second, &history, &pi);
9185
11fdf7f2
TL
9186 // The mon won't resend unless the primary changed, so we ignore
9187 // same_interval_since. We'll pass this history with the current
9188 // epoch as the event.
7c673cae
FG
9189 if (history.same_primary_since > m->epoch) {
9190 dout(10) << __func__ << ": got obsolete pg create on pgid "
9191 << pgid << " from epoch " << m->epoch
9192 << ", primary changed in " << history.same_primary_since
9193 << dendl;
9194 continue;
9195 }
11fdf7f2
TL
9196 enqueue_peering_evt(
9197 pgid,
9198 PGPeeringEventRef(
9199 std::make_shared<PGPeeringEvent>(
9200 osdmap->get_epoch(),
9201 osdmap->get_epoch(),
9202 NullEvt(),
9203 true,
9204 new PGCreateInfo(
9205 pgid,
9206 osdmap->get_epoch(),
9207 history,
9208 pi,
9209 true)
9210 )));
7c673cae 9211 }
7c673cae 9212
3efd9988 9213 {
11fdf7f2 9214 std::lock_guard l(pending_creates_lock);
3efd9988
FG
9215 if (pending_creates_from_mon == 0) {
9216 last_pg_create_epoch = m->epoch;
9217 }
9218 }
11fdf7f2 9219
7c673cae
FG
9220 maybe_update_heartbeat_peers();
9221}
9222
9223
9224// ----------------------------------------
9225// peering and recovery
9226
9f95a23c 9227PeeringCtx OSD::create_context()
7c673cae 9228{
9f95a23c 9229 return PeeringCtx(get_osdmap()->require_osd_release);
7c673cae
FG
9230}
9231
9f95a23c 9232void OSD::dispatch_context(PeeringCtx &ctx, PG *pg, OSDMapRef curmap,
7c673cae
FG
9233 ThreadPool::TPHandle *handle)
9234{
11fdf7f2
TL
9235 if (!service.get_osdmap()->is_up(whoami)) {
9236 dout(20) << __func__ << " not up in osdmap" << dendl;
9237 } else if (!is_active()) {
9238 dout(20) << __func__ << " not active" << dendl;
9239 } else {
9f95a23c
TL
9240 for (auto& [osd, ls] : ctx.message_map) {
9241 if (!curmap->is_up(osd)) {
9242 dout(20) << __func__ << " skipping down osd." << osd << dendl;
9243 continue;
9244 }
9245 ConnectionRef con = service.get_con_osd_cluster(
9246 osd, curmap->get_epoch());
9247 if (!con) {
9248 dout(20) << __func__ << " skipping osd." << osd << " (NULL con)"
9249 << dendl;
9250 continue;
9251 }
9252 service.maybe_share_map(con.get(), curmap);
9253 for (auto m : ls) {
9254 con->send_message2(m);
9255 }
9256 ls.clear();
9257 }
7c673cae 9258 }
9f95a23c 9259 if ((!ctx.transaction.empty() || ctx.transaction.has_contexts()) && pg) {
7c673cae 9260 int tr = store->queue_transaction(
11fdf7f2 9261 pg->ch,
9f95a23c 9262 std::move(ctx.transaction), TrackedOpRef(),
7c673cae 9263 handle);
11fdf7f2 9264 ceph_assert(tr == 0);
7c673cae 9265 }
7c673cae
FG
9266}
9267
11fdf7f2 9268void OSD::handle_fast_pg_create(MOSDPGCreate2 *m)
7c673cae 9269{
11fdf7f2
TL
9270 dout(7) << __func__ << " " << *m << " from " << m->get_source() << dendl;
9271 if (!require_mon_peer(m)) {
9272 m->put();
7c673cae 9273 return;
7c673cae 9274 }
11fdf7f2
TL
9275 for (auto& p : m->pgs) {
9276 spg_t pgid = p.first;
9277 epoch_t created = p.second.first;
9278 utime_t created_stamp = p.second.second;
9f95a23c
TL
9279 auto q = m->pg_extra.find(pgid);
9280 if (q == m->pg_extra.end()) {
9281 dout(20) << __func__ << " " << pgid << " e" << created
9282 << "@" << created_stamp
9283 << " (no history or past_intervals)" << dendl;
9284 // pre-octopus ... no pg history. this can be removed in Q release.
9285 enqueue_peering_evt(
9286 pgid,
9287 PGPeeringEventRef(
9288 std::make_shared<PGPeeringEvent>(
9289 m->epoch,
9290 m->epoch,
9291 NullEvt(),
9292 true,
9293 new PGCreateInfo(
9294 pgid,
9295 created,
9296 pg_history_t(created, created_stamp),
9297 PastIntervals(),
9298 true)
9299 )));
9300 } else {
9301 dout(20) << __func__ << " " << pgid << " e" << created
9302 << "@" << created_stamp
9303 << " history " << q->second.first
9304 << " pi " << q->second.second << dendl;
9305 if (!q->second.second.empty() &&
9306 m->epoch < q->second.second.get_bounds().second) {
9307 clog->error() << "got pg_create on " << pgid << " epoch " << m->epoch
9308 << " and unmatched past_intervals " << q->second.second
9309 << " (history " << q->second.first << ")";
9310 } else {
9311 enqueue_peering_evt(
9312 pgid,
9313 PGPeeringEventRef(
9314 std::make_shared<PGPeeringEvent>(
9315 m->epoch,
9316 m->epoch,
9317 NullEvt(),
9318 true,
9319 new PGCreateInfo(
9320 pgid,
9321 m->epoch,
9322 q->second.first,
9323 q->second.second,
9324 true)
9325 )));
9326 }
9327 }
11fdf7f2 9328 }
7c673cae 9329
11fdf7f2
TL
9330 {
9331 std::lock_guard l(pending_creates_lock);
9332 if (pending_creates_from_mon == 0) {
9333 last_pg_create_epoch = m->epoch;
9334 }
7c673cae
FG
9335 }
9336
11fdf7f2 9337 m->put();
7c673cae
FG
9338}
9339
11fdf7f2 9340void OSD::handle_fast_pg_query(MOSDPGQuery *m)
7c673cae 9341{
11fdf7f2
TL
9342 dout(7) << __func__ << " " << *m << " from " << m->get_source() << dendl;
9343 if (!require_osd_peer(m)) {
9344 m->put();
7c673cae 9345 return;
11fdf7f2 9346 }
7c673cae 9347 int from = m->get_source().num();
11fdf7f2
TL
9348 for (auto& p : m->pg_list) {
9349 enqueue_peering_evt(
9350 p.first,
9351 PGPeeringEventRef(
9352 std::make_shared<PGPeeringEvent>(
9353 p.second.epoch_sent, p.second.epoch_sent,
9354 MQuery(
9355 p.first,
9356 pg_shard_t(from, p.second.from),
9357 p.second,
9358 p.second.epoch_sent),
9359 false))
7c673cae
FG
9360 );
9361 }
11fdf7f2 9362 m->put();
7c673cae
FG
9363}
9364
11fdf7f2 9365void OSD::handle_fast_pg_notify(MOSDPGNotify* m)
7c673cae 9366{
11fdf7f2
TL
9367 dout(7) << __func__ << " " << *m << " from " << m->get_source() << dendl;
9368 if (!require_osd_peer(m)) {
9369 m->put();
7c673cae
FG
9370 return;
9371 }
11fdf7f2
TL
9372 int from = m->get_source().num();
9373 for (auto& p : m->get_pg_list()) {
9f95a23c 9374 spg_t pgid(p.info.pgid.pgid, p.to);
11fdf7f2
TL
9375 enqueue_peering_evt(
9376 pgid,
9377 PGPeeringEventRef(
9378 std::make_shared<PGPeeringEvent>(
9f95a23c
TL
9379 p.epoch_sent,
9380 p.query_epoch,
11fdf7f2 9381 MNotifyRec(
9f95a23c
TL
9382 pgid, pg_shard_t(from, p.from),
9383 p,
9384 m->get_connection()->get_features()),
11fdf7f2
TL
9385 true,
9386 new PGCreateInfo(
9387 pgid,
9f95a23c
TL
9388 p.query_epoch,
9389 p.info.history,
9390 p.past_intervals,
11fdf7f2
TL
9391 false)
9392 )));
7c673cae 9393 }
11fdf7f2 9394 m->put();
7c673cae
FG
9395}
9396
11fdf7f2 9397void OSD::handle_fast_pg_info(MOSDPGInfo* m)
7c673cae 9398{
11fdf7f2
TL
9399 dout(7) << __func__ << " " << *m << " from " << m->get_source() << dendl;
9400 if (!require_osd_peer(m)) {
9401 m->put();
7c673cae
FG
9402 return;
9403 }
11fdf7f2
TL
9404 int from = m->get_source().num();
9405 for (auto& p : m->pg_list) {
9406 enqueue_peering_evt(
9f95a23c 9407 spg_t(p.info.pgid.pgid, p.to),
11fdf7f2
TL
9408 PGPeeringEventRef(
9409 std::make_shared<PGPeeringEvent>(
9f95a23c 9410 p.epoch_sent, p.query_epoch,
11fdf7f2 9411 MInfoRec(
9f95a23c
TL
9412 pg_shard_t(from, p.from),
9413 p.info,
9414 p.epoch_sent)))
11fdf7f2 9415 );
7c673cae 9416 }
11fdf7f2 9417 m->put();
7c673cae
FG
9418}
9419
11fdf7f2 9420void OSD::handle_fast_pg_remove(MOSDPGRemove *m)
7c673cae 9421{
11fdf7f2
TL
9422 dout(7) << __func__ << " " << *m << " from " << m->get_source() << dendl;
9423 if (!require_osd_peer(m)) {
9424 m->put();
7c673cae
FG
9425 return;
9426 }
11fdf7f2
TL
9427 for (auto& pgid : m->pg_list) {
9428 enqueue_peering_evt(
9429 pgid,
9430 PGPeeringEventRef(
9431 std::make_shared<PGPeeringEvent>(
9432 m->get_epoch(), m->get_epoch(),
9f95a23c 9433 PeeringState::DeleteStart())));
7c673cae 9434 }
11fdf7f2 9435 m->put();
7c673cae
FG
9436}
9437
11fdf7f2 9438void OSD::handle_fast_force_recovery(MOSDForceRecovery *m)
c07f9fc5 9439{
11fdf7f2
TL
9440 dout(10) << __func__ << " " << *m << dendl;
9441 if (!require_mon_or_mgr_peer(m)) {
9442 m->put();
9443 return;
9444 }
9445 epoch_t epoch = get_osdmap_epoch();
9446 for (auto pgid : m->forced_pgs) {
9447 if (m->options & OFR_BACKFILL) {
9448 if (m->options & OFR_CANCEL) {
9449 enqueue_peering_evt(
9450 pgid,
9451 PGPeeringEventRef(
9452 std::make_shared<PGPeeringEvent>(
9453 epoch, epoch,
9f95a23c 9454 PeeringState::UnsetForceBackfill())));
11fdf7f2
TL
9455 } else {
9456 enqueue_peering_evt(
9457 pgid,
9458 PGPeeringEventRef(
9459 std::make_shared<PGPeeringEvent>(
9460 epoch, epoch,
9f95a23c 9461 PeeringState::SetForceBackfill())));
11fdf7f2
TL
9462 }
9463 } else if (m->options & OFR_RECOVERY) {
9464 if (m->options & OFR_CANCEL) {
9465 enqueue_peering_evt(
9466 pgid,
9467 PGPeeringEventRef(
9468 std::make_shared<PGPeeringEvent>(
9469 epoch, epoch,
9f95a23c 9470 PeeringState::UnsetForceRecovery())));
11fdf7f2
TL
9471 } else {
9472 enqueue_peering_evt(
9473 pgid,
9474 PGPeeringEventRef(
9475 std::make_shared<PGPeeringEvent>(
9476 epoch, epoch,
9f95a23c 9477 PeeringState::SetForceRecovery())));
c07f9fc5
FG
9478 }
9479 }
9480 }
11fdf7f2 9481 m->put();
c07f9fc5 9482}
7c673cae 9483
11fdf7f2 9484void OSD::handle_pg_query_nopg(const MQuery& q)
7c673cae 9485{
11fdf7f2
TL
9486 spg_t pgid = q.pgid;
9487 dout(10) << __func__ << " " << pgid << dendl;
7c673cae 9488
11fdf7f2
TL
9489 OSDMapRef osdmap = get_osdmap();
9490 if (!osdmap->have_pg_pool(pgid.pool()))
7c673cae
FG
9491 return;
9492
11fdf7f2
TL
9493 dout(10) << " pg " << pgid << " dne" << dendl;
9494 pg_info_t empty(spg_t(pgid.pgid, q.query.to));
9495 ConnectionRef con = service.get_con_osd_cluster(q.from.osd, osdmap->get_epoch());
9496 if (con) {
9497 Message *m;
9498 if (q.query.type == pg_query_t::LOG ||
9499 q.query.type == pg_query_t::FULLLOG) {
9500 m = new MOSDPGLog(
9501 q.query.from, q.query.to,
9502 osdmap->get_epoch(), empty,
9503 q.query.epoch_sent);
7c673cae 9504 } else {
9f95a23c 9505 vector<pg_notify_t> ls;
11fdf7f2 9506 ls.push_back(
9f95a23c
TL
9507 pg_notify_t(
9508 q.query.from, q.query.to,
9509 q.query.epoch_sent,
9510 osdmap->get_epoch(),
9511 empty,
11fdf7f2 9512 PastIntervals()));
9f95a23c 9513 m = new MOSDPGNotify(osdmap->get_epoch(), std::move(ls));
7c673cae 9514 }
9f95a23c 9515 service.maybe_share_map(con.get(), osdmap);
11fdf7f2 9516 con->send_message(m);
7c673cae
FG
9517 }
9518}
9519
9f95a23c
TL
9520void OSDService::queue_check_readable(spg_t spgid,
9521 epoch_t lpr,
9522 ceph::signedspan delay)
9523{
9524 if (delay == ceph::signedspan::zero()) {
9525 osd->enqueue_peering_evt(
9526 spgid,
9527 PGPeeringEventRef(
9528 std::make_shared<PGPeeringEvent>(
9529 lpr, lpr,
9530 PeeringState::CheckReadable())));
9531 } else {
9532 mono_timer.add_event(
9533 delay,
9534 [this, spgid, lpr]() {
9535 queue_check_readable(spgid, lpr);
9536 });
9537 }
9538}
9539
7c673cae 9540
7c673cae
FG
9541// =========================================================
9542// RECOVERY
9543
9544void OSDService::_maybe_queue_recovery() {
9f95a23c 9545 ceph_assert(ceph_mutex_is_locked_by_me(recovery_lock));
7c673cae
FG
9546 uint64_t available_pushes;
9547 while (!awaiting_throttle.empty() &&
9548 _recover_now(&available_pushes)) {
11fdf7f2 9549 uint64_t to_start = std::min(
7c673cae
FG
9550 available_pushes,
9551 cct->_conf->osd_recovery_max_single_start);
9552 _queue_for_recovery(awaiting_throttle.front(), to_start);
9553 awaiting_throttle.pop_front();
11fdf7f2
TL
9554 dout(10) << __func__ << " starting " << to_start
9555 << ", recovery_ops_reserved " << recovery_ops_reserved
9556 << " -> " << (recovery_ops_reserved + to_start) << dendl;
7c673cae
FG
9557 recovery_ops_reserved += to_start;
9558 }
9559}
9560
9561bool OSDService::_recover_now(uint64_t *available_pushes)
9562{
9563 if (available_pushes)
9564 *available_pushes = 0;
9565
9566 if (ceph_clock_now() < defer_recovery_until) {
9567 dout(15) << __func__ << " defer until " << defer_recovery_until << dendl;
9568 return false;
9569 }
9570
9571 if (recovery_paused) {
9572 dout(15) << __func__ << " paused" << dendl;
9573 return false;
9574 }
9575
9f95a23c 9576 uint64_t max = osd->get_recovery_max_active();
7c673cae
FG
9577 if (max <= recovery_ops_active + recovery_ops_reserved) {
9578 dout(15) << __func__ << " active " << recovery_ops_active
9579 << " + reserved " << recovery_ops_reserved
9580 << " >= max " << max << dendl;
9581 return false;
9582 }
9583
9584 if (available_pushes)
9585 *available_pushes = max - recovery_ops_active - recovery_ops_reserved;
9586
9587 return true;
9588}
9589
9f95a23c
TL
9590unsigned OSDService::get_target_pg_log_entries() const
9591{
9592 auto num_pgs = osd->get_num_pgs();
9593 auto target = cct->_conf->osd_target_pg_log_entries_per_osd;
9594 if (num_pgs > 0 && target > 0) {
9595 // target an even spread of our budgeted log entries across all
9596 // PGs. note that while we only get to control the entry count
9597 // for primary PGs, we'll normally be responsible for a mix of
9598 // primary and replica PGs (for the same pool(s) even), so this
9599 // will work out.
9600 return std::max<unsigned>(
9601 std::min<unsigned>(target / num_pgs,
9602 cct->_conf->osd_max_pg_log_entries),
9603 cct->_conf->osd_min_pg_log_entries);
9604 } else {
9605 // fall back to a per-pg value.
9606 return cct->_conf->osd_min_pg_log_entries;
9607 }
9608}
9609
7c673cae
FG
9610void OSD::do_recovery(
9611 PG *pg, epoch_t queued, uint64_t reserved_pushes,
9612 ThreadPool::TPHandle &handle)
9613{
9614 uint64_t started = 0;
31f18b77
FG
9615
9616 /*
9617 * When the value of osd_recovery_sleep is set greater than zero, recovery
9618 * ops are scheduled after osd_recovery_sleep amount of time from the previous
9619 * recovery event's schedule time. This is done by adding a
9620 * recovery_requeue_callback event, which re-queues the recovery op using
9621 * queue_recovery_after_sleep.
9622 */
c07f9fc5 9623 float recovery_sleep = get_osd_recovery_sleep();
b32b8144 9624 {
11fdf7f2 9625 std::lock_guard l(service.sleep_lock);
b32b8144
FG
9626 if (recovery_sleep > 0 && service.recovery_needs_sleep) {
9627 PGRef pgref(pg);
9f95a23c 9628 auto recovery_requeue_callback = new LambdaContext([this, pgref, queued, reserved_pushes](int r) {
b32b8144
FG
9629 dout(20) << "do_recovery wake up at "
9630 << ceph_clock_now()
9631 << ", re-queuing recovery" << dendl;
11fdf7f2 9632 std::lock_guard l(service.sleep_lock);
b32b8144
FG
9633 service.recovery_needs_sleep = false;
9634 service.queue_recovery_after_sleep(pgref.get(), queued, reserved_pushes);
9635 });
9636
9637 // This is true for the first recovery op and when the previous recovery op
9638 // has been scheduled in the past. The next recovery op is scheduled after
9639 // completing the sleep from now.
f67539c2 9640
9f95a23c
TL
9641 if (auto now = ceph::real_clock::now();
9642 service.recovery_schedule_time < now) {
9643 service.recovery_schedule_time = now;
b32b8144 9644 }
9f95a23c 9645 service.recovery_schedule_time += ceph::make_timespan(recovery_sleep);
11fdf7f2 9646 service.sleep_timer.add_event_at(service.recovery_schedule_time,
9f95a23c 9647 recovery_requeue_callback);
b32b8144
FG
9648 dout(20) << "Recovery event scheduled at "
9649 << service.recovery_schedule_time << dendl;
9650 return;
9651 }
7c673cae
FG
9652 }
9653
9654 {
b32b8144 9655 {
11fdf7f2 9656 std::lock_guard l(service.sleep_lock);
b32b8144
FG
9657 service.recovery_needs_sleep = true;
9658 }
9659
7c673cae
FG
9660 if (pg->pg_has_reset_since(queued)) {
9661 goto out;
9662 }
9663
7c673cae
FG
9664 dout(10) << "do_recovery starting " << reserved_pushes << " " << *pg << dendl;
9665#ifdef DEBUG_RECOVERY_OIDS
11fdf7f2 9666 dout(20) << " active was " << service.recovery_oids[pg->pg_id] << dendl;
7c673cae
FG
9667#endif
9668
11fdf7f2 9669 bool do_unfound = pg->start_recovery_ops(reserved_pushes, handle, &started);
f67539c2 9670 dout(10) << "do_recovery started " << started << "/" << reserved_pushes
7c673cae
FG
9671 << " on " << *pg << dendl;
9672
11fdf7f2 9673 if (do_unfound) {
9f95a23c 9674 PeeringCtx rctx = create_context();
11fdf7f2 9675 rctx.handle = &handle;
9f95a23c 9676 pg->find_unfound(queued, rctx);
11fdf7f2 9677 dispatch_context(rctx, pg, pg->get_osdmap());
7c673cae 9678 }
7c673cae
FG
9679 }
9680
9681 out:
11fdf7f2 9682 ceph_assert(started <= reserved_pushes);
7c673cae
FG
9683 service.release_reserved_pushes(reserved_pushes);
9684}
9685
9686void OSDService::start_recovery_op(PG *pg, const hobject_t& soid)
9687{
11fdf7f2 9688 std::lock_guard l(recovery_lock);
7c673cae
FG
9689 dout(10) << "start_recovery_op " << *pg << " " << soid
9690 << " (" << recovery_ops_active << "/"
9f95a23c 9691 << osd->get_recovery_max_active() << " rops)"
7c673cae
FG
9692 << dendl;
9693 recovery_ops_active++;
9694
9695#ifdef DEBUG_RECOVERY_OIDS
11fdf7f2
TL
9696 dout(20) << " active was " << recovery_oids[pg->pg_id] << dendl;
9697 ceph_assert(recovery_oids[pg->pg_id].count(soid) == 0);
9698 recovery_oids[pg->pg_id].insert(soid);
7c673cae
FG
9699#endif
9700}
9701
9702void OSDService::finish_recovery_op(PG *pg, const hobject_t& soid, bool dequeue)
9703{
11fdf7f2 9704 std::lock_guard l(recovery_lock);
7c673cae
FG
9705 dout(10) << "finish_recovery_op " << *pg << " " << soid
9706 << " dequeue=" << dequeue
9f95a23c
TL
9707 << " (" << recovery_ops_active << "/"
9708 << osd->get_recovery_max_active() << " rops)"
7c673cae
FG
9709 << dendl;
9710
9711 // adjust count
11fdf7f2 9712 ceph_assert(recovery_ops_active > 0);
7c673cae
FG
9713 recovery_ops_active--;
9714
9715#ifdef DEBUG_RECOVERY_OIDS
11fdf7f2
TL
9716 dout(20) << " active oids was " << recovery_oids[pg->pg_id] << dendl;
9717 ceph_assert(recovery_oids[pg->pg_id].count(soid));
9718 recovery_oids[pg->pg_id].erase(soid);
7c673cae
FG
9719#endif
9720
9721 _maybe_queue_recovery();
9722}
9723
9724bool OSDService::is_recovery_active()
9725{
eafe8130
TL
9726 if (cct->_conf->osd_debug_pretend_recovery_active) {
9727 return true;
9728 }
b5b8bbf5 9729 return local_reserver.has_reservation() || remote_reserver.has_reservation();
7c673cae
FG
9730}
9731
11fdf7f2
TL
9732void OSDService::release_reserved_pushes(uint64_t pushes)
9733{
9734 std::lock_guard l(recovery_lock);
9735 dout(10) << __func__ << "(" << pushes << "), recovery_ops_reserved "
9736 << recovery_ops_reserved << " -> " << (recovery_ops_reserved-pushes)
9737 << dendl;
9738 ceph_assert(recovery_ops_reserved >= pushes);
9739 recovery_ops_reserved -= pushes;
9740 _maybe_queue_recovery();
9741}
9742
7c673cae
FG
9743// =========================================================
9744// OPS
9745
9746bool OSD::op_is_discardable(const MOSDOp *op)
9747{
9748 // drop client request if they are not connected and can't get the
9749 // reply anyway.
9750 if (!op->get_connection()->is_connected()) {
9751 return true;
9752 }
9753 return false;
9754}
9755
11fdf7f2 9756void OSD::enqueue_op(spg_t pg, OpRequestRef&& op, epoch_t epoch)
7c673cae 9757{
11fdf7f2
TL
9758 const utime_t stamp = op->get_req()->get_recv_stamp();
9759 const utime_t latency = ceph_clock_now() - stamp;
9760 const unsigned priority = op->get_req()->get_priority();
9761 const int cost = op->get_req()->get_cost();
9762 const uint64_t owner = op->get_req()->get_source().num();
f67539c2 9763 const int type = op->get_req()->get_type();
11fdf7f2
TL
9764
9765 dout(15) << "enqueue_op " << op << " prio " << priority
f67539c2 9766 << " type " << type
11fdf7f2 9767 << " cost " << cost
7c673cae
FG
9768 << " latency " << latency
9769 << " epoch " << epoch
9770 << " " << *(op->get_req()) << dendl;
9771 op->osd_trace.event("enqueue op");
11fdf7f2
TL
9772 op->osd_trace.keyval("priority", priority);
9773 op->osd_trace.keyval("cost", cost);
f67539c2
TL
9774#ifdef HAVE_JAEGER
9775 if (op->osd_parent_span) {
9776 auto enqueue_span = jaeger_tracing::child_span(__func__, op->osd_parent_span);
9777 enqueue_span->Log({
9778 {"priority", priority},
9779 {"cost", cost},
9780 {"epoch", epoch},
9781 {"owner", owner},
9782 {"type", type}
9783 });
9784 }
9785#endif
7c673cae 9786 op->mark_queued_for_pg();
224ce89b 9787 logger->tinc(l_osd_op_before_queue_op_lat, latency);
f67539c2
TL
9788 if (type == MSG_OSD_PG_PUSH ||
9789 type == MSG_OSD_PG_PUSH_REPLY) {
9790 op_shardedwq.queue(
9791 OpSchedulerItem(
9792 unique_ptr<OpSchedulerItem::OpQueueable>(new PGRecoveryMsg(pg, std::move(op))),
9793 cost, priority, stamp, owner, epoch));
9794 } else {
9795 op_shardedwq.queue(
9796 OpSchedulerItem(
9797 unique_ptr<OpSchedulerItem::OpQueueable>(new PGOpItem(pg, std::move(op))),
9798 cost, priority, stamp, owner, epoch));
9799 }
7c673cae
FG
9800}
9801
11fdf7f2
TL
9802void OSD::enqueue_peering_evt(spg_t pgid, PGPeeringEventRef evt)
9803{
9804 dout(15) << __func__ << " " << pgid << " " << evt->get_desc() << dendl;
9805 op_shardedwq.queue(
9f95a23c
TL
9806 OpSchedulerItem(
9807 unique_ptr<OpSchedulerItem::OpQueueable>(new PGPeeringItem(pgid, evt)),
11fdf7f2
TL
9808 10,
9809 cct->_conf->osd_peering_op_priority,
9810 utime_t(),
9811 0,
9812 evt->get_epoch_sent()));
9813}
7c673cae
FG
9814
9815/*
9816 * NOTE: dequeue called in worker thread, with pg lock
9817 */
9818void OSD::dequeue_op(
9819 PGRef pg, OpRequestRef op,
9820 ThreadPool::TPHandle &handle)
9821{
9f95a23c
TL
9822 const Message *m = op->get_req();
9823
11fdf7f2 9824 FUNCTRACE(cct);
9f95a23c 9825 OID_EVENT_TRACE_WITH_MSG(m, "DEQUEUE_OP_BEGIN", false);
7c673cae
FG
9826
9827 utime_t now = ceph_clock_now();
9828 op->set_dequeued_time(now);
9f95a23c
TL
9829
9830 utime_t latency = now - m->get_recv_stamp();
9831 dout(10) << "dequeue_op " << op << " prio " << m->get_priority()
9832 << " cost " << m->get_cost()
7c673cae 9833 << " latency " << latency
9f95a23c 9834 << " " << *m
7c673cae
FG
9835 << " pg " << *pg << dendl;
9836
224ce89b
WB
9837 logger->tinc(l_osd_op_before_dequeue_op_lat, latency);
9838
9f95a23c
TL
9839 service.maybe_share_map(m->get_connection().get(),
9840 pg->get_osdmap(),
9841 op->sent_epoch);
7c673cae 9842
11fdf7f2 9843 if (pg->is_deleting())
7c673cae
FG
9844 return;
9845
9846 op->mark_reached_pg();
9847 op->osd_trace.event("dequeue_op");
9848
9849 pg->do_request(op, handle);
9850
9851 // finish
9852 dout(10) << "dequeue_op " << op << " finish" << dendl;
9f95a23c 9853 OID_EVENT_TRACE_WITH_MSG(m, "DEQUEUE_OP_END", false);
7c673cae
FG
9854}
9855
9856
11fdf7f2
TL
9857void OSD::dequeue_peering_evt(
9858 OSDShard *sdata,
9859 PG *pg,
9860 PGPeeringEventRef evt,
9861 ThreadPool::TPHandle& handle)
7c673cae 9862{
9f95a23c 9863 PeeringCtx rctx = create_context();
11fdf7f2 9864 auto curmap = sdata->get_osdmap();
9f95a23c
TL
9865 bool need_up_thru = false;
9866 epoch_t same_interval_since = 0;
11fdf7f2
TL
9867 if (!pg) {
9868 if (const MQuery *q = dynamic_cast<const MQuery*>(evt->evt.get())) {
9869 handle_pg_query_nopg(*q);
7c673cae 9870 } else {
11fdf7f2
TL
9871 derr << __func__ << " unrecognized pg-less event " << evt->get_desc() << dendl;
9872 ceph_abort();
9873 }
9f95a23c
TL
9874 } else if (advance_pg(curmap->get_epoch(), pg, handle, rctx)) {
9875 pg->do_peering_event(evt, rctx);
11fdf7f2 9876 if (pg->is_deleted()) {
11fdf7f2
TL
9877 pg->unlock();
9878 return;
7c673cae 9879 }
9f95a23c 9880 dispatch_context(rctx, pg, curmap, &handle);
11fdf7f2
TL
9881 need_up_thru = pg->get_need_up_thru();
9882 same_interval_since = pg->get_same_interval_since();
7c673cae
FG
9883 pg->unlock();
9884 }
11fdf7f2
TL
9885
9886 if (need_up_thru) {
7c673cae 9887 queue_want_up_thru(same_interval_since);
11fdf7f2 9888 }
7c673cae
FG
9889
9890 service.send_pg_temp();
9891}
9892
11fdf7f2
TL
9893void OSD::dequeue_delete(
9894 OSDShard *sdata,
9895 PG *pg,
9896 epoch_t e,
9897 ThreadPool::TPHandle& handle)
9898{
9899 dequeue_peering_evt(
9900 sdata,
9901 pg,
9902 PGPeeringEventRef(
9903 std::make_shared<PGPeeringEvent>(
9904 e, e,
9f95a23c 9905 PeeringState::DeleteSome())),
11fdf7f2
TL
9906 handle);
9907}
9908
9909
9910
7c673cae
FG
9911// --------------------------------
9912
9913const char** OSD::get_tracked_conf_keys() const
9914{
9915 static const char* KEYS[] = {
9916 "osd_max_backfills",
9917 "osd_min_recovery_priority",
224ce89b
WB
9918 "osd_max_trimming_pgs",
9919 "osd_op_complaint_time",
9920 "osd_op_log_threshold",
9921 "osd_op_history_size",
9922 "osd_op_history_duration",
9923 "osd_op_history_slow_op_size",
9924 "osd_op_history_slow_op_threshold",
7c673cae
FG
9925 "osd_enable_op_tracker",
9926 "osd_map_cache_size",
11fdf7f2 9927 "osd_pg_epoch_max_lag_factor",
7c673cae 9928 "osd_pg_epoch_persisted_max_stale",
f67539c2
TL
9929 "osd_recovery_sleep",
9930 "osd_recovery_sleep_hdd",
9931 "osd_recovery_sleep_ssd",
9932 "osd_recovery_sleep_hybrid",
b3b6e05e
TL
9933 "osd_delete_sleep",
9934 "osd_delete_sleep_hdd",
9935 "osd_delete_sleep_ssd",
9936 "osd_delete_sleep_hybrid",
9937 "osd_snap_trim_sleep",
9938 "osd_snap_trim_sleep_hdd",
9939 "osd_snap_trim_sleep_ssd",
9940 "osd_snap_trim_sleep_hybrid"
9941 "osd_scrub_sleep",
f67539c2
TL
9942 "osd_recovery_max_active",
9943 "osd_recovery_max_active_hdd",
9944 "osd_recovery_max_active_ssd",
7c673cae
FG
9945 // clog & admin clog
9946 "clog_to_monitors",
9947 "clog_to_syslog",
9948 "clog_to_syslog_facility",
9949 "clog_to_syslog_level",
9950 "osd_objectstore_fuse",
9951 "clog_to_graylog",
9952 "clog_to_graylog_host",
9953 "clog_to_graylog_port",
9954 "host",
9955 "fsid",
9956 "osd_recovery_delay_start",
9957 "osd_client_message_size_cap",
9958 "osd_client_message_cap",
31f18b77
FG
9959 "osd_heartbeat_min_size",
9960 "osd_heartbeat_interval",
9f95a23c 9961 "osd_object_clean_region_max_num_intervals",
494da23a
TL
9962 "osd_scrub_min_interval",
9963 "osd_scrub_max_interval",
7c673cae
FG
9964 NULL
9965 };
9966 return KEYS;
9967}
9968
11fdf7f2 9969void OSD::handle_conf_change(const ConfigProxy& conf,
7c673cae
FG
9970 const std::set <std::string> &changed)
9971{
9f95a23c 9972 std::lock_guard l{osd_lock};
f67539c2
TL
9973
9974 if (changed.count("osd_max_backfills") ||
9975 changed.count("osd_delete_sleep") ||
9976 changed.count("osd_delete_sleep_hdd") ||
9977 changed.count("osd_delete_sleep_ssd") ||
9978 changed.count("osd_delete_sleep_hybrid") ||
9979 changed.count("osd_snap_trim_sleep") ||
9980 changed.count("osd_snap_trim_sleep_hdd") ||
9981 changed.count("osd_snap_trim_sleep_ssd") ||
9982 changed.count("osd_snap_trim_sleep_hybrid") ||
9983 changed.count("osd_scrub_sleep") ||
9984 changed.count("osd_recovery_sleep") ||
9985 changed.count("osd_recovery_sleep_hdd") ||
9986 changed.count("osd_recovery_sleep_ssd") ||
9987 changed.count("osd_recovery_sleep_hybrid") ||
9988 changed.count("osd_recovery_max_active") ||
9989 changed.count("osd_recovery_max_active_hdd") ||
9990 changed.count("osd_recovery_max_active_ssd")) {
b3b6e05e
TL
9991 if (!maybe_override_options_for_qos() &&
9992 changed.count("osd_max_backfills")) {
9993 // Scheduler is not "mclock". Fallback to earlier behavior
f67539c2
TL
9994 service.local_reserver.set_max(cct->_conf->osd_max_backfills);
9995 service.remote_reserver.set_max(cct->_conf->osd_max_backfills);
9996 }
7c673cae
FG
9997 }
9998 if (changed.count("osd_min_recovery_priority")) {
9999 service.local_reserver.set_min_priority(cct->_conf->osd_min_recovery_priority);
10000 service.remote_reserver.set_min_priority(cct->_conf->osd_min_recovery_priority);
10001 }
10002 if (changed.count("osd_max_trimming_pgs")) {
10003 service.snap_reserver.set_max(cct->_conf->osd_max_trimming_pgs);
10004 }
10005 if (changed.count("osd_op_complaint_time") ||
10006 changed.count("osd_op_log_threshold")) {
10007 op_tracker.set_complaint_and_threshold(cct->_conf->osd_op_complaint_time,
10008 cct->_conf->osd_op_log_threshold);
10009 }
10010 if (changed.count("osd_op_history_size") ||
10011 changed.count("osd_op_history_duration")) {
10012 op_tracker.set_history_size_and_duration(cct->_conf->osd_op_history_size,
10013 cct->_conf->osd_op_history_duration);
10014 }
10015 if (changed.count("osd_op_history_slow_op_size") ||
10016 changed.count("osd_op_history_slow_op_threshold")) {
10017 op_tracker.set_history_slow_op_size_and_threshold(cct->_conf->osd_op_history_slow_op_size,
10018 cct->_conf->osd_op_history_slow_op_threshold);
10019 }
10020 if (changed.count("osd_enable_op_tracker")) {
10021 op_tracker.set_tracking(cct->_conf->osd_enable_op_tracker);
10022 }
7c673cae
FG
10023 if (changed.count("osd_map_cache_size")) {
10024 service.map_cache.set_size(cct->_conf->osd_map_cache_size);
10025 service.map_bl_cache.set_size(cct->_conf->osd_map_cache_size);
10026 service.map_bl_inc_cache.set_size(cct->_conf->osd_map_cache_size);
10027 }
10028 if (changed.count("clog_to_monitors") ||
10029 changed.count("clog_to_syslog") ||
10030 changed.count("clog_to_syslog_level") ||
10031 changed.count("clog_to_syslog_facility") ||
10032 changed.count("clog_to_graylog") ||
10033 changed.count("clog_to_graylog_host") ||
10034 changed.count("clog_to_graylog_port") ||
10035 changed.count("host") ||
10036 changed.count("fsid")) {
10037 update_log_config();
10038 }
11fdf7f2
TL
10039 if (changed.count("osd_pg_epoch_max_lag_factor")) {
10040 m_osd_pg_epoch_max_lag_factor = conf.get_val<double>(
10041 "osd_pg_epoch_max_lag_factor");
10042 }
7c673cae
FG
10043
10044#ifdef HAVE_LIBFUSE
10045 if (changed.count("osd_objectstore_fuse")) {
10046 if (store) {
10047 enable_disable_fuse(false);
10048 }
10049 }
10050#endif
10051
10052 if (changed.count("osd_recovery_delay_start")) {
10053 service.defer_recovery(cct->_conf->osd_recovery_delay_start);
10054 service.kick_recovery_queue();
10055 }
10056
10057 if (changed.count("osd_client_message_cap")) {
10058 uint64_t newval = cct->_conf->osd_client_message_cap;
10059 Messenger::Policy pol = client_messenger->get_policy(entity_name_t::TYPE_CLIENT);
10060 if (pol.throttler_messages && newval > 0) {
10061 pol.throttler_messages->reset_max(newval);
10062 }
10063 }
10064 if (changed.count("osd_client_message_size_cap")) {
10065 uint64_t newval = cct->_conf->osd_client_message_size_cap;
10066 Messenger::Policy pol = client_messenger->get_policy(entity_name_t::TYPE_CLIENT);
10067 if (pol.throttler_bytes && newval > 0) {
10068 pol.throttler_bytes->reset_max(newval);
10069 }
10070 }
9f95a23c
TL
10071 if (changed.count("osd_object_clean_region_max_num_intervals")) {
10072 ObjectCleanRegions::set_max_num_intervals(cct->_conf->osd_object_clean_region_max_num_intervals);
10073 }
7c673cae 10074
494da23a
TL
10075 if (changed.count("osd_scrub_min_interval") ||
10076 changed.count("osd_scrub_max_interval")) {
10077 resched_all_scrubs();
10078 dout(0) << __func__ << ": scrub interval change" << dendl;
10079 }
7c673cae 10080 check_config();
f67539c2
TL
10081 if (changed.count("osd_asio_thread_count")) {
10082 service.poolctx.stop();
10083 service.poolctx.start(conf.get_val<std::uint64_t>("osd_asio_thread_count"));
10084 }
7c673cae
FG
10085}
10086
b3b6e05e
TL
10087bool OSD::maybe_override_options_for_qos()
10088{
10089 // If the scheduler enabled is mclock, override the recovery, backfill
10090 // and sleep options so that mclock can meet the QoS goals.
10091 if (cct->_conf.get_val<std::string>("osd_op_queue") == "mclock_scheduler") {
10092 dout(1) << __func__
10093 << ": Changing recovery/backfill/sleep settings for QoS" << dendl;
10094
10095 // Set high value for recovery max active
10096 uint32_t rec_max_active = 1000;
10097 cct->_conf.set_val(
10098 "osd_recovery_max_active", std::to_string(rec_max_active));
10099 cct->_conf.set_val(
10100 "osd_recovery_max_active_hdd", std::to_string(rec_max_active));
10101 cct->_conf.set_val(
10102 "osd_recovery_max_active_ssd", std::to_string(rec_max_active));
10103
10104 // Set high value for osd_max_backfill
10105 uint32_t max_backfills = 1000;
10106 cct->_conf.set_val("osd_max_backfills", std::to_string(max_backfills));
10107 service.local_reserver.set_max(max_backfills);
10108 service.remote_reserver.set_max(max_backfills);
10109
10110 // Disable recovery sleep
10111 cct->_conf.set_val("osd_recovery_sleep", std::to_string(0));
10112 cct->_conf.set_val("osd_recovery_sleep_hdd", std::to_string(0));
10113 cct->_conf.set_val("osd_recovery_sleep_ssd", std::to_string(0));
10114 cct->_conf.set_val("osd_recovery_sleep_hybrid", std::to_string(0));
10115
10116 // Disable delete sleep
10117 cct->_conf.set_val("osd_delete_sleep", std::to_string(0));
10118 cct->_conf.set_val("osd_delete_sleep_hdd", std::to_string(0));
10119 cct->_conf.set_val("osd_delete_sleep_ssd", std::to_string(0));
10120 cct->_conf.set_val("osd_delete_sleep_hybrid", std::to_string(0));
10121
10122 // Disable snap trim sleep
10123 cct->_conf.set_val("osd_snap_trim_sleep", std::to_string(0));
10124 cct->_conf.set_val("osd_snap_trim_sleep_hdd", std::to_string(0));
10125 cct->_conf.set_val("osd_snap_trim_sleep_ssd", std::to_string(0));
10126 cct->_conf.set_val("osd_snap_trim_sleep_hybrid", std::to_string(0));
10127
10128 // Disable scrub sleep
10129 cct->_conf.set_val("osd_scrub_sleep", std::to_string(0));
10130 return true;
10131 }
10132 return false;
10133}
10134
7c673cae
FG
10135void OSD::update_log_config()
10136{
10137 map<string,string> log_to_monitors;
10138 map<string,string> log_to_syslog;
10139 map<string,string> log_channel;
10140 map<string,string> log_prio;
10141 map<string,string> log_to_graylog;
10142 map<string,string> log_to_graylog_host;
10143 map<string,string> log_to_graylog_port;
10144 uuid_d fsid;
10145 string host;
10146
10147 if (parse_log_client_options(cct, log_to_monitors, log_to_syslog,
10148 log_channel, log_prio, log_to_graylog,
10149 log_to_graylog_host, log_to_graylog_port,
10150 fsid, host) == 0)
10151 clog->update_config(log_to_monitors, log_to_syslog,
10152 log_channel, log_prio, log_to_graylog,
10153 log_to_graylog_host, log_to_graylog_port,
10154 fsid, host);
10155 derr << "log_to_monitors " << log_to_monitors << dendl;
10156}
10157
10158void OSD::check_config()
10159{
10160 // some sanity checks
7c673cae
FG
10161 if (cct->_conf->osd_map_cache_size <= (int)cct->_conf->osd_pg_epoch_persisted_max_stale + 2) {
10162 clog->warn() << "osd_map_cache_size (" << cct->_conf->osd_map_cache_size << ")"
10163 << " is not > osd_pg_epoch_persisted_max_stale ("
10164 << cct->_conf->osd_pg_epoch_persisted_max_stale << ")";
10165 }
9f95a23c 10166 if (cct->_conf->osd_object_clean_region_max_num_intervals < 0) {
f67539c2 10167 clog->warn() << "osd_object_clean_region_max_num_intervals ("
9f95a23c
TL
10168 << cct->_conf->osd_object_clean_region_max_num_intervals
10169 << ") is < 0";
10170 }
7c673cae
FG
10171}
10172
7c673cae
FG
10173// --------------------------------
10174
10175void OSD::get_latest_osdmap()
10176{
10177 dout(10) << __func__ << " -- start" << dendl;
10178
f67539c2
TL
10179 boost::system::error_code ec;
10180 service.objecter->wait_for_latest_osdmap(ceph::async::use_blocked[ec]);
7c673cae
FG
10181
10182 dout(10) << __func__ << " -- finish" << dendl;
10183}
10184
10185// --------------------------------
10186
9f95a23c
TL
10187void OSD::set_perf_queries(const ConfigPayload &config_payload) {
10188 const OSDConfigPayload &osd_config_payload = boost::get<OSDConfigPayload>(config_payload);
10189 const std::map<OSDPerfMetricQuery, OSDPerfMetricLimits> &queries = osd_config_payload.config;
11fdf7f2
TL
10190 dout(10) << "setting " << queries.size() << " queries" << dendl;
10191
10192 std::list<OSDPerfMetricQuery> supported_queries;
10193 for (auto &it : queries) {
10194 auto &query = it.first;
10195 if (!query.key_descriptor.empty()) {
10196 supported_queries.push_back(query);
10197 }
10198 }
10199 if (supported_queries.size() < queries.size()) {
10200 dout(1) << queries.size() - supported_queries.size()
10201 << " unsupported queries" << dendl;
10202 }
11fdf7f2 10203 {
9f95a23c 10204 std::lock_guard locker{m_perf_queries_lock};
11fdf7f2
TL
10205 m_perf_queries = supported_queries;
10206 m_perf_limits = queries;
10207 }
11fdf7f2
TL
10208 std::vector<PGRef> pgs;
10209 _get_pgs(&pgs);
10210 for (auto& pg : pgs) {
9f95a23c 10211 std::scoped_lock l{*pg};
eafe8130 10212 pg->set_dynamic_perf_stats_queries(supported_queries);
7c673cae 10213 }
7c673cae
FG
10214}
10215
9f95a23c
TL
10216MetricPayload OSD::get_perf_reports() {
10217 OSDMetricPayload payload;
10218 std::map<OSDPerfMetricQuery, OSDPerfMetricReport> &reports = payload.report;
10219
11fdf7f2
TL
10220 std::vector<PGRef> pgs;
10221 _get_pgs(&pgs);
10222 DynamicPerfStats dps;
10223 for (auto& pg : pgs) {
eafe8130
TL
10224 // m_perf_queries can be modified only in set_perf_queries by mgr client
10225 // request, and it is protected by by mgr client's lock, which is held
10226 // when set_perf_queries/get_perf_reports are called, so we may not hold
10227 // m_perf_queries_lock here.
10228 DynamicPerfStats pg_dps(m_perf_queries);
10229 pg->lock();
10230 pg->get_dynamic_perf_stats(&pg_dps);
10231 pg->unlock();
10232 dps.merge(pg_dps);
11fdf7f2 10233 }
9f95a23c
TL
10234 dps.add_to_reports(m_perf_limits, &reports);
10235 dout(20) << "reports for " << reports.size() << " queries" << dendl;
10236
10237 return payload;
11fdf7f2 10238}
224ce89b 10239
7c673cae
FG
10240// =============================================================
10241
10242#undef dout_context
11fdf7f2 10243#define dout_context cct
7c673cae 10244#undef dout_prefix
11fdf7f2 10245#define dout_prefix *_dout << "osd." << osd->get_nodeid() << ":" << shard_id << "." << __func__ << " "
7c673cae 10246
11fdf7f2 10247void OSDShard::_attach_pg(OSDShardPGSlot *slot, PG *pg)
7c673cae 10248{
11fdf7f2
TL
10249 dout(10) << pg->pg_id << " " << pg << dendl;
10250 slot->pg = pg;
10251 pg->osd_shard = this;
10252 pg->pg_slot = slot;
10253 osd->inc_num_pgs();
10254
10255 slot->epoch = pg->get_osdmap_epoch();
10256 pg_slots_by_epoch.insert(*slot);
10257}
10258
10259void OSDShard::_detach_pg(OSDShardPGSlot *slot)
10260{
10261 dout(10) << slot->pg->pg_id << " " << slot->pg << dendl;
10262 slot->pg->osd_shard = nullptr;
10263 slot->pg->pg_slot = nullptr;
10264 slot->pg = nullptr;
10265 osd->dec_num_pgs();
10266
10267 pg_slots_by_epoch.erase(pg_slots_by_epoch.iterator_to(*slot));
10268 slot->epoch = 0;
10269 if (waiting_for_min_pg_epoch) {
10270 min_pg_epoch_cond.notify_all();
10271 }
10272}
10273
10274void OSDShard::update_pg_epoch(OSDShardPGSlot *slot, epoch_t e)
10275{
10276 std::lock_guard l(shard_lock);
10277 dout(30) << "min was " << pg_slots_by_epoch.begin()->epoch
10278 << " on " << pg_slots_by_epoch.begin()->pg->pg_id << dendl;
10279 pg_slots_by_epoch.erase(pg_slots_by_epoch.iterator_to(*slot));
10280 dout(20) << slot->pg->pg_id << " " << slot->epoch << " -> " << e << dendl;
10281 slot->epoch = e;
10282 pg_slots_by_epoch.insert(*slot);
10283 dout(30) << "min is now " << pg_slots_by_epoch.begin()->epoch
10284 << " on " << pg_slots_by_epoch.begin()->pg->pg_id << dendl;
10285 if (waiting_for_min_pg_epoch) {
10286 min_pg_epoch_cond.notify_all();
10287 }
10288}
10289
10290epoch_t OSDShard::get_min_pg_epoch()
10291{
10292 std::lock_guard l(shard_lock);
10293 auto p = pg_slots_by_epoch.begin();
10294 if (p == pg_slots_by_epoch.end()) {
10295 return 0;
10296 }
10297 return p->epoch;
10298}
10299
10300void OSDShard::wait_min_pg_epoch(epoch_t need)
10301{
10302 std::unique_lock l{shard_lock};
10303 ++waiting_for_min_pg_epoch;
10304 min_pg_epoch_cond.wait(l, [need, this] {
10305 if (pg_slots_by_epoch.empty()) {
10306 return true;
10307 } else if (pg_slots_by_epoch.begin()->epoch >= need) {
10308 return true;
10309 } else {
10310 dout(10) << need << " waiting on "
10311 << pg_slots_by_epoch.begin()->epoch << dendl;
10312 return false;
10313 }
10314 });
10315 --waiting_for_min_pg_epoch;
10316}
10317
10318epoch_t OSDShard::get_max_waiting_epoch()
10319{
10320 std::lock_guard l(shard_lock);
10321 epoch_t r = 0;
10322 for (auto& i : pg_slots) {
10323 if (!i.second->waiting_peering.empty()) {
10324 r = std::max(r, i.second->waiting_peering.rbegin()->first);
10325 }
10326 }
10327 return r;
10328}
10329
10330void OSDShard::consume_map(
9f95a23c 10331 const OSDMapRef& new_osdmap,
11fdf7f2
TL
10332 unsigned *pushes_to_free)
10333{
10334 std::lock_guard l(shard_lock);
10335 OSDMapRef old_osdmap;
7c673cae 10336 {
11fdf7f2
TL
10337 std::lock_guard l(osdmap_lock);
10338 old_osdmap = std::move(shard_osdmap);
10339 shard_osdmap = new_osdmap;
10340 }
10341 dout(10) << new_osdmap->get_epoch()
10342 << " (was " << (old_osdmap ? old_osdmap->get_epoch() : 0) << ")"
10343 << dendl;
10344 bool queued = false;
10345
10346 // check slots
10347 auto p = pg_slots.begin();
10348 while (p != pg_slots.end()) {
10349 OSDShardPGSlot *slot = p->second.get();
10350 const spg_t& pgid = p->first;
10351 dout(20) << __func__ << " " << pgid << dendl;
10352 if (!slot->waiting_for_split.empty()) {
10353 dout(20) << __func__ << " " << pgid
10354 << " waiting for split " << slot->waiting_for_split << dendl;
10355 ++p;
10356 continue;
10357 }
10358 if (slot->waiting_for_merge_epoch > new_osdmap->get_epoch()) {
10359 dout(20) << __func__ << " " << pgid
10360 << " waiting for merge by epoch " << slot->waiting_for_merge_epoch
10361 << dendl;
10362 ++p;
10363 continue;
10364 }
10365 if (!slot->waiting_peering.empty()) {
10366 epoch_t first = slot->waiting_peering.begin()->first;
10367 if (first <= new_osdmap->get_epoch()) {
10368 dout(20) << __func__ << " " << pgid
10369 << " pending_peering first epoch " << first
10370 << " <= " << new_osdmap->get_epoch() << ", requeueing" << dendl;
10371 _wake_pg_slot(pgid, slot);
10372 queued = true;
10373 }
10374 ++p;
10375 continue;
10376 }
10377 if (!slot->waiting.empty()) {
10378 if (new_osdmap->is_up_acting_osd_shard(pgid, osd->get_nodeid())) {
10379 dout(20) << __func__ << " " << pgid << " maps to us, keeping"
10380 << dendl;
10381 ++p;
10382 continue;
7c673cae 10383 }
11fdf7f2
TL
10384 while (!slot->waiting.empty() &&
10385 slot->waiting.front().get_map_epoch() <= new_osdmap->get_epoch()) {
10386 auto& qi = slot->waiting.front();
10387 dout(20) << __func__ << " " << pgid
10388 << " waiting item " << qi
10389 << " epoch " << qi.get_map_epoch()
10390 << " <= " << new_osdmap->get_epoch()
10391 << ", "
10392 << (qi.get_map_epoch() < new_osdmap->get_epoch() ? "stale" :
10393 "misdirected")
10394 << ", dropping" << dendl;
10395 *pushes_to_free += qi.get_reserved_pushes();
10396 slot->waiting.pop_front();
10397 }
10398 }
10399 if (slot->waiting.empty() &&
10400 slot->num_running == 0 &&
10401 slot->waiting_for_split.empty() &&
10402 !slot->pg) {
10403 dout(20) << __func__ << " " << pgid << " empty, pruning" << dendl;
10404 p = pg_slots.erase(p);
10405 continue;
7c673cae 10406 }
11fdf7f2
TL
10407
10408 ++p;
7c673cae 10409 }
7c673cae 10410 if (queued) {
11fdf7f2
TL
10411 std::lock_guard l{sdata_wait_lock};
10412 sdata_cond.notify_one();
7c673cae
FG
10413 }
10414}
10415
11fdf7f2
TL
10416void OSDShard::_wake_pg_slot(
10417 spg_t pgid,
10418 OSDShardPGSlot *slot)
10419{
10420 dout(20) << __func__ << " " << pgid
10421 << " to_process " << slot->to_process
10422 << " waiting " << slot->waiting
10423 << " waiting_peering " << slot->waiting_peering << dendl;
10424 for (auto i = slot->to_process.rbegin();
10425 i != slot->to_process.rend();
10426 ++i) {
9f95a23c 10427 scheduler->enqueue_front(std::move(*i));
11fdf7f2
TL
10428 }
10429 slot->to_process.clear();
10430 for (auto i = slot->waiting.rbegin();
10431 i != slot->waiting.rend();
10432 ++i) {
9f95a23c 10433 scheduler->enqueue_front(std::move(*i));
11fdf7f2
TL
10434 }
10435 slot->waiting.clear();
10436 for (auto i = slot->waiting_peering.rbegin();
10437 i != slot->waiting_peering.rend();
10438 ++i) {
10439 // this is overkill; we requeue everything, even if some of these
10440 // items are waiting for maps we don't have yet. FIXME, maybe,
10441 // someday, if we decide this inefficiency matters
10442 for (auto j = i->second.rbegin(); j != i->second.rend(); ++j) {
9f95a23c 10443 scheduler->enqueue_front(std::move(*j));
11fdf7f2
TL
10444 }
10445 }
10446 slot->waiting_peering.clear();
10447 ++slot->requeue_seq;
10448}
10449
10450void OSDShard::identify_splits_and_merges(
10451 const OSDMapRef& as_of_osdmap,
10452 set<pair<spg_t,epoch_t>> *split_pgs,
10453 set<pair<spg_t,epoch_t>> *merge_pgs)
10454{
10455 std::lock_guard l(shard_lock);
10456 if (shard_osdmap) {
10457 for (auto& i : pg_slots) {
10458 const spg_t& pgid = i.first;
10459 auto *slot = i.second.get();
10460 if (slot->pg) {
10461 osd->service.identify_splits_and_merges(
10462 shard_osdmap, as_of_osdmap, pgid,
10463 split_pgs, merge_pgs);
10464 } else if (!slot->waiting_for_split.empty()) {
10465 osd->service.identify_splits_and_merges(
10466 shard_osdmap, as_of_osdmap, pgid,
10467 split_pgs, nullptr);
10468 } else {
10469 dout(20) << __func__ << " slot " << pgid
9f95a23c 10470 << " has no pg and waiting_for_split " << dendl;
7c673cae 10471 }
11fdf7f2
TL
10472 }
10473 }
10474}
10475
10476void OSDShard::prime_splits(const OSDMapRef& as_of_osdmap,
10477 set<pair<spg_t,epoch_t>> *pgids)
10478{
10479 std::lock_guard l(shard_lock);
10480 _prime_splits(pgids);
10481 if (shard_osdmap->get_epoch() > as_of_osdmap->get_epoch()) {
10482 set<pair<spg_t,epoch_t>> newer_children;
10483 for (auto i : *pgids) {
10484 osd->service.identify_splits_and_merges(
10485 as_of_osdmap, shard_osdmap, i.first,
10486 &newer_children, nullptr);
10487 }
10488 newer_children.insert(pgids->begin(), pgids->end());
10489 dout(10) << "as_of_osdmap " << as_of_osdmap->get_epoch() << " < shard "
10490 << shard_osdmap->get_epoch() << ", new children " << newer_children
10491 << dendl;
10492 _prime_splits(&newer_children);
10493 // note: we don't care what is left over here for other shards.
10494 // if this shard is ahead of us and one isn't, e.g., one thread is
10495 // calling into prime_splits via _process (due to a newly created
10496 // pg) and this shard has a newer map due to a racing consume_map,
10497 // then any grandchildren left here will be identified (or were
10498 // identified) when the slower shard's osdmap is advanced.
10499 // _prime_splits() will tolerate the case where the pgid is
10500 // already primed.
10501 }
10502}
10503
10504void OSDShard::_prime_splits(set<pair<spg_t,epoch_t>> *pgids)
10505{
10506 dout(10) << *pgids << dendl;
10507 auto p = pgids->begin();
10508 while (p != pgids->end()) {
10509 unsigned shard_index = p->first.hash_to_shard(osd->num_shards);
10510 if (shard_index == shard_id) {
10511 auto r = pg_slots.emplace(p->first, nullptr);
10512 if (r.second) {
10513 dout(10) << "priming slot " << p->first << " e" << p->second << dendl;
10514 r.first->second = make_unique<OSDShardPGSlot>();
10515 r.first->second->waiting_for_split.insert(p->second);
7c673cae 10516 } else {
11fdf7f2
TL
10517 auto q = r.first;
10518 ceph_assert(q != pg_slots.end());
10519 dout(10) << "priming (existing) slot " << p->first << " e" << p->second
10520 << dendl;
10521 q->second->waiting_for_split.insert(p->second);
7c673cae 10522 }
11fdf7f2
TL
10523 p = pgids->erase(p);
10524 } else {
10525 ++p;
7c673cae
FG
10526 }
10527 }
11fdf7f2
TL
10528}
10529
10530void OSDShard::prime_merges(const OSDMapRef& as_of_osdmap,
10531 set<pair<spg_t,epoch_t>> *merge_pgs)
10532{
10533 std::lock_guard l(shard_lock);
10534 dout(20) << __func__ << " checking shard " << shard_id
10535 << " for remaining merge pgs " << merge_pgs << dendl;
10536 auto p = merge_pgs->begin();
10537 while (p != merge_pgs->end()) {
10538 spg_t pgid = p->first;
10539 epoch_t epoch = p->second;
10540 unsigned shard_index = pgid.hash_to_shard(osd->num_shards);
10541 if (shard_index != shard_id) {
10542 ++p;
10543 continue;
10544 }
10545 OSDShardPGSlot *slot;
10546 auto r = pg_slots.emplace(pgid, nullptr);
10547 if (r.second) {
10548 r.first->second = make_unique<OSDShardPGSlot>();
10549 }
10550 slot = r.first->second.get();
10551 if (slot->pg) {
10552 // already have pg
10553 dout(20) << __func__ << " have merge participant pg " << pgid
10554 << " " << slot->pg << dendl;
10555 } else if (!slot->waiting_for_split.empty() &&
10556 *slot->waiting_for_split.begin() < epoch) {
10557 dout(20) << __func__ << " pending split on merge participant pg " << pgid
10558 << " " << slot->waiting_for_split << dendl;
10559 } else {
10560 dout(20) << __func__ << " creating empty merge participant " << pgid
10561 << " for merge in " << epoch << dendl;
10562 // leave history zeroed; PG::merge_from() will fill it in.
10563 pg_history_t history;
10564 PGCreateInfo cinfo(pgid, epoch - 1,
10565 history, PastIntervals(), false);
10566 PGRef pg = osd->handle_pg_create_info(shard_osdmap, &cinfo);
10567 _attach_pg(r.first->second.get(), pg.get());
10568 _wake_pg_slot(pgid, slot);
10569 pg->unlock();
10570 }
10571 // mark slot for merge
10572 dout(20) << __func__ << " marking merge participant " << pgid << dendl;
10573 slot->waiting_for_merge_epoch = epoch;
10574 p = merge_pgs->erase(p);
7c673cae
FG
10575 }
10576}
10577
11fdf7f2 10578void OSDShard::register_and_wake_split_child(PG *pg)
7c673cae 10579{
11fdf7f2
TL
10580 epoch_t epoch;
10581 {
10582 std::lock_guard l(shard_lock);
10583 dout(10) << pg->pg_id << " " << pg << dendl;
10584 auto p = pg_slots.find(pg->pg_id);
10585 ceph_assert(p != pg_slots.end());
10586 auto *slot = p->second.get();
10587 dout(20) << pg->pg_id << " waiting_for_split " << slot->waiting_for_split
10588 << dendl;
10589 ceph_assert(!slot->pg);
10590 ceph_assert(!slot->waiting_for_split.empty());
10591 _attach_pg(slot, pg);
10592
10593 epoch = pg->get_osdmap_epoch();
10594 ceph_assert(slot->waiting_for_split.count(epoch));
10595 slot->waiting_for_split.erase(epoch);
10596 if (slot->waiting_for_split.empty()) {
10597 _wake_pg_slot(pg->pg_id, slot);
10598 } else {
10599 dout(10) << __func__ << " still waiting for split on "
10600 << slot->waiting_for_split << dendl;
10601 }
7c673cae 10602 }
11fdf7f2
TL
10603
10604 // kick child to ensure it pulls up to the latest osdmap
10605 osd->enqueue_peering_evt(
10606 pg->pg_id,
10607 PGPeeringEventRef(
10608 std::make_shared<PGPeeringEvent>(
10609 epoch,
10610 epoch,
10611 NullEvt())));
10612
10613 std::lock_guard l{sdata_wait_lock};
10614 sdata_cond.notify_one();
7c673cae
FG
10615}
10616
11fdf7f2 10617void OSDShard::unprime_split_children(spg_t parent, unsigned old_pg_num)
7c673cae 10618{
11fdf7f2
TL
10619 std::lock_guard l(shard_lock);
10620 vector<spg_t> to_delete;
10621 for (auto& i : pg_slots) {
10622 if (i.first != parent &&
10623 i.first.get_ancestor(old_pg_num) == parent) {
10624 dout(10) << __func__ << " parent " << parent << " clearing " << i.first
10625 << dendl;
10626 _wake_pg_slot(i.first, i.second.get());
10627 to_delete.push_back(i.first);
10628 }
10629 }
10630 for (auto pgid : to_delete) {
10631 pg_slots.erase(pgid);
10632 }
10633}
10634
9f95a23c
TL
10635OSDShard::OSDShard(
10636 int id,
10637 CephContext *cct,
10638 OSD *osd)
10639 : shard_id(id),
10640 cct(cct),
10641 osd(osd),
10642 shard_name(string("OSDShard.") + stringify(id)),
10643 sdata_wait_lock_name(shard_name + "::sdata_wait_lock"),
10644 sdata_wait_lock{make_mutex(sdata_wait_lock_name)},
10645 osdmap_lock{make_mutex(shard_name + "::osdmap_lock")},
10646 shard_lock_name(shard_name + "::shard_lock"),
10647 shard_lock{make_mutex(shard_lock_name)},
f67539c2
TL
10648 scheduler(ceph::osd::scheduler::make_scheduler(
10649 cct, osd->num_shards, osd->store->is_rotational())),
9f95a23c
TL
10650 context_queue(sdata_wait_lock, sdata_cond)
10651{
10652 dout(0) << "using op scheduler " << *scheduler << dendl;
10653}
10654
11fdf7f2
TL
10655
10656// =============================================================
10657
10658#undef dout_context
10659#define dout_context osd->cct
10660#undef dout_prefix
10661#define dout_prefix *_dout << "osd." << osd->whoami << " op_wq "
10662
10663void OSD::ShardedOpWQ::_add_slot_waiter(
10664 spg_t pgid,
10665 OSDShardPGSlot *slot,
9f95a23c 10666 OpSchedulerItem&& qi)
11fdf7f2
TL
10667{
10668 if (qi.is_peering()) {
10669 dout(20) << __func__ << " " << pgid
10670 << " peering, item epoch is "
10671 << qi.get_map_epoch()
10672 << ", will wait on " << qi << dendl;
10673 slot->waiting_peering[qi.get_map_epoch()].push_back(std::move(qi));
10674 } else {
10675 dout(20) << __func__ << " " << pgid
10676 << " item epoch is "
10677 << qi.get_map_epoch()
10678 << ", will wait on " << qi << dendl;
10679 slot->waiting.push_back(std::move(qi));
7c673cae
FG
10680 }
10681}
10682
10683#undef dout_prefix
10684#define dout_prefix *_dout << "osd." << osd->whoami << " op_wq(" << shard_index << ") "
10685
10686void OSD::ShardedOpWQ::_process(uint32_t thread_index, heartbeat_handle_d *hb)
10687{
11fdf7f2
TL
10688 uint32_t shard_index = thread_index % osd->num_shards;
10689 auto& sdata = osd->shards[shard_index];
10690 ceph_assert(sdata);
10691
10692 // If all threads of shards do oncommits, there is a out-of-order
10693 // problem. So we choose the thread which has the smallest
10694 // thread_index(thread_index < num_shards) of shard to do oncommit
10695 // callback.
10696 bool is_smallest_thread_index = thread_index < osd->num_shards;
7c673cae
FG
10697
10698 // peek at spg_t
11fdf7f2 10699 sdata->shard_lock.lock();
9f95a23c 10700 if (sdata->scheduler->empty() &&
11fdf7f2
TL
10701 (!is_smallest_thread_index || sdata->context_queue.empty())) {
10702 std::unique_lock wait_lock{sdata->sdata_wait_lock};
10703 if (is_smallest_thread_index && !sdata->context_queue.empty()) {
10704 // we raced with a context_queue addition, don't wait
10705 wait_lock.unlock();
10706 } else if (!sdata->stop_waiting) {
10707 dout(20) << __func__ << " empty q, waiting" << dendl;
10708 osd->cct->get_heartbeat_map()->clear_timeout(hb);
10709 sdata->shard_lock.unlock();
10710 sdata->sdata_cond.wait(wait_lock);
10711 wait_lock.unlock();
10712 sdata->shard_lock.lock();
9f95a23c 10713 if (sdata->scheduler->empty() &&
11fdf7f2
TL
10714 !(is_smallest_thread_index && !sdata->context_queue.empty())) {
10715 sdata->shard_lock.unlock();
10716 return;
10717 }
e306af50 10718 // found a work item; reapply default wq timeouts
11fdf7f2 10719 osd->cct->get_heartbeat_map()->reset_timeout(hb,
e306af50 10720 timeout_interval, suicide_interval);
11fdf7f2
TL
10721 } else {
10722 dout(20) << __func__ << " need return immediately" << dendl;
10723 wait_lock.unlock();
10724 sdata->shard_lock.unlock();
7c673cae
FG
10725 return;
10726 }
10727 }
11fdf7f2
TL
10728
10729 list<Context *> oncommits;
9f95a23c
TL
10730 if (is_smallest_thread_index) {
10731 sdata->context_queue.move_to(oncommits);
7c673cae 10732 }
11fdf7f2 10733
f67539c2
TL
10734 WorkItem work_item;
10735 while (!std::get_if<OpSchedulerItem>(&work_item)) {
10736 if (sdata->scheduler->empty()) {
10737 if (osd->is_stopping()) {
10738 sdata->shard_lock.unlock();
10739 for (auto c : oncommits) {
10740 dout(10) << __func__ << " discarding in-flight oncommit " << c << dendl;
10741 delete c;
10742 }
10743 return; // OSD shutdown, discard.
10744 }
10745 sdata->shard_lock.unlock();
10746 handle_oncommits(oncommits);
10747 return;
10748 }
10749
10750 work_item = sdata->scheduler->dequeue();
11fdf7f2
TL
10751 if (osd->is_stopping()) {
10752 sdata->shard_lock.unlock();
10753 for (auto c : oncommits) {
f67539c2
TL
10754 dout(10) << __func__ << " discarding in-flight oncommit " << c << dendl;
10755 delete c;
11fdf7f2
TL
10756 }
10757 return; // OSD shutdown, discard.
7c673cae 10758 }
7c673cae 10759
f67539c2
TL
10760 // If the work item is scheduled in the future, wait until
10761 // the time returned in the dequeue response before retrying.
10762 if (auto when_ready = std::get_if<double>(&work_item)) {
10763 if (is_smallest_thread_index) {
10764 sdata->shard_lock.unlock();
10765 handle_oncommits(oncommits);
10766 return;
10767 }
10768 std::unique_lock wait_lock{sdata->sdata_wait_lock};
10769 auto future_time = ceph::real_clock::from_double(*when_ready);
10770 dout(10) << __func__ << " dequeue future request at " << future_time << dendl;
10771 sdata->shard_lock.unlock();
10772 ++sdata->waiting_threads;
10773 sdata->sdata_cond.wait_until(wait_lock, future_time);
10774 --sdata->waiting_threads;
10775 wait_lock.unlock();
10776 sdata->shard_lock.lock();
10777 }
10778 } // while
10779
10780 // Access the stored item
10781 auto item = std::move(std::get<OpSchedulerItem>(work_item));
11fdf7f2
TL
10782 if (osd->is_stopping()) {
10783 sdata->shard_lock.unlock();
10784 for (auto c : oncommits) {
10785 dout(10) << __func__ << " discarding in-flight oncommit " << c << dendl;
10786 delete c;
10787 }
10788 return; // OSD shutdown, discard.
10789 }
7c673cae 10790
11fdf7f2
TL
10791 const auto token = item.get_ordering_token();
10792 auto r = sdata->pg_slots.emplace(token, nullptr);
10793 if (r.second) {
10794 r.first->second = make_unique<OSDShardPGSlot>();
7c673cae 10795 }
11fdf7f2
TL
10796 OSDShardPGSlot *slot = r.first->second.get();
10797 dout(20) << __func__ << " " << token
10798 << (r.second ? " (new)" : "")
10799 << " to_process " << slot->to_process
10800 << " waiting " << slot->waiting
10801 << " waiting_peering " << slot->waiting_peering
10802 << dendl;
10803 slot->to_process.push_back(std::move(item));
10804 dout(20) << __func__ << " " << slot->to_process.back()
10805 << " queued" << dendl;
7c673cae 10806
11fdf7f2
TL
10807 retry_pg:
10808 PGRef pg = slot->pg;
7c673cae 10809
11fdf7f2
TL
10810 // lock pg (if we have it)
10811 if (pg) {
10812 // note the requeue seq now...
10813 uint64_t requeue_seq = slot->requeue_seq;
10814 ++slot->num_running;
7c673cae 10815
11fdf7f2
TL
10816 sdata->shard_lock.unlock();
10817 osd->service.maybe_inject_dispatch_delay();
10818 pg->lock();
10819 osd->service.maybe_inject_dispatch_delay();
10820 sdata->shard_lock.lock();
7c673cae 10821
11fdf7f2
TL
10822 auto q = sdata->pg_slots.find(token);
10823 if (q == sdata->pg_slots.end()) {
10824 // this can happen if we race with pg removal.
10825 dout(20) << __func__ << " slot " << token << " no longer there" << dendl;
10826 pg->unlock();
10827 sdata->shard_lock.unlock();
10828 handle_oncommits(oncommits);
10829 return;
10830 }
10831 slot = q->second.get();
10832 --slot->num_running;
7c673cae 10833
11fdf7f2
TL
10834 if (slot->to_process.empty()) {
10835 // raced with _wake_pg_slot or consume_map
10836 dout(20) << __func__ << " " << token
10837 << " nothing queued" << dendl;
7c673cae 10838 pg->unlock();
11fdf7f2
TL
10839 sdata->shard_lock.unlock();
10840 handle_oncommits(oncommits);
10841 return;
7c673cae 10842 }
11fdf7f2
TL
10843 if (requeue_seq != slot->requeue_seq) {
10844 dout(20) << __func__ << " " << token
10845 << " requeue_seq " << slot->requeue_seq << " > our "
10846 << requeue_seq << ", we raced with _wake_pg_slot"
10847 << dendl;
7c673cae 10848 pg->unlock();
11fdf7f2
TL
10849 sdata->shard_lock.unlock();
10850 handle_oncommits(oncommits);
10851 return;
7c673cae 10852 }
11fdf7f2
TL
10853 if (slot->pg != pg) {
10854 // this can happen if we race with pg removal.
10855 dout(20) << __func__ << " slot " << token << " no longer attached to "
10856 << pg << dendl;
7c673cae 10857 pg->unlock();
11fdf7f2 10858 goto retry_pg;
7c673cae 10859 }
7c673cae
FG
10860 }
10861
11fdf7f2
TL
10862 dout(20) << __func__ << " " << token
10863 << " to_process " << slot->to_process
10864 << " waiting " << slot->waiting
10865 << " waiting_peering " << slot->waiting_peering << dendl;
10866
10867 ThreadPool::TPHandle tp_handle(osd->cct, hb, timeout_interval,
10868 suicide_interval);
10869
7c673cae 10870 // take next item
11fdf7f2
TL
10871 auto qi = std::move(slot->to_process.front());
10872 slot->to_process.pop_front();
10873 dout(20) << __func__ << " " << qi << " pg " << pg << dendl;
10874 set<pair<spg_t,epoch_t>> new_children;
10875 OSDMapRef osdmap;
7c673cae 10876
11fdf7f2 10877 while (!pg) {
7c673cae 10878 // should this pg shard exist on this osd in this (or a later) epoch?
11fdf7f2
TL
10879 osdmap = sdata->shard_osdmap;
10880 const PGCreateInfo *create_info = qi.creates_pg();
10881 if (!slot->waiting_for_split.empty()) {
10882 dout(20) << __func__ << " " << token
10883 << " splitting " << slot->waiting_for_split << dendl;
10884 _add_slot_waiter(token, slot, std::move(qi));
10885 } else if (qi.get_map_epoch() > osdmap->get_epoch()) {
10886 dout(20) << __func__ << " " << token
10887 << " map " << qi.get_map_epoch() << " > "
10888 << osdmap->get_epoch() << dendl;
10889 _add_slot_waiter(token, slot, std::move(qi));
10890 } else if (qi.is_peering()) {
10891 if (!qi.peering_requires_pg()) {
10892 // for pg-less events, we run them under the ordering lock, since
10893 // we don't have the pg lock to keep them ordered.
10894 qi.run(osd, sdata, pg, tp_handle);
10895 } else if (osdmap->is_up_acting_osd_shard(token, osd->whoami)) {
10896 if (create_info) {
10897 if (create_info->by_mon &&
10898 osdmap->get_pg_acting_primary(token.pgid) != osd->whoami) {
10899 dout(20) << __func__ << " " << token
10900 << " no pg, no longer primary, ignoring mon create on "
10901 << qi << dendl;
10902 } else {
10903 dout(20) << __func__ << " " << token
10904 << " no pg, should create on " << qi << dendl;
10905 pg = osd->handle_pg_create_info(osdmap, create_info);
10906 if (pg) {
10907 // we created the pg! drop out and continue "normally"!
10908 sdata->_attach_pg(slot, pg.get());
10909 sdata->_wake_pg_slot(token, slot);
10910
10911 // identify split children between create epoch and shard epoch.
10912 osd->service.identify_splits_and_merges(
10913 pg->get_osdmap(), osdmap, pg->pg_id, &new_children, nullptr);
10914 sdata->_prime_splits(&new_children);
10915 // distribute remaining split children to other shards below!
10916 break;
10917 }
10918 dout(20) << __func__ << " ignored create on " << qi << dendl;
10919 }
10920 } else {
10921 dout(20) << __func__ << " " << token
10922 << " no pg, peering, !create, discarding " << qi << dendl;
10923 }
10924 } else {
10925 dout(20) << __func__ << " " << token
10926 << " no pg, peering, doesn't map here e" << osdmap->get_epoch()
10927 << ", discarding " << qi
10928 << dendl;
10929 }
10930 } else if (osdmap->is_up_acting_osd_shard(token, osd->whoami)) {
10931 dout(20) << __func__ << " " << token
10932 << " no pg, should exist e" << osdmap->get_epoch()
10933 << ", will wait on " << qi << dendl;
10934 _add_slot_waiter(token, slot, std::move(qi));
7c673cae 10935 } else {
11fdf7f2
TL
10936 dout(20) << __func__ << " " << token
10937 << " no pg, shouldn't exist e" << osdmap->get_epoch()
10938 << ", dropping " << qi << dendl;
7c673cae 10939 // share map with client?
9f95a23c
TL
10940 if (std::optional<OpRequestRef> _op = qi.maybe_get_op()) {
10941 osd->service.maybe_share_map((*_op)->get_req()->get_connection().get(),
10942 sdata->shard_osdmap,
10943 (*_op)->sent_epoch);
7c673cae 10944 }
11fdf7f2 10945 unsigned pushes_to_free = qi.get_reserved_pushes();
7c673cae 10946 if (pushes_to_free > 0) {
11fdf7f2 10947 sdata->shard_lock.unlock();
7c673cae 10948 osd->service.release_reserved_pushes(pushes_to_free);
11fdf7f2 10949 handle_oncommits(oncommits);
7c673cae
FG
10950 return;
10951 }
10952 }
11fdf7f2
TL
10953 sdata->shard_lock.unlock();
10954 handle_oncommits(oncommits);
7c673cae
FG
10955 return;
10956 }
11fdf7f2
TL
10957 if (qi.is_peering()) {
10958 OSDMapRef osdmap = sdata->shard_osdmap;
10959 if (qi.get_map_epoch() > osdmap->get_epoch()) {
10960 _add_slot_waiter(token, slot, std::move(qi));
10961 sdata->shard_lock.unlock();
10962 pg->unlock();
10963 handle_oncommits(oncommits);
10964 return;
10965 }
10966 }
10967 sdata->shard_lock.unlock();
7c673cae 10968
11fdf7f2
TL
10969 if (!new_children.empty()) {
10970 for (auto shard : osd->shards) {
10971 shard->prime_splits(osdmap, &new_children);
10972 }
10973 ceph_assert(new_children.empty());
10974 }
7c673cae
FG
10975
10976 // osd_opwq_process marks the point at which an operation has been dequeued
10977 // and will begin to be handled by a worker thread.
10978 {
10979#ifdef WITH_LTTNG
10980 osd_reqid_t reqid;
9f95a23c 10981 if (std::optional<OpRequestRef> _op = qi.maybe_get_op()) {
7c673cae
FG
10982 reqid = (*_op)->get_reqid();
10983 }
10984#endif
10985 tracepoint(osd, opwq_process_start, reqid.name._type,
10986 reqid.name._num, reqid.tid, reqid.inc);
10987 }
10988
10989 lgeneric_subdout(osd->cct, osd, 30) << "dequeue status: ";
10990 Formatter *f = Formatter::create("json");
10991 f->open_object_section("q");
10992 dump(f);
10993 f->close_section();
10994 f->flush(*_dout);
10995 delete f;
10996 *_dout << dendl;
10997
11fdf7f2 10998 qi.run(osd, sdata, pg, tp_handle);
7c673cae
FG
10999
11000 {
11001#ifdef WITH_LTTNG
11002 osd_reqid_t reqid;
9f95a23c 11003 if (std::optional<OpRequestRef> _op = qi.maybe_get_op()) {
7c673cae
FG
11004 reqid = (*_op)->get_reqid();
11005 }
11006#endif
11007 tracepoint(osd, opwq_process_finish, reqid.name._type,
11008 reqid.name._num, reqid.tid, reqid.inc);
11009 }
11010
11fdf7f2 11011 handle_oncommits(oncommits);
7c673cae
FG
11012}
11013
9f95a23c 11014void OSD::ShardedOpWQ::_enqueue(OpSchedulerItem&& item) {
7c673cae 11015 uint32_t shard_index =
11fdf7f2 11016 item.get_ordering_token().hash_to_shard(osd->shards.size());
7c673cae 11017
9f95a23c
TL
11018 dout(20) << __func__ << " " << item << dendl;
11019
11fdf7f2 11020 OSDShard* sdata = osd->shards[shard_index];
7c673cae 11021 assert (NULL != sdata);
7c673cae 11022
9f95a23c
TL
11023 bool empty = true;
11024 {
11025 std::lock_guard l{sdata->shard_lock};
11026 empty = sdata->scheduler->empty();
11027 sdata->scheduler->enqueue(std::move(item));
11028 }
7c673cae 11029
f67539c2 11030 {
9f95a23c 11031 std::lock_guard l{sdata->sdata_wait_lock};
f67539c2
TL
11032 if (empty) {
11033 sdata->sdata_cond.notify_all();
11034 } else if (sdata->waiting_threads) {
11035 sdata->sdata_cond.notify_one();
11036 }
9f95a23c 11037 }
7c673cae
FG
11038}
11039
9f95a23c 11040void OSD::ShardedOpWQ::_enqueue_front(OpSchedulerItem&& item)
7c673cae 11041{
11fdf7f2
TL
11042 auto shard_index = item.get_ordering_token().hash_to_shard(osd->shards.size());
11043 auto& sdata = osd->shards[shard_index];
11044 ceph_assert(sdata);
11045 sdata->shard_lock.lock();
11046 auto p = sdata->pg_slots.find(item.get_ordering_token());
11047 if (p != sdata->pg_slots.end() &&
11048 !p->second->to_process.empty()) {
7c673cae 11049 // we may be racing with _process, which has dequeued a new item
9f95a23c 11050 // from scheduler, put it on to_process, and is now busy taking the
7c673cae
FG
11051 // pg lock. ensure this old requeued item is ordered before any
11052 // such newer item in to_process.
11fdf7f2
TL
11053 p->second->to_process.push_front(std::move(item));
11054 item = std::move(p->second->to_process.back());
11055 p->second->to_process.pop_back();
11056 dout(20) << __func__
11057 << " " << p->second->to_process.front()
11058 << " shuffled w/ " << item << dendl;
7c673cae 11059 } else {
11fdf7f2 11060 dout(20) << __func__ << " " << item << dendl;
7c673cae 11061 }
9f95a23c 11062 sdata->scheduler->enqueue_front(std::move(item));
11fdf7f2
TL
11063 sdata->shard_lock.unlock();
11064 std::lock_guard l{sdata->sdata_wait_lock};
11065 sdata->sdata_cond.notify_one();
7c673cae
FG
11066}
11067
f67539c2 11068namespace ceph::osd_cmds {
7c673cae 11069
11fdf7f2
TL
11070int heap(CephContext& cct, const cmdmap_t& cmdmap, Formatter& f,
11071 std::ostream& os)
7c673cae
FG
11072{
11073 if (!ceph_using_tcmalloc()) {
11074 os << "could not issue heap profiler command -- not using tcmalloc!";
11075 return -EOPNOTSUPP;
11076 }
f67539c2 11077
7c673cae 11078 string cmd;
9f95a23c 11079 if (!cmd_getval(cmdmap, "heapcmd", cmd)) {
7c673cae
FG
11080 os << "unable to get value for command \"" << cmd << "\"";
11081 return -EINVAL;
11fdf7f2 11082 }
f67539c2 11083
7c673cae
FG
11084 std::vector<std::string> cmd_vec;
11085 get_str_vec(cmd, cmd_vec);
11fdf7f2
TL
11086
11087 string val;
9f95a23c 11088 if (cmd_getval(cmdmap, "value", val)) {
11fdf7f2
TL
11089 cmd_vec.push_back(val);
11090 }
f67539c2 11091
7c673cae 11092 ceph_heap_profiler_handle_command(cmd_vec, os);
f67539c2 11093
7c673cae
FG
11094 return 0;
11095}
f67539c2
TL
11096
11097} // namespace ceph::osd_cmds