]> git.proxmox.com Git - ceph.git/blame - ceph/src/osd/OSD.cc
import 15.2.4
[ceph.git] / ceph / src / osd / OSD.cc
CommitLineData
7c673cae
FG
1// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2// vim: ts=8 sw=2 smarttab
3/*
4 * Ceph - scalable distributed file system
5 *
6 * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
7 * Copyright (C) 2017 OVH
8 *
9 * This is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU Lesser General Public
11 * License version 2.1, as published by the Free Software
12 * Foundation. See file COPYING.
13 *
14 */
11fdf7f2 15
7c673cae 16#include "acconfig.h"
11fdf7f2
TL
17
18#include <cctype>
7c673cae
FG
19#include <fstream>
20#include <iostream>
11fdf7f2
TL
21#include <iterator>
22
23#include <unistd.h>
7c673cae
FG
24#include <sys/stat.h>
25#include <signal.h>
eafe8130 26#include <time.h>
7c673cae 27#include <boost/scoped_ptr.hpp>
eafe8130 28#include <boost/range/adaptor/reversed.hpp>
7c673cae
FG
29
30#ifdef HAVE_SYS_PARAM_H
31#include <sys/param.h>
32#endif
33
34#ifdef HAVE_SYS_MOUNT_H
35#include <sys/mount.h>
36#endif
37
38#include "osd/PG.h"
39
40#include "include/types.h"
41#include "include/compat.h"
11fdf7f2 42#include "include/random.h"
7c673cae
FG
43
44#include "OSD.h"
45#include "OSDMap.h"
46#include "Watch.h"
47#include "osdc/Objecter.h"
48
49#include "common/errno.h"
50#include "common/ceph_argparse.h"
9f95a23c 51#include "common/ceph_releases.h"
224ce89b 52#include "common/ceph_time.h"
7c673cae 53#include "common/version.h"
b5b8bbf5 54#include "common/pick_address.h"
11fdf7f2
TL
55#include "common/blkdev.h"
56#include "common/numa.h"
7c673cae
FG
57
58#include "os/ObjectStore.h"
59#ifdef HAVE_LIBFUSE
60#include "os/FuseStore.h"
61#endif
62
63#include "PrimaryLogPG.h"
64
7c673cae
FG
65#include "msg/Messenger.h"
66#include "msg/Message.h"
67
68#include "mon/MonClient.h"
69
70#include "messages/MLog.h"
71
72#include "messages/MGenericMessage.h"
7c673cae
FG
73#include "messages/MOSDPing.h"
74#include "messages/MOSDFailure.h"
75#include "messages/MOSDMarkMeDown.h"
9f95a23c 76#include "messages/MOSDMarkMeDead.h"
7c673cae
FG
77#include "messages/MOSDFull.h"
78#include "messages/MOSDOp.h"
79#include "messages/MOSDOpReply.h"
80#include "messages/MOSDBackoff.h"
81#include "messages/MOSDBeacon.h"
82#include "messages/MOSDRepOp.h"
83#include "messages/MOSDRepOpReply.h"
84#include "messages/MOSDBoot.h"
85#include "messages/MOSDPGTemp.h"
11fdf7f2 86#include "messages/MOSDPGReadyToMerge.h"
7c673cae
FG
87
88#include "messages/MOSDMap.h"
89#include "messages/MMonGetOSDMap.h"
90#include "messages/MOSDPGNotify.h"
9f95a23c 91#include "messages/MOSDPGNotify2.h"
7c673cae 92#include "messages/MOSDPGQuery.h"
9f95a23c 93#include "messages/MOSDPGQuery2.h"
7c673cae
FG
94#include "messages/MOSDPGLog.h"
95#include "messages/MOSDPGRemove.h"
96#include "messages/MOSDPGInfo.h"
9f95a23c 97#include "messages/MOSDPGInfo2.h"
7c673cae 98#include "messages/MOSDPGCreate.h"
11fdf7f2 99#include "messages/MOSDPGCreate2.h"
7c673cae 100#include "messages/MOSDPGScan.h"
7c673cae
FG
101#include "messages/MBackfillReserve.h"
102#include "messages/MRecoveryReserve.h"
c07f9fc5 103#include "messages/MOSDForceRecovery.h"
7c673cae
FG
104#include "messages/MOSDECSubOpWrite.h"
105#include "messages/MOSDECSubOpWriteReply.h"
106#include "messages/MOSDECSubOpRead.h"
107#include "messages/MOSDECSubOpReadReply.h"
108#include "messages/MOSDPGCreated.h"
109#include "messages/MOSDPGUpdateLogMissing.h"
110#include "messages/MOSDPGUpdateLogMissingReply.h"
111
11fdf7f2
TL
112#include "messages/MOSDPeeringOp.h"
113
7c673cae
FG
114#include "messages/MOSDAlive.h"
115
116#include "messages/MOSDScrub.h"
11fdf7f2 117#include "messages/MOSDScrub2.h"
7c673cae
FG
118#include "messages/MOSDRepScrub.h"
119
7c673cae
FG
120#include "messages/MCommand.h"
121#include "messages/MCommandReply.h"
122
123#include "messages/MPGStats.h"
124#include "messages/MPGStatsAck.h"
125
126#include "messages/MWatchNotify.h"
127#include "messages/MOSDPGPush.h"
128#include "messages/MOSDPGPushReply.h"
129#include "messages/MOSDPGPull.h"
130
9f95a23c
TL
131#include "messages/MMonGetPurgedSnaps.h"
132#include "messages/MMonGetPurgedSnapsReply.h"
133
7c673cae
FG
134#include "common/perf_counters.h"
135#include "common/Timer.h"
136#include "common/LogClient.h"
137#include "common/AsyncReserver.h"
138#include "common/HeartbeatMap.h"
139#include "common/admin_socket.h"
140#include "common/ceph_context.h"
141
142#include "global/signal_handler.h"
143#include "global/pidfile.h"
144
145#include "include/color.h"
146#include "perfglue/cpu_profiler.h"
147#include "perfglue/heap_profiler.h"
148
149#include "osd/OpRequest.h"
150
151#include "auth/AuthAuthorizeHandler.h"
152#include "auth/RotatingKeyRing.h"
7c673cae
FG
153
154#include "objclass/objclass.h"
155
156#include "common/cmdparse.h"
157#include "include/str_list.h"
158#include "include/util.h"
159
11fdf7f2 160#include "include/ceph_assert.h"
7c673cae
FG
161#include "common/config.h"
162#include "common/EventTrace.h"
163
11fdf7f2
TL
164#include "json_spirit/json_spirit_reader.h"
165#include "json_spirit/json_spirit_writer.h"
166
7c673cae
FG
167#ifdef WITH_LTTNG
168#define TRACEPOINT_DEFINE
169#define TRACEPOINT_PROBE_DYNAMIC_LINKAGE
170#include "tracing/osd.h"
171#undef TRACEPOINT_PROBE_DYNAMIC_LINKAGE
172#undef TRACEPOINT_DEFINE
173#else
174#define tracepoint(...)
175#endif
176
177#define dout_context cct
178#define dout_subsys ceph_subsys_osd
179#undef dout_prefix
180#define dout_prefix _prefix(_dout, whoami, get_osdmap_epoch())
181
9f95a23c
TL
182using namespace ceph::osd::scheduler;
183using TOPNSPC::common::cmd_getval;
224ce89b 184
7c673cae
FG
185static ostream& _prefix(std::ostream* _dout, int whoami, epoch_t epoch) {
186 return *_dout << "osd." << whoami << " " << epoch << " ";
187}
188
7c673cae
FG
189//Initial features in new superblock.
190//Features here are also automatically upgraded
191CompatSet OSD::get_osd_initial_compat_set() {
192 CompatSet::FeatureSet ceph_osd_feature_compat;
193 CompatSet::FeatureSet ceph_osd_feature_ro_compat;
194 CompatSet::FeatureSet ceph_osd_feature_incompat;
195 ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_BASE);
196 ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_PGINFO);
197 ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_OLOC);
198 ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_LEC);
199 ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_CATEGORIES);
200 ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_HOBJECTPOOL);
201 ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_BIGINFO);
202 ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_LEVELDBINFO);
203 ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_LEVELDBLOG);
204 ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_SNAPMAPPER);
205 ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_HINTS);
206 ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_PGMETA);
207 ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_MISSING);
208 ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_FASTINFO);
c07f9fc5 209 ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_RECOVERY_DELETES);
9f95a23c 210 ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_SNAPMAPPER2);
7c673cae
FG
211 return CompatSet(ceph_osd_feature_compat, ceph_osd_feature_ro_compat,
212 ceph_osd_feature_incompat);
213}
214
215//Features are added here that this OSD supports.
216CompatSet OSD::get_osd_compat_set() {
217 CompatSet compat = get_osd_initial_compat_set();
218 //Any features here can be set in code, but not in initial superblock
219 compat.incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_SHARDS);
220 return compat;
221}
222
223OSDService::OSDService(OSD *osd) :
224 osd(osd),
225 cct(osd->cct),
7c673cae
FG
226 whoami(osd->whoami), store(osd->store),
227 log_client(osd->log_client), clog(osd->clog),
228 pg_recovery_stats(osd->pg_recovery_stats),
229 cluster_messenger(osd->cluster_messenger),
230 client_messenger(osd->client_messenger),
231 logger(osd->logger),
232 recoverystate_perf(osd->recoverystate_perf),
233 monc(osd->monc),
11fdf7f2
TL
234 osd_max_object_size(cct->_conf, "osd_max_object_size"),
235 osd_skip_data_digest(cct->_conf, "osd_skip_data_digest"),
236 publish_lock{ceph::make_mutex("OSDService::publish_lock")},
237 pre_publish_lock{ceph::make_mutex("OSDService::pre_publish_lock")},
7c673cae 238 max_oldest_map(0),
eafe8130
TL
239 scrubs_local(0),
240 scrubs_remote(0),
7c673cae
FG
241 agent_valid_iterator(false),
242 agent_ops(0),
243 flush_mode_high_count(0),
244 agent_active(true),
245 agent_thread(this),
246 agent_stop_flag(false),
7c673cae
FG
247 agent_timer(osd->client_messenger->cct, agent_timer_lock),
248 last_recalibrate(ceph_clock_now()),
249 promote_max_objects(0),
250 promote_max_bytes(0),
9f95a23c
TL
251 objecter(make_unique<Objecter>(osd->client_messenger->cct,
252 osd->objecter_messenger,
253 osd->monc, nullptr, 0, 0)),
11fdf7f2 254 m_objecter_finishers(cct->_conf->osd_objecter_finishers),
7c673cae
FG
255 watch_timer(osd->client_messenger->cct, watch_lock),
256 next_notif_id(0),
7c673cae 257 recovery_request_timer(cct, recovery_request_lock, false),
11fdf7f2 258 sleep_timer(cct, sleep_lock, false),
7c673cae 259 reserver_finisher(cct),
3efd9988 260 local_reserver(cct, &reserver_finisher, cct->_conf->osd_max_backfills,
7c673cae 261 cct->_conf->osd_min_recovery_priority),
3efd9988 262 remote_reserver(cct, &reserver_finisher, cct->_conf->osd_max_backfills,
7c673cae 263 cct->_conf->osd_min_recovery_priority),
3efd9988 264 snap_reserver(cct, &reserver_finisher,
7c673cae 265 cct->_conf->osd_max_trimming_pgs),
7c673cae
FG
266 recovery_ops_active(0),
267 recovery_ops_reserved(0),
268 recovery_paused(false),
7c673cae
FG
269 map_cache(cct, cct->_conf->osd_map_cache_size),
270 map_bl_cache(cct->_conf->osd_map_cache_size),
271 map_bl_inc_cache(cct->_conf->osd_map_cache_size),
7c673cae 272 cur_state(NONE),
11fdf7f2 273 cur_ratio(0), physical_ratio(0),
9f95a23c 274 boot_epoch(0), up_epoch(0), bind_epoch(0)
7c673cae
FG
275{
276 objecter->init();
11fdf7f2
TL
277
278 for (int i = 0; i < m_objecter_finishers; i++) {
279 ostringstream str;
280 str << "objecter-finisher-" << i;
9f95a23c
TL
281 auto fin = make_unique<Finisher>(osd->client_messenger->cct, str.str(), "finisher");
282 objecter_finishers.push_back(std::move(fin));
11fdf7f2 283 }
7c673cae
FG
284}
285
31f18b77
FG
286#ifdef PG_DEBUG_REFS
287void OSDService::add_pgid(spg_t pgid, PG *pg){
11fdf7f2 288 std::lock_guard l(pgid_lock);
31f18b77
FG
289 if (!pgid_tracker.count(pgid)) {
290 live_pgs[pgid] = pg;
291 }
292 pgid_tracker[pgid]++;
293}
294void OSDService::remove_pgid(spg_t pgid, PG *pg)
295{
11fdf7f2
TL
296 std::lock_guard l(pgid_lock);
297 ceph_assert(pgid_tracker.count(pgid));
298 ceph_assert(pgid_tracker[pgid] > 0);
31f18b77
FG
299 pgid_tracker[pgid]--;
300 if (pgid_tracker[pgid] == 0) {
301 pgid_tracker.erase(pgid);
302 live_pgs.erase(pgid);
303 }
304}
305void OSDService::dump_live_pgids()
306{
11fdf7f2 307 std::lock_guard l(pgid_lock);
31f18b77
FG
308 derr << "live pgids:" << dendl;
309 for (map<spg_t, int>::const_iterator i = pgid_tracker.cbegin();
310 i != pgid_tracker.cend();
311 ++i) {
312 derr << "\t" << *i << dendl;
313 live_pgs[i->first]->dump_live_ids();
314 }
315}
316#endif
317
318
9f95a23c
TL
319ceph::signedspan OSDService::get_mnow()
320{
321 return ceph::mono_clock::now() - osd->startup_time;
322}
7c673cae 323
11fdf7f2
TL
324void OSDService::identify_splits_and_merges(
325 OSDMapRef old_map,
326 OSDMapRef new_map,
327 spg_t pgid,
328 set<pair<spg_t,epoch_t>> *split_children,
329 set<pair<spg_t,epoch_t>> *merge_pgs)
7c673cae 330{
11fdf7f2 331 if (!old_map->have_pg_pool(pgid.pool())) {
7c673cae 332 return;
7c673cae 333 }
7c673cae 334 int old_pgnum = old_map->get_pg_num(pgid.pool());
11fdf7f2
TL
335 auto p = osd->pg_num_history.pg_nums.find(pgid.pool());
336 if (p == osd->pg_num_history.pg_nums.end()) {
337 return;
338 }
339 dout(20) << __func__ << " " << pgid << " e" << old_map->get_epoch()
340 << " to e" << new_map->get_epoch()
341 << " pg_nums " << p->second << dendl;
342 deque<spg_t> queue;
343 queue.push_back(pgid);
eafe8130 344 set<spg_t> did;
11fdf7f2
TL
345 while (!queue.empty()) {
346 auto cur = queue.front();
347 queue.pop_front();
eafe8130 348 did.insert(cur);
11fdf7f2
TL
349 unsigned pgnum = old_pgnum;
350 for (auto q = p->second.lower_bound(old_map->get_epoch());
351 q != p->second.end() &&
352 q->first <= new_map->get_epoch();
353 ++q) {
354 if (pgnum < q->second) {
355 // split?
356 if (cur.ps() < pgnum) {
357 set<spg_t> children;
358 if (cur.is_split(pgnum, q->second, &children)) {
359 dout(20) << __func__ << " " << cur << " e" << q->first
360 << " pg_num " << pgnum << " -> " << q->second
361 << " children " << children << dendl;
362 for (auto i : children) {
363 split_children->insert(make_pair(i, q->first));
eafe8130
TL
364 if (!did.count(i))
365 queue.push_back(i);
11fdf7f2
TL
366 }
367 }
368 } else if (cur.ps() < q->second) {
369 dout(20) << __func__ << " " << cur << " e" << q->first
370 << " pg_num " << pgnum << " -> " << q->second
371 << " is a child" << dendl;
372 // normally we'd capture this from the parent, but it's
373 // possible the parent doesn't exist yet (it will be
374 // fabricated to allow an intervening merge). note this PG
375 // as a split child here to be sure we catch it.
376 split_children->insert(make_pair(cur, q->first));
377 } else {
378 dout(20) << __func__ << " " << cur << " e" << q->first
379 << " pg_num " << pgnum << " -> " << q->second
380 << " is post-split, skipping" << dendl;
381 }
382 } else if (merge_pgs) {
383 // merge?
384 if (cur.ps() >= q->second) {
385 if (cur.ps() < pgnum) {
386 spg_t parent;
387 if (cur.is_merge_source(pgnum, q->second, &parent)) {
388 set<spg_t> children;
389 parent.is_split(q->second, pgnum, &children);
390 dout(20) << __func__ << " " << cur << " e" << q->first
391 << " pg_num " << pgnum << " -> " << q->second
392 << " is merge source, target " << parent
393 << ", source(s) " << children << dendl;
394 merge_pgs->insert(make_pair(parent, q->first));
eafe8130
TL
395 if (!did.count(parent)) {
396 // queue (and re-scan) parent in case it might not exist yet
397 // and there are some future splits pending on it
398 queue.push_back(parent);
399 }
11fdf7f2
TL
400 for (auto c : children) {
401 merge_pgs->insert(make_pair(c, q->first));
eafe8130
TL
402 if (!did.count(c))
403 queue.push_back(c);
11fdf7f2
TL
404 }
405 }
406 } else {
407 dout(20) << __func__ << " " << cur << " e" << q->first
408 << " pg_num " << pgnum << " -> " << q->second
409 << " is beyond old pgnum, skipping" << dendl;
410 }
411 } else {
412 set<spg_t> children;
413 if (cur.is_split(q->second, pgnum, &children)) {
414 dout(20) << __func__ << " " << cur << " e" << q->first
415 << " pg_num " << pgnum << " -> " << q->second
416 << " is merge target, source " << children << dendl;
417 for (auto c : children) {
418 merge_pgs->insert(make_pair(c, q->first));
eafe8130
TL
419 if (!did.count(c))
420 queue.push_back(c);
11fdf7f2
TL
421 }
422 merge_pgs->insert(make_pair(cur, q->first));
423 }
7c673cae
FG
424 }
425 }
11fdf7f2 426 pgnum = q->second;
7c673cae
FG
427 }
428 }
429}
430
7c673cae
FG
431void OSDService::need_heartbeat_peer_update()
432{
433 osd->need_heartbeat_peer_update();
434}
435
9f95a23c
TL
436HeartbeatStampsRef OSDService::get_hb_stamps(unsigned peer)
437{
438 std::lock_guard l(hb_stamp_lock);
439 if (peer >= hb_stamps.size()) {
440 hb_stamps.resize(peer + 1);
441 }
442 if (!hb_stamps[peer]) {
443 hb_stamps[peer] = ceph::make_ref<HeartbeatStamps>(peer);
444 }
445 return hb_stamps[peer];
446}
447
448void OSDService::queue_renew_lease(epoch_t epoch, spg_t spgid)
449{
450 osd->enqueue_peering_evt(
451 spgid,
452 PGPeeringEventRef(
453 std::make_shared<PGPeeringEvent>(
454 epoch, epoch,
455 RenewLease())));
456}
457
7c673cae
FG
458void OSDService::start_shutdown()
459{
460 {
11fdf7f2 461 std::lock_guard l(agent_timer_lock);
7c673cae
FG
462 agent_timer.shutdown();
463 }
31f18b77
FG
464
465 {
11fdf7f2
TL
466 std::lock_guard l(sleep_lock);
467 sleep_timer.shutdown();
31f18b77 468 }
81eedcae
TL
469
470 {
471 std::lock_guard l(recovery_request_lock);
472 recovery_request_timer.shutdown();
473 }
7c673cae
FG
474}
475
31f18b77 476void OSDService::shutdown_reserver()
7c673cae
FG
477{
478 reserver_finisher.wait_for_empty();
479 reserver_finisher.stop();
31f18b77
FG
480}
481
482void OSDService::shutdown()
483{
9f95a23c
TL
484 mono_timer.suspend();
485
7c673cae 486 {
11fdf7f2 487 std::lock_guard l(watch_lock);
7c673cae
FG
488 watch_timer.shutdown();
489 }
490
491 objecter->shutdown();
9f95a23c 492 for (auto& f : objecter_finishers) {
11fdf7f2
TL
493 f->wait_for_empty();
494 f->stop();
7c673cae
FG
495 }
496
11fdf7f2 497 publish_map(OSDMapRef());
7c673cae
FG
498 next_osdmap = OSDMapRef();
499}
500
501void OSDService::init()
502{
503 reserver_finisher.start();
9f95a23c 504 for (auto& f : objecter_finishers) {
11fdf7f2
TL
505 f->start();
506 }
7c673cae
FG
507 objecter->set_client_incarnation(0);
508
509 // deprioritize objecter in daemonperf output
510 objecter->get_logger()->set_prio_adjust(-3);
511
512 watch_timer.init();
513 agent_timer.init();
9f95a23c 514 mono_timer.resume();
7c673cae
FG
515
516 agent_thread.create("osd_srv_agent");
517
518 if (cct->_conf->osd_recovery_delay_start)
519 defer_recovery(cct->_conf->osd_recovery_delay_start);
520}
521
522void OSDService::final_init()
523{
524 objecter->start(osdmap.get());
525}
526
527void OSDService::activate_map()
528{
529 // wake/unwake the tiering agent
9f95a23c 530 std::lock_guard l{agent_lock};
7c673cae
FG
531 agent_active =
532 !osdmap->test_flag(CEPH_OSDMAP_NOTIERAGENT) &&
533 osd->is_active();
9f95a23c 534 agent_cond.notify_all();
7c673cae
FG
535}
536
181888fb
FG
537void OSDService::request_osdmap_update(epoch_t e)
538{
539 osd->osdmap_subscribe(e, false);
540}
541
9f95a23c 542
7c673cae
FG
543class AgentTimeoutCB : public Context {
544 PGRef pg;
545public:
546 explicit AgentTimeoutCB(PGRef _pg) : pg(_pg) {}
547 void finish(int) override {
548 pg->agent_choose_mode_restart();
549 }
550};
551
552void OSDService::agent_entry()
553{
554 dout(10) << __func__ << " start" << dendl;
9f95a23c 555 std::unique_lock agent_locker{agent_lock};
7c673cae
FG
556
557 while (!agent_stop_flag) {
558 if (agent_queue.empty()) {
559 dout(20) << __func__ << " empty queue" << dendl;
9f95a23c 560 agent_cond.wait(agent_locker);
7c673cae
FG
561 continue;
562 }
563 uint64_t level = agent_queue.rbegin()->first;
564 set<PGRef>& top = agent_queue.rbegin()->second;
565 dout(10) << __func__
566 << " tiers " << agent_queue.size()
567 << ", top is " << level
568 << " with pgs " << top.size()
569 << ", ops " << agent_ops << "/"
570 << cct->_conf->osd_agent_max_ops
571 << (agent_active ? " active" : " NOT ACTIVE")
572 << dendl;
573 dout(20) << __func__ << " oids " << agent_oids << dendl;
574 int max = cct->_conf->osd_agent_max_ops - agent_ops;
575 int agent_flush_quota = max;
576 if (!flush_mode_high_count)
577 agent_flush_quota = cct->_conf->osd_agent_max_low_ops - agent_ops;
578 if (agent_flush_quota <= 0 || top.empty() || !agent_active) {
9f95a23c 579 agent_cond.wait(agent_locker);
7c673cae
FG
580 continue;
581 }
582
583 if (!agent_valid_iterator || agent_queue_pos == top.end()) {
584 agent_queue_pos = top.begin();
585 agent_valid_iterator = true;
586 }
587 PGRef pg = *agent_queue_pos;
588 dout(10) << "high_count " << flush_mode_high_count
589 << " agent_ops " << agent_ops
590 << " flush_quota " << agent_flush_quota << dendl;
9f95a23c 591 agent_locker.unlock();
7c673cae 592 if (!pg->agent_work(max, agent_flush_quota)) {
11fdf7f2 593 dout(10) << __func__ << " " << pg->pg_id
7c673cae
FG
594 << " no agent_work, delay for " << cct->_conf->osd_agent_delay_time
595 << " seconds" << dendl;
596
597 osd->logger->inc(l_osd_tier_delay);
598 // Queue a timer to call agent_choose_mode for this pg in 5 seconds
9f95a23c 599 std::lock_guard timer_locker{agent_timer_lock};
7c673cae
FG
600 Context *cb = new AgentTimeoutCB(pg);
601 agent_timer.add_event_after(cct->_conf->osd_agent_delay_time, cb);
7c673cae 602 }
9f95a23c 603 agent_locker.lock();
7c673cae 604 }
7c673cae
FG
605 dout(10) << __func__ << " finish" << dendl;
606}
607
608void OSDService::agent_stop()
609{
610 {
11fdf7f2 611 std::lock_guard l(agent_lock);
7c673cae
FG
612
613 // By this time all ops should be cancelled
11fdf7f2 614 ceph_assert(agent_ops == 0);
7c673cae
FG
615 // By this time all PGs are shutdown and dequeued
616 if (!agent_queue.empty()) {
617 set<PGRef>& top = agent_queue.rbegin()->second;
11fdf7f2
TL
618 derr << "agent queue not empty, for example " << (*top.begin())->get_pgid() << dendl;
619 ceph_abort_msg("agent queue not empty");
7c673cae
FG
620 }
621
622 agent_stop_flag = true;
9f95a23c 623 agent_cond.notify_all();
7c673cae
FG
624 }
625 agent_thread.join();
626}
627
628// -------------------------------------
629
630void OSDService::promote_throttle_recalibrate()
631{
632 utime_t now = ceph_clock_now();
633 double dur = now - last_recalibrate;
634 last_recalibrate = now;
635 unsigned prob = promote_probability_millis;
636
637 uint64_t target_obj_sec = cct->_conf->osd_tier_promote_max_objects_sec;
638 uint64_t target_bytes_sec = cct->_conf->osd_tier_promote_max_bytes_sec;
639
640 unsigned min_prob = 1;
641
642 uint64_t attempts, obj, bytes;
643 promote_counter.sample_and_attenuate(&attempts, &obj, &bytes);
644 dout(10) << __func__ << " " << attempts << " attempts, promoted "
1adf2230 645 << obj << " objects and " << byte_u_t(bytes) << "; target "
7c673cae 646 << target_obj_sec << " obj/sec or "
1adf2230 647 << byte_u_t(target_bytes_sec) << "/sec"
7c673cae
FG
648 << dendl;
649
650 // calculate what the probability *should* be, given the targets
651 unsigned new_prob;
652 if (attempts && dur > 0) {
653 uint64_t avg_size = 1;
654 if (obj)
11fdf7f2 655 avg_size = std::max<uint64_t>(bytes / obj, 1);
7c673cae
FG
656 unsigned po = (double)target_obj_sec * dur * 1000.0 / (double)attempts;
657 unsigned pb = (double)target_bytes_sec / (double)avg_size * dur * 1000.0
658 / (double)attempts;
659 dout(20) << __func__ << " po " << po << " pb " << pb << " avg_size "
660 << avg_size << dendl;
661 if (target_obj_sec && target_bytes_sec)
11fdf7f2 662 new_prob = std::min(po, pb);
7c673cae
FG
663 else if (target_obj_sec)
664 new_prob = po;
665 else if (target_bytes_sec)
666 new_prob = pb;
667 else
668 new_prob = 1000;
669 } else {
670 new_prob = 1000;
671 }
672 dout(20) << __func__ << " new_prob " << new_prob << dendl;
673
674 // correct for persistent skew between target rate and actual rate, adjust
675 double ratio = 1.0;
676 unsigned actual = 0;
677 if (attempts && obj) {
678 actual = obj * 1000 / attempts;
679 ratio = (double)actual / (double)prob;
680 new_prob = (double)new_prob / ratio;
681 }
11fdf7f2
TL
682 new_prob = std::max(new_prob, min_prob);
683 new_prob = std::min(new_prob, 1000u);
7c673cae
FG
684
685 // adjust
686 prob = (prob + new_prob) / 2;
11fdf7f2
TL
687 prob = std::max(prob, min_prob);
688 prob = std::min(prob, 1000u);
7c673cae
FG
689 dout(10) << __func__ << " actual " << actual
690 << ", actual/prob ratio " << ratio
691 << ", adjusted new_prob " << new_prob
692 << ", prob " << promote_probability_millis << " -> " << prob
693 << dendl;
694 promote_probability_millis = prob;
695
696 // set hard limits for this interval to mitigate stampedes
91327a77
AA
697 promote_max_objects = target_obj_sec * osd->OSD_TICK_INTERVAL * 2;
698 promote_max_bytes = target_bytes_sec * osd->OSD_TICK_INTERVAL * 2;
7c673cae
FG
699}
700
701// -------------------------------------
702
703float OSDService::get_failsafe_full_ratio()
704{
705 float full_ratio = cct->_conf->osd_failsafe_full_ratio;
706 if (full_ratio > 1.0) full_ratio /= 100.0;
707 return full_ratio;
708}
709
11fdf7f2 710OSDService::s_names OSDService::recalc_full_state(float ratio, float pratio, string &inject)
7c673cae 711{
7c673cae
FG
712 // The OSDMap ratios take precendence. So if the failsafe is .95 and
713 // the admin sets the cluster full to .96, the failsafe moves up to .96
714 // too. (Not that having failsafe == full is ideal, but it's better than
715 // dropping writes before the clusters appears full.)
716 OSDMapRef osdmap = get_osdmap();
717 if (!osdmap || osdmap->get_epoch() == 0) {
11fdf7f2 718 return NONE;
7c673cae
FG
719 }
720 float nearfull_ratio = osdmap->get_nearfull_ratio();
721 float backfillfull_ratio = std::max(osdmap->get_backfillfull_ratio(), nearfull_ratio);
722 float full_ratio = std::max(osdmap->get_full_ratio(), backfillfull_ratio);
723 float failsafe_ratio = std::max(get_failsafe_full_ratio(), full_ratio);
724
9f95a23c 725 if (osdmap->require_osd_release < ceph_release_t::luminous) {
7c673cae
FG
726 // use the failsafe for nearfull and full; the mon isn't using the
727 // flags anyway because we're mid-upgrade.
728 full_ratio = failsafe_ratio;
729 backfillfull_ratio = failsafe_ratio;
730 nearfull_ratio = failsafe_ratio;
731 } else if (full_ratio <= 0 ||
732 backfillfull_ratio <= 0 ||
733 nearfull_ratio <= 0) {
734 derr << __func__ << " full_ratio, backfillfull_ratio or nearfull_ratio is <= 0" << dendl;
735 // use failsafe flag. ick. the monitor did something wrong or the user
736 // did something stupid.
737 full_ratio = failsafe_ratio;
738 backfillfull_ratio = failsafe_ratio;
739 nearfull_ratio = failsafe_ratio;
740 }
741
7c673cae 742 if (injectfull_state > NONE && injectfull) {
7c673cae 743 inject = "(Injected)";
11fdf7f2
TL
744 return injectfull_state;
745 } else if (pratio > failsafe_ratio) {
746 return FAILSAFE;
7c673cae 747 } else if (ratio > full_ratio) {
11fdf7f2 748 return FULL;
7c673cae 749 } else if (ratio > backfillfull_ratio) {
11fdf7f2 750 return BACKFILLFULL;
92f5a8d4 751 } else if (pratio > nearfull_ratio) {
11fdf7f2 752 return NEARFULL;
7c673cae 753 }
11fdf7f2
TL
754 return NONE;
755}
756
757void OSDService::check_full_status(float ratio, float pratio)
758{
759 std::lock_guard l(full_status_lock);
760
761 cur_ratio = ratio;
762 physical_ratio = pratio;
763
764 string inject;
765 s_names new_state;
766 new_state = recalc_full_state(ratio, pratio, inject);
767
7c673cae 768 dout(20) << __func__ << " cur ratio " << ratio
11fdf7f2 769 << ", physical ratio " << pratio
7c673cae
FG
770 << ", new state " << get_full_state_name(new_state)
771 << " " << inject
772 << dendl;
773
774 // warn
775 if (cur_state != new_state) {
776 dout(10) << __func__ << " " << get_full_state_name(cur_state)
777 << " -> " << get_full_state_name(new_state) << dendl;
778 if (new_state == FAILSAFE) {
c07f9fc5 779 clog->error() << "full status failsafe engaged, dropping updates, now "
7c673cae
FG
780 << (int)roundf(ratio * 100) << "% full";
781 } else if (cur_state == FAILSAFE) {
c07f9fc5
FG
782 clog->error() << "full status failsafe disengaged, no longer dropping "
783 << "updates, now " << (int)roundf(ratio * 100) << "% full";
7c673cae
FG
784 }
785 cur_state = new_state;
786 }
787}
788
789bool OSDService::need_fullness_update()
790{
791 OSDMapRef osdmap = get_osdmap();
792 s_names cur = NONE;
793 if (osdmap->exists(whoami)) {
794 if (osdmap->get_state(whoami) & CEPH_OSD_FULL) {
795 cur = FULL;
796 } else if (osdmap->get_state(whoami) & CEPH_OSD_BACKFILLFULL) {
797 cur = BACKFILLFULL;
798 } else if (osdmap->get_state(whoami) & CEPH_OSD_NEARFULL) {
799 cur = NEARFULL;
800 }
801 }
802 s_names want = NONE;
803 if (is_full())
804 want = FULL;
805 else if (is_backfillfull())
806 want = BACKFILLFULL;
807 else if (is_nearfull())
808 want = NEARFULL;
809 return want != cur;
810}
811
11fdf7f2 812bool OSDService::_check_inject_full(DoutPrefixProvider *dpp, s_names type) const
7c673cae 813{
7c673cae
FG
814 if (injectfull && injectfull_state >= type) {
815 // injectfull is either a count of the number of times to return failsafe full
816 // or if -1 then always return full
817 if (injectfull > 0)
818 --injectfull;
11fdf7f2
TL
819 ldpp_dout(dpp, 10) << __func__ << " Injected " << get_full_state_name(type) << " OSD ("
820 << (injectfull < 0 ? "set" : std::to_string(injectfull)) << ")"
821 << dendl;
7c673cae
FG
822 return true;
823 }
11fdf7f2
TL
824 return false;
825}
826
827bool OSDService::_check_full(DoutPrefixProvider *dpp, s_names type) const
828{
829 std::lock_guard l(full_status_lock);
830
831 if (_check_inject_full(dpp, type))
832 return true;
833
834 if (cur_state >= type)
835 ldpp_dout(dpp, 10) << __func__ << " current usage is " << cur_ratio
836 << " physical " << physical_ratio << dendl;
7c673cae 837
7c673cae
FG
838 return cur_state >= type;
839}
840
11fdf7f2
TL
841bool OSDService::_tentative_full(DoutPrefixProvider *dpp, s_names type, uint64_t adjust_used, osd_stat_t adjusted_stat)
842{
843 ldpp_dout(dpp, 20) << __func__ << " type " << get_full_state_name(type) << " adjust_used " << (adjust_used >> 10) << "KiB" << dendl;
844 {
845 std::lock_guard l(full_status_lock);
846 if (_check_inject_full(dpp, type)) {
847 return true;
848 }
849 }
850
851 float pratio;
852 float ratio = compute_adjusted_ratio(adjusted_stat, &pratio, adjust_used);
853
854 string notused;
855 s_names tentative_state = recalc_full_state(ratio, pratio, notused);
856
857 if (tentative_state >= type)
858 ldpp_dout(dpp, 10) << __func__ << " tentative usage is " << ratio << dendl;
859
860 return tentative_state >= type;
861}
862
863bool OSDService::check_failsafe_full(DoutPrefixProvider *dpp) const
864{
865 return _check_full(dpp, FAILSAFE);
866}
867
868bool OSDService::check_full(DoutPrefixProvider *dpp) const
7c673cae 869{
11fdf7f2 870 return _check_full(dpp, FULL);
7c673cae
FG
871}
872
11fdf7f2 873bool OSDService::tentative_backfill_full(DoutPrefixProvider *dpp, uint64_t adjust_used, osd_stat_t stats)
7c673cae 874{
11fdf7f2 875 return _tentative_full(dpp, BACKFILLFULL, adjust_used, stats);
7c673cae
FG
876}
877
11fdf7f2 878bool OSDService::check_backfill_full(DoutPrefixProvider *dpp) const
7c673cae 879{
11fdf7f2 880 return _check_full(dpp, BACKFILLFULL);
7c673cae
FG
881}
882
11fdf7f2 883bool OSDService::check_nearfull(DoutPrefixProvider *dpp) const
7c673cae 884{
11fdf7f2 885 return _check_full(dpp, NEARFULL);
7c673cae
FG
886}
887
888bool OSDService::is_failsafe_full() const
889{
11fdf7f2 890 std::lock_guard l(full_status_lock);
7c673cae
FG
891 return cur_state == FAILSAFE;
892}
893
894bool OSDService::is_full() const
895{
11fdf7f2 896 std::lock_guard l(full_status_lock);
7c673cae
FG
897 return cur_state >= FULL;
898}
899
900bool OSDService::is_backfillfull() const
901{
11fdf7f2 902 std::lock_guard l(full_status_lock);
7c673cae
FG
903 return cur_state >= BACKFILLFULL;
904}
905
906bool OSDService::is_nearfull() const
907{
11fdf7f2 908 std::lock_guard l(full_status_lock);
7c673cae
FG
909 return cur_state >= NEARFULL;
910}
911
912void OSDService::set_injectfull(s_names type, int64_t count)
913{
11fdf7f2 914 std::lock_guard l(full_status_lock);
7c673cae
FG
915 injectfull_state = type;
916 injectfull = count;
917}
918
11fdf7f2
TL
919void OSDService::set_statfs(const struct store_statfs_t &stbuf,
920 osd_alert_list_t& alerts)
7c673cae 921{
224ce89b 922 uint64_t bytes = stbuf.total;
224ce89b 923 uint64_t avail = stbuf.available;
11fdf7f2
TL
924 uint64_t used = stbuf.get_used_raw();
925
926 // For testing fake statfs values so it doesn't matter if all
927 // OSDs are using the same partition.
928 if (cct->_conf->fake_statfs_for_testing) {
929 uint64_t total_num_bytes = 0;
930 vector<PGRef> pgs;
931 osd->_get_pgs(&pgs);
932 for (auto p : pgs) {
933 total_num_bytes += p->get_stats_num_bytes();
934 }
935 bytes = cct->_conf->fake_statfs_for_testing;
936 if (total_num_bytes < bytes)
937 avail = bytes - total_num_bytes;
938 else
939 avail = 0;
940 dout(0) << __func__ << " fake total " << cct->_conf->fake_statfs_for_testing
941 << " adjust available " << avail
942 << dendl;
943 used = bytes - avail;
944 }
7c673cae 945
224ce89b
WB
946 osd->logger->set(l_osd_stat_bytes, bytes);
947 osd->logger->set(l_osd_stat_bytes_used, used);
948 osd->logger->set(l_osd_stat_bytes_avail, avail);
7c673cae 949
11fdf7f2
TL
950 std::lock_guard l(stat_lock);
951 osd_stat.statfs = stbuf;
952 osd_stat.os_alerts.clear();
953 osd_stat.os_alerts[whoami].swap(alerts);
954 if (cct->_conf->fake_statfs_for_testing) {
955 osd_stat.statfs.total = bytes;
956 osd_stat.statfs.available = avail;
957 // For testing don't want used to go negative, so clear reserved
958 osd_stat.statfs.internally_reserved = 0;
224ce89b
WB
959 }
960}
7c673cae 961
11fdf7f2
TL
962osd_stat_t OSDService::set_osd_stat(vector<int>& hb_peers,
963 int num_pgs)
224ce89b 964{
eafe8130
TL
965 utime_t now = ceph_clock_now();
966 auto stale_time = g_conf().get_val<int64_t>("osd_mon_heartbeat_stat_stale");
11fdf7f2
TL
967 std::lock_guard l(stat_lock);
968 osd_stat.hb_peers.swap(hb_peers);
969 osd->op_tracker.get_age_ms_histogram(&osd_stat.op_queue_age_hist);
970 osd_stat.num_pgs = num_pgs;
eafe8130
TL
971 // Clean entries that aren't updated
972 // This is called often enough that we can just remove 1 at a time
973 for (auto i: osd_stat.hb_pingtime) {
974 if (i.second.last_update == 0)
975 continue;
976 if (stale_time && now.sec() - i.second.last_update > stale_time) {
977 dout(20) << __func__ << " time out heartbeat for osd " << i.first
978 << " last_update " << i.second.last_update << dendl;
979 osd_stat.hb_pingtime.erase(i.first);
980 break;
981 }
982 }
11fdf7f2
TL
983 return osd_stat;
984}
985
986void OSDService::inc_osd_stat_repaired()
987{
988 std::lock_guard l(stat_lock);
989 osd_stat.num_shards_repaired++;
990 return;
991}
992
993float OSDService::compute_adjusted_ratio(osd_stat_t new_stat, float *pratio,
994 uint64_t adjust_used)
995{
996 *pratio =
997 ((float)new_stat.statfs.get_used()) / ((float)new_stat.statfs.total);
998
999 if (adjust_used) {
1000 dout(20) << __func__ << " Before kb_used() " << new_stat.statfs.kb_used() << dendl;
1001 if (new_stat.statfs.available > adjust_used)
1002 new_stat.statfs.available -= adjust_used;
1003 else
1004 new_stat.statfs.available = 0;
1005 dout(20) << __func__ << " After kb_used() " << new_stat.statfs.kb_used() << dendl;
7c673cae
FG
1006 }
1007
11fdf7f2
TL
1008 // Check all pgs and adjust kb_used to include all pending backfill data
1009 int backfill_adjusted = 0;
1010 vector<PGRef> pgs;
1011 osd->_get_pgs(&pgs);
1012 for (auto p : pgs) {
1013 backfill_adjusted += p->pg_stat_adjust(&new_stat);
1014 }
1015 if (backfill_adjusted) {
1016 dout(20) << __func__ << " backfill adjusted " << new_stat << dendl;
1017 }
1018 return ((float)new_stat.statfs.get_used()) / ((float)new_stat.statfs.total);
7c673cae
FG
1019}
1020
7c673cae
FG
1021void OSDService::send_message_osd_cluster(int peer, Message *m, epoch_t from_epoch)
1022{
1023 OSDMapRef next_map = get_nextmap_reserved();
1024 // service map is always newer/newest
11fdf7f2 1025 ceph_assert(from_epoch <= next_map->get_epoch());
7c673cae
FG
1026
1027 if (next_map->is_down(peer) ||
1028 next_map->get_info(peer).up_from > from_epoch) {
1029 m->put();
1030 release_map(next_map);
1031 return;
1032 }
9f95a23c
TL
1033 ConnectionRef peer_con;
1034 if (peer == whoami) {
1035 peer_con = osd->cluster_messenger->get_loopback_connection();
1036 } else {
1037 peer_con = osd->cluster_messenger->connect_to_osd(
1038 next_map->get_cluster_addrs(peer), false, true);
1039 }
1040 maybe_share_map(peer_con.get(), next_map);
7c673cae
FG
1041 peer_con->send_message(m);
1042 release_map(next_map);
1043}
1044
9f95a23c
TL
1045void OSDService::send_message_osd_cluster(std::vector<std::pair<int, Message*>>& messages, epoch_t from_epoch)
1046{
1047 OSDMapRef next_map = get_nextmap_reserved();
1048 // service map is always newer/newest
1049 ceph_assert(from_epoch <= next_map->get_epoch());
1050
1051 for (auto& iter : messages) {
1052 if (next_map->is_down(iter.first) ||
1053 next_map->get_info(iter.first).up_from > from_epoch) {
1054 iter.second->put();
1055 continue;
1056 }
1057 ConnectionRef peer_con;
1058 if (iter.first == whoami) {
1059 peer_con = osd->cluster_messenger->get_loopback_connection();
1060 } else {
1061 peer_con = osd->cluster_messenger->connect_to_osd(
1062 next_map->get_cluster_addrs(iter.first), false, true);
1063 }
1064 maybe_share_map(peer_con.get(), next_map);
1065 peer_con->send_message(iter.second);
1066 }
1067 release_map(next_map);
1068}
7c673cae
FG
1069ConnectionRef OSDService::get_con_osd_cluster(int peer, epoch_t from_epoch)
1070{
1071 OSDMapRef next_map = get_nextmap_reserved();
1072 // service map is always newer/newest
11fdf7f2 1073 ceph_assert(from_epoch <= next_map->get_epoch());
7c673cae
FG
1074
1075 if (next_map->is_down(peer) ||
1076 next_map->get_info(peer).up_from > from_epoch) {
1077 release_map(next_map);
1078 return NULL;
1079 }
9f95a23c
TL
1080 ConnectionRef con;
1081 if (peer == whoami) {
1082 con = osd->cluster_messenger->get_loopback_connection();
1083 } else {
1084 con = osd->cluster_messenger->connect_to_osd(
1085 next_map->get_cluster_addrs(peer), false, true);
1086 }
7c673cae
FG
1087 release_map(next_map);
1088 return con;
1089}
1090
1091pair<ConnectionRef,ConnectionRef> OSDService::get_con_osd_hb(int peer, epoch_t from_epoch)
1092{
1093 OSDMapRef next_map = get_nextmap_reserved();
1094 // service map is always newer/newest
11fdf7f2 1095 ceph_assert(from_epoch <= next_map->get_epoch());
7c673cae
FG
1096
1097 pair<ConnectionRef,ConnectionRef> ret;
1098 if (next_map->is_down(peer) ||
1099 next_map->get_info(peer).up_from > from_epoch) {
1100 release_map(next_map);
1101 return ret;
1102 }
11fdf7f2
TL
1103 ret.first = osd->hb_back_client_messenger->connect_to_osd(
1104 next_map->get_hb_back_addrs(peer));
1105 ret.second = osd->hb_front_client_messenger->connect_to_osd(
1106 next_map->get_hb_front_addrs(peer));
7c673cae
FG
1107 release_map(next_map);
1108 return ret;
1109}
1110
11fdf7f2
TL
1111entity_name_t OSDService::get_cluster_msgr_name() const
1112{
1113 return cluster_messenger->get_myname();
1114}
7c673cae 1115
94b18763
FG
1116void OSDService::queue_want_pg_temp(pg_t pgid,
1117 const vector<int>& want,
1118 bool forced)
7c673cae 1119{
11fdf7f2 1120 std::lock_guard l(pg_temp_lock);
94b18763 1121 auto p = pg_temp_pending.find(pgid);
7c673cae 1122 if (p == pg_temp_pending.end() ||
94b18763
FG
1123 p->second.acting != want ||
1124 forced) {
11fdf7f2 1125 pg_temp_wanted[pgid] = {want, forced};
7c673cae
FG
1126 }
1127}
1128
1129void OSDService::remove_want_pg_temp(pg_t pgid)
1130{
11fdf7f2 1131 std::lock_guard l(pg_temp_lock);
7c673cae
FG
1132 pg_temp_wanted.erase(pgid);
1133 pg_temp_pending.erase(pgid);
1134}
1135
1136void OSDService::_sent_pg_temp()
1137{
11fdf7f2
TL
1138#ifdef HAVE_STDLIB_MAP_SPLICING
1139 pg_temp_pending.merge(pg_temp_wanted);
1140#else
94b18763
FG
1141 pg_temp_pending.insert(make_move_iterator(begin(pg_temp_wanted)),
1142 make_move_iterator(end(pg_temp_wanted)));
11fdf7f2 1143#endif
7c673cae
FG
1144 pg_temp_wanted.clear();
1145}
1146
1147void OSDService::requeue_pg_temp()
1148{
11fdf7f2 1149 std::lock_guard l(pg_temp_lock);
7c673cae
FG
1150 // wanted overrides pending. note that remove_want_pg_temp
1151 // clears the item out of both.
1152 unsigned old_wanted = pg_temp_wanted.size();
1153 unsigned old_pending = pg_temp_pending.size();
1154 _sent_pg_temp();
1155 pg_temp_wanted.swap(pg_temp_pending);
1156 dout(10) << __func__ << " " << old_wanted << " + " << old_pending << " -> "
1157 << pg_temp_wanted.size() << dendl;
1158}
1159
94b18763
FG
1160std::ostream& operator<<(std::ostream& out,
1161 const OSDService::pg_temp_t& pg_temp)
1162{
1163 out << pg_temp.acting;
1164 if (pg_temp.forced) {
1165 out << " (forced)";
1166 }
1167 return out;
1168}
1169
7c673cae
FG
1170void OSDService::send_pg_temp()
1171{
11fdf7f2 1172 std::lock_guard l(pg_temp_lock);
7c673cae
FG
1173 if (pg_temp_wanted.empty())
1174 return;
1175 dout(10) << "send_pg_temp " << pg_temp_wanted << dendl;
94b18763 1176 MOSDPGTemp *ms[2] = {nullptr, nullptr};
11fdf7f2
TL
1177 for (auto& [pgid, pg_temp] : pg_temp_wanted) {
1178 auto& m = ms[pg_temp.forced];
94b18763
FG
1179 if (!m) {
1180 m = new MOSDPGTemp(osdmap->get_epoch());
11fdf7f2 1181 m->forced = pg_temp.forced;
94b18763 1182 }
11fdf7f2 1183 m->pg_temp.emplace(pgid, pg_temp.acting);
94b18763
FG
1184 }
1185 for (auto m : ms) {
1186 if (m) {
1187 monc->send_mon_message(m);
1188 }
1189 }
7c673cae
FG
1190 _sent_pg_temp();
1191}
1192
1193void OSDService::send_pg_created(pg_t pgid)
1194{
11fdf7f2 1195 std::lock_guard l(pg_created_lock);
7c673cae 1196 dout(20) << __func__ << dendl;
11fdf7f2 1197 auto o = get_osdmap();
9f95a23c 1198 if (o->require_osd_release >= ceph_release_t::luminous) {
11fdf7f2 1199 pg_created.insert(pgid);
c07f9fc5
FG
1200 monc->send_mon_message(new MOSDPGCreated(pgid));
1201 }
7c673cae
FG
1202}
1203
11fdf7f2
TL
1204void OSDService::send_pg_created()
1205{
1206 std::lock_guard l(pg_created_lock);
1207 dout(20) << __func__ << dendl;
1208 auto o = get_osdmap();
9f95a23c 1209 if (o->require_osd_release >= ceph_release_t::luminous) {
11fdf7f2
TL
1210 for (auto pgid : pg_created) {
1211 monc->send_mon_message(new MOSDPGCreated(pgid));
1212 }
1213 }
1214}
1215
1216void OSDService::prune_pg_created()
1217{
1218 std::lock_guard l(pg_created_lock);
1219 dout(20) << __func__ << dendl;
1220 auto o = get_osdmap();
1221 auto i = pg_created.begin();
1222 while (i != pg_created.end()) {
1223 auto p = o->get_pg_pool(i->pool());
1224 if (!p || !p->has_flag(pg_pool_t::FLAG_CREATING)) {
1225 dout(20) << __func__ << " pruning " << *i << dendl;
1226 i = pg_created.erase(i);
1227 } else {
1228 dout(20) << __func__ << " keeping " << *i << dendl;
1229 ++i;
1230 }
1231 }
1232}
1233
1234
7c673cae
FG
1235// --------------------------------------
1236// dispatch
1237
eafe8130 1238bool OSDService::can_inc_scrubs()
7c673cae
FG
1239{
1240 bool can_inc = false;
11fdf7f2 1241 std::lock_guard l(sched_scrub_lock);
7c673cae 1242
eafe8130
TL
1243 if (scrubs_local + scrubs_remote < cct->_conf->osd_max_scrubs) {
1244 dout(20) << __func__ << " == true " << scrubs_local << " local + " << scrubs_remote
1245 << " remote < max " << cct->_conf->osd_max_scrubs << dendl;
7c673cae
FG
1246 can_inc = true;
1247 } else {
eafe8130
TL
1248 dout(20) << __func__ << " == false " << scrubs_local << " local + " << scrubs_remote
1249 << " remote >= max " << cct->_conf->osd_max_scrubs << dendl;
7c673cae
FG
1250 }
1251
1252 return can_inc;
1253}
1254
eafe8130 1255bool OSDService::inc_scrubs_local()
7c673cae
FG
1256{
1257 bool result = false;
eafe8130
TL
1258 std::lock_guard l{sched_scrub_lock};
1259 if (scrubs_local + scrubs_remote < cct->_conf->osd_max_scrubs) {
1260 dout(20) << __func__ << " " << scrubs_local << " -> " << (scrubs_local+1)
1261 << " (max " << cct->_conf->osd_max_scrubs << ", remote " << scrubs_remote << ")" << dendl;
7c673cae 1262 result = true;
eafe8130 1263 ++scrubs_local;
7c673cae 1264 } else {
eafe8130 1265 dout(20) << __func__ << " " << scrubs_local << " local + " << scrubs_remote << " remote >= max " << cct->_conf->osd_max_scrubs << dendl;
7c673cae 1266 }
7c673cae
FG
1267 return result;
1268}
1269
eafe8130 1270void OSDService::dec_scrubs_local()
7c673cae 1271{
eafe8130
TL
1272 std::lock_guard l{sched_scrub_lock};
1273 dout(20) << __func__ << " " << scrubs_local << " -> " << (scrubs_local-1)
1274 << " (max " << cct->_conf->osd_max_scrubs << ", remote " << scrubs_remote << ")" << dendl;
1275 --scrubs_local;
1276 ceph_assert(scrubs_local >= 0);
7c673cae
FG
1277}
1278
eafe8130 1279bool OSDService::inc_scrubs_remote()
7c673cae 1280{
eafe8130
TL
1281 bool result = false;
1282 std::lock_guard l{sched_scrub_lock};
1283 if (scrubs_local + scrubs_remote < cct->_conf->osd_max_scrubs) {
1284 dout(20) << __func__ << " " << scrubs_remote << " -> " << (scrubs_remote+1)
1285 << " (max " << cct->_conf->osd_max_scrubs << ", local " << scrubs_local << ")" << dendl;
1286 result = true;
1287 ++scrubs_remote;
7c673cae 1288 } else {
eafe8130 1289 dout(20) << __func__ << " " << scrubs_local << " local + " << scrubs_remote << " remote >= max " << cct->_conf->osd_max_scrubs << dendl;
7c673cae 1290 }
eafe8130
TL
1291 return result;
1292}
1293
1294void OSDService::dec_scrubs_remote()
1295{
1296 std::lock_guard l{sched_scrub_lock};
1297 dout(20) << __func__ << " " << scrubs_remote << " -> " << (scrubs_remote-1)
1298 << " (max " << cct->_conf->osd_max_scrubs << ", local " << scrubs_local << ")" << dendl;
1299 --scrubs_remote;
1300 ceph_assert(scrubs_remote >= 0);
7c673cae
FG
1301}
1302
eafe8130 1303void OSDService::dump_scrub_reservations(Formatter *f)
7c673cae 1304{
eafe8130
TL
1305 std::lock_guard l{sched_scrub_lock};
1306 f->dump_int("scrubs_local", scrubs_local);
1307 f->dump_int("scrubs_remote", scrubs_remote);
1308 f->dump_int("osd_max_scrubs", cct->_conf->osd_max_scrubs);
7c673cae
FG
1309}
1310
1311void OSDService::retrieve_epochs(epoch_t *_boot_epoch, epoch_t *_up_epoch,
1312 epoch_t *_bind_epoch) const
1313{
11fdf7f2 1314 std::lock_guard l(epoch_lock);
7c673cae
FG
1315 if (_boot_epoch)
1316 *_boot_epoch = boot_epoch;
1317 if (_up_epoch)
1318 *_up_epoch = up_epoch;
1319 if (_bind_epoch)
1320 *_bind_epoch = bind_epoch;
1321}
1322
1323void OSDService::set_epochs(const epoch_t *_boot_epoch, const epoch_t *_up_epoch,
1324 const epoch_t *_bind_epoch)
1325{
11fdf7f2 1326 std::lock_guard l(epoch_lock);
7c673cae 1327 if (_boot_epoch) {
11fdf7f2 1328 ceph_assert(*_boot_epoch == 0 || *_boot_epoch >= boot_epoch);
7c673cae
FG
1329 boot_epoch = *_boot_epoch;
1330 }
1331 if (_up_epoch) {
11fdf7f2 1332 ceph_assert(*_up_epoch == 0 || *_up_epoch >= up_epoch);
7c673cae
FG
1333 up_epoch = *_up_epoch;
1334 }
1335 if (_bind_epoch) {
11fdf7f2 1336 ceph_assert(*_bind_epoch == 0 || *_bind_epoch >= bind_epoch);
7c673cae
FG
1337 bind_epoch = *_bind_epoch;
1338 }
1339}
1340
1341bool OSDService::prepare_to_stop()
1342{
9f95a23c 1343 std::unique_lock l(is_stopping_lock);
7c673cae
FG
1344 if (get_state() != NOT_STOPPING)
1345 return false;
1346
1347 OSDMapRef osdmap = get_osdmap();
1348 if (osdmap && osdmap->is_up(whoami)) {
1349 dout(0) << __func__ << " telling mon we are shutting down" << dendl;
1350 set_state(PREPARING_TO_STOP);
11fdf7f2
TL
1351 monc->send_mon_message(
1352 new MOSDMarkMeDown(
1353 monc->get_fsid(),
1354 whoami,
1355 osdmap->get_addrs(whoami),
1356 osdmap->get_epoch(),
1357 true // request ack
1358 ));
9f95a23c
TL
1359 const auto timeout = ceph::make_timespan(cct->_conf->osd_mon_shutdown_timeout);
1360 is_stopping_cond.wait_for(l, timeout,
1361 [this] { return get_state() == STOPPING; });
7c673cae
FG
1362 }
1363 dout(0) << __func__ << " starting shutdown" << dendl;
1364 set_state(STOPPING);
1365 return true;
1366}
1367
1368void OSDService::got_stop_ack()
1369{
9f95a23c 1370 std::scoped_lock l(is_stopping_lock);
7c673cae
FG
1371 if (get_state() == PREPARING_TO_STOP) {
1372 dout(0) << __func__ << " starting shutdown" << dendl;
1373 set_state(STOPPING);
9f95a23c 1374 is_stopping_cond.notify_all();
7c673cae
FG
1375 } else {
1376 dout(10) << __func__ << " ignoring msg" << dendl;
1377 }
1378}
1379
1380MOSDMap *OSDService::build_incremental_map_msg(epoch_t since, epoch_t to,
1381 OSDSuperblock& sblock)
1382{
28e407b8
AA
1383 MOSDMap *m = new MOSDMap(monc->get_fsid(),
1384 osdmap->get_encoding_features());
7c673cae
FG
1385 m->oldest_map = max_oldest_map;
1386 m->newest_map = sblock.newest_map;
1387
11fdf7f2
TL
1388 int max = cct->_conf->osd_map_message_max;
1389 ssize_t max_bytes = cct->_conf->osd_map_message_max_bytes;
1390
1391 if (since < m->oldest_map) {
1392 // we don't have the next map the target wants, so start with a
1393 // full map.
1394 bufferlist bl;
1395 dout(10) << __func__ << " oldest map " << max_oldest_map << " > since "
1396 << since << ", starting with full map" << dendl;
1397 since = m->oldest_map;
1398 if (!get_map_bl(since, bl)) {
1399 derr << __func__ << " missing full map " << since << dendl;
1400 goto panic;
1401 }
1402 max--;
1403 max_bytes -= bl.length();
1404 m->maps[since].claim(bl);
1405 }
1406 for (epoch_t e = since + 1; e <= to; ++e) {
7c673cae 1407 bufferlist bl;
11fdf7f2 1408 if (get_inc_map_bl(e, bl)) {
7c673cae 1409 m->incremental_maps[e].claim(bl);
11fdf7f2 1410 } else {
e306af50 1411 dout(10) << __func__ << " missing incremental map " << e << dendl;
11fdf7f2
TL
1412 if (!get_map_bl(e, bl)) {
1413 derr << __func__ << " also missing full map " << e << dendl;
1414 goto panic;
1415 }
7c673cae 1416 m->maps[e].claim(bl);
11fdf7f2
TL
1417 }
1418 max--;
1419 max_bytes -= bl.length();
1420 if (max <= 0 || max_bytes <= 0) {
7c673cae 1421 break;
11fdf7f2
TL
1422 }
1423 }
1424 return m;
1425
1426 panic:
1427 if (!m->maps.empty() ||
1428 !m->incremental_maps.empty()) {
1429 // send what we have so far
1430 return m;
1431 }
1432 // send something
1433 bufferlist bl;
1434 if (get_inc_map_bl(m->newest_map, bl)) {
1435 m->incremental_maps[m->newest_map].claim(bl);
1436 } else {
1437 derr << __func__ << " unable to load latest map " << m->newest_map << dendl;
1438 if (!get_map_bl(m->newest_map, bl)) {
1439 derr << __func__ << " unable to load latest full map " << m->newest_map
7c673cae 1440 << dendl;
11fdf7f2 1441 ceph_abort();
7c673cae 1442 }
11fdf7f2 1443 m->maps[m->newest_map].claim(bl);
7c673cae
FG
1444 }
1445 return m;
1446}
1447
1448void OSDService::send_map(MOSDMap *m, Connection *con)
1449{
1450 con->send_message(m);
1451}
1452
1453void OSDService::send_incremental_map(epoch_t since, Connection *con,
9f95a23c 1454 const OSDMapRef& osdmap)
7c673cae
FG
1455{
1456 epoch_t to = osdmap->get_epoch();
1457 dout(10) << "send_incremental_map " << since << " -> " << to
1458 << " to " << con << " " << con->get_peer_addr() << dendl;
1459
1460 MOSDMap *m = NULL;
1461 while (!m) {
1462 OSDSuperblock sblock(get_superblock());
1463 if (since < sblock.oldest_map) {
1464 // just send latest full map
28e407b8
AA
1465 MOSDMap *m = new MOSDMap(monc->get_fsid(),
1466 osdmap->get_encoding_features());
7c673cae
FG
1467 m->oldest_map = max_oldest_map;
1468 m->newest_map = sblock.newest_map;
1469 get_map_bl(to, m->maps[to]);
1470 send_map(m, con);
1471 return;
1472 }
1473
1474 if (to > since && (int64_t)(to - since) > cct->_conf->osd_map_share_max_epochs) {
1475 dout(10) << " " << (to - since) << " > max " << cct->_conf->osd_map_share_max_epochs
1476 << ", only sending most recent" << dendl;
1477 since = to - cct->_conf->osd_map_share_max_epochs;
1478 }
1479
7c673cae
FG
1480 m = build_incremental_map_msg(since, to, sblock);
1481 }
1482 send_map(m, con);
1483}
1484
1485bool OSDService::_get_map_bl(epoch_t e, bufferlist& bl)
1486{
1487 bool found = map_bl_cache.lookup(e, &bl);
31f18b77
FG
1488 if (found) {
1489 if (logger)
1490 logger->inc(l_osd_map_bl_cache_hit);
7c673cae 1491 return true;
31f18b77
FG
1492 }
1493 if (logger)
1494 logger->inc(l_osd_map_bl_cache_miss);
11fdf7f2 1495 found = store->read(meta_ch,
31f18b77
FG
1496 OSD::get_osdmap_pobject_name(e), 0, 0, bl,
1497 CEPH_OSD_OP_FLAG_FADVISE_WILLNEED) >= 0;
1498 if (found) {
7c673cae 1499 _add_map_bl(e, bl);
31f18b77 1500 }
7c673cae
FG
1501 return found;
1502}
1503
1504bool OSDService::get_inc_map_bl(epoch_t e, bufferlist& bl)
1505{
11fdf7f2 1506 std::lock_guard l(map_cache_lock);
7c673cae 1507 bool found = map_bl_inc_cache.lookup(e, &bl);
31f18b77
FG
1508 if (found) {
1509 if (logger)
1510 logger->inc(l_osd_map_bl_cache_hit);
7c673cae 1511 return true;
31f18b77
FG
1512 }
1513 if (logger)
1514 logger->inc(l_osd_map_bl_cache_miss);
11fdf7f2 1515 found = store->read(meta_ch,
31f18b77
FG
1516 OSD::get_inc_osdmap_pobject_name(e), 0, 0, bl,
1517 CEPH_OSD_OP_FLAG_FADVISE_WILLNEED) >= 0;
1518 if (found) {
7c673cae 1519 _add_map_inc_bl(e, bl);
31f18b77 1520 }
7c673cae
FG
1521 return found;
1522}
1523
1524void OSDService::_add_map_bl(epoch_t e, bufferlist& bl)
1525{
1526 dout(10) << "add_map_bl " << e << " " << bl.length() << " bytes" << dendl;
31f18b77
FG
1527 // cache a contiguous buffer
1528 if (bl.get_num_buffers() > 1) {
1529 bl.rebuild();
1530 }
1531 bl.try_assign_to_mempool(mempool::mempool_osd_mapbl);
7c673cae
FG
1532 map_bl_cache.add(e, bl);
1533}
1534
1535void OSDService::_add_map_inc_bl(epoch_t e, bufferlist& bl)
1536{
1537 dout(10) << "add_map_inc_bl " << e << " " << bl.length() << " bytes" << dendl;
31f18b77
FG
1538 // cache a contiguous buffer
1539 if (bl.get_num_buffers() > 1) {
1540 bl.rebuild();
1541 }
1542 bl.try_assign_to_mempool(mempool::mempool_osd_mapbl);
7c673cae
FG
1543 map_bl_inc_cache.add(e, bl);
1544}
1545
7c673cae
FG
1546OSDMapRef OSDService::_add_map(OSDMap *o)
1547{
1548 epoch_t e = o->get_epoch();
1549
1550 if (cct->_conf->osd_map_dedup) {
1551 // Dedup against an existing map at a nearby epoch
1552 OSDMapRef for_dedup = map_cache.lower_bound(e);
1553 if (for_dedup) {
1554 OSDMap::dedup(for_dedup.get(), o);
1555 }
1556 }
1557 bool existed;
1558 OSDMapRef l = map_cache.add(e, o, &existed);
1559 if (existed) {
1560 delete o;
1561 }
1562 return l;
1563}
1564
1565OSDMapRef OSDService::try_get_map(epoch_t epoch)
1566{
11fdf7f2 1567 std::lock_guard l(map_cache_lock);
7c673cae
FG
1568 OSDMapRef retval = map_cache.lookup(epoch);
1569 if (retval) {
1570 dout(30) << "get_map " << epoch << " -cached" << dendl;
1571 if (logger) {
1572 logger->inc(l_osd_map_cache_hit);
1573 }
1574 return retval;
1575 }
1576 if (logger) {
1577 logger->inc(l_osd_map_cache_miss);
1578 epoch_t lb = map_cache.cached_key_lower_bound();
1579 if (epoch < lb) {
1580 dout(30) << "get_map " << epoch << " - miss, below lower bound" << dendl;
1581 logger->inc(l_osd_map_cache_miss_low);
1582 logger->inc(l_osd_map_cache_miss_low_avg, lb - epoch);
1583 }
1584 }
1585
1586 OSDMap *map = new OSDMap;
1587 if (epoch > 0) {
1588 dout(20) << "get_map " << epoch << " - loading and decoding " << map << dendl;
1589 bufferlist bl;
1590 if (!_get_map_bl(epoch, bl) || bl.length() == 0) {
1591 derr << "failed to load OSD map for epoch " << epoch << ", got " << bl.length() << " bytes" << dendl;
1592 delete map;
1593 return OSDMapRef();
1594 }
1595 map->decode(bl);
1596 } else {
1597 dout(20) << "get_map " << epoch << " - return initial " << map << dendl;
1598 }
1599 return _add_map(map);
1600}
1601
1602// ops
1603
1604
1605void OSDService::reply_op_error(OpRequestRef op, int err)
1606{
9f95a23c 1607 reply_op_error(op, err, eversion_t(), 0, {});
7c673cae
FG
1608}
1609
1610void OSDService::reply_op_error(OpRequestRef op, int err, eversion_t v,
9f95a23c
TL
1611 version_t uv,
1612 vector<pg_log_op_return_item_t> op_returns)
7c673cae 1613{
9f95a23c 1614 auto m = op->get_req<MOSDOp>();
11fdf7f2 1615 ceph_assert(m->get_type() == CEPH_MSG_OSD_OP);
7c673cae
FG
1616 int flags;
1617 flags = m->get_flags() & (CEPH_OSD_FLAG_ACK|CEPH_OSD_FLAG_ONDISK);
1618
9f95a23c
TL
1619 MOSDOpReply *reply = new MOSDOpReply(m, err, osdmap->get_epoch(), flags,
1620 !m->has_flag(CEPH_OSD_FLAG_RETURNVEC));
7c673cae 1621 reply->set_reply_versions(v, uv);
9f95a23c 1622 reply->set_op_returns(op_returns);
7c673cae
FG
1623 m->get_connection()->send_message(reply);
1624}
1625
1626void OSDService::handle_misdirected_op(PG *pg, OpRequestRef op)
1627{
31f18b77
FG
1628 if (!cct->_conf->osd_debug_misdirected_ops) {
1629 return;
1630 }
1631
9f95a23c 1632 auto m = op->get_req<MOSDOp>();
11fdf7f2 1633 ceph_assert(m->get_type() == CEPH_MSG_OSD_OP);
7c673cae 1634
11fdf7f2 1635 ceph_assert(m->get_map_epoch() >= pg->get_history().same_primary_since);
7c673cae
FG
1636
1637 if (pg->is_ec_pg()) {
1638 /**
1639 * OSD recomputes op target based on current OSDMap. With an EC pg, we
1640 * can get this result:
1641 * 1) client at map 512 sends an op to osd 3, pg_t 3.9 based on mapping
1642 * [CRUSH_ITEM_NONE, 2, 3]/3
1643 * 2) OSD 3 at map 513 remaps op to osd 3, spg_t 3.9s0 based on mapping
1644 * [3, 2, 3]/3
1645 * 3) PG 3.9s0 dequeues the op at epoch 512 and notices that it isn't primary
1646 * -- misdirected op
1647 * 4) client resends and this time PG 3.9s0 having caught up to 513 gets
1648 * it and fulfils it
1649 *
1650 * We can't compute the op target based on the sending map epoch due to
1651 * splitting. The simplest thing is to detect such cases here and drop
1652 * them without an error (the client will resend anyway).
1653 */
11fdf7f2 1654 ceph_assert(m->get_map_epoch() <= superblock.newest_map);
7c673cae
FG
1655 OSDMapRef opmap = try_get_map(m->get_map_epoch());
1656 if (!opmap) {
1657 dout(7) << __func__ << ": " << *pg << " no longer have map for "
1658 << m->get_map_epoch() << ", dropping" << dendl;
1659 return;
1660 }
1661 pg_t _pgid = m->get_raw_pg();
1662 spg_t pgid;
1663 if ((m->get_flags() & CEPH_OSD_FLAG_PGOP) == 0)
1664 _pgid = opmap->raw_pg_to_pg(_pgid);
1665 if (opmap->get_primary_shard(_pgid, &pgid) &&
11fdf7f2 1666 pgid.shard != pg->pg_id.shard) {
7c673cae
FG
1667 dout(7) << __func__ << ": " << *pg << " primary changed since "
1668 << m->get_map_epoch() << ", dropping" << dendl;
1669 return;
1670 }
1671 }
1672
1673 dout(7) << *pg << " misdirected op in " << m->get_map_epoch() << dendl;
1674 clog->warn() << m->get_source_inst() << " misdirected " << m->get_reqid()
1675 << " pg " << m->get_raw_pg()
1676 << " to osd." << whoami
11fdf7f2 1677 << " not " << pg->get_acting()
7c673cae 1678 << " in e" << m->get_map_epoch() << "/" << osdmap->get_epoch();
7c673cae
FG
1679}
1680
9f95a23c 1681void OSDService::enqueue_back(OpSchedulerItem&& qi)
7c673cae 1682{
11fdf7f2 1683 osd->op_shardedwq.queue(std::move(qi));
7c673cae
FG
1684}
1685
9f95a23c 1686void OSDService::enqueue_front(OpSchedulerItem&& qi)
7c673cae 1687{
11fdf7f2 1688 osd->op_shardedwq.queue_front(std::move(qi));
7c673cae
FG
1689}
1690
11fdf7f2
TL
1691void OSDService::queue_recovery_context(
1692 PG *pg,
1693 GenContext<ThreadPool::TPHandle&> *c)
7c673cae 1694{
11fdf7f2
TL
1695 epoch_t e = get_osdmap_epoch();
1696 enqueue_back(
9f95a23c
TL
1697 OpSchedulerItem(
1698 unique_ptr<OpSchedulerItem::OpQueueable>(
11fdf7f2
TL
1699 new PGRecoveryContext(pg->get_pgid(), c, e)),
1700 cct->_conf->osd_recovery_cost,
1701 cct->_conf->osd_recovery_priority,
1702 ceph_clock_now(),
1703 0,
1704 e));
7c673cae
FG
1705}
1706
1707void OSDService::queue_for_snap_trim(PG *pg)
1708{
1709 dout(10) << "queueing " << *pg << " for snaptrim" << dendl;
11fdf7f2 1710 enqueue_back(
9f95a23c
TL
1711 OpSchedulerItem(
1712 unique_ptr<OpSchedulerItem::OpQueueable>(
11fdf7f2
TL
1713 new PGSnapTrim(pg->get_pgid(), pg->get_osdmap_epoch())),
1714 cct->_conf->osd_snap_trim_cost,
1715 cct->_conf->osd_snap_trim_priority,
1716 ceph_clock_now(),
1717 0,
1718 pg->get_osdmap_epoch()));
1719}
1720
1721void OSDService::queue_for_scrub(PG *pg, bool with_high_priority)
1722{
1723 unsigned scrub_queue_priority = pg->scrubber.priority;
1724 if (with_high_priority && scrub_queue_priority < cct->_conf->osd_client_op_priority) {
1725 scrub_queue_priority = cct->_conf->osd_client_op_priority;
1726 }
1727 const auto epoch = pg->get_osdmap_epoch();
1728 enqueue_back(
9f95a23c
TL
1729 OpSchedulerItem(
1730 unique_ptr<OpSchedulerItem::OpQueueable>(new PGScrub(pg->get_pgid(), epoch)),
11fdf7f2
TL
1731 cct->_conf->osd_scrub_cost,
1732 scrub_queue_priority,
1733 ceph_clock_now(),
1734 0,
1735 epoch));
1736}
1737
1738void OSDService::queue_for_pg_delete(spg_t pgid, epoch_t e)
1739{
1740 dout(10) << __func__ << " on " << pgid << " e " << e << dendl;
1741 enqueue_back(
9f95a23c
TL
1742 OpSchedulerItem(
1743 unique_ptr<OpSchedulerItem::OpQueueable>(
11fdf7f2
TL
1744 new PGDelete(pgid, e)),
1745 cct->_conf->osd_pg_delete_cost,
1746 cct->_conf->osd_pg_delete_priority,
1747 ceph_clock_now(),
1748 0,
1749 e));
1750}
1751
1752bool OSDService::try_finish_pg_delete(PG *pg, unsigned old_pg_num)
1753{
1754 return osd->try_finish_pg_delete(pg, old_pg_num);
1755}
1756
1757// ---
1758
1759void OSDService::set_ready_to_merge_source(PG *pg, eversion_t version)
1760{
1761 std::lock_guard l(merge_lock);
1762 dout(10) << __func__ << " " << pg->pg_id << dendl;
1763 ready_to_merge_source[pg->pg_id.pgid] = version;
1764 assert(not_ready_to_merge_source.count(pg->pg_id.pgid) == 0);
1765 _send_ready_to_merge();
1766}
1767
1768void OSDService::set_ready_to_merge_target(PG *pg,
1769 eversion_t version,
1770 epoch_t last_epoch_started,
1771 epoch_t last_epoch_clean)
1772{
1773 std::lock_guard l(merge_lock);
1774 dout(10) << __func__ << " " << pg->pg_id << dendl;
1775 ready_to_merge_target.insert(make_pair(pg->pg_id.pgid,
1776 make_tuple(version,
1777 last_epoch_started,
1778 last_epoch_clean)));
1779 assert(not_ready_to_merge_target.count(pg->pg_id.pgid) == 0);
1780 _send_ready_to_merge();
1781}
1782
1783void OSDService::set_not_ready_to_merge_source(pg_t source)
1784{
1785 std::lock_guard l(merge_lock);
1786 dout(10) << __func__ << " " << source << dendl;
1787 not_ready_to_merge_source.insert(source);
1788 assert(ready_to_merge_source.count(source) == 0);
1789 _send_ready_to_merge();
1790}
1791
1792void OSDService::set_not_ready_to_merge_target(pg_t target, pg_t source)
1793{
1794 std::lock_guard l(merge_lock);
1795 dout(10) << __func__ << " " << target << " source " << source << dendl;
1796 not_ready_to_merge_target[target] = source;
1797 assert(ready_to_merge_target.count(target) == 0);
1798 _send_ready_to_merge();
1799}
1800
1801void OSDService::send_ready_to_merge()
1802{
1803 std::lock_guard l(merge_lock);
1804 _send_ready_to_merge();
1805}
1806
1807void OSDService::_send_ready_to_merge()
1808{
1809 dout(20) << __func__
1810 << " ready_to_merge_source " << ready_to_merge_source
1811 << " not_ready_to_merge_source " << not_ready_to_merge_source
1812 << " ready_to_merge_target " << ready_to_merge_target
1813 << " not_ready_to_merge_target " << not_ready_to_merge_target
1814 << " sent_ready_to_merge_source " << sent_ready_to_merge_source
1815 << dendl;
1816 for (auto src : not_ready_to_merge_source) {
1817 if (sent_ready_to_merge_source.count(src) == 0) {
1818 monc->send_mon_message(new MOSDPGReadyToMerge(
1819 src,
1820 {}, {}, 0, 0,
1821 false,
1822 osdmap->get_epoch()));
1823 sent_ready_to_merge_source.insert(src);
1824 }
1825 }
1826 for (auto p : not_ready_to_merge_target) {
1827 if (sent_ready_to_merge_source.count(p.second) == 0) {
1828 monc->send_mon_message(new MOSDPGReadyToMerge(
1829 p.second,
1830 {}, {}, 0, 0,
1831 false,
1832 osdmap->get_epoch()));
1833 sent_ready_to_merge_source.insert(p.second);
1834 }
1835 }
1836 for (auto src : ready_to_merge_source) {
1837 if (not_ready_to_merge_source.count(src.first) ||
1838 not_ready_to_merge_target.count(src.first.get_parent())) {
1839 continue;
1840 }
1841 auto p = ready_to_merge_target.find(src.first.get_parent());
1842 if (p != ready_to_merge_target.end() &&
1843 sent_ready_to_merge_source.count(src.first) == 0) {
1844 monc->send_mon_message(new MOSDPGReadyToMerge(
1845 src.first, // source pgid
1846 src.second, // src version
1847 std::get<0>(p->second), // target version
1848 std::get<1>(p->second), // PG's last_epoch_started
1849 std::get<2>(p->second), // PG's last_epoch_clean
1850 true,
1851 osdmap->get_epoch()));
1852 sent_ready_to_merge_source.insert(src.first);
1853 }
1854 }
1855}
1856
1857void OSDService::clear_ready_to_merge(PG *pg)
1858{
1859 std::lock_guard l(merge_lock);
1860 dout(10) << __func__ << " " << pg->pg_id << dendl;
1861 ready_to_merge_source.erase(pg->pg_id.pgid);
1862 ready_to_merge_target.erase(pg->pg_id.pgid);
1863 not_ready_to_merge_source.erase(pg->pg_id.pgid);
1864 not_ready_to_merge_target.erase(pg->pg_id.pgid);
1865 sent_ready_to_merge_source.erase(pg->pg_id.pgid);
1866}
1867
1868void OSDService::clear_sent_ready_to_merge()
1869{
1870 std::lock_guard l(merge_lock);
1871 sent_ready_to_merge_source.clear();
1872}
1873
9f95a23c 1874void OSDService::prune_sent_ready_to_merge(const OSDMapRef& osdmap)
11fdf7f2
TL
1875{
1876 std::lock_guard l(merge_lock);
1877 auto i = sent_ready_to_merge_source.begin();
1878 while (i != sent_ready_to_merge_source.end()) {
1879 if (!osdmap->pg_exists(*i)) {
1880 dout(10) << __func__ << " " << *i << dendl;
1881 i = sent_ready_to_merge_source.erase(i);
1882 } else {
1883 ++i;
1884 }
1885 }
7c673cae
FG
1886}
1887
11fdf7f2
TL
1888// ---
1889
1890void OSDService::_queue_for_recovery(
1891 std::pair<epoch_t, PGRef> p,
1892 uint64_t reserved_pushes)
1893{
9f95a23c 1894 ceph_assert(ceph_mutex_is_locked_by_me(recovery_lock));
11fdf7f2 1895 enqueue_back(
9f95a23c
TL
1896 OpSchedulerItem(
1897 unique_ptr<OpSchedulerItem::OpQueueable>(
11fdf7f2
TL
1898 new PGRecovery(
1899 p.second->get_pgid(), p.first, reserved_pushes)),
1900 cct->_conf->osd_recovery_cost,
1901 cct->_conf->osd_recovery_priority,
1902 ceph_clock_now(),
1903 0,
1904 p.first));
1905}
7c673cae
FG
1906
1907// ====================================================================
1908// OSD
1909
1910#undef dout_prefix
1911#define dout_prefix *_dout
1912
1913// Commands shared between OSD's console and admin console:
1914namespace ceph {
1915namespace osd_cmds {
1916
11fdf7f2 1917int heap(CephContext& cct, const cmdmap_t& cmdmap, Formatter& f, std::ostream& os);
7c673cae
FG
1918
1919}} // namespace ceph::osd_cmds
1920
e306af50 1921int OSD::mkfs(CephContext *cct, ObjectStore *store, uuid_d fsid, int whoami, string osdspec_affinity)
7c673cae
FG
1922{
1923 int ret;
1924
7c673cae
FG
1925 OSDSuperblock sb;
1926 bufferlist sbbl;
11fdf7f2 1927 ObjectStore::CollectionHandle ch;
7c673cae
FG
1928
1929 // if we are fed a uuid for this osd, use it.
1930 store->set_fsid(cct->_conf->osd_uuid);
1931
1932 ret = store->mkfs();
1933 if (ret) {
224ce89b
WB
1934 derr << "OSD::mkfs: ObjectStore::mkfs failed with error "
1935 << cpp_strerror(ret) << dendl;
7c673cae
FG
1936 goto free_store;
1937 }
1938
31f18b77 1939 store->set_cache_shards(1); // doesn't matter for mkfs!
7c673cae
FG
1940
1941 ret = store->mount();
1942 if (ret) {
224ce89b
WB
1943 derr << "OSD::mkfs: couldn't mount ObjectStore: error "
1944 << cpp_strerror(ret) << dendl;
7c673cae
FG
1945 goto free_store;
1946 }
1947
11fdf7f2
TL
1948 ch = store->open_collection(coll_t::meta());
1949 if (ch) {
1950 ret = store->read(ch, OSD_SUPERBLOCK_GOBJECT, 0, 0, sbbl);
1951 if (ret < 0) {
1952 derr << "OSD::mkfs: have meta collection but no superblock" << dendl;
1953 goto free_store;
1954 }
7c673cae
FG
1955 /* if we already have superblock, check content of superblock */
1956 dout(0) << " have superblock" << dendl;
11fdf7f2
TL
1957 auto p = sbbl.cbegin();
1958 decode(sb, p);
7c673cae
FG
1959 if (whoami != sb.whoami) {
1960 derr << "provided osd id " << whoami << " != superblock's " << sb.whoami
1961 << dendl;
1962 ret = -EINVAL;
1963 goto umount_store;
1964 }
1965 if (fsid != sb.cluster_fsid) {
1966 derr << "provided cluster fsid " << fsid
1967 << " != superblock's " << sb.cluster_fsid << dendl;
1968 ret = -EINVAL;
1969 goto umount_store;
1970 }
1971 } else {
1972 // create superblock
1973 sb.cluster_fsid = fsid;
1974 sb.osd_fsid = store->get_fsid();
1975 sb.whoami = whoami;
1976 sb.compat_features = get_osd_initial_compat_set();
1977
1978 bufferlist bl;
11fdf7f2 1979 encode(sb, bl);
7c673cae 1980
11fdf7f2
TL
1981 ObjectStore::CollectionHandle ch = store->create_new_collection(
1982 coll_t::meta());
7c673cae
FG
1983 ObjectStore::Transaction t;
1984 t.create_collection(coll_t::meta(), 0);
1985 t.write(coll_t::meta(), OSD_SUPERBLOCK_GOBJECT, 0, bl.length(), bl);
11fdf7f2 1986 ret = store->queue_transaction(ch, std::move(t));
7c673cae
FG
1987 if (ret) {
1988 derr << "OSD::mkfs: error while writing OSD_SUPERBLOCK_GOBJECT: "
11fdf7f2 1989 << "queue_transaction returned " << cpp_strerror(ret) << dendl;
7c673cae
FG
1990 goto umount_store;
1991 }
1992 }
1993
e306af50 1994 ret = write_meta(cct, store, sb.cluster_fsid, sb.osd_fsid, whoami, osdspec_affinity);
7c673cae 1995 if (ret) {
224ce89b
WB
1996 derr << "OSD::mkfs: failed to write fsid file: error "
1997 << cpp_strerror(ret) << dendl;
7c673cae
FG
1998 goto umount_store;
1999 }
2000
2001umount_store:
11fdf7f2
TL
2002 if (ch) {
2003 ch.reset();
2004 }
7c673cae
FG
2005 store->umount();
2006free_store:
2007 delete store;
2008 return ret;
2009}
2010
e306af50 2011int OSD::write_meta(CephContext *cct, ObjectStore *store, uuid_d& cluster_fsid, uuid_d& osd_fsid, int whoami, string& osdspec_affinity)
7c673cae
FG
2012{
2013 char val[80];
2014 int r;
2015
2016 snprintf(val, sizeof(val), "%s", CEPH_OSD_ONDISK_MAGIC);
2017 r = store->write_meta("magic", val);
2018 if (r < 0)
2019 return r;
2020
2021 snprintf(val, sizeof(val), "%d", whoami);
2022 r = store->write_meta("whoami", val);
2023 if (r < 0)
2024 return r;
2025
2026 cluster_fsid.print(val);
2027 r = store->write_meta("ceph_fsid", val);
2028 if (r < 0)
2029 return r;
2030
11fdf7f2 2031 string key = cct->_conf.get_val<string>("key");
3efd9988
FG
2032 if (key.size()) {
2033 r = store->write_meta("osd_key", key);
2034 if (r < 0)
2035 return r;
b32b8144 2036 } else {
11fdf7f2 2037 string keyfile = cct->_conf.get_val<string>("keyfile");
b32b8144
FG
2038 if (!keyfile.empty()) {
2039 bufferlist keybl;
2040 string err;
11fdf7f2 2041 r = keybl.read_file(keyfile.c_str(), &err);
b32b8144
FG
2042 if (r < 0) {
2043 derr << __func__ << " failed to read keyfile " << keyfile << ": "
2044 << err << ": " << cpp_strerror(r) << dendl;
2045 return r;
2046 }
2047 r = store->write_meta("osd_key", keybl.to_str());
2048 if (r < 0)
2049 return r;
2050 }
3efd9988 2051 }
e306af50
TL
2052 if (!osdspec_affinity.empty()) {
2053 r = store->write_meta("osdspec_affinity", osdspec_affinity.c_str());
2054 if (r < 0)
2055 return r;
2056 }
3efd9988 2057
7c673cae
FG
2058 r = store->write_meta("ready", "ready");
2059 if (r < 0)
2060 return r;
2061
2062 return 0;
2063}
2064
11fdf7f2
TL
2065int OSD::peek_meta(ObjectStore *store,
2066 std::string *magic,
2067 uuid_d *cluster_fsid,
2068 uuid_d *osd_fsid,
2069 int *whoami,
9f95a23c 2070 ceph_release_t *require_osd_release)
7c673cae
FG
2071{
2072 string val;
2073
2074 int r = store->read_meta("magic", &val);
2075 if (r < 0)
2076 return r;
11fdf7f2 2077 *magic = val;
7c673cae
FG
2078
2079 r = store->read_meta("whoami", &val);
2080 if (r < 0)
2081 return r;
11fdf7f2 2082 *whoami = atoi(val.c_str());
7c673cae
FG
2083
2084 r = store->read_meta("ceph_fsid", &val);
2085 if (r < 0)
2086 return r;
11fdf7f2 2087 r = cluster_fsid->parse(val.c_str());
7c673cae
FG
2088 if (!r)
2089 return -EINVAL;
2090
2091 r = store->read_meta("fsid", &val);
2092 if (r < 0) {
11fdf7f2 2093 *osd_fsid = uuid_d();
7c673cae 2094 } else {
11fdf7f2 2095 r = osd_fsid->parse(val.c_str());
7c673cae
FG
2096 if (!r)
2097 return -EINVAL;
2098 }
2099
11fdf7f2
TL
2100 r = store->read_meta("require_osd_release", &val);
2101 if (r >= 0) {
9f95a23c 2102 *require_osd_release = ceph_release_from_name(val);
11fdf7f2
TL
2103 }
2104
7c673cae
FG
2105 return 0;
2106}
2107
2108
2109#undef dout_prefix
2110#define dout_prefix _prefix(_dout, whoami, get_osdmap_epoch())
2111
2112// cons/des
2113
2114OSD::OSD(CephContext *cct_, ObjectStore *store_,
2115 int id,
2116 Messenger *internal_messenger,
2117 Messenger *external_messenger,
2118 Messenger *hb_client_front,
2119 Messenger *hb_client_back,
2120 Messenger *hb_front_serverm,
2121 Messenger *hb_back_serverm,
2122 Messenger *osdc_messenger,
2123 MonClient *mc,
2124 const std::string &dev, const std::string &jdev) :
2125 Dispatcher(cct_),
7c673cae 2126 tick_timer(cct, osd_lock),
7c673cae 2127 tick_timer_without_osd_lock(cct, tick_timer_lock),
11fdf7f2 2128 gss_ktfile_client(cct->_conf.get_val<std::string>("gss_ktab_client_file")),
7c673cae
FG
2129 cluster_messenger(internal_messenger),
2130 client_messenger(external_messenger),
2131 objecter_messenger(osdc_messenger),
2132 monc(mc),
9f95a23c 2133 mgrc(cct_, client_messenger, &mc->monmap),
7c673cae
FG
2134 logger(NULL),
2135 recoverystate_perf(NULL),
2136 store(store_),
2137 log_client(cct, client_messenger, &mc->monmap, LogClient::NO_FLAGS),
2138 clog(log_client.create_channel()),
2139 whoami(id),
2140 dev_path(dev), journal_path(jdev),
31f18b77 2141 store_is_rotational(store->is_rotational()),
7c673cae
FG
2142 trace_endpoint("0.0.0.0", 0, "osd"),
2143 asok_hook(NULL),
11fdf7f2
TL
2144 m_osd_pg_epoch_max_lag_factor(cct->_conf.get_val<double>(
2145 "osd_pg_epoch_max_lag_factor")),
7c673cae 2146 osd_compat(get_osd_compat_set()),
7c673cae 2147 osd_op_tp(cct, "OSD::osd_op_tp", "tp_osd_tp",
31f18b77 2148 get_num_op_threads()),
7c673cae
FG
2149 heartbeat_stop(false),
2150 heartbeat_need_update(true),
2151 hb_front_client_messenger(hb_client_front),
2152 hb_back_client_messenger(hb_client_back),
2153 hb_front_server_messenger(hb_front_serverm),
2154 hb_back_server_messenger(hb_back_serverm),
2155 daily_loadavg(0.0),
2156 heartbeat_thread(this),
2157 heartbeat_dispatcher(this),
2158 op_tracker(cct, cct->_conf->osd_enable_op_tracker,
2159 cct->_conf->osd_num_op_tracker_shard),
2160 test_ops_hook(NULL),
7c673cae 2161 op_shardedwq(
7c673cae
FG
2162 this,
2163 cct->_conf->osd_op_thread_timeout,
2164 cct->_conf->osd_op_thread_suicide_timeout,
2165 &osd_op_tp),
7c673cae 2166 last_pg_create_epoch(0),
11fdf7f2 2167 boot_finisher(cct),
7c673cae
FG
2168 up_thru_wanted(0),
2169 requested_full_first(0),
2170 requested_full_last(0),
7c673cae
FG
2171 service(this)
2172{
11fdf7f2
TL
2173
2174 if (!gss_ktfile_client.empty()) {
2175 // Assert we can export environment variable
2176 /*
2177 The default client keytab is used, if it is present and readable,
2178 to automatically obtain initial credentials for GSSAPI client
2179 applications. The principal name of the first entry in the client
2180 keytab is used by default when obtaining initial credentials.
2181 1. The KRB5_CLIENT_KTNAME environment variable.
2182 2. The default_client_keytab_name profile variable in [libdefaults].
2183 3. The hardcoded default, DEFCKTNAME.
2184 */
2185 const int32_t set_result(setenv("KRB5_CLIENT_KTNAME",
2186 gss_ktfile_client.c_str(), 1));
2187 ceph_assert(set_result == 0);
2188 }
2189
7c673cae
FG
2190 monc->set_messenger(client_messenger);
2191 op_tracker.set_complaint_and_threshold(cct->_conf->osd_op_complaint_time,
2192 cct->_conf->osd_op_log_threshold);
2193 op_tracker.set_history_size_and_duration(cct->_conf->osd_op_history_size,
2194 cct->_conf->osd_op_history_duration);
2195 op_tracker.set_history_slow_op_size_and_threshold(cct->_conf->osd_op_history_slow_op_size,
2196 cct->_conf->osd_op_history_slow_op_threshold);
9f95a23c 2197 ObjectCleanRegions::set_max_num_intervals(cct->_conf->osd_object_clean_region_max_num_intervals);
7c673cae
FG
2198#ifdef WITH_BLKIN
2199 std::stringstream ss;
2200 ss << "osd." << whoami;
2201 trace_endpoint.copy_name(ss.str());
2202#endif
11fdf7f2
TL
2203
2204 // initialize shards
2205 num_shards = get_num_op_shards();
2206 for (uint32_t i = 0; i < num_shards; i++) {
2207 OSDShard *one_shard = new OSDShard(
2208 i,
2209 cct,
9f95a23c 2210 this);
11fdf7f2
TL
2211 shards.push_back(one_shard);
2212 }
7c673cae
FG
2213}
2214
2215OSD::~OSD()
2216{
11fdf7f2
TL
2217 while (!shards.empty()) {
2218 delete shards.back();
2219 shards.pop_back();
2220 }
7c673cae
FG
2221 cct->get_perfcounters_collection()->remove(recoverystate_perf);
2222 cct->get_perfcounters_collection()->remove(logger);
2223 delete recoverystate_perf;
2224 delete logger;
2225 delete store;
2226}
2227
91327a77
AA
2228double OSD::get_tick_interval() const
2229{
2230 // vary +/- 5% to avoid scrub scheduling livelocks
2231 constexpr auto delta = 0.05;
91327a77 2232 return (OSD_TICK_INTERVAL *
11fdf7f2 2233 ceph::util::generate_random_number(1.0 - delta, 1.0 + delta));
91327a77
AA
2234}
2235
7c673cae
FG
2236void OSD::handle_signal(int signum)
2237{
11fdf7f2 2238 ceph_assert(signum == SIGINT || signum == SIGTERM);
7c673cae
FG
2239 derr << "*** Got signal " << sig_str(signum) << " ***" << dendl;
2240 shutdown();
2241}
2242
2243int OSD::pre_init()
2244{
11fdf7f2 2245 std::lock_guard lock(osd_lock);
7c673cae
FG
2246 if (is_stopping())
2247 return 0;
2248
2249 if (store->test_mount_in_use()) {
2250 derr << "OSD::pre_init: object store '" << dev_path << "' is "
2251 << "currently in use. (Is ceph-osd already running?)" << dendl;
2252 return -EBUSY;
2253 }
2254
11fdf7f2
TL
2255 cct->_conf.add_observer(this);
2256 return 0;
2257}
2258
2259int OSD::set_numa_affinity()
2260{
2261 // storage numa node
2262 int store_node = -1;
2263 store->get_numa_node(&store_node, nullptr, nullptr);
2264 if (store_node >= 0) {
2265 dout(1) << __func__ << " storage numa node " << store_node << dendl;
2266 }
2267
2268 // check network numa node(s)
2269 int front_node = -1, back_node = -1;
2270 string front_iface = pick_iface(
2271 cct,
2272 client_messenger->get_myaddrs().front().get_sockaddr_storage());
2273 string back_iface = pick_iface(
2274 cct,
2275 cluster_messenger->get_myaddrs().front().get_sockaddr_storage());
2276 int r = get_iface_numa_node(front_iface, &front_node);
92f5a8d4 2277 if (r >= 0 && front_node >= 0) {
11fdf7f2 2278 dout(1) << __func__ << " public network " << front_iface << " numa node "
92f5a8d4 2279 << front_node << dendl;
11fdf7f2 2280 r = get_iface_numa_node(back_iface, &back_node);
92f5a8d4 2281 if (r >= 0 && back_node >= 0) {
11fdf7f2
TL
2282 dout(1) << __func__ << " cluster network " << back_iface << " numa node "
2283 << back_node << dendl;
2284 if (front_node == back_node &&
2285 front_node == store_node) {
2286 dout(1) << " objectstore and network numa nodes all match" << dendl;
2287 if (g_conf().get_val<bool>("osd_numa_auto_affinity")) {
2288 numa_node = front_node;
2289 }
92f5a8d4
TL
2290 } else if (front_node != back_node) {
2291 dout(1) << __func__ << " public and cluster network numa nodes do not match"
2292 << dendl;
11fdf7f2
TL
2293 } else {
2294 dout(1) << __func__ << " objectstore and network numa nodes do not match"
2295 << dendl;
2296 }
92f5a8d4
TL
2297 } else if (back_node == -2) {
2298 dout(1) << __func__ << " cluster network " << back_iface
2299 << " ports numa nodes do not match" << dendl;
2300 } else {
2301 derr << __func__ << " unable to identify cluster interface '" << back_iface
2302 << "' numa node: " << cpp_strerror(r) << dendl;
11fdf7f2 2303 }
92f5a8d4
TL
2304 } else if (front_node == -2) {
2305 dout(1) << __func__ << " public network " << front_iface
2306 << " ports numa nodes do not match" << dendl;
11fdf7f2
TL
2307 } else {
2308 derr << __func__ << " unable to identify public interface '" << front_iface
2309 << "' numa node: " << cpp_strerror(r) << dendl;
2310 }
2311 if (int node = g_conf().get_val<int64_t>("osd_numa_node"); node >= 0) {
2312 // this takes precedence over the automagic logic above
2313 numa_node = node;
2314 }
2315 if (numa_node >= 0) {
2316 int r = get_numa_node_cpu_set(numa_node, &numa_cpu_set_size, &numa_cpu_set);
2317 if (r < 0) {
2318 dout(1) << __func__ << " unable to determine numa node " << numa_node
2319 << " CPUs" << dendl;
2320 numa_node = -1;
2321 } else {
2322 dout(1) << __func__ << " setting numa affinity to node " << numa_node
2323 << " cpus "
2324 << cpu_set_to_str_list(numa_cpu_set_size, &numa_cpu_set)
2325 << dendl;
92f5a8d4 2326 r = set_cpu_affinity_all_threads(numa_cpu_set_size, &numa_cpu_set);
11fdf7f2
TL
2327 if (r < 0) {
2328 r = -errno;
2329 derr << __func__ << " failed to set numa affinity: " << cpp_strerror(r)
2330 << dendl;
2331 numa_node = -1;
2332 }
2333 }
2334 } else {
2335 dout(1) << __func__ << " not setting numa affinity" << dendl;
2336 }
7c673cae
FG
2337 return 0;
2338}
2339
2340// asok
2341
2342class OSDSocketHook : public AdminSocketHook {
2343 OSD *osd;
2344public:
2345 explicit OSDSocketHook(OSD *o) : osd(o) {}
9f95a23c
TL
2346 int call(std::string_view prefix, const cmdmap_t& cmdmap,
2347 Formatter *f,
2348 std::ostream& ss,
2349 bufferlist& out) override {
2350 ceph_abort("should use async hook");
2351 }
2352 void call_async(
2353 std::string_view prefix,
2354 const cmdmap_t& cmdmap,
2355 Formatter *f,
2356 const bufferlist& inbl,
2357 std::function<void(int,const std::string&,bufferlist&)> on_finish) override {
11fdf7f2 2358 try {
9f95a23c
TL
2359 osd->asok_command(prefix, cmdmap, f, inbl, on_finish);
2360 } catch (const TOPNSPC::common::bad_cmd_get& e) {
2361 bufferlist empty;
2362 on_finish(-EINVAL, e.what(), empty);
11fdf7f2 2363 }
7c673cae
FG
2364 }
2365};
2366
11fdf7f2
TL
2367std::set<int64_t> OSD::get_mapped_pools()
2368{
2369 std::set<int64_t> pools;
2370 std::vector<spg_t> pgids;
2371 _get_pgids(&pgids);
2372 for (const auto &pgid : pgids) {
2373 pools.insert(pgid.pool());
2374 }
2375 return pools;
2376}
2377
9f95a23c
TL
2378void OSD::asok_command(
2379 std::string_view prefix, const cmdmap_t& cmdmap,
2380 Formatter *f,
2381 const bufferlist& inbl,
2382 std::function<void(int,const std::string&,bufferlist&)> on_finish)
7c673cae 2383{
9f95a23c
TL
2384 int ret = 0;
2385 stringstream ss; // stderr error message stream
2386 bufferlist outbl; // if empty at end, we'll dump formatter as output
2387
2388 // --- PG commands are routed here to PG::do_command ---
2389 if (prefix == "pg" ||
2390 prefix == "query" ||
2391 prefix == "mark_unfound_lost" ||
2392 prefix == "list_unfound" ||
2393 prefix == "scrub" ||
2394 prefix == "deep_scrub"
2395 ) {
2396 string pgidstr;
2397 pg_t pgid;
2398 if (!cmd_getval(cmdmap, "pgid", pgidstr)) {
2399 ss << "no pgid specified";
2400 ret = -EINVAL;
2401 goto out;
2402 }
2403 if (!pgid.parse(pgidstr.c_str())) {
2404 ss << "couldn't parse pgid '" << pgidstr << "'";
2405 ret = -EINVAL;
2406 goto out;
2407 }
2408 spg_t pcand;
2409 PGRef pg;
2410 if (get_osdmap()->get_primary_shard(pgid, &pcand) &&
2411 (pg = _lookup_lock_pg(pcand))) {
2412 if (pg->is_primary()) {
2413 cmdmap_t new_cmdmap = cmdmap;
2414 try {
2415 pg->do_command(prefix, new_cmdmap, inbl, on_finish);
2416 pg->unlock();
2417 return; // the pg handler calls on_finish directly
2418 } catch (const TOPNSPC::common::bad_cmd_get& e) {
2419 pg->unlock();
2420 ss << e.what();
2421 ret = -EINVAL;
2422 goto out;
2423 }
2424 } else {
2425 ss << "not primary for pgid " << pgid;
2426 // do not reply; they will get newer maps and realize they
2427 // need to resend.
2428 pg->unlock();
2429 ret = -EAGAIN;
2430 goto out;
2431 }
2432 } else {
2433 ss << "i don't have pgid " << pgid;
2434 ret = -ENOENT;
2435 }
2436 }
2437
2438 // --- OSD commands follow ---
2439
2440 else if (prefix == "status") {
2441 lock_guard l(osd_lock);
7c673cae
FG
2442 f->open_object_section("status");
2443 f->dump_stream("cluster_fsid") << superblock.cluster_fsid;
2444 f->dump_stream("osd_fsid") << superblock.osd_fsid;
2445 f->dump_unsigned("whoami", superblock.whoami);
2446 f->dump_string("state", get_state_name(get_state()));
2447 f->dump_unsigned("oldest_map", superblock.oldest_map);
2448 f->dump_unsigned("newest_map", superblock.newest_map);
11fdf7f2 2449 f->dump_unsigned("num_pgs", num_pgs);
7c673cae 2450 f->close_section();
9f95a23c 2451 } else if (prefix == "flush_journal") {
7c673cae 2452 store->flush_journal();
9f95a23c
TL
2453 } else if (prefix == "dump_ops_in_flight" ||
2454 prefix == "ops" ||
2455 prefix == "dump_blocked_ops" ||
2456 prefix == "dump_historic_ops" ||
2457 prefix == "dump_historic_ops_by_duration" ||
2458 prefix == "dump_historic_slow_ops") {
c07f9fc5
FG
2459
2460 const string error_str = "op_tracker tracking is not enabled now, so no ops are tracked currently, \
2461even those get stuck. Please enable \"osd_enable_op_tracker\", and the tracker \
2462will start to track new ops received afterwards.";
2463
2464 set<string> filters;
2465 vector<string> filter_str;
9f95a23c 2466 if (cmd_getval(cmdmap, "filterstr", filter_str)) {
c07f9fc5
FG
2467 copy(filter_str.begin(), filter_str.end(),
2468 inserter(filters, filters.end()));
2469 }
2470
9f95a23c
TL
2471 if (prefix == "dump_ops_in_flight" ||
2472 prefix == "ops") {
c07f9fc5
FG
2473 if (!op_tracker.dump_ops_in_flight(f, false, filters)) {
2474 ss << error_str;
9f95a23c
TL
2475 ret = -EINVAL;
2476 goto out;
c07f9fc5
FG
2477 }
2478 }
9f95a23c 2479 if (prefix == "dump_blocked_ops") {
c07f9fc5
FG
2480 if (!op_tracker.dump_ops_in_flight(f, true, filters)) {
2481 ss << error_str;
9f95a23c
TL
2482 ret = -EINVAL;
2483 goto out;
c07f9fc5
FG
2484 }
2485 }
9f95a23c 2486 if (prefix == "dump_historic_ops") {
c07f9fc5
FG
2487 if (!op_tracker.dump_historic_ops(f, false, filters)) {
2488 ss << error_str;
9f95a23c
TL
2489 ret = -EINVAL;
2490 goto out;
c07f9fc5
FG
2491 }
2492 }
9f95a23c 2493 if (prefix == "dump_historic_ops_by_duration") {
c07f9fc5
FG
2494 if (!op_tracker.dump_historic_ops(f, true, filters)) {
2495 ss << error_str;
9f95a23c
TL
2496 ret = -EINVAL;
2497 goto out;
c07f9fc5
FG
2498 }
2499 }
9f95a23c 2500 if (prefix == "dump_historic_slow_ops") {
c07f9fc5
FG
2501 if (!op_tracker.dump_historic_slow_ops(f, filters)) {
2502 ss << error_str;
9f95a23c
TL
2503 ret = -EINVAL;
2504 goto out;
c07f9fc5 2505 }
7c673cae 2506 }
9f95a23c 2507 } else if (prefix == "dump_op_pq_state") {
7c673cae
FG
2508 f->open_object_section("pq");
2509 op_shardedwq.dump(f);
2510 f->close_section();
9f95a23c 2511 } else if (prefix == "dump_blacklist") {
7c673cae
FG
2512 list<pair<entity_addr_t,utime_t> > bl;
2513 OSDMapRef curmap = service.get_osdmap();
2514
2515 f->open_array_section("blacklist");
2516 curmap->get_blacklist(&bl);
2517 for (list<pair<entity_addr_t,utime_t> >::iterator it = bl.begin();
2518 it != bl.end(); ++it) {
224ce89b 2519 f->open_object_section("entry");
7c673cae
FG
2520 f->open_object_section("entity_addr_t");
2521 it->first.dump(f);
2522 f->close_section(); //entity_addr_t
2523 it->second.localtime(f->dump_stream("expire_time"));
2524 f->close_section(); //entry
2525 }
2526 f->close_section(); //blacklist
9f95a23c 2527 } else if (prefix == "dump_watchers") {
7c673cae
FG
2528 list<obj_watch_item_t> watchers;
2529 // scan pg's
11fdf7f2
TL
2530 vector<PGRef> pgs;
2531 _get_pgs(&pgs);
2532 for (auto& pg : pgs) {
2533 list<obj_watch_item_t> pg_watchers;
2534 pg->get_watchers(&pg_watchers);
2535 watchers.splice(watchers.end(), pg_watchers);
7c673cae
FG
2536 }
2537
2538 f->open_array_section("watchers");
2539 for (list<obj_watch_item_t>::iterator it = watchers.begin();
2540 it != watchers.end(); ++it) {
2541
224ce89b 2542 f->open_object_section("watch");
7c673cae
FG
2543
2544 f->dump_string("namespace", it->obj.nspace);
2545 f->dump_string("object", it->obj.oid.name);
2546
2547 f->open_object_section("entity_name");
2548 it->wi.name.dump(f);
2549 f->close_section(); //entity_name_t
2550
224ce89b
WB
2551 f->dump_unsigned("cookie", it->wi.cookie);
2552 f->dump_unsigned("timeout", it->wi.timeout_seconds);
7c673cae
FG
2553
2554 f->open_object_section("entity_addr_t");
2555 it->wi.addr.dump(f);
2556 f->close_section(); //entity_addr_t
2557
2558 f->close_section(); //watch
2559 }
2560
2561 f->close_section(); //watchers
9f95a23c 2562 } else if (prefix == "dump_recovery_reservations") {
7c673cae
FG
2563 f->open_object_section("reservations");
2564 f->open_object_section("local_reservations");
2565 service.local_reserver.dump(f);
2566 f->close_section();
2567 f->open_object_section("remote_reservations");
2568 service.remote_reserver.dump(f);
2569 f->close_section();
2570 f->close_section();
9f95a23c 2571 } else if (prefix == "dump_scrub_reservations") {
eafe8130
TL
2572 f->open_object_section("scrub_reservations");
2573 service.dump_scrub_reservations(f);
2574 f->close_section();
9f95a23c 2575 } else if (prefix == "get_latest_osdmap") {
7c673cae 2576 get_latest_osdmap();
9f95a23c 2577 } else if (prefix == "set_heap_property") {
7c673cae
FG
2578 string property;
2579 int64_t value = 0;
2580 string error;
2581 bool success = false;
9f95a23c 2582 if (!cmd_getval(cmdmap, "property", property)) {
7c673cae
FG
2583 error = "unable to get property";
2584 success = false;
9f95a23c 2585 } else if (!cmd_getval(cmdmap, "value", value)) {
7c673cae
FG
2586 error = "unable to get value";
2587 success = false;
2588 } else if (value < 0) {
2589 error = "negative value not allowed";
2590 success = false;
2591 } else if (!ceph_heap_set_numeric_property(property.c_str(), (size_t)value)) {
2592 error = "invalid property";
2593 success = false;
2594 } else {
2595 success = true;
2596 }
2597 f->open_object_section("result");
2598 f->dump_string("error", error);
2599 f->dump_bool("success", success);
2600 f->close_section();
9f95a23c 2601 } else if (prefix == "get_heap_property") {
7c673cae
FG
2602 string property;
2603 size_t value = 0;
2604 string error;
2605 bool success = false;
9f95a23c 2606 if (!cmd_getval(cmdmap, "property", property)) {
7c673cae
FG
2607 error = "unable to get property";
2608 success = false;
2609 } else if (!ceph_heap_get_numeric_property(property.c_str(), &value)) {
2610 error = "invalid property";
2611 success = false;
2612 } else {
2613 success = true;
2614 }
2615 f->open_object_section("result");
2616 f->dump_string("error", error);
2617 f->dump_bool("success", success);
2618 f->dump_int("value", value);
2619 f->close_section();
9f95a23c 2620 } else if (prefix == "dump_objectstore_kv_stats") {
7c673cae 2621 store->get_db_statistics(f);
9f95a23c 2622 } else if (prefix == "dump_scrubs") {
7c673cae 2623 service.dumps_scrub(f);
9f95a23c 2624 } else if (prefix == "calc_objectstore_db_histogram") {
7c673cae 2625 store->generate_db_histogram(f);
9f95a23c 2626 } else if (prefix == "flush_store_cache") {
11fdf7f2 2627 store->flush_cache(&ss);
9f95a23c 2628 } else if (prefix == "dump_pgstate_history") {
7c673cae 2629 f->open_object_section("pgstate_history");
9f95a23c 2630 f->open_array_section("pgs");
11fdf7f2
TL
2631 vector<PGRef> pgs;
2632 _get_pgs(&pgs);
2633 for (auto& pg : pgs) {
9f95a23c 2634 f->open_object_section("pg");
11fdf7f2 2635 f->dump_stream("pg") << pg->pg_id;
9f95a23c 2636 f->dump_string("currently", pg->get_current_state());
11fdf7f2 2637 pg->dump_pgstate_history(f);
9f95a23c 2638 f->close_section();
7c673cae
FG
2639 }
2640 f->close_section();
9f95a23c
TL
2641 f->close_section();
2642 } else if (prefix == "compact") {
224ce89b
WB
2643 dout(1) << "triggering manual compaction" << dendl;
2644 auto start = ceph::coarse_mono_clock::now();
2645 store->compact();
2646 auto end = ceph::coarse_mono_clock::now();
11fdf7f2 2647 double duration = std::chrono::duration<double>(end-start).count();
224ce89b 2648 dout(1) << "finished manual compaction in "
11fdf7f2 2649 << duration
224ce89b
WB
2650 << " seconds" << dendl;
2651 f->open_object_section("compact_result");
11fdf7f2
TL
2652 f->dump_float("elapsed_time", duration);
2653 f->close_section();
9f95a23c 2654 } else if (prefix == "get_mapped_pools") {
11fdf7f2
TL
2655 f->open_array_section("mapped_pools");
2656 set<int64_t> poollist = get_mapped_pools();
2657 for (auto pool : poollist) {
2658 f->dump_int("pool_id", pool);
2659 }
2660 f->close_section();
9f95a23c 2661 } else if (prefix == "smart") {
11fdf7f2 2662 string devid;
9f95a23c
TL
2663 cmd_getval(cmdmap, "devid", devid);
2664 ostringstream out;
2665 probe_smart(devid, out);
2666 outbl.append(out.str());
2667 } else if (prefix == "list_devices") {
11fdf7f2
TL
2668 set<string> devnames;
2669 store->get_devices(&devnames);
9f95a23c 2670 f->open_array_section("list_devices");
11fdf7f2
TL
2671 for (auto dev : devnames) {
2672 if (dev.find("dm-") == 0) {
2673 continue;
2674 }
9f95a23c
TL
2675 string err;
2676 f->open_object_section("device");
11fdf7f2 2677 f->dump_string("device", "/dev/" + dev);
9f95a23c
TL
2678 f->dump_string("device_id", get_device_id(dev, &err));
2679 f->close_section();
11fdf7f2 2680 }
224ce89b 2681 f->close_section();
9f95a23c
TL
2682 } else if (prefix == "send_beacon") {
2683 lock_guard l(osd_lock);
11fdf7f2
TL
2684 if (is_active()) {
2685 send_beacon(ceph::coarse_mono_clock::now());
2686 }
9f95a23c
TL
2687 }
2688
2689 else if (prefix == "cluster_log") {
2690 vector<string> msg;
2691 cmd_getval(cmdmap, "message", msg);
2692 if (msg.empty()) {
2693 ret = -EINVAL;
2694 ss << "ignoring empty log message";
2695 goto out;
2696 }
2697 string message = msg.front();
2698 for (vector<string>::iterator a = ++msg.begin(); a != msg.end(); ++a)
2699 message += " " + *a;
2700 string lvl;
2701 cmd_getval(cmdmap, "level", lvl);
2702 clog_type level = string_to_clog_type(lvl);
2703 if (level < 0) {
2704 ret = -EINVAL;
2705 ss << "unknown level '" << lvl << "'";
2706 goto out;
2707 }
2708 clog->do_log(level, message);
2709 }
2710
2711 else if (prefix == "bench") {
2712 lock_guard l(osd_lock);
2713 int64_t count;
2714 int64_t bsize;
2715 int64_t osize, onum;
2716 // default count 1G, size 4MB
2717 cmd_getval(cmdmap, "count", count, (int64_t)1 << 30);
2718 cmd_getval(cmdmap, "size", bsize, (int64_t)4 << 20);
2719 cmd_getval(cmdmap, "object_size", osize, (int64_t)0);
2720 cmd_getval(cmdmap, "object_num", onum, (int64_t)0);
2721
2722 uint32_t duration = cct->_conf->osd_bench_duration;
2723
2724 if (bsize > (int64_t) cct->_conf->osd_bench_max_block_size) {
2725 // let us limit the block size because the next checks rely on it
2726 // having a sane value. If we allow any block size to be set things
2727 // can still go sideways.
2728 ss << "block 'size' values are capped at "
2729 << byte_u_t(cct->_conf->osd_bench_max_block_size) << ". If you wish to use"
2730 << " a higher value, please adjust 'osd_bench_max_block_size'";
2731 ret = -EINVAL;
2732 goto out;
2733 } else if (bsize < (int64_t) (1 << 20)) {
2734 // entering the realm of small block sizes.
2735 // limit the count to a sane value, assuming a configurable amount of
2736 // IOPS and duration, so that the OSD doesn't get hung up on this,
2737 // preventing timeouts from going off
2738 int64_t max_count =
2739 bsize * duration * cct->_conf->osd_bench_small_size_max_iops;
2740 if (count > max_count) {
2741 ss << "'count' values greater than " << max_count
2742 << " for a block size of " << byte_u_t(bsize) << ", assuming "
2743 << cct->_conf->osd_bench_small_size_max_iops << " IOPS,"
2744 << " for " << duration << " seconds,"
2745 << " can cause ill effects on osd. "
2746 << " Please adjust 'osd_bench_small_size_max_iops' with a higher"
2747 << " value if you wish to use a higher 'count'.";
2748 ret = -EINVAL;
2749 goto out;
eafe8130
TL
2750 }
2751 } else {
9f95a23c
TL
2752 // 1MB block sizes are big enough so that we get more stuff done.
2753 // However, to avoid the osd from getting hung on this and having
2754 // timers being triggered, we are going to limit the count assuming
2755 // a configurable throughput and duration.
2756 // NOTE: max_count is the total amount of bytes that we believe we
2757 // will be able to write during 'duration' for the given
2758 // throughput. The block size hardly impacts this unless it's
2759 // way too big. Given we already check how big the block size
2760 // is, it's safe to assume everything will check out.
2761 int64_t max_count =
2762 cct->_conf->osd_bench_large_size_max_throughput * duration;
2763 if (count > max_count) {
2764 ss << "'count' values greater than " << max_count
2765 << " for a block size of " << byte_u_t(bsize) << ", assuming "
2766 << byte_u_t(cct->_conf->osd_bench_large_size_max_throughput) << "/s,"
2767 << " for " << duration << " seconds,"
2768 << " can cause ill effects on osd. "
2769 << " Please adjust 'osd_bench_large_size_max_throughput'"
2770 << " with a higher value if you wish to use a higher 'count'.";
2771 ret = -EINVAL;
2772 goto out;
2773 }
eafe8130 2774 }
eafe8130 2775
9f95a23c
TL
2776 if (osize && bsize > osize)
2777 bsize = osize;
eafe8130 2778
9f95a23c
TL
2779 dout(1) << " bench count " << count
2780 << " bsize " << byte_u_t(bsize) << dendl;
eafe8130 2781
9f95a23c
TL
2782 ObjectStore::Transaction cleanupt;
2783
2784 if (osize && onum) {
2785 bufferlist bl;
2786 bufferptr bp(osize);
2787 bp.zero();
2788 bl.push_back(std::move(bp));
2789 bl.rebuild_page_aligned();
2790 for (int i=0; i<onum; ++i) {
2791 char nm[30];
2792 snprintf(nm, sizeof(nm), "disk_bw_test_%d", i);
2793 object_t oid(nm);
2794 hobject_t soid(sobject_t(oid, 0));
2795 ObjectStore::Transaction t;
2796 t.write(coll_t(), ghobject_t(soid), 0, osize, bl);
2797 store->queue_transaction(service.meta_ch, std::move(t), NULL);
2798 cleanupt.remove(coll_t(), ghobject_t(soid));
2799 }
2800 }
2801
2802 bufferlist bl;
2803 bufferptr bp(bsize);
2804 bp.zero();
2805 bl.push_back(std::move(bp));
2806 bl.rebuild_page_aligned();
2807
2808 {
2809 C_SaferCond waiter;
2810 if (!service.meta_ch->flush_commit(&waiter)) {
2811 waiter.wait();
2812 }
2813 }
2814
2815 utime_t start = ceph_clock_now();
2816 for (int64_t pos = 0; pos < count; pos += bsize) {
2817 char nm[30];
2818 unsigned offset = 0;
2819 if (onum && osize) {
2820 snprintf(nm, sizeof(nm), "disk_bw_test_%d", (int)(rand() % onum));
2821 offset = rand() % (osize / bsize) * bsize;
2822 } else {
2823 snprintf(nm, sizeof(nm), "disk_bw_test_%lld", (long long)pos);
2824 }
2825 object_t oid(nm);
2826 hobject_t soid(sobject_t(oid, 0));
2827 ObjectStore::Transaction t;
2828 t.write(coll_t::meta(), ghobject_t(soid), offset, bsize, bl);
2829 store->queue_transaction(service.meta_ch, std::move(t), NULL);
2830 if (!onum || !osize)
2831 cleanupt.remove(coll_t::meta(), ghobject_t(soid));
2832 }
2833
2834 {
2835 C_SaferCond waiter;
2836 if (!service.meta_ch->flush_commit(&waiter)) {
2837 waiter.wait();
2838 }
2839 }
2840 utime_t end = ceph_clock_now();
2841
2842 // clean up
2843 store->queue_transaction(service.meta_ch, std::move(cleanupt), NULL);
2844 {
2845 C_SaferCond waiter;
2846 if (!service.meta_ch->flush_commit(&waiter)) {
2847 waiter.wait();
2848 }
2849 }
2850
2851 double elapsed = end - start;
2852 double rate = count / elapsed;
2853 double iops = rate / bsize;
2854 f->open_object_section("osd_bench_results");
2855 f->dump_int("bytes_written", count);
2856 f->dump_int("blocksize", bsize);
2857 f->dump_float("elapsed_sec", elapsed);
2858 f->dump_float("bytes_per_sec", rate);
2859 f->dump_float("iops", iops);
2860 f->close_section();
2861 }
2862
2863 else if (prefix == "flush_pg_stats") {
2864 mgrc.send_pgstats();
2865 f->dump_unsigned("stat_seq", service.get_osd_stat_seq());
2866 }
2867
2868 else if (prefix == "heap") {
2869 ret = ceph::osd_cmds::heap(*cct, cmdmap, *f, ss);
2870 }
2871
2872 else if (prefix == "debug dump_missing") {
2873 f->open_array_section("pgs");
2874 vector<PGRef> pgs;
2875 _get_pgs(&pgs);
2876 for (auto& pg : pgs) {
2877 string s = stringify(pg->pg_id);
2878 f->open_array_section(s.c_str());
2879 pg->lock();
2880 pg->dump_missing(f);
2881 pg->unlock();
2882 f->close_section();
2883 }
2884 f->close_section();
2885 }
2886
2887 else if (prefix == "debug kick_recovery_wq") {
2888 int64_t delay;
2889 cmd_getval(cmdmap, "delay", delay);
2890 ostringstream oss;
2891 oss << delay;
2892 ret = cct->_conf.set_val("osd_recovery_delay_start", oss.str().c_str());
2893 if (ret != 0) {
2894 ss << "kick_recovery_wq: error setting "
2895 << "osd_recovery_delay_start to '" << delay << "': error "
2896 << ret;
2897 goto out;
2898 }
2899 cct->_conf.apply_changes(nullptr);
2900 ss << "kicking recovery queue. set osd_recovery_delay_start "
2901 << "to " << cct->_conf->osd_recovery_delay_start;
2902 }
2903
2904 else if (prefix == "cpu_profiler") {
2905 ostringstream ds;
2906 string arg;
2907 cmd_getval(cmdmap, "arg", arg);
2908 vector<string> argvec;
2909 get_str_vec(arg, argvec);
2910 cpu_profiler_handle_command(argvec, ds);
2911 outbl.append(ds.str());
2912 }
2913
2914 else if (prefix == "dump_pg_recovery_stats") {
2915 lock_guard l(osd_lock);
2916 pg_recovery_stats.dump_formatted(f);
2917 }
2918
2919 else if (prefix == "reset_pg_recovery_stats") {
2920 lock_guard l(osd_lock);
2921 pg_recovery_stats.reset();
2922 }
2923
2924 else if (prefix == "perf histogram dump") {
2925 std::string logger;
2926 std::string counter;
2927 cmd_getval(cmdmap, "logger", logger);
2928 cmd_getval(cmdmap, "counter", counter);
2929 cct->get_perfcounters_collection()->dump_formatted_histograms(
2930 f, false, logger, counter);
2931 }
2932
2933 else if (prefix == "cache drop") {
2934 lock_guard l(osd_lock);
2935 dout(20) << "clearing all caches" << dendl;
2936 // Clear the objectstore's cache - onode and buffer for Bluestore,
2937 // system's pagecache for Filestore
2938 ret = store->flush_cache(&ss);
2939 if (ret < 0) {
2940 ss << "Error flushing objectstore cache: " << cpp_strerror(ret);
2941 goto out;
2942 }
2943 // Clear the objectcontext cache (per PG)
2944 vector<PGRef> pgs;
2945 _get_pgs(&pgs);
2946 for (auto& pg: pgs) {
2947 pg->clear_cache();
2948 }
2949 }
2950
2951 else if (prefix == "cache status") {
2952 lock_guard l(osd_lock);
2953 int obj_ctx_count = 0;
2954 vector<PGRef> pgs;
2955 _get_pgs(&pgs);
2956 for (auto& pg: pgs) {
2957 obj_ctx_count += pg->get_cache_obj_count();
2958 }
2959 f->open_object_section("cache_status");
2960 f->dump_int("object_ctx", obj_ctx_count);
2961 store->dump_cache_stats(f);
2962 f->close_section();
2963 }
2964
2965 else if (prefix == "scrub_purged_snaps") {
2966 lock_guard l(osd_lock);
2967 scrub_purged_snaps();
2968 }
2969
2970 else if (prefix == "dump_osd_network") {
2971 lock_guard l(osd_lock);
2972 int64_t value = 0;
2973 if (!(cmd_getval(cmdmap, "value", value))) {
2974 // Convert milliseconds to microseconds
2975 value = static_cast<double>(g_conf().get_val<double>(
2976 "mon_warn_on_slow_ping_time")) * 1000;
2977 if (value == 0) {
2978 double ratio = g_conf().get_val<double>("mon_warn_on_slow_ping_ratio");
2979 value = g_conf().get_val<int64_t>("osd_heartbeat_grace");
2980 value *= 1000000 * ratio; // Seconds of grace to microseconds at ratio
2981 }
2982 } else {
2983 // Convert user input to microseconds
2984 value *= 1000;
2985 }
2986 if (value < 0) value = 0;
2987
2988 struct osd_ping_time_t {
2989 uint32_t pingtime;
2990 int to;
2991 bool back;
2992 std::array<uint32_t,3> times;
2993 std::array<uint32_t,3> min;
2994 std::array<uint32_t,3> max;
2995 uint32_t last;
2996 uint32_t last_update;
2997
2998 bool operator<(const osd_ping_time_t& rhs) const {
2999 if (pingtime < rhs.pingtime)
3000 return true;
3001 if (pingtime > rhs.pingtime)
3002 return false;
3003 if (to < rhs.to)
3004 return true;
3005 if (to > rhs.to)
3006 return false;
3007 return back;
3008 }
3009 };
3010
3011 set<osd_ping_time_t> sorted;
3012 // Get pingtimes under lock and not on the stack
eafe8130
TL
3013 map<int, osd_stat_t::Interfaces> *pingtimes = new map<int, osd_stat_t::Interfaces>;
3014 service.get_hb_pingtime(pingtimes);
3015 for (auto j : *pingtimes) {
3016 if (j.second.last_update == 0)
3017 continue;
3018 osd_ping_time_t item;
3019 item.pingtime = std::max(j.second.back_pingtime[0], j.second.back_pingtime[1]);
3020 item.pingtime = std::max(item.pingtime, j.second.back_pingtime[2]);
3021 if (item.pingtime >= value) {
3022 item.to = j.first;
3023 item.times[0] = j.second.back_pingtime[0];
3024 item.times[1] = j.second.back_pingtime[1];
3025 item.times[2] = j.second.back_pingtime[2];
3026 item.min[0] = j.second.back_min[0];
3027 item.min[1] = j.second.back_min[1];
3028 item.min[2] = j.second.back_min[2];
3029 item.max[0] = j.second.back_max[0];
3030 item.max[1] = j.second.back_max[1];
3031 item.max[2] = j.second.back_max[2];
3032 item.last = j.second.back_last;
3033 item.back = true;
3034 item.last_update = j.second.last_update;
3035 sorted.emplace(item);
3036 }
3037 if (j.second.front_last == 0)
3038 continue;
3039 item.pingtime = std::max(j.second.front_pingtime[0], j.second.front_pingtime[1]);
3040 item.pingtime = std::max(item.pingtime, j.second.front_pingtime[2]);
3041 if (item.pingtime >= value) {
3042 item.to = j.first;
3043 item.times[0] = j.second.front_pingtime[0];
3044 item.times[1] = j.second.front_pingtime[1];
3045 item.times[2] = j.second.front_pingtime[2];
3046 item.min[0] = j.second.front_min[0];
3047 item.min[1] = j.second.front_min[1];
3048 item.min[2] = j.second.front_min[2];
3049 item.max[0] = j.second.front_max[0];
3050 item.max[1] = j.second.front_max[1];
3051 item.max[2] = j.second.front_max[2];
3052 item.last = j.second.front_last;
3053 item.last_update = j.second.last_update;
3054 item.back = false;
3055 sorted.emplace(item);
3056 }
3057 }
3058 delete pingtimes;
3059 //
3060 // Network ping times (1min 5min 15min)
3061 f->open_object_section("network_ping_times");
3062 f->dump_int("threshold", value / 1000);
3063 f->open_array_section("entries");
3064 for (auto &sitem : boost::adaptors::reverse(sorted)) {
3065 ceph_assert(sitem.pingtime >= value);
3066 f->open_object_section("entry");
3067
3068 const time_t lu(sitem.last_update);
3069 char buffer[26];
3070 string lustr(ctime_r(&lu, buffer));
3071 lustr.pop_back(); // Remove trailing \n
3072 auto stale = cct->_conf.get_val<int64_t>("osd_heartbeat_stale");
3073 f->dump_string("last update", lustr);
3074 f->dump_bool("stale", ceph_clock_now().sec() - sitem.last_update > stale);
3075 f->dump_int("from osd", whoami);
3076 f->dump_int("to osd", sitem.to);
3077 f->dump_string("interface", (sitem.back ? "back" : "front"));
3078 f->open_object_section("average");
3079 f->dump_format_unquoted("1min", "%s", fixed_u_to_string(sitem.times[0],3).c_str());
3080 f->dump_format_unquoted("5min", "%s", fixed_u_to_string(sitem.times[1],3).c_str());
3081 f->dump_format_unquoted("15min", "%s", fixed_u_to_string(sitem.times[2],3).c_str());
3082 f->close_section(); // average
3083 f->open_object_section("min");
3084 f->dump_format_unquoted("1min", "%s", fixed_u_to_string(sitem.max[0],3).c_str());
3085 f->dump_format_unquoted("5min", "%s", fixed_u_to_string(sitem.max[1],3).c_str());
3086 f->dump_format_unquoted("15min", "%s", fixed_u_to_string(sitem.max[2],3).c_str());
3087 f->close_section(); // min
3088 f->open_object_section("max");
3089 f->dump_format_unquoted("1min", "%s", fixed_u_to_string(sitem.max[0],3).c_str());
3090 f->dump_format_unquoted("5min", "%s", fixed_u_to_string(sitem.max[1],3).c_str());
3091 f->dump_format_unquoted("15min", "%s", fixed_u_to_string(sitem.max[2],3).c_str());
3092 f->close_section(); // max
3093 f->dump_format_unquoted("last", "%s", fixed_u_to_string(sitem.last,3).c_str());
3094 f->close_section(); // entry
3095 }
3096 f->close_section(); // entries
3097 f->close_section(); // network_ping_times
7c673cae 3098 } else {
11fdf7f2 3099 ceph_abort_msg("broken asok registration");
7c673cae 3100 }
9f95a23c
TL
3101
3102 out:
3103 on_finish(ret, ss.str(), outbl);
7c673cae
FG
3104}
3105
3106class TestOpsSocketHook : public AdminSocketHook {
3107 OSDService *service;
3108 ObjectStore *store;
3109public:
3110 TestOpsSocketHook(OSDService *s, ObjectStore *st) : service(s), store(st) {}
9f95a23c
TL
3111 int call(std::string_view command, const cmdmap_t& cmdmap,
3112 Formatter *f,
3113 std::ostream& errss,
3114 bufferlist& out) override {
3115 int r = 0;
3116 stringstream outss;
11fdf7f2 3117 try {
9f95a23c
TL
3118 test_ops(service, store, command, cmdmap, outss);
3119 out.append(outss);
3120 } catch (const TOPNSPC::common::bad_cmd_get& e) {
3121 errss << e.what();
3122 r = -EINVAL;
11fdf7f2 3123 }
9f95a23c 3124 return r;
7c673cae
FG
3125 }
3126 void test_ops(OSDService *service, ObjectStore *store,
11fdf7f2 3127 std::string_view command, const cmdmap_t& cmdmap, ostream &ss);
7c673cae
FG
3128
3129};
3130
3131class OSD::C_Tick : public Context {
3132 OSD *osd;
3133 public:
3134 explicit C_Tick(OSD *o) : osd(o) {}
3135 void finish(int r) override {
3136 osd->tick();
3137 }
3138};
3139
3140class OSD::C_Tick_WithoutOSDLock : public Context {
3141 OSD *osd;
3142 public:
3143 explicit C_Tick_WithoutOSDLock(OSD *o) : osd(o) {}
3144 void finish(int r) override {
3145 osd->tick_without_osd_lock();
3146 }
3147};
3148
3149int OSD::enable_disable_fuse(bool stop)
3150{
3151#ifdef HAVE_LIBFUSE
3152 int r;
3153 string mntpath = cct->_conf->osd_data + "/fuse";
3154 if (fuse_store && (stop || !cct->_conf->osd_objectstore_fuse)) {
3155 dout(1) << __func__ << " disabling" << dendl;
3156 fuse_store->stop();
3157 delete fuse_store;
3158 fuse_store = NULL;
3159 r = ::rmdir(mntpath.c_str());
7c673cae 3160 if (r < 0) {
c07f9fc5
FG
3161 r = -errno;
3162 derr << __func__ << " failed to rmdir " << mntpath << ": "
3163 << cpp_strerror(r) << dendl;
7c673cae
FG
3164 return r;
3165 }
3166 return 0;
3167 }
3168 if (!fuse_store && cct->_conf->osd_objectstore_fuse) {
3169 dout(1) << __func__ << " enabling" << dendl;
3170 r = ::mkdir(mntpath.c_str(), 0700);
3171 if (r < 0)
3172 r = -errno;
3173 if (r < 0 && r != -EEXIST) {
3174 derr << __func__ << " unable to create " << mntpath << ": "
3175 << cpp_strerror(r) << dendl;
3176 return r;
3177 }
3178 fuse_store = new FuseStore(store, mntpath);
3179 r = fuse_store->start();
3180 if (r < 0) {
3181 derr << __func__ << " unable to start fuse: " << cpp_strerror(r) << dendl;
3182 delete fuse_store;
3183 fuse_store = NULL;
3184 return r;
3185 }
3186 }
3187#endif // HAVE_LIBFUSE
3188 return 0;
3189}
3190
9f95a23c
TL
3191size_t OSD::get_num_cache_shards()
3192{
3193 return cct->_conf.get_val<Option::size_t>("osd_num_cache_shards");
3194}
3195
31f18b77
FG
3196int OSD::get_num_op_shards()
3197{
3198 if (cct->_conf->osd_op_num_shards)
3199 return cct->_conf->osd_op_num_shards;
3200 if (store_is_rotational)
3201 return cct->_conf->osd_op_num_shards_hdd;
3202 else
3203 return cct->_conf->osd_op_num_shards_ssd;
3204}
3205
3206int OSD::get_num_op_threads()
3207{
3208 if (cct->_conf->osd_op_num_threads_per_shard)
3209 return get_num_op_shards() * cct->_conf->osd_op_num_threads_per_shard;
3210 if (store_is_rotational)
3211 return get_num_op_shards() * cct->_conf->osd_op_num_threads_per_shard_hdd;
3212 else
3213 return get_num_op_shards() * cct->_conf->osd_op_num_threads_per_shard_ssd;
3214}
3215
c07f9fc5
FG
3216float OSD::get_osd_recovery_sleep()
3217{
3218 if (cct->_conf->osd_recovery_sleep)
3219 return cct->_conf->osd_recovery_sleep;
d2e6a577 3220 if (!store_is_rotational && !journal_is_rotational)
c07f9fc5 3221 return cct->_conf->osd_recovery_sleep_ssd;
d2e6a577 3222 else if (store_is_rotational && !journal_is_rotational)
11fdf7f2 3223 return cct->_conf.get_val<double>("osd_recovery_sleep_hybrid");
d2e6a577
FG
3224 else
3225 return cct->_conf->osd_recovery_sleep_hdd;
c07f9fc5
FG
3226}
3227
11fdf7f2
TL
3228float OSD::get_osd_delete_sleep()
3229{
3230 float osd_delete_sleep = cct->_conf.get_val<double>("osd_delete_sleep");
3231 if (osd_delete_sleep > 0)
3232 return osd_delete_sleep;
3233 if (!store_is_rotational && !journal_is_rotational)
3234 return cct->_conf.get_val<double>("osd_delete_sleep_ssd");
3235 if (store_is_rotational && !journal_is_rotational)
3236 return cct->_conf.get_val<double>("osd_delete_sleep_hybrid");
3237 return cct->_conf.get_val<double>("osd_delete_sleep_hdd");
3238}
3239
9f95a23c
TL
3240int OSD::get_recovery_max_active()
3241{
3242 if (cct->_conf->osd_recovery_max_active)
3243 return cct->_conf->osd_recovery_max_active;
3244 if (store_is_rotational)
3245 return cct->_conf->osd_recovery_max_active_hdd;
3246 else
3247 return cct->_conf->osd_recovery_max_active_ssd;
3248}
3249
494da23a
TL
3250float OSD::get_osd_snap_trim_sleep()
3251{
3252 float osd_snap_trim_sleep = cct->_conf.get_val<double>("osd_snap_trim_sleep");
3253 if (osd_snap_trim_sleep > 0)
3254 return osd_snap_trim_sleep;
3255 if (!store_is_rotational && !journal_is_rotational)
3256 return cct->_conf.get_val<double>("osd_snap_trim_sleep_ssd");
3257 if (store_is_rotational && !journal_is_rotational)
3258 return cct->_conf.get_val<double>("osd_snap_trim_sleep_hybrid");
3259 return cct->_conf.get_val<double>("osd_snap_trim_sleep_hdd");
3260}
3261
7c673cae
FG
3262int OSD::init()
3263{
9f95a23c 3264 OSDMapRef osdmap;
7c673cae 3265 CompatSet initial, diff;
11fdf7f2 3266 std::lock_guard lock(osd_lock);
7c673cae
FG
3267 if (is_stopping())
3268 return 0;
3269
3270 tick_timer.init();
3271 tick_timer_without_osd_lock.init();
3272 service.recovery_request_timer.init();
11fdf7f2
TL
3273 service.sleep_timer.init();
3274
3275 boot_finisher.start();
3276
3277 {
3278 string val;
3279 store->read_meta("require_osd_release", &val);
9f95a23c 3280 last_require_osd_release = ceph_release_from_name(val);
11fdf7f2 3281 }
7c673cae
FG
3282
3283 // mount.
31f18b77
FG
3284 dout(2) << "init " << dev_path
3285 << " (looks like " << (store_is_rotational ? "hdd" : "ssd") << ")"
3286 << dendl;
d2e6a577 3287 dout(2) << "journal " << journal_path << dendl;
11fdf7f2 3288 ceph_assert(store); // call pre_init() first!
7c673cae 3289
9f95a23c 3290 store->set_cache_shards(get_num_cache_shards());
7c673cae
FG
3291
3292 int r = store->mount();
3293 if (r < 0) {
3294 derr << "OSD:init: unable to mount object store" << dendl;
3295 return r;
3296 }
d2e6a577
FG
3297 journal_is_rotational = store->is_journal_rotational();
3298 dout(2) << "journal looks like " << (journal_is_rotational ? "hdd" : "ssd")
3299 << dendl;
7c673cae
FG
3300
3301 enable_disable_fuse(false);
3302
3303 dout(2) << "boot" << dendl;
3304
11fdf7f2
TL
3305 service.meta_ch = store->open_collection(coll_t::meta());
3306
7c673cae
FG
3307 // initialize the daily loadavg with current 15min loadavg
3308 double loadavgs[3];
3309 if (getloadavg(loadavgs, 3) == 3) {
3310 daily_loadavg = loadavgs[2];
3311 } else {
3312 derr << "OSD::init() : couldn't read loadavgs\n" << dendl;
3313 daily_loadavg = 1.0;
3314 }
3315
3316 int rotating_auth_attempts = 0;
11fdf7f2
TL
3317 auto rotating_auth_timeout =
3318 g_conf().get_val<int64_t>("rotating_keys_bootstrap_timeout");
7c673cae
FG
3319
3320 // sanity check long object name handling
3321 {
3322 hobject_t l;
3323 l.oid.name = string(cct->_conf->osd_max_object_name_len, 'n');
3324 l.set_key(string(cct->_conf->osd_max_object_name_len, 'k'));
3325 l.nspace = string(cct->_conf->osd_max_object_namespace_len, 's');
3326 r = store->validate_hobject_key(l);
3327 if (r < 0) {
3328 derr << "backend (" << store->get_type() << ") is unable to support max "
3329 << "object name[space] len" << dendl;
3330 derr << " osd max object name len = "
3331 << cct->_conf->osd_max_object_name_len << dendl;
3332 derr << " osd max object namespace len = "
3333 << cct->_conf->osd_max_object_namespace_len << dendl;
3334 derr << cpp_strerror(r) << dendl;
3335 if (cct->_conf->osd_check_max_object_name_len_on_startup) {
3336 goto out;
3337 }
3338 derr << "osd_check_max_object_name_len_on_startup = false, starting anyway"
3339 << dendl;
3340 } else {
3341 dout(20) << "configured osd_max_object_name[space]_len looks ok" << dendl;
3342 }
3343 }
3344
3345 // read superblock
3346 r = read_superblock();
3347 if (r < 0) {
3348 derr << "OSD::init() : unable to read osd superblock" << dendl;
3349 r = -EINVAL;
3350 goto out;
3351 }
3352
3353 if (osd_compat.compare(superblock.compat_features) < 0) {
3354 derr << "The disk uses features unsupported by the executable." << dendl;
3355 derr << " ondisk features " << superblock.compat_features << dendl;
3356 derr << " daemon features " << osd_compat << dendl;
3357
3358 if (osd_compat.writeable(superblock.compat_features)) {
3359 CompatSet diff = osd_compat.unsupported(superblock.compat_features);
3360 derr << "it is still writeable, though. Missing features: " << diff << dendl;
3361 r = -EOPNOTSUPP;
3362 goto out;
3363 }
3364 else {
3365 CompatSet diff = osd_compat.unsupported(superblock.compat_features);
3366 derr << "Cannot write to disk! Missing features: " << diff << dendl;
3367 r = -EOPNOTSUPP;
3368 goto out;
3369 }
3370 }
3371
3372 assert_warn(whoami == superblock.whoami);
3373 if (whoami != superblock.whoami) {
3374 derr << "OSD::init: superblock says osd"
3375 << superblock.whoami << " but I am osd." << whoami << dendl;
3376 r = -EINVAL;
3377 goto out;
3378 }
3379
9f95a23c
TL
3380 startup_time = ceph::mono_clock::now();
3381
11fdf7f2 3382 // load up "current" osdmap
9f95a23c
TL
3383 assert_warn(!get_osdmap());
3384 if (get_osdmap()) {
11fdf7f2
TL
3385 derr << "OSD::init: unable to read current osdmap" << dendl;
3386 r = -EINVAL;
3387 goto out;
3388 }
3389 osdmap = get_map(superblock.current_epoch);
9f95a23c 3390 set_osdmap(osdmap);
11fdf7f2
TL
3391
3392 // make sure we don't have legacy pgs deleting
3393 {
3394 vector<coll_t> ls;
3395 int r = store->list_collections(ls);
3396 ceph_assert(r >= 0);
3397 for (auto c : ls) {
3398 spg_t pgid;
3399 if (c.is_pg(&pgid) &&
3400 !osdmap->have_pg_pool(pgid.pool())) {
3401 ghobject_t oid = make_final_pool_info_oid(pgid.pool());
3402 if (!store->exists(service.meta_ch, oid)) {
3403 derr << __func__ << " missing pg_pool_t for deleted pool "
3404 << pgid.pool() << " for pg " << pgid
3405 << "; please downgrade to luminous and allow "
3406 << "pg deletion to complete before upgrading" << dendl;
3407 ceph_abort();
3408 }
3409 }
3410 }
3411 }
3412
7c673cae
FG
3413 initial = get_osd_initial_compat_set();
3414 diff = superblock.compat_features.unsupported(initial);
3415 if (superblock.compat_features.merge(initial)) {
9f95a23c
TL
3416 // Are we adding SNAPMAPPER2?
3417 if (diff.incompat.contains(CEPH_OSD_FEATURE_INCOMPAT_SNAPMAPPER2)) {
3418 dout(1) << __func__ << " upgrade snap_mapper (first start as octopus)"
3419 << dendl;
3420 auto ch = service.meta_ch;
3421 auto hoid = make_snapmapper_oid();
3422 unsigned max = cct->_conf->osd_target_transaction_size;
3423 r = SnapMapper::convert_legacy(cct, store, ch, hoid, max);
3424 if (r < 0)
3425 goto out;
3426 }
7c673cae
FG
3427 // We need to persist the new compat_set before we
3428 // do anything else
3429 dout(5) << "Upgrading superblock adding: " << diff << dendl;
3430 ObjectStore::Transaction t;
3431 write_superblock(t);
11fdf7f2 3432 r = store->queue_transaction(service.meta_ch, std::move(t));
7c673cae
FG
3433 if (r < 0)
3434 goto out;
3435 }
3436
3437 // make sure snap mapper object exists
11fdf7f2 3438 if (!store->exists(service.meta_ch, OSD::make_snapmapper_oid())) {
7c673cae
FG
3439 dout(10) << "init creating/touching snapmapper object" << dendl;
3440 ObjectStore::Transaction t;
3441 t.touch(coll_t::meta(), OSD::make_snapmapper_oid());
11fdf7f2 3442 r = store->queue_transaction(service.meta_ch, std::move(t));
7c673cae
FG
3443 if (r < 0)
3444 goto out;
3445 }
9f95a23c
TL
3446 if (!store->exists(service.meta_ch, OSD::make_purged_snaps_oid())) {
3447 dout(10) << "init creating/touching purged_snaps object" << dendl;
3448 ObjectStore::Transaction t;
3449 t.touch(coll_t::meta(), OSD::make_purged_snaps_oid());
3450 r = store->queue_transaction(service.meta_ch, std::move(t));
3451 if (r < 0)
3452 goto out;
3453 }
7c673cae
FG
3454
3455 if (cct->_conf->osd_open_classes_on_start) {
9f95a23c 3456 int r = ClassHandler::get_instance().open_all_classes();
7c673cae
FG
3457 if (r)
3458 dout(1) << "warning: got an error loading one or more classes: " << cpp_strerror(r) << dendl;
3459 }
3460
11fdf7f2 3461 check_osdmap_features();
7c673cae
FG
3462
3463 create_recoverystate_perf();
3464
3465 {
3466 epoch_t bind_epoch = osdmap->get_epoch();
3467 service.set_epochs(NULL, NULL, &bind_epoch);
3468 }
3469
3470 clear_temp_objects();
3471
d2e6a577 3472 // initialize osdmap references in sharded wq
11fdf7f2
TL
3473 for (auto& shard : shards) {
3474 std::lock_guard l(shard->osdmap_lock);
3475 shard->shard_osdmap = osdmap;
3476 }
d2e6a577 3477
7c673cae
FG
3478 // load up pgs (as they previously existed)
3479 load_pgs();
3480
3481 dout(2) << "superblock: I am osd." << superblock.whoami << dendl;
7c673cae
FG
3482
3483 create_logger();
3484
11fdf7f2
TL
3485 // prime osd stats
3486 {
3487 struct store_statfs_t stbuf;
3488 osd_alert_list_t alerts;
3489 int r = store->statfs(&stbuf, &alerts);
3490 ceph_assert(r == 0);
3491 service.set_statfs(stbuf, alerts);
3492 }
3493
3494 // client_messenger auth_client is already set up by monc.
3495 for (auto m : { cluster_messenger,
3496 objecter_messenger,
3497 hb_front_client_messenger,
3498 hb_back_client_messenger,
3499 hb_front_server_messenger,
3500 hb_back_server_messenger } ) {
3501 m->set_auth_client(monc);
3502 }
3503 for (auto m : { client_messenger,
3504 cluster_messenger,
3505 hb_front_server_messenger,
3506 hb_back_server_messenger }) {
3507 m->set_auth_server(monc);
3508 }
3509 monc->set_handle_authentication_dispatcher(this);
7c673cae
FG
3510
3511 monc->set_want_keys(CEPH_ENTITY_TYPE_MON | CEPH_ENTITY_TYPE_OSD
3512 | CEPH_ENTITY_TYPE_MGR);
3513 r = monc->init();
3514 if (r < 0)
3515 goto out;
3516
11fdf7f2
TL
3517 mgrc.set_pgstats_cb([this](){ return collect_pg_stats(); });
3518 mgrc.set_perf_metric_query_cb(
9f95a23c
TL
3519 [this](const ConfigPayload &config_payload) {
3520 set_perf_queries(config_payload);
11fdf7f2 3521 },
9f95a23c
TL
3522 [this] {
3523 return get_perf_reports();
11fdf7f2 3524 });
7c673cae 3525 mgrc.init();
7c673cae
FG
3526
3527 // tell monc about log_client so it will know about mon session resets
3528 monc->set_log_client(&log_client);
3529 update_log_config();
3530
11fdf7f2
TL
3531 // i'm ready!
3532 client_messenger->add_dispatcher_tail(&mgrc);
3533 client_messenger->add_dispatcher_tail(this);
3534 cluster_messenger->add_dispatcher_head(this);
3535
3536 hb_front_client_messenger->add_dispatcher_head(&heartbeat_dispatcher);
3537 hb_back_client_messenger->add_dispatcher_head(&heartbeat_dispatcher);
3538 hb_front_server_messenger->add_dispatcher_head(&heartbeat_dispatcher);
3539 hb_back_server_messenger->add_dispatcher_head(&heartbeat_dispatcher);
3540
9f95a23c 3541 objecter_messenger->add_dispatcher_head(service.objecter.get());
11fdf7f2 3542
28e407b8
AA
3543 service.init();
3544 service.publish_map(osdmap);
3545 service.publish_superblock(superblock);
3546 service.max_oldest_map = superblock.oldest_map;
3547
11fdf7f2
TL
3548 for (auto& shard : shards) {
3549 // put PGs in a temporary set because we may modify pg_slots
3550 // unordered_map below.
3551 set<PGRef> pgs;
3552 for (auto& i : shard->pg_slots) {
3553 PGRef pg = i.second->pg;
3554 if (!pg) {
3555 continue;
3556 }
3557 pgs.insert(pg);
3558 }
3559 for (auto pg : pgs) {
9f95a23c 3560 std::scoped_lock l{*pg};
11fdf7f2
TL
3561 set<pair<spg_t,epoch_t>> new_children;
3562 set<pair<spg_t,epoch_t>> merge_pgs;
3563 service.identify_splits_and_merges(pg->get_osdmap(), osdmap, pg->pg_id,
3564 &new_children, &merge_pgs);
3565 if (!new_children.empty()) {
3566 for (auto shard : shards) {
3567 shard->prime_splits(osdmap, &new_children);
3568 }
3569 assert(new_children.empty());
3570 }
3571 if (!merge_pgs.empty()) {
3572 for (auto shard : shards) {
3573 shard->prime_merges(osdmap, &merge_pgs);
3574 }
3575 assert(merge_pgs.empty());
3576 }
11fdf7f2
TL
3577 }
3578 }
3579
7c673cae 3580 osd_op_tp.start();
7c673cae 3581
7c673cae
FG
3582 // start the heartbeat
3583 heartbeat_thread.create("osd_srv_heartbt");
3584
3585 // tick
91327a77
AA
3586 tick_timer.add_event_after(get_tick_interval(),
3587 new C_Tick(this));
7c673cae 3588 {
11fdf7f2 3589 std::lock_guard l(tick_timer_lock);
91327a77
AA
3590 tick_timer_without_osd_lock.add_event_after(get_tick_interval(),
3591 new C_Tick_WithoutOSDLock(this));
7c673cae
FG
3592 }
3593
9f95a23c 3594 osd_lock.unlock();
7c673cae
FG
3595
3596 r = monc->authenticate();
3597 if (r < 0) {
c07f9fc5
FG
3598 derr << __func__ << " authentication failed: " << cpp_strerror(r)
3599 << dendl;
11fdf7f2 3600 exit(1);
7c673cae
FG
3601 }
3602
11fdf7f2 3603 while (monc->wait_auth_rotating(rotating_auth_timeout) < 0) {
7c673cae
FG
3604 derr << "unable to obtain rotating service keys; retrying" << dendl;
3605 ++rotating_auth_attempts;
11fdf7f2 3606 if (rotating_auth_attempts > g_conf()->max_rotating_auth_attempts) {
c07f9fc5 3607 derr << __func__ << " wait_auth_rotating timed out" << dendl;
11fdf7f2 3608 exit(1);
7c673cae
FG
3609 }
3610 }
3611
3612 r = update_crush_device_class();
3613 if (r < 0) {
d2e6a577
FG
3614 derr << __func__ << " unable to update_crush_device_class: "
3615 << cpp_strerror(r) << dendl;
11fdf7f2 3616 exit(1);
7c673cae
FG
3617 }
3618
3619 r = update_crush_location();
3620 if (r < 0) {
d2e6a577 3621 derr << __func__ << " unable to update_crush_location: "
c07f9fc5 3622 << cpp_strerror(r) << dendl;
11fdf7f2 3623 exit(1);
7c673cae
FG
3624 }
3625
9f95a23c 3626 osd_lock.lock();
7c673cae
FG
3627 if (is_stopping())
3628 return 0;
3629
3630 // start objecter *after* we have authenticated, so that we don't ignore
3631 // the OSDMaps it requests.
3632 service.final_init();
3633
3634 check_config();
3635
3636 dout(10) << "ensuring pgs have consumed prior maps" << dendl;
3637 consume_map();
7c673cae
FG
3638
3639 dout(0) << "done with init, starting boot process" << dendl;
3640
3641 // subscribe to any pg creations
3642 monc->sub_want("osd_pg_creates", last_pg_create_epoch, 0);
3643
3644 // MgrClient needs this (it doesn't have MonClient reference itself)
3645 monc->sub_want("mgrmap", 0, 0);
3646
3647 // we don't need to ask for an osdmap here; objecter will
3648 //monc->sub_want("osdmap", osdmap->get_epoch(), CEPH_SUBSCRIBE_ONETIME);
3649
3650 monc->renew_subs();
3651
3652 start_boot();
3653
3654 return 0;
7c673cae
FG
3655
3656out:
3657 enable_disable_fuse(true);
3658 store->umount();
3659 delete store;
3660 store = NULL;
3661 return r;
3662}
3663
3664void OSD::final_init()
3665{
3666 AdminSocket *admin_socket = cct->get_admin_socket();
3667 asok_hook = new OSDSocketHook(this);
9f95a23c 3668 int r = admin_socket->register_command("status", asok_hook,
7c673cae 3669 "high-level status of OSD");
11fdf7f2 3670 ceph_assert(r == 0);
9f95a23c 3671 r = admin_socket->register_command("flush_journal",
7c673cae
FG
3672 asok_hook,
3673 "flush the journal to permanent store");
11fdf7f2 3674 ceph_assert(r == 0);
9f95a23c 3675 r = admin_socket->register_command("dump_ops_in_flight " \
c07f9fc5
FG
3676 "name=filterstr,type=CephString,n=N,req=false",
3677 asok_hook,
7c673cae 3678 "show the ops currently in flight");
11fdf7f2 3679 ceph_assert(r == 0);
9f95a23c 3680 r = admin_socket->register_command("ops " \
c07f9fc5
FG
3681 "name=filterstr,type=CephString,n=N,req=false",
3682 asok_hook,
7c673cae 3683 "show the ops currently in flight");
11fdf7f2 3684 ceph_assert(r == 0);
9f95a23c 3685 r = admin_socket->register_command("dump_blocked_ops " \
c07f9fc5
FG
3686 "name=filterstr,type=CephString,n=N,req=false",
3687 asok_hook,
7c673cae 3688 "show the blocked ops currently in flight");
11fdf7f2 3689 ceph_assert(r == 0);
9f95a23c 3690 r = admin_socket->register_command("dump_historic_ops " \
c07f9fc5 3691 "name=filterstr,type=CephString,n=N,req=false",
7c673cae
FG
3692 asok_hook,
3693 "show recent ops");
11fdf7f2 3694 ceph_assert(r == 0);
9f95a23c 3695 r = admin_socket->register_command("dump_historic_slow_ops " \
c07f9fc5 3696 "name=filterstr,type=CephString,n=N,req=false",
7c673cae
FG
3697 asok_hook,
3698 "show slowest recent ops");
11fdf7f2 3699 ceph_assert(r == 0);
9f95a23c 3700 r = admin_socket->register_command("dump_historic_ops_by_duration " \
c07f9fc5 3701 "name=filterstr,type=CephString,n=N,req=false",
7c673cae
FG
3702 asok_hook,
3703 "show slowest recent ops, sorted by duration");
11fdf7f2 3704 ceph_assert(r == 0);
9f95a23c 3705 r = admin_socket->register_command("dump_op_pq_state",
7c673cae
FG
3706 asok_hook,
3707 "dump op priority queue state");
11fdf7f2 3708 ceph_assert(r == 0);
9f95a23c 3709 r = admin_socket->register_command("dump_blacklist",
7c673cae
FG
3710 asok_hook,
3711 "dump blacklisted clients and times");
11fdf7f2 3712 ceph_assert(r == 0);
9f95a23c 3713 r = admin_socket->register_command("dump_watchers",
7c673cae
FG
3714 asok_hook,
3715 "show clients which have active watches,"
3716 " and on which objects");
11fdf7f2 3717 ceph_assert(r == 0);
9f95a23c 3718 r = admin_socket->register_command("dump_recovery_reservations",
7c673cae
FG
3719 asok_hook,
3720 "show recovery reservations");
11fdf7f2 3721 ceph_assert(r == 0);
9f95a23c 3722 r = admin_socket->register_command("dump_scrub_reservations",
eafe8130 3723 asok_hook,
9f95a23c 3724 "show recovery reservations");
eafe8130 3725 ceph_assert(r == 0);
9f95a23c 3726 r = admin_socket->register_command("get_latest_osdmap",
7c673cae
FG
3727 asok_hook,
3728 "force osd to update the latest map from "
3729 "the mon");
11fdf7f2 3730 ceph_assert(r == 0);
7c673cae 3731
9f95a23c 3732 r = admin_socket->register_command("set_heap_property " \
7c673cae
FG
3733 "name=property,type=CephString " \
3734 "name=value,type=CephInt",
3735 asok_hook,
3736 "update malloc extension heap property");
11fdf7f2 3737 ceph_assert(r == 0);
7c673cae 3738
9f95a23c 3739 r = admin_socket->register_command("get_heap_property " \
7c673cae
FG
3740 "name=property,type=CephString",
3741 asok_hook,
3742 "get malloc extension heap property");
11fdf7f2 3743 ceph_assert(r == 0);
7c673cae
FG
3744
3745 r = admin_socket->register_command("dump_objectstore_kv_stats",
7c673cae
FG
3746 asok_hook,
3747 "print statistics of kvdb which used by bluestore");
11fdf7f2 3748 ceph_assert(r == 0);
7c673cae
FG
3749
3750 r = admin_socket->register_command("dump_scrubs",
7c673cae
FG
3751 asok_hook,
3752 "print scheduled scrubs");
11fdf7f2 3753 ceph_assert(r == 0);
7c673cae
FG
3754
3755 r = admin_socket->register_command("calc_objectstore_db_histogram",
7c673cae
FG
3756 asok_hook,
3757 "Generate key value histogram of kvdb(rocksdb) which used by bluestore");
11fdf7f2 3758 ceph_assert(r == 0);
7c673cae
FG
3759
3760 r = admin_socket->register_command("flush_store_cache",
7c673cae
FG
3761 asok_hook,
3762 "Flush bluestore internal cache");
11fdf7f2 3763 ceph_assert(r == 0);
9f95a23c 3764 r = admin_socket->register_command("dump_pgstate_history",
7c673cae
FG
3765 asok_hook,
3766 "show recent state history");
11fdf7f2 3767 ceph_assert(r == 0);
7c673cae 3768
9f95a23c 3769 r = admin_socket->register_command("compact",
224ce89b
WB
3770 asok_hook,
3771 "Commpact object store's omap."
3772 " WARNING: Compaction probably slows your requests");
11fdf7f2
TL
3773 ceph_assert(r == 0);
3774
9f95a23c 3775 r = admin_socket->register_command("get_mapped_pools",
11fdf7f2
TL
3776 asok_hook,
3777 "dump pools whose PG(s) are mapped to this OSD.");
3778
3779 ceph_assert(r == 0);
3780
9f95a23c 3781 r = admin_socket->register_command("smart name=devid,type=CephString,req=false",
11fdf7f2
TL
3782 asok_hook,
3783 "probe OSD devices for SMART data.");
3784
3785 ceph_assert(r == 0);
3786
9f95a23c 3787 r = admin_socket->register_command("list_devices",
11fdf7f2
TL
3788 asok_hook,
3789 "list OSD devices.");
9f95a23c 3790 r = admin_socket->register_command("send_beacon",
11fdf7f2
TL
3791 asok_hook,
3792 "send OSD beacon to mon immediately");
224ce89b 3793
9f95a23c
TL
3794 r = admin_socket->register_command(
3795 "dump_osd_network name=value,type=CephInt,req=false", asok_hook,
3796 "Dump osd heartbeat network ping times");
eafe8130
TL
3797 ceph_assert(r == 0);
3798
7c673cae
FG
3799 test_ops_hook = new TestOpsSocketHook(&(this->service), this->store);
3800 // Note: pools are CephString instead of CephPoolname because
3801 // these commands traditionally support both pool names and numbers
3802 r = admin_socket->register_command(
7c673cae
FG
3803 "setomapval " \
3804 "name=pool,type=CephString " \
3805 "name=objname,type=CephObjectname " \
3806 "name=key,type=CephString "\
3807 "name=val,type=CephString",
3808 test_ops_hook,
3809 "set omap key");
11fdf7f2 3810 ceph_assert(r == 0);
7c673cae 3811 r = admin_socket->register_command(
7c673cae
FG
3812 "rmomapkey " \
3813 "name=pool,type=CephString " \
3814 "name=objname,type=CephObjectname " \
3815 "name=key,type=CephString",
3816 test_ops_hook,
3817 "remove omap key");
11fdf7f2 3818 ceph_assert(r == 0);
7c673cae 3819 r = admin_socket->register_command(
7c673cae
FG
3820 "setomapheader " \
3821 "name=pool,type=CephString " \
3822 "name=objname,type=CephObjectname " \
3823 "name=header,type=CephString",
3824 test_ops_hook,
3825 "set omap header");
11fdf7f2 3826 ceph_assert(r == 0);
7c673cae
FG
3827
3828 r = admin_socket->register_command(
7c673cae
FG
3829 "getomap " \
3830 "name=pool,type=CephString " \
3831 "name=objname,type=CephObjectname",
3832 test_ops_hook,
3833 "output entire object map");
11fdf7f2 3834 ceph_assert(r == 0);
7c673cae
FG
3835
3836 r = admin_socket->register_command(
7c673cae
FG
3837 "truncobj " \
3838 "name=pool,type=CephString " \
3839 "name=objname,type=CephObjectname " \
3840 "name=len,type=CephInt",
3841 test_ops_hook,
3842 "truncate object to length");
11fdf7f2 3843 ceph_assert(r == 0);
7c673cae
FG
3844
3845 r = admin_socket->register_command(
7c673cae
FG
3846 "injectdataerr " \
3847 "name=pool,type=CephString " \
3848 "name=objname,type=CephObjectname " \
3849 "name=shardid,type=CephInt,req=false,range=0|255",
3850 test_ops_hook,
3851 "inject data error to an object");
11fdf7f2 3852 ceph_assert(r == 0);
7c673cae
FG
3853
3854 r = admin_socket->register_command(
7c673cae
FG
3855 "injectmdataerr " \
3856 "name=pool,type=CephString " \
3857 "name=objname,type=CephObjectname " \
3858 "name=shardid,type=CephInt,req=false,range=0|255",
3859 test_ops_hook,
3860 "inject metadata error to an object");
11fdf7f2 3861 ceph_assert(r == 0);
7c673cae 3862 r = admin_socket->register_command(
7c673cae
FG
3863 "set_recovery_delay " \
3864 "name=utime,type=CephInt,req=false",
3865 test_ops_hook,
3866 "Delay osd recovery by specified seconds");
11fdf7f2 3867 ceph_assert(r == 0);
7c673cae 3868 r = admin_socket->register_command(
7c673cae
FG
3869 "injectfull " \
3870 "name=type,type=CephString,req=false " \
3871 "name=count,type=CephInt,req=false ",
3872 test_ops_hook,
3873 "Inject a full disk (optional count times)");
11fdf7f2 3874 ceph_assert(r == 0);
9f95a23c
TL
3875 r = admin_socket->register_command(
3876 "bench " \
3877 "name=count,type=CephInt,req=false " \
3878 "name=size,type=CephInt,req=false " \
3879 "name=object_size,type=CephInt,req=false " \
3880 "name=object_num,type=CephInt,req=false ",
3881 asok_hook,
3882 "OSD benchmark: write <count> <size>-byte objects(with <obj_size> <obj_num>), " \
3883 "(default count=1G default size=4MB). Results in log.");
3884 ceph_assert(r == 0);
3885 r = admin_socket->register_command(
3886 "cluster_log " \
3887 "name=level,type=CephChoices,strings=error,warning,info,debug " \
3888 "name=message,type=CephString,n=N",
3889 asok_hook,
3890 "log a message to the cluster log");
3891 ceph_assert(r == 0);
3892 r = admin_socket->register_command(
3893 "flush_pg_stats",
3894 asok_hook,
3895 "flush pg stats");
3896 ceph_assert(r == 0);
3897 r = admin_socket->register_command(
3898 "heap " \
3899 "name=heapcmd,type=CephChoices,strings=" \
3900 "dump|start_profiler|stop_profiler|release|get_release_rate|set_release_rate|stats " \
3901 "name=value,type=CephString,req=false",
3902 asok_hook,
3903 "show heap usage info (available only if compiled with tcmalloc)");
3904 ceph_assert(r == 0);
3905 r = admin_socket->register_command(
3906 "debug dump_missing " \
3907 "name=filename,type=CephFilepath",
3908 asok_hook,
3909 "dump missing objects to a named file");
3910 ceph_assert(r == 0);
3911 r = admin_socket->register_command(
3912 "debug kick_recovery_wq " \
3913 "name=delay,type=CephInt,range=0",
3914 asok_hook,
3915 "set osd_recovery_delay_start to <val>");
3916 ceph_assert(r == 0);
3917 r = admin_socket->register_command(
3918 "cpu_profiler " \
3919 "name=arg,type=CephChoices,strings=status|flush",
3920 asok_hook,
3921 "run cpu profiling on daemon");
3922 ceph_assert(r == 0);
3923 r = admin_socket->register_command(
3924 "dump_pg_recovery_stats",
3925 asok_hook,
3926 "dump pg recovery statistics");
3927 ceph_assert(r == 0);
3928 r = admin_socket->register_command(
3929 "reset_pg_recovery_stats",
3930 asok_hook,
3931 "reset pg recovery statistics");
3932 ceph_assert(r == 0);
3933 r = admin_socket->register_command(
3934 "cache drop",
3935 asok_hook,
3936 "Drop all OSD caches");
3937 ceph_assert(r == 0);
3938 r = admin_socket->register_command(
3939 "cache status",
3940 asok_hook,
3941 "Get OSD caches statistics");
3942 ceph_assert(r == 0);
3943 r = admin_socket->register_command(
3944 "scrub_purged_snaps",
3945 asok_hook,
3946 "Scrub purged_snaps vs snapmapper index");
3947 ceph_assert(r == 0);
7c673cae 3948
9f95a23c
TL
3949 // -- pg commands --
3950 // old form: ceph pg <pgid> command ...
3951 r = admin_socket->register_command(
3952 "pg " \
3953 "name=pgid,type=CephPgid " \
3954 "name=cmd,type=CephChoices,strings=query",
3955 asok_hook,
3956 "");
3957 ceph_assert(r == 0);
3958 r = admin_socket->register_command(
3959 "pg " \
3960 "name=pgid,type=CephPgid " \
3961 "name=cmd,type=CephChoices,strings=mark_unfound_lost " \
3962 "name=mulcmd,type=CephChoices,strings=revert|delete",
3963 asok_hook,
3964 "");
3965 ceph_assert(r == 0);
3966 r = admin_socket->register_command(
3967 "pg " \
3968 "name=pgid,type=CephPgid " \
3969 "name=cmd,type=CephChoices,strings=list_unfound " \
3970 "name=offset,type=CephString,req=false",
3971 asok_hook,
3972 "");
3973 ceph_assert(r == 0);
3974 r = admin_socket->register_command(
3975 "pg " \
3976 "name=pgid,type=CephPgid " \
3977 "name=cmd,type=CephChoices,strings=scrub " \
3978 "name=time,type=CephInt,req=false",
3979 asok_hook,
3980 "");
3981 ceph_assert(r == 0);
3982 r = admin_socket->register_command(
3983 "pg " \
3984 "name=pgid,type=CephPgid " \
3985 "name=cmd,type=CephChoices,strings=deep_scrub " \
3986 "name=time,type=CephInt,req=false",
3987 asok_hook,
3988 "");
3989 ceph_assert(r == 0);
3990 // new form: tell <pgid> <cmd> for both cli and rest
3991 r = admin_socket->register_command(
3992 "query",
3993 asok_hook,
3994 "show details of a specific pg");
3995 ceph_assert(r == 0);
3996 r = admin_socket->register_command(
3997 "mark_unfound_lost " \
3998 "name=pgid,type=CephPgid,req=false " \
3999 "name=mulcmd,type=CephChoices,strings=revert|delete",
4000 asok_hook,
4001 "mark all unfound objects in this pg as lost, either removing or reverting to a prior version if one is available");
4002 ceph_assert(r == 0);
4003 r = admin_socket->register_command(
4004 "list_unfound " \
4005 "name=pgid,type=CephPgid,req=false " \
4006 "name=offset,type=CephString,req=false",
4007 asok_hook,
4008 "list unfound objects on this pg, perhaps starting at an offset given in JSON");
4009 ceph_assert(r == 0);
4010 r = admin_socket->register_command(
4011 "scrub " \
4012 "name=pgid,type=CephPgid,req=false " \
4013 "name=time,type=CephInt,req=false",
4014 asok_hook,
4015 "Trigger a scheduled scrub ");
4016 ceph_assert(r == 0);
4017 r = admin_socket->register_command(
4018 "deep_scrub " \
4019 "name=pgid,type=CephPgid,req=false " \
4020 "name=time,type=CephInt,req=false",
4021 asok_hook,
4022 "Trigger a scheduled deep scrub ");
4023 ceph_assert(r == 0);
4024}
7c673cae 4025
9f95a23c
TL
4026void OSD::create_logger()
4027{
4028 dout(10) << "create_logger" << dendl;
7c673cae 4029
9f95a23c 4030 logger = build_osd_logger(cct);
7c673cae
FG
4031 cct->get_perfcounters_collection()->add(logger);
4032}
4033
4034void OSD::create_recoverystate_perf()
4035{
4036 dout(10) << "create_recoverystate_perf" << dendl;
4037
9f95a23c 4038 recoverystate_perf = build_recoverystate_perf(cct);
7c673cae
FG
4039 cct->get_perfcounters_collection()->add(recoverystate_perf);
4040}
4041
4042int OSD::shutdown()
4043{
92f5a8d4
TL
4044 if (cct->_conf->osd_fast_shutdown) {
4045 derr << "*** Immediate shutdown (osd_fast_shutdown=true) ***" << dendl;
4046 cct->_log->flush();
4047 _exit(0);
4048 }
4049
7c673cae
FG
4050 if (!service.prepare_to_stop())
4051 return 0; // already shutting down
9f95a23c 4052 osd_lock.lock();
7c673cae 4053 if (is_stopping()) {
9f95a23c 4054 osd_lock.unlock();
7c673cae
FG
4055 return 0;
4056 }
11fdf7f2 4057 dout(0) << "shutdown" << dendl;
7c673cae
FG
4058
4059 set_state(STATE_STOPPING);
4060
4061 // Debugging
11fdf7f2
TL
4062 if (cct->_conf.get_val<bool>("osd_debug_shutdown")) {
4063 cct->_conf.set_val("debug_osd", "100");
4064 cct->_conf.set_val("debug_journal", "100");
4065 cct->_conf.set_val("debug_filestore", "100");
4066 cct->_conf.set_val("debug_bluestore", "100");
4067 cct->_conf.set_val("debug_ms", "100");
4068 cct->_conf.apply_changes(nullptr);
3efd9988 4069 }
7c673cae
FG
4070
4071 // stop MgrClient earlier as it's more like an internal consumer of OSD
4072 mgrc.shutdown();
4073
4074 service.start_shutdown();
4075
4076 // stop sending work to pgs. this just prevents any new work in _process
4077 // from racing with on_shutdown and potentially entering the pg after.
4078 op_shardedwq.drain();
4079
4080 // Shutdown PGs
4081 {
11fdf7f2
TL
4082 vector<PGRef> pgs;
4083 _get_pgs(&pgs);
4084 for (auto pg : pgs) {
4085 pg->shutdown();
7c673cae
FG
4086 }
4087 }
7c673cae
FG
4088
4089 // drain op queue again (in case PGs requeued something)
4090 op_shardedwq.drain();
4091 {
4092 finished.clear(); // zap waiters (bleh, this is messy)
11fdf7f2 4093 waiting_for_osdmap.clear();
7c673cae
FG
4094 }
4095
7c673cae 4096 // unregister commands
11fdf7f2 4097 cct->get_admin_socket()->unregister_commands(asok_hook);
7c673cae
FG
4098 delete asok_hook;
4099 asok_hook = NULL;
4100
11fdf7f2 4101 cct->get_admin_socket()->unregister_commands(test_ops_hook);
7c673cae
FG
4102 delete test_ops_hook;
4103 test_ops_hook = NULL;
4104
9f95a23c 4105 osd_lock.unlock();
7c673cae 4106
9f95a23c
TL
4107 {
4108 std::lock_guard l{heartbeat_lock};
4109 heartbeat_stop = true;
4110 heartbeat_cond.notify_all();
4111 heartbeat_peers.clear();
4112 }
7c673cae
FG
4113 heartbeat_thread.join();
4114
9f95a23c
TL
4115 hb_back_server_messenger->mark_down_all();
4116 hb_front_server_messenger->mark_down_all();
4117 hb_front_client_messenger->mark_down_all();
4118 hb_back_client_messenger->mark_down_all();
4119
7c673cae
FG
4120 osd_op_tp.drain();
4121 osd_op_tp.stop();
4122 dout(10) << "op sharded tp stopped" << dendl;
4123
7c673cae
FG
4124 dout(10) << "stopping agent" << dendl;
4125 service.agent_stop();
4126
11fdf7f2
TL
4127 boot_finisher.wait_for_empty();
4128
9f95a23c 4129 osd_lock.lock();
7c673cae 4130
11fdf7f2 4131 boot_finisher.stop();
494da23a 4132 reset_heartbeat_peers(true);
7c673cae
FG
4133
4134 tick_timer.shutdown();
4135
4136 {
11fdf7f2 4137 std::lock_guard l(tick_timer_lock);
7c673cae
FG
4138 tick_timer_without_osd_lock.shutdown();
4139 }
4140
4141 // note unmount epoch
9f95a23c 4142 dout(10) << "noting clean unmount in epoch " << get_osdmap_epoch() << dendl;
7c673cae 4143 superblock.mounted = service.get_boot_epoch();
9f95a23c 4144 superblock.clean_thru = get_osdmap_epoch();
7c673cae
FG
4145 ObjectStore::Transaction t;
4146 write_superblock(t);
11fdf7f2 4147 int r = store->queue_transaction(service.meta_ch, std::move(t));
7c673cae
FG
4148 if (r) {
4149 derr << "OSD::shutdown: error writing superblock: "
4150 << cpp_strerror(r) << dendl;
4151 }
4152
4153
31f18b77
FG
4154 service.shutdown_reserver();
4155
7c673cae
FG
4156 // Remove PGs
4157#ifdef PG_DEBUG_REFS
4158 service.dump_live_pgids();
4159#endif
11fdf7f2
TL
4160 while (true) {
4161 vector<PGRef> pgs;
4162 _get_pgs(&pgs, true);
4163 if (pgs.empty()) {
4164 break;
4165 }
4166 for (auto& pg : pgs) {
4167 if (pg->is_deleted()) {
4168 continue;
4169 }
4170 dout(20) << " kicking pg " << pg << dendl;
4171 pg->lock();
4172 if (pg->get_num_ref() != 1) {
4173 derr << "pgid " << pg->get_pgid() << " has ref count of "
4174 << pg->get_num_ref() << dendl;
7c673cae 4175#ifdef PG_DEBUG_REFS
11fdf7f2 4176 pg->dump_live_ids();
7c673cae 4177#endif
31f18b77
FG
4178 if (cct->_conf->osd_shutdown_pgref_assert) {
4179 ceph_abort();
4180 }
7c673cae 4181 }
11fdf7f2
TL
4182 pg->ch.reset();
4183 pg->unlock();
7c673cae 4184 }
7c673cae
FG
4185 }
4186#ifdef PG_DEBUG_REFS
4187 service.dump_live_pgids();
4188#endif
f64942e4 4189
9f95a23c 4190 osd_lock.unlock();
11fdf7f2 4191 cct->_conf.remove_observer(this);
9f95a23c 4192 osd_lock.lock();
7c673cae 4193
11fdf7f2
TL
4194 service.meta_ch.reset();
4195
7c673cae
FG
4196 dout(10) << "syncing store" << dendl;
4197 enable_disable_fuse(true);
4198
4199 if (cct->_conf->osd_journal_flush_on_shutdown) {
4200 dout(10) << "flushing journal" << dendl;
4201 store->flush_journal();
4202 }
4203
7c673cae 4204 monc->shutdown();
9f95a23c
TL
4205 osd_lock.unlock();
4206 {
4207 std::unique_lock l{map_lock};
4208 set_osdmap(OSDMapRef());
4209 }
11fdf7f2
TL
4210 for (auto s : shards) {
4211 std::lock_guard l(s->osdmap_lock);
4212 s->shard_osdmap = OSDMapRef();
4213 }
7c673cae 4214 service.shutdown();
11fdf7f2
TL
4215
4216 std::lock_guard lock(osd_lock);
4217 store->umount();
4218 delete store;
4219 store = nullptr;
4220 dout(10) << "Store synced" << dendl;
4221
7c673cae
FG
4222 op_tracker.on_shutdown();
4223
9f95a23c 4224 ClassHandler::get_instance().shutdown();
7c673cae
FG
4225 client_messenger->shutdown();
4226 cluster_messenger->shutdown();
4227 hb_front_client_messenger->shutdown();
4228 hb_back_client_messenger->shutdown();
4229 objecter_messenger->shutdown();
4230 hb_front_server_messenger->shutdown();
4231 hb_back_server_messenger->shutdown();
4232
7c673cae
FG
4233 return r;
4234}
4235
4236int OSD::mon_cmd_maybe_osd_create(string &cmd)
4237{
4238 bool created = false;
4239 while (true) {
4240 dout(10) << __func__ << " cmd: " << cmd << dendl;
4241 vector<string> vcmd{cmd};
4242 bufferlist inbl;
4243 C_SaferCond w;
4244 string outs;
4245 monc->start_mon_command(vcmd, inbl, NULL, &outs, &w);
4246 int r = w.wait();
4247 if (r < 0) {
4248 if (r == -ENOENT && !created) {
4249 string newcmd = "{\"prefix\": \"osd create\", \"id\": " + stringify(whoami)
4250 + ", \"uuid\": \"" + stringify(superblock.osd_fsid) + "\"}";
4251 vector<string> vnewcmd{newcmd};
4252 bufferlist inbl;
4253 C_SaferCond w;
4254 string outs;
4255 monc->start_mon_command(vnewcmd, inbl, NULL, &outs, &w);
4256 int r = w.wait();
4257 if (r < 0) {
4258 derr << __func__ << " fail: osd does not exist and created failed: "
4259 << cpp_strerror(r) << dendl;
4260 return r;
4261 }
4262 created = true;
4263 continue;
4264 }
4265 derr << __func__ << " fail: '" << outs << "': " << cpp_strerror(r) << dendl;
4266 return r;
4267 }
4268 break;
4269 }
4270
4271 return 0;
4272}
4273
4274int OSD::update_crush_location()
4275{
4276 if (!cct->_conf->osd_crush_update_on_start) {
4277 dout(10) << __func__ << " osd_crush_update_on_start = false" << dendl;
4278 return 0;
4279 }
4280
4281 char weight[32];
4282 if (cct->_conf->osd_crush_initial_weight >= 0) {
4283 snprintf(weight, sizeof(weight), "%.4lf", cct->_conf->osd_crush_initial_weight);
4284 } else {
4285 struct store_statfs_t st;
11fdf7f2
TL
4286 osd_alert_list_t alerts;
4287 int r = store->statfs(&st, &alerts);
7c673cae
FG
4288 if (r < 0) {
4289 derr << "statfs: " << cpp_strerror(r) << dendl;
4290 return r;
4291 }
4292 snprintf(weight, sizeof(weight), "%.4lf",
11fdf7f2
TL
4293 std::max(.00001,
4294 double(st.total) /
4295 double(1ull << 40 /* TB */)));
7c673cae
FG
4296 }
4297
9f95a23c 4298 dout(10) << __func__ << " crush location is " << cct->crush_location << dendl;
7c673cae
FG
4299
4300 string cmd =
4301 string("{\"prefix\": \"osd crush create-or-move\", ") +
9f95a23c
TL
4302 string("\"id\": ") + stringify(whoami) + ", " +
4303 string("\"weight\":") + weight + ", " +
4304 string("\"args\": [") + stringify(cct->crush_location) + "]}";
7c673cae
FG
4305 return mon_cmd_maybe_osd_create(cmd);
4306}
4307
4308int OSD::update_crush_device_class()
4309{
224ce89b
WB
4310 if (!cct->_conf->osd_class_update_on_start) {
4311 dout(10) << __func__ << " osd_class_update_on_start = false" << dendl;
4312 return 0;
4313 }
4314
7c673cae
FG
4315 string device_class;
4316 int r = store->read_meta("crush_device_class", &device_class);
224ce89b
WB
4317 if (r < 0 || device_class.empty()) {
4318 device_class = store->get_default_device_class();
4319 }
4320
4321 if (device_class.empty()) {
d2e6a577 4322 dout(20) << __func__ << " no device class stored locally" << dendl;
7c673cae 4323 return 0;
224ce89b 4324 }
7c673cae
FG
4325
4326 string cmd =
4327 string("{\"prefix\": \"osd crush set-device-class\", ") +
224ce89b
WB
4328 string("\"class\": \"") + device_class + string("\", ") +
4329 string("\"ids\": [\"") + stringify(whoami) + string("\"]}");
7c673cae 4330
224ce89b 4331 r = mon_cmd_maybe_osd_create(cmd);
11fdf7f2
TL
4332 if (r == -EBUSY) {
4333 // good, already bound to a device-class
4334 return 0;
4335 } else {
4336 return r;
4337 }
7c673cae
FG
4338}
4339
4340void OSD::write_superblock(ObjectStore::Transaction& t)
4341{
4342 dout(10) << "write_superblock " << superblock << dendl;
4343
4344 //hack: at minimum it's using the baseline feature set
4345 if (!superblock.compat_features.incompat.contains(CEPH_OSD_FEATURE_INCOMPAT_BASE))
4346 superblock.compat_features.incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_BASE);
4347
4348 bufferlist bl;
11fdf7f2 4349 encode(superblock, bl);
7c673cae
FG
4350 t.write(coll_t::meta(), OSD_SUPERBLOCK_GOBJECT, 0, bl.length(), bl);
4351}
4352
4353int OSD::read_superblock()
4354{
4355 bufferlist bl;
11fdf7f2 4356 int r = store->read(service.meta_ch, OSD_SUPERBLOCK_GOBJECT, 0, 0, bl);
7c673cae
FG
4357 if (r < 0)
4358 return r;
4359
11fdf7f2
TL
4360 auto p = bl.cbegin();
4361 decode(superblock, p);
7c673cae
FG
4362
4363 dout(10) << "read_superblock " << superblock << dendl;
4364
4365 return 0;
4366}
4367
4368void OSD::clear_temp_objects()
4369{
4370 dout(10) << __func__ << dendl;
4371 vector<coll_t> ls;
4372 store->list_collections(ls);
4373 for (vector<coll_t>::iterator p = ls.begin(); p != ls.end(); ++p) {
4374 spg_t pgid;
4375 if (!p->is_pg(&pgid))
4376 continue;
4377
4378 // list temp objects
4379 dout(20) << " clearing temps in " << *p << " pgid " << pgid << dendl;
4380
4381 vector<ghobject_t> temps;
4382 ghobject_t next;
4383 while (1) {
4384 vector<ghobject_t> objects;
11fdf7f2
TL
4385 auto ch = store->open_collection(*p);
4386 ceph_assert(ch);
4387 store->collection_list(ch, next, ghobject_t::get_max(),
7c673cae
FG
4388 store->get_ideal_list_max(),
4389 &objects, &next);
4390 if (objects.empty())
4391 break;
4392 vector<ghobject_t>::iterator q;
4393 for (q = objects.begin(); q != objects.end(); ++q) {
4394 // Hammer set pool for temps to -1, so check for clean-up
4395 if (q->hobj.is_temp() || (q->hobj.pool == -1)) {
4396 temps.push_back(*q);
4397 } else {
4398 break;
4399 }
4400 }
4401 // If we saw a non-temp object and hit the break above we can
4402 // break out of the while loop too.
4403 if (q != objects.end())
4404 break;
4405 }
4406 if (!temps.empty()) {
4407 ObjectStore::Transaction t;
4408 int removed = 0;
4409 for (vector<ghobject_t>::iterator q = temps.begin(); q != temps.end(); ++q) {
4410 dout(20) << " removing " << *p << " object " << *q << dendl;
4411 t.remove(*p, *q);
4412 if (++removed > cct->_conf->osd_target_transaction_size) {
11fdf7f2 4413 store->queue_transaction(service.meta_ch, std::move(t));
7c673cae
FG
4414 t = ObjectStore::Transaction();
4415 removed = 0;
4416 }
4417 }
4418 if (removed) {
11fdf7f2 4419 store->queue_transaction(service.meta_ch, std::move(t));
7c673cae
FG
4420 }
4421 }
4422 }
4423}
4424
4425void OSD::recursive_remove_collection(CephContext* cct,
4426 ObjectStore *store, spg_t pgid,
4427 coll_t tmp)
4428{
4429 OSDriver driver(
4430 store,
4431 coll_t(),
4432 make_snapmapper_oid());
4433
11fdf7f2 4434 ObjectStore::CollectionHandle ch = store->open_collection(tmp);
7c673cae
FG
4435 ObjectStore::Transaction t;
4436 SnapMapper mapper(cct, &driver, 0, 0, 0, pgid.shard);
4437
11fdf7f2
TL
4438 ghobject_t next;
4439 int max = cct->_conf->osd_target_transaction_size;
7c673cae 4440 vector<ghobject_t> objects;
11fdf7f2
TL
4441 objects.reserve(max);
4442 while (true) {
4443 objects.clear();
4444 store->collection_list(ch, next, ghobject_t::get_max(),
4445 max, &objects, &next);
4446 generic_dout(10) << __func__ << " " << objects << dendl;
4447 if (objects.empty())
4448 break;
4449 for (auto& p: objects) {
4450 OSDriver::OSTransaction _t(driver.get_transaction(&t));
4451 int r = mapper.remove_oid(p.hobj, &_t);
4452 if (r != 0 && r != -ENOENT)
4453 ceph_abort();
4454 t.remove(tmp, p);
7c673cae 4455 }
11fdf7f2
TL
4456 int r = store->queue_transaction(ch, std::move(t));
4457 ceph_assert(r == 0);
4458 t = ObjectStore::Transaction();
7c673cae
FG
4459 }
4460 t.remove_collection(tmp);
11fdf7f2
TL
4461 int r = store->queue_transaction(ch, std::move(t));
4462 ceph_assert(r == 0);
7c673cae
FG
4463
4464 C_SaferCond waiter;
11fdf7f2 4465 if (!ch->flush_commit(&waiter)) {
7c673cae
FG
4466 waiter.wait();
4467 }
4468}
4469
4470
4471// ======================================================
4472// PG's
4473
7c673cae
FG
4474PG* OSD::_make_pg(
4475 OSDMapRef createmap,
4476 spg_t pgid)
4477{
11fdf7f2
TL
4478 dout(10) << __func__ << " " << pgid << dendl;
4479 pg_pool_t pi;
4480 map<string,string> ec_profile;
4481 string name;
4482 if (createmap->have_pg_pool(pgid.pool())) {
4483 pi = *createmap->get_pg_pool(pgid.pool());
4484 name = createmap->get_pool_name(pgid.pool());
4485 if (pi.is_erasure()) {
4486 ec_profile = createmap->get_erasure_code_profile(pi.erasure_code_profile);
4487 }
4488 } else {
4489 // pool was deleted; grab final pg_pool_t off disk.
4490 ghobject_t oid = make_final_pool_info_oid(pgid.pool());
4491 bufferlist bl;
4492 int r = store->read(service.meta_ch, oid, 0, 0, bl);
4493 if (r < 0) {
4494 derr << __func__ << " missing pool " << pgid.pool() << " tombstone"
4495 << dendl;
4496 return nullptr;
4497 }
4498 ceph_assert(r >= 0);
4499 auto p = bl.cbegin();
4500 decode(pi, p);
4501 decode(name, p);
4502 if (p.end()) { // dev release v13.0.2 did not include ec_profile
4503 derr << __func__ << " missing ec_profile from pool " << pgid.pool()
4504 << " tombstone" << dendl;
4505 return nullptr;
4506 }
4507 decode(ec_profile, p);
4508 }
4509 PGPool pool(cct, createmap, pgid.pool(), pi, name);
7c673cae 4510 PG *pg;
11fdf7f2
TL
4511 if (pi.type == pg_pool_t::TYPE_REPLICATED ||
4512 pi.type == pg_pool_t::TYPE_ERASURE)
4513 pg = new PrimaryLogPG(&service, createmap, pool, ec_profile, pgid);
7c673cae
FG
4514 else
4515 ceph_abort();
7c673cae
FG
4516 return pg;
4517}
4518
11fdf7f2 4519void OSD::_get_pgs(vector<PGRef> *v, bool clear_too)
7c673cae 4520{
11fdf7f2
TL
4521 v->clear();
4522 v->reserve(get_num_pgs());
4523 for (auto& s : shards) {
4524 std::lock_guard l(s->shard_lock);
4525 for (auto& j : s->pg_slots) {
4526 if (j.second->pg &&
4527 !j.second->pg->is_deleted()) {
4528 v->push_back(j.second->pg);
4529 if (clear_too) {
4530 s->_detach_pg(j.second.get());
4531 }
4532 }
7c673cae 4533 }
7c673cae 4534 }
7c673cae
FG
4535}
4536
11fdf7f2 4537void OSD::_get_pgids(vector<spg_t> *v)
7c673cae 4538{
11fdf7f2
TL
4539 v->clear();
4540 v->reserve(get_num_pgs());
4541 for (auto& s : shards) {
4542 std::lock_guard l(s->shard_lock);
4543 for (auto& j : s->pg_slots) {
4544 if (j.second->pg &&
4545 !j.second->pg->is_deleted()) {
4546 v->push_back(j.first);
4547 }
7c673cae
FG
4548 }
4549 }
7c673cae
FG
4550}
4551
11fdf7f2 4552void OSD::register_pg(PGRef pg)
7c673cae 4553{
11fdf7f2
TL
4554 spg_t pgid = pg->get_pgid();
4555 uint32_t shard_index = pgid.hash_to_shard(num_shards);
4556 auto sdata = shards[shard_index];
4557 std::lock_guard l(sdata->shard_lock);
4558 auto r = sdata->pg_slots.emplace(pgid, make_unique<OSDShardPGSlot>());
4559 ceph_assert(r.second);
4560 auto *slot = r.first->second.get();
4561 dout(20) << __func__ << " " << pgid << " " << pg << dendl;
4562 sdata->_attach_pg(slot, pg.get());
4563}
7c673cae 4564
11fdf7f2
TL
4565bool OSD::try_finish_pg_delete(PG *pg, unsigned old_pg_num)
4566{
4567 auto sdata = pg->osd_shard;
4568 ceph_assert(sdata);
4569 {
4570 std::lock_guard l(sdata->shard_lock);
4571 auto p = sdata->pg_slots.find(pg->pg_id);
4572 if (p == sdata->pg_slots.end() ||
4573 !p->second->pg) {
4574 dout(20) << __func__ << " " << pg->pg_id << " not found" << dendl;
4575 return false;
4576 }
4577 if (p->second->waiting_for_merge_epoch) {
4578 dout(20) << __func__ << " " << pg->pg_id << " waiting for merge" << dendl;
4579 return false;
4580 }
4581 dout(20) << __func__ << " " << pg->pg_id << " " << pg << dendl;
4582 sdata->_detach_pg(p->second.get());
4583 }
7c673cae 4584
11fdf7f2
TL
4585 for (auto shard : shards) {
4586 shard->unprime_split_children(pg->pg_id, old_pg_num);
4587 }
7c673cae 4588
11fdf7f2
TL
4589 // update pg count now since we might not get an osdmap any time soon.
4590 if (pg->is_primary())
4591 service.logger->dec(l_osd_pg_primary);
9f95a23c
TL
4592 else if (pg->is_nonprimary())
4593 service.logger->dec(l_osd_pg_replica); // misnomver
11fdf7f2
TL
4594 else
4595 service.logger->dec(l_osd_pg_stray);
7c673cae 4596
11fdf7f2 4597 return true;
7c673cae
FG
4598}
4599
11fdf7f2 4600PGRef OSD::_lookup_pg(spg_t pgid)
7c673cae 4601{
11fdf7f2
TL
4602 uint32_t shard_index = pgid.hash_to_shard(num_shards);
4603 auto sdata = shards[shard_index];
4604 std::lock_guard l(sdata->shard_lock);
4605 auto p = sdata->pg_slots.find(pgid);
4606 if (p == sdata->pg_slots.end()) {
7c673cae 4607 return nullptr;
11fdf7f2
TL
4608 }
4609 return p->second->pg;
7c673cae
FG
4610}
4611
11fdf7f2 4612PGRef OSD::_lookup_lock_pg(spg_t pgid)
31f18b77 4613{
11fdf7f2
TL
4614 PGRef pg = _lookup_pg(pgid);
4615 if (!pg) {
4616 return nullptr;
4617 }
4618 pg->lock();
4619 if (!pg->is_deleted()) {
4620 return pg;
4621 }
4622 pg->unlock();
4623 return nullptr;
31f18b77
FG
4624}
4625
11fdf7f2 4626PGRef OSD::lookup_lock_pg(spg_t pgid)
7c673cae 4627{
11fdf7f2 4628 return _lookup_lock_pg(pgid);
7c673cae
FG
4629}
4630
4631void OSD::load_pgs()
4632{
9f95a23c 4633 ceph_assert(ceph_mutex_is_locked(osd_lock));
7c673cae 4634 dout(0) << "load_pgs" << dendl;
11fdf7f2 4635
7c673cae 4636 {
11fdf7f2
TL
4637 auto pghist = make_pg_num_history_oid();
4638 bufferlist bl;
4639 int r = store->read(service.meta_ch, pghist, 0, 0, bl, 0);
4640 if (r >= 0 && bl.length() > 0) {
4641 auto p = bl.cbegin();
4642 decode(pg_num_history, p);
4643 }
4644 dout(20) << __func__ << " pg_num_history " << pg_num_history << dendl;
7c673cae
FG
4645 }
4646
4647 vector<coll_t> ls;
4648 int r = store->list_collections(ls);
4649 if (r < 0) {
4650 derr << "failed to list pgs: " << cpp_strerror(-r) << dendl;
4651 }
4652
11fdf7f2 4653 int num = 0;
7c673cae
FG
4654 for (vector<coll_t>::iterator it = ls.begin();
4655 it != ls.end();
4656 ++it) {
4657 spg_t pgid;
4658 if (it->is_temp(&pgid) ||
4659 (it->is_pg(&pgid) && PG::_has_removal_flag(store, pgid))) {
11fdf7f2
TL
4660 dout(10) << "load_pgs " << *it
4661 << " removing, legacy or flagged for removal pg" << dendl;
7c673cae
FG
4662 recursive_remove_collection(cct, store, pgid, *it);
4663 continue;
4664 }
4665
4666 if (!it->is_pg(&pgid)) {
4667 dout(10) << "load_pgs ignoring unrecognized " << *it << dendl;
4668 continue;
4669 }
4670
7c673cae 4671 dout(10) << "pgid " << pgid << " coll " << coll_t(pgid) << dendl;
7c673cae 4672 epoch_t map_epoch = 0;
11fdf7f2 4673 int r = PG::peek_map_epoch(store, pgid, &map_epoch);
7c673cae
FG
4674 if (r < 0) {
4675 derr << __func__ << " unable to peek at " << pgid << " metadata, skipping"
4676 << dendl;
4677 continue;
4678 }
4679
11fdf7f2 4680 PGRef pg;
7c673cae
FG
4681 if (map_epoch > 0) {
4682 OSDMapRef pgosdmap = service.try_get_map(map_epoch);
4683 if (!pgosdmap) {
9f95a23c 4684 if (!get_osdmap()->have_pg_pool(pgid.pool())) {
7c673cae
FG
4685 derr << __func__ << ": could not find map for epoch " << map_epoch
4686 << " on pg " << pgid << ", but the pool is not present in the "
4687 << "current map, so this is probably a result of bug 10617. "
4688 << "Skipping the pg for now, you can use ceph-objectstore-tool "
4689 << "to clean it up later." << dendl;
4690 continue;
4691 } else {
4692 derr << __func__ << ": have pgid " << pgid << " at epoch "
4693 << map_epoch << ", but missing map. Crashing."
4694 << dendl;
11fdf7f2 4695 ceph_abort_msg("Missing map in load_pgs");
7c673cae
FG
4696 }
4697 }
11fdf7f2 4698 pg = _make_pg(pgosdmap, pgid);
7c673cae 4699 } else {
9f95a23c 4700 pg = _make_pg(get_osdmap(), pgid);
7c673cae 4701 }
11fdf7f2
TL
4702 if (!pg) {
4703 recursive_remove_collection(cct, store, pgid, *it);
4704 continue;
4705 }
4706
4707 // there can be no waiters here, so we don't call _wake_pg_slot
7c673cae 4708
11fdf7f2 4709 pg->lock();
7c673cae
FG
4710 pg->ch = store->open_collection(pg->coll);
4711
4712 // read pg state, log
11fdf7f2 4713 pg->read_state(store);
7c673cae 4714
94b18763
FG
4715 if (pg->dne()) {
4716 dout(10) << "load_pgs " << *it << " deleting dne" << dendl;
4717 pg->ch = nullptr;
94b18763 4718 pg->unlock();
94b18763
FG
4719 recursive_remove_collection(cct, store, pgid, *it);
4720 continue;
4721 }
11fdf7f2
TL
4722 {
4723 uint32_t shard_index = pgid.hash_to_shard(shards.size());
4724 assert(NULL != shards[shard_index]);
4725 store->set_collection_commit_queue(pg->coll, &(shards[shard_index]->context_queue));
4726 }
7c673cae
FG
4727
4728 pg->reg_next_scrub();
4729
11fdf7f2 4730 dout(10) << __func__ << " loaded " << *pg << dendl;
7c673cae 4731 pg->unlock();
7c673cae 4732
11fdf7f2
TL
4733 register_pg(pg);
4734 ++num;
7c673cae 4735 }
11fdf7f2 4736 dout(0) << __func__ << " opened " << num << " pgs" << dendl;
7c673cae
FG
4737}
4738
4739
11fdf7f2
TL
4740PGRef OSD::handle_pg_create_info(const OSDMapRef& osdmap,
4741 const PGCreateInfo *info)
4742{
4743 spg_t pgid = info->pgid;
7c673cae 4744
11fdf7f2
TL
4745 if (maybe_wait_for_max_pg(osdmap, pgid, info->by_mon)) {
4746 dout(10) << __func__ << " hit max pg, dropping" << dendl;
4747 return nullptr;
4748 }
3efd9988 4749
9f95a23c 4750 PeeringCtx rctx = create_context();
7c673cae 4751
11fdf7f2 4752 OSDMapRef startmap = get_map(info->epoch);
7c673cae 4753
11fdf7f2
TL
4754 if (info->by_mon) {
4755 int64_t pool_id = pgid.pgid.pool();
4756 const pg_pool_t *pool = osdmap->get_pg_pool(pool_id);
4757 if (!pool) {
4758 dout(10) << __func__ << " ignoring " << pgid << ", pool dne" << dendl;
4759 return nullptr;
4760 }
9f95a23c 4761 if (osdmap->require_osd_release >= ceph_release_t::nautilus &&
11fdf7f2
TL
4762 !pool->has_flag(pg_pool_t::FLAG_CREATING)) {
4763 // this ensures we do not process old creating messages after the
4764 // pool's initial pgs have been created (and pg are subsequently
4765 // allowed to split or merge).
4766 dout(20) << __func__ << " dropping " << pgid
4767 << "create, pool does not have CREATING flag set" << dendl;
4768 return nullptr;
7c673cae
FG
4769 }
4770 }
7c673cae 4771
11fdf7f2
TL
4772 int up_primary, acting_primary;
4773 vector<int> up, acting;
4774 startmap->pg_to_up_acting_osds(
4775 pgid.pgid, &up, &up_primary, &acting, &acting_primary);
7c673cae 4776
11fdf7f2
TL
4777 const pg_pool_t* pp = startmap->get_pg_pool(pgid.pool());
4778 if (pp->has_flag(pg_pool_t::FLAG_EC_OVERWRITES) &&
4779 store->get_type() != "bluestore") {
4780 clog->warn() << "pg " << pgid
4781 << " is at risk of silent data corruption: "
4782 << "the pool allows ec overwrites but is not stored in "
4783 << "bluestore, so deep scrubbing will not detect bitrot";
7c673cae 4784 }
9f95a23c
TL
4785 create_pg_collection(
4786 rctx.transaction, pgid, pgid.get_split_bits(pp->get_pg_num()));
4787 init_pg_ondisk(rctx.transaction, pgid, pp);
7c673cae 4788
9f95a23c 4789 int role = startmap->calc_pg_role(pg_shard_t(whoami, pgid.shard), acting);
7c673cae 4790
11fdf7f2
TL
4791 PGRef pg = _make_pg(startmap, pgid);
4792 pg->ch = store->create_new_collection(pg->coll);
7c673cae 4793
11fdf7f2
TL
4794 {
4795 uint32_t shard_index = pgid.hash_to_shard(shards.size());
4796 assert(NULL != shards[shard_index]);
4797 store->set_collection_commit_queue(pg->coll, &(shards[shard_index]->context_queue));
7c673cae 4798 }
7c673cae 4799
11fdf7f2 4800 pg->lock(true);
7c673cae 4801
11fdf7f2
TL
4802 // we are holding the shard lock
4803 ceph_assert(!pg->is_deleted());
4804
4805 pg->init(
4806 role,
4807 up,
4808 up_primary,
4809 acting,
4810 acting_primary,
4811 info->history,
4812 info->past_intervals,
4813 false,
4814 rctx.transaction);
7c673cae 4815
92f5a8d4
TL
4816 pg->init_collection_pool_opts();
4817
11fdf7f2 4818 if (pg->is_primary()) {
9f95a23c 4819 std::lock_guard locker{m_perf_queries_lock};
11fdf7f2
TL
4820 pg->set_dynamic_perf_stats_queries(m_perf_queries);
4821 }
7c673cae 4822
9f95a23c
TL
4823 pg->handle_initialize(rctx);
4824 pg->handle_activate_map(rctx);
7c673cae 4825
11fdf7f2 4826 dispatch_context(rctx, pg.get(), osdmap, nullptr);
7c673cae 4827
11fdf7f2
TL
4828 dout(10) << __func__ << " new pg " << *pg << dendl;
4829 return pg;
7c673cae
FG
4830}
4831
11fdf7f2
TL
4832bool OSD::maybe_wait_for_max_pg(const OSDMapRef& osdmap,
4833 spg_t pgid,
4834 bool is_mon_create)
3efd9988
FG
4835{
4836 const auto max_pgs_per_osd =
11fdf7f2
TL
4837 (cct->_conf.get_val<uint64_t>("mon_max_pg_per_osd") *
4838 cct->_conf.get_val<double>("osd_max_pg_per_osd_hard_ratio"));
3efd9988 4839
11fdf7f2 4840 if (num_pgs < max_pgs_per_osd) {
3efd9988
FG
4841 return false;
4842 }
11fdf7f2
TL
4843
4844 std::lock_guard l(pending_creates_lock);
3efd9988
FG
4845 if (is_mon_create) {
4846 pending_creates_from_mon++;
4847 } else {
9f95a23c
TL
4848 bool is_primary = osdmap->get_pg_acting_role(pgid, whoami) == 0;
4849 pending_creates_from_osd.emplace(pgid, is_primary);
3efd9988 4850 }
1adf2230 4851 dout(1) << __func__ << " withhold creation of pg " << pgid
11fdf7f2 4852 << ": " << num_pgs << " >= "<< max_pgs_per_osd << dendl;
3efd9988
FG
4853 return true;
4854}
4855
4856// to re-trigger a peering, we have to twiddle the pg mapping a little bit,
4857// see PG::should_restart_peering(). OSDMap::pg_to_up_acting_osds() will turn
4858// to up set if pg_temp is empty. so an empty pg_temp won't work.
4859static vector<int32_t> twiddle(const vector<int>& acting) {
4860 if (acting.size() > 1) {
4861 return {acting[0]};
4862 } else {
4863 vector<int32_t> twiddled(acting.begin(), acting.end());
4864 twiddled.push_back(-1);
4865 return twiddled;
4866 }
4867}
4868
4869void OSD::resume_creating_pg()
4870{
4871 bool do_sub_pg_creates = false;
b32b8144 4872 bool have_pending_creates = false;
3efd9988
FG
4873 {
4874 const auto max_pgs_per_osd =
11fdf7f2
TL
4875 (cct->_conf.get_val<uint64_t>("mon_max_pg_per_osd") *
4876 cct->_conf.get_val<double>("osd_max_pg_per_osd_hard_ratio"));
4877 if (max_pgs_per_osd <= num_pgs) {
3efd9988
FG
4878 // this could happen if admin decreases this setting before a PG is removed
4879 return;
4880 }
11fdf7f2
TL
4881 unsigned spare_pgs = max_pgs_per_osd - num_pgs;
4882 std::lock_guard l(pending_creates_lock);
3efd9988 4883 if (pending_creates_from_mon > 0) {
11fdf7f2
TL
4884 dout(20) << __func__ << " pending_creates_from_mon "
4885 << pending_creates_from_mon << dendl;
3efd9988
FG
4886 do_sub_pg_creates = true;
4887 if (pending_creates_from_mon >= spare_pgs) {
4888 spare_pgs = pending_creates_from_mon = 0;
4889 } else {
4890 spare_pgs -= pending_creates_from_mon;
4891 pending_creates_from_mon = 0;
4892 }
4893 }
4894 auto pg = pending_creates_from_osd.cbegin();
4895 while (spare_pgs > 0 && pg != pending_creates_from_osd.cend()) {
94b18763 4896 dout(20) << __func__ << " pg " << pg->first << dendl;
3efd9988 4897 vector<int> acting;
9f95a23c
TL
4898 get_osdmap()->pg_to_up_acting_osds(pg->first.pgid, nullptr, nullptr, &acting, nullptr);
4899 service.queue_want_pg_temp(pg->first.pgid, twiddle(acting), true);
3efd9988 4900 pg = pending_creates_from_osd.erase(pg);
94b18763 4901 do_sub_pg_creates = true;
3efd9988
FG
4902 spare_pgs--;
4903 }
b32b8144
FG
4904 have_pending_creates = (pending_creates_from_mon > 0 ||
4905 !pending_creates_from_osd.empty());
3efd9988 4906 }
b32b8144
FG
4907
4908 bool do_renew_subs = false;
3efd9988
FG
4909 if (do_sub_pg_creates) {
4910 if (monc->sub_want("osd_pg_creates", last_pg_create_epoch, 0)) {
4911 dout(4) << __func__ << ": resolicit pg creates from mon since "
4912 << last_pg_create_epoch << dendl;
b32b8144 4913 do_renew_subs = true;
3efd9988
FG
4914 }
4915 }
9f95a23c 4916 version_t start = get_osdmap_epoch() + 1;
b32b8144
FG
4917 if (have_pending_creates) {
4918 // don't miss any new osdmap deleting PGs
4919 if (monc->sub_want("osdmap", start, 0)) {
4920 dout(4) << __func__ << ": resolicit osdmap from mon since "
4921 << start << dendl;
4922 do_renew_subs = true;
4923 }
94b18763 4924 } else if (do_sub_pg_creates) {
b32b8144
FG
4925 // no need to subscribe the osdmap continuously anymore
4926 // once the pgtemp and/or mon_subscribe(pg_creates) is sent
4927 if (monc->sub_want_increment("osdmap", start, CEPH_SUBSCRIBE_ONETIME)) {
11fdf7f2 4928 dout(4) << __func__ << ": re-subscribe osdmap(onetime) since "
b32b8144
FG
4929 << start << dendl;
4930 do_renew_subs = true;
4931 }
4932 }
4933
4934 if (do_renew_subs) {
4935 monc->renew_subs();
4936 }
4937
94b18763 4938 service.send_pg_temp();
3efd9988 4939}
7c673cae
FG
4940
4941void OSD::build_initial_pg_history(
4942 spg_t pgid,
4943 epoch_t created,
4944 utime_t created_stamp,
4945 pg_history_t *h,
4946 PastIntervals *pi)
4947{
4948 dout(10) << __func__ << " " << pgid << " created " << created << dendl;
9f95a23c 4949 *h = pg_history_t(created, created_stamp);
7c673cae
FG
4950
4951 OSDMapRef lastmap = service.get_map(created);
4952 int up_primary, acting_primary;
4953 vector<int> up, acting;
4954 lastmap->pg_to_up_acting_osds(
4955 pgid.pgid, &up, &up_primary, &acting, &acting_primary);
4956
4957 ostringstream debug;
9f95a23c 4958 for (epoch_t e = created + 1; e <= get_osdmap_epoch(); ++e) {
7c673cae
FG
4959 OSDMapRef osdmap = service.get_map(e);
4960 int new_up_primary, new_acting_primary;
4961 vector<int> new_up, new_acting;
4962 osdmap->pg_to_up_acting_osds(
4963 pgid.pgid, &new_up, &new_up_primary, &new_acting, &new_acting_primary);
4964
4965 // this is a bit imprecise, but sufficient?
4966 struct min_size_predicate_t : public IsPGRecoverablePredicate {
4967 const pg_pool_t *pi;
4968 bool operator()(const set<pg_shard_t> &have) const {
4969 return have.size() >= pi->min_size;
4970 }
11fdf7f2 4971 explicit min_size_predicate_t(const pg_pool_t *i) : pi(i) {}
7c673cae
FG
4972 } min_size_predicate(osdmap->get_pg_pool(pgid.pgid.pool()));
4973
4974 bool new_interval = PastIntervals::check_new_interval(
4975 acting_primary,
4976 new_acting_primary,
4977 acting, new_acting,
4978 up_primary,
4979 new_up_primary,
4980 up, new_up,
4981 h->same_interval_since,
4982 h->last_epoch_clean,
9f95a23c
TL
4983 osdmap.get(),
4984 lastmap.get(),
7c673cae 4985 pgid.pgid,
9f95a23c 4986 min_size_predicate,
7c673cae
FG
4987 pi,
4988 &debug);
4989 if (new_interval) {
4990 h->same_interval_since = e;
181888fb
FG
4991 if (up != new_up) {
4992 h->same_up_since = e;
4993 }
4994 if (acting_primary != new_acting_primary) {
4995 h->same_primary_since = e;
4996 }
4997 if (pgid.pgid.is_split(lastmap->get_pg_num(pgid.pgid.pool()),
4998 osdmap->get_pg_num(pgid.pgid.pool()),
4999 nullptr)) {
5000 h->last_epoch_split = e;
5001 }
5002 up = new_up;
5003 acting = new_acting;
5004 up_primary = new_up_primary;
5005 acting_primary = new_acting_primary;
c07f9fc5 5006 }
7c673cae
FG
5007 lastmap = osdmap;
5008 }
5009 dout(20) << __func__ << " " << debug.str() << dendl;
5010 dout(10) << __func__ << " " << *h << " " << *pi
5011 << " [" << (pi->empty() ? pair<epoch_t,epoch_t>(0,0) :
5012 pi->get_bounds()) << ")"
5013 << dendl;
5014}
5015
7c673cae
FG
5016void OSD::_add_heartbeat_peer(int p)
5017{
5018 if (p == whoami)
5019 return;
5020 HeartbeatInfo *hi;
5021
5022 map<int,HeartbeatInfo>::iterator i = heartbeat_peers.find(p);
5023 if (i == heartbeat_peers.end()) {
9f95a23c 5024 pair<ConnectionRef,ConnectionRef> cons = service.get_con_osd_hb(p, get_osdmap_epoch());
7c673cae
FG
5025 if (!cons.first)
5026 return;
9f95a23c
TL
5027 assert(cons.second);
5028
7c673cae
FG
5029 hi = &heartbeat_peers[p];
5030 hi->peer = p;
9f95a23c
TL
5031
5032 auto stamps = service.get_hb_stamps(p);
5033
5034 auto sb = ceph::make_ref<Session>(cct, cons.first.get());
5035 sb->peer = p;
5036 sb->stamps = stamps;
eafe8130 5037 hi->hb_interval_start = ceph_clock_now();
7c673cae 5038 hi->con_back = cons.first.get();
9f95a23c
TL
5039 hi->con_back->set_priv(sb);
5040
5041 auto sf = ceph::make_ref<Session>(cct, cons.second.get());
5042 sf->peer = p;
5043 sf->stamps = stamps;
5044 hi->con_front = cons.second.get();
5045 hi->con_front->set_priv(sf);
5046
5047 dout(10) << "_add_heartbeat_peer: new peer osd." << p
5048 << " " << hi->con_back->get_peer_addr()
5049 << " " << hi->con_front->get_peer_addr()
5050 << dendl;
7c673cae
FG
5051 } else {
5052 hi = &i->second;
5053 }
9f95a23c 5054 hi->epoch = get_osdmap_epoch();
7c673cae
FG
5055}
5056
5057void OSD::_remove_heartbeat_peer(int n)
5058{
5059 map<int,HeartbeatInfo>::iterator q = heartbeat_peers.find(n);
11fdf7f2 5060 ceph_assert(q != heartbeat_peers.end());
7c673cae
FG
5061 dout(20) << " removing heartbeat peer osd." << n
5062 << " " << q->second.con_back->get_peer_addr()
5063 << " " << (q->second.con_front ? q->second.con_front->get_peer_addr() : entity_addr_t())
5064 << dendl;
9f95a23c 5065 q->second.clear_mark_down();
7c673cae
FG
5066 heartbeat_peers.erase(q);
5067}
5068
5069void OSD::need_heartbeat_peer_update()
5070{
5071 if (is_stopping())
5072 return;
5073 dout(20) << "need_heartbeat_peer_update" << dendl;
5074 heartbeat_set_peers_need_update();
5075}
5076
5077void OSD::maybe_update_heartbeat_peers()
5078{
9f95a23c 5079 ceph_assert(ceph_mutex_is_locked(osd_lock));
7c673cae 5080
11fdf7f2 5081 if (is_waiting_for_healthy() || is_active()) {
7c673cae
FG
5082 utime_t now = ceph_clock_now();
5083 if (last_heartbeat_resample == utime_t()) {
5084 last_heartbeat_resample = now;
5085 heartbeat_set_peers_need_update();
5086 } else if (!heartbeat_peers_need_update()) {
5087 utime_t dur = now - last_heartbeat_resample;
5088 if (dur > cct->_conf->osd_heartbeat_grace) {
5089 dout(10) << "maybe_update_heartbeat_peers forcing update after " << dur << " seconds" << dendl;
5090 heartbeat_set_peers_need_update();
5091 last_heartbeat_resample = now;
494da23a
TL
5092 // automatically clean up any stale heartbeat peers
5093 // if we are unhealthy, then clean all
5094 reset_heartbeat_peers(is_waiting_for_healthy());
7c673cae
FG
5095 }
5096 }
5097 }
5098
5099 if (!heartbeat_peers_need_update())
5100 return;
5101 heartbeat_clear_peers_need_update();
5102
11fdf7f2 5103 std::lock_guard l(heartbeat_lock);
7c673cae
FG
5104
5105 dout(10) << "maybe_update_heartbeat_peers updating" << dendl;
5106
5107
5108 // build heartbeat from set
5109 if (is_active()) {
11fdf7f2
TL
5110 vector<PGRef> pgs;
5111 _get_pgs(&pgs);
5112 for (auto& pg : pgs) {
5113 pg->with_heartbeat_peers([&](int peer) {
9f95a23c 5114 if (get_osdmap()->is_up(peer)) {
11fdf7f2
TL
5115 _add_heartbeat_peer(peer);
5116 }
5117 });
7c673cae
FG
5118 }
5119 }
5120
5121 // include next and previous up osds to ensure we have a fully-connected set
5122 set<int> want, extras;
9f95a23c 5123 const int next = get_osdmap()->get_next_up_osd_after(whoami);
7c673cae
FG
5124 if (next >= 0)
5125 want.insert(next);
9f95a23c 5126 int prev = get_osdmap()->get_previous_up_osd_before(whoami);
7c673cae
FG
5127 if (prev >= 0 && prev != next)
5128 want.insert(prev);
5129
11fdf7f2
TL
5130 // make sure we have at least **min_down** osds coming from different
5131 // subtree level (e.g., hosts) for fast failure detection.
5132 auto min_down = cct->_conf.get_val<uint64_t>("mon_osd_min_down_reporters");
5133 auto subtree = cct->_conf.get_val<string>("mon_osd_reporter_subtree_level");
9f95a23c
TL
5134 auto limit = std::max(min_down, (uint64_t)cct->_conf->osd_heartbeat_min_peers);
5135 get_osdmap()->get_random_up_osds_by_subtree(
5136 whoami, subtree, limit, want, &want);
11fdf7f2 5137
7c673cae
FG
5138 for (set<int>::iterator p = want.begin(); p != want.end(); ++p) {
5139 dout(10) << " adding neighbor peer osd." << *p << dendl;
5140 extras.insert(*p);
5141 _add_heartbeat_peer(*p);
5142 }
5143
5144 // remove down peers; enumerate extras
5145 map<int,HeartbeatInfo>::iterator p = heartbeat_peers.begin();
5146 while (p != heartbeat_peers.end()) {
9f95a23c 5147 if (!get_osdmap()->is_up(p->first)) {
7c673cae
FG
5148 int o = p->first;
5149 ++p;
5150 _remove_heartbeat_peer(o);
5151 continue;
5152 }
9f95a23c 5153 if (p->second.epoch < get_osdmap_epoch()) {
7c673cae
FG
5154 extras.insert(p->first);
5155 }
5156 ++p;
5157 }
5158
5159 // too few?
11fdf7f2 5160 for (int n = next; n >= 0; ) {
7c673cae
FG
5161 if ((int)heartbeat_peers.size() >= cct->_conf->osd_heartbeat_min_peers)
5162 break;
5163 if (!extras.count(n) && !want.count(n) && n != whoami) {
5164 dout(10) << " adding random peer osd." << n << dendl;
5165 extras.insert(n);
5166 _add_heartbeat_peer(n);
5167 }
9f95a23c 5168 n = get_osdmap()->get_next_up_osd_after(n);
11fdf7f2 5169 if (n == next)
7c673cae
FG
5170 break; // came full circle; stop
5171 }
5172
5173 // too many?
5174 for (set<int>::iterator p = extras.begin();
5175 (int)heartbeat_peers.size() > cct->_conf->osd_heartbeat_min_peers && p != extras.end();
5176 ++p) {
5177 if (want.count(*p))
5178 continue;
5179 _remove_heartbeat_peer(*p);
5180 }
5181
5182 dout(10) << "maybe_update_heartbeat_peers " << heartbeat_peers.size() << " peers, extras " << extras << dendl;
9f95a23c
TL
5183
5184 // clean up stale failure pending
5185 for (auto it = failure_pending.begin(); it != failure_pending.end();) {
5186 if (heartbeat_peers.count(it->first) == 0) {
5187 send_still_alive(get_osdmap_epoch(), it->first, it->second.second);
5188 failure_pending.erase(it++);
5189 } else {
5190 it++;
5191 }
5192 }
7c673cae
FG
5193}
5194
494da23a 5195void OSD::reset_heartbeat_peers(bool all)
7c673cae 5196{
9f95a23c 5197 ceph_assert(ceph_mutex_is_locked(osd_lock));
7c673cae 5198 dout(10) << "reset_heartbeat_peers" << dendl;
494da23a
TL
5199 utime_t stale = ceph_clock_now();
5200 stale -= cct->_conf.get_val<int64_t>("osd_heartbeat_stale");
11fdf7f2 5201 std::lock_guard l(heartbeat_lock);
494da23a
TL
5202 for (auto it = heartbeat_peers.begin(); it != heartbeat_peers.end();) {
5203 HeartbeatInfo& hi = it->second;
5204 if (all || hi.is_stale(stale)) {
9f95a23c 5205 hi.clear_mark_down();
494da23a
TL
5206 // stop sending failure_report to mon too
5207 failure_queue.erase(it->first);
5208 heartbeat_peers.erase(it++);
5209 } else {
5210 it++;
7c673cae 5211 }
7c673cae 5212 }
7c673cae
FG
5213}
5214
5215void OSD::handle_osd_ping(MOSDPing *m)
5216{
5217 if (superblock.cluster_fsid != m->fsid) {
5218 dout(20) << "handle_osd_ping from " << m->get_source_inst()
9f95a23c
TL
5219 << " bad fsid " << m->fsid << " != " << superblock.cluster_fsid
5220 << dendl;
7c673cae
FG
5221 m->put();
5222 return;
5223 }
5224
5225 int from = m->get_source().num();
5226
9f95a23c 5227 heartbeat_lock.lock();
7c673cae 5228 if (is_stopping()) {
9f95a23c 5229 heartbeat_lock.unlock();
7c673cae
FG
5230 m->put();
5231 return;
5232 }
5233
9f95a23c
TL
5234 utime_t now = ceph_clock_now();
5235 auto mnow = service.get_mnow();
5236 ConnectionRef con(m->get_connection());
7c673cae 5237 OSDMapRef curmap = service.get_osdmap();
c07f9fc5 5238 if (!curmap) {
9f95a23c 5239 heartbeat_lock.unlock();
c07f9fc5
FG
5240 m->put();
5241 return;
5242 }
7c673cae 5243
9f95a23c
TL
5244 auto sref = con->get_priv();
5245 Session *s = static_cast<Session*>(sref.get());
5246 if (!s) {
5247 heartbeat_lock.unlock();
5248 m->put();
5249 return;
5250 }
5251 if (!s->stamps) {
5252 s->peer = from;
5253 s->stamps = service.get_hb_stamps(from);
5254 }
5255
7c673cae
FG
5256 switch (m->op) {
5257
5258 case MOSDPing::PING:
5259 {
5260 if (cct->_conf->osd_debug_drop_ping_probability > 0) {
5261 auto heartbeat_drop = debug_heartbeat_drops_remaining.find(from);
5262 if (heartbeat_drop != debug_heartbeat_drops_remaining.end()) {
5263 if (heartbeat_drop->second == 0) {
5264 debug_heartbeat_drops_remaining.erase(heartbeat_drop);
5265 } else {
5266 --heartbeat_drop->second;
5267 dout(5) << "Dropping heartbeat from " << from
5268 << ", " << heartbeat_drop->second
5269 << " remaining to drop" << dendl;
5270 break;
5271 }
5272 } else if (cct->_conf->osd_debug_drop_ping_probability >
5273 ((((double)(rand()%100))/100.0))) {
5274 heartbeat_drop =
5275 debug_heartbeat_drops_remaining.insert(std::make_pair(from,
5276 cct->_conf->osd_debug_drop_ping_duration)).first;
5277 dout(5) << "Dropping heartbeat from " << from
5278 << ", " << heartbeat_drop->second
5279 << " remaining to drop" << dendl;
5280 break;
5281 }
5282 }
5283
9f95a23c
TL
5284 ceph::signedspan sender_delta_ub{};
5285 s->stamps->got_ping(
5286 m->up_from,
5287 mnow,
5288 m->mono_send_stamp,
5289 m->delta_ub,
5290 &sender_delta_ub);
5291 dout(20) << __func__ << " new stamps " << *s->stamps << dendl;
5292
7c673cae 5293 if (!cct->get_heartbeat_map()->is_healthy()) {
9f95a23c
TL
5294 dout(10) << "internal heartbeat not healthy, dropping ping request"
5295 << dendl;
7c673cae
FG
5296 break;
5297 }
5298
5299 Message *r = new MOSDPing(monc->get_fsid(),
5300 curmap->get_epoch(),
9f95a23c
TL
5301 MOSDPing::PING_REPLY,
5302 m->ping_stamp,
5303 m->mono_ping_stamp,
5304 mnow,
5305 service.get_up_epoch(),
5306 cct->_conf->osd_heartbeat_min_size,
5307 sender_delta_ub);
5308 con->send_message(r);
7c673cae
FG
5309
5310 if (curmap->is_up(from)) {
7c673cae 5311 if (is_active()) {
9f95a23c
TL
5312 ConnectionRef cluster_con = service.get_con_osd_cluster(
5313 from, curmap->get_epoch());
5314 if (cluster_con) {
5315 service.maybe_share_map(cluster_con.get(), curmap, m->map_epoch);
7c673cae
FG
5316 }
5317 }
5318 } else if (!curmap->exists(from) ||
5319 curmap->get_down_at(from) > m->map_epoch) {
5320 // tell them they have died
5321 Message *r = new MOSDPing(monc->get_fsid(),
5322 curmap->get_epoch(),
5323 MOSDPing::YOU_DIED,
9f95a23c
TL
5324 m->ping_stamp,
5325 m->mono_ping_stamp,
5326 mnow,
5327 service.get_up_epoch(),
31f18b77 5328 cct->_conf->osd_heartbeat_min_size);
9f95a23c 5329 con->send_message(r);
7c673cae
FG
5330 }
5331 }
5332 break;
5333
5334 case MOSDPing::PING_REPLY:
5335 {
5336 map<int,HeartbeatInfo>::iterator i = heartbeat_peers.find(from);
5337 if (i != heartbeat_peers.end()) {
9f95a23c 5338 auto acked = i->second.ping_history.find(m->ping_stamp);
11fdf7f2 5339 if (acked != i->second.ping_history.end()) {
11fdf7f2 5340 int &unacknowledged = acked->second.second;
9f95a23c 5341 if (con == i->second.con_back) {
11fdf7f2
TL
5342 dout(25) << "handle_osd_ping got reply from osd." << from
5343 << " first_tx " << i->second.first_tx
5344 << " last_tx " << i->second.last_tx
9f95a23c
TL
5345 << " last_rx_back " << i->second.last_rx_back
5346 << " -> " << now
11fdf7f2
TL
5347 << " last_rx_front " << i->second.last_rx_front
5348 << dendl;
5349 i->second.last_rx_back = now;
5350 ceph_assert(unacknowledged > 0);
5351 --unacknowledged;
5352 // if there is no front con, set both stamps.
5353 if (i->second.con_front == NULL) {
5354 i->second.last_rx_front = now;
5355 ceph_assert(unacknowledged > 0);
5356 --unacknowledged;
5357 }
9f95a23c 5358 } else if (con == i->second.con_front) {
11fdf7f2
TL
5359 dout(25) << "handle_osd_ping got reply from osd." << from
5360 << " first_tx " << i->second.first_tx
5361 << " last_tx " << i->second.last_tx
5362 << " last_rx_back " << i->second.last_rx_back
9f95a23c
TL
5363 << " last_rx_front " << i->second.last_rx_front
5364 << " -> " << now
11fdf7f2
TL
5365 << dendl;
5366 i->second.last_rx_front = now;
5367 ceph_assert(unacknowledged > 0);
5368 --unacknowledged;
5369 }
7c673cae 5370
11fdf7f2
TL
5371 if (unacknowledged == 0) {
5372 // succeeded in getting all replies
5373 dout(25) << "handle_osd_ping got all replies from osd." << from
9f95a23c 5374 << " , erase pending ping(sent at " << m->ping_stamp << ")"
11fdf7f2
TL
5375 << " and older pending ping(s)"
5376 << dendl;
eafe8130
TL
5377
5378#define ROUND_S_TO_USEC(sec) (uint32_t)((sec) * 1000 * 1000 + 0.5)
5379 ++i->second.hb_average_count;
9f95a23c 5380 uint32_t back_pingtime = ROUND_S_TO_USEC(i->second.last_rx_back - m->ping_stamp);
eafe8130
TL
5381 i->second.hb_total_back += back_pingtime;
5382 if (back_pingtime < i->second.hb_min_back)
5383 i->second.hb_min_back = back_pingtime;
5384 if (back_pingtime > i->second.hb_max_back)
5385 i->second.hb_max_back = back_pingtime;
9f95a23c 5386 uint32_t front_pingtime = ROUND_S_TO_USEC(i->second.last_rx_front - m->ping_stamp);
eafe8130
TL
5387 i->second.hb_total_front += front_pingtime;
5388 if (front_pingtime < i->second.hb_min_front)
5389 i->second.hb_min_front = front_pingtime;
5390 if (front_pingtime > i->second.hb_max_front)
5391 i->second.hb_max_front = front_pingtime;
5392
5393 ceph_assert(i->second.hb_interval_start != utime_t());
5394 if (i->second.hb_interval_start == utime_t())
5395 i->second.hb_interval_start = now;
5396 int64_t hb_avg_time_period = 60;
5397 if (cct->_conf.get_val<int64_t>("debug_heartbeat_testing_span")) {
5398 hb_avg_time_period = cct->_conf.get_val<int64_t>("debug_heartbeat_testing_span");
5399 }
5400 if (now - i->second.hb_interval_start >= utime_t(hb_avg_time_period, 0)) {
5401 uint32_t back_avg = i->second.hb_total_back / i->second.hb_average_count;
5402 uint32_t back_min = i->second.hb_min_back;
5403 uint32_t back_max = i->second.hb_max_back;
5404 uint32_t front_avg = i->second.hb_total_front / i->second.hb_average_count;
5405 uint32_t front_min = i->second.hb_min_front;
5406 uint32_t front_max = i->second.hb_max_front;
5407
5408 // Reset for new interval
5409 i->second.hb_average_count = 0;
5410 i->second.hb_interval_start = now;
5411 i->second.hb_total_back = i->second.hb_max_back = 0;
5412 i->second.hb_min_back = UINT_MAX;
5413 i->second.hb_total_front = i->second.hb_max_front = 0;
5414 i->second.hb_min_front = UINT_MAX;
5415
5416 // Record per osd interace ping times
5417 // Based on osd_heartbeat_interval ignoring that it is randomly short than this interval
5418 if (i->second.hb_back_pingtime.size() == 0) {
5419 ceph_assert(i->second.hb_front_pingtime.size() == 0);
5420 for (unsigned k = 0 ; k < hb_vector_size; ++k) {
5421 i->second.hb_back_pingtime.push_back(back_avg);
5422 i->second.hb_back_min.push_back(back_min);
5423 i->second.hb_back_max.push_back(back_max);
5424 i->second.hb_front_pingtime.push_back(front_avg);
5425 i->second.hb_front_min.push_back(front_min);
5426 i->second.hb_front_max.push_back(front_max);
5427 ++i->second.hb_index;
5428 }
5429 } else {
5430 int index = i->second.hb_index & (hb_vector_size - 1);
5431 i->second.hb_back_pingtime[index] = back_avg;
5432 i->second.hb_back_min[index] = back_min;
5433 i->second.hb_back_max[index] = back_max;
5434 i->second.hb_front_pingtime[index] = front_avg;
5435 i->second.hb_front_min[index] = front_min;
5436 i->second.hb_front_max[index] = front_max;
5437 ++i->second.hb_index;
5438 }
5439
5440 {
5441 std::lock_guard l(service.stat_lock);
5442 service.osd_stat.hb_pingtime[from].last_update = now.sec();
5443 service.osd_stat.hb_pingtime[from].back_last = back_pingtime;
5444
5445 uint32_t total = 0;
5446 uint32_t min = UINT_MAX;
5447 uint32_t max = 0;
5448 uint32_t count = 0;
5449 uint32_t which = 0;
5450 uint32_t size = (uint32_t)i->second.hb_back_pingtime.size();
5451 for (int32_t k = size - 1 ; k >= 0; --k) {
5452 ++count;
5453 int index = (i->second.hb_index + k) % size;
5454 total += i->second.hb_back_pingtime[index];
5455 if (i->second.hb_back_min[index] < min)
5456 min = i->second.hb_back_min[index];
5457 if (i->second.hb_back_max[index] > max)
5458 max = i->second.hb_back_max[index];
5459 if (count == 1 || count == 5 || count == 15) {
5460 service.osd_stat.hb_pingtime[from].back_pingtime[which] = total / count;
5461 service.osd_stat.hb_pingtime[from].back_min[which] = min;
5462 service.osd_stat.hb_pingtime[from].back_max[which] = max;
5463 which++;
5464 if (count == 15)
5465 break;
5466 }
5467 }
5468
5469 if (i->second.con_front != NULL) {
5470 service.osd_stat.hb_pingtime[from].front_last = front_pingtime;
5471
5472 total = 0;
5473 min = UINT_MAX;
5474 max = 0;
5475 count = 0;
5476 which = 0;
5477 for (int32_t k = size - 1 ; k >= 0; --k) {
5478 ++count;
5479 int index = (i->second.hb_index + k) % size;
5480 total += i->second.hb_front_pingtime[index];
5481 if (i->second.hb_front_min[index] < min)
5482 min = i->second.hb_front_min[index];
5483 if (i->second.hb_front_max[index] > max)
5484 max = i->second.hb_front_max[index];
5485 if (count == 1 || count == 5 || count == 15) {
5486 service.osd_stat.hb_pingtime[from].front_pingtime[which] = total / count;
5487 service.osd_stat.hb_pingtime[from].front_min[which] = min;
5488 service.osd_stat.hb_pingtime[from].front_max[which] = max;
5489 which++;
5490 if (count == 15)
5491 break;
5492 }
5493 }
5494 }
5495 }
5496 } else {
5497 std::lock_guard l(service.stat_lock);
5498 service.osd_stat.hb_pingtime[from].back_last = back_pingtime;
5499 if (i->second.con_front != NULL)
5500 service.osd_stat.hb_pingtime[from].front_last = front_pingtime;
5501 }
11fdf7f2 5502 i->second.ping_history.erase(i->second.ping_history.begin(), ++acked);
7c673cae
FG
5503 }
5504
11fdf7f2
TL
5505 if (i->second.is_healthy(now)) {
5506 // Cancel false reports
5507 auto failure_queue_entry = failure_queue.find(from);
5508 if (failure_queue_entry != failure_queue.end()) {
5509 dout(10) << "handle_osd_ping canceling queued "
5510 << "failure report for osd." << from << dendl;
5511 failure_queue.erase(failure_queue_entry);
5512 }
5513
5514 auto failure_pending_entry = failure_pending.find(from);
5515 if (failure_pending_entry != failure_pending.end()) {
5516 dout(10) << "handle_osd_ping canceling in-flight "
5517 << "failure report for osd." << from << dendl;
5518 send_still_alive(curmap->get_epoch(),
5519 from,
5520 failure_pending_entry->second.second);
5521 failure_pending.erase(failure_pending_entry);
5522 }
7c673cae 5523 }
11fdf7f2
TL
5524 } else {
5525 // old replies, deprecated by newly sent pings.
9f95a23c 5526 dout(10) << "handle_osd_ping no pending ping(sent at " << m->ping_stamp
11fdf7f2
TL
5527 << ") is found, treat as covered by newly sent pings "
5528 << "and ignore"
5529 << dendl;
7c673cae
FG
5530 }
5531 }
5532
5533 if (m->map_epoch &&
5534 curmap->is_up(from)) {
7c673cae 5535 if (is_active()) {
9f95a23c
TL
5536 ConnectionRef cluster_con = service.get_con_osd_cluster(
5537 from, curmap->get_epoch());
5538 if (cluster_con) {
5539 service.maybe_share_map(cluster_con.get(), curmap, m->map_epoch);
7c673cae
FG
5540 }
5541 }
5542 }
9f95a23c
TL
5543
5544 s->stamps->got_ping_reply(
5545 mnow,
5546 m->mono_send_stamp,
5547 m->delta_ub);
5548 dout(20) << __func__ << " new stamps " << *s->stamps << dendl;
7c673cae
FG
5549 }
5550 break;
5551
5552 case MOSDPing::YOU_DIED:
5553 dout(10) << "handle_osd_ping " << m->get_source_inst()
5554 << " says i am down in " << m->map_epoch << dendl;
5555 osdmap_subscribe(curmap->get_epoch()+1, false);
5556 break;
5557 }
5558
9f95a23c 5559 heartbeat_lock.unlock();
7c673cae
FG
5560 m->put();
5561}
5562
5563void OSD::heartbeat_entry()
5564{
9f95a23c 5565 std::unique_lock l(heartbeat_lock);
7c673cae
FG
5566 if (is_stopping())
5567 return;
5568 while (!heartbeat_stop) {
5569 heartbeat();
5570
eafe8130
TL
5571 double wait;
5572 if (cct->_conf.get_val<bool>("debug_disable_randomized_ping")) {
5573 wait = (float)cct->_conf->osd_heartbeat_interval;
5574 } else {
5575 wait = .5 + ((float)(rand() % 10)/10.0) * (float)cct->_conf->osd_heartbeat_interval;
5576 }
9f95a23c 5577 auto w = ceph::make_timespan(wait);
7c673cae 5578 dout(30) << "heartbeat_entry sleeping for " << wait << dendl;
9f95a23c 5579 heartbeat_cond.wait_for(l, w);
7c673cae
FG
5580 if (is_stopping())
5581 return;
5582 dout(30) << "heartbeat_entry woke up" << dendl;
5583 }
5584}
5585
5586void OSD::heartbeat_check()
5587{
9f95a23c 5588 ceph_assert(ceph_mutex_is_locked(heartbeat_lock));
7c673cae
FG
5589 utime_t now = ceph_clock_now();
5590
11fdf7f2 5591 // check for incoming heartbeats (move me elsewhere?)
7c673cae
FG
5592 for (map<int,HeartbeatInfo>::iterator p = heartbeat_peers.begin();
5593 p != heartbeat_peers.end();
5594 ++p) {
5595
5596 if (p->second.first_tx == utime_t()) {
5597 dout(25) << "heartbeat_check we haven't sent ping to osd." << p->first
11fdf7f2 5598 << " yet, skipping" << dendl;
7c673cae
FG
5599 continue;
5600 }
5601
5602 dout(25) << "heartbeat_check osd." << p->first
5603 << " first_tx " << p->second.first_tx
5604 << " last_tx " << p->second.last_tx
5605 << " last_rx_back " << p->second.last_rx_back
5606 << " last_rx_front " << p->second.last_rx_front
5607 << dendl;
11fdf7f2
TL
5608 if (p->second.is_unhealthy(now)) {
5609 utime_t oldest_deadline = p->second.ping_history.begin()->second.first;
7c673cae
FG
5610 if (p->second.last_rx_back == utime_t() ||
5611 p->second.last_rx_front == utime_t()) {
11fdf7f2
TL
5612 derr << "heartbeat_check: no reply from "
5613 << p->second.con_front->get_peer_addr().get_sockaddr()
5614 << " osd." << p->first
5615 << " ever on either front or back, first ping sent "
5616 << p->second.first_tx
5617 << " (oldest deadline " << oldest_deadline << ")"
5618 << dendl;
7c673cae 5619 // fail
11fdf7f2 5620 failure_queue[p->first] = p->second.first_tx;
7c673cae 5621 } else {
11fdf7f2
TL
5622 derr << "heartbeat_check: no reply from "
5623 << p->second.con_front->get_peer_addr().get_sockaddr()
7c673cae
FG
5624 << " osd." << p->first << " since back " << p->second.last_rx_back
5625 << " front " << p->second.last_rx_front
11fdf7f2
TL
5626 << " (oldest deadline " << oldest_deadline << ")"
5627 << dendl;
7c673cae 5628 // fail
11fdf7f2 5629 failure_queue[p->first] = std::min(p->second.last_rx_back, p->second.last_rx_front);
7c673cae
FG
5630 }
5631 }
5632 }
5633}
5634
5635void OSD::heartbeat()
5636{
9f95a23c 5637 ceph_assert(ceph_mutex_is_locked_by_me(heartbeat_lock));
7c673cae
FG
5638 dout(30) << "heartbeat" << dendl;
5639
5640 // get CPU load avg
5641 double loadavgs[1];
11fdf7f2
TL
5642 int hb_interval = cct->_conf->osd_heartbeat_interval;
5643 int n_samples = 86400;
5644 if (hb_interval > 1) {
5645 n_samples /= hb_interval;
5646 if (n_samples < 1)
5647 n_samples = 1;
5648 }
5649
7c673cae
FG
5650 if (getloadavg(loadavgs, 1) == 1) {
5651 logger->set(l_osd_loadavg, 100 * loadavgs[0]);
5652 daily_loadavg = (daily_loadavg * (n_samples - 1) + loadavgs[0]) / n_samples;
5653 dout(30) << "heartbeat: daily_loadavg " << daily_loadavg << dendl;
5654 }
5655
5656 dout(30) << "heartbeat checking stats" << dendl;
5657
11fdf7f2 5658 // refresh peer list and osd stats
7c673cae
FG
5659 vector<int> hb_peers;
5660 for (map<int,HeartbeatInfo>::iterator p = heartbeat_peers.begin();
5661 p != heartbeat_peers.end();
5662 ++p)
5663 hb_peers.push_back(p->first);
7c673cae 5664
11fdf7f2
TL
5665 auto new_stat = service.set_osd_stat(hb_peers, get_num_pgs());
5666 dout(5) << __func__ << " " << new_stat << dendl;
5667 ceph_assert(new_stat.statfs.total);
5668
5669 float pratio;
5670 float ratio = service.compute_adjusted_ratio(new_stat, &pratio);
5671
5672 service.check_full_status(ratio, pratio);
7c673cae
FG
5673
5674 utime_t now = ceph_clock_now();
9f95a23c 5675 auto mnow = service.get_mnow();
11fdf7f2
TL
5676 utime_t deadline = now;
5677 deadline += cct->_conf->osd_heartbeat_grace;
7c673cae
FG
5678
5679 // send heartbeats
5680 for (map<int,HeartbeatInfo>::iterator i = heartbeat_peers.begin();
5681 i != heartbeat_peers.end();
5682 ++i) {
5683 int peer = i->first;
9f95a23c
TL
5684 dout(30) << "heartbeat sending ping to osd." << peer << dendl;
5685
7c673cae
FG
5686 i->second.last_tx = now;
5687 if (i->second.first_tx == utime_t())
5688 i->second.first_tx = now;
11fdf7f2
TL
5689 i->second.ping_history[now] = make_pair(deadline,
5690 HeartbeatInfo::HEARTBEAT_MAX_CONN);
eafe8130
TL
5691 if (i->second.hb_interval_start == utime_t())
5692 i->second.hb_interval_start = now;
9f95a23c
TL
5693
5694 Session *s = static_cast<Session*>(i->second.con_back->get_priv().get());
5695 std::optional<ceph::signedspan> delta_ub;
5696 s->stamps->sent_ping(&delta_ub);
5697
5698 i->second.con_back->send_message(
5699 new MOSDPing(monc->get_fsid(),
5700 service.get_osdmap_epoch(),
5701 MOSDPing::PING,
5702 now,
5703 mnow,
5704 mnow,
5705 service.get_up_epoch(),
5706 cct->_conf->osd_heartbeat_min_size,
5707 delta_ub));
7c673cae
FG
5708
5709 if (i->second.con_front)
9f95a23c
TL
5710 i->second.con_front->send_message(
5711 new MOSDPing(monc->get_fsid(),
5712 service.get_osdmap_epoch(),
5713 MOSDPing::PING,
5714 now,
5715 mnow,
5716 mnow,
5717 service.get_up_epoch(),
5718 cct->_conf->osd_heartbeat_min_size,
5719 delta_ub));
7c673cae
FG
5720 }
5721
5722 logger->set(l_osd_hb_to, heartbeat_peers.size());
5723
5724 // hmm.. am i all alone?
5725 dout(30) << "heartbeat lonely?" << dendl;
5726 if (heartbeat_peers.empty()) {
5727 if (now - last_mon_heartbeat > cct->_conf->osd_mon_heartbeat_interval && is_active()) {
5728 last_mon_heartbeat = now;
5729 dout(10) << "i have no heartbeat peers; checking mon for new map" << dendl;
9f95a23c 5730 osdmap_subscribe(get_osdmap_epoch() + 1, false);
7c673cae
FG
5731 }
5732 }
5733
5734 dout(30) << "heartbeat done" << dendl;
5735}
5736
5737bool OSD::heartbeat_reset(Connection *con)
5738{
11fdf7f2
TL
5739 std::lock_guard l(heartbeat_lock);
5740 auto s = con->get_priv();
9f95a23c 5741 dout(20) << __func__ << " con " << con << " s " << s.get() << dendl;
11fdf7f2 5742 con->set_priv(nullptr);
7c673cae 5743 if (s) {
7c673cae 5744 if (is_stopping()) {
7c673cae
FG
5745 return true;
5746 }
9f95a23c
TL
5747 auto session = static_cast<Session*>(s.get());
5748 auto p = heartbeat_peers.find(session->peer);
7c673cae
FG
5749 if (p != heartbeat_peers.end() &&
5750 (p->second.con_back == con ||
5751 p->second.con_front == con)) {
5752 dout(10) << "heartbeat_reset failed hb con " << con << " for osd." << p->second.peer
5753 << ", reopening" << dendl;
9f95a23c 5754 p->second.clear_mark_down(con);
7c673cae
FG
5755 pair<ConnectionRef,ConnectionRef> newcon = service.get_con_osd_hb(p->second.peer, p->second.epoch);
5756 if (newcon.first) {
5757 p->second.con_back = newcon.first.get();
11fdf7f2 5758 p->second.con_back->set_priv(s);
7c673cae
FG
5759 if (newcon.second) {
5760 p->second.con_front = newcon.second.get();
11fdf7f2 5761 p->second.con_front->set_priv(s);
7c673cae 5762 }
11fdf7f2 5763 p->second.ping_history.clear();
7c673cae
FG
5764 } else {
5765 dout(10) << "heartbeat_reset failed hb con " << con << " for osd." << p->second.peer
5766 << ", raced with osdmap update, closing out peer" << dendl;
5767 heartbeat_peers.erase(p);
5768 }
5769 } else {
5770 dout(10) << "heartbeat_reset closing (old) failed hb con " << con << dendl;
5771 }
7c673cae
FG
5772 }
5773 return true;
5774}
5775
5776
5777
5778// =========================================
5779
5780void OSD::tick()
5781{
9f95a23c 5782 ceph_assert(ceph_mutex_is_locked(osd_lock));
7c673cae
FG
5783 dout(10) << "tick" << dendl;
5784
9f95a23c
TL
5785 utime_t now = ceph_clock_now();
5786 // throw out any obsolete markdown log
5787 utime_t grace = utime_t(cct->_conf->osd_max_markdown_period, 0);
5788 while (!osd_markdown_log.empty() &&
5789 osd_markdown_log.front() + grace < now)
5790 osd_markdown_log.pop_front();
5791
7c673cae
FG
5792 if (is_active() || is_waiting_for_healthy()) {
5793 maybe_update_heartbeat_peers();
5794 }
5795
5796 if (is_waiting_for_healthy()) {
5797 start_boot();
494da23a
TL
5798 }
5799
5800 if (is_waiting_for_healthy() || is_booting()) {
5801 std::lock_guard l(heartbeat_lock);
494da23a
TL
5802 if (now - last_mon_heartbeat > cct->_conf->osd_mon_heartbeat_interval) {
5803 last_mon_heartbeat = now;
5804 dout(1) << __func__ << " checking mon for new map" << dendl;
9f95a23c 5805 osdmap_subscribe(get_osdmap_epoch() + 1, false);
11fdf7f2 5806 }
7c673cae
FG
5807 }
5808
5809 do_waiters();
5810
9f95a23c
TL
5811 // scrub purged_snaps every deep scrub interval
5812 {
5813 const utime_t last = superblock.last_purged_snaps_scrub;
5814 utime_t next = last;
5815 next += cct->_conf->osd_scrub_min_interval;
5816 std::mt19937 rng;
5817 // use a seed that is stable for each scrub interval, but varies
5818 // by OSD to avoid any herds.
5819 rng.seed(whoami + superblock.last_purged_snaps_scrub.sec());
5820 double r = (rng() % 1024) / 1024;
5821 next +=
5822 cct->_conf->osd_scrub_min_interval *
5823 cct->_conf->osd_scrub_interval_randomize_ratio * r;
5824 if (next < ceph_clock_now()) {
5825 dout(20) << __func__ << " last_purged_snaps_scrub " << last
5826 << " next " << next << " ... now" << dendl;
5827 scrub_purged_snaps();
5828 } else {
5829 dout(20) << __func__ << " last_purged_snaps_scrub " << last
5830 << " next " << next << dendl;
5831 }
5832 }
5833
91327a77 5834 tick_timer.add_event_after(get_tick_interval(), new C_Tick(this));
7c673cae
FG
5835}
5836
5837void OSD::tick_without_osd_lock()
5838{
9f95a23c 5839 ceph_assert(ceph_mutex_is_locked(tick_timer_lock));
7c673cae
FG
5840 dout(10) << "tick_without_osd_lock" << dendl;
5841
7c673cae
FG
5842 logger->set(l_osd_cached_crc, buffer::get_cached_crc());
5843 logger->set(l_osd_cached_crc_adjusted, buffer::get_cached_crc_adjusted());
5844 logger->set(l_osd_missed_crc, buffer::get_missed_crc());
11fdf7f2
TL
5845
5846 // refresh osd stats
5847 struct store_statfs_t stbuf;
5848 osd_alert_list_t alerts;
5849 int r = store->statfs(&stbuf, &alerts);
5850 ceph_assert(r == 0);
5851 service.set_statfs(stbuf, alerts);
7c673cae
FG
5852
5853 // osd_lock is not being held, which means the OSD state
5854 // might change when doing the monitor report
5855 if (is_active() || is_waiting_for_healthy()) {
9f95a23c
TL
5856 {
5857 std::lock_guard l{heartbeat_lock};
5858 heartbeat_check();
5859 }
5860 map_lock.lock_shared();
11fdf7f2 5861 std::lock_guard l(mon_report_lock);
7c673cae
FG
5862
5863 // mon report?
7c673cae 5864 utime_t now = ceph_clock_now();
11fdf7f2
TL
5865 if (service.need_fullness_update() ||
5866 now - last_mon_report > cct->_conf->osd_mon_report_interval) {
7c673cae 5867 last_mon_report = now;
7c673cae
FG
5868 send_full_update();
5869 send_failures();
7c673cae 5870 }
9f95a23c 5871 map_lock.unlock_shared();
11fdf7f2
TL
5872
5873 epoch_t max_waiting_epoch = 0;
5874 for (auto s : shards) {
5875 max_waiting_epoch = std::max(max_waiting_epoch,
5876 s->get_max_waiting_epoch());
5877 }
5878 if (max_waiting_epoch > get_osdmap()->get_epoch()) {
5879 dout(20) << __func__ << " max_waiting_epoch " << max_waiting_epoch
5880 << ", requesting new map" << dendl;
5881 osdmap_subscribe(superblock.newest_map + 1, false);
5882 }
7c673cae
FG
5883 }
5884
5885 if (is_active()) {
5886 if (!scrub_random_backoff()) {
5887 sched_scrub();
5888 }
5889 service.promote_throttle_recalibrate();
3efd9988 5890 resume_creating_pg();
224ce89b
WB
5891 bool need_send_beacon = false;
5892 const auto now = ceph::coarse_mono_clock::now();
5893 {
5894 // borrow lec lock to pretect last_sent_beacon from changing
11fdf7f2 5895 std::lock_guard l{min_last_epoch_clean_lock};
224ce89b
WB
5896 const auto elapsed = now - last_sent_beacon;
5897 if (chrono::duration_cast<chrono::seconds>(elapsed).count() >
5898 cct->_conf->osd_beacon_report_interval) {
5899 need_send_beacon = true;
5900 }
5901 }
5902 if (need_send_beacon) {
5903 send_beacon(now);
5904 }
7c673cae
FG
5905 }
5906
11fdf7f2 5907 mgrc.update_daemon_health(get_health_metrics());
7c673cae 5908 service.kick_recovery_queue();
91327a77
AA
5909 tick_timer_without_osd_lock.add_event_after(get_tick_interval(),
5910 new C_Tick_WithoutOSDLock(this));
7c673cae
FG
5911}
5912
7c673cae
FG
5913// Usage:
5914// setomapval <pool-id> [namespace/]<obj-name> <key> <val>
5915// rmomapkey <pool-id> [namespace/]<obj-name> <key>
5916// setomapheader <pool-id> [namespace/]<obj-name> <header>
5917// getomap <pool> [namespace/]<obj-name>
5918// truncobj <pool-id> [namespace/]<obj-name> <newlen>
5919// injectmdataerr [namespace/]<obj-name> [shardid]
5920// injectdataerr [namespace/]<obj-name> [shardid]
5921//
5922// set_recovery_delay [utime]
5923void TestOpsSocketHook::test_ops(OSDService *service, ObjectStore *store,
11fdf7f2
TL
5924 std::string_view command,
5925 const cmdmap_t& cmdmap, ostream &ss)
7c673cae
FG
5926{
5927 //Test support
5928 //Support changing the omap on a single osd by using the Admin Socket to
5929 //directly request the osd make a change.
5930 if (command == "setomapval" || command == "rmomapkey" ||
5931 command == "setomapheader" || command == "getomap" ||
5932 command == "truncobj" || command == "injectmdataerr" ||
5933 command == "injectdataerr"
5934 ) {
5935 pg_t rawpg;
5936 int64_t pool;
5937 OSDMapRef curmap = service->get_osdmap();
5938 int r = -1;
5939
5940 string poolstr;
5941
9f95a23c 5942 cmd_getval(cmdmap, "pool", poolstr);
7c673cae
FG
5943 pool = curmap->lookup_pg_pool_name(poolstr);
5944 //If we can't find it by name then maybe id specified
5945 if (pool < 0 && isdigit(poolstr[0]))
5946 pool = atoll(poolstr.c_str());
5947 if (pool < 0) {
b5b8bbf5 5948 ss << "Invalid pool '" << poolstr << "''";
7c673cae
FG
5949 return;
5950 }
5951
5952 string objname, nspace;
9f95a23c 5953 cmd_getval(cmdmap, "objname", objname);
7c673cae
FG
5954 std::size_t found = objname.find_first_of('/');
5955 if (found != string::npos) {
5956 nspace = objname.substr(0, found);
5957 objname = objname.substr(found+1);
5958 }
5959 object_locator_t oloc(pool, nspace);
5960 r = curmap->object_locator_to_pg(object_t(objname), oloc, rawpg);
5961
5962 if (r < 0) {
5963 ss << "Invalid namespace/objname";
5964 return;
5965 }
5966
5967 int64_t shardid;
9f95a23c 5968 cmd_getval(cmdmap, "shardid", shardid, int64_t(shard_id_t::NO_SHARD));
7c673cae
FG
5969 hobject_t obj(object_t(objname), string(""), CEPH_NOSNAP, rawpg.ps(), pool, nspace);
5970 ghobject_t gobj(obj, ghobject_t::NO_GEN, shard_id_t(uint8_t(shardid)));
5971 spg_t pgid(curmap->raw_pg_to_pg(rawpg), shard_id_t(shardid));
5972 if (curmap->pg_is_ec(rawpg)) {
5973 if ((command != "injectdataerr") && (command != "injectmdataerr")) {
5974 ss << "Must not call on ec pool, except injectdataerr or injectmdataerr";
5975 return;
5976 }
5977 }
5978
5979 ObjectStore::Transaction t;
5980
5981 if (command == "setomapval") {
5982 map<string, bufferlist> newattrs;
5983 bufferlist val;
5984 string key, valstr;
9f95a23c
TL
5985 cmd_getval(cmdmap, "key", key);
5986 cmd_getval(cmdmap, "val", valstr);
7c673cae
FG
5987
5988 val.append(valstr);
5989 newattrs[key] = val;
5990 t.omap_setkeys(coll_t(pgid), ghobject_t(obj), newattrs);
11fdf7f2 5991 r = store->queue_transaction(service->meta_ch, std::move(t));
7c673cae
FG
5992 if (r < 0)
5993 ss << "error=" << r;
5994 else
5995 ss << "ok";
5996 } else if (command == "rmomapkey") {
5997 string key;
9f95a23c 5998 cmd_getval(cmdmap, "key", key);
7c673cae 5999
9f95a23c 6000 t.omap_rmkey(coll_t(pgid), ghobject_t(obj), key);
11fdf7f2 6001 r = store->queue_transaction(service->meta_ch, std::move(t));
7c673cae
FG
6002 if (r < 0)
6003 ss << "error=" << r;
6004 else
6005 ss << "ok";
6006 } else if (command == "setomapheader") {
6007 bufferlist newheader;
6008 string headerstr;
6009
9f95a23c 6010 cmd_getval(cmdmap, "header", headerstr);
7c673cae
FG
6011 newheader.append(headerstr);
6012 t.omap_setheader(coll_t(pgid), ghobject_t(obj), newheader);
11fdf7f2 6013 r = store->queue_transaction(service->meta_ch, std::move(t));
7c673cae
FG
6014 if (r < 0)
6015 ss << "error=" << r;
6016 else
6017 ss << "ok";
6018 } else if (command == "getomap") {
6019 //Debug: Output entire omap
6020 bufferlist hdrbl;
6021 map<string, bufferlist> keyvals;
11fdf7f2
TL
6022 auto ch = store->open_collection(coll_t(pgid));
6023 if (!ch) {
6024 ss << "unable to open collection for " << pgid;
6025 r = -ENOENT;
6026 } else {
6027 r = store->omap_get(ch, ghobject_t(obj), &hdrbl, &keyvals);
6028 if (r >= 0) {
7c673cae
FG
6029 ss << "header=" << string(hdrbl.c_str(), hdrbl.length());
6030 for (map<string, bufferlist>::iterator it = keyvals.begin();
11fdf7f2 6031 it != keyvals.end(); ++it)
7c673cae
FG
6032 ss << " key=" << (*it).first << " val="
6033 << string((*it).second.c_str(), (*it).second.length());
11fdf7f2 6034 } else {
7c673cae 6035 ss << "error=" << r;
11fdf7f2 6036 }
7c673cae
FG
6037 }
6038 } else if (command == "truncobj") {
6039 int64_t trunclen;
9f95a23c 6040 cmd_getval(cmdmap, "len", trunclen);
7c673cae 6041 t.truncate(coll_t(pgid), ghobject_t(obj), trunclen);
11fdf7f2 6042 r = store->queue_transaction(service->meta_ch, std::move(t));
7c673cae
FG
6043 if (r < 0)
6044 ss << "error=" << r;
6045 else
6046 ss << "ok";
6047 } else if (command == "injectdataerr") {
6048 store->inject_data_error(gobj);
6049 ss << "ok";
6050 } else if (command == "injectmdataerr") {
6051 store->inject_mdata_error(gobj);
6052 ss << "ok";
6053 }
6054 return;
6055 }
6056 if (command == "set_recovery_delay") {
6057 int64_t delay;
9f95a23c 6058 cmd_getval(cmdmap, "utime", delay, (int64_t)0);
7c673cae
FG
6059 ostringstream oss;
6060 oss << delay;
11fdf7f2 6061 int r = service->cct->_conf.set_val("osd_recovery_delay_start",
7c673cae
FG
6062 oss.str().c_str());
6063 if (r != 0) {
6064 ss << "set_recovery_delay: error setting "
6065 << "osd_recovery_delay_start to '" << delay << "': error "
6066 << r;
6067 return;
6068 }
11fdf7f2 6069 service->cct->_conf.apply_changes(nullptr);
7c673cae
FG
6070 ss << "set_recovery_delay: set osd_recovery_delay_start "
6071 << "to " << service->cct->_conf->osd_recovery_delay_start;
6072 return;
6073 }
7c673cae
FG
6074 if (command == "injectfull") {
6075 int64_t count;
6076 string type;
6077 OSDService::s_names state;
9f95a23c
TL
6078 cmd_getval(cmdmap, "type", type, string("full"));
6079 cmd_getval(cmdmap, "count", count, (int64_t)-1);
7c673cae
FG
6080 if (type == "none" || count == 0) {
6081 type = "none";
6082 count = 0;
6083 }
6084 state = service->get_full_state(type);
6085 if (state == OSDService::s_names::INVALID) {
6086 ss << "Invalid type use (none, nearfull, backfillfull, full, failsafe)";
6087 return;
6088 }
6089 service->set_injectfull(state, count);
6090 return;
6091 }
6092 ss << "Internal error - command=" << command;
6093}
6094
7c673cae
FG
6095// =========================================
6096
6097void OSD::ms_handle_connect(Connection *con)
6098{
6099 dout(10) << __func__ << " con " << con << dendl;
6100 if (con->get_peer_type() == CEPH_ENTITY_TYPE_MON) {
11fdf7f2 6101 std::lock_guard l(osd_lock);
7c673cae
FG
6102 if (is_stopping())
6103 return;
6104 dout(10) << __func__ << " on mon" << dendl;
6105
6106 if (is_preboot()) {
6107 start_boot();
6108 } else if (is_booting()) {
6109 _send_boot(); // resend boot message
6110 } else {
9f95a23c 6111 map_lock.lock_shared();
11fdf7f2 6112 std::lock_guard l2(mon_report_lock);
7c673cae
FG
6113
6114 utime_t now = ceph_clock_now();
6115 last_mon_report = now;
6116
6117 // resend everything, it's a new session
6118 send_full_update();
6119 send_alive();
6120 service.requeue_pg_temp();
11fdf7f2 6121 service.clear_sent_ready_to_merge();
7c673cae 6122 service.send_pg_temp();
11fdf7f2
TL
6123 service.send_ready_to_merge();
6124 service.send_pg_created();
7c673cae
FG
6125 requeue_failures();
6126 send_failures();
7c673cae 6127
9f95a23c 6128 map_lock.unlock_shared();
7c673cae
FG
6129 if (is_active()) {
6130 send_beacon(ceph::coarse_mono_clock::now());
6131 }
6132 }
6133
6134 // full map requests may happen while active or pre-boot
6135 if (requested_full_first) {
6136 rerequest_full_maps();
6137 }
6138 }
6139}
6140
6141void OSD::ms_handle_fast_connect(Connection *con)
6142{
6143 if (con->get_peer_type() != CEPH_ENTITY_TYPE_MON &&
6144 con->get_peer_type() != CEPH_ENTITY_TYPE_MGR) {
9f95a23c
TL
6145 if (auto s = ceph::ref_cast<Session>(con->get_priv()); !s) {
6146 s = ceph::make_ref<Session>(cct, con);
6147 con->set_priv(s);
7c673cae
FG
6148 dout(10) << " new session (outgoing) " << s << " con=" << s->con
6149 << " addr=" << s->con->get_peer_addr() << dendl;
6150 // we don't connect to clients
11fdf7f2 6151 ceph_assert(con->get_peer_type() == CEPH_ENTITY_TYPE_OSD);
7c673cae
FG
6152 s->entity_name.set_type(CEPH_ENTITY_TYPE_OSD);
6153 }
7c673cae
FG
6154 }
6155}
6156
6157void OSD::ms_handle_fast_accept(Connection *con)
6158{
6159 if (con->get_peer_type() != CEPH_ENTITY_TYPE_MON &&
6160 con->get_peer_type() != CEPH_ENTITY_TYPE_MGR) {
9f95a23c
TL
6161 if (auto s = ceph::ref_cast<Session>(con->get_priv()); !s) {
6162 s = ceph::make_ref<Session>(cct, con);
6163 con->set_priv(s);
7c673cae
FG
6164 dout(10) << "new session (incoming)" << s << " con=" << con
6165 << " addr=" << con->get_peer_addr()
6166 << " must have raced with connect" << dendl;
11fdf7f2 6167 ceph_assert(con->get_peer_type() == CEPH_ENTITY_TYPE_OSD);
7c673cae
FG
6168 s->entity_name.set_type(CEPH_ENTITY_TYPE_OSD);
6169 }
7c673cae
FG
6170 }
6171}
6172
6173bool OSD::ms_handle_reset(Connection *con)
6174{
9f95a23c
TL
6175 auto session = ceph::ref_cast<Session>(con->get_priv());
6176 dout(2) << "ms_handle_reset con " << con << " session " << session.get() << dendl;
7c673cae
FG
6177 if (!session)
6178 return false;
6179 session->wstate.reset(con);
11fdf7f2
TL
6180 session->con->set_priv(nullptr);
6181 session->con.reset(); // break con <-> session ref cycle
7c673cae
FG
6182 // note that we break session->con *before* the session_handle_reset
6183 // cleanup below. this avoids a race between us and
6184 // PG::add_backoff, Session::check_backoff, etc.
9f95a23c 6185 session_handle_reset(session);
7c673cae
FG
6186 return true;
6187}
6188
6189bool OSD::ms_handle_refused(Connection *con)
6190{
6191 if (!cct->_conf->osd_fast_fail_on_connection_refused)
6192 return false;
6193
9f95a23c
TL
6194 auto session = ceph::ref_cast<Session>(con->get_priv());
6195 dout(2) << "ms_handle_refused con " << con << " session " << session.get() << dendl;
7c673cae
FG
6196 if (!session)
6197 return false;
6198 int type = con->get_peer_type();
6199 // handle only OSD failures here
6200 if (monc && (type == CEPH_ENTITY_TYPE_OSD)) {
6201 OSDMapRef osdmap = get_osdmap();
6202 if (osdmap) {
6203 int id = osdmap->identify_osd_on_all_channels(con->get_peer_addr());
6204 if (id >= 0 && osdmap->is_up(id)) {
6205 // I'm cheating mon heartbeat grace logic, because we know it's not going
6206 // to respawn alone. +1 so we won't hit any boundary case.
11fdf7f2
TL
6207 monc->send_mon_message(
6208 new MOSDFailure(
6209 monc->get_fsid(),
6210 id,
6211 osdmap->get_addrs(id),
6212 cct->_conf->osd_heartbeat_grace + 1,
6213 osdmap->get_epoch(),
6214 MOSDFailure::FLAG_IMMEDIATE | MOSDFailure::FLAG_FAILED
6215 ));
7c673cae
FG
6216 }
6217 }
6218 }
7c673cae
FG
6219 return true;
6220}
6221
6222struct C_OSD_GetVersion : public Context {
6223 OSD *osd;
6224 uint64_t oldest, newest;
6225 explicit C_OSD_GetVersion(OSD *o) : osd(o), oldest(0), newest(0) {}
6226 void finish(int r) override {
6227 if (r >= 0)
6228 osd->_got_mon_epochs(oldest, newest);
6229 }
6230};
6231
6232void OSD::start_boot()
6233{
6234 if (!_is_healthy()) {
6235 // if we are not healthy, do not mark ourselves up (yet)
6236 dout(1) << "not healthy; waiting to boot" << dendl;
6237 if (!is_waiting_for_healthy())
6238 start_waiting_for_healthy();
6239 // send pings sooner rather than later
6240 heartbeat_kick();
6241 return;
6242 }
6243 dout(1) << __func__ << dendl;
6244 set_state(STATE_PREBOOT);
6245 dout(10) << "start_boot - have maps " << superblock.oldest_map
6246 << ".." << superblock.newest_map << dendl;
6247 C_OSD_GetVersion *c = new C_OSD_GetVersion(this);
6248 monc->get_version("osdmap", &c->newest, &c->oldest, c);
6249}
6250
6251void OSD::_got_mon_epochs(epoch_t oldest, epoch_t newest)
6252{
11fdf7f2 6253 std::lock_guard l(osd_lock);
7c673cae
FG
6254 if (is_preboot()) {
6255 _preboot(oldest, newest);
6256 }
6257}
6258
6259void OSD::_preboot(epoch_t oldest, epoch_t newest)
6260{
11fdf7f2 6261 ceph_assert(is_preboot());
7c673cae
FG
6262 dout(10) << __func__ << " _preboot mon has osdmaps "
6263 << oldest << ".." << newest << dendl;
6264
6265 // ensure our local fullness awareness is accurate
81eedcae
TL
6266 {
6267 std::lock_guard l(heartbeat_lock);
6268 heartbeat();
6269 }
7c673cae 6270
9f95a23c
TL
6271 const auto& monmap = monc->monmap;
6272 const auto osdmap = get_osdmap();
7c673cae 6273 // if our map within recent history, try to add ourselves to the osdmap.
31f18b77
FG
6274 if (osdmap->get_epoch() == 0) {
6275 derr << "waiting for initial osdmap" << dendl;
c07f9fc5 6276 } else if (osdmap->is_destroyed(whoami)) {
b32b8144
FG
6277 derr << "osdmap says I am destroyed" << dendl;
6278 // provide a small margin so we don't livelock seeing if we
6279 // un-destroyed ourselves.
6280 if (osdmap->get_epoch() > newest - 1) {
6281 exit(0);
6282 }
81eedcae 6283 } else if (osdmap->is_noup(whoami)) {
7c673cae
FG
6284 derr << "osdmap NOUP flag is set, waiting for it to clear" << dendl;
6285 } else if (!osdmap->test_flag(CEPH_OSDMAP_SORTBITWISE)) {
6286 derr << "osdmap SORTBITWISE OSDMap flag is NOT set; please set it"
6287 << dendl;
9f95a23c 6288 } else if (osdmap->require_osd_release < ceph_release_t::luminous) {
11fdf7f2 6289 derr << "osdmap require_osd_release < luminous; please upgrade to luminous"
7c673cae 6290 << dendl;
7c673cae
FG
6291 } else if (service.need_fullness_update()) {
6292 derr << "osdmap fullness state needs update" << dendl;
6293 send_full_update();
9f95a23c
TL
6294 } else if (monmap.min_mon_release >= ceph_release_t::octopus &&
6295 superblock.purged_snaps_last < superblock.current_epoch) {
6296 dout(10) << __func__ << " purged_snaps_last " << superblock.purged_snaps_last
6297 << " < newest_map " << superblock.current_epoch << dendl;
6298 _get_purged_snaps();
7c673cae
FG
6299 } else if (osdmap->get_epoch() >= oldest - 1 &&
6300 osdmap->get_epoch() + cct->_conf->osd_map_message_max > newest) {
11fdf7f2
TL
6301
6302 // wait for pgs to fully catch up in a different thread, since
6303 // this thread might be required for splitting and merging PGs to
6304 // make progress.
6305 boot_finisher.queue(
9f95a23c 6306 new LambdaContext(
11fdf7f2 6307 [this](int r) {
9f95a23c 6308 std::unique_lock l(osd_lock);
11fdf7f2
TL
6309 if (is_preboot()) {
6310 dout(10) << __func__ << " waiting for peering work to drain"
6311 << dendl;
9f95a23c 6312 l.unlock();
11fdf7f2 6313 for (auto shard : shards) {
9f95a23c 6314 shard->wait_min_pg_epoch(get_osdmap_epoch());
11fdf7f2 6315 }
9f95a23c 6316 l.lock();
11fdf7f2
TL
6317 }
6318 if (is_preboot()) {
6319 _send_boot();
6320 }
6321 }));
6322 return;
7c673cae
FG
6323 }
6324
6325 // get all the latest maps
6326 if (osdmap->get_epoch() + 1 >= oldest)
6327 osdmap_subscribe(osdmap->get_epoch() + 1, false);
6328 else
6329 osdmap_subscribe(oldest - 1, true);
6330}
6331
9f95a23c
TL
6332void OSD::_get_purged_snaps()
6333{
6334 // NOTE: this is a naive, stateless implementaiton. it may send multiple
6335 // overlapping requests to the mon, which will be somewhat inefficient, but
6336 // it should be reliable.
6337 dout(10) << __func__ << " purged_snaps_last " << superblock.purged_snaps_last
6338 << ", newest_map " << superblock.current_epoch << dendl;
6339 MMonGetPurgedSnaps *m = new MMonGetPurgedSnaps(
6340 superblock.purged_snaps_last + 1,
6341 superblock.current_epoch + 1);
6342 monc->send_mon_message(m);
6343}
6344
6345void OSD::handle_get_purged_snaps_reply(MMonGetPurgedSnapsReply *m)
6346{
6347 dout(10) << __func__ << " " << *m << dendl;
6348 ObjectStore::Transaction t;
6349 if (!is_preboot() ||
6350 m->last < superblock.purged_snaps_last) {
6351 goto out;
6352 }
6353 SnapMapper::record_purged_snaps(cct, store, service.meta_ch,
6354 make_purged_snaps_oid(), &t,
6355 m->purged_snaps);
6356 superblock.purged_snaps_last = m->last;
6357 write_superblock(t);
6358 store->queue_transaction(
6359 service.meta_ch,
6360 std::move(t));
6361 service.publish_superblock(superblock);
6362 if (m->last < superblock.current_epoch) {
6363 _get_purged_snaps();
6364 } else {
6365 start_boot();
6366 }
6367out:
6368 m->put();
6369}
6370
7c673cae
FG
6371void OSD::send_full_update()
6372{
6373 if (!service.need_fullness_update())
6374 return;
6375 unsigned state = 0;
6376 if (service.is_full()) {
6377 state = CEPH_OSD_FULL;
6378 } else if (service.is_backfillfull()) {
6379 state = CEPH_OSD_BACKFILLFULL;
6380 } else if (service.is_nearfull()) {
6381 state = CEPH_OSD_NEARFULL;
6382 }
6383 set<string> s;
6384 OSDMap::calc_state_set(state, s);
6385 dout(10) << __func__ << " want state " << s << dendl;
9f95a23c 6386 monc->send_mon_message(new MOSDFull(get_osdmap_epoch(), state));
7c673cae
FG
6387}
6388
6389void OSD::start_waiting_for_healthy()
6390{
6391 dout(1) << "start_waiting_for_healthy" << dendl;
6392 set_state(STATE_WAITING_FOR_HEALTHY);
6393 last_heartbeat_resample = utime_t();
181888fb
FG
6394
6395 // subscribe to osdmap updates, in case our peers really are known to be dead
9f95a23c 6396 osdmap_subscribe(get_osdmap_epoch() + 1, false);
7c673cae
FG
6397}
6398
6399bool OSD::_is_healthy()
6400{
6401 if (!cct->get_heartbeat_map()->is_healthy()) {
6402 dout(1) << "is_healthy false -- internal heartbeat failed" << dendl;
6403 return false;
6404 }
6405
6406 if (is_waiting_for_healthy()) {
11fdf7f2 6407 utime_t now = ceph_clock_now();
9f95a23c
TL
6408 if (osd_markdown_log.empty()) {
6409 dout(5) << __func__ << " force returning true since last markdown"
6410 << " was " << cct->_conf->osd_max_markdown_period
6411 << "s ago" << dendl;
11fdf7f2
TL
6412 return true;
6413 }
6414 std::lock_guard l(heartbeat_lock);
7c673cae
FG
6415 int num = 0, up = 0;
6416 for (map<int,HeartbeatInfo>::iterator p = heartbeat_peers.begin();
6417 p != heartbeat_peers.end();
6418 ++p) {
11fdf7f2 6419 if (p->second.is_healthy(now))
7c673cae
FG
6420 ++up;
6421 ++num;
6422 }
6423 if ((float)up < (float)num * cct->_conf->osd_heartbeat_min_healthy_ratio) {
6424 dout(1) << "is_healthy false -- only " << up << "/" << num << " up peers (less than "
6425 << int(cct->_conf->osd_heartbeat_min_healthy_ratio * 100.0) << "%)" << dendl;
6426 return false;
6427 }
6428 }
6429
6430 return true;
6431}
6432
6433void OSD::_send_boot()
6434{
6435 dout(10) << "_send_boot" << dendl;
11fdf7f2
TL
6436 Connection *local_connection =
6437 cluster_messenger->get_loopback_connection().get();
6438 entity_addrvec_t client_addrs = client_messenger->get_myaddrs();
6439 entity_addrvec_t cluster_addrs = cluster_messenger->get_myaddrs();
6440 entity_addrvec_t hb_back_addrs = hb_back_server_messenger->get_myaddrs();
6441 entity_addrvec_t hb_front_addrs = hb_front_server_messenger->get_myaddrs();
6442
6443 dout(20) << " initial client_addrs " << client_addrs
6444 << ", cluster_addrs " << cluster_addrs
6445 << ", hb_back_addrs " << hb_back_addrs
6446 << ", hb_front_addrs " << hb_front_addrs
6447 << dendl;
6448 if (cluster_messenger->set_addr_unknowns(client_addrs)) {
6449 dout(10) << " assuming cluster_addrs match client_addrs "
6450 << client_addrs << dendl;
6451 cluster_addrs = cluster_messenger->get_myaddrs();
6452 }
6453 if (auto session = local_connection->get_priv(); !session) {
6454 cluster_messenger->ms_deliver_handle_fast_connect(local_connection);
7c673cae
FG
6455 }
6456
7c673cae 6457 local_connection = hb_back_server_messenger->get_loopback_connection().get();
11fdf7f2
TL
6458 if (hb_back_server_messenger->set_addr_unknowns(cluster_addrs)) {
6459 dout(10) << " assuming hb_back_addrs match cluster_addrs "
6460 << cluster_addrs << dendl;
6461 hb_back_addrs = hb_back_server_messenger->get_myaddrs();
7c673cae 6462 }
11fdf7f2
TL
6463 if (auto session = local_connection->get_priv(); !session) {
6464 hb_back_server_messenger->ms_deliver_handle_fast_connect(local_connection);
7c673cae
FG
6465 }
6466
11fdf7f2
TL
6467 local_connection = hb_front_server_messenger->get_loopback_connection().get();
6468 if (hb_front_server_messenger->set_addr_unknowns(client_addrs)) {
6469 dout(10) << " assuming hb_front_addrs match client_addrs "
6470 << client_addrs << dendl;
6471 hb_front_addrs = hb_front_server_messenger->get_myaddrs();
6472 }
6473 if (auto session = local_connection->get_priv(); !session) {
6474 hb_front_server_messenger->ms_deliver_handle_fast_connect(local_connection);
6475 }
6476
6477 // we now know what our front and back addrs will be, and we are
6478 // about to tell the mon what our metadata (including numa bindings)
6479 // are, so now is a good time!
6480 set_numa_affinity();
6481
6482 MOSDBoot *mboot = new MOSDBoot(
6483 superblock, get_osdmap_epoch(), service.get_boot_epoch(),
6484 hb_back_addrs, hb_front_addrs, cluster_addrs,
6485 CEPH_FEATURES_ALL);
6486 dout(10) << " final client_addrs " << client_addrs
6487 << ", cluster_addrs " << cluster_addrs
6488 << ", hb_back_addrs " << hb_back_addrs
6489 << ", hb_front_addrs " << hb_front_addrs
7c673cae
FG
6490 << dendl;
6491 _collect_metadata(&mboot->metadata);
6492 monc->send_mon_message(mboot);
6493 set_state(STATE_BOOTING);
6494}
6495
6496void OSD::_collect_metadata(map<string,string> *pm)
6497{
6498 // config info
6499 (*pm)["osd_data"] = dev_path;
c07f9fc5
FG
6500 if (store->get_type() == "filestore") {
6501 // not applicable for bluestore
6502 (*pm)["osd_journal"] = journal_path;
6503 }
11fdf7f2
TL
6504 (*pm)["front_addr"] = stringify(client_messenger->get_myaddrs());
6505 (*pm)["back_addr"] = stringify(cluster_messenger->get_myaddrs());
6506 (*pm)["hb_front_addr"] = stringify(hb_front_server_messenger->get_myaddrs());
6507 (*pm)["hb_back_addr"] = stringify(hb_back_server_messenger->get_myaddrs());
7c673cae
FG
6508
6509 // backend
6510 (*pm)["osd_objectstore"] = store->get_type();
31f18b77 6511 (*pm)["rotational"] = store_is_rotational ? "1" : "0";
d2e6a577 6512 (*pm)["journal_rotational"] = journal_is_rotational ? "1" : "0";
c07f9fc5 6513 (*pm)["default_device_class"] = store->get_default_device_class();
7c673cae
FG
6514 store->collect_metadata(pm);
6515
6516 collect_sys_info(pm, cct);
6517
11fdf7f2
TL
6518 (*pm)["front_iface"] = pick_iface(
6519 cct,
6520 client_messenger->get_myaddrs().front().get_sockaddr_storage());
6521 (*pm)["back_iface"] = pick_iface(
6522 cct,
6523 cluster_messenger->get_myaddrs().front().get_sockaddr_storage());
6524
6525 // network numa
6526 {
6527 int node = -1;
6528 set<int> nodes;
6529 set<string> unknown;
6530 for (auto nm : { "front_iface", "back_iface" }) {
6531 if (!(*pm)[nm].size()) {
6532 unknown.insert(nm);
6533 continue;
6534 }
6535 int n = -1;
6536 int r = get_iface_numa_node((*pm)[nm], &n);
6537 if (r < 0) {
6538 unknown.insert((*pm)[nm]);
6539 continue;
6540 }
6541 nodes.insert(n);
6542 if (node < 0) {
6543 node = n;
6544 }
6545 }
6546 if (unknown.size()) {
6547 (*pm)["network_numa_unknown_ifaces"] = stringify(unknown);
6548 }
6549 if (!nodes.empty()) {
6550 (*pm)["network_numa_nodes"] = stringify(nodes);
6551 }
6552 if (node >= 0 && nodes.size() == 1 && unknown.empty()) {
6553 (*pm)["network_numa_node"] = stringify(node);
6554 }
6555 }
6556
6557 if (numa_node >= 0) {
6558 (*pm)["numa_node"] = stringify(numa_node);
6559 (*pm)["numa_node_cpus"] = cpu_set_to_str_list(numa_cpu_set_size,
6560 &numa_cpu_set);
6561 }
6562
6563 set<string> devnames;
6564 store->get_devices(&devnames);
9f95a23c
TL
6565 map<string,string> errs;
6566 get_device_metadata(devnames, pm, &errs);
6567 for (auto& i : errs) {
6568 dout(1) << __func__ << " " << i.first << ": " << i.second << dendl;
11fdf7f2 6569 }
7c673cae
FG
6570 dout(10) << __func__ << " " << *pm << dendl;
6571}
6572
6573void OSD::queue_want_up_thru(epoch_t want)
6574{
9f95a23c
TL
6575 std::shared_lock map_locker{map_lock};
6576 epoch_t cur = get_osdmap()->get_up_thru(whoami);
6577 std::lock_guard report_locker(mon_report_lock);
7c673cae
FG
6578 if (want > up_thru_wanted) {
6579 dout(10) << "queue_want_up_thru now " << want << " (was " << up_thru_wanted << ")"
6580 << ", currently " << cur
6581 << dendl;
6582 up_thru_wanted = want;
6583 send_alive();
6584 } else {
6585 dout(10) << "queue_want_up_thru want " << want << " <= queued " << up_thru_wanted
6586 << ", currently " << cur
6587 << dendl;
6588 }
7c673cae
FG
6589}
6590
6591void OSD::send_alive()
6592{
9f95a23c
TL
6593 ceph_assert(ceph_mutex_is_locked(mon_report_lock));
6594 const auto osdmap = get_osdmap();
7c673cae
FG
6595 if (!osdmap->exists(whoami))
6596 return;
6597 epoch_t up_thru = osdmap->get_up_thru(whoami);
6598 dout(10) << "send_alive up_thru currently " << up_thru << " want " << up_thru_wanted << dendl;
6599 if (up_thru_wanted > up_thru) {
6600 dout(10) << "send_alive want " << up_thru_wanted << dendl;
6601 monc->send_mon_message(new MOSDAlive(osdmap->get_epoch(), up_thru_wanted));
6602 }
6603}
6604
6605void OSD::request_full_map(epoch_t first, epoch_t last)
6606{
6607 dout(10) << __func__ << " " << first << ".." << last
6608 << ", previously requested "
6609 << requested_full_first << ".." << requested_full_last << dendl;
9f95a23c 6610 ceph_assert(ceph_mutex_is_locked(osd_lock));
11fdf7f2
TL
6611 ceph_assert(first > 0 && last > 0);
6612 ceph_assert(first <= last);
6613 ceph_assert(first >= requested_full_first); // we shouldn't ever ask for older maps
7c673cae
FG
6614 if (requested_full_first == 0) {
6615 // first request
6616 requested_full_first = first;
6617 requested_full_last = last;
6618 } else if (last <= requested_full_last) {
6619 // dup
6620 return;
6621 } else {
6622 // additional request
6623 first = requested_full_last + 1;
6624 requested_full_last = last;
6625 }
6626 MMonGetOSDMap *req = new MMonGetOSDMap;
6627 req->request_full(first, last);
6628 monc->send_mon_message(req);
6629}
6630
6631void OSD::got_full_map(epoch_t e)
6632{
11fdf7f2 6633 ceph_assert(requested_full_first <= requested_full_last);
9f95a23c 6634 ceph_assert(ceph_mutex_is_locked(osd_lock));
7c673cae
FG
6635 if (requested_full_first == 0) {
6636 dout(20) << __func__ << " " << e << ", nothing requested" << dendl;
6637 return;
6638 }
6639 if (e < requested_full_first) {
6640 dout(10) << __func__ << " " << e << ", requested " << requested_full_first
6641 << ".." << requested_full_last
6642 << ", ignoring" << dendl;
6643 return;
6644 }
6645 if (e >= requested_full_last) {
6646 dout(10) << __func__ << " " << e << ", requested " << requested_full_first
6647 << ".." << requested_full_last << ", resetting" << dendl;
6648 requested_full_first = requested_full_last = 0;
6649 return;
6650 }
6651
6652 requested_full_first = e + 1;
6653
6654 dout(10) << __func__ << " " << e << ", requested " << requested_full_first
6655 << ".." << requested_full_last
6656 << ", still need more" << dendl;
6657}
6658
6659void OSD::requeue_failures()
6660{
11fdf7f2 6661 std::lock_guard l(heartbeat_lock);
7c673cae
FG
6662 unsigned old_queue = failure_queue.size();
6663 unsigned old_pending = failure_pending.size();
11fdf7f2 6664 for (auto p = failure_pending.begin(); p != failure_pending.end(); ) {
7c673cae
FG
6665 failure_queue[p->first] = p->second.first;
6666 failure_pending.erase(p++);
6667 }
6668 dout(10) << __func__ << " " << old_queue << " + " << old_pending << " -> "
6669 << failure_queue.size() << dendl;
6670}
6671
6672void OSD::send_failures()
6673{
9f95a23c
TL
6674 ceph_assert(ceph_mutex_is_locked(map_lock));
6675 ceph_assert(ceph_mutex_is_locked(mon_report_lock));
11fdf7f2 6676 std::lock_guard l(heartbeat_lock);
7c673cae 6677 utime_t now = ceph_clock_now();
9f95a23c 6678 const auto osdmap = get_osdmap();
7c673cae
FG
6679 while (!failure_queue.empty()) {
6680 int osd = failure_queue.begin()->first;
7c673cae
FG
6681 if (!failure_pending.count(osd)) {
6682 int failed_for = (int)(double)(now - failure_queue.begin()->second);
11fdf7f2
TL
6683 monc->send_mon_message(
6684 new MOSDFailure(
6685 monc->get_fsid(),
6686 osd,
6687 osdmap->get_addrs(osd),
6688 failed_for,
6689 osdmap->get_epoch()));
6690 failure_pending[osd] = make_pair(failure_queue.begin()->second,
6691 osdmap->get_addrs(osd));
7c673cae
FG
6692 }
6693 failure_queue.erase(osd);
6694 }
6695}
6696
11fdf7f2 6697void OSD::send_still_alive(epoch_t epoch, int osd, const entity_addrvec_t &addrs)
7c673cae 6698{
11fdf7f2
TL
6699 MOSDFailure *m = new MOSDFailure(monc->get_fsid(), osd, addrs, 0, epoch,
6700 MOSDFailure::FLAG_ALIVE);
7c673cae
FG
6701 monc->send_mon_message(m);
6702}
6703
11fdf7f2 6704void OSD::cancel_pending_failures()
7c673cae 6705{
11fdf7f2
TL
6706 std::lock_guard l(heartbeat_lock);
6707 auto it = failure_pending.begin();
6708 while (it != failure_pending.end()) {
6709 dout(10) << __func__ << " canceling in-flight failure report for osd."
6710 << it->first << dendl;
9f95a23c 6711 send_still_alive(get_osdmap_epoch(), it->first, it->second.second);
11fdf7f2 6712 failure_pending.erase(it++);
7c673cae 6713 }
7c673cae
FG
6714}
6715
6716void OSD::send_beacon(const ceph::coarse_mono_clock::time_point& now)
6717{
6718 const auto& monmap = monc->monmap;
6719 // send beacon to mon even if we are just connected, and the monmap is not
6720 // initialized yet by then.
6721 if (monmap.epoch > 0 &&
6722 monmap.get_required_features().contains_all(
6723 ceph::features::mon::FEATURE_LUMINOUS)) {
6724 dout(20) << __func__ << " sending" << dendl;
7c673cae
FG
6725 MOSDBeacon* beacon = nullptr;
6726 {
11fdf7f2 6727 std::lock_guard l{min_last_epoch_clean_lock};
9f95a23c
TL
6728 beacon = new MOSDBeacon(get_osdmap_epoch(),
6729 min_last_epoch_clean,
6730 superblock.last_purged_snaps_scrub);
494da23a 6731 beacon->pgs = min_last_epoch_clean_pgs;
224ce89b 6732 last_sent_beacon = now;
7c673cae
FG
6733 }
6734 monc->send_mon_message(beacon);
6735 } else {
6736 dout(20) << __func__ << " not sending" << dendl;
6737 }
6738}
6739
7c673cae
FG
6740void OSD::handle_command(MCommand *m)
6741{
6742 ConnectionRef con = m->get_connection();
9f95a23c 6743 auto session = ceph::ref_cast<Session>(con->get_priv());
7c673cae 6744 if (!session) {
9f95a23c 6745 con->send_message(new MCommandReply(m, -EACCES));
7c673cae
FG
6746 m->put();
6747 return;
6748 }
9f95a23c
TL
6749 if (!session->caps.allow_all()) {
6750 con->send_message(new MCommandReply(m, -EACCES));
7c673cae
FG
6751 m->put();
6752 return;
6753 }
9f95a23c 6754 cct->get_admin_socket()->queue_tell_command(m);
7c673cae
FG
6755 m->put();
6756}
6757
f64942e4
AA
6758namespace {
6759 class unlock_guard {
9f95a23c 6760 ceph::mutex& m;
f64942e4 6761 public:
9f95a23c 6762 explicit unlock_guard(ceph::mutex& mutex)
f64942e4
AA
6763 : m(mutex)
6764 {
11fdf7f2 6765 m.unlock();
f64942e4
AA
6766 }
6767 unlock_guard(unlock_guard&) = delete;
6768 ~unlock_guard() {
11fdf7f2 6769 m.lock();
f64942e4
AA
6770 }
6771 };
6772}
6773
9f95a23c 6774void OSD::scrub_purged_snaps()
7c673cae 6775{
9f95a23c
TL
6776 dout(10) << __func__ << dendl;
6777 ceph_assert(ceph_mutex_is_locked(osd_lock));
6778 SnapMapper::Scrubber s(cct, store, service.meta_ch,
6779 make_snapmapper_oid(),
6780 make_purged_snaps_oid());
6781 clog->debug() << "purged_snaps scrub starts";
6782 osd_lock.unlock();
6783 s.run();
6784 if (s.stray.size()) {
6785 clog->debug() << "purged_snaps scrub found " << s.stray.size() << " strays";
6786 } else {
6787 clog->debug() << "purged_snaps scrub ok";
224ce89b 6788 }
9f95a23c
TL
6789 set<pair<spg_t,snapid_t>> queued;
6790 for (auto& [pool, snap, hash, shard] : s.stray) {
6791 const pg_pool_t *pi = get_osdmap()->get_pg_pool(pool);
6792 if (!pi) {
6793 dout(20) << __func__ << " pool " << pool << " dne" << dendl;
6794 continue;
11fdf7f2 6795 }
9f95a23c
TL
6796 pg_t pgid(pi->raw_hash_to_pg(hash), pool);
6797 spg_t spgid(pgid, shard);
6798 pair<spg_t,snapid_t> p(spgid, snap);
6799 if (queued.count(p)) {
6800 dout(20) << __func__ << " pg " << spgid << " snap " << snap
6801 << " already queued" << dendl;
6802 continue;
11fdf7f2 6803 }
9f95a23c
TL
6804 PGRef pg = lookup_lock_pg(spgid);
6805 if (!pg) {
6806 dout(20) << __func__ << " pg " << spgid << " not found" << dendl;
6807 continue;
11fdf7f2 6808 }
9f95a23c
TL
6809 queued.insert(p);
6810 dout(10) << __func__ << " requeue pg " << spgid << " " << pg << " snap "
6811 << snap << dendl;
6812 pg->queue_snap_retrim(snap);
6813 pg->unlock();
7c673cae 6814 }
9f95a23c
TL
6815 osd_lock.lock();
6816 if (is_stopping()) {
6817 return;
6818 }
6819 dout(10) << __func__ << " done queueing pgs, updating superblock" << dendl;
6820 ObjectStore::Transaction t;
6821 superblock.last_purged_snaps_scrub = ceph_clock_now();
6822 write_superblock(t);
6823 int tr = store->queue_transaction(service.meta_ch, std::move(t), nullptr);
6824 ceph_assert(tr == 0);
6825 if (is_active()) {
6826 send_beacon(ceph::coarse_mono_clock::now());
6827 }
6828 dout(10) << __func__ << " done" << dendl;
11fdf7f2
TL
6829}
6830
6831void OSD::probe_smart(const string& only_devid, ostream& ss)
6832{
6833 set<string> devnames;
6834 store->get_devices(&devnames);
6835 uint64_t smart_timeout = cct->_conf.get_val<uint64_t>(
6836 "osd_smart_report_timeout");
6837
6838 // == typedef std::map<std::string, mValue> mObject;
6839 json_spirit::mObject json_map;
6840
6841 for (auto dev : devnames) {
6842 // smartctl works only on physical devices; filter out any logical device
6843 if (dev.find("dm-") == 0) {
6844 continue;
6845 }
6846
6847 string err;
6848 string devid = get_device_id(dev, &err);
6849 if (devid.size() == 0) {
6850 dout(10) << __func__ << " no unique id for dev " << dev << " ("
6851 << err << "), skipping" << dendl;
6852 continue;
6853 }
6854 if (only_devid.size() && devid != only_devid) {
6855 continue;
6856 }
6857
6858 json_spirit::mValue smart_json;
6859 if (block_device_get_metrics(dev, smart_timeout,
6860 &smart_json)) {
6861 dout(10) << "block_device_get_metrics failed for /dev/" << dev << dendl;
6862 continue;
6863 }
6864 json_map[devid] = smart_json;
7c673cae 6865 }
11fdf7f2 6866 json_spirit::write(json_map, ss, json_spirit::pretty_print);
7c673cae
FG
6867}
6868
6869bool OSD::heartbeat_dispatch(Message *m)
6870{
6871 dout(30) << "heartbeat_dispatch " << m << dendl;
6872 switch (m->get_type()) {
6873
6874 case CEPH_MSG_PING:
6875 dout(10) << "ping from " << m->get_source_inst() << dendl;
6876 m->put();
6877 break;
6878
6879 case MSG_OSD_PING:
6880 handle_osd_ping(static_cast<MOSDPing*>(m));
6881 break;
6882
6883 default:
6884 dout(0) << "dropping unexpected message " << *m << " from " << m->get_source_inst() << dendl;
6885 m->put();
6886 }
6887
6888 return true;
6889}
6890
6891bool OSD::ms_dispatch(Message *m)
6892{
6893 dout(20) << "OSD::ms_dispatch: " << *m << dendl;
6894 if (m->get_type() == MSG_OSD_MARK_ME_DOWN) {
6895 service.got_stop_ack();
6896 m->put();
6897 return true;
6898 }
6899
6900 // lock!
6901
9f95a23c 6902 osd_lock.lock();
7c673cae 6903 if (is_stopping()) {
9f95a23c 6904 osd_lock.unlock();
7c673cae
FG
6905 m->put();
6906 return true;
6907 }
6908
6909 do_waiters();
6910 _dispatch(m);
6911
9f95a23c 6912 osd_lock.unlock();
7c673cae
FG
6913
6914 return true;
6915}
6916
9f95a23c
TL
6917void OSDService::maybe_share_map(
6918 Connection *con,
6919 const OSDMapRef& osdmap,
6920 epoch_t peer_epoch_lb)
7c673cae 6921{
9f95a23c
TL
6922 // NOTE: we assume caller hold something that keeps the Connection itself
6923 // pinned (e.g., an OpRequest's MessageRef).
6924 auto session = ceph::ref_cast<Session>(con->get_priv());
6925 if (!session) {
7c673cae
FG
6926 return;
6927 }
7c673cae 6928
9f95a23c
TL
6929 // assume the peer has the newer of the op's sent_epoch and what
6930 // we think we sent them.
7c673cae 6931 session->sent_epoch_lock.lock();
9f95a23c
TL
6932 if (peer_epoch_lb > session->last_sent_epoch) {
6933 dout(10) << __func__ << " con " << con
6934 << " " << con->get_peer_addr()
6935 << " map epoch " << session->last_sent_epoch
6936 << " -> " << peer_epoch_lb << " (as per caller)" << dendl;
6937 session->last_sent_epoch = peer_epoch_lb;
6938 }
6939 epoch_t last_sent_epoch = session->last_sent_epoch;
7c673cae
FG
6940 session->sent_epoch_lock.unlock();
6941
9f95a23c
TL
6942 if (osdmap->get_epoch() <= last_sent_epoch) {
6943 return;
6944 }
11fdf7f2 6945
9f95a23c
TL
6946 send_incremental_map(last_sent_epoch, con, osdmap);
6947 last_sent_epoch = osdmap->get_epoch();
7c673cae
FG
6948
6949 session->sent_epoch_lock.lock();
6950 if (session->last_sent_epoch < last_sent_epoch) {
9f95a23c
TL
6951 dout(10) << __func__ << " con " << con
6952 << " " << con->get_peer_addr()
6953 << " map epoch " << session->last_sent_epoch
6954 << " -> " << last_sent_epoch << " (shared)" << dendl;
7c673cae
FG
6955 session->last_sent_epoch = last_sent_epoch;
6956 }
6957 session->sent_epoch_lock.unlock();
7c673cae
FG
6958}
6959
9f95a23c 6960void OSD::dispatch_session_waiting(const ceph::ref_t<Session>& session, OSDMapRef osdmap)
7c673cae 6961{
9f95a23c 6962 ceph_assert(ceph_mutex_is_locked(session->session_dispatch_lock));
7c673cae
FG
6963
6964 auto i = session->waiting_on_map.begin();
6965 while (i != session->waiting_on_map.end()) {
6966 OpRequestRef op = &(*i);
11fdf7f2 6967 ceph_assert(ms_can_fast_dispatch(op->get_req()));
9f95a23c 6968 auto m = op->get_req<MOSDFastDispatchOp>();
7c673cae
FG
6969 if (m->get_min_epoch() > osdmap->get_epoch()) {
6970 break;
6971 }
6972 session->waiting_on_map.erase(i++);
6973 op->put();
6974
6975 spg_t pgid;
6976 if (m->get_type() == CEPH_MSG_OSD_OP) {
6977 pg_t actual_pgid = osdmap->raw_pg_to_pg(
6978 static_cast<const MOSDOp*>(m)->get_pg());
6979 if (!osdmap->get_primary_shard(actual_pgid, &pgid)) {
6980 continue;
6981 }
6982 } else {
6983 pgid = m->get_spg();
6984 }
11fdf7f2 6985 enqueue_op(pgid, std::move(op), m->get_map_epoch());
7c673cae
FG
6986 }
6987
6988 if (session->waiting_on_map.empty()) {
6989 clear_session_waiting_on_map(session);
6990 } else {
6991 register_session_waiting_on_map(session);
6992 }
6993}
6994
6995void OSD::ms_fast_dispatch(Message *m)
6996{
11fdf7f2 6997 FUNCTRACE(cct);
7c673cae
FG
6998 if (service.is_stopping()) {
6999 m->put();
7000 return;
7001 }
11fdf7f2
TL
7002
7003 // peering event?
7004 switch (m->get_type()) {
7005 case CEPH_MSG_PING:
7006 dout(10) << "ping from " << m->get_source() << dendl;
7007 m->put();
7008 return;
11fdf7f2
TL
7009 case MSG_OSD_FORCE_RECOVERY:
7010 handle_fast_force_recovery(static_cast<MOSDForceRecovery*>(m));
7011 return;
7012 case MSG_OSD_SCRUB2:
7013 handle_fast_scrub(static_cast<MOSDScrub2*>(m));
7014 return;
7015
7016 case MSG_OSD_PG_CREATE2:
7017 return handle_fast_pg_create(static_cast<MOSDPGCreate2*>(m));
7018 case MSG_OSD_PG_QUERY:
7019 return handle_fast_pg_query(static_cast<MOSDPGQuery*>(m));
7020 case MSG_OSD_PG_NOTIFY:
7021 return handle_fast_pg_notify(static_cast<MOSDPGNotify*>(m));
7022 case MSG_OSD_PG_INFO:
7023 return handle_fast_pg_info(static_cast<MOSDPGInfo*>(m));
7024 case MSG_OSD_PG_REMOVE:
7025 return handle_fast_pg_remove(static_cast<MOSDPGRemove*>(m));
7026
7027 // these are single-pg messages that handle themselves
7028 case MSG_OSD_PG_LOG:
7029 case MSG_OSD_PG_TRIM:
9f95a23c
TL
7030 case MSG_OSD_PG_NOTIFY2:
7031 case MSG_OSD_PG_QUERY2:
7032 case MSG_OSD_PG_INFO2:
11fdf7f2
TL
7033 case MSG_OSD_BACKFILL_RESERVE:
7034 case MSG_OSD_RECOVERY_RESERVE:
9f95a23c
TL
7035 case MSG_OSD_PG_LEASE:
7036 case MSG_OSD_PG_LEASE_ACK:
11fdf7f2
TL
7037 {
7038 MOSDPeeringOp *pm = static_cast<MOSDPeeringOp*>(m);
7039 if (require_osd_peer(pm)) {
7040 enqueue_peering_evt(
7041 pm->get_spg(),
7042 PGPeeringEventRef(pm->get_event()));
7043 }
7044 pm->put();
7045 return;
7046 }
7047 }
7048
7c673cae
FG
7049 OpRequestRef op = op_tracker.create_request<OpRequest, Message*>(m);
7050 {
7051#ifdef WITH_LTTNG
7052 osd_reqid_t reqid = op->get_reqid();
7053#endif
7054 tracepoint(osd, ms_fast_dispatch, reqid.name._type,
7055 reqid.name._num, reqid.tid, reqid.inc);
7056 }
7057
7058 if (m->trace)
7059 op->osd_trace.init("osd op", &trace_endpoint, &m->trace);
7060
11fdf7f2 7061 // note sender epoch, min req's epoch
7c673cae
FG
7062 op->sent_epoch = static_cast<MOSDFastDispatchOp*>(m)->get_map_epoch();
7063 op->min_epoch = static_cast<MOSDFastDispatchOp*>(m)->get_min_epoch();
11fdf7f2 7064 ceph_assert(op->min_epoch <= op->sent_epoch); // sanity check!
7c673cae
FG
7065
7066 service.maybe_inject_dispatch_delay();
7067
7068 if (m->get_connection()->has_features(CEPH_FEATUREMASK_RESEND_ON_SPLIT) ||
7069 m->get_type() != CEPH_MSG_OSD_OP) {
7070 // queue it directly
7071 enqueue_op(
7072 static_cast<MOSDFastDispatchOp*>(m)->get_spg(),
11fdf7f2 7073 std::move(op),
7c673cae
FG
7074 static_cast<MOSDFastDispatchOp*>(m)->get_map_epoch());
7075 } else {
7076 // legacy client, and this is an MOSDOp (the *only* fast dispatch
7077 // message that didn't have an explicit spg_t); we need to map
7078 // them to an spg_t while preserving delivery order.
11fdf7f2
TL
7079 auto priv = m->get_connection()->get_priv();
7080 if (auto session = static_cast<Session*>(priv.get()); session) {
7081 std::lock_guard l{session->session_dispatch_lock};
7082 op->get();
7083 session->waiting_on_map.push_back(*op);
7084 OSDMapRef nextmap = service.get_nextmap_reserved();
7085 dispatch_session_waiting(session, nextmap);
7086 service.release_map(nextmap);
7c673cae
FG
7087 }
7088 }
7089 OID_EVENT_TRACE_WITH_MSG(m, "MS_FAST_DISPATCH_END", false);
7090}
7091
11fdf7f2 7092int OSD::ms_handle_authentication(Connection *con)
7c673cae 7093{
11fdf7f2 7094 int ret = 0;
9f95a23c 7095 auto s = ceph::ref_cast<Session>(con->get_priv());
11fdf7f2 7096 if (!s) {
9f95a23c
TL
7097 s = ceph::make_ref<Session>(cct, con);
7098 con->set_priv(s);
11fdf7f2
TL
7099 s->entity_name = con->get_peer_entity_name();
7100 dout(10) << __func__ << " new session " << s << " con " << s->con
7101 << " entity " << s->entity_name
7102 << " addr " << con->get_peer_addrs() << dendl;
7103 } else {
7104 dout(10) << __func__ << " existing session " << s << " con " << s->con
7105 << " entity " << s->entity_name
7106 << " addr " << con->get_peer_addrs() << dendl;
7c673cae
FG
7107 }
7108
11fdf7f2 7109 AuthCapsInfo &caps_info = con->get_peer_caps_info();
9f95a23c 7110 if (caps_info.allow_all) {
11fdf7f2 7111 s->caps.set_allow_all();
9f95a23c 7112 } else if (caps_info.caps.length() > 0) {
11fdf7f2
TL
7113 bufferlist::const_iterator p = caps_info.caps.cbegin();
7114 string str;
7115 try {
7116 decode(str, p);
7117 }
7118 catch (buffer::error& e) {
7119 dout(10) << __func__ << " session " << s << " " << s->entity_name
7120 << " failed to decode caps string" << dendl;
9f95a23c 7121 ret = -EACCES;
11fdf7f2
TL
7122 }
7123 if (!ret) {
7c673cae 7124 bool success = s->caps.parse(str);
11fdf7f2
TL
7125 if (success) {
7126 dout(10) << __func__ << " session " << s
7127 << " " << s->entity_name
7128 << " has caps " << s->caps << " '" << str << "'" << dendl;
7129 ret = 1;
7130 } else {
7131 dout(10) << __func__ << " session " << s << " " << s->entity_name
7132 << " failed to parse caps '" << str << "'" << dendl;
9f95a23c 7133 ret = -EACCES;
11fdf7f2 7134 }
7c673cae 7135 }
7c673cae 7136 }
11fdf7f2 7137 return ret;
7c673cae
FG
7138}
7139
7140void OSD::do_waiters()
7141{
9f95a23c 7142 ceph_assert(ceph_mutex_is_locked(osd_lock));
7c673cae
FG
7143
7144 dout(10) << "do_waiters -- start" << dendl;
7145 while (!finished.empty()) {
7146 OpRequestRef next = finished.front();
7147 finished.pop_front();
7148 dispatch_op(next);
7149 }
7150 dout(10) << "do_waiters -- finish" << dendl;
7151}
7152
7153void OSD::dispatch_op(OpRequestRef op)
7154{
7155 switch (op->get_req()->get_type()) {
7156
7157 case MSG_OSD_PG_CREATE:
7158 handle_pg_create(op);
7159 break;
7c673cae
FG
7160 }
7161}
7162
7163void OSD::_dispatch(Message *m)
7164{
9f95a23c 7165 ceph_assert(ceph_mutex_is_locked(osd_lock));
7c673cae
FG
7166 dout(20) << "_dispatch " << m << " " << *m << dendl;
7167
7168 switch (m->get_type()) {
7c673cae
FG
7169 // -- don't need OSDMap --
7170
7171 // map and replication
7172 case CEPH_MSG_OSD_MAP:
7173 handle_osd_map(static_cast<MOSDMap*>(m));
7174 break;
9f95a23c
TL
7175 case MSG_MON_GET_PURGED_SNAPS_REPLY:
7176 handle_get_purged_snaps_reply(static_cast<MMonGetPurgedSnapsReply*>(m));
7177 break;
7c673cae
FG
7178
7179 // osd
7c673cae
FG
7180 case MSG_OSD_SCRUB:
7181 handle_scrub(static_cast<MOSDScrub*>(m));
7182 break;
7183
11fdf7f2
TL
7184 case MSG_COMMAND:
7185 handle_command(static_cast<MCommand*>(m));
7186 return;
c07f9fc5 7187
7c673cae
FG
7188 // -- need OSDMap --
7189
7190 case MSG_OSD_PG_CREATE:
7c673cae
FG
7191 {
7192 OpRequestRef op = op_tracker.create_request<OpRequest, Message*>(m);
7193 if (m->trace)
7194 op->osd_trace.init("osd op", &trace_endpoint, &m->trace);
7195 // no map? starting up?
9f95a23c 7196 if (!get_osdmap()) {
7c673cae
FG
7197 dout(7) << "no OSDMap, not booted" << dendl;
7198 logger->inc(l_osd_waiting_for_map);
7199 waiting_for_osdmap.push_back(op);
7200 op->mark_delayed("no osdmap");
7201 break;
7202 }
7203
7204 // need OSDMap
7205 dispatch_op(op);
7206 }
7207 }
7208}
7209
11fdf7f2 7210// remove me post-nautilus
7c673cae
FG
7211void OSD::handle_scrub(MOSDScrub *m)
7212{
7213 dout(10) << "handle_scrub " << *m << dendl;
7214 if (!require_mon_or_mgr_peer(m)) {
7215 m->put();
7216 return;
7217 }
7218 if (m->fsid != monc->get_fsid()) {
11fdf7f2
TL
7219 dout(0) << "handle_scrub fsid " << m->fsid << " != " << monc->get_fsid()
7220 << dendl;
7c673cae
FG
7221 m->put();
7222 return;
7223 }
7224
11fdf7f2
TL
7225 vector<spg_t> spgs;
7226 _get_pgids(&spgs);
7227
7228 if (!m->scrub_pgs.empty()) {
7229 vector<spg_t> v;
7230 for (auto pgid : m->scrub_pgs) {
7c673cae 7231 spg_t pcand;
9f95a23c 7232 if (get_osdmap()->get_primary_shard(pgid, &pcand) &&
11fdf7f2
TL
7233 std::find(spgs.begin(), spgs.end(), pcand) != spgs.end()) {
7234 v.push_back(pcand);
7c673cae
FG
7235 }
7236 }
11fdf7f2
TL
7237 spgs.swap(v);
7238 }
7239
7240 for (auto pgid : spgs) {
7241 enqueue_peering_evt(
7242 pgid,
7243 PGPeeringEventRef(
7244 std::make_shared<PGPeeringEvent>(
7245 get_osdmap_epoch(),
7246 get_osdmap_epoch(),
9f95a23c 7247 PeeringState::RequestScrub(m->deep, m->repair))));
7c673cae
FG
7248 }
7249
7250 m->put();
7251}
7252
11fdf7f2
TL
7253void OSD::handle_fast_scrub(MOSDScrub2 *m)
7254{
7255 dout(10) << __func__ << " " << *m << dendl;
7256 if (!require_mon_or_mgr_peer(m)) {
7257 m->put();
7258 return;
7259 }
7260 if (m->fsid != monc->get_fsid()) {
7261 dout(0) << __func__ << " fsid " << m->fsid << " != " << monc->get_fsid()
7262 << dendl;
7263 m->put();
7264 return;
7265 }
7266 for (auto pgid : m->scrub_pgs) {
7267 enqueue_peering_evt(
7268 pgid,
7269 PGPeeringEventRef(
7270 std::make_shared<PGPeeringEvent>(
7271 m->epoch,
7272 m->epoch,
9f95a23c 7273 PeeringState::RequestScrub(m->deep, m->repair))));
11fdf7f2
TL
7274 }
7275 m->put();
7276}
7277
7c673cae
FG
7278bool OSD::scrub_random_backoff()
7279{
7280 bool coin_flip = (rand() / (double)RAND_MAX >=
7281 cct->_conf->osd_scrub_backoff_ratio);
7282 if (!coin_flip) {
7283 dout(20) << "scrub_random_backoff lost coin flip, randomly backing off" << dendl;
7284 return true;
7285 }
7286 return false;
7287}
7288
7289OSDService::ScrubJob::ScrubJob(CephContext* cct,
7290 const spg_t& pg, const utime_t& timestamp,
7291 double pool_scrub_min_interval,
7292 double pool_scrub_max_interval, bool must)
7293 : cct(cct),
7294 pgid(pg),
7295 sched_time(timestamp),
7296 deadline(timestamp)
7297{
7298 // if not explicitly requested, postpone the scrub with a random delay
7299 if (!must) {
7300 double scrub_min_interval = pool_scrub_min_interval > 0 ?
7301 pool_scrub_min_interval : cct->_conf->osd_scrub_min_interval;
7302 double scrub_max_interval = pool_scrub_max_interval > 0 ?
7303 pool_scrub_max_interval : cct->_conf->osd_scrub_max_interval;
7304
7305 sched_time += scrub_min_interval;
7306 double r = rand() / (double)RAND_MAX;
7307 sched_time +=
7308 scrub_min_interval * cct->_conf->osd_scrub_interval_randomize_ratio * r;
11fdf7f2
TL
7309 if (scrub_max_interval == 0) {
7310 deadline = utime_t();
7311 } else {
7312 deadline += scrub_max_interval;
7313 }
7314
7c673cae
FG
7315 }
7316}
7317
7318bool OSDService::ScrubJob::ScrubJob::operator<(const OSDService::ScrubJob& rhs) const {
7319 if (sched_time < rhs.sched_time)
7320 return true;
7321 if (sched_time > rhs.sched_time)
7322 return false;
7323 return pgid < rhs.pgid;
7324}
7325
9f95a23c
TL
7326double OSD::scrub_sleep_time(bool must_scrub)
7327{
7328 if (must_scrub) {
7329 return cct->_conf->osd_scrub_sleep;
7330 }
7331 utime_t now = ceph_clock_now();
7332 if (scrub_time_permit(now)) {
7333 return cct->_conf->osd_scrub_sleep;
7334 }
7335 double normal_sleep = cct->_conf->osd_scrub_sleep;
7336 double extended_sleep = cct->_conf->osd_scrub_extended_sleep;
7337 return std::max(extended_sleep, normal_sleep);
7338}
7339
7c673cae
FG
7340bool OSD::scrub_time_permit(utime_t now)
7341{
7342 struct tm bdt;
7343 time_t tt = now.sec();
7344 localtime_r(&tt, &bdt);
28e407b8
AA
7345
7346 bool day_permit = false;
7347 if (cct->_conf->osd_scrub_begin_week_day < cct->_conf->osd_scrub_end_week_day) {
7348 if (bdt.tm_wday >= cct->_conf->osd_scrub_begin_week_day && bdt.tm_wday < cct->_conf->osd_scrub_end_week_day) {
7349 day_permit = true;
7350 }
7351 } else {
7352 if (bdt.tm_wday >= cct->_conf->osd_scrub_begin_week_day || bdt.tm_wday < cct->_conf->osd_scrub_end_week_day) {
7353 day_permit = true;
7354 }
7355 }
7356
7357 if (!day_permit) {
7358 dout(20) << __func__ << " should run between week day " << cct->_conf->osd_scrub_begin_week_day
7359 << " - " << cct->_conf->osd_scrub_end_week_day
7360 << " now " << bdt.tm_wday << " = no" << dendl;
7361 return false;
7362 }
7363
7c673cae
FG
7364 bool time_permit = false;
7365 if (cct->_conf->osd_scrub_begin_hour < cct->_conf->osd_scrub_end_hour) {
7366 if (bdt.tm_hour >= cct->_conf->osd_scrub_begin_hour && bdt.tm_hour < cct->_conf->osd_scrub_end_hour) {
7367 time_permit = true;
7368 }
7369 } else {
7370 if (bdt.tm_hour >= cct->_conf->osd_scrub_begin_hour || bdt.tm_hour < cct->_conf->osd_scrub_end_hour) {
7371 time_permit = true;
7372 }
7373 }
7374 if (!time_permit) {
7375 dout(20) << __func__ << " should run between " << cct->_conf->osd_scrub_begin_hour
7376 << " - " << cct->_conf->osd_scrub_end_hour
7377 << " now " << bdt.tm_hour << " = no" << dendl;
7378 } else {
7379 dout(20) << __func__ << " should run between " << cct->_conf->osd_scrub_begin_hour
7380 << " - " << cct->_conf->osd_scrub_end_hour
7381 << " now " << bdt.tm_hour << " = yes" << dendl;
7382 }
7383 return time_permit;
7384}
7385
7386bool OSD::scrub_load_below_threshold()
7387{
7388 double loadavgs[3];
7389 if (getloadavg(loadavgs, 3) != 3) {
7390 dout(10) << __func__ << " couldn't read loadavgs\n" << dendl;
7391 return false;
7392 }
7393
7394 // allow scrub if below configured threshold
91327a77
AA
7395 long cpus = sysconf(_SC_NPROCESSORS_ONLN);
7396 double loadavg_per_cpu = cpus > 0 ? loadavgs[0] / cpus : loadavgs[0];
7397 if (loadavg_per_cpu < cct->_conf->osd_scrub_load_threshold) {
7398 dout(20) << __func__ << " loadavg per cpu " << loadavg_per_cpu
7c673cae
FG
7399 << " < max " << cct->_conf->osd_scrub_load_threshold
7400 << " = yes" << dendl;
7401 return true;
7402 }
7403
7404 // allow scrub if below daily avg and currently decreasing
7405 if (loadavgs[0] < daily_loadavg && loadavgs[0] < loadavgs[2]) {
7406 dout(20) << __func__ << " loadavg " << loadavgs[0]
7407 << " < daily_loadavg " << daily_loadavg
7408 << " and < 15m avg " << loadavgs[2]
7409 << " = yes" << dendl;
7410 return true;
7411 }
7412
7413 dout(20) << __func__ << " loadavg " << loadavgs[0]
7414 << " >= max " << cct->_conf->osd_scrub_load_threshold
7415 << " and ( >= daily_loadavg " << daily_loadavg
7416 << " or >= 15m avg " << loadavgs[2]
7417 << ") = no" << dendl;
7418 return false;
7419}
7420
7421void OSD::sched_scrub()
7422{
7423 // if not permitted, fail fast
eafe8130 7424 if (!service.can_inc_scrubs()) {
7c673cae
FG
7425 return;
7426 }
eafe8130
TL
7427 bool allow_requested_repair_only = false;
7428 if (service.is_recovery_active()) {
7429 if (!cct->_conf->osd_scrub_during_recovery && cct->_conf->osd_repair_during_recovery) {
7430 dout(10) << __func__
7431 << " will only schedule explicitly requested repair due to active recovery"
7432 << dendl;
7433 allow_requested_repair_only = true;
7434 } else if (!cct->_conf->osd_scrub_during_recovery && !cct->_conf->osd_repair_during_recovery) {
7435 dout(20) << __func__ << " not scheduling scrubs due to active recovery" << dendl;
7436 return;
7437 }
b5b8bbf5
FG
7438 }
7439
7c673cae
FG
7440 utime_t now = ceph_clock_now();
7441 bool time_permit = scrub_time_permit(now);
7442 bool load_is_low = scrub_load_below_threshold();
7443 dout(20) << "sched_scrub load_is_low=" << (int)load_is_low << dendl;
7444
7445 OSDService::ScrubJob scrub;
7446 if (service.first_scrub_stamp(&scrub)) {
7447 do {
7448 dout(30) << "sched_scrub examine " << scrub.pgid << " at " << scrub.sched_time << dendl;
7449
7450 if (scrub.sched_time > now) {
7451 // save ourselves some effort
7452 dout(10) << "sched_scrub " << scrub.pgid << " scheduled at " << scrub.sched_time
7453 << " > " << now << dendl;
7454 break;
7455 }
7456
11fdf7f2 7457 if ((scrub.deadline.is_zero() || scrub.deadline >= now) && !(time_permit && load_is_low)) {
7c673cae
FG
7458 dout(10) << __func__ << " not scheduling scrub for " << scrub.pgid << " due to "
7459 << (!time_permit ? "time not permit" : "high load") << dendl;
7460 continue;
7461 }
7462
11fdf7f2 7463 PGRef pg = _lookup_lock_pg(scrub.pgid);
7c673cae
FG
7464 if (!pg)
7465 continue;
494da23a
TL
7466 // This has already started, so go on to the next scrub job
7467 if (pg->scrubber.active) {
7468 pg->unlock();
7469 dout(30) << __func__ << ": already in progress pgid " << scrub.pgid << dendl;
7470 continue;
7471 }
eafe8130
TL
7472 // Skip other kinds of scrubing if only explicitly requested repairing is allowed
7473 if (allow_requested_repair_only && !pg->scrubber.must_repair) {
7474 pg->unlock();
7475 dout(10) << __func__ << " skip " << scrub.pgid
7476 << " because repairing is not explicitly requested on it"
7477 << dendl;
7478 continue;
7479 }
494da23a 7480 // If it is reserving, let it resolve before going to the next scrub job
eafe8130 7481 if (pg->scrubber.local_reserved && !pg->scrubber.active) {
494da23a
TL
7482 pg->unlock();
7483 dout(30) << __func__ << ": reserve in progress pgid " << scrub.pgid << dendl;
7484 break;
7485 }
11fdf7f2
TL
7486 dout(10) << "sched_scrub scrubbing " << scrub.pgid << " at " << scrub.sched_time
7487 << (pg->get_must_scrub() ? ", explicitly requested" :
7488 (load_is_low ? ", load_is_low" : " deadline < now"))
7489 << dendl;
7490 if (pg->sched_scrub()) {
7491 pg->unlock();
7492 break;
7c673cae
FG
7493 }
7494 pg->unlock();
7495 } while (service.next_scrub_stamp(scrub, &scrub));
7496 }
7497 dout(20) << "sched_scrub done" << dendl;
7498}
7499
494da23a
TL
7500void OSD::resched_all_scrubs()
7501{
7502 dout(10) << __func__ << ": start" << dendl;
7503 OSDService::ScrubJob scrub;
7504 if (service.first_scrub_stamp(&scrub)) {
7505 do {
7506 dout(20) << __func__ << ": examine " << scrub.pgid << dendl;
7507
7508 PGRef pg = _lookup_lock_pg(scrub.pgid);
7509 if (!pg)
7510 continue;
7511 if (!pg->scrubber.must_scrub && !pg->scrubber.need_auto) {
7512 dout(20) << __func__ << ": reschedule " << scrub.pgid << dendl;
7513 pg->on_info_history_change();
7514 }
7515 pg->unlock();
7516 } while (service.next_scrub_stamp(scrub, &scrub));
7517 }
7518 dout(10) << __func__ << ": done" << dendl;
7519}
7520
11fdf7f2
TL
7521MPGStats* OSD::collect_pg_stats()
7522{
7523 // This implementation unconditionally sends every is_primary PG's
7524 // stats every time we're called. This has equivalent cost to the
7525 // previous implementation's worst case where all PGs are busy and
7526 // their stats are always enqueued for sending.
9f95a23c 7527 std::shared_lock l{map_lock};
11fdf7f2 7528
11fdf7f2
TL
7529 osd_stat_t cur_stat = service.get_osd_stat();
7530 cur_stat.os_perf_stat = store->get_cur_stats();
7531
9f95a23c 7532 auto m = new MPGStats(monc->get_fsid(), get_osdmap_epoch());
11fdf7f2
TL
7533 m->osd_stat = cur_stat;
7534
7535 std::lock_guard lec{min_last_epoch_clean_lock};
9f95a23c 7536 min_last_epoch_clean = get_osdmap_epoch();
11fdf7f2
TL
7537 min_last_epoch_clean_pgs.clear();
7538
7539 std::set<int64_t> pool_set;
7540 vector<PGRef> pgs;
7541 _get_pgs(&pgs);
7542 for (auto& pg : pgs) {
7543 auto pool = pg->pg_id.pgid.pool();
7544 pool_set.emplace((int64_t)pool);
7545 if (!pg->is_primary()) {
7546 continue;
7547 }
7548 pg->get_pg_stats([&](const pg_stat_t& s, epoch_t lec) {
7549 m->pg_stat[pg->pg_id.pgid] = s;
7550 min_last_epoch_clean = min(min_last_epoch_clean, lec);
7551 min_last_epoch_clean_pgs.push_back(pg->pg_id.pgid);
7552 });
7553 }
7554 store_statfs_t st;
81eedcae 7555 bool per_pool_stats = false;
9f95a23c 7556 bool per_pool_omap_stats = false;
11fdf7f2 7557 for (auto p : pool_set) {
9f95a23c 7558 int r = store->pool_statfs(p, &st, &per_pool_omap_stats);
11fdf7f2
TL
7559 if (r == -ENOTSUP) {
7560 break;
7561 } else {
7562 assert(r >= 0);
7563 m->pool_stat[p] = st;
81eedcae 7564 per_pool_stats = true;
11fdf7f2
TL
7565 }
7566 }
7c673cae 7567
81eedcae
TL
7568 // indicate whether we are reporting per-pool stats
7569 m->osd_stat.num_osds = 1;
7570 m->osd_stat.num_per_pool_osds = per_pool_stats ? 1 : 0;
9f95a23c 7571 m->osd_stat.num_per_pool_omap_osds = per_pool_omap_stats ? 1 : 0;
81eedcae 7572
11fdf7f2
TL
7573 return m;
7574}
7c673cae 7575
11fdf7f2 7576vector<DaemonHealthMetric> OSD::get_health_metrics()
b32b8144 7577{
11fdf7f2
TL
7578 vector<DaemonHealthMetric> metrics;
7579 {
7580 utime_t oldest_secs;
7581 const utime_t now = ceph_clock_now();
7582 auto too_old = now;
7583 too_old -= cct->_conf.get_val<double>("osd_op_complaint_time");
7584 int slow = 0;
7585 TrackedOpRef oldest_op;
7586 auto count_slow_ops = [&](TrackedOp& op) {
7587 if (op.get_initiated() < too_old) {
9f95a23c
TL
7588 stringstream ss;
7589 ss << "slow request " << op.get_desc()
7590 << " initiated "
7591 << op.get_initiated()
7592 << " currently "
7593 << op.state_string();
7594 lgeneric_subdout(cct,osd,20) << ss.str() << dendl;
7595 clog->warn() << ss.str();
11fdf7f2
TL
7596 slow++;
7597 if (!oldest_op || op.get_initiated() < oldest_op->get_initiated()) {
7598 oldest_op = &op;
7599 }
7600 return true;
7601 } else {
7602 return false;
7603 }
7604 };
7605 if (op_tracker.visit_ops_in_flight(&oldest_secs, count_slow_ops)) {
7606 if (slow) {
7607 derr << __func__ << " reporting " << slow << " slow ops, oldest is "
7608 << oldest_op->get_desc() << dendl;
7609 }
7610 metrics.emplace_back(daemon_metric::SLOW_OPS, slow, oldest_secs);
7611 } else {
7612 // no news is not good news.
7613 metrics.emplace_back(daemon_metric::SLOW_OPS, 0, 0);
7614 }
7615 }
7616 {
7617 std::lock_guard l(pending_creates_lock);
7618 auto n_primaries = pending_creates_from_mon;
7619 for (const auto& create : pending_creates_from_osd) {
7620 if (create.second) {
7621 n_primaries++;
7622 }
b32b8144 7623 }
11fdf7f2 7624 metrics.emplace_back(daemon_metric::PENDING_CREATING_PGS, n_primaries);
b32b8144 7625 }
b32b8144
FG
7626 return metrics;
7627}
7628
7c673cae
FG
7629// =====================================================
7630// MAP
7631
7632void OSD::wait_for_new_map(OpRequestRef op)
7633{
7634 // ask?
7635 if (waiting_for_osdmap.empty()) {
9f95a23c 7636 osdmap_subscribe(get_osdmap_epoch() + 1, false);
7c673cae
FG
7637 }
7638
7639 logger->inc(l_osd_waiting_for_map);
7640 waiting_for_osdmap.push_back(op);
7641 op->mark_delayed("wait for new map");
7642}
7643
7644
7645/** update_map
7646 * assimilate new OSDMap(s). scan pgs, etc.
7647 */
7648
7649void OSD::note_down_osd(int peer)
7650{
9f95a23c
TL
7651 ceph_assert(ceph_mutex_is_locked(osd_lock));
7652 cluster_messenger->mark_down_addrs(get_osdmap()->get_cluster_addrs(peer));
7c673cae 7653
9f95a23c 7654 std::lock_guard l{heartbeat_lock};
7c673cae
FG
7655 failure_queue.erase(peer);
7656 failure_pending.erase(peer);
7657 map<int,HeartbeatInfo>::iterator p = heartbeat_peers.find(peer);
7658 if (p != heartbeat_peers.end()) {
9f95a23c 7659 p->second.clear_mark_down();
7c673cae
FG
7660 heartbeat_peers.erase(p);
7661 }
7c673cae
FG
7662}
7663
7664void OSD::note_up_osd(int peer)
7665{
7c673cae
FG
7666 heartbeat_set_peers_need_update();
7667}
7668
7669struct C_OnMapCommit : public Context {
7670 OSD *osd;
7671 epoch_t first, last;
7672 MOSDMap *msg;
7673 C_OnMapCommit(OSD *o, epoch_t f, epoch_t l, MOSDMap *m)
7674 : osd(o), first(f), last(l), msg(m) {}
7675 void finish(int r) override {
7676 osd->_committed_osd_maps(first, last, msg);
7677 msg->put();
7678 }
7679};
7680
7c673cae
FG
7681void OSD::osdmap_subscribe(version_t epoch, bool force_request)
7682{
11fdf7f2 7683 std::lock_guard l(osdmap_subscribe_lock);
181888fb 7684 if (latest_subscribed_epoch >= epoch && !force_request)
7c673cae
FG
7685 return;
7686
11fdf7f2 7687 latest_subscribed_epoch = std::max<uint64_t>(epoch, latest_subscribed_epoch);
181888fb 7688
7c673cae
FG
7689 if (monc->sub_want_increment("osdmap", epoch, CEPH_SUBSCRIBE_ONETIME) ||
7690 force_request) {
7691 monc->renew_subs();
7692 }
7693}
7694
7695void OSD::trim_maps(epoch_t oldest, int nreceived, bool skip_maps)
7696{
7697 epoch_t min = std::min(oldest, service.map_cache.cached_key_lower_bound());
7698 if (min <= superblock.oldest_map)
7699 return;
7700
7701 int num = 0;
7702 ObjectStore::Transaction t;
7703 for (epoch_t e = superblock.oldest_map; e < min; ++e) {
7704 dout(20) << " removing old osdmap epoch " << e << dendl;
7705 t.remove(coll_t::meta(), get_osdmap_pobject_name(e));
7706 t.remove(coll_t::meta(), get_inc_osdmap_pobject_name(e));
7707 superblock.oldest_map = e + 1;
7708 num++;
7709 if (num >= cct->_conf->osd_target_transaction_size && num >= nreceived) {
7710 service.publish_superblock(superblock);
7711 write_superblock(t);
11fdf7f2
TL
7712 int tr = store->queue_transaction(service.meta_ch, std::move(t), nullptr);
7713 ceph_assert(tr == 0);
7c673cae
FG
7714 num = 0;
7715 if (!skip_maps) {
7716 // skip_maps leaves us with a range of old maps if we fail to remove all
7717 // of them before moving superblock.oldest_map forward to the first map
7718 // in the incoming MOSDMap msg. so we should continue removing them in
7719 // this case, even we could do huge series of delete transactions all at
7720 // once.
7721 break;
7722 }
7723 }
7724 }
7725 if (num > 0) {
7726 service.publish_superblock(superblock);
7727 write_superblock(t);
11fdf7f2
TL
7728 int tr = store->queue_transaction(service.meta_ch, std::move(t), nullptr);
7729 ceph_assert(tr == 0);
7c673cae
FG
7730 }
7731 // we should not remove the cached maps
11fdf7f2 7732 ceph_assert(min <= service.map_cache.cached_key_lower_bound());
7c673cae
FG
7733}
7734
7735void OSD::handle_osd_map(MOSDMap *m)
7736{
11fdf7f2
TL
7737 // wait for pgs to catch up
7738 {
7739 // we extend the map cache pins to accomodate pgs slow to consume maps
7740 // for some period, until we hit the max_lag_factor bound, at which point
7741 // we block here to stop injesting more maps than they are able to keep
7742 // up with.
7743 epoch_t max_lag = cct->_conf->osd_map_cache_size *
7744 m_osd_pg_epoch_max_lag_factor;
7745 ceph_assert(max_lag > 0);
7746 epoch_t osd_min = 0;
7747 for (auto shard : shards) {
7748 epoch_t min = shard->get_min_pg_epoch();
7749 if (osd_min == 0 || min < osd_min) {
7750 osd_min = min;
7751 }
7752 }
9f95a23c 7753 epoch_t osdmap_epoch = get_osdmap_epoch();
11fdf7f2 7754 if (osd_min > 0 &&
9f95a23c
TL
7755 osdmap_epoch > max_lag &&
7756 osdmap_epoch - max_lag > osd_min) {
7757 epoch_t need = osdmap_epoch - max_lag;
11fdf7f2
TL
7758 dout(10) << __func__ << " waiting for pgs to catch up (need " << need
7759 << " max_lag " << max_lag << ")" << dendl;
7760 for (auto shard : shards) {
7761 epoch_t min = shard->get_min_pg_epoch();
7762 if (need > min) {
7763 dout(10) << __func__ << " waiting for pgs to consume " << need
7764 << " (shard " << shard->shard_id << " min " << min
7765 << ", map cache is " << cct->_conf->osd_map_cache_size
7766 << ", max_lag_factor " << m_osd_pg_epoch_max_lag_factor
7767 << ")" << dendl;
7768 unlock_guard unlock{osd_lock};
7769 shard->wait_min_pg_epoch(need);
7770 }
7771 }
7772 }
7773 }
7774
9f95a23c 7775 ceph_assert(ceph_mutex_is_locked(osd_lock));
11fdf7f2
TL
7776 map<epoch_t,OSDMapRef> added_maps;
7777 map<epoch_t,bufferlist> added_maps_bl;
7c673cae
FG
7778 if (m->fsid != monc->get_fsid()) {
7779 dout(0) << "handle_osd_map fsid " << m->fsid << " != "
7780 << monc->get_fsid() << dendl;
7781 m->put();
7782 return;
7783 }
7784 if (is_initializing()) {
7785 dout(0) << "ignoring osdmap until we have initialized" << dendl;
7786 m->put();
7787 return;
7788 }
7789
9f95a23c
TL
7790 auto session = ceph::ref_cast<Session>(m->get_connection()->get_priv());
7791 if (session && !(session->entity_name.is_mon() ||
7c673cae
FG
7792 session->entity_name.is_osd())) {
7793 //not enough perms!
7794 dout(10) << "got osd map from Session " << session
7795 << " which we can't take maps from (not a mon or osd)" << dendl;
7796 m->put();
7c673cae
FG
7797 return;
7798 }
7c673cae
FG
7799
7800 // share with the objecter
7801 if (!is_preboot())
7802 service.objecter->handle_osd_map(m);
7803
7804 epoch_t first = m->get_first();
7805 epoch_t last = m->get_last();
7806 dout(3) << "handle_osd_map epochs [" << first << "," << last << "], i have "
7807 << superblock.newest_map
7808 << ", src has [" << m->oldest_map << "," << m->newest_map << "]"
7809 << dendl;
7810
7811 logger->inc(l_osd_map);
7812 logger->inc(l_osd_mape, last - first + 1);
7813 if (first <= superblock.newest_map)
7814 logger->inc(l_osd_mape_dup, superblock.newest_map - first + 1);
7815 if (service.max_oldest_map < m->oldest_map) {
7816 service.max_oldest_map = m->oldest_map;
11fdf7f2 7817 ceph_assert(service.max_oldest_map >= superblock.oldest_map);
7c673cae
FG
7818 }
7819
7820 // make sure there is something new, here, before we bother flushing
7821 // the queues and such
7822 if (last <= superblock.newest_map) {
7823 dout(10) << " no new maps here, dropping" << dendl;
7824 m->put();
7825 return;
7826 }
7827
7828 // missing some?
7829 bool skip_maps = false;
7830 if (first > superblock.newest_map + 1) {
7831 dout(10) << "handle_osd_map message skips epochs "
7832 << superblock.newest_map + 1 << ".." << (first-1) << dendl;
7833 if (m->oldest_map <= superblock.newest_map + 1) {
7834 osdmap_subscribe(superblock.newest_map + 1, false);
7835 m->put();
7836 return;
7837 }
7838 // always try to get the full range of maps--as many as we can. this
7839 // 1- is good to have
7840 // 2- is at present the only way to ensure that we get a *full* map as
7841 // the first map!
7842 if (m->oldest_map < first) {
7843 osdmap_subscribe(m->oldest_map - 1, true);
7844 m->put();
7845 return;
7846 }
7847 skip_maps = true;
7848 }
7849
7850 ObjectStore::Transaction t;
7851 uint64_t txn_size = 0;
7852
9f95a23c
TL
7853 map<epoch_t,mempool::osdmap::map<int64_t,snap_interval_set_t>> purged_snaps;
7854
7c673cae 7855 // store new maps: queue for disk and put in the osdmap cache
11fdf7f2 7856 epoch_t start = std::max(superblock.newest_map + 1, first);
7c673cae
FG
7857 for (epoch_t e = start; e <= last; e++) {
7858 if (txn_size >= t.get_num_bytes()) {
7859 derr << __func__ << " transaction size overflowed" << dendl;
11fdf7f2 7860 ceph_assert(txn_size < t.get_num_bytes());
7c673cae
FG
7861 }
7862 txn_size = t.get_num_bytes();
7863 map<epoch_t,bufferlist>::iterator p;
7864 p = m->maps.find(e);
7865 if (p != m->maps.end()) {
7866 dout(10) << "handle_osd_map got full map for epoch " << e << dendl;
7867 OSDMap *o = new OSDMap;
7868 bufferlist& bl = p->second;
7869
7870 o->decode(bl);
7871
9f95a23c
TL
7872 purged_snaps[e] = o->get_new_purged_snaps();
7873
7c673cae
FG
7874 ghobject_t fulloid = get_osdmap_pobject_name(e);
7875 t.write(coll_t::meta(), fulloid, 0, bl.length(), bl);
11fdf7f2
TL
7876 added_maps[e] = add_map(o);
7877 added_maps_bl[e] = bl;
7c673cae
FG
7878 got_full_map(e);
7879 continue;
7880 }
7881
7882 p = m->incremental_maps.find(e);
7883 if (p != m->incremental_maps.end()) {
7884 dout(10) << "handle_osd_map got inc map for epoch " << e << dendl;
7885 bufferlist& bl = p->second;
7886 ghobject_t oid = get_inc_osdmap_pobject_name(e);
7887 t.write(coll_t::meta(), oid, 0, bl.length(), bl);
7c673cae
FG
7888
7889 OSDMap *o = new OSDMap;
7890 if (e > 1) {
7891 bufferlist obl;
7892 bool got = get_map_bl(e - 1, obl);
11fdf7f2
TL
7893 if (!got) {
7894 auto p = added_maps_bl.find(e - 1);
7895 ceph_assert(p != added_maps_bl.end());
7896 obl = p->second;
7897 }
7c673cae
FG
7898 o->decode(obl);
7899 }
7900
7901 OSDMap::Incremental inc;
11fdf7f2 7902 auto p = bl.cbegin();
7c673cae 7903 inc.decode(p);
494da23a 7904
7c673cae 7905 if (o->apply_incremental(inc) < 0) {
9f95a23c 7906 derr << "ERROR: bad fsid? i have " << get_osdmap()->get_fsid() << " and inc has " << inc.fsid << dendl;
11fdf7f2 7907 ceph_abort_msg("bad fsid");
7c673cae
FG
7908 }
7909
7910 bufferlist fbl;
7911 o->encode(fbl, inc.encode_features | CEPH_FEATURE_RESERVED);
7912
7913 bool injected_failure = false;
7914 if (cct->_conf->osd_inject_bad_map_crc_probability > 0 &&
7915 (rand() % 10000) < cct->_conf->osd_inject_bad_map_crc_probability*10000.0) {
7916 derr << __func__ << " injecting map crc failure" << dendl;
7917 injected_failure = true;
7918 }
7919
7920 if ((inc.have_crc && o->get_crc() != inc.full_crc) || injected_failure) {
7921 dout(2) << "got incremental " << e
7922 << " but failed to encode full with correct crc; requesting"
7923 << dendl;
7924 clog->warn() << "failed to encode map e" << e << " with expected crc";
7925 dout(20) << "my encoded map was:\n";
7926 fbl.hexdump(*_dout);
7927 *_dout << dendl;
7928 delete o;
7929 request_full_map(e, last);
7930 last = e - 1;
7931 break;
7932 }
7933 got_full_map(e);
9f95a23c 7934 purged_snaps[e] = o->get_new_purged_snaps();
7c673cae
FG
7935
7936 ghobject_t fulloid = get_osdmap_pobject_name(e);
7937 t.write(coll_t::meta(), fulloid, 0, fbl.length(), fbl);
11fdf7f2
TL
7938 added_maps[e] = add_map(o);
7939 added_maps_bl[e] = fbl;
7c673cae
FG
7940 continue;
7941 }
7942
11fdf7f2 7943 ceph_abort_msg("MOSDMap lied about what maps it had?");
7c673cae
FG
7944 }
7945
7946 // even if this map isn't from a mon, we may have satisfied our subscription
7947 monc->sub_got("osdmap", last);
7948
7949 if (!m->maps.empty() && requested_full_first) {
7950 dout(10) << __func__ << " still missing full maps " << requested_full_first
7951 << ".." << requested_full_last << dendl;
7952 rerequest_full_maps();
7953 }
7954
7c673cae
FG
7955 if (superblock.oldest_map) {
7956 // make sure we at least keep pace with incoming maps
7957 trim_maps(m->oldest_map, last - first + 1, skip_maps);
11fdf7f2 7958 pg_num_history.prune(superblock.oldest_map);
7c673cae
FG
7959 }
7960
7961 if (!superblock.oldest_map || skip_maps)
7962 superblock.oldest_map = first;
7963 superblock.newest_map = last;
7964 superblock.current_epoch = last;
7965
7966 // note in the superblock that we were clean thru the prior epoch
7967 epoch_t boot_epoch = service.get_boot_epoch();
7968 if (boot_epoch && boot_epoch >= superblock.mounted) {
7969 superblock.mounted = boot_epoch;
7970 superblock.clean_thru = last;
7971 }
7972
11fdf7f2
TL
7973 // check for pg_num changes and deleted pools
7974 OSDMapRef lastmap;
7975 for (auto& i : added_maps) {
7976 if (!lastmap) {
7977 if (!(lastmap = service.try_get_map(i.first - 1))) {
7978 dout(10) << __func__ << " can't get previous map " << i.first - 1
7979 << " probably first start of this osd" << dendl;
7980 continue;
7981 }
7982 }
7983 ceph_assert(lastmap->get_epoch() + 1 == i.second->get_epoch());
7984 for (auto& j : lastmap->get_pools()) {
7985 if (!i.second->have_pg_pool(j.first)) {
7986 pg_num_history.log_pool_delete(i.first, j.first);
7987 dout(10) << __func__ << " recording final pg_pool_t for pool "
7988 << j.first << dendl;
7989 // this information is needed by _make_pg() if have to restart before
7990 // the pool is deleted and need to instantiate a new (zombie) PG[Pool].
7991 ghobject_t obj = make_final_pool_info_oid(j.first);
7992 bufferlist bl;
7993 encode(j.second, bl, CEPH_FEATURES_ALL);
7994 string name = lastmap->get_pool_name(j.first);
7995 encode(name, bl);
7996 map<string,string> profile;
7997 if (lastmap->get_pg_pool(j.first)->is_erasure()) {
7998 profile = lastmap->get_erasure_code_profile(
7999 lastmap->get_pg_pool(j.first)->erasure_code_profile);
8000 }
8001 encode(profile, bl);
8002 t.write(coll_t::meta(), obj, 0, bl.length(), bl);
11fdf7f2
TL
8003 } else if (unsigned new_pg_num = i.second->get_pg_num(j.first);
8004 new_pg_num != j.second.get_pg_num()) {
8005 dout(10) << __func__ << " recording pool " << j.first << " pg_num "
8006 << j.second.get_pg_num() << " -> " << new_pg_num << dendl;
8007 pg_num_history.log_pg_num_change(i.first, j.first, new_pg_num);
8008 }
8009 }
8010 for (auto& j : i.second->get_pools()) {
8011 if (!lastmap->have_pg_pool(j.first)) {
8012 dout(10) << __func__ << " recording new pool " << j.first << " pg_num "
8013 << j.second.get_pg_num() << dendl;
8014 pg_num_history.log_pg_num_change(i.first, j.first,
8015 j.second.get_pg_num());
8016 }
8017 }
8018 lastmap = i.second;
8019 }
8020 pg_num_history.epoch = last;
8021 {
8022 bufferlist bl;
8023 ::encode(pg_num_history, bl);
8024 t.write(coll_t::meta(), make_pg_num_history_oid(), 0, bl.length(), bl);
8025 dout(20) << __func__ << " pg_num_history " << pg_num_history << dendl;
8026 }
8027
9f95a23c
TL
8028 // record new purged_snaps
8029 if (superblock.purged_snaps_last == start - 1) {
8030 SnapMapper::record_purged_snaps(cct, store, service.meta_ch,
8031 make_purged_snaps_oid(), &t,
8032 purged_snaps);
8033 superblock.purged_snaps_last = last;
8034 } else {
8035 dout(10) << __func__ << " superblock purged_snaps_last is "
8036 << superblock.purged_snaps_last
8037 << ", not recording new purged_snaps" << dendl;
8038 }
8039
7c673cae
FG
8040 // superblock and commit
8041 write_superblock(t);
11fdf7f2 8042 t.register_on_commit(new C_OnMapCommit(this, start, last, m));
7c673cae 8043 store->queue_transaction(
11fdf7f2
TL
8044 service.meta_ch,
8045 std::move(t));
7c673cae
FG
8046 service.publish_superblock(superblock);
8047}
8048
8049void OSD::_committed_osd_maps(epoch_t first, epoch_t last, MOSDMap *m)
8050{
8051 dout(10) << __func__ << " " << first << ".." << last << dendl;
8052 if (is_stopping()) {
8053 dout(10) << __func__ << " bailing, we are shutting down" << dendl;
8054 return;
8055 }
11fdf7f2 8056 std::lock_guard l(osd_lock);
31f18b77
FG
8057 if (is_stopping()) {
8058 dout(10) << __func__ << " bailing, we are shutting down" << dendl;
8059 return;
8060 }
9f95a23c 8061 map_lock.lock();
7c673cae
FG
8062
8063 bool do_shutdown = false;
8064 bool do_restart = false;
8065 bool network_error = false;
9f95a23c 8066 OSDMapRef osdmap;
7c673cae
FG
8067
8068 // advance through the new maps
8069 for (epoch_t cur = first; cur <= last; cur++) {
8070 dout(10) << " advance to epoch " << cur
8071 << " (<= last " << last
8072 << " <= newest_map " << superblock.newest_map
8073 << ")" << dendl;
8074
8075 OSDMapRef newmap = get_map(cur);
11fdf7f2 8076 ceph_assert(newmap); // we just cached it above!
7c673cae
FG
8077
8078 // start blacklisting messages sent to peers that go down.
8079 service.pre_publish_map(newmap);
8080
8081 // kill connections to newly down osds
8082 bool waited_for_reservations = false;
8083 set<int> old;
9f95a23c 8084 osdmap = get_osdmap();
7c673cae
FG
8085 osdmap->get_all_osds(old);
8086 for (set<int>::iterator p = old.begin(); p != old.end(); ++p) {
8087 if (*p != whoami &&
8088 osdmap->is_up(*p) && // in old map
8089 newmap->is_down(*p)) { // but not the new one
8090 if (!waited_for_reservations) {
8091 service.await_reserved_maps();
8092 waited_for_reservations = true;
8093 }
8094 note_down_osd(*p);
8095 } else if (*p != whoami &&
8096 osdmap->is_down(*p) &&
8097 newmap->is_up(*p)) {
8098 note_up_osd(*p);
8099 }
8100 }
8101
81eedcae 8102 if (osdmap->is_noup(whoami) != newmap->is_noup(whoami)) {
7c673cae
FG
8103 dout(10) << __func__ << " NOUP flag changed in " << newmap->get_epoch()
8104 << dendl;
8105 if (is_booting()) {
8106 // this captures the case where we sent the boot message while
8107 // NOUP was being set on the mon and our boot request was
8108 // dropped, and then later it is cleared. it imperfectly
8109 // handles the case where our original boot message was not
8110 // dropped and we restart even though we might have booted, but
8111 // that is harmless (boot will just take slightly longer).
8112 do_restart = true;
8113 }
8114 }
8115
9f95a23c
TL
8116 osdmap = std::move(newmap);
8117 set_osdmap(osdmap);
7c673cae
FG
8118 epoch_t up_epoch;
8119 epoch_t boot_epoch;
8120 service.retrieve_epochs(&boot_epoch, &up_epoch, NULL);
8121 if (!up_epoch &&
8122 osdmap->is_up(whoami) &&
11fdf7f2 8123 osdmap->get_addrs(whoami) == client_messenger->get_myaddrs()) {
7c673cae
FG
8124 up_epoch = osdmap->get_epoch();
8125 dout(10) << "up_epoch is " << up_epoch << dendl;
8126 if (!boot_epoch) {
8127 boot_epoch = osdmap->get_epoch();
8128 dout(10) << "boot_epoch is " << boot_epoch << dendl;
8129 }
8130 service.set_epochs(&boot_epoch, &up_epoch, NULL);
8131 }
8132 }
8133
7c673cae
FG
8134 epoch_t _bind_epoch = service.get_bind_epoch();
8135 if (osdmap->is_up(whoami) &&
11fdf7f2
TL
8136 osdmap->get_addrs(whoami).legacy_equals(
8137 client_messenger->get_myaddrs()) &&
7c673cae
FG
8138 _bind_epoch < osdmap->get_up_from(whoami)) {
8139
8140 if (is_booting()) {
8141 dout(1) << "state: booting -> active" << dendl;
8142 set_state(STATE_ACTIVE);
11fdf7f2 8143 do_restart = false;
7c673cae
FG
8144
8145 // set incarnation so that osd_reqid_t's we generate for our
8146 // objecter requests are unique across restarts.
8147 service.objecter->set_client_incarnation(osdmap->get_epoch());
11fdf7f2 8148 cancel_pending_failures();
7c673cae
FG
8149 }
8150 }
8151
8152 if (osdmap->get_epoch() > 0 &&
8153 is_active()) {
8154 if (!osdmap->exists(whoami)) {
9f95a23c 8155 derr << "map says i do not exist. shutting down." << dendl;
7c673cae
FG
8156 do_shutdown = true; // don't call shutdown() while we have
8157 // everything paused
9f95a23c
TL
8158 } else if (osdmap->is_stop(whoami)) {
8159 derr << "map says i am stopped by admin. shutting down." << dendl;
8160 do_shutdown = true;
7c673cae 8161 } else if (!osdmap->is_up(whoami) ||
11fdf7f2
TL
8162 !osdmap->get_addrs(whoami).legacy_equals(
8163 client_messenger->get_myaddrs()) ||
8164 !osdmap->get_cluster_addrs(whoami).legacy_equals(
8165 cluster_messenger->get_myaddrs()) ||
8166 !osdmap->get_hb_back_addrs(whoami).legacy_equals(
8167 hb_back_server_messenger->get_myaddrs()) ||
8168 !osdmap->get_hb_front_addrs(whoami).legacy_equals(
8169 hb_front_server_messenger->get_myaddrs())) {
7c673cae
FG
8170 if (!osdmap->is_up(whoami)) {
8171 if (service.is_preparing_to_stop() || service.is_stopping()) {
8172 service.got_stop_ack();
8173 } else {
c07f9fc5
FG
8174 clog->warn() << "Monitor daemon marked osd." << whoami << " down, "
8175 "but it is still running";
8176 clog->debug() << "map e" << osdmap->get_epoch()
8177 << " wrongly marked me down at e"
8178 << osdmap->get_down_at(whoami);
7c673cae 8179 }
9f95a23c
TL
8180 if (monc->monmap.min_mon_release >= ceph_release_t::octopus) {
8181 // note that this is best-effort...
8182 monc->send_mon_message(
8183 new MOSDMarkMeDead(
8184 monc->get_fsid(),
8185 whoami,
8186 osdmap->get_epoch()));
8187 }
11fdf7f2
TL
8188 } else if (!osdmap->get_addrs(whoami).legacy_equals(
8189 client_messenger->get_myaddrs())) {
7c673cae 8190 clog->error() << "map e" << osdmap->get_epoch()
11fdf7f2
TL
8191 << " had wrong client addr (" << osdmap->get_addrs(whoami)
8192 << " != my " << client_messenger->get_myaddrs() << ")";
8193 } else if (!osdmap->get_cluster_addrs(whoami).legacy_equals(
8194 cluster_messenger->get_myaddrs())) {
7c673cae
FG
8195 clog->error() << "map e" << osdmap->get_epoch()
8196 << " had wrong cluster addr ("
11fdf7f2
TL
8197 << osdmap->get_cluster_addrs(whoami)
8198 << " != my " << cluster_messenger->get_myaddrs() << ")";
8199 } else if (!osdmap->get_hb_back_addrs(whoami).legacy_equals(
8200 hb_back_server_messenger->get_myaddrs())) {
7c673cae 8201 clog->error() << "map e" << osdmap->get_epoch()
c07f9fc5 8202 << " had wrong heartbeat back addr ("
11fdf7f2
TL
8203 << osdmap->get_hb_back_addrs(whoami)
8204 << " != my " << hb_back_server_messenger->get_myaddrs()
7c673cae 8205 << ")";
11fdf7f2
TL
8206 } else if (!osdmap->get_hb_front_addrs(whoami).legacy_equals(
8207 hb_front_server_messenger->get_myaddrs())) {
7c673cae 8208 clog->error() << "map e" << osdmap->get_epoch()
c07f9fc5 8209 << " had wrong heartbeat front addr ("
11fdf7f2
TL
8210 << osdmap->get_hb_front_addrs(whoami)
8211 << " != my " << hb_front_server_messenger->get_myaddrs()
7c673cae
FG
8212 << ")";
8213 }
8214
8215 if (!service.is_stopping()) {
8216 epoch_t up_epoch = 0;
8217 epoch_t bind_epoch = osdmap->get_epoch();
8218 service.set_epochs(NULL,&up_epoch, &bind_epoch);
8219 do_restart = true;
8220
8221 //add markdown log
8222 utime_t now = ceph_clock_now();
8223 utime_t grace = utime_t(cct->_conf->osd_max_markdown_period, 0);
8224 osd_markdown_log.push_back(now);
7c673cae 8225 if ((int)osd_markdown_log.size() > cct->_conf->osd_max_markdown_count) {
9f95a23c
TL
8226 derr << __func__ << " marked down "
8227 << osd_markdown_log.size()
8228 << " > osd_max_markdown_count "
8229 << cct->_conf->osd_max_markdown_count
8230 << " in last " << grace << " seconds, shutting down"
8231 << dendl;
7c673cae
FG
8232 do_restart = false;
8233 do_shutdown = true;
8234 }
8235
8236 start_waiting_for_healthy();
8237
8238 set<int> avoid_ports;
8239#if defined(__FreeBSD__)
8240 // prevent FreeBSD from grabbing the client_messenger port during
8241 // rebinding. In which case a cluster_meesneger will connect also
8242 // to the same port
11fdf7f2 8243 client_messenger->get_myaddrs().get_ports(&avoid_ports);
7c673cae 8244#endif
11fdf7f2 8245 cluster_messenger->get_myaddrs().get_ports(&avoid_ports);
7c673cae
FG
8246
8247 int r = cluster_messenger->rebind(avoid_ports);
8248 if (r != 0) {
8249 do_shutdown = true; // FIXME: do_restart?
8250 network_error = true;
9f95a23c
TL
8251 derr << __func__ << " marked down:"
8252 << " rebind cluster_messenger failed" << dendl;
7c673cae
FG
8253 }
8254
9f95a23c
TL
8255 hb_back_server_messenger->mark_down_all();
8256 hb_front_server_messenger->mark_down_all();
7c673cae
FG
8257 hb_front_client_messenger->mark_down_all();
8258 hb_back_client_messenger->mark_down_all();
8259
494da23a 8260 reset_heartbeat_peers(true);
7c673cae
FG
8261 }
8262 }
8263 }
8264
9f95a23c 8265 map_lock.unlock();
7c673cae 8266
11fdf7f2 8267 check_osdmap_features();
7c673cae
FG
8268
8269 // yay!
8270 consume_map();
8271
8272 if (is_active() || is_waiting_for_healthy())
8273 maybe_update_heartbeat_peers();
8274
11fdf7f2 8275 if (is_active()) {
7c673cae
FG
8276 activate_map();
8277 }
8278
31f18b77 8279 if (do_shutdown) {
7c673cae 8280 if (network_error) {
11fdf7f2 8281 cancel_pending_failures();
7c673cae
FG
8282 }
8283 // trigger shutdown in a different thread
8284 dout(0) << __func__ << " shutdown OSD via async signal" << dendl;
8285 queue_async_signal(SIGINT);
8286 }
31f18b77
FG
8287 else if (m->newest_map && m->newest_map > last) {
8288 dout(10) << " msg say newest map is " << m->newest_map
8289 << ", requesting more" << dendl;
8290 osdmap_subscribe(osdmap->get_epoch()+1, false);
8291 }
7c673cae
FG
8292 else if (is_preboot()) {
8293 if (m->get_source().is_mon())
8294 _preboot(m->oldest_map, m->newest_map);
8295 else
8296 start_boot();
8297 }
8298 else if (do_restart)
8299 start_boot();
8300
8301}
8302
11fdf7f2 8303void OSD::check_osdmap_features()
7c673cae
FG
8304{
8305 // adjust required feature bits?
8306
8307 // we have to be a bit careful here, because we are accessing the
8308 // Policy structures without taking any lock. in particular, only
8309 // modify integer values that can safely be read by a racing CPU.
8310 // since we are only accessing existing Policy structures a their
8311 // current memory location, and setting or clearing bits in integer
8312 // fields, and we are the only writer, this is not a problem.
8313
9f95a23c 8314 const auto osdmap = get_osdmap();
7c673cae
FG
8315 {
8316 Messenger::Policy p = client_messenger->get_default_policy();
8317 uint64_t mask;
8318 uint64_t features = osdmap->get_features(entity_name_t::TYPE_CLIENT, &mask);
8319 if ((p.features_required & mask) != features) {
8320 dout(0) << "crush map has features " << features
8321 << ", adjusting msgr requires for clients" << dendl;
8322 p.features_required = (p.features_required & ~mask) | features;
8323 client_messenger->set_default_policy(p);
8324 }
8325 }
8326 {
8327 Messenger::Policy p = client_messenger->get_policy(entity_name_t::TYPE_MON);
8328 uint64_t mask;
8329 uint64_t features = osdmap->get_features(entity_name_t::TYPE_MON, &mask);
8330 if ((p.features_required & mask) != features) {
8331 dout(0) << "crush map has features " << features
8332 << " was " << p.features_required
8333 << ", adjusting msgr requires for mons" << dendl;
8334 p.features_required = (p.features_required & ~mask) | features;
8335 client_messenger->set_policy(entity_name_t::TYPE_MON, p);
8336 }
8337 }
8338 {
8339 Messenger::Policy p = cluster_messenger->get_policy(entity_name_t::TYPE_OSD);
8340 uint64_t mask;
8341 uint64_t features = osdmap->get_features(entity_name_t::TYPE_OSD, &mask);
8342
8343 if ((p.features_required & mask) != features) {
8344 dout(0) << "crush map has features " << features
8345 << ", adjusting msgr requires for osds" << dendl;
8346 p.features_required = (p.features_required & ~mask) | features;
8347 cluster_messenger->set_policy(entity_name_t::TYPE_OSD, p);
8348 }
8349
11fdf7f2 8350 if (!superblock.compat_features.incompat.contains(CEPH_OSD_FEATURE_INCOMPAT_SHARDS)) {
7c673cae
FG
8351 dout(0) << __func__ << " enabling on-disk ERASURE CODES compat feature" << dendl;
8352 superblock.compat_features.incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_SHARDS);
8353 ObjectStore::Transaction t;
8354 write_superblock(t);
11fdf7f2
TL
8355 int err = store->queue_transaction(service.meta_ch, std::move(t), NULL);
8356 ceph_assert(err == 0);
7c673cae
FG
8357 }
8358 }
11fdf7f2 8359
9f95a23c
TL
8360 if (osdmap->require_osd_release < ceph_release_t::nautilus) {
8361 hb_front_server_messenger->set_require_authorizer(false);
8362 hb_back_server_messenger->set_require_authorizer(false);
8363 } else {
8364 hb_front_server_messenger->set_require_authorizer(true);
8365 hb_back_server_messenger->set_require_authorizer(true);
11fdf7f2
TL
8366 }
8367
8368 if (osdmap->require_osd_release != last_require_osd_release) {
8369 dout(1) << __func__ << " require_osd_release " << last_require_osd_release
8370 << " -> " << to_string(osdmap->require_osd_release) << dendl;
8371 store->write_meta("require_osd_release",
8372 stringify((int)osdmap->require_osd_release));
8373 last_require_osd_release = osdmap->require_osd_release;
8374 }
7c673cae
FG
8375}
8376
11fdf7f2
TL
8377struct C_FinishSplits : public Context {
8378 OSD *osd;
8379 set<PGRef> pgs;
8380 C_FinishSplits(OSD *osd, const set<PGRef> &in)
8381 : osd(osd), pgs(in) {}
8382 void finish(int r) override {
8383 osd->_finish_splits(pgs);
8384 }
8385};
8386
8387void OSD::_finish_splits(set<PGRef>& pgs)
7c673cae 8388{
11fdf7f2
TL
8389 dout(10) << __func__ << " " << pgs << dendl;
8390 if (is_stopping())
8391 return;
11fdf7f2
TL
8392 for (set<PGRef>::iterator i = pgs.begin();
8393 i != pgs.end();
8394 ++i) {
8395 PG *pg = i->get();
7c673cae 8396
9f95a23c 8397 PeeringCtx rctx = create_context();
11fdf7f2
TL
8398 pg->lock();
8399 dout(10) << __func__ << " " << *pg << dendl;
8400 epoch_t e = pg->get_osdmap_epoch();
9f95a23c 8401 pg->handle_initialize(rctx);
11fdf7f2 8402 pg->queue_null(e, e);
9f95a23c 8403 dispatch_context(rctx, pg, service.get_osdmap());
11fdf7f2 8404 pg->unlock();
7c673cae 8405
11fdf7f2
TL
8406 unsigned shard_index = pg->pg_id.hash_to_shard(num_shards);
8407 shards[shard_index]->register_and_wake_split_child(pg);
7c673cae 8408 }
11fdf7f2
TL
8409};
8410
8411bool OSD::add_merge_waiter(OSDMapRef nextmap, spg_t target, PGRef src,
8412 unsigned need)
8413{
8414 std::lock_guard l(merge_lock);
8415 auto& p = merge_waiters[nextmap->get_epoch()][target];
8416 p[src->pg_id] = src;
8417 dout(10) << __func__ << " added merge_waiter " << src->pg_id
8418 << " for " << target << ", have " << p.size() << "/" << need
8419 << dendl;
8420 return p.size() == need;
8421}
8422
8423bool OSD::advance_pg(
8424 epoch_t osd_epoch,
8425 PG *pg,
8426 ThreadPool::TPHandle &handle,
9f95a23c 8427 PeeringCtx &rctx)
11fdf7f2
TL
8428{
8429 if (osd_epoch <= pg->get_osdmap_epoch()) {
8430 return true;
8431 }
8432 ceph_assert(pg->is_locked());
8433 OSDMapRef lastmap = pg->get_osdmap();
8434 ceph_assert(lastmap->get_epoch() < osd_epoch);
8435 set<PGRef> new_pgs; // any split children
8436 bool ret = true;
8437
8438 unsigned old_pg_num = lastmap->have_pg_pool(pg->pg_id.pool()) ?
8439 lastmap->get_pg_num(pg->pg_id.pool()) : 0;
8440 for (epoch_t next_epoch = pg->get_osdmap_epoch() + 1;
8441 next_epoch <= osd_epoch;
7c673cae
FG
8442 ++next_epoch) {
8443 OSDMapRef nextmap = service.try_get_map(next_epoch);
8444 if (!nextmap) {
8445 dout(20) << __func__ << " missing map " << next_epoch << dendl;
7c673cae
FG
8446 continue;
8447 }
8448
11fdf7f2
TL
8449 unsigned new_pg_num =
8450 (old_pg_num && nextmap->have_pg_pool(pg->pg_id.pool())) ?
8451 nextmap->get_pg_num(pg->pg_id.pool()) : 0;
8452 if (old_pg_num && new_pg_num && old_pg_num != new_pg_num) {
8453 // check for merge
8454 if (nextmap->have_pg_pool(pg->pg_id.pool())) {
8455 spg_t parent;
8456 if (pg->pg_id.is_merge_source(
8457 old_pg_num,
8458 new_pg_num,
8459 &parent)) {
8460 // we are merge source
8461 PGRef spg = pg; // carry a ref
8462 dout(1) << __func__ << " " << pg->pg_id
8463 << " is merge source, target is " << parent
8464 << dendl;
8465 pg->write_if_dirty(rctx);
9f95a23c
TL
8466 if (!new_pgs.empty()) {
8467 rctx.transaction.register_on_applied(new C_FinishSplits(this,
8468 new_pgs));
8469 new_pgs.clear();
8470 }
8471 dispatch_context(rctx, pg, pg->get_osdmap(), &handle);
11fdf7f2 8472 pg->ch->flush();
eafe8130
TL
8473 // release backoffs explicitly, since the on_shutdown path
8474 // aggressively tears down backoff state.
8475 if (pg->is_primary()) {
8476 pg->release_pg_backoffs();
8477 }
11fdf7f2
TL
8478 pg->on_shutdown();
8479 OSDShard *sdata = pg->osd_shard;
8480 {
8481 std::lock_guard l(sdata->shard_lock);
8482 if (pg->pg_slot) {
8483 sdata->_detach_pg(pg->pg_slot);
8484 // update pg count now since we might not get an osdmap
8485 // any time soon.
8486 if (pg->is_primary())
8487 logger->dec(l_osd_pg_primary);
9f95a23c
TL
8488 else if (pg->is_nonprimary())
8489 logger->dec(l_osd_pg_replica); // misnomer
11fdf7f2
TL
8490 else
8491 logger->dec(l_osd_pg_stray);
8492 }
8493 }
8494 pg->unlock();
8495
8496 set<spg_t> children;
8497 parent.is_split(new_pg_num, old_pg_num, &children);
8498 if (add_merge_waiter(nextmap, parent, pg, children.size())) {
8499 enqueue_peering_evt(
8500 parent,
8501 PGPeeringEventRef(
8502 std::make_shared<PGPeeringEvent>(
8503 nextmap->get_epoch(),
8504 nextmap->get_epoch(),
8505 NullEvt())));
8506 }
8507 ret = false;
8508 goto out;
8509 } else if (pg->pg_id.is_merge_target(old_pg_num, new_pg_num)) {
8510 // we are merge target
8511 set<spg_t> children;
8512 pg->pg_id.is_split(new_pg_num, old_pg_num, &children);
8513 dout(20) << __func__ << " " << pg->pg_id
8514 << " is merge target, sources are " << children
8515 << dendl;
8516 map<spg_t,PGRef> sources;
8517 {
8518 std::lock_guard l(merge_lock);
8519 auto& s = merge_waiters[nextmap->get_epoch()][pg->pg_id];
8520 unsigned need = children.size();
8521 dout(20) << __func__ << " have " << s.size() << "/"
8522 << need << dendl;
8523 if (s.size() == need) {
8524 sources.swap(s);
8525 merge_waiters[nextmap->get_epoch()].erase(pg->pg_id);
8526 if (merge_waiters[nextmap->get_epoch()].empty()) {
8527 merge_waiters.erase(nextmap->get_epoch());
8528 }
8529 }
8530 }
8531 if (!sources.empty()) {
8532 unsigned new_pg_num = nextmap->get_pg_num(pg->pg_id.pool());
8533 unsigned split_bits = pg->pg_id.get_split_bits(new_pg_num);
8534 dout(1) << __func__ << " merging " << pg->pg_id << dendl;
8535 pg->merge_from(
8536 sources, rctx, split_bits,
8537 nextmap->get_pg_pool(
8538 pg->pg_id.pool())->last_pg_merge_meta);
8539 pg->pg_slot->waiting_for_merge_epoch = 0;
8540 } else {
8541 dout(20) << __func__ << " not ready to merge yet" << dendl;
8542 pg->write_if_dirty(rctx);
9f95a23c
TL
8543 if (!new_pgs.empty()) {
8544 rctx.transaction.register_on_applied(new C_FinishSplits(this,
8545 new_pgs));
8546 new_pgs.clear();
8547 }
8548 dispatch_context(rctx, pg, pg->get_osdmap(), &handle);
11fdf7f2
TL
8549 pg->unlock();
8550 // kick source(s) to get them ready
8551 for (auto& i : children) {
8552 dout(20) << __func__ << " kicking source " << i << dendl;
8553 enqueue_peering_evt(
8554 i,
8555 PGPeeringEventRef(
8556 std::make_shared<PGPeeringEvent>(
8557 nextmap->get_epoch(),
8558 nextmap->get_epoch(),
8559 NullEvt())));
8560 }
8561 ret = false;
8562 goto out;
8563 }
8564 }
8565 }
8566 }
8567
7c673cae
FG
8568 vector<int> newup, newacting;
8569 int up_primary, acting_primary;
8570 nextmap->pg_to_up_acting_osds(
11fdf7f2 8571 pg->pg_id.pgid,
7c673cae
FG
8572 &newup, &up_primary,
8573 &newacting, &acting_primary);
8574 pg->handle_advance_map(
8575 nextmap, lastmap, newup, up_primary,
8576 newacting, acting_primary, rctx);
8577
494da23a
TL
8578 auto oldpool = lastmap->get_pools().find(pg->pg_id.pool());
8579 auto newpool = nextmap->get_pools().find(pg->pg_id.pool());
8580 if (oldpool != lastmap->get_pools().end()
8581 && newpool != nextmap->get_pools().end()) {
8582 dout(20) << __func__
8583 << " new pool opts " << newpool->second.opts
8584 << " old pool opts " << oldpool->second.opts
8585 << dendl;
8586
8587 double old_min_interval = 0, new_min_interval = 0;
8588 oldpool->second.opts.get(pool_opts_t::SCRUB_MIN_INTERVAL, &old_min_interval);
8589 newpool->second.opts.get(pool_opts_t::SCRUB_MIN_INTERVAL, &new_min_interval);
8590
8591 double old_max_interval = 0, new_max_interval = 0;
8592 oldpool->second.opts.get(pool_opts_t::SCRUB_MAX_INTERVAL, &old_max_interval);
8593 newpool->second.opts.get(pool_opts_t::SCRUB_MAX_INTERVAL, &new_max_interval);
8594
8595 // Assume if an interval is change from set to unset or vice versa the actual config
8596 // is different. Keep it simple even if it is possible to call resched_all_scrub()
8597 // unnecessarily.
8598 if (old_min_interval != new_min_interval || old_max_interval != new_max_interval) {
8599 pg->on_info_history_change();
8600 }
8601 }
8602
11fdf7f2
TL
8603 if (new_pg_num && old_pg_num != new_pg_num) {
8604 // check for split
8605 set<spg_t> children;
8606 if (pg->pg_id.is_split(
8607 old_pg_num,
8608 new_pg_num,
8609 &children)) {
8610 split_pgs(
8611 pg, children, &new_pgs, lastmap, nextmap,
8612 rctx);
8613 }
7c673cae
FG
8614 }
8615
8616 lastmap = nextmap;
11fdf7f2 8617 old_pg_num = new_pg_num;
7c673cae
FG
8618 handle.reset_tp_timeout();
8619 }
7c673cae 8620 pg->handle_activate_map(rctx);
11fdf7f2
TL
8621
8622 ret = true;
8623 out:
8624 if (!new_pgs.empty()) {
9f95a23c 8625 rctx.transaction.register_on_applied(new C_FinishSplits(this, new_pgs));
7c673cae 8626 }
11fdf7f2 8627 return ret;
7c673cae
FG
8628}
8629
8630void OSD::consume_map()
8631{
9f95a23c
TL
8632 ceph_assert(ceph_mutex_is_locked(osd_lock));
8633 auto osdmap = get_osdmap();
7c673cae
FG
8634 dout(7) << "consume_map version " << osdmap->get_epoch() << dendl;
8635
3efd9988
FG
8636 /** make sure the cluster is speaking in SORTBITWISE, because we don't
8637 * speak the older sorting version any more. Be careful not to force
8638 * a shutdown if we are merely processing old maps, though.
8639 */
8640 if (!osdmap->test_flag(CEPH_OSDMAP_SORTBITWISE) && is_active()) {
8641 derr << __func__ << " SORTBITWISE flag is not set" << dendl;
8642 ceph_abort();
8643 }
8644
11fdf7f2
TL
8645 service.pre_publish_map(osdmap);
8646 service.await_reserved_maps();
8647 service.publish_map(osdmap);
7c673cae 8648
11fdf7f2
TL
8649 // prime splits and merges
8650 set<pair<spg_t,epoch_t>> newly_split; // splits, and when
8651 set<pair<spg_t,epoch_t>> merge_pgs; // merge participants, and when
8652 for (auto& shard : shards) {
8653 shard->identify_splits_and_merges(osdmap, &newly_split, &merge_pgs);
8654 }
8655 if (!newly_split.empty()) {
8656 for (auto& shard : shards) {
8657 shard->prime_splits(osdmap, &newly_split);
8658 }
8659 ceph_assert(newly_split.empty());
8660 }
7c673cae 8661
11fdf7f2
TL
8662 // prune sent_ready_to_merge
8663 service.prune_sent_ready_to_merge(osdmap);
7c673cae 8664
11fdf7f2
TL
8665 // FIXME, maybe: We could race against an incoming peering message
8666 // that instantiates a merge PG after identify_merges() below and
8667 // never set up its peer to complete the merge. An OSD restart
8668 // would clear it up. This is a hard race to resolve,
8669 // extraordinarily rare (we only merge PGs that are stable and
8670 // clean, so it'd have to be an imported PG to an OSD with a
8671 // slightly stale OSDMap...), so I'm ignoring it for now. We plan to
8672 // replace all of this with a seastar-based code soon anyway.
8673 if (!merge_pgs.empty()) {
8674 // mark the pgs we already have, or create new and empty merge
8675 // participants for those we are missing. do this all under the
8676 // shard lock so we don't have to worry about racing pg creates
8677 // via _process.
8678 for (auto& shard : shards) {
8679 shard->prime_merges(osdmap, &merge_pgs);
7c673cae 8680 }
11fdf7f2
TL
8681 ceph_assert(merge_pgs.empty());
8682 }
8683
8684 service.prune_pg_created();
8685
8686 unsigned pushes_to_free = 0;
8687 for (auto& shard : shards) {
8688 shard->consume_map(osdmap, &pushes_to_free);
8689 }
8690
8691 vector<spg_t> pgids;
8692 _get_pgids(&pgids);
8693
8694 // count (FIXME, probably during seastar rewrite)
8695 int num_pg_primary = 0, num_pg_replica = 0, num_pg_stray = 0;
8696 vector<PGRef> pgs;
8697 _get_pgs(&pgs);
8698 for (auto& pg : pgs) {
8699 // FIXME (probably during seastar rewrite): this is lockless and
8700 // racy, but we don't want to take pg lock here.
8701 if (pg->is_primary())
8702 num_pg_primary++;
9f95a23c
TL
8703 else if (pg->is_nonprimary())
8704 num_pg_replica++; // misnomer
11fdf7f2
TL
8705 else
8706 num_pg_stray++;
8707 }
3efd9988 8708
11fdf7f2
TL
8709 {
8710 // FIXME (as part of seastar rewrite): move to OSDShard
8711 std::lock_guard l(pending_creates_lock);
8712 for (auto pg = pending_creates_from_osd.begin();
8713 pg != pending_creates_from_osd.end();) {
9f95a23c 8714 if (osdmap->get_pg_acting_role(pg->first, whoami) < 0) {
11fdf7f2
TL
8715 dout(10) << __func__ << " pg " << pg->first << " doesn't map here, "
8716 << "discarding pending_create_from_osd" << dendl;
3efd9988
FG
8717 pg = pending_creates_from_osd.erase(pg);
8718 } else {
8719 ++pg;
8720 }
8721 }
7c673cae
FG
8722 }
8723
7c673cae
FG
8724 service.maybe_inject_dispatch_delay();
8725
8726 dispatch_sessions_waiting_on_map();
8727
8728 service.maybe_inject_dispatch_delay();
8729
11fdf7f2 8730 service.release_reserved_pushes(pushes_to_free);
7c673cae 8731
11fdf7f2
TL
8732 // queue null events to push maps down to individual PGs
8733 for (auto pgid : pgids) {
8734 enqueue_peering_evt(
8735 pgid,
8736 PGPeeringEventRef(
8737 std::make_shared<PGPeeringEvent>(
8738 osdmap->get_epoch(),
8739 osdmap->get_epoch(),
8740 NullEvt())));
7c673cae 8741 }
11fdf7f2 8742 logger->set(l_osd_pg, pgids.size());
7c673cae
FG
8743 logger->set(l_osd_pg_primary, num_pg_primary);
8744 logger->set(l_osd_pg_replica, num_pg_replica);
8745 logger->set(l_osd_pg_stray, num_pg_stray);
8746}
8747
8748void OSD::activate_map()
8749{
9f95a23c
TL
8750 ceph_assert(ceph_mutex_is_locked(osd_lock));
8751 auto osdmap = get_osdmap();
7c673cae
FG
8752
8753 dout(7) << "activate_map version " << osdmap->get_epoch() << dendl;
8754
7c673cae
FG
8755 // norecover?
8756 if (osdmap->test_flag(CEPH_OSDMAP_NORECOVER)) {
8757 if (!service.recovery_is_paused()) {
8758 dout(1) << "pausing recovery (NORECOVER flag set)" << dendl;
8759 service.pause_recovery();
8760 }
8761 } else {
8762 if (service.recovery_is_paused()) {
8763 dout(1) << "unpausing recovery (NORECOVER flag unset)" << dendl;
8764 service.unpause_recovery();
8765 }
8766 }
8767
8768 service.activate_map();
8769
8770 // process waiters
8771 take_waiters(waiting_for_osdmap);
8772}
8773
8774bool OSD::require_mon_peer(const Message *m)
8775{
8776 if (!m->get_connection()->peer_is_mon()) {
8777 dout(0) << "require_mon_peer received from non-mon "
8778 << m->get_connection()->get_peer_addr()
8779 << " " << *m << dendl;
8780 return false;
8781 }
8782 return true;
8783}
8784
8785bool OSD::require_mon_or_mgr_peer(const Message *m)
8786{
8787 if (!m->get_connection()->peer_is_mon() &&
8788 !m->get_connection()->peer_is_mgr()) {
8789 dout(0) << "require_mon_or_mgr_peer received from non-mon, non-mgr "
8790 << m->get_connection()->get_peer_addr()
8791 << " " << *m << dendl;
8792 return false;
8793 }
8794 return true;
8795}
8796
8797bool OSD::require_osd_peer(const Message *m)
8798{
8799 if (!m->get_connection()->peer_is_osd()) {
8800 dout(0) << "require_osd_peer received from non-osd "
8801 << m->get_connection()->get_peer_addr()
8802 << " " << *m << dendl;
8803 return false;
8804 }
8805 return true;
8806}
8807
8808bool OSD::require_self_aliveness(const Message *m, epoch_t epoch)
8809{
8810 epoch_t up_epoch = service.get_up_epoch();
8811 if (epoch < up_epoch) {
8812 dout(7) << "from pre-up epoch " << epoch << " < " << up_epoch << dendl;
8813 return false;
8814 }
8815
8816 if (!is_active()) {
8817 dout(7) << "still in boot state, dropping message " << *m << dendl;
8818 return false;
8819 }
8820
8821 return true;
8822}
8823
9f95a23c 8824bool OSD::require_same_peer_instance(const Message *m, const OSDMapRef& map,
7c673cae
FG
8825 bool is_fast_dispatch)
8826{
8827 int from = m->get_source().num();
8828
8829 if (map->is_down(from) ||
11fdf7f2 8830 (map->get_cluster_addrs(from) != m->get_source_addrs())) {
7c673cae
FG
8831 dout(5) << "from dead osd." << from << ", marking down, "
8832 << " msg was " << m->get_source_inst().addr
11fdf7f2
TL
8833 << " expected "
8834 << (map->is_up(from) ?
8835 map->get_cluster_addrs(from) : entity_addrvec_t())
7c673cae
FG
8836 << dendl;
8837 ConnectionRef con = m->get_connection();
8838 con->mark_down();
9f95a23c 8839 if (auto s = ceph::ref_cast<Session>(con->get_priv()); s) {
7c673cae 8840 if (!is_fast_dispatch)
9f95a23c 8841 s->session_dispatch_lock.lock();
7c673cae 8842 clear_session_waiting_on_map(s);
11fdf7f2
TL
8843 con->set_priv(nullptr); // break ref <-> session cycle, if any
8844 s->con.reset();
7c673cae 8845 if (!is_fast_dispatch)
9f95a23c 8846 s->session_dispatch_lock.unlock();
7c673cae
FG
8847 }
8848 return false;
8849 }
8850 return true;
8851}
8852
8853
8854/*
8855 * require that we have same (or newer) map, and that
8856 * the source is the pg primary.
8857 */
8858bool OSD::require_same_or_newer_map(OpRequestRef& op, epoch_t epoch,
8859 bool is_fast_dispatch)
8860{
8861 const Message *m = op->get_req();
9f95a23c 8862 const auto osdmap = get_osdmap();
7c673cae
FG
8863 dout(15) << "require_same_or_newer_map " << epoch
8864 << " (i am " << osdmap->get_epoch() << ") " << m << dendl;
8865
9f95a23c 8866 ceph_assert(ceph_mutex_is_locked(osd_lock));
7c673cae
FG
8867
8868 // do they have a newer map?
8869 if (epoch > osdmap->get_epoch()) {
8870 dout(7) << "waiting for newer map epoch " << epoch
8871 << " > my " << osdmap->get_epoch() << " with " << m << dendl;
8872 wait_for_new_map(op);
8873 return false;
8874 }
8875
8876 if (!require_self_aliveness(op->get_req(), epoch)) {
8877 return false;
8878 }
8879
8880 // ok, our map is same or newer.. do they still exist?
8881 if (m->get_connection()->get_messenger() == cluster_messenger &&
8882 !require_same_peer_instance(op->get_req(), osdmap, is_fast_dispatch)) {
8883 return false;
8884 }
8885
8886 return true;
8887}
8888
8889
8890
8891
8892
8893// ----------------------------------------
8894// pg creation
8895
8896void OSD::split_pgs(
8897 PG *parent,
31f18b77 8898 const set<spg_t> &childpgids, set<PGRef> *out_pgs,
7c673cae
FG
8899 OSDMapRef curmap,
8900 OSDMapRef nextmap,
9f95a23c 8901 PeeringCtx &rctx)
7c673cae 8902{
11fdf7f2
TL
8903 unsigned pg_num = nextmap->get_pg_num(parent->pg_id.pool());
8904 parent->update_snap_mapper_bits(parent->get_pgid().get_split_bits(pg_num));
7c673cae 8905
11fdf7f2
TL
8906 vector<object_stat_sum_t> updated_stats;
8907 parent->start_split_stats(childpgids, &updated_stats);
7c673cae
FG
8908
8909 vector<object_stat_sum_t>::iterator stat_iter = updated_stats.begin();
8910 for (set<spg_t>::const_iterator i = childpgids.begin();
8911 i != childpgids.end();
8912 ++i, ++stat_iter) {
11fdf7f2
TL
8913 ceph_assert(stat_iter != updated_stats.end());
8914 dout(10) << __func__ << " splitting " << *parent << " into " << *i << dendl;
7c673cae
FG
8915 PG* child = _make_pg(nextmap, *i);
8916 child->lock(true);
8917 out_pgs->insert(child);
11fdf7f2 8918 child->ch = store->create_new_collection(child->coll);
7c673cae 8919
11fdf7f2
TL
8920 {
8921 uint32_t shard_index = i->hash_to_shard(shards.size());
8922 assert(NULL != shards[shard_index]);
8923 store->set_collection_commit_queue(child->coll, &(shards[shard_index]->context_queue));
8924 }
7c673cae 8925
11fdf7f2
TL
8926 unsigned split_bits = i->get_split_bits(pg_num);
8927 dout(10) << " pg_num is " << pg_num
8928 << ", m_seed " << i->ps()
8929 << ", split_bits is " << split_bits << dendl;
7c673cae
FG
8930 parent->split_colls(
8931 *i,
8932 split_bits,
8933 i->ps(),
11fdf7f2 8934 &child->get_pool().info,
9f95a23c 8935 rctx.transaction);
7c673cae
FG
8936 parent->split_into(
8937 i->pgid,
8938 child,
8939 split_bits);
7c673cae 8940
92f5a8d4
TL
8941 child->init_collection_pool_opts();
8942
9f95a23c 8943 child->finish_split_stats(*stat_iter, rctx.transaction);
7c673cae
FG
8944 child->unlock();
8945 }
11fdf7f2 8946 ceph_assert(stat_iter != updated_stats.end());
9f95a23c 8947 parent->finish_split_stats(*stat_iter, rctx.transaction);
7c673cae
FG
8948}
8949
8950/*
8951 * holding osd_lock
8952 */
8953void OSD::handle_pg_create(OpRequestRef op)
8954{
9f95a23c
TL
8955 // NOTE: this can be removed in P release (mimic is the last version to
8956 // send MOSDPGCreate messages).
8957
8958 auto m = op->get_req<MOSDPGCreate>();
11fdf7f2 8959 ceph_assert(m->get_type() == MSG_OSD_PG_CREATE);
7c673cae
FG
8960
8961 dout(10) << "handle_pg_create " << *m << dendl;
8962
8963 if (!require_mon_peer(op->get_req())) {
8964 return;
8965 }
8966
8967 if (!require_same_or_newer_map(op, m->epoch, false))
8968 return;
8969
8970 op->mark_started();
8971
9f95a23c 8972 const auto osdmap = get_osdmap();
7c673cae
FG
8973 map<pg_t,utime_t>::const_iterator ci = m->ctimes.begin();
8974 for (map<pg_t,pg_create_t>::const_iterator p = m->mkpg.begin();
8975 p != m->mkpg.end();
8976 ++p, ++ci) {
11fdf7f2 8977 ceph_assert(ci != m->ctimes.end() && ci->first == p->first);
7c673cae
FG
8978 epoch_t created = p->second.created;
8979 if (p->second.split_bits) // Skip split pgs
8980 continue;
8981 pg_t on = p->first;
8982
7c673cae
FG
8983 if (!osdmap->have_pg_pool(on.pool())) {
8984 dout(20) << "ignoring pg on deleted pool " << on << dendl;
8985 continue;
8986 }
8987
8988 dout(20) << "mkpg " << on << " e" << created << "@" << ci->second << dendl;
8989
9f95a23c
TL
8990 spg_t pgid;
8991 bool mapped = osdmap->get_primary_shard(on, &pgid);
8992 ceph_assert(mapped);
8993
7c673cae
FG
8994 // is it still ours?
8995 vector<int> up, acting;
8996 int up_primary = -1;
8997 int acting_primary = -1;
8998 osdmap->pg_to_up_acting_osds(on, &up, &up_primary, &acting, &acting_primary);
9f95a23c 8999 int role = osdmap->calc_pg_role(pg_shard_t(whoami, pgid.shard), acting);
7c673cae
FG
9000
9001 if (acting_primary != whoami) {
9002 dout(10) << "mkpg " << on << " not acting_primary (" << acting_primary
9003 << "), my role=" << role << ", skipping" << dendl;
9004 continue;
9005 }
9006
7c673cae 9007
11fdf7f2 9008 PastIntervals pi;
7c673cae
FG
9009 pg_history_t history;
9010 build_initial_pg_history(pgid, created, ci->second, &history, &pi);
9011
11fdf7f2
TL
9012 // The mon won't resend unless the primary changed, so we ignore
9013 // same_interval_since. We'll pass this history with the current
9014 // epoch as the event.
7c673cae
FG
9015 if (history.same_primary_since > m->epoch) {
9016 dout(10) << __func__ << ": got obsolete pg create on pgid "
9017 << pgid << " from epoch " << m->epoch
9018 << ", primary changed in " << history.same_primary_since
9019 << dendl;
9020 continue;
9021 }
11fdf7f2
TL
9022 enqueue_peering_evt(
9023 pgid,
9024 PGPeeringEventRef(
9025 std::make_shared<PGPeeringEvent>(
9026 osdmap->get_epoch(),
9027 osdmap->get_epoch(),
9028 NullEvt(),
9029 true,
9030 new PGCreateInfo(
9031 pgid,
9032 osdmap->get_epoch(),
9033 history,
9034 pi,
9035 true)
9036 )));
7c673cae 9037 }
7c673cae 9038
3efd9988 9039 {
11fdf7f2 9040 std::lock_guard l(pending_creates_lock);
3efd9988
FG
9041 if (pending_creates_from_mon == 0) {
9042 last_pg_create_epoch = m->epoch;
9043 }
9044 }
11fdf7f2 9045
7c673cae
FG
9046 maybe_update_heartbeat_peers();
9047}
9048
9049
9050// ----------------------------------------
9051// peering and recovery
9052
9f95a23c 9053PeeringCtx OSD::create_context()
7c673cae 9054{
9f95a23c 9055 return PeeringCtx(get_osdmap()->require_osd_release);
7c673cae
FG
9056}
9057
9f95a23c 9058void OSD::dispatch_context(PeeringCtx &ctx, PG *pg, OSDMapRef curmap,
7c673cae
FG
9059 ThreadPool::TPHandle *handle)
9060{
11fdf7f2
TL
9061 if (!service.get_osdmap()->is_up(whoami)) {
9062 dout(20) << __func__ << " not up in osdmap" << dendl;
9063 } else if (!is_active()) {
9064 dout(20) << __func__ << " not active" << dendl;
9065 } else {
9f95a23c
TL
9066 for (auto& [osd, ls] : ctx.message_map) {
9067 if (!curmap->is_up(osd)) {
9068 dout(20) << __func__ << " skipping down osd." << osd << dendl;
9069 continue;
9070 }
9071 ConnectionRef con = service.get_con_osd_cluster(
9072 osd, curmap->get_epoch());
9073 if (!con) {
9074 dout(20) << __func__ << " skipping osd." << osd << " (NULL con)"
9075 << dendl;
9076 continue;
9077 }
9078 service.maybe_share_map(con.get(), curmap);
9079 for (auto m : ls) {
9080 con->send_message2(m);
9081 }
9082 ls.clear();
9083 }
7c673cae 9084 }
9f95a23c 9085 if ((!ctx.transaction.empty() || ctx.transaction.has_contexts()) && pg) {
7c673cae 9086 int tr = store->queue_transaction(
11fdf7f2 9087 pg->ch,
9f95a23c 9088 std::move(ctx.transaction), TrackedOpRef(),
7c673cae 9089 handle);
11fdf7f2 9090 ceph_assert(tr == 0);
7c673cae 9091 }
7c673cae
FG
9092}
9093
11fdf7f2 9094void OSD::handle_fast_pg_create(MOSDPGCreate2 *m)
7c673cae 9095{
11fdf7f2
TL
9096 dout(7) << __func__ << " " << *m << " from " << m->get_source() << dendl;
9097 if (!require_mon_peer(m)) {
9098 m->put();
7c673cae 9099 return;
7c673cae 9100 }
11fdf7f2
TL
9101 for (auto& p : m->pgs) {
9102 spg_t pgid = p.first;
9103 epoch_t created = p.second.first;
9104 utime_t created_stamp = p.second.second;
9f95a23c
TL
9105 auto q = m->pg_extra.find(pgid);
9106 if (q == m->pg_extra.end()) {
9107 dout(20) << __func__ << " " << pgid << " e" << created
9108 << "@" << created_stamp
9109 << " (no history or past_intervals)" << dendl;
9110 // pre-octopus ... no pg history. this can be removed in Q release.
9111 enqueue_peering_evt(
9112 pgid,
9113 PGPeeringEventRef(
9114 std::make_shared<PGPeeringEvent>(
9115 m->epoch,
9116 m->epoch,
9117 NullEvt(),
9118 true,
9119 new PGCreateInfo(
9120 pgid,
9121 created,
9122 pg_history_t(created, created_stamp),
9123 PastIntervals(),
9124 true)
9125 )));
9126 } else {
9127 dout(20) << __func__ << " " << pgid << " e" << created
9128 << "@" << created_stamp
9129 << " history " << q->second.first
9130 << " pi " << q->second.second << dendl;
9131 if (!q->second.second.empty() &&
9132 m->epoch < q->second.second.get_bounds().second) {
9133 clog->error() << "got pg_create on " << pgid << " epoch " << m->epoch
9134 << " and unmatched past_intervals " << q->second.second
9135 << " (history " << q->second.first << ")";
9136 } else {
9137 enqueue_peering_evt(
9138 pgid,
9139 PGPeeringEventRef(
9140 std::make_shared<PGPeeringEvent>(
9141 m->epoch,
9142 m->epoch,
9143 NullEvt(),
9144 true,
9145 new PGCreateInfo(
9146 pgid,
9147 m->epoch,
9148 q->second.first,
9149 q->second.second,
9150 true)
9151 )));
9152 }
9153 }
11fdf7f2 9154 }
7c673cae 9155
11fdf7f2
TL
9156 {
9157 std::lock_guard l(pending_creates_lock);
9158 if (pending_creates_from_mon == 0) {
9159 last_pg_create_epoch = m->epoch;
9160 }
7c673cae
FG
9161 }
9162
11fdf7f2 9163 m->put();
7c673cae
FG
9164}
9165
11fdf7f2 9166void OSD::handle_fast_pg_query(MOSDPGQuery *m)
7c673cae 9167{
11fdf7f2
TL
9168 dout(7) << __func__ << " " << *m << " from " << m->get_source() << dendl;
9169 if (!require_osd_peer(m)) {
9170 m->put();
7c673cae 9171 return;
11fdf7f2 9172 }
7c673cae 9173 int from = m->get_source().num();
11fdf7f2
TL
9174 for (auto& p : m->pg_list) {
9175 enqueue_peering_evt(
9176 p.first,
9177 PGPeeringEventRef(
9178 std::make_shared<PGPeeringEvent>(
9179 p.second.epoch_sent, p.second.epoch_sent,
9180 MQuery(
9181 p.first,
9182 pg_shard_t(from, p.second.from),
9183 p.second,
9184 p.second.epoch_sent),
9185 false))
7c673cae
FG
9186 );
9187 }
11fdf7f2 9188 m->put();
7c673cae
FG
9189}
9190
11fdf7f2 9191void OSD::handle_fast_pg_notify(MOSDPGNotify* m)
7c673cae 9192{
11fdf7f2
TL
9193 dout(7) << __func__ << " " << *m << " from " << m->get_source() << dendl;
9194 if (!require_osd_peer(m)) {
9195 m->put();
7c673cae
FG
9196 return;
9197 }
11fdf7f2
TL
9198 int from = m->get_source().num();
9199 for (auto& p : m->get_pg_list()) {
9f95a23c 9200 spg_t pgid(p.info.pgid.pgid, p.to);
11fdf7f2
TL
9201 enqueue_peering_evt(
9202 pgid,
9203 PGPeeringEventRef(
9204 std::make_shared<PGPeeringEvent>(
9f95a23c
TL
9205 p.epoch_sent,
9206 p.query_epoch,
11fdf7f2 9207 MNotifyRec(
9f95a23c
TL
9208 pgid, pg_shard_t(from, p.from),
9209 p,
9210 m->get_connection()->get_features()),
11fdf7f2
TL
9211 true,
9212 new PGCreateInfo(
9213 pgid,
9f95a23c
TL
9214 p.query_epoch,
9215 p.info.history,
9216 p.past_intervals,
11fdf7f2
TL
9217 false)
9218 )));
7c673cae 9219 }
11fdf7f2 9220 m->put();
7c673cae
FG
9221}
9222
11fdf7f2 9223void OSD::handle_fast_pg_info(MOSDPGInfo* m)
7c673cae 9224{
11fdf7f2
TL
9225 dout(7) << __func__ << " " << *m << " from " << m->get_source() << dendl;
9226 if (!require_osd_peer(m)) {
9227 m->put();
7c673cae
FG
9228 return;
9229 }
11fdf7f2
TL
9230 int from = m->get_source().num();
9231 for (auto& p : m->pg_list) {
9232 enqueue_peering_evt(
9f95a23c 9233 spg_t(p.info.pgid.pgid, p.to),
11fdf7f2
TL
9234 PGPeeringEventRef(
9235 std::make_shared<PGPeeringEvent>(
9f95a23c 9236 p.epoch_sent, p.query_epoch,
11fdf7f2 9237 MInfoRec(
9f95a23c
TL
9238 pg_shard_t(from, p.from),
9239 p.info,
9240 p.epoch_sent)))
11fdf7f2 9241 );
7c673cae 9242 }
11fdf7f2 9243 m->put();
7c673cae
FG
9244}
9245
11fdf7f2 9246void OSD::handle_fast_pg_remove(MOSDPGRemove *m)
7c673cae 9247{
11fdf7f2
TL
9248 dout(7) << __func__ << " " << *m << " from " << m->get_source() << dendl;
9249 if (!require_osd_peer(m)) {
9250 m->put();
7c673cae
FG
9251 return;
9252 }
11fdf7f2
TL
9253 for (auto& pgid : m->pg_list) {
9254 enqueue_peering_evt(
9255 pgid,
9256 PGPeeringEventRef(
9257 std::make_shared<PGPeeringEvent>(
9258 m->get_epoch(), m->get_epoch(),
9f95a23c 9259 PeeringState::DeleteStart())));
7c673cae 9260 }
11fdf7f2 9261 m->put();
7c673cae
FG
9262}
9263
11fdf7f2 9264void OSD::handle_fast_force_recovery(MOSDForceRecovery *m)
c07f9fc5 9265{
11fdf7f2
TL
9266 dout(10) << __func__ << " " << *m << dendl;
9267 if (!require_mon_or_mgr_peer(m)) {
9268 m->put();
9269 return;
9270 }
9271 epoch_t epoch = get_osdmap_epoch();
9272 for (auto pgid : m->forced_pgs) {
9273 if (m->options & OFR_BACKFILL) {
9274 if (m->options & OFR_CANCEL) {
9275 enqueue_peering_evt(
9276 pgid,
9277 PGPeeringEventRef(
9278 std::make_shared<PGPeeringEvent>(
9279 epoch, epoch,
9f95a23c 9280 PeeringState::UnsetForceBackfill())));
11fdf7f2
TL
9281 } else {
9282 enqueue_peering_evt(
9283 pgid,
9284 PGPeeringEventRef(
9285 std::make_shared<PGPeeringEvent>(
9286 epoch, epoch,
9f95a23c 9287 PeeringState::SetForceBackfill())));
11fdf7f2
TL
9288 }
9289 } else if (m->options & OFR_RECOVERY) {
9290 if (m->options & OFR_CANCEL) {
9291 enqueue_peering_evt(
9292 pgid,
9293 PGPeeringEventRef(
9294 std::make_shared<PGPeeringEvent>(
9295 epoch, epoch,
9f95a23c 9296 PeeringState::UnsetForceRecovery())));
11fdf7f2
TL
9297 } else {
9298 enqueue_peering_evt(
9299 pgid,
9300 PGPeeringEventRef(
9301 std::make_shared<PGPeeringEvent>(
9302 epoch, epoch,
9f95a23c 9303 PeeringState::SetForceRecovery())));
c07f9fc5
FG
9304 }
9305 }
9306 }
11fdf7f2 9307 m->put();
c07f9fc5 9308}
7c673cae 9309
11fdf7f2 9310void OSD::handle_pg_query_nopg(const MQuery& q)
7c673cae 9311{
11fdf7f2
TL
9312 spg_t pgid = q.pgid;
9313 dout(10) << __func__ << " " << pgid << dendl;
7c673cae 9314
11fdf7f2
TL
9315 OSDMapRef osdmap = get_osdmap();
9316 if (!osdmap->have_pg_pool(pgid.pool()))
7c673cae
FG
9317 return;
9318
11fdf7f2
TL
9319 dout(10) << " pg " << pgid << " dne" << dendl;
9320 pg_info_t empty(spg_t(pgid.pgid, q.query.to));
9321 ConnectionRef con = service.get_con_osd_cluster(q.from.osd, osdmap->get_epoch());
9322 if (con) {
9323 Message *m;
9324 if (q.query.type == pg_query_t::LOG ||
9325 q.query.type == pg_query_t::FULLLOG) {
9326 m = new MOSDPGLog(
9327 q.query.from, q.query.to,
9328 osdmap->get_epoch(), empty,
9329 q.query.epoch_sent);
7c673cae 9330 } else {
9f95a23c 9331 vector<pg_notify_t> ls;
11fdf7f2 9332 ls.push_back(
9f95a23c
TL
9333 pg_notify_t(
9334 q.query.from, q.query.to,
9335 q.query.epoch_sent,
9336 osdmap->get_epoch(),
9337 empty,
11fdf7f2 9338 PastIntervals()));
9f95a23c 9339 m = new MOSDPGNotify(osdmap->get_epoch(), std::move(ls));
7c673cae 9340 }
9f95a23c 9341 service.maybe_share_map(con.get(), osdmap);
11fdf7f2 9342 con->send_message(m);
7c673cae
FG
9343 }
9344}
9345
9f95a23c
TL
9346void OSDService::queue_check_readable(spg_t spgid,
9347 epoch_t lpr,
9348 ceph::signedspan delay)
9349{
9350 if (delay == ceph::signedspan::zero()) {
9351 osd->enqueue_peering_evt(
9352 spgid,
9353 PGPeeringEventRef(
9354 std::make_shared<PGPeeringEvent>(
9355 lpr, lpr,
9356 PeeringState::CheckReadable())));
9357 } else {
9358 mono_timer.add_event(
9359 delay,
9360 [this, spgid, lpr]() {
9361 queue_check_readable(spgid, lpr);
9362 });
9363 }
9364}
9365
7c673cae 9366
7c673cae
FG
9367// =========================================================
9368// RECOVERY
9369
9370void OSDService::_maybe_queue_recovery() {
9f95a23c 9371 ceph_assert(ceph_mutex_is_locked_by_me(recovery_lock));
7c673cae
FG
9372 uint64_t available_pushes;
9373 while (!awaiting_throttle.empty() &&
9374 _recover_now(&available_pushes)) {
11fdf7f2 9375 uint64_t to_start = std::min(
7c673cae
FG
9376 available_pushes,
9377 cct->_conf->osd_recovery_max_single_start);
9378 _queue_for_recovery(awaiting_throttle.front(), to_start);
9379 awaiting_throttle.pop_front();
11fdf7f2
TL
9380 dout(10) << __func__ << " starting " << to_start
9381 << ", recovery_ops_reserved " << recovery_ops_reserved
9382 << " -> " << (recovery_ops_reserved + to_start) << dendl;
7c673cae
FG
9383 recovery_ops_reserved += to_start;
9384 }
9385}
9386
9387bool OSDService::_recover_now(uint64_t *available_pushes)
9388{
9389 if (available_pushes)
9390 *available_pushes = 0;
9391
9392 if (ceph_clock_now() < defer_recovery_until) {
9393 dout(15) << __func__ << " defer until " << defer_recovery_until << dendl;
9394 return false;
9395 }
9396
9397 if (recovery_paused) {
9398 dout(15) << __func__ << " paused" << dendl;
9399 return false;
9400 }
9401
9f95a23c 9402 uint64_t max = osd->get_recovery_max_active();
7c673cae
FG
9403 if (max <= recovery_ops_active + recovery_ops_reserved) {
9404 dout(15) << __func__ << " active " << recovery_ops_active
9405 << " + reserved " << recovery_ops_reserved
9406 << " >= max " << max << dendl;
9407 return false;
9408 }
9409
9410 if (available_pushes)
9411 *available_pushes = max - recovery_ops_active - recovery_ops_reserved;
9412
9413 return true;
9414}
9415
9f95a23c
TL
9416unsigned OSDService::get_target_pg_log_entries() const
9417{
9418 auto num_pgs = osd->get_num_pgs();
9419 auto target = cct->_conf->osd_target_pg_log_entries_per_osd;
9420 if (num_pgs > 0 && target > 0) {
9421 // target an even spread of our budgeted log entries across all
9422 // PGs. note that while we only get to control the entry count
9423 // for primary PGs, we'll normally be responsible for a mix of
9424 // primary and replica PGs (for the same pool(s) even), so this
9425 // will work out.
9426 return std::max<unsigned>(
9427 std::min<unsigned>(target / num_pgs,
9428 cct->_conf->osd_max_pg_log_entries),
9429 cct->_conf->osd_min_pg_log_entries);
9430 } else {
9431 // fall back to a per-pg value.
9432 return cct->_conf->osd_min_pg_log_entries;
9433 }
9434}
9435
7c673cae
FG
9436void OSD::do_recovery(
9437 PG *pg, epoch_t queued, uint64_t reserved_pushes,
9438 ThreadPool::TPHandle &handle)
9439{
9440 uint64_t started = 0;
31f18b77
FG
9441
9442 /*
9443 * When the value of osd_recovery_sleep is set greater than zero, recovery
9444 * ops are scheduled after osd_recovery_sleep amount of time from the previous
9445 * recovery event's schedule time. This is done by adding a
9446 * recovery_requeue_callback event, which re-queues the recovery op using
9447 * queue_recovery_after_sleep.
9448 */
c07f9fc5 9449 float recovery_sleep = get_osd_recovery_sleep();
b32b8144 9450 {
11fdf7f2 9451 std::lock_guard l(service.sleep_lock);
b32b8144
FG
9452 if (recovery_sleep > 0 && service.recovery_needs_sleep) {
9453 PGRef pgref(pg);
9f95a23c 9454 auto recovery_requeue_callback = new LambdaContext([this, pgref, queued, reserved_pushes](int r) {
b32b8144
FG
9455 dout(20) << "do_recovery wake up at "
9456 << ceph_clock_now()
9457 << ", re-queuing recovery" << dendl;
11fdf7f2 9458 std::lock_guard l(service.sleep_lock);
b32b8144
FG
9459 service.recovery_needs_sleep = false;
9460 service.queue_recovery_after_sleep(pgref.get(), queued, reserved_pushes);
9461 });
9462
9463 // This is true for the first recovery op and when the previous recovery op
9464 // has been scheduled in the past. The next recovery op is scheduled after
9465 // completing the sleep from now.
9f95a23c
TL
9466
9467 if (auto now = ceph::real_clock::now();
9468 service.recovery_schedule_time < now) {
9469 service.recovery_schedule_time = now;
b32b8144 9470 }
9f95a23c 9471 service.recovery_schedule_time += ceph::make_timespan(recovery_sleep);
11fdf7f2 9472 service.sleep_timer.add_event_at(service.recovery_schedule_time,
9f95a23c 9473 recovery_requeue_callback);
b32b8144
FG
9474 dout(20) << "Recovery event scheduled at "
9475 << service.recovery_schedule_time << dendl;
9476 return;
9477 }
7c673cae
FG
9478 }
9479
9480 {
b32b8144 9481 {
11fdf7f2 9482 std::lock_guard l(service.sleep_lock);
b32b8144
FG
9483 service.recovery_needs_sleep = true;
9484 }
9485
7c673cae
FG
9486 if (pg->pg_has_reset_since(queued)) {
9487 goto out;
9488 }
9489
7c673cae
FG
9490 dout(10) << "do_recovery starting " << reserved_pushes << " " << *pg << dendl;
9491#ifdef DEBUG_RECOVERY_OIDS
11fdf7f2 9492 dout(20) << " active was " << service.recovery_oids[pg->pg_id] << dendl;
7c673cae
FG
9493#endif
9494
11fdf7f2 9495 bool do_unfound = pg->start_recovery_ops(reserved_pushes, handle, &started);
7c673cae
FG
9496 dout(10) << "do_recovery started " << started << "/" << reserved_pushes
9497 << " on " << *pg << dendl;
9498
11fdf7f2 9499 if (do_unfound) {
9f95a23c 9500 PeeringCtx rctx = create_context();
11fdf7f2 9501 rctx.handle = &handle;
9f95a23c 9502 pg->find_unfound(queued, rctx);
11fdf7f2 9503 dispatch_context(rctx, pg, pg->get_osdmap());
7c673cae 9504 }
7c673cae
FG
9505 }
9506
9507 out:
11fdf7f2 9508 ceph_assert(started <= reserved_pushes);
7c673cae
FG
9509 service.release_reserved_pushes(reserved_pushes);
9510}
9511
9512void OSDService::start_recovery_op(PG *pg, const hobject_t& soid)
9513{
11fdf7f2 9514 std::lock_guard l(recovery_lock);
7c673cae
FG
9515 dout(10) << "start_recovery_op " << *pg << " " << soid
9516 << " (" << recovery_ops_active << "/"
9f95a23c 9517 << osd->get_recovery_max_active() << " rops)"
7c673cae
FG
9518 << dendl;
9519 recovery_ops_active++;
9520
9521#ifdef DEBUG_RECOVERY_OIDS
11fdf7f2
TL
9522 dout(20) << " active was " << recovery_oids[pg->pg_id] << dendl;
9523 ceph_assert(recovery_oids[pg->pg_id].count(soid) == 0);
9524 recovery_oids[pg->pg_id].insert(soid);
7c673cae
FG
9525#endif
9526}
9527
9528void OSDService::finish_recovery_op(PG *pg, const hobject_t& soid, bool dequeue)
9529{
11fdf7f2 9530 std::lock_guard l(recovery_lock);
7c673cae
FG
9531 dout(10) << "finish_recovery_op " << *pg << " " << soid
9532 << " dequeue=" << dequeue
9f95a23c
TL
9533 << " (" << recovery_ops_active << "/"
9534 << osd->get_recovery_max_active() << " rops)"
7c673cae
FG
9535 << dendl;
9536
9537 // adjust count
11fdf7f2 9538 ceph_assert(recovery_ops_active > 0);
7c673cae
FG
9539 recovery_ops_active--;
9540
9541#ifdef DEBUG_RECOVERY_OIDS
11fdf7f2
TL
9542 dout(20) << " active oids was " << recovery_oids[pg->pg_id] << dendl;
9543 ceph_assert(recovery_oids[pg->pg_id].count(soid));
9544 recovery_oids[pg->pg_id].erase(soid);
7c673cae
FG
9545#endif
9546
9547 _maybe_queue_recovery();
9548}
9549
9550bool OSDService::is_recovery_active()
9551{
eafe8130
TL
9552 if (cct->_conf->osd_debug_pretend_recovery_active) {
9553 return true;
9554 }
b5b8bbf5 9555 return local_reserver.has_reservation() || remote_reserver.has_reservation();
7c673cae
FG
9556}
9557
11fdf7f2
TL
9558void OSDService::release_reserved_pushes(uint64_t pushes)
9559{
9560 std::lock_guard l(recovery_lock);
9561 dout(10) << __func__ << "(" << pushes << "), recovery_ops_reserved "
9562 << recovery_ops_reserved << " -> " << (recovery_ops_reserved-pushes)
9563 << dendl;
9564 ceph_assert(recovery_ops_reserved >= pushes);
9565 recovery_ops_reserved -= pushes;
9566 _maybe_queue_recovery();
9567}
9568
7c673cae
FG
9569// =========================================================
9570// OPS
9571
9572bool OSD::op_is_discardable(const MOSDOp *op)
9573{
9574 // drop client request if they are not connected and can't get the
9575 // reply anyway.
9576 if (!op->get_connection()->is_connected()) {
9577 return true;
9578 }
9579 return false;
9580}
9581
11fdf7f2 9582void OSD::enqueue_op(spg_t pg, OpRequestRef&& op, epoch_t epoch)
7c673cae 9583{
11fdf7f2
TL
9584 const utime_t stamp = op->get_req()->get_recv_stamp();
9585 const utime_t latency = ceph_clock_now() - stamp;
9586 const unsigned priority = op->get_req()->get_priority();
9587 const int cost = op->get_req()->get_cost();
9588 const uint64_t owner = op->get_req()->get_source().num();
9589
9590 dout(15) << "enqueue_op " << op << " prio " << priority
9591 << " cost " << cost
7c673cae
FG
9592 << " latency " << latency
9593 << " epoch " << epoch
9594 << " " << *(op->get_req()) << dendl;
9595 op->osd_trace.event("enqueue op");
11fdf7f2
TL
9596 op->osd_trace.keyval("priority", priority);
9597 op->osd_trace.keyval("cost", cost);
7c673cae 9598 op->mark_queued_for_pg();
224ce89b 9599 logger->tinc(l_osd_op_before_queue_op_lat, latency);
11fdf7f2 9600 op_shardedwq.queue(
9f95a23c
TL
9601 OpSchedulerItem(
9602 unique_ptr<OpSchedulerItem::OpQueueable>(new PGOpItem(pg, std::move(op))),
11fdf7f2 9603 cost, priority, stamp, owner, epoch));
7c673cae
FG
9604}
9605
11fdf7f2
TL
9606void OSD::enqueue_peering_evt(spg_t pgid, PGPeeringEventRef evt)
9607{
9608 dout(15) << __func__ << " " << pgid << " " << evt->get_desc() << dendl;
9609 op_shardedwq.queue(
9f95a23c
TL
9610 OpSchedulerItem(
9611 unique_ptr<OpSchedulerItem::OpQueueable>(new PGPeeringItem(pgid, evt)),
11fdf7f2
TL
9612 10,
9613 cct->_conf->osd_peering_op_priority,
9614 utime_t(),
9615 0,
9616 evt->get_epoch_sent()));
9617}
7c673cae
FG
9618
9619/*
9620 * NOTE: dequeue called in worker thread, with pg lock
9621 */
9622void OSD::dequeue_op(
9623 PGRef pg, OpRequestRef op,
9624 ThreadPool::TPHandle &handle)
9625{
9f95a23c
TL
9626 const Message *m = op->get_req();
9627
11fdf7f2 9628 FUNCTRACE(cct);
9f95a23c 9629 OID_EVENT_TRACE_WITH_MSG(m, "DEQUEUE_OP_BEGIN", false);
7c673cae
FG
9630
9631 utime_t now = ceph_clock_now();
9632 op->set_dequeued_time(now);
9f95a23c
TL
9633
9634 utime_t latency = now - m->get_recv_stamp();
9635 dout(10) << "dequeue_op " << op << " prio " << m->get_priority()
9636 << " cost " << m->get_cost()
7c673cae 9637 << " latency " << latency
9f95a23c 9638 << " " << *m
7c673cae
FG
9639 << " pg " << *pg << dendl;
9640
224ce89b
WB
9641 logger->tinc(l_osd_op_before_dequeue_op_lat, latency);
9642
9f95a23c
TL
9643 service.maybe_share_map(m->get_connection().get(),
9644 pg->get_osdmap(),
9645 op->sent_epoch);
7c673cae 9646
11fdf7f2 9647 if (pg->is_deleting())
7c673cae
FG
9648 return;
9649
9650 op->mark_reached_pg();
9651 op->osd_trace.event("dequeue_op");
9652
9653 pg->do_request(op, handle);
9654
9655 // finish
9656 dout(10) << "dequeue_op " << op << " finish" << dendl;
9f95a23c 9657 OID_EVENT_TRACE_WITH_MSG(m, "DEQUEUE_OP_END", false);
7c673cae
FG
9658}
9659
9660
11fdf7f2
TL
9661void OSD::dequeue_peering_evt(
9662 OSDShard *sdata,
9663 PG *pg,
9664 PGPeeringEventRef evt,
9665 ThreadPool::TPHandle& handle)
7c673cae 9666{
9f95a23c 9667 PeeringCtx rctx = create_context();
11fdf7f2 9668 auto curmap = sdata->get_osdmap();
9f95a23c
TL
9669 bool need_up_thru = false;
9670 epoch_t same_interval_since = 0;
11fdf7f2
TL
9671 if (!pg) {
9672 if (const MQuery *q = dynamic_cast<const MQuery*>(evt->evt.get())) {
9673 handle_pg_query_nopg(*q);
7c673cae 9674 } else {
11fdf7f2
TL
9675 derr << __func__ << " unrecognized pg-less event " << evt->get_desc() << dendl;
9676 ceph_abort();
9677 }
9f95a23c
TL
9678 } else if (advance_pg(curmap->get_epoch(), pg, handle, rctx)) {
9679 pg->do_peering_event(evt, rctx);
11fdf7f2 9680 if (pg->is_deleted()) {
11fdf7f2
TL
9681 pg->unlock();
9682 return;
7c673cae 9683 }
9f95a23c 9684 dispatch_context(rctx, pg, curmap, &handle);
11fdf7f2
TL
9685 need_up_thru = pg->get_need_up_thru();
9686 same_interval_since = pg->get_same_interval_since();
7c673cae
FG
9687 pg->unlock();
9688 }
11fdf7f2
TL
9689
9690 if (need_up_thru) {
7c673cae 9691 queue_want_up_thru(same_interval_since);
11fdf7f2 9692 }
7c673cae
FG
9693
9694 service.send_pg_temp();
9695}
9696
11fdf7f2
TL
9697void OSD::dequeue_delete(
9698 OSDShard *sdata,
9699 PG *pg,
9700 epoch_t e,
9701 ThreadPool::TPHandle& handle)
9702{
9703 dequeue_peering_evt(
9704 sdata,
9705 pg,
9706 PGPeeringEventRef(
9707 std::make_shared<PGPeeringEvent>(
9708 e, e,
9f95a23c 9709 PeeringState::DeleteSome())),
11fdf7f2
TL
9710 handle);
9711}
9712
9713
9714
7c673cae
FG
9715// --------------------------------
9716
9717const char** OSD::get_tracked_conf_keys() const
9718{
9719 static const char* KEYS[] = {
9720 "osd_max_backfills",
9721 "osd_min_recovery_priority",
224ce89b
WB
9722 "osd_max_trimming_pgs",
9723 "osd_op_complaint_time",
9724 "osd_op_log_threshold",
9725 "osd_op_history_size",
9726 "osd_op_history_duration",
9727 "osd_op_history_slow_op_size",
9728 "osd_op_history_slow_op_threshold",
7c673cae
FG
9729 "osd_enable_op_tracker",
9730 "osd_map_cache_size",
11fdf7f2 9731 "osd_pg_epoch_max_lag_factor",
7c673cae 9732 "osd_pg_epoch_persisted_max_stale",
7c673cae
FG
9733 // clog & admin clog
9734 "clog_to_monitors",
9735 "clog_to_syslog",
9736 "clog_to_syslog_facility",
9737 "clog_to_syslog_level",
9738 "osd_objectstore_fuse",
9739 "clog_to_graylog",
9740 "clog_to_graylog_host",
9741 "clog_to_graylog_port",
9742 "host",
9743 "fsid",
9744 "osd_recovery_delay_start",
9745 "osd_client_message_size_cap",
9746 "osd_client_message_cap",
31f18b77
FG
9747 "osd_heartbeat_min_size",
9748 "osd_heartbeat_interval",
9f95a23c 9749 "osd_object_clean_region_max_num_intervals",
494da23a
TL
9750 "osd_scrub_min_interval",
9751 "osd_scrub_max_interval",
7c673cae
FG
9752 NULL
9753 };
9754 return KEYS;
9755}
9756
11fdf7f2 9757void OSD::handle_conf_change(const ConfigProxy& conf,
7c673cae
FG
9758 const std::set <std::string> &changed)
9759{
9f95a23c 9760 std::lock_guard l{osd_lock};
7c673cae
FG
9761 if (changed.count("osd_max_backfills")) {
9762 service.local_reserver.set_max(cct->_conf->osd_max_backfills);
9763 service.remote_reserver.set_max(cct->_conf->osd_max_backfills);
9764 }
9765 if (changed.count("osd_min_recovery_priority")) {
9766 service.local_reserver.set_min_priority(cct->_conf->osd_min_recovery_priority);
9767 service.remote_reserver.set_min_priority(cct->_conf->osd_min_recovery_priority);
9768 }
9769 if (changed.count("osd_max_trimming_pgs")) {
9770 service.snap_reserver.set_max(cct->_conf->osd_max_trimming_pgs);
9771 }
9772 if (changed.count("osd_op_complaint_time") ||
9773 changed.count("osd_op_log_threshold")) {
9774 op_tracker.set_complaint_and_threshold(cct->_conf->osd_op_complaint_time,
9775 cct->_conf->osd_op_log_threshold);
9776 }
9777 if (changed.count("osd_op_history_size") ||
9778 changed.count("osd_op_history_duration")) {
9779 op_tracker.set_history_size_and_duration(cct->_conf->osd_op_history_size,
9780 cct->_conf->osd_op_history_duration);
9781 }
9782 if (changed.count("osd_op_history_slow_op_size") ||
9783 changed.count("osd_op_history_slow_op_threshold")) {
9784 op_tracker.set_history_slow_op_size_and_threshold(cct->_conf->osd_op_history_slow_op_size,
9785 cct->_conf->osd_op_history_slow_op_threshold);
9786 }
9787 if (changed.count("osd_enable_op_tracker")) {
9788 op_tracker.set_tracking(cct->_conf->osd_enable_op_tracker);
9789 }
7c673cae
FG
9790 if (changed.count("osd_map_cache_size")) {
9791 service.map_cache.set_size(cct->_conf->osd_map_cache_size);
9792 service.map_bl_cache.set_size(cct->_conf->osd_map_cache_size);
9793 service.map_bl_inc_cache.set_size(cct->_conf->osd_map_cache_size);
9794 }
9795 if (changed.count("clog_to_monitors") ||
9796 changed.count("clog_to_syslog") ||
9797 changed.count("clog_to_syslog_level") ||
9798 changed.count("clog_to_syslog_facility") ||
9799 changed.count("clog_to_graylog") ||
9800 changed.count("clog_to_graylog_host") ||
9801 changed.count("clog_to_graylog_port") ||
9802 changed.count("host") ||
9803 changed.count("fsid")) {
9804 update_log_config();
9805 }
11fdf7f2
TL
9806 if (changed.count("osd_pg_epoch_max_lag_factor")) {
9807 m_osd_pg_epoch_max_lag_factor = conf.get_val<double>(
9808 "osd_pg_epoch_max_lag_factor");
9809 }
7c673cae
FG
9810
9811#ifdef HAVE_LIBFUSE
9812 if (changed.count("osd_objectstore_fuse")) {
9813 if (store) {
9814 enable_disable_fuse(false);
9815 }
9816 }
9817#endif
9818
9819 if (changed.count("osd_recovery_delay_start")) {
9820 service.defer_recovery(cct->_conf->osd_recovery_delay_start);
9821 service.kick_recovery_queue();
9822 }
9823
9824 if (changed.count("osd_client_message_cap")) {
9825 uint64_t newval = cct->_conf->osd_client_message_cap;
9826 Messenger::Policy pol = client_messenger->get_policy(entity_name_t::TYPE_CLIENT);
9827 if (pol.throttler_messages && newval > 0) {
9828 pol.throttler_messages->reset_max(newval);
9829 }
9830 }
9831 if (changed.count("osd_client_message_size_cap")) {
9832 uint64_t newval = cct->_conf->osd_client_message_size_cap;
9833 Messenger::Policy pol = client_messenger->get_policy(entity_name_t::TYPE_CLIENT);
9834 if (pol.throttler_bytes && newval > 0) {
9835 pol.throttler_bytes->reset_max(newval);
9836 }
9837 }
9f95a23c
TL
9838 if (changed.count("osd_object_clean_region_max_num_intervals")) {
9839 ObjectCleanRegions::set_max_num_intervals(cct->_conf->osd_object_clean_region_max_num_intervals);
9840 }
7c673cae 9841
494da23a
TL
9842 if (changed.count("osd_scrub_min_interval") ||
9843 changed.count("osd_scrub_max_interval")) {
9844 resched_all_scrubs();
9845 dout(0) << __func__ << ": scrub interval change" << dendl;
9846 }
7c673cae
FG
9847 check_config();
9848}
9849
9850void OSD::update_log_config()
9851{
9852 map<string,string> log_to_monitors;
9853 map<string,string> log_to_syslog;
9854 map<string,string> log_channel;
9855 map<string,string> log_prio;
9856 map<string,string> log_to_graylog;
9857 map<string,string> log_to_graylog_host;
9858 map<string,string> log_to_graylog_port;
9859 uuid_d fsid;
9860 string host;
9861
9862 if (parse_log_client_options(cct, log_to_monitors, log_to_syslog,
9863 log_channel, log_prio, log_to_graylog,
9864 log_to_graylog_host, log_to_graylog_port,
9865 fsid, host) == 0)
9866 clog->update_config(log_to_monitors, log_to_syslog,
9867 log_channel, log_prio, log_to_graylog,
9868 log_to_graylog_host, log_to_graylog_port,
9869 fsid, host);
9870 derr << "log_to_monitors " << log_to_monitors << dendl;
9871}
9872
9873void OSD::check_config()
9874{
9875 // some sanity checks
7c673cae
FG
9876 if (cct->_conf->osd_map_cache_size <= (int)cct->_conf->osd_pg_epoch_persisted_max_stale + 2) {
9877 clog->warn() << "osd_map_cache_size (" << cct->_conf->osd_map_cache_size << ")"
9878 << " is not > osd_pg_epoch_persisted_max_stale ("
9879 << cct->_conf->osd_pg_epoch_persisted_max_stale << ")";
9880 }
9f95a23c
TL
9881 if (cct->_conf->osd_object_clean_region_max_num_intervals < 0) {
9882 clog->warn() << "osd_object_clean_region_max_num_intervals ("
9883 << cct->_conf->osd_object_clean_region_max_num_intervals
9884 << ") is < 0";
9885 }
7c673cae
FG
9886}
9887
7c673cae
FG
9888// --------------------------------
9889
9890void OSD::get_latest_osdmap()
9891{
9892 dout(10) << __func__ << " -- start" << dendl;
9893
9894 C_SaferCond cond;
9895 service.objecter->wait_for_latest_osdmap(&cond);
9896 cond.wait();
9897
9898 dout(10) << __func__ << " -- finish" << dendl;
9899}
9900
9901// --------------------------------
9902
9f95a23c
TL
9903void OSD::set_perf_queries(const ConfigPayload &config_payload) {
9904 const OSDConfigPayload &osd_config_payload = boost::get<OSDConfigPayload>(config_payload);
9905 const std::map<OSDPerfMetricQuery, OSDPerfMetricLimits> &queries = osd_config_payload.config;
11fdf7f2
TL
9906 dout(10) << "setting " << queries.size() << " queries" << dendl;
9907
9908 std::list<OSDPerfMetricQuery> supported_queries;
9909 for (auto &it : queries) {
9910 auto &query = it.first;
9911 if (!query.key_descriptor.empty()) {
9912 supported_queries.push_back(query);
9913 }
9914 }
9915 if (supported_queries.size() < queries.size()) {
9916 dout(1) << queries.size() - supported_queries.size()
9917 << " unsupported queries" << dendl;
9918 }
11fdf7f2 9919 {
9f95a23c 9920 std::lock_guard locker{m_perf_queries_lock};
11fdf7f2
TL
9921 m_perf_queries = supported_queries;
9922 m_perf_limits = queries;
9923 }
11fdf7f2
TL
9924 std::vector<PGRef> pgs;
9925 _get_pgs(&pgs);
9926 for (auto& pg : pgs) {
9f95a23c 9927 std::scoped_lock l{*pg};
eafe8130 9928 pg->set_dynamic_perf_stats_queries(supported_queries);
7c673cae 9929 }
7c673cae
FG
9930}
9931
9f95a23c
TL
9932MetricPayload OSD::get_perf_reports() {
9933 OSDMetricPayload payload;
9934 std::map<OSDPerfMetricQuery, OSDPerfMetricReport> &reports = payload.report;
9935
11fdf7f2
TL
9936 std::vector<PGRef> pgs;
9937 _get_pgs(&pgs);
9938 DynamicPerfStats dps;
9939 for (auto& pg : pgs) {
eafe8130
TL
9940 // m_perf_queries can be modified only in set_perf_queries by mgr client
9941 // request, and it is protected by by mgr client's lock, which is held
9942 // when set_perf_queries/get_perf_reports are called, so we may not hold
9943 // m_perf_queries_lock here.
9944 DynamicPerfStats pg_dps(m_perf_queries);
9945 pg->lock();
9946 pg->get_dynamic_perf_stats(&pg_dps);
9947 pg->unlock();
9948 dps.merge(pg_dps);
11fdf7f2 9949 }
9f95a23c
TL
9950 dps.add_to_reports(m_perf_limits, &reports);
9951 dout(20) << "reports for " << reports.size() << " queries" << dendl;
9952
9953 return payload;
11fdf7f2 9954}
224ce89b 9955
7c673cae
FG
9956// =============================================================
9957
9958#undef dout_context
11fdf7f2 9959#define dout_context cct
7c673cae 9960#undef dout_prefix
11fdf7f2 9961#define dout_prefix *_dout << "osd." << osd->get_nodeid() << ":" << shard_id << "." << __func__ << " "
7c673cae 9962
11fdf7f2 9963void OSDShard::_attach_pg(OSDShardPGSlot *slot, PG *pg)
7c673cae 9964{
11fdf7f2
TL
9965 dout(10) << pg->pg_id << " " << pg << dendl;
9966 slot->pg = pg;
9967 pg->osd_shard = this;
9968 pg->pg_slot = slot;
9969 osd->inc_num_pgs();
9970
9971 slot->epoch = pg->get_osdmap_epoch();
9972 pg_slots_by_epoch.insert(*slot);
9973}
9974
9975void OSDShard::_detach_pg(OSDShardPGSlot *slot)
9976{
9977 dout(10) << slot->pg->pg_id << " " << slot->pg << dendl;
9978 slot->pg->osd_shard = nullptr;
9979 slot->pg->pg_slot = nullptr;
9980 slot->pg = nullptr;
9981 osd->dec_num_pgs();
9982
9983 pg_slots_by_epoch.erase(pg_slots_by_epoch.iterator_to(*slot));
9984 slot->epoch = 0;
9985 if (waiting_for_min_pg_epoch) {
9986 min_pg_epoch_cond.notify_all();
9987 }
9988}
9989
9990void OSDShard::update_pg_epoch(OSDShardPGSlot *slot, epoch_t e)
9991{
9992 std::lock_guard l(shard_lock);
9993 dout(30) << "min was " << pg_slots_by_epoch.begin()->epoch
9994 << " on " << pg_slots_by_epoch.begin()->pg->pg_id << dendl;
9995 pg_slots_by_epoch.erase(pg_slots_by_epoch.iterator_to(*slot));
9996 dout(20) << slot->pg->pg_id << " " << slot->epoch << " -> " << e << dendl;
9997 slot->epoch = e;
9998 pg_slots_by_epoch.insert(*slot);
9999 dout(30) << "min is now " << pg_slots_by_epoch.begin()->epoch
10000 << " on " << pg_slots_by_epoch.begin()->pg->pg_id << dendl;
10001 if (waiting_for_min_pg_epoch) {
10002 min_pg_epoch_cond.notify_all();
10003 }
10004}
10005
10006epoch_t OSDShard::get_min_pg_epoch()
10007{
10008 std::lock_guard l(shard_lock);
10009 auto p = pg_slots_by_epoch.begin();
10010 if (p == pg_slots_by_epoch.end()) {
10011 return 0;
10012 }
10013 return p->epoch;
10014}
10015
10016void OSDShard::wait_min_pg_epoch(epoch_t need)
10017{
10018 std::unique_lock l{shard_lock};
10019 ++waiting_for_min_pg_epoch;
10020 min_pg_epoch_cond.wait(l, [need, this] {
10021 if (pg_slots_by_epoch.empty()) {
10022 return true;
10023 } else if (pg_slots_by_epoch.begin()->epoch >= need) {
10024 return true;
10025 } else {
10026 dout(10) << need << " waiting on "
10027 << pg_slots_by_epoch.begin()->epoch << dendl;
10028 return false;
10029 }
10030 });
10031 --waiting_for_min_pg_epoch;
10032}
10033
10034epoch_t OSDShard::get_max_waiting_epoch()
10035{
10036 std::lock_guard l(shard_lock);
10037 epoch_t r = 0;
10038 for (auto& i : pg_slots) {
10039 if (!i.second->waiting_peering.empty()) {
10040 r = std::max(r, i.second->waiting_peering.rbegin()->first);
10041 }
10042 }
10043 return r;
10044}
10045
10046void OSDShard::consume_map(
9f95a23c 10047 const OSDMapRef& new_osdmap,
11fdf7f2
TL
10048 unsigned *pushes_to_free)
10049{
10050 std::lock_guard l(shard_lock);
10051 OSDMapRef old_osdmap;
7c673cae 10052 {
11fdf7f2
TL
10053 std::lock_guard l(osdmap_lock);
10054 old_osdmap = std::move(shard_osdmap);
10055 shard_osdmap = new_osdmap;
10056 }
10057 dout(10) << new_osdmap->get_epoch()
10058 << " (was " << (old_osdmap ? old_osdmap->get_epoch() : 0) << ")"
10059 << dendl;
10060 bool queued = false;
10061
10062 // check slots
10063 auto p = pg_slots.begin();
10064 while (p != pg_slots.end()) {
10065 OSDShardPGSlot *slot = p->second.get();
10066 const spg_t& pgid = p->first;
10067 dout(20) << __func__ << " " << pgid << dendl;
10068 if (!slot->waiting_for_split.empty()) {
10069 dout(20) << __func__ << " " << pgid
10070 << " waiting for split " << slot->waiting_for_split << dendl;
10071 ++p;
10072 continue;
10073 }
10074 if (slot->waiting_for_merge_epoch > new_osdmap->get_epoch()) {
10075 dout(20) << __func__ << " " << pgid
10076 << " waiting for merge by epoch " << slot->waiting_for_merge_epoch
10077 << dendl;
10078 ++p;
10079 continue;
10080 }
10081 if (!slot->waiting_peering.empty()) {
10082 epoch_t first = slot->waiting_peering.begin()->first;
10083 if (first <= new_osdmap->get_epoch()) {
10084 dout(20) << __func__ << " " << pgid
10085 << " pending_peering first epoch " << first
10086 << " <= " << new_osdmap->get_epoch() << ", requeueing" << dendl;
10087 _wake_pg_slot(pgid, slot);
10088 queued = true;
10089 }
10090 ++p;
10091 continue;
10092 }
10093 if (!slot->waiting.empty()) {
10094 if (new_osdmap->is_up_acting_osd_shard(pgid, osd->get_nodeid())) {
10095 dout(20) << __func__ << " " << pgid << " maps to us, keeping"
10096 << dendl;
10097 ++p;
10098 continue;
7c673cae 10099 }
11fdf7f2
TL
10100 while (!slot->waiting.empty() &&
10101 slot->waiting.front().get_map_epoch() <= new_osdmap->get_epoch()) {
10102 auto& qi = slot->waiting.front();
10103 dout(20) << __func__ << " " << pgid
10104 << " waiting item " << qi
10105 << " epoch " << qi.get_map_epoch()
10106 << " <= " << new_osdmap->get_epoch()
10107 << ", "
10108 << (qi.get_map_epoch() < new_osdmap->get_epoch() ? "stale" :
10109 "misdirected")
10110 << ", dropping" << dendl;
10111 *pushes_to_free += qi.get_reserved_pushes();
10112 slot->waiting.pop_front();
10113 }
10114 }
10115 if (slot->waiting.empty() &&
10116 slot->num_running == 0 &&
10117 slot->waiting_for_split.empty() &&
10118 !slot->pg) {
10119 dout(20) << __func__ << " " << pgid << " empty, pruning" << dendl;
10120 p = pg_slots.erase(p);
10121 continue;
7c673cae 10122 }
11fdf7f2
TL
10123
10124 ++p;
7c673cae 10125 }
7c673cae 10126 if (queued) {
11fdf7f2
TL
10127 std::lock_guard l{sdata_wait_lock};
10128 sdata_cond.notify_one();
7c673cae
FG
10129 }
10130}
10131
11fdf7f2
TL
10132void OSDShard::_wake_pg_slot(
10133 spg_t pgid,
10134 OSDShardPGSlot *slot)
10135{
10136 dout(20) << __func__ << " " << pgid
10137 << " to_process " << slot->to_process
10138 << " waiting " << slot->waiting
10139 << " waiting_peering " << slot->waiting_peering << dendl;
10140 for (auto i = slot->to_process.rbegin();
10141 i != slot->to_process.rend();
10142 ++i) {
9f95a23c 10143 scheduler->enqueue_front(std::move(*i));
11fdf7f2
TL
10144 }
10145 slot->to_process.clear();
10146 for (auto i = slot->waiting.rbegin();
10147 i != slot->waiting.rend();
10148 ++i) {
9f95a23c 10149 scheduler->enqueue_front(std::move(*i));
11fdf7f2
TL
10150 }
10151 slot->waiting.clear();
10152 for (auto i = slot->waiting_peering.rbegin();
10153 i != slot->waiting_peering.rend();
10154 ++i) {
10155 // this is overkill; we requeue everything, even if some of these
10156 // items are waiting for maps we don't have yet. FIXME, maybe,
10157 // someday, if we decide this inefficiency matters
10158 for (auto j = i->second.rbegin(); j != i->second.rend(); ++j) {
9f95a23c 10159 scheduler->enqueue_front(std::move(*j));
11fdf7f2
TL
10160 }
10161 }
10162 slot->waiting_peering.clear();
10163 ++slot->requeue_seq;
10164}
10165
10166void OSDShard::identify_splits_and_merges(
10167 const OSDMapRef& as_of_osdmap,
10168 set<pair<spg_t,epoch_t>> *split_pgs,
10169 set<pair<spg_t,epoch_t>> *merge_pgs)
10170{
10171 std::lock_guard l(shard_lock);
10172 if (shard_osdmap) {
10173 for (auto& i : pg_slots) {
10174 const spg_t& pgid = i.first;
10175 auto *slot = i.second.get();
10176 if (slot->pg) {
10177 osd->service.identify_splits_and_merges(
10178 shard_osdmap, as_of_osdmap, pgid,
10179 split_pgs, merge_pgs);
10180 } else if (!slot->waiting_for_split.empty()) {
10181 osd->service.identify_splits_and_merges(
10182 shard_osdmap, as_of_osdmap, pgid,
10183 split_pgs, nullptr);
10184 } else {
10185 dout(20) << __func__ << " slot " << pgid
9f95a23c 10186 << " has no pg and waiting_for_split " << dendl;
7c673cae 10187 }
11fdf7f2
TL
10188 }
10189 }
10190}
10191
10192void OSDShard::prime_splits(const OSDMapRef& as_of_osdmap,
10193 set<pair<spg_t,epoch_t>> *pgids)
10194{
10195 std::lock_guard l(shard_lock);
10196 _prime_splits(pgids);
10197 if (shard_osdmap->get_epoch() > as_of_osdmap->get_epoch()) {
10198 set<pair<spg_t,epoch_t>> newer_children;
10199 for (auto i : *pgids) {
10200 osd->service.identify_splits_and_merges(
10201 as_of_osdmap, shard_osdmap, i.first,
10202 &newer_children, nullptr);
10203 }
10204 newer_children.insert(pgids->begin(), pgids->end());
10205 dout(10) << "as_of_osdmap " << as_of_osdmap->get_epoch() << " < shard "
10206 << shard_osdmap->get_epoch() << ", new children " << newer_children
10207 << dendl;
10208 _prime_splits(&newer_children);
10209 // note: we don't care what is left over here for other shards.
10210 // if this shard is ahead of us and one isn't, e.g., one thread is
10211 // calling into prime_splits via _process (due to a newly created
10212 // pg) and this shard has a newer map due to a racing consume_map,
10213 // then any grandchildren left here will be identified (or were
10214 // identified) when the slower shard's osdmap is advanced.
10215 // _prime_splits() will tolerate the case where the pgid is
10216 // already primed.
10217 }
10218}
10219
10220void OSDShard::_prime_splits(set<pair<spg_t,epoch_t>> *pgids)
10221{
10222 dout(10) << *pgids << dendl;
10223 auto p = pgids->begin();
10224 while (p != pgids->end()) {
10225 unsigned shard_index = p->first.hash_to_shard(osd->num_shards);
10226 if (shard_index == shard_id) {
10227 auto r = pg_slots.emplace(p->first, nullptr);
10228 if (r.second) {
10229 dout(10) << "priming slot " << p->first << " e" << p->second << dendl;
10230 r.first->second = make_unique<OSDShardPGSlot>();
10231 r.first->second->waiting_for_split.insert(p->second);
7c673cae 10232 } else {
11fdf7f2
TL
10233 auto q = r.first;
10234 ceph_assert(q != pg_slots.end());
10235 dout(10) << "priming (existing) slot " << p->first << " e" << p->second
10236 << dendl;
10237 q->second->waiting_for_split.insert(p->second);
7c673cae 10238 }
11fdf7f2
TL
10239 p = pgids->erase(p);
10240 } else {
10241 ++p;
7c673cae
FG
10242 }
10243 }
11fdf7f2
TL
10244}
10245
10246void OSDShard::prime_merges(const OSDMapRef& as_of_osdmap,
10247 set<pair<spg_t,epoch_t>> *merge_pgs)
10248{
10249 std::lock_guard l(shard_lock);
10250 dout(20) << __func__ << " checking shard " << shard_id
10251 << " for remaining merge pgs " << merge_pgs << dendl;
10252 auto p = merge_pgs->begin();
10253 while (p != merge_pgs->end()) {
10254 spg_t pgid = p->first;
10255 epoch_t epoch = p->second;
10256 unsigned shard_index = pgid.hash_to_shard(osd->num_shards);
10257 if (shard_index != shard_id) {
10258 ++p;
10259 continue;
10260 }
10261 OSDShardPGSlot *slot;
10262 auto r = pg_slots.emplace(pgid, nullptr);
10263 if (r.second) {
10264 r.first->second = make_unique<OSDShardPGSlot>();
10265 }
10266 slot = r.first->second.get();
10267 if (slot->pg) {
10268 // already have pg
10269 dout(20) << __func__ << " have merge participant pg " << pgid
10270 << " " << slot->pg << dendl;
10271 } else if (!slot->waiting_for_split.empty() &&
10272 *slot->waiting_for_split.begin() < epoch) {
10273 dout(20) << __func__ << " pending split on merge participant pg " << pgid
10274 << " " << slot->waiting_for_split << dendl;
10275 } else {
10276 dout(20) << __func__ << " creating empty merge participant " << pgid
10277 << " for merge in " << epoch << dendl;
10278 // leave history zeroed; PG::merge_from() will fill it in.
10279 pg_history_t history;
10280 PGCreateInfo cinfo(pgid, epoch - 1,
10281 history, PastIntervals(), false);
10282 PGRef pg = osd->handle_pg_create_info(shard_osdmap, &cinfo);
10283 _attach_pg(r.first->second.get(), pg.get());
10284 _wake_pg_slot(pgid, slot);
10285 pg->unlock();
10286 }
10287 // mark slot for merge
10288 dout(20) << __func__ << " marking merge participant " << pgid << dendl;
10289 slot->waiting_for_merge_epoch = epoch;
10290 p = merge_pgs->erase(p);
7c673cae
FG
10291 }
10292}
10293
11fdf7f2 10294void OSDShard::register_and_wake_split_child(PG *pg)
7c673cae 10295{
11fdf7f2
TL
10296 epoch_t epoch;
10297 {
10298 std::lock_guard l(shard_lock);
10299 dout(10) << pg->pg_id << " " << pg << dendl;
10300 auto p = pg_slots.find(pg->pg_id);
10301 ceph_assert(p != pg_slots.end());
10302 auto *slot = p->second.get();
10303 dout(20) << pg->pg_id << " waiting_for_split " << slot->waiting_for_split
10304 << dendl;
10305 ceph_assert(!slot->pg);
10306 ceph_assert(!slot->waiting_for_split.empty());
10307 _attach_pg(slot, pg);
10308
10309 epoch = pg->get_osdmap_epoch();
10310 ceph_assert(slot->waiting_for_split.count(epoch));
10311 slot->waiting_for_split.erase(epoch);
10312 if (slot->waiting_for_split.empty()) {
10313 _wake_pg_slot(pg->pg_id, slot);
10314 } else {
10315 dout(10) << __func__ << " still waiting for split on "
10316 << slot->waiting_for_split << dendl;
10317 }
7c673cae 10318 }
11fdf7f2
TL
10319
10320 // kick child to ensure it pulls up to the latest osdmap
10321 osd->enqueue_peering_evt(
10322 pg->pg_id,
10323 PGPeeringEventRef(
10324 std::make_shared<PGPeeringEvent>(
10325 epoch,
10326 epoch,
10327 NullEvt())));
10328
10329 std::lock_guard l{sdata_wait_lock};
10330 sdata_cond.notify_one();
7c673cae
FG
10331}
10332
11fdf7f2 10333void OSDShard::unprime_split_children(spg_t parent, unsigned old_pg_num)
7c673cae 10334{
11fdf7f2
TL
10335 std::lock_guard l(shard_lock);
10336 vector<spg_t> to_delete;
10337 for (auto& i : pg_slots) {
10338 if (i.first != parent &&
10339 i.first.get_ancestor(old_pg_num) == parent) {
10340 dout(10) << __func__ << " parent " << parent << " clearing " << i.first
10341 << dendl;
10342 _wake_pg_slot(i.first, i.second.get());
10343 to_delete.push_back(i.first);
10344 }
10345 }
10346 for (auto pgid : to_delete) {
10347 pg_slots.erase(pgid);
10348 }
10349}
10350
9f95a23c
TL
10351OSDShard::OSDShard(
10352 int id,
10353 CephContext *cct,
10354 OSD *osd)
10355 : shard_id(id),
10356 cct(cct),
10357 osd(osd),
10358 shard_name(string("OSDShard.") + stringify(id)),
10359 sdata_wait_lock_name(shard_name + "::sdata_wait_lock"),
10360 sdata_wait_lock{make_mutex(sdata_wait_lock_name)},
10361 osdmap_lock{make_mutex(shard_name + "::osdmap_lock")},
10362 shard_lock_name(shard_name + "::shard_lock"),
10363 shard_lock{make_mutex(shard_lock_name)},
10364 scheduler(ceph::osd::scheduler::make_scheduler(cct)),
10365 context_queue(sdata_wait_lock, sdata_cond)
10366{
10367 dout(0) << "using op scheduler " << *scheduler << dendl;
10368}
10369
11fdf7f2
TL
10370
10371// =============================================================
10372
10373#undef dout_context
10374#define dout_context osd->cct
10375#undef dout_prefix
10376#define dout_prefix *_dout << "osd." << osd->whoami << " op_wq "
10377
10378void OSD::ShardedOpWQ::_add_slot_waiter(
10379 spg_t pgid,
10380 OSDShardPGSlot *slot,
9f95a23c 10381 OpSchedulerItem&& qi)
11fdf7f2
TL
10382{
10383 if (qi.is_peering()) {
10384 dout(20) << __func__ << " " << pgid
10385 << " peering, item epoch is "
10386 << qi.get_map_epoch()
10387 << ", will wait on " << qi << dendl;
10388 slot->waiting_peering[qi.get_map_epoch()].push_back(std::move(qi));
10389 } else {
10390 dout(20) << __func__ << " " << pgid
10391 << " item epoch is "
10392 << qi.get_map_epoch()
10393 << ", will wait on " << qi << dendl;
10394 slot->waiting.push_back(std::move(qi));
7c673cae
FG
10395 }
10396}
10397
10398#undef dout_prefix
10399#define dout_prefix *_dout << "osd." << osd->whoami << " op_wq(" << shard_index << ") "
10400
10401void OSD::ShardedOpWQ::_process(uint32_t thread_index, heartbeat_handle_d *hb)
10402{
11fdf7f2
TL
10403 uint32_t shard_index = thread_index % osd->num_shards;
10404 auto& sdata = osd->shards[shard_index];
10405 ceph_assert(sdata);
10406
10407 // If all threads of shards do oncommits, there is a out-of-order
10408 // problem. So we choose the thread which has the smallest
10409 // thread_index(thread_index < num_shards) of shard to do oncommit
10410 // callback.
10411 bool is_smallest_thread_index = thread_index < osd->num_shards;
7c673cae
FG
10412
10413 // peek at spg_t
11fdf7f2 10414 sdata->shard_lock.lock();
9f95a23c 10415 if (sdata->scheduler->empty() &&
11fdf7f2
TL
10416 (!is_smallest_thread_index || sdata->context_queue.empty())) {
10417 std::unique_lock wait_lock{sdata->sdata_wait_lock};
10418 if (is_smallest_thread_index && !sdata->context_queue.empty()) {
10419 // we raced with a context_queue addition, don't wait
10420 wait_lock.unlock();
10421 } else if (!sdata->stop_waiting) {
10422 dout(20) << __func__ << " empty q, waiting" << dendl;
10423 osd->cct->get_heartbeat_map()->clear_timeout(hb);
10424 sdata->shard_lock.unlock();
10425 sdata->sdata_cond.wait(wait_lock);
10426 wait_lock.unlock();
10427 sdata->shard_lock.lock();
9f95a23c 10428 if (sdata->scheduler->empty() &&
11fdf7f2
TL
10429 !(is_smallest_thread_index && !sdata->context_queue.empty())) {
10430 sdata->shard_lock.unlock();
10431 return;
10432 }
e306af50 10433 // found a work item; reapply default wq timeouts
11fdf7f2 10434 osd->cct->get_heartbeat_map()->reset_timeout(hb,
e306af50 10435 timeout_interval, suicide_interval);
11fdf7f2
TL
10436 } else {
10437 dout(20) << __func__ << " need return immediately" << dendl;
10438 wait_lock.unlock();
10439 sdata->shard_lock.unlock();
7c673cae
FG
10440 return;
10441 }
10442 }
11fdf7f2
TL
10443
10444 list<Context *> oncommits;
9f95a23c
TL
10445 if (is_smallest_thread_index) {
10446 sdata->context_queue.move_to(oncommits);
7c673cae 10447 }
11fdf7f2 10448
9f95a23c 10449 if (sdata->scheduler->empty()) {
11fdf7f2
TL
10450 if (osd->is_stopping()) {
10451 sdata->shard_lock.unlock();
10452 for (auto c : oncommits) {
10453 dout(10) << __func__ << " discarding in-flight oncommit " << c << dendl;
10454 delete c;
10455 }
10456 return; // OSD shutdown, discard.
7c673cae 10457 }
11fdf7f2
TL
10458 sdata->shard_lock.unlock();
10459 handle_oncommits(oncommits);
10460 return;
7c673cae 10461 }
7c673cae 10462
9f95a23c 10463 OpSchedulerItem item = sdata->scheduler->dequeue();
11fdf7f2
TL
10464 if (osd->is_stopping()) {
10465 sdata->shard_lock.unlock();
10466 for (auto c : oncommits) {
10467 dout(10) << __func__ << " discarding in-flight oncommit " << c << dendl;
10468 delete c;
10469 }
10470 return; // OSD shutdown, discard.
10471 }
7c673cae 10472
11fdf7f2
TL
10473 const auto token = item.get_ordering_token();
10474 auto r = sdata->pg_slots.emplace(token, nullptr);
10475 if (r.second) {
10476 r.first->second = make_unique<OSDShardPGSlot>();
7c673cae 10477 }
11fdf7f2
TL
10478 OSDShardPGSlot *slot = r.first->second.get();
10479 dout(20) << __func__ << " " << token
10480 << (r.second ? " (new)" : "")
10481 << " to_process " << slot->to_process
10482 << " waiting " << slot->waiting
10483 << " waiting_peering " << slot->waiting_peering
10484 << dendl;
10485 slot->to_process.push_back(std::move(item));
10486 dout(20) << __func__ << " " << slot->to_process.back()
10487 << " queued" << dendl;
7c673cae 10488
11fdf7f2
TL
10489 retry_pg:
10490 PGRef pg = slot->pg;
7c673cae 10491
11fdf7f2
TL
10492 // lock pg (if we have it)
10493 if (pg) {
10494 // note the requeue seq now...
10495 uint64_t requeue_seq = slot->requeue_seq;
10496 ++slot->num_running;
7c673cae 10497
11fdf7f2
TL
10498 sdata->shard_lock.unlock();
10499 osd->service.maybe_inject_dispatch_delay();
10500 pg->lock();
10501 osd->service.maybe_inject_dispatch_delay();
10502 sdata->shard_lock.lock();
7c673cae 10503
11fdf7f2
TL
10504 auto q = sdata->pg_slots.find(token);
10505 if (q == sdata->pg_slots.end()) {
10506 // this can happen if we race with pg removal.
10507 dout(20) << __func__ << " slot " << token << " no longer there" << dendl;
10508 pg->unlock();
10509 sdata->shard_lock.unlock();
10510 handle_oncommits(oncommits);
10511 return;
10512 }
10513 slot = q->second.get();
10514 --slot->num_running;
7c673cae 10515
11fdf7f2
TL
10516 if (slot->to_process.empty()) {
10517 // raced with _wake_pg_slot or consume_map
10518 dout(20) << __func__ << " " << token
10519 << " nothing queued" << dendl;
7c673cae 10520 pg->unlock();
11fdf7f2
TL
10521 sdata->shard_lock.unlock();
10522 handle_oncommits(oncommits);
10523 return;
7c673cae 10524 }
11fdf7f2
TL
10525 if (requeue_seq != slot->requeue_seq) {
10526 dout(20) << __func__ << " " << token
10527 << " requeue_seq " << slot->requeue_seq << " > our "
10528 << requeue_seq << ", we raced with _wake_pg_slot"
10529 << dendl;
7c673cae 10530 pg->unlock();
11fdf7f2
TL
10531 sdata->shard_lock.unlock();
10532 handle_oncommits(oncommits);
10533 return;
7c673cae 10534 }
11fdf7f2
TL
10535 if (slot->pg != pg) {
10536 // this can happen if we race with pg removal.
10537 dout(20) << __func__ << " slot " << token << " no longer attached to "
10538 << pg << dendl;
7c673cae 10539 pg->unlock();
11fdf7f2 10540 goto retry_pg;
7c673cae 10541 }
7c673cae
FG
10542 }
10543
11fdf7f2
TL
10544 dout(20) << __func__ << " " << token
10545 << " to_process " << slot->to_process
10546 << " waiting " << slot->waiting
10547 << " waiting_peering " << slot->waiting_peering << dendl;
10548
10549 ThreadPool::TPHandle tp_handle(osd->cct, hb, timeout_interval,
10550 suicide_interval);
10551
7c673cae 10552 // take next item
11fdf7f2
TL
10553 auto qi = std::move(slot->to_process.front());
10554 slot->to_process.pop_front();
10555 dout(20) << __func__ << " " << qi << " pg " << pg << dendl;
10556 set<pair<spg_t,epoch_t>> new_children;
10557 OSDMapRef osdmap;
7c673cae 10558
11fdf7f2 10559 while (!pg) {
7c673cae 10560 // should this pg shard exist on this osd in this (or a later) epoch?
11fdf7f2
TL
10561 osdmap = sdata->shard_osdmap;
10562 const PGCreateInfo *create_info = qi.creates_pg();
10563 if (!slot->waiting_for_split.empty()) {
10564 dout(20) << __func__ << " " << token
10565 << " splitting " << slot->waiting_for_split << dendl;
10566 _add_slot_waiter(token, slot, std::move(qi));
10567 } else if (qi.get_map_epoch() > osdmap->get_epoch()) {
10568 dout(20) << __func__ << " " << token
10569 << " map " << qi.get_map_epoch() << " > "
10570 << osdmap->get_epoch() << dendl;
10571 _add_slot_waiter(token, slot, std::move(qi));
10572 } else if (qi.is_peering()) {
10573 if (!qi.peering_requires_pg()) {
10574 // for pg-less events, we run them under the ordering lock, since
10575 // we don't have the pg lock to keep them ordered.
10576 qi.run(osd, sdata, pg, tp_handle);
10577 } else if (osdmap->is_up_acting_osd_shard(token, osd->whoami)) {
10578 if (create_info) {
10579 if (create_info->by_mon &&
10580 osdmap->get_pg_acting_primary(token.pgid) != osd->whoami) {
10581 dout(20) << __func__ << " " << token
10582 << " no pg, no longer primary, ignoring mon create on "
10583 << qi << dendl;
10584 } else {
10585 dout(20) << __func__ << " " << token
10586 << " no pg, should create on " << qi << dendl;
10587 pg = osd->handle_pg_create_info(osdmap, create_info);
10588 if (pg) {
10589 // we created the pg! drop out and continue "normally"!
10590 sdata->_attach_pg(slot, pg.get());
10591 sdata->_wake_pg_slot(token, slot);
10592
10593 // identify split children between create epoch and shard epoch.
10594 osd->service.identify_splits_and_merges(
10595 pg->get_osdmap(), osdmap, pg->pg_id, &new_children, nullptr);
10596 sdata->_prime_splits(&new_children);
10597 // distribute remaining split children to other shards below!
10598 break;
10599 }
10600 dout(20) << __func__ << " ignored create on " << qi << dendl;
10601 }
10602 } else {
10603 dout(20) << __func__ << " " << token
10604 << " no pg, peering, !create, discarding " << qi << dendl;
10605 }
10606 } else {
10607 dout(20) << __func__ << " " << token
10608 << " no pg, peering, doesn't map here e" << osdmap->get_epoch()
10609 << ", discarding " << qi
10610 << dendl;
10611 }
10612 } else if (osdmap->is_up_acting_osd_shard(token, osd->whoami)) {
10613 dout(20) << __func__ << " " << token
10614 << " no pg, should exist e" << osdmap->get_epoch()
10615 << ", will wait on " << qi << dendl;
10616 _add_slot_waiter(token, slot, std::move(qi));
7c673cae 10617 } else {
11fdf7f2
TL
10618 dout(20) << __func__ << " " << token
10619 << " no pg, shouldn't exist e" << osdmap->get_epoch()
10620 << ", dropping " << qi << dendl;
7c673cae 10621 // share map with client?
9f95a23c
TL
10622 if (std::optional<OpRequestRef> _op = qi.maybe_get_op()) {
10623 osd->service.maybe_share_map((*_op)->get_req()->get_connection().get(),
10624 sdata->shard_osdmap,
10625 (*_op)->sent_epoch);
7c673cae 10626 }
11fdf7f2 10627 unsigned pushes_to_free = qi.get_reserved_pushes();
7c673cae 10628 if (pushes_to_free > 0) {
11fdf7f2 10629 sdata->shard_lock.unlock();
7c673cae 10630 osd->service.release_reserved_pushes(pushes_to_free);
11fdf7f2 10631 handle_oncommits(oncommits);
7c673cae
FG
10632 return;
10633 }
10634 }
11fdf7f2
TL
10635 sdata->shard_lock.unlock();
10636 handle_oncommits(oncommits);
7c673cae
FG
10637 return;
10638 }
11fdf7f2
TL
10639 if (qi.is_peering()) {
10640 OSDMapRef osdmap = sdata->shard_osdmap;
10641 if (qi.get_map_epoch() > osdmap->get_epoch()) {
10642 _add_slot_waiter(token, slot, std::move(qi));
10643 sdata->shard_lock.unlock();
10644 pg->unlock();
10645 handle_oncommits(oncommits);
10646 return;
10647 }
10648 }
10649 sdata->shard_lock.unlock();
7c673cae 10650
11fdf7f2
TL
10651 if (!new_children.empty()) {
10652 for (auto shard : osd->shards) {
10653 shard->prime_splits(osdmap, &new_children);
10654 }
10655 ceph_assert(new_children.empty());
10656 }
7c673cae
FG
10657
10658 // osd_opwq_process marks the point at which an operation has been dequeued
10659 // and will begin to be handled by a worker thread.
10660 {
10661#ifdef WITH_LTTNG
10662 osd_reqid_t reqid;
9f95a23c 10663 if (std::optional<OpRequestRef> _op = qi.maybe_get_op()) {
7c673cae
FG
10664 reqid = (*_op)->get_reqid();
10665 }
10666#endif
10667 tracepoint(osd, opwq_process_start, reqid.name._type,
10668 reqid.name._num, reqid.tid, reqid.inc);
10669 }
10670
10671 lgeneric_subdout(osd->cct, osd, 30) << "dequeue status: ";
10672 Formatter *f = Formatter::create("json");
10673 f->open_object_section("q");
10674 dump(f);
10675 f->close_section();
10676 f->flush(*_dout);
10677 delete f;
10678 *_dout << dendl;
10679
11fdf7f2 10680 qi.run(osd, sdata, pg, tp_handle);
7c673cae
FG
10681
10682 {
10683#ifdef WITH_LTTNG
10684 osd_reqid_t reqid;
9f95a23c 10685 if (std::optional<OpRequestRef> _op = qi.maybe_get_op()) {
7c673cae
FG
10686 reqid = (*_op)->get_reqid();
10687 }
10688#endif
10689 tracepoint(osd, opwq_process_finish, reqid.name._type,
10690 reqid.name._num, reqid.tid, reqid.inc);
10691 }
10692
11fdf7f2 10693 handle_oncommits(oncommits);
7c673cae
FG
10694}
10695
9f95a23c 10696void OSD::ShardedOpWQ::_enqueue(OpSchedulerItem&& item) {
7c673cae 10697 uint32_t shard_index =
11fdf7f2 10698 item.get_ordering_token().hash_to_shard(osd->shards.size());
7c673cae 10699
9f95a23c
TL
10700 dout(20) << __func__ << " " << item << dendl;
10701
11fdf7f2 10702 OSDShard* sdata = osd->shards[shard_index];
7c673cae 10703 assert (NULL != sdata);
7c673cae 10704
9f95a23c
TL
10705 bool empty = true;
10706 {
10707 std::lock_guard l{sdata->shard_lock};
10708 empty = sdata->scheduler->empty();
10709 sdata->scheduler->enqueue(std::move(item));
10710 }
7c673cae 10711
9f95a23c
TL
10712 if (empty) {
10713 std::lock_guard l{sdata->sdata_wait_lock};
10714 sdata->sdata_cond.notify_one();
10715 }
7c673cae
FG
10716}
10717
9f95a23c 10718void OSD::ShardedOpWQ::_enqueue_front(OpSchedulerItem&& item)
7c673cae 10719{
11fdf7f2
TL
10720 auto shard_index = item.get_ordering_token().hash_to_shard(osd->shards.size());
10721 auto& sdata = osd->shards[shard_index];
10722 ceph_assert(sdata);
10723 sdata->shard_lock.lock();
10724 auto p = sdata->pg_slots.find(item.get_ordering_token());
10725 if (p != sdata->pg_slots.end() &&
10726 !p->second->to_process.empty()) {
7c673cae 10727 // we may be racing with _process, which has dequeued a new item
9f95a23c 10728 // from scheduler, put it on to_process, and is now busy taking the
7c673cae
FG
10729 // pg lock. ensure this old requeued item is ordered before any
10730 // such newer item in to_process.
11fdf7f2
TL
10731 p->second->to_process.push_front(std::move(item));
10732 item = std::move(p->second->to_process.back());
10733 p->second->to_process.pop_back();
10734 dout(20) << __func__
10735 << " " << p->second->to_process.front()
10736 << " shuffled w/ " << item << dendl;
7c673cae 10737 } else {
11fdf7f2 10738 dout(20) << __func__ << " " << item << dendl;
7c673cae 10739 }
9f95a23c 10740 sdata->scheduler->enqueue_front(std::move(item));
11fdf7f2
TL
10741 sdata->shard_lock.unlock();
10742 std::lock_guard l{sdata->sdata_wait_lock};
10743 sdata->sdata_cond.notify_one();
7c673cae
FG
10744}
10745
10746namespace ceph {
10747namespace osd_cmds {
10748
11fdf7f2
TL
10749int heap(CephContext& cct, const cmdmap_t& cmdmap, Formatter& f,
10750 std::ostream& os)
7c673cae
FG
10751{
10752 if (!ceph_using_tcmalloc()) {
10753 os << "could not issue heap profiler command -- not using tcmalloc!";
10754 return -EOPNOTSUPP;
10755 }
10756
10757 string cmd;
9f95a23c 10758 if (!cmd_getval(cmdmap, "heapcmd", cmd)) {
7c673cae
FG
10759 os << "unable to get value for command \"" << cmd << "\"";
10760 return -EINVAL;
11fdf7f2 10761 }
7c673cae
FG
10762
10763 std::vector<std::string> cmd_vec;
10764 get_str_vec(cmd, cmd_vec);
11fdf7f2
TL
10765
10766 string val;
9f95a23c 10767 if (cmd_getval(cmdmap, "value", val)) {
11fdf7f2
TL
10768 cmd_vec.push_back(val);
10769 }
7c673cae
FG
10770
10771 ceph_heap_profiler_handle_command(cmd_vec, os);
10772
10773 return 0;
10774}
10775
10776}} // namespace ceph::osd_cmds