]> git.proxmox.com Git - ceph.git/blame - ceph/src/osd/OSD.cc
import ceph 15.2.14
[ceph.git] / ceph / src / osd / OSD.cc
CommitLineData
7c673cae
FG
1// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2// vim: ts=8 sw=2 smarttab
3/*
4 * Ceph - scalable distributed file system
5 *
6 * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
7 * Copyright (C) 2017 OVH
8 *
9 * This is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU Lesser General Public
11 * License version 2.1, as published by the Free Software
12 * Foundation. See file COPYING.
13 *
14 */
11fdf7f2 15
7c673cae 16#include "acconfig.h"
11fdf7f2
TL
17
18#include <cctype>
7c673cae
FG
19#include <fstream>
20#include <iostream>
11fdf7f2
TL
21#include <iterator>
22
23#include <unistd.h>
7c673cae
FG
24#include <sys/stat.h>
25#include <signal.h>
eafe8130 26#include <time.h>
7c673cae 27#include <boost/scoped_ptr.hpp>
eafe8130 28#include <boost/range/adaptor/reversed.hpp>
7c673cae
FG
29
30#ifdef HAVE_SYS_PARAM_H
31#include <sys/param.h>
32#endif
33
34#ifdef HAVE_SYS_MOUNT_H
35#include <sys/mount.h>
36#endif
37
38#include "osd/PG.h"
39
40#include "include/types.h"
41#include "include/compat.h"
11fdf7f2 42#include "include/random.h"
7c673cae
FG
43
44#include "OSD.h"
45#include "OSDMap.h"
46#include "Watch.h"
47#include "osdc/Objecter.h"
48
49#include "common/errno.h"
50#include "common/ceph_argparse.h"
9f95a23c 51#include "common/ceph_releases.h"
224ce89b 52#include "common/ceph_time.h"
7c673cae 53#include "common/version.h"
b5b8bbf5 54#include "common/pick_address.h"
11fdf7f2
TL
55#include "common/blkdev.h"
56#include "common/numa.h"
7c673cae
FG
57
58#include "os/ObjectStore.h"
59#ifdef HAVE_LIBFUSE
60#include "os/FuseStore.h"
61#endif
62
63#include "PrimaryLogPG.h"
64
7c673cae
FG
65#include "msg/Messenger.h"
66#include "msg/Message.h"
67
68#include "mon/MonClient.h"
69
70#include "messages/MLog.h"
71
72#include "messages/MGenericMessage.h"
7c673cae
FG
73#include "messages/MOSDPing.h"
74#include "messages/MOSDFailure.h"
75#include "messages/MOSDMarkMeDown.h"
9f95a23c 76#include "messages/MOSDMarkMeDead.h"
7c673cae
FG
77#include "messages/MOSDFull.h"
78#include "messages/MOSDOp.h"
79#include "messages/MOSDOpReply.h"
80#include "messages/MOSDBackoff.h"
81#include "messages/MOSDBeacon.h"
82#include "messages/MOSDRepOp.h"
83#include "messages/MOSDRepOpReply.h"
84#include "messages/MOSDBoot.h"
85#include "messages/MOSDPGTemp.h"
11fdf7f2 86#include "messages/MOSDPGReadyToMerge.h"
7c673cae
FG
87
88#include "messages/MOSDMap.h"
89#include "messages/MMonGetOSDMap.h"
90#include "messages/MOSDPGNotify.h"
9f95a23c 91#include "messages/MOSDPGNotify2.h"
7c673cae 92#include "messages/MOSDPGQuery.h"
9f95a23c 93#include "messages/MOSDPGQuery2.h"
7c673cae
FG
94#include "messages/MOSDPGLog.h"
95#include "messages/MOSDPGRemove.h"
96#include "messages/MOSDPGInfo.h"
9f95a23c 97#include "messages/MOSDPGInfo2.h"
7c673cae 98#include "messages/MOSDPGCreate.h"
11fdf7f2 99#include "messages/MOSDPGCreate2.h"
7c673cae 100#include "messages/MOSDPGScan.h"
7c673cae
FG
101#include "messages/MBackfillReserve.h"
102#include "messages/MRecoveryReserve.h"
c07f9fc5 103#include "messages/MOSDForceRecovery.h"
7c673cae
FG
104#include "messages/MOSDECSubOpWrite.h"
105#include "messages/MOSDECSubOpWriteReply.h"
106#include "messages/MOSDECSubOpRead.h"
107#include "messages/MOSDECSubOpReadReply.h"
108#include "messages/MOSDPGCreated.h"
109#include "messages/MOSDPGUpdateLogMissing.h"
110#include "messages/MOSDPGUpdateLogMissingReply.h"
111
11fdf7f2
TL
112#include "messages/MOSDPeeringOp.h"
113
7c673cae
FG
114#include "messages/MOSDAlive.h"
115
116#include "messages/MOSDScrub.h"
11fdf7f2 117#include "messages/MOSDScrub2.h"
7c673cae
FG
118#include "messages/MOSDRepScrub.h"
119
7c673cae
FG
120#include "messages/MCommand.h"
121#include "messages/MCommandReply.h"
122
123#include "messages/MPGStats.h"
124#include "messages/MPGStatsAck.h"
125
126#include "messages/MWatchNotify.h"
127#include "messages/MOSDPGPush.h"
128#include "messages/MOSDPGPushReply.h"
129#include "messages/MOSDPGPull.h"
130
9f95a23c
TL
131#include "messages/MMonGetPurgedSnaps.h"
132#include "messages/MMonGetPurgedSnapsReply.h"
133
7c673cae
FG
134#include "common/perf_counters.h"
135#include "common/Timer.h"
136#include "common/LogClient.h"
137#include "common/AsyncReserver.h"
138#include "common/HeartbeatMap.h"
139#include "common/admin_socket.h"
140#include "common/ceph_context.h"
141
142#include "global/signal_handler.h"
143#include "global/pidfile.h"
144
145#include "include/color.h"
146#include "perfglue/cpu_profiler.h"
147#include "perfglue/heap_profiler.h"
148
149#include "osd/OpRequest.h"
150
151#include "auth/AuthAuthorizeHandler.h"
152#include "auth/RotatingKeyRing.h"
7c673cae
FG
153
154#include "objclass/objclass.h"
155
156#include "common/cmdparse.h"
157#include "include/str_list.h"
158#include "include/util.h"
159
11fdf7f2 160#include "include/ceph_assert.h"
7c673cae
FG
161#include "common/config.h"
162#include "common/EventTrace.h"
163
11fdf7f2
TL
164#include "json_spirit/json_spirit_reader.h"
165#include "json_spirit/json_spirit_writer.h"
166
7c673cae
FG
167#ifdef WITH_LTTNG
168#define TRACEPOINT_DEFINE
169#define TRACEPOINT_PROBE_DYNAMIC_LINKAGE
170#include "tracing/osd.h"
171#undef TRACEPOINT_PROBE_DYNAMIC_LINKAGE
172#undef TRACEPOINT_DEFINE
173#else
174#define tracepoint(...)
175#endif
176
177#define dout_context cct
178#define dout_subsys ceph_subsys_osd
179#undef dout_prefix
180#define dout_prefix _prefix(_dout, whoami, get_osdmap_epoch())
181
9f95a23c
TL
182using namespace ceph::osd::scheduler;
183using TOPNSPC::common::cmd_getval;
224ce89b 184
7c673cae
FG
185static ostream& _prefix(std::ostream* _dout, int whoami, epoch_t epoch) {
186 return *_dout << "osd." << whoami << " " << epoch << " ";
187}
188
7c673cae
FG
189//Initial features in new superblock.
190//Features here are also automatically upgraded
191CompatSet OSD::get_osd_initial_compat_set() {
192 CompatSet::FeatureSet ceph_osd_feature_compat;
193 CompatSet::FeatureSet ceph_osd_feature_ro_compat;
194 CompatSet::FeatureSet ceph_osd_feature_incompat;
195 ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_BASE);
196 ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_PGINFO);
197 ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_OLOC);
198 ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_LEC);
199 ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_CATEGORIES);
200 ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_HOBJECTPOOL);
201 ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_BIGINFO);
202 ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_LEVELDBINFO);
203 ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_LEVELDBLOG);
204 ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_SNAPMAPPER);
205 ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_HINTS);
206 ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_PGMETA);
207 ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_MISSING);
208 ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_FASTINFO);
c07f9fc5 209 ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_RECOVERY_DELETES);
9f95a23c 210 ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_SNAPMAPPER2);
7c673cae
FG
211 return CompatSet(ceph_osd_feature_compat, ceph_osd_feature_ro_compat,
212 ceph_osd_feature_incompat);
213}
214
215//Features are added here that this OSD supports.
216CompatSet OSD::get_osd_compat_set() {
217 CompatSet compat = get_osd_initial_compat_set();
218 //Any features here can be set in code, but not in initial superblock
219 compat.incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_SHARDS);
220 return compat;
221}
222
223OSDService::OSDService(OSD *osd) :
224 osd(osd),
225 cct(osd->cct),
7c673cae
FG
226 whoami(osd->whoami), store(osd->store),
227 log_client(osd->log_client), clog(osd->clog),
228 pg_recovery_stats(osd->pg_recovery_stats),
229 cluster_messenger(osd->cluster_messenger),
230 client_messenger(osd->client_messenger),
231 logger(osd->logger),
232 recoverystate_perf(osd->recoverystate_perf),
233 monc(osd->monc),
11fdf7f2
TL
234 osd_max_object_size(cct->_conf, "osd_max_object_size"),
235 osd_skip_data_digest(cct->_conf, "osd_skip_data_digest"),
236 publish_lock{ceph::make_mutex("OSDService::publish_lock")},
237 pre_publish_lock{ceph::make_mutex("OSDService::pre_publish_lock")},
7c673cae 238 max_oldest_map(0),
eafe8130
TL
239 scrubs_local(0),
240 scrubs_remote(0),
7c673cae
FG
241 agent_valid_iterator(false),
242 agent_ops(0),
243 flush_mode_high_count(0),
244 agent_active(true),
245 agent_thread(this),
246 agent_stop_flag(false),
7c673cae
FG
247 agent_timer(osd->client_messenger->cct, agent_timer_lock),
248 last_recalibrate(ceph_clock_now()),
249 promote_max_objects(0),
250 promote_max_bytes(0),
9f95a23c
TL
251 objecter(make_unique<Objecter>(osd->client_messenger->cct,
252 osd->objecter_messenger,
f91f0fd5 253 osd->monc, nullptr)),
11fdf7f2 254 m_objecter_finishers(cct->_conf->osd_objecter_finishers),
7c673cae
FG
255 watch_timer(osd->client_messenger->cct, watch_lock),
256 next_notif_id(0),
7c673cae 257 recovery_request_timer(cct, recovery_request_lock, false),
11fdf7f2 258 sleep_timer(cct, sleep_lock, false),
7c673cae 259 reserver_finisher(cct),
3efd9988 260 local_reserver(cct, &reserver_finisher, cct->_conf->osd_max_backfills,
7c673cae 261 cct->_conf->osd_min_recovery_priority),
3efd9988 262 remote_reserver(cct, &reserver_finisher, cct->_conf->osd_max_backfills,
7c673cae 263 cct->_conf->osd_min_recovery_priority),
3efd9988 264 snap_reserver(cct, &reserver_finisher,
7c673cae 265 cct->_conf->osd_max_trimming_pgs),
7c673cae
FG
266 recovery_ops_active(0),
267 recovery_ops_reserved(0),
268 recovery_paused(false),
7c673cae
FG
269 map_cache(cct, cct->_conf->osd_map_cache_size),
270 map_bl_cache(cct->_conf->osd_map_cache_size),
271 map_bl_inc_cache(cct->_conf->osd_map_cache_size),
7c673cae 272 cur_state(NONE),
11fdf7f2 273 cur_ratio(0), physical_ratio(0),
9f95a23c 274 boot_epoch(0), up_epoch(0), bind_epoch(0)
7c673cae
FG
275{
276 objecter->init();
11fdf7f2
TL
277
278 for (int i = 0; i < m_objecter_finishers; i++) {
279 ostringstream str;
280 str << "objecter-finisher-" << i;
9f95a23c
TL
281 auto fin = make_unique<Finisher>(osd->client_messenger->cct, str.str(), "finisher");
282 objecter_finishers.push_back(std::move(fin));
11fdf7f2 283 }
7c673cae
FG
284}
285
31f18b77
FG
286#ifdef PG_DEBUG_REFS
287void OSDService::add_pgid(spg_t pgid, PG *pg){
11fdf7f2 288 std::lock_guard l(pgid_lock);
31f18b77
FG
289 if (!pgid_tracker.count(pgid)) {
290 live_pgs[pgid] = pg;
291 }
292 pgid_tracker[pgid]++;
293}
294void OSDService::remove_pgid(spg_t pgid, PG *pg)
295{
11fdf7f2
TL
296 std::lock_guard l(pgid_lock);
297 ceph_assert(pgid_tracker.count(pgid));
298 ceph_assert(pgid_tracker[pgid] > 0);
31f18b77
FG
299 pgid_tracker[pgid]--;
300 if (pgid_tracker[pgid] == 0) {
301 pgid_tracker.erase(pgid);
302 live_pgs.erase(pgid);
303 }
304}
305void OSDService::dump_live_pgids()
306{
11fdf7f2 307 std::lock_guard l(pgid_lock);
31f18b77
FG
308 derr << "live pgids:" << dendl;
309 for (map<spg_t, int>::const_iterator i = pgid_tracker.cbegin();
310 i != pgid_tracker.cend();
311 ++i) {
312 derr << "\t" << *i << dendl;
313 live_pgs[i->first]->dump_live_ids();
314 }
315}
316#endif
317
318
9f95a23c
TL
319ceph::signedspan OSDService::get_mnow()
320{
321 return ceph::mono_clock::now() - osd->startup_time;
322}
7c673cae 323
11fdf7f2
TL
324void OSDService::identify_splits_and_merges(
325 OSDMapRef old_map,
326 OSDMapRef new_map,
327 spg_t pgid,
328 set<pair<spg_t,epoch_t>> *split_children,
329 set<pair<spg_t,epoch_t>> *merge_pgs)
7c673cae 330{
11fdf7f2 331 if (!old_map->have_pg_pool(pgid.pool())) {
7c673cae 332 return;
7c673cae 333 }
7c673cae 334 int old_pgnum = old_map->get_pg_num(pgid.pool());
11fdf7f2
TL
335 auto p = osd->pg_num_history.pg_nums.find(pgid.pool());
336 if (p == osd->pg_num_history.pg_nums.end()) {
337 return;
338 }
339 dout(20) << __func__ << " " << pgid << " e" << old_map->get_epoch()
340 << " to e" << new_map->get_epoch()
341 << " pg_nums " << p->second << dendl;
342 deque<spg_t> queue;
343 queue.push_back(pgid);
eafe8130 344 set<spg_t> did;
11fdf7f2
TL
345 while (!queue.empty()) {
346 auto cur = queue.front();
347 queue.pop_front();
eafe8130 348 did.insert(cur);
11fdf7f2
TL
349 unsigned pgnum = old_pgnum;
350 for (auto q = p->second.lower_bound(old_map->get_epoch());
351 q != p->second.end() &&
352 q->first <= new_map->get_epoch();
353 ++q) {
354 if (pgnum < q->second) {
355 // split?
356 if (cur.ps() < pgnum) {
357 set<spg_t> children;
358 if (cur.is_split(pgnum, q->second, &children)) {
359 dout(20) << __func__ << " " << cur << " e" << q->first
360 << " pg_num " << pgnum << " -> " << q->second
361 << " children " << children << dendl;
362 for (auto i : children) {
363 split_children->insert(make_pair(i, q->first));
eafe8130
TL
364 if (!did.count(i))
365 queue.push_back(i);
11fdf7f2
TL
366 }
367 }
368 } else if (cur.ps() < q->second) {
369 dout(20) << __func__ << " " << cur << " e" << q->first
370 << " pg_num " << pgnum << " -> " << q->second
371 << " is a child" << dendl;
372 // normally we'd capture this from the parent, but it's
373 // possible the parent doesn't exist yet (it will be
374 // fabricated to allow an intervening merge). note this PG
375 // as a split child here to be sure we catch it.
376 split_children->insert(make_pair(cur, q->first));
377 } else {
378 dout(20) << __func__ << " " << cur << " e" << q->first
379 << " pg_num " << pgnum << " -> " << q->second
380 << " is post-split, skipping" << dendl;
381 }
382 } else if (merge_pgs) {
383 // merge?
384 if (cur.ps() >= q->second) {
385 if (cur.ps() < pgnum) {
386 spg_t parent;
387 if (cur.is_merge_source(pgnum, q->second, &parent)) {
388 set<spg_t> children;
389 parent.is_split(q->second, pgnum, &children);
390 dout(20) << __func__ << " " << cur << " e" << q->first
391 << " pg_num " << pgnum << " -> " << q->second
392 << " is merge source, target " << parent
393 << ", source(s) " << children << dendl;
394 merge_pgs->insert(make_pair(parent, q->first));
eafe8130
TL
395 if (!did.count(parent)) {
396 // queue (and re-scan) parent in case it might not exist yet
397 // and there are some future splits pending on it
398 queue.push_back(parent);
399 }
11fdf7f2
TL
400 for (auto c : children) {
401 merge_pgs->insert(make_pair(c, q->first));
eafe8130
TL
402 if (!did.count(c))
403 queue.push_back(c);
11fdf7f2
TL
404 }
405 }
406 } else {
407 dout(20) << __func__ << " " << cur << " e" << q->first
408 << " pg_num " << pgnum << " -> " << q->second
409 << " is beyond old pgnum, skipping" << dendl;
410 }
411 } else {
412 set<spg_t> children;
413 if (cur.is_split(q->second, pgnum, &children)) {
414 dout(20) << __func__ << " " << cur << " e" << q->first
415 << " pg_num " << pgnum << " -> " << q->second
416 << " is merge target, source " << children << dendl;
417 for (auto c : children) {
418 merge_pgs->insert(make_pair(c, q->first));
eafe8130
TL
419 if (!did.count(c))
420 queue.push_back(c);
11fdf7f2
TL
421 }
422 merge_pgs->insert(make_pair(cur, q->first));
423 }
7c673cae
FG
424 }
425 }
11fdf7f2 426 pgnum = q->second;
7c673cae
FG
427 }
428 }
429}
430
7c673cae
FG
431void OSDService::need_heartbeat_peer_update()
432{
433 osd->need_heartbeat_peer_update();
434}
435
9f95a23c
TL
436HeartbeatStampsRef OSDService::get_hb_stamps(unsigned peer)
437{
438 std::lock_guard l(hb_stamp_lock);
439 if (peer >= hb_stamps.size()) {
440 hb_stamps.resize(peer + 1);
441 }
442 if (!hb_stamps[peer]) {
443 hb_stamps[peer] = ceph::make_ref<HeartbeatStamps>(peer);
444 }
445 return hb_stamps[peer];
446}
447
448void OSDService::queue_renew_lease(epoch_t epoch, spg_t spgid)
449{
450 osd->enqueue_peering_evt(
451 spgid,
452 PGPeeringEventRef(
453 std::make_shared<PGPeeringEvent>(
454 epoch, epoch,
455 RenewLease())));
456}
457
7c673cae
FG
458void OSDService::start_shutdown()
459{
460 {
11fdf7f2 461 std::lock_guard l(agent_timer_lock);
7c673cae
FG
462 agent_timer.shutdown();
463 }
31f18b77
FG
464
465 {
11fdf7f2
TL
466 std::lock_guard l(sleep_lock);
467 sleep_timer.shutdown();
31f18b77 468 }
81eedcae
TL
469
470 {
471 std::lock_guard l(recovery_request_lock);
472 recovery_request_timer.shutdown();
473 }
7c673cae
FG
474}
475
31f18b77 476void OSDService::shutdown_reserver()
7c673cae
FG
477{
478 reserver_finisher.wait_for_empty();
479 reserver_finisher.stop();
31f18b77
FG
480}
481
482void OSDService::shutdown()
483{
9f95a23c
TL
484 mono_timer.suspend();
485
7c673cae 486 {
11fdf7f2 487 std::lock_guard l(watch_lock);
7c673cae
FG
488 watch_timer.shutdown();
489 }
490
491 objecter->shutdown();
9f95a23c 492 for (auto& f : objecter_finishers) {
11fdf7f2
TL
493 f->wait_for_empty();
494 f->stop();
7c673cae
FG
495 }
496
11fdf7f2 497 publish_map(OSDMapRef());
7c673cae
FG
498 next_osdmap = OSDMapRef();
499}
500
501void OSDService::init()
502{
503 reserver_finisher.start();
9f95a23c 504 for (auto& f : objecter_finishers) {
11fdf7f2
TL
505 f->start();
506 }
7c673cae
FG
507 objecter->set_client_incarnation(0);
508
509 // deprioritize objecter in daemonperf output
510 objecter->get_logger()->set_prio_adjust(-3);
511
512 watch_timer.init();
513 agent_timer.init();
9f95a23c 514 mono_timer.resume();
7c673cae
FG
515
516 agent_thread.create("osd_srv_agent");
517
518 if (cct->_conf->osd_recovery_delay_start)
519 defer_recovery(cct->_conf->osd_recovery_delay_start);
520}
521
522void OSDService::final_init()
523{
524 objecter->start(osdmap.get());
525}
526
527void OSDService::activate_map()
528{
529 // wake/unwake the tiering agent
9f95a23c 530 std::lock_guard l{agent_lock};
7c673cae
FG
531 agent_active =
532 !osdmap->test_flag(CEPH_OSDMAP_NOTIERAGENT) &&
533 osd->is_active();
9f95a23c 534 agent_cond.notify_all();
7c673cae
FG
535}
536
181888fb
FG
537void OSDService::request_osdmap_update(epoch_t e)
538{
539 osd->osdmap_subscribe(e, false);
540}
541
9f95a23c 542
7c673cae
FG
543class AgentTimeoutCB : public Context {
544 PGRef pg;
545public:
546 explicit AgentTimeoutCB(PGRef _pg) : pg(_pg) {}
547 void finish(int) override {
548 pg->agent_choose_mode_restart();
549 }
550};
551
552void OSDService::agent_entry()
553{
554 dout(10) << __func__ << " start" << dendl;
9f95a23c 555 std::unique_lock agent_locker{agent_lock};
7c673cae
FG
556
557 while (!agent_stop_flag) {
558 if (agent_queue.empty()) {
559 dout(20) << __func__ << " empty queue" << dendl;
9f95a23c 560 agent_cond.wait(agent_locker);
7c673cae
FG
561 continue;
562 }
563 uint64_t level = agent_queue.rbegin()->first;
564 set<PGRef>& top = agent_queue.rbegin()->second;
565 dout(10) << __func__
566 << " tiers " << agent_queue.size()
567 << ", top is " << level
568 << " with pgs " << top.size()
569 << ", ops " << agent_ops << "/"
570 << cct->_conf->osd_agent_max_ops
571 << (agent_active ? " active" : " NOT ACTIVE")
572 << dendl;
573 dout(20) << __func__ << " oids " << agent_oids << dendl;
574 int max = cct->_conf->osd_agent_max_ops - agent_ops;
575 int agent_flush_quota = max;
576 if (!flush_mode_high_count)
577 agent_flush_quota = cct->_conf->osd_agent_max_low_ops - agent_ops;
578 if (agent_flush_quota <= 0 || top.empty() || !agent_active) {
9f95a23c 579 agent_cond.wait(agent_locker);
7c673cae
FG
580 continue;
581 }
582
583 if (!agent_valid_iterator || agent_queue_pos == top.end()) {
584 agent_queue_pos = top.begin();
585 agent_valid_iterator = true;
586 }
587 PGRef pg = *agent_queue_pos;
588 dout(10) << "high_count " << flush_mode_high_count
589 << " agent_ops " << agent_ops
590 << " flush_quota " << agent_flush_quota << dendl;
9f95a23c 591 agent_locker.unlock();
7c673cae 592 if (!pg->agent_work(max, agent_flush_quota)) {
11fdf7f2 593 dout(10) << __func__ << " " << pg->pg_id
7c673cae
FG
594 << " no agent_work, delay for " << cct->_conf->osd_agent_delay_time
595 << " seconds" << dendl;
596
597 osd->logger->inc(l_osd_tier_delay);
598 // Queue a timer to call agent_choose_mode for this pg in 5 seconds
9f95a23c 599 std::lock_guard timer_locker{agent_timer_lock};
7c673cae
FG
600 Context *cb = new AgentTimeoutCB(pg);
601 agent_timer.add_event_after(cct->_conf->osd_agent_delay_time, cb);
7c673cae 602 }
9f95a23c 603 agent_locker.lock();
7c673cae 604 }
7c673cae
FG
605 dout(10) << __func__ << " finish" << dendl;
606}
607
608void OSDService::agent_stop()
609{
610 {
11fdf7f2 611 std::lock_guard l(agent_lock);
7c673cae
FG
612
613 // By this time all ops should be cancelled
11fdf7f2 614 ceph_assert(agent_ops == 0);
7c673cae
FG
615 // By this time all PGs are shutdown and dequeued
616 if (!agent_queue.empty()) {
617 set<PGRef>& top = agent_queue.rbegin()->second;
11fdf7f2
TL
618 derr << "agent queue not empty, for example " << (*top.begin())->get_pgid() << dendl;
619 ceph_abort_msg("agent queue not empty");
7c673cae
FG
620 }
621
622 agent_stop_flag = true;
9f95a23c 623 agent_cond.notify_all();
7c673cae
FG
624 }
625 agent_thread.join();
626}
627
628// -------------------------------------
629
630void OSDService::promote_throttle_recalibrate()
631{
632 utime_t now = ceph_clock_now();
633 double dur = now - last_recalibrate;
634 last_recalibrate = now;
635 unsigned prob = promote_probability_millis;
636
637 uint64_t target_obj_sec = cct->_conf->osd_tier_promote_max_objects_sec;
638 uint64_t target_bytes_sec = cct->_conf->osd_tier_promote_max_bytes_sec;
639
640 unsigned min_prob = 1;
641
642 uint64_t attempts, obj, bytes;
643 promote_counter.sample_and_attenuate(&attempts, &obj, &bytes);
644 dout(10) << __func__ << " " << attempts << " attempts, promoted "
1adf2230 645 << obj << " objects and " << byte_u_t(bytes) << "; target "
7c673cae 646 << target_obj_sec << " obj/sec or "
1adf2230 647 << byte_u_t(target_bytes_sec) << "/sec"
7c673cae
FG
648 << dendl;
649
650 // calculate what the probability *should* be, given the targets
651 unsigned new_prob;
652 if (attempts && dur > 0) {
653 uint64_t avg_size = 1;
654 if (obj)
11fdf7f2 655 avg_size = std::max<uint64_t>(bytes / obj, 1);
7c673cae
FG
656 unsigned po = (double)target_obj_sec * dur * 1000.0 / (double)attempts;
657 unsigned pb = (double)target_bytes_sec / (double)avg_size * dur * 1000.0
658 / (double)attempts;
659 dout(20) << __func__ << " po " << po << " pb " << pb << " avg_size "
660 << avg_size << dendl;
661 if (target_obj_sec && target_bytes_sec)
11fdf7f2 662 new_prob = std::min(po, pb);
7c673cae
FG
663 else if (target_obj_sec)
664 new_prob = po;
665 else if (target_bytes_sec)
666 new_prob = pb;
667 else
668 new_prob = 1000;
669 } else {
670 new_prob = 1000;
671 }
672 dout(20) << __func__ << " new_prob " << new_prob << dendl;
673
674 // correct for persistent skew between target rate and actual rate, adjust
675 double ratio = 1.0;
676 unsigned actual = 0;
677 if (attempts && obj) {
678 actual = obj * 1000 / attempts;
679 ratio = (double)actual / (double)prob;
680 new_prob = (double)new_prob / ratio;
681 }
11fdf7f2
TL
682 new_prob = std::max(new_prob, min_prob);
683 new_prob = std::min(new_prob, 1000u);
7c673cae
FG
684
685 // adjust
686 prob = (prob + new_prob) / 2;
11fdf7f2
TL
687 prob = std::max(prob, min_prob);
688 prob = std::min(prob, 1000u);
7c673cae
FG
689 dout(10) << __func__ << " actual " << actual
690 << ", actual/prob ratio " << ratio
691 << ", adjusted new_prob " << new_prob
692 << ", prob " << promote_probability_millis << " -> " << prob
693 << dendl;
694 promote_probability_millis = prob;
695
696 // set hard limits for this interval to mitigate stampedes
91327a77
AA
697 promote_max_objects = target_obj_sec * osd->OSD_TICK_INTERVAL * 2;
698 promote_max_bytes = target_bytes_sec * osd->OSD_TICK_INTERVAL * 2;
7c673cae
FG
699}
700
701// -------------------------------------
702
703float OSDService::get_failsafe_full_ratio()
704{
705 float full_ratio = cct->_conf->osd_failsafe_full_ratio;
706 if (full_ratio > 1.0) full_ratio /= 100.0;
707 return full_ratio;
708}
709
11fdf7f2 710OSDService::s_names OSDService::recalc_full_state(float ratio, float pratio, string &inject)
7c673cae 711{
7c673cae
FG
712 // The OSDMap ratios take precendence. So if the failsafe is .95 and
713 // the admin sets the cluster full to .96, the failsafe moves up to .96
714 // too. (Not that having failsafe == full is ideal, but it's better than
715 // dropping writes before the clusters appears full.)
716 OSDMapRef osdmap = get_osdmap();
717 if (!osdmap || osdmap->get_epoch() == 0) {
11fdf7f2 718 return NONE;
7c673cae
FG
719 }
720 float nearfull_ratio = osdmap->get_nearfull_ratio();
721 float backfillfull_ratio = std::max(osdmap->get_backfillfull_ratio(), nearfull_ratio);
722 float full_ratio = std::max(osdmap->get_full_ratio(), backfillfull_ratio);
723 float failsafe_ratio = std::max(get_failsafe_full_ratio(), full_ratio);
724
9f95a23c 725 if (osdmap->require_osd_release < ceph_release_t::luminous) {
7c673cae
FG
726 // use the failsafe for nearfull and full; the mon isn't using the
727 // flags anyway because we're mid-upgrade.
728 full_ratio = failsafe_ratio;
729 backfillfull_ratio = failsafe_ratio;
730 nearfull_ratio = failsafe_ratio;
731 } else if (full_ratio <= 0 ||
732 backfillfull_ratio <= 0 ||
733 nearfull_ratio <= 0) {
734 derr << __func__ << " full_ratio, backfillfull_ratio or nearfull_ratio is <= 0" << dendl;
735 // use failsafe flag. ick. the monitor did something wrong or the user
736 // did something stupid.
737 full_ratio = failsafe_ratio;
738 backfillfull_ratio = failsafe_ratio;
739 nearfull_ratio = failsafe_ratio;
740 }
741
7c673cae 742 if (injectfull_state > NONE && injectfull) {
7c673cae 743 inject = "(Injected)";
11fdf7f2
TL
744 return injectfull_state;
745 } else if (pratio > failsafe_ratio) {
746 return FAILSAFE;
7c673cae 747 } else if (ratio > full_ratio) {
11fdf7f2 748 return FULL;
7c673cae 749 } else if (ratio > backfillfull_ratio) {
11fdf7f2 750 return BACKFILLFULL;
92f5a8d4 751 } else if (pratio > nearfull_ratio) {
11fdf7f2 752 return NEARFULL;
7c673cae 753 }
11fdf7f2
TL
754 return NONE;
755}
756
757void OSDService::check_full_status(float ratio, float pratio)
758{
759 std::lock_guard l(full_status_lock);
760
761 cur_ratio = ratio;
762 physical_ratio = pratio;
763
764 string inject;
765 s_names new_state;
766 new_state = recalc_full_state(ratio, pratio, inject);
767
7c673cae 768 dout(20) << __func__ << " cur ratio " << ratio
11fdf7f2 769 << ", physical ratio " << pratio
7c673cae
FG
770 << ", new state " << get_full_state_name(new_state)
771 << " " << inject
772 << dendl;
773
774 // warn
775 if (cur_state != new_state) {
776 dout(10) << __func__ << " " << get_full_state_name(cur_state)
777 << " -> " << get_full_state_name(new_state) << dendl;
778 if (new_state == FAILSAFE) {
c07f9fc5 779 clog->error() << "full status failsafe engaged, dropping updates, now "
7c673cae
FG
780 << (int)roundf(ratio * 100) << "% full";
781 } else if (cur_state == FAILSAFE) {
c07f9fc5
FG
782 clog->error() << "full status failsafe disengaged, no longer dropping "
783 << "updates, now " << (int)roundf(ratio * 100) << "% full";
7c673cae
FG
784 }
785 cur_state = new_state;
786 }
787}
788
789bool OSDService::need_fullness_update()
790{
791 OSDMapRef osdmap = get_osdmap();
792 s_names cur = NONE;
793 if (osdmap->exists(whoami)) {
794 if (osdmap->get_state(whoami) & CEPH_OSD_FULL) {
795 cur = FULL;
796 } else if (osdmap->get_state(whoami) & CEPH_OSD_BACKFILLFULL) {
797 cur = BACKFILLFULL;
798 } else if (osdmap->get_state(whoami) & CEPH_OSD_NEARFULL) {
799 cur = NEARFULL;
800 }
801 }
802 s_names want = NONE;
803 if (is_full())
804 want = FULL;
805 else if (is_backfillfull())
806 want = BACKFILLFULL;
807 else if (is_nearfull())
808 want = NEARFULL;
809 return want != cur;
810}
811
11fdf7f2 812bool OSDService::_check_inject_full(DoutPrefixProvider *dpp, s_names type) const
7c673cae 813{
7c673cae
FG
814 if (injectfull && injectfull_state >= type) {
815 // injectfull is either a count of the number of times to return failsafe full
816 // or if -1 then always return full
817 if (injectfull > 0)
818 --injectfull;
11fdf7f2
TL
819 ldpp_dout(dpp, 10) << __func__ << " Injected " << get_full_state_name(type) << " OSD ("
820 << (injectfull < 0 ? "set" : std::to_string(injectfull)) << ")"
821 << dendl;
7c673cae
FG
822 return true;
823 }
11fdf7f2
TL
824 return false;
825}
826
827bool OSDService::_check_full(DoutPrefixProvider *dpp, s_names type) const
828{
829 std::lock_guard l(full_status_lock);
830
831 if (_check_inject_full(dpp, type))
832 return true;
833
834 if (cur_state >= type)
835 ldpp_dout(dpp, 10) << __func__ << " current usage is " << cur_ratio
836 << " physical " << physical_ratio << dendl;
7c673cae 837
7c673cae
FG
838 return cur_state >= type;
839}
840
11fdf7f2
TL
841bool OSDService::_tentative_full(DoutPrefixProvider *dpp, s_names type, uint64_t adjust_used, osd_stat_t adjusted_stat)
842{
843 ldpp_dout(dpp, 20) << __func__ << " type " << get_full_state_name(type) << " adjust_used " << (adjust_used >> 10) << "KiB" << dendl;
844 {
845 std::lock_guard l(full_status_lock);
846 if (_check_inject_full(dpp, type)) {
847 return true;
848 }
849 }
850
851 float pratio;
852 float ratio = compute_adjusted_ratio(adjusted_stat, &pratio, adjust_used);
853
854 string notused;
855 s_names tentative_state = recalc_full_state(ratio, pratio, notused);
856
857 if (tentative_state >= type)
858 ldpp_dout(dpp, 10) << __func__ << " tentative usage is " << ratio << dendl;
859
860 return tentative_state >= type;
861}
862
863bool OSDService::check_failsafe_full(DoutPrefixProvider *dpp) const
864{
865 return _check_full(dpp, FAILSAFE);
866}
867
868bool OSDService::check_full(DoutPrefixProvider *dpp) const
7c673cae 869{
11fdf7f2 870 return _check_full(dpp, FULL);
7c673cae
FG
871}
872
11fdf7f2 873bool OSDService::tentative_backfill_full(DoutPrefixProvider *dpp, uint64_t adjust_used, osd_stat_t stats)
7c673cae 874{
11fdf7f2 875 return _tentative_full(dpp, BACKFILLFULL, adjust_used, stats);
7c673cae
FG
876}
877
11fdf7f2 878bool OSDService::check_backfill_full(DoutPrefixProvider *dpp) const
7c673cae 879{
11fdf7f2 880 return _check_full(dpp, BACKFILLFULL);
7c673cae
FG
881}
882
11fdf7f2 883bool OSDService::check_nearfull(DoutPrefixProvider *dpp) const
7c673cae 884{
11fdf7f2 885 return _check_full(dpp, NEARFULL);
7c673cae
FG
886}
887
888bool OSDService::is_failsafe_full() const
889{
11fdf7f2 890 std::lock_guard l(full_status_lock);
7c673cae
FG
891 return cur_state == FAILSAFE;
892}
893
894bool OSDService::is_full() const
895{
11fdf7f2 896 std::lock_guard l(full_status_lock);
7c673cae
FG
897 return cur_state >= FULL;
898}
899
900bool OSDService::is_backfillfull() const
901{
11fdf7f2 902 std::lock_guard l(full_status_lock);
7c673cae
FG
903 return cur_state >= BACKFILLFULL;
904}
905
906bool OSDService::is_nearfull() const
907{
11fdf7f2 908 std::lock_guard l(full_status_lock);
7c673cae
FG
909 return cur_state >= NEARFULL;
910}
911
912void OSDService::set_injectfull(s_names type, int64_t count)
913{
11fdf7f2 914 std::lock_guard l(full_status_lock);
7c673cae
FG
915 injectfull_state = type;
916 injectfull = count;
917}
918
11fdf7f2
TL
919void OSDService::set_statfs(const struct store_statfs_t &stbuf,
920 osd_alert_list_t& alerts)
7c673cae 921{
224ce89b 922 uint64_t bytes = stbuf.total;
224ce89b 923 uint64_t avail = stbuf.available;
11fdf7f2
TL
924 uint64_t used = stbuf.get_used_raw();
925
926 // For testing fake statfs values so it doesn't matter if all
927 // OSDs are using the same partition.
928 if (cct->_conf->fake_statfs_for_testing) {
929 uint64_t total_num_bytes = 0;
930 vector<PGRef> pgs;
931 osd->_get_pgs(&pgs);
932 for (auto p : pgs) {
933 total_num_bytes += p->get_stats_num_bytes();
934 }
935 bytes = cct->_conf->fake_statfs_for_testing;
936 if (total_num_bytes < bytes)
937 avail = bytes - total_num_bytes;
938 else
939 avail = 0;
940 dout(0) << __func__ << " fake total " << cct->_conf->fake_statfs_for_testing
941 << " adjust available " << avail
942 << dendl;
943 used = bytes - avail;
944 }
7c673cae 945
224ce89b
WB
946 osd->logger->set(l_osd_stat_bytes, bytes);
947 osd->logger->set(l_osd_stat_bytes_used, used);
948 osd->logger->set(l_osd_stat_bytes_avail, avail);
7c673cae 949
11fdf7f2
TL
950 std::lock_guard l(stat_lock);
951 osd_stat.statfs = stbuf;
952 osd_stat.os_alerts.clear();
953 osd_stat.os_alerts[whoami].swap(alerts);
954 if (cct->_conf->fake_statfs_for_testing) {
955 osd_stat.statfs.total = bytes;
956 osd_stat.statfs.available = avail;
957 // For testing don't want used to go negative, so clear reserved
958 osd_stat.statfs.internally_reserved = 0;
224ce89b
WB
959 }
960}
7c673cae 961
11fdf7f2
TL
962osd_stat_t OSDService::set_osd_stat(vector<int>& hb_peers,
963 int num_pgs)
224ce89b 964{
eafe8130
TL
965 utime_t now = ceph_clock_now();
966 auto stale_time = g_conf().get_val<int64_t>("osd_mon_heartbeat_stat_stale");
11fdf7f2
TL
967 std::lock_guard l(stat_lock);
968 osd_stat.hb_peers.swap(hb_peers);
969 osd->op_tracker.get_age_ms_histogram(&osd_stat.op_queue_age_hist);
970 osd_stat.num_pgs = num_pgs;
eafe8130
TL
971 // Clean entries that aren't updated
972 // This is called often enough that we can just remove 1 at a time
973 for (auto i: osd_stat.hb_pingtime) {
974 if (i.second.last_update == 0)
975 continue;
976 if (stale_time && now.sec() - i.second.last_update > stale_time) {
977 dout(20) << __func__ << " time out heartbeat for osd " << i.first
978 << " last_update " << i.second.last_update << dendl;
979 osd_stat.hb_pingtime.erase(i.first);
980 break;
981 }
982 }
11fdf7f2
TL
983 return osd_stat;
984}
985
986void OSDService::inc_osd_stat_repaired()
987{
988 std::lock_guard l(stat_lock);
989 osd_stat.num_shards_repaired++;
990 return;
991}
992
993float OSDService::compute_adjusted_ratio(osd_stat_t new_stat, float *pratio,
994 uint64_t adjust_used)
995{
996 *pratio =
3fec8b72 997 ((float)new_stat.statfs.get_used_raw()) / ((float)new_stat.statfs.total);
11fdf7f2
TL
998
999 if (adjust_used) {
1000 dout(20) << __func__ << " Before kb_used() " << new_stat.statfs.kb_used() << dendl;
1001 if (new_stat.statfs.available > adjust_used)
1002 new_stat.statfs.available -= adjust_used;
1003 else
1004 new_stat.statfs.available = 0;
1005 dout(20) << __func__ << " After kb_used() " << new_stat.statfs.kb_used() << dendl;
7c673cae
FG
1006 }
1007
11fdf7f2
TL
1008 // Check all pgs and adjust kb_used to include all pending backfill data
1009 int backfill_adjusted = 0;
1010 vector<PGRef> pgs;
1011 osd->_get_pgs(&pgs);
1012 for (auto p : pgs) {
1013 backfill_adjusted += p->pg_stat_adjust(&new_stat);
1014 }
1015 if (backfill_adjusted) {
1016 dout(20) << __func__ << " backfill adjusted " << new_stat << dendl;
1017 }
3fec8b72 1018 return ((float)new_stat.statfs.get_used_raw()) / ((float)new_stat.statfs.total);
7c673cae
FG
1019}
1020
7c673cae
FG
1021void OSDService::send_message_osd_cluster(int peer, Message *m, epoch_t from_epoch)
1022{
1023 OSDMapRef next_map = get_nextmap_reserved();
1024 // service map is always newer/newest
11fdf7f2 1025 ceph_assert(from_epoch <= next_map->get_epoch());
7c673cae
FG
1026
1027 if (next_map->is_down(peer) ||
1028 next_map->get_info(peer).up_from > from_epoch) {
1029 m->put();
1030 release_map(next_map);
1031 return;
1032 }
9f95a23c
TL
1033 ConnectionRef peer_con;
1034 if (peer == whoami) {
1035 peer_con = osd->cluster_messenger->get_loopback_connection();
1036 } else {
1037 peer_con = osd->cluster_messenger->connect_to_osd(
1038 next_map->get_cluster_addrs(peer), false, true);
1039 }
1040 maybe_share_map(peer_con.get(), next_map);
7c673cae
FG
1041 peer_con->send_message(m);
1042 release_map(next_map);
1043}
1044
9f95a23c
TL
1045void OSDService::send_message_osd_cluster(std::vector<std::pair<int, Message*>>& messages, epoch_t from_epoch)
1046{
1047 OSDMapRef next_map = get_nextmap_reserved();
1048 // service map is always newer/newest
1049 ceph_assert(from_epoch <= next_map->get_epoch());
1050
1051 for (auto& iter : messages) {
1052 if (next_map->is_down(iter.first) ||
1053 next_map->get_info(iter.first).up_from > from_epoch) {
1054 iter.second->put();
1055 continue;
1056 }
1057 ConnectionRef peer_con;
1058 if (iter.first == whoami) {
1059 peer_con = osd->cluster_messenger->get_loopback_connection();
1060 } else {
1061 peer_con = osd->cluster_messenger->connect_to_osd(
1062 next_map->get_cluster_addrs(iter.first), false, true);
1063 }
1064 maybe_share_map(peer_con.get(), next_map);
1065 peer_con->send_message(iter.second);
1066 }
1067 release_map(next_map);
1068}
7c673cae
FG
1069ConnectionRef OSDService::get_con_osd_cluster(int peer, epoch_t from_epoch)
1070{
1071 OSDMapRef next_map = get_nextmap_reserved();
1072 // service map is always newer/newest
11fdf7f2 1073 ceph_assert(from_epoch <= next_map->get_epoch());
7c673cae
FG
1074
1075 if (next_map->is_down(peer) ||
1076 next_map->get_info(peer).up_from > from_epoch) {
1077 release_map(next_map);
1078 return NULL;
1079 }
9f95a23c
TL
1080 ConnectionRef con;
1081 if (peer == whoami) {
1082 con = osd->cluster_messenger->get_loopback_connection();
1083 } else {
1084 con = osd->cluster_messenger->connect_to_osd(
1085 next_map->get_cluster_addrs(peer), false, true);
1086 }
7c673cae
FG
1087 release_map(next_map);
1088 return con;
1089}
1090
1091pair<ConnectionRef,ConnectionRef> OSDService::get_con_osd_hb(int peer, epoch_t from_epoch)
1092{
1093 OSDMapRef next_map = get_nextmap_reserved();
1094 // service map is always newer/newest
11fdf7f2 1095 ceph_assert(from_epoch <= next_map->get_epoch());
7c673cae
FG
1096
1097 pair<ConnectionRef,ConnectionRef> ret;
1098 if (next_map->is_down(peer) ||
1099 next_map->get_info(peer).up_from > from_epoch) {
1100 release_map(next_map);
1101 return ret;
1102 }
11fdf7f2
TL
1103 ret.first = osd->hb_back_client_messenger->connect_to_osd(
1104 next_map->get_hb_back_addrs(peer));
1105 ret.second = osd->hb_front_client_messenger->connect_to_osd(
1106 next_map->get_hb_front_addrs(peer));
7c673cae
FG
1107 release_map(next_map);
1108 return ret;
1109}
1110
11fdf7f2
TL
1111entity_name_t OSDService::get_cluster_msgr_name() const
1112{
1113 return cluster_messenger->get_myname();
1114}
7c673cae 1115
94b18763
FG
1116void OSDService::queue_want_pg_temp(pg_t pgid,
1117 const vector<int>& want,
1118 bool forced)
7c673cae 1119{
11fdf7f2 1120 std::lock_guard l(pg_temp_lock);
94b18763 1121 auto p = pg_temp_pending.find(pgid);
7c673cae 1122 if (p == pg_temp_pending.end() ||
94b18763
FG
1123 p->second.acting != want ||
1124 forced) {
11fdf7f2 1125 pg_temp_wanted[pgid] = {want, forced};
7c673cae
FG
1126 }
1127}
1128
1129void OSDService::remove_want_pg_temp(pg_t pgid)
1130{
11fdf7f2 1131 std::lock_guard l(pg_temp_lock);
7c673cae
FG
1132 pg_temp_wanted.erase(pgid);
1133 pg_temp_pending.erase(pgid);
1134}
1135
1136void OSDService::_sent_pg_temp()
1137{
11fdf7f2
TL
1138#ifdef HAVE_STDLIB_MAP_SPLICING
1139 pg_temp_pending.merge(pg_temp_wanted);
1140#else
94b18763
FG
1141 pg_temp_pending.insert(make_move_iterator(begin(pg_temp_wanted)),
1142 make_move_iterator(end(pg_temp_wanted)));
11fdf7f2 1143#endif
7c673cae
FG
1144 pg_temp_wanted.clear();
1145}
1146
1147void OSDService::requeue_pg_temp()
1148{
11fdf7f2 1149 std::lock_guard l(pg_temp_lock);
7c673cae
FG
1150 // wanted overrides pending. note that remove_want_pg_temp
1151 // clears the item out of both.
1152 unsigned old_wanted = pg_temp_wanted.size();
1153 unsigned old_pending = pg_temp_pending.size();
1154 _sent_pg_temp();
1155 pg_temp_wanted.swap(pg_temp_pending);
1156 dout(10) << __func__ << " " << old_wanted << " + " << old_pending << " -> "
1157 << pg_temp_wanted.size() << dendl;
1158}
1159
94b18763
FG
1160std::ostream& operator<<(std::ostream& out,
1161 const OSDService::pg_temp_t& pg_temp)
1162{
1163 out << pg_temp.acting;
1164 if (pg_temp.forced) {
1165 out << " (forced)";
1166 }
1167 return out;
1168}
1169
7c673cae
FG
1170void OSDService::send_pg_temp()
1171{
11fdf7f2 1172 std::lock_guard l(pg_temp_lock);
7c673cae
FG
1173 if (pg_temp_wanted.empty())
1174 return;
1175 dout(10) << "send_pg_temp " << pg_temp_wanted << dendl;
94b18763 1176 MOSDPGTemp *ms[2] = {nullptr, nullptr};
11fdf7f2
TL
1177 for (auto& [pgid, pg_temp] : pg_temp_wanted) {
1178 auto& m = ms[pg_temp.forced];
94b18763
FG
1179 if (!m) {
1180 m = new MOSDPGTemp(osdmap->get_epoch());
11fdf7f2 1181 m->forced = pg_temp.forced;
94b18763 1182 }
11fdf7f2 1183 m->pg_temp.emplace(pgid, pg_temp.acting);
94b18763
FG
1184 }
1185 for (auto m : ms) {
1186 if (m) {
1187 monc->send_mon_message(m);
1188 }
1189 }
7c673cae
FG
1190 _sent_pg_temp();
1191}
1192
1193void OSDService::send_pg_created(pg_t pgid)
1194{
11fdf7f2 1195 std::lock_guard l(pg_created_lock);
7c673cae 1196 dout(20) << __func__ << dendl;
11fdf7f2 1197 auto o = get_osdmap();
9f95a23c 1198 if (o->require_osd_release >= ceph_release_t::luminous) {
11fdf7f2 1199 pg_created.insert(pgid);
c07f9fc5
FG
1200 monc->send_mon_message(new MOSDPGCreated(pgid));
1201 }
7c673cae
FG
1202}
1203
11fdf7f2
TL
1204void OSDService::send_pg_created()
1205{
1206 std::lock_guard l(pg_created_lock);
1207 dout(20) << __func__ << dendl;
1208 auto o = get_osdmap();
9f95a23c 1209 if (o->require_osd_release >= ceph_release_t::luminous) {
11fdf7f2
TL
1210 for (auto pgid : pg_created) {
1211 monc->send_mon_message(new MOSDPGCreated(pgid));
1212 }
1213 }
1214}
1215
1216void OSDService::prune_pg_created()
1217{
1218 std::lock_guard l(pg_created_lock);
1219 dout(20) << __func__ << dendl;
1220 auto o = get_osdmap();
1221 auto i = pg_created.begin();
1222 while (i != pg_created.end()) {
1223 auto p = o->get_pg_pool(i->pool());
1224 if (!p || !p->has_flag(pg_pool_t::FLAG_CREATING)) {
1225 dout(20) << __func__ << " pruning " << *i << dendl;
1226 i = pg_created.erase(i);
1227 } else {
1228 dout(20) << __func__ << " keeping " << *i << dendl;
1229 ++i;
1230 }
1231 }
1232}
1233
1234
7c673cae
FG
1235// --------------------------------------
1236// dispatch
1237
eafe8130 1238bool OSDService::can_inc_scrubs()
7c673cae
FG
1239{
1240 bool can_inc = false;
11fdf7f2 1241 std::lock_guard l(sched_scrub_lock);
7c673cae 1242
eafe8130
TL
1243 if (scrubs_local + scrubs_remote < cct->_conf->osd_max_scrubs) {
1244 dout(20) << __func__ << " == true " << scrubs_local << " local + " << scrubs_remote
1245 << " remote < max " << cct->_conf->osd_max_scrubs << dendl;
7c673cae
FG
1246 can_inc = true;
1247 } else {
eafe8130
TL
1248 dout(20) << __func__ << " == false " << scrubs_local << " local + " << scrubs_remote
1249 << " remote >= max " << cct->_conf->osd_max_scrubs << dendl;
7c673cae
FG
1250 }
1251
1252 return can_inc;
1253}
1254
eafe8130 1255bool OSDService::inc_scrubs_local()
7c673cae
FG
1256{
1257 bool result = false;
eafe8130
TL
1258 std::lock_guard l{sched_scrub_lock};
1259 if (scrubs_local + scrubs_remote < cct->_conf->osd_max_scrubs) {
1260 dout(20) << __func__ << " " << scrubs_local << " -> " << (scrubs_local+1)
1261 << " (max " << cct->_conf->osd_max_scrubs << ", remote " << scrubs_remote << ")" << dendl;
7c673cae 1262 result = true;
eafe8130 1263 ++scrubs_local;
7c673cae 1264 } else {
eafe8130 1265 dout(20) << __func__ << " " << scrubs_local << " local + " << scrubs_remote << " remote >= max " << cct->_conf->osd_max_scrubs << dendl;
7c673cae 1266 }
7c673cae
FG
1267 return result;
1268}
1269
eafe8130 1270void OSDService::dec_scrubs_local()
7c673cae 1271{
eafe8130
TL
1272 std::lock_guard l{sched_scrub_lock};
1273 dout(20) << __func__ << " " << scrubs_local << " -> " << (scrubs_local-1)
1274 << " (max " << cct->_conf->osd_max_scrubs << ", remote " << scrubs_remote << ")" << dendl;
1275 --scrubs_local;
1276 ceph_assert(scrubs_local >= 0);
7c673cae
FG
1277}
1278
eafe8130 1279bool OSDService::inc_scrubs_remote()
7c673cae 1280{
eafe8130
TL
1281 bool result = false;
1282 std::lock_guard l{sched_scrub_lock};
1283 if (scrubs_local + scrubs_remote < cct->_conf->osd_max_scrubs) {
1284 dout(20) << __func__ << " " << scrubs_remote << " -> " << (scrubs_remote+1)
1285 << " (max " << cct->_conf->osd_max_scrubs << ", local " << scrubs_local << ")" << dendl;
1286 result = true;
1287 ++scrubs_remote;
7c673cae 1288 } else {
eafe8130 1289 dout(20) << __func__ << " " << scrubs_local << " local + " << scrubs_remote << " remote >= max " << cct->_conf->osd_max_scrubs << dendl;
7c673cae 1290 }
eafe8130
TL
1291 return result;
1292}
1293
1294void OSDService::dec_scrubs_remote()
1295{
1296 std::lock_guard l{sched_scrub_lock};
1297 dout(20) << __func__ << " " << scrubs_remote << " -> " << (scrubs_remote-1)
1298 << " (max " << cct->_conf->osd_max_scrubs << ", local " << scrubs_local << ")" << dendl;
1299 --scrubs_remote;
1300 ceph_assert(scrubs_remote >= 0);
7c673cae
FG
1301}
1302
eafe8130 1303void OSDService::dump_scrub_reservations(Formatter *f)
7c673cae 1304{
eafe8130
TL
1305 std::lock_guard l{sched_scrub_lock};
1306 f->dump_int("scrubs_local", scrubs_local);
1307 f->dump_int("scrubs_remote", scrubs_remote);
1308 f->dump_int("osd_max_scrubs", cct->_conf->osd_max_scrubs);
7c673cae
FG
1309}
1310
1311void OSDService::retrieve_epochs(epoch_t *_boot_epoch, epoch_t *_up_epoch,
1312 epoch_t *_bind_epoch) const
1313{
11fdf7f2 1314 std::lock_guard l(epoch_lock);
7c673cae
FG
1315 if (_boot_epoch)
1316 *_boot_epoch = boot_epoch;
1317 if (_up_epoch)
1318 *_up_epoch = up_epoch;
1319 if (_bind_epoch)
1320 *_bind_epoch = bind_epoch;
1321}
1322
1323void OSDService::set_epochs(const epoch_t *_boot_epoch, const epoch_t *_up_epoch,
1324 const epoch_t *_bind_epoch)
1325{
11fdf7f2 1326 std::lock_guard l(epoch_lock);
7c673cae 1327 if (_boot_epoch) {
11fdf7f2 1328 ceph_assert(*_boot_epoch == 0 || *_boot_epoch >= boot_epoch);
7c673cae
FG
1329 boot_epoch = *_boot_epoch;
1330 }
1331 if (_up_epoch) {
11fdf7f2 1332 ceph_assert(*_up_epoch == 0 || *_up_epoch >= up_epoch);
7c673cae
FG
1333 up_epoch = *_up_epoch;
1334 }
1335 if (_bind_epoch) {
11fdf7f2 1336 ceph_assert(*_bind_epoch == 0 || *_bind_epoch >= bind_epoch);
7c673cae
FG
1337 bind_epoch = *_bind_epoch;
1338 }
1339}
1340
1341bool OSDService::prepare_to_stop()
1342{
9f95a23c 1343 std::unique_lock l(is_stopping_lock);
7c673cae
FG
1344 if (get_state() != NOT_STOPPING)
1345 return false;
1346
1347 OSDMapRef osdmap = get_osdmap();
1348 if (osdmap && osdmap->is_up(whoami)) {
1349 dout(0) << __func__ << " telling mon we are shutting down" << dendl;
1350 set_state(PREPARING_TO_STOP);
11fdf7f2
TL
1351 monc->send_mon_message(
1352 new MOSDMarkMeDown(
1353 monc->get_fsid(),
1354 whoami,
1355 osdmap->get_addrs(whoami),
1356 osdmap->get_epoch(),
1357 true // request ack
1358 ));
9f95a23c
TL
1359 const auto timeout = ceph::make_timespan(cct->_conf->osd_mon_shutdown_timeout);
1360 is_stopping_cond.wait_for(l, timeout,
1361 [this] { return get_state() == STOPPING; });
7c673cae
FG
1362 }
1363 dout(0) << __func__ << " starting shutdown" << dendl;
1364 set_state(STOPPING);
1365 return true;
1366}
1367
1368void OSDService::got_stop_ack()
1369{
9f95a23c 1370 std::scoped_lock l(is_stopping_lock);
7c673cae
FG
1371 if (get_state() == PREPARING_TO_STOP) {
1372 dout(0) << __func__ << " starting shutdown" << dendl;
1373 set_state(STOPPING);
9f95a23c 1374 is_stopping_cond.notify_all();
7c673cae
FG
1375 } else {
1376 dout(10) << __func__ << " ignoring msg" << dendl;
1377 }
1378}
1379
1380MOSDMap *OSDService::build_incremental_map_msg(epoch_t since, epoch_t to,
1381 OSDSuperblock& sblock)
1382{
28e407b8
AA
1383 MOSDMap *m = new MOSDMap(monc->get_fsid(),
1384 osdmap->get_encoding_features());
7c673cae
FG
1385 m->oldest_map = max_oldest_map;
1386 m->newest_map = sblock.newest_map;
1387
11fdf7f2
TL
1388 int max = cct->_conf->osd_map_message_max;
1389 ssize_t max_bytes = cct->_conf->osd_map_message_max_bytes;
1390
1391 if (since < m->oldest_map) {
1392 // we don't have the next map the target wants, so start with a
1393 // full map.
1394 bufferlist bl;
1395 dout(10) << __func__ << " oldest map " << max_oldest_map << " > since "
1396 << since << ", starting with full map" << dendl;
1397 since = m->oldest_map;
1398 if (!get_map_bl(since, bl)) {
1399 derr << __func__ << " missing full map " << since << dendl;
1400 goto panic;
1401 }
1402 max--;
1403 max_bytes -= bl.length();
1404 m->maps[since].claim(bl);
1405 }
1406 for (epoch_t e = since + 1; e <= to; ++e) {
7c673cae 1407 bufferlist bl;
11fdf7f2 1408 if (get_inc_map_bl(e, bl)) {
7c673cae 1409 m->incremental_maps[e].claim(bl);
11fdf7f2 1410 } else {
e306af50 1411 dout(10) << __func__ << " missing incremental map " << e << dendl;
11fdf7f2
TL
1412 if (!get_map_bl(e, bl)) {
1413 derr << __func__ << " also missing full map " << e << dendl;
1414 goto panic;
1415 }
7c673cae 1416 m->maps[e].claim(bl);
11fdf7f2
TL
1417 }
1418 max--;
1419 max_bytes -= bl.length();
1420 if (max <= 0 || max_bytes <= 0) {
7c673cae 1421 break;
11fdf7f2
TL
1422 }
1423 }
1424 return m;
1425
1426 panic:
1427 if (!m->maps.empty() ||
1428 !m->incremental_maps.empty()) {
1429 // send what we have so far
1430 return m;
1431 }
1432 // send something
1433 bufferlist bl;
1434 if (get_inc_map_bl(m->newest_map, bl)) {
1435 m->incremental_maps[m->newest_map].claim(bl);
1436 } else {
1437 derr << __func__ << " unable to load latest map " << m->newest_map << dendl;
1438 if (!get_map_bl(m->newest_map, bl)) {
1439 derr << __func__ << " unable to load latest full map " << m->newest_map
7c673cae 1440 << dendl;
11fdf7f2 1441 ceph_abort();
7c673cae 1442 }
11fdf7f2 1443 m->maps[m->newest_map].claim(bl);
7c673cae
FG
1444 }
1445 return m;
1446}
1447
1448void OSDService::send_map(MOSDMap *m, Connection *con)
1449{
1450 con->send_message(m);
1451}
1452
1453void OSDService::send_incremental_map(epoch_t since, Connection *con,
9f95a23c 1454 const OSDMapRef& osdmap)
7c673cae
FG
1455{
1456 epoch_t to = osdmap->get_epoch();
1457 dout(10) << "send_incremental_map " << since << " -> " << to
1458 << " to " << con << " " << con->get_peer_addr() << dendl;
1459
1460 MOSDMap *m = NULL;
1461 while (!m) {
1462 OSDSuperblock sblock(get_superblock());
1463 if (since < sblock.oldest_map) {
1464 // just send latest full map
28e407b8
AA
1465 MOSDMap *m = new MOSDMap(monc->get_fsid(),
1466 osdmap->get_encoding_features());
7c673cae
FG
1467 m->oldest_map = max_oldest_map;
1468 m->newest_map = sblock.newest_map;
1469 get_map_bl(to, m->maps[to]);
1470 send_map(m, con);
1471 return;
1472 }
1473
1474 if (to > since && (int64_t)(to - since) > cct->_conf->osd_map_share_max_epochs) {
1475 dout(10) << " " << (to - since) << " > max " << cct->_conf->osd_map_share_max_epochs
1476 << ", only sending most recent" << dendl;
1477 since = to - cct->_conf->osd_map_share_max_epochs;
1478 }
1479
7c673cae
FG
1480 m = build_incremental_map_msg(since, to, sblock);
1481 }
1482 send_map(m, con);
1483}
1484
1485bool OSDService::_get_map_bl(epoch_t e, bufferlist& bl)
1486{
1487 bool found = map_bl_cache.lookup(e, &bl);
31f18b77
FG
1488 if (found) {
1489 if (logger)
1490 logger->inc(l_osd_map_bl_cache_hit);
7c673cae 1491 return true;
31f18b77
FG
1492 }
1493 if (logger)
1494 logger->inc(l_osd_map_bl_cache_miss);
11fdf7f2 1495 found = store->read(meta_ch,
31f18b77
FG
1496 OSD::get_osdmap_pobject_name(e), 0, 0, bl,
1497 CEPH_OSD_OP_FLAG_FADVISE_WILLNEED) >= 0;
1498 if (found) {
7c673cae 1499 _add_map_bl(e, bl);
31f18b77 1500 }
7c673cae
FG
1501 return found;
1502}
1503
1504bool OSDService::get_inc_map_bl(epoch_t e, bufferlist& bl)
1505{
11fdf7f2 1506 std::lock_guard l(map_cache_lock);
7c673cae 1507 bool found = map_bl_inc_cache.lookup(e, &bl);
31f18b77
FG
1508 if (found) {
1509 if (logger)
1510 logger->inc(l_osd_map_bl_cache_hit);
7c673cae 1511 return true;
31f18b77
FG
1512 }
1513 if (logger)
1514 logger->inc(l_osd_map_bl_cache_miss);
11fdf7f2 1515 found = store->read(meta_ch,
31f18b77
FG
1516 OSD::get_inc_osdmap_pobject_name(e), 0, 0, bl,
1517 CEPH_OSD_OP_FLAG_FADVISE_WILLNEED) >= 0;
1518 if (found) {
7c673cae 1519 _add_map_inc_bl(e, bl);
31f18b77 1520 }
7c673cae
FG
1521 return found;
1522}
1523
1524void OSDService::_add_map_bl(epoch_t e, bufferlist& bl)
1525{
1526 dout(10) << "add_map_bl " << e << " " << bl.length() << " bytes" << dendl;
31f18b77
FG
1527 // cache a contiguous buffer
1528 if (bl.get_num_buffers() > 1) {
1529 bl.rebuild();
1530 }
1531 bl.try_assign_to_mempool(mempool::mempool_osd_mapbl);
7c673cae
FG
1532 map_bl_cache.add(e, bl);
1533}
1534
1535void OSDService::_add_map_inc_bl(epoch_t e, bufferlist& bl)
1536{
1537 dout(10) << "add_map_inc_bl " << e << " " << bl.length() << " bytes" << dendl;
31f18b77
FG
1538 // cache a contiguous buffer
1539 if (bl.get_num_buffers() > 1) {
1540 bl.rebuild();
1541 }
1542 bl.try_assign_to_mempool(mempool::mempool_osd_mapbl);
7c673cae
FG
1543 map_bl_inc_cache.add(e, bl);
1544}
1545
7c673cae
FG
1546OSDMapRef OSDService::_add_map(OSDMap *o)
1547{
1548 epoch_t e = o->get_epoch();
1549
1550 if (cct->_conf->osd_map_dedup) {
1551 // Dedup against an existing map at a nearby epoch
1552 OSDMapRef for_dedup = map_cache.lower_bound(e);
1553 if (for_dedup) {
1554 OSDMap::dedup(for_dedup.get(), o);
1555 }
1556 }
1557 bool existed;
1558 OSDMapRef l = map_cache.add(e, o, &existed);
1559 if (existed) {
1560 delete o;
1561 }
1562 return l;
1563}
1564
1565OSDMapRef OSDService::try_get_map(epoch_t epoch)
1566{
11fdf7f2 1567 std::lock_guard l(map_cache_lock);
7c673cae
FG
1568 OSDMapRef retval = map_cache.lookup(epoch);
1569 if (retval) {
1570 dout(30) << "get_map " << epoch << " -cached" << dendl;
1571 if (logger) {
1572 logger->inc(l_osd_map_cache_hit);
1573 }
1574 return retval;
1575 }
1576 if (logger) {
1577 logger->inc(l_osd_map_cache_miss);
1578 epoch_t lb = map_cache.cached_key_lower_bound();
1579 if (epoch < lb) {
1580 dout(30) << "get_map " << epoch << " - miss, below lower bound" << dendl;
1581 logger->inc(l_osd_map_cache_miss_low);
1582 logger->inc(l_osd_map_cache_miss_low_avg, lb - epoch);
1583 }
1584 }
1585
1586 OSDMap *map = new OSDMap;
1587 if (epoch > 0) {
1588 dout(20) << "get_map " << epoch << " - loading and decoding " << map << dendl;
1589 bufferlist bl;
1590 if (!_get_map_bl(epoch, bl) || bl.length() == 0) {
1591 derr << "failed to load OSD map for epoch " << epoch << ", got " << bl.length() << " bytes" << dendl;
1592 delete map;
1593 return OSDMapRef();
1594 }
1595 map->decode(bl);
1596 } else {
1597 dout(20) << "get_map " << epoch << " - return initial " << map << dendl;
1598 }
1599 return _add_map(map);
1600}
1601
1602// ops
1603
1604
1605void OSDService::reply_op_error(OpRequestRef op, int err)
1606{
9f95a23c 1607 reply_op_error(op, err, eversion_t(), 0, {});
7c673cae
FG
1608}
1609
1610void OSDService::reply_op_error(OpRequestRef op, int err, eversion_t v,
9f95a23c
TL
1611 version_t uv,
1612 vector<pg_log_op_return_item_t> op_returns)
7c673cae 1613{
9f95a23c 1614 auto m = op->get_req<MOSDOp>();
11fdf7f2 1615 ceph_assert(m->get_type() == CEPH_MSG_OSD_OP);
7c673cae
FG
1616 int flags;
1617 flags = m->get_flags() & (CEPH_OSD_FLAG_ACK|CEPH_OSD_FLAG_ONDISK);
1618
9f95a23c
TL
1619 MOSDOpReply *reply = new MOSDOpReply(m, err, osdmap->get_epoch(), flags,
1620 !m->has_flag(CEPH_OSD_FLAG_RETURNVEC));
7c673cae 1621 reply->set_reply_versions(v, uv);
9f95a23c 1622 reply->set_op_returns(op_returns);
7c673cae
FG
1623 m->get_connection()->send_message(reply);
1624}
1625
1626void OSDService::handle_misdirected_op(PG *pg, OpRequestRef op)
1627{
31f18b77
FG
1628 if (!cct->_conf->osd_debug_misdirected_ops) {
1629 return;
1630 }
1631
9f95a23c 1632 auto m = op->get_req<MOSDOp>();
11fdf7f2 1633 ceph_assert(m->get_type() == CEPH_MSG_OSD_OP);
7c673cae 1634
11fdf7f2 1635 ceph_assert(m->get_map_epoch() >= pg->get_history().same_primary_since);
7c673cae
FG
1636
1637 if (pg->is_ec_pg()) {
1638 /**
1639 * OSD recomputes op target based on current OSDMap. With an EC pg, we
1640 * can get this result:
1641 * 1) client at map 512 sends an op to osd 3, pg_t 3.9 based on mapping
1642 * [CRUSH_ITEM_NONE, 2, 3]/3
1643 * 2) OSD 3 at map 513 remaps op to osd 3, spg_t 3.9s0 based on mapping
1644 * [3, 2, 3]/3
1645 * 3) PG 3.9s0 dequeues the op at epoch 512 and notices that it isn't primary
1646 * -- misdirected op
1647 * 4) client resends and this time PG 3.9s0 having caught up to 513 gets
1648 * it and fulfils it
1649 *
1650 * We can't compute the op target based on the sending map epoch due to
1651 * splitting. The simplest thing is to detect such cases here and drop
1652 * them without an error (the client will resend anyway).
1653 */
11fdf7f2 1654 ceph_assert(m->get_map_epoch() <= superblock.newest_map);
7c673cae
FG
1655 OSDMapRef opmap = try_get_map(m->get_map_epoch());
1656 if (!opmap) {
1657 dout(7) << __func__ << ": " << *pg << " no longer have map for "
1658 << m->get_map_epoch() << ", dropping" << dendl;
1659 return;
1660 }
1661 pg_t _pgid = m->get_raw_pg();
1662 spg_t pgid;
1663 if ((m->get_flags() & CEPH_OSD_FLAG_PGOP) == 0)
1664 _pgid = opmap->raw_pg_to_pg(_pgid);
1665 if (opmap->get_primary_shard(_pgid, &pgid) &&
11fdf7f2 1666 pgid.shard != pg->pg_id.shard) {
7c673cae
FG
1667 dout(7) << __func__ << ": " << *pg << " primary changed since "
1668 << m->get_map_epoch() << ", dropping" << dendl;
1669 return;
1670 }
1671 }
1672
1673 dout(7) << *pg << " misdirected op in " << m->get_map_epoch() << dendl;
1674 clog->warn() << m->get_source_inst() << " misdirected " << m->get_reqid()
1675 << " pg " << m->get_raw_pg()
1676 << " to osd." << whoami
11fdf7f2 1677 << " not " << pg->get_acting()
7c673cae 1678 << " in e" << m->get_map_epoch() << "/" << osdmap->get_epoch();
7c673cae
FG
1679}
1680
9f95a23c 1681void OSDService::enqueue_back(OpSchedulerItem&& qi)
7c673cae 1682{
11fdf7f2 1683 osd->op_shardedwq.queue(std::move(qi));
7c673cae
FG
1684}
1685
9f95a23c 1686void OSDService::enqueue_front(OpSchedulerItem&& qi)
7c673cae 1687{
11fdf7f2 1688 osd->op_shardedwq.queue_front(std::move(qi));
7c673cae
FG
1689}
1690
11fdf7f2
TL
1691void OSDService::queue_recovery_context(
1692 PG *pg,
1693 GenContext<ThreadPool::TPHandle&> *c)
7c673cae 1694{
11fdf7f2
TL
1695 epoch_t e = get_osdmap_epoch();
1696 enqueue_back(
9f95a23c
TL
1697 OpSchedulerItem(
1698 unique_ptr<OpSchedulerItem::OpQueueable>(
11fdf7f2
TL
1699 new PGRecoveryContext(pg->get_pgid(), c, e)),
1700 cct->_conf->osd_recovery_cost,
1701 cct->_conf->osd_recovery_priority,
1702 ceph_clock_now(),
1703 0,
1704 e));
7c673cae
FG
1705}
1706
1707void OSDService::queue_for_snap_trim(PG *pg)
1708{
1709 dout(10) << "queueing " << *pg << " for snaptrim" << dendl;
11fdf7f2 1710 enqueue_back(
9f95a23c
TL
1711 OpSchedulerItem(
1712 unique_ptr<OpSchedulerItem::OpQueueable>(
11fdf7f2
TL
1713 new PGSnapTrim(pg->get_pgid(), pg->get_osdmap_epoch())),
1714 cct->_conf->osd_snap_trim_cost,
1715 cct->_conf->osd_snap_trim_priority,
1716 ceph_clock_now(),
1717 0,
1718 pg->get_osdmap_epoch()));
1719}
1720
1721void OSDService::queue_for_scrub(PG *pg, bool with_high_priority)
1722{
1723 unsigned scrub_queue_priority = pg->scrubber.priority;
1724 if (with_high_priority && scrub_queue_priority < cct->_conf->osd_client_op_priority) {
1725 scrub_queue_priority = cct->_conf->osd_client_op_priority;
1726 }
1727 const auto epoch = pg->get_osdmap_epoch();
1728 enqueue_back(
9f95a23c
TL
1729 OpSchedulerItem(
1730 unique_ptr<OpSchedulerItem::OpQueueable>(new PGScrub(pg->get_pgid(), epoch)),
11fdf7f2
TL
1731 cct->_conf->osd_scrub_cost,
1732 scrub_queue_priority,
1733 ceph_clock_now(),
1734 0,
1735 epoch));
1736}
1737
1738void OSDService::queue_for_pg_delete(spg_t pgid, epoch_t e)
1739{
1740 dout(10) << __func__ << " on " << pgid << " e " << e << dendl;
1741 enqueue_back(
9f95a23c
TL
1742 OpSchedulerItem(
1743 unique_ptr<OpSchedulerItem::OpQueueable>(
11fdf7f2
TL
1744 new PGDelete(pgid, e)),
1745 cct->_conf->osd_pg_delete_cost,
1746 cct->_conf->osd_pg_delete_priority,
1747 ceph_clock_now(),
1748 0,
1749 e));
1750}
1751
1752bool OSDService::try_finish_pg_delete(PG *pg, unsigned old_pg_num)
1753{
1754 return osd->try_finish_pg_delete(pg, old_pg_num);
1755}
1756
1757// ---
1758
1759void OSDService::set_ready_to_merge_source(PG *pg, eversion_t version)
1760{
1761 std::lock_guard l(merge_lock);
1762 dout(10) << __func__ << " " << pg->pg_id << dendl;
1763 ready_to_merge_source[pg->pg_id.pgid] = version;
1764 assert(not_ready_to_merge_source.count(pg->pg_id.pgid) == 0);
1765 _send_ready_to_merge();
1766}
1767
1768void OSDService::set_ready_to_merge_target(PG *pg,
1769 eversion_t version,
1770 epoch_t last_epoch_started,
1771 epoch_t last_epoch_clean)
1772{
1773 std::lock_guard l(merge_lock);
1774 dout(10) << __func__ << " " << pg->pg_id << dendl;
1775 ready_to_merge_target.insert(make_pair(pg->pg_id.pgid,
1776 make_tuple(version,
1777 last_epoch_started,
1778 last_epoch_clean)));
1779 assert(not_ready_to_merge_target.count(pg->pg_id.pgid) == 0);
1780 _send_ready_to_merge();
1781}
1782
1783void OSDService::set_not_ready_to_merge_source(pg_t source)
1784{
1785 std::lock_guard l(merge_lock);
1786 dout(10) << __func__ << " " << source << dendl;
1787 not_ready_to_merge_source.insert(source);
1788 assert(ready_to_merge_source.count(source) == 0);
1789 _send_ready_to_merge();
1790}
1791
1792void OSDService::set_not_ready_to_merge_target(pg_t target, pg_t source)
1793{
1794 std::lock_guard l(merge_lock);
1795 dout(10) << __func__ << " " << target << " source " << source << dendl;
1796 not_ready_to_merge_target[target] = source;
1797 assert(ready_to_merge_target.count(target) == 0);
1798 _send_ready_to_merge();
1799}
1800
1801void OSDService::send_ready_to_merge()
1802{
1803 std::lock_guard l(merge_lock);
1804 _send_ready_to_merge();
1805}
1806
1807void OSDService::_send_ready_to_merge()
1808{
1809 dout(20) << __func__
1810 << " ready_to_merge_source " << ready_to_merge_source
1811 << " not_ready_to_merge_source " << not_ready_to_merge_source
1812 << " ready_to_merge_target " << ready_to_merge_target
1813 << " not_ready_to_merge_target " << not_ready_to_merge_target
1814 << " sent_ready_to_merge_source " << sent_ready_to_merge_source
1815 << dendl;
1816 for (auto src : not_ready_to_merge_source) {
1817 if (sent_ready_to_merge_source.count(src) == 0) {
1818 monc->send_mon_message(new MOSDPGReadyToMerge(
1819 src,
1820 {}, {}, 0, 0,
1821 false,
1822 osdmap->get_epoch()));
1823 sent_ready_to_merge_source.insert(src);
1824 }
1825 }
1826 for (auto p : not_ready_to_merge_target) {
1827 if (sent_ready_to_merge_source.count(p.second) == 0) {
1828 monc->send_mon_message(new MOSDPGReadyToMerge(
1829 p.second,
1830 {}, {}, 0, 0,
1831 false,
1832 osdmap->get_epoch()));
1833 sent_ready_to_merge_source.insert(p.second);
1834 }
1835 }
1836 for (auto src : ready_to_merge_source) {
1837 if (not_ready_to_merge_source.count(src.first) ||
1838 not_ready_to_merge_target.count(src.first.get_parent())) {
1839 continue;
1840 }
1841 auto p = ready_to_merge_target.find(src.first.get_parent());
1842 if (p != ready_to_merge_target.end() &&
1843 sent_ready_to_merge_source.count(src.first) == 0) {
1844 monc->send_mon_message(new MOSDPGReadyToMerge(
1845 src.first, // source pgid
1846 src.second, // src version
1847 std::get<0>(p->second), // target version
1848 std::get<1>(p->second), // PG's last_epoch_started
1849 std::get<2>(p->second), // PG's last_epoch_clean
1850 true,
1851 osdmap->get_epoch()));
1852 sent_ready_to_merge_source.insert(src.first);
1853 }
1854 }
1855}
1856
1857void OSDService::clear_ready_to_merge(PG *pg)
1858{
1859 std::lock_guard l(merge_lock);
1860 dout(10) << __func__ << " " << pg->pg_id << dendl;
1861 ready_to_merge_source.erase(pg->pg_id.pgid);
1862 ready_to_merge_target.erase(pg->pg_id.pgid);
1863 not_ready_to_merge_source.erase(pg->pg_id.pgid);
1864 not_ready_to_merge_target.erase(pg->pg_id.pgid);
1865 sent_ready_to_merge_source.erase(pg->pg_id.pgid);
1866}
1867
1868void OSDService::clear_sent_ready_to_merge()
1869{
1870 std::lock_guard l(merge_lock);
1871 sent_ready_to_merge_source.clear();
1872}
1873
9f95a23c 1874void OSDService::prune_sent_ready_to_merge(const OSDMapRef& osdmap)
11fdf7f2
TL
1875{
1876 std::lock_guard l(merge_lock);
1877 auto i = sent_ready_to_merge_source.begin();
1878 while (i != sent_ready_to_merge_source.end()) {
1879 if (!osdmap->pg_exists(*i)) {
1880 dout(10) << __func__ << " " << *i << dendl;
1881 i = sent_ready_to_merge_source.erase(i);
1882 } else {
1883 ++i;
1884 }
1885 }
7c673cae
FG
1886}
1887
11fdf7f2
TL
1888// ---
1889
1890void OSDService::_queue_for_recovery(
1891 std::pair<epoch_t, PGRef> p,
1892 uint64_t reserved_pushes)
1893{
9f95a23c 1894 ceph_assert(ceph_mutex_is_locked_by_me(recovery_lock));
11fdf7f2 1895 enqueue_back(
9f95a23c
TL
1896 OpSchedulerItem(
1897 unique_ptr<OpSchedulerItem::OpQueueable>(
11fdf7f2
TL
1898 new PGRecovery(
1899 p.second->get_pgid(), p.first, reserved_pushes)),
1900 cct->_conf->osd_recovery_cost,
1901 cct->_conf->osd_recovery_priority,
1902 ceph_clock_now(),
1903 0,
1904 p.first));
1905}
7c673cae
FG
1906
1907// ====================================================================
1908// OSD
1909
1910#undef dout_prefix
1911#define dout_prefix *_dout
1912
1913// Commands shared between OSD's console and admin console:
1914namespace ceph {
1915namespace osd_cmds {
1916
11fdf7f2 1917int heap(CephContext& cct, const cmdmap_t& cmdmap, Formatter& f, std::ostream& os);
7c673cae
FG
1918
1919}} // namespace ceph::osd_cmds
1920
e306af50 1921int OSD::mkfs(CephContext *cct, ObjectStore *store, uuid_d fsid, int whoami, string osdspec_affinity)
7c673cae
FG
1922{
1923 int ret;
1924
7c673cae
FG
1925 OSDSuperblock sb;
1926 bufferlist sbbl;
11fdf7f2 1927 ObjectStore::CollectionHandle ch;
7c673cae
FG
1928
1929 // if we are fed a uuid for this osd, use it.
1930 store->set_fsid(cct->_conf->osd_uuid);
1931
1932 ret = store->mkfs();
1933 if (ret) {
224ce89b
WB
1934 derr << "OSD::mkfs: ObjectStore::mkfs failed with error "
1935 << cpp_strerror(ret) << dendl;
7c673cae
FG
1936 goto free_store;
1937 }
1938
31f18b77 1939 store->set_cache_shards(1); // doesn't matter for mkfs!
7c673cae
FG
1940
1941 ret = store->mount();
1942 if (ret) {
224ce89b
WB
1943 derr << "OSD::mkfs: couldn't mount ObjectStore: error "
1944 << cpp_strerror(ret) << dendl;
7c673cae
FG
1945 goto free_store;
1946 }
1947
11fdf7f2
TL
1948 ch = store->open_collection(coll_t::meta());
1949 if (ch) {
1950 ret = store->read(ch, OSD_SUPERBLOCK_GOBJECT, 0, 0, sbbl);
1951 if (ret < 0) {
1952 derr << "OSD::mkfs: have meta collection but no superblock" << dendl;
1953 goto free_store;
1954 }
7c673cae
FG
1955 /* if we already have superblock, check content of superblock */
1956 dout(0) << " have superblock" << dendl;
11fdf7f2
TL
1957 auto p = sbbl.cbegin();
1958 decode(sb, p);
7c673cae
FG
1959 if (whoami != sb.whoami) {
1960 derr << "provided osd id " << whoami << " != superblock's " << sb.whoami
1961 << dendl;
1962 ret = -EINVAL;
1963 goto umount_store;
1964 }
1965 if (fsid != sb.cluster_fsid) {
1966 derr << "provided cluster fsid " << fsid
1967 << " != superblock's " << sb.cluster_fsid << dendl;
1968 ret = -EINVAL;
1969 goto umount_store;
1970 }
1971 } else {
1972 // create superblock
1973 sb.cluster_fsid = fsid;
1974 sb.osd_fsid = store->get_fsid();
1975 sb.whoami = whoami;
1976 sb.compat_features = get_osd_initial_compat_set();
1977
1978 bufferlist bl;
11fdf7f2 1979 encode(sb, bl);
7c673cae 1980
11fdf7f2
TL
1981 ObjectStore::CollectionHandle ch = store->create_new_collection(
1982 coll_t::meta());
7c673cae
FG
1983 ObjectStore::Transaction t;
1984 t.create_collection(coll_t::meta(), 0);
1985 t.write(coll_t::meta(), OSD_SUPERBLOCK_GOBJECT, 0, bl.length(), bl);
11fdf7f2 1986 ret = store->queue_transaction(ch, std::move(t));
7c673cae
FG
1987 if (ret) {
1988 derr << "OSD::mkfs: error while writing OSD_SUPERBLOCK_GOBJECT: "
11fdf7f2 1989 << "queue_transaction returned " << cpp_strerror(ret) << dendl;
7c673cae
FG
1990 goto umount_store;
1991 }
1992 }
1993
e306af50 1994 ret = write_meta(cct, store, sb.cluster_fsid, sb.osd_fsid, whoami, osdspec_affinity);
7c673cae 1995 if (ret) {
224ce89b
WB
1996 derr << "OSD::mkfs: failed to write fsid file: error "
1997 << cpp_strerror(ret) << dendl;
7c673cae
FG
1998 goto umount_store;
1999 }
2000
2001umount_store:
11fdf7f2
TL
2002 if (ch) {
2003 ch.reset();
2004 }
7c673cae
FG
2005 store->umount();
2006free_store:
2007 delete store;
2008 return ret;
2009}
2010
e306af50 2011int OSD::write_meta(CephContext *cct, ObjectStore *store, uuid_d& cluster_fsid, uuid_d& osd_fsid, int whoami, string& osdspec_affinity)
7c673cae
FG
2012{
2013 char val[80];
2014 int r;
2015
2016 snprintf(val, sizeof(val), "%s", CEPH_OSD_ONDISK_MAGIC);
2017 r = store->write_meta("magic", val);
2018 if (r < 0)
2019 return r;
2020
2021 snprintf(val, sizeof(val), "%d", whoami);
2022 r = store->write_meta("whoami", val);
2023 if (r < 0)
2024 return r;
2025
2026 cluster_fsid.print(val);
2027 r = store->write_meta("ceph_fsid", val);
2028 if (r < 0)
2029 return r;
2030
11fdf7f2 2031 string key = cct->_conf.get_val<string>("key");
3efd9988
FG
2032 if (key.size()) {
2033 r = store->write_meta("osd_key", key);
2034 if (r < 0)
2035 return r;
b32b8144 2036 } else {
11fdf7f2 2037 string keyfile = cct->_conf.get_val<string>("keyfile");
b32b8144
FG
2038 if (!keyfile.empty()) {
2039 bufferlist keybl;
2040 string err;
11fdf7f2 2041 r = keybl.read_file(keyfile.c_str(), &err);
b32b8144
FG
2042 if (r < 0) {
2043 derr << __func__ << " failed to read keyfile " << keyfile << ": "
2044 << err << ": " << cpp_strerror(r) << dendl;
2045 return r;
2046 }
2047 r = store->write_meta("osd_key", keybl.to_str());
2048 if (r < 0)
2049 return r;
2050 }
3efd9988 2051 }
e306af50
TL
2052 if (!osdspec_affinity.empty()) {
2053 r = store->write_meta("osdspec_affinity", osdspec_affinity.c_str());
2054 if (r < 0)
2055 return r;
2056 }
3efd9988 2057
7c673cae
FG
2058 r = store->write_meta("ready", "ready");
2059 if (r < 0)
2060 return r;
2061
2062 return 0;
2063}
2064
11fdf7f2
TL
2065int OSD::peek_meta(ObjectStore *store,
2066 std::string *magic,
2067 uuid_d *cluster_fsid,
2068 uuid_d *osd_fsid,
2069 int *whoami,
9f95a23c 2070 ceph_release_t *require_osd_release)
7c673cae
FG
2071{
2072 string val;
2073
2074 int r = store->read_meta("magic", &val);
2075 if (r < 0)
2076 return r;
11fdf7f2 2077 *magic = val;
7c673cae
FG
2078
2079 r = store->read_meta("whoami", &val);
2080 if (r < 0)
2081 return r;
11fdf7f2 2082 *whoami = atoi(val.c_str());
7c673cae
FG
2083
2084 r = store->read_meta("ceph_fsid", &val);
2085 if (r < 0)
2086 return r;
11fdf7f2 2087 r = cluster_fsid->parse(val.c_str());
7c673cae
FG
2088 if (!r)
2089 return -EINVAL;
2090
2091 r = store->read_meta("fsid", &val);
2092 if (r < 0) {
11fdf7f2 2093 *osd_fsid = uuid_d();
7c673cae 2094 } else {
11fdf7f2 2095 r = osd_fsid->parse(val.c_str());
7c673cae
FG
2096 if (!r)
2097 return -EINVAL;
2098 }
2099
11fdf7f2
TL
2100 r = store->read_meta("require_osd_release", &val);
2101 if (r >= 0) {
9f95a23c 2102 *require_osd_release = ceph_release_from_name(val);
11fdf7f2
TL
2103 }
2104
7c673cae
FG
2105 return 0;
2106}
2107
2108
2109#undef dout_prefix
2110#define dout_prefix _prefix(_dout, whoami, get_osdmap_epoch())
2111
2112// cons/des
2113
2114OSD::OSD(CephContext *cct_, ObjectStore *store_,
2115 int id,
2116 Messenger *internal_messenger,
2117 Messenger *external_messenger,
2118 Messenger *hb_client_front,
2119 Messenger *hb_client_back,
2120 Messenger *hb_front_serverm,
2121 Messenger *hb_back_serverm,
2122 Messenger *osdc_messenger,
2123 MonClient *mc,
2124 const std::string &dev, const std::string &jdev) :
2125 Dispatcher(cct_),
7c673cae 2126 tick_timer(cct, osd_lock),
7c673cae 2127 tick_timer_without_osd_lock(cct, tick_timer_lock),
11fdf7f2 2128 gss_ktfile_client(cct->_conf.get_val<std::string>("gss_ktab_client_file")),
7c673cae
FG
2129 cluster_messenger(internal_messenger),
2130 client_messenger(external_messenger),
2131 objecter_messenger(osdc_messenger),
2132 monc(mc),
9f95a23c 2133 mgrc(cct_, client_messenger, &mc->monmap),
7c673cae
FG
2134 logger(NULL),
2135 recoverystate_perf(NULL),
2136 store(store_),
2137 log_client(cct, client_messenger, &mc->monmap, LogClient::NO_FLAGS),
2138 clog(log_client.create_channel()),
2139 whoami(id),
2140 dev_path(dev), journal_path(jdev),
31f18b77 2141 store_is_rotational(store->is_rotational()),
7c673cae
FG
2142 trace_endpoint("0.0.0.0", 0, "osd"),
2143 asok_hook(NULL),
11fdf7f2
TL
2144 m_osd_pg_epoch_max_lag_factor(cct->_conf.get_val<double>(
2145 "osd_pg_epoch_max_lag_factor")),
7c673cae 2146 osd_compat(get_osd_compat_set()),
7c673cae 2147 osd_op_tp(cct, "OSD::osd_op_tp", "tp_osd_tp",
31f18b77 2148 get_num_op_threads()),
7c673cae
FG
2149 heartbeat_stop(false),
2150 heartbeat_need_update(true),
2151 hb_front_client_messenger(hb_client_front),
2152 hb_back_client_messenger(hb_client_back),
2153 hb_front_server_messenger(hb_front_serverm),
2154 hb_back_server_messenger(hb_back_serverm),
2155 daily_loadavg(0.0),
2156 heartbeat_thread(this),
2157 heartbeat_dispatcher(this),
2158 op_tracker(cct, cct->_conf->osd_enable_op_tracker,
2159 cct->_conf->osd_num_op_tracker_shard),
2160 test_ops_hook(NULL),
7c673cae 2161 op_shardedwq(
7c673cae
FG
2162 this,
2163 cct->_conf->osd_op_thread_timeout,
2164 cct->_conf->osd_op_thread_suicide_timeout,
2165 &osd_op_tp),
7c673cae 2166 last_pg_create_epoch(0),
11fdf7f2 2167 boot_finisher(cct),
7c673cae
FG
2168 up_thru_wanted(0),
2169 requested_full_first(0),
2170 requested_full_last(0),
7c673cae
FG
2171 service(this)
2172{
11fdf7f2
TL
2173
2174 if (!gss_ktfile_client.empty()) {
2175 // Assert we can export environment variable
2176 /*
2177 The default client keytab is used, if it is present and readable,
2178 to automatically obtain initial credentials for GSSAPI client
2179 applications. The principal name of the first entry in the client
2180 keytab is used by default when obtaining initial credentials.
2181 1. The KRB5_CLIENT_KTNAME environment variable.
2182 2. The default_client_keytab_name profile variable in [libdefaults].
2183 3. The hardcoded default, DEFCKTNAME.
2184 */
2185 const int32_t set_result(setenv("KRB5_CLIENT_KTNAME",
2186 gss_ktfile_client.c_str(), 1));
2187 ceph_assert(set_result == 0);
2188 }
2189
7c673cae
FG
2190 monc->set_messenger(client_messenger);
2191 op_tracker.set_complaint_and_threshold(cct->_conf->osd_op_complaint_time,
2192 cct->_conf->osd_op_log_threshold);
2193 op_tracker.set_history_size_and_duration(cct->_conf->osd_op_history_size,
2194 cct->_conf->osd_op_history_duration);
2195 op_tracker.set_history_slow_op_size_and_threshold(cct->_conf->osd_op_history_slow_op_size,
2196 cct->_conf->osd_op_history_slow_op_threshold);
9f95a23c 2197 ObjectCleanRegions::set_max_num_intervals(cct->_conf->osd_object_clean_region_max_num_intervals);
7c673cae
FG
2198#ifdef WITH_BLKIN
2199 std::stringstream ss;
2200 ss << "osd." << whoami;
2201 trace_endpoint.copy_name(ss.str());
2202#endif
11fdf7f2
TL
2203
2204 // initialize shards
2205 num_shards = get_num_op_shards();
2206 for (uint32_t i = 0; i < num_shards; i++) {
2207 OSDShard *one_shard = new OSDShard(
2208 i,
2209 cct,
9f95a23c 2210 this);
11fdf7f2
TL
2211 shards.push_back(one_shard);
2212 }
7c673cae
FG
2213}
2214
2215OSD::~OSD()
2216{
11fdf7f2
TL
2217 while (!shards.empty()) {
2218 delete shards.back();
2219 shards.pop_back();
2220 }
7c673cae
FG
2221 cct->get_perfcounters_collection()->remove(recoverystate_perf);
2222 cct->get_perfcounters_collection()->remove(logger);
2223 delete recoverystate_perf;
2224 delete logger;
2225 delete store;
2226}
2227
91327a77
AA
2228double OSD::get_tick_interval() const
2229{
2230 // vary +/- 5% to avoid scrub scheduling livelocks
2231 constexpr auto delta = 0.05;
91327a77 2232 return (OSD_TICK_INTERVAL *
11fdf7f2 2233 ceph::util::generate_random_number(1.0 - delta, 1.0 + delta));
91327a77
AA
2234}
2235
7c673cae
FG
2236void OSD::handle_signal(int signum)
2237{
11fdf7f2 2238 ceph_assert(signum == SIGINT || signum == SIGTERM);
7c673cae
FG
2239 derr << "*** Got signal " << sig_str(signum) << " ***" << dendl;
2240 shutdown();
2241}
2242
2243int OSD::pre_init()
2244{
11fdf7f2 2245 std::lock_guard lock(osd_lock);
7c673cae
FG
2246 if (is_stopping())
2247 return 0;
2248
2249 if (store->test_mount_in_use()) {
2250 derr << "OSD::pre_init: object store '" << dev_path << "' is "
2251 << "currently in use. (Is ceph-osd already running?)" << dendl;
2252 return -EBUSY;
2253 }
2254
11fdf7f2
TL
2255 cct->_conf.add_observer(this);
2256 return 0;
2257}
2258
2259int OSD::set_numa_affinity()
2260{
2261 // storage numa node
2262 int store_node = -1;
2263 store->get_numa_node(&store_node, nullptr, nullptr);
2264 if (store_node >= 0) {
2265 dout(1) << __func__ << " storage numa node " << store_node << dendl;
2266 }
2267
2268 // check network numa node(s)
2269 int front_node = -1, back_node = -1;
2270 string front_iface = pick_iface(
2271 cct,
2272 client_messenger->get_myaddrs().front().get_sockaddr_storage());
2273 string back_iface = pick_iface(
2274 cct,
2275 cluster_messenger->get_myaddrs().front().get_sockaddr_storage());
2276 int r = get_iface_numa_node(front_iface, &front_node);
92f5a8d4 2277 if (r >= 0 && front_node >= 0) {
11fdf7f2 2278 dout(1) << __func__ << " public network " << front_iface << " numa node "
92f5a8d4 2279 << front_node << dendl;
11fdf7f2 2280 r = get_iface_numa_node(back_iface, &back_node);
92f5a8d4 2281 if (r >= 0 && back_node >= 0) {
11fdf7f2
TL
2282 dout(1) << __func__ << " cluster network " << back_iface << " numa node "
2283 << back_node << dendl;
2284 if (front_node == back_node &&
2285 front_node == store_node) {
2286 dout(1) << " objectstore and network numa nodes all match" << dendl;
2287 if (g_conf().get_val<bool>("osd_numa_auto_affinity")) {
2288 numa_node = front_node;
2289 }
92f5a8d4
TL
2290 } else if (front_node != back_node) {
2291 dout(1) << __func__ << " public and cluster network numa nodes do not match"
2292 << dendl;
11fdf7f2
TL
2293 } else {
2294 dout(1) << __func__ << " objectstore and network numa nodes do not match"
2295 << dendl;
2296 }
92f5a8d4
TL
2297 } else if (back_node == -2) {
2298 dout(1) << __func__ << " cluster network " << back_iface
2299 << " ports numa nodes do not match" << dendl;
2300 } else {
2301 derr << __func__ << " unable to identify cluster interface '" << back_iface
2302 << "' numa node: " << cpp_strerror(r) << dendl;
11fdf7f2 2303 }
92f5a8d4
TL
2304 } else if (front_node == -2) {
2305 dout(1) << __func__ << " public network " << front_iface
2306 << " ports numa nodes do not match" << dendl;
11fdf7f2
TL
2307 } else {
2308 derr << __func__ << " unable to identify public interface '" << front_iface
2309 << "' numa node: " << cpp_strerror(r) << dendl;
2310 }
2311 if (int node = g_conf().get_val<int64_t>("osd_numa_node"); node >= 0) {
2312 // this takes precedence over the automagic logic above
2313 numa_node = node;
2314 }
2315 if (numa_node >= 0) {
2316 int r = get_numa_node_cpu_set(numa_node, &numa_cpu_set_size, &numa_cpu_set);
2317 if (r < 0) {
2318 dout(1) << __func__ << " unable to determine numa node " << numa_node
2319 << " CPUs" << dendl;
2320 numa_node = -1;
2321 } else {
2322 dout(1) << __func__ << " setting numa affinity to node " << numa_node
2323 << " cpus "
2324 << cpu_set_to_str_list(numa_cpu_set_size, &numa_cpu_set)
2325 << dendl;
92f5a8d4 2326 r = set_cpu_affinity_all_threads(numa_cpu_set_size, &numa_cpu_set);
11fdf7f2
TL
2327 if (r < 0) {
2328 r = -errno;
2329 derr << __func__ << " failed to set numa affinity: " << cpp_strerror(r)
2330 << dendl;
2331 numa_node = -1;
2332 }
2333 }
2334 } else {
2335 dout(1) << __func__ << " not setting numa affinity" << dendl;
2336 }
7c673cae
FG
2337 return 0;
2338}
2339
2340// asok
2341
2342class OSDSocketHook : public AdminSocketHook {
2343 OSD *osd;
2344public:
2345 explicit OSDSocketHook(OSD *o) : osd(o) {}
9f95a23c
TL
2346 int call(std::string_view prefix, const cmdmap_t& cmdmap,
2347 Formatter *f,
2348 std::ostream& ss,
2349 bufferlist& out) override {
2350 ceph_abort("should use async hook");
2351 }
2352 void call_async(
2353 std::string_view prefix,
2354 const cmdmap_t& cmdmap,
2355 Formatter *f,
2356 const bufferlist& inbl,
2357 std::function<void(int,const std::string&,bufferlist&)> on_finish) override {
11fdf7f2 2358 try {
9f95a23c
TL
2359 osd->asok_command(prefix, cmdmap, f, inbl, on_finish);
2360 } catch (const TOPNSPC::common::bad_cmd_get& e) {
2361 bufferlist empty;
2362 on_finish(-EINVAL, e.what(), empty);
11fdf7f2 2363 }
7c673cae
FG
2364 }
2365};
2366
11fdf7f2
TL
2367std::set<int64_t> OSD::get_mapped_pools()
2368{
2369 std::set<int64_t> pools;
2370 std::vector<spg_t> pgids;
2371 _get_pgids(&pgids);
2372 for (const auto &pgid : pgids) {
2373 pools.insert(pgid.pool());
2374 }
2375 return pools;
2376}
2377
9f95a23c
TL
2378void OSD::asok_command(
2379 std::string_view prefix, const cmdmap_t& cmdmap,
2380 Formatter *f,
2381 const bufferlist& inbl,
2382 std::function<void(int,const std::string&,bufferlist&)> on_finish)
7c673cae 2383{
9f95a23c
TL
2384 int ret = 0;
2385 stringstream ss; // stderr error message stream
2386 bufferlist outbl; // if empty at end, we'll dump formatter as output
2387
2388 // --- PG commands are routed here to PG::do_command ---
2389 if (prefix == "pg" ||
2390 prefix == "query" ||
2391 prefix == "mark_unfound_lost" ||
2392 prefix == "list_unfound" ||
2393 prefix == "scrub" ||
2394 prefix == "deep_scrub"
2395 ) {
2396 string pgidstr;
2397 pg_t pgid;
2398 if (!cmd_getval(cmdmap, "pgid", pgidstr)) {
2399 ss << "no pgid specified";
2400 ret = -EINVAL;
2401 goto out;
2402 }
2403 if (!pgid.parse(pgidstr.c_str())) {
2404 ss << "couldn't parse pgid '" << pgidstr << "'";
2405 ret = -EINVAL;
2406 goto out;
2407 }
2408 spg_t pcand;
2409 PGRef pg;
2410 if (get_osdmap()->get_primary_shard(pgid, &pcand) &&
2411 (pg = _lookup_lock_pg(pcand))) {
2412 if (pg->is_primary()) {
2413 cmdmap_t new_cmdmap = cmdmap;
2414 try {
2415 pg->do_command(prefix, new_cmdmap, inbl, on_finish);
2416 pg->unlock();
2417 return; // the pg handler calls on_finish directly
2418 } catch (const TOPNSPC::common::bad_cmd_get& e) {
2419 pg->unlock();
2420 ss << e.what();
2421 ret = -EINVAL;
2422 goto out;
2423 }
2424 } else {
2425 ss << "not primary for pgid " << pgid;
2426 // do not reply; they will get newer maps and realize they
2427 // need to resend.
2428 pg->unlock();
2429 ret = -EAGAIN;
2430 goto out;
2431 }
2432 } else {
2433 ss << "i don't have pgid " << pgid;
2434 ret = -ENOENT;
2435 }
2436 }
2437
2438 // --- OSD commands follow ---
2439
2440 else if (prefix == "status") {
2441 lock_guard l(osd_lock);
7c673cae
FG
2442 f->open_object_section("status");
2443 f->dump_stream("cluster_fsid") << superblock.cluster_fsid;
2444 f->dump_stream("osd_fsid") << superblock.osd_fsid;
2445 f->dump_unsigned("whoami", superblock.whoami);
2446 f->dump_string("state", get_state_name(get_state()));
2447 f->dump_unsigned("oldest_map", superblock.oldest_map);
2448 f->dump_unsigned("newest_map", superblock.newest_map);
11fdf7f2 2449 f->dump_unsigned("num_pgs", num_pgs);
7c673cae 2450 f->close_section();
9f95a23c 2451 } else if (prefix == "flush_journal") {
7c673cae 2452 store->flush_journal();
9f95a23c
TL
2453 } else if (prefix == "dump_ops_in_flight" ||
2454 prefix == "ops" ||
2455 prefix == "dump_blocked_ops" ||
2456 prefix == "dump_historic_ops" ||
2457 prefix == "dump_historic_ops_by_duration" ||
2458 prefix == "dump_historic_slow_ops") {
c07f9fc5
FG
2459
2460 const string error_str = "op_tracker tracking is not enabled now, so no ops are tracked currently, \
2461even those get stuck. Please enable \"osd_enable_op_tracker\", and the tracker \
2462will start to track new ops received afterwards.";
2463
2464 set<string> filters;
2465 vector<string> filter_str;
9f95a23c 2466 if (cmd_getval(cmdmap, "filterstr", filter_str)) {
c07f9fc5
FG
2467 copy(filter_str.begin(), filter_str.end(),
2468 inserter(filters, filters.end()));
2469 }
2470
9f95a23c
TL
2471 if (prefix == "dump_ops_in_flight" ||
2472 prefix == "ops") {
c07f9fc5
FG
2473 if (!op_tracker.dump_ops_in_flight(f, false, filters)) {
2474 ss << error_str;
9f95a23c
TL
2475 ret = -EINVAL;
2476 goto out;
c07f9fc5
FG
2477 }
2478 }
9f95a23c 2479 if (prefix == "dump_blocked_ops") {
c07f9fc5
FG
2480 if (!op_tracker.dump_ops_in_flight(f, true, filters)) {
2481 ss << error_str;
9f95a23c
TL
2482 ret = -EINVAL;
2483 goto out;
c07f9fc5
FG
2484 }
2485 }
9f95a23c 2486 if (prefix == "dump_historic_ops") {
c07f9fc5
FG
2487 if (!op_tracker.dump_historic_ops(f, false, filters)) {
2488 ss << error_str;
9f95a23c
TL
2489 ret = -EINVAL;
2490 goto out;
c07f9fc5
FG
2491 }
2492 }
9f95a23c 2493 if (prefix == "dump_historic_ops_by_duration") {
c07f9fc5
FG
2494 if (!op_tracker.dump_historic_ops(f, true, filters)) {
2495 ss << error_str;
9f95a23c
TL
2496 ret = -EINVAL;
2497 goto out;
c07f9fc5
FG
2498 }
2499 }
9f95a23c 2500 if (prefix == "dump_historic_slow_ops") {
c07f9fc5
FG
2501 if (!op_tracker.dump_historic_slow_ops(f, filters)) {
2502 ss << error_str;
9f95a23c
TL
2503 ret = -EINVAL;
2504 goto out;
c07f9fc5 2505 }
7c673cae 2506 }
9f95a23c 2507 } else if (prefix == "dump_op_pq_state") {
7c673cae
FG
2508 f->open_object_section("pq");
2509 op_shardedwq.dump(f);
2510 f->close_section();
9f95a23c 2511 } else if (prefix == "dump_blacklist") {
7c673cae
FG
2512 list<pair<entity_addr_t,utime_t> > bl;
2513 OSDMapRef curmap = service.get_osdmap();
2514
2515 f->open_array_section("blacklist");
2516 curmap->get_blacklist(&bl);
2517 for (list<pair<entity_addr_t,utime_t> >::iterator it = bl.begin();
2518 it != bl.end(); ++it) {
224ce89b 2519 f->open_object_section("entry");
7c673cae
FG
2520 f->open_object_section("entity_addr_t");
2521 it->first.dump(f);
2522 f->close_section(); //entity_addr_t
2523 it->second.localtime(f->dump_stream("expire_time"));
2524 f->close_section(); //entry
2525 }
2526 f->close_section(); //blacklist
9f95a23c 2527 } else if (prefix == "dump_watchers") {
7c673cae
FG
2528 list<obj_watch_item_t> watchers;
2529 // scan pg's
11fdf7f2
TL
2530 vector<PGRef> pgs;
2531 _get_pgs(&pgs);
2532 for (auto& pg : pgs) {
2533 list<obj_watch_item_t> pg_watchers;
2534 pg->get_watchers(&pg_watchers);
2535 watchers.splice(watchers.end(), pg_watchers);
7c673cae
FG
2536 }
2537
2538 f->open_array_section("watchers");
2539 for (list<obj_watch_item_t>::iterator it = watchers.begin();
2540 it != watchers.end(); ++it) {
2541
224ce89b 2542 f->open_object_section("watch");
7c673cae
FG
2543
2544 f->dump_string("namespace", it->obj.nspace);
2545 f->dump_string("object", it->obj.oid.name);
2546
2547 f->open_object_section("entity_name");
2548 it->wi.name.dump(f);
2549 f->close_section(); //entity_name_t
2550
224ce89b
WB
2551 f->dump_unsigned("cookie", it->wi.cookie);
2552 f->dump_unsigned("timeout", it->wi.timeout_seconds);
7c673cae
FG
2553
2554 f->open_object_section("entity_addr_t");
2555 it->wi.addr.dump(f);
2556 f->close_section(); //entity_addr_t
2557
2558 f->close_section(); //watch
2559 }
2560
2561 f->close_section(); //watchers
9f95a23c 2562 } else if (prefix == "dump_recovery_reservations") {
7c673cae
FG
2563 f->open_object_section("reservations");
2564 f->open_object_section("local_reservations");
2565 service.local_reserver.dump(f);
2566 f->close_section();
2567 f->open_object_section("remote_reservations");
2568 service.remote_reserver.dump(f);
2569 f->close_section();
2570 f->close_section();
9f95a23c 2571 } else if (prefix == "dump_scrub_reservations") {
eafe8130
TL
2572 f->open_object_section("scrub_reservations");
2573 service.dump_scrub_reservations(f);
2574 f->close_section();
9f95a23c 2575 } else if (prefix == "get_latest_osdmap") {
7c673cae 2576 get_latest_osdmap();
9f95a23c 2577 } else if (prefix == "set_heap_property") {
7c673cae
FG
2578 string property;
2579 int64_t value = 0;
2580 string error;
2581 bool success = false;
9f95a23c 2582 if (!cmd_getval(cmdmap, "property", property)) {
7c673cae
FG
2583 error = "unable to get property";
2584 success = false;
9f95a23c 2585 } else if (!cmd_getval(cmdmap, "value", value)) {
7c673cae
FG
2586 error = "unable to get value";
2587 success = false;
2588 } else if (value < 0) {
2589 error = "negative value not allowed";
2590 success = false;
2591 } else if (!ceph_heap_set_numeric_property(property.c_str(), (size_t)value)) {
2592 error = "invalid property";
2593 success = false;
2594 } else {
2595 success = true;
2596 }
2597 f->open_object_section("result");
2598 f->dump_string("error", error);
2599 f->dump_bool("success", success);
2600 f->close_section();
9f95a23c 2601 } else if (prefix == "get_heap_property") {
7c673cae
FG
2602 string property;
2603 size_t value = 0;
2604 string error;
2605 bool success = false;
9f95a23c 2606 if (!cmd_getval(cmdmap, "property", property)) {
7c673cae
FG
2607 error = "unable to get property";
2608 success = false;
2609 } else if (!ceph_heap_get_numeric_property(property.c_str(), &value)) {
2610 error = "invalid property";
2611 success = false;
2612 } else {
2613 success = true;
2614 }
2615 f->open_object_section("result");
2616 f->dump_string("error", error);
2617 f->dump_bool("success", success);
2618 f->dump_int("value", value);
2619 f->close_section();
9f95a23c 2620 } else if (prefix == "dump_objectstore_kv_stats") {
7c673cae 2621 store->get_db_statistics(f);
9f95a23c 2622 } else if (prefix == "dump_scrubs") {
7c673cae 2623 service.dumps_scrub(f);
9f95a23c 2624 } else if (prefix == "calc_objectstore_db_histogram") {
7c673cae 2625 store->generate_db_histogram(f);
9f95a23c 2626 } else if (prefix == "flush_store_cache") {
11fdf7f2 2627 store->flush_cache(&ss);
9f95a23c 2628 } else if (prefix == "dump_pgstate_history") {
7c673cae 2629 f->open_object_section("pgstate_history");
9f95a23c 2630 f->open_array_section("pgs");
11fdf7f2
TL
2631 vector<PGRef> pgs;
2632 _get_pgs(&pgs);
2633 for (auto& pg : pgs) {
9f95a23c 2634 f->open_object_section("pg");
11fdf7f2 2635 f->dump_stream("pg") << pg->pg_id;
9f95a23c 2636 f->dump_string("currently", pg->get_current_state());
11fdf7f2 2637 pg->dump_pgstate_history(f);
9f95a23c 2638 f->close_section();
7c673cae
FG
2639 }
2640 f->close_section();
9f95a23c
TL
2641 f->close_section();
2642 } else if (prefix == "compact") {
224ce89b
WB
2643 dout(1) << "triggering manual compaction" << dendl;
2644 auto start = ceph::coarse_mono_clock::now();
2645 store->compact();
2646 auto end = ceph::coarse_mono_clock::now();
11fdf7f2 2647 double duration = std::chrono::duration<double>(end-start).count();
224ce89b 2648 dout(1) << "finished manual compaction in "
11fdf7f2 2649 << duration
224ce89b
WB
2650 << " seconds" << dendl;
2651 f->open_object_section("compact_result");
11fdf7f2
TL
2652 f->dump_float("elapsed_time", duration);
2653 f->close_section();
9f95a23c 2654 } else if (prefix == "get_mapped_pools") {
11fdf7f2
TL
2655 f->open_array_section("mapped_pools");
2656 set<int64_t> poollist = get_mapped_pools();
2657 for (auto pool : poollist) {
2658 f->dump_int("pool_id", pool);
2659 }
2660 f->close_section();
9f95a23c 2661 } else if (prefix == "smart") {
11fdf7f2 2662 string devid;
9f95a23c
TL
2663 cmd_getval(cmdmap, "devid", devid);
2664 ostringstream out;
2665 probe_smart(devid, out);
2666 outbl.append(out.str());
2667 } else if (prefix == "list_devices") {
11fdf7f2
TL
2668 set<string> devnames;
2669 store->get_devices(&devnames);
9f95a23c 2670 f->open_array_section("list_devices");
11fdf7f2
TL
2671 for (auto dev : devnames) {
2672 if (dev.find("dm-") == 0) {
2673 continue;
2674 }
9f95a23c
TL
2675 string err;
2676 f->open_object_section("device");
11fdf7f2 2677 f->dump_string("device", "/dev/" + dev);
9f95a23c
TL
2678 f->dump_string("device_id", get_device_id(dev, &err));
2679 f->close_section();
11fdf7f2 2680 }
224ce89b 2681 f->close_section();
9f95a23c
TL
2682 } else if (prefix == "send_beacon") {
2683 lock_guard l(osd_lock);
11fdf7f2
TL
2684 if (is_active()) {
2685 send_beacon(ceph::coarse_mono_clock::now());
2686 }
9f95a23c
TL
2687 }
2688
2689 else if (prefix == "cluster_log") {
2690 vector<string> msg;
2691 cmd_getval(cmdmap, "message", msg);
2692 if (msg.empty()) {
2693 ret = -EINVAL;
2694 ss << "ignoring empty log message";
2695 goto out;
2696 }
2697 string message = msg.front();
2698 for (vector<string>::iterator a = ++msg.begin(); a != msg.end(); ++a)
2699 message += " " + *a;
2700 string lvl;
2701 cmd_getval(cmdmap, "level", lvl);
2702 clog_type level = string_to_clog_type(lvl);
2703 if (level < 0) {
2704 ret = -EINVAL;
2705 ss << "unknown level '" << lvl << "'";
2706 goto out;
2707 }
2708 clog->do_log(level, message);
2709 }
2710
2711 else if (prefix == "bench") {
9f95a23c
TL
2712 int64_t count;
2713 int64_t bsize;
2714 int64_t osize, onum;
2715 // default count 1G, size 4MB
2716 cmd_getval(cmdmap, "count", count, (int64_t)1 << 30);
2717 cmd_getval(cmdmap, "size", bsize, (int64_t)4 << 20);
2718 cmd_getval(cmdmap, "object_size", osize, (int64_t)0);
2719 cmd_getval(cmdmap, "object_num", onum, (int64_t)0);
2720
2721 uint32_t duration = cct->_conf->osd_bench_duration;
2722
2723 if (bsize > (int64_t) cct->_conf->osd_bench_max_block_size) {
2724 // let us limit the block size because the next checks rely on it
2725 // having a sane value. If we allow any block size to be set things
2726 // can still go sideways.
2727 ss << "block 'size' values are capped at "
2728 << byte_u_t(cct->_conf->osd_bench_max_block_size) << ". If you wish to use"
2729 << " a higher value, please adjust 'osd_bench_max_block_size'";
2730 ret = -EINVAL;
2731 goto out;
2732 } else if (bsize < (int64_t) (1 << 20)) {
2733 // entering the realm of small block sizes.
2734 // limit the count to a sane value, assuming a configurable amount of
2735 // IOPS and duration, so that the OSD doesn't get hung up on this,
2736 // preventing timeouts from going off
2737 int64_t max_count =
2738 bsize * duration * cct->_conf->osd_bench_small_size_max_iops;
2739 if (count > max_count) {
2740 ss << "'count' values greater than " << max_count
2741 << " for a block size of " << byte_u_t(bsize) << ", assuming "
2742 << cct->_conf->osd_bench_small_size_max_iops << " IOPS,"
2743 << " for " << duration << " seconds,"
2744 << " can cause ill effects on osd. "
2745 << " Please adjust 'osd_bench_small_size_max_iops' with a higher"
2746 << " value if you wish to use a higher 'count'.";
2747 ret = -EINVAL;
2748 goto out;
eafe8130
TL
2749 }
2750 } else {
9f95a23c
TL
2751 // 1MB block sizes are big enough so that we get more stuff done.
2752 // However, to avoid the osd from getting hung on this and having
2753 // timers being triggered, we are going to limit the count assuming
2754 // a configurable throughput and duration.
2755 // NOTE: max_count is the total amount of bytes that we believe we
2756 // will be able to write during 'duration' for the given
2757 // throughput. The block size hardly impacts this unless it's
2758 // way too big. Given we already check how big the block size
2759 // is, it's safe to assume everything will check out.
2760 int64_t max_count =
2761 cct->_conf->osd_bench_large_size_max_throughput * duration;
2762 if (count > max_count) {
2763 ss << "'count' values greater than " << max_count
2764 << " for a block size of " << byte_u_t(bsize) << ", assuming "
2765 << byte_u_t(cct->_conf->osd_bench_large_size_max_throughput) << "/s,"
2766 << " for " << duration << " seconds,"
2767 << " can cause ill effects on osd. "
2768 << " Please adjust 'osd_bench_large_size_max_throughput'"
2769 << " with a higher value if you wish to use a higher 'count'.";
2770 ret = -EINVAL;
2771 goto out;
2772 }
eafe8130 2773 }
eafe8130 2774
9f95a23c
TL
2775 if (osize && bsize > osize)
2776 bsize = osize;
eafe8130 2777
9f95a23c
TL
2778 dout(1) << " bench count " << count
2779 << " bsize " << byte_u_t(bsize) << dendl;
eafe8130 2780
9f95a23c
TL
2781 ObjectStore::Transaction cleanupt;
2782
2783 if (osize && onum) {
2784 bufferlist bl;
2785 bufferptr bp(osize);
2786 bp.zero();
2787 bl.push_back(std::move(bp));
2788 bl.rebuild_page_aligned();
2789 for (int i=0; i<onum; ++i) {
2790 char nm[30];
2791 snprintf(nm, sizeof(nm), "disk_bw_test_%d", i);
2792 object_t oid(nm);
2793 hobject_t soid(sobject_t(oid, 0));
2794 ObjectStore::Transaction t;
2795 t.write(coll_t(), ghobject_t(soid), 0, osize, bl);
2796 store->queue_transaction(service.meta_ch, std::move(t), NULL);
2797 cleanupt.remove(coll_t(), ghobject_t(soid));
2798 }
2799 }
2800
2801 bufferlist bl;
2802 bufferptr bp(bsize);
2803 bp.zero();
2804 bl.push_back(std::move(bp));
2805 bl.rebuild_page_aligned();
2806
2807 {
2808 C_SaferCond waiter;
2809 if (!service.meta_ch->flush_commit(&waiter)) {
2810 waiter.wait();
2811 }
2812 }
2813
2814 utime_t start = ceph_clock_now();
2815 for (int64_t pos = 0; pos < count; pos += bsize) {
2816 char nm[30];
2817 unsigned offset = 0;
2818 if (onum && osize) {
2819 snprintf(nm, sizeof(nm), "disk_bw_test_%d", (int)(rand() % onum));
2820 offset = rand() % (osize / bsize) * bsize;
2821 } else {
2822 snprintf(nm, sizeof(nm), "disk_bw_test_%lld", (long long)pos);
2823 }
2824 object_t oid(nm);
2825 hobject_t soid(sobject_t(oid, 0));
2826 ObjectStore::Transaction t;
2827 t.write(coll_t::meta(), ghobject_t(soid), offset, bsize, bl);
2828 store->queue_transaction(service.meta_ch, std::move(t), NULL);
2829 if (!onum || !osize)
2830 cleanupt.remove(coll_t::meta(), ghobject_t(soid));
2831 }
2832
2833 {
2834 C_SaferCond waiter;
2835 if (!service.meta_ch->flush_commit(&waiter)) {
2836 waiter.wait();
2837 }
2838 }
2839 utime_t end = ceph_clock_now();
2840
2841 // clean up
2842 store->queue_transaction(service.meta_ch, std::move(cleanupt), NULL);
2843 {
2844 C_SaferCond waiter;
2845 if (!service.meta_ch->flush_commit(&waiter)) {
2846 waiter.wait();
2847 }
2848 }
2849
2850 double elapsed = end - start;
2851 double rate = count / elapsed;
2852 double iops = rate / bsize;
2853 f->open_object_section("osd_bench_results");
2854 f->dump_int("bytes_written", count);
2855 f->dump_int("blocksize", bsize);
2856 f->dump_float("elapsed_sec", elapsed);
2857 f->dump_float("bytes_per_sec", rate);
2858 f->dump_float("iops", iops);
2859 f->close_section();
2860 }
2861
2862 else if (prefix == "flush_pg_stats") {
2863 mgrc.send_pgstats();
2864 f->dump_unsigned("stat_seq", service.get_osd_stat_seq());
2865 }
2866
2867 else if (prefix == "heap") {
2868 ret = ceph::osd_cmds::heap(*cct, cmdmap, *f, ss);
2869 }
2870
2871 else if (prefix == "debug dump_missing") {
2872 f->open_array_section("pgs");
2873 vector<PGRef> pgs;
2874 _get_pgs(&pgs);
2875 for (auto& pg : pgs) {
2876 string s = stringify(pg->pg_id);
2877 f->open_array_section(s.c_str());
2878 pg->lock();
2879 pg->dump_missing(f);
2880 pg->unlock();
2881 f->close_section();
2882 }
2883 f->close_section();
2884 }
2885
2886 else if (prefix == "debug kick_recovery_wq") {
2887 int64_t delay;
2888 cmd_getval(cmdmap, "delay", delay);
2889 ostringstream oss;
2890 oss << delay;
2891 ret = cct->_conf.set_val("osd_recovery_delay_start", oss.str().c_str());
2892 if (ret != 0) {
2893 ss << "kick_recovery_wq: error setting "
2894 << "osd_recovery_delay_start to '" << delay << "': error "
2895 << ret;
2896 goto out;
2897 }
2898 cct->_conf.apply_changes(nullptr);
2899 ss << "kicking recovery queue. set osd_recovery_delay_start "
2900 << "to " << cct->_conf->osd_recovery_delay_start;
2901 }
2902
2903 else if (prefix == "cpu_profiler") {
2904 ostringstream ds;
2905 string arg;
2906 cmd_getval(cmdmap, "arg", arg);
2907 vector<string> argvec;
2908 get_str_vec(arg, argvec);
2909 cpu_profiler_handle_command(argvec, ds);
2910 outbl.append(ds.str());
2911 }
2912
2913 else if (prefix == "dump_pg_recovery_stats") {
2914 lock_guard l(osd_lock);
2915 pg_recovery_stats.dump_formatted(f);
2916 }
2917
2918 else if (prefix == "reset_pg_recovery_stats") {
2919 lock_guard l(osd_lock);
2920 pg_recovery_stats.reset();
2921 }
2922
2923 else if (prefix == "perf histogram dump") {
2924 std::string logger;
2925 std::string counter;
2926 cmd_getval(cmdmap, "logger", logger);
2927 cmd_getval(cmdmap, "counter", counter);
2928 cct->get_perfcounters_collection()->dump_formatted_histograms(
2929 f, false, logger, counter);
2930 }
2931
2932 else if (prefix == "cache drop") {
2933 lock_guard l(osd_lock);
2934 dout(20) << "clearing all caches" << dendl;
2935 // Clear the objectstore's cache - onode and buffer for Bluestore,
2936 // system's pagecache for Filestore
2937 ret = store->flush_cache(&ss);
2938 if (ret < 0) {
2939 ss << "Error flushing objectstore cache: " << cpp_strerror(ret);
2940 goto out;
2941 }
2942 // Clear the objectcontext cache (per PG)
2943 vector<PGRef> pgs;
2944 _get_pgs(&pgs);
2945 for (auto& pg: pgs) {
2946 pg->clear_cache();
2947 }
2948 }
2949
2950 else if (prefix == "cache status") {
2951 lock_guard l(osd_lock);
2952 int obj_ctx_count = 0;
2953 vector<PGRef> pgs;
2954 _get_pgs(&pgs);
2955 for (auto& pg: pgs) {
2956 obj_ctx_count += pg->get_cache_obj_count();
2957 }
2958 f->open_object_section("cache_status");
2959 f->dump_int("object_ctx", obj_ctx_count);
2960 store->dump_cache_stats(f);
2961 f->close_section();
2962 }
2963
2964 else if (prefix == "scrub_purged_snaps") {
2965 lock_guard l(osd_lock);
2966 scrub_purged_snaps();
2967 }
2968
2969 else if (prefix == "dump_osd_network") {
2970 lock_guard l(osd_lock);
2971 int64_t value = 0;
2972 if (!(cmd_getval(cmdmap, "value", value))) {
2973 // Convert milliseconds to microseconds
2974 value = static_cast<double>(g_conf().get_val<double>(
2975 "mon_warn_on_slow_ping_time")) * 1000;
2976 if (value == 0) {
2977 double ratio = g_conf().get_val<double>("mon_warn_on_slow_ping_ratio");
2978 value = g_conf().get_val<int64_t>("osd_heartbeat_grace");
2979 value *= 1000000 * ratio; // Seconds of grace to microseconds at ratio
2980 }
2981 } else {
2982 // Convert user input to microseconds
2983 value *= 1000;
2984 }
2985 if (value < 0) value = 0;
2986
2987 struct osd_ping_time_t {
2988 uint32_t pingtime;
2989 int to;
2990 bool back;
2991 std::array<uint32_t,3> times;
2992 std::array<uint32_t,3> min;
2993 std::array<uint32_t,3> max;
2994 uint32_t last;
2995 uint32_t last_update;
2996
2997 bool operator<(const osd_ping_time_t& rhs) const {
2998 if (pingtime < rhs.pingtime)
2999 return true;
3000 if (pingtime > rhs.pingtime)
3001 return false;
3002 if (to < rhs.to)
3003 return true;
3004 if (to > rhs.to)
3005 return false;
3006 return back;
3007 }
3008 };
3009
3010 set<osd_ping_time_t> sorted;
3011 // Get pingtimes under lock and not on the stack
eafe8130
TL
3012 map<int, osd_stat_t::Interfaces> *pingtimes = new map<int, osd_stat_t::Interfaces>;
3013 service.get_hb_pingtime(pingtimes);
3014 for (auto j : *pingtimes) {
3015 if (j.second.last_update == 0)
3016 continue;
3017 osd_ping_time_t item;
3018 item.pingtime = std::max(j.second.back_pingtime[0], j.second.back_pingtime[1]);
3019 item.pingtime = std::max(item.pingtime, j.second.back_pingtime[2]);
3020 if (item.pingtime >= value) {
3021 item.to = j.first;
3022 item.times[0] = j.second.back_pingtime[0];
3023 item.times[1] = j.second.back_pingtime[1];
3024 item.times[2] = j.second.back_pingtime[2];
3025 item.min[0] = j.second.back_min[0];
3026 item.min[1] = j.second.back_min[1];
3027 item.min[2] = j.second.back_min[2];
3028 item.max[0] = j.second.back_max[0];
3029 item.max[1] = j.second.back_max[1];
3030 item.max[2] = j.second.back_max[2];
3031 item.last = j.second.back_last;
3032 item.back = true;
3033 item.last_update = j.second.last_update;
3034 sorted.emplace(item);
3035 }
3036 if (j.second.front_last == 0)
3037 continue;
3038 item.pingtime = std::max(j.second.front_pingtime[0], j.second.front_pingtime[1]);
3039 item.pingtime = std::max(item.pingtime, j.second.front_pingtime[2]);
3040 if (item.pingtime >= value) {
3041 item.to = j.first;
3042 item.times[0] = j.second.front_pingtime[0];
3043 item.times[1] = j.second.front_pingtime[1];
3044 item.times[2] = j.second.front_pingtime[2];
3045 item.min[0] = j.second.front_min[0];
3046 item.min[1] = j.second.front_min[1];
3047 item.min[2] = j.second.front_min[2];
3048 item.max[0] = j.second.front_max[0];
3049 item.max[1] = j.second.front_max[1];
3050 item.max[2] = j.second.front_max[2];
3051 item.last = j.second.front_last;
3052 item.last_update = j.second.last_update;
3053 item.back = false;
3054 sorted.emplace(item);
3055 }
3056 }
3057 delete pingtimes;
3058 //
3059 // Network ping times (1min 5min 15min)
3060 f->open_object_section("network_ping_times");
3061 f->dump_int("threshold", value / 1000);
3062 f->open_array_section("entries");
3063 for (auto &sitem : boost::adaptors::reverse(sorted)) {
3064 ceph_assert(sitem.pingtime >= value);
3065 f->open_object_section("entry");
3066
3067 const time_t lu(sitem.last_update);
3068 char buffer[26];
3069 string lustr(ctime_r(&lu, buffer));
3070 lustr.pop_back(); // Remove trailing \n
3071 auto stale = cct->_conf.get_val<int64_t>("osd_heartbeat_stale");
3072 f->dump_string("last update", lustr);
3073 f->dump_bool("stale", ceph_clock_now().sec() - sitem.last_update > stale);
3074 f->dump_int("from osd", whoami);
3075 f->dump_int("to osd", sitem.to);
3076 f->dump_string("interface", (sitem.back ? "back" : "front"));
3077 f->open_object_section("average");
3078 f->dump_format_unquoted("1min", "%s", fixed_u_to_string(sitem.times[0],3).c_str());
3079 f->dump_format_unquoted("5min", "%s", fixed_u_to_string(sitem.times[1],3).c_str());
3080 f->dump_format_unquoted("15min", "%s", fixed_u_to_string(sitem.times[2],3).c_str());
3081 f->close_section(); // average
3082 f->open_object_section("min");
3083 f->dump_format_unquoted("1min", "%s", fixed_u_to_string(sitem.max[0],3).c_str());
3084 f->dump_format_unquoted("5min", "%s", fixed_u_to_string(sitem.max[1],3).c_str());
3085 f->dump_format_unquoted("15min", "%s", fixed_u_to_string(sitem.max[2],3).c_str());
3086 f->close_section(); // min
3087 f->open_object_section("max");
3088 f->dump_format_unquoted("1min", "%s", fixed_u_to_string(sitem.max[0],3).c_str());
3089 f->dump_format_unquoted("5min", "%s", fixed_u_to_string(sitem.max[1],3).c_str());
3090 f->dump_format_unquoted("15min", "%s", fixed_u_to_string(sitem.max[2],3).c_str());
3091 f->close_section(); // max
3092 f->dump_format_unquoted("last", "%s", fixed_u_to_string(sitem.last,3).c_str());
3093 f->close_section(); // entry
3094 }
3095 f->close_section(); // entries
3096 f->close_section(); // network_ping_times
7c673cae 3097 } else {
11fdf7f2 3098 ceph_abort_msg("broken asok registration");
7c673cae 3099 }
9f95a23c
TL
3100
3101 out:
3102 on_finish(ret, ss.str(), outbl);
7c673cae
FG
3103}
3104
3105class TestOpsSocketHook : public AdminSocketHook {
3106 OSDService *service;
3107 ObjectStore *store;
3108public:
3109 TestOpsSocketHook(OSDService *s, ObjectStore *st) : service(s), store(st) {}
9f95a23c
TL
3110 int call(std::string_view command, const cmdmap_t& cmdmap,
3111 Formatter *f,
3112 std::ostream& errss,
3113 bufferlist& out) override {
3114 int r = 0;
3115 stringstream outss;
11fdf7f2 3116 try {
9f95a23c
TL
3117 test_ops(service, store, command, cmdmap, outss);
3118 out.append(outss);
3119 } catch (const TOPNSPC::common::bad_cmd_get& e) {
3120 errss << e.what();
3121 r = -EINVAL;
11fdf7f2 3122 }
9f95a23c 3123 return r;
7c673cae
FG
3124 }
3125 void test_ops(OSDService *service, ObjectStore *store,
11fdf7f2 3126 std::string_view command, const cmdmap_t& cmdmap, ostream &ss);
7c673cae
FG
3127
3128};
3129
3130class OSD::C_Tick : public Context {
3131 OSD *osd;
3132 public:
3133 explicit C_Tick(OSD *o) : osd(o) {}
3134 void finish(int r) override {
3135 osd->tick();
3136 }
3137};
3138
3139class OSD::C_Tick_WithoutOSDLock : public Context {
3140 OSD *osd;
3141 public:
3142 explicit C_Tick_WithoutOSDLock(OSD *o) : osd(o) {}
3143 void finish(int r) override {
3144 osd->tick_without_osd_lock();
3145 }
3146};
3147
3148int OSD::enable_disable_fuse(bool stop)
3149{
3150#ifdef HAVE_LIBFUSE
3151 int r;
3152 string mntpath = cct->_conf->osd_data + "/fuse";
3153 if (fuse_store && (stop || !cct->_conf->osd_objectstore_fuse)) {
3154 dout(1) << __func__ << " disabling" << dendl;
3155 fuse_store->stop();
3156 delete fuse_store;
3157 fuse_store = NULL;
3158 r = ::rmdir(mntpath.c_str());
7c673cae 3159 if (r < 0) {
c07f9fc5
FG
3160 r = -errno;
3161 derr << __func__ << " failed to rmdir " << mntpath << ": "
3162 << cpp_strerror(r) << dendl;
7c673cae
FG
3163 return r;
3164 }
3165 return 0;
3166 }
3167 if (!fuse_store && cct->_conf->osd_objectstore_fuse) {
3168 dout(1) << __func__ << " enabling" << dendl;
3169 r = ::mkdir(mntpath.c_str(), 0700);
3170 if (r < 0)
3171 r = -errno;
3172 if (r < 0 && r != -EEXIST) {
3173 derr << __func__ << " unable to create " << mntpath << ": "
3174 << cpp_strerror(r) << dendl;
3175 return r;
3176 }
3177 fuse_store = new FuseStore(store, mntpath);
3178 r = fuse_store->start();
3179 if (r < 0) {
3180 derr << __func__ << " unable to start fuse: " << cpp_strerror(r) << dendl;
3181 delete fuse_store;
3182 fuse_store = NULL;
3183 return r;
3184 }
3185 }
3186#endif // HAVE_LIBFUSE
3187 return 0;
3188}
3189
9f95a23c
TL
3190size_t OSD::get_num_cache_shards()
3191{
3192 return cct->_conf.get_val<Option::size_t>("osd_num_cache_shards");
3193}
3194
31f18b77
FG
3195int OSD::get_num_op_shards()
3196{
3197 if (cct->_conf->osd_op_num_shards)
3198 return cct->_conf->osd_op_num_shards;
3199 if (store_is_rotational)
3200 return cct->_conf->osd_op_num_shards_hdd;
3201 else
3202 return cct->_conf->osd_op_num_shards_ssd;
3203}
3204
3205int OSD::get_num_op_threads()
3206{
3207 if (cct->_conf->osd_op_num_threads_per_shard)
3208 return get_num_op_shards() * cct->_conf->osd_op_num_threads_per_shard;
3209 if (store_is_rotational)
3210 return get_num_op_shards() * cct->_conf->osd_op_num_threads_per_shard_hdd;
3211 else
3212 return get_num_op_shards() * cct->_conf->osd_op_num_threads_per_shard_ssd;
3213}
3214
c07f9fc5
FG
3215float OSD::get_osd_recovery_sleep()
3216{
3217 if (cct->_conf->osd_recovery_sleep)
3218 return cct->_conf->osd_recovery_sleep;
d2e6a577 3219 if (!store_is_rotational && !journal_is_rotational)
c07f9fc5 3220 return cct->_conf->osd_recovery_sleep_ssd;
d2e6a577 3221 else if (store_is_rotational && !journal_is_rotational)
11fdf7f2 3222 return cct->_conf.get_val<double>("osd_recovery_sleep_hybrid");
d2e6a577
FG
3223 else
3224 return cct->_conf->osd_recovery_sleep_hdd;
c07f9fc5
FG
3225}
3226
11fdf7f2
TL
3227float OSD::get_osd_delete_sleep()
3228{
3229 float osd_delete_sleep = cct->_conf.get_val<double>("osd_delete_sleep");
3230 if (osd_delete_sleep > 0)
3231 return osd_delete_sleep;
3232 if (!store_is_rotational && !journal_is_rotational)
3233 return cct->_conf.get_val<double>("osd_delete_sleep_ssd");
3234 if (store_is_rotational && !journal_is_rotational)
3235 return cct->_conf.get_val<double>("osd_delete_sleep_hybrid");
3236 return cct->_conf.get_val<double>("osd_delete_sleep_hdd");
3237}
3238
9f95a23c
TL
3239int OSD::get_recovery_max_active()
3240{
3241 if (cct->_conf->osd_recovery_max_active)
3242 return cct->_conf->osd_recovery_max_active;
3243 if (store_is_rotational)
3244 return cct->_conf->osd_recovery_max_active_hdd;
3245 else
3246 return cct->_conf->osd_recovery_max_active_ssd;
3247}
3248
494da23a
TL
3249float OSD::get_osd_snap_trim_sleep()
3250{
3251 float osd_snap_trim_sleep = cct->_conf.get_val<double>("osd_snap_trim_sleep");
3252 if (osd_snap_trim_sleep > 0)
3253 return osd_snap_trim_sleep;
3254 if (!store_is_rotational && !journal_is_rotational)
3255 return cct->_conf.get_val<double>("osd_snap_trim_sleep_ssd");
3256 if (store_is_rotational && !journal_is_rotational)
3257 return cct->_conf.get_val<double>("osd_snap_trim_sleep_hybrid");
3258 return cct->_conf.get_val<double>("osd_snap_trim_sleep_hdd");
3259}
3260
7c673cae
FG
3261int OSD::init()
3262{
9f95a23c 3263 OSDMapRef osdmap;
7c673cae 3264 CompatSet initial, diff;
11fdf7f2 3265 std::lock_guard lock(osd_lock);
7c673cae
FG
3266 if (is_stopping())
3267 return 0;
3268
3269 tick_timer.init();
3270 tick_timer_without_osd_lock.init();
3271 service.recovery_request_timer.init();
11fdf7f2
TL
3272 service.sleep_timer.init();
3273
3274 boot_finisher.start();
3275
3276 {
3277 string val;
3278 store->read_meta("require_osd_release", &val);
9f95a23c 3279 last_require_osd_release = ceph_release_from_name(val);
11fdf7f2 3280 }
7c673cae
FG
3281
3282 // mount.
31f18b77
FG
3283 dout(2) << "init " << dev_path
3284 << " (looks like " << (store_is_rotational ? "hdd" : "ssd") << ")"
3285 << dendl;
d2e6a577 3286 dout(2) << "journal " << journal_path << dendl;
11fdf7f2 3287 ceph_assert(store); // call pre_init() first!
7c673cae 3288
9f95a23c 3289 store->set_cache_shards(get_num_cache_shards());
7c673cae
FG
3290
3291 int r = store->mount();
3292 if (r < 0) {
3293 derr << "OSD:init: unable to mount object store" << dendl;
3294 return r;
3295 }
d2e6a577
FG
3296 journal_is_rotational = store->is_journal_rotational();
3297 dout(2) << "journal looks like " << (journal_is_rotational ? "hdd" : "ssd")
3298 << dendl;
7c673cae
FG
3299
3300 enable_disable_fuse(false);
3301
3302 dout(2) << "boot" << dendl;
3303
11fdf7f2
TL
3304 service.meta_ch = store->open_collection(coll_t::meta());
3305
7c673cae
FG
3306 // initialize the daily loadavg with current 15min loadavg
3307 double loadavgs[3];
3308 if (getloadavg(loadavgs, 3) == 3) {
3309 daily_loadavg = loadavgs[2];
3310 } else {
3311 derr << "OSD::init() : couldn't read loadavgs\n" << dendl;
3312 daily_loadavg = 1.0;
3313 }
3314
3315 int rotating_auth_attempts = 0;
11fdf7f2
TL
3316 auto rotating_auth_timeout =
3317 g_conf().get_val<int64_t>("rotating_keys_bootstrap_timeout");
7c673cae
FG
3318
3319 // sanity check long object name handling
3320 {
3321 hobject_t l;
3322 l.oid.name = string(cct->_conf->osd_max_object_name_len, 'n');
3323 l.set_key(string(cct->_conf->osd_max_object_name_len, 'k'));
3324 l.nspace = string(cct->_conf->osd_max_object_namespace_len, 's');
3325 r = store->validate_hobject_key(l);
3326 if (r < 0) {
3327 derr << "backend (" << store->get_type() << ") is unable to support max "
3328 << "object name[space] len" << dendl;
3329 derr << " osd max object name len = "
3330 << cct->_conf->osd_max_object_name_len << dendl;
3331 derr << " osd max object namespace len = "
3332 << cct->_conf->osd_max_object_namespace_len << dendl;
3333 derr << cpp_strerror(r) << dendl;
3334 if (cct->_conf->osd_check_max_object_name_len_on_startup) {
3335 goto out;
3336 }
3337 derr << "osd_check_max_object_name_len_on_startup = false, starting anyway"
3338 << dendl;
3339 } else {
3340 dout(20) << "configured osd_max_object_name[space]_len looks ok" << dendl;
3341 }
3342 }
3343
3344 // read superblock
3345 r = read_superblock();
3346 if (r < 0) {
3347 derr << "OSD::init() : unable to read osd superblock" << dendl;
3348 r = -EINVAL;
3349 goto out;
3350 }
3351
3352 if (osd_compat.compare(superblock.compat_features) < 0) {
3353 derr << "The disk uses features unsupported by the executable." << dendl;
3354 derr << " ondisk features " << superblock.compat_features << dendl;
3355 derr << " daemon features " << osd_compat << dendl;
3356
3357 if (osd_compat.writeable(superblock.compat_features)) {
3358 CompatSet diff = osd_compat.unsupported(superblock.compat_features);
3359 derr << "it is still writeable, though. Missing features: " << diff << dendl;
3360 r = -EOPNOTSUPP;
3361 goto out;
3362 }
3363 else {
3364 CompatSet diff = osd_compat.unsupported(superblock.compat_features);
3365 derr << "Cannot write to disk! Missing features: " << diff << dendl;
3366 r = -EOPNOTSUPP;
3367 goto out;
3368 }
3369 }
3370
3371 assert_warn(whoami == superblock.whoami);
3372 if (whoami != superblock.whoami) {
3373 derr << "OSD::init: superblock says osd"
3374 << superblock.whoami << " but I am osd." << whoami << dendl;
3375 r = -EINVAL;
3376 goto out;
3377 }
3378
9f95a23c
TL
3379 startup_time = ceph::mono_clock::now();
3380
11fdf7f2 3381 // load up "current" osdmap
9f95a23c
TL
3382 assert_warn(!get_osdmap());
3383 if (get_osdmap()) {
11fdf7f2
TL
3384 derr << "OSD::init: unable to read current osdmap" << dendl;
3385 r = -EINVAL;
3386 goto out;
3387 }
3388 osdmap = get_map(superblock.current_epoch);
9f95a23c 3389 set_osdmap(osdmap);
11fdf7f2
TL
3390
3391 // make sure we don't have legacy pgs deleting
3392 {
3393 vector<coll_t> ls;
3394 int r = store->list_collections(ls);
3395 ceph_assert(r >= 0);
3396 for (auto c : ls) {
3397 spg_t pgid;
3398 if (c.is_pg(&pgid) &&
3399 !osdmap->have_pg_pool(pgid.pool())) {
3400 ghobject_t oid = make_final_pool_info_oid(pgid.pool());
3401 if (!store->exists(service.meta_ch, oid)) {
3402 derr << __func__ << " missing pg_pool_t for deleted pool "
3403 << pgid.pool() << " for pg " << pgid
3404 << "; please downgrade to luminous and allow "
3405 << "pg deletion to complete before upgrading" << dendl;
3406 ceph_abort();
3407 }
3408 }
3409 }
3410 }
3411
7c673cae
FG
3412 initial = get_osd_initial_compat_set();
3413 diff = superblock.compat_features.unsupported(initial);
3414 if (superblock.compat_features.merge(initial)) {
9f95a23c
TL
3415 // Are we adding SNAPMAPPER2?
3416 if (diff.incompat.contains(CEPH_OSD_FEATURE_INCOMPAT_SNAPMAPPER2)) {
3417 dout(1) << __func__ << " upgrade snap_mapper (first start as octopus)"
3418 << dendl;
3419 auto ch = service.meta_ch;
3420 auto hoid = make_snapmapper_oid();
3421 unsigned max = cct->_conf->osd_target_transaction_size;
3422 r = SnapMapper::convert_legacy(cct, store, ch, hoid, max);
3423 if (r < 0)
3424 goto out;
3425 }
7c673cae
FG
3426 // We need to persist the new compat_set before we
3427 // do anything else
3428 dout(5) << "Upgrading superblock adding: " << diff << dendl;
3429 ObjectStore::Transaction t;
3430 write_superblock(t);
11fdf7f2 3431 r = store->queue_transaction(service.meta_ch, std::move(t));
7c673cae
FG
3432 if (r < 0)
3433 goto out;
3434 }
3435
3436 // make sure snap mapper object exists
11fdf7f2 3437 if (!store->exists(service.meta_ch, OSD::make_snapmapper_oid())) {
7c673cae
FG
3438 dout(10) << "init creating/touching snapmapper object" << dendl;
3439 ObjectStore::Transaction t;
3440 t.touch(coll_t::meta(), OSD::make_snapmapper_oid());
11fdf7f2 3441 r = store->queue_transaction(service.meta_ch, std::move(t));
7c673cae
FG
3442 if (r < 0)
3443 goto out;
3444 }
9f95a23c
TL
3445 if (!store->exists(service.meta_ch, OSD::make_purged_snaps_oid())) {
3446 dout(10) << "init creating/touching purged_snaps object" << dendl;
3447 ObjectStore::Transaction t;
3448 t.touch(coll_t::meta(), OSD::make_purged_snaps_oid());
3449 r = store->queue_transaction(service.meta_ch, std::move(t));
3450 if (r < 0)
3451 goto out;
3452 }
7c673cae
FG
3453
3454 if (cct->_conf->osd_open_classes_on_start) {
9f95a23c 3455 int r = ClassHandler::get_instance().open_all_classes();
7c673cae
FG
3456 if (r)
3457 dout(1) << "warning: got an error loading one or more classes: " << cpp_strerror(r) << dendl;
3458 }
3459
11fdf7f2 3460 check_osdmap_features();
7c673cae
FG
3461
3462 create_recoverystate_perf();
3463
3464 {
3465 epoch_t bind_epoch = osdmap->get_epoch();
3466 service.set_epochs(NULL, NULL, &bind_epoch);
3467 }
3468
3469 clear_temp_objects();
3470
d2e6a577 3471 // initialize osdmap references in sharded wq
11fdf7f2
TL
3472 for (auto& shard : shards) {
3473 std::lock_guard l(shard->osdmap_lock);
3474 shard->shard_osdmap = osdmap;
3475 }
d2e6a577 3476
7c673cae
FG
3477 // load up pgs (as they previously existed)
3478 load_pgs();
3479
3480 dout(2) << "superblock: I am osd." << superblock.whoami << dendl;
7c673cae
FG
3481
3482 create_logger();
3483
11fdf7f2
TL
3484 // prime osd stats
3485 {
3486 struct store_statfs_t stbuf;
3487 osd_alert_list_t alerts;
3488 int r = store->statfs(&stbuf, &alerts);
3489 ceph_assert(r == 0);
3490 service.set_statfs(stbuf, alerts);
3491 }
3492
3493 // client_messenger auth_client is already set up by monc.
3494 for (auto m : { cluster_messenger,
3495 objecter_messenger,
3496 hb_front_client_messenger,
3497 hb_back_client_messenger,
3498 hb_front_server_messenger,
3499 hb_back_server_messenger } ) {
3500 m->set_auth_client(monc);
3501 }
3502 for (auto m : { client_messenger,
3503 cluster_messenger,
3504 hb_front_server_messenger,
3505 hb_back_server_messenger }) {
3506 m->set_auth_server(monc);
3507 }
3508 monc->set_handle_authentication_dispatcher(this);
7c673cae
FG
3509
3510 monc->set_want_keys(CEPH_ENTITY_TYPE_MON | CEPH_ENTITY_TYPE_OSD
3511 | CEPH_ENTITY_TYPE_MGR);
3512 r = monc->init();
3513 if (r < 0)
3514 goto out;
3515
11fdf7f2
TL
3516 mgrc.set_pgstats_cb([this](){ return collect_pg_stats(); });
3517 mgrc.set_perf_metric_query_cb(
9f95a23c
TL
3518 [this](const ConfigPayload &config_payload) {
3519 set_perf_queries(config_payload);
11fdf7f2 3520 },
9f95a23c
TL
3521 [this] {
3522 return get_perf_reports();
11fdf7f2 3523 });
7c673cae 3524 mgrc.init();
7c673cae
FG
3525
3526 // tell monc about log_client so it will know about mon session resets
3527 monc->set_log_client(&log_client);
3528 update_log_config();
3529
11fdf7f2
TL
3530 // i'm ready!
3531 client_messenger->add_dispatcher_tail(&mgrc);
3532 client_messenger->add_dispatcher_tail(this);
3533 cluster_messenger->add_dispatcher_head(this);
3534
3535 hb_front_client_messenger->add_dispatcher_head(&heartbeat_dispatcher);
3536 hb_back_client_messenger->add_dispatcher_head(&heartbeat_dispatcher);
3537 hb_front_server_messenger->add_dispatcher_head(&heartbeat_dispatcher);
3538 hb_back_server_messenger->add_dispatcher_head(&heartbeat_dispatcher);
3539
9f95a23c 3540 objecter_messenger->add_dispatcher_head(service.objecter.get());
11fdf7f2 3541
28e407b8
AA
3542 service.init();
3543 service.publish_map(osdmap);
3544 service.publish_superblock(superblock);
3545 service.max_oldest_map = superblock.oldest_map;
3546
11fdf7f2
TL
3547 for (auto& shard : shards) {
3548 // put PGs in a temporary set because we may modify pg_slots
3549 // unordered_map below.
3550 set<PGRef> pgs;
3551 for (auto& i : shard->pg_slots) {
3552 PGRef pg = i.second->pg;
3553 if (!pg) {
3554 continue;
3555 }
3556 pgs.insert(pg);
3557 }
3558 for (auto pg : pgs) {
9f95a23c 3559 std::scoped_lock l{*pg};
11fdf7f2
TL
3560 set<pair<spg_t,epoch_t>> new_children;
3561 set<pair<spg_t,epoch_t>> merge_pgs;
3562 service.identify_splits_and_merges(pg->get_osdmap(), osdmap, pg->pg_id,
3563 &new_children, &merge_pgs);
3564 if (!new_children.empty()) {
3565 for (auto shard : shards) {
3566 shard->prime_splits(osdmap, &new_children);
3567 }
3568 assert(new_children.empty());
3569 }
3570 if (!merge_pgs.empty()) {
3571 for (auto shard : shards) {
3572 shard->prime_merges(osdmap, &merge_pgs);
3573 }
3574 assert(merge_pgs.empty());
3575 }
11fdf7f2
TL
3576 }
3577 }
3578
7c673cae 3579 osd_op_tp.start();
7c673cae 3580
7c673cae
FG
3581 // start the heartbeat
3582 heartbeat_thread.create("osd_srv_heartbt");
3583
3584 // tick
91327a77
AA
3585 tick_timer.add_event_after(get_tick_interval(),
3586 new C_Tick(this));
7c673cae 3587 {
11fdf7f2 3588 std::lock_guard l(tick_timer_lock);
91327a77
AA
3589 tick_timer_without_osd_lock.add_event_after(get_tick_interval(),
3590 new C_Tick_WithoutOSDLock(this));
7c673cae
FG
3591 }
3592
9f95a23c 3593 osd_lock.unlock();
7c673cae
FG
3594
3595 r = monc->authenticate();
3596 if (r < 0) {
c07f9fc5
FG
3597 derr << __func__ << " authentication failed: " << cpp_strerror(r)
3598 << dendl;
11fdf7f2 3599 exit(1);
7c673cae
FG
3600 }
3601
11fdf7f2 3602 while (monc->wait_auth_rotating(rotating_auth_timeout) < 0) {
7c673cae
FG
3603 derr << "unable to obtain rotating service keys; retrying" << dendl;
3604 ++rotating_auth_attempts;
11fdf7f2 3605 if (rotating_auth_attempts > g_conf()->max_rotating_auth_attempts) {
c07f9fc5 3606 derr << __func__ << " wait_auth_rotating timed out" << dendl;
11fdf7f2 3607 exit(1);
7c673cae
FG
3608 }
3609 }
3610
3611 r = update_crush_device_class();
3612 if (r < 0) {
d2e6a577
FG
3613 derr << __func__ << " unable to update_crush_device_class: "
3614 << cpp_strerror(r) << dendl;
11fdf7f2 3615 exit(1);
7c673cae
FG
3616 }
3617
3618 r = update_crush_location();
3619 if (r < 0) {
d2e6a577 3620 derr << __func__ << " unable to update_crush_location: "
c07f9fc5 3621 << cpp_strerror(r) << dendl;
11fdf7f2 3622 exit(1);
7c673cae
FG
3623 }
3624
9f95a23c 3625 osd_lock.lock();
7c673cae
FG
3626 if (is_stopping())
3627 return 0;
3628
3629 // start objecter *after* we have authenticated, so that we don't ignore
3630 // the OSDMaps it requests.
3631 service.final_init();
3632
3633 check_config();
3634
3635 dout(10) << "ensuring pgs have consumed prior maps" << dendl;
3636 consume_map();
7c673cae
FG
3637
3638 dout(0) << "done with init, starting boot process" << dendl;
3639
3640 // subscribe to any pg creations
3641 monc->sub_want("osd_pg_creates", last_pg_create_epoch, 0);
3642
3643 // MgrClient needs this (it doesn't have MonClient reference itself)
3644 monc->sub_want("mgrmap", 0, 0);
3645
3646 // we don't need to ask for an osdmap here; objecter will
3647 //monc->sub_want("osdmap", osdmap->get_epoch(), CEPH_SUBSCRIBE_ONETIME);
3648
3649 monc->renew_subs();
3650
3651 start_boot();
3652
3653 return 0;
7c673cae
FG
3654
3655out:
3656 enable_disable_fuse(true);
3657 store->umount();
3658 delete store;
3659 store = NULL;
3660 return r;
3661}
3662
3663void OSD::final_init()
3664{
3665 AdminSocket *admin_socket = cct->get_admin_socket();
3666 asok_hook = new OSDSocketHook(this);
9f95a23c 3667 int r = admin_socket->register_command("status", asok_hook,
7c673cae 3668 "high-level status of OSD");
11fdf7f2 3669 ceph_assert(r == 0);
9f95a23c 3670 r = admin_socket->register_command("flush_journal",
7c673cae
FG
3671 asok_hook,
3672 "flush the journal to permanent store");
11fdf7f2 3673 ceph_assert(r == 0);
9f95a23c 3674 r = admin_socket->register_command("dump_ops_in_flight " \
c07f9fc5
FG
3675 "name=filterstr,type=CephString,n=N,req=false",
3676 asok_hook,
7c673cae 3677 "show the ops currently in flight");
11fdf7f2 3678 ceph_assert(r == 0);
9f95a23c 3679 r = admin_socket->register_command("ops " \
c07f9fc5
FG
3680 "name=filterstr,type=CephString,n=N,req=false",
3681 asok_hook,
7c673cae 3682 "show the ops currently in flight");
11fdf7f2 3683 ceph_assert(r == 0);
9f95a23c 3684 r = admin_socket->register_command("dump_blocked_ops " \
c07f9fc5
FG
3685 "name=filterstr,type=CephString,n=N,req=false",
3686 asok_hook,
7c673cae 3687 "show the blocked ops currently in flight");
11fdf7f2 3688 ceph_assert(r == 0);
9f95a23c 3689 r = admin_socket->register_command("dump_historic_ops " \
c07f9fc5 3690 "name=filterstr,type=CephString,n=N,req=false",
7c673cae
FG
3691 asok_hook,
3692 "show recent ops");
11fdf7f2 3693 ceph_assert(r == 0);
9f95a23c 3694 r = admin_socket->register_command("dump_historic_slow_ops " \
c07f9fc5 3695 "name=filterstr,type=CephString,n=N,req=false",
7c673cae
FG
3696 asok_hook,
3697 "show slowest recent ops");
11fdf7f2 3698 ceph_assert(r == 0);
9f95a23c 3699 r = admin_socket->register_command("dump_historic_ops_by_duration " \
c07f9fc5 3700 "name=filterstr,type=CephString,n=N,req=false",
7c673cae
FG
3701 asok_hook,
3702 "show slowest recent ops, sorted by duration");
11fdf7f2 3703 ceph_assert(r == 0);
9f95a23c 3704 r = admin_socket->register_command("dump_op_pq_state",
7c673cae
FG
3705 asok_hook,
3706 "dump op priority queue state");
11fdf7f2 3707 ceph_assert(r == 0);
9f95a23c 3708 r = admin_socket->register_command("dump_blacklist",
7c673cae
FG
3709 asok_hook,
3710 "dump blacklisted clients and times");
11fdf7f2 3711 ceph_assert(r == 0);
9f95a23c 3712 r = admin_socket->register_command("dump_watchers",
7c673cae
FG
3713 asok_hook,
3714 "show clients which have active watches,"
3715 " and on which objects");
11fdf7f2 3716 ceph_assert(r == 0);
9f95a23c 3717 r = admin_socket->register_command("dump_recovery_reservations",
7c673cae
FG
3718 asok_hook,
3719 "show recovery reservations");
11fdf7f2 3720 ceph_assert(r == 0);
9f95a23c 3721 r = admin_socket->register_command("dump_scrub_reservations",
eafe8130 3722 asok_hook,
f6b5b4d7 3723 "show scrub reservations");
eafe8130 3724 ceph_assert(r == 0);
9f95a23c 3725 r = admin_socket->register_command("get_latest_osdmap",
7c673cae
FG
3726 asok_hook,
3727 "force osd to update the latest map from "
3728 "the mon");
11fdf7f2 3729 ceph_assert(r == 0);
7c673cae 3730
9f95a23c 3731 r = admin_socket->register_command("set_heap_property " \
7c673cae
FG
3732 "name=property,type=CephString " \
3733 "name=value,type=CephInt",
3734 asok_hook,
3735 "update malloc extension heap property");
11fdf7f2 3736 ceph_assert(r == 0);
7c673cae 3737
9f95a23c 3738 r = admin_socket->register_command("get_heap_property " \
7c673cae
FG
3739 "name=property,type=CephString",
3740 asok_hook,
3741 "get malloc extension heap property");
11fdf7f2 3742 ceph_assert(r == 0);
7c673cae
FG
3743
3744 r = admin_socket->register_command("dump_objectstore_kv_stats",
7c673cae
FG
3745 asok_hook,
3746 "print statistics of kvdb which used by bluestore");
11fdf7f2 3747 ceph_assert(r == 0);
7c673cae
FG
3748
3749 r = admin_socket->register_command("dump_scrubs",
7c673cae
FG
3750 asok_hook,
3751 "print scheduled scrubs");
11fdf7f2 3752 ceph_assert(r == 0);
7c673cae
FG
3753
3754 r = admin_socket->register_command("calc_objectstore_db_histogram",
7c673cae
FG
3755 asok_hook,
3756 "Generate key value histogram of kvdb(rocksdb) which used by bluestore");
11fdf7f2 3757 ceph_assert(r == 0);
7c673cae
FG
3758
3759 r = admin_socket->register_command("flush_store_cache",
7c673cae
FG
3760 asok_hook,
3761 "Flush bluestore internal cache");
11fdf7f2 3762 ceph_assert(r == 0);
9f95a23c 3763 r = admin_socket->register_command("dump_pgstate_history",
7c673cae
FG
3764 asok_hook,
3765 "show recent state history");
11fdf7f2 3766 ceph_assert(r == 0);
7c673cae 3767
9f95a23c 3768 r = admin_socket->register_command("compact",
224ce89b
WB
3769 asok_hook,
3770 "Commpact object store's omap."
3771 " WARNING: Compaction probably slows your requests");
11fdf7f2
TL
3772 ceph_assert(r == 0);
3773
9f95a23c 3774 r = admin_socket->register_command("get_mapped_pools",
11fdf7f2
TL
3775 asok_hook,
3776 "dump pools whose PG(s) are mapped to this OSD.");
3777
3778 ceph_assert(r == 0);
3779
9f95a23c 3780 r = admin_socket->register_command("smart name=devid,type=CephString,req=false",
11fdf7f2
TL
3781 asok_hook,
3782 "probe OSD devices for SMART data.");
3783
3784 ceph_assert(r == 0);
3785
9f95a23c 3786 r = admin_socket->register_command("list_devices",
11fdf7f2
TL
3787 asok_hook,
3788 "list OSD devices.");
9f95a23c 3789 r = admin_socket->register_command("send_beacon",
11fdf7f2
TL
3790 asok_hook,
3791 "send OSD beacon to mon immediately");
224ce89b 3792
9f95a23c
TL
3793 r = admin_socket->register_command(
3794 "dump_osd_network name=value,type=CephInt,req=false", asok_hook,
3795 "Dump osd heartbeat network ping times");
eafe8130
TL
3796 ceph_assert(r == 0);
3797
7c673cae
FG
3798 test_ops_hook = new TestOpsSocketHook(&(this->service), this->store);
3799 // Note: pools are CephString instead of CephPoolname because
3800 // these commands traditionally support both pool names and numbers
3801 r = admin_socket->register_command(
7c673cae
FG
3802 "setomapval " \
3803 "name=pool,type=CephString " \
3804 "name=objname,type=CephObjectname " \
3805 "name=key,type=CephString "\
3806 "name=val,type=CephString",
3807 test_ops_hook,
3808 "set omap key");
11fdf7f2 3809 ceph_assert(r == 0);
7c673cae 3810 r = admin_socket->register_command(
7c673cae
FG
3811 "rmomapkey " \
3812 "name=pool,type=CephString " \
3813 "name=objname,type=CephObjectname " \
3814 "name=key,type=CephString",
3815 test_ops_hook,
3816 "remove omap key");
11fdf7f2 3817 ceph_assert(r == 0);
7c673cae 3818 r = admin_socket->register_command(
7c673cae
FG
3819 "setomapheader " \
3820 "name=pool,type=CephString " \
3821 "name=objname,type=CephObjectname " \
3822 "name=header,type=CephString",
3823 test_ops_hook,
3824 "set omap header");
11fdf7f2 3825 ceph_assert(r == 0);
7c673cae
FG
3826
3827 r = admin_socket->register_command(
7c673cae
FG
3828 "getomap " \
3829 "name=pool,type=CephString " \
3830 "name=objname,type=CephObjectname",
3831 test_ops_hook,
3832 "output entire object map");
11fdf7f2 3833 ceph_assert(r == 0);
7c673cae
FG
3834
3835 r = admin_socket->register_command(
7c673cae
FG
3836 "truncobj " \
3837 "name=pool,type=CephString " \
3838 "name=objname,type=CephObjectname " \
3839 "name=len,type=CephInt",
3840 test_ops_hook,
3841 "truncate object to length");
11fdf7f2 3842 ceph_assert(r == 0);
7c673cae
FG
3843
3844 r = admin_socket->register_command(
7c673cae
FG
3845 "injectdataerr " \
3846 "name=pool,type=CephString " \
3847 "name=objname,type=CephObjectname " \
3848 "name=shardid,type=CephInt,req=false,range=0|255",
3849 test_ops_hook,
3850 "inject data error to an object");
11fdf7f2 3851 ceph_assert(r == 0);
7c673cae
FG
3852
3853 r = admin_socket->register_command(
7c673cae
FG
3854 "injectmdataerr " \
3855 "name=pool,type=CephString " \
3856 "name=objname,type=CephObjectname " \
3857 "name=shardid,type=CephInt,req=false,range=0|255",
3858 test_ops_hook,
3859 "inject metadata error to an object");
11fdf7f2 3860 ceph_assert(r == 0);
7c673cae 3861 r = admin_socket->register_command(
7c673cae
FG
3862 "set_recovery_delay " \
3863 "name=utime,type=CephInt,req=false",
3864 test_ops_hook,
3865 "Delay osd recovery by specified seconds");
11fdf7f2 3866 ceph_assert(r == 0);
7c673cae 3867 r = admin_socket->register_command(
7c673cae
FG
3868 "injectfull " \
3869 "name=type,type=CephString,req=false " \
3870 "name=count,type=CephInt,req=false ",
3871 test_ops_hook,
3872 "Inject a full disk (optional count times)");
11fdf7f2 3873 ceph_assert(r == 0);
9f95a23c
TL
3874 r = admin_socket->register_command(
3875 "bench " \
3876 "name=count,type=CephInt,req=false " \
3877 "name=size,type=CephInt,req=false " \
3878 "name=object_size,type=CephInt,req=false " \
3879 "name=object_num,type=CephInt,req=false ",
3880 asok_hook,
3881 "OSD benchmark: write <count> <size>-byte objects(with <obj_size> <obj_num>), " \
3882 "(default count=1G default size=4MB). Results in log.");
3883 ceph_assert(r == 0);
3884 r = admin_socket->register_command(
3885 "cluster_log " \
3886 "name=level,type=CephChoices,strings=error,warning,info,debug " \
3887 "name=message,type=CephString,n=N",
3888 asok_hook,
3889 "log a message to the cluster log");
3890 ceph_assert(r == 0);
3891 r = admin_socket->register_command(
3892 "flush_pg_stats",
3893 asok_hook,
3894 "flush pg stats");
3895 ceph_assert(r == 0);
3896 r = admin_socket->register_command(
3897 "heap " \
3898 "name=heapcmd,type=CephChoices,strings=" \
3899 "dump|start_profiler|stop_profiler|release|get_release_rate|set_release_rate|stats " \
3900 "name=value,type=CephString,req=false",
3901 asok_hook,
3902 "show heap usage info (available only if compiled with tcmalloc)");
3903 ceph_assert(r == 0);
3904 r = admin_socket->register_command(
3905 "debug dump_missing " \
3906 "name=filename,type=CephFilepath",
3907 asok_hook,
3908 "dump missing objects to a named file");
3909 ceph_assert(r == 0);
3910 r = admin_socket->register_command(
3911 "debug kick_recovery_wq " \
3912 "name=delay,type=CephInt,range=0",
3913 asok_hook,
3914 "set osd_recovery_delay_start to <val>");
3915 ceph_assert(r == 0);
3916 r = admin_socket->register_command(
3917 "cpu_profiler " \
3918 "name=arg,type=CephChoices,strings=status|flush",
3919 asok_hook,
3920 "run cpu profiling on daemon");
3921 ceph_assert(r == 0);
3922 r = admin_socket->register_command(
3923 "dump_pg_recovery_stats",
3924 asok_hook,
3925 "dump pg recovery statistics");
3926 ceph_assert(r == 0);
3927 r = admin_socket->register_command(
3928 "reset_pg_recovery_stats",
3929 asok_hook,
3930 "reset pg recovery statistics");
3931 ceph_assert(r == 0);
3932 r = admin_socket->register_command(
3933 "cache drop",
3934 asok_hook,
3935 "Drop all OSD caches");
3936 ceph_assert(r == 0);
3937 r = admin_socket->register_command(
3938 "cache status",
3939 asok_hook,
3940 "Get OSD caches statistics");
3941 ceph_assert(r == 0);
3942 r = admin_socket->register_command(
3943 "scrub_purged_snaps",
3944 asok_hook,
3945 "Scrub purged_snaps vs snapmapper index");
3946 ceph_assert(r == 0);
7c673cae 3947
9f95a23c
TL
3948 // -- pg commands --
3949 // old form: ceph pg <pgid> command ...
3950 r = admin_socket->register_command(
3951 "pg " \
3952 "name=pgid,type=CephPgid " \
3953 "name=cmd,type=CephChoices,strings=query",
3954 asok_hook,
3955 "");
3956 ceph_assert(r == 0);
3957 r = admin_socket->register_command(
3958 "pg " \
3959 "name=pgid,type=CephPgid " \
3960 "name=cmd,type=CephChoices,strings=mark_unfound_lost " \
3961 "name=mulcmd,type=CephChoices,strings=revert|delete",
3962 asok_hook,
3963 "");
3964 ceph_assert(r == 0);
3965 r = admin_socket->register_command(
3966 "pg " \
3967 "name=pgid,type=CephPgid " \
3968 "name=cmd,type=CephChoices,strings=list_unfound " \
3969 "name=offset,type=CephString,req=false",
3970 asok_hook,
3971 "");
3972 ceph_assert(r == 0);
3973 r = admin_socket->register_command(
3974 "pg " \
3975 "name=pgid,type=CephPgid " \
3976 "name=cmd,type=CephChoices,strings=scrub " \
3977 "name=time,type=CephInt,req=false",
3978 asok_hook,
3979 "");
3980 ceph_assert(r == 0);
3981 r = admin_socket->register_command(
3982 "pg " \
3983 "name=pgid,type=CephPgid " \
3984 "name=cmd,type=CephChoices,strings=deep_scrub " \
3985 "name=time,type=CephInt,req=false",
3986 asok_hook,
3987 "");
3988 ceph_assert(r == 0);
3989 // new form: tell <pgid> <cmd> for both cli and rest
3990 r = admin_socket->register_command(
3991 "query",
3992 asok_hook,
3993 "show details of a specific pg");
3994 ceph_assert(r == 0);
3995 r = admin_socket->register_command(
3996 "mark_unfound_lost " \
3997 "name=pgid,type=CephPgid,req=false " \
3998 "name=mulcmd,type=CephChoices,strings=revert|delete",
3999 asok_hook,
4000 "mark all unfound objects in this pg as lost, either removing or reverting to a prior version if one is available");
4001 ceph_assert(r == 0);
4002 r = admin_socket->register_command(
4003 "list_unfound " \
4004 "name=pgid,type=CephPgid,req=false " \
4005 "name=offset,type=CephString,req=false",
4006 asok_hook,
4007 "list unfound objects on this pg, perhaps starting at an offset given in JSON");
4008 ceph_assert(r == 0);
4009 r = admin_socket->register_command(
4010 "scrub " \
4011 "name=pgid,type=CephPgid,req=false " \
4012 "name=time,type=CephInt,req=false",
4013 asok_hook,
4014 "Trigger a scheduled scrub ");
4015 ceph_assert(r == 0);
4016 r = admin_socket->register_command(
4017 "deep_scrub " \
4018 "name=pgid,type=CephPgid,req=false " \
4019 "name=time,type=CephInt,req=false",
4020 asok_hook,
4021 "Trigger a scheduled deep scrub ");
4022 ceph_assert(r == 0);
4023}
7c673cae 4024
9f95a23c
TL
4025void OSD::create_logger()
4026{
4027 dout(10) << "create_logger" << dendl;
7c673cae 4028
9f95a23c 4029 logger = build_osd_logger(cct);
7c673cae
FG
4030 cct->get_perfcounters_collection()->add(logger);
4031}
4032
4033void OSD::create_recoverystate_perf()
4034{
4035 dout(10) << "create_recoverystate_perf" << dendl;
4036
9f95a23c 4037 recoverystate_perf = build_recoverystate_perf(cct);
7c673cae
FG
4038 cct->get_perfcounters_collection()->add(recoverystate_perf);
4039}
4040
4041int OSD::shutdown()
4042{
92f5a8d4
TL
4043 if (cct->_conf->osd_fast_shutdown) {
4044 derr << "*** Immediate shutdown (osd_fast_shutdown=true) ***" << dendl;
3fec8b72
TL
4045 if (cct->_conf->osd_fast_shutdown_notify_mon)
4046 service.prepare_to_stop();
92f5a8d4
TL
4047 cct->_log->flush();
4048 _exit(0);
4049 }
4050
7c673cae
FG
4051 if (!service.prepare_to_stop())
4052 return 0; // already shutting down
9f95a23c 4053 osd_lock.lock();
7c673cae 4054 if (is_stopping()) {
9f95a23c 4055 osd_lock.unlock();
7c673cae
FG
4056 return 0;
4057 }
11fdf7f2 4058 dout(0) << "shutdown" << dendl;
7c673cae
FG
4059
4060 set_state(STATE_STOPPING);
4061
4062 // Debugging
11fdf7f2
TL
4063 if (cct->_conf.get_val<bool>("osd_debug_shutdown")) {
4064 cct->_conf.set_val("debug_osd", "100");
4065 cct->_conf.set_val("debug_journal", "100");
4066 cct->_conf.set_val("debug_filestore", "100");
4067 cct->_conf.set_val("debug_bluestore", "100");
4068 cct->_conf.set_val("debug_ms", "100");
4069 cct->_conf.apply_changes(nullptr);
3efd9988 4070 }
7c673cae
FG
4071
4072 // stop MgrClient earlier as it's more like an internal consumer of OSD
4073 mgrc.shutdown();
4074
4075 service.start_shutdown();
4076
4077 // stop sending work to pgs. this just prevents any new work in _process
4078 // from racing with on_shutdown and potentially entering the pg after.
4079 op_shardedwq.drain();
4080
4081 // Shutdown PGs
4082 {
11fdf7f2
TL
4083 vector<PGRef> pgs;
4084 _get_pgs(&pgs);
4085 for (auto pg : pgs) {
4086 pg->shutdown();
7c673cae
FG
4087 }
4088 }
7c673cae
FG
4089
4090 // drain op queue again (in case PGs requeued something)
4091 op_shardedwq.drain();
4092 {
4093 finished.clear(); // zap waiters (bleh, this is messy)
11fdf7f2 4094 waiting_for_osdmap.clear();
7c673cae
FG
4095 }
4096
7c673cae 4097 // unregister commands
11fdf7f2 4098 cct->get_admin_socket()->unregister_commands(asok_hook);
7c673cae
FG
4099 delete asok_hook;
4100 asok_hook = NULL;
4101
11fdf7f2 4102 cct->get_admin_socket()->unregister_commands(test_ops_hook);
7c673cae
FG
4103 delete test_ops_hook;
4104 test_ops_hook = NULL;
4105
9f95a23c 4106 osd_lock.unlock();
7c673cae 4107
9f95a23c
TL
4108 {
4109 std::lock_guard l{heartbeat_lock};
4110 heartbeat_stop = true;
4111 heartbeat_cond.notify_all();
4112 heartbeat_peers.clear();
4113 }
7c673cae
FG
4114 heartbeat_thread.join();
4115
9f95a23c
TL
4116 hb_back_server_messenger->mark_down_all();
4117 hb_front_server_messenger->mark_down_all();
4118 hb_front_client_messenger->mark_down_all();
4119 hb_back_client_messenger->mark_down_all();
4120
7c673cae
FG
4121 osd_op_tp.drain();
4122 osd_op_tp.stop();
4123 dout(10) << "op sharded tp stopped" << dendl;
4124
7c673cae
FG
4125 dout(10) << "stopping agent" << dendl;
4126 service.agent_stop();
4127
11fdf7f2
TL
4128 boot_finisher.wait_for_empty();
4129
9f95a23c 4130 osd_lock.lock();
7c673cae 4131
11fdf7f2 4132 boot_finisher.stop();
494da23a 4133 reset_heartbeat_peers(true);
7c673cae
FG
4134
4135 tick_timer.shutdown();
4136
4137 {
11fdf7f2 4138 std::lock_guard l(tick_timer_lock);
7c673cae
FG
4139 tick_timer_without_osd_lock.shutdown();
4140 }
4141
4142 // note unmount epoch
9f95a23c 4143 dout(10) << "noting clean unmount in epoch " << get_osdmap_epoch() << dendl;
7c673cae 4144 superblock.mounted = service.get_boot_epoch();
9f95a23c 4145 superblock.clean_thru = get_osdmap_epoch();
7c673cae
FG
4146 ObjectStore::Transaction t;
4147 write_superblock(t);
11fdf7f2 4148 int r = store->queue_transaction(service.meta_ch, std::move(t));
7c673cae
FG
4149 if (r) {
4150 derr << "OSD::shutdown: error writing superblock: "
4151 << cpp_strerror(r) << dendl;
4152 }
4153
4154
31f18b77
FG
4155 service.shutdown_reserver();
4156
7c673cae
FG
4157 // Remove PGs
4158#ifdef PG_DEBUG_REFS
4159 service.dump_live_pgids();
4160#endif
11fdf7f2
TL
4161 while (true) {
4162 vector<PGRef> pgs;
4163 _get_pgs(&pgs, true);
4164 if (pgs.empty()) {
4165 break;
4166 }
4167 for (auto& pg : pgs) {
4168 if (pg->is_deleted()) {
4169 continue;
4170 }
4171 dout(20) << " kicking pg " << pg << dendl;
4172 pg->lock();
4173 if (pg->get_num_ref() != 1) {
4174 derr << "pgid " << pg->get_pgid() << " has ref count of "
4175 << pg->get_num_ref() << dendl;
7c673cae 4176#ifdef PG_DEBUG_REFS
11fdf7f2 4177 pg->dump_live_ids();
7c673cae 4178#endif
31f18b77
FG
4179 if (cct->_conf->osd_shutdown_pgref_assert) {
4180 ceph_abort();
4181 }
7c673cae 4182 }
11fdf7f2
TL
4183 pg->ch.reset();
4184 pg->unlock();
7c673cae 4185 }
7c673cae
FG
4186 }
4187#ifdef PG_DEBUG_REFS
4188 service.dump_live_pgids();
4189#endif
f64942e4 4190
9f95a23c 4191 osd_lock.unlock();
11fdf7f2 4192 cct->_conf.remove_observer(this);
9f95a23c 4193 osd_lock.lock();
7c673cae 4194
11fdf7f2
TL
4195 service.meta_ch.reset();
4196
7c673cae
FG
4197 dout(10) << "syncing store" << dendl;
4198 enable_disable_fuse(true);
4199
4200 if (cct->_conf->osd_journal_flush_on_shutdown) {
4201 dout(10) << "flushing journal" << dendl;
4202 store->flush_journal();
4203 }
4204
7c673cae 4205 monc->shutdown();
9f95a23c
TL
4206 osd_lock.unlock();
4207 {
4208 std::unique_lock l{map_lock};
4209 set_osdmap(OSDMapRef());
4210 }
11fdf7f2
TL
4211 for (auto s : shards) {
4212 std::lock_guard l(s->osdmap_lock);
4213 s->shard_osdmap = OSDMapRef();
4214 }
7c673cae 4215 service.shutdown();
11fdf7f2
TL
4216
4217 std::lock_guard lock(osd_lock);
4218 store->umount();
4219 delete store;
4220 store = nullptr;
4221 dout(10) << "Store synced" << dendl;
4222
7c673cae
FG
4223 op_tracker.on_shutdown();
4224
9f95a23c 4225 ClassHandler::get_instance().shutdown();
7c673cae
FG
4226 client_messenger->shutdown();
4227 cluster_messenger->shutdown();
4228 hb_front_client_messenger->shutdown();
4229 hb_back_client_messenger->shutdown();
4230 objecter_messenger->shutdown();
4231 hb_front_server_messenger->shutdown();
4232 hb_back_server_messenger->shutdown();
4233
7c673cae
FG
4234 return r;
4235}
4236
4237int OSD::mon_cmd_maybe_osd_create(string &cmd)
4238{
4239 bool created = false;
4240 while (true) {
4241 dout(10) << __func__ << " cmd: " << cmd << dendl;
4242 vector<string> vcmd{cmd};
4243 bufferlist inbl;
4244 C_SaferCond w;
4245 string outs;
4246 monc->start_mon_command(vcmd, inbl, NULL, &outs, &w);
4247 int r = w.wait();
4248 if (r < 0) {
4249 if (r == -ENOENT && !created) {
4250 string newcmd = "{\"prefix\": \"osd create\", \"id\": " + stringify(whoami)
4251 + ", \"uuid\": \"" + stringify(superblock.osd_fsid) + "\"}";
4252 vector<string> vnewcmd{newcmd};
4253 bufferlist inbl;
4254 C_SaferCond w;
4255 string outs;
4256 monc->start_mon_command(vnewcmd, inbl, NULL, &outs, &w);
4257 int r = w.wait();
4258 if (r < 0) {
4259 derr << __func__ << " fail: osd does not exist and created failed: "
4260 << cpp_strerror(r) << dendl;
4261 return r;
4262 }
4263 created = true;
4264 continue;
4265 }
4266 derr << __func__ << " fail: '" << outs << "': " << cpp_strerror(r) << dendl;
4267 return r;
4268 }
4269 break;
4270 }
4271
4272 return 0;
4273}
4274
4275int OSD::update_crush_location()
4276{
4277 if (!cct->_conf->osd_crush_update_on_start) {
4278 dout(10) << __func__ << " osd_crush_update_on_start = false" << dendl;
4279 return 0;
4280 }
4281
4282 char weight[32];
4283 if (cct->_conf->osd_crush_initial_weight >= 0) {
4284 snprintf(weight, sizeof(weight), "%.4lf", cct->_conf->osd_crush_initial_weight);
4285 } else {
4286 struct store_statfs_t st;
11fdf7f2
TL
4287 osd_alert_list_t alerts;
4288 int r = store->statfs(&st, &alerts);
7c673cae
FG
4289 if (r < 0) {
4290 derr << "statfs: " << cpp_strerror(r) << dendl;
4291 return r;
4292 }
4293 snprintf(weight, sizeof(weight), "%.4lf",
11fdf7f2
TL
4294 std::max(.00001,
4295 double(st.total) /
4296 double(1ull << 40 /* TB */)));
7c673cae
FG
4297 }
4298
9f95a23c 4299 dout(10) << __func__ << " crush location is " << cct->crush_location << dendl;
7c673cae
FG
4300
4301 string cmd =
4302 string("{\"prefix\": \"osd crush create-or-move\", ") +
9f95a23c
TL
4303 string("\"id\": ") + stringify(whoami) + ", " +
4304 string("\"weight\":") + weight + ", " +
4305 string("\"args\": [") + stringify(cct->crush_location) + "]}";
7c673cae
FG
4306 return mon_cmd_maybe_osd_create(cmd);
4307}
4308
4309int OSD::update_crush_device_class()
4310{
224ce89b
WB
4311 if (!cct->_conf->osd_class_update_on_start) {
4312 dout(10) << __func__ << " osd_class_update_on_start = false" << dendl;
4313 return 0;
4314 }
4315
7c673cae
FG
4316 string device_class;
4317 int r = store->read_meta("crush_device_class", &device_class);
224ce89b
WB
4318 if (r < 0 || device_class.empty()) {
4319 device_class = store->get_default_device_class();
4320 }
4321
4322 if (device_class.empty()) {
d2e6a577 4323 dout(20) << __func__ << " no device class stored locally" << dendl;
7c673cae 4324 return 0;
224ce89b 4325 }
7c673cae
FG
4326
4327 string cmd =
4328 string("{\"prefix\": \"osd crush set-device-class\", ") +
224ce89b
WB
4329 string("\"class\": \"") + device_class + string("\", ") +
4330 string("\"ids\": [\"") + stringify(whoami) + string("\"]}");
7c673cae 4331
224ce89b 4332 r = mon_cmd_maybe_osd_create(cmd);
11fdf7f2
TL
4333 if (r == -EBUSY) {
4334 // good, already bound to a device-class
4335 return 0;
4336 } else {
4337 return r;
4338 }
7c673cae
FG
4339}
4340
4341void OSD::write_superblock(ObjectStore::Transaction& t)
4342{
4343 dout(10) << "write_superblock " << superblock << dendl;
4344
4345 //hack: at minimum it's using the baseline feature set
4346 if (!superblock.compat_features.incompat.contains(CEPH_OSD_FEATURE_INCOMPAT_BASE))
4347 superblock.compat_features.incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_BASE);
4348
4349 bufferlist bl;
11fdf7f2 4350 encode(superblock, bl);
7c673cae
FG
4351 t.write(coll_t::meta(), OSD_SUPERBLOCK_GOBJECT, 0, bl.length(), bl);
4352}
4353
4354int OSD::read_superblock()
4355{
4356 bufferlist bl;
11fdf7f2 4357 int r = store->read(service.meta_ch, OSD_SUPERBLOCK_GOBJECT, 0, 0, bl);
7c673cae
FG
4358 if (r < 0)
4359 return r;
4360
11fdf7f2
TL
4361 auto p = bl.cbegin();
4362 decode(superblock, p);
7c673cae
FG
4363
4364 dout(10) << "read_superblock " << superblock << dendl;
4365
4366 return 0;
4367}
4368
4369void OSD::clear_temp_objects()
4370{
4371 dout(10) << __func__ << dendl;
4372 vector<coll_t> ls;
4373 store->list_collections(ls);
4374 for (vector<coll_t>::iterator p = ls.begin(); p != ls.end(); ++p) {
4375 spg_t pgid;
4376 if (!p->is_pg(&pgid))
4377 continue;
4378
4379 // list temp objects
4380 dout(20) << " clearing temps in " << *p << " pgid " << pgid << dendl;
4381
4382 vector<ghobject_t> temps;
4383 ghobject_t next;
4384 while (1) {
4385 vector<ghobject_t> objects;
11fdf7f2
TL
4386 auto ch = store->open_collection(*p);
4387 ceph_assert(ch);
4388 store->collection_list(ch, next, ghobject_t::get_max(),
7c673cae
FG
4389 store->get_ideal_list_max(),
4390 &objects, &next);
4391 if (objects.empty())
4392 break;
4393 vector<ghobject_t>::iterator q;
4394 for (q = objects.begin(); q != objects.end(); ++q) {
4395 // Hammer set pool for temps to -1, so check for clean-up
4396 if (q->hobj.is_temp() || (q->hobj.pool == -1)) {
4397 temps.push_back(*q);
4398 } else {
4399 break;
4400 }
4401 }
4402 // If we saw a non-temp object and hit the break above we can
4403 // break out of the while loop too.
4404 if (q != objects.end())
4405 break;
4406 }
4407 if (!temps.empty()) {
4408 ObjectStore::Transaction t;
4409 int removed = 0;
4410 for (vector<ghobject_t>::iterator q = temps.begin(); q != temps.end(); ++q) {
4411 dout(20) << " removing " << *p << " object " << *q << dendl;
4412 t.remove(*p, *q);
4413 if (++removed > cct->_conf->osd_target_transaction_size) {
11fdf7f2 4414 store->queue_transaction(service.meta_ch, std::move(t));
7c673cae
FG
4415 t = ObjectStore::Transaction();
4416 removed = 0;
4417 }
4418 }
4419 if (removed) {
11fdf7f2 4420 store->queue_transaction(service.meta_ch, std::move(t));
7c673cae
FG
4421 }
4422 }
4423 }
4424}
4425
4426void OSD::recursive_remove_collection(CephContext* cct,
4427 ObjectStore *store, spg_t pgid,
4428 coll_t tmp)
4429{
4430 OSDriver driver(
4431 store,
4432 coll_t(),
4433 make_snapmapper_oid());
4434
11fdf7f2 4435 ObjectStore::CollectionHandle ch = store->open_collection(tmp);
7c673cae
FG
4436 ObjectStore::Transaction t;
4437 SnapMapper mapper(cct, &driver, 0, 0, 0, pgid.shard);
4438
11fdf7f2
TL
4439 ghobject_t next;
4440 int max = cct->_conf->osd_target_transaction_size;
7c673cae 4441 vector<ghobject_t> objects;
11fdf7f2
TL
4442 objects.reserve(max);
4443 while (true) {
4444 objects.clear();
4445 store->collection_list(ch, next, ghobject_t::get_max(),
4446 max, &objects, &next);
4447 generic_dout(10) << __func__ << " " << objects << dendl;
4448 if (objects.empty())
4449 break;
4450 for (auto& p: objects) {
4451 OSDriver::OSTransaction _t(driver.get_transaction(&t));
4452 int r = mapper.remove_oid(p.hobj, &_t);
4453 if (r != 0 && r != -ENOENT)
4454 ceph_abort();
4455 t.remove(tmp, p);
7c673cae 4456 }
11fdf7f2
TL
4457 int r = store->queue_transaction(ch, std::move(t));
4458 ceph_assert(r == 0);
4459 t = ObjectStore::Transaction();
7c673cae
FG
4460 }
4461 t.remove_collection(tmp);
11fdf7f2
TL
4462 int r = store->queue_transaction(ch, std::move(t));
4463 ceph_assert(r == 0);
7c673cae
FG
4464
4465 C_SaferCond waiter;
11fdf7f2 4466 if (!ch->flush_commit(&waiter)) {
7c673cae
FG
4467 waiter.wait();
4468 }
4469}
4470
4471
4472// ======================================================
4473// PG's
4474
7c673cae
FG
4475PG* OSD::_make_pg(
4476 OSDMapRef createmap,
4477 spg_t pgid)
4478{
11fdf7f2
TL
4479 dout(10) << __func__ << " " << pgid << dendl;
4480 pg_pool_t pi;
4481 map<string,string> ec_profile;
4482 string name;
4483 if (createmap->have_pg_pool(pgid.pool())) {
4484 pi = *createmap->get_pg_pool(pgid.pool());
4485 name = createmap->get_pool_name(pgid.pool());
4486 if (pi.is_erasure()) {
4487 ec_profile = createmap->get_erasure_code_profile(pi.erasure_code_profile);
4488 }
4489 } else {
4490 // pool was deleted; grab final pg_pool_t off disk.
4491 ghobject_t oid = make_final_pool_info_oid(pgid.pool());
4492 bufferlist bl;
4493 int r = store->read(service.meta_ch, oid, 0, 0, bl);
4494 if (r < 0) {
4495 derr << __func__ << " missing pool " << pgid.pool() << " tombstone"
4496 << dendl;
4497 return nullptr;
4498 }
4499 ceph_assert(r >= 0);
4500 auto p = bl.cbegin();
4501 decode(pi, p);
4502 decode(name, p);
4503 if (p.end()) { // dev release v13.0.2 did not include ec_profile
4504 derr << __func__ << " missing ec_profile from pool " << pgid.pool()
4505 << " tombstone" << dendl;
4506 return nullptr;
4507 }
4508 decode(ec_profile, p);
4509 }
4510 PGPool pool(cct, createmap, pgid.pool(), pi, name);
7c673cae 4511 PG *pg;
11fdf7f2
TL
4512 if (pi.type == pg_pool_t::TYPE_REPLICATED ||
4513 pi.type == pg_pool_t::TYPE_ERASURE)
4514 pg = new PrimaryLogPG(&service, createmap, pool, ec_profile, pgid);
7c673cae
FG
4515 else
4516 ceph_abort();
7c673cae
FG
4517 return pg;
4518}
4519
11fdf7f2 4520void OSD::_get_pgs(vector<PGRef> *v, bool clear_too)
7c673cae 4521{
11fdf7f2
TL
4522 v->clear();
4523 v->reserve(get_num_pgs());
4524 for (auto& s : shards) {
4525 std::lock_guard l(s->shard_lock);
4526 for (auto& j : s->pg_slots) {
4527 if (j.second->pg &&
4528 !j.second->pg->is_deleted()) {
4529 v->push_back(j.second->pg);
4530 if (clear_too) {
4531 s->_detach_pg(j.second.get());
4532 }
4533 }
7c673cae 4534 }
7c673cae 4535 }
7c673cae
FG
4536}
4537
11fdf7f2 4538void OSD::_get_pgids(vector<spg_t> *v)
7c673cae 4539{
11fdf7f2
TL
4540 v->clear();
4541 v->reserve(get_num_pgs());
4542 for (auto& s : shards) {
4543 std::lock_guard l(s->shard_lock);
4544 for (auto& j : s->pg_slots) {
4545 if (j.second->pg &&
4546 !j.second->pg->is_deleted()) {
4547 v->push_back(j.first);
4548 }
7c673cae
FG
4549 }
4550 }
7c673cae
FG
4551}
4552
11fdf7f2 4553void OSD::register_pg(PGRef pg)
7c673cae 4554{
11fdf7f2
TL
4555 spg_t pgid = pg->get_pgid();
4556 uint32_t shard_index = pgid.hash_to_shard(num_shards);
4557 auto sdata = shards[shard_index];
4558 std::lock_guard l(sdata->shard_lock);
4559 auto r = sdata->pg_slots.emplace(pgid, make_unique<OSDShardPGSlot>());
4560 ceph_assert(r.second);
4561 auto *slot = r.first->second.get();
4562 dout(20) << __func__ << " " << pgid << " " << pg << dendl;
4563 sdata->_attach_pg(slot, pg.get());
4564}
7c673cae 4565
11fdf7f2
TL
4566bool OSD::try_finish_pg_delete(PG *pg, unsigned old_pg_num)
4567{
4568 auto sdata = pg->osd_shard;
4569 ceph_assert(sdata);
4570 {
4571 std::lock_guard l(sdata->shard_lock);
4572 auto p = sdata->pg_slots.find(pg->pg_id);
4573 if (p == sdata->pg_slots.end() ||
4574 !p->second->pg) {
4575 dout(20) << __func__ << " " << pg->pg_id << " not found" << dendl;
4576 return false;
4577 }
4578 if (p->second->waiting_for_merge_epoch) {
4579 dout(20) << __func__ << " " << pg->pg_id << " waiting for merge" << dendl;
4580 return false;
4581 }
4582 dout(20) << __func__ << " " << pg->pg_id << " " << pg << dendl;
4583 sdata->_detach_pg(p->second.get());
4584 }
7c673cae 4585
11fdf7f2
TL
4586 for (auto shard : shards) {
4587 shard->unprime_split_children(pg->pg_id, old_pg_num);
4588 }
7c673cae 4589
11fdf7f2
TL
4590 // update pg count now since we might not get an osdmap any time soon.
4591 if (pg->is_primary())
4592 service.logger->dec(l_osd_pg_primary);
9f95a23c
TL
4593 else if (pg->is_nonprimary())
4594 service.logger->dec(l_osd_pg_replica); // misnomver
11fdf7f2
TL
4595 else
4596 service.logger->dec(l_osd_pg_stray);
7c673cae 4597
11fdf7f2 4598 return true;
7c673cae
FG
4599}
4600
11fdf7f2 4601PGRef OSD::_lookup_pg(spg_t pgid)
7c673cae 4602{
11fdf7f2
TL
4603 uint32_t shard_index = pgid.hash_to_shard(num_shards);
4604 auto sdata = shards[shard_index];
4605 std::lock_guard l(sdata->shard_lock);
4606 auto p = sdata->pg_slots.find(pgid);
4607 if (p == sdata->pg_slots.end()) {
7c673cae 4608 return nullptr;
11fdf7f2
TL
4609 }
4610 return p->second->pg;
7c673cae
FG
4611}
4612
11fdf7f2 4613PGRef OSD::_lookup_lock_pg(spg_t pgid)
31f18b77 4614{
11fdf7f2
TL
4615 PGRef pg = _lookup_pg(pgid);
4616 if (!pg) {
4617 return nullptr;
4618 }
4619 pg->lock();
4620 if (!pg->is_deleted()) {
4621 return pg;
4622 }
4623 pg->unlock();
4624 return nullptr;
31f18b77
FG
4625}
4626
11fdf7f2 4627PGRef OSD::lookup_lock_pg(spg_t pgid)
7c673cae 4628{
11fdf7f2 4629 return _lookup_lock_pg(pgid);
7c673cae
FG
4630}
4631
4632void OSD::load_pgs()
4633{
9f95a23c 4634 ceph_assert(ceph_mutex_is_locked(osd_lock));
7c673cae 4635 dout(0) << "load_pgs" << dendl;
11fdf7f2 4636
7c673cae 4637 {
11fdf7f2
TL
4638 auto pghist = make_pg_num_history_oid();
4639 bufferlist bl;
4640 int r = store->read(service.meta_ch, pghist, 0, 0, bl, 0);
4641 if (r >= 0 && bl.length() > 0) {
4642 auto p = bl.cbegin();
4643 decode(pg_num_history, p);
4644 }
4645 dout(20) << __func__ << " pg_num_history " << pg_num_history << dendl;
7c673cae
FG
4646 }
4647
4648 vector<coll_t> ls;
4649 int r = store->list_collections(ls);
4650 if (r < 0) {
4651 derr << "failed to list pgs: " << cpp_strerror(-r) << dendl;
4652 }
4653
11fdf7f2 4654 int num = 0;
7c673cae
FG
4655 for (vector<coll_t>::iterator it = ls.begin();
4656 it != ls.end();
4657 ++it) {
4658 spg_t pgid;
4659 if (it->is_temp(&pgid) ||
4660 (it->is_pg(&pgid) && PG::_has_removal_flag(store, pgid))) {
11fdf7f2
TL
4661 dout(10) << "load_pgs " << *it
4662 << " removing, legacy or flagged for removal pg" << dendl;
7c673cae
FG
4663 recursive_remove_collection(cct, store, pgid, *it);
4664 continue;
4665 }
4666
4667 if (!it->is_pg(&pgid)) {
4668 dout(10) << "load_pgs ignoring unrecognized " << *it << dendl;
4669 continue;
4670 }
4671
7c673cae 4672 dout(10) << "pgid " << pgid << " coll " << coll_t(pgid) << dendl;
7c673cae 4673 epoch_t map_epoch = 0;
11fdf7f2 4674 int r = PG::peek_map_epoch(store, pgid, &map_epoch);
7c673cae
FG
4675 if (r < 0) {
4676 derr << __func__ << " unable to peek at " << pgid << " metadata, skipping"
4677 << dendl;
4678 continue;
4679 }
4680
11fdf7f2 4681 PGRef pg;
7c673cae
FG
4682 if (map_epoch > 0) {
4683 OSDMapRef pgosdmap = service.try_get_map(map_epoch);
4684 if (!pgosdmap) {
9f95a23c 4685 if (!get_osdmap()->have_pg_pool(pgid.pool())) {
7c673cae
FG
4686 derr << __func__ << ": could not find map for epoch " << map_epoch
4687 << " on pg " << pgid << ", but the pool is not present in the "
4688 << "current map, so this is probably a result of bug 10617. "
4689 << "Skipping the pg for now, you can use ceph-objectstore-tool "
4690 << "to clean it up later." << dendl;
4691 continue;
4692 } else {
4693 derr << __func__ << ": have pgid " << pgid << " at epoch "
4694 << map_epoch << ", but missing map. Crashing."
4695 << dendl;
11fdf7f2 4696 ceph_abort_msg("Missing map in load_pgs");
7c673cae
FG
4697 }
4698 }
11fdf7f2 4699 pg = _make_pg(pgosdmap, pgid);
7c673cae 4700 } else {
9f95a23c 4701 pg = _make_pg(get_osdmap(), pgid);
7c673cae 4702 }
11fdf7f2
TL
4703 if (!pg) {
4704 recursive_remove_collection(cct, store, pgid, *it);
4705 continue;
4706 }
4707
4708 // there can be no waiters here, so we don't call _wake_pg_slot
7c673cae 4709
11fdf7f2 4710 pg->lock();
7c673cae
FG
4711 pg->ch = store->open_collection(pg->coll);
4712
4713 // read pg state, log
11fdf7f2 4714 pg->read_state(store);
7c673cae 4715
94b18763
FG
4716 if (pg->dne()) {
4717 dout(10) << "load_pgs " << *it << " deleting dne" << dendl;
4718 pg->ch = nullptr;
94b18763 4719 pg->unlock();
94b18763
FG
4720 recursive_remove_collection(cct, store, pgid, *it);
4721 continue;
4722 }
11fdf7f2
TL
4723 {
4724 uint32_t shard_index = pgid.hash_to_shard(shards.size());
4725 assert(NULL != shards[shard_index]);
4726 store->set_collection_commit_queue(pg->coll, &(shards[shard_index]->context_queue));
4727 }
7c673cae
FG
4728
4729 pg->reg_next_scrub();
4730
11fdf7f2 4731 dout(10) << __func__ << " loaded " << *pg << dendl;
7c673cae 4732 pg->unlock();
7c673cae 4733
11fdf7f2
TL
4734 register_pg(pg);
4735 ++num;
7c673cae 4736 }
11fdf7f2 4737 dout(0) << __func__ << " opened " << num << " pgs" << dendl;
7c673cae
FG
4738}
4739
4740
11fdf7f2
TL
4741PGRef OSD::handle_pg_create_info(const OSDMapRef& osdmap,
4742 const PGCreateInfo *info)
4743{
4744 spg_t pgid = info->pgid;
7c673cae 4745
11fdf7f2
TL
4746 if (maybe_wait_for_max_pg(osdmap, pgid, info->by_mon)) {
4747 dout(10) << __func__ << " hit max pg, dropping" << dendl;
4748 return nullptr;
4749 }
3efd9988 4750
9f95a23c 4751 PeeringCtx rctx = create_context();
7c673cae 4752
11fdf7f2 4753 OSDMapRef startmap = get_map(info->epoch);
7c673cae 4754
11fdf7f2
TL
4755 if (info->by_mon) {
4756 int64_t pool_id = pgid.pgid.pool();
4757 const pg_pool_t *pool = osdmap->get_pg_pool(pool_id);
4758 if (!pool) {
4759 dout(10) << __func__ << " ignoring " << pgid << ", pool dne" << dendl;
4760 return nullptr;
4761 }
9f95a23c 4762 if (osdmap->require_osd_release >= ceph_release_t::nautilus &&
11fdf7f2
TL
4763 !pool->has_flag(pg_pool_t::FLAG_CREATING)) {
4764 // this ensures we do not process old creating messages after the
4765 // pool's initial pgs have been created (and pg are subsequently
4766 // allowed to split or merge).
4767 dout(20) << __func__ << " dropping " << pgid
4768 << "create, pool does not have CREATING flag set" << dendl;
4769 return nullptr;
7c673cae
FG
4770 }
4771 }
7c673cae 4772
11fdf7f2
TL
4773 int up_primary, acting_primary;
4774 vector<int> up, acting;
4775 startmap->pg_to_up_acting_osds(
4776 pgid.pgid, &up, &up_primary, &acting, &acting_primary);
7c673cae 4777
11fdf7f2
TL
4778 const pg_pool_t* pp = startmap->get_pg_pool(pgid.pool());
4779 if (pp->has_flag(pg_pool_t::FLAG_EC_OVERWRITES) &&
4780 store->get_type() != "bluestore") {
4781 clog->warn() << "pg " << pgid
4782 << " is at risk of silent data corruption: "
4783 << "the pool allows ec overwrites but is not stored in "
4784 << "bluestore, so deep scrubbing will not detect bitrot";
7c673cae 4785 }
9f95a23c
TL
4786 create_pg_collection(
4787 rctx.transaction, pgid, pgid.get_split_bits(pp->get_pg_num()));
4788 init_pg_ondisk(rctx.transaction, pgid, pp);
7c673cae 4789
9f95a23c 4790 int role = startmap->calc_pg_role(pg_shard_t(whoami, pgid.shard), acting);
7c673cae 4791
11fdf7f2
TL
4792 PGRef pg = _make_pg(startmap, pgid);
4793 pg->ch = store->create_new_collection(pg->coll);
7c673cae 4794
11fdf7f2
TL
4795 {
4796 uint32_t shard_index = pgid.hash_to_shard(shards.size());
4797 assert(NULL != shards[shard_index]);
4798 store->set_collection_commit_queue(pg->coll, &(shards[shard_index]->context_queue));
7c673cae 4799 }
7c673cae 4800
11fdf7f2 4801 pg->lock(true);
7c673cae 4802
11fdf7f2
TL
4803 // we are holding the shard lock
4804 ceph_assert(!pg->is_deleted());
4805
4806 pg->init(
4807 role,
4808 up,
4809 up_primary,
4810 acting,
4811 acting_primary,
4812 info->history,
4813 info->past_intervals,
4814 false,
4815 rctx.transaction);
7c673cae 4816
92f5a8d4
TL
4817 pg->init_collection_pool_opts();
4818
11fdf7f2 4819 if (pg->is_primary()) {
9f95a23c 4820 std::lock_guard locker{m_perf_queries_lock};
11fdf7f2
TL
4821 pg->set_dynamic_perf_stats_queries(m_perf_queries);
4822 }
7c673cae 4823
9f95a23c
TL
4824 pg->handle_initialize(rctx);
4825 pg->handle_activate_map(rctx);
7c673cae 4826
11fdf7f2 4827 dispatch_context(rctx, pg.get(), osdmap, nullptr);
7c673cae 4828
11fdf7f2
TL
4829 dout(10) << __func__ << " new pg " << *pg << dendl;
4830 return pg;
7c673cae
FG
4831}
4832
11fdf7f2
TL
4833bool OSD::maybe_wait_for_max_pg(const OSDMapRef& osdmap,
4834 spg_t pgid,
4835 bool is_mon_create)
3efd9988
FG
4836{
4837 const auto max_pgs_per_osd =
11fdf7f2
TL
4838 (cct->_conf.get_val<uint64_t>("mon_max_pg_per_osd") *
4839 cct->_conf.get_val<double>("osd_max_pg_per_osd_hard_ratio"));
3efd9988 4840
11fdf7f2 4841 if (num_pgs < max_pgs_per_osd) {
3efd9988
FG
4842 return false;
4843 }
11fdf7f2
TL
4844
4845 std::lock_guard l(pending_creates_lock);
3efd9988
FG
4846 if (is_mon_create) {
4847 pending_creates_from_mon++;
4848 } else {
9f95a23c
TL
4849 bool is_primary = osdmap->get_pg_acting_role(pgid, whoami) == 0;
4850 pending_creates_from_osd.emplace(pgid, is_primary);
3efd9988 4851 }
1adf2230 4852 dout(1) << __func__ << " withhold creation of pg " << pgid
11fdf7f2 4853 << ": " << num_pgs << " >= "<< max_pgs_per_osd << dendl;
3efd9988
FG
4854 return true;
4855}
4856
4857// to re-trigger a peering, we have to twiddle the pg mapping a little bit,
4858// see PG::should_restart_peering(). OSDMap::pg_to_up_acting_osds() will turn
4859// to up set if pg_temp is empty. so an empty pg_temp won't work.
4860static vector<int32_t> twiddle(const vector<int>& acting) {
4861 if (acting.size() > 1) {
4862 return {acting[0]};
4863 } else {
4864 vector<int32_t> twiddled(acting.begin(), acting.end());
4865 twiddled.push_back(-1);
4866 return twiddled;
4867 }
4868}
4869
4870void OSD::resume_creating_pg()
4871{
4872 bool do_sub_pg_creates = false;
b32b8144 4873 bool have_pending_creates = false;
3efd9988
FG
4874 {
4875 const auto max_pgs_per_osd =
11fdf7f2
TL
4876 (cct->_conf.get_val<uint64_t>("mon_max_pg_per_osd") *
4877 cct->_conf.get_val<double>("osd_max_pg_per_osd_hard_ratio"));
4878 if (max_pgs_per_osd <= num_pgs) {
3efd9988
FG
4879 // this could happen if admin decreases this setting before a PG is removed
4880 return;
4881 }
11fdf7f2
TL
4882 unsigned spare_pgs = max_pgs_per_osd - num_pgs;
4883 std::lock_guard l(pending_creates_lock);
3efd9988 4884 if (pending_creates_from_mon > 0) {
11fdf7f2
TL
4885 dout(20) << __func__ << " pending_creates_from_mon "
4886 << pending_creates_from_mon << dendl;
3efd9988
FG
4887 do_sub_pg_creates = true;
4888 if (pending_creates_from_mon >= spare_pgs) {
4889 spare_pgs = pending_creates_from_mon = 0;
4890 } else {
4891 spare_pgs -= pending_creates_from_mon;
4892 pending_creates_from_mon = 0;
4893 }
4894 }
4895 auto pg = pending_creates_from_osd.cbegin();
4896 while (spare_pgs > 0 && pg != pending_creates_from_osd.cend()) {
94b18763 4897 dout(20) << __func__ << " pg " << pg->first << dendl;
3efd9988 4898 vector<int> acting;
9f95a23c
TL
4899 get_osdmap()->pg_to_up_acting_osds(pg->first.pgid, nullptr, nullptr, &acting, nullptr);
4900 service.queue_want_pg_temp(pg->first.pgid, twiddle(acting), true);
3efd9988 4901 pg = pending_creates_from_osd.erase(pg);
94b18763 4902 do_sub_pg_creates = true;
3efd9988
FG
4903 spare_pgs--;
4904 }
b32b8144
FG
4905 have_pending_creates = (pending_creates_from_mon > 0 ||
4906 !pending_creates_from_osd.empty());
3efd9988 4907 }
b32b8144
FG
4908
4909 bool do_renew_subs = false;
3efd9988
FG
4910 if (do_sub_pg_creates) {
4911 if (monc->sub_want("osd_pg_creates", last_pg_create_epoch, 0)) {
4912 dout(4) << __func__ << ": resolicit pg creates from mon since "
4913 << last_pg_create_epoch << dendl;
b32b8144 4914 do_renew_subs = true;
3efd9988
FG
4915 }
4916 }
9f95a23c 4917 version_t start = get_osdmap_epoch() + 1;
b32b8144
FG
4918 if (have_pending_creates) {
4919 // don't miss any new osdmap deleting PGs
4920 if (monc->sub_want("osdmap", start, 0)) {
4921 dout(4) << __func__ << ": resolicit osdmap from mon since "
4922 << start << dendl;
4923 do_renew_subs = true;
4924 }
94b18763 4925 } else if (do_sub_pg_creates) {
b32b8144
FG
4926 // no need to subscribe the osdmap continuously anymore
4927 // once the pgtemp and/or mon_subscribe(pg_creates) is sent
4928 if (monc->sub_want_increment("osdmap", start, CEPH_SUBSCRIBE_ONETIME)) {
11fdf7f2 4929 dout(4) << __func__ << ": re-subscribe osdmap(onetime) since "
b32b8144
FG
4930 << start << dendl;
4931 do_renew_subs = true;
4932 }
4933 }
4934
4935 if (do_renew_subs) {
4936 monc->renew_subs();
4937 }
4938
94b18763 4939 service.send_pg_temp();
3efd9988 4940}
7c673cae
FG
4941
4942void OSD::build_initial_pg_history(
4943 spg_t pgid,
4944 epoch_t created,
4945 utime_t created_stamp,
4946 pg_history_t *h,
4947 PastIntervals *pi)
4948{
4949 dout(10) << __func__ << " " << pgid << " created " << created << dendl;
9f95a23c 4950 *h = pg_history_t(created, created_stamp);
7c673cae
FG
4951
4952 OSDMapRef lastmap = service.get_map(created);
4953 int up_primary, acting_primary;
4954 vector<int> up, acting;
4955 lastmap->pg_to_up_acting_osds(
4956 pgid.pgid, &up, &up_primary, &acting, &acting_primary);
4957
4958 ostringstream debug;
9f95a23c 4959 for (epoch_t e = created + 1; e <= get_osdmap_epoch(); ++e) {
7c673cae
FG
4960 OSDMapRef osdmap = service.get_map(e);
4961 int new_up_primary, new_acting_primary;
4962 vector<int> new_up, new_acting;
4963 osdmap->pg_to_up_acting_osds(
4964 pgid.pgid, &new_up, &new_up_primary, &new_acting, &new_acting_primary);
4965
4966 // this is a bit imprecise, but sufficient?
4967 struct min_size_predicate_t : public IsPGRecoverablePredicate {
4968 const pg_pool_t *pi;
4969 bool operator()(const set<pg_shard_t> &have) const {
4970 return have.size() >= pi->min_size;
4971 }
11fdf7f2 4972 explicit min_size_predicate_t(const pg_pool_t *i) : pi(i) {}
7c673cae
FG
4973 } min_size_predicate(osdmap->get_pg_pool(pgid.pgid.pool()));
4974
4975 bool new_interval = PastIntervals::check_new_interval(
4976 acting_primary,
4977 new_acting_primary,
4978 acting, new_acting,
4979 up_primary,
4980 new_up_primary,
4981 up, new_up,
4982 h->same_interval_since,
4983 h->last_epoch_clean,
9f95a23c
TL
4984 osdmap.get(),
4985 lastmap.get(),
7c673cae 4986 pgid.pgid,
9f95a23c 4987 min_size_predicate,
7c673cae
FG
4988 pi,
4989 &debug);
4990 if (new_interval) {
4991 h->same_interval_since = e;
181888fb
FG
4992 if (up != new_up) {
4993 h->same_up_since = e;
4994 }
4995 if (acting_primary != new_acting_primary) {
4996 h->same_primary_since = e;
4997 }
4998 if (pgid.pgid.is_split(lastmap->get_pg_num(pgid.pgid.pool()),
4999 osdmap->get_pg_num(pgid.pgid.pool()),
5000 nullptr)) {
5001 h->last_epoch_split = e;
5002 }
5003 up = new_up;
5004 acting = new_acting;
5005 up_primary = new_up_primary;
5006 acting_primary = new_acting_primary;
c07f9fc5 5007 }
7c673cae
FG
5008 lastmap = osdmap;
5009 }
5010 dout(20) << __func__ << " " << debug.str() << dendl;
5011 dout(10) << __func__ << " " << *h << " " << *pi
5012 << " [" << (pi->empty() ? pair<epoch_t,epoch_t>(0,0) :
5013 pi->get_bounds()) << ")"
5014 << dendl;
5015}
5016
7c673cae
FG
5017void OSD::_add_heartbeat_peer(int p)
5018{
5019 if (p == whoami)
5020 return;
5021 HeartbeatInfo *hi;
5022
5023 map<int,HeartbeatInfo>::iterator i = heartbeat_peers.find(p);
5024 if (i == heartbeat_peers.end()) {
9f95a23c 5025 pair<ConnectionRef,ConnectionRef> cons = service.get_con_osd_hb(p, get_osdmap_epoch());
7c673cae
FG
5026 if (!cons.first)
5027 return;
9f95a23c
TL
5028 assert(cons.second);
5029
7c673cae
FG
5030 hi = &heartbeat_peers[p];
5031 hi->peer = p;
9f95a23c
TL
5032
5033 auto stamps = service.get_hb_stamps(p);
5034
5035 auto sb = ceph::make_ref<Session>(cct, cons.first.get());
5036 sb->peer = p;
5037 sb->stamps = stamps;
eafe8130 5038 hi->hb_interval_start = ceph_clock_now();
7c673cae 5039 hi->con_back = cons.first.get();
9f95a23c
TL
5040 hi->con_back->set_priv(sb);
5041
5042 auto sf = ceph::make_ref<Session>(cct, cons.second.get());
5043 sf->peer = p;
5044 sf->stamps = stamps;
5045 hi->con_front = cons.second.get();
5046 hi->con_front->set_priv(sf);
5047
5048 dout(10) << "_add_heartbeat_peer: new peer osd." << p
5049 << " " << hi->con_back->get_peer_addr()
5050 << " " << hi->con_front->get_peer_addr()
5051 << dendl;
7c673cae
FG
5052 } else {
5053 hi = &i->second;
5054 }
9f95a23c 5055 hi->epoch = get_osdmap_epoch();
7c673cae
FG
5056}
5057
5058void OSD::_remove_heartbeat_peer(int n)
5059{
5060 map<int,HeartbeatInfo>::iterator q = heartbeat_peers.find(n);
11fdf7f2 5061 ceph_assert(q != heartbeat_peers.end());
7c673cae
FG
5062 dout(20) << " removing heartbeat peer osd." << n
5063 << " " << q->second.con_back->get_peer_addr()
5064 << " " << (q->second.con_front ? q->second.con_front->get_peer_addr() : entity_addr_t())
5065 << dendl;
9f95a23c 5066 q->second.clear_mark_down();
7c673cae
FG
5067 heartbeat_peers.erase(q);
5068}
5069
5070void OSD::need_heartbeat_peer_update()
5071{
5072 if (is_stopping())
5073 return;
5074 dout(20) << "need_heartbeat_peer_update" << dendl;
5075 heartbeat_set_peers_need_update();
5076}
5077
5078void OSD::maybe_update_heartbeat_peers()
5079{
9f95a23c 5080 ceph_assert(ceph_mutex_is_locked(osd_lock));
7c673cae 5081
11fdf7f2 5082 if (is_waiting_for_healthy() || is_active()) {
7c673cae
FG
5083 utime_t now = ceph_clock_now();
5084 if (last_heartbeat_resample == utime_t()) {
5085 last_heartbeat_resample = now;
5086 heartbeat_set_peers_need_update();
5087 } else if (!heartbeat_peers_need_update()) {
5088 utime_t dur = now - last_heartbeat_resample;
5089 if (dur > cct->_conf->osd_heartbeat_grace) {
5090 dout(10) << "maybe_update_heartbeat_peers forcing update after " << dur << " seconds" << dendl;
5091 heartbeat_set_peers_need_update();
5092 last_heartbeat_resample = now;
494da23a
TL
5093 // automatically clean up any stale heartbeat peers
5094 // if we are unhealthy, then clean all
5095 reset_heartbeat_peers(is_waiting_for_healthy());
7c673cae
FG
5096 }
5097 }
5098 }
5099
5100 if (!heartbeat_peers_need_update())
5101 return;
5102 heartbeat_clear_peers_need_update();
5103
11fdf7f2 5104 std::lock_guard l(heartbeat_lock);
7c673cae
FG
5105
5106 dout(10) << "maybe_update_heartbeat_peers updating" << dendl;
5107
5108
5109 // build heartbeat from set
5110 if (is_active()) {
11fdf7f2
TL
5111 vector<PGRef> pgs;
5112 _get_pgs(&pgs);
5113 for (auto& pg : pgs) {
5114 pg->with_heartbeat_peers([&](int peer) {
9f95a23c 5115 if (get_osdmap()->is_up(peer)) {
11fdf7f2
TL
5116 _add_heartbeat_peer(peer);
5117 }
5118 });
7c673cae
FG
5119 }
5120 }
5121
5122 // include next and previous up osds to ensure we have a fully-connected set
5123 set<int> want, extras;
9f95a23c 5124 const int next = get_osdmap()->get_next_up_osd_after(whoami);
7c673cae
FG
5125 if (next >= 0)
5126 want.insert(next);
9f95a23c 5127 int prev = get_osdmap()->get_previous_up_osd_before(whoami);
7c673cae
FG
5128 if (prev >= 0 && prev != next)
5129 want.insert(prev);
5130
11fdf7f2
TL
5131 // make sure we have at least **min_down** osds coming from different
5132 // subtree level (e.g., hosts) for fast failure detection.
5133 auto min_down = cct->_conf.get_val<uint64_t>("mon_osd_min_down_reporters");
5134 auto subtree = cct->_conf.get_val<string>("mon_osd_reporter_subtree_level");
9f95a23c
TL
5135 auto limit = std::max(min_down, (uint64_t)cct->_conf->osd_heartbeat_min_peers);
5136 get_osdmap()->get_random_up_osds_by_subtree(
5137 whoami, subtree, limit, want, &want);
11fdf7f2 5138
7c673cae
FG
5139 for (set<int>::iterator p = want.begin(); p != want.end(); ++p) {
5140 dout(10) << " adding neighbor peer osd." << *p << dendl;
5141 extras.insert(*p);
5142 _add_heartbeat_peer(*p);
5143 }
5144
5145 // remove down peers; enumerate extras
5146 map<int,HeartbeatInfo>::iterator p = heartbeat_peers.begin();
5147 while (p != heartbeat_peers.end()) {
9f95a23c 5148 if (!get_osdmap()->is_up(p->first)) {
7c673cae
FG
5149 int o = p->first;
5150 ++p;
5151 _remove_heartbeat_peer(o);
5152 continue;
5153 }
9f95a23c 5154 if (p->second.epoch < get_osdmap_epoch()) {
7c673cae
FG
5155 extras.insert(p->first);
5156 }
5157 ++p;
5158 }
5159
5160 // too few?
11fdf7f2 5161 for (int n = next; n >= 0; ) {
7c673cae
FG
5162 if ((int)heartbeat_peers.size() >= cct->_conf->osd_heartbeat_min_peers)
5163 break;
5164 if (!extras.count(n) && !want.count(n) && n != whoami) {
5165 dout(10) << " adding random peer osd." << n << dendl;
5166 extras.insert(n);
5167 _add_heartbeat_peer(n);
5168 }
9f95a23c 5169 n = get_osdmap()->get_next_up_osd_after(n);
11fdf7f2 5170 if (n == next)
7c673cae
FG
5171 break; // came full circle; stop
5172 }
5173
5174 // too many?
5175 for (set<int>::iterator p = extras.begin();
5176 (int)heartbeat_peers.size() > cct->_conf->osd_heartbeat_min_peers && p != extras.end();
5177 ++p) {
5178 if (want.count(*p))
5179 continue;
5180 _remove_heartbeat_peer(*p);
5181 }
5182
5183 dout(10) << "maybe_update_heartbeat_peers " << heartbeat_peers.size() << " peers, extras " << extras << dendl;
9f95a23c
TL
5184
5185 // clean up stale failure pending
5186 for (auto it = failure_pending.begin(); it != failure_pending.end();) {
5187 if (heartbeat_peers.count(it->first) == 0) {
5188 send_still_alive(get_osdmap_epoch(), it->first, it->second.second);
5189 failure_pending.erase(it++);
5190 } else {
5191 it++;
5192 }
5193 }
7c673cae
FG
5194}
5195
494da23a 5196void OSD::reset_heartbeat_peers(bool all)
7c673cae 5197{
9f95a23c 5198 ceph_assert(ceph_mutex_is_locked(osd_lock));
7c673cae 5199 dout(10) << "reset_heartbeat_peers" << dendl;
494da23a
TL
5200 utime_t stale = ceph_clock_now();
5201 stale -= cct->_conf.get_val<int64_t>("osd_heartbeat_stale");
11fdf7f2 5202 std::lock_guard l(heartbeat_lock);
494da23a 5203 for (auto it = heartbeat_peers.begin(); it != heartbeat_peers.end();) {
3fec8b72 5204 auto& [peer, hi] = *it;
494da23a 5205 if (all || hi.is_stale(stale)) {
9f95a23c 5206 hi.clear_mark_down();
494da23a 5207 // stop sending failure_report to mon too
3fec8b72
TL
5208 failure_queue.erase(peer);
5209 failure_pending.erase(peer);
5210 it = heartbeat_peers.erase(it);
494da23a 5211 } else {
3fec8b72 5212 ++it;
7c673cae 5213 }
7c673cae 5214 }
7c673cae
FG
5215}
5216
5217void OSD::handle_osd_ping(MOSDPing *m)
5218{
5219 if (superblock.cluster_fsid != m->fsid) {
5220 dout(20) << "handle_osd_ping from " << m->get_source_inst()
9f95a23c
TL
5221 << " bad fsid " << m->fsid << " != " << superblock.cluster_fsid
5222 << dendl;
7c673cae
FG
5223 m->put();
5224 return;
5225 }
5226
5227 int from = m->get_source().num();
5228
9f95a23c 5229 heartbeat_lock.lock();
7c673cae 5230 if (is_stopping()) {
9f95a23c 5231 heartbeat_lock.unlock();
7c673cae
FG
5232 m->put();
5233 return;
5234 }
5235
9f95a23c
TL
5236 utime_t now = ceph_clock_now();
5237 auto mnow = service.get_mnow();
5238 ConnectionRef con(m->get_connection());
7c673cae 5239 OSDMapRef curmap = service.get_osdmap();
c07f9fc5 5240 if (!curmap) {
9f95a23c 5241 heartbeat_lock.unlock();
c07f9fc5
FG
5242 m->put();
5243 return;
5244 }
7c673cae 5245
9f95a23c
TL
5246 auto sref = con->get_priv();
5247 Session *s = static_cast<Session*>(sref.get());
5248 if (!s) {
5249 heartbeat_lock.unlock();
5250 m->put();
5251 return;
5252 }
5253 if (!s->stamps) {
5254 s->peer = from;
5255 s->stamps = service.get_hb_stamps(from);
5256 }
5257
7c673cae
FG
5258 switch (m->op) {
5259
5260 case MOSDPing::PING:
5261 {
5262 if (cct->_conf->osd_debug_drop_ping_probability > 0) {
5263 auto heartbeat_drop = debug_heartbeat_drops_remaining.find(from);
5264 if (heartbeat_drop != debug_heartbeat_drops_remaining.end()) {
5265 if (heartbeat_drop->second == 0) {
5266 debug_heartbeat_drops_remaining.erase(heartbeat_drop);
5267 } else {
5268 --heartbeat_drop->second;
5269 dout(5) << "Dropping heartbeat from " << from
5270 << ", " << heartbeat_drop->second
5271 << " remaining to drop" << dendl;
5272 break;
5273 }
5274 } else if (cct->_conf->osd_debug_drop_ping_probability >
5275 ((((double)(rand()%100))/100.0))) {
5276 heartbeat_drop =
5277 debug_heartbeat_drops_remaining.insert(std::make_pair(from,
5278 cct->_conf->osd_debug_drop_ping_duration)).first;
5279 dout(5) << "Dropping heartbeat from " << from
5280 << ", " << heartbeat_drop->second
5281 << " remaining to drop" << dendl;
5282 break;
5283 }
5284 }
5285
9f95a23c
TL
5286 ceph::signedspan sender_delta_ub{};
5287 s->stamps->got_ping(
5288 m->up_from,
5289 mnow,
5290 m->mono_send_stamp,
5291 m->delta_ub,
5292 &sender_delta_ub);
5293 dout(20) << __func__ << " new stamps " << *s->stamps << dendl;
5294
7c673cae 5295 if (!cct->get_heartbeat_map()->is_healthy()) {
9f95a23c
TL
5296 dout(10) << "internal heartbeat not healthy, dropping ping request"
5297 << dendl;
7c673cae
FG
5298 break;
5299 }
5300
5301 Message *r = new MOSDPing(monc->get_fsid(),
5302 curmap->get_epoch(),
9f95a23c
TL
5303 MOSDPing::PING_REPLY,
5304 m->ping_stamp,
5305 m->mono_ping_stamp,
5306 mnow,
5307 service.get_up_epoch(),
5308 cct->_conf->osd_heartbeat_min_size,
5309 sender_delta_ub);
5310 con->send_message(r);
7c673cae
FG
5311
5312 if (curmap->is_up(from)) {
7c673cae 5313 if (is_active()) {
9f95a23c
TL
5314 ConnectionRef cluster_con = service.get_con_osd_cluster(
5315 from, curmap->get_epoch());
5316 if (cluster_con) {
5317 service.maybe_share_map(cluster_con.get(), curmap, m->map_epoch);
7c673cae
FG
5318 }
5319 }
5320 } else if (!curmap->exists(from) ||
5321 curmap->get_down_at(from) > m->map_epoch) {
5322 // tell them they have died
5323 Message *r = new MOSDPing(monc->get_fsid(),
5324 curmap->get_epoch(),
5325 MOSDPing::YOU_DIED,
9f95a23c
TL
5326 m->ping_stamp,
5327 m->mono_ping_stamp,
5328 mnow,
5329 service.get_up_epoch(),
31f18b77 5330 cct->_conf->osd_heartbeat_min_size);
9f95a23c 5331 con->send_message(r);
7c673cae
FG
5332 }
5333 }
5334 break;
5335
5336 case MOSDPing::PING_REPLY:
5337 {
5338 map<int,HeartbeatInfo>::iterator i = heartbeat_peers.find(from);
5339 if (i != heartbeat_peers.end()) {
9f95a23c 5340 auto acked = i->second.ping_history.find(m->ping_stamp);
11fdf7f2 5341 if (acked != i->second.ping_history.end()) {
11fdf7f2 5342 int &unacknowledged = acked->second.second;
9f95a23c 5343 if (con == i->second.con_back) {
11fdf7f2
TL
5344 dout(25) << "handle_osd_ping got reply from osd." << from
5345 << " first_tx " << i->second.first_tx
5346 << " last_tx " << i->second.last_tx
9f95a23c
TL
5347 << " last_rx_back " << i->second.last_rx_back
5348 << " -> " << now
11fdf7f2
TL
5349 << " last_rx_front " << i->second.last_rx_front
5350 << dendl;
5351 i->second.last_rx_back = now;
5352 ceph_assert(unacknowledged > 0);
5353 --unacknowledged;
5354 // if there is no front con, set both stamps.
5355 if (i->second.con_front == NULL) {
5356 i->second.last_rx_front = now;
5357 ceph_assert(unacknowledged > 0);
5358 --unacknowledged;
5359 }
9f95a23c 5360 } else if (con == i->second.con_front) {
11fdf7f2
TL
5361 dout(25) << "handle_osd_ping got reply from osd." << from
5362 << " first_tx " << i->second.first_tx
5363 << " last_tx " << i->second.last_tx
5364 << " last_rx_back " << i->second.last_rx_back
9f95a23c
TL
5365 << " last_rx_front " << i->second.last_rx_front
5366 << " -> " << now
11fdf7f2
TL
5367 << dendl;
5368 i->second.last_rx_front = now;
5369 ceph_assert(unacknowledged > 0);
5370 --unacknowledged;
5371 }
7c673cae 5372
11fdf7f2
TL
5373 if (unacknowledged == 0) {
5374 // succeeded in getting all replies
5375 dout(25) << "handle_osd_ping got all replies from osd." << from
9f95a23c 5376 << " , erase pending ping(sent at " << m->ping_stamp << ")"
11fdf7f2
TL
5377 << " and older pending ping(s)"
5378 << dendl;
eafe8130
TL
5379
5380#define ROUND_S_TO_USEC(sec) (uint32_t)((sec) * 1000 * 1000 + 0.5)
5381 ++i->second.hb_average_count;
9f95a23c 5382 uint32_t back_pingtime = ROUND_S_TO_USEC(i->second.last_rx_back - m->ping_stamp);
eafe8130
TL
5383 i->second.hb_total_back += back_pingtime;
5384 if (back_pingtime < i->second.hb_min_back)
5385 i->second.hb_min_back = back_pingtime;
5386 if (back_pingtime > i->second.hb_max_back)
5387 i->second.hb_max_back = back_pingtime;
9f95a23c 5388 uint32_t front_pingtime = ROUND_S_TO_USEC(i->second.last_rx_front - m->ping_stamp);
eafe8130
TL
5389 i->second.hb_total_front += front_pingtime;
5390 if (front_pingtime < i->second.hb_min_front)
5391 i->second.hb_min_front = front_pingtime;
5392 if (front_pingtime > i->second.hb_max_front)
5393 i->second.hb_max_front = front_pingtime;
5394
5395 ceph_assert(i->second.hb_interval_start != utime_t());
5396 if (i->second.hb_interval_start == utime_t())
5397 i->second.hb_interval_start = now;
5398 int64_t hb_avg_time_period = 60;
5399 if (cct->_conf.get_val<int64_t>("debug_heartbeat_testing_span")) {
5400 hb_avg_time_period = cct->_conf.get_val<int64_t>("debug_heartbeat_testing_span");
5401 }
5402 if (now - i->second.hb_interval_start >= utime_t(hb_avg_time_period, 0)) {
5403 uint32_t back_avg = i->second.hb_total_back / i->second.hb_average_count;
5404 uint32_t back_min = i->second.hb_min_back;
5405 uint32_t back_max = i->second.hb_max_back;
5406 uint32_t front_avg = i->second.hb_total_front / i->second.hb_average_count;
5407 uint32_t front_min = i->second.hb_min_front;
5408 uint32_t front_max = i->second.hb_max_front;
5409
5410 // Reset for new interval
5411 i->second.hb_average_count = 0;
5412 i->second.hb_interval_start = now;
5413 i->second.hb_total_back = i->second.hb_max_back = 0;
5414 i->second.hb_min_back = UINT_MAX;
5415 i->second.hb_total_front = i->second.hb_max_front = 0;
5416 i->second.hb_min_front = UINT_MAX;
5417
5418 // Record per osd interace ping times
5419 // Based on osd_heartbeat_interval ignoring that it is randomly short than this interval
5420 if (i->second.hb_back_pingtime.size() == 0) {
5421 ceph_assert(i->second.hb_front_pingtime.size() == 0);
5422 for (unsigned k = 0 ; k < hb_vector_size; ++k) {
5423 i->second.hb_back_pingtime.push_back(back_avg);
5424 i->second.hb_back_min.push_back(back_min);
5425 i->second.hb_back_max.push_back(back_max);
5426 i->second.hb_front_pingtime.push_back(front_avg);
5427 i->second.hb_front_min.push_back(front_min);
5428 i->second.hb_front_max.push_back(front_max);
5429 ++i->second.hb_index;
5430 }
5431 } else {
5432 int index = i->second.hb_index & (hb_vector_size - 1);
5433 i->second.hb_back_pingtime[index] = back_avg;
5434 i->second.hb_back_min[index] = back_min;
5435 i->second.hb_back_max[index] = back_max;
5436 i->second.hb_front_pingtime[index] = front_avg;
5437 i->second.hb_front_min[index] = front_min;
5438 i->second.hb_front_max[index] = front_max;
5439 ++i->second.hb_index;
5440 }
5441
5442 {
5443 std::lock_guard l(service.stat_lock);
5444 service.osd_stat.hb_pingtime[from].last_update = now.sec();
5445 service.osd_stat.hb_pingtime[from].back_last = back_pingtime;
5446
5447 uint32_t total = 0;
5448 uint32_t min = UINT_MAX;
5449 uint32_t max = 0;
5450 uint32_t count = 0;
5451 uint32_t which = 0;
5452 uint32_t size = (uint32_t)i->second.hb_back_pingtime.size();
5453 for (int32_t k = size - 1 ; k >= 0; --k) {
5454 ++count;
5455 int index = (i->second.hb_index + k) % size;
5456 total += i->second.hb_back_pingtime[index];
5457 if (i->second.hb_back_min[index] < min)
5458 min = i->second.hb_back_min[index];
5459 if (i->second.hb_back_max[index] > max)
5460 max = i->second.hb_back_max[index];
5461 if (count == 1 || count == 5 || count == 15) {
5462 service.osd_stat.hb_pingtime[from].back_pingtime[which] = total / count;
5463 service.osd_stat.hb_pingtime[from].back_min[which] = min;
5464 service.osd_stat.hb_pingtime[from].back_max[which] = max;
5465 which++;
5466 if (count == 15)
5467 break;
5468 }
5469 }
5470
5471 if (i->second.con_front != NULL) {
5472 service.osd_stat.hb_pingtime[from].front_last = front_pingtime;
5473
5474 total = 0;
5475 min = UINT_MAX;
5476 max = 0;
5477 count = 0;
5478 which = 0;
5479 for (int32_t k = size - 1 ; k >= 0; --k) {
5480 ++count;
5481 int index = (i->second.hb_index + k) % size;
5482 total += i->second.hb_front_pingtime[index];
5483 if (i->second.hb_front_min[index] < min)
5484 min = i->second.hb_front_min[index];
5485 if (i->second.hb_front_max[index] > max)
5486 max = i->second.hb_front_max[index];
5487 if (count == 1 || count == 5 || count == 15) {
5488 service.osd_stat.hb_pingtime[from].front_pingtime[which] = total / count;
5489 service.osd_stat.hb_pingtime[from].front_min[which] = min;
5490 service.osd_stat.hb_pingtime[from].front_max[which] = max;
5491 which++;
5492 if (count == 15)
5493 break;
5494 }
5495 }
5496 }
5497 }
5498 } else {
5499 std::lock_guard l(service.stat_lock);
5500 service.osd_stat.hb_pingtime[from].back_last = back_pingtime;
5501 if (i->second.con_front != NULL)
5502 service.osd_stat.hb_pingtime[from].front_last = front_pingtime;
5503 }
11fdf7f2 5504 i->second.ping_history.erase(i->second.ping_history.begin(), ++acked);
7c673cae
FG
5505 }
5506
11fdf7f2
TL
5507 if (i->second.is_healthy(now)) {
5508 // Cancel false reports
5509 auto failure_queue_entry = failure_queue.find(from);
5510 if (failure_queue_entry != failure_queue.end()) {
5511 dout(10) << "handle_osd_ping canceling queued "
5512 << "failure report for osd." << from << dendl;
5513 failure_queue.erase(failure_queue_entry);
5514 }
5515
5516 auto failure_pending_entry = failure_pending.find(from);
5517 if (failure_pending_entry != failure_pending.end()) {
5518 dout(10) << "handle_osd_ping canceling in-flight "
5519 << "failure report for osd." << from << dendl;
5520 send_still_alive(curmap->get_epoch(),
5521 from,
5522 failure_pending_entry->second.second);
5523 failure_pending.erase(failure_pending_entry);
5524 }
7c673cae 5525 }
11fdf7f2
TL
5526 } else {
5527 // old replies, deprecated by newly sent pings.
9f95a23c 5528 dout(10) << "handle_osd_ping no pending ping(sent at " << m->ping_stamp
11fdf7f2
TL
5529 << ") is found, treat as covered by newly sent pings "
5530 << "and ignore"
5531 << dendl;
7c673cae
FG
5532 }
5533 }
5534
5535 if (m->map_epoch &&
5536 curmap->is_up(from)) {
7c673cae 5537 if (is_active()) {
9f95a23c
TL
5538 ConnectionRef cluster_con = service.get_con_osd_cluster(
5539 from, curmap->get_epoch());
5540 if (cluster_con) {
5541 service.maybe_share_map(cluster_con.get(), curmap, m->map_epoch);
7c673cae
FG
5542 }
5543 }
5544 }
9f95a23c
TL
5545
5546 s->stamps->got_ping_reply(
5547 mnow,
5548 m->mono_send_stamp,
5549 m->delta_ub);
5550 dout(20) << __func__ << " new stamps " << *s->stamps << dendl;
7c673cae
FG
5551 }
5552 break;
5553
5554 case MOSDPing::YOU_DIED:
5555 dout(10) << "handle_osd_ping " << m->get_source_inst()
5556 << " says i am down in " << m->map_epoch << dendl;
5557 osdmap_subscribe(curmap->get_epoch()+1, false);
5558 break;
5559 }
5560
9f95a23c 5561 heartbeat_lock.unlock();
7c673cae
FG
5562 m->put();
5563}
5564
5565void OSD::heartbeat_entry()
5566{
9f95a23c 5567 std::unique_lock l(heartbeat_lock);
7c673cae
FG
5568 if (is_stopping())
5569 return;
5570 while (!heartbeat_stop) {
5571 heartbeat();
5572
eafe8130
TL
5573 double wait;
5574 if (cct->_conf.get_val<bool>("debug_disable_randomized_ping")) {
5575 wait = (float)cct->_conf->osd_heartbeat_interval;
5576 } else {
5577 wait = .5 + ((float)(rand() % 10)/10.0) * (float)cct->_conf->osd_heartbeat_interval;
5578 }
9f95a23c 5579 auto w = ceph::make_timespan(wait);
7c673cae 5580 dout(30) << "heartbeat_entry sleeping for " << wait << dendl;
9f95a23c 5581 heartbeat_cond.wait_for(l, w);
7c673cae
FG
5582 if (is_stopping())
5583 return;
5584 dout(30) << "heartbeat_entry woke up" << dendl;
5585 }
5586}
5587
5588void OSD::heartbeat_check()
5589{
9f95a23c 5590 ceph_assert(ceph_mutex_is_locked(heartbeat_lock));
7c673cae
FG
5591 utime_t now = ceph_clock_now();
5592
11fdf7f2 5593 // check for incoming heartbeats (move me elsewhere?)
7c673cae
FG
5594 for (map<int,HeartbeatInfo>::iterator p = heartbeat_peers.begin();
5595 p != heartbeat_peers.end();
5596 ++p) {
5597
5598 if (p->second.first_tx == utime_t()) {
5599 dout(25) << "heartbeat_check we haven't sent ping to osd." << p->first
11fdf7f2 5600 << " yet, skipping" << dendl;
7c673cae
FG
5601 continue;
5602 }
5603
5604 dout(25) << "heartbeat_check osd." << p->first
5605 << " first_tx " << p->second.first_tx
5606 << " last_tx " << p->second.last_tx
5607 << " last_rx_back " << p->second.last_rx_back
5608 << " last_rx_front " << p->second.last_rx_front
5609 << dendl;
11fdf7f2
TL
5610 if (p->second.is_unhealthy(now)) {
5611 utime_t oldest_deadline = p->second.ping_history.begin()->second.first;
7c673cae
FG
5612 if (p->second.last_rx_back == utime_t() ||
5613 p->second.last_rx_front == utime_t()) {
11fdf7f2
TL
5614 derr << "heartbeat_check: no reply from "
5615 << p->second.con_front->get_peer_addr().get_sockaddr()
5616 << " osd." << p->first
5617 << " ever on either front or back, first ping sent "
5618 << p->second.first_tx
5619 << " (oldest deadline " << oldest_deadline << ")"
5620 << dendl;
7c673cae 5621 // fail
11fdf7f2 5622 failure_queue[p->first] = p->second.first_tx;
7c673cae 5623 } else {
11fdf7f2
TL
5624 derr << "heartbeat_check: no reply from "
5625 << p->second.con_front->get_peer_addr().get_sockaddr()
7c673cae
FG
5626 << " osd." << p->first << " since back " << p->second.last_rx_back
5627 << " front " << p->second.last_rx_front
11fdf7f2
TL
5628 << " (oldest deadline " << oldest_deadline << ")"
5629 << dendl;
7c673cae 5630 // fail
11fdf7f2 5631 failure_queue[p->first] = std::min(p->second.last_rx_back, p->second.last_rx_front);
7c673cae
FG
5632 }
5633 }
5634 }
5635}
5636
5637void OSD::heartbeat()
5638{
9f95a23c 5639 ceph_assert(ceph_mutex_is_locked_by_me(heartbeat_lock));
7c673cae
FG
5640 dout(30) << "heartbeat" << dendl;
5641
5642 // get CPU load avg
5643 double loadavgs[1];
11fdf7f2
TL
5644 int hb_interval = cct->_conf->osd_heartbeat_interval;
5645 int n_samples = 86400;
5646 if (hb_interval > 1) {
5647 n_samples /= hb_interval;
5648 if (n_samples < 1)
5649 n_samples = 1;
5650 }
5651
7c673cae
FG
5652 if (getloadavg(loadavgs, 1) == 1) {
5653 logger->set(l_osd_loadavg, 100 * loadavgs[0]);
5654 daily_loadavg = (daily_loadavg * (n_samples - 1) + loadavgs[0]) / n_samples;
5655 dout(30) << "heartbeat: daily_loadavg " << daily_loadavg << dendl;
5656 }
5657
5658 dout(30) << "heartbeat checking stats" << dendl;
5659
11fdf7f2 5660 // refresh peer list and osd stats
7c673cae
FG
5661 vector<int> hb_peers;
5662 for (map<int,HeartbeatInfo>::iterator p = heartbeat_peers.begin();
5663 p != heartbeat_peers.end();
5664 ++p)
5665 hb_peers.push_back(p->first);
7c673cae 5666
11fdf7f2
TL
5667 auto new_stat = service.set_osd_stat(hb_peers, get_num_pgs());
5668 dout(5) << __func__ << " " << new_stat << dendl;
5669 ceph_assert(new_stat.statfs.total);
5670
5671 float pratio;
5672 float ratio = service.compute_adjusted_ratio(new_stat, &pratio);
5673
5674 service.check_full_status(ratio, pratio);
7c673cae
FG
5675
5676 utime_t now = ceph_clock_now();
9f95a23c 5677 auto mnow = service.get_mnow();
11fdf7f2
TL
5678 utime_t deadline = now;
5679 deadline += cct->_conf->osd_heartbeat_grace;
7c673cae
FG
5680
5681 // send heartbeats
5682 for (map<int,HeartbeatInfo>::iterator i = heartbeat_peers.begin();
5683 i != heartbeat_peers.end();
5684 ++i) {
5685 int peer = i->first;
3fec8b72
TL
5686 Session *s = static_cast<Session*>(i->second.con_back->get_priv().get());
5687 if (!s) {
5688 dout(30) << "heartbeat osd." << peer << " has no open con" << dendl;
5689 continue;
5690 }
9f95a23c
TL
5691 dout(30) << "heartbeat sending ping to osd." << peer << dendl;
5692
7c673cae
FG
5693 i->second.last_tx = now;
5694 if (i->second.first_tx == utime_t())
5695 i->second.first_tx = now;
11fdf7f2
TL
5696 i->second.ping_history[now] = make_pair(deadline,
5697 HeartbeatInfo::HEARTBEAT_MAX_CONN);
eafe8130
TL
5698 if (i->second.hb_interval_start == utime_t())
5699 i->second.hb_interval_start = now;
9f95a23c 5700
9f95a23c
TL
5701 std::optional<ceph::signedspan> delta_ub;
5702 s->stamps->sent_ping(&delta_ub);
5703
5704 i->second.con_back->send_message(
5705 new MOSDPing(monc->get_fsid(),
5706 service.get_osdmap_epoch(),
5707 MOSDPing::PING,
5708 now,
5709 mnow,
5710 mnow,
5711 service.get_up_epoch(),
5712 cct->_conf->osd_heartbeat_min_size,
5713 delta_ub));
7c673cae
FG
5714
5715 if (i->second.con_front)
9f95a23c
TL
5716 i->second.con_front->send_message(
5717 new MOSDPing(monc->get_fsid(),
5718 service.get_osdmap_epoch(),
5719 MOSDPing::PING,
5720 now,
5721 mnow,
5722 mnow,
5723 service.get_up_epoch(),
5724 cct->_conf->osd_heartbeat_min_size,
5725 delta_ub));
7c673cae
FG
5726 }
5727
5728 logger->set(l_osd_hb_to, heartbeat_peers.size());
5729
5730 // hmm.. am i all alone?
5731 dout(30) << "heartbeat lonely?" << dendl;
5732 if (heartbeat_peers.empty()) {
5733 if (now - last_mon_heartbeat > cct->_conf->osd_mon_heartbeat_interval && is_active()) {
5734 last_mon_heartbeat = now;
5735 dout(10) << "i have no heartbeat peers; checking mon for new map" << dendl;
9f95a23c 5736 osdmap_subscribe(get_osdmap_epoch() + 1, false);
7c673cae
FG
5737 }
5738 }
5739
5740 dout(30) << "heartbeat done" << dendl;
5741}
5742
5743bool OSD::heartbeat_reset(Connection *con)
5744{
11fdf7f2
TL
5745 std::lock_guard l(heartbeat_lock);
5746 auto s = con->get_priv();
9f95a23c 5747 dout(20) << __func__ << " con " << con << " s " << s.get() << dendl;
11fdf7f2 5748 con->set_priv(nullptr);
7c673cae 5749 if (s) {
7c673cae 5750 if (is_stopping()) {
7c673cae
FG
5751 return true;
5752 }
9f95a23c
TL
5753 auto session = static_cast<Session*>(s.get());
5754 auto p = heartbeat_peers.find(session->peer);
7c673cae
FG
5755 if (p != heartbeat_peers.end() &&
5756 (p->second.con_back == con ||
5757 p->second.con_front == con)) {
5758 dout(10) << "heartbeat_reset failed hb con " << con << " for osd." << p->second.peer
5759 << ", reopening" << dendl;
9f95a23c 5760 p->second.clear_mark_down(con);
7c673cae
FG
5761 pair<ConnectionRef,ConnectionRef> newcon = service.get_con_osd_hb(p->second.peer, p->second.epoch);
5762 if (newcon.first) {
5763 p->second.con_back = newcon.first.get();
11fdf7f2 5764 p->second.con_back->set_priv(s);
7c673cae
FG
5765 if (newcon.second) {
5766 p->second.con_front = newcon.second.get();
11fdf7f2 5767 p->second.con_front->set_priv(s);
7c673cae 5768 }
11fdf7f2 5769 p->second.ping_history.clear();
7c673cae
FG
5770 } else {
5771 dout(10) << "heartbeat_reset failed hb con " << con << " for osd." << p->second.peer
5772 << ", raced with osdmap update, closing out peer" << dendl;
5773 heartbeat_peers.erase(p);
5774 }
5775 } else {
5776 dout(10) << "heartbeat_reset closing (old) failed hb con " << con << dendl;
5777 }
7c673cae
FG
5778 }
5779 return true;
5780}
5781
5782
5783
5784// =========================================
5785
5786void OSD::tick()
5787{
9f95a23c 5788 ceph_assert(ceph_mutex_is_locked(osd_lock));
7c673cae
FG
5789 dout(10) << "tick" << dendl;
5790
9f95a23c
TL
5791 utime_t now = ceph_clock_now();
5792 // throw out any obsolete markdown log
5793 utime_t grace = utime_t(cct->_conf->osd_max_markdown_period, 0);
5794 while (!osd_markdown_log.empty() &&
5795 osd_markdown_log.front() + grace < now)
5796 osd_markdown_log.pop_front();
5797
7c673cae
FG
5798 if (is_active() || is_waiting_for_healthy()) {
5799 maybe_update_heartbeat_peers();
5800 }
5801
5802 if (is_waiting_for_healthy()) {
5803 start_boot();
494da23a
TL
5804 }
5805
5806 if (is_waiting_for_healthy() || is_booting()) {
5807 std::lock_guard l(heartbeat_lock);
494da23a
TL
5808 if (now - last_mon_heartbeat > cct->_conf->osd_mon_heartbeat_interval) {
5809 last_mon_heartbeat = now;
5810 dout(1) << __func__ << " checking mon for new map" << dendl;
9f95a23c 5811 osdmap_subscribe(get_osdmap_epoch() + 1, false);
11fdf7f2 5812 }
7c673cae
FG
5813 }
5814
5815 do_waiters();
5816
9f95a23c
TL
5817 // scrub purged_snaps every deep scrub interval
5818 {
5819 const utime_t last = superblock.last_purged_snaps_scrub;
5820 utime_t next = last;
5821 next += cct->_conf->osd_scrub_min_interval;
5822 std::mt19937 rng;
5823 // use a seed that is stable for each scrub interval, but varies
5824 // by OSD to avoid any herds.
5825 rng.seed(whoami + superblock.last_purged_snaps_scrub.sec());
5826 double r = (rng() % 1024) / 1024;
5827 next +=
5828 cct->_conf->osd_scrub_min_interval *
5829 cct->_conf->osd_scrub_interval_randomize_ratio * r;
5830 if (next < ceph_clock_now()) {
5831 dout(20) << __func__ << " last_purged_snaps_scrub " << last
5832 << " next " << next << " ... now" << dendl;
5833 scrub_purged_snaps();
5834 } else {
5835 dout(20) << __func__ << " last_purged_snaps_scrub " << last
5836 << " next " << next << dendl;
5837 }
5838 }
5839
91327a77 5840 tick_timer.add_event_after(get_tick_interval(), new C_Tick(this));
7c673cae
FG
5841}
5842
5843void OSD::tick_without_osd_lock()
5844{
9f95a23c 5845 ceph_assert(ceph_mutex_is_locked(tick_timer_lock));
7c673cae
FG
5846 dout(10) << "tick_without_osd_lock" << dendl;
5847
7c673cae
FG
5848 logger->set(l_osd_cached_crc, buffer::get_cached_crc());
5849 logger->set(l_osd_cached_crc_adjusted, buffer::get_cached_crc_adjusted());
5850 logger->set(l_osd_missed_crc, buffer::get_missed_crc());
11fdf7f2
TL
5851
5852 // refresh osd stats
5853 struct store_statfs_t stbuf;
5854 osd_alert_list_t alerts;
5855 int r = store->statfs(&stbuf, &alerts);
5856 ceph_assert(r == 0);
5857 service.set_statfs(stbuf, alerts);
7c673cae
FG
5858
5859 // osd_lock is not being held, which means the OSD state
5860 // might change when doing the monitor report
5861 if (is_active() || is_waiting_for_healthy()) {
9f95a23c
TL
5862 {
5863 std::lock_guard l{heartbeat_lock};
5864 heartbeat_check();
5865 }
5866 map_lock.lock_shared();
11fdf7f2 5867 std::lock_guard l(mon_report_lock);
7c673cae
FG
5868
5869 // mon report?
7c673cae 5870 utime_t now = ceph_clock_now();
11fdf7f2
TL
5871 if (service.need_fullness_update() ||
5872 now - last_mon_report > cct->_conf->osd_mon_report_interval) {
7c673cae 5873 last_mon_report = now;
7c673cae
FG
5874 send_full_update();
5875 send_failures();
7c673cae 5876 }
9f95a23c 5877 map_lock.unlock_shared();
11fdf7f2
TL
5878
5879 epoch_t max_waiting_epoch = 0;
5880 for (auto s : shards) {
5881 max_waiting_epoch = std::max(max_waiting_epoch,
5882 s->get_max_waiting_epoch());
5883 }
5884 if (max_waiting_epoch > get_osdmap()->get_epoch()) {
5885 dout(20) << __func__ << " max_waiting_epoch " << max_waiting_epoch
5886 << ", requesting new map" << dendl;
5887 osdmap_subscribe(superblock.newest_map + 1, false);
5888 }
7c673cae
FG
5889 }
5890
5891 if (is_active()) {
5892 if (!scrub_random_backoff()) {
5893 sched_scrub();
5894 }
5895 service.promote_throttle_recalibrate();
3efd9988 5896 resume_creating_pg();
224ce89b
WB
5897 bool need_send_beacon = false;
5898 const auto now = ceph::coarse_mono_clock::now();
5899 {
5900 // borrow lec lock to pretect last_sent_beacon from changing
11fdf7f2 5901 std::lock_guard l{min_last_epoch_clean_lock};
224ce89b
WB
5902 const auto elapsed = now - last_sent_beacon;
5903 if (chrono::duration_cast<chrono::seconds>(elapsed).count() >
5904 cct->_conf->osd_beacon_report_interval) {
5905 need_send_beacon = true;
5906 }
5907 }
5908 if (need_send_beacon) {
5909 send_beacon(now);
5910 }
7c673cae
FG
5911 }
5912
11fdf7f2 5913 mgrc.update_daemon_health(get_health_metrics());
7c673cae 5914 service.kick_recovery_queue();
91327a77
AA
5915 tick_timer_without_osd_lock.add_event_after(get_tick_interval(),
5916 new C_Tick_WithoutOSDLock(this));
7c673cae
FG
5917}
5918
7c673cae
FG
5919// Usage:
5920// setomapval <pool-id> [namespace/]<obj-name> <key> <val>
5921// rmomapkey <pool-id> [namespace/]<obj-name> <key>
5922// setomapheader <pool-id> [namespace/]<obj-name> <header>
5923// getomap <pool> [namespace/]<obj-name>
5924// truncobj <pool-id> [namespace/]<obj-name> <newlen>
5925// injectmdataerr [namespace/]<obj-name> [shardid]
5926// injectdataerr [namespace/]<obj-name> [shardid]
5927//
5928// set_recovery_delay [utime]
5929void TestOpsSocketHook::test_ops(OSDService *service, ObjectStore *store,
11fdf7f2
TL
5930 std::string_view command,
5931 const cmdmap_t& cmdmap, ostream &ss)
7c673cae
FG
5932{
5933 //Test support
5934 //Support changing the omap on a single osd by using the Admin Socket to
5935 //directly request the osd make a change.
5936 if (command == "setomapval" || command == "rmomapkey" ||
5937 command == "setomapheader" || command == "getomap" ||
5938 command == "truncobj" || command == "injectmdataerr" ||
5939 command == "injectdataerr"
5940 ) {
5941 pg_t rawpg;
5942 int64_t pool;
5943 OSDMapRef curmap = service->get_osdmap();
5944 int r = -1;
5945
5946 string poolstr;
5947
9f95a23c 5948 cmd_getval(cmdmap, "pool", poolstr);
7c673cae
FG
5949 pool = curmap->lookup_pg_pool_name(poolstr);
5950 //If we can't find it by name then maybe id specified
5951 if (pool < 0 && isdigit(poolstr[0]))
5952 pool = atoll(poolstr.c_str());
5953 if (pool < 0) {
b5b8bbf5 5954 ss << "Invalid pool '" << poolstr << "''";
7c673cae
FG
5955 return;
5956 }
5957
5958 string objname, nspace;
9f95a23c 5959 cmd_getval(cmdmap, "objname", objname);
7c673cae
FG
5960 std::size_t found = objname.find_first_of('/');
5961 if (found != string::npos) {
5962 nspace = objname.substr(0, found);
5963 objname = objname.substr(found+1);
5964 }
5965 object_locator_t oloc(pool, nspace);
5966 r = curmap->object_locator_to_pg(object_t(objname), oloc, rawpg);
5967
5968 if (r < 0) {
5969 ss << "Invalid namespace/objname";
5970 return;
5971 }
5972
5973 int64_t shardid;
9f95a23c 5974 cmd_getval(cmdmap, "shardid", shardid, int64_t(shard_id_t::NO_SHARD));
7c673cae
FG
5975 hobject_t obj(object_t(objname), string(""), CEPH_NOSNAP, rawpg.ps(), pool, nspace);
5976 ghobject_t gobj(obj, ghobject_t::NO_GEN, shard_id_t(uint8_t(shardid)));
5977 spg_t pgid(curmap->raw_pg_to_pg(rawpg), shard_id_t(shardid));
5978 if (curmap->pg_is_ec(rawpg)) {
5979 if ((command != "injectdataerr") && (command != "injectmdataerr")) {
5980 ss << "Must not call on ec pool, except injectdataerr or injectmdataerr";
5981 return;
5982 }
5983 }
5984
5985 ObjectStore::Transaction t;
5986
5987 if (command == "setomapval") {
5988 map<string, bufferlist> newattrs;
5989 bufferlist val;
5990 string key, valstr;
9f95a23c
TL
5991 cmd_getval(cmdmap, "key", key);
5992 cmd_getval(cmdmap, "val", valstr);
7c673cae
FG
5993
5994 val.append(valstr);
5995 newattrs[key] = val;
5996 t.omap_setkeys(coll_t(pgid), ghobject_t(obj), newattrs);
11fdf7f2 5997 r = store->queue_transaction(service->meta_ch, std::move(t));
7c673cae
FG
5998 if (r < 0)
5999 ss << "error=" << r;
6000 else
6001 ss << "ok";
6002 } else if (command == "rmomapkey") {
6003 string key;
9f95a23c 6004 cmd_getval(cmdmap, "key", key);
7c673cae 6005
9f95a23c 6006 t.omap_rmkey(coll_t(pgid), ghobject_t(obj), key);
11fdf7f2 6007 r = store->queue_transaction(service->meta_ch, std::move(t));
7c673cae
FG
6008 if (r < 0)
6009 ss << "error=" << r;
6010 else
6011 ss << "ok";
6012 } else if (command == "setomapheader") {
6013 bufferlist newheader;
6014 string headerstr;
6015
9f95a23c 6016 cmd_getval(cmdmap, "header", headerstr);
7c673cae
FG
6017 newheader.append(headerstr);
6018 t.omap_setheader(coll_t(pgid), ghobject_t(obj), newheader);
11fdf7f2 6019 r = store->queue_transaction(service->meta_ch, std::move(t));
7c673cae
FG
6020 if (r < 0)
6021 ss << "error=" << r;
6022 else
6023 ss << "ok";
6024 } else if (command == "getomap") {
6025 //Debug: Output entire omap
6026 bufferlist hdrbl;
6027 map<string, bufferlist> keyvals;
11fdf7f2
TL
6028 auto ch = store->open_collection(coll_t(pgid));
6029 if (!ch) {
6030 ss << "unable to open collection for " << pgid;
6031 r = -ENOENT;
6032 } else {
6033 r = store->omap_get(ch, ghobject_t(obj), &hdrbl, &keyvals);
6034 if (r >= 0) {
7c673cae
FG
6035 ss << "header=" << string(hdrbl.c_str(), hdrbl.length());
6036 for (map<string, bufferlist>::iterator it = keyvals.begin();
11fdf7f2 6037 it != keyvals.end(); ++it)
7c673cae
FG
6038 ss << " key=" << (*it).first << " val="
6039 << string((*it).second.c_str(), (*it).second.length());
11fdf7f2 6040 } else {
7c673cae 6041 ss << "error=" << r;
11fdf7f2 6042 }
7c673cae
FG
6043 }
6044 } else if (command == "truncobj") {
6045 int64_t trunclen;
9f95a23c 6046 cmd_getval(cmdmap, "len", trunclen);
7c673cae 6047 t.truncate(coll_t(pgid), ghobject_t(obj), trunclen);
11fdf7f2 6048 r = store->queue_transaction(service->meta_ch, std::move(t));
7c673cae
FG
6049 if (r < 0)
6050 ss << "error=" << r;
6051 else
6052 ss << "ok";
6053 } else if (command == "injectdataerr") {
6054 store->inject_data_error(gobj);
6055 ss << "ok";
6056 } else if (command == "injectmdataerr") {
6057 store->inject_mdata_error(gobj);
6058 ss << "ok";
6059 }
6060 return;
6061 }
6062 if (command == "set_recovery_delay") {
6063 int64_t delay;
9f95a23c 6064 cmd_getval(cmdmap, "utime", delay, (int64_t)0);
7c673cae
FG
6065 ostringstream oss;
6066 oss << delay;
11fdf7f2 6067 int r = service->cct->_conf.set_val("osd_recovery_delay_start",
7c673cae
FG
6068 oss.str().c_str());
6069 if (r != 0) {
6070 ss << "set_recovery_delay: error setting "
6071 << "osd_recovery_delay_start to '" << delay << "': error "
6072 << r;
6073 return;
6074 }
11fdf7f2 6075 service->cct->_conf.apply_changes(nullptr);
7c673cae
FG
6076 ss << "set_recovery_delay: set osd_recovery_delay_start "
6077 << "to " << service->cct->_conf->osd_recovery_delay_start;
6078 return;
6079 }
7c673cae
FG
6080 if (command == "injectfull") {
6081 int64_t count;
6082 string type;
6083 OSDService::s_names state;
9f95a23c
TL
6084 cmd_getval(cmdmap, "type", type, string("full"));
6085 cmd_getval(cmdmap, "count", count, (int64_t)-1);
7c673cae
FG
6086 if (type == "none" || count == 0) {
6087 type = "none";
6088 count = 0;
6089 }
6090 state = service->get_full_state(type);
6091 if (state == OSDService::s_names::INVALID) {
6092 ss << "Invalid type use (none, nearfull, backfillfull, full, failsafe)";
6093 return;
6094 }
6095 service->set_injectfull(state, count);
6096 return;
6097 }
6098 ss << "Internal error - command=" << command;
6099}
6100
7c673cae
FG
6101// =========================================
6102
6103void OSD::ms_handle_connect(Connection *con)
6104{
6105 dout(10) << __func__ << " con " << con << dendl;
6106 if (con->get_peer_type() == CEPH_ENTITY_TYPE_MON) {
11fdf7f2 6107 std::lock_guard l(osd_lock);
7c673cae
FG
6108 if (is_stopping())
6109 return;
6110 dout(10) << __func__ << " on mon" << dendl;
6111
6112 if (is_preboot()) {
6113 start_boot();
6114 } else if (is_booting()) {
6115 _send_boot(); // resend boot message
6116 } else {
9f95a23c 6117 map_lock.lock_shared();
11fdf7f2 6118 std::lock_guard l2(mon_report_lock);
7c673cae
FG
6119
6120 utime_t now = ceph_clock_now();
6121 last_mon_report = now;
6122
6123 // resend everything, it's a new session
6124 send_full_update();
6125 send_alive();
6126 service.requeue_pg_temp();
11fdf7f2 6127 service.clear_sent_ready_to_merge();
7c673cae 6128 service.send_pg_temp();
11fdf7f2
TL
6129 service.send_ready_to_merge();
6130 service.send_pg_created();
7c673cae
FG
6131 requeue_failures();
6132 send_failures();
7c673cae 6133
9f95a23c 6134 map_lock.unlock_shared();
7c673cae
FG
6135 if (is_active()) {
6136 send_beacon(ceph::coarse_mono_clock::now());
6137 }
6138 }
6139
6140 // full map requests may happen while active or pre-boot
6141 if (requested_full_first) {
6142 rerequest_full_maps();
6143 }
6144 }
6145}
6146
6147void OSD::ms_handle_fast_connect(Connection *con)
6148{
6149 if (con->get_peer_type() != CEPH_ENTITY_TYPE_MON &&
6150 con->get_peer_type() != CEPH_ENTITY_TYPE_MGR) {
9f95a23c
TL
6151 if (auto s = ceph::ref_cast<Session>(con->get_priv()); !s) {
6152 s = ceph::make_ref<Session>(cct, con);
6153 con->set_priv(s);
7c673cae
FG
6154 dout(10) << " new session (outgoing) " << s << " con=" << s->con
6155 << " addr=" << s->con->get_peer_addr() << dendl;
6156 // we don't connect to clients
11fdf7f2 6157 ceph_assert(con->get_peer_type() == CEPH_ENTITY_TYPE_OSD);
7c673cae
FG
6158 s->entity_name.set_type(CEPH_ENTITY_TYPE_OSD);
6159 }
7c673cae
FG
6160 }
6161}
6162
6163void OSD::ms_handle_fast_accept(Connection *con)
6164{
6165 if (con->get_peer_type() != CEPH_ENTITY_TYPE_MON &&
6166 con->get_peer_type() != CEPH_ENTITY_TYPE_MGR) {
9f95a23c
TL
6167 if (auto s = ceph::ref_cast<Session>(con->get_priv()); !s) {
6168 s = ceph::make_ref<Session>(cct, con);
6169 con->set_priv(s);
7c673cae
FG
6170 dout(10) << "new session (incoming)" << s << " con=" << con
6171 << " addr=" << con->get_peer_addr()
6172 << " must have raced with connect" << dendl;
11fdf7f2 6173 ceph_assert(con->get_peer_type() == CEPH_ENTITY_TYPE_OSD);
7c673cae
FG
6174 s->entity_name.set_type(CEPH_ENTITY_TYPE_OSD);
6175 }
7c673cae
FG
6176 }
6177}
6178
6179bool OSD::ms_handle_reset(Connection *con)
6180{
9f95a23c
TL
6181 auto session = ceph::ref_cast<Session>(con->get_priv());
6182 dout(2) << "ms_handle_reset con " << con << " session " << session.get() << dendl;
7c673cae
FG
6183 if (!session)
6184 return false;
6185 session->wstate.reset(con);
11fdf7f2
TL
6186 session->con->set_priv(nullptr);
6187 session->con.reset(); // break con <-> session ref cycle
7c673cae
FG
6188 // note that we break session->con *before* the session_handle_reset
6189 // cleanup below. this avoids a race between us and
6190 // PG::add_backoff, Session::check_backoff, etc.
9f95a23c 6191 session_handle_reset(session);
7c673cae
FG
6192 return true;
6193}
6194
6195bool OSD::ms_handle_refused(Connection *con)
6196{
6197 if (!cct->_conf->osd_fast_fail_on_connection_refused)
6198 return false;
6199
9f95a23c
TL
6200 auto session = ceph::ref_cast<Session>(con->get_priv());
6201 dout(2) << "ms_handle_refused con " << con << " session " << session.get() << dendl;
7c673cae
FG
6202 if (!session)
6203 return false;
6204 int type = con->get_peer_type();
6205 // handle only OSD failures here
6206 if (monc && (type == CEPH_ENTITY_TYPE_OSD)) {
6207 OSDMapRef osdmap = get_osdmap();
6208 if (osdmap) {
6209 int id = osdmap->identify_osd_on_all_channels(con->get_peer_addr());
6210 if (id >= 0 && osdmap->is_up(id)) {
6211 // I'm cheating mon heartbeat grace logic, because we know it's not going
6212 // to respawn alone. +1 so we won't hit any boundary case.
11fdf7f2
TL
6213 monc->send_mon_message(
6214 new MOSDFailure(
6215 monc->get_fsid(),
6216 id,
6217 osdmap->get_addrs(id),
6218 cct->_conf->osd_heartbeat_grace + 1,
6219 osdmap->get_epoch(),
6220 MOSDFailure::FLAG_IMMEDIATE | MOSDFailure::FLAG_FAILED
6221 ));
7c673cae
FG
6222 }
6223 }
6224 }
7c673cae
FG
6225 return true;
6226}
6227
6228struct C_OSD_GetVersion : public Context {
6229 OSD *osd;
6230 uint64_t oldest, newest;
6231 explicit C_OSD_GetVersion(OSD *o) : osd(o), oldest(0), newest(0) {}
6232 void finish(int r) override {
6233 if (r >= 0)
6234 osd->_got_mon_epochs(oldest, newest);
6235 }
6236};
6237
6238void OSD::start_boot()
6239{
6240 if (!_is_healthy()) {
6241 // if we are not healthy, do not mark ourselves up (yet)
6242 dout(1) << "not healthy; waiting to boot" << dendl;
6243 if (!is_waiting_for_healthy())
6244 start_waiting_for_healthy();
6245 // send pings sooner rather than later
6246 heartbeat_kick();
6247 return;
6248 }
6249 dout(1) << __func__ << dendl;
6250 set_state(STATE_PREBOOT);
6251 dout(10) << "start_boot - have maps " << superblock.oldest_map
6252 << ".." << superblock.newest_map << dendl;
6253 C_OSD_GetVersion *c = new C_OSD_GetVersion(this);
6254 monc->get_version("osdmap", &c->newest, &c->oldest, c);
6255}
6256
6257void OSD::_got_mon_epochs(epoch_t oldest, epoch_t newest)
6258{
11fdf7f2 6259 std::lock_guard l(osd_lock);
7c673cae
FG
6260 if (is_preboot()) {
6261 _preboot(oldest, newest);
6262 }
6263}
6264
6265void OSD::_preboot(epoch_t oldest, epoch_t newest)
6266{
11fdf7f2 6267 ceph_assert(is_preboot());
7c673cae
FG
6268 dout(10) << __func__ << " _preboot mon has osdmaps "
6269 << oldest << ".." << newest << dendl;
6270
6271 // ensure our local fullness awareness is accurate
81eedcae
TL
6272 {
6273 std::lock_guard l(heartbeat_lock);
6274 heartbeat();
6275 }
7c673cae 6276
9f95a23c
TL
6277 const auto& monmap = monc->monmap;
6278 const auto osdmap = get_osdmap();
7c673cae 6279 // if our map within recent history, try to add ourselves to the osdmap.
31f18b77
FG
6280 if (osdmap->get_epoch() == 0) {
6281 derr << "waiting for initial osdmap" << dendl;
c07f9fc5 6282 } else if (osdmap->is_destroyed(whoami)) {
b32b8144
FG
6283 derr << "osdmap says I am destroyed" << dendl;
6284 // provide a small margin so we don't livelock seeing if we
6285 // un-destroyed ourselves.
6286 if (osdmap->get_epoch() > newest - 1) {
6287 exit(0);
6288 }
81eedcae 6289 } else if (osdmap->is_noup(whoami)) {
7c673cae
FG
6290 derr << "osdmap NOUP flag is set, waiting for it to clear" << dendl;
6291 } else if (!osdmap->test_flag(CEPH_OSDMAP_SORTBITWISE)) {
6292 derr << "osdmap SORTBITWISE OSDMap flag is NOT set; please set it"
6293 << dendl;
9f95a23c 6294 } else if (osdmap->require_osd_release < ceph_release_t::luminous) {
11fdf7f2 6295 derr << "osdmap require_osd_release < luminous; please upgrade to luminous"
7c673cae 6296 << dendl;
7c673cae
FG
6297 } else if (service.need_fullness_update()) {
6298 derr << "osdmap fullness state needs update" << dendl;
6299 send_full_update();
9f95a23c
TL
6300 } else if (monmap.min_mon_release >= ceph_release_t::octopus &&
6301 superblock.purged_snaps_last < superblock.current_epoch) {
6302 dout(10) << __func__ << " purged_snaps_last " << superblock.purged_snaps_last
6303 << " < newest_map " << superblock.current_epoch << dendl;
6304 _get_purged_snaps();
7c673cae
FG
6305 } else if (osdmap->get_epoch() >= oldest - 1 &&
6306 osdmap->get_epoch() + cct->_conf->osd_map_message_max > newest) {
11fdf7f2
TL
6307
6308 // wait for pgs to fully catch up in a different thread, since
6309 // this thread might be required for splitting and merging PGs to
6310 // make progress.
6311 boot_finisher.queue(
9f95a23c 6312 new LambdaContext(
11fdf7f2 6313 [this](int r) {
9f95a23c 6314 std::unique_lock l(osd_lock);
11fdf7f2
TL
6315 if (is_preboot()) {
6316 dout(10) << __func__ << " waiting for peering work to drain"
6317 << dendl;
9f95a23c 6318 l.unlock();
11fdf7f2 6319 for (auto shard : shards) {
9f95a23c 6320 shard->wait_min_pg_epoch(get_osdmap_epoch());
11fdf7f2 6321 }
9f95a23c 6322 l.lock();
11fdf7f2
TL
6323 }
6324 if (is_preboot()) {
6325 _send_boot();
6326 }
6327 }));
6328 return;
7c673cae
FG
6329 }
6330
6331 // get all the latest maps
6332 if (osdmap->get_epoch() + 1 >= oldest)
6333 osdmap_subscribe(osdmap->get_epoch() + 1, false);
6334 else
6335 osdmap_subscribe(oldest - 1, true);
6336}
6337
9f95a23c
TL
6338void OSD::_get_purged_snaps()
6339{
6340 // NOTE: this is a naive, stateless implementaiton. it may send multiple
6341 // overlapping requests to the mon, which will be somewhat inefficient, but
6342 // it should be reliable.
6343 dout(10) << __func__ << " purged_snaps_last " << superblock.purged_snaps_last
6344 << ", newest_map " << superblock.current_epoch << dendl;
6345 MMonGetPurgedSnaps *m = new MMonGetPurgedSnaps(
6346 superblock.purged_snaps_last + 1,
6347 superblock.current_epoch + 1);
6348 monc->send_mon_message(m);
6349}
6350
6351void OSD::handle_get_purged_snaps_reply(MMonGetPurgedSnapsReply *m)
6352{
6353 dout(10) << __func__ << " " << *m << dendl;
6354 ObjectStore::Transaction t;
6355 if (!is_preboot() ||
6356 m->last < superblock.purged_snaps_last) {
6357 goto out;
6358 }
6359 SnapMapper::record_purged_snaps(cct, store, service.meta_ch,
6360 make_purged_snaps_oid(), &t,
6361 m->purged_snaps);
6362 superblock.purged_snaps_last = m->last;
6363 write_superblock(t);
6364 store->queue_transaction(
6365 service.meta_ch,
6366 std::move(t));
6367 service.publish_superblock(superblock);
6368 if (m->last < superblock.current_epoch) {
6369 _get_purged_snaps();
6370 } else {
6371 start_boot();
6372 }
6373out:
6374 m->put();
6375}
6376
7c673cae
FG
6377void OSD::send_full_update()
6378{
6379 if (!service.need_fullness_update())
6380 return;
6381 unsigned state = 0;
6382 if (service.is_full()) {
6383 state = CEPH_OSD_FULL;
6384 } else if (service.is_backfillfull()) {
6385 state = CEPH_OSD_BACKFILLFULL;
6386 } else if (service.is_nearfull()) {
6387 state = CEPH_OSD_NEARFULL;
6388 }
6389 set<string> s;
6390 OSDMap::calc_state_set(state, s);
6391 dout(10) << __func__ << " want state " << s << dendl;
9f95a23c 6392 monc->send_mon_message(new MOSDFull(get_osdmap_epoch(), state));
7c673cae
FG
6393}
6394
6395void OSD::start_waiting_for_healthy()
6396{
6397 dout(1) << "start_waiting_for_healthy" << dendl;
6398 set_state(STATE_WAITING_FOR_HEALTHY);
6399 last_heartbeat_resample = utime_t();
181888fb
FG
6400
6401 // subscribe to osdmap updates, in case our peers really are known to be dead
9f95a23c 6402 osdmap_subscribe(get_osdmap_epoch() + 1, false);
7c673cae
FG
6403}
6404
6405bool OSD::_is_healthy()
6406{
6407 if (!cct->get_heartbeat_map()->is_healthy()) {
6408 dout(1) << "is_healthy false -- internal heartbeat failed" << dendl;
6409 return false;
6410 }
6411
6412 if (is_waiting_for_healthy()) {
11fdf7f2 6413 utime_t now = ceph_clock_now();
9f95a23c
TL
6414 if (osd_markdown_log.empty()) {
6415 dout(5) << __func__ << " force returning true since last markdown"
6416 << " was " << cct->_conf->osd_max_markdown_period
6417 << "s ago" << dendl;
11fdf7f2
TL
6418 return true;
6419 }
6420 std::lock_guard l(heartbeat_lock);
7c673cae
FG
6421 int num = 0, up = 0;
6422 for (map<int,HeartbeatInfo>::iterator p = heartbeat_peers.begin();
6423 p != heartbeat_peers.end();
6424 ++p) {
11fdf7f2 6425 if (p->second.is_healthy(now))
7c673cae
FG
6426 ++up;
6427 ++num;
6428 }
6429 if ((float)up < (float)num * cct->_conf->osd_heartbeat_min_healthy_ratio) {
6430 dout(1) << "is_healthy false -- only " << up << "/" << num << " up peers (less than "
6431 << int(cct->_conf->osd_heartbeat_min_healthy_ratio * 100.0) << "%)" << dendl;
6432 return false;
6433 }
6434 }
6435
6436 return true;
6437}
6438
6439void OSD::_send_boot()
6440{
6441 dout(10) << "_send_boot" << dendl;
11fdf7f2
TL
6442 Connection *local_connection =
6443 cluster_messenger->get_loopback_connection().get();
6444 entity_addrvec_t client_addrs = client_messenger->get_myaddrs();
6445 entity_addrvec_t cluster_addrs = cluster_messenger->get_myaddrs();
6446 entity_addrvec_t hb_back_addrs = hb_back_server_messenger->get_myaddrs();
6447 entity_addrvec_t hb_front_addrs = hb_front_server_messenger->get_myaddrs();
6448
6449 dout(20) << " initial client_addrs " << client_addrs
6450 << ", cluster_addrs " << cluster_addrs
6451 << ", hb_back_addrs " << hb_back_addrs
6452 << ", hb_front_addrs " << hb_front_addrs
6453 << dendl;
6454 if (cluster_messenger->set_addr_unknowns(client_addrs)) {
6455 dout(10) << " assuming cluster_addrs match client_addrs "
6456 << client_addrs << dendl;
6457 cluster_addrs = cluster_messenger->get_myaddrs();
6458 }
6459 if (auto session = local_connection->get_priv(); !session) {
6460 cluster_messenger->ms_deliver_handle_fast_connect(local_connection);
7c673cae
FG
6461 }
6462
7c673cae 6463 local_connection = hb_back_server_messenger->get_loopback_connection().get();
11fdf7f2
TL
6464 if (hb_back_server_messenger->set_addr_unknowns(cluster_addrs)) {
6465 dout(10) << " assuming hb_back_addrs match cluster_addrs "
6466 << cluster_addrs << dendl;
6467 hb_back_addrs = hb_back_server_messenger->get_myaddrs();
7c673cae 6468 }
11fdf7f2
TL
6469 if (auto session = local_connection->get_priv(); !session) {
6470 hb_back_server_messenger->ms_deliver_handle_fast_connect(local_connection);
7c673cae
FG
6471 }
6472
11fdf7f2
TL
6473 local_connection = hb_front_server_messenger->get_loopback_connection().get();
6474 if (hb_front_server_messenger->set_addr_unknowns(client_addrs)) {
6475 dout(10) << " assuming hb_front_addrs match client_addrs "
6476 << client_addrs << dendl;
6477 hb_front_addrs = hb_front_server_messenger->get_myaddrs();
6478 }
6479 if (auto session = local_connection->get_priv(); !session) {
6480 hb_front_server_messenger->ms_deliver_handle_fast_connect(local_connection);
6481 }
6482
6483 // we now know what our front and back addrs will be, and we are
6484 // about to tell the mon what our metadata (including numa bindings)
6485 // are, so now is a good time!
6486 set_numa_affinity();
6487
6488 MOSDBoot *mboot = new MOSDBoot(
6489 superblock, get_osdmap_epoch(), service.get_boot_epoch(),
6490 hb_back_addrs, hb_front_addrs, cluster_addrs,
6491 CEPH_FEATURES_ALL);
6492 dout(10) << " final client_addrs " << client_addrs
6493 << ", cluster_addrs " << cluster_addrs
6494 << ", hb_back_addrs " << hb_back_addrs
6495 << ", hb_front_addrs " << hb_front_addrs
7c673cae
FG
6496 << dendl;
6497 _collect_metadata(&mboot->metadata);
6498 monc->send_mon_message(mboot);
6499 set_state(STATE_BOOTING);
6500}
6501
6502void OSD::_collect_metadata(map<string,string> *pm)
6503{
6504 // config info
6505 (*pm)["osd_data"] = dev_path;
c07f9fc5
FG
6506 if (store->get_type() == "filestore") {
6507 // not applicable for bluestore
6508 (*pm)["osd_journal"] = journal_path;
6509 }
11fdf7f2
TL
6510 (*pm)["front_addr"] = stringify(client_messenger->get_myaddrs());
6511 (*pm)["back_addr"] = stringify(cluster_messenger->get_myaddrs());
6512 (*pm)["hb_front_addr"] = stringify(hb_front_server_messenger->get_myaddrs());
6513 (*pm)["hb_back_addr"] = stringify(hb_back_server_messenger->get_myaddrs());
7c673cae
FG
6514
6515 // backend
6516 (*pm)["osd_objectstore"] = store->get_type();
31f18b77 6517 (*pm)["rotational"] = store_is_rotational ? "1" : "0";
d2e6a577 6518 (*pm)["journal_rotational"] = journal_is_rotational ? "1" : "0";
c07f9fc5 6519 (*pm)["default_device_class"] = store->get_default_device_class();
f6b5b4d7
TL
6520 string osdspec_affinity;
6521 int r = store->read_meta("osdspec_affinity", &osdspec_affinity);
6522 if (r < 0 || osdspec_affinity.empty()) {
6523 osdspec_affinity = "";
6524 }
6525 (*pm)["osdspec_affinity"] = osdspec_affinity;
7c673cae
FG
6526 store->collect_metadata(pm);
6527
6528 collect_sys_info(pm, cct);
6529
11fdf7f2
TL
6530 (*pm)["front_iface"] = pick_iface(
6531 cct,
6532 client_messenger->get_myaddrs().front().get_sockaddr_storage());
6533 (*pm)["back_iface"] = pick_iface(
6534 cct,
6535 cluster_messenger->get_myaddrs().front().get_sockaddr_storage());
6536
6537 // network numa
6538 {
6539 int node = -1;
6540 set<int> nodes;
6541 set<string> unknown;
6542 for (auto nm : { "front_iface", "back_iface" }) {
6543 if (!(*pm)[nm].size()) {
6544 unknown.insert(nm);
6545 continue;
6546 }
6547 int n = -1;
6548 int r = get_iface_numa_node((*pm)[nm], &n);
6549 if (r < 0) {
6550 unknown.insert((*pm)[nm]);
6551 continue;
6552 }
6553 nodes.insert(n);
6554 if (node < 0) {
6555 node = n;
6556 }
6557 }
6558 if (unknown.size()) {
6559 (*pm)["network_numa_unknown_ifaces"] = stringify(unknown);
6560 }
6561 if (!nodes.empty()) {
6562 (*pm)["network_numa_nodes"] = stringify(nodes);
6563 }
6564 if (node >= 0 && nodes.size() == 1 && unknown.empty()) {
6565 (*pm)["network_numa_node"] = stringify(node);
6566 }
6567 }
6568
6569 if (numa_node >= 0) {
6570 (*pm)["numa_node"] = stringify(numa_node);
6571 (*pm)["numa_node_cpus"] = cpu_set_to_str_list(numa_cpu_set_size,
6572 &numa_cpu_set);
6573 }
6574
6575 set<string> devnames;
6576 store->get_devices(&devnames);
9f95a23c
TL
6577 map<string,string> errs;
6578 get_device_metadata(devnames, pm, &errs);
6579 for (auto& i : errs) {
6580 dout(1) << __func__ << " " << i.first << ": " << i.second << dendl;
11fdf7f2 6581 }
7c673cae
FG
6582 dout(10) << __func__ << " " << *pm << dendl;
6583}
6584
6585void OSD::queue_want_up_thru(epoch_t want)
6586{
9f95a23c
TL
6587 std::shared_lock map_locker{map_lock};
6588 epoch_t cur = get_osdmap()->get_up_thru(whoami);
6589 std::lock_guard report_locker(mon_report_lock);
7c673cae
FG
6590 if (want > up_thru_wanted) {
6591 dout(10) << "queue_want_up_thru now " << want << " (was " << up_thru_wanted << ")"
6592 << ", currently " << cur
6593 << dendl;
6594 up_thru_wanted = want;
6595 send_alive();
6596 } else {
6597 dout(10) << "queue_want_up_thru want " << want << " <= queued " << up_thru_wanted
6598 << ", currently " << cur
6599 << dendl;
6600 }
7c673cae
FG
6601}
6602
6603void OSD::send_alive()
6604{
9f95a23c
TL
6605 ceph_assert(ceph_mutex_is_locked(mon_report_lock));
6606 const auto osdmap = get_osdmap();
7c673cae
FG
6607 if (!osdmap->exists(whoami))
6608 return;
6609 epoch_t up_thru = osdmap->get_up_thru(whoami);
6610 dout(10) << "send_alive up_thru currently " << up_thru << " want " << up_thru_wanted << dendl;
6611 if (up_thru_wanted > up_thru) {
6612 dout(10) << "send_alive want " << up_thru_wanted << dendl;
6613 monc->send_mon_message(new MOSDAlive(osdmap->get_epoch(), up_thru_wanted));
6614 }
6615}
6616
6617void OSD::request_full_map(epoch_t first, epoch_t last)
6618{
6619 dout(10) << __func__ << " " << first << ".." << last
6620 << ", previously requested "
6621 << requested_full_first << ".." << requested_full_last << dendl;
9f95a23c 6622 ceph_assert(ceph_mutex_is_locked(osd_lock));
11fdf7f2
TL
6623 ceph_assert(first > 0 && last > 0);
6624 ceph_assert(first <= last);
6625 ceph_assert(first >= requested_full_first); // we shouldn't ever ask for older maps
7c673cae
FG
6626 if (requested_full_first == 0) {
6627 // first request
6628 requested_full_first = first;
6629 requested_full_last = last;
6630 } else if (last <= requested_full_last) {
6631 // dup
6632 return;
6633 } else {
6634 // additional request
6635 first = requested_full_last + 1;
6636 requested_full_last = last;
6637 }
6638 MMonGetOSDMap *req = new MMonGetOSDMap;
6639 req->request_full(first, last);
6640 monc->send_mon_message(req);
6641}
6642
6643void OSD::got_full_map(epoch_t e)
6644{
11fdf7f2 6645 ceph_assert(requested_full_first <= requested_full_last);
9f95a23c 6646 ceph_assert(ceph_mutex_is_locked(osd_lock));
7c673cae
FG
6647 if (requested_full_first == 0) {
6648 dout(20) << __func__ << " " << e << ", nothing requested" << dendl;
6649 return;
6650 }
6651 if (e < requested_full_first) {
6652 dout(10) << __func__ << " " << e << ", requested " << requested_full_first
6653 << ".." << requested_full_last
6654 << ", ignoring" << dendl;
6655 return;
6656 }
6657 if (e >= requested_full_last) {
6658 dout(10) << __func__ << " " << e << ", requested " << requested_full_first
6659 << ".." << requested_full_last << ", resetting" << dendl;
6660 requested_full_first = requested_full_last = 0;
6661 return;
6662 }
6663
6664 requested_full_first = e + 1;
6665
6666 dout(10) << __func__ << " " << e << ", requested " << requested_full_first
6667 << ".." << requested_full_last
6668 << ", still need more" << dendl;
6669}
6670
6671void OSD::requeue_failures()
6672{
11fdf7f2 6673 std::lock_guard l(heartbeat_lock);
7c673cae
FG
6674 unsigned old_queue = failure_queue.size();
6675 unsigned old_pending = failure_pending.size();
11fdf7f2 6676 for (auto p = failure_pending.begin(); p != failure_pending.end(); ) {
7c673cae
FG
6677 failure_queue[p->first] = p->second.first;
6678 failure_pending.erase(p++);
6679 }
6680 dout(10) << __func__ << " " << old_queue << " + " << old_pending << " -> "
6681 << failure_queue.size() << dendl;
6682}
6683
6684void OSD::send_failures()
6685{
9f95a23c
TL
6686 ceph_assert(ceph_mutex_is_locked(map_lock));
6687 ceph_assert(ceph_mutex_is_locked(mon_report_lock));
11fdf7f2 6688 std::lock_guard l(heartbeat_lock);
7c673cae 6689 utime_t now = ceph_clock_now();
9f95a23c 6690 const auto osdmap = get_osdmap();
7c673cae
FG
6691 while (!failure_queue.empty()) {
6692 int osd = failure_queue.begin()->first;
7c673cae
FG
6693 if (!failure_pending.count(osd)) {
6694 int failed_for = (int)(double)(now - failure_queue.begin()->second);
11fdf7f2
TL
6695 monc->send_mon_message(
6696 new MOSDFailure(
6697 monc->get_fsid(),
6698 osd,
6699 osdmap->get_addrs(osd),
6700 failed_for,
6701 osdmap->get_epoch()));
6702 failure_pending[osd] = make_pair(failure_queue.begin()->second,
6703 osdmap->get_addrs(osd));
7c673cae
FG
6704 }
6705 failure_queue.erase(osd);
6706 }
6707}
6708
11fdf7f2 6709void OSD::send_still_alive(epoch_t epoch, int osd, const entity_addrvec_t &addrs)
7c673cae 6710{
11fdf7f2
TL
6711 MOSDFailure *m = new MOSDFailure(monc->get_fsid(), osd, addrs, 0, epoch,
6712 MOSDFailure::FLAG_ALIVE);
7c673cae
FG
6713 monc->send_mon_message(m);
6714}
6715
11fdf7f2 6716void OSD::cancel_pending_failures()
7c673cae 6717{
11fdf7f2
TL
6718 std::lock_guard l(heartbeat_lock);
6719 auto it = failure_pending.begin();
6720 while (it != failure_pending.end()) {
6721 dout(10) << __func__ << " canceling in-flight failure report for osd."
6722 << it->first << dendl;
9f95a23c 6723 send_still_alive(get_osdmap_epoch(), it->first, it->second.second);
11fdf7f2 6724 failure_pending.erase(it++);
7c673cae 6725 }
7c673cae
FG
6726}
6727
6728void OSD::send_beacon(const ceph::coarse_mono_clock::time_point& now)
6729{
6730 const auto& monmap = monc->monmap;
6731 // send beacon to mon even if we are just connected, and the monmap is not
6732 // initialized yet by then.
6733 if (monmap.epoch > 0 &&
6734 monmap.get_required_features().contains_all(
6735 ceph::features::mon::FEATURE_LUMINOUS)) {
6736 dout(20) << __func__ << " sending" << dendl;
7c673cae
FG
6737 MOSDBeacon* beacon = nullptr;
6738 {
11fdf7f2 6739 std::lock_guard l{min_last_epoch_clean_lock};
9f95a23c
TL
6740 beacon = new MOSDBeacon(get_osdmap_epoch(),
6741 min_last_epoch_clean,
6742 superblock.last_purged_snaps_scrub);
494da23a 6743 beacon->pgs = min_last_epoch_clean_pgs;
224ce89b 6744 last_sent_beacon = now;
7c673cae
FG
6745 }
6746 monc->send_mon_message(beacon);
6747 } else {
6748 dout(20) << __func__ << " not sending" << dendl;
6749 }
6750}
6751
7c673cae
FG
6752void OSD::handle_command(MCommand *m)
6753{
6754 ConnectionRef con = m->get_connection();
9f95a23c 6755 auto session = ceph::ref_cast<Session>(con->get_priv());
7c673cae 6756 if (!session) {
9f95a23c 6757 con->send_message(new MCommandReply(m, -EACCES));
7c673cae
FG
6758 m->put();
6759 return;
6760 }
9f95a23c
TL
6761 if (!session->caps.allow_all()) {
6762 con->send_message(new MCommandReply(m, -EACCES));
7c673cae
FG
6763 m->put();
6764 return;
6765 }
9f95a23c 6766 cct->get_admin_socket()->queue_tell_command(m);
7c673cae
FG
6767 m->put();
6768}
6769
f64942e4
AA
6770namespace {
6771 class unlock_guard {
9f95a23c 6772 ceph::mutex& m;
f64942e4 6773 public:
9f95a23c 6774 explicit unlock_guard(ceph::mutex& mutex)
f64942e4
AA
6775 : m(mutex)
6776 {
11fdf7f2 6777 m.unlock();
f64942e4
AA
6778 }
6779 unlock_guard(unlock_guard&) = delete;
6780 ~unlock_guard() {
11fdf7f2 6781 m.lock();
f64942e4
AA
6782 }
6783 };
6784}
6785
9f95a23c 6786void OSD::scrub_purged_snaps()
7c673cae 6787{
9f95a23c
TL
6788 dout(10) << __func__ << dendl;
6789 ceph_assert(ceph_mutex_is_locked(osd_lock));
6790 SnapMapper::Scrubber s(cct, store, service.meta_ch,
6791 make_snapmapper_oid(),
6792 make_purged_snaps_oid());
6793 clog->debug() << "purged_snaps scrub starts";
6794 osd_lock.unlock();
6795 s.run();
6796 if (s.stray.size()) {
6797 clog->debug() << "purged_snaps scrub found " << s.stray.size() << " strays";
6798 } else {
6799 clog->debug() << "purged_snaps scrub ok";
224ce89b 6800 }
9f95a23c
TL
6801 set<pair<spg_t,snapid_t>> queued;
6802 for (auto& [pool, snap, hash, shard] : s.stray) {
6803 const pg_pool_t *pi = get_osdmap()->get_pg_pool(pool);
6804 if (!pi) {
6805 dout(20) << __func__ << " pool " << pool << " dne" << dendl;
6806 continue;
11fdf7f2 6807 }
9f95a23c
TL
6808 pg_t pgid(pi->raw_hash_to_pg(hash), pool);
6809 spg_t spgid(pgid, shard);
6810 pair<spg_t,snapid_t> p(spgid, snap);
6811 if (queued.count(p)) {
6812 dout(20) << __func__ << " pg " << spgid << " snap " << snap
6813 << " already queued" << dendl;
6814 continue;
11fdf7f2 6815 }
9f95a23c
TL
6816 PGRef pg = lookup_lock_pg(spgid);
6817 if (!pg) {
6818 dout(20) << __func__ << " pg " << spgid << " not found" << dendl;
6819 continue;
11fdf7f2 6820 }
9f95a23c
TL
6821 queued.insert(p);
6822 dout(10) << __func__ << " requeue pg " << spgid << " " << pg << " snap "
6823 << snap << dendl;
6824 pg->queue_snap_retrim(snap);
6825 pg->unlock();
7c673cae 6826 }
9f95a23c
TL
6827 osd_lock.lock();
6828 if (is_stopping()) {
6829 return;
6830 }
6831 dout(10) << __func__ << " done queueing pgs, updating superblock" << dendl;
6832 ObjectStore::Transaction t;
6833 superblock.last_purged_snaps_scrub = ceph_clock_now();
6834 write_superblock(t);
6835 int tr = store->queue_transaction(service.meta_ch, std::move(t), nullptr);
6836 ceph_assert(tr == 0);
6837 if (is_active()) {
6838 send_beacon(ceph::coarse_mono_clock::now());
6839 }
6840 dout(10) << __func__ << " done" << dendl;
11fdf7f2
TL
6841}
6842
6843void OSD::probe_smart(const string& only_devid, ostream& ss)
6844{
6845 set<string> devnames;
6846 store->get_devices(&devnames);
6847 uint64_t smart_timeout = cct->_conf.get_val<uint64_t>(
6848 "osd_smart_report_timeout");
6849
6850 // == typedef std::map<std::string, mValue> mObject;
6851 json_spirit::mObject json_map;
6852
6853 for (auto dev : devnames) {
6854 // smartctl works only on physical devices; filter out any logical device
6855 if (dev.find("dm-") == 0) {
6856 continue;
6857 }
6858
6859 string err;
6860 string devid = get_device_id(dev, &err);
6861 if (devid.size() == 0) {
6862 dout(10) << __func__ << " no unique id for dev " << dev << " ("
6863 << err << "), skipping" << dendl;
6864 continue;
6865 }
6866 if (only_devid.size() && devid != only_devid) {
6867 continue;
6868 }
6869
6870 json_spirit::mValue smart_json;
6871 if (block_device_get_metrics(dev, smart_timeout,
6872 &smart_json)) {
6873 dout(10) << "block_device_get_metrics failed for /dev/" << dev << dendl;
6874 continue;
6875 }
6876 json_map[devid] = smart_json;
7c673cae 6877 }
11fdf7f2 6878 json_spirit::write(json_map, ss, json_spirit::pretty_print);
7c673cae
FG
6879}
6880
6881bool OSD::heartbeat_dispatch(Message *m)
6882{
6883 dout(30) << "heartbeat_dispatch " << m << dendl;
6884 switch (m->get_type()) {
6885
6886 case CEPH_MSG_PING:
6887 dout(10) << "ping from " << m->get_source_inst() << dendl;
6888 m->put();
6889 break;
6890
6891 case MSG_OSD_PING:
6892 handle_osd_ping(static_cast<MOSDPing*>(m));
6893 break;
6894
6895 default:
6896 dout(0) << "dropping unexpected message " << *m << " from " << m->get_source_inst() << dendl;
6897 m->put();
6898 }
6899
6900 return true;
6901}
6902
6903bool OSD::ms_dispatch(Message *m)
6904{
6905 dout(20) << "OSD::ms_dispatch: " << *m << dendl;
6906 if (m->get_type() == MSG_OSD_MARK_ME_DOWN) {
6907 service.got_stop_ack();
6908 m->put();
6909 return true;
6910 }
6911
6912 // lock!
6913
9f95a23c 6914 osd_lock.lock();
7c673cae 6915 if (is_stopping()) {
9f95a23c 6916 osd_lock.unlock();
7c673cae
FG
6917 m->put();
6918 return true;
6919 }
6920
6921 do_waiters();
6922 _dispatch(m);
6923
9f95a23c 6924 osd_lock.unlock();
7c673cae
FG
6925
6926 return true;
6927}
6928
9f95a23c
TL
6929void OSDService::maybe_share_map(
6930 Connection *con,
6931 const OSDMapRef& osdmap,
6932 epoch_t peer_epoch_lb)
7c673cae 6933{
9f95a23c
TL
6934 // NOTE: we assume caller hold something that keeps the Connection itself
6935 // pinned (e.g., an OpRequest's MessageRef).
6936 auto session = ceph::ref_cast<Session>(con->get_priv());
6937 if (!session) {
7c673cae
FG
6938 return;
6939 }
7c673cae 6940
9f95a23c
TL
6941 // assume the peer has the newer of the op's sent_epoch and what
6942 // we think we sent them.
7c673cae 6943 session->sent_epoch_lock.lock();
9f95a23c
TL
6944 if (peer_epoch_lb > session->last_sent_epoch) {
6945 dout(10) << __func__ << " con " << con
6946 << " " << con->get_peer_addr()
6947 << " map epoch " << session->last_sent_epoch
6948 << " -> " << peer_epoch_lb << " (as per caller)" << dendl;
6949 session->last_sent_epoch = peer_epoch_lb;
6950 }
6951 epoch_t last_sent_epoch = session->last_sent_epoch;
7c673cae
FG
6952 session->sent_epoch_lock.unlock();
6953
9f95a23c
TL
6954 if (osdmap->get_epoch() <= last_sent_epoch) {
6955 return;
6956 }
11fdf7f2 6957
9f95a23c
TL
6958 send_incremental_map(last_sent_epoch, con, osdmap);
6959 last_sent_epoch = osdmap->get_epoch();
7c673cae
FG
6960
6961 session->sent_epoch_lock.lock();
6962 if (session->last_sent_epoch < last_sent_epoch) {
9f95a23c
TL
6963 dout(10) << __func__ << " con " << con
6964 << " " << con->get_peer_addr()
6965 << " map epoch " << session->last_sent_epoch
6966 << " -> " << last_sent_epoch << " (shared)" << dendl;
7c673cae
FG
6967 session->last_sent_epoch = last_sent_epoch;
6968 }
6969 session->sent_epoch_lock.unlock();
7c673cae
FG
6970}
6971
9f95a23c 6972void OSD::dispatch_session_waiting(const ceph::ref_t<Session>& session, OSDMapRef osdmap)
7c673cae 6973{
9f95a23c 6974 ceph_assert(ceph_mutex_is_locked(session->session_dispatch_lock));
7c673cae
FG
6975
6976 auto i = session->waiting_on_map.begin();
6977 while (i != session->waiting_on_map.end()) {
6978 OpRequestRef op = &(*i);
11fdf7f2 6979 ceph_assert(ms_can_fast_dispatch(op->get_req()));
9f95a23c 6980 auto m = op->get_req<MOSDFastDispatchOp>();
7c673cae
FG
6981 if (m->get_min_epoch() > osdmap->get_epoch()) {
6982 break;
6983 }
6984 session->waiting_on_map.erase(i++);
6985 op->put();
6986
6987 spg_t pgid;
6988 if (m->get_type() == CEPH_MSG_OSD_OP) {
6989 pg_t actual_pgid = osdmap->raw_pg_to_pg(
6990 static_cast<const MOSDOp*>(m)->get_pg());
6991 if (!osdmap->get_primary_shard(actual_pgid, &pgid)) {
6992 continue;
6993 }
6994 } else {
6995 pgid = m->get_spg();
6996 }
11fdf7f2 6997 enqueue_op(pgid, std::move(op), m->get_map_epoch());
7c673cae
FG
6998 }
6999
7000 if (session->waiting_on_map.empty()) {
7001 clear_session_waiting_on_map(session);
7002 } else {
7003 register_session_waiting_on_map(session);
7004 }
7005}
7006
7007void OSD::ms_fast_dispatch(Message *m)
7008{
11fdf7f2 7009 FUNCTRACE(cct);
7c673cae
FG
7010 if (service.is_stopping()) {
7011 m->put();
7012 return;
7013 }
11fdf7f2
TL
7014
7015 // peering event?
7016 switch (m->get_type()) {
7017 case CEPH_MSG_PING:
7018 dout(10) << "ping from " << m->get_source() << dendl;
7019 m->put();
7020 return;
11fdf7f2
TL
7021 case MSG_OSD_FORCE_RECOVERY:
7022 handle_fast_force_recovery(static_cast<MOSDForceRecovery*>(m));
7023 return;
7024 case MSG_OSD_SCRUB2:
7025 handle_fast_scrub(static_cast<MOSDScrub2*>(m));
7026 return;
7027
7028 case MSG_OSD_PG_CREATE2:
7029 return handle_fast_pg_create(static_cast<MOSDPGCreate2*>(m));
7030 case MSG_OSD_PG_QUERY:
7031 return handle_fast_pg_query(static_cast<MOSDPGQuery*>(m));
7032 case MSG_OSD_PG_NOTIFY:
7033 return handle_fast_pg_notify(static_cast<MOSDPGNotify*>(m));
7034 case MSG_OSD_PG_INFO:
7035 return handle_fast_pg_info(static_cast<MOSDPGInfo*>(m));
7036 case MSG_OSD_PG_REMOVE:
7037 return handle_fast_pg_remove(static_cast<MOSDPGRemove*>(m));
7038
7039 // these are single-pg messages that handle themselves
7040 case MSG_OSD_PG_LOG:
7041 case MSG_OSD_PG_TRIM:
9f95a23c
TL
7042 case MSG_OSD_PG_NOTIFY2:
7043 case MSG_OSD_PG_QUERY2:
7044 case MSG_OSD_PG_INFO2:
11fdf7f2
TL
7045 case MSG_OSD_BACKFILL_RESERVE:
7046 case MSG_OSD_RECOVERY_RESERVE:
9f95a23c
TL
7047 case MSG_OSD_PG_LEASE:
7048 case MSG_OSD_PG_LEASE_ACK:
11fdf7f2
TL
7049 {
7050 MOSDPeeringOp *pm = static_cast<MOSDPeeringOp*>(m);
7051 if (require_osd_peer(pm)) {
7052 enqueue_peering_evt(
7053 pm->get_spg(),
7054 PGPeeringEventRef(pm->get_event()));
7055 }
7056 pm->put();
7057 return;
7058 }
7059 }
7060
7c673cae
FG
7061 OpRequestRef op = op_tracker.create_request<OpRequest, Message*>(m);
7062 {
7063#ifdef WITH_LTTNG
7064 osd_reqid_t reqid = op->get_reqid();
7065#endif
7066 tracepoint(osd, ms_fast_dispatch, reqid.name._type,
7067 reqid.name._num, reqid.tid, reqid.inc);
7068 }
7069
7070 if (m->trace)
7071 op->osd_trace.init("osd op", &trace_endpoint, &m->trace);
7072
11fdf7f2 7073 // note sender epoch, min req's epoch
7c673cae
FG
7074 op->sent_epoch = static_cast<MOSDFastDispatchOp*>(m)->get_map_epoch();
7075 op->min_epoch = static_cast<MOSDFastDispatchOp*>(m)->get_min_epoch();
11fdf7f2 7076 ceph_assert(op->min_epoch <= op->sent_epoch); // sanity check!
7c673cae
FG
7077
7078 service.maybe_inject_dispatch_delay();
7079
7080 if (m->get_connection()->has_features(CEPH_FEATUREMASK_RESEND_ON_SPLIT) ||
7081 m->get_type() != CEPH_MSG_OSD_OP) {
7082 // queue it directly
7083 enqueue_op(
7084 static_cast<MOSDFastDispatchOp*>(m)->get_spg(),
11fdf7f2 7085 std::move(op),
7c673cae
FG
7086 static_cast<MOSDFastDispatchOp*>(m)->get_map_epoch());
7087 } else {
7088 // legacy client, and this is an MOSDOp (the *only* fast dispatch
7089 // message that didn't have an explicit spg_t); we need to map
7090 // them to an spg_t while preserving delivery order.
11fdf7f2
TL
7091 auto priv = m->get_connection()->get_priv();
7092 if (auto session = static_cast<Session*>(priv.get()); session) {
7093 std::lock_guard l{session->session_dispatch_lock};
7094 op->get();
7095 session->waiting_on_map.push_back(*op);
7096 OSDMapRef nextmap = service.get_nextmap_reserved();
7097 dispatch_session_waiting(session, nextmap);
7098 service.release_map(nextmap);
7c673cae
FG
7099 }
7100 }
7101 OID_EVENT_TRACE_WITH_MSG(m, "MS_FAST_DISPATCH_END", false);
7102}
7103
11fdf7f2 7104int OSD::ms_handle_authentication(Connection *con)
7c673cae 7105{
11fdf7f2 7106 int ret = 0;
9f95a23c 7107 auto s = ceph::ref_cast<Session>(con->get_priv());
11fdf7f2 7108 if (!s) {
9f95a23c
TL
7109 s = ceph::make_ref<Session>(cct, con);
7110 con->set_priv(s);
11fdf7f2
TL
7111 s->entity_name = con->get_peer_entity_name();
7112 dout(10) << __func__ << " new session " << s << " con " << s->con
7113 << " entity " << s->entity_name
7114 << " addr " << con->get_peer_addrs() << dendl;
7115 } else {
7116 dout(10) << __func__ << " existing session " << s << " con " << s->con
7117 << " entity " << s->entity_name
7118 << " addr " << con->get_peer_addrs() << dendl;
7c673cae
FG
7119 }
7120
11fdf7f2 7121 AuthCapsInfo &caps_info = con->get_peer_caps_info();
9f95a23c 7122 if (caps_info.allow_all) {
11fdf7f2 7123 s->caps.set_allow_all();
9f95a23c 7124 } else if (caps_info.caps.length() > 0) {
11fdf7f2
TL
7125 bufferlist::const_iterator p = caps_info.caps.cbegin();
7126 string str;
7127 try {
7128 decode(str, p);
7129 }
7130 catch (buffer::error& e) {
7131 dout(10) << __func__ << " session " << s << " " << s->entity_name
7132 << " failed to decode caps string" << dendl;
9f95a23c 7133 ret = -EACCES;
11fdf7f2
TL
7134 }
7135 if (!ret) {
7c673cae 7136 bool success = s->caps.parse(str);
11fdf7f2
TL
7137 if (success) {
7138 dout(10) << __func__ << " session " << s
7139 << " " << s->entity_name
7140 << " has caps " << s->caps << " '" << str << "'" << dendl;
7141 ret = 1;
7142 } else {
7143 dout(10) << __func__ << " session " << s << " " << s->entity_name
7144 << " failed to parse caps '" << str << "'" << dendl;
9f95a23c 7145 ret = -EACCES;
11fdf7f2 7146 }
7c673cae 7147 }
7c673cae 7148 }
11fdf7f2 7149 return ret;
7c673cae
FG
7150}
7151
7152void OSD::do_waiters()
7153{
9f95a23c 7154 ceph_assert(ceph_mutex_is_locked(osd_lock));
7c673cae
FG
7155
7156 dout(10) << "do_waiters -- start" << dendl;
7157 while (!finished.empty()) {
7158 OpRequestRef next = finished.front();
7159 finished.pop_front();
7160 dispatch_op(next);
7161 }
7162 dout(10) << "do_waiters -- finish" << dendl;
7163}
7164
7165void OSD::dispatch_op(OpRequestRef op)
7166{
7167 switch (op->get_req()->get_type()) {
7168
7169 case MSG_OSD_PG_CREATE:
7170 handle_pg_create(op);
7171 break;
7c673cae
FG
7172 }
7173}
7174
7175void OSD::_dispatch(Message *m)
7176{
9f95a23c 7177 ceph_assert(ceph_mutex_is_locked(osd_lock));
7c673cae
FG
7178 dout(20) << "_dispatch " << m << " " << *m << dendl;
7179
7180 switch (m->get_type()) {
7c673cae
FG
7181 // -- don't need OSDMap --
7182
7183 // map and replication
7184 case CEPH_MSG_OSD_MAP:
7185 handle_osd_map(static_cast<MOSDMap*>(m));
7186 break;
9f95a23c
TL
7187 case MSG_MON_GET_PURGED_SNAPS_REPLY:
7188 handle_get_purged_snaps_reply(static_cast<MMonGetPurgedSnapsReply*>(m));
7189 break;
7c673cae
FG
7190
7191 // osd
7c673cae
FG
7192 case MSG_OSD_SCRUB:
7193 handle_scrub(static_cast<MOSDScrub*>(m));
7194 break;
7195
11fdf7f2
TL
7196 case MSG_COMMAND:
7197 handle_command(static_cast<MCommand*>(m));
7198 return;
c07f9fc5 7199
7c673cae
FG
7200 // -- need OSDMap --
7201
7202 case MSG_OSD_PG_CREATE:
7c673cae
FG
7203 {
7204 OpRequestRef op = op_tracker.create_request<OpRequest, Message*>(m);
7205 if (m->trace)
7206 op->osd_trace.init("osd op", &trace_endpoint, &m->trace);
7207 // no map? starting up?
9f95a23c 7208 if (!get_osdmap()) {
7c673cae
FG
7209 dout(7) << "no OSDMap, not booted" << dendl;
7210 logger->inc(l_osd_waiting_for_map);
7211 waiting_for_osdmap.push_back(op);
7212 op->mark_delayed("no osdmap");
7213 break;
7214 }
7215
7216 // need OSDMap
7217 dispatch_op(op);
7218 }
7219 }
7220}
7221
11fdf7f2 7222// remove me post-nautilus
7c673cae
FG
7223void OSD::handle_scrub(MOSDScrub *m)
7224{
7225 dout(10) << "handle_scrub " << *m << dendl;
7226 if (!require_mon_or_mgr_peer(m)) {
7227 m->put();
7228 return;
7229 }
7230 if (m->fsid != monc->get_fsid()) {
11fdf7f2
TL
7231 dout(0) << "handle_scrub fsid " << m->fsid << " != " << monc->get_fsid()
7232 << dendl;
7c673cae
FG
7233 m->put();
7234 return;
7235 }
7236
11fdf7f2
TL
7237 vector<spg_t> spgs;
7238 _get_pgids(&spgs);
7239
7240 if (!m->scrub_pgs.empty()) {
7241 vector<spg_t> v;
7242 for (auto pgid : m->scrub_pgs) {
7c673cae 7243 spg_t pcand;
9f95a23c 7244 if (get_osdmap()->get_primary_shard(pgid, &pcand) &&
11fdf7f2
TL
7245 std::find(spgs.begin(), spgs.end(), pcand) != spgs.end()) {
7246 v.push_back(pcand);
7c673cae
FG
7247 }
7248 }
11fdf7f2
TL
7249 spgs.swap(v);
7250 }
7251
7252 for (auto pgid : spgs) {
7253 enqueue_peering_evt(
7254 pgid,
7255 PGPeeringEventRef(
7256 std::make_shared<PGPeeringEvent>(
7257 get_osdmap_epoch(),
7258 get_osdmap_epoch(),
9f95a23c 7259 PeeringState::RequestScrub(m->deep, m->repair))));
7c673cae
FG
7260 }
7261
7262 m->put();
7263}
7264
11fdf7f2
TL
7265void OSD::handle_fast_scrub(MOSDScrub2 *m)
7266{
7267 dout(10) << __func__ << " " << *m << dendl;
7268 if (!require_mon_or_mgr_peer(m)) {
7269 m->put();
7270 return;
7271 }
7272 if (m->fsid != monc->get_fsid()) {
7273 dout(0) << __func__ << " fsid " << m->fsid << " != " << monc->get_fsid()
7274 << dendl;
7275 m->put();
7276 return;
7277 }
7278 for (auto pgid : m->scrub_pgs) {
7279 enqueue_peering_evt(
7280 pgid,
7281 PGPeeringEventRef(
7282 std::make_shared<PGPeeringEvent>(
7283 m->epoch,
7284 m->epoch,
9f95a23c 7285 PeeringState::RequestScrub(m->deep, m->repair))));
11fdf7f2
TL
7286 }
7287 m->put();
7288}
7289
7c673cae
FG
7290bool OSD::scrub_random_backoff()
7291{
7292 bool coin_flip = (rand() / (double)RAND_MAX >=
7293 cct->_conf->osd_scrub_backoff_ratio);
7294 if (!coin_flip) {
7295 dout(20) << "scrub_random_backoff lost coin flip, randomly backing off" << dendl;
7296 return true;
7297 }
7298 return false;
7299}
7300
7301OSDService::ScrubJob::ScrubJob(CephContext* cct,
7302 const spg_t& pg, const utime_t& timestamp,
7303 double pool_scrub_min_interval,
7304 double pool_scrub_max_interval, bool must)
7305 : cct(cct),
7306 pgid(pg),
7307 sched_time(timestamp),
7308 deadline(timestamp)
7309{
7310 // if not explicitly requested, postpone the scrub with a random delay
7311 if (!must) {
7312 double scrub_min_interval = pool_scrub_min_interval > 0 ?
7313 pool_scrub_min_interval : cct->_conf->osd_scrub_min_interval;
7314 double scrub_max_interval = pool_scrub_max_interval > 0 ?
7315 pool_scrub_max_interval : cct->_conf->osd_scrub_max_interval;
7316
7317 sched_time += scrub_min_interval;
7318 double r = rand() / (double)RAND_MAX;
7319 sched_time +=
7320 scrub_min_interval * cct->_conf->osd_scrub_interval_randomize_ratio * r;
11fdf7f2
TL
7321 if (scrub_max_interval == 0) {
7322 deadline = utime_t();
7323 } else {
7324 deadline += scrub_max_interval;
7325 }
7326
7c673cae
FG
7327 }
7328}
7329
7330bool OSDService::ScrubJob::ScrubJob::operator<(const OSDService::ScrubJob& rhs) const {
7331 if (sched_time < rhs.sched_time)
7332 return true;
7333 if (sched_time > rhs.sched_time)
7334 return false;
7335 return pgid < rhs.pgid;
7336}
7337
9f95a23c
TL
7338double OSD::scrub_sleep_time(bool must_scrub)
7339{
7340 if (must_scrub) {
7341 return cct->_conf->osd_scrub_sleep;
7342 }
7343 utime_t now = ceph_clock_now();
7344 if (scrub_time_permit(now)) {
7345 return cct->_conf->osd_scrub_sleep;
7346 }
7347 double normal_sleep = cct->_conf->osd_scrub_sleep;
7348 double extended_sleep = cct->_conf->osd_scrub_extended_sleep;
7349 return std::max(extended_sleep, normal_sleep);
7350}
7351
7c673cae
FG
7352bool OSD::scrub_time_permit(utime_t now)
7353{
7354 struct tm bdt;
7355 time_t tt = now.sec();
7356 localtime_r(&tt, &bdt);
28e407b8
AA
7357
7358 bool day_permit = false;
7359 if (cct->_conf->osd_scrub_begin_week_day < cct->_conf->osd_scrub_end_week_day) {
7360 if (bdt.tm_wday >= cct->_conf->osd_scrub_begin_week_day && bdt.tm_wday < cct->_conf->osd_scrub_end_week_day) {
7361 day_permit = true;
7362 }
7363 } else {
7364 if (bdt.tm_wday >= cct->_conf->osd_scrub_begin_week_day || bdt.tm_wday < cct->_conf->osd_scrub_end_week_day) {
7365 day_permit = true;
7366 }
7367 }
7368
7369 if (!day_permit) {
7370 dout(20) << __func__ << " should run between week day " << cct->_conf->osd_scrub_begin_week_day
7371 << " - " << cct->_conf->osd_scrub_end_week_day
7372 << " now " << bdt.tm_wday << " = no" << dendl;
7373 return false;
7374 }
7375
7c673cae
FG
7376 bool time_permit = false;
7377 if (cct->_conf->osd_scrub_begin_hour < cct->_conf->osd_scrub_end_hour) {
7378 if (bdt.tm_hour >= cct->_conf->osd_scrub_begin_hour && bdt.tm_hour < cct->_conf->osd_scrub_end_hour) {
7379 time_permit = true;
7380 }
7381 } else {
7382 if (bdt.tm_hour >= cct->_conf->osd_scrub_begin_hour || bdt.tm_hour < cct->_conf->osd_scrub_end_hour) {
7383 time_permit = true;
7384 }
7385 }
7386 if (!time_permit) {
7387 dout(20) << __func__ << " should run between " << cct->_conf->osd_scrub_begin_hour
7388 << " - " << cct->_conf->osd_scrub_end_hour
7389 << " now " << bdt.tm_hour << " = no" << dendl;
7390 } else {
7391 dout(20) << __func__ << " should run between " << cct->_conf->osd_scrub_begin_hour
7392 << " - " << cct->_conf->osd_scrub_end_hour
7393 << " now " << bdt.tm_hour << " = yes" << dendl;
7394 }
7395 return time_permit;
7396}
7397
7398bool OSD::scrub_load_below_threshold()
7399{
7400 double loadavgs[3];
7401 if (getloadavg(loadavgs, 3) != 3) {
7402 dout(10) << __func__ << " couldn't read loadavgs\n" << dendl;
7403 return false;
7404 }
7405
7406 // allow scrub if below configured threshold
91327a77
AA
7407 long cpus = sysconf(_SC_NPROCESSORS_ONLN);
7408 double loadavg_per_cpu = cpus > 0 ? loadavgs[0] / cpus : loadavgs[0];
7409 if (loadavg_per_cpu < cct->_conf->osd_scrub_load_threshold) {
7410 dout(20) << __func__ << " loadavg per cpu " << loadavg_per_cpu
7c673cae
FG
7411 << " < max " << cct->_conf->osd_scrub_load_threshold
7412 << " = yes" << dendl;
7413 return true;
7414 }
7415
7416 // allow scrub if below daily avg and currently decreasing
7417 if (loadavgs[0] < daily_loadavg && loadavgs[0] < loadavgs[2]) {
7418 dout(20) << __func__ << " loadavg " << loadavgs[0]
7419 << " < daily_loadavg " << daily_loadavg
7420 << " and < 15m avg " << loadavgs[2]
7421 << " = yes" << dendl;
7422 return true;
7423 }
7424
7425 dout(20) << __func__ << " loadavg " << loadavgs[0]
7426 << " >= max " << cct->_conf->osd_scrub_load_threshold
7427 << " and ( >= daily_loadavg " << daily_loadavg
7428 << " or >= 15m avg " << loadavgs[2]
7429 << ") = no" << dendl;
7430 return false;
7431}
7432
7433void OSD::sched_scrub()
7434{
7435 // if not permitted, fail fast
eafe8130 7436 if (!service.can_inc_scrubs()) {
7c673cae
FG
7437 return;
7438 }
eafe8130 7439 bool allow_requested_repair_only = false;
f6b5b4d7
TL
7440 if (service.is_recovery_active() && !cct->_conf->osd_scrub_during_recovery) {
7441 if (!cct->_conf->osd_repair_during_recovery) {
eafe8130
TL
7442 dout(20) << __func__ << " not scheduling scrubs due to active recovery" << dendl;
7443 return;
7444 }
f6b5b4d7
TL
7445 dout(10) << __func__
7446 << " will only schedule explicitly requested repair due to active recovery"
7447 << dendl;
7448 allow_requested_repair_only = true;
b5b8bbf5
FG
7449 }
7450
7c673cae
FG
7451 utime_t now = ceph_clock_now();
7452 bool time_permit = scrub_time_permit(now);
7453 bool load_is_low = scrub_load_below_threshold();
7454 dout(20) << "sched_scrub load_is_low=" << (int)load_is_low << dendl;
7455
7456 OSDService::ScrubJob scrub;
7457 if (service.first_scrub_stamp(&scrub)) {
7458 do {
7459 dout(30) << "sched_scrub examine " << scrub.pgid << " at " << scrub.sched_time << dendl;
7460
7461 if (scrub.sched_time > now) {
7462 // save ourselves some effort
7463 dout(10) << "sched_scrub " << scrub.pgid << " scheduled at " << scrub.sched_time
7464 << " > " << now << dendl;
7465 break;
7466 }
7467
11fdf7f2 7468 if ((scrub.deadline.is_zero() || scrub.deadline >= now) && !(time_permit && load_is_low)) {
7c673cae
FG
7469 dout(10) << __func__ << " not scheduling scrub for " << scrub.pgid << " due to "
7470 << (!time_permit ? "time not permit" : "high load") << dendl;
7471 continue;
7472 }
7473
11fdf7f2 7474 PGRef pg = _lookup_lock_pg(scrub.pgid);
7c673cae
FG
7475 if (!pg)
7476 continue;
494da23a
TL
7477 // This has already started, so go on to the next scrub job
7478 if (pg->scrubber.active) {
7479 pg->unlock();
7480 dout(30) << __func__ << ": already in progress pgid " << scrub.pgid << dendl;
7481 continue;
7482 }
eafe8130
TL
7483 // Skip other kinds of scrubing if only explicitly requested repairing is allowed
7484 if (allow_requested_repair_only && !pg->scrubber.must_repair) {
7485 pg->unlock();
7486 dout(10) << __func__ << " skip " << scrub.pgid
7487 << " because repairing is not explicitly requested on it"
7488 << dendl;
7489 continue;
7490 }
494da23a 7491 // If it is reserving, let it resolve before going to the next scrub job
eafe8130 7492 if (pg->scrubber.local_reserved && !pg->scrubber.active) {
494da23a
TL
7493 pg->unlock();
7494 dout(30) << __func__ << ": reserve in progress pgid " << scrub.pgid << dendl;
7495 break;
7496 }
11fdf7f2
TL
7497 dout(10) << "sched_scrub scrubbing " << scrub.pgid << " at " << scrub.sched_time
7498 << (pg->get_must_scrub() ? ", explicitly requested" :
7499 (load_is_low ? ", load_is_low" : " deadline < now"))
7500 << dendl;
7501 if (pg->sched_scrub()) {
7502 pg->unlock();
7503 break;
7c673cae
FG
7504 }
7505 pg->unlock();
7506 } while (service.next_scrub_stamp(scrub, &scrub));
7507 }
7508 dout(20) << "sched_scrub done" << dendl;
7509}
7510
494da23a
TL
7511void OSD::resched_all_scrubs()
7512{
7513 dout(10) << __func__ << ": start" << dendl;
6d8e3169
FG
7514 const vector<spg_t> pgs = [this] {
7515 vector<spg_t> pgs;
7516 OSDService::ScrubJob job;
7517 if (service.first_scrub_stamp(&job)) {
7518 do {
7519 pgs.push_back(job.pgid);
7520 } while (service.next_scrub_stamp(job, &job));
7521 }
7522 return pgs;
7523 }();
7524 for (auto& pgid : pgs) {
7525 dout(20) << __func__ << ": examine " << pgid << dendl;
7526 PGRef pg = _lookup_lock_pg(pgid);
494da23a
TL
7527 if (!pg)
7528 continue;
7529 if (!pg->scrubber.must_scrub && !pg->scrubber.need_auto) {
6d8e3169 7530 dout(15) << __func__ << ": reschedule " << pgid << dendl;
494da23a
TL
7531 pg->on_info_history_change();
7532 }
7533 pg->unlock();
494da23a
TL
7534 }
7535 dout(10) << __func__ << ": done" << dendl;
7536}
7537
11fdf7f2
TL
7538MPGStats* OSD::collect_pg_stats()
7539{
7540 // This implementation unconditionally sends every is_primary PG's
7541 // stats every time we're called. This has equivalent cost to the
7542 // previous implementation's worst case where all PGs are busy and
7543 // their stats are always enqueued for sending.
9f95a23c 7544 std::shared_lock l{map_lock};
11fdf7f2 7545
11fdf7f2
TL
7546 osd_stat_t cur_stat = service.get_osd_stat();
7547 cur_stat.os_perf_stat = store->get_cur_stats();
7548
9f95a23c 7549 auto m = new MPGStats(monc->get_fsid(), get_osdmap_epoch());
11fdf7f2
TL
7550 m->osd_stat = cur_stat;
7551
7552 std::lock_guard lec{min_last_epoch_clean_lock};
9f95a23c 7553 min_last_epoch_clean = get_osdmap_epoch();
11fdf7f2
TL
7554 min_last_epoch_clean_pgs.clear();
7555
7556 std::set<int64_t> pool_set;
7557 vector<PGRef> pgs;
7558 _get_pgs(&pgs);
7559 for (auto& pg : pgs) {
7560 auto pool = pg->pg_id.pgid.pool();
7561 pool_set.emplace((int64_t)pool);
7562 if (!pg->is_primary()) {
7563 continue;
7564 }
7565 pg->get_pg_stats([&](const pg_stat_t& s, epoch_t lec) {
7566 m->pg_stat[pg->pg_id.pgid] = s;
7567 min_last_epoch_clean = min(min_last_epoch_clean, lec);
7568 min_last_epoch_clean_pgs.push_back(pg->pg_id.pgid);
7569 });
7570 }
7571 store_statfs_t st;
81eedcae 7572 bool per_pool_stats = false;
9f95a23c 7573 bool per_pool_omap_stats = false;
11fdf7f2 7574 for (auto p : pool_set) {
9f95a23c 7575 int r = store->pool_statfs(p, &st, &per_pool_omap_stats);
11fdf7f2
TL
7576 if (r == -ENOTSUP) {
7577 break;
7578 } else {
7579 assert(r >= 0);
7580 m->pool_stat[p] = st;
81eedcae 7581 per_pool_stats = true;
11fdf7f2
TL
7582 }
7583 }
7c673cae 7584
81eedcae
TL
7585 // indicate whether we are reporting per-pool stats
7586 m->osd_stat.num_osds = 1;
7587 m->osd_stat.num_per_pool_osds = per_pool_stats ? 1 : 0;
9f95a23c 7588 m->osd_stat.num_per_pool_omap_osds = per_pool_omap_stats ? 1 : 0;
81eedcae 7589
11fdf7f2
TL
7590 return m;
7591}
7c673cae 7592
11fdf7f2 7593vector<DaemonHealthMetric> OSD::get_health_metrics()
b32b8144 7594{
11fdf7f2
TL
7595 vector<DaemonHealthMetric> metrics;
7596 {
7597 utime_t oldest_secs;
7598 const utime_t now = ceph_clock_now();
7599 auto too_old = now;
7600 too_old -= cct->_conf.get_val<double>("osd_op_complaint_time");
7601 int slow = 0;
7602 TrackedOpRef oldest_op;
7603 auto count_slow_ops = [&](TrackedOp& op) {
7604 if (op.get_initiated() < too_old) {
9f95a23c
TL
7605 stringstream ss;
7606 ss << "slow request " << op.get_desc()
7607 << " initiated "
7608 << op.get_initiated()
7609 << " currently "
7610 << op.state_string();
7611 lgeneric_subdout(cct,osd,20) << ss.str() << dendl;
7612 clog->warn() << ss.str();
11fdf7f2
TL
7613 slow++;
7614 if (!oldest_op || op.get_initiated() < oldest_op->get_initiated()) {
7615 oldest_op = &op;
7616 }
7617 return true;
7618 } else {
7619 return false;
7620 }
7621 };
7622 if (op_tracker.visit_ops_in_flight(&oldest_secs, count_slow_ops)) {
7623 if (slow) {
7624 derr << __func__ << " reporting " << slow << " slow ops, oldest is "
7625 << oldest_op->get_desc() << dendl;
7626 }
7627 metrics.emplace_back(daemon_metric::SLOW_OPS, slow, oldest_secs);
7628 } else {
7629 // no news is not good news.
7630 metrics.emplace_back(daemon_metric::SLOW_OPS, 0, 0);
7631 }
7632 }
7633 {
7634 std::lock_guard l(pending_creates_lock);
7635 auto n_primaries = pending_creates_from_mon;
7636 for (const auto& create : pending_creates_from_osd) {
7637 if (create.second) {
7638 n_primaries++;
7639 }
b32b8144 7640 }
11fdf7f2 7641 metrics.emplace_back(daemon_metric::PENDING_CREATING_PGS, n_primaries);
b32b8144 7642 }
b32b8144
FG
7643 return metrics;
7644}
7645
7c673cae
FG
7646// =====================================================
7647// MAP
7648
7649void OSD::wait_for_new_map(OpRequestRef op)
7650{
7651 // ask?
7652 if (waiting_for_osdmap.empty()) {
9f95a23c 7653 osdmap_subscribe(get_osdmap_epoch() + 1, false);
7c673cae
FG
7654 }
7655
7656 logger->inc(l_osd_waiting_for_map);
7657 waiting_for_osdmap.push_back(op);
7658 op->mark_delayed("wait for new map");
7659}
7660
7661
7662/** update_map
7663 * assimilate new OSDMap(s). scan pgs, etc.
7664 */
7665
7666void OSD::note_down_osd(int peer)
7667{
9f95a23c
TL
7668 ceph_assert(ceph_mutex_is_locked(osd_lock));
7669 cluster_messenger->mark_down_addrs(get_osdmap()->get_cluster_addrs(peer));
7c673cae 7670
9f95a23c 7671 std::lock_guard l{heartbeat_lock};
7c673cae
FG
7672 failure_queue.erase(peer);
7673 failure_pending.erase(peer);
7674 map<int,HeartbeatInfo>::iterator p = heartbeat_peers.find(peer);
7675 if (p != heartbeat_peers.end()) {
9f95a23c 7676 p->second.clear_mark_down();
7c673cae
FG
7677 heartbeat_peers.erase(p);
7678 }
7c673cae
FG
7679}
7680
7681void OSD::note_up_osd(int peer)
7682{
7c673cae
FG
7683 heartbeat_set_peers_need_update();
7684}
7685
7686struct C_OnMapCommit : public Context {
7687 OSD *osd;
7688 epoch_t first, last;
7689 MOSDMap *msg;
7690 C_OnMapCommit(OSD *o, epoch_t f, epoch_t l, MOSDMap *m)
7691 : osd(o), first(f), last(l), msg(m) {}
7692 void finish(int r) override {
7693 osd->_committed_osd_maps(first, last, msg);
7694 msg->put();
7695 }
7696};
7697
7c673cae
FG
7698void OSD::osdmap_subscribe(version_t epoch, bool force_request)
7699{
11fdf7f2 7700 std::lock_guard l(osdmap_subscribe_lock);
181888fb 7701 if (latest_subscribed_epoch >= epoch && !force_request)
7c673cae
FG
7702 return;
7703
11fdf7f2 7704 latest_subscribed_epoch = std::max<uint64_t>(epoch, latest_subscribed_epoch);
181888fb 7705
7c673cae
FG
7706 if (monc->sub_want_increment("osdmap", epoch, CEPH_SUBSCRIBE_ONETIME) ||
7707 force_request) {
7708 monc->renew_subs();
7709 }
7710}
7711
7712void OSD::trim_maps(epoch_t oldest, int nreceived, bool skip_maps)
7713{
7714 epoch_t min = std::min(oldest, service.map_cache.cached_key_lower_bound());
7715 if (min <= superblock.oldest_map)
7716 return;
7717
7718 int num = 0;
7719 ObjectStore::Transaction t;
7720 for (epoch_t e = superblock.oldest_map; e < min; ++e) {
7721 dout(20) << " removing old osdmap epoch " << e << dendl;
7722 t.remove(coll_t::meta(), get_osdmap_pobject_name(e));
7723 t.remove(coll_t::meta(), get_inc_osdmap_pobject_name(e));
7724 superblock.oldest_map = e + 1;
7725 num++;
7726 if (num >= cct->_conf->osd_target_transaction_size && num >= nreceived) {
7727 service.publish_superblock(superblock);
7728 write_superblock(t);
11fdf7f2
TL
7729 int tr = store->queue_transaction(service.meta_ch, std::move(t), nullptr);
7730 ceph_assert(tr == 0);
7c673cae
FG
7731 num = 0;
7732 if (!skip_maps) {
7733 // skip_maps leaves us with a range of old maps if we fail to remove all
7734 // of them before moving superblock.oldest_map forward to the first map
7735 // in the incoming MOSDMap msg. so we should continue removing them in
7736 // this case, even we could do huge series of delete transactions all at
7737 // once.
7738 break;
7739 }
7740 }
7741 }
7742 if (num > 0) {
7743 service.publish_superblock(superblock);
7744 write_superblock(t);
11fdf7f2
TL
7745 int tr = store->queue_transaction(service.meta_ch, std::move(t), nullptr);
7746 ceph_assert(tr == 0);
7c673cae
FG
7747 }
7748 // we should not remove the cached maps
11fdf7f2 7749 ceph_assert(min <= service.map_cache.cached_key_lower_bound());
7c673cae
FG
7750}
7751
7752void OSD::handle_osd_map(MOSDMap *m)
7753{
11fdf7f2
TL
7754 // wait for pgs to catch up
7755 {
7756 // we extend the map cache pins to accomodate pgs slow to consume maps
7757 // for some period, until we hit the max_lag_factor bound, at which point
7758 // we block here to stop injesting more maps than they are able to keep
7759 // up with.
7760 epoch_t max_lag = cct->_conf->osd_map_cache_size *
7761 m_osd_pg_epoch_max_lag_factor;
7762 ceph_assert(max_lag > 0);
7763 epoch_t osd_min = 0;
7764 for (auto shard : shards) {
7765 epoch_t min = shard->get_min_pg_epoch();
7766 if (osd_min == 0 || min < osd_min) {
7767 osd_min = min;
7768 }
7769 }
9f95a23c 7770 epoch_t osdmap_epoch = get_osdmap_epoch();
11fdf7f2 7771 if (osd_min > 0 &&
9f95a23c
TL
7772 osdmap_epoch > max_lag &&
7773 osdmap_epoch - max_lag > osd_min) {
7774 epoch_t need = osdmap_epoch - max_lag;
11fdf7f2
TL
7775 dout(10) << __func__ << " waiting for pgs to catch up (need " << need
7776 << " max_lag " << max_lag << ")" << dendl;
7777 for (auto shard : shards) {
7778 epoch_t min = shard->get_min_pg_epoch();
7779 if (need > min) {
7780 dout(10) << __func__ << " waiting for pgs to consume " << need
7781 << " (shard " << shard->shard_id << " min " << min
7782 << ", map cache is " << cct->_conf->osd_map_cache_size
7783 << ", max_lag_factor " << m_osd_pg_epoch_max_lag_factor
7784 << ")" << dendl;
7785 unlock_guard unlock{osd_lock};
7786 shard->wait_min_pg_epoch(need);
7787 }
7788 }
7789 }
7790 }
7791
9f95a23c 7792 ceph_assert(ceph_mutex_is_locked(osd_lock));
11fdf7f2
TL
7793 map<epoch_t,OSDMapRef> added_maps;
7794 map<epoch_t,bufferlist> added_maps_bl;
7c673cae
FG
7795 if (m->fsid != monc->get_fsid()) {
7796 dout(0) << "handle_osd_map fsid " << m->fsid << " != "
7797 << monc->get_fsid() << dendl;
7798 m->put();
7799 return;
7800 }
7801 if (is_initializing()) {
7802 dout(0) << "ignoring osdmap until we have initialized" << dendl;
7803 m->put();
7804 return;
7805 }
7806
9f95a23c
TL
7807 auto session = ceph::ref_cast<Session>(m->get_connection()->get_priv());
7808 if (session && !(session->entity_name.is_mon() ||
7c673cae
FG
7809 session->entity_name.is_osd())) {
7810 //not enough perms!
7811 dout(10) << "got osd map from Session " << session
7812 << " which we can't take maps from (not a mon or osd)" << dendl;
7813 m->put();
7c673cae
FG
7814 return;
7815 }
7c673cae
FG
7816
7817 // share with the objecter
7818 if (!is_preboot())
7819 service.objecter->handle_osd_map(m);
7820
7821 epoch_t first = m->get_first();
7822 epoch_t last = m->get_last();
7823 dout(3) << "handle_osd_map epochs [" << first << "," << last << "], i have "
7824 << superblock.newest_map
7825 << ", src has [" << m->oldest_map << "," << m->newest_map << "]"
7826 << dendl;
7827
7828 logger->inc(l_osd_map);
7829 logger->inc(l_osd_mape, last - first + 1);
7830 if (first <= superblock.newest_map)
7831 logger->inc(l_osd_mape_dup, superblock.newest_map - first + 1);
7832 if (service.max_oldest_map < m->oldest_map) {
7833 service.max_oldest_map = m->oldest_map;
11fdf7f2 7834 ceph_assert(service.max_oldest_map >= superblock.oldest_map);
7c673cae
FG
7835 }
7836
7837 // make sure there is something new, here, before we bother flushing
7838 // the queues and such
7839 if (last <= superblock.newest_map) {
7840 dout(10) << " no new maps here, dropping" << dendl;
7841 m->put();
7842 return;
7843 }
7844
7845 // missing some?
7846 bool skip_maps = false;
7847 if (first > superblock.newest_map + 1) {
7848 dout(10) << "handle_osd_map message skips epochs "
7849 << superblock.newest_map + 1 << ".." << (first-1) << dendl;
7850 if (m->oldest_map <= superblock.newest_map + 1) {
7851 osdmap_subscribe(superblock.newest_map + 1, false);
7852 m->put();
7853 return;
7854 }
7855 // always try to get the full range of maps--as many as we can. this
7856 // 1- is good to have
7857 // 2- is at present the only way to ensure that we get a *full* map as
7858 // the first map!
7859 if (m->oldest_map < first) {
7860 osdmap_subscribe(m->oldest_map - 1, true);
7861 m->put();
7862 return;
7863 }
7864 skip_maps = true;
7865 }
7866
7867 ObjectStore::Transaction t;
7868 uint64_t txn_size = 0;
7869
9f95a23c
TL
7870 map<epoch_t,mempool::osdmap::map<int64_t,snap_interval_set_t>> purged_snaps;
7871
7c673cae 7872 // store new maps: queue for disk and put in the osdmap cache
11fdf7f2 7873 epoch_t start = std::max(superblock.newest_map + 1, first);
7c673cae
FG
7874 for (epoch_t e = start; e <= last; e++) {
7875 if (txn_size >= t.get_num_bytes()) {
7876 derr << __func__ << " transaction size overflowed" << dendl;
11fdf7f2 7877 ceph_assert(txn_size < t.get_num_bytes());
7c673cae
FG
7878 }
7879 txn_size = t.get_num_bytes();
7880 map<epoch_t,bufferlist>::iterator p;
7881 p = m->maps.find(e);
7882 if (p != m->maps.end()) {
7883 dout(10) << "handle_osd_map got full map for epoch " << e << dendl;
7884 OSDMap *o = new OSDMap;
7885 bufferlist& bl = p->second;
7886
7887 o->decode(bl);
7888
9f95a23c
TL
7889 purged_snaps[e] = o->get_new_purged_snaps();
7890
7c673cae
FG
7891 ghobject_t fulloid = get_osdmap_pobject_name(e);
7892 t.write(coll_t::meta(), fulloid, 0, bl.length(), bl);
11fdf7f2
TL
7893 added_maps[e] = add_map(o);
7894 added_maps_bl[e] = bl;
7c673cae
FG
7895 got_full_map(e);
7896 continue;
7897 }
7898
7899 p = m->incremental_maps.find(e);
7900 if (p != m->incremental_maps.end()) {
7901 dout(10) << "handle_osd_map got inc map for epoch " << e << dendl;
7902 bufferlist& bl = p->second;
7903 ghobject_t oid = get_inc_osdmap_pobject_name(e);
7904 t.write(coll_t::meta(), oid, 0, bl.length(), bl);
7c673cae
FG
7905
7906 OSDMap *o = new OSDMap;
7907 if (e > 1) {
7908 bufferlist obl;
7909 bool got = get_map_bl(e - 1, obl);
11fdf7f2
TL
7910 if (!got) {
7911 auto p = added_maps_bl.find(e - 1);
7912 ceph_assert(p != added_maps_bl.end());
7913 obl = p->second;
7914 }
7c673cae
FG
7915 o->decode(obl);
7916 }
7917
7918 OSDMap::Incremental inc;
11fdf7f2 7919 auto p = bl.cbegin();
7c673cae 7920 inc.decode(p);
494da23a 7921
7c673cae 7922 if (o->apply_incremental(inc) < 0) {
9f95a23c 7923 derr << "ERROR: bad fsid? i have " << get_osdmap()->get_fsid() << " and inc has " << inc.fsid << dendl;
11fdf7f2 7924 ceph_abort_msg("bad fsid");
7c673cae
FG
7925 }
7926
7927 bufferlist fbl;
7928 o->encode(fbl, inc.encode_features | CEPH_FEATURE_RESERVED);
7929
7930 bool injected_failure = false;
7931 if (cct->_conf->osd_inject_bad_map_crc_probability > 0 &&
7932 (rand() % 10000) < cct->_conf->osd_inject_bad_map_crc_probability*10000.0) {
7933 derr << __func__ << " injecting map crc failure" << dendl;
7934 injected_failure = true;
7935 }
7936
7937 if ((inc.have_crc && o->get_crc() != inc.full_crc) || injected_failure) {
7938 dout(2) << "got incremental " << e
7939 << " but failed to encode full with correct crc; requesting"
7940 << dendl;
7941 clog->warn() << "failed to encode map e" << e << " with expected crc";
7942 dout(20) << "my encoded map was:\n";
7943 fbl.hexdump(*_dout);
7944 *_dout << dendl;
7945 delete o;
7946 request_full_map(e, last);
7947 last = e - 1;
f6b5b4d7
TL
7948
7949 // don't continue committing if we failed to enc the first inc map
7950 if (last < start) {
7951 dout(10) << __func__ << " bailing because last < start (" << last << "<" << start << ")" << dendl;
7952 m->put();
7953 return;
7954 }
7c673cae
FG
7955 break;
7956 }
7957 got_full_map(e);
9f95a23c 7958 purged_snaps[e] = o->get_new_purged_snaps();
7c673cae
FG
7959
7960 ghobject_t fulloid = get_osdmap_pobject_name(e);
7961 t.write(coll_t::meta(), fulloid, 0, fbl.length(), fbl);
11fdf7f2
TL
7962 added_maps[e] = add_map(o);
7963 added_maps_bl[e] = fbl;
7c673cae
FG
7964 continue;
7965 }
7966
11fdf7f2 7967 ceph_abort_msg("MOSDMap lied about what maps it had?");
7c673cae
FG
7968 }
7969
7970 // even if this map isn't from a mon, we may have satisfied our subscription
7971 monc->sub_got("osdmap", last);
7972
7973 if (!m->maps.empty() && requested_full_first) {
7974 dout(10) << __func__ << " still missing full maps " << requested_full_first
7975 << ".." << requested_full_last << dendl;
7976 rerequest_full_maps();
7977 }
7978
7c673cae
FG
7979 if (superblock.oldest_map) {
7980 // make sure we at least keep pace with incoming maps
7981 trim_maps(m->oldest_map, last - first + 1, skip_maps);
11fdf7f2 7982 pg_num_history.prune(superblock.oldest_map);
7c673cae
FG
7983 }
7984
7985 if (!superblock.oldest_map || skip_maps)
7986 superblock.oldest_map = first;
7987 superblock.newest_map = last;
7988 superblock.current_epoch = last;
7989
7990 // note in the superblock that we were clean thru the prior epoch
7991 epoch_t boot_epoch = service.get_boot_epoch();
7992 if (boot_epoch && boot_epoch >= superblock.mounted) {
7993 superblock.mounted = boot_epoch;
7994 superblock.clean_thru = last;
7995 }
7996
11fdf7f2
TL
7997 // check for pg_num changes and deleted pools
7998 OSDMapRef lastmap;
7999 for (auto& i : added_maps) {
8000 if (!lastmap) {
8001 if (!(lastmap = service.try_get_map(i.first - 1))) {
8002 dout(10) << __func__ << " can't get previous map " << i.first - 1
8003 << " probably first start of this osd" << dendl;
8004 continue;
8005 }
8006 }
8007 ceph_assert(lastmap->get_epoch() + 1 == i.second->get_epoch());
8008 for (auto& j : lastmap->get_pools()) {
8009 if (!i.second->have_pg_pool(j.first)) {
8010 pg_num_history.log_pool_delete(i.first, j.first);
8011 dout(10) << __func__ << " recording final pg_pool_t for pool "
8012 << j.first << dendl;
8013 // this information is needed by _make_pg() if have to restart before
8014 // the pool is deleted and need to instantiate a new (zombie) PG[Pool].
8015 ghobject_t obj = make_final_pool_info_oid(j.first);
8016 bufferlist bl;
8017 encode(j.second, bl, CEPH_FEATURES_ALL);
8018 string name = lastmap->get_pool_name(j.first);
8019 encode(name, bl);
8020 map<string,string> profile;
8021 if (lastmap->get_pg_pool(j.first)->is_erasure()) {
8022 profile = lastmap->get_erasure_code_profile(
8023 lastmap->get_pg_pool(j.first)->erasure_code_profile);
8024 }
8025 encode(profile, bl);
8026 t.write(coll_t::meta(), obj, 0, bl.length(), bl);
11fdf7f2
TL
8027 } else if (unsigned new_pg_num = i.second->get_pg_num(j.first);
8028 new_pg_num != j.second.get_pg_num()) {
8029 dout(10) << __func__ << " recording pool " << j.first << " pg_num "
8030 << j.second.get_pg_num() << " -> " << new_pg_num << dendl;
8031 pg_num_history.log_pg_num_change(i.first, j.first, new_pg_num);
8032 }
8033 }
8034 for (auto& j : i.second->get_pools()) {
8035 if (!lastmap->have_pg_pool(j.first)) {
8036 dout(10) << __func__ << " recording new pool " << j.first << " pg_num "
8037 << j.second.get_pg_num() << dendl;
8038 pg_num_history.log_pg_num_change(i.first, j.first,
8039 j.second.get_pg_num());
8040 }
8041 }
8042 lastmap = i.second;
8043 }
8044 pg_num_history.epoch = last;
8045 {
8046 bufferlist bl;
8047 ::encode(pg_num_history, bl);
8048 t.write(coll_t::meta(), make_pg_num_history_oid(), 0, bl.length(), bl);
8049 dout(20) << __func__ << " pg_num_history " << pg_num_history << dendl;
8050 }
8051
9f95a23c
TL
8052 // record new purged_snaps
8053 if (superblock.purged_snaps_last == start - 1) {
8054 SnapMapper::record_purged_snaps(cct, store, service.meta_ch,
8055 make_purged_snaps_oid(), &t,
8056 purged_snaps);
8057 superblock.purged_snaps_last = last;
8058 } else {
8059 dout(10) << __func__ << " superblock purged_snaps_last is "
8060 << superblock.purged_snaps_last
8061 << ", not recording new purged_snaps" << dendl;
8062 }
8063
7c673cae
FG
8064 // superblock and commit
8065 write_superblock(t);
11fdf7f2 8066 t.register_on_commit(new C_OnMapCommit(this, start, last, m));
7c673cae 8067 store->queue_transaction(
11fdf7f2
TL
8068 service.meta_ch,
8069 std::move(t));
7c673cae
FG
8070 service.publish_superblock(superblock);
8071}
8072
8073void OSD::_committed_osd_maps(epoch_t first, epoch_t last, MOSDMap *m)
8074{
8075 dout(10) << __func__ << " " << first << ".." << last << dendl;
8076 if (is_stopping()) {
8077 dout(10) << __func__ << " bailing, we are shutting down" << dendl;
8078 return;
8079 }
11fdf7f2 8080 std::lock_guard l(osd_lock);
31f18b77
FG
8081 if (is_stopping()) {
8082 dout(10) << __func__ << " bailing, we are shutting down" << dendl;
8083 return;
8084 }
9f95a23c 8085 map_lock.lock();
7c673cae 8086
f6b5b4d7
TL
8087 ceph_assert(first <= last);
8088
7c673cae
FG
8089 bool do_shutdown = false;
8090 bool do_restart = false;
8091 bool network_error = false;
f6b5b4d7 8092 OSDMapRef osdmap = get_osdmap();
7c673cae
FG
8093
8094 // advance through the new maps
8095 for (epoch_t cur = first; cur <= last; cur++) {
8096 dout(10) << " advance to epoch " << cur
8097 << " (<= last " << last
8098 << " <= newest_map " << superblock.newest_map
8099 << ")" << dendl;
8100
8101 OSDMapRef newmap = get_map(cur);
11fdf7f2 8102 ceph_assert(newmap); // we just cached it above!
7c673cae
FG
8103
8104 // start blacklisting messages sent to peers that go down.
8105 service.pre_publish_map(newmap);
8106
8107 // kill connections to newly down osds
8108 bool waited_for_reservations = false;
8109 set<int> old;
9f95a23c 8110 osdmap = get_osdmap();
7c673cae
FG
8111 osdmap->get_all_osds(old);
8112 for (set<int>::iterator p = old.begin(); p != old.end(); ++p) {
8113 if (*p != whoami &&
8114 osdmap->is_up(*p) && // in old map
8115 newmap->is_down(*p)) { // but not the new one
8116 if (!waited_for_reservations) {
8117 service.await_reserved_maps();
8118 waited_for_reservations = true;
8119 }
8120 note_down_osd(*p);
8121 } else if (*p != whoami &&
8122 osdmap->is_down(*p) &&
8123 newmap->is_up(*p)) {
8124 note_up_osd(*p);
8125 }
8126 }
8127
81eedcae 8128 if (osdmap->is_noup(whoami) != newmap->is_noup(whoami)) {
7c673cae
FG
8129 dout(10) << __func__ << " NOUP flag changed in " << newmap->get_epoch()
8130 << dendl;
8131 if (is_booting()) {
8132 // this captures the case where we sent the boot message while
8133 // NOUP was being set on the mon and our boot request was
8134 // dropped, and then later it is cleared. it imperfectly
8135 // handles the case where our original boot message was not
8136 // dropped and we restart even though we might have booted, but
8137 // that is harmless (boot will just take slightly longer).
8138 do_restart = true;
8139 }
8140 }
8141
9f95a23c
TL
8142 osdmap = std::move(newmap);
8143 set_osdmap(osdmap);
7c673cae
FG
8144 epoch_t up_epoch;
8145 epoch_t boot_epoch;
8146 service.retrieve_epochs(&boot_epoch, &up_epoch, NULL);
8147 if (!up_epoch &&
8148 osdmap->is_up(whoami) &&
11fdf7f2 8149 osdmap->get_addrs(whoami) == client_messenger->get_myaddrs()) {
7c673cae
FG
8150 up_epoch = osdmap->get_epoch();
8151 dout(10) << "up_epoch is " << up_epoch << dendl;
8152 if (!boot_epoch) {
8153 boot_epoch = osdmap->get_epoch();
8154 dout(10) << "boot_epoch is " << boot_epoch << dendl;
8155 }
8156 service.set_epochs(&boot_epoch, &up_epoch, NULL);
8157 }
8158 }
8159
7c673cae
FG
8160 epoch_t _bind_epoch = service.get_bind_epoch();
8161 if (osdmap->is_up(whoami) &&
11fdf7f2
TL
8162 osdmap->get_addrs(whoami).legacy_equals(
8163 client_messenger->get_myaddrs()) &&
7c673cae
FG
8164 _bind_epoch < osdmap->get_up_from(whoami)) {
8165
8166 if (is_booting()) {
8167 dout(1) << "state: booting -> active" << dendl;
8168 set_state(STATE_ACTIVE);
11fdf7f2 8169 do_restart = false;
7c673cae
FG
8170
8171 // set incarnation so that osd_reqid_t's we generate for our
8172 // objecter requests are unique across restarts.
8173 service.objecter->set_client_incarnation(osdmap->get_epoch());
11fdf7f2 8174 cancel_pending_failures();
7c673cae
FG
8175 }
8176 }
8177
8178 if (osdmap->get_epoch() > 0 &&
8179 is_active()) {
8180 if (!osdmap->exists(whoami)) {
9f95a23c 8181 derr << "map says i do not exist. shutting down." << dendl;
7c673cae
FG
8182 do_shutdown = true; // don't call shutdown() while we have
8183 // everything paused
9f95a23c
TL
8184 } else if (osdmap->is_stop(whoami)) {
8185 derr << "map says i am stopped by admin. shutting down." << dendl;
8186 do_shutdown = true;
7c673cae 8187 } else if (!osdmap->is_up(whoami) ||
11fdf7f2
TL
8188 !osdmap->get_addrs(whoami).legacy_equals(
8189 client_messenger->get_myaddrs()) ||
8190 !osdmap->get_cluster_addrs(whoami).legacy_equals(
8191 cluster_messenger->get_myaddrs()) ||
8192 !osdmap->get_hb_back_addrs(whoami).legacy_equals(
8193 hb_back_server_messenger->get_myaddrs()) ||
8194 !osdmap->get_hb_front_addrs(whoami).legacy_equals(
8195 hb_front_server_messenger->get_myaddrs())) {
7c673cae
FG
8196 if (!osdmap->is_up(whoami)) {
8197 if (service.is_preparing_to_stop() || service.is_stopping()) {
8198 service.got_stop_ack();
8199 } else {
c07f9fc5
FG
8200 clog->warn() << "Monitor daemon marked osd." << whoami << " down, "
8201 "but it is still running";
8202 clog->debug() << "map e" << osdmap->get_epoch()
8203 << " wrongly marked me down at e"
8204 << osdmap->get_down_at(whoami);
7c673cae 8205 }
9f95a23c
TL
8206 if (monc->monmap.min_mon_release >= ceph_release_t::octopus) {
8207 // note that this is best-effort...
8208 monc->send_mon_message(
8209 new MOSDMarkMeDead(
8210 monc->get_fsid(),
8211 whoami,
8212 osdmap->get_epoch()));
8213 }
11fdf7f2
TL
8214 } else if (!osdmap->get_addrs(whoami).legacy_equals(
8215 client_messenger->get_myaddrs())) {
7c673cae 8216 clog->error() << "map e" << osdmap->get_epoch()
11fdf7f2
TL
8217 << " had wrong client addr (" << osdmap->get_addrs(whoami)
8218 << " != my " << client_messenger->get_myaddrs() << ")";
8219 } else if (!osdmap->get_cluster_addrs(whoami).legacy_equals(
8220 cluster_messenger->get_myaddrs())) {
7c673cae
FG
8221 clog->error() << "map e" << osdmap->get_epoch()
8222 << " had wrong cluster addr ("
11fdf7f2
TL
8223 << osdmap->get_cluster_addrs(whoami)
8224 << " != my " << cluster_messenger->get_myaddrs() << ")";
8225 } else if (!osdmap->get_hb_back_addrs(whoami).legacy_equals(
8226 hb_back_server_messenger->get_myaddrs())) {
7c673cae 8227 clog->error() << "map e" << osdmap->get_epoch()
c07f9fc5 8228 << " had wrong heartbeat back addr ("
11fdf7f2
TL
8229 << osdmap->get_hb_back_addrs(whoami)
8230 << " != my " << hb_back_server_messenger->get_myaddrs()
7c673cae 8231 << ")";
11fdf7f2
TL
8232 } else if (!osdmap->get_hb_front_addrs(whoami).legacy_equals(
8233 hb_front_server_messenger->get_myaddrs())) {
7c673cae 8234 clog->error() << "map e" << osdmap->get_epoch()
c07f9fc5 8235 << " had wrong heartbeat front addr ("
11fdf7f2
TL
8236 << osdmap->get_hb_front_addrs(whoami)
8237 << " != my " << hb_front_server_messenger->get_myaddrs()
7c673cae
FG
8238 << ")";
8239 }
8240
8241 if (!service.is_stopping()) {
8242 epoch_t up_epoch = 0;
8243 epoch_t bind_epoch = osdmap->get_epoch();
8244 service.set_epochs(NULL,&up_epoch, &bind_epoch);
8245 do_restart = true;
8246
8247 //add markdown log
8248 utime_t now = ceph_clock_now();
8249 utime_t grace = utime_t(cct->_conf->osd_max_markdown_period, 0);
8250 osd_markdown_log.push_back(now);
7c673cae 8251 if ((int)osd_markdown_log.size() > cct->_conf->osd_max_markdown_count) {
9f95a23c
TL
8252 derr << __func__ << " marked down "
8253 << osd_markdown_log.size()
8254 << " > osd_max_markdown_count "
8255 << cct->_conf->osd_max_markdown_count
8256 << " in last " << grace << " seconds, shutting down"
8257 << dendl;
7c673cae
FG
8258 do_restart = false;
8259 do_shutdown = true;
8260 }
8261
8262 start_waiting_for_healthy();
8263
8264 set<int> avoid_ports;
8265#if defined(__FreeBSD__)
8266 // prevent FreeBSD from grabbing the client_messenger port during
8267 // rebinding. In which case a cluster_meesneger will connect also
8268 // to the same port
11fdf7f2 8269 client_messenger->get_myaddrs().get_ports(&avoid_ports);
7c673cae 8270#endif
11fdf7f2 8271 cluster_messenger->get_myaddrs().get_ports(&avoid_ports);
7c673cae
FG
8272
8273 int r = cluster_messenger->rebind(avoid_ports);
8274 if (r != 0) {
8275 do_shutdown = true; // FIXME: do_restart?
8276 network_error = true;
9f95a23c
TL
8277 derr << __func__ << " marked down:"
8278 << " rebind cluster_messenger failed" << dendl;
7c673cae
FG
8279 }
8280
9f95a23c
TL
8281 hb_back_server_messenger->mark_down_all();
8282 hb_front_server_messenger->mark_down_all();
7c673cae
FG
8283 hb_front_client_messenger->mark_down_all();
8284 hb_back_client_messenger->mark_down_all();
8285
494da23a 8286 reset_heartbeat_peers(true);
7c673cae
FG
8287 }
8288 }
8289 }
8290
9f95a23c 8291 map_lock.unlock();
7c673cae 8292
11fdf7f2 8293 check_osdmap_features();
7c673cae
FG
8294
8295 // yay!
8296 consume_map();
8297
8298 if (is_active() || is_waiting_for_healthy())
8299 maybe_update_heartbeat_peers();
8300
11fdf7f2 8301 if (is_active()) {
7c673cae
FG
8302 activate_map();
8303 }
8304
31f18b77 8305 if (do_shutdown) {
7c673cae 8306 if (network_error) {
11fdf7f2 8307 cancel_pending_failures();
7c673cae
FG
8308 }
8309 // trigger shutdown in a different thread
8310 dout(0) << __func__ << " shutdown OSD via async signal" << dendl;
8311 queue_async_signal(SIGINT);
8312 }
31f18b77
FG
8313 else if (m->newest_map && m->newest_map > last) {
8314 dout(10) << " msg say newest map is " << m->newest_map
8315 << ", requesting more" << dendl;
8316 osdmap_subscribe(osdmap->get_epoch()+1, false);
8317 }
7c673cae
FG
8318 else if (is_preboot()) {
8319 if (m->get_source().is_mon())
8320 _preboot(m->oldest_map, m->newest_map);
8321 else
8322 start_boot();
8323 }
8324 else if (do_restart)
8325 start_boot();
8326
8327}
8328
11fdf7f2 8329void OSD::check_osdmap_features()
7c673cae
FG
8330{
8331 // adjust required feature bits?
8332
8333 // we have to be a bit careful here, because we are accessing the
8334 // Policy structures without taking any lock. in particular, only
8335 // modify integer values that can safely be read by a racing CPU.
8336 // since we are only accessing existing Policy structures a their
8337 // current memory location, and setting or clearing bits in integer
8338 // fields, and we are the only writer, this is not a problem.
8339
9f95a23c 8340 const auto osdmap = get_osdmap();
7c673cae
FG
8341 {
8342 Messenger::Policy p = client_messenger->get_default_policy();
8343 uint64_t mask;
8344 uint64_t features = osdmap->get_features(entity_name_t::TYPE_CLIENT, &mask);
8345 if ((p.features_required & mask) != features) {
8346 dout(0) << "crush map has features " << features
8347 << ", adjusting msgr requires for clients" << dendl;
8348 p.features_required = (p.features_required & ~mask) | features;
8349 client_messenger->set_default_policy(p);
8350 }
8351 }
8352 {
8353 Messenger::Policy p = client_messenger->get_policy(entity_name_t::TYPE_MON);
8354 uint64_t mask;
8355 uint64_t features = osdmap->get_features(entity_name_t::TYPE_MON, &mask);
8356 if ((p.features_required & mask) != features) {
8357 dout(0) << "crush map has features " << features
8358 << " was " << p.features_required
8359 << ", adjusting msgr requires for mons" << dendl;
8360 p.features_required = (p.features_required & ~mask) | features;
8361 client_messenger->set_policy(entity_name_t::TYPE_MON, p);
8362 }
8363 }
8364 {
8365 Messenger::Policy p = cluster_messenger->get_policy(entity_name_t::TYPE_OSD);
8366 uint64_t mask;
8367 uint64_t features = osdmap->get_features(entity_name_t::TYPE_OSD, &mask);
8368
8369 if ((p.features_required & mask) != features) {
8370 dout(0) << "crush map has features " << features
8371 << ", adjusting msgr requires for osds" << dendl;
8372 p.features_required = (p.features_required & ~mask) | features;
8373 cluster_messenger->set_policy(entity_name_t::TYPE_OSD, p);
8374 }
8375
11fdf7f2 8376 if (!superblock.compat_features.incompat.contains(CEPH_OSD_FEATURE_INCOMPAT_SHARDS)) {
7c673cae
FG
8377 dout(0) << __func__ << " enabling on-disk ERASURE CODES compat feature" << dendl;
8378 superblock.compat_features.incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_SHARDS);
8379 ObjectStore::Transaction t;
8380 write_superblock(t);
11fdf7f2
TL
8381 int err = store->queue_transaction(service.meta_ch, std::move(t), NULL);
8382 ceph_assert(err == 0);
7c673cae
FG
8383 }
8384 }
11fdf7f2 8385
9f95a23c
TL
8386 if (osdmap->require_osd_release < ceph_release_t::nautilus) {
8387 hb_front_server_messenger->set_require_authorizer(false);
8388 hb_back_server_messenger->set_require_authorizer(false);
8389 } else {
8390 hb_front_server_messenger->set_require_authorizer(true);
8391 hb_back_server_messenger->set_require_authorizer(true);
11fdf7f2
TL
8392 }
8393
8394 if (osdmap->require_osd_release != last_require_osd_release) {
8395 dout(1) << __func__ << " require_osd_release " << last_require_osd_release
8396 << " -> " << to_string(osdmap->require_osd_release) << dendl;
8397 store->write_meta("require_osd_release",
8398 stringify((int)osdmap->require_osd_release));
8399 last_require_osd_release = osdmap->require_osd_release;
8400 }
7c673cae
FG
8401}
8402
11fdf7f2
TL
8403struct C_FinishSplits : public Context {
8404 OSD *osd;
8405 set<PGRef> pgs;
8406 C_FinishSplits(OSD *osd, const set<PGRef> &in)
8407 : osd(osd), pgs(in) {}
8408 void finish(int r) override {
8409 osd->_finish_splits(pgs);
8410 }
8411};
8412
8413void OSD::_finish_splits(set<PGRef>& pgs)
7c673cae 8414{
11fdf7f2
TL
8415 dout(10) << __func__ << " " << pgs << dendl;
8416 if (is_stopping())
8417 return;
11fdf7f2
TL
8418 for (set<PGRef>::iterator i = pgs.begin();
8419 i != pgs.end();
8420 ++i) {
8421 PG *pg = i->get();
7c673cae 8422
9f95a23c 8423 PeeringCtx rctx = create_context();
11fdf7f2
TL
8424 pg->lock();
8425 dout(10) << __func__ << " " << *pg << dendl;
8426 epoch_t e = pg->get_osdmap_epoch();
9f95a23c 8427 pg->handle_initialize(rctx);
11fdf7f2 8428 pg->queue_null(e, e);
9f95a23c 8429 dispatch_context(rctx, pg, service.get_osdmap());
11fdf7f2 8430 pg->unlock();
7c673cae 8431
11fdf7f2
TL
8432 unsigned shard_index = pg->pg_id.hash_to_shard(num_shards);
8433 shards[shard_index]->register_and_wake_split_child(pg);
7c673cae 8434 }
11fdf7f2
TL
8435};
8436
8437bool OSD::add_merge_waiter(OSDMapRef nextmap, spg_t target, PGRef src,
8438 unsigned need)
8439{
8440 std::lock_guard l(merge_lock);
8441 auto& p = merge_waiters[nextmap->get_epoch()][target];
8442 p[src->pg_id] = src;
8443 dout(10) << __func__ << " added merge_waiter " << src->pg_id
8444 << " for " << target << ", have " << p.size() << "/" << need
8445 << dendl;
8446 return p.size() == need;
8447}
8448
8449bool OSD::advance_pg(
8450 epoch_t osd_epoch,
8451 PG *pg,
8452 ThreadPool::TPHandle &handle,
9f95a23c 8453 PeeringCtx &rctx)
11fdf7f2
TL
8454{
8455 if (osd_epoch <= pg->get_osdmap_epoch()) {
8456 return true;
8457 }
8458 ceph_assert(pg->is_locked());
8459 OSDMapRef lastmap = pg->get_osdmap();
8460 ceph_assert(lastmap->get_epoch() < osd_epoch);
8461 set<PGRef> new_pgs; // any split children
8462 bool ret = true;
8463
8464 unsigned old_pg_num = lastmap->have_pg_pool(pg->pg_id.pool()) ?
8465 lastmap->get_pg_num(pg->pg_id.pool()) : 0;
8466 for (epoch_t next_epoch = pg->get_osdmap_epoch() + 1;
8467 next_epoch <= osd_epoch;
7c673cae
FG
8468 ++next_epoch) {
8469 OSDMapRef nextmap = service.try_get_map(next_epoch);
8470 if (!nextmap) {
8471 dout(20) << __func__ << " missing map " << next_epoch << dendl;
7c673cae
FG
8472 continue;
8473 }
8474
11fdf7f2
TL
8475 unsigned new_pg_num =
8476 (old_pg_num && nextmap->have_pg_pool(pg->pg_id.pool())) ?
8477 nextmap->get_pg_num(pg->pg_id.pool()) : 0;
8478 if (old_pg_num && new_pg_num && old_pg_num != new_pg_num) {
8479 // check for merge
8480 if (nextmap->have_pg_pool(pg->pg_id.pool())) {
8481 spg_t parent;
8482 if (pg->pg_id.is_merge_source(
8483 old_pg_num,
8484 new_pg_num,
8485 &parent)) {
8486 // we are merge source
8487 PGRef spg = pg; // carry a ref
8488 dout(1) << __func__ << " " << pg->pg_id
8489 << " is merge source, target is " << parent
8490 << dendl;
8491 pg->write_if_dirty(rctx);
9f95a23c
TL
8492 if (!new_pgs.empty()) {
8493 rctx.transaction.register_on_applied(new C_FinishSplits(this,
8494 new_pgs));
8495 new_pgs.clear();
8496 }
8497 dispatch_context(rctx, pg, pg->get_osdmap(), &handle);
11fdf7f2 8498 pg->ch->flush();
eafe8130
TL
8499 // release backoffs explicitly, since the on_shutdown path
8500 // aggressively tears down backoff state.
8501 if (pg->is_primary()) {
8502 pg->release_pg_backoffs();
8503 }
11fdf7f2
TL
8504 pg->on_shutdown();
8505 OSDShard *sdata = pg->osd_shard;
8506 {
8507 std::lock_guard l(sdata->shard_lock);
8508 if (pg->pg_slot) {
8509 sdata->_detach_pg(pg->pg_slot);
8510 // update pg count now since we might not get an osdmap
8511 // any time soon.
8512 if (pg->is_primary())
8513 logger->dec(l_osd_pg_primary);
9f95a23c
TL
8514 else if (pg->is_nonprimary())
8515 logger->dec(l_osd_pg_replica); // misnomer
11fdf7f2
TL
8516 else
8517 logger->dec(l_osd_pg_stray);
8518 }
8519 }
8520 pg->unlock();
8521
8522 set<spg_t> children;
8523 parent.is_split(new_pg_num, old_pg_num, &children);
8524 if (add_merge_waiter(nextmap, parent, pg, children.size())) {
8525 enqueue_peering_evt(
8526 parent,
8527 PGPeeringEventRef(
8528 std::make_shared<PGPeeringEvent>(
8529 nextmap->get_epoch(),
8530 nextmap->get_epoch(),
8531 NullEvt())));
8532 }
8533 ret = false;
8534 goto out;
8535 } else if (pg->pg_id.is_merge_target(old_pg_num, new_pg_num)) {
8536 // we are merge target
8537 set<spg_t> children;
8538 pg->pg_id.is_split(new_pg_num, old_pg_num, &children);
8539 dout(20) << __func__ << " " << pg->pg_id
8540 << " is merge target, sources are " << children
8541 << dendl;
8542 map<spg_t,PGRef> sources;
8543 {
8544 std::lock_guard l(merge_lock);
8545 auto& s = merge_waiters[nextmap->get_epoch()][pg->pg_id];
8546 unsigned need = children.size();
8547 dout(20) << __func__ << " have " << s.size() << "/"
8548 << need << dendl;
8549 if (s.size() == need) {
8550 sources.swap(s);
8551 merge_waiters[nextmap->get_epoch()].erase(pg->pg_id);
8552 if (merge_waiters[nextmap->get_epoch()].empty()) {
8553 merge_waiters.erase(nextmap->get_epoch());
8554 }
8555 }
8556 }
8557 if (!sources.empty()) {
8558 unsigned new_pg_num = nextmap->get_pg_num(pg->pg_id.pool());
8559 unsigned split_bits = pg->pg_id.get_split_bits(new_pg_num);
8560 dout(1) << __func__ << " merging " << pg->pg_id << dendl;
8561 pg->merge_from(
8562 sources, rctx, split_bits,
8563 nextmap->get_pg_pool(
8564 pg->pg_id.pool())->last_pg_merge_meta);
8565 pg->pg_slot->waiting_for_merge_epoch = 0;
8566 } else {
8567 dout(20) << __func__ << " not ready to merge yet" << dendl;
8568 pg->write_if_dirty(rctx);
9f95a23c
TL
8569 if (!new_pgs.empty()) {
8570 rctx.transaction.register_on_applied(new C_FinishSplits(this,
8571 new_pgs));
8572 new_pgs.clear();
8573 }
8574 dispatch_context(rctx, pg, pg->get_osdmap(), &handle);
11fdf7f2
TL
8575 pg->unlock();
8576 // kick source(s) to get them ready
8577 for (auto& i : children) {
8578 dout(20) << __func__ << " kicking source " << i << dendl;
8579 enqueue_peering_evt(
8580 i,
8581 PGPeeringEventRef(
8582 std::make_shared<PGPeeringEvent>(
8583 nextmap->get_epoch(),
8584 nextmap->get_epoch(),
8585 NullEvt())));
8586 }
8587 ret = false;
8588 goto out;
8589 }
8590 }
8591 }
8592 }
8593
7c673cae
FG
8594 vector<int> newup, newacting;
8595 int up_primary, acting_primary;
8596 nextmap->pg_to_up_acting_osds(
11fdf7f2 8597 pg->pg_id.pgid,
7c673cae
FG
8598 &newup, &up_primary,
8599 &newacting, &acting_primary);
8600 pg->handle_advance_map(
8601 nextmap, lastmap, newup, up_primary,
8602 newacting, acting_primary, rctx);
8603
494da23a
TL
8604 auto oldpool = lastmap->get_pools().find(pg->pg_id.pool());
8605 auto newpool = nextmap->get_pools().find(pg->pg_id.pool());
8606 if (oldpool != lastmap->get_pools().end()
8607 && newpool != nextmap->get_pools().end()) {
8608 dout(20) << __func__
8609 << " new pool opts " << newpool->second.opts
8610 << " old pool opts " << oldpool->second.opts
8611 << dendl;
8612
8613 double old_min_interval = 0, new_min_interval = 0;
8614 oldpool->second.opts.get(pool_opts_t::SCRUB_MIN_INTERVAL, &old_min_interval);
8615 newpool->second.opts.get(pool_opts_t::SCRUB_MIN_INTERVAL, &new_min_interval);
8616
8617 double old_max_interval = 0, new_max_interval = 0;
8618 oldpool->second.opts.get(pool_opts_t::SCRUB_MAX_INTERVAL, &old_max_interval);
8619 newpool->second.opts.get(pool_opts_t::SCRUB_MAX_INTERVAL, &new_max_interval);
8620
8621 // Assume if an interval is change from set to unset or vice versa the actual config
8622 // is different. Keep it simple even if it is possible to call resched_all_scrub()
8623 // unnecessarily.
8624 if (old_min_interval != new_min_interval || old_max_interval != new_max_interval) {
8625 pg->on_info_history_change();
8626 }
8627 }
8628
11fdf7f2
TL
8629 if (new_pg_num && old_pg_num != new_pg_num) {
8630 // check for split
8631 set<spg_t> children;
8632 if (pg->pg_id.is_split(
8633 old_pg_num,
8634 new_pg_num,
8635 &children)) {
8636 split_pgs(
8637 pg, children, &new_pgs, lastmap, nextmap,
8638 rctx);
8639 }
7c673cae
FG
8640 }
8641
8642 lastmap = nextmap;
11fdf7f2 8643 old_pg_num = new_pg_num;
7c673cae
FG
8644 handle.reset_tp_timeout();
8645 }
7c673cae 8646 pg->handle_activate_map(rctx);
11fdf7f2
TL
8647
8648 ret = true;
8649 out:
8650 if (!new_pgs.empty()) {
9f95a23c 8651 rctx.transaction.register_on_applied(new C_FinishSplits(this, new_pgs));
7c673cae 8652 }
11fdf7f2 8653 return ret;
7c673cae
FG
8654}
8655
8656void OSD::consume_map()
8657{
9f95a23c
TL
8658 ceph_assert(ceph_mutex_is_locked(osd_lock));
8659 auto osdmap = get_osdmap();
7c673cae
FG
8660 dout(7) << "consume_map version " << osdmap->get_epoch() << dendl;
8661
3efd9988
FG
8662 /** make sure the cluster is speaking in SORTBITWISE, because we don't
8663 * speak the older sorting version any more. Be careful not to force
8664 * a shutdown if we are merely processing old maps, though.
8665 */
8666 if (!osdmap->test_flag(CEPH_OSDMAP_SORTBITWISE) && is_active()) {
8667 derr << __func__ << " SORTBITWISE flag is not set" << dendl;
8668 ceph_abort();
8669 }
8670
11fdf7f2
TL
8671 service.pre_publish_map(osdmap);
8672 service.await_reserved_maps();
8673 service.publish_map(osdmap);
7c673cae 8674
11fdf7f2
TL
8675 // prime splits and merges
8676 set<pair<spg_t,epoch_t>> newly_split; // splits, and when
8677 set<pair<spg_t,epoch_t>> merge_pgs; // merge participants, and when
8678 for (auto& shard : shards) {
8679 shard->identify_splits_and_merges(osdmap, &newly_split, &merge_pgs);
8680 }
8681 if (!newly_split.empty()) {
8682 for (auto& shard : shards) {
8683 shard->prime_splits(osdmap, &newly_split);
8684 }
8685 ceph_assert(newly_split.empty());
8686 }
7c673cae 8687
11fdf7f2
TL
8688 // prune sent_ready_to_merge
8689 service.prune_sent_ready_to_merge(osdmap);
7c673cae 8690
11fdf7f2
TL
8691 // FIXME, maybe: We could race against an incoming peering message
8692 // that instantiates a merge PG after identify_merges() below and
8693 // never set up its peer to complete the merge. An OSD restart
8694 // would clear it up. This is a hard race to resolve,
8695 // extraordinarily rare (we only merge PGs that are stable and
8696 // clean, so it'd have to be an imported PG to an OSD with a
8697 // slightly stale OSDMap...), so I'm ignoring it for now. We plan to
8698 // replace all of this with a seastar-based code soon anyway.
8699 if (!merge_pgs.empty()) {
8700 // mark the pgs we already have, or create new and empty merge
8701 // participants for those we are missing. do this all under the
8702 // shard lock so we don't have to worry about racing pg creates
8703 // via _process.
8704 for (auto& shard : shards) {
8705 shard->prime_merges(osdmap, &merge_pgs);
7c673cae 8706 }
11fdf7f2
TL
8707 ceph_assert(merge_pgs.empty());
8708 }
8709
8710 service.prune_pg_created();
8711
8712 unsigned pushes_to_free = 0;
8713 for (auto& shard : shards) {
8714 shard->consume_map(osdmap, &pushes_to_free);
8715 }
8716
8717 vector<spg_t> pgids;
8718 _get_pgids(&pgids);
8719
8720 // count (FIXME, probably during seastar rewrite)
8721 int num_pg_primary = 0, num_pg_replica = 0, num_pg_stray = 0;
8722 vector<PGRef> pgs;
8723 _get_pgs(&pgs);
8724 for (auto& pg : pgs) {
8725 // FIXME (probably during seastar rewrite): this is lockless and
8726 // racy, but we don't want to take pg lock here.
8727 if (pg->is_primary())
8728 num_pg_primary++;
9f95a23c
TL
8729 else if (pg->is_nonprimary())
8730 num_pg_replica++; // misnomer
11fdf7f2
TL
8731 else
8732 num_pg_stray++;
8733 }
3efd9988 8734
11fdf7f2
TL
8735 {
8736 // FIXME (as part of seastar rewrite): move to OSDShard
8737 std::lock_guard l(pending_creates_lock);
8738 for (auto pg = pending_creates_from_osd.begin();
8739 pg != pending_creates_from_osd.end();) {
9f95a23c 8740 if (osdmap->get_pg_acting_role(pg->first, whoami) < 0) {
11fdf7f2
TL
8741 dout(10) << __func__ << " pg " << pg->first << " doesn't map here, "
8742 << "discarding pending_create_from_osd" << dendl;
3efd9988
FG
8743 pg = pending_creates_from_osd.erase(pg);
8744 } else {
8745 ++pg;
8746 }
8747 }
7c673cae
FG
8748 }
8749
7c673cae
FG
8750 service.maybe_inject_dispatch_delay();
8751
8752 dispatch_sessions_waiting_on_map();
8753
8754 service.maybe_inject_dispatch_delay();
8755
11fdf7f2 8756 service.release_reserved_pushes(pushes_to_free);
7c673cae 8757
11fdf7f2
TL
8758 // queue null events to push maps down to individual PGs
8759 for (auto pgid : pgids) {
8760 enqueue_peering_evt(
8761 pgid,
8762 PGPeeringEventRef(
8763 std::make_shared<PGPeeringEvent>(
8764 osdmap->get_epoch(),
8765 osdmap->get_epoch(),
8766 NullEvt())));
7c673cae 8767 }
11fdf7f2 8768 logger->set(l_osd_pg, pgids.size());
7c673cae
FG
8769 logger->set(l_osd_pg_primary, num_pg_primary);
8770 logger->set(l_osd_pg_replica, num_pg_replica);
8771 logger->set(l_osd_pg_stray, num_pg_stray);
8772}
8773
8774void OSD::activate_map()
8775{
9f95a23c
TL
8776 ceph_assert(ceph_mutex_is_locked(osd_lock));
8777 auto osdmap = get_osdmap();
7c673cae
FG
8778
8779 dout(7) << "activate_map version " << osdmap->get_epoch() << dendl;
8780
7c673cae
FG
8781 // norecover?
8782 if (osdmap->test_flag(CEPH_OSDMAP_NORECOVER)) {
8783 if (!service.recovery_is_paused()) {
8784 dout(1) << "pausing recovery (NORECOVER flag set)" << dendl;
8785 service.pause_recovery();
8786 }
8787 } else {
8788 if (service.recovery_is_paused()) {
8789 dout(1) << "unpausing recovery (NORECOVER flag unset)" << dendl;
8790 service.unpause_recovery();
8791 }
8792 }
8793
8794 service.activate_map();
8795
8796 // process waiters
8797 take_waiters(waiting_for_osdmap);
8798}
8799
8800bool OSD::require_mon_peer(const Message *m)
8801{
8802 if (!m->get_connection()->peer_is_mon()) {
8803 dout(0) << "require_mon_peer received from non-mon "
8804 << m->get_connection()->get_peer_addr()
8805 << " " << *m << dendl;
8806 return false;
8807 }
8808 return true;
8809}
8810
8811bool OSD::require_mon_or_mgr_peer(const Message *m)
8812{
8813 if (!m->get_connection()->peer_is_mon() &&
8814 !m->get_connection()->peer_is_mgr()) {
8815 dout(0) << "require_mon_or_mgr_peer received from non-mon, non-mgr "
8816 << m->get_connection()->get_peer_addr()
8817 << " " << *m << dendl;
8818 return false;
8819 }
8820 return true;
8821}
8822
8823bool OSD::require_osd_peer(const Message *m)
8824{
8825 if (!m->get_connection()->peer_is_osd()) {
8826 dout(0) << "require_osd_peer received from non-osd "
8827 << m->get_connection()->get_peer_addr()
8828 << " " << *m << dendl;
8829 return false;
8830 }
8831 return true;
8832}
8833
8834bool OSD::require_self_aliveness(const Message *m, epoch_t epoch)
8835{
8836 epoch_t up_epoch = service.get_up_epoch();
8837 if (epoch < up_epoch) {
8838 dout(7) << "from pre-up epoch " << epoch << " < " << up_epoch << dendl;
8839 return false;
8840 }
8841
8842 if (!is_active()) {
8843 dout(7) << "still in boot state, dropping message " << *m << dendl;
8844 return false;
8845 }
8846
8847 return true;
8848}
8849
9f95a23c 8850bool OSD::require_same_peer_instance(const Message *m, const OSDMapRef& map,
7c673cae
FG
8851 bool is_fast_dispatch)
8852{
8853 int from = m->get_source().num();
8854
8855 if (map->is_down(from) ||
11fdf7f2 8856 (map->get_cluster_addrs(from) != m->get_source_addrs())) {
7c673cae
FG
8857 dout(5) << "from dead osd." << from << ", marking down, "
8858 << " msg was " << m->get_source_inst().addr
11fdf7f2
TL
8859 << " expected "
8860 << (map->is_up(from) ?
8861 map->get_cluster_addrs(from) : entity_addrvec_t())
7c673cae
FG
8862 << dendl;
8863 ConnectionRef con = m->get_connection();
8864 con->mark_down();
9f95a23c 8865 if (auto s = ceph::ref_cast<Session>(con->get_priv()); s) {
7c673cae 8866 if (!is_fast_dispatch)
9f95a23c 8867 s->session_dispatch_lock.lock();
7c673cae 8868 clear_session_waiting_on_map(s);
11fdf7f2
TL
8869 con->set_priv(nullptr); // break ref <-> session cycle, if any
8870 s->con.reset();
7c673cae 8871 if (!is_fast_dispatch)
9f95a23c 8872 s->session_dispatch_lock.unlock();
7c673cae
FG
8873 }
8874 return false;
8875 }
8876 return true;
8877}
8878
8879
8880/*
8881 * require that we have same (or newer) map, and that
8882 * the source is the pg primary.
8883 */
8884bool OSD::require_same_or_newer_map(OpRequestRef& op, epoch_t epoch,
8885 bool is_fast_dispatch)
8886{
8887 const Message *m = op->get_req();
9f95a23c 8888 const auto osdmap = get_osdmap();
7c673cae
FG
8889 dout(15) << "require_same_or_newer_map " << epoch
8890 << " (i am " << osdmap->get_epoch() << ") " << m << dendl;
8891
9f95a23c 8892 ceph_assert(ceph_mutex_is_locked(osd_lock));
7c673cae
FG
8893
8894 // do they have a newer map?
8895 if (epoch > osdmap->get_epoch()) {
8896 dout(7) << "waiting for newer map epoch " << epoch
8897 << " > my " << osdmap->get_epoch() << " with " << m << dendl;
8898 wait_for_new_map(op);
8899 return false;
8900 }
8901
8902 if (!require_self_aliveness(op->get_req(), epoch)) {
8903 return false;
8904 }
8905
8906 // ok, our map is same or newer.. do they still exist?
8907 if (m->get_connection()->get_messenger() == cluster_messenger &&
8908 !require_same_peer_instance(op->get_req(), osdmap, is_fast_dispatch)) {
8909 return false;
8910 }
8911
8912 return true;
8913}
8914
8915
8916
8917
8918
8919// ----------------------------------------
8920// pg creation
8921
8922void OSD::split_pgs(
8923 PG *parent,
31f18b77 8924 const set<spg_t> &childpgids, set<PGRef> *out_pgs,
7c673cae
FG
8925 OSDMapRef curmap,
8926 OSDMapRef nextmap,
9f95a23c 8927 PeeringCtx &rctx)
7c673cae 8928{
11fdf7f2
TL
8929 unsigned pg_num = nextmap->get_pg_num(parent->pg_id.pool());
8930 parent->update_snap_mapper_bits(parent->get_pgid().get_split_bits(pg_num));
7c673cae 8931
11fdf7f2
TL
8932 vector<object_stat_sum_t> updated_stats;
8933 parent->start_split_stats(childpgids, &updated_stats);
7c673cae
FG
8934
8935 vector<object_stat_sum_t>::iterator stat_iter = updated_stats.begin();
8936 for (set<spg_t>::const_iterator i = childpgids.begin();
8937 i != childpgids.end();
8938 ++i, ++stat_iter) {
11fdf7f2
TL
8939 ceph_assert(stat_iter != updated_stats.end());
8940 dout(10) << __func__ << " splitting " << *parent << " into " << *i << dendl;
7c673cae
FG
8941 PG* child = _make_pg(nextmap, *i);
8942 child->lock(true);
8943 out_pgs->insert(child);
11fdf7f2 8944 child->ch = store->create_new_collection(child->coll);
7c673cae 8945
11fdf7f2
TL
8946 {
8947 uint32_t shard_index = i->hash_to_shard(shards.size());
8948 assert(NULL != shards[shard_index]);
8949 store->set_collection_commit_queue(child->coll, &(shards[shard_index]->context_queue));
8950 }
7c673cae 8951
11fdf7f2
TL
8952 unsigned split_bits = i->get_split_bits(pg_num);
8953 dout(10) << " pg_num is " << pg_num
8954 << ", m_seed " << i->ps()
8955 << ", split_bits is " << split_bits << dendl;
7c673cae
FG
8956 parent->split_colls(
8957 *i,
8958 split_bits,
8959 i->ps(),
11fdf7f2 8960 &child->get_pool().info,
9f95a23c 8961 rctx.transaction);
7c673cae
FG
8962 parent->split_into(
8963 i->pgid,
8964 child,
8965 split_bits);
7c673cae 8966
92f5a8d4
TL
8967 child->init_collection_pool_opts();
8968
9f95a23c 8969 child->finish_split_stats(*stat_iter, rctx.transaction);
7c673cae
FG
8970 child->unlock();
8971 }
11fdf7f2 8972 ceph_assert(stat_iter != updated_stats.end());
9f95a23c 8973 parent->finish_split_stats(*stat_iter, rctx.transaction);
7c673cae
FG
8974}
8975
8976/*
8977 * holding osd_lock
8978 */
8979void OSD::handle_pg_create(OpRequestRef op)
8980{
9f95a23c
TL
8981 // NOTE: this can be removed in P release (mimic is the last version to
8982 // send MOSDPGCreate messages).
8983
8984 auto m = op->get_req<MOSDPGCreate>();
11fdf7f2 8985 ceph_assert(m->get_type() == MSG_OSD_PG_CREATE);
7c673cae
FG
8986
8987 dout(10) << "handle_pg_create " << *m << dendl;
8988
8989 if (!require_mon_peer(op->get_req())) {
8990 return;
8991 }
8992
8993 if (!require_same_or_newer_map(op, m->epoch, false))
8994 return;
8995
8996 op->mark_started();
8997
9f95a23c 8998 const auto osdmap = get_osdmap();
7c673cae
FG
8999 map<pg_t,utime_t>::const_iterator ci = m->ctimes.begin();
9000 for (map<pg_t,pg_create_t>::const_iterator p = m->mkpg.begin();
9001 p != m->mkpg.end();
9002 ++p, ++ci) {
11fdf7f2 9003 ceph_assert(ci != m->ctimes.end() && ci->first == p->first);
7c673cae
FG
9004 epoch_t created = p->second.created;
9005 if (p->second.split_bits) // Skip split pgs
9006 continue;
9007 pg_t on = p->first;
9008
7c673cae
FG
9009 if (!osdmap->have_pg_pool(on.pool())) {
9010 dout(20) << "ignoring pg on deleted pool " << on << dendl;
9011 continue;
9012 }
9013
9014 dout(20) << "mkpg " << on << " e" << created << "@" << ci->second << dendl;
9015
9f95a23c
TL
9016 spg_t pgid;
9017 bool mapped = osdmap->get_primary_shard(on, &pgid);
9018 ceph_assert(mapped);
9019
7c673cae
FG
9020 // is it still ours?
9021 vector<int> up, acting;
9022 int up_primary = -1;
9023 int acting_primary = -1;
9024 osdmap->pg_to_up_acting_osds(on, &up, &up_primary, &acting, &acting_primary);
9f95a23c 9025 int role = osdmap->calc_pg_role(pg_shard_t(whoami, pgid.shard), acting);
7c673cae
FG
9026
9027 if (acting_primary != whoami) {
9028 dout(10) << "mkpg " << on << " not acting_primary (" << acting_primary
9029 << "), my role=" << role << ", skipping" << dendl;
9030 continue;
9031 }
9032
7c673cae 9033
11fdf7f2 9034 PastIntervals pi;
7c673cae
FG
9035 pg_history_t history;
9036 build_initial_pg_history(pgid, created, ci->second, &history, &pi);
9037
11fdf7f2
TL
9038 // The mon won't resend unless the primary changed, so we ignore
9039 // same_interval_since. We'll pass this history with the current
9040 // epoch as the event.
7c673cae
FG
9041 if (history.same_primary_since > m->epoch) {
9042 dout(10) << __func__ << ": got obsolete pg create on pgid "
9043 << pgid << " from epoch " << m->epoch
9044 << ", primary changed in " << history.same_primary_since
9045 << dendl;
9046 continue;
9047 }
11fdf7f2
TL
9048 enqueue_peering_evt(
9049 pgid,
9050 PGPeeringEventRef(
9051 std::make_shared<PGPeeringEvent>(
9052 osdmap->get_epoch(),
9053 osdmap->get_epoch(),
9054 NullEvt(),
9055 true,
9056 new PGCreateInfo(
9057 pgid,
9058 osdmap->get_epoch(),
9059 history,
9060 pi,
9061 true)
9062 )));
7c673cae 9063 }
7c673cae 9064
3efd9988 9065 {
11fdf7f2 9066 std::lock_guard l(pending_creates_lock);
3efd9988
FG
9067 if (pending_creates_from_mon == 0) {
9068 last_pg_create_epoch = m->epoch;
9069 }
9070 }
11fdf7f2 9071
7c673cae
FG
9072 maybe_update_heartbeat_peers();
9073}
9074
9075
9076// ----------------------------------------
9077// peering and recovery
9078
9f95a23c 9079PeeringCtx OSD::create_context()
7c673cae 9080{
9f95a23c 9081 return PeeringCtx(get_osdmap()->require_osd_release);
7c673cae
FG
9082}
9083
9f95a23c 9084void OSD::dispatch_context(PeeringCtx &ctx, PG *pg, OSDMapRef curmap,
7c673cae
FG
9085 ThreadPool::TPHandle *handle)
9086{
11fdf7f2
TL
9087 if (!service.get_osdmap()->is_up(whoami)) {
9088 dout(20) << __func__ << " not up in osdmap" << dendl;
9089 } else if (!is_active()) {
9090 dout(20) << __func__ << " not active" << dendl;
9091 } else {
9f95a23c
TL
9092 for (auto& [osd, ls] : ctx.message_map) {
9093 if (!curmap->is_up(osd)) {
9094 dout(20) << __func__ << " skipping down osd." << osd << dendl;
9095 continue;
9096 }
9097 ConnectionRef con = service.get_con_osd_cluster(
9098 osd, curmap->get_epoch());
9099 if (!con) {
9100 dout(20) << __func__ << " skipping osd." << osd << " (NULL con)"
9101 << dendl;
9102 continue;
9103 }
9104 service.maybe_share_map(con.get(), curmap);
9105 for (auto m : ls) {
9106 con->send_message2(m);
9107 }
9108 ls.clear();
9109 }
7c673cae 9110 }
9f95a23c 9111 if ((!ctx.transaction.empty() || ctx.transaction.has_contexts()) && pg) {
7c673cae 9112 int tr = store->queue_transaction(
11fdf7f2 9113 pg->ch,
9f95a23c 9114 std::move(ctx.transaction), TrackedOpRef(),
7c673cae 9115 handle);
11fdf7f2 9116 ceph_assert(tr == 0);
7c673cae 9117 }
7c673cae
FG
9118}
9119
11fdf7f2 9120void OSD::handle_fast_pg_create(MOSDPGCreate2 *m)
7c673cae 9121{
11fdf7f2
TL
9122 dout(7) << __func__ << " " << *m << " from " << m->get_source() << dendl;
9123 if (!require_mon_peer(m)) {
9124 m->put();
7c673cae 9125 return;
7c673cae 9126 }
11fdf7f2
TL
9127 for (auto& p : m->pgs) {
9128 spg_t pgid = p.first;
9129 epoch_t created = p.second.first;
9130 utime_t created_stamp = p.second.second;
9f95a23c
TL
9131 auto q = m->pg_extra.find(pgid);
9132 if (q == m->pg_extra.end()) {
9133 dout(20) << __func__ << " " << pgid << " e" << created
9134 << "@" << created_stamp
9135 << " (no history or past_intervals)" << dendl;
9136 // pre-octopus ... no pg history. this can be removed in Q release.
9137 enqueue_peering_evt(
9138 pgid,
9139 PGPeeringEventRef(
9140 std::make_shared<PGPeeringEvent>(
9141 m->epoch,
9142 m->epoch,
9143 NullEvt(),
9144 true,
9145 new PGCreateInfo(
9146 pgid,
9147 created,
9148 pg_history_t(created, created_stamp),
9149 PastIntervals(),
9150 true)
9151 )));
9152 } else {
9153 dout(20) << __func__ << " " << pgid << " e" << created
9154 << "@" << created_stamp
9155 << " history " << q->second.first
9156 << " pi " << q->second.second << dendl;
9157 if (!q->second.second.empty() &&
9158 m->epoch < q->second.second.get_bounds().second) {
9159 clog->error() << "got pg_create on " << pgid << " epoch " << m->epoch
9160 << " and unmatched past_intervals " << q->second.second
9161 << " (history " << q->second.first << ")";
9162 } else {
9163 enqueue_peering_evt(
9164 pgid,
9165 PGPeeringEventRef(
9166 std::make_shared<PGPeeringEvent>(
9167 m->epoch,
9168 m->epoch,
9169 NullEvt(),
9170 true,
9171 new PGCreateInfo(
9172 pgid,
9173 m->epoch,
9174 q->second.first,
9175 q->second.second,
9176 true)
9177 )));
9178 }
9179 }
11fdf7f2 9180 }
7c673cae 9181
11fdf7f2
TL
9182 {
9183 std::lock_guard l(pending_creates_lock);
9184 if (pending_creates_from_mon == 0) {
9185 last_pg_create_epoch = m->epoch;
9186 }
7c673cae
FG
9187 }
9188
11fdf7f2 9189 m->put();
7c673cae
FG
9190}
9191
11fdf7f2 9192void OSD::handle_fast_pg_query(MOSDPGQuery *m)
7c673cae 9193{
11fdf7f2
TL
9194 dout(7) << __func__ << " " << *m << " from " << m->get_source() << dendl;
9195 if (!require_osd_peer(m)) {
9196 m->put();
7c673cae 9197 return;
11fdf7f2 9198 }
7c673cae 9199 int from = m->get_source().num();
11fdf7f2
TL
9200 for (auto& p : m->pg_list) {
9201 enqueue_peering_evt(
9202 p.first,
9203 PGPeeringEventRef(
9204 std::make_shared<PGPeeringEvent>(
9205 p.second.epoch_sent, p.second.epoch_sent,
9206 MQuery(
9207 p.first,
9208 pg_shard_t(from, p.second.from),
9209 p.second,
9210 p.second.epoch_sent),
9211 false))
7c673cae
FG
9212 );
9213 }
11fdf7f2 9214 m->put();
7c673cae
FG
9215}
9216
11fdf7f2 9217void OSD::handle_fast_pg_notify(MOSDPGNotify* m)
7c673cae 9218{
11fdf7f2
TL
9219 dout(7) << __func__ << " " << *m << " from " << m->get_source() << dendl;
9220 if (!require_osd_peer(m)) {
9221 m->put();
7c673cae
FG
9222 return;
9223 }
11fdf7f2
TL
9224 int from = m->get_source().num();
9225 for (auto& p : m->get_pg_list()) {
9f95a23c 9226 spg_t pgid(p.info.pgid.pgid, p.to);
11fdf7f2
TL
9227 enqueue_peering_evt(
9228 pgid,
9229 PGPeeringEventRef(
9230 std::make_shared<PGPeeringEvent>(
9f95a23c
TL
9231 p.epoch_sent,
9232 p.query_epoch,
11fdf7f2 9233 MNotifyRec(
9f95a23c
TL
9234 pgid, pg_shard_t(from, p.from),
9235 p,
9236 m->get_connection()->get_features()),
11fdf7f2
TL
9237 true,
9238 new PGCreateInfo(
9239 pgid,
9f95a23c
TL
9240 p.query_epoch,
9241 p.info.history,
9242 p.past_intervals,
11fdf7f2
TL
9243 false)
9244 )));
7c673cae 9245 }
11fdf7f2 9246 m->put();
7c673cae
FG
9247}
9248
11fdf7f2 9249void OSD::handle_fast_pg_info(MOSDPGInfo* m)
7c673cae 9250{
11fdf7f2
TL
9251 dout(7) << __func__ << " " << *m << " from " << m->get_source() << dendl;
9252 if (!require_osd_peer(m)) {
9253 m->put();
7c673cae
FG
9254 return;
9255 }
11fdf7f2
TL
9256 int from = m->get_source().num();
9257 for (auto& p : m->pg_list) {
9258 enqueue_peering_evt(
9f95a23c 9259 spg_t(p.info.pgid.pgid, p.to),
11fdf7f2
TL
9260 PGPeeringEventRef(
9261 std::make_shared<PGPeeringEvent>(
9f95a23c 9262 p.epoch_sent, p.query_epoch,
11fdf7f2 9263 MInfoRec(
9f95a23c
TL
9264 pg_shard_t(from, p.from),
9265 p.info,
9266 p.epoch_sent)))
11fdf7f2 9267 );
7c673cae 9268 }
11fdf7f2 9269 m->put();
7c673cae
FG
9270}
9271
11fdf7f2 9272void OSD::handle_fast_pg_remove(MOSDPGRemove *m)
7c673cae 9273{
11fdf7f2
TL
9274 dout(7) << __func__ << " " << *m << " from " << m->get_source() << dendl;
9275 if (!require_osd_peer(m)) {
9276 m->put();
7c673cae
FG
9277 return;
9278 }
11fdf7f2
TL
9279 for (auto& pgid : m->pg_list) {
9280 enqueue_peering_evt(
9281 pgid,
9282 PGPeeringEventRef(
9283 std::make_shared<PGPeeringEvent>(
9284 m->get_epoch(), m->get_epoch(),
9f95a23c 9285 PeeringState::DeleteStart())));
7c673cae 9286 }
11fdf7f2 9287 m->put();
7c673cae
FG
9288}
9289
11fdf7f2 9290void OSD::handle_fast_force_recovery(MOSDForceRecovery *m)
c07f9fc5 9291{
11fdf7f2
TL
9292 dout(10) << __func__ << " " << *m << dendl;
9293 if (!require_mon_or_mgr_peer(m)) {
9294 m->put();
9295 return;
9296 }
9297 epoch_t epoch = get_osdmap_epoch();
9298 for (auto pgid : m->forced_pgs) {
9299 if (m->options & OFR_BACKFILL) {
9300 if (m->options & OFR_CANCEL) {
9301 enqueue_peering_evt(
9302 pgid,
9303 PGPeeringEventRef(
9304 std::make_shared<PGPeeringEvent>(
9305 epoch, epoch,
9f95a23c 9306 PeeringState::UnsetForceBackfill())));
11fdf7f2
TL
9307 } else {
9308 enqueue_peering_evt(
9309 pgid,
9310 PGPeeringEventRef(
9311 std::make_shared<PGPeeringEvent>(
9312 epoch, epoch,
9f95a23c 9313 PeeringState::SetForceBackfill())));
11fdf7f2
TL
9314 }
9315 } else if (m->options & OFR_RECOVERY) {
9316 if (m->options & OFR_CANCEL) {
9317 enqueue_peering_evt(
9318 pgid,
9319 PGPeeringEventRef(
9320 std::make_shared<PGPeeringEvent>(
9321 epoch, epoch,
9f95a23c 9322 PeeringState::UnsetForceRecovery())));
11fdf7f2
TL
9323 } else {
9324 enqueue_peering_evt(
9325 pgid,
9326 PGPeeringEventRef(
9327 std::make_shared<PGPeeringEvent>(
9328 epoch, epoch,
9f95a23c 9329 PeeringState::SetForceRecovery())));
c07f9fc5
FG
9330 }
9331 }
9332 }
11fdf7f2 9333 m->put();
c07f9fc5 9334}
7c673cae 9335
11fdf7f2 9336void OSD::handle_pg_query_nopg(const MQuery& q)
7c673cae 9337{
11fdf7f2
TL
9338 spg_t pgid = q.pgid;
9339 dout(10) << __func__ << " " << pgid << dendl;
7c673cae 9340
11fdf7f2
TL
9341 OSDMapRef osdmap = get_osdmap();
9342 if (!osdmap->have_pg_pool(pgid.pool()))
7c673cae
FG
9343 return;
9344
11fdf7f2
TL
9345 dout(10) << " pg " << pgid << " dne" << dendl;
9346 pg_info_t empty(spg_t(pgid.pgid, q.query.to));
9347 ConnectionRef con = service.get_con_osd_cluster(q.from.osd, osdmap->get_epoch());
9348 if (con) {
9349 Message *m;
9350 if (q.query.type == pg_query_t::LOG ||
9351 q.query.type == pg_query_t::FULLLOG) {
9352 m = new MOSDPGLog(
9353 q.query.from, q.query.to,
9354 osdmap->get_epoch(), empty,
9355 q.query.epoch_sent);
7c673cae 9356 } else {
9f95a23c 9357 vector<pg_notify_t> ls;
11fdf7f2 9358 ls.push_back(
9f95a23c
TL
9359 pg_notify_t(
9360 q.query.from, q.query.to,
9361 q.query.epoch_sent,
9362 osdmap->get_epoch(),
9363 empty,
11fdf7f2 9364 PastIntervals()));
9f95a23c 9365 m = new MOSDPGNotify(osdmap->get_epoch(), std::move(ls));
7c673cae 9366 }
9f95a23c 9367 service.maybe_share_map(con.get(), osdmap);
11fdf7f2 9368 con->send_message(m);
7c673cae
FG
9369 }
9370}
9371
9f95a23c
TL
9372void OSDService::queue_check_readable(spg_t spgid,
9373 epoch_t lpr,
9374 ceph::signedspan delay)
9375{
9376 if (delay == ceph::signedspan::zero()) {
9377 osd->enqueue_peering_evt(
9378 spgid,
9379 PGPeeringEventRef(
9380 std::make_shared<PGPeeringEvent>(
9381 lpr, lpr,
9382 PeeringState::CheckReadable())));
9383 } else {
9384 mono_timer.add_event(
9385 delay,
9386 [this, spgid, lpr]() {
9387 queue_check_readable(spgid, lpr);
9388 });
9389 }
9390}
9391
7c673cae 9392
7c673cae
FG
9393// =========================================================
9394// RECOVERY
9395
9396void OSDService::_maybe_queue_recovery() {
9f95a23c 9397 ceph_assert(ceph_mutex_is_locked_by_me(recovery_lock));
7c673cae
FG
9398 uint64_t available_pushes;
9399 while (!awaiting_throttle.empty() &&
9400 _recover_now(&available_pushes)) {
11fdf7f2 9401 uint64_t to_start = std::min(
7c673cae
FG
9402 available_pushes,
9403 cct->_conf->osd_recovery_max_single_start);
9404 _queue_for_recovery(awaiting_throttle.front(), to_start);
9405 awaiting_throttle.pop_front();
11fdf7f2
TL
9406 dout(10) << __func__ << " starting " << to_start
9407 << ", recovery_ops_reserved " << recovery_ops_reserved
9408 << " -> " << (recovery_ops_reserved + to_start) << dendl;
7c673cae
FG
9409 recovery_ops_reserved += to_start;
9410 }
9411}
9412
9413bool OSDService::_recover_now(uint64_t *available_pushes)
9414{
9415 if (available_pushes)
9416 *available_pushes = 0;
9417
9418 if (ceph_clock_now() < defer_recovery_until) {
9419 dout(15) << __func__ << " defer until " << defer_recovery_until << dendl;
9420 return false;
9421 }
9422
9423 if (recovery_paused) {
9424 dout(15) << __func__ << " paused" << dendl;
9425 return false;
9426 }
9427
9f95a23c 9428 uint64_t max = osd->get_recovery_max_active();
7c673cae
FG
9429 if (max <= recovery_ops_active + recovery_ops_reserved) {
9430 dout(15) << __func__ << " active " << recovery_ops_active
9431 << " + reserved " << recovery_ops_reserved
9432 << " >= max " << max << dendl;
9433 return false;
9434 }
9435
9436 if (available_pushes)
9437 *available_pushes = max - recovery_ops_active - recovery_ops_reserved;
9438
9439 return true;
9440}
9441
9f95a23c
TL
9442unsigned OSDService::get_target_pg_log_entries() const
9443{
9444 auto num_pgs = osd->get_num_pgs();
9445 auto target = cct->_conf->osd_target_pg_log_entries_per_osd;
9446 if (num_pgs > 0 && target > 0) {
9447 // target an even spread of our budgeted log entries across all
9448 // PGs. note that while we only get to control the entry count
9449 // for primary PGs, we'll normally be responsible for a mix of
9450 // primary and replica PGs (for the same pool(s) even), so this
9451 // will work out.
9452 return std::max<unsigned>(
9453 std::min<unsigned>(target / num_pgs,
9454 cct->_conf->osd_max_pg_log_entries),
9455 cct->_conf->osd_min_pg_log_entries);
9456 } else {
9457 // fall back to a per-pg value.
9458 return cct->_conf->osd_min_pg_log_entries;
9459 }
9460}
9461
7c673cae
FG
9462void OSD::do_recovery(
9463 PG *pg, epoch_t queued, uint64_t reserved_pushes,
9464 ThreadPool::TPHandle &handle)
9465{
9466 uint64_t started = 0;
31f18b77
FG
9467
9468 /*
9469 * When the value of osd_recovery_sleep is set greater than zero, recovery
9470 * ops are scheduled after osd_recovery_sleep amount of time from the previous
9471 * recovery event's schedule time. This is done by adding a
9472 * recovery_requeue_callback event, which re-queues the recovery op using
9473 * queue_recovery_after_sleep.
9474 */
c07f9fc5 9475 float recovery_sleep = get_osd_recovery_sleep();
b32b8144 9476 {
11fdf7f2 9477 std::lock_guard l(service.sleep_lock);
b32b8144
FG
9478 if (recovery_sleep > 0 && service.recovery_needs_sleep) {
9479 PGRef pgref(pg);
9f95a23c 9480 auto recovery_requeue_callback = new LambdaContext([this, pgref, queued, reserved_pushes](int r) {
b32b8144
FG
9481 dout(20) << "do_recovery wake up at "
9482 << ceph_clock_now()
9483 << ", re-queuing recovery" << dendl;
11fdf7f2 9484 std::lock_guard l(service.sleep_lock);
b32b8144
FG
9485 service.recovery_needs_sleep = false;
9486 service.queue_recovery_after_sleep(pgref.get(), queued, reserved_pushes);
9487 });
9488
9489 // This is true for the first recovery op and when the previous recovery op
9490 // has been scheduled in the past. The next recovery op is scheduled after
9491 // completing the sleep from now.
9f95a23c
TL
9492
9493 if (auto now = ceph::real_clock::now();
9494 service.recovery_schedule_time < now) {
9495 service.recovery_schedule_time = now;
b32b8144 9496 }
9f95a23c 9497 service.recovery_schedule_time += ceph::make_timespan(recovery_sleep);
11fdf7f2 9498 service.sleep_timer.add_event_at(service.recovery_schedule_time,
9f95a23c 9499 recovery_requeue_callback);
b32b8144
FG
9500 dout(20) << "Recovery event scheduled at "
9501 << service.recovery_schedule_time << dendl;
9502 return;
9503 }
7c673cae
FG
9504 }
9505
9506 {
b32b8144 9507 {
11fdf7f2 9508 std::lock_guard l(service.sleep_lock);
b32b8144
FG
9509 service.recovery_needs_sleep = true;
9510 }
9511
7c673cae
FG
9512 if (pg->pg_has_reset_since(queued)) {
9513 goto out;
9514 }
9515
7c673cae
FG
9516 dout(10) << "do_recovery starting " << reserved_pushes << " " << *pg << dendl;
9517#ifdef DEBUG_RECOVERY_OIDS
11fdf7f2 9518 dout(20) << " active was " << service.recovery_oids[pg->pg_id] << dendl;
7c673cae
FG
9519#endif
9520
11fdf7f2 9521 bool do_unfound = pg->start_recovery_ops(reserved_pushes, handle, &started);
7c673cae
FG
9522 dout(10) << "do_recovery started " << started << "/" << reserved_pushes
9523 << " on " << *pg << dendl;
9524
11fdf7f2 9525 if (do_unfound) {
9f95a23c 9526 PeeringCtx rctx = create_context();
11fdf7f2 9527 rctx.handle = &handle;
9f95a23c 9528 pg->find_unfound(queued, rctx);
11fdf7f2 9529 dispatch_context(rctx, pg, pg->get_osdmap());
7c673cae 9530 }
7c673cae
FG
9531 }
9532
9533 out:
11fdf7f2 9534 ceph_assert(started <= reserved_pushes);
7c673cae
FG
9535 service.release_reserved_pushes(reserved_pushes);
9536}
9537
9538void OSDService::start_recovery_op(PG *pg, const hobject_t& soid)
9539{
11fdf7f2 9540 std::lock_guard l(recovery_lock);
7c673cae
FG
9541 dout(10) << "start_recovery_op " << *pg << " " << soid
9542 << " (" << recovery_ops_active << "/"
9f95a23c 9543 << osd->get_recovery_max_active() << " rops)"
7c673cae
FG
9544 << dendl;
9545 recovery_ops_active++;
9546
9547#ifdef DEBUG_RECOVERY_OIDS
11fdf7f2
TL
9548 dout(20) << " active was " << recovery_oids[pg->pg_id] << dendl;
9549 ceph_assert(recovery_oids[pg->pg_id].count(soid) == 0);
9550 recovery_oids[pg->pg_id].insert(soid);
7c673cae
FG
9551#endif
9552}
9553
9554void OSDService::finish_recovery_op(PG *pg, const hobject_t& soid, bool dequeue)
9555{
11fdf7f2 9556 std::lock_guard l(recovery_lock);
7c673cae
FG
9557 dout(10) << "finish_recovery_op " << *pg << " " << soid
9558 << " dequeue=" << dequeue
9f95a23c
TL
9559 << " (" << recovery_ops_active << "/"
9560 << osd->get_recovery_max_active() << " rops)"
7c673cae
FG
9561 << dendl;
9562
9563 // adjust count
11fdf7f2 9564 ceph_assert(recovery_ops_active > 0);
7c673cae
FG
9565 recovery_ops_active--;
9566
9567#ifdef DEBUG_RECOVERY_OIDS
11fdf7f2
TL
9568 dout(20) << " active oids was " << recovery_oids[pg->pg_id] << dendl;
9569 ceph_assert(recovery_oids[pg->pg_id].count(soid));
9570 recovery_oids[pg->pg_id].erase(soid);
7c673cae
FG
9571#endif
9572
9573 _maybe_queue_recovery();
9574}
9575
9576bool OSDService::is_recovery_active()
9577{
eafe8130
TL
9578 if (cct->_conf->osd_debug_pretend_recovery_active) {
9579 return true;
9580 }
b5b8bbf5 9581 return local_reserver.has_reservation() || remote_reserver.has_reservation();
7c673cae
FG
9582}
9583
11fdf7f2
TL
9584void OSDService::release_reserved_pushes(uint64_t pushes)
9585{
9586 std::lock_guard l(recovery_lock);
9587 dout(10) << __func__ << "(" << pushes << "), recovery_ops_reserved "
9588 << recovery_ops_reserved << " -> " << (recovery_ops_reserved-pushes)
9589 << dendl;
9590 ceph_assert(recovery_ops_reserved >= pushes);
9591 recovery_ops_reserved -= pushes;
9592 _maybe_queue_recovery();
9593}
9594
7c673cae
FG
9595// =========================================================
9596// OPS
9597
9598bool OSD::op_is_discardable(const MOSDOp *op)
9599{
9600 // drop client request if they are not connected and can't get the
9601 // reply anyway.
9602 if (!op->get_connection()->is_connected()) {
9603 return true;
9604 }
9605 return false;
9606}
9607
11fdf7f2 9608void OSD::enqueue_op(spg_t pg, OpRequestRef&& op, epoch_t epoch)
7c673cae 9609{
11fdf7f2
TL
9610 const utime_t stamp = op->get_req()->get_recv_stamp();
9611 const utime_t latency = ceph_clock_now() - stamp;
9612 const unsigned priority = op->get_req()->get_priority();
9613 const int cost = op->get_req()->get_cost();
9614 const uint64_t owner = op->get_req()->get_source().num();
9615
9616 dout(15) << "enqueue_op " << op << " prio " << priority
9617 << " cost " << cost
7c673cae
FG
9618 << " latency " << latency
9619 << " epoch " << epoch
9620 << " " << *(op->get_req()) << dendl;
9621 op->osd_trace.event("enqueue op");
11fdf7f2
TL
9622 op->osd_trace.keyval("priority", priority);
9623 op->osd_trace.keyval("cost", cost);
7c673cae 9624 op->mark_queued_for_pg();
224ce89b 9625 logger->tinc(l_osd_op_before_queue_op_lat, latency);
11fdf7f2 9626 op_shardedwq.queue(
9f95a23c
TL
9627 OpSchedulerItem(
9628 unique_ptr<OpSchedulerItem::OpQueueable>(new PGOpItem(pg, std::move(op))),
11fdf7f2 9629 cost, priority, stamp, owner, epoch));
7c673cae
FG
9630}
9631
11fdf7f2
TL
9632void OSD::enqueue_peering_evt(spg_t pgid, PGPeeringEventRef evt)
9633{
9634 dout(15) << __func__ << " " << pgid << " " << evt->get_desc() << dendl;
9635 op_shardedwq.queue(
9f95a23c
TL
9636 OpSchedulerItem(
9637 unique_ptr<OpSchedulerItem::OpQueueable>(new PGPeeringItem(pgid, evt)),
11fdf7f2
TL
9638 10,
9639 cct->_conf->osd_peering_op_priority,
9640 utime_t(),
9641 0,
9642 evt->get_epoch_sent()));
9643}
7c673cae
FG
9644
9645/*
9646 * NOTE: dequeue called in worker thread, with pg lock
9647 */
9648void OSD::dequeue_op(
9649 PGRef pg, OpRequestRef op,
9650 ThreadPool::TPHandle &handle)
9651{
9f95a23c
TL
9652 const Message *m = op->get_req();
9653
11fdf7f2 9654 FUNCTRACE(cct);
9f95a23c 9655 OID_EVENT_TRACE_WITH_MSG(m, "DEQUEUE_OP_BEGIN", false);
7c673cae
FG
9656
9657 utime_t now = ceph_clock_now();
9658 op->set_dequeued_time(now);
9f95a23c
TL
9659
9660 utime_t latency = now - m->get_recv_stamp();
9661 dout(10) << "dequeue_op " << op << " prio " << m->get_priority()
9662 << " cost " << m->get_cost()
7c673cae 9663 << " latency " << latency
9f95a23c 9664 << " " << *m
7c673cae
FG
9665 << " pg " << *pg << dendl;
9666
224ce89b
WB
9667 logger->tinc(l_osd_op_before_dequeue_op_lat, latency);
9668
9f95a23c
TL
9669 service.maybe_share_map(m->get_connection().get(),
9670 pg->get_osdmap(),
9671 op->sent_epoch);
7c673cae 9672
11fdf7f2 9673 if (pg->is_deleting())
7c673cae
FG
9674 return;
9675
9676 op->mark_reached_pg();
9677 op->osd_trace.event("dequeue_op");
9678
9679 pg->do_request(op, handle);
9680
9681 // finish
9682 dout(10) << "dequeue_op " << op << " finish" << dendl;
9f95a23c 9683 OID_EVENT_TRACE_WITH_MSG(m, "DEQUEUE_OP_END", false);
7c673cae
FG
9684}
9685
9686
11fdf7f2
TL
9687void OSD::dequeue_peering_evt(
9688 OSDShard *sdata,
9689 PG *pg,
9690 PGPeeringEventRef evt,
9691 ThreadPool::TPHandle& handle)
7c673cae 9692{
9f95a23c 9693 PeeringCtx rctx = create_context();
11fdf7f2 9694 auto curmap = sdata->get_osdmap();
9f95a23c
TL
9695 bool need_up_thru = false;
9696 epoch_t same_interval_since = 0;
11fdf7f2
TL
9697 if (!pg) {
9698 if (const MQuery *q = dynamic_cast<const MQuery*>(evt->evt.get())) {
9699 handle_pg_query_nopg(*q);
7c673cae 9700 } else {
11fdf7f2
TL
9701 derr << __func__ << " unrecognized pg-less event " << evt->get_desc() << dendl;
9702 ceph_abort();
9703 }
9f95a23c
TL
9704 } else if (advance_pg(curmap->get_epoch(), pg, handle, rctx)) {
9705 pg->do_peering_event(evt, rctx);
11fdf7f2 9706 if (pg->is_deleted()) {
11fdf7f2
TL
9707 pg->unlock();
9708 return;
7c673cae 9709 }
9f95a23c 9710 dispatch_context(rctx, pg, curmap, &handle);
11fdf7f2
TL
9711 need_up_thru = pg->get_need_up_thru();
9712 same_interval_since = pg->get_same_interval_since();
7c673cae
FG
9713 pg->unlock();
9714 }
11fdf7f2
TL
9715
9716 if (need_up_thru) {
7c673cae 9717 queue_want_up_thru(same_interval_since);
11fdf7f2 9718 }
7c673cae
FG
9719
9720 service.send_pg_temp();
9721}
9722
11fdf7f2
TL
9723void OSD::dequeue_delete(
9724 OSDShard *sdata,
9725 PG *pg,
9726 epoch_t e,
9727 ThreadPool::TPHandle& handle)
9728{
9729 dequeue_peering_evt(
9730 sdata,
9731 pg,
9732 PGPeeringEventRef(
9733 std::make_shared<PGPeeringEvent>(
9734 e, e,
9f95a23c 9735 PeeringState::DeleteSome())),
11fdf7f2
TL
9736 handle);
9737}
9738
9739
9740
7c673cae
FG
9741// --------------------------------
9742
9743const char** OSD::get_tracked_conf_keys() const
9744{
9745 static const char* KEYS[] = {
9746 "osd_max_backfills",
9747 "osd_min_recovery_priority",
224ce89b
WB
9748 "osd_max_trimming_pgs",
9749 "osd_op_complaint_time",
9750 "osd_op_log_threshold",
9751 "osd_op_history_size",
9752 "osd_op_history_duration",
9753 "osd_op_history_slow_op_size",
9754 "osd_op_history_slow_op_threshold",
7c673cae
FG
9755 "osd_enable_op_tracker",
9756 "osd_map_cache_size",
11fdf7f2 9757 "osd_pg_epoch_max_lag_factor",
7c673cae 9758 "osd_pg_epoch_persisted_max_stale",
7c673cae
FG
9759 // clog & admin clog
9760 "clog_to_monitors",
9761 "clog_to_syslog",
9762 "clog_to_syslog_facility",
9763 "clog_to_syslog_level",
9764 "osd_objectstore_fuse",
9765 "clog_to_graylog",
9766 "clog_to_graylog_host",
9767 "clog_to_graylog_port",
9768 "host",
9769 "fsid",
9770 "osd_recovery_delay_start",
9771 "osd_client_message_size_cap",
9772 "osd_client_message_cap",
31f18b77
FG
9773 "osd_heartbeat_min_size",
9774 "osd_heartbeat_interval",
9f95a23c 9775 "osd_object_clean_region_max_num_intervals",
494da23a
TL
9776 "osd_scrub_min_interval",
9777 "osd_scrub_max_interval",
7c673cae
FG
9778 NULL
9779 };
9780 return KEYS;
9781}
9782
11fdf7f2 9783void OSD::handle_conf_change(const ConfigProxy& conf,
7c673cae
FG
9784 const std::set <std::string> &changed)
9785{
9f95a23c 9786 std::lock_guard l{osd_lock};
7c673cae
FG
9787 if (changed.count("osd_max_backfills")) {
9788 service.local_reserver.set_max(cct->_conf->osd_max_backfills);
9789 service.remote_reserver.set_max(cct->_conf->osd_max_backfills);
9790 }
9791 if (changed.count("osd_min_recovery_priority")) {
9792 service.local_reserver.set_min_priority(cct->_conf->osd_min_recovery_priority);
9793 service.remote_reserver.set_min_priority(cct->_conf->osd_min_recovery_priority);
9794 }
9795 if (changed.count("osd_max_trimming_pgs")) {
9796 service.snap_reserver.set_max(cct->_conf->osd_max_trimming_pgs);
9797 }
9798 if (changed.count("osd_op_complaint_time") ||
9799 changed.count("osd_op_log_threshold")) {
9800 op_tracker.set_complaint_and_threshold(cct->_conf->osd_op_complaint_time,
9801 cct->_conf->osd_op_log_threshold);
9802 }
9803 if (changed.count("osd_op_history_size") ||
9804 changed.count("osd_op_history_duration")) {
9805 op_tracker.set_history_size_and_duration(cct->_conf->osd_op_history_size,
9806 cct->_conf->osd_op_history_duration);
9807 }
9808 if (changed.count("osd_op_history_slow_op_size") ||
9809 changed.count("osd_op_history_slow_op_threshold")) {
9810 op_tracker.set_history_slow_op_size_and_threshold(cct->_conf->osd_op_history_slow_op_size,
9811 cct->_conf->osd_op_history_slow_op_threshold);
9812 }
9813 if (changed.count("osd_enable_op_tracker")) {
9814 op_tracker.set_tracking(cct->_conf->osd_enable_op_tracker);
9815 }
7c673cae
FG
9816 if (changed.count("osd_map_cache_size")) {
9817 service.map_cache.set_size(cct->_conf->osd_map_cache_size);
9818 service.map_bl_cache.set_size(cct->_conf->osd_map_cache_size);
9819 service.map_bl_inc_cache.set_size(cct->_conf->osd_map_cache_size);
9820 }
9821 if (changed.count("clog_to_monitors") ||
9822 changed.count("clog_to_syslog") ||
9823 changed.count("clog_to_syslog_level") ||
9824 changed.count("clog_to_syslog_facility") ||
9825 changed.count("clog_to_graylog") ||
9826 changed.count("clog_to_graylog_host") ||
9827 changed.count("clog_to_graylog_port") ||
9828 changed.count("host") ||
9829 changed.count("fsid")) {
9830 update_log_config();
9831 }
11fdf7f2
TL
9832 if (changed.count("osd_pg_epoch_max_lag_factor")) {
9833 m_osd_pg_epoch_max_lag_factor = conf.get_val<double>(
9834 "osd_pg_epoch_max_lag_factor");
9835 }
7c673cae
FG
9836
9837#ifdef HAVE_LIBFUSE
9838 if (changed.count("osd_objectstore_fuse")) {
9839 if (store) {
9840 enable_disable_fuse(false);
9841 }
9842 }
9843#endif
9844
9845 if (changed.count("osd_recovery_delay_start")) {
9846 service.defer_recovery(cct->_conf->osd_recovery_delay_start);
9847 service.kick_recovery_queue();
9848 }
9849
9850 if (changed.count("osd_client_message_cap")) {
9851 uint64_t newval = cct->_conf->osd_client_message_cap;
9852 Messenger::Policy pol = client_messenger->get_policy(entity_name_t::TYPE_CLIENT);
9853 if (pol.throttler_messages && newval > 0) {
9854 pol.throttler_messages->reset_max(newval);
9855 }
9856 }
9857 if (changed.count("osd_client_message_size_cap")) {
9858 uint64_t newval = cct->_conf->osd_client_message_size_cap;
9859 Messenger::Policy pol = client_messenger->get_policy(entity_name_t::TYPE_CLIENT);
9860 if (pol.throttler_bytes && newval > 0) {
9861 pol.throttler_bytes->reset_max(newval);
9862 }
9863 }
9f95a23c
TL
9864 if (changed.count("osd_object_clean_region_max_num_intervals")) {
9865 ObjectCleanRegions::set_max_num_intervals(cct->_conf->osd_object_clean_region_max_num_intervals);
9866 }
7c673cae 9867
494da23a
TL
9868 if (changed.count("osd_scrub_min_interval") ||
9869 changed.count("osd_scrub_max_interval")) {
9870 resched_all_scrubs();
9871 dout(0) << __func__ << ": scrub interval change" << dendl;
9872 }
7c673cae
FG
9873 check_config();
9874}
9875
9876void OSD::update_log_config()
9877{
9878 map<string,string> log_to_monitors;
9879 map<string,string> log_to_syslog;
9880 map<string,string> log_channel;
9881 map<string,string> log_prio;
9882 map<string,string> log_to_graylog;
9883 map<string,string> log_to_graylog_host;
9884 map<string,string> log_to_graylog_port;
9885 uuid_d fsid;
9886 string host;
9887
9888 if (parse_log_client_options(cct, log_to_monitors, log_to_syslog,
9889 log_channel, log_prio, log_to_graylog,
9890 log_to_graylog_host, log_to_graylog_port,
9891 fsid, host) == 0)
9892 clog->update_config(log_to_monitors, log_to_syslog,
9893 log_channel, log_prio, log_to_graylog,
9894 log_to_graylog_host, log_to_graylog_port,
9895 fsid, host);
9896 derr << "log_to_monitors " << log_to_monitors << dendl;
9897}
9898
9899void OSD::check_config()
9900{
9901 // some sanity checks
7c673cae
FG
9902 if (cct->_conf->osd_map_cache_size <= (int)cct->_conf->osd_pg_epoch_persisted_max_stale + 2) {
9903 clog->warn() << "osd_map_cache_size (" << cct->_conf->osd_map_cache_size << ")"
9904 << " is not > osd_pg_epoch_persisted_max_stale ("
9905 << cct->_conf->osd_pg_epoch_persisted_max_stale << ")";
9906 }
9f95a23c
TL
9907 if (cct->_conf->osd_object_clean_region_max_num_intervals < 0) {
9908 clog->warn() << "osd_object_clean_region_max_num_intervals ("
9909 << cct->_conf->osd_object_clean_region_max_num_intervals
9910 << ") is < 0";
9911 }
7c673cae
FG
9912}
9913
7c673cae
FG
9914// --------------------------------
9915
9916void OSD::get_latest_osdmap()
9917{
9918 dout(10) << __func__ << " -- start" << dendl;
9919
9920 C_SaferCond cond;
9921 service.objecter->wait_for_latest_osdmap(&cond);
9922 cond.wait();
9923
9924 dout(10) << __func__ << " -- finish" << dendl;
9925}
9926
9927// --------------------------------
9928
9f95a23c
TL
9929void OSD::set_perf_queries(const ConfigPayload &config_payload) {
9930 const OSDConfigPayload &osd_config_payload = boost::get<OSDConfigPayload>(config_payload);
9931 const std::map<OSDPerfMetricQuery, OSDPerfMetricLimits> &queries = osd_config_payload.config;
11fdf7f2
TL
9932 dout(10) << "setting " << queries.size() << " queries" << dendl;
9933
9934 std::list<OSDPerfMetricQuery> supported_queries;
9935 for (auto &it : queries) {
9936 auto &query = it.first;
9937 if (!query.key_descriptor.empty()) {
9938 supported_queries.push_back(query);
9939 }
9940 }
9941 if (supported_queries.size() < queries.size()) {
9942 dout(1) << queries.size() - supported_queries.size()
9943 << " unsupported queries" << dendl;
9944 }
11fdf7f2 9945 {
9f95a23c 9946 std::lock_guard locker{m_perf_queries_lock};
11fdf7f2
TL
9947 m_perf_queries = supported_queries;
9948 m_perf_limits = queries;
9949 }
11fdf7f2
TL
9950 std::vector<PGRef> pgs;
9951 _get_pgs(&pgs);
9952 for (auto& pg : pgs) {
9f95a23c 9953 std::scoped_lock l{*pg};
eafe8130 9954 pg->set_dynamic_perf_stats_queries(supported_queries);
7c673cae 9955 }
7c673cae
FG
9956}
9957
9f95a23c
TL
9958MetricPayload OSD::get_perf_reports() {
9959 OSDMetricPayload payload;
9960 std::map<OSDPerfMetricQuery, OSDPerfMetricReport> &reports = payload.report;
9961
11fdf7f2
TL
9962 std::vector<PGRef> pgs;
9963 _get_pgs(&pgs);
9964 DynamicPerfStats dps;
9965 for (auto& pg : pgs) {
eafe8130
TL
9966 // m_perf_queries can be modified only in set_perf_queries by mgr client
9967 // request, and it is protected by by mgr client's lock, which is held
9968 // when set_perf_queries/get_perf_reports are called, so we may not hold
9969 // m_perf_queries_lock here.
9970 DynamicPerfStats pg_dps(m_perf_queries);
9971 pg->lock();
9972 pg->get_dynamic_perf_stats(&pg_dps);
9973 pg->unlock();
9974 dps.merge(pg_dps);
11fdf7f2 9975 }
9f95a23c
TL
9976 dps.add_to_reports(m_perf_limits, &reports);
9977 dout(20) << "reports for " << reports.size() << " queries" << dendl;
9978
9979 return payload;
11fdf7f2 9980}
224ce89b 9981
7c673cae
FG
9982// =============================================================
9983
9984#undef dout_context
11fdf7f2 9985#define dout_context cct
7c673cae 9986#undef dout_prefix
11fdf7f2 9987#define dout_prefix *_dout << "osd." << osd->get_nodeid() << ":" << shard_id << "." << __func__ << " "
7c673cae 9988
11fdf7f2 9989void OSDShard::_attach_pg(OSDShardPGSlot *slot, PG *pg)
7c673cae 9990{
11fdf7f2
TL
9991 dout(10) << pg->pg_id << " " << pg << dendl;
9992 slot->pg = pg;
9993 pg->osd_shard = this;
9994 pg->pg_slot = slot;
9995 osd->inc_num_pgs();
9996
9997 slot->epoch = pg->get_osdmap_epoch();
9998 pg_slots_by_epoch.insert(*slot);
9999}
10000
10001void OSDShard::_detach_pg(OSDShardPGSlot *slot)
10002{
10003 dout(10) << slot->pg->pg_id << " " << slot->pg << dendl;
10004 slot->pg->osd_shard = nullptr;
10005 slot->pg->pg_slot = nullptr;
10006 slot->pg = nullptr;
10007 osd->dec_num_pgs();
10008
10009 pg_slots_by_epoch.erase(pg_slots_by_epoch.iterator_to(*slot));
10010 slot->epoch = 0;
10011 if (waiting_for_min_pg_epoch) {
10012 min_pg_epoch_cond.notify_all();
10013 }
10014}
10015
10016void OSDShard::update_pg_epoch(OSDShardPGSlot *slot, epoch_t e)
10017{
10018 std::lock_guard l(shard_lock);
10019 dout(30) << "min was " << pg_slots_by_epoch.begin()->epoch
10020 << " on " << pg_slots_by_epoch.begin()->pg->pg_id << dendl;
10021 pg_slots_by_epoch.erase(pg_slots_by_epoch.iterator_to(*slot));
10022 dout(20) << slot->pg->pg_id << " " << slot->epoch << " -> " << e << dendl;
10023 slot->epoch = e;
10024 pg_slots_by_epoch.insert(*slot);
10025 dout(30) << "min is now " << pg_slots_by_epoch.begin()->epoch
10026 << " on " << pg_slots_by_epoch.begin()->pg->pg_id << dendl;
10027 if (waiting_for_min_pg_epoch) {
10028 min_pg_epoch_cond.notify_all();
10029 }
10030}
10031
10032epoch_t OSDShard::get_min_pg_epoch()
10033{
10034 std::lock_guard l(shard_lock);
10035 auto p = pg_slots_by_epoch.begin();
10036 if (p == pg_slots_by_epoch.end()) {
10037 return 0;
10038 }
10039 return p->epoch;
10040}
10041
10042void OSDShard::wait_min_pg_epoch(epoch_t need)
10043{
10044 std::unique_lock l{shard_lock};
10045 ++waiting_for_min_pg_epoch;
10046 min_pg_epoch_cond.wait(l, [need, this] {
10047 if (pg_slots_by_epoch.empty()) {
10048 return true;
10049 } else if (pg_slots_by_epoch.begin()->epoch >= need) {
10050 return true;
10051 } else {
10052 dout(10) << need << " waiting on "
10053 << pg_slots_by_epoch.begin()->epoch << dendl;
10054 return false;
10055 }
10056 });
10057 --waiting_for_min_pg_epoch;
10058}
10059
10060epoch_t OSDShard::get_max_waiting_epoch()
10061{
10062 std::lock_guard l(shard_lock);
10063 epoch_t r = 0;
10064 for (auto& i : pg_slots) {
10065 if (!i.second->waiting_peering.empty()) {
10066 r = std::max(r, i.second->waiting_peering.rbegin()->first);
10067 }
10068 }
10069 return r;
10070}
10071
10072void OSDShard::consume_map(
9f95a23c 10073 const OSDMapRef& new_osdmap,
11fdf7f2
TL
10074 unsigned *pushes_to_free)
10075{
10076 std::lock_guard l(shard_lock);
10077 OSDMapRef old_osdmap;
7c673cae 10078 {
11fdf7f2
TL
10079 std::lock_guard l(osdmap_lock);
10080 old_osdmap = std::move(shard_osdmap);
10081 shard_osdmap = new_osdmap;
10082 }
10083 dout(10) << new_osdmap->get_epoch()
10084 << " (was " << (old_osdmap ? old_osdmap->get_epoch() : 0) << ")"
10085 << dendl;
10086 bool queued = false;
10087
10088 // check slots
10089 auto p = pg_slots.begin();
10090 while (p != pg_slots.end()) {
10091 OSDShardPGSlot *slot = p->second.get();
10092 const spg_t& pgid = p->first;
10093 dout(20) << __func__ << " " << pgid << dendl;
10094 if (!slot->waiting_for_split.empty()) {
10095 dout(20) << __func__ << " " << pgid
10096 << " waiting for split " << slot->waiting_for_split << dendl;
10097 ++p;
10098 continue;
10099 }
10100 if (slot->waiting_for_merge_epoch > new_osdmap->get_epoch()) {
10101 dout(20) << __func__ << " " << pgid
10102 << " waiting for merge by epoch " << slot->waiting_for_merge_epoch
10103 << dendl;
10104 ++p;
10105 continue;
10106 }
10107 if (!slot->waiting_peering.empty()) {
10108 epoch_t first = slot->waiting_peering.begin()->first;
10109 if (first <= new_osdmap->get_epoch()) {
10110 dout(20) << __func__ << " " << pgid
10111 << " pending_peering first epoch " << first
10112 << " <= " << new_osdmap->get_epoch() << ", requeueing" << dendl;
10113 _wake_pg_slot(pgid, slot);
10114 queued = true;
10115 }
10116 ++p;
10117 continue;
10118 }
10119 if (!slot->waiting.empty()) {
10120 if (new_osdmap->is_up_acting_osd_shard(pgid, osd->get_nodeid())) {
10121 dout(20) << __func__ << " " << pgid << " maps to us, keeping"
10122 << dendl;
10123 ++p;
10124 continue;
7c673cae 10125 }
11fdf7f2
TL
10126 while (!slot->waiting.empty() &&
10127 slot->waiting.front().get_map_epoch() <= new_osdmap->get_epoch()) {
10128 auto& qi = slot->waiting.front();
10129 dout(20) << __func__ << " " << pgid
10130 << " waiting item " << qi
10131 << " epoch " << qi.get_map_epoch()
10132 << " <= " << new_osdmap->get_epoch()
10133 << ", "
10134 << (qi.get_map_epoch() < new_osdmap->get_epoch() ? "stale" :
10135 "misdirected")
10136 << ", dropping" << dendl;
10137 *pushes_to_free += qi.get_reserved_pushes();
10138 slot->waiting.pop_front();
10139 }
10140 }
10141 if (slot->waiting.empty() &&
10142 slot->num_running == 0 &&
10143 slot->waiting_for_split.empty() &&
10144 !slot->pg) {
10145 dout(20) << __func__ << " " << pgid << " empty, pruning" << dendl;
10146 p = pg_slots.erase(p);
10147 continue;
7c673cae 10148 }
11fdf7f2
TL
10149
10150 ++p;
7c673cae 10151 }
7c673cae 10152 if (queued) {
11fdf7f2
TL
10153 std::lock_guard l{sdata_wait_lock};
10154 sdata_cond.notify_one();
7c673cae
FG
10155 }
10156}
10157
11fdf7f2
TL
10158void OSDShard::_wake_pg_slot(
10159 spg_t pgid,
10160 OSDShardPGSlot *slot)
10161{
10162 dout(20) << __func__ << " " << pgid
10163 << " to_process " << slot->to_process
10164 << " waiting " << slot->waiting
10165 << " waiting_peering " << slot->waiting_peering << dendl;
10166 for (auto i = slot->to_process.rbegin();
10167 i != slot->to_process.rend();
10168 ++i) {
9f95a23c 10169 scheduler->enqueue_front(std::move(*i));
11fdf7f2
TL
10170 }
10171 slot->to_process.clear();
10172 for (auto i = slot->waiting.rbegin();
10173 i != slot->waiting.rend();
10174 ++i) {
9f95a23c 10175 scheduler->enqueue_front(std::move(*i));
11fdf7f2
TL
10176 }
10177 slot->waiting.clear();
10178 for (auto i = slot->waiting_peering.rbegin();
10179 i != slot->waiting_peering.rend();
10180 ++i) {
10181 // this is overkill; we requeue everything, even if some of these
10182 // items are waiting for maps we don't have yet. FIXME, maybe,
10183 // someday, if we decide this inefficiency matters
10184 for (auto j = i->second.rbegin(); j != i->second.rend(); ++j) {
9f95a23c 10185 scheduler->enqueue_front(std::move(*j));
11fdf7f2
TL
10186 }
10187 }
10188 slot->waiting_peering.clear();
10189 ++slot->requeue_seq;
10190}
10191
10192void OSDShard::identify_splits_and_merges(
10193 const OSDMapRef& as_of_osdmap,
10194 set<pair<spg_t,epoch_t>> *split_pgs,
10195 set<pair<spg_t,epoch_t>> *merge_pgs)
10196{
10197 std::lock_guard l(shard_lock);
10198 if (shard_osdmap) {
10199 for (auto& i : pg_slots) {
10200 const spg_t& pgid = i.first;
10201 auto *slot = i.second.get();
10202 if (slot->pg) {
10203 osd->service.identify_splits_and_merges(
10204 shard_osdmap, as_of_osdmap, pgid,
10205 split_pgs, merge_pgs);
10206 } else if (!slot->waiting_for_split.empty()) {
10207 osd->service.identify_splits_and_merges(
10208 shard_osdmap, as_of_osdmap, pgid,
10209 split_pgs, nullptr);
10210 } else {
10211 dout(20) << __func__ << " slot " << pgid
9f95a23c 10212 << " has no pg and waiting_for_split " << dendl;
7c673cae 10213 }
11fdf7f2
TL
10214 }
10215 }
10216}
10217
10218void OSDShard::prime_splits(const OSDMapRef& as_of_osdmap,
10219 set<pair<spg_t,epoch_t>> *pgids)
10220{
10221 std::lock_guard l(shard_lock);
10222 _prime_splits(pgids);
10223 if (shard_osdmap->get_epoch() > as_of_osdmap->get_epoch()) {
10224 set<pair<spg_t,epoch_t>> newer_children;
10225 for (auto i : *pgids) {
10226 osd->service.identify_splits_and_merges(
10227 as_of_osdmap, shard_osdmap, i.first,
10228 &newer_children, nullptr);
10229 }
10230 newer_children.insert(pgids->begin(), pgids->end());
10231 dout(10) << "as_of_osdmap " << as_of_osdmap->get_epoch() << " < shard "
10232 << shard_osdmap->get_epoch() << ", new children " << newer_children
10233 << dendl;
10234 _prime_splits(&newer_children);
10235 // note: we don't care what is left over here for other shards.
10236 // if this shard is ahead of us and one isn't, e.g., one thread is
10237 // calling into prime_splits via _process (due to a newly created
10238 // pg) and this shard has a newer map due to a racing consume_map,
10239 // then any grandchildren left here will be identified (or were
10240 // identified) when the slower shard's osdmap is advanced.
10241 // _prime_splits() will tolerate the case where the pgid is
10242 // already primed.
10243 }
10244}
10245
10246void OSDShard::_prime_splits(set<pair<spg_t,epoch_t>> *pgids)
10247{
10248 dout(10) << *pgids << dendl;
10249 auto p = pgids->begin();
10250 while (p != pgids->end()) {
10251 unsigned shard_index = p->first.hash_to_shard(osd->num_shards);
10252 if (shard_index == shard_id) {
10253 auto r = pg_slots.emplace(p->first, nullptr);
10254 if (r.second) {
10255 dout(10) << "priming slot " << p->first << " e" << p->second << dendl;
10256 r.first->second = make_unique<OSDShardPGSlot>();
10257 r.first->second->waiting_for_split.insert(p->second);
7c673cae 10258 } else {
11fdf7f2
TL
10259 auto q = r.first;
10260 ceph_assert(q != pg_slots.end());
10261 dout(10) << "priming (existing) slot " << p->first << " e" << p->second
10262 << dendl;
10263 q->second->waiting_for_split.insert(p->second);
7c673cae 10264 }
11fdf7f2
TL
10265 p = pgids->erase(p);
10266 } else {
10267 ++p;
7c673cae
FG
10268 }
10269 }
11fdf7f2
TL
10270}
10271
10272void OSDShard::prime_merges(const OSDMapRef& as_of_osdmap,
10273 set<pair<spg_t,epoch_t>> *merge_pgs)
10274{
10275 std::lock_guard l(shard_lock);
10276 dout(20) << __func__ << " checking shard " << shard_id
10277 << " for remaining merge pgs " << merge_pgs << dendl;
10278 auto p = merge_pgs->begin();
10279 while (p != merge_pgs->end()) {
10280 spg_t pgid = p->first;
10281 epoch_t epoch = p->second;
10282 unsigned shard_index = pgid.hash_to_shard(osd->num_shards);
10283 if (shard_index != shard_id) {
10284 ++p;
10285 continue;
10286 }
10287 OSDShardPGSlot *slot;
10288 auto r = pg_slots.emplace(pgid, nullptr);
10289 if (r.second) {
10290 r.first->second = make_unique<OSDShardPGSlot>();
10291 }
10292 slot = r.first->second.get();
10293 if (slot->pg) {
10294 // already have pg
10295 dout(20) << __func__ << " have merge participant pg " << pgid
10296 << " " << slot->pg << dendl;
10297 } else if (!slot->waiting_for_split.empty() &&
10298 *slot->waiting_for_split.begin() < epoch) {
10299 dout(20) << __func__ << " pending split on merge participant pg " << pgid
10300 << " " << slot->waiting_for_split << dendl;
10301 } else {
10302 dout(20) << __func__ << " creating empty merge participant " << pgid
10303 << " for merge in " << epoch << dendl;
10304 // leave history zeroed; PG::merge_from() will fill it in.
10305 pg_history_t history;
10306 PGCreateInfo cinfo(pgid, epoch - 1,
10307 history, PastIntervals(), false);
10308 PGRef pg = osd->handle_pg_create_info(shard_osdmap, &cinfo);
10309 _attach_pg(r.first->second.get(), pg.get());
10310 _wake_pg_slot(pgid, slot);
10311 pg->unlock();
10312 }
10313 // mark slot for merge
10314 dout(20) << __func__ << " marking merge participant " << pgid << dendl;
10315 slot->waiting_for_merge_epoch = epoch;
10316 p = merge_pgs->erase(p);
7c673cae
FG
10317 }
10318}
10319
11fdf7f2 10320void OSDShard::register_and_wake_split_child(PG *pg)
7c673cae 10321{
11fdf7f2
TL
10322 epoch_t epoch;
10323 {
10324 std::lock_guard l(shard_lock);
10325 dout(10) << pg->pg_id << " " << pg << dendl;
10326 auto p = pg_slots.find(pg->pg_id);
10327 ceph_assert(p != pg_slots.end());
10328 auto *slot = p->second.get();
10329 dout(20) << pg->pg_id << " waiting_for_split " << slot->waiting_for_split
10330 << dendl;
10331 ceph_assert(!slot->pg);
10332 ceph_assert(!slot->waiting_for_split.empty());
10333 _attach_pg(slot, pg);
10334
10335 epoch = pg->get_osdmap_epoch();
10336 ceph_assert(slot->waiting_for_split.count(epoch));
10337 slot->waiting_for_split.erase(epoch);
10338 if (slot->waiting_for_split.empty()) {
10339 _wake_pg_slot(pg->pg_id, slot);
10340 } else {
10341 dout(10) << __func__ << " still waiting for split on "
10342 << slot->waiting_for_split << dendl;
10343 }
7c673cae 10344 }
11fdf7f2
TL
10345
10346 // kick child to ensure it pulls up to the latest osdmap
10347 osd->enqueue_peering_evt(
10348 pg->pg_id,
10349 PGPeeringEventRef(
10350 std::make_shared<PGPeeringEvent>(
10351 epoch,
10352 epoch,
10353 NullEvt())));
10354
10355 std::lock_guard l{sdata_wait_lock};
10356 sdata_cond.notify_one();
7c673cae
FG
10357}
10358
11fdf7f2 10359void OSDShard::unprime_split_children(spg_t parent, unsigned old_pg_num)
7c673cae 10360{
11fdf7f2
TL
10361 std::lock_guard l(shard_lock);
10362 vector<spg_t> to_delete;
10363 for (auto& i : pg_slots) {
10364 if (i.first != parent &&
10365 i.first.get_ancestor(old_pg_num) == parent) {
10366 dout(10) << __func__ << " parent " << parent << " clearing " << i.first
10367 << dendl;
10368 _wake_pg_slot(i.first, i.second.get());
10369 to_delete.push_back(i.first);
10370 }
10371 }
10372 for (auto pgid : to_delete) {
10373 pg_slots.erase(pgid);
10374 }
10375}
10376
9f95a23c
TL
10377OSDShard::OSDShard(
10378 int id,
10379 CephContext *cct,
10380 OSD *osd)
10381 : shard_id(id),
10382 cct(cct),
10383 osd(osd),
10384 shard_name(string("OSDShard.") + stringify(id)),
10385 sdata_wait_lock_name(shard_name + "::sdata_wait_lock"),
10386 sdata_wait_lock{make_mutex(sdata_wait_lock_name)},
10387 osdmap_lock{make_mutex(shard_name + "::osdmap_lock")},
10388 shard_lock_name(shard_name + "::shard_lock"),
10389 shard_lock{make_mutex(shard_lock_name)},
10390 scheduler(ceph::osd::scheduler::make_scheduler(cct)),
10391 context_queue(sdata_wait_lock, sdata_cond)
10392{
10393 dout(0) << "using op scheduler " << *scheduler << dendl;
10394}
10395
11fdf7f2
TL
10396
10397// =============================================================
10398
10399#undef dout_context
10400#define dout_context osd->cct
10401#undef dout_prefix
10402#define dout_prefix *_dout << "osd." << osd->whoami << " op_wq "
10403
10404void OSD::ShardedOpWQ::_add_slot_waiter(
10405 spg_t pgid,
10406 OSDShardPGSlot *slot,
9f95a23c 10407 OpSchedulerItem&& qi)
11fdf7f2
TL
10408{
10409 if (qi.is_peering()) {
10410 dout(20) << __func__ << " " << pgid
10411 << " peering, item epoch is "
10412 << qi.get_map_epoch()
10413 << ", will wait on " << qi << dendl;
10414 slot->waiting_peering[qi.get_map_epoch()].push_back(std::move(qi));
10415 } else {
10416 dout(20) << __func__ << " " << pgid
10417 << " item epoch is "
10418 << qi.get_map_epoch()
10419 << ", will wait on " << qi << dendl;
10420 slot->waiting.push_back(std::move(qi));
7c673cae
FG
10421 }
10422}
10423
10424#undef dout_prefix
10425#define dout_prefix *_dout << "osd." << osd->whoami << " op_wq(" << shard_index << ") "
10426
10427void OSD::ShardedOpWQ::_process(uint32_t thread_index, heartbeat_handle_d *hb)
10428{
11fdf7f2
TL
10429 uint32_t shard_index = thread_index % osd->num_shards;
10430 auto& sdata = osd->shards[shard_index];
10431 ceph_assert(sdata);
10432
10433 // If all threads of shards do oncommits, there is a out-of-order
10434 // problem. So we choose the thread which has the smallest
10435 // thread_index(thread_index < num_shards) of shard to do oncommit
10436 // callback.
10437 bool is_smallest_thread_index = thread_index < osd->num_shards;
7c673cae
FG
10438
10439 // peek at spg_t
11fdf7f2 10440 sdata->shard_lock.lock();
9f95a23c 10441 if (sdata->scheduler->empty() &&
11fdf7f2
TL
10442 (!is_smallest_thread_index || sdata->context_queue.empty())) {
10443 std::unique_lock wait_lock{sdata->sdata_wait_lock};
10444 if (is_smallest_thread_index && !sdata->context_queue.empty()) {
10445 // we raced with a context_queue addition, don't wait
10446 wait_lock.unlock();
10447 } else if (!sdata->stop_waiting) {
10448 dout(20) << __func__ << " empty q, waiting" << dendl;
10449 osd->cct->get_heartbeat_map()->clear_timeout(hb);
10450 sdata->shard_lock.unlock();
10451 sdata->sdata_cond.wait(wait_lock);
10452 wait_lock.unlock();
10453 sdata->shard_lock.lock();
9f95a23c 10454 if (sdata->scheduler->empty() &&
11fdf7f2
TL
10455 !(is_smallest_thread_index && !sdata->context_queue.empty())) {
10456 sdata->shard_lock.unlock();
10457 return;
10458 }
e306af50 10459 // found a work item; reapply default wq timeouts
11fdf7f2 10460 osd->cct->get_heartbeat_map()->reset_timeout(hb,
e306af50 10461 timeout_interval, suicide_interval);
11fdf7f2
TL
10462 } else {
10463 dout(20) << __func__ << " need return immediately" << dendl;
10464 wait_lock.unlock();
10465 sdata->shard_lock.unlock();
7c673cae
FG
10466 return;
10467 }
10468 }
11fdf7f2
TL
10469
10470 list<Context *> oncommits;
9f95a23c
TL
10471 if (is_smallest_thread_index) {
10472 sdata->context_queue.move_to(oncommits);
7c673cae 10473 }
11fdf7f2 10474
9f95a23c 10475 if (sdata->scheduler->empty()) {
11fdf7f2
TL
10476 if (osd->is_stopping()) {
10477 sdata->shard_lock.unlock();
10478 for (auto c : oncommits) {
10479 dout(10) << __func__ << " discarding in-flight oncommit " << c << dendl;
10480 delete c;
10481 }
10482 return; // OSD shutdown, discard.
7c673cae 10483 }
11fdf7f2
TL
10484 sdata->shard_lock.unlock();
10485 handle_oncommits(oncommits);
10486 return;
7c673cae 10487 }
7c673cae 10488
9f95a23c 10489 OpSchedulerItem item = sdata->scheduler->dequeue();
11fdf7f2
TL
10490 if (osd->is_stopping()) {
10491 sdata->shard_lock.unlock();
10492 for (auto c : oncommits) {
10493 dout(10) << __func__ << " discarding in-flight oncommit " << c << dendl;
10494 delete c;
10495 }
10496 return; // OSD shutdown, discard.
10497 }
7c673cae 10498
11fdf7f2
TL
10499 const auto token = item.get_ordering_token();
10500 auto r = sdata->pg_slots.emplace(token, nullptr);
10501 if (r.second) {
10502 r.first->second = make_unique<OSDShardPGSlot>();
7c673cae 10503 }
11fdf7f2
TL
10504 OSDShardPGSlot *slot = r.first->second.get();
10505 dout(20) << __func__ << " " << token
10506 << (r.second ? " (new)" : "")
10507 << " to_process " << slot->to_process
10508 << " waiting " << slot->waiting
10509 << " waiting_peering " << slot->waiting_peering
10510 << dendl;
10511 slot->to_process.push_back(std::move(item));
10512 dout(20) << __func__ << " " << slot->to_process.back()
10513 << " queued" << dendl;
7c673cae 10514
11fdf7f2
TL
10515 retry_pg:
10516 PGRef pg = slot->pg;
7c673cae 10517
11fdf7f2
TL
10518 // lock pg (if we have it)
10519 if (pg) {
10520 // note the requeue seq now...
10521 uint64_t requeue_seq = slot->requeue_seq;
10522 ++slot->num_running;
7c673cae 10523
11fdf7f2
TL
10524 sdata->shard_lock.unlock();
10525 osd->service.maybe_inject_dispatch_delay();
10526 pg->lock();
10527 osd->service.maybe_inject_dispatch_delay();
10528 sdata->shard_lock.lock();
7c673cae 10529
11fdf7f2
TL
10530 auto q = sdata->pg_slots.find(token);
10531 if (q == sdata->pg_slots.end()) {
10532 // this can happen if we race with pg removal.
10533 dout(20) << __func__ << " slot " << token << " no longer there" << dendl;
10534 pg->unlock();
10535 sdata->shard_lock.unlock();
10536 handle_oncommits(oncommits);
10537 return;
10538 }
10539 slot = q->second.get();
10540 --slot->num_running;
7c673cae 10541
11fdf7f2
TL
10542 if (slot->to_process.empty()) {
10543 // raced with _wake_pg_slot or consume_map
10544 dout(20) << __func__ << " " << token
10545 << " nothing queued" << dendl;
7c673cae 10546 pg->unlock();
11fdf7f2
TL
10547 sdata->shard_lock.unlock();
10548 handle_oncommits(oncommits);
10549 return;
7c673cae 10550 }
11fdf7f2
TL
10551 if (requeue_seq != slot->requeue_seq) {
10552 dout(20) << __func__ << " " << token
10553 << " requeue_seq " << slot->requeue_seq << " > our "
10554 << requeue_seq << ", we raced with _wake_pg_slot"
10555 << dendl;
7c673cae 10556 pg->unlock();
11fdf7f2
TL
10557 sdata->shard_lock.unlock();
10558 handle_oncommits(oncommits);
10559 return;
7c673cae 10560 }
11fdf7f2
TL
10561 if (slot->pg != pg) {
10562 // this can happen if we race with pg removal.
10563 dout(20) << __func__ << " slot " << token << " no longer attached to "
10564 << pg << dendl;
7c673cae 10565 pg->unlock();
11fdf7f2 10566 goto retry_pg;
7c673cae 10567 }
7c673cae
FG
10568 }
10569
11fdf7f2
TL
10570 dout(20) << __func__ << " " << token
10571 << " to_process " << slot->to_process
10572 << " waiting " << slot->waiting
10573 << " waiting_peering " << slot->waiting_peering << dendl;
10574
10575 ThreadPool::TPHandle tp_handle(osd->cct, hb, timeout_interval,
10576 suicide_interval);
10577
7c673cae 10578 // take next item
11fdf7f2
TL
10579 auto qi = std::move(slot->to_process.front());
10580 slot->to_process.pop_front();
10581 dout(20) << __func__ << " " << qi << " pg " << pg << dendl;
10582 set<pair<spg_t,epoch_t>> new_children;
10583 OSDMapRef osdmap;
7c673cae 10584
11fdf7f2 10585 while (!pg) {
7c673cae 10586 // should this pg shard exist on this osd in this (or a later) epoch?
11fdf7f2
TL
10587 osdmap = sdata->shard_osdmap;
10588 const PGCreateInfo *create_info = qi.creates_pg();
10589 if (!slot->waiting_for_split.empty()) {
10590 dout(20) << __func__ << " " << token
10591 << " splitting " << slot->waiting_for_split << dendl;
10592 _add_slot_waiter(token, slot, std::move(qi));
10593 } else if (qi.get_map_epoch() > osdmap->get_epoch()) {
10594 dout(20) << __func__ << " " << token
10595 << " map " << qi.get_map_epoch() << " > "
10596 << osdmap->get_epoch() << dendl;
10597 _add_slot_waiter(token, slot, std::move(qi));
10598 } else if (qi.is_peering()) {
10599 if (!qi.peering_requires_pg()) {
10600 // for pg-less events, we run them under the ordering lock, since
10601 // we don't have the pg lock to keep them ordered.
10602 qi.run(osd, sdata, pg, tp_handle);
10603 } else if (osdmap->is_up_acting_osd_shard(token, osd->whoami)) {
10604 if (create_info) {
10605 if (create_info->by_mon &&
10606 osdmap->get_pg_acting_primary(token.pgid) != osd->whoami) {
10607 dout(20) << __func__ << " " << token
10608 << " no pg, no longer primary, ignoring mon create on "
10609 << qi << dendl;
10610 } else {
10611 dout(20) << __func__ << " " << token
10612 << " no pg, should create on " << qi << dendl;
10613 pg = osd->handle_pg_create_info(osdmap, create_info);
10614 if (pg) {
10615 // we created the pg! drop out and continue "normally"!
10616 sdata->_attach_pg(slot, pg.get());
10617 sdata->_wake_pg_slot(token, slot);
10618
10619 // identify split children between create epoch and shard epoch.
10620 osd->service.identify_splits_and_merges(
10621 pg->get_osdmap(), osdmap, pg->pg_id, &new_children, nullptr);
10622 sdata->_prime_splits(&new_children);
10623 // distribute remaining split children to other shards below!
10624 break;
10625 }
10626 dout(20) << __func__ << " ignored create on " << qi << dendl;
10627 }
10628 } else {
10629 dout(20) << __func__ << " " << token
10630 << " no pg, peering, !create, discarding " << qi << dendl;
10631 }
10632 } else {
10633 dout(20) << __func__ << " " << token
10634 << " no pg, peering, doesn't map here e" << osdmap->get_epoch()
10635 << ", discarding " << qi
10636 << dendl;
10637 }
10638 } else if (osdmap->is_up_acting_osd_shard(token, osd->whoami)) {
10639 dout(20) << __func__ << " " << token
10640 << " no pg, should exist e" << osdmap->get_epoch()
10641 << ", will wait on " << qi << dendl;
10642 _add_slot_waiter(token, slot, std::move(qi));
7c673cae 10643 } else {
11fdf7f2
TL
10644 dout(20) << __func__ << " " << token
10645 << " no pg, shouldn't exist e" << osdmap->get_epoch()
10646 << ", dropping " << qi << dendl;
7c673cae 10647 // share map with client?
9f95a23c
TL
10648 if (std::optional<OpRequestRef> _op = qi.maybe_get_op()) {
10649 osd->service.maybe_share_map((*_op)->get_req()->get_connection().get(),
10650 sdata->shard_osdmap,
10651 (*_op)->sent_epoch);
7c673cae 10652 }
11fdf7f2 10653 unsigned pushes_to_free = qi.get_reserved_pushes();
7c673cae 10654 if (pushes_to_free > 0) {
11fdf7f2 10655 sdata->shard_lock.unlock();
7c673cae 10656 osd->service.release_reserved_pushes(pushes_to_free);
11fdf7f2 10657 handle_oncommits(oncommits);
7c673cae
FG
10658 return;
10659 }
10660 }
11fdf7f2
TL
10661 sdata->shard_lock.unlock();
10662 handle_oncommits(oncommits);
7c673cae
FG
10663 return;
10664 }
11fdf7f2
TL
10665 if (qi.is_peering()) {
10666 OSDMapRef osdmap = sdata->shard_osdmap;
10667 if (qi.get_map_epoch() > osdmap->get_epoch()) {
10668 _add_slot_waiter(token, slot, std::move(qi));
10669 sdata->shard_lock.unlock();
10670 pg->unlock();
10671 handle_oncommits(oncommits);
10672 return;
10673 }
10674 }
10675 sdata->shard_lock.unlock();
7c673cae 10676
11fdf7f2
TL
10677 if (!new_children.empty()) {
10678 for (auto shard : osd->shards) {
10679 shard->prime_splits(osdmap, &new_children);
10680 }
10681 ceph_assert(new_children.empty());
10682 }
7c673cae
FG
10683
10684 // osd_opwq_process marks the point at which an operation has been dequeued
10685 // and will begin to be handled by a worker thread.
10686 {
10687#ifdef WITH_LTTNG
10688 osd_reqid_t reqid;
9f95a23c 10689 if (std::optional<OpRequestRef> _op = qi.maybe_get_op()) {
7c673cae
FG
10690 reqid = (*_op)->get_reqid();
10691 }
10692#endif
10693 tracepoint(osd, opwq_process_start, reqid.name._type,
10694 reqid.name._num, reqid.tid, reqid.inc);
10695 }
10696
10697 lgeneric_subdout(osd->cct, osd, 30) << "dequeue status: ";
10698 Formatter *f = Formatter::create("json");
10699 f->open_object_section("q");
10700 dump(f);
10701 f->close_section();
10702 f->flush(*_dout);
10703 delete f;
10704 *_dout << dendl;
10705
11fdf7f2 10706 qi.run(osd, sdata, pg, tp_handle);
7c673cae
FG
10707
10708 {
10709#ifdef WITH_LTTNG
10710 osd_reqid_t reqid;
9f95a23c 10711 if (std::optional<OpRequestRef> _op = qi.maybe_get_op()) {
7c673cae
FG
10712 reqid = (*_op)->get_reqid();
10713 }
10714#endif
10715 tracepoint(osd, opwq_process_finish, reqid.name._type,
10716 reqid.name._num, reqid.tid, reqid.inc);
10717 }
10718
11fdf7f2 10719 handle_oncommits(oncommits);
7c673cae
FG
10720}
10721
9f95a23c 10722void OSD::ShardedOpWQ::_enqueue(OpSchedulerItem&& item) {
7c673cae 10723 uint32_t shard_index =
11fdf7f2 10724 item.get_ordering_token().hash_to_shard(osd->shards.size());
7c673cae 10725
9f95a23c
TL
10726 dout(20) << __func__ << " " << item << dendl;
10727
11fdf7f2 10728 OSDShard* sdata = osd->shards[shard_index];
7c673cae 10729 assert (NULL != sdata);
7c673cae 10730
9f95a23c
TL
10731 bool empty = true;
10732 {
10733 std::lock_guard l{sdata->shard_lock};
10734 empty = sdata->scheduler->empty();
10735 sdata->scheduler->enqueue(std::move(item));
10736 }
7c673cae 10737
9f95a23c
TL
10738 if (empty) {
10739 std::lock_guard l{sdata->sdata_wait_lock};
f6b5b4d7 10740 sdata->sdata_cond.notify_all();
9f95a23c 10741 }
7c673cae
FG
10742}
10743
9f95a23c 10744void OSD::ShardedOpWQ::_enqueue_front(OpSchedulerItem&& item)
7c673cae 10745{
11fdf7f2
TL
10746 auto shard_index = item.get_ordering_token().hash_to_shard(osd->shards.size());
10747 auto& sdata = osd->shards[shard_index];
10748 ceph_assert(sdata);
10749 sdata->shard_lock.lock();
10750 auto p = sdata->pg_slots.find(item.get_ordering_token());
10751 if (p != sdata->pg_slots.end() &&
10752 !p->second->to_process.empty()) {
7c673cae 10753 // we may be racing with _process, which has dequeued a new item
9f95a23c 10754 // from scheduler, put it on to_process, and is now busy taking the
7c673cae
FG
10755 // pg lock. ensure this old requeued item is ordered before any
10756 // such newer item in to_process.
11fdf7f2
TL
10757 p->second->to_process.push_front(std::move(item));
10758 item = std::move(p->second->to_process.back());
10759 p->second->to_process.pop_back();
10760 dout(20) << __func__
10761 << " " << p->second->to_process.front()
10762 << " shuffled w/ " << item << dendl;
7c673cae 10763 } else {
11fdf7f2 10764 dout(20) << __func__ << " " << item << dendl;
7c673cae 10765 }
9f95a23c 10766 sdata->scheduler->enqueue_front(std::move(item));
11fdf7f2
TL
10767 sdata->shard_lock.unlock();
10768 std::lock_guard l{sdata->sdata_wait_lock};
10769 sdata->sdata_cond.notify_one();
7c673cae
FG
10770}
10771
10772namespace ceph {
10773namespace osd_cmds {
10774
11fdf7f2
TL
10775int heap(CephContext& cct, const cmdmap_t& cmdmap, Formatter& f,
10776 std::ostream& os)
7c673cae
FG
10777{
10778 if (!ceph_using_tcmalloc()) {
10779 os << "could not issue heap profiler command -- not using tcmalloc!";
10780 return -EOPNOTSUPP;
10781 }
10782
10783 string cmd;
9f95a23c 10784 if (!cmd_getval(cmdmap, "heapcmd", cmd)) {
7c673cae
FG
10785 os << "unable to get value for command \"" << cmd << "\"";
10786 return -EINVAL;
11fdf7f2 10787 }
7c673cae
FG
10788
10789 std::vector<std::string> cmd_vec;
10790 get_str_vec(cmd, cmd_vec);
11fdf7f2
TL
10791
10792 string val;
9f95a23c 10793 if (cmd_getval(cmdmap, "value", val)) {
11fdf7f2
TL
10794 cmd_vec.push_back(val);
10795 }
7c673cae
FG
10796
10797 ceph_heap_profiler_handle_command(cmd_vec, os);
10798
10799 return 0;
10800}
10801
10802}} // namespace ceph::osd_cmds