]> git.proxmox.com Git - ceph.git/blame - ceph/src/osd/OSD.cc
import new upstream nautilus stable release 14.2.8
[ceph.git] / ceph / src / osd / OSD.cc
CommitLineData
7c673cae
FG
1// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2// vim: ts=8 sw=2 smarttab
3/*
4 * Ceph - scalable distributed file system
5 *
6 * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
7 * Copyright (C) 2017 OVH
8 *
9 * This is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU Lesser General Public
11 * License version 2.1, as published by the Free Software
12 * Foundation. See file COPYING.
13 *
14 */
11fdf7f2 15
7c673cae 16#include "acconfig.h"
11fdf7f2
TL
17
18#include <cctype>
7c673cae
FG
19#include <fstream>
20#include <iostream>
11fdf7f2
TL
21#include <iterator>
22
23#include <unistd.h>
7c673cae
FG
24#include <sys/stat.h>
25#include <signal.h>
eafe8130 26#include <time.h>
7c673cae 27#include <boost/scoped_ptr.hpp>
eafe8130 28#include <boost/range/adaptor/reversed.hpp>
7c673cae
FG
29
30#ifdef HAVE_SYS_PARAM_H
31#include <sys/param.h>
32#endif
33
34#ifdef HAVE_SYS_MOUNT_H
35#include <sys/mount.h>
36#endif
37
38#include "osd/PG.h"
39
40#include "include/types.h"
41#include "include/compat.h"
11fdf7f2 42#include "include/random.h"
7c673cae
FG
43
44#include "OSD.h"
45#include "OSDMap.h"
46#include "Watch.h"
47#include "osdc/Objecter.h"
48
49#include "common/errno.h"
50#include "common/ceph_argparse.h"
224ce89b 51#include "common/ceph_time.h"
7c673cae 52#include "common/version.h"
b5b8bbf5 53#include "common/pick_address.h"
11fdf7f2
TL
54#include "common/blkdev.h"
55#include "common/numa.h"
7c673cae
FG
56
57#include "os/ObjectStore.h"
58#ifdef HAVE_LIBFUSE
59#include "os/FuseStore.h"
60#endif
61
62#include "PrimaryLogPG.h"
63
7c673cae
FG
64#include "msg/Messenger.h"
65#include "msg/Message.h"
66
67#include "mon/MonClient.h"
68
69#include "messages/MLog.h"
70
71#include "messages/MGenericMessage.h"
7c673cae
FG
72#include "messages/MOSDPing.h"
73#include "messages/MOSDFailure.h"
74#include "messages/MOSDMarkMeDown.h"
75#include "messages/MOSDFull.h"
76#include "messages/MOSDOp.h"
77#include "messages/MOSDOpReply.h"
78#include "messages/MOSDBackoff.h"
79#include "messages/MOSDBeacon.h"
80#include "messages/MOSDRepOp.h"
81#include "messages/MOSDRepOpReply.h"
82#include "messages/MOSDBoot.h"
83#include "messages/MOSDPGTemp.h"
11fdf7f2 84#include "messages/MOSDPGReadyToMerge.h"
7c673cae
FG
85
86#include "messages/MOSDMap.h"
87#include "messages/MMonGetOSDMap.h"
88#include "messages/MOSDPGNotify.h"
89#include "messages/MOSDPGQuery.h"
90#include "messages/MOSDPGLog.h"
91#include "messages/MOSDPGRemove.h"
92#include "messages/MOSDPGInfo.h"
93#include "messages/MOSDPGCreate.h"
11fdf7f2 94#include "messages/MOSDPGCreate2.h"
7c673cae
FG
95#include "messages/MOSDPGTrim.h"
96#include "messages/MOSDPGScan.h"
7c673cae
FG
97#include "messages/MBackfillReserve.h"
98#include "messages/MRecoveryReserve.h"
c07f9fc5 99#include "messages/MOSDForceRecovery.h"
7c673cae
FG
100#include "messages/MOSDECSubOpWrite.h"
101#include "messages/MOSDECSubOpWriteReply.h"
102#include "messages/MOSDECSubOpRead.h"
103#include "messages/MOSDECSubOpReadReply.h"
104#include "messages/MOSDPGCreated.h"
105#include "messages/MOSDPGUpdateLogMissing.h"
106#include "messages/MOSDPGUpdateLogMissingReply.h"
107
11fdf7f2
TL
108#include "messages/MOSDPeeringOp.h"
109
7c673cae
FG
110#include "messages/MOSDAlive.h"
111
112#include "messages/MOSDScrub.h"
11fdf7f2 113#include "messages/MOSDScrub2.h"
7c673cae
FG
114#include "messages/MOSDRepScrub.h"
115
116#include "messages/MMonCommand.h"
117#include "messages/MCommand.h"
118#include "messages/MCommandReply.h"
119
120#include "messages/MPGStats.h"
121#include "messages/MPGStatsAck.h"
122
123#include "messages/MWatchNotify.h"
124#include "messages/MOSDPGPush.h"
125#include "messages/MOSDPGPushReply.h"
126#include "messages/MOSDPGPull.h"
127
128#include "common/perf_counters.h"
129#include "common/Timer.h"
130#include "common/LogClient.h"
131#include "common/AsyncReserver.h"
132#include "common/HeartbeatMap.h"
133#include "common/admin_socket.h"
134#include "common/ceph_context.h"
135
136#include "global/signal_handler.h"
137#include "global/pidfile.h"
138
139#include "include/color.h"
140#include "perfglue/cpu_profiler.h"
141#include "perfglue/heap_profiler.h"
142
143#include "osd/OpRequest.h"
144
145#include "auth/AuthAuthorizeHandler.h"
146#include "auth/RotatingKeyRing.h"
7c673cae
FG
147
148#include "objclass/objclass.h"
149
150#include "common/cmdparse.h"
151#include "include/str_list.h"
152#include "include/util.h"
153
11fdf7f2 154#include "include/ceph_assert.h"
7c673cae
FG
155#include "common/config.h"
156#include "common/EventTrace.h"
157
11fdf7f2
TL
158#include "json_spirit/json_spirit_reader.h"
159#include "json_spirit/json_spirit_writer.h"
160
7c673cae
FG
161#ifdef WITH_LTTNG
162#define TRACEPOINT_DEFINE
163#define TRACEPOINT_PROBE_DYNAMIC_LINKAGE
164#include "tracing/osd.h"
165#undef TRACEPOINT_PROBE_DYNAMIC_LINKAGE
166#undef TRACEPOINT_DEFINE
167#else
168#define tracepoint(...)
169#endif
170
171#define dout_context cct
172#define dout_subsys ceph_subsys_osd
173#undef dout_prefix
174#define dout_prefix _prefix(_dout, whoami, get_osdmap_epoch())
175
224ce89b 176
7c673cae
FG
177static ostream& _prefix(std::ostream* _dout, int whoami, epoch_t epoch) {
178 return *_dout << "osd." << whoami << " " << epoch << " ";
179}
180
7c673cae
FG
181//Initial features in new superblock.
182//Features here are also automatically upgraded
183CompatSet OSD::get_osd_initial_compat_set() {
184 CompatSet::FeatureSet ceph_osd_feature_compat;
185 CompatSet::FeatureSet ceph_osd_feature_ro_compat;
186 CompatSet::FeatureSet ceph_osd_feature_incompat;
187 ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_BASE);
188 ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_PGINFO);
189 ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_OLOC);
190 ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_LEC);
191 ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_CATEGORIES);
192 ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_HOBJECTPOOL);
193 ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_BIGINFO);
194 ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_LEVELDBINFO);
195 ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_LEVELDBLOG);
196 ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_SNAPMAPPER);
197 ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_HINTS);
198 ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_PGMETA);
199 ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_MISSING);
200 ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_FASTINFO);
c07f9fc5 201 ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_RECOVERY_DELETES);
7c673cae
FG
202 return CompatSet(ceph_osd_feature_compat, ceph_osd_feature_ro_compat,
203 ceph_osd_feature_incompat);
204}
205
206//Features are added here that this OSD supports.
207CompatSet OSD::get_osd_compat_set() {
208 CompatSet compat = get_osd_initial_compat_set();
209 //Any features here can be set in code, but not in initial superblock
210 compat.incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_SHARDS);
211 return compat;
212}
213
214OSDService::OSDService(OSD *osd) :
215 osd(osd),
216 cct(osd->cct),
7c673cae
FG
217 whoami(osd->whoami), store(osd->store),
218 log_client(osd->log_client), clog(osd->clog),
219 pg_recovery_stats(osd->pg_recovery_stats),
220 cluster_messenger(osd->cluster_messenger),
221 client_messenger(osd->client_messenger),
222 logger(osd->logger),
223 recoverystate_perf(osd->recoverystate_perf),
224 monc(osd->monc),
7c673cae 225 class_handler(osd->class_handler),
11fdf7f2
TL
226 osd_max_object_size(cct->_conf, "osd_max_object_size"),
227 osd_skip_data_digest(cct->_conf, "osd_skip_data_digest"),
228 publish_lock{ceph::make_mutex("OSDService::publish_lock")},
229 pre_publish_lock{ceph::make_mutex("OSDService::pre_publish_lock")},
7c673cae
FG
230 max_oldest_map(0),
231 peer_map_epoch_lock("OSDService::peer_map_epoch_lock"),
eafe8130
TL
232 sched_scrub_lock("OSDService::sched_scrub_lock"),
233 scrubs_local(0),
234 scrubs_remote(0),
7c673cae
FG
235 agent_lock("OSDService::agent_lock"),
236 agent_valid_iterator(false),
237 agent_ops(0),
238 flush_mode_high_count(0),
239 agent_active(true),
240 agent_thread(this),
241 agent_stop_flag(false),
242 agent_timer_lock("OSDService::agent_timer_lock"),
243 agent_timer(osd->client_messenger->cct, agent_timer_lock),
244 last_recalibrate(ceph_clock_now()),
245 promote_max_objects(0),
246 promote_max_bytes(0),
247 objecter(new Objecter(osd->client_messenger->cct, osd->objecter_messenger, osd->monc, NULL, 0, 0)),
11fdf7f2 248 m_objecter_finishers(cct->_conf->osd_objecter_finishers),
7c673cae
FG
249 watch_lock("OSDService::watch_lock"),
250 watch_timer(osd->client_messenger->cct, watch_lock),
251 next_notif_id(0),
252 recovery_request_lock("OSDService::recovery_request_lock"),
253 recovery_request_timer(cct, recovery_request_lock, false),
11fdf7f2
TL
254 sleep_lock("OSDService::sleep_lock"),
255 sleep_timer(cct, sleep_lock, false),
7c673cae 256 reserver_finisher(cct),
3efd9988 257 local_reserver(cct, &reserver_finisher, cct->_conf->osd_max_backfills,
7c673cae 258 cct->_conf->osd_min_recovery_priority),
3efd9988 259 remote_reserver(cct, &reserver_finisher, cct->_conf->osd_max_backfills,
7c673cae
FG
260 cct->_conf->osd_min_recovery_priority),
261 pg_temp_lock("OSDService::pg_temp_lock"),
3efd9988 262 snap_reserver(cct, &reserver_finisher,
7c673cae
FG
263 cct->_conf->osd_max_trimming_pgs),
264 recovery_lock("OSDService::recovery_lock"),
265 recovery_ops_active(0),
266 recovery_ops_reserved(0),
267 recovery_paused(false),
268 map_cache_lock("OSDService::map_cache_lock"),
269 map_cache(cct, cct->_conf->osd_map_cache_size),
270 map_bl_cache(cct->_conf->osd_map_cache_size),
271 map_bl_inc_cache(cct->_conf->osd_map_cache_size),
7c673cae
FG
272 stat_lock("OSDService::stat_lock"),
273 full_status_lock("OSDService::full_status_lock"),
274 cur_state(NONE),
11fdf7f2 275 cur_ratio(0), physical_ratio(0),
7c673cae
FG
276 epoch_lock("OSDService::epoch_lock"),
277 boot_epoch(0), up_epoch(0), bind_epoch(0),
278 is_stopping_lock("OSDService::is_stopping_lock")
279#ifdef PG_DEBUG_REFS
280 , pgid_lock("OSDService::pgid_lock")
281#endif
282{
283 objecter->init();
11fdf7f2
TL
284
285 for (int i = 0; i < m_objecter_finishers; i++) {
286 ostringstream str;
287 str << "objecter-finisher-" << i;
288 Finisher *fin = new Finisher(osd->client_messenger->cct, str.str(), "finisher");
289 objecter_finishers.push_back(fin);
290 }
7c673cae
FG
291}
292
293OSDService::~OSDService()
294{
295 delete objecter;
11fdf7f2
TL
296
297 for (auto f : objecter_finishers) {
298 delete f;
299 f = NULL;
300 }
7c673cae
FG
301}
302
31f18b77
FG
303
304
305#ifdef PG_DEBUG_REFS
306void OSDService::add_pgid(spg_t pgid, PG *pg){
11fdf7f2 307 std::lock_guard l(pgid_lock);
31f18b77
FG
308 if (!pgid_tracker.count(pgid)) {
309 live_pgs[pgid] = pg;
310 }
311 pgid_tracker[pgid]++;
312}
313void OSDService::remove_pgid(spg_t pgid, PG *pg)
314{
11fdf7f2
TL
315 std::lock_guard l(pgid_lock);
316 ceph_assert(pgid_tracker.count(pgid));
317 ceph_assert(pgid_tracker[pgid] > 0);
31f18b77
FG
318 pgid_tracker[pgid]--;
319 if (pgid_tracker[pgid] == 0) {
320 pgid_tracker.erase(pgid);
321 live_pgs.erase(pgid);
322 }
323}
324void OSDService::dump_live_pgids()
325{
11fdf7f2 326 std::lock_guard l(pgid_lock);
31f18b77
FG
327 derr << "live pgids:" << dendl;
328 for (map<spg_t, int>::const_iterator i = pgid_tracker.cbegin();
329 i != pgid_tracker.cend();
330 ++i) {
331 derr << "\t" << *i << dendl;
332 live_pgs[i->first]->dump_live_ids();
333 }
334}
335#endif
336
337
7c673cae 338
11fdf7f2
TL
339void OSDService::identify_splits_and_merges(
340 OSDMapRef old_map,
341 OSDMapRef new_map,
342 spg_t pgid,
343 set<pair<spg_t,epoch_t>> *split_children,
344 set<pair<spg_t,epoch_t>> *merge_pgs)
7c673cae 345{
11fdf7f2 346 if (!old_map->have_pg_pool(pgid.pool())) {
7c673cae 347 return;
7c673cae 348 }
7c673cae 349 int old_pgnum = old_map->get_pg_num(pgid.pool());
11fdf7f2
TL
350 auto p = osd->pg_num_history.pg_nums.find(pgid.pool());
351 if (p == osd->pg_num_history.pg_nums.end()) {
352 return;
353 }
354 dout(20) << __func__ << " " << pgid << " e" << old_map->get_epoch()
355 << " to e" << new_map->get_epoch()
356 << " pg_nums " << p->second << dendl;
357 deque<spg_t> queue;
358 queue.push_back(pgid);
eafe8130 359 set<spg_t> did;
11fdf7f2
TL
360 while (!queue.empty()) {
361 auto cur = queue.front();
362 queue.pop_front();
eafe8130 363 did.insert(cur);
11fdf7f2
TL
364 unsigned pgnum = old_pgnum;
365 for (auto q = p->second.lower_bound(old_map->get_epoch());
366 q != p->second.end() &&
367 q->first <= new_map->get_epoch();
368 ++q) {
369 if (pgnum < q->second) {
370 // split?
371 if (cur.ps() < pgnum) {
372 set<spg_t> children;
373 if (cur.is_split(pgnum, q->second, &children)) {
374 dout(20) << __func__ << " " << cur << " e" << q->first
375 << " pg_num " << pgnum << " -> " << q->second
376 << " children " << children << dendl;
377 for (auto i : children) {
378 split_children->insert(make_pair(i, q->first));
eafe8130
TL
379 if (!did.count(i))
380 queue.push_back(i);
11fdf7f2
TL
381 }
382 }
383 } else if (cur.ps() < q->second) {
384 dout(20) << __func__ << " " << cur << " e" << q->first
385 << " pg_num " << pgnum << " -> " << q->second
386 << " is a child" << dendl;
387 // normally we'd capture this from the parent, but it's
388 // possible the parent doesn't exist yet (it will be
389 // fabricated to allow an intervening merge). note this PG
390 // as a split child here to be sure we catch it.
391 split_children->insert(make_pair(cur, q->first));
392 } else {
393 dout(20) << __func__ << " " << cur << " e" << q->first
394 << " pg_num " << pgnum << " -> " << q->second
395 << " is post-split, skipping" << dendl;
396 }
397 } else if (merge_pgs) {
398 // merge?
399 if (cur.ps() >= q->second) {
400 if (cur.ps() < pgnum) {
401 spg_t parent;
402 if (cur.is_merge_source(pgnum, q->second, &parent)) {
403 set<spg_t> children;
404 parent.is_split(q->second, pgnum, &children);
405 dout(20) << __func__ << " " << cur << " e" << q->first
406 << " pg_num " << pgnum << " -> " << q->second
407 << " is merge source, target " << parent
408 << ", source(s) " << children << dendl;
409 merge_pgs->insert(make_pair(parent, q->first));
eafe8130
TL
410 if (!did.count(parent)) {
411 // queue (and re-scan) parent in case it might not exist yet
412 // and there are some future splits pending on it
413 queue.push_back(parent);
414 }
11fdf7f2
TL
415 for (auto c : children) {
416 merge_pgs->insert(make_pair(c, q->first));
eafe8130
TL
417 if (!did.count(c))
418 queue.push_back(c);
11fdf7f2
TL
419 }
420 }
421 } else {
422 dout(20) << __func__ << " " << cur << " e" << q->first
423 << " pg_num " << pgnum << " -> " << q->second
424 << " is beyond old pgnum, skipping" << dendl;
425 }
426 } else {
427 set<spg_t> children;
428 if (cur.is_split(q->second, pgnum, &children)) {
429 dout(20) << __func__ << " " << cur << " e" << q->first
430 << " pg_num " << pgnum << " -> " << q->second
431 << " is merge target, source " << children << dendl;
432 for (auto c : children) {
433 merge_pgs->insert(make_pair(c, q->first));
eafe8130
TL
434 if (!did.count(c))
435 queue.push_back(c);
11fdf7f2
TL
436 }
437 merge_pgs->insert(make_pair(cur, q->first));
438 }
7c673cae
FG
439 }
440 }
11fdf7f2 441 pgnum = q->second;
7c673cae
FG
442 }
443 }
444}
445
7c673cae
FG
446void OSDService::need_heartbeat_peer_update()
447{
448 osd->need_heartbeat_peer_update();
449}
450
7c673cae
FG
451void OSDService::start_shutdown()
452{
453 {
11fdf7f2 454 std::lock_guard l(agent_timer_lock);
7c673cae
FG
455 agent_timer.shutdown();
456 }
31f18b77
FG
457
458 {
11fdf7f2
TL
459 std::lock_guard l(sleep_lock);
460 sleep_timer.shutdown();
31f18b77 461 }
81eedcae
TL
462
463 {
464 std::lock_guard l(recovery_request_lock);
465 recovery_request_timer.shutdown();
466 }
7c673cae
FG
467}
468
31f18b77 469void OSDService::shutdown_reserver()
7c673cae
FG
470{
471 reserver_finisher.wait_for_empty();
472 reserver_finisher.stop();
31f18b77
FG
473}
474
475void OSDService::shutdown()
476{
7c673cae 477 {
11fdf7f2 478 std::lock_guard l(watch_lock);
7c673cae
FG
479 watch_timer.shutdown();
480 }
481
482 objecter->shutdown();
11fdf7f2
TL
483 for (auto f : objecter_finishers) {
484 f->wait_for_empty();
485 f->stop();
7c673cae
FG
486 }
487
11fdf7f2 488 publish_map(OSDMapRef());
7c673cae
FG
489 next_osdmap = OSDMapRef();
490}
491
492void OSDService::init()
493{
494 reserver_finisher.start();
11fdf7f2
TL
495 for (auto f : objecter_finishers) {
496 f->start();
497 }
7c673cae
FG
498 objecter->set_client_incarnation(0);
499
500 // deprioritize objecter in daemonperf output
501 objecter->get_logger()->set_prio_adjust(-3);
502
503 watch_timer.init();
504 agent_timer.init();
7c673cae
FG
505
506 agent_thread.create("osd_srv_agent");
507
508 if (cct->_conf->osd_recovery_delay_start)
509 defer_recovery(cct->_conf->osd_recovery_delay_start);
510}
511
512void OSDService::final_init()
513{
514 objecter->start(osdmap.get());
515}
516
517void OSDService::activate_map()
518{
519 // wake/unwake the tiering agent
520 agent_lock.Lock();
521 agent_active =
522 !osdmap->test_flag(CEPH_OSDMAP_NOTIERAGENT) &&
523 osd->is_active();
524 agent_cond.Signal();
525 agent_lock.Unlock();
526}
527
181888fb
FG
528void OSDService::request_osdmap_update(epoch_t e)
529{
530 osd->osdmap_subscribe(e, false);
531}
532
7c673cae
FG
533class AgentTimeoutCB : public Context {
534 PGRef pg;
535public:
536 explicit AgentTimeoutCB(PGRef _pg) : pg(_pg) {}
537 void finish(int) override {
538 pg->agent_choose_mode_restart();
539 }
540};
541
542void OSDService::agent_entry()
543{
544 dout(10) << __func__ << " start" << dendl;
545 agent_lock.Lock();
546
547 while (!agent_stop_flag) {
548 if (agent_queue.empty()) {
549 dout(20) << __func__ << " empty queue" << dendl;
550 agent_cond.Wait(agent_lock);
551 continue;
552 }
553 uint64_t level = agent_queue.rbegin()->first;
554 set<PGRef>& top = agent_queue.rbegin()->second;
555 dout(10) << __func__
556 << " tiers " << agent_queue.size()
557 << ", top is " << level
558 << " with pgs " << top.size()
559 << ", ops " << agent_ops << "/"
560 << cct->_conf->osd_agent_max_ops
561 << (agent_active ? " active" : " NOT ACTIVE")
562 << dendl;
563 dout(20) << __func__ << " oids " << agent_oids << dendl;
564 int max = cct->_conf->osd_agent_max_ops - agent_ops;
565 int agent_flush_quota = max;
566 if (!flush_mode_high_count)
567 agent_flush_quota = cct->_conf->osd_agent_max_low_ops - agent_ops;
568 if (agent_flush_quota <= 0 || top.empty() || !agent_active) {
569 agent_cond.Wait(agent_lock);
570 continue;
571 }
572
573 if (!agent_valid_iterator || agent_queue_pos == top.end()) {
574 agent_queue_pos = top.begin();
575 agent_valid_iterator = true;
576 }
577 PGRef pg = *agent_queue_pos;
578 dout(10) << "high_count " << flush_mode_high_count
579 << " agent_ops " << agent_ops
580 << " flush_quota " << agent_flush_quota << dendl;
581 agent_lock.Unlock();
582 if (!pg->agent_work(max, agent_flush_quota)) {
11fdf7f2 583 dout(10) << __func__ << " " << pg->pg_id
7c673cae
FG
584 << " no agent_work, delay for " << cct->_conf->osd_agent_delay_time
585 << " seconds" << dendl;
586
587 osd->logger->inc(l_osd_tier_delay);
588 // Queue a timer to call agent_choose_mode for this pg in 5 seconds
589 agent_timer_lock.Lock();
590 Context *cb = new AgentTimeoutCB(pg);
591 agent_timer.add_event_after(cct->_conf->osd_agent_delay_time, cb);
592 agent_timer_lock.Unlock();
593 }
594 agent_lock.Lock();
595 }
596 agent_lock.Unlock();
597 dout(10) << __func__ << " finish" << dendl;
598}
599
600void OSDService::agent_stop()
601{
602 {
11fdf7f2 603 std::lock_guard l(agent_lock);
7c673cae
FG
604
605 // By this time all ops should be cancelled
11fdf7f2 606 ceph_assert(agent_ops == 0);
7c673cae
FG
607 // By this time all PGs are shutdown and dequeued
608 if (!agent_queue.empty()) {
609 set<PGRef>& top = agent_queue.rbegin()->second;
11fdf7f2
TL
610 derr << "agent queue not empty, for example " << (*top.begin())->get_pgid() << dendl;
611 ceph_abort_msg("agent queue not empty");
7c673cae
FG
612 }
613
614 agent_stop_flag = true;
615 agent_cond.Signal();
616 }
617 agent_thread.join();
618}
619
620// -------------------------------------
621
622void OSDService::promote_throttle_recalibrate()
623{
624 utime_t now = ceph_clock_now();
625 double dur = now - last_recalibrate;
626 last_recalibrate = now;
627 unsigned prob = promote_probability_millis;
628
629 uint64_t target_obj_sec = cct->_conf->osd_tier_promote_max_objects_sec;
630 uint64_t target_bytes_sec = cct->_conf->osd_tier_promote_max_bytes_sec;
631
632 unsigned min_prob = 1;
633
634 uint64_t attempts, obj, bytes;
635 promote_counter.sample_and_attenuate(&attempts, &obj, &bytes);
636 dout(10) << __func__ << " " << attempts << " attempts, promoted "
1adf2230 637 << obj << " objects and " << byte_u_t(bytes) << "; target "
7c673cae 638 << target_obj_sec << " obj/sec or "
1adf2230 639 << byte_u_t(target_bytes_sec) << "/sec"
7c673cae
FG
640 << dendl;
641
642 // calculate what the probability *should* be, given the targets
643 unsigned new_prob;
644 if (attempts && dur > 0) {
645 uint64_t avg_size = 1;
646 if (obj)
11fdf7f2 647 avg_size = std::max<uint64_t>(bytes / obj, 1);
7c673cae
FG
648 unsigned po = (double)target_obj_sec * dur * 1000.0 / (double)attempts;
649 unsigned pb = (double)target_bytes_sec / (double)avg_size * dur * 1000.0
650 / (double)attempts;
651 dout(20) << __func__ << " po " << po << " pb " << pb << " avg_size "
652 << avg_size << dendl;
653 if (target_obj_sec && target_bytes_sec)
11fdf7f2 654 new_prob = std::min(po, pb);
7c673cae
FG
655 else if (target_obj_sec)
656 new_prob = po;
657 else if (target_bytes_sec)
658 new_prob = pb;
659 else
660 new_prob = 1000;
661 } else {
662 new_prob = 1000;
663 }
664 dout(20) << __func__ << " new_prob " << new_prob << dendl;
665
666 // correct for persistent skew between target rate and actual rate, adjust
667 double ratio = 1.0;
668 unsigned actual = 0;
669 if (attempts && obj) {
670 actual = obj * 1000 / attempts;
671 ratio = (double)actual / (double)prob;
672 new_prob = (double)new_prob / ratio;
673 }
11fdf7f2
TL
674 new_prob = std::max(new_prob, min_prob);
675 new_prob = std::min(new_prob, 1000u);
7c673cae
FG
676
677 // adjust
678 prob = (prob + new_prob) / 2;
11fdf7f2
TL
679 prob = std::max(prob, min_prob);
680 prob = std::min(prob, 1000u);
7c673cae
FG
681 dout(10) << __func__ << " actual " << actual
682 << ", actual/prob ratio " << ratio
683 << ", adjusted new_prob " << new_prob
684 << ", prob " << promote_probability_millis << " -> " << prob
685 << dendl;
686 promote_probability_millis = prob;
687
688 // set hard limits for this interval to mitigate stampedes
91327a77
AA
689 promote_max_objects = target_obj_sec * osd->OSD_TICK_INTERVAL * 2;
690 promote_max_bytes = target_bytes_sec * osd->OSD_TICK_INTERVAL * 2;
7c673cae
FG
691}
692
693// -------------------------------------
694
695float OSDService::get_failsafe_full_ratio()
696{
697 float full_ratio = cct->_conf->osd_failsafe_full_ratio;
698 if (full_ratio > 1.0) full_ratio /= 100.0;
699 return full_ratio;
700}
701
11fdf7f2 702OSDService::s_names OSDService::recalc_full_state(float ratio, float pratio, string &inject)
7c673cae 703{
7c673cae
FG
704 // The OSDMap ratios take precendence. So if the failsafe is .95 and
705 // the admin sets the cluster full to .96, the failsafe moves up to .96
706 // too. (Not that having failsafe == full is ideal, but it's better than
707 // dropping writes before the clusters appears full.)
708 OSDMapRef osdmap = get_osdmap();
709 if (!osdmap || osdmap->get_epoch() == 0) {
11fdf7f2 710 return NONE;
7c673cae
FG
711 }
712 float nearfull_ratio = osdmap->get_nearfull_ratio();
713 float backfillfull_ratio = std::max(osdmap->get_backfillfull_ratio(), nearfull_ratio);
714 float full_ratio = std::max(osdmap->get_full_ratio(), backfillfull_ratio);
715 float failsafe_ratio = std::max(get_failsafe_full_ratio(), full_ratio);
716
31f18b77 717 if (osdmap->require_osd_release < CEPH_RELEASE_LUMINOUS) {
7c673cae
FG
718 // use the failsafe for nearfull and full; the mon isn't using the
719 // flags anyway because we're mid-upgrade.
720 full_ratio = failsafe_ratio;
721 backfillfull_ratio = failsafe_ratio;
722 nearfull_ratio = failsafe_ratio;
723 } else if (full_ratio <= 0 ||
724 backfillfull_ratio <= 0 ||
725 nearfull_ratio <= 0) {
726 derr << __func__ << " full_ratio, backfillfull_ratio or nearfull_ratio is <= 0" << dendl;
727 // use failsafe flag. ick. the monitor did something wrong or the user
728 // did something stupid.
729 full_ratio = failsafe_ratio;
730 backfillfull_ratio = failsafe_ratio;
731 nearfull_ratio = failsafe_ratio;
732 }
733
7c673cae 734 if (injectfull_state > NONE && injectfull) {
7c673cae 735 inject = "(Injected)";
11fdf7f2
TL
736 return injectfull_state;
737 } else if (pratio > failsafe_ratio) {
738 return FAILSAFE;
7c673cae 739 } else if (ratio > full_ratio) {
11fdf7f2 740 return FULL;
7c673cae 741 } else if (ratio > backfillfull_ratio) {
11fdf7f2 742 return BACKFILLFULL;
92f5a8d4 743 } else if (pratio > nearfull_ratio) {
11fdf7f2 744 return NEARFULL;
7c673cae 745 }
11fdf7f2
TL
746 return NONE;
747}
748
749void OSDService::check_full_status(float ratio, float pratio)
750{
751 std::lock_guard l(full_status_lock);
752
753 cur_ratio = ratio;
754 physical_ratio = pratio;
755
756 string inject;
757 s_names new_state;
758 new_state = recalc_full_state(ratio, pratio, inject);
759
7c673cae 760 dout(20) << __func__ << " cur ratio " << ratio
11fdf7f2 761 << ", physical ratio " << pratio
7c673cae
FG
762 << ", new state " << get_full_state_name(new_state)
763 << " " << inject
764 << dendl;
765
766 // warn
767 if (cur_state != new_state) {
768 dout(10) << __func__ << " " << get_full_state_name(cur_state)
769 << " -> " << get_full_state_name(new_state) << dendl;
770 if (new_state == FAILSAFE) {
c07f9fc5 771 clog->error() << "full status failsafe engaged, dropping updates, now "
7c673cae
FG
772 << (int)roundf(ratio * 100) << "% full";
773 } else if (cur_state == FAILSAFE) {
c07f9fc5
FG
774 clog->error() << "full status failsafe disengaged, no longer dropping "
775 << "updates, now " << (int)roundf(ratio * 100) << "% full";
7c673cae
FG
776 }
777 cur_state = new_state;
778 }
779}
780
781bool OSDService::need_fullness_update()
782{
783 OSDMapRef osdmap = get_osdmap();
784 s_names cur = NONE;
785 if (osdmap->exists(whoami)) {
786 if (osdmap->get_state(whoami) & CEPH_OSD_FULL) {
787 cur = FULL;
788 } else if (osdmap->get_state(whoami) & CEPH_OSD_BACKFILLFULL) {
789 cur = BACKFILLFULL;
790 } else if (osdmap->get_state(whoami) & CEPH_OSD_NEARFULL) {
791 cur = NEARFULL;
792 }
793 }
794 s_names want = NONE;
795 if (is_full())
796 want = FULL;
797 else if (is_backfillfull())
798 want = BACKFILLFULL;
799 else if (is_nearfull())
800 want = NEARFULL;
801 return want != cur;
802}
803
11fdf7f2 804bool OSDService::_check_inject_full(DoutPrefixProvider *dpp, s_names type) const
7c673cae 805{
7c673cae
FG
806 if (injectfull && injectfull_state >= type) {
807 // injectfull is either a count of the number of times to return failsafe full
808 // or if -1 then always return full
809 if (injectfull > 0)
810 --injectfull;
11fdf7f2
TL
811 ldpp_dout(dpp, 10) << __func__ << " Injected " << get_full_state_name(type) << " OSD ("
812 << (injectfull < 0 ? "set" : std::to_string(injectfull)) << ")"
813 << dendl;
7c673cae
FG
814 return true;
815 }
11fdf7f2
TL
816 return false;
817}
818
819bool OSDService::_check_full(DoutPrefixProvider *dpp, s_names type) const
820{
821 std::lock_guard l(full_status_lock);
822
823 if (_check_inject_full(dpp, type))
824 return true;
825
826 if (cur_state >= type)
827 ldpp_dout(dpp, 10) << __func__ << " current usage is " << cur_ratio
828 << " physical " << physical_ratio << dendl;
7c673cae 829
7c673cae
FG
830 return cur_state >= type;
831}
832
11fdf7f2
TL
833bool OSDService::_tentative_full(DoutPrefixProvider *dpp, s_names type, uint64_t adjust_used, osd_stat_t adjusted_stat)
834{
835 ldpp_dout(dpp, 20) << __func__ << " type " << get_full_state_name(type) << " adjust_used " << (adjust_used >> 10) << "KiB" << dendl;
836 {
837 std::lock_guard l(full_status_lock);
838 if (_check_inject_full(dpp, type)) {
839 return true;
840 }
841 }
842
843 float pratio;
844 float ratio = compute_adjusted_ratio(adjusted_stat, &pratio, adjust_used);
845
846 string notused;
847 s_names tentative_state = recalc_full_state(ratio, pratio, notused);
848
849 if (tentative_state >= type)
850 ldpp_dout(dpp, 10) << __func__ << " tentative usage is " << ratio << dendl;
851
852 return tentative_state >= type;
853}
854
855bool OSDService::check_failsafe_full(DoutPrefixProvider *dpp) const
856{
857 return _check_full(dpp, FAILSAFE);
858}
859
860bool OSDService::check_full(DoutPrefixProvider *dpp) const
7c673cae 861{
11fdf7f2 862 return _check_full(dpp, FULL);
7c673cae
FG
863}
864
11fdf7f2 865bool OSDService::tentative_backfill_full(DoutPrefixProvider *dpp, uint64_t adjust_used, osd_stat_t stats)
7c673cae 866{
11fdf7f2 867 return _tentative_full(dpp, BACKFILLFULL, adjust_used, stats);
7c673cae
FG
868}
869
11fdf7f2 870bool OSDService::check_backfill_full(DoutPrefixProvider *dpp) const
7c673cae 871{
11fdf7f2 872 return _check_full(dpp, BACKFILLFULL);
7c673cae
FG
873}
874
11fdf7f2 875bool OSDService::check_nearfull(DoutPrefixProvider *dpp) const
7c673cae 876{
11fdf7f2 877 return _check_full(dpp, NEARFULL);
7c673cae
FG
878}
879
880bool OSDService::is_failsafe_full() const
881{
11fdf7f2 882 std::lock_guard l(full_status_lock);
7c673cae
FG
883 return cur_state == FAILSAFE;
884}
885
886bool OSDService::is_full() const
887{
11fdf7f2 888 std::lock_guard l(full_status_lock);
7c673cae
FG
889 return cur_state >= FULL;
890}
891
892bool OSDService::is_backfillfull() const
893{
11fdf7f2 894 std::lock_guard l(full_status_lock);
7c673cae
FG
895 return cur_state >= BACKFILLFULL;
896}
897
898bool OSDService::is_nearfull() const
899{
11fdf7f2 900 std::lock_guard l(full_status_lock);
7c673cae
FG
901 return cur_state >= NEARFULL;
902}
903
904void OSDService::set_injectfull(s_names type, int64_t count)
905{
11fdf7f2 906 std::lock_guard l(full_status_lock);
7c673cae
FG
907 injectfull_state = type;
908 injectfull = count;
909}
910
11fdf7f2
TL
911void OSDService::set_statfs(const struct store_statfs_t &stbuf,
912 osd_alert_list_t& alerts)
7c673cae 913{
224ce89b 914 uint64_t bytes = stbuf.total;
224ce89b 915 uint64_t avail = stbuf.available;
11fdf7f2
TL
916 uint64_t used = stbuf.get_used_raw();
917
918 // For testing fake statfs values so it doesn't matter if all
919 // OSDs are using the same partition.
920 if (cct->_conf->fake_statfs_for_testing) {
921 uint64_t total_num_bytes = 0;
922 vector<PGRef> pgs;
923 osd->_get_pgs(&pgs);
924 for (auto p : pgs) {
925 total_num_bytes += p->get_stats_num_bytes();
926 }
927 bytes = cct->_conf->fake_statfs_for_testing;
928 if (total_num_bytes < bytes)
929 avail = bytes - total_num_bytes;
930 else
931 avail = 0;
932 dout(0) << __func__ << " fake total " << cct->_conf->fake_statfs_for_testing
933 << " adjust available " << avail
934 << dendl;
935 used = bytes - avail;
936 }
7c673cae 937
224ce89b
WB
938 osd->logger->set(l_osd_stat_bytes, bytes);
939 osd->logger->set(l_osd_stat_bytes_used, used);
940 osd->logger->set(l_osd_stat_bytes_avail, avail);
7c673cae 941
11fdf7f2
TL
942 std::lock_guard l(stat_lock);
943 osd_stat.statfs = stbuf;
944 osd_stat.os_alerts.clear();
945 osd_stat.os_alerts[whoami].swap(alerts);
946 if (cct->_conf->fake_statfs_for_testing) {
947 osd_stat.statfs.total = bytes;
948 osd_stat.statfs.available = avail;
949 // For testing don't want used to go negative, so clear reserved
950 osd_stat.statfs.internally_reserved = 0;
224ce89b
WB
951 }
952}
7c673cae 953
11fdf7f2
TL
954osd_stat_t OSDService::set_osd_stat(vector<int>& hb_peers,
955 int num_pgs)
224ce89b 956{
eafe8130
TL
957 utime_t now = ceph_clock_now();
958 auto stale_time = g_conf().get_val<int64_t>("osd_mon_heartbeat_stat_stale");
11fdf7f2
TL
959 std::lock_guard l(stat_lock);
960 osd_stat.hb_peers.swap(hb_peers);
961 osd->op_tracker.get_age_ms_histogram(&osd_stat.op_queue_age_hist);
962 osd_stat.num_pgs = num_pgs;
eafe8130
TL
963 // Clean entries that aren't updated
964 // This is called often enough that we can just remove 1 at a time
965 for (auto i: osd_stat.hb_pingtime) {
966 if (i.second.last_update == 0)
967 continue;
968 if (stale_time && now.sec() - i.second.last_update > stale_time) {
969 dout(20) << __func__ << " time out heartbeat for osd " << i.first
970 << " last_update " << i.second.last_update << dendl;
971 osd_stat.hb_pingtime.erase(i.first);
972 break;
973 }
974 }
11fdf7f2
TL
975 return osd_stat;
976}
977
978void OSDService::inc_osd_stat_repaired()
979{
980 std::lock_guard l(stat_lock);
981 osd_stat.num_shards_repaired++;
982 return;
983}
984
985float OSDService::compute_adjusted_ratio(osd_stat_t new_stat, float *pratio,
986 uint64_t adjust_used)
987{
988 *pratio =
989 ((float)new_stat.statfs.get_used()) / ((float)new_stat.statfs.total);
990
991 if (adjust_used) {
992 dout(20) << __func__ << " Before kb_used() " << new_stat.statfs.kb_used() << dendl;
993 if (new_stat.statfs.available > adjust_used)
994 new_stat.statfs.available -= adjust_used;
995 else
996 new_stat.statfs.available = 0;
997 dout(20) << __func__ << " After kb_used() " << new_stat.statfs.kb_used() << dendl;
7c673cae
FG
998 }
999
11fdf7f2
TL
1000 // Check all pgs and adjust kb_used to include all pending backfill data
1001 int backfill_adjusted = 0;
1002 vector<PGRef> pgs;
1003 osd->_get_pgs(&pgs);
1004 for (auto p : pgs) {
1005 backfill_adjusted += p->pg_stat_adjust(&new_stat);
1006 }
1007 if (backfill_adjusted) {
1008 dout(20) << __func__ << " backfill adjusted " << new_stat << dendl;
1009 }
1010 return ((float)new_stat.statfs.get_used()) / ((float)new_stat.statfs.total);
7c673cae
FG
1011}
1012
1013bool OSDService::check_osdmap_full(const set<pg_shard_t> &missing_on)
1014{
1015 OSDMapRef osdmap = get_osdmap();
1016 for (auto shard : missing_on) {
1017 if (osdmap->get_state(shard.osd) & CEPH_OSD_FULL)
1018 return true;
1019 }
1020 return false;
1021}
1022
1023void OSDService::send_message_osd_cluster(int peer, Message *m, epoch_t from_epoch)
1024{
1025 OSDMapRef next_map = get_nextmap_reserved();
1026 // service map is always newer/newest
11fdf7f2 1027 ceph_assert(from_epoch <= next_map->get_epoch());
7c673cae
FG
1028
1029 if (next_map->is_down(peer) ||
1030 next_map->get_info(peer).up_from > from_epoch) {
1031 m->put();
1032 release_map(next_map);
1033 return;
1034 }
11fdf7f2
TL
1035 ConnectionRef peer_con = osd->cluster_messenger->connect_to_osd(
1036 next_map->get_cluster_addrs(peer));
7c673cae
FG
1037 share_map_peer(peer, peer_con.get(), next_map);
1038 peer_con->send_message(m);
1039 release_map(next_map);
1040}
1041
1042ConnectionRef OSDService::get_con_osd_cluster(int peer, epoch_t from_epoch)
1043{
1044 OSDMapRef next_map = get_nextmap_reserved();
1045 // service map is always newer/newest
11fdf7f2 1046 ceph_assert(from_epoch <= next_map->get_epoch());
7c673cae
FG
1047
1048 if (next_map->is_down(peer) ||
1049 next_map->get_info(peer).up_from > from_epoch) {
1050 release_map(next_map);
1051 return NULL;
1052 }
11fdf7f2
TL
1053 ConnectionRef con = osd->cluster_messenger->connect_to_osd(
1054 next_map->get_cluster_addrs(peer));
7c673cae
FG
1055 release_map(next_map);
1056 return con;
1057}
1058
1059pair<ConnectionRef,ConnectionRef> OSDService::get_con_osd_hb(int peer, epoch_t from_epoch)
1060{
1061 OSDMapRef next_map = get_nextmap_reserved();
1062 // service map is always newer/newest
11fdf7f2 1063 ceph_assert(from_epoch <= next_map->get_epoch());
7c673cae
FG
1064
1065 pair<ConnectionRef,ConnectionRef> ret;
1066 if (next_map->is_down(peer) ||
1067 next_map->get_info(peer).up_from > from_epoch) {
1068 release_map(next_map);
1069 return ret;
1070 }
11fdf7f2
TL
1071 ret.first = osd->hb_back_client_messenger->connect_to_osd(
1072 next_map->get_hb_back_addrs(peer));
1073 ret.second = osd->hb_front_client_messenger->connect_to_osd(
1074 next_map->get_hb_front_addrs(peer));
7c673cae
FG
1075 release_map(next_map);
1076 return ret;
1077}
1078
11fdf7f2
TL
1079entity_name_t OSDService::get_cluster_msgr_name() const
1080{
1081 return cluster_messenger->get_myname();
1082}
7c673cae 1083
94b18763
FG
1084void OSDService::queue_want_pg_temp(pg_t pgid,
1085 const vector<int>& want,
1086 bool forced)
7c673cae 1087{
11fdf7f2 1088 std::lock_guard l(pg_temp_lock);
94b18763 1089 auto p = pg_temp_pending.find(pgid);
7c673cae 1090 if (p == pg_temp_pending.end() ||
94b18763
FG
1091 p->second.acting != want ||
1092 forced) {
11fdf7f2 1093 pg_temp_wanted[pgid] = {want, forced};
7c673cae
FG
1094 }
1095}
1096
1097void OSDService::remove_want_pg_temp(pg_t pgid)
1098{
11fdf7f2 1099 std::lock_guard l(pg_temp_lock);
7c673cae
FG
1100 pg_temp_wanted.erase(pgid);
1101 pg_temp_pending.erase(pgid);
1102}
1103
1104void OSDService::_sent_pg_temp()
1105{
11fdf7f2
TL
1106#ifdef HAVE_STDLIB_MAP_SPLICING
1107 pg_temp_pending.merge(pg_temp_wanted);
1108#else
94b18763
FG
1109 pg_temp_pending.insert(make_move_iterator(begin(pg_temp_wanted)),
1110 make_move_iterator(end(pg_temp_wanted)));
11fdf7f2 1111#endif
7c673cae
FG
1112 pg_temp_wanted.clear();
1113}
1114
1115void OSDService::requeue_pg_temp()
1116{
11fdf7f2 1117 std::lock_guard l(pg_temp_lock);
7c673cae
FG
1118 // wanted overrides pending. note that remove_want_pg_temp
1119 // clears the item out of both.
1120 unsigned old_wanted = pg_temp_wanted.size();
1121 unsigned old_pending = pg_temp_pending.size();
1122 _sent_pg_temp();
1123 pg_temp_wanted.swap(pg_temp_pending);
1124 dout(10) << __func__ << " " << old_wanted << " + " << old_pending << " -> "
1125 << pg_temp_wanted.size() << dendl;
1126}
1127
94b18763
FG
1128std::ostream& operator<<(std::ostream& out,
1129 const OSDService::pg_temp_t& pg_temp)
1130{
1131 out << pg_temp.acting;
1132 if (pg_temp.forced) {
1133 out << " (forced)";
1134 }
1135 return out;
1136}
1137
7c673cae
FG
1138void OSDService::send_pg_temp()
1139{
11fdf7f2 1140 std::lock_guard l(pg_temp_lock);
7c673cae
FG
1141 if (pg_temp_wanted.empty())
1142 return;
1143 dout(10) << "send_pg_temp " << pg_temp_wanted << dendl;
94b18763 1144 MOSDPGTemp *ms[2] = {nullptr, nullptr};
11fdf7f2
TL
1145 for (auto& [pgid, pg_temp] : pg_temp_wanted) {
1146 auto& m = ms[pg_temp.forced];
94b18763
FG
1147 if (!m) {
1148 m = new MOSDPGTemp(osdmap->get_epoch());
11fdf7f2 1149 m->forced = pg_temp.forced;
94b18763 1150 }
11fdf7f2 1151 m->pg_temp.emplace(pgid, pg_temp.acting);
94b18763
FG
1152 }
1153 for (auto m : ms) {
1154 if (m) {
1155 monc->send_mon_message(m);
1156 }
1157 }
7c673cae
FG
1158 _sent_pg_temp();
1159}
1160
1161void OSDService::send_pg_created(pg_t pgid)
1162{
11fdf7f2 1163 std::lock_guard l(pg_created_lock);
7c673cae 1164 dout(20) << __func__ << dendl;
11fdf7f2
TL
1165 auto o = get_osdmap();
1166 if (o->require_osd_release >= CEPH_RELEASE_LUMINOUS) {
1167 pg_created.insert(pgid);
c07f9fc5
FG
1168 monc->send_mon_message(new MOSDPGCreated(pgid));
1169 }
7c673cae
FG
1170}
1171
11fdf7f2
TL
1172void OSDService::send_pg_created()
1173{
1174 std::lock_guard l(pg_created_lock);
1175 dout(20) << __func__ << dendl;
1176 auto o = get_osdmap();
1177 if (o->require_osd_release >= CEPH_RELEASE_LUMINOUS) {
1178 for (auto pgid : pg_created) {
1179 monc->send_mon_message(new MOSDPGCreated(pgid));
1180 }
1181 }
1182}
1183
1184void OSDService::prune_pg_created()
1185{
1186 std::lock_guard l(pg_created_lock);
1187 dout(20) << __func__ << dendl;
1188 auto o = get_osdmap();
1189 auto i = pg_created.begin();
1190 while (i != pg_created.end()) {
1191 auto p = o->get_pg_pool(i->pool());
1192 if (!p || !p->has_flag(pg_pool_t::FLAG_CREATING)) {
1193 dout(20) << __func__ << " pruning " << *i << dendl;
1194 i = pg_created.erase(i);
1195 } else {
1196 dout(20) << __func__ << " keeping " << *i << dendl;
1197 ++i;
1198 }
1199 }
1200}
1201
1202
7c673cae
FG
1203// --------------------------------------
1204// dispatch
1205
1206epoch_t OSDService::get_peer_epoch(int peer)
1207{
11fdf7f2 1208 std::lock_guard l(peer_map_epoch_lock);
7c673cae
FG
1209 map<int,epoch_t>::iterator p = peer_map_epoch.find(peer);
1210 if (p == peer_map_epoch.end())
1211 return 0;
1212 return p->second;
1213}
1214
1215epoch_t OSDService::note_peer_epoch(int peer, epoch_t e)
1216{
11fdf7f2 1217 std::lock_guard l(peer_map_epoch_lock);
7c673cae
FG
1218 map<int,epoch_t>::iterator p = peer_map_epoch.find(peer);
1219 if (p != peer_map_epoch.end()) {
1220 if (p->second < e) {
1221 dout(10) << "note_peer_epoch osd." << peer << " has " << e << dendl;
1222 p->second = e;
1223 } else {
1224 dout(30) << "note_peer_epoch osd." << peer << " has " << p->second << " >= " << e << dendl;
1225 }
1226 return p->second;
1227 } else {
1228 dout(10) << "note_peer_epoch osd." << peer << " now has " << e << dendl;
1229 peer_map_epoch[peer] = e;
1230 return e;
1231 }
1232}
1233
1234void OSDService::forget_peer_epoch(int peer, epoch_t as_of)
1235{
11fdf7f2 1236 std::lock_guard l(peer_map_epoch_lock);
7c673cae
FG
1237 map<int,epoch_t>::iterator p = peer_map_epoch.find(peer);
1238 if (p != peer_map_epoch.end()) {
1239 if (p->second <= as_of) {
1240 dout(10) << "forget_peer_epoch osd." << peer << " as_of " << as_of
1241 << " had " << p->second << dendl;
1242 peer_map_epoch.erase(p);
1243 } else {
1244 dout(10) << "forget_peer_epoch osd." << peer << " as_of " << as_of
1245 << " has " << p->second << " - not forgetting" << dendl;
1246 }
1247 }
1248}
1249
1250bool OSDService::should_share_map(entity_name_t name, Connection *con,
1251 epoch_t epoch, const OSDMapRef& osdmap,
1252 const epoch_t *sent_epoch_p)
1253{
1254 dout(20) << "should_share_map "
1255 << name << " " << con->get_peer_addr()
1256 << " " << epoch << dendl;
1257
1258 // does client have old map?
1259 if (name.is_client()) {
1260 bool message_sendmap = epoch < osdmap->get_epoch();
1261 if (message_sendmap && sent_epoch_p) {
1262 dout(20) << "client session last_sent_epoch: "
1263 << *sent_epoch_p
1264 << " versus osdmap epoch " << osdmap->get_epoch() << dendl;
1265 if (*sent_epoch_p < osdmap->get_epoch()) {
1266 return true;
1267 } // else we don't need to send it out again
1268 }
1269 }
1270
1271 if (con->get_messenger() == osd->cluster_messenger &&
1272 con != osd->cluster_messenger->get_loopback_connection() &&
1273 osdmap->is_up(name.num()) &&
11fdf7f2
TL
1274 (osdmap->get_cluster_addrs(name.num()) == con->get_peer_addrs() ||
1275 osdmap->get_hb_back_addrs(name.num()) == con->get_peer_addrs())) {
7c673cae 1276 // remember
11fdf7f2 1277 epoch_t has = std::max(get_peer_epoch(name.num()), epoch);
7c673cae
FG
1278
1279 // share?
1280 if (has < osdmap->get_epoch()) {
1281 dout(10) << name << " " << con->get_peer_addr()
1282 << " has old map " << epoch << " < "
1283 << osdmap->get_epoch() << dendl;
1284 return true;
1285 }
1286 }
1287
1288 return false;
1289}
1290
1291void OSDService::share_map(
1292 entity_name_t name,
1293 Connection *con,
1294 epoch_t epoch,
1295 OSDMapRef& osdmap,
1296 epoch_t *sent_epoch_p)
1297{
1298 dout(20) << "share_map "
1299 << name << " " << con->get_peer_addr()
1300 << " " << epoch << dendl;
1301
1302 if (!osd->is_active()) {
1303 /*It is safe not to proceed as OSD is not in healthy state*/
1304 return;
1305 }
1306
1307 bool want_shared = should_share_map(name, con, epoch,
1308 osdmap, sent_epoch_p);
1309
1310 if (want_shared){
1311 if (name.is_client()) {
1312 dout(10) << name << " has old map " << epoch
1313 << " < " << osdmap->get_epoch() << dendl;
1314 // we know the Session is valid or we wouldn't be sending
1315 if (sent_epoch_p) {
1316 *sent_epoch_p = osdmap->get_epoch();
1317 }
1318 send_incremental_map(epoch, con, osdmap);
1319 } else if (con->get_messenger() == osd->cluster_messenger &&
1320 osdmap->is_up(name.num()) &&
11fdf7f2
TL
1321 (osdmap->get_cluster_addrs(name.num()) == con->get_peer_addrs() ||
1322 osdmap->get_hb_back_addrs(name.num()) == con->get_peer_addrs())) {
1323 dout(10) << name << " " << con->get_peer_addrs()
7c673cae
FG
1324 << " has old map " << epoch << " < "
1325 << osdmap->get_epoch() << dendl;
1326 note_peer_epoch(name.num(), osdmap->get_epoch());
1327 send_incremental_map(epoch, con, osdmap);
1328 }
1329 }
1330}
1331
1332void OSDService::share_map_peer(int peer, Connection *con, OSDMapRef map)
1333{
1334 if (!map)
1335 map = get_osdmap();
1336
1337 // send map?
1338 epoch_t pe = get_peer_epoch(peer);
1339 if (pe) {
1340 if (pe < map->get_epoch()) {
1341 send_incremental_map(pe, con, map);
1342 note_peer_epoch(peer, map->get_epoch());
1343 } else
1344 dout(20) << "share_map_peer " << con << " already has epoch " << pe << dendl;
1345 } else {
1346 dout(20) << "share_map_peer " << con << " don't know epoch, doing nothing" << dendl;
1347 // no idea about peer's epoch.
1348 // ??? send recent ???
1349 // do nothing.
1350 }
1351}
1352
eafe8130 1353bool OSDService::can_inc_scrubs()
7c673cae
FG
1354{
1355 bool can_inc = false;
11fdf7f2 1356 std::lock_guard l(sched_scrub_lock);
7c673cae 1357
eafe8130
TL
1358 if (scrubs_local + scrubs_remote < cct->_conf->osd_max_scrubs) {
1359 dout(20) << __func__ << " == true " << scrubs_local << " local + " << scrubs_remote
1360 << " remote < max " << cct->_conf->osd_max_scrubs << dendl;
7c673cae
FG
1361 can_inc = true;
1362 } else {
eafe8130
TL
1363 dout(20) << __func__ << " == false " << scrubs_local << " local + " << scrubs_remote
1364 << " remote >= max " << cct->_conf->osd_max_scrubs << dendl;
7c673cae
FG
1365 }
1366
1367 return can_inc;
1368}
1369
eafe8130 1370bool OSDService::inc_scrubs_local()
7c673cae
FG
1371{
1372 bool result = false;
eafe8130
TL
1373 std::lock_guard l{sched_scrub_lock};
1374 if (scrubs_local + scrubs_remote < cct->_conf->osd_max_scrubs) {
1375 dout(20) << __func__ << " " << scrubs_local << " -> " << (scrubs_local+1)
1376 << " (max " << cct->_conf->osd_max_scrubs << ", remote " << scrubs_remote << ")" << dendl;
7c673cae 1377 result = true;
eafe8130 1378 ++scrubs_local;
7c673cae 1379 } else {
eafe8130 1380 dout(20) << __func__ << " " << scrubs_local << " local + " << scrubs_remote << " remote >= max " << cct->_conf->osd_max_scrubs << dendl;
7c673cae 1381 }
7c673cae
FG
1382 return result;
1383}
1384
eafe8130 1385void OSDService::dec_scrubs_local()
7c673cae 1386{
eafe8130
TL
1387 std::lock_guard l{sched_scrub_lock};
1388 dout(20) << __func__ << " " << scrubs_local << " -> " << (scrubs_local-1)
1389 << " (max " << cct->_conf->osd_max_scrubs << ", remote " << scrubs_remote << ")" << dendl;
1390 --scrubs_local;
1391 ceph_assert(scrubs_local >= 0);
7c673cae
FG
1392}
1393
eafe8130 1394bool OSDService::inc_scrubs_remote()
7c673cae 1395{
eafe8130
TL
1396 bool result = false;
1397 std::lock_guard l{sched_scrub_lock};
1398 if (scrubs_local + scrubs_remote < cct->_conf->osd_max_scrubs) {
1399 dout(20) << __func__ << " " << scrubs_remote << " -> " << (scrubs_remote+1)
1400 << " (max " << cct->_conf->osd_max_scrubs << ", local " << scrubs_local << ")" << dendl;
1401 result = true;
1402 ++scrubs_remote;
7c673cae 1403 } else {
eafe8130 1404 dout(20) << __func__ << " " << scrubs_local << " local + " << scrubs_remote << " remote >= max " << cct->_conf->osd_max_scrubs << dendl;
7c673cae 1405 }
eafe8130
TL
1406 return result;
1407}
1408
1409void OSDService::dec_scrubs_remote()
1410{
1411 std::lock_guard l{sched_scrub_lock};
1412 dout(20) << __func__ << " " << scrubs_remote << " -> " << (scrubs_remote-1)
1413 << " (max " << cct->_conf->osd_max_scrubs << ", local " << scrubs_local << ")" << dendl;
1414 --scrubs_remote;
1415 ceph_assert(scrubs_remote >= 0);
7c673cae
FG
1416}
1417
eafe8130 1418void OSDService::dump_scrub_reservations(Formatter *f)
7c673cae 1419{
eafe8130
TL
1420 std::lock_guard l{sched_scrub_lock};
1421 f->dump_int("scrubs_local", scrubs_local);
1422 f->dump_int("scrubs_remote", scrubs_remote);
1423 f->dump_int("osd_max_scrubs", cct->_conf->osd_max_scrubs);
7c673cae
FG
1424}
1425
1426void OSDService::retrieve_epochs(epoch_t *_boot_epoch, epoch_t *_up_epoch,
1427 epoch_t *_bind_epoch) const
1428{
11fdf7f2 1429 std::lock_guard l(epoch_lock);
7c673cae
FG
1430 if (_boot_epoch)
1431 *_boot_epoch = boot_epoch;
1432 if (_up_epoch)
1433 *_up_epoch = up_epoch;
1434 if (_bind_epoch)
1435 *_bind_epoch = bind_epoch;
1436}
1437
1438void OSDService::set_epochs(const epoch_t *_boot_epoch, const epoch_t *_up_epoch,
1439 const epoch_t *_bind_epoch)
1440{
11fdf7f2 1441 std::lock_guard l(epoch_lock);
7c673cae 1442 if (_boot_epoch) {
11fdf7f2 1443 ceph_assert(*_boot_epoch == 0 || *_boot_epoch >= boot_epoch);
7c673cae
FG
1444 boot_epoch = *_boot_epoch;
1445 }
1446 if (_up_epoch) {
11fdf7f2 1447 ceph_assert(*_up_epoch == 0 || *_up_epoch >= up_epoch);
7c673cae
FG
1448 up_epoch = *_up_epoch;
1449 }
1450 if (_bind_epoch) {
11fdf7f2 1451 ceph_assert(*_bind_epoch == 0 || *_bind_epoch >= bind_epoch);
7c673cae
FG
1452 bind_epoch = *_bind_epoch;
1453 }
1454}
1455
1456bool OSDService::prepare_to_stop()
1457{
11fdf7f2 1458 std::lock_guard l(is_stopping_lock);
7c673cae
FG
1459 if (get_state() != NOT_STOPPING)
1460 return false;
1461
1462 OSDMapRef osdmap = get_osdmap();
1463 if (osdmap && osdmap->is_up(whoami)) {
1464 dout(0) << __func__ << " telling mon we are shutting down" << dendl;
1465 set_state(PREPARING_TO_STOP);
11fdf7f2
TL
1466 monc->send_mon_message(
1467 new MOSDMarkMeDown(
1468 monc->get_fsid(),
1469 whoami,
1470 osdmap->get_addrs(whoami),
1471 osdmap->get_epoch(),
1472 true // request ack
1473 ));
7c673cae
FG
1474 utime_t now = ceph_clock_now();
1475 utime_t timeout;
1476 timeout.set_from_double(now + cct->_conf->osd_mon_shutdown_timeout);
1477 while ((ceph_clock_now() < timeout) &&
1478 (get_state() != STOPPING)) {
1479 is_stopping_cond.WaitUntil(is_stopping_lock, timeout);
1480 }
1481 }
1482 dout(0) << __func__ << " starting shutdown" << dendl;
1483 set_state(STOPPING);
1484 return true;
1485}
1486
1487void OSDService::got_stop_ack()
1488{
11fdf7f2 1489 std::lock_guard l(is_stopping_lock);
7c673cae
FG
1490 if (get_state() == PREPARING_TO_STOP) {
1491 dout(0) << __func__ << " starting shutdown" << dendl;
1492 set_state(STOPPING);
1493 is_stopping_cond.Signal();
1494 } else {
1495 dout(10) << __func__ << " ignoring msg" << dendl;
1496 }
1497}
1498
1499MOSDMap *OSDService::build_incremental_map_msg(epoch_t since, epoch_t to,
1500 OSDSuperblock& sblock)
1501{
28e407b8
AA
1502 MOSDMap *m = new MOSDMap(monc->get_fsid(),
1503 osdmap->get_encoding_features());
7c673cae
FG
1504 m->oldest_map = max_oldest_map;
1505 m->newest_map = sblock.newest_map;
1506
11fdf7f2
TL
1507 int max = cct->_conf->osd_map_message_max;
1508 ssize_t max_bytes = cct->_conf->osd_map_message_max_bytes;
1509
1510 if (since < m->oldest_map) {
1511 // we don't have the next map the target wants, so start with a
1512 // full map.
1513 bufferlist bl;
1514 dout(10) << __func__ << " oldest map " << max_oldest_map << " > since "
1515 << since << ", starting with full map" << dendl;
1516 since = m->oldest_map;
1517 if (!get_map_bl(since, bl)) {
1518 derr << __func__ << " missing full map " << since << dendl;
1519 goto panic;
1520 }
1521 max--;
1522 max_bytes -= bl.length();
1523 m->maps[since].claim(bl);
1524 }
1525 for (epoch_t e = since + 1; e <= to; ++e) {
7c673cae 1526 bufferlist bl;
11fdf7f2 1527 if (get_inc_map_bl(e, bl)) {
7c673cae 1528 m->incremental_maps[e].claim(bl);
11fdf7f2
TL
1529 } else {
1530 derr << __func__ << " missing incremental map " << e << dendl;
1531 if (!get_map_bl(e, bl)) {
1532 derr << __func__ << " also missing full map " << e << dendl;
1533 goto panic;
1534 }
7c673cae 1535 m->maps[e].claim(bl);
11fdf7f2
TL
1536 }
1537 max--;
1538 max_bytes -= bl.length();
1539 if (max <= 0 || max_bytes <= 0) {
7c673cae 1540 break;
11fdf7f2
TL
1541 }
1542 }
1543 return m;
1544
1545 panic:
1546 if (!m->maps.empty() ||
1547 !m->incremental_maps.empty()) {
1548 // send what we have so far
1549 return m;
1550 }
1551 // send something
1552 bufferlist bl;
1553 if (get_inc_map_bl(m->newest_map, bl)) {
1554 m->incremental_maps[m->newest_map].claim(bl);
1555 } else {
1556 derr << __func__ << " unable to load latest map " << m->newest_map << dendl;
1557 if (!get_map_bl(m->newest_map, bl)) {
1558 derr << __func__ << " unable to load latest full map " << m->newest_map
7c673cae 1559 << dendl;
11fdf7f2 1560 ceph_abort();
7c673cae 1561 }
11fdf7f2 1562 m->maps[m->newest_map].claim(bl);
7c673cae
FG
1563 }
1564 return m;
1565}
1566
1567void OSDService::send_map(MOSDMap *m, Connection *con)
1568{
1569 con->send_message(m);
1570}
1571
1572void OSDService::send_incremental_map(epoch_t since, Connection *con,
1573 OSDMapRef& osdmap)
1574{
1575 epoch_t to = osdmap->get_epoch();
1576 dout(10) << "send_incremental_map " << since << " -> " << to
1577 << " to " << con << " " << con->get_peer_addr() << dendl;
1578
1579 MOSDMap *m = NULL;
1580 while (!m) {
1581 OSDSuperblock sblock(get_superblock());
1582 if (since < sblock.oldest_map) {
1583 // just send latest full map
28e407b8
AA
1584 MOSDMap *m = new MOSDMap(monc->get_fsid(),
1585 osdmap->get_encoding_features());
7c673cae
FG
1586 m->oldest_map = max_oldest_map;
1587 m->newest_map = sblock.newest_map;
1588 get_map_bl(to, m->maps[to]);
1589 send_map(m, con);
1590 return;
1591 }
1592
1593 if (to > since && (int64_t)(to - since) > cct->_conf->osd_map_share_max_epochs) {
1594 dout(10) << " " << (to - since) << " > max " << cct->_conf->osd_map_share_max_epochs
1595 << ", only sending most recent" << dendl;
1596 since = to - cct->_conf->osd_map_share_max_epochs;
1597 }
1598
7c673cae
FG
1599 m = build_incremental_map_msg(since, to, sblock);
1600 }
1601 send_map(m, con);
1602}
1603
1604bool OSDService::_get_map_bl(epoch_t e, bufferlist& bl)
1605{
1606 bool found = map_bl_cache.lookup(e, &bl);
31f18b77
FG
1607 if (found) {
1608 if (logger)
1609 logger->inc(l_osd_map_bl_cache_hit);
7c673cae 1610 return true;
31f18b77
FG
1611 }
1612 if (logger)
1613 logger->inc(l_osd_map_bl_cache_miss);
11fdf7f2 1614 found = store->read(meta_ch,
31f18b77
FG
1615 OSD::get_osdmap_pobject_name(e), 0, 0, bl,
1616 CEPH_OSD_OP_FLAG_FADVISE_WILLNEED) >= 0;
1617 if (found) {
7c673cae 1618 _add_map_bl(e, bl);
31f18b77 1619 }
7c673cae
FG
1620 return found;
1621}
1622
1623bool OSDService::get_inc_map_bl(epoch_t e, bufferlist& bl)
1624{
11fdf7f2 1625 std::lock_guard l(map_cache_lock);
7c673cae 1626 bool found = map_bl_inc_cache.lookup(e, &bl);
31f18b77
FG
1627 if (found) {
1628 if (logger)
1629 logger->inc(l_osd_map_bl_cache_hit);
7c673cae 1630 return true;
31f18b77
FG
1631 }
1632 if (logger)
1633 logger->inc(l_osd_map_bl_cache_miss);
11fdf7f2 1634 found = store->read(meta_ch,
31f18b77
FG
1635 OSD::get_inc_osdmap_pobject_name(e), 0, 0, bl,
1636 CEPH_OSD_OP_FLAG_FADVISE_WILLNEED) >= 0;
1637 if (found) {
7c673cae 1638 _add_map_inc_bl(e, bl);
31f18b77 1639 }
7c673cae
FG
1640 return found;
1641}
1642
1643void OSDService::_add_map_bl(epoch_t e, bufferlist& bl)
1644{
1645 dout(10) << "add_map_bl " << e << " " << bl.length() << " bytes" << dendl;
31f18b77
FG
1646 // cache a contiguous buffer
1647 if (bl.get_num_buffers() > 1) {
1648 bl.rebuild();
1649 }
1650 bl.try_assign_to_mempool(mempool::mempool_osd_mapbl);
7c673cae
FG
1651 map_bl_cache.add(e, bl);
1652}
1653
1654void OSDService::_add_map_inc_bl(epoch_t e, bufferlist& bl)
1655{
1656 dout(10) << "add_map_inc_bl " << e << " " << bl.length() << " bytes" << dendl;
31f18b77
FG
1657 // cache a contiguous buffer
1658 if (bl.get_num_buffers() > 1) {
1659 bl.rebuild();
1660 }
1661 bl.try_assign_to_mempool(mempool::mempool_osd_mapbl);
7c673cae
FG
1662 map_bl_inc_cache.add(e, bl);
1663}
1664
11fdf7f2 1665int OSDService::get_deleted_pool_pg_num(int64_t pool)
7c673cae 1666{
11fdf7f2
TL
1667 std::lock_guard l(map_cache_lock);
1668 auto p = deleted_pool_pg_nums.find(pool);
1669 if (p != deleted_pool_pg_nums.end()) {
1670 return p->second;
31f18b77 1671 }
11fdf7f2
TL
1672 dout(20) << __func__ << " " << pool << " loading" << dendl;
1673 ghobject_t oid = OSD::make_final_pool_info_oid(pool);
1674 bufferlist bl;
1675 int r = store->read(meta_ch, oid, 0, 0, bl);
1676 ceph_assert(r >= 0);
1677 auto blp = bl.cbegin();
1678 pg_pool_t pi;
1679 ::decode(pi, blp);
1680 deleted_pool_pg_nums[pool] = pi.get_pg_num();
1681 dout(20) << __func__ << " " << pool << " got " << pi.get_pg_num() << dendl;
1682 return pi.get_pg_num();
7c673cae
FG
1683}
1684
1685OSDMapRef OSDService::_add_map(OSDMap *o)
1686{
1687 epoch_t e = o->get_epoch();
1688
1689 if (cct->_conf->osd_map_dedup) {
1690 // Dedup against an existing map at a nearby epoch
1691 OSDMapRef for_dedup = map_cache.lower_bound(e);
1692 if (for_dedup) {
1693 OSDMap::dedup(for_dedup.get(), o);
1694 }
1695 }
1696 bool existed;
1697 OSDMapRef l = map_cache.add(e, o, &existed);
1698 if (existed) {
1699 delete o;
1700 }
1701 return l;
1702}
1703
1704OSDMapRef OSDService::try_get_map(epoch_t epoch)
1705{
11fdf7f2 1706 std::lock_guard l(map_cache_lock);
7c673cae
FG
1707 OSDMapRef retval = map_cache.lookup(epoch);
1708 if (retval) {
1709 dout(30) << "get_map " << epoch << " -cached" << dendl;
1710 if (logger) {
1711 logger->inc(l_osd_map_cache_hit);
1712 }
1713 return retval;
1714 }
1715 if (logger) {
1716 logger->inc(l_osd_map_cache_miss);
1717 epoch_t lb = map_cache.cached_key_lower_bound();
1718 if (epoch < lb) {
1719 dout(30) << "get_map " << epoch << " - miss, below lower bound" << dendl;
1720 logger->inc(l_osd_map_cache_miss_low);
1721 logger->inc(l_osd_map_cache_miss_low_avg, lb - epoch);
1722 }
1723 }
1724
1725 OSDMap *map = new OSDMap;
1726 if (epoch > 0) {
1727 dout(20) << "get_map " << epoch << " - loading and decoding " << map << dendl;
1728 bufferlist bl;
1729 if (!_get_map_bl(epoch, bl) || bl.length() == 0) {
1730 derr << "failed to load OSD map for epoch " << epoch << ", got " << bl.length() << " bytes" << dendl;
1731 delete map;
1732 return OSDMapRef();
1733 }
1734 map->decode(bl);
1735 } else {
1736 dout(20) << "get_map " << epoch << " - return initial " << map << dendl;
1737 }
1738 return _add_map(map);
1739}
1740
1741// ops
1742
1743
1744void OSDService::reply_op_error(OpRequestRef op, int err)
1745{
1746 reply_op_error(op, err, eversion_t(), 0);
1747}
1748
1749void OSDService::reply_op_error(OpRequestRef op, int err, eversion_t v,
1750 version_t uv)
1751{
1752 const MOSDOp *m = static_cast<const MOSDOp*>(op->get_req());
11fdf7f2 1753 ceph_assert(m->get_type() == CEPH_MSG_OSD_OP);
7c673cae
FG
1754 int flags;
1755 flags = m->get_flags() & (CEPH_OSD_FLAG_ACK|CEPH_OSD_FLAG_ONDISK);
1756
11fdf7f2 1757 MOSDOpReply *reply = new MOSDOpReply(m, err, osdmap->get_epoch(), flags, true);
7c673cae
FG
1758 reply->set_reply_versions(v, uv);
1759 m->get_connection()->send_message(reply);
1760}
1761
1762void OSDService::handle_misdirected_op(PG *pg, OpRequestRef op)
1763{
31f18b77
FG
1764 if (!cct->_conf->osd_debug_misdirected_ops) {
1765 return;
1766 }
1767
7c673cae 1768 const MOSDOp *m = static_cast<const MOSDOp*>(op->get_req());
11fdf7f2 1769 ceph_assert(m->get_type() == CEPH_MSG_OSD_OP);
7c673cae 1770
11fdf7f2 1771 ceph_assert(m->get_map_epoch() >= pg->get_history().same_primary_since);
7c673cae
FG
1772
1773 if (pg->is_ec_pg()) {
1774 /**
1775 * OSD recomputes op target based on current OSDMap. With an EC pg, we
1776 * can get this result:
1777 * 1) client at map 512 sends an op to osd 3, pg_t 3.9 based on mapping
1778 * [CRUSH_ITEM_NONE, 2, 3]/3
1779 * 2) OSD 3 at map 513 remaps op to osd 3, spg_t 3.9s0 based on mapping
1780 * [3, 2, 3]/3
1781 * 3) PG 3.9s0 dequeues the op at epoch 512 and notices that it isn't primary
1782 * -- misdirected op
1783 * 4) client resends and this time PG 3.9s0 having caught up to 513 gets
1784 * it and fulfils it
1785 *
1786 * We can't compute the op target based on the sending map epoch due to
1787 * splitting. The simplest thing is to detect such cases here and drop
1788 * them without an error (the client will resend anyway).
1789 */
11fdf7f2 1790 ceph_assert(m->get_map_epoch() <= superblock.newest_map);
7c673cae
FG
1791 OSDMapRef opmap = try_get_map(m->get_map_epoch());
1792 if (!opmap) {
1793 dout(7) << __func__ << ": " << *pg << " no longer have map for "
1794 << m->get_map_epoch() << ", dropping" << dendl;
1795 return;
1796 }
1797 pg_t _pgid = m->get_raw_pg();
1798 spg_t pgid;
1799 if ((m->get_flags() & CEPH_OSD_FLAG_PGOP) == 0)
1800 _pgid = opmap->raw_pg_to_pg(_pgid);
1801 if (opmap->get_primary_shard(_pgid, &pgid) &&
11fdf7f2 1802 pgid.shard != pg->pg_id.shard) {
7c673cae
FG
1803 dout(7) << __func__ << ": " << *pg << " primary changed since "
1804 << m->get_map_epoch() << ", dropping" << dendl;
1805 return;
1806 }
1807 }
1808
1809 dout(7) << *pg << " misdirected op in " << m->get_map_epoch() << dendl;
1810 clog->warn() << m->get_source_inst() << " misdirected " << m->get_reqid()
1811 << " pg " << m->get_raw_pg()
1812 << " to osd." << whoami
11fdf7f2 1813 << " not " << pg->get_acting()
7c673cae 1814 << " in e" << m->get_map_epoch() << "/" << osdmap->get_epoch();
7c673cae
FG
1815}
1816
11fdf7f2 1817void OSDService::enqueue_back(OpQueueItem&& qi)
7c673cae 1818{
11fdf7f2 1819 osd->op_shardedwq.queue(std::move(qi));
7c673cae
FG
1820}
1821
11fdf7f2 1822void OSDService::enqueue_front(OpQueueItem&& qi)
7c673cae 1823{
11fdf7f2 1824 osd->op_shardedwq.queue_front(std::move(qi));
7c673cae
FG
1825}
1826
11fdf7f2
TL
1827void OSDService::queue_recovery_context(
1828 PG *pg,
1829 GenContext<ThreadPool::TPHandle&> *c)
7c673cae 1830{
11fdf7f2
TL
1831 epoch_t e = get_osdmap_epoch();
1832 enqueue_back(
1833 OpQueueItem(
1834 unique_ptr<OpQueueItem::OpQueueable>(
1835 new PGRecoveryContext(pg->get_pgid(), c, e)),
1836 cct->_conf->osd_recovery_cost,
1837 cct->_conf->osd_recovery_priority,
1838 ceph_clock_now(),
1839 0,
1840 e));
7c673cae
FG
1841}
1842
1843void OSDService::queue_for_snap_trim(PG *pg)
1844{
1845 dout(10) << "queueing " << *pg << " for snaptrim" << dendl;
11fdf7f2
TL
1846 enqueue_back(
1847 OpQueueItem(
1848 unique_ptr<OpQueueItem::OpQueueable>(
1849 new PGSnapTrim(pg->get_pgid(), pg->get_osdmap_epoch())),
1850 cct->_conf->osd_snap_trim_cost,
1851 cct->_conf->osd_snap_trim_priority,
1852 ceph_clock_now(),
1853 0,
1854 pg->get_osdmap_epoch()));
1855}
1856
1857void OSDService::queue_for_scrub(PG *pg, bool with_high_priority)
1858{
1859 unsigned scrub_queue_priority = pg->scrubber.priority;
1860 if (with_high_priority && scrub_queue_priority < cct->_conf->osd_client_op_priority) {
1861 scrub_queue_priority = cct->_conf->osd_client_op_priority;
1862 }
1863 const auto epoch = pg->get_osdmap_epoch();
1864 enqueue_back(
1865 OpQueueItem(
1866 unique_ptr<OpQueueItem::OpQueueable>(new PGScrub(pg->get_pgid(), epoch)),
1867 cct->_conf->osd_scrub_cost,
1868 scrub_queue_priority,
1869 ceph_clock_now(),
1870 0,
1871 epoch));
1872}
1873
1874void OSDService::queue_for_pg_delete(spg_t pgid, epoch_t e)
1875{
1876 dout(10) << __func__ << " on " << pgid << " e " << e << dendl;
1877 enqueue_back(
1878 OpQueueItem(
1879 unique_ptr<OpQueueItem::OpQueueable>(
1880 new PGDelete(pgid, e)),
1881 cct->_conf->osd_pg_delete_cost,
1882 cct->_conf->osd_pg_delete_priority,
1883 ceph_clock_now(),
1884 0,
1885 e));
1886}
1887
1888bool OSDService::try_finish_pg_delete(PG *pg, unsigned old_pg_num)
1889{
1890 return osd->try_finish_pg_delete(pg, old_pg_num);
1891}
1892
1893// ---
1894
1895void OSDService::set_ready_to_merge_source(PG *pg, eversion_t version)
1896{
1897 std::lock_guard l(merge_lock);
1898 dout(10) << __func__ << " " << pg->pg_id << dendl;
1899 ready_to_merge_source[pg->pg_id.pgid] = version;
1900 assert(not_ready_to_merge_source.count(pg->pg_id.pgid) == 0);
1901 _send_ready_to_merge();
1902}
1903
1904void OSDService::set_ready_to_merge_target(PG *pg,
1905 eversion_t version,
1906 epoch_t last_epoch_started,
1907 epoch_t last_epoch_clean)
1908{
1909 std::lock_guard l(merge_lock);
1910 dout(10) << __func__ << " " << pg->pg_id << dendl;
1911 ready_to_merge_target.insert(make_pair(pg->pg_id.pgid,
1912 make_tuple(version,
1913 last_epoch_started,
1914 last_epoch_clean)));
1915 assert(not_ready_to_merge_target.count(pg->pg_id.pgid) == 0);
1916 _send_ready_to_merge();
1917}
1918
1919void OSDService::set_not_ready_to_merge_source(pg_t source)
1920{
1921 std::lock_guard l(merge_lock);
1922 dout(10) << __func__ << " " << source << dendl;
1923 not_ready_to_merge_source.insert(source);
1924 assert(ready_to_merge_source.count(source) == 0);
1925 _send_ready_to_merge();
1926}
1927
1928void OSDService::set_not_ready_to_merge_target(pg_t target, pg_t source)
1929{
1930 std::lock_guard l(merge_lock);
1931 dout(10) << __func__ << " " << target << " source " << source << dendl;
1932 not_ready_to_merge_target[target] = source;
1933 assert(ready_to_merge_target.count(target) == 0);
1934 _send_ready_to_merge();
1935}
1936
1937void OSDService::send_ready_to_merge()
1938{
1939 std::lock_guard l(merge_lock);
1940 _send_ready_to_merge();
1941}
1942
1943void OSDService::_send_ready_to_merge()
1944{
1945 dout(20) << __func__
1946 << " ready_to_merge_source " << ready_to_merge_source
1947 << " not_ready_to_merge_source " << not_ready_to_merge_source
1948 << " ready_to_merge_target " << ready_to_merge_target
1949 << " not_ready_to_merge_target " << not_ready_to_merge_target
1950 << " sent_ready_to_merge_source " << sent_ready_to_merge_source
1951 << dendl;
1952 for (auto src : not_ready_to_merge_source) {
1953 if (sent_ready_to_merge_source.count(src) == 0) {
1954 monc->send_mon_message(new MOSDPGReadyToMerge(
1955 src,
1956 {}, {}, 0, 0,
1957 false,
1958 osdmap->get_epoch()));
1959 sent_ready_to_merge_source.insert(src);
1960 }
1961 }
1962 for (auto p : not_ready_to_merge_target) {
1963 if (sent_ready_to_merge_source.count(p.second) == 0) {
1964 monc->send_mon_message(new MOSDPGReadyToMerge(
1965 p.second,
1966 {}, {}, 0, 0,
1967 false,
1968 osdmap->get_epoch()));
1969 sent_ready_to_merge_source.insert(p.second);
1970 }
1971 }
1972 for (auto src : ready_to_merge_source) {
1973 if (not_ready_to_merge_source.count(src.first) ||
1974 not_ready_to_merge_target.count(src.first.get_parent())) {
1975 continue;
1976 }
1977 auto p = ready_to_merge_target.find(src.first.get_parent());
1978 if (p != ready_to_merge_target.end() &&
1979 sent_ready_to_merge_source.count(src.first) == 0) {
1980 monc->send_mon_message(new MOSDPGReadyToMerge(
1981 src.first, // source pgid
1982 src.second, // src version
1983 std::get<0>(p->second), // target version
1984 std::get<1>(p->second), // PG's last_epoch_started
1985 std::get<2>(p->second), // PG's last_epoch_clean
1986 true,
1987 osdmap->get_epoch()));
1988 sent_ready_to_merge_source.insert(src.first);
1989 }
1990 }
1991}
1992
1993void OSDService::clear_ready_to_merge(PG *pg)
1994{
1995 std::lock_guard l(merge_lock);
1996 dout(10) << __func__ << " " << pg->pg_id << dendl;
1997 ready_to_merge_source.erase(pg->pg_id.pgid);
1998 ready_to_merge_target.erase(pg->pg_id.pgid);
1999 not_ready_to_merge_source.erase(pg->pg_id.pgid);
2000 not_ready_to_merge_target.erase(pg->pg_id.pgid);
2001 sent_ready_to_merge_source.erase(pg->pg_id.pgid);
2002}
2003
2004void OSDService::clear_sent_ready_to_merge()
2005{
2006 std::lock_guard l(merge_lock);
2007 sent_ready_to_merge_source.clear();
2008}
2009
2010void OSDService::prune_sent_ready_to_merge(OSDMapRef& osdmap)
2011{
2012 std::lock_guard l(merge_lock);
2013 auto i = sent_ready_to_merge_source.begin();
2014 while (i != sent_ready_to_merge_source.end()) {
2015 if (!osdmap->pg_exists(*i)) {
2016 dout(10) << __func__ << " " << *i << dendl;
2017 i = sent_ready_to_merge_source.erase(i);
2018 } else {
2019 ++i;
2020 }
2021 }
7c673cae
FG
2022}
2023
11fdf7f2
TL
2024// ---
2025
2026void OSDService::_queue_for_recovery(
2027 std::pair<epoch_t, PGRef> p,
2028 uint64_t reserved_pushes)
2029{
2030 ceph_assert(recovery_lock.is_locked_by_me());
2031 enqueue_back(
2032 OpQueueItem(
2033 unique_ptr<OpQueueItem::OpQueueable>(
2034 new PGRecovery(
2035 p.second->get_pgid(), p.first, reserved_pushes)),
2036 cct->_conf->osd_recovery_cost,
2037 cct->_conf->osd_recovery_priority,
2038 ceph_clock_now(),
2039 0,
2040 p.first));
2041}
7c673cae
FG
2042
2043// ====================================================================
2044// OSD
2045
2046#undef dout_prefix
2047#define dout_prefix *_dout
2048
2049// Commands shared between OSD's console and admin console:
2050namespace ceph {
2051namespace osd_cmds {
2052
11fdf7f2 2053int heap(CephContext& cct, const cmdmap_t& cmdmap, Formatter& f, std::ostream& os);
7c673cae
FG
2054
2055}} // namespace ceph::osd_cmds
2056
11fdf7f2 2057int OSD::mkfs(CephContext *cct, ObjectStore *store, uuid_d fsid, int whoami)
7c673cae
FG
2058{
2059 int ret;
2060
7c673cae
FG
2061 OSDSuperblock sb;
2062 bufferlist sbbl;
11fdf7f2 2063 ObjectStore::CollectionHandle ch;
7c673cae
FG
2064
2065 // if we are fed a uuid for this osd, use it.
2066 store->set_fsid(cct->_conf->osd_uuid);
2067
2068 ret = store->mkfs();
2069 if (ret) {
224ce89b
WB
2070 derr << "OSD::mkfs: ObjectStore::mkfs failed with error "
2071 << cpp_strerror(ret) << dendl;
7c673cae
FG
2072 goto free_store;
2073 }
2074
31f18b77 2075 store->set_cache_shards(1); // doesn't matter for mkfs!
7c673cae
FG
2076
2077 ret = store->mount();
2078 if (ret) {
224ce89b
WB
2079 derr << "OSD::mkfs: couldn't mount ObjectStore: error "
2080 << cpp_strerror(ret) << dendl;
7c673cae
FG
2081 goto free_store;
2082 }
2083
11fdf7f2
TL
2084 ch = store->open_collection(coll_t::meta());
2085 if (ch) {
2086 ret = store->read(ch, OSD_SUPERBLOCK_GOBJECT, 0, 0, sbbl);
2087 if (ret < 0) {
2088 derr << "OSD::mkfs: have meta collection but no superblock" << dendl;
2089 goto free_store;
2090 }
7c673cae
FG
2091 /* if we already have superblock, check content of superblock */
2092 dout(0) << " have superblock" << dendl;
11fdf7f2
TL
2093 auto p = sbbl.cbegin();
2094 decode(sb, p);
7c673cae
FG
2095 if (whoami != sb.whoami) {
2096 derr << "provided osd id " << whoami << " != superblock's " << sb.whoami
2097 << dendl;
2098 ret = -EINVAL;
2099 goto umount_store;
2100 }
2101 if (fsid != sb.cluster_fsid) {
2102 derr << "provided cluster fsid " << fsid
2103 << " != superblock's " << sb.cluster_fsid << dendl;
2104 ret = -EINVAL;
2105 goto umount_store;
2106 }
2107 } else {
2108 // create superblock
2109 sb.cluster_fsid = fsid;
2110 sb.osd_fsid = store->get_fsid();
2111 sb.whoami = whoami;
2112 sb.compat_features = get_osd_initial_compat_set();
2113
2114 bufferlist bl;
11fdf7f2 2115 encode(sb, bl);
7c673cae 2116
11fdf7f2
TL
2117 ObjectStore::CollectionHandle ch = store->create_new_collection(
2118 coll_t::meta());
7c673cae
FG
2119 ObjectStore::Transaction t;
2120 t.create_collection(coll_t::meta(), 0);
2121 t.write(coll_t::meta(), OSD_SUPERBLOCK_GOBJECT, 0, bl.length(), bl);
11fdf7f2 2122 ret = store->queue_transaction(ch, std::move(t));
7c673cae
FG
2123 if (ret) {
2124 derr << "OSD::mkfs: error while writing OSD_SUPERBLOCK_GOBJECT: "
11fdf7f2 2125 << "queue_transaction returned " << cpp_strerror(ret) << dendl;
7c673cae
FG
2126 goto umount_store;
2127 }
2128 }
2129
3efd9988 2130 ret = write_meta(cct, store, sb.cluster_fsid, sb.osd_fsid, whoami);
7c673cae 2131 if (ret) {
224ce89b
WB
2132 derr << "OSD::mkfs: failed to write fsid file: error "
2133 << cpp_strerror(ret) << dendl;
7c673cae
FG
2134 goto umount_store;
2135 }
2136
2137umount_store:
11fdf7f2
TL
2138 if (ch) {
2139 ch.reset();
2140 }
7c673cae
FG
2141 store->umount();
2142free_store:
2143 delete store;
2144 return ret;
2145}
2146
3efd9988 2147int OSD::write_meta(CephContext *cct, ObjectStore *store, uuid_d& cluster_fsid, uuid_d& osd_fsid, int whoami)
7c673cae
FG
2148{
2149 char val[80];
2150 int r;
2151
2152 snprintf(val, sizeof(val), "%s", CEPH_OSD_ONDISK_MAGIC);
2153 r = store->write_meta("magic", val);
2154 if (r < 0)
2155 return r;
2156
2157 snprintf(val, sizeof(val), "%d", whoami);
2158 r = store->write_meta("whoami", val);
2159 if (r < 0)
2160 return r;
2161
2162 cluster_fsid.print(val);
2163 r = store->write_meta("ceph_fsid", val);
2164 if (r < 0)
2165 return r;
2166
11fdf7f2 2167 string key = cct->_conf.get_val<string>("key");
3efd9988
FG
2168 if (key.size()) {
2169 r = store->write_meta("osd_key", key);
2170 if (r < 0)
2171 return r;
b32b8144 2172 } else {
11fdf7f2 2173 string keyfile = cct->_conf.get_val<string>("keyfile");
b32b8144
FG
2174 if (!keyfile.empty()) {
2175 bufferlist keybl;
2176 string err;
11fdf7f2 2177 r = keybl.read_file(keyfile.c_str(), &err);
b32b8144
FG
2178 if (r < 0) {
2179 derr << __func__ << " failed to read keyfile " << keyfile << ": "
2180 << err << ": " << cpp_strerror(r) << dendl;
2181 return r;
2182 }
2183 r = store->write_meta("osd_key", keybl.to_str());
2184 if (r < 0)
2185 return r;
2186 }
3efd9988
FG
2187 }
2188
7c673cae
FG
2189 r = store->write_meta("ready", "ready");
2190 if (r < 0)
2191 return r;
2192
2193 return 0;
2194}
2195
11fdf7f2
TL
2196int OSD::peek_meta(ObjectStore *store,
2197 std::string *magic,
2198 uuid_d *cluster_fsid,
2199 uuid_d *osd_fsid,
2200 int *whoami,
2201 int *require_osd_release)
7c673cae
FG
2202{
2203 string val;
2204
2205 int r = store->read_meta("magic", &val);
2206 if (r < 0)
2207 return r;
11fdf7f2 2208 *magic = val;
7c673cae
FG
2209
2210 r = store->read_meta("whoami", &val);
2211 if (r < 0)
2212 return r;
11fdf7f2 2213 *whoami = atoi(val.c_str());
7c673cae
FG
2214
2215 r = store->read_meta("ceph_fsid", &val);
2216 if (r < 0)
2217 return r;
11fdf7f2 2218 r = cluster_fsid->parse(val.c_str());
7c673cae
FG
2219 if (!r)
2220 return -EINVAL;
2221
2222 r = store->read_meta("fsid", &val);
2223 if (r < 0) {
11fdf7f2 2224 *osd_fsid = uuid_d();
7c673cae 2225 } else {
11fdf7f2 2226 r = osd_fsid->parse(val.c_str());
7c673cae
FG
2227 if (!r)
2228 return -EINVAL;
2229 }
2230
11fdf7f2
TL
2231 r = store->read_meta("require_osd_release", &val);
2232 if (r >= 0) {
2233 *require_osd_release = atoi(val.c_str());
2234 }
2235
7c673cae
FG
2236 return 0;
2237}
2238
2239
2240#undef dout_prefix
2241#define dout_prefix _prefix(_dout, whoami, get_osdmap_epoch())
2242
2243// cons/des
2244
2245OSD::OSD(CephContext *cct_, ObjectStore *store_,
2246 int id,
2247 Messenger *internal_messenger,
2248 Messenger *external_messenger,
2249 Messenger *hb_client_front,
2250 Messenger *hb_client_back,
2251 Messenger *hb_front_serverm,
2252 Messenger *hb_back_serverm,
2253 Messenger *osdc_messenger,
2254 MonClient *mc,
2255 const std::string &dev, const std::string &jdev) :
2256 Dispatcher(cct_),
2257 osd_lock("OSD::osd_lock"),
2258 tick_timer(cct, osd_lock),
2259 tick_timer_lock("OSD::tick_timer_lock"),
2260 tick_timer_without_osd_lock(cct, tick_timer_lock),
11fdf7f2 2261 gss_ktfile_client(cct->_conf.get_val<std::string>("gss_ktab_client_file")),
7c673cae
FG
2262 cluster_messenger(internal_messenger),
2263 client_messenger(external_messenger),
2264 objecter_messenger(osdc_messenger),
2265 monc(mc),
2266 mgrc(cct_, client_messenger),
2267 logger(NULL),
2268 recoverystate_perf(NULL),
2269 store(store_),
2270 log_client(cct, client_messenger, &mc->monmap, LogClient::NO_FLAGS),
2271 clog(log_client.create_channel()),
2272 whoami(id),
2273 dev_path(dev), journal_path(jdev),
31f18b77 2274 store_is_rotational(store->is_rotational()),
7c673cae
FG
2275 trace_endpoint("0.0.0.0", 0, "osd"),
2276 asok_hook(NULL),
11fdf7f2
TL
2277 m_osd_pg_epoch_max_lag_factor(cct->_conf.get_val<double>(
2278 "osd_pg_epoch_max_lag_factor")),
7c673cae 2279 osd_compat(get_osd_compat_set()),
7c673cae 2280 osd_op_tp(cct, "OSD::osd_op_tp", "tp_osd_tp",
31f18b77 2281 get_num_op_threads()),
7c673cae
FG
2282 command_tp(cct, "OSD::command_tp", "tp_osd_cmd", 1),
2283 session_waiting_lock("OSD::session_waiting_lock"),
181888fb 2284 osdmap_subscribe_lock("OSD::osdmap_subscribe_lock"),
7c673cae
FG
2285 heartbeat_lock("OSD::heartbeat_lock"),
2286 heartbeat_stop(false),
2287 heartbeat_need_update(true),
2288 hb_front_client_messenger(hb_client_front),
2289 hb_back_client_messenger(hb_client_back),
2290 hb_front_server_messenger(hb_front_serverm),
2291 hb_back_server_messenger(hb_back_serverm),
2292 daily_loadavg(0.0),
2293 heartbeat_thread(this),
2294 heartbeat_dispatcher(this),
2295 op_tracker(cct, cct->_conf->osd_enable_op_tracker,
2296 cct->_conf->osd_num_op_tracker_shard),
2297 test_ops_hook(NULL),
2298 op_queue(get_io_queue()),
2299 op_prio_cutoff(get_io_prio_cut()),
2300 op_shardedwq(
7c673cae
FG
2301 this,
2302 cct->_conf->osd_op_thread_timeout,
2303 cct->_conf->osd_op_thread_suicide_timeout,
2304 &osd_op_tp),
7c673cae 2305 map_lock("OSD::map_lock"),
7c673cae
FG
2306 last_pg_create_epoch(0),
2307 mon_report_lock("OSD::mon_report_lock"),
11fdf7f2 2308 boot_finisher(cct),
7c673cae
FG
2309 up_thru_wanted(0),
2310 requested_full_first(0),
2311 requested_full_last(0),
7c673cae
FG
2312 command_wq(
2313 this,
2314 cct->_conf->osd_command_thread_timeout,
2315 cct->_conf->osd_command_thread_suicide_timeout,
2316 &command_tp),
7c673cae
FG
2317 service(this)
2318{
11fdf7f2
TL
2319
2320 if (!gss_ktfile_client.empty()) {
2321 // Assert we can export environment variable
2322 /*
2323 The default client keytab is used, if it is present and readable,
2324 to automatically obtain initial credentials for GSSAPI client
2325 applications. The principal name of the first entry in the client
2326 keytab is used by default when obtaining initial credentials.
2327 1. The KRB5_CLIENT_KTNAME environment variable.
2328 2. The default_client_keytab_name profile variable in [libdefaults].
2329 3. The hardcoded default, DEFCKTNAME.
2330 */
2331 const int32_t set_result(setenv("KRB5_CLIENT_KTNAME",
2332 gss_ktfile_client.c_str(), 1));
2333 ceph_assert(set_result == 0);
2334 }
2335
7c673cae
FG
2336 monc->set_messenger(client_messenger);
2337 op_tracker.set_complaint_and_threshold(cct->_conf->osd_op_complaint_time,
2338 cct->_conf->osd_op_log_threshold);
2339 op_tracker.set_history_size_and_duration(cct->_conf->osd_op_history_size,
2340 cct->_conf->osd_op_history_duration);
2341 op_tracker.set_history_slow_op_size_and_threshold(cct->_conf->osd_op_history_slow_op_size,
2342 cct->_conf->osd_op_history_slow_op_threshold);
2343#ifdef WITH_BLKIN
2344 std::stringstream ss;
2345 ss << "osd." << whoami;
2346 trace_endpoint.copy_name(ss.str());
2347#endif
11fdf7f2
TL
2348
2349 // initialize shards
2350 num_shards = get_num_op_shards();
2351 for (uint32_t i = 0; i < num_shards; i++) {
2352 OSDShard *one_shard = new OSDShard(
2353 i,
2354 cct,
2355 this,
2356 cct->_conf->osd_op_pq_max_tokens_per_priority,
2357 cct->_conf->osd_op_pq_min_cost,
2358 op_queue);
2359 shards.push_back(one_shard);
2360 }
7c673cae
FG
2361}
2362
2363OSD::~OSD()
2364{
11fdf7f2
TL
2365 while (!shards.empty()) {
2366 delete shards.back();
2367 shards.pop_back();
2368 }
7c673cae
FG
2369 delete class_handler;
2370 cct->get_perfcounters_collection()->remove(recoverystate_perf);
2371 cct->get_perfcounters_collection()->remove(logger);
2372 delete recoverystate_perf;
2373 delete logger;
2374 delete store;
2375}
2376
91327a77
AA
2377double OSD::get_tick_interval() const
2378{
2379 // vary +/- 5% to avoid scrub scheduling livelocks
2380 constexpr auto delta = 0.05;
91327a77 2381 return (OSD_TICK_INTERVAL *
11fdf7f2 2382 ceph::util::generate_random_number(1.0 - delta, 1.0 + delta));
91327a77
AA
2383}
2384
7c673cae
FG
2385void cls_initialize(ClassHandler *ch);
2386
2387void OSD::handle_signal(int signum)
2388{
11fdf7f2 2389 ceph_assert(signum == SIGINT || signum == SIGTERM);
7c673cae
FG
2390 derr << "*** Got signal " << sig_str(signum) << " ***" << dendl;
2391 shutdown();
2392}
2393
2394int OSD::pre_init()
2395{
11fdf7f2 2396 std::lock_guard lock(osd_lock);
7c673cae
FG
2397 if (is_stopping())
2398 return 0;
2399
2400 if (store->test_mount_in_use()) {
2401 derr << "OSD::pre_init: object store '" << dev_path << "' is "
2402 << "currently in use. (Is ceph-osd already running?)" << dendl;
2403 return -EBUSY;
2404 }
2405
11fdf7f2
TL
2406 cct->_conf.add_observer(this);
2407 return 0;
2408}
2409
2410int OSD::set_numa_affinity()
2411{
2412 // storage numa node
2413 int store_node = -1;
2414 store->get_numa_node(&store_node, nullptr, nullptr);
2415 if (store_node >= 0) {
2416 dout(1) << __func__ << " storage numa node " << store_node << dendl;
2417 }
2418
2419 // check network numa node(s)
2420 int front_node = -1, back_node = -1;
2421 string front_iface = pick_iface(
2422 cct,
2423 client_messenger->get_myaddrs().front().get_sockaddr_storage());
2424 string back_iface = pick_iface(
2425 cct,
2426 cluster_messenger->get_myaddrs().front().get_sockaddr_storage());
2427 int r = get_iface_numa_node(front_iface, &front_node);
92f5a8d4 2428 if (r >= 0 && front_node >= 0) {
11fdf7f2 2429 dout(1) << __func__ << " public network " << front_iface << " numa node "
92f5a8d4 2430 << front_node << dendl;
11fdf7f2 2431 r = get_iface_numa_node(back_iface, &back_node);
92f5a8d4 2432 if (r >= 0 && back_node >= 0) {
11fdf7f2
TL
2433 dout(1) << __func__ << " cluster network " << back_iface << " numa node "
2434 << back_node << dendl;
2435 if (front_node == back_node &&
2436 front_node == store_node) {
2437 dout(1) << " objectstore and network numa nodes all match" << dendl;
2438 if (g_conf().get_val<bool>("osd_numa_auto_affinity")) {
2439 numa_node = front_node;
2440 }
92f5a8d4
TL
2441 } else if (front_node != back_node) {
2442 dout(1) << __func__ << " public and cluster network numa nodes do not match"
2443 << dendl;
11fdf7f2
TL
2444 } else {
2445 dout(1) << __func__ << " objectstore and network numa nodes do not match"
2446 << dendl;
2447 }
92f5a8d4
TL
2448 } else if (back_node == -2) {
2449 dout(1) << __func__ << " cluster network " << back_iface
2450 << " ports numa nodes do not match" << dendl;
2451 } else {
2452 derr << __func__ << " unable to identify cluster interface '" << back_iface
2453 << "' numa node: " << cpp_strerror(r) << dendl;
11fdf7f2 2454 }
92f5a8d4
TL
2455 } else if (front_node == -2) {
2456 dout(1) << __func__ << " public network " << front_iface
2457 << " ports numa nodes do not match" << dendl;
11fdf7f2
TL
2458 } else {
2459 derr << __func__ << " unable to identify public interface '" << front_iface
2460 << "' numa node: " << cpp_strerror(r) << dendl;
2461 }
2462 if (int node = g_conf().get_val<int64_t>("osd_numa_node"); node >= 0) {
2463 // this takes precedence over the automagic logic above
2464 numa_node = node;
2465 }
2466 if (numa_node >= 0) {
2467 int r = get_numa_node_cpu_set(numa_node, &numa_cpu_set_size, &numa_cpu_set);
2468 if (r < 0) {
2469 dout(1) << __func__ << " unable to determine numa node " << numa_node
2470 << " CPUs" << dendl;
2471 numa_node = -1;
2472 } else {
2473 dout(1) << __func__ << " setting numa affinity to node " << numa_node
2474 << " cpus "
2475 << cpu_set_to_str_list(numa_cpu_set_size, &numa_cpu_set)
2476 << dendl;
92f5a8d4 2477 r = set_cpu_affinity_all_threads(numa_cpu_set_size, &numa_cpu_set);
11fdf7f2
TL
2478 if (r < 0) {
2479 r = -errno;
2480 derr << __func__ << " failed to set numa affinity: " << cpp_strerror(r)
2481 << dendl;
2482 numa_node = -1;
2483 }
2484 }
2485 } else {
2486 dout(1) << __func__ << " not setting numa affinity" << dendl;
2487 }
7c673cae
FG
2488 return 0;
2489}
2490
2491// asok
2492
2493class OSDSocketHook : public AdminSocketHook {
2494 OSD *osd;
2495public:
2496 explicit OSDSocketHook(OSD *o) : osd(o) {}
11fdf7f2
TL
2497 bool call(std::string_view admin_command, const cmdmap_t& cmdmap,
2498 std::string_view format, bufferlist& out) override {
7c673cae 2499 stringstream ss;
11fdf7f2
TL
2500 bool r = true;
2501 try {
2502 r = osd->asok_command(admin_command, cmdmap, format, ss);
2503 } catch (const bad_cmd_get& e) {
2504 ss << e.what();
2505 r = true;
2506 }
7c673cae
FG
2507 out.append(ss);
2508 return r;
2509 }
2510};
2511
11fdf7f2
TL
2512std::set<int64_t> OSD::get_mapped_pools()
2513{
2514 std::set<int64_t> pools;
2515 std::vector<spg_t> pgids;
2516 _get_pgids(&pgids);
2517 for (const auto &pgid : pgids) {
2518 pools.insert(pgid.pool());
2519 }
2520 return pools;
2521}
2522
2523bool OSD::asok_command(std::string_view admin_command, const cmdmap_t& cmdmap,
2524 std::string_view format, ostream& ss)
7c673cae
FG
2525{
2526 Formatter *f = Formatter::create(format, "json-pretty", "json-pretty");
2527 if (admin_command == "status") {
2528 f->open_object_section("status");
2529 f->dump_stream("cluster_fsid") << superblock.cluster_fsid;
2530 f->dump_stream("osd_fsid") << superblock.osd_fsid;
2531 f->dump_unsigned("whoami", superblock.whoami);
2532 f->dump_string("state", get_state_name(get_state()));
2533 f->dump_unsigned("oldest_map", superblock.oldest_map);
2534 f->dump_unsigned("newest_map", superblock.newest_map);
11fdf7f2 2535 f->dump_unsigned("num_pgs", num_pgs);
7c673cae
FG
2536 f->close_section();
2537 } else if (admin_command == "flush_journal") {
2538 store->flush_journal();
2539 } else if (admin_command == "dump_ops_in_flight" ||
c07f9fc5
FG
2540 admin_command == "ops" ||
2541 admin_command == "dump_blocked_ops" ||
2542 admin_command == "dump_historic_ops" ||
2543 admin_command == "dump_historic_ops_by_duration" ||
2544 admin_command == "dump_historic_slow_ops") {
2545
2546 const string error_str = "op_tracker tracking is not enabled now, so no ops are tracked currently, \
2547even those get stuck. Please enable \"osd_enable_op_tracker\", and the tracker \
2548will start to track new ops received afterwards.";
2549
2550 set<string> filters;
2551 vector<string> filter_str;
2552 if (cmd_getval(cct, cmdmap, "filterstr", filter_str)) {
2553 copy(filter_str.begin(), filter_str.end(),
2554 inserter(filters, filters.end()));
2555 }
2556
2557 if (admin_command == "dump_ops_in_flight" ||
2558 admin_command == "ops") {
2559 if (!op_tracker.dump_ops_in_flight(f, false, filters)) {
2560 ss << error_str;
2561 }
2562 }
2563 if (admin_command == "dump_blocked_ops") {
2564 if (!op_tracker.dump_ops_in_flight(f, true, filters)) {
2565 ss << error_str;
2566 }
2567 }
2568 if (admin_command == "dump_historic_ops") {
2569 if (!op_tracker.dump_historic_ops(f, false, filters)) {
2570 ss << error_str;
2571 }
2572 }
2573 if (admin_command == "dump_historic_ops_by_duration") {
2574 if (!op_tracker.dump_historic_ops(f, true, filters)) {
2575 ss << error_str;
2576 }
2577 }
2578 if (admin_command == "dump_historic_slow_ops") {
2579 if (!op_tracker.dump_historic_slow_ops(f, filters)) {
2580 ss << error_str;
2581 }
7c673cae
FG
2582 }
2583 } else if (admin_command == "dump_op_pq_state") {
2584 f->open_object_section("pq");
2585 op_shardedwq.dump(f);
2586 f->close_section();
2587 } else if (admin_command == "dump_blacklist") {
2588 list<pair<entity_addr_t,utime_t> > bl;
2589 OSDMapRef curmap = service.get_osdmap();
2590
2591 f->open_array_section("blacklist");
2592 curmap->get_blacklist(&bl);
2593 for (list<pair<entity_addr_t,utime_t> >::iterator it = bl.begin();
2594 it != bl.end(); ++it) {
224ce89b 2595 f->open_object_section("entry");
7c673cae
FG
2596 f->open_object_section("entity_addr_t");
2597 it->first.dump(f);
2598 f->close_section(); //entity_addr_t
2599 it->second.localtime(f->dump_stream("expire_time"));
2600 f->close_section(); //entry
2601 }
2602 f->close_section(); //blacklist
2603 } else if (admin_command == "dump_watchers") {
2604 list<obj_watch_item_t> watchers;
2605 // scan pg's
11fdf7f2
TL
2606 vector<PGRef> pgs;
2607 _get_pgs(&pgs);
2608 for (auto& pg : pgs) {
2609 list<obj_watch_item_t> pg_watchers;
2610 pg->get_watchers(&pg_watchers);
2611 watchers.splice(watchers.end(), pg_watchers);
7c673cae
FG
2612 }
2613
2614 f->open_array_section("watchers");
2615 for (list<obj_watch_item_t>::iterator it = watchers.begin();
2616 it != watchers.end(); ++it) {
2617
224ce89b 2618 f->open_object_section("watch");
7c673cae
FG
2619
2620 f->dump_string("namespace", it->obj.nspace);
2621 f->dump_string("object", it->obj.oid.name);
2622
2623 f->open_object_section("entity_name");
2624 it->wi.name.dump(f);
2625 f->close_section(); //entity_name_t
2626
224ce89b
WB
2627 f->dump_unsigned("cookie", it->wi.cookie);
2628 f->dump_unsigned("timeout", it->wi.timeout_seconds);
7c673cae
FG
2629
2630 f->open_object_section("entity_addr_t");
2631 it->wi.addr.dump(f);
2632 f->close_section(); //entity_addr_t
2633
2634 f->close_section(); //watch
2635 }
2636
2637 f->close_section(); //watchers
eafe8130 2638 } else if (admin_command == "dump_recovery_reservations") {
7c673cae
FG
2639 f->open_object_section("reservations");
2640 f->open_object_section("local_reservations");
2641 service.local_reserver.dump(f);
2642 f->close_section();
2643 f->open_object_section("remote_reservations");
2644 service.remote_reserver.dump(f);
2645 f->close_section();
2646 f->close_section();
eafe8130
TL
2647 } else if (admin_command == "dump_scrub_reservations") {
2648 f->open_object_section("scrub_reservations");
2649 service.dump_scrub_reservations(f);
2650 f->close_section();
7c673cae
FG
2651 } else if (admin_command == "get_latest_osdmap") {
2652 get_latest_osdmap();
2653 } else if (admin_command == "heap") {
2654 auto result = ceph::osd_cmds::heap(*cct, cmdmap, *f, ss);
2655
2656 // Note: Failed heap profile commands won't necessarily trigger an error:
2657 f->open_object_section("result");
2658 f->dump_string("error", cpp_strerror(result));
2659 f->dump_bool("success", result >= 0);
2660 f->close_section();
2661 } else if (admin_command == "set_heap_property") {
2662 string property;
2663 int64_t value = 0;
2664 string error;
2665 bool success = false;
2666 if (!cmd_getval(cct, cmdmap, "property", property)) {
2667 error = "unable to get property";
2668 success = false;
2669 } else if (!cmd_getval(cct, cmdmap, "value", value)) {
2670 error = "unable to get value";
2671 success = false;
2672 } else if (value < 0) {
2673 error = "negative value not allowed";
2674 success = false;
2675 } else if (!ceph_heap_set_numeric_property(property.c_str(), (size_t)value)) {
2676 error = "invalid property";
2677 success = false;
2678 } else {
2679 success = true;
2680 }
2681 f->open_object_section("result");
2682 f->dump_string("error", error);
2683 f->dump_bool("success", success);
2684 f->close_section();
2685 } else if (admin_command == "get_heap_property") {
2686 string property;
2687 size_t value = 0;
2688 string error;
2689 bool success = false;
2690 if (!cmd_getval(cct, cmdmap, "property", property)) {
2691 error = "unable to get property";
2692 success = false;
2693 } else if (!ceph_heap_get_numeric_property(property.c_str(), &value)) {
2694 error = "invalid property";
2695 success = false;
2696 } else {
2697 success = true;
2698 }
2699 f->open_object_section("result");
2700 f->dump_string("error", error);
2701 f->dump_bool("success", success);
2702 f->dump_int("value", value);
2703 f->close_section();
2704 } else if (admin_command == "dump_objectstore_kv_stats") {
2705 store->get_db_statistics(f);
2706 } else if (admin_command == "dump_scrubs") {
2707 service.dumps_scrub(f);
2708 } else if (admin_command == "calc_objectstore_db_histogram") {
2709 store->generate_db_histogram(f);
2710 } else if (admin_command == "flush_store_cache") {
11fdf7f2 2711 store->flush_cache(&ss);
7c673cae
FG
2712 } else if (admin_command == "dump_pgstate_history") {
2713 f->open_object_section("pgstate_history");
11fdf7f2
TL
2714 vector<PGRef> pgs;
2715 _get_pgs(&pgs);
2716 for (auto& pg : pgs) {
2717 f->dump_stream("pg") << pg->pg_id;
2718 pg->dump_pgstate_history(f);
7c673cae
FG
2719 }
2720 f->close_section();
224ce89b
WB
2721 } else if (admin_command == "compact") {
2722 dout(1) << "triggering manual compaction" << dendl;
2723 auto start = ceph::coarse_mono_clock::now();
2724 store->compact();
2725 auto end = ceph::coarse_mono_clock::now();
11fdf7f2 2726 double duration = std::chrono::duration<double>(end-start).count();
224ce89b 2727 dout(1) << "finished manual compaction in "
11fdf7f2 2728 << duration
224ce89b
WB
2729 << " seconds" << dendl;
2730 f->open_object_section("compact_result");
11fdf7f2
TL
2731 f->dump_float("elapsed_time", duration);
2732 f->close_section();
2733 } else if (admin_command == "get_mapped_pools") {
2734 f->open_array_section("mapped_pools");
2735 set<int64_t> poollist = get_mapped_pools();
2736 for (auto pool : poollist) {
2737 f->dump_int("pool_id", pool);
2738 }
2739 f->close_section();
2740 } else if (admin_command == "smart") {
2741 string devid;
2742 cmd_getval(cct, cmdmap, "devid", devid);
2743 probe_smart(devid, ss);
2744 } else if (admin_command == "list_devices") {
2745 set<string> devnames;
2746 store->get_devices(&devnames);
2747 f->open_object_section("list_devices");
2748 for (auto dev : devnames) {
2749 if (dev.find("dm-") == 0) {
2750 continue;
2751 }
2752 f->dump_string("device", "/dev/" + dev);
2753 }
224ce89b 2754 f->close_section();
11fdf7f2
TL
2755 } else if (admin_command == "send_beacon") {
2756 if (is_active()) {
2757 send_beacon(ceph::coarse_mono_clock::now());
2758 }
eafe8130
TL
2759 } else if (admin_command == "dump_osd_network") {
2760 int64_t value = 0;
2761 if (!(cmd_getval(cct, cmdmap, "value", value))) {
2762 // Convert milliseconds to microseconds
2763 value = static_cast<int64_t>(g_conf().get_val<double>("mon_warn_on_slow_ping_time")) * 1000;
2764 if (value == 0) {
2765 double ratio = g_conf().get_val<double>("mon_warn_on_slow_ping_ratio");
2766 value = g_conf().get_val<int64_t>("osd_heartbeat_grace");
2767 value *= 1000000 * ratio; // Seconds of grace to microseconds at ratio
2768 }
2769 } else {
2770 // Convert user input to microseconds
2771 value *= 1000;
2772 }
2773 if (value < 0) value = 0;
2774
2775 struct osd_ping_time_t {
2776 uint32_t pingtime;
2777 int to;
2778 bool back;
2779 std::array<uint32_t,3> times;
2780 std::array<uint32_t,3> min;
2781 std::array<uint32_t,3> max;
2782 uint32_t last;
2783 uint32_t last_update;
2784
2785 bool operator<(const osd_ping_time_t& rhs) const {
2786 if (pingtime < rhs.pingtime)
2787 return true;
2788 if (pingtime > rhs.pingtime)
2789 return false;
2790 if (to < rhs.to)
2791 return true;
2792 if (to > rhs.to)
2793 return false;
2794 return back;
2795 }
2796 };
2797
2798 set<osd_ping_time_t> sorted;
2799 // Get pingtimes under lock and not on the stack
2800 map<int, osd_stat_t::Interfaces> *pingtimes = new map<int, osd_stat_t::Interfaces>;
2801 service.get_hb_pingtime(pingtimes);
2802 for (auto j : *pingtimes) {
2803 if (j.second.last_update == 0)
2804 continue;
2805 osd_ping_time_t item;
2806 item.pingtime = std::max(j.second.back_pingtime[0], j.second.back_pingtime[1]);
2807 item.pingtime = std::max(item.pingtime, j.second.back_pingtime[2]);
2808 if (item.pingtime >= value) {
2809 item.to = j.first;
2810 item.times[0] = j.second.back_pingtime[0];
2811 item.times[1] = j.second.back_pingtime[1];
2812 item.times[2] = j.second.back_pingtime[2];
2813 item.min[0] = j.second.back_min[0];
2814 item.min[1] = j.second.back_min[1];
2815 item.min[2] = j.second.back_min[2];
2816 item.max[0] = j.second.back_max[0];
2817 item.max[1] = j.second.back_max[1];
2818 item.max[2] = j.second.back_max[2];
2819 item.last = j.second.back_last;
2820 item.back = true;
2821 item.last_update = j.second.last_update;
2822 sorted.emplace(item);
2823 }
2824 if (j.second.front_last == 0)
2825 continue;
2826 item.pingtime = std::max(j.second.front_pingtime[0], j.second.front_pingtime[1]);
2827 item.pingtime = std::max(item.pingtime, j.second.front_pingtime[2]);
2828 if (item.pingtime >= value) {
2829 item.to = j.first;
2830 item.times[0] = j.second.front_pingtime[0];
2831 item.times[1] = j.second.front_pingtime[1];
2832 item.times[2] = j.second.front_pingtime[2];
2833 item.min[0] = j.second.front_min[0];
2834 item.min[1] = j.second.front_min[1];
2835 item.min[2] = j.second.front_min[2];
2836 item.max[0] = j.second.front_max[0];
2837 item.max[1] = j.second.front_max[1];
2838 item.max[2] = j.second.front_max[2];
2839 item.last = j.second.front_last;
2840 item.last_update = j.second.last_update;
2841 item.back = false;
2842 sorted.emplace(item);
2843 }
2844 }
2845 delete pingtimes;
2846 //
2847 // Network ping times (1min 5min 15min)
2848 f->open_object_section("network_ping_times");
2849 f->dump_int("threshold", value / 1000);
2850 f->open_array_section("entries");
2851 for (auto &sitem : boost::adaptors::reverse(sorted)) {
2852 ceph_assert(sitem.pingtime >= value);
2853 f->open_object_section("entry");
2854
2855 const time_t lu(sitem.last_update);
2856 char buffer[26];
2857 string lustr(ctime_r(&lu, buffer));
2858 lustr.pop_back(); // Remove trailing \n
2859 auto stale = cct->_conf.get_val<int64_t>("osd_heartbeat_stale");
2860 f->dump_string("last update", lustr);
2861 f->dump_bool("stale", ceph_clock_now().sec() - sitem.last_update > stale);
2862 f->dump_int("from osd", whoami);
2863 f->dump_int("to osd", sitem.to);
2864 f->dump_string("interface", (sitem.back ? "back" : "front"));
2865 f->open_object_section("average");
2866 f->dump_format_unquoted("1min", "%s", fixed_u_to_string(sitem.times[0],3).c_str());
2867 f->dump_format_unquoted("5min", "%s", fixed_u_to_string(sitem.times[1],3).c_str());
2868 f->dump_format_unquoted("15min", "%s", fixed_u_to_string(sitem.times[2],3).c_str());
2869 f->close_section(); // average
2870 f->open_object_section("min");
2871 f->dump_format_unquoted("1min", "%s", fixed_u_to_string(sitem.max[0],3).c_str());
2872 f->dump_format_unquoted("5min", "%s", fixed_u_to_string(sitem.max[1],3).c_str());
2873 f->dump_format_unquoted("15min", "%s", fixed_u_to_string(sitem.max[2],3).c_str());
2874 f->close_section(); // min
2875 f->open_object_section("max");
2876 f->dump_format_unquoted("1min", "%s", fixed_u_to_string(sitem.max[0],3).c_str());
2877 f->dump_format_unquoted("5min", "%s", fixed_u_to_string(sitem.max[1],3).c_str());
2878 f->dump_format_unquoted("15min", "%s", fixed_u_to_string(sitem.max[2],3).c_str());
2879 f->close_section(); // max
2880 f->dump_format_unquoted("last", "%s", fixed_u_to_string(sitem.last,3).c_str());
2881 f->close_section(); // entry
2882 }
2883 f->close_section(); // entries
2884 f->close_section(); // network_ping_times
7c673cae 2885 } else {
11fdf7f2 2886 ceph_abort_msg("broken asok registration");
7c673cae
FG
2887 }
2888 f->flush(ss);
2889 delete f;
2890 return true;
2891}
2892
2893class TestOpsSocketHook : public AdminSocketHook {
2894 OSDService *service;
2895 ObjectStore *store;
2896public:
2897 TestOpsSocketHook(OSDService *s, ObjectStore *st) : service(s), store(st) {}
11fdf7f2
TL
2898 bool call(std::string_view command, const cmdmap_t& cmdmap,
2899 std::string_view format, bufferlist& out) override {
7c673cae 2900 stringstream ss;
11fdf7f2
TL
2901 try {
2902 test_ops(service, store, command, cmdmap, ss);
2903 } catch (const bad_cmd_get& e) {
2904 ss << e.what();
2905 }
7c673cae
FG
2906 out.append(ss);
2907 return true;
2908 }
2909 void test_ops(OSDService *service, ObjectStore *store,
11fdf7f2 2910 std::string_view command, const cmdmap_t& cmdmap, ostream &ss);
7c673cae
FG
2911
2912};
2913
2914class OSD::C_Tick : public Context {
2915 OSD *osd;
2916 public:
2917 explicit C_Tick(OSD *o) : osd(o) {}
2918 void finish(int r) override {
2919 osd->tick();
2920 }
2921};
2922
2923class OSD::C_Tick_WithoutOSDLock : public Context {
2924 OSD *osd;
2925 public:
2926 explicit C_Tick_WithoutOSDLock(OSD *o) : osd(o) {}
2927 void finish(int r) override {
2928 osd->tick_without_osd_lock();
2929 }
2930};
2931
2932int OSD::enable_disable_fuse(bool stop)
2933{
2934#ifdef HAVE_LIBFUSE
2935 int r;
2936 string mntpath = cct->_conf->osd_data + "/fuse";
2937 if (fuse_store && (stop || !cct->_conf->osd_objectstore_fuse)) {
2938 dout(1) << __func__ << " disabling" << dendl;
2939 fuse_store->stop();
2940 delete fuse_store;
2941 fuse_store = NULL;
2942 r = ::rmdir(mntpath.c_str());
7c673cae 2943 if (r < 0) {
c07f9fc5
FG
2944 r = -errno;
2945 derr << __func__ << " failed to rmdir " << mntpath << ": "
2946 << cpp_strerror(r) << dendl;
7c673cae
FG
2947 return r;
2948 }
2949 return 0;
2950 }
2951 if (!fuse_store && cct->_conf->osd_objectstore_fuse) {
2952 dout(1) << __func__ << " enabling" << dendl;
2953 r = ::mkdir(mntpath.c_str(), 0700);
2954 if (r < 0)
2955 r = -errno;
2956 if (r < 0 && r != -EEXIST) {
2957 derr << __func__ << " unable to create " << mntpath << ": "
2958 << cpp_strerror(r) << dendl;
2959 return r;
2960 }
2961 fuse_store = new FuseStore(store, mntpath);
2962 r = fuse_store->start();
2963 if (r < 0) {
2964 derr << __func__ << " unable to start fuse: " << cpp_strerror(r) << dendl;
2965 delete fuse_store;
2966 fuse_store = NULL;
2967 return r;
2968 }
2969 }
2970#endif // HAVE_LIBFUSE
2971 return 0;
2972}
2973
31f18b77
FG
2974int OSD::get_num_op_shards()
2975{
2976 if (cct->_conf->osd_op_num_shards)
2977 return cct->_conf->osd_op_num_shards;
2978 if (store_is_rotational)
2979 return cct->_conf->osd_op_num_shards_hdd;
2980 else
2981 return cct->_conf->osd_op_num_shards_ssd;
2982}
2983
2984int OSD::get_num_op_threads()
2985{
2986 if (cct->_conf->osd_op_num_threads_per_shard)
2987 return get_num_op_shards() * cct->_conf->osd_op_num_threads_per_shard;
2988 if (store_is_rotational)
2989 return get_num_op_shards() * cct->_conf->osd_op_num_threads_per_shard_hdd;
2990 else
2991 return get_num_op_shards() * cct->_conf->osd_op_num_threads_per_shard_ssd;
2992}
2993
c07f9fc5
FG
2994float OSD::get_osd_recovery_sleep()
2995{
2996 if (cct->_conf->osd_recovery_sleep)
2997 return cct->_conf->osd_recovery_sleep;
d2e6a577 2998 if (!store_is_rotational && !journal_is_rotational)
c07f9fc5 2999 return cct->_conf->osd_recovery_sleep_ssd;
d2e6a577 3000 else if (store_is_rotational && !journal_is_rotational)
11fdf7f2 3001 return cct->_conf.get_val<double>("osd_recovery_sleep_hybrid");
d2e6a577
FG
3002 else
3003 return cct->_conf->osd_recovery_sleep_hdd;
c07f9fc5
FG
3004}
3005
11fdf7f2
TL
3006float OSD::get_osd_delete_sleep()
3007{
3008 float osd_delete_sleep = cct->_conf.get_val<double>("osd_delete_sleep");
3009 if (osd_delete_sleep > 0)
3010 return osd_delete_sleep;
3011 if (!store_is_rotational && !journal_is_rotational)
3012 return cct->_conf.get_val<double>("osd_delete_sleep_ssd");
3013 if (store_is_rotational && !journal_is_rotational)
3014 return cct->_conf.get_val<double>("osd_delete_sleep_hybrid");
3015 return cct->_conf.get_val<double>("osd_delete_sleep_hdd");
3016}
3017
494da23a
TL
3018float OSD::get_osd_snap_trim_sleep()
3019{
3020 float osd_snap_trim_sleep = cct->_conf.get_val<double>("osd_snap_trim_sleep");
3021 if (osd_snap_trim_sleep > 0)
3022 return osd_snap_trim_sleep;
3023 if (!store_is_rotational && !journal_is_rotational)
3024 return cct->_conf.get_val<double>("osd_snap_trim_sleep_ssd");
3025 if (store_is_rotational && !journal_is_rotational)
3026 return cct->_conf.get_val<double>("osd_snap_trim_sleep_hybrid");
3027 return cct->_conf.get_val<double>("osd_snap_trim_sleep_hdd");
3028}
3029
7c673cae
FG
3030int OSD::init()
3031{
3032 CompatSet initial, diff;
11fdf7f2 3033 std::lock_guard lock(osd_lock);
7c673cae
FG
3034 if (is_stopping())
3035 return 0;
3036
3037 tick_timer.init();
3038 tick_timer_without_osd_lock.init();
3039 service.recovery_request_timer.init();
11fdf7f2
TL
3040 service.sleep_timer.init();
3041
3042 boot_finisher.start();
3043
3044 {
3045 string val;
3046 store->read_meta("require_osd_release", &val);
3047 last_require_osd_release = atoi(val.c_str());
3048 }
7c673cae
FG
3049
3050 // mount.
31f18b77
FG
3051 dout(2) << "init " << dev_path
3052 << " (looks like " << (store_is_rotational ? "hdd" : "ssd") << ")"
3053 << dendl;
d2e6a577 3054 dout(2) << "journal " << journal_path << dendl;
11fdf7f2 3055 ceph_assert(store); // call pre_init() first!
7c673cae 3056
31f18b77 3057 store->set_cache_shards(get_num_op_shards());
7c673cae
FG
3058
3059 int r = store->mount();
3060 if (r < 0) {
3061 derr << "OSD:init: unable to mount object store" << dendl;
3062 return r;
3063 }
d2e6a577
FG
3064 journal_is_rotational = store->is_journal_rotational();
3065 dout(2) << "journal looks like " << (journal_is_rotational ? "hdd" : "ssd")
3066 << dendl;
7c673cae
FG
3067
3068 enable_disable_fuse(false);
3069
3070 dout(2) << "boot" << dendl;
3071
11fdf7f2
TL
3072 service.meta_ch = store->open_collection(coll_t::meta());
3073
7c673cae
FG
3074 // initialize the daily loadavg with current 15min loadavg
3075 double loadavgs[3];
3076 if (getloadavg(loadavgs, 3) == 3) {
3077 daily_loadavg = loadavgs[2];
3078 } else {
3079 derr << "OSD::init() : couldn't read loadavgs\n" << dendl;
3080 daily_loadavg = 1.0;
3081 }
3082
3083 int rotating_auth_attempts = 0;
11fdf7f2
TL
3084 auto rotating_auth_timeout =
3085 g_conf().get_val<int64_t>("rotating_keys_bootstrap_timeout");
7c673cae
FG
3086
3087 // sanity check long object name handling
3088 {
3089 hobject_t l;
3090 l.oid.name = string(cct->_conf->osd_max_object_name_len, 'n');
3091 l.set_key(string(cct->_conf->osd_max_object_name_len, 'k'));
3092 l.nspace = string(cct->_conf->osd_max_object_namespace_len, 's');
3093 r = store->validate_hobject_key(l);
3094 if (r < 0) {
3095 derr << "backend (" << store->get_type() << ") is unable to support max "
3096 << "object name[space] len" << dendl;
3097 derr << " osd max object name len = "
3098 << cct->_conf->osd_max_object_name_len << dendl;
3099 derr << " osd max object namespace len = "
3100 << cct->_conf->osd_max_object_namespace_len << dendl;
3101 derr << cpp_strerror(r) << dendl;
3102 if (cct->_conf->osd_check_max_object_name_len_on_startup) {
3103 goto out;
3104 }
3105 derr << "osd_check_max_object_name_len_on_startup = false, starting anyway"
3106 << dendl;
3107 } else {
3108 dout(20) << "configured osd_max_object_name[space]_len looks ok" << dendl;
3109 }
3110 }
3111
3112 // read superblock
3113 r = read_superblock();
3114 if (r < 0) {
3115 derr << "OSD::init() : unable to read osd superblock" << dendl;
3116 r = -EINVAL;
3117 goto out;
3118 }
3119
3120 if (osd_compat.compare(superblock.compat_features) < 0) {
3121 derr << "The disk uses features unsupported by the executable." << dendl;
3122 derr << " ondisk features " << superblock.compat_features << dendl;
3123 derr << " daemon features " << osd_compat << dendl;
3124
3125 if (osd_compat.writeable(superblock.compat_features)) {
3126 CompatSet diff = osd_compat.unsupported(superblock.compat_features);
3127 derr << "it is still writeable, though. Missing features: " << diff << dendl;
3128 r = -EOPNOTSUPP;
3129 goto out;
3130 }
3131 else {
3132 CompatSet diff = osd_compat.unsupported(superblock.compat_features);
3133 derr << "Cannot write to disk! Missing features: " << diff << dendl;
3134 r = -EOPNOTSUPP;
3135 goto out;
3136 }
3137 }
3138
3139 assert_warn(whoami == superblock.whoami);
3140 if (whoami != superblock.whoami) {
3141 derr << "OSD::init: superblock says osd"
3142 << superblock.whoami << " but I am osd." << whoami << dendl;
3143 r = -EINVAL;
3144 goto out;
3145 }
3146
11fdf7f2
TL
3147 // load up "current" osdmap
3148 assert_warn(!osdmap);
3149 if (osdmap) {
3150 derr << "OSD::init: unable to read current osdmap" << dendl;
3151 r = -EINVAL;
3152 goto out;
3153 }
3154 osdmap = get_map(superblock.current_epoch);
3155
3156 // make sure we don't have legacy pgs deleting
3157 {
3158 vector<coll_t> ls;
3159 int r = store->list_collections(ls);
3160 ceph_assert(r >= 0);
3161 for (auto c : ls) {
3162 spg_t pgid;
3163 if (c.is_pg(&pgid) &&
3164 !osdmap->have_pg_pool(pgid.pool())) {
3165 ghobject_t oid = make_final_pool_info_oid(pgid.pool());
3166 if (!store->exists(service.meta_ch, oid)) {
3167 derr << __func__ << " missing pg_pool_t for deleted pool "
3168 << pgid.pool() << " for pg " << pgid
3169 << "; please downgrade to luminous and allow "
3170 << "pg deletion to complete before upgrading" << dendl;
3171 ceph_abort();
3172 }
3173 }
3174 }
3175 }
3176
7c673cae
FG
3177 initial = get_osd_initial_compat_set();
3178 diff = superblock.compat_features.unsupported(initial);
3179 if (superblock.compat_features.merge(initial)) {
3180 // We need to persist the new compat_set before we
3181 // do anything else
3182 dout(5) << "Upgrading superblock adding: " << diff << dendl;
3183 ObjectStore::Transaction t;
3184 write_superblock(t);
11fdf7f2 3185 r = store->queue_transaction(service.meta_ch, std::move(t));
7c673cae
FG
3186 if (r < 0)
3187 goto out;
3188 }
3189
3190 // make sure snap mapper object exists
11fdf7f2 3191 if (!store->exists(service.meta_ch, OSD::make_snapmapper_oid())) {
7c673cae
FG
3192 dout(10) << "init creating/touching snapmapper object" << dendl;
3193 ObjectStore::Transaction t;
3194 t.touch(coll_t::meta(), OSD::make_snapmapper_oid());
11fdf7f2 3195 r = store->queue_transaction(service.meta_ch, std::move(t));
7c673cae
FG
3196 if (r < 0)
3197 goto out;
3198 }
3199
3200 class_handler = new ClassHandler(cct);
3201 cls_initialize(class_handler);
3202
3203 if (cct->_conf->osd_open_classes_on_start) {
3204 int r = class_handler->open_all_classes();
3205 if (r)
3206 dout(1) << "warning: got an error loading one or more classes: " << cpp_strerror(r) << dendl;
3207 }
3208
11fdf7f2 3209 check_osdmap_features();
7c673cae
FG
3210
3211 create_recoverystate_perf();
3212
3213 {
3214 epoch_t bind_epoch = osdmap->get_epoch();
3215 service.set_epochs(NULL, NULL, &bind_epoch);
3216 }
3217
3218 clear_temp_objects();
3219
d2e6a577 3220 // initialize osdmap references in sharded wq
11fdf7f2
TL
3221 for (auto& shard : shards) {
3222 std::lock_guard l(shard->osdmap_lock);
3223 shard->shard_osdmap = osdmap;
3224 }
d2e6a577 3225
7c673cae
FG
3226 // load up pgs (as they previously existed)
3227 load_pgs();
3228
3229 dout(2) << "superblock: I am osd." << superblock.whoami << dendl;
3230 dout(0) << "using " << op_queue << " op queue with priority op cut off at " <<
3231 op_prio_cutoff << "." << dendl;
3232
3233 create_logger();
3234
11fdf7f2
TL
3235 // prime osd stats
3236 {
3237 struct store_statfs_t stbuf;
3238 osd_alert_list_t alerts;
3239 int r = store->statfs(&stbuf, &alerts);
3240 ceph_assert(r == 0);
3241 service.set_statfs(stbuf, alerts);
3242 }
3243
3244 // client_messenger auth_client is already set up by monc.
3245 for (auto m : { cluster_messenger,
3246 objecter_messenger,
3247 hb_front_client_messenger,
3248 hb_back_client_messenger,
3249 hb_front_server_messenger,
3250 hb_back_server_messenger } ) {
3251 m->set_auth_client(monc);
3252 }
3253 for (auto m : { client_messenger,
3254 cluster_messenger,
3255 hb_front_server_messenger,
3256 hb_back_server_messenger }) {
3257 m->set_auth_server(monc);
3258 }
3259 monc->set_handle_authentication_dispatcher(this);
7c673cae
FG
3260
3261 monc->set_want_keys(CEPH_ENTITY_TYPE_MON | CEPH_ENTITY_TYPE_OSD
3262 | CEPH_ENTITY_TYPE_MGR);
3263 r = monc->init();
3264 if (r < 0)
3265 goto out;
3266
11fdf7f2
TL
3267 mgrc.set_pgstats_cb([this](){ return collect_pg_stats(); });
3268 mgrc.set_perf_metric_query_cb(
3269 [this](const std::map<OSDPerfMetricQuery, OSDPerfMetricLimits> &queries) {
3270 set_perf_queries(queries);
3271 },
3272 [this](std::map<OSDPerfMetricQuery, OSDPerfMetricReport> *reports) {
3273 get_perf_reports(reports);
3274 });
7c673cae 3275 mgrc.init();
7c673cae
FG
3276
3277 // tell monc about log_client so it will know about mon session resets
3278 monc->set_log_client(&log_client);
3279 update_log_config();
3280
11fdf7f2
TL
3281 // i'm ready!
3282 client_messenger->add_dispatcher_tail(&mgrc);
3283 client_messenger->add_dispatcher_tail(this);
3284 cluster_messenger->add_dispatcher_head(this);
3285
3286 hb_front_client_messenger->add_dispatcher_head(&heartbeat_dispatcher);
3287 hb_back_client_messenger->add_dispatcher_head(&heartbeat_dispatcher);
3288 hb_front_server_messenger->add_dispatcher_head(&heartbeat_dispatcher);
3289 hb_back_server_messenger->add_dispatcher_head(&heartbeat_dispatcher);
3290
3291 objecter_messenger->add_dispatcher_head(service.objecter);
3292
28e407b8
AA
3293 service.init();
3294 service.publish_map(osdmap);
3295 service.publish_superblock(superblock);
3296 service.max_oldest_map = superblock.oldest_map;
3297
11fdf7f2
TL
3298 for (auto& shard : shards) {
3299 // put PGs in a temporary set because we may modify pg_slots
3300 // unordered_map below.
3301 set<PGRef> pgs;
3302 for (auto& i : shard->pg_slots) {
3303 PGRef pg = i.second->pg;
3304 if (!pg) {
3305 continue;
3306 }
3307 pgs.insert(pg);
3308 }
3309 for (auto pg : pgs) {
3310 pg->lock();
3311 set<pair<spg_t,epoch_t>> new_children;
3312 set<pair<spg_t,epoch_t>> merge_pgs;
3313 service.identify_splits_and_merges(pg->get_osdmap(), osdmap, pg->pg_id,
3314 &new_children, &merge_pgs);
3315 if (!new_children.empty()) {
3316 for (auto shard : shards) {
3317 shard->prime_splits(osdmap, &new_children);
3318 }
3319 assert(new_children.empty());
3320 }
3321 if (!merge_pgs.empty()) {
3322 for (auto shard : shards) {
3323 shard->prime_merges(osdmap, &merge_pgs);
3324 }
3325 assert(merge_pgs.empty());
3326 }
3327 pg->unlock();
3328 }
3329 }
3330
7c673cae 3331 osd_op_tp.start();
7c673cae
FG
3332 command_tp.start();
3333
7c673cae
FG
3334 // start the heartbeat
3335 heartbeat_thread.create("osd_srv_heartbt");
3336
3337 // tick
91327a77
AA
3338 tick_timer.add_event_after(get_tick_interval(),
3339 new C_Tick(this));
7c673cae 3340 {
11fdf7f2 3341 std::lock_guard l(tick_timer_lock);
91327a77
AA
3342 tick_timer_without_osd_lock.add_event_after(get_tick_interval(),
3343 new C_Tick_WithoutOSDLock(this));
7c673cae
FG
3344 }
3345
7c673cae
FG
3346 osd_lock.Unlock();
3347
3348 r = monc->authenticate();
3349 if (r < 0) {
c07f9fc5
FG
3350 derr << __func__ << " authentication failed: " << cpp_strerror(r)
3351 << dendl;
11fdf7f2 3352 exit(1);
7c673cae
FG
3353 }
3354
11fdf7f2 3355 while (monc->wait_auth_rotating(rotating_auth_timeout) < 0) {
7c673cae
FG
3356 derr << "unable to obtain rotating service keys; retrying" << dendl;
3357 ++rotating_auth_attempts;
11fdf7f2 3358 if (rotating_auth_attempts > g_conf()->max_rotating_auth_attempts) {
c07f9fc5 3359 derr << __func__ << " wait_auth_rotating timed out" << dendl;
11fdf7f2 3360 exit(1);
7c673cae
FG
3361 }
3362 }
3363
3364 r = update_crush_device_class();
3365 if (r < 0) {
d2e6a577
FG
3366 derr << __func__ << " unable to update_crush_device_class: "
3367 << cpp_strerror(r) << dendl;
11fdf7f2 3368 exit(1);
7c673cae
FG
3369 }
3370
3371 r = update_crush_location();
3372 if (r < 0) {
d2e6a577 3373 derr << __func__ << " unable to update_crush_location: "
c07f9fc5 3374 << cpp_strerror(r) << dendl;
11fdf7f2 3375 exit(1);
7c673cae
FG
3376 }
3377
3378 osd_lock.Lock();
3379 if (is_stopping())
3380 return 0;
3381
3382 // start objecter *after* we have authenticated, so that we don't ignore
3383 // the OSDMaps it requests.
3384 service.final_init();
3385
3386 check_config();
3387
3388 dout(10) << "ensuring pgs have consumed prior maps" << dendl;
3389 consume_map();
7c673cae
FG
3390
3391 dout(0) << "done with init, starting boot process" << dendl;
3392
3393 // subscribe to any pg creations
3394 monc->sub_want("osd_pg_creates", last_pg_create_epoch, 0);
3395
3396 // MgrClient needs this (it doesn't have MonClient reference itself)
3397 monc->sub_want("mgrmap", 0, 0);
3398
3399 // we don't need to ask for an osdmap here; objecter will
3400 //monc->sub_want("osdmap", osdmap->get_epoch(), CEPH_SUBSCRIBE_ONETIME);
3401
3402 monc->renew_subs();
3403
3404 start_boot();
3405
3406 return 0;
7c673cae
FG
3407
3408out:
3409 enable_disable_fuse(true);
3410 store->umount();
3411 delete store;
3412 store = NULL;
3413 return r;
3414}
3415
3416void OSD::final_init()
3417{
3418 AdminSocket *admin_socket = cct->get_admin_socket();
3419 asok_hook = new OSDSocketHook(this);
3420 int r = admin_socket->register_command("status", "status", asok_hook,
3421 "high-level status of OSD");
11fdf7f2 3422 ceph_assert(r == 0);
7c673cae
FG
3423 r = admin_socket->register_command("flush_journal", "flush_journal",
3424 asok_hook,
3425 "flush the journal to permanent store");
11fdf7f2 3426 ceph_assert(r == 0);
7c673cae 3427 r = admin_socket->register_command("dump_ops_in_flight",
c07f9fc5
FG
3428 "dump_ops_in_flight " \
3429 "name=filterstr,type=CephString,n=N,req=false",
3430 asok_hook,
7c673cae 3431 "show the ops currently in flight");
11fdf7f2 3432 ceph_assert(r == 0);
7c673cae 3433 r = admin_socket->register_command("ops",
c07f9fc5
FG
3434 "ops " \
3435 "name=filterstr,type=CephString,n=N,req=false",
3436 asok_hook,
7c673cae 3437 "show the ops currently in flight");
11fdf7f2 3438 ceph_assert(r == 0);
7c673cae 3439 r = admin_socket->register_command("dump_blocked_ops",
c07f9fc5
FG
3440 "dump_blocked_ops " \
3441 "name=filterstr,type=CephString,n=N,req=false",
3442 asok_hook,
7c673cae 3443 "show the blocked ops currently in flight");
11fdf7f2 3444 ceph_assert(r == 0);
c07f9fc5
FG
3445 r = admin_socket->register_command("dump_historic_ops",
3446 "dump_historic_ops " \
3447 "name=filterstr,type=CephString,n=N,req=false",
7c673cae
FG
3448 asok_hook,
3449 "show recent ops");
11fdf7f2 3450 ceph_assert(r == 0);
c07f9fc5
FG
3451 r = admin_socket->register_command("dump_historic_slow_ops",
3452 "dump_historic_slow_ops " \
3453 "name=filterstr,type=CephString,n=N,req=false",
7c673cae
FG
3454 asok_hook,
3455 "show slowest recent ops");
11fdf7f2 3456 ceph_assert(r == 0);
c07f9fc5
FG
3457 r = admin_socket->register_command("dump_historic_ops_by_duration",
3458 "dump_historic_ops_by_duration " \
3459 "name=filterstr,type=CephString,n=N,req=false",
7c673cae
FG
3460 asok_hook,
3461 "show slowest recent ops, sorted by duration");
11fdf7f2 3462 ceph_assert(r == 0);
7c673cae
FG
3463 r = admin_socket->register_command("dump_op_pq_state", "dump_op_pq_state",
3464 asok_hook,
3465 "dump op priority queue state");
11fdf7f2 3466 ceph_assert(r == 0);
7c673cae
FG
3467 r = admin_socket->register_command("dump_blacklist", "dump_blacklist",
3468 asok_hook,
3469 "dump blacklisted clients and times");
11fdf7f2 3470 ceph_assert(r == 0);
7c673cae
FG
3471 r = admin_socket->register_command("dump_watchers", "dump_watchers",
3472 asok_hook,
3473 "show clients which have active watches,"
3474 " and on which objects");
11fdf7f2 3475 ceph_assert(r == 0);
eafe8130 3476 r = admin_socket->register_command("dump_recovery_reservations", "dump_recovery_reservations",
7c673cae
FG
3477 asok_hook,
3478 "show recovery reservations");
11fdf7f2 3479 ceph_assert(r == 0);
eafe8130
TL
3480 r = admin_socket->register_command("dump_scrub_reservations", "dump_scrub_reservations",
3481 asok_hook,
3482 "show scrub reservations");
3483 ceph_assert(r == 0);
7c673cae
FG
3484 r = admin_socket->register_command("get_latest_osdmap", "get_latest_osdmap",
3485 asok_hook,
3486 "force osd to update the latest map from "
3487 "the mon");
11fdf7f2 3488 ceph_assert(r == 0);
7c673cae
FG
3489
3490 r = admin_socket->register_command( "heap",
3491 "heap " \
11fdf7f2
TL
3492 "name=heapcmd,type=CephString " \
3493 "name=value,type=CephString,req=false",
7c673cae
FG
3494 asok_hook,
3495 "show heap usage info (available only if "
3496 "compiled with tcmalloc)");
11fdf7f2 3497 ceph_assert(r == 0);
7c673cae
FG
3498
3499 r = admin_socket->register_command("set_heap_property",
3500 "set_heap_property " \
3501 "name=property,type=CephString " \
3502 "name=value,type=CephInt",
3503 asok_hook,
3504 "update malloc extension heap property");
11fdf7f2 3505 ceph_assert(r == 0);
7c673cae
FG
3506
3507 r = admin_socket->register_command("get_heap_property",
3508 "get_heap_property " \
3509 "name=property,type=CephString",
3510 asok_hook,
3511 "get malloc extension heap property");
11fdf7f2 3512 ceph_assert(r == 0);
7c673cae
FG
3513
3514 r = admin_socket->register_command("dump_objectstore_kv_stats",
3515 "dump_objectstore_kv_stats",
3516 asok_hook,
3517 "print statistics of kvdb which used by bluestore");
11fdf7f2 3518 ceph_assert(r == 0);
7c673cae
FG
3519
3520 r = admin_socket->register_command("dump_scrubs",
3521 "dump_scrubs",
3522 asok_hook,
3523 "print scheduled scrubs");
11fdf7f2 3524 ceph_assert(r == 0);
7c673cae
FG
3525
3526 r = admin_socket->register_command("calc_objectstore_db_histogram",
3527 "calc_objectstore_db_histogram",
3528 asok_hook,
3529 "Generate key value histogram of kvdb(rocksdb) which used by bluestore");
11fdf7f2 3530 ceph_assert(r == 0);
7c673cae
FG
3531
3532 r = admin_socket->register_command("flush_store_cache",
3533 "flush_store_cache",
3534 asok_hook,
3535 "Flush bluestore internal cache");
11fdf7f2 3536 ceph_assert(r == 0);
7c673cae
FG
3537 r = admin_socket->register_command("dump_pgstate_history", "dump_pgstate_history",
3538 asok_hook,
3539 "show recent state history");
11fdf7f2 3540 ceph_assert(r == 0);
7c673cae 3541
224ce89b
WB
3542 r = admin_socket->register_command("compact", "compact",
3543 asok_hook,
3544 "Commpact object store's omap."
3545 " WARNING: Compaction probably slows your requests");
11fdf7f2
TL
3546 ceph_assert(r == 0);
3547
3548 r = admin_socket->register_command("get_mapped_pools", "get_mapped_pools",
3549 asok_hook,
3550 "dump pools whose PG(s) are mapped to this OSD.");
3551
3552 ceph_assert(r == 0);
3553
3554 r = admin_socket->register_command("smart", "smart name=devid,type=CephString,req=False",
3555 asok_hook,
3556 "probe OSD devices for SMART data.");
3557
3558 ceph_assert(r == 0);
3559
3560 r = admin_socket->register_command("list_devices", "list_devices",
3561 asok_hook,
3562 "list OSD devices.");
3563 r = admin_socket->register_command("send_beacon", "send_beacon",
3564 asok_hook,
3565 "send OSD beacon to mon immediately");
224ce89b 3566
eafe8130
TL
3567 r = admin_socket->register_command("dump_osd_network", "dump_osd_network name=value,type=CephInt,req=false", asok_hook,
3568 "Dump osd heartbeat network ping times");
3569 ceph_assert(r == 0);
3570
7c673cae
FG
3571 test_ops_hook = new TestOpsSocketHook(&(this->service), this->store);
3572 // Note: pools are CephString instead of CephPoolname because
3573 // these commands traditionally support both pool names and numbers
3574 r = admin_socket->register_command(
3575 "setomapval",
3576 "setomapval " \
3577 "name=pool,type=CephString " \
3578 "name=objname,type=CephObjectname " \
3579 "name=key,type=CephString "\
3580 "name=val,type=CephString",
3581 test_ops_hook,
3582 "set omap key");
11fdf7f2 3583 ceph_assert(r == 0);
7c673cae
FG
3584 r = admin_socket->register_command(
3585 "rmomapkey",
3586 "rmomapkey " \
3587 "name=pool,type=CephString " \
3588 "name=objname,type=CephObjectname " \
3589 "name=key,type=CephString",
3590 test_ops_hook,
3591 "remove omap key");
11fdf7f2 3592 ceph_assert(r == 0);
7c673cae
FG
3593 r = admin_socket->register_command(
3594 "setomapheader",
3595 "setomapheader " \
3596 "name=pool,type=CephString " \
3597 "name=objname,type=CephObjectname " \
3598 "name=header,type=CephString",
3599 test_ops_hook,
3600 "set omap header");
11fdf7f2 3601 ceph_assert(r == 0);
7c673cae
FG
3602
3603 r = admin_socket->register_command(
3604 "getomap",
3605 "getomap " \
3606 "name=pool,type=CephString " \
3607 "name=objname,type=CephObjectname",
3608 test_ops_hook,
3609 "output entire object map");
11fdf7f2 3610 ceph_assert(r == 0);
7c673cae
FG
3611
3612 r = admin_socket->register_command(
3613 "truncobj",
3614 "truncobj " \
3615 "name=pool,type=CephString " \
3616 "name=objname,type=CephObjectname " \
3617 "name=len,type=CephInt",
3618 test_ops_hook,
3619 "truncate object to length");
11fdf7f2 3620 ceph_assert(r == 0);
7c673cae
FG
3621
3622 r = admin_socket->register_command(
3623 "injectdataerr",
3624 "injectdataerr " \
3625 "name=pool,type=CephString " \
3626 "name=objname,type=CephObjectname " \
3627 "name=shardid,type=CephInt,req=false,range=0|255",
3628 test_ops_hook,
3629 "inject data error to an object");
11fdf7f2 3630 ceph_assert(r == 0);
7c673cae
FG
3631
3632 r = admin_socket->register_command(
3633 "injectmdataerr",
3634 "injectmdataerr " \
3635 "name=pool,type=CephString " \
3636 "name=objname,type=CephObjectname " \
3637 "name=shardid,type=CephInt,req=false,range=0|255",
3638 test_ops_hook,
3639 "inject metadata error to an object");
11fdf7f2 3640 ceph_assert(r == 0);
7c673cae
FG
3641 r = admin_socket->register_command(
3642 "set_recovery_delay",
3643 "set_recovery_delay " \
3644 "name=utime,type=CephInt,req=false",
3645 test_ops_hook,
3646 "Delay osd recovery by specified seconds");
11fdf7f2 3647 ceph_assert(r == 0);
7c673cae
FG
3648 r = admin_socket->register_command(
3649 "trigger_scrub",
3650 "trigger_scrub " \
a8e16298
TL
3651 "name=pgid,type=CephString " \
3652 "name=time,type=CephInt,req=false",
7c673cae
FG
3653 test_ops_hook,
3654 "Trigger a scheduled scrub ");
11fdf7f2 3655 ceph_assert(r == 0);
a8e16298
TL
3656 r = admin_socket->register_command(
3657 "trigger_deep_scrub",
3658 "trigger_deep_scrub " \
3659 "name=pgid,type=CephString " \
3660 "name=time,type=CephInt,req=false",
3661 test_ops_hook,
3662 "Trigger a scheduled deep scrub ");
3663 ceph_assert(r == 0);
7c673cae
FG
3664 r = admin_socket->register_command(
3665 "injectfull",
3666 "injectfull " \
3667 "name=type,type=CephString,req=false " \
3668 "name=count,type=CephInt,req=false ",
3669 test_ops_hook,
3670 "Inject a full disk (optional count times)");
11fdf7f2 3671 ceph_assert(r == 0);
7c673cae
FG
3672}
3673
3674void OSD::create_logger()
3675{
3676 dout(10) << "create_logger" << dendl;
3677
3678 PerfCountersBuilder osd_plb(cct, "osd", l_osd_first, l_osd_last);
3679
3680 // Latency axis configuration for op histograms, values are in nanoseconds
3681 PerfHistogramCommon::axis_config_d op_hist_x_axis_config{
3682 "Latency (usec)",
3683 PerfHistogramCommon::SCALE_LOG2, ///< Latency in logarithmic scale
3684 0, ///< Start at 0
3685 100000, ///< Quantization unit is 100usec
3686 32, ///< Enough to cover much longer than slow requests
3687 };
3688
3689 // Op size axis configuration for op histograms, values are in bytes
3690 PerfHistogramCommon::axis_config_d op_hist_y_axis_config{
3691 "Request size (bytes)",
3692 PerfHistogramCommon::SCALE_LOG2, ///< Request size in logarithmic scale
3693 0, ///< Start at 0
3694 512, ///< Quantization unit is 512 bytes
3695 32, ///< Enough to cover requests larger than GB
3696 };
3697
3698
3efd9988
FG
3699 // All the basic OSD operation stats are to be considered useful
3700 osd_plb.set_prio_default(PerfCountersBuilder::PRIO_USEFUL);
3701
7c673cae
FG
3702 osd_plb.add_u64(
3703 l_osd_op_wip, "op_wip",
3704 "Replication operations currently being processed (primary)");
3705 osd_plb.add_u64_counter(
3706 l_osd_op, "op",
3707 "Client operations",
3708 "ops", PerfCountersBuilder::PRIO_CRITICAL);
3709 osd_plb.add_u64_counter(
3710 l_osd_op_inb, "op_in_bytes",
3711 "Client operations total write size",
11fdf7f2 3712 "wr", PerfCountersBuilder::PRIO_INTERESTING, unit_t(UNIT_BYTES));
7c673cae
FG
3713 osd_plb.add_u64_counter(
3714 l_osd_op_outb, "op_out_bytes",
3715 "Client operations total read size",
11fdf7f2 3716 "rd", PerfCountersBuilder::PRIO_INTERESTING, unit_t(UNIT_BYTES));
7c673cae
FG
3717 osd_plb.add_time_avg(
3718 l_osd_op_lat, "op_latency",
3719 "Latency of client operations (including queue time)",
3720 "l", 9);
3721 osd_plb.add_time_avg(
3722 l_osd_op_process_lat, "op_process_latency",
3723 "Latency of client operations (excluding queue time)");
3724 osd_plb.add_time_avg(
3725 l_osd_op_prepare_lat, "op_prepare_latency",
3726 "Latency of client operations (excluding queue time and wait for finished)");
3727
3728 osd_plb.add_u64_counter(
3729 l_osd_op_r, "op_r", "Client read operations");
3730 osd_plb.add_u64_counter(
11fdf7f2 3731 l_osd_op_r_outb, "op_r_out_bytes", "Client data read", NULL, PerfCountersBuilder::PRIO_USEFUL, unit_t(UNIT_BYTES));
7c673cae
FG
3732 osd_plb.add_time_avg(
3733 l_osd_op_r_lat, "op_r_latency",
3734 "Latency of read operation (including queue time)");
31f18b77 3735 osd_plb.add_u64_counter_histogram(
7c673cae
FG
3736 l_osd_op_r_lat_outb_hist, "op_r_latency_out_bytes_histogram",
3737 op_hist_x_axis_config, op_hist_y_axis_config,
3738 "Histogram of operation latency (including queue time) + data read");
3739 osd_plb.add_time_avg(
3740 l_osd_op_r_process_lat, "op_r_process_latency",
3741 "Latency of read operation (excluding queue time)");
3742 osd_plb.add_time_avg(
3743 l_osd_op_r_prepare_lat, "op_r_prepare_latency",
3744 "Latency of read operations (excluding queue time and wait for finished)");
3745 osd_plb.add_u64_counter(
3746 l_osd_op_w, "op_w", "Client write operations");
3747 osd_plb.add_u64_counter(
3748 l_osd_op_w_inb, "op_w_in_bytes", "Client data written");
3749 osd_plb.add_time_avg(
3750 l_osd_op_w_lat, "op_w_latency",
3751 "Latency of write operation (including queue time)");
31f18b77 3752 osd_plb.add_u64_counter_histogram(
7c673cae
FG
3753 l_osd_op_w_lat_inb_hist, "op_w_latency_in_bytes_histogram",
3754 op_hist_x_axis_config, op_hist_y_axis_config,
3755 "Histogram of operation latency (including queue time) + data written");
3756 osd_plb.add_time_avg(
3757 l_osd_op_w_process_lat, "op_w_process_latency",
3758 "Latency of write operation (excluding queue time)");
3759 osd_plb.add_time_avg(
3760 l_osd_op_w_prepare_lat, "op_w_prepare_latency",
3761 "Latency of write operations (excluding queue time and wait for finished)");
3762 osd_plb.add_u64_counter(
3763 l_osd_op_rw, "op_rw",
3764 "Client read-modify-write operations");
3765 osd_plb.add_u64_counter(
3766 l_osd_op_rw_inb, "op_rw_in_bytes",
11fdf7f2 3767 "Client read-modify-write operations write in", NULL, PerfCountersBuilder::PRIO_USEFUL, unit_t(UNIT_BYTES));
7c673cae
FG
3768 osd_plb.add_u64_counter(
3769 l_osd_op_rw_outb,"op_rw_out_bytes",
11fdf7f2 3770 "Client read-modify-write operations read out ", NULL, PerfCountersBuilder::PRIO_USEFUL, unit_t(UNIT_BYTES));
7c673cae
FG
3771 osd_plb.add_time_avg(
3772 l_osd_op_rw_lat, "op_rw_latency",
3773 "Latency of read-modify-write operation (including queue time)");
31f18b77 3774 osd_plb.add_u64_counter_histogram(
7c673cae
FG
3775 l_osd_op_rw_lat_inb_hist, "op_rw_latency_in_bytes_histogram",
3776 op_hist_x_axis_config, op_hist_y_axis_config,
3777 "Histogram of rw operation latency (including queue time) + data written");
31f18b77 3778 osd_plb.add_u64_counter_histogram(
7c673cae
FG
3779 l_osd_op_rw_lat_outb_hist, "op_rw_latency_out_bytes_histogram",
3780 op_hist_x_axis_config, op_hist_y_axis_config,
3781 "Histogram of rw operation latency (including queue time) + data read");
3782 osd_plb.add_time_avg(
3783 l_osd_op_rw_process_lat, "op_rw_process_latency",
3784 "Latency of read-modify-write operation (excluding queue time)");
3785 osd_plb.add_time_avg(
3786 l_osd_op_rw_prepare_lat, "op_rw_prepare_latency",
3787 "Latency of read-modify-write operations (excluding queue time and wait for finished)");
3788
3efd9988
FG
3789 // Now we move on to some more obscure stats, revert to assuming things
3790 // are low priority unless otherwise specified.
3791 osd_plb.set_prio_default(PerfCountersBuilder::PRIO_DEBUGONLY);
3792
224ce89b
WB
3793 osd_plb.add_time_avg(l_osd_op_before_queue_op_lat, "op_before_queue_op_lat",
3794 "Latency of IO before calling queue(before really queue into ShardedOpWq)"); // client io before queue op_wq latency
3795 osd_plb.add_time_avg(l_osd_op_before_dequeue_op_lat, "op_before_dequeue_op_lat",
3796 "Latency of IO before calling dequeue_op(already dequeued and get PG lock)"); // client io before dequeue_op latency
3797
7c673cae
FG
3798 osd_plb.add_u64_counter(
3799 l_osd_sop, "subop", "Suboperations");
3800 osd_plb.add_u64_counter(
11fdf7f2 3801 l_osd_sop_inb, "subop_in_bytes", "Suboperations total size", NULL, 0, unit_t(UNIT_BYTES));
7c673cae
FG
3802 osd_plb.add_time_avg(l_osd_sop_lat, "subop_latency", "Suboperations latency");
3803
3804 osd_plb.add_u64_counter(l_osd_sop_w, "subop_w", "Replicated writes");
3805 osd_plb.add_u64_counter(
11fdf7f2 3806 l_osd_sop_w_inb, "subop_w_in_bytes", "Replicated written data size", NULL, 0, unit_t(UNIT_BYTES));
7c673cae
FG
3807 osd_plb.add_time_avg(
3808 l_osd_sop_w_lat, "subop_w_latency", "Replicated writes latency");
3809 osd_plb.add_u64_counter(
3810 l_osd_sop_pull, "subop_pull", "Suboperations pull requests");
3811 osd_plb.add_time_avg(
3812 l_osd_sop_pull_lat, "subop_pull_latency", "Suboperations pull latency");
3813 osd_plb.add_u64_counter(
3814 l_osd_sop_push, "subop_push", "Suboperations push messages");
3815 osd_plb.add_u64_counter(
11fdf7f2 3816 l_osd_sop_push_inb, "subop_push_in_bytes", "Suboperations pushed size", NULL, 0, unit_t(UNIT_BYTES));
7c673cae
FG
3817 osd_plb.add_time_avg(
3818 l_osd_sop_push_lat, "subop_push_latency", "Suboperations push latency");
3819
3820 osd_plb.add_u64_counter(l_osd_pull, "pull", "Pull requests sent");
3821 osd_plb.add_u64_counter(l_osd_push, "push", "Push messages sent");
11fdf7f2 3822 osd_plb.add_u64_counter(l_osd_push_outb, "push_out_bytes", "Pushed size", NULL, 0, unit_t(UNIT_BYTES));
7c673cae
FG
3823
3824 osd_plb.add_u64_counter(
3825 l_osd_rop, "recovery_ops",
3826 "Started recovery operations",
3827 "rop", PerfCountersBuilder::PRIO_INTERESTING);
3828
11fdf7f2
TL
3829 osd_plb.add_u64_counter(
3830 l_osd_rbytes, "recovery_bytes",
3831 "recovery bytes",
3832 "rbt", PerfCountersBuilder::PRIO_INTERESTING);
3833
7c673cae 3834 osd_plb.add_u64(l_osd_loadavg, "loadavg", "CPU load");
7c673cae
FG
3835 osd_plb.add_u64(
3836 l_osd_cached_crc, "cached_crc", "Total number getting crc from crc_cache");
3837 osd_plb.add_u64(
3838 l_osd_cached_crc_adjusted, "cached_crc_adjusted",
3839 "Total number getting crc from crc_cache with adjusting");
3840 osd_plb.add_u64(l_osd_missed_crc, "missed_crc",
3841 "Total number of crc cache misses");
3842
3843 osd_plb.add_u64(l_osd_pg, "numpg", "Placement groups",
3844 "pgs", PerfCountersBuilder::PRIO_USEFUL);
3845 osd_plb.add_u64(
3846 l_osd_pg_primary, "numpg_primary",
3847 "Placement groups for which this osd is primary");
3848 osd_plb.add_u64(
3849 l_osd_pg_replica, "numpg_replica",
3850 "Placement groups for which this osd is replica");
3851 osd_plb.add_u64(
3852 l_osd_pg_stray, "numpg_stray",
3853 "Placement groups ready to be deleted from this osd");
94b18763
FG
3854 osd_plb.add_u64(
3855 l_osd_pg_removing, "numpg_removing",
3856 "Placement groups queued for local deletion", "pgsr",
3857 PerfCountersBuilder::PRIO_USEFUL);
7c673cae
FG
3858 osd_plb.add_u64(
3859 l_osd_hb_to, "heartbeat_to_peers", "Heartbeat (ping) peers we send to");
3860 osd_plb.add_u64_counter(l_osd_map, "map_messages", "OSD map messages");
3861 osd_plb.add_u64_counter(l_osd_mape, "map_message_epochs", "OSD map epochs");
3862 osd_plb.add_u64_counter(
3863 l_osd_mape_dup, "map_message_epoch_dups", "OSD map duplicates");
3864 osd_plb.add_u64_counter(
3865 l_osd_waiting_for_map, "messages_delayed_for_map",
3866 "Operations waiting for OSD map");
31f18b77 3867
7c673cae
FG
3868 osd_plb.add_u64_counter(
3869 l_osd_map_cache_hit, "osd_map_cache_hit", "osdmap cache hit");
3870 osd_plb.add_u64_counter(
3871 l_osd_map_cache_miss, "osd_map_cache_miss", "osdmap cache miss");
3872 osd_plb.add_u64_counter(
3873 l_osd_map_cache_miss_low, "osd_map_cache_miss_low",
3874 "osdmap cache miss below cache lower bound");
3875 osd_plb.add_u64_avg(
3876 l_osd_map_cache_miss_low_avg, "osd_map_cache_miss_low_avg",
3877 "osdmap cache miss, avg distance below cache lower bound");
31f18b77
FG
3878 osd_plb.add_u64_counter(
3879 l_osd_map_bl_cache_hit, "osd_map_bl_cache_hit",
3880 "OSDMap buffer cache hits");
3881 osd_plb.add_u64_counter(
3882 l_osd_map_bl_cache_miss, "osd_map_bl_cache_miss",
3883 "OSDMap buffer cache misses");
7c673cae 3884
3efd9988
FG
3885 osd_plb.add_u64(
3886 l_osd_stat_bytes, "stat_bytes", "OSD size", "size",
11fdf7f2 3887 PerfCountersBuilder::PRIO_USEFUL, unit_t(UNIT_BYTES));
3efd9988
FG
3888 osd_plb.add_u64(
3889 l_osd_stat_bytes_used, "stat_bytes_used", "Used space", "used",
11fdf7f2
TL
3890 PerfCountersBuilder::PRIO_USEFUL, unit_t(UNIT_BYTES));
3891 osd_plb.add_u64(l_osd_stat_bytes_avail, "stat_bytes_avail", "Available space", NULL, 0, unit_t(UNIT_BYTES));
7c673cae
FG
3892
3893 osd_plb.add_u64_counter(
3894 l_osd_copyfrom, "copyfrom", "Rados \"copy-from\" operations");
3895
3896 osd_plb.add_u64_counter(l_osd_tier_promote, "tier_promote", "Tier promotions");
3897 osd_plb.add_u64_counter(l_osd_tier_flush, "tier_flush", "Tier flushes");
3898 osd_plb.add_u64_counter(
3899 l_osd_tier_flush_fail, "tier_flush_fail", "Failed tier flushes");
3900 osd_plb.add_u64_counter(
3901 l_osd_tier_try_flush, "tier_try_flush", "Tier flush attempts");
3902 osd_plb.add_u64_counter(
3903 l_osd_tier_try_flush_fail, "tier_try_flush_fail",
3904 "Failed tier flush attempts");
3905 osd_plb.add_u64_counter(
3906 l_osd_tier_evict, "tier_evict", "Tier evictions");
3907 osd_plb.add_u64_counter(
3908 l_osd_tier_whiteout, "tier_whiteout", "Tier whiteouts");
3909 osd_plb.add_u64_counter(
3910 l_osd_tier_dirty, "tier_dirty", "Dirty tier flag set");
3911 osd_plb.add_u64_counter(
3912 l_osd_tier_clean, "tier_clean", "Dirty tier flag cleaned");
3913 osd_plb.add_u64_counter(
3914 l_osd_tier_delay, "tier_delay", "Tier delays (agent waiting)");
3915 osd_plb.add_u64_counter(
3916 l_osd_tier_proxy_read, "tier_proxy_read", "Tier proxy reads");
3917 osd_plb.add_u64_counter(
3918 l_osd_tier_proxy_write, "tier_proxy_write", "Tier proxy writes");
3919
3920 osd_plb.add_u64_counter(
3921 l_osd_agent_wake, "agent_wake", "Tiering agent wake up");
3922 osd_plb.add_u64_counter(
3923 l_osd_agent_skip, "agent_skip", "Objects skipped by agent");
3924 osd_plb.add_u64_counter(
3925 l_osd_agent_flush, "agent_flush", "Tiering agent flushes");
3926 osd_plb.add_u64_counter(
3927 l_osd_agent_evict, "agent_evict", "Tiering agent evictions");
3928
3929 osd_plb.add_u64_counter(
3930 l_osd_object_ctx_cache_hit, "object_ctx_cache_hit", "Object context cache hits");
3931 osd_plb.add_u64_counter(
3932 l_osd_object_ctx_cache_total, "object_ctx_cache_total", "Object context cache lookups");
3933
3934 osd_plb.add_u64_counter(l_osd_op_cache_hit, "op_cache_hit");
3935 osd_plb.add_time_avg(
3936 l_osd_tier_flush_lat, "osd_tier_flush_lat", "Object flush latency");
3937 osd_plb.add_time_avg(
3938 l_osd_tier_promote_lat, "osd_tier_promote_lat", "Object promote latency");
3939 osd_plb.add_time_avg(
3940 l_osd_tier_r_lat, "osd_tier_r_lat", "Object proxy read latency");
3941
3942 osd_plb.add_u64_counter(
3943 l_osd_pg_info, "osd_pg_info", "PG updated its info (using any method)");
3944 osd_plb.add_u64_counter(
3945 l_osd_pg_fastinfo, "osd_pg_fastinfo",
3946 "PG updated its info using fastinfo attr");
3947 osd_plb.add_u64_counter(
3948 l_osd_pg_biginfo, "osd_pg_biginfo", "PG updated its biginfo attr");
3949
3950 logger = osd_plb.create_perf_counters();
3951 cct->get_perfcounters_collection()->add(logger);
3952}
3953
3954void OSD::create_recoverystate_perf()
3955{
3956 dout(10) << "create_recoverystate_perf" << dendl;
3957
3958 PerfCountersBuilder rs_perf(cct, "recoverystate_perf", rs_first, rs_last);
3959
3960 rs_perf.add_time_avg(rs_initial_latency, "initial_latency", "Initial recovery state latency");
3961 rs_perf.add_time_avg(rs_started_latency, "started_latency", "Started recovery state latency");
3962 rs_perf.add_time_avg(rs_reset_latency, "reset_latency", "Reset recovery state latency");
3963 rs_perf.add_time_avg(rs_start_latency, "start_latency", "Start recovery state latency");
3964 rs_perf.add_time_avg(rs_primary_latency, "primary_latency", "Primary recovery state latency");
3965 rs_perf.add_time_avg(rs_peering_latency, "peering_latency", "Peering recovery state latency");
3966 rs_perf.add_time_avg(rs_backfilling_latency, "backfilling_latency", "Backfilling recovery state latency");
3967 rs_perf.add_time_avg(rs_waitremotebackfillreserved_latency, "waitremotebackfillreserved_latency", "Wait remote backfill reserved recovery state latency");
3968 rs_perf.add_time_avg(rs_waitlocalbackfillreserved_latency, "waitlocalbackfillreserved_latency", "Wait local backfill reserved recovery state latency");
3969 rs_perf.add_time_avg(rs_notbackfilling_latency, "notbackfilling_latency", "Notbackfilling recovery state latency");
3970 rs_perf.add_time_avg(rs_repnotrecovering_latency, "repnotrecovering_latency", "Repnotrecovering recovery state latency");
3971 rs_perf.add_time_avg(rs_repwaitrecoveryreserved_latency, "repwaitrecoveryreserved_latency", "Rep wait recovery reserved recovery state latency");
3972 rs_perf.add_time_avg(rs_repwaitbackfillreserved_latency, "repwaitbackfillreserved_latency", "Rep wait backfill reserved recovery state latency");
3973 rs_perf.add_time_avg(rs_reprecovering_latency, "reprecovering_latency", "RepRecovering recovery state latency");
3974 rs_perf.add_time_avg(rs_activating_latency, "activating_latency", "Activating recovery state latency");
3975 rs_perf.add_time_avg(rs_waitlocalrecoveryreserved_latency, "waitlocalrecoveryreserved_latency", "Wait local recovery reserved recovery state latency");
3976 rs_perf.add_time_avg(rs_waitremoterecoveryreserved_latency, "waitremoterecoveryreserved_latency", "Wait remote recovery reserved recovery state latency");
3977 rs_perf.add_time_avg(rs_recovering_latency, "recovering_latency", "Recovering recovery state latency");
3978 rs_perf.add_time_avg(rs_recovered_latency, "recovered_latency", "Recovered recovery state latency");
3979 rs_perf.add_time_avg(rs_clean_latency, "clean_latency", "Clean recovery state latency");
3980 rs_perf.add_time_avg(rs_active_latency, "active_latency", "Active recovery state latency");
3981 rs_perf.add_time_avg(rs_replicaactive_latency, "replicaactive_latency", "Replicaactive recovery state latency");
3982 rs_perf.add_time_avg(rs_stray_latency, "stray_latency", "Stray recovery state latency");
3983 rs_perf.add_time_avg(rs_getinfo_latency, "getinfo_latency", "Getinfo recovery state latency");
3984 rs_perf.add_time_avg(rs_getlog_latency, "getlog_latency", "Getlog recovery state latency");
3985 rs_perf.add_time_avg(rs_waitactingchange_latency, "waitactingchange_latency", "Waitactingchange recovery state latency");
3986 rs_perf.add_time_avg(rs_incomplete_latency, "incomplete_latency", "Incomplete recovery state latency");
3987 rs_perf.add_time_avg(rs_down_latency, "down_latency", "Down recovery state latency");
3988 rs_perf.add_time_avg(rs_getmissing_latency, "getmissing_latency", "Getmissing recovery state latency");
3989 rs_perf.add_time_avg(rs_waitupthru_latency, "waitupthru_latency", "Waitupthru recovery state latency");
3990 rs_perf.add_time_avg(rs_notrecovering_latency, "notrecovering_latency", "Notrecovering recovery state latency");
3991
3992 recoverystate_perf = rs_perf.create_perf_counters();
3993 cct->get_perfcounters_collection()->add(recoverystate_perf);
3994}
3995
3996int OSD::shutdown()
3997{
92f5a8d4
TL
3998 if (cct->_conf->osd_fast_shutdown) {
3999 derr << "*** Immediate shutdown (osd_fast_shutdown=true) ***" << dendl;
4000 cct->_log->flush();
4001 _exit(0);
4002 }
4003
7c673cae
FG
4004 if (!service.prepare_to_stop())
4005 return 0; // already shutting down
4006 osd_lock.Lock();
4007 if (is_stopping()) {
4008 osd_lock.Unlock();
4009 return 0;
4010 }
11fdf7f2 4011 dout(0) << "shutdown" << dendl;
7c673cae
FG
4012
4013 set_state(STATE_STOPPING);
4014
4015 // Debugging
11fdf7f2
TL
4016 if (cct->_conf.get_val<bool>("osd_debug_shutdown")) {
4017 cct->_conf.set_val("debug_osd", "100");
4018 cct->_conf.set_val("debug_journal", "100");
4019 cct->_conf.set_val("debug_filestore", "100");
4020 cct->_conf.set_val("debug_bluestore", "100");
4021 cct->_conf.set_val("debug_ms", "100");
4022 cct->_conf.apply_changes(nullptr);
3efd9988 4023 }
7c673cae
FG
4024
4025 // stop MgrClient earlier as it's more like an internal consumer of OSD
4026 mgrc.shutdown();
4027
4028 service.start_shutdown();
4029
4030 // stop sending work to pgs. this just prevents any new work in _process
4031 // from racing with on_shutdown and potentially entering the pg after.
4032 op_shardedwq.drain();
4033
4034 // Shutdown PGs
4035 {
11fdf7f2
TL
4036 vector<PGRef> pgs;
4037 _get_pgs(&pgs);
4038 for (auto pg : pgs) {
4039 pg->shutdown();
7c673cae
FG
4040 }
4041 }
7c673cae
FG
4042
4043 // drain op queue again (in case PGs requeued something)
4044 op_shardedwq.drain();
4045 {
4046 finished.clear(); // zap waiters (bleh, this is messy)
11fdf7f2 4047 waiting_for_osdmap.clear();
7c673cae
FG
4048 }
4049
7c673cae 4050 // unregister commands
11fdf7f2 4051 cct->get_admin_socket()->unregister_commands(asok_hook);
7c673cae
FG
4052 delete asok_hook;
4053 asok_hook = NULL;
4054
11fdf7f2 4055 cct->get_admin_socket()->unregister_commands(test_ops_hook);
7c673cae
FG
4056 delete test_ops_hook;
4057 test_ops_hook = NULL;
4058
4059 osd_lock.Unlock();
4060
4061 heartbeat_lock.Lock();
4062 heartbeat_stop = true;
4063 heartbeat_cond.Signal();
4064 heartbeat_lock.Unlock();
4065 heartbeat_thread.join();
4066
7c673cae
FG
4067 osd_op_tp.drain();
4068 osd_op_tp.stop();
4069 dout(10) << "op sharded tp stopped" << dendl;
4070
4071 command_tp.drain();
4072 command_tp.stop();
4073 dout(10) << "command tp stopped" << dendl;
4074
7c673cae
FG
4075 dout(10) << "stopping agent" << dendl;
4076 service.agent_stop();
4077
11fdf7f2
TL
4078 boot_finisher.wait_for_empty();
4079
7c673cae
FG
4080 osd_lock.Lock();
4081
11fdf7f2 4082 boot_finisher.stop();
494da23a 4083 reset_heartbeat_peers(true);
7c673cae
FG
4084
4085 tick_timer.shutdown();
4086
4087 {
11fdf7f2 4088 std::lock_guard l(tick_timer_lock);
7c673cae
FG
4089 tick_timer_without_osd_lock.shutdown();
4090 }
4091
4092 // note unmount epoch
4093 dout(10) << "noting clean unmount in epoch " << osdmap->get_epoch() << dendl;
4094 superblock.mounted = service.get_boot_epoch();
4095 superblock.clean_thru = osdmap->get_epoch();
4096 ObjectStore::Transaction t;
4097 write_superblock(t);
11fdf7f2 4098 int r = store->queue_transaction(service.meta_ch, std::move(t));
7c673cae
FG
4099 if (r) {
4100 derr << "OSD::shutdown: error writing superblock: "
4101 << cpp_strerror(r) << dendl;
4102 }
4103
4104
31f18b77
FG
4105 service.shutdown_reserver();
4106
7c673cae
FG
4107 // Remove PGs
4108#ifdef PG_DEBUG_REFS
4109 service.dump_live_pgids();
4110#endif
11fdf7f2
TL
4111 while (true) {
4112 vector<PGRef> pgs;
4113 _get_pgs(&pgs, true);
4114 if (pgs.empty()) {
4115 break;
4116 }
4117 for (auto& pg : pgs) {
4118 if (pg->is_deleted()) {
4119 continue;
4120 }
4121 dout(20) << " kicking pg " << pg << dendl;
4122 pg->lock();
4123 if (pg->get_num_ref() != 1) {
4124 derr << "pgid " << pg->get_pgid() << " has ref count of "
4125 << pg->get_num_ref() << dendl;
7c673cae 4126#ifdef PG_DEBUG_REFS
11fdf7f2 4127 pg->dump_live_ids();
7c673cae 4128#endif
31f18b77
FG
4129 if (cct->_conf->osd_shutdown_pgref_assert) {
4130 ceph_abort();
4131 }
7c673cae 4132 }
11fdf7f2
TL
4133 pg->ch.reset();
4134 pg->unlock();
7c673cae 4135 }
7c673cae
FG
4136 }
4137#ifdef PG_DEBUG_REFS
4138 service.dump_live_pgids();
4139#endif
f64942e4
AA
4140
4141 osd_lock.Unlock();
11fdf7f2 4142 cct->_conf.remove_observer(this);
f64942e4 4143 osd_lock.Lock();
7c673cae 4144
11fdf7f2
TL
4145 service.meta_ch.reset();
4146
7c673cae
FG
4147 dout(10) << "syncing store" << dendl;
4148 enable_disable_fuse(true);
4149
4150 if (cct->_conf->osd_journal_flush_on_shutdown) {
4151 dout(10) << "flushing journal" << dendl;
4152 store->flush_journal();
4153 }
4154
7c673cae
FG
4155 monc->shutdown();
4156 osd_lock.Unlock();
4157
11fdf7f2 4158 map_lock.get_write();
7c673cae 4159 osdmap = OSDMapRef();
11fdf7f2
TL
4160 map_lock.put_write();
4161
4162 for (auto s : shards) {
4163 std::lock_guard l(s->osdmap_lock);
4164 s->shard_osdmap = OSDMapRef();
4165 }
7c673cae 4166 service.shutdown();
11fdf7f2
TL
4167
4168 std::lock_guard lock(osd_lock);
4169 store->umount();
4170 delete store;
4171 store = nullptr;
4172 dout(10) << "Store synced" << dendl;
4173
7c673cae
FG
4174 op_tracker.on_shutdown();
4175
4176 class_handler->shutdown();
4177 client_messenger->shutdown();
4178 cluster_messenger->shutdown();
4179 hb_front_client_messenger->shutdown();
4180 hb_back_client_messenger->shutdown();
4181 objecter_messenger->shutdown();
4182 hb_front_server_messenger->shutdown();
4183 hb_back_server_messenger->shutdown();
4184
7c673cae
FG
4185 return r;
4186}
4187
4188int OSD::mon_cmd_maybe_osd_create(string &cmd)
4189{
4190 bool created = false;
4191 while (true) {
4192 dout(10) << __func__ << " cmd: " << cmd << dendl;
4193 vector<string> vcmd{cmd};
4194 bufferlist inbl;
4195 C_SaferCond w;
4196 string outs;
4197 monc->start_mon_command(vcmd, inbl, NULL, &outs, &w);
4198 int r = w.wait();
4199 if (r < 0) {
4200 if (r == -ENOENT && !created) {
4201 string newcmd = "{\"prefix\": \"osd create\", \"id\": " + stringify(whoami)
4202 + ", \"uuid\": \"" + stringify(superblock.osd_fsid) + "\"}";
4203 vector<string> vnewcmd{newcmd};
4204 bufferlist inbl;
4205 C_SaferCond w;
4206 string outs;
4207 monc->start_mon_command(vnewcmd, inbl, NULL, &outs, &w);
4208 int r = w.wait();
4209 if (r < 0) {
4210 derr << __func__ << " fail: osd does not exist and created failed: "
4211 << cpp_strerror(r) << dendl;
4212 return r;
4213 }
4214 created = true;
4215 continue;
4216 }
4217 derr << __func__ << " fail: '" << outs << "': " << cpp_strerror(r) << dendl;
4218 return r;
4219 }
4220 break;
4221 }
4222
4223 return 0;
4224}
4225
4226int OSD::update_crush_location()
4227{
4228 if (!cct->_conf->osd_crush_update_on_start) {
4229 dout(10) << __func__ << " osd_crush_update_on_start = false" << dendl;
4230 return 0;
4231 }
4232
4233 char weight[32];
4234 if (cct->_conf->osd_crush_initial_weight >= 0) {
4235 snprintf(weight, sizeof(weight), "%.4lf", cct->_conf->osd_crush_initial_weight);
4236 } else {
4237 struct store_statfs_t st;
11fdf7f2
TL
4238 osd_alert_list_t alerts;
4239 int r = store->statfs(&st, &alerts);
7c673cae
FG
4240 if (r < 0) {
4241 derr << "statfs: " << cpp_strerror(r) << dendl;
4242 return r;
4243 }
4244 snprintf(weight, sizeof(weight), "%.4lf",
11fdf7f2
TL
4245 std::max(.00001,
4246 double(st.total) /
4247 double(1ull << 40 /* TB */)));
7c673cae
FG
4248 }
4249
4250 std::multimap<string,string> loc = cct->crush_location.get_location();
4251 dout(10) << __func__ << " crush location is " << loc << dendl;
4252
4253 string cmd =
4254 string("{\"prefix\": \"osd crush create-or-move\", ") +
4255 string("\"id\": ") + stringify(whoami) + string(", ") +
4256 string("\"weight\":") + weight + string(", ") +
4257 string("\"args\": [");
4258 for (multimap<string,string>::iterator p = loc.begin(); p != loc.end(); ++p) {
4259 if (p != loc.begin())
4260 cmd += ", ";
4261 cmd += "\"" + p->first + "=" + p->second + "\"";
4262 }
4263 cmd += "]}";
4264
4265 return mon_cmd_maybe_osd_create(cmd);
4266}
4267
4268int OSD::update_crush_device_class()
4269{
224ce89b
WB
4270 if (!cct->_conf->osd_class_update_on_start) {
4271 dout(10) << __func__ << " osd_class_update_on_start = false" << dendl;
4272 return 0;
4273 }
4274
7c673cae
FG
4275 string device_class;
4276 int r = store->read_meta("crush_device_class", &device_class);
224ce89b
WB
4277 if (r < 0 || device_class.empty()) {
4278 device_class = store->get_default_device_class();
4279 }
4280
4281 if (device_class.empty()) {
d2e6a577 4282 dout(20) << __func__ << " no device class stored locally" << dendl;
7c673cae 4283 return 0;
224ce89b 4284 }
7c673cae
FG
4285
4286 string cmd =
4287 string("{\"prefix\": \"osd crush set-device-class\", ") +
224ce89b
WB
4288 string("\"class\": \"") + device_class + string("\", ") +
4289 string("\"ids\": [\"") + stringify(whoami) + string("\"]}");
7c673cae 4290
224ce89b 4291 r = mon_cmd_maybe_osd_create(cmd);
11fdf7f2
TL
4292 if (r == -EBUSY) {
4293 // good, already bound to a device-class
4294 return 0;
4295 } else {
4296 return r;
4297 }
7c673cae
FG
4298}
4299
4300void OSD::write_superblock(ObjectStore::Transaction& t)
4301{
4302 dout(10) << "write_superblock " << superblock << dendl;
4303
4304 //hack: at minimum it's using the baseline feature set
4305 if (!superblock.compat_features.incompat.contains(CEPH_OSD_FEATURE_INCOMPAT_BASE))
4306 superblock.compat_features.incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_BASE);
4307
4308 bufferlist bl;
11fdf7f2 4309 encode(superblock, bl);
7c673cae
FG
4310 t.write(coll_t::meta(), OSD_SUPERBLOCK_GOBJECT, 0, bl.length(), bl);
4311}
4312
4313int OSD::read_superblock()
4314{
4315 bufferlist bl;
11fdf7f2 4316 int r = store->read(service.meta_ch, OSD_SUPERBLOCK_GOBJECT, 0, 0, bl);
7c673cae
FG
4317 if (r < 0)
4318 return r;
4319
11fdf7f2
TL
4320 auto p = bl.cbegin();
4321 decode(superblock, p);
7c673cae
FG
4322
4323 dout(10) << "read_superblock " << superblock << dendl;
4324
4325 return 0;
4326}
4327
4328void OSD::clear_temp_objects()
4329{
4330 dout(10) << __func__ << dendl;
4331 vector<coll_t> ls;
4332 store->list_collections(ls);
4333 for (vector<coll_t>::iterator p = ls.begin(); p != ls.end(); ++p) {
4334 spg_t pgid;
4335 if (!p->is_pg(&pgid))
4336 continue;
4337
4338 // list temp objects
4339 dout(20) << " clearing temps in " << *p << " pgid " << pgid << dendl;
4340
4341 vector<ghobject_t> temps;
4342 ghobject_t next;
4343 while (1) {
4344 vector<ghobject_t> objects;
11fdf7f2
TL
4345 auto ch = store->open_collection(*p);
4346 ceph_assert(ch);
4347 store->collection_list(ch, next, ghobject_t::get_max(),
7c673cae
FG
4348 store->get_ideal_list_max(),
4349 &objects, &next);
4350 if (objects.empty())
4351 break;
4352 vector<ghobject_t>::iterator q;
4353 for (q = objects.begin(); q != objects.end(); ++q) {
4354 // Hammer set pool for temps to -1, so check for clean-up
4355 if (q->hobj.is_temp() || (q->hobj.pool == -1)) {
4356 temps.push_back(*q);
4357 } else {
4358 break;
4359 }
4360 }
4361 // If we saw a non-temp object and hit the break above we can
4362 // break out of the while loop too.
4363 if (q != objects.end())
4364 break;
4365 }
4366 if (!temps.empty()) {
4367 ObjectStore::Transaction t;
4368 int removed = 0;
4369 for (vector<ghobject_t>::iterator q = temps.begin(); q != temps.end(); ++q) {
4370 dout(20) << " removing " << *p << " object " << *q << dendl;
4371 t.remove(*p, *q);
4372 if (++removed > cct->_conf->osd_target_transaction_size) {
11fdf7f2 4373 store->queue_transaction(service.meta_ch, std::move(t));
7c673cae
FG
4374 t = ObjectStore::Transaction();
4375 removed = 0;
4376 }
4377 }
4378 if (removed) {
11fdf7f2 4379 store->queue_transaction(service.meta_ch, std::move(t));
7c673cae
FG
4380 }
4381 }
4382 }
4383}
4384
4385void OSD::recursive_remove_collection(CephContext* cct,
4386 ObjectStore *store, spg_t pgid,
4387 coll_t tmp)
4388{
4389 OSDriver driver(
4390 store,
4391 coll_t(),
4392 make_snapmapper_oid());
4393
11fdf7f2 4394 ObjectStore::CollectionHandle ch = store->open_collection(tmp);
7c673cae
FG
4395 ObjectStore::Transaction t;
4396 SnapMapper mapper(cct, &driver, 0, 0, 0, pgid.shard);
4397
11fdf7f2
TL
4398 ghobject_t next;
4399 int max = cct->_conf->osd_target_transaction_size;
7c673cae 4400 vector<ghobject_t> objects;
11fdf7f2
TL
4401 objects.reserve(max);
4402 while (true) {
4403 objects.clear();
4404 store->collection_list(ch, next, ghobject_t::get_max(),
4405 max, &objects, &next);
4406 generic_dout(10) << __func__ << " " << objects << dendl;
4407 if (objects.empty())
4408 break;
4409 for (auto& p: objects) {
4410 OSDriver::OSTransaction _t(driver.get_transaction(&t));
4411 int r = mapper.remove_oid(p.hobj, &_t);
4412 if (r != 0 && r != -ENOENT)
4413 ceph_abort();
4414 t.remove(tmp, p);
7c673cae 4415 }
11fdf7f2
TL
4416 int r = store->queue_transaction(ch, std::move(t));
4417 ceph_assert(r == 0);
4418 t = ObjectStore::Transaction();
7c673cae
FG
4419 }
4420 t.remove_collection(tmp);
11fdf7f2
TL
4421 int r = store->queue_transaction(ch, std::move(t));
4422 ceph_assert(r == 0);
7c673cae
FG
4423
4424 C_SaferCond waiter;
11fdf7f2 4425 if (!ch->flush_commit(&waiter)) {
7c673cae
FG
4426 waiter.wait();
4427 }
4428}
4429
4430
4431// ======================================================
4432// PG's
4433
7c673cae
FG
4434PG* OSD::_make_pg(
4435 OSDMapRef createmap,
4436 spg_t pgid)
4437{
11fdf7f2
TL
4438 dout(10) << __func__ << " " << pgid << dendl;
4439 pg_pool_t pi;
4440 map<string,string> ec_profile;
4441 string name;
4442 if (createmap->have_pg_pool(pgid.pool())) {
4443 pi = *createmap->get_pg_pool(pgid.pool());
4444 name = createmap->get_pool_name(pgid.pool());
4445 if (pi.is_erasure()) {
4446 ec_profile = createmap->get_erasure_code_profile(pi.erasure_code_profile);
4447 }
4448 } else {
4449 // pool was deleted; grab final pg_pool_t off disk.
4450 ghobject_t oid = make_final_pool_info_oid(pgid.pool());
4451 bufferlist bl;
4452 int r = store->read(service.meta_ch, oid, 0, 0, bl);
4453 if (r < 0) {
4454 derr << __func__ << " missing pool " << pgid.pool() << " tombstone"
4455 << dendl;
4456 return nullptr;
4457 }
4458 ceph_assert(r >= 0);
4459 auto p = bl.cbegin();
4460 decode(pi, p);
4461 decode(name, p);
4462 if (p.end()) { // dev release v13.0.2 did not include ec_profile
4463 derr << __func__ << " missing ec_profile from pool " << pgid.pool()
4464 << " tombstone" << dendl;
4465 return nullptr;
4466 }
4467 decode(ec_profile, p);
4468 }
4469 PGPool pool(cct, createmap, pgid.pool(), pi, name);
7c673cae 4470 PG *pg;
11fdf7f2
TL
4471 if (pi.type == pg_pool_t::TYPE_REPLICATED ||
4472 pi.type == pg_pool_t::TYPE_ERASURE)
4473 pg = new PrimaryLogPG(&service, createmap, pool, ec_profile, pgid);
7c673cae
FG
4474 else
4475 ceph_abort();
7c673cae
FG
4476 return pg;
4477}
4478
11fdf7f2 4479void OSD::_get_pgs(vector<PGRef> *v, bool clear_too)
7c673cae 4480{
11fdf7f2
TL
4481 v->clear();
4482 v->reserve(get_num_pgs());
4483 for (auto& s : shards) {
4484 std::lock_guard l(s->shard_lock);
4485 for (auto& j : s->pg_slots) {
4486 if (j.second->pg &&
4487 !j.second->pg->is_deleted()) {
4488 v->push_back(j.second->pg);
4489 if (clear_too) {
4490 s->_detach_pg(j.second.get());
4491 }
4492 }
7c673cae 4493 }
7c673cae 4494 }
7c673cae
FG
4495}
4496
11fdf7f2 4497void OSD::_get_pgids(vector<spg_t> *v)
7c673cae 4498{
11fdf7f2
TL
4499 v->clear();
4500 v->reserve(get_num_pgs());
4501 for (auto& s : shards) {
4502 std::lock_guard l(s->shard_lock);
4503 for (auto& j : s->pg_slots) {
4504 if (j.second->pg &&
4505 !j.second->pg->is_deleted()) {
4506 v->push_back(j.first);
4507 }
7c673cae
FG
4508 }
4509 }
7c673cae
FG
4510}
4511
11fdf7f2 4512void OSD::register_pg(PGRef pg)
7c673cae 4513{
11fdf7f2
TL
4514 spg_t pgid = pg->get_pgid();
4515 uint32_t shard_index = pgid.hash_to_shard(num_shards);
4516 auto sdata = shards[shard_index];
4517 std::lock_guard l(sdata->shard_lock);
4518 auto r = sdata->pg_slots.emplace(pgid, make_unique<OSDShardPGSlot>());
4519 ceph_assert(r.second);
4520 auto *slot = r.first->second.get();
4521 dout(20) << __func__ << " " << pgid << " " << pg << dendl;
4522 sdata->_attach_pg(slot, pg.get());
4523}
7c673cae 4524
11fdf7f2
TL
4525bool OSD::try_finish_pg_delete(PG *pg, unsigned old_pg_num)
4526{
4527 auto sdata = pg->osd_shard;
4528 ceph_assert(sdata);
4529 {
4530 std::lock_guard l(sdata->shard_lock);
4531 auto p = sdata->pg_slots.find(pg->pg_id);
4532 if (p == sdata->pg_slots.end() ||
4533 !p->second->pg) {
4534 dout(20) << __func__ << " " << pg->pg_id << " not found" << dendl;
4535 return false;
4536 }
4537 if (p->second->waiting_for_merge_epoch) {
4538 dout(20) << __func__ << " " << pg->pg_id << " waiting for merge" << dendl;
4539 return false;
4540 }
4541 dout(20) << __func__ << " " << pg->pg_id << " " << pg << dendl;
4542 sdata->_detach_pg(p->second.get());
4543 }
7c673cae 4544
11fdf7f2
TL
4545 for (auto shard : shards) {
4546 shard->unprime_split_children(pg->pg_id, old_pg_num);
4547 }
7c673cae 4548
11fdf7f2
TL
4549 // update pg count now since we might not get an osdmap any time soon.
4550 if (pg->is_primary())
4551 service.logger->dec(l_osd_pg_primary);
4552 else if (pg->is_replica())
4553 service.logger->dec(l_osd_pg_replica);
4554 else
4555 service.logger->dec(l_osd_pg_stray);
7c673cae 4556
11fdf7f2 4557 return true;
7c673cae
FG
4558}
4559
11fdf7f2 4560PGRef OSD::_lookup_pg(spg_t pgid)
7c673cae 4561{
11fdf7f2
TL
4562 uint32_t shard_index = pgid.hash_to_shard(num_shards);
4563 auto sdata = shards[shard_index];
4564 std::lock_guard l(sdata->shard_lock);
4565 auto p = sdata->pg_slots.find(pgid);
4566 if (p == sdata->pg_slots.end()) {
7c673cae 4567 return nullptr;
11fdf7f2
TL
4568 }
4569 return p->second->pg;
7c673cae
FG
4570}
4571
11fdf7f2 4572PGRef OSD::_lookup_lock_pg(spg_t pgid)
31f18b77 4573{
11fdf7f2
TL
4574 PGRef pg = _lookup_pg(pgid);
4575 if (!pg) {
4576 return nullptr;
4577 }
4578 pg->lock();
4579 if (!pg->is_deleted()) {
4580 return pg;
4581 }
4582 pg->unlock();
4583 return nullptr;
31f18b77
FG
4584}
4585
11fdf7f2 4586PGRef OSD::lookup_lock_pg(spg_t pgid)
7c673cae 4587{
11fdf7f2 4588 return _lookup_lock_pg(pgid);
7c673cae
FG
4589}
4590
4591void OSD::load_pgs()
4592{
11fdf7f2 4593 ceph_assert(osd_lock.is_locked());
7c673cae 4594 dout(0) << "load_pgs" << dendl;
11fdf7f2 4595
7c673cae 4596 {
11fdf7f2
TL
4597 auto pghist = make_pg_num_history_oid();
4598 bufferlist bl;
4599 int r = store->read(service.meta_ch, pghist, 0, 0, bl, 0);
4600 if (r >= 0 && bl.length() > 0) {
4601 auto p = bl.cbegin();
4602 decode(pg_num_history, p);
4603 }
4604 dout(20) << __func__ << " pg_num_history " << pg_num_history << dendl;
7c673cae
FG
4605 }
4606
4607 vector<coll_t> ls;
4608 int r = store->list_collections(ls);
4609 if (r < 0) {
4610 derr << "failed to list pgs: " << cpp_strerror(-r) << dendl;
4611 }
4612
11fdf7f2 4613 int num = 0;
7c673cae
FG
4614 for (vector<coll_t>::iterator it = ls.begin();
4615 it != ls.end();
4616 ++it) {
4617 spg_t pgid;
4618 if (it->is_temp(&pgid) ||
4619 (it->is_pg(&pgid) && PG::_has_removal_flag(store, pgid))) {
11fdf7f2
TL
4620 dout(10) << "load_pgs " << *it
4621 << " removing, legacy or flagged for removal pg" << dendl;
7c673cae
FG
4622 recursive_remove_collection(cct, store, pgid, *it);
4623 continue;
4624 }
4625
4626 if (!it->is_pg(&pgid)) {
4627 dout(10) << "load_pgs ignoring unrecognized " << *it << dendl;
4628 continue;
4629 }
4630
7c673cae 4631 dout(10) << "pgid " << pgid << " coll " << coll_t(pgid) << dendl;
7c673cae 4632 epoch_t map_epoch = 0;
11fdf7f2 4633 int r = PG::peek_map_epoch(store, pgid, &map_epoch);
7c673cae
FG
4634 if (r < 0) {
4635 derr << __func__ << " unable to peek at " << pgid << " metadata, skipping"
4636 << dendl;
4637 continue;
4638 }
4639
11fdf7f2 4640 PGRef pg;
7c673cae
FG
4641 if (map_epoch > 0) {
4642 OSDMapRef pgosdmap = service.try_get_map(map_epoch);
4643 if (!pgosdmap) {
4644 if (!osdmap->have_pg_pool(pgid.pool())) {
4645 derr << __func__ << ": could not find map for epoch " << map_epoch
4646 << " on pg " << pgid << ", but the pool is not present in the "
4647 << "current map, so this is probably a result of bug 10617. "
4648 << "Skipping the pg for now, you can use ceph-objectstore-tool "
4649 << "to clean it up later." << dendl;
4650 continue;
4651 } else {
4652 derr << __func__ << ": have pgid " << pgid << " at epoch "
4653 << map_epoch << ", but missing map. Crashing."
4654 << dendl;
11fdf7f2 4655 ceph_abort_msg("Missing map in load_pgs");
7c673cae
FG
4656 }
4657 }
11fdf7f2 4658 pg = _make_pg(pgosdmap, pgid);
7c673cae 4659 } else {
11fdf7f2 4660 pg = _make_pg(osdmap, pgid);
7c673cae 4661 }
11fdf7f2
TL
4662 if (!pg) {
4663 recursive_remove_collection(cct, store, pgid, *it);
4664 continue;
4665 }
4666
4667 // there can be no waiters here, so we don't call _wake_pg_slot
7c673cae 4668
11fdf7f2 4669 pg->lock();
7c673cae
FG
4670 pg->ch = store->open_collection(pg->coll);
4671
4672 // read pg state, log
11fdf7f2 4673 pg->read_state(store);
7c673cae 4674
94b18763
FG
4675 if (pg->dne()) {
4676 dout(10) << "load_pgs " << *it << " deleting dne" << dendl;
4677 pg->ch = nullptr;
94b18763 4678 pg->unlock();
94b18763
FG
4679 recursive_remove_collection(cct, store, pgid, *it);
4680 continue;
4681 }
11fdf7f2
TL
4682 {
4683 uint32_t shard_index = pgid.hash_to_shard(shards.size());
4684 assert(NULL != shards[shard_index]);
4685 store->set_collection_commit_queue(pg->coll, &(shards[shard_index]->context_queue));
4686 }
7c673cae
FG
4687
4688 pg->reg_next_scrub();
4689
11fdf7f2 4690 dout(10) << __func__ << " loaded " << *pg << dendl;
7c673cae 4691 pg->unlock();
7c673cae 4692
11fdf7f2
TL
4693 register_pg(pg);
4694 ++num;
7c673cae 4695 }
11fdf7f2 4696 dout(0) << __func__ << " opened " << num << " pgs" << dendl;
7c673cae
FG
4697}
4698
4699
11fdf7f2
TL
4700PGRef OSD::handle_pg_create_info(const OSDMapRef& osdmap,
4701 const PGCreateInfo *info)
4702{
4703 spg_t pgid = info->pgid;
7c673cae 4704
11fdf7f2
TL
4705 if (maybe_wait_for_max_pg(osdmap, pgid, info->by_mon)) {
4706 dout(10) << __func__ << " hit max pg, dropping" << dendl;
4707 return nullptr;
4708 }
3efd9988 4709
11fdf7f2 4710 PG::RecoveryCtx rctx = create_context();
7c673cae 4711
11fdf7f2 4712 OSDMapRef startmap = get_map(info->epoch);
7c673cae 4713
11fdf7f2
TL
4714 if (info->by_mon) {
4715 int64_t pool_id = pgid.pgid.pool();
4716 const pg_pool_t *pool = osdmap->get_pg_pool(pool_id);
4717 if (!pool) {
4718 dout(10) << __func__ << " ignoring " << pgid << ", pool dne" << dendl;
4719 return nullptr;
4720 }
4721 if (osdmap->require_osd_release >= CEPH_RELEASE_NAUTILUS &&
4722 !pool->has_flag(pg_pool_t::FLAG_CREATING)) {
4723 // this ensures we do not process old creating messages after the
4724 // pool's initial pgs have been created (and pg are subsequently
4725 // allowed to split or merge).
4726 dout(20) << __func__ << " dropping " << pgid
4727 << "create, pool does not have CREATING flag set" << dendl;
4728 return nullptr;
7c673cae
FG
4729 }
4730 }
7c673cae 4731
11fdf7f2
TL
4732 int up_primary, acting_primary;
4733 vector<int> up, acting;
4734 startmap->pg_to_up_acting_osds(
4735 pgid.pgid, &up, &up_primary, &acting, &acting_primary);
7c673cae 4736
11fdf7f2
TL
4737 const pg_pool_t* pp = startmap->get_pg_pool(pgid.pool());
4738 if (pp->has_flag(pg_pool_t::FLAG_EC_OVERWRITES) &&
4739 store->get_type() != "bluestore") {
4740 clog->warn() << "pg " << pgid
4741 << " is at risk of silent data corruption: "
4742 << "the pool allows ec overwrites but is not stored in "
4743 << "bluestore, so deep scrubbing will not detect bitrot";
7c673cae 4744 }
11fdf7f2
TL
4745 PG::_create(*rctx.transaction, pgid, pgid.get_split_bits(pp->get_pg_num()));
4746 PG::_init(*rctx.transaction, pgid, pp);
7c673cae 4747
11fdf7f2
TL
4748 int role = startmap->calc_pg_role(whoami, acting, acting.size());
4749 if (!pp->is_replicated() && role != pgid.shard) {
4750 role = -1;
7c673cae
FG
4751 }
4752
11fdf7f2
TL
4753 PGRef pg = _make_pg(startmap, pgid);
4754 pg->ch = store->create_new_collection(pg->coll);
7c673cae 4755
11fdf7f2
TL
4756 {
4757 uint32_t shard_index = pgid.hash_to_shard(shards.size());
4758 assert(NULL != shards[shard_index]);
4759 store->set_collection_commit_queue(pg->coll, &(shards[shard_index]->context_queue));
7c673cae 4760 }
7c673cae 4761
11fdf7f2 4762 pg->lock(true);
7c673cae 4763
11fdf7f2
TL
4764 // we are holding the shard lock
4765 ceph_assert(!pg->is_deleted());
4766
4767 pg->init(
4768 role,
4769 up,
4770 up_primary,
4771 acting,
4772 acting_primary,
4773 info->history,
4774 info->past_intervals,
4775 false,
4776 rctx.transaction);
7c673cae 4777
92f5a8d4
TL
4778 pg->init_collection_pool_opts();
4779
11fdf7f2
TL
4780 if (pg->is_primary()) {
4781 Mutex::Locker locker(m_perf_queries_lock);
4782 pg->set_dynamic_perf_stats_queries(m_perf_queries);
4783 }
7c673cae 4784
11fdf7f2
TL
4785 pg->handle_initialize(&rctx);
4786 pg->handle_activate_map(&rctx);
7c673cae 4787
11fdf7f2 4788 dispatch_context(rctx, pg.get(), osdmap, nullptr);
7c673cae 4789
11fdf7f2
TL
4790 dout(10) << __func__ << " new pg " << *pg << dendl;
4791 return pg;
7c673cae
FG
4792}
4793
11fdf7f2
TL
4794bool OSD::maybe_wait_for_max_pg(const OSDMapRef& osdmap,
4795 spg_t pgid,
4796 bool is_mon_create)
3efd9988
FG
4797{
4798 const auto max_pgs_per_osd =
11fdf7f2
TL
4799 (cct->_conf.get_val<uint64_t>("mon_max_pg_per_osd") *
4800 cct->_conf.get_val<double>("osd_max_pg_per_osd_hard_ratio"));
3efd9988 4801
11fdf7f2 4802 if (num_pgs < max_pgs_per_osd) {
3efd9988
FG
4803 return false;
4804 }
11fdf7f2
TL
4805
4806 std::lock_guard l(pending_creates_lock);
3efd9988
FG
4807 if (is_mon_create) {
4808 pending_creates_from_mon++;
4809 } else {
b32b8144
FG
4810 bool is_primary = osdmap->get_pg_acting_rank(pgid.pgid, whoami) == 0;
4811 pending_creates_from_osd.emplace(pgid.pgid, is_primary);
3efd9988 4812 }
1adf2230 4813 dout(1) << __func__ << " withhold creation of pg " << pgid
11fdf7f2 4814 << ": " << num_pgs << " >= "<< max_pgs_per_osd << dendl;
3efd9988
FG
4815 return true;
4816}
4817
4818// to re-trigger a peering, we have to twiddle the pg mapping a little bit,
4819// see PG::should_restart_peering(). OSDMap::pg_to_up_acting_osds() will turn
4820// to up set if pg_temp is empty. so an empty pg_temp won't work.
4821static vector<int32_t> twiddle(const vector<int>& acting) {
4822 if (acting.size() > 1) {
4823 return {acting[0]};
4824 } else {
4825 vector<int32_t> twiddled(acting.begin(), acting.end());
4826 twiddled.push_back(-1);
4827 return twiddled;
4828 }
4829}
4830
4831void OSD::resume_creating_pg()
4832{
4833 bool do_sub_pg_creates = false;
b32b8144 4834 bool have_pending_creates = false;
3efd9988
FG
4835 {
4836 const auto max_pgs_per_osd =
11fdf7f2
TL
4837 (cct->_conf.get_val<uint64_t>("mon_max_pg_per_osd") *
4838 cct->_conf.get_val<double>("osd_max_pg_per_osd_hard_ratio"));
4839 if (max_pgs_per_osd <= num_pgs) {
3efd9988
FG
4840 // this could happen if admin decreases this setting before a PG is removed
4841 return;
4842 }
11fdf7f2
TL
4843 unsigned spare_pgs = max_pgs_per_osd - num_pgs;
4844 std::lock_guard l(pending_creates_lock);
3efd9988 4845 if (pending_creates_from_mon > 0) {
11fdf7f2
TL
4846 dout(20) << __func__ << " pending_creates_from_mon "
4847 << pending_creates_from_mon << dendl;
3efd9988
FG
4848 do_sub_pg_creates = true;
4849 if (pending_creates_from_mon >= spare_pgs) {
4850 spare_pgs = pending_creates_from_mon = 0;
4851 } else {
4852 spare_pgs -= pending_creates_from_mon;
4853 pending_creates_from_mon = 0;
4854 }
4855 }
4856 auto pg = pending_creates_from_osd.cbegin();
4857 while (spare_pgs > 0 && pg != pending_creates_from_osd.cend()) {
94b18763 4858 dout(20) << __func__ << " pg " << pg->first << dendl;
3efd9988 4859 vector<int> acting;
b32b8144 4860 osdmap->pg_to_up_acting_osds(pg->first, nullptr, nullptr, &acting, nullptr);
94b18763 4861 service.queue_want_pg_temp(pg->first, twiddle(acting), true);
3efd9988 4862 pg = pending_creates_from_osd.erase(pg);
94b18763 4863 do_sub_pg_creates = true;
3efd9988
FG
4864 spare_pgs--;
4865 }
b32b8144
FG
4866 have_pending_creates = (pending_creates_from_mon > 0 ||
4867 !pending_creates_from_osd.empty());
3efd9988 4868 }
b32b8144
FG
4869
4870 bool do_renew_subs = false;
3efd9988
FG
4871 if (do_sub_pg_creates) {
4872 if (monc->sub_want("osd_pg_creates", last_pg_create_epoch, 0)) {
4873 dout(4) << __func__ << ": resolicit pg creates from mon since "
4874 << last_pg_create_epoch << dendl;
b32b8144 4875 do_renew_subs = true;
3efd9988
FG
4876 }
4877 }
b32b8144
FG
4878 version_t start = osdmap->get_epoch() + 1;
4879 if (have_pending_creates) {
4880 // don't miss any new osdmap deleting PGs
4881 if (monc->sub_want("osdmap", start, 0)) {
4882 dout(4) << __func__ << ": resolicit osdmap from mon since "
4883 << start << dendl;
4884 do_renew_subs = true;
4885 }
94b18763 4886 } else if (do_sub_pg_creates) {
b32b8144
FG
4887 // no need to subscribe the osdmap continuously anymore
4888 // once the pgtemp and/or mon_subscribe(pg_creates) is sent
4889 if (monc->sub_want_increment("osdmap", start, CEPH_SUBSCRIBE_ONETIME)) {
11fdf7f2 4890 dout(4) << __func__ << ": re-subscribe osdmap(onetime) since "
b32b8144
FG
4891 << start << dendl;
4892 do_renew_subs = true;
4893 }
4894 }
4895
4896 if (do_renew_subs) {
4897 monc->renew_subs();
4898 }
4899
94b18763 4900 service.send_pg_temp();
3efd9988 4901}
7c673cae
FG
4902
4903void OSD::build_initial_pg_history(
4904 spg_t pgid,
4905 epoch_t created,
4906 utime_t created_stamp,
4907 pg_history_t *h,
4908 PastIntervals *pi)
4909{
4910 dout(10) << __func__ << " " << pgid << " created " << created << dendl;
4911 h->epoch_created = created;
31f18b77 4912 h->epoch_pool_created = created;
7c673cae
FG
4913 h->same_interval_since = created;
4914 h->same_up_since = created;
4915 h->same_primary_since = created;
4916 h->last_scrub_stamp = created_stamp;
4917 h->last_deep_scrub_stamp = created_stamp;
4918 h->last_clean_scrub_stamp = created_stamp;
4919
4920 OSDMapRef lastmap = service.get_map(created);
4921 int up_primary, acting_primary;
4922 vector<int> up, acting;
4923 lastmap->pg_to_up_acting_osds(
4924 pgid.pgid, &up, &up_primary, &acting, &acting_primary);
4925
4926 ostringstream debug;
4927 for (epoch_t e = created + 1; e <= osdmap->get_epoch(); ++e) {
4928 OSDMapRef osdmap = service.get_map(e);
4929 int new_up_primary, new_acting_primary;
4930 vector<int> new_up, new_acting;
4931 osdmap->pg_to_up_acting_osds(
4932 pgid.pgid, &new_up, &new_up_primary, &new_acting, &new_acting_primary);
4933
4934 // this is a bit imprecise, but sufficient?
4935 struct min_size_predicate_t : public IsPGRecoverablePredicate {
4936 const pg_pool_t *pi;
4937 bool operator()(const set<pg_shard_t> &have) const {
4938 return have.size() >= pi->min_size;
4939 }
11fdf7f2 4940 explicit min_size_predicate_t(const pg_pool_t *i) : pi(i) {}
7c673cae
FG
4941 } min_size_predicate(osdmap->get_pg_pool(pgid.pgid.pool()));
4942
4943 bool new_interval = PastIntervals::check_new_interval(
4944 acting_primary,
4945 new_acting_primary,
4946 acting, new_acting,
4947 up_primary,
4948 new_up_primary,
4949 up, new_up,
4950 h->same_interval_since,
4951 h->last_epoch_clean,
4952 osdmap,
4953 lastmap,
4954 pgid.pgid,
4955 &min_size_predicate,
4956 pi,
4957 &debug);
4958 if (new_interval) {
4959 h->same_interval_since = e;
181888fb
FG
4960 if (up != new_up) {
4961 h->same_up_since = e;
4962 }
4963 if (acting_primary != new_acting_primary) {
4964 h->same_primary_since = e;
4965 }
4966 if (pgid.pgid.is_split(lastmap->get_pg_num(pgid.pgid.pool()),
4967 osdmap->get_pg_num(pgid.pgid.pool()),
4968 nullptr)) {
4969 h->last_epoch_split = e;
4970 }
4971 up = new_up;
4972 acting = new_acting;
4973 up_primary = new_up_primary;
4974 acting_primary = new_acting_primary;
c07f9fc5 4975 }
7c673cae
FG
4976 lastmap = osdmap;
4977 }
4978 dout(20) << __func__ << " " << debug.str() << dendl;
4979 dout(10) << __func__ << " " << *h << " " << *pi
4980 << " [" << (pi->empty() ? pair<epoch_t,epoch_t>(0,0) :
4981 pi->get_bounds()) << ")"
4982 << dendl;
4983}
4984
7c673cae
FG
4985void OSD::_add_heartbeat_peer(int p)
4986{
4987 if (p == whoami)
4988 return;
4989 HeartbeatInfo *hi;
4990
4991 map<int,HeartbeatInfo>::iterator i = heartbeat_peers.find(p);
4992 if (i == heartbeat_peers.end()) {
4993 pair<ConnectionRef,ConnectionRef> cons = service.get_con_osd_hb(p, osdmap->get_epoch());
4994 if (!cons.first)
4995 return;
4996 hi = &heartbeat_peers[p];
4997 hi->peer = p;
11fdf7f2 4998 RefCountedPtr s{new HeartbeatSession{p}, false};
eafe8130 4999 hi->hb_interval_start = ceph_clock_now();
7c673cae 5000 hi->con_back = cons.first.get();
11fdf7f2 5001 hi->con_back->set_priv(s);
7c673cae
FG
5002 if (cons.second) {
5003 hi->con_front = cons.second.get();
11fdf7f2 5004 hi->con_front->set_priv(s);
7c673cae
FG
5005 dout(10) << "_add_heartbeat_peer: new peer osd." << p
5006 << " " << hi->con_back->get_peer_addr()
5007 << " " << hi->con_front->get_peer_addr()
5008 << dendl;
5009 } else {
5010 hi->con_front.reset(NULL);
5011 dout(10) << "_add_heartbeat_peer: new peer osd." << p
5012 << " " << hi->con_back->get_peer_addr()
5013 << dendl;
5014 }
7c673cae
FG
5015 } else {
5016 hi = &i->second;
5017 }
5018 hi->epoch = osdmap->get_epoch();
5019}
5020
5021void OSD::_remove_heartbeat_peer(int n)
5022{
5023 map<int,HeartbeatInfo>::iterator q = heartbeat_peers.find(n);
11fdf7f2 5024 ceph_assert(q != heartbeat_peers.end());
7c673cae
FG
5025 dout(20) << " removing heartbeat peer osd." << n
5026 << " " << q->second.con_back->get_peer_addr()
5027 << " " << (q->second.con_front ? q->second.con_front->get_peer_addr() : entity_addr_t())
5028 << dendl;
5029 q->second.con_back->mark_down();
5030 if (q->second.con_front) {
5031 q->second.con_front->mark_down();
5032 }
5033 heartbeat_peers.erase(q);
5034}
5035
5036void OSD::need_heartbeat_peer_update()
5037{
5038 if (is_stopping())
5039 return;
5040 dout(20) << "need_heartbeat_peer_update" << dendl;
5041 heartbeat_set_peers_need_update();
5042}
5043
5044void OSD::maybe_update_heartbeat_peers()
5045{
11fdf7f2 5046 ceph_assert(osd_lock.is_locked());
7c673cae 5047
11fdf7f2 5048 if (is_waiting_for_healthy() || is_active()) {
7c673cae
FG
5049 utime_t now = ceph_clock_now();
5050 if (last_heartbeat_resample == utime_t()) {
5051 last_heartbeat_resample = now;
5052 heartbeat_set_peers_need_update();
5053 } else if (!heartbeat_peers_need_update()) {
5054 utime_t dur = now - last_heartbeat_resample;
5055 if (dur > cct->_conf->osd_heartbeat_grace) {
5056 dout(10) << "maybe_update_heartbeat_peers forcing update after " << dur << " seconds" << dendl;
5057 heartbeat_set_peers_need_update();
5058 last_heartbeat_resample = now;
494da23a
TL
5059 // automatically clean up any stale heartbeat peers
5060 // if we are unhealthy, then clean all
5061 reset_heartbeat_peers(is_waiting_for_healthy());
7c673cae
FG
5062 }
5063 }
5064 }
5065
5066 if (!heartbeat_peers_need_update())
5067 return;
5068 heartbeat_clear_peers_need_update();
5069
11fdf7f2 5070 std::lock_guard l(heartbeat_lock);
7c673cae
FG
5071
5072 dout(10) << "maybe_update_heartbeat_peers updating" << dendl;
5073
5074
5075 // build heartbeat from set
5076 if (is_active()) {
11fdf7f2
TL
5077 vector<PGRef> pgs;
5078 _get_pgs(&pgs);
5079 for (auto& pg : pgs) {
5080 pg->with_heartbeat_peers([&](int peer) {
5081 if (osdmap->is_up(peer)) {
5082 _add_heartbeat_peer(peer);
5083 }
5084 });
7c673cae
FG
5085 }
5086 }
5087
5088 // include next and previous up osds to ensure we have a fully-connected set
5089 set<int> want, extras;
11fdf7f2 5090 const int next = osdmap->get_next_up_osd_after(whoami);
7c673cae
FG
5091 if (next >= 0)
5092 want.insert(next);
5093 int prev = osdmap->get_previous_up_osd_before(whoami);
5094 if (prev >= 0 && prev != next)
5095 want.insert(prev);
5096
11fdf7f2
TL
5097 // make sure we have at least **min_down** osds coming from different
5098 // subtree level (e.g., hosts) for fast failure detection.
5099 auto min_down = cct->_conf.get_val<uint64_t>("mon_osd_min_down_reporters");
5100 auto subtree = cct->_conf.get_val<string>("mon_osd_reporter_subtree_level");
5101 osdmap->get_random_up_osds_by_subtree(
5102 whoami, subtree, min_down, want, &want);
5103
7c673cae
FG
5104 for (set<int>::iterator p = want.begin(); p != want.end(); ++p) {
5105 dout(10) << " adding neighbor peer osd." << *p << dendl;
5106 extras.insert(*p);
5107 _add_heartbeat_peer(*p);
5108 }
5109
5110 // remove down peers; enumerate extras
5111 map<int,HeartbeatInfo>::iterator p = heartbeat_peers.begin();
5112 while (p != heartbeat_peers.end()) {
5113 if (!osdmap->is_up(p->first)) {
5114 int o = p->first;
5115 ++p;
5116 _remove_heartbeat_peer(o);
5117 continue;
5118 }
5119 if (p->second.epoch < osdmap->get_epoch()) {
5120 extras.insert(p->first);
5121 }
5122 ++p;
5123 }
5124
5125 // too few?
11fdf7f2 5126 for (int n = next; n >= 0; ) {
7c673cae
FG
5127 if ((int)heartbeat_peers.size() >= cct->_conf->osd_heartbeat_min_peers)
5128 break;
5129 if (!extras.count(n) && !want.count(n) && n != whoami) {
5130 dout(10) << " adding random peer osd." << n << dendl;
5131 extras.insert(n);
5132 _add_heartbeat_peer(n);
5133 }
5134 n = osdmap->get_next_up_osd_after(n);
11fdf7f2 5135 if (n == next)
7c673cae
FG
5136 break; // came full circle; stop
5137 }
5138
5139 // too many?
5140 for (set<int>::iterator p = extras.begin();
5141 (int)heartbeat_peers.size() > cct->_conf->osd_heartbeat_min_peers && p != extras.end();
5142 ++p) {
5143 if (want.count(*p))
5144 continue;
5145 _remove_heartbeat_peer(*p);
5146 }
5147
5148 dout(10) << "maybe_update_heartbeat_peers " << heartbeat_peers.size() << " peers, extras " << extras << dendl;
5149}
5150
494da23a 5151void OSD::reset_heartbeat_peers(bool all)
7c673cae 5152{
11fdf7f2 5153 ceph_assert(osd_lock.is_locked());
7c673cae 5154 dout(10) << "reset_heartbeat_peers" << dendl;
494da23a
TL
5155 utime_t stale = ceph_clock_now();
5156 stale -= cct->_conf.get_val<int64_t>("osd_heartbeat_stale");
11fdf7f2 5157 std::lock_guard l(heartbeat_lock);
494da23a
TL
5158 for (auto it = heartbeat_peers.begin(); it != heartbeat_peers.end();) {
5159 HeartbeatInfo& hi = it->second;
5160 if (all || hi.is_stale(stale)) {
5161 hi.con_back->mark_down();
5162 if (hi.con_front) {
5163 hi.con_front->mark_down();
5164 }
5165 // stop sending failure_report to mon too
5166 failure_queue.erase(it->first);
5167 heartbeat_peers.erase(it++);
5168 } else {
5169 it++;
7c673cae 5170 }
7c673cae 5171 }
7c673cae
FG
5172}
5173
5174void OSD::handle_osd_ping(MOSDPing *m)
5175{
5176 if (superblock.cluster_fsid != m->fsid) {
5177 dout(20) << "handle_osd_ping from " << m->get_source_inst()
5178 << " bad fsid " << m->fsid << " != " << superblock.cluster_fsid << dendl;
5179 m->put();
5180 return;
5181 }
5182
5183 int from = m->get_source().num();
5184
5185 heartbeat_lock.Lock();
5186 if (is_stopping()) {
5187 heartbeat_lock.Unlock();
5188 m->put();
5189 return;
5190 }
5191
5192 OSDMapRef curmap = service.get_osdmap();
c07f9fc5
FG
5193 if (!curmap) {
5194 heartbeat_lock.Unlock();
5195 m->put();
5196 return;
5197 }
7c673cae
FG
5198
5199 switch (m->op) {
5200
5201 case MOSDPing::PING:
5202 {
5203 if (cct->_conf->osd_debug_drop_ping_probability > 0) {
5204 auto heartbeat_drop = debug_heartbeat_drops_remaining.find(from);
5205 if (heartbeat_drop != debug_heartbeat_drops_remaining.end()) {
5206 if (heartbeat_drop->second == 0) {
5207 debug_heartbeat_drops_remaining.erase(heartbeat_drop);
5208 } else {
5209 --heartbeat_drop->second;
5210 dout(5) << "Dropping heartbeat from " << from
5211 << ", " << heartbeat_drop->second
5212 << " remaining to drop" << dendl;
5213 break;
5214 }
5215 } else if (cct->_conf->osd_debug_drop_ping_probability >
5216 ((((double)(rand()%100))/100.0))) {
5217 heartbeat_drop =
5218 debug_heartbeat_drops_remaining.insert(std::make_pair(from,
5219 cct->_conf->osd_debug_drop_ping_duration)).first;
5220 dout(5) << "Dropping heartbeat from " << from
5221 << ", " << heartbeat_drop->second
5222 << " remaining to drop" << dendl;
5223 break;
5224 }
5225 }
5226
5227 if (!cct->get_heartbeat_map()->is_healthy()) {
5228 dout(10) << "internal heartbeat not healthy, dropping ping request" << dendl;
5229 break;
5230 }
5231
5232 Message *r = new MOSDPing(monc->get_fsid(),
5233 curmap->get_epoch(),
31f18b77
FG
5234 MOSDPing::PING_REPLY, m->stamp,
5235 cct->_conf->osd_heartbeat_min_size);
7c673cae
FG
5236 m->get_connection()->send_message(r);
5237
5238 if (curmap->is_up(from)) {
5239 service.note_peer_epoch(from, m->map_epoch);
5240 if (is_active()) {
5241 ConnectionRef con = service.get_con_osd_cluster(from, curmap->get_epoch());
5242 if (con) {
5243 service.share_map_peer(from, con.get());
5244 }
5245 }
5246 } else if (!curmap->exists(from) ||
5247 curmap->get_down_at(from) > m->map_epoch) {
5248 // tell them they have died
5249 Message *r = new MOSDPing(monc->get_fsid(),
5250 curmap->get_epoch(),
5251 MOSDPing::YOU_DIED,
31f18b77
FG
5252 m->stamp,
5253 cct->_conf->osd_heartbeat_min_size);
7c673cae
FG
5254 m->get_connection()->send_message(r);
5255 }
5256 }
5257 break;
5258
5259 case MOSDPing::PING_REPLY:
5260 {
5261 map<int,HeartbeatInfo>::iterator i = heartbeat_peers.find(from);
5262 if (i != heartbeat_peers.end()) {
11fdf7f2
TL
5263 auto acked = i->second.ping_history.find(m->stamp);
5264 if (acked != i->second.ping_history.end()) {
5265 utime_t now = ceph_clock_now();
5266 int &unacknowledged = acked->second.second;
5267 if (m->get_connection() == i->second.con_back) {
5268 dout(25) << "handle_osd_ping got reply from osd." << from
5269 << " first_tx " << i->second.first_tx
5270 << " last_tx " << i->second.last_tx
5271 << " last_rx_back " << i->second.last_rx_back << " -> " << now
5272 << " last_rx_front " << i->second.last_rx_front
5273 << dendl;
5274 i->second.last_rx_back = now;
5275 ceph_assert(unacknowledged > 0);
5276 --unacknowledged;
5277 // if there is no front con, set both stamps.
5278 if (i->second.con_front == NULL) {
5279 i->second.last_rx_front = now;
5280 ceph_assert(unacknowledged > 0);
5281 --unacknowledged;
5282 }
5283 } else if (m->get_connection() == i->second.con_front) {
5284 dout(25) << "handle_osd_ping got reply from osd." << from
5285 << " first_tx " << i->second.first_tx
5286 << " last_tx " << i->second.last_tx
5287 << " last_rx_back " << i->second.last_rx_back
5288 << " last_rx_front " << i->second.last_rx_front << " -> " << now
5289 << dendl;
5290 i->second.last_rx_front = now;
5291 ceph_assert(unacknowledged > 0);
5292 --unacknowledged;
5293 }
7c673cae 5294
11fdf7f2
TL
5295 if (unacknowledged == 0) {
5296 // succeeded in getting all replies
5297 dout(25) << "handle_osd_ping got all replies from osd." << from
5298 << " , erase pending ping(sent at " << m->stamp << ")"
5299 << " and older pending ping(s)"
5300 << dendl;
eafe8130
TL
5301
5302#define ROUND_S_TO_USEC(sec) (uint32_t)((sec) * 1000 * 1000 + 0.5)
5303 ++i->second.hb_average_count;
5304 uint32_t back_pingtime = ROUND_S_TO_USEC(i->second.last_rx_back - m->stamp);
5305 i->second.hb_total_back += back_pingtime;
5306 if (back_pingtime < i->second.hb_min_back)
5307 i->second.hb_min_back = back_pingtime;
5308 if (back_pingtime > i->second.hb_max_back)
5309 i->second.hb_max_back = back_pingtime;
5310 uint32_t front_pingtime = ROUND_S_TO_USEC(i->second.last_rx_front - m->stamp);
5311 i->second.hb_total_front += front_pingtime;
5312 if (front_pingtime < i->second.hb_min_front)
5313 i->second.hb_min_front = front_pingtime;
5314 if (front_pingtime > i->second.hb_max_front)
5315 i->second.hb_max_front = front_pingtime;
5316
5317 ceph_assert(i->second.hb_interval_start != utime_t());
5318 if (i->second.hb_interval_start == utime_t())
5319 i->second.hb_interval_start = now;
5320 int64_t hb_avg_time_period = 60;
5321 if (cct->_conf.get_val<int64_t>("debug_heartbeat_testing_span")) {
5322 hb_avg_time_period = cct->_conf.get_val<int64_t>("debug_heartbeat_testing_span");
5323 }
5324 if (now - i->second.hb_interval_start >= utime_t(hb_avg_time_period, 0)) {
5325 uint32_t back_avg = i->second.hb_total_back / i->second.hb_average_count;
5326 uint32_t back_min = i->second.hb_min_back;
5327 uint32_t back_max = i->second.hb_max_back;
5328 uint32_t front_avg = i->second.hb_total_front / i->second.hb_average_count;
5329 uint32_t front_min = i->second.hb_min_front;
5330 uint32_t front_max = i->second.hb_max_front;
5331
5332 // Reset for new interval
5333 i->second.hb_average_count = 0;
5334 i->second.hb_interval_start = now;
5335 i->second.hb_total_back = i->second.hb_max_back = 0;
5336 i->second.hb_min_back = UINT_MAX;
5337 i->second.hb_total_front = i->second.hb_max_front = 0;
5338 i->second.hb_min_front = UINT_MAX;
5339
5340 // Record per osd interace ping times
5341 // Based on osd_heartbeat_interval ignoring that it is randomly short than this interval
5342 if (i->second.hb_back_pingtime.size() == 0) {
5343 ceph_assert(i->second.hb_front_pingtime.size() == 0);
5344 for (unsigned k = 0 ; k < hb_vector_size; ++k) {
5345 i->second.hb_back_pingtime.push_back(back_avg);
5346 i->second.hb_back_min.push_back(back_min);
5347 i->second.hb_back_max.push_back(back_max);
5348 i->second.hb_front_pingtime.push_back(front_avg);
5349 i->second.hb_front_min.push_back(front_min);
5350 i->second.hb_front_max.push_back(front_max);
5351 ++i->second.hb_index;
5352 }
5353 } else {
5354 int index = i->second.hb_index & (hb_vector_size - 1);
5355 i->second.hb_back_pingtime[index] = back_avg;
5356 i->second.hb_back_min[index] = back_min;
5357 i->second.hb_back_max[index] = back_max;
5358 i->second.hb_front_pingtime[index] = front_avg;
5359 i->second.hb_front_min[index] = front_min;
5360 i->second.hb_front_max[index] = front_max;
5361 ++i->second.hb_index;
5362 }
5363
5364 {
5365 std::lock_guard l(service.stat_lock);
5366 service.osd_stat.hb_pingtime[from].last_update = now.sec();
5367 service.osd_stat.hb_pingtime[from].back_last = back_pingtime;
5368
5369 uint32_t total = 0;
5370 uint32_t min = UINT_MAX;
5371 uint32_t max = 0;
5372 uint32_t count = 0;
5373 uint32_t which = 0;
5374 uint32_t size = (uint32_t)i->second.hb_back_pingtime.size();
5375 for (int32_t k = size - 1 ; k >= 0; --k) {
5376 ++count;
5377 int index = (i->second.hb_index + k) % size;
5378 total += i->second.hb_back_pingtime[index];
5379 if (i->second.hb_back_min[index] < min)
5380 min = i->second.hb_back_min[index];
5381 if (i->second.hb_back_max[index] > max)
5382 max = i->second.hb_back_max[index];
5383 if (count == 1 || count == 5 || count == 15) {
5384 service.osd_stat.hb_pingtime[from].back_pingtime[which] = total / count;
5385 service.osd_stat.hb_pingtime[from].back_min[which] = min;
5386 service.osd_stat.hb_pingtime[from].back_max[which] = max;
5387 which++;
5388 if (count == 15)
5389 break;
5390 }
5391 }
5392
5393 if (i->second.con_front != NULL) {
5394 service.osd_stat.hb_pingtime[from].front_last = front_pingtime;
5395
5396 total = 0;
5397 min = UINT_MAX;
5398 max = 0;
5399 count = 0;
5400 which = 0;
5401 for (int32_t k = size - 1 ; k >= 0; --k) {
5402 ++count;
5403 int index = (i->second.hb_index + k) % size;
5404 total += i->second.hb_front_pingtime[index];
5405 if (i->second.hb_front_min[index] < min)
5406 min = i->second.hb_front_min[index];
5407 if (i->second.hb_front_max[index] > max)
5408 max = i->second.hb_front_max[index];
5409 if (count == 1 || count == 5 || count == 15) {
5410 service.osd_stat.hb_pingtime[from].front_pingtime[which] = total / count;
5411 service.osd_stat.hb_pingtime[from].front_min[which] = min;
5412 service.osd_stat.hb_pingtime[from].front_max[which] = max;
5413 which++;
5414 if (count == 15)
5415 break;
5416 }
5417 }
5418 }
5419 }
5420 } else {
5421 std::lock_guard l(service.stat_lock);
5422 service.osd_stat.hb_pingtime[from].back_last = back_pingtime;
5423 if (i->second.con_front != NULL)
5424 service.osd_stat.hb_pingtime[from].front_last = front_pingtime;
5425 }
11fdf7f2 5426 i->second.ping_history.erase(i->second.ping_history.begin(), ++acked);
7c673cae
FG
5427 }
5428
11fdf7f2
TL
5429 if (i->second.is_healthy(now)) {
5430 // Cancel false reports
5431 auto failure_queue_entry = failure_queue.find(from);
5432 if (failure_queue_entry != failure_queue.end()) {
5433 dout(10) << "handle_osd_ping canceling queued "
5434 << "failure report for osd." << from << dendl;
5435 failure_queue.erase(failure_queue_entry);
5436 }
5437
5438 auto failure_pending_entry = failure_pending.find(from);
5439 if (failure_pending_entry != failure_pending.end()) {
5440 dout(10) << "handle_osd_ping canceling in-flight "
5441 << "failure report for osd." << from << dendl;
5442 send_still_alive(curmap->get_epoch(),
5443 from,
5444 failure_pending_entry->second.second);
5445 failure_pending.erase(failure_pending_entry);
5446 }
7c673cae 5447 }
11fdf7f2
TL
5448 } else {
5449 // old replies, deprecated by newly sent pings.
5450 dout(10) << "handle_osd_ping no pending ping(sent at " << m->stamp
5451 << ") is found, treat as covered by newly sent pings "
5452 << "and ignore"
5453 << dendl;
7c673cae
FG
5454 }
5455 }
5456
5457 if (m->map_epoch &&
5458 curmap->is_up(from)) {
5459 service.note_peer_epoch(from, m->map_epoch);
5460 if (is_active()) {
5461 ConnectionRef con = service.get_con_osd_cluster(from, curmap->get_epoch());
5462 if (con) {
5463 service.share_map_peer(from, con.get());
5464 }
5465 }
5466 }
5467 }
5468 break;
5469
5470 case MOSDPing::YOU_DIED:
5471 dout(10) << "handle_osd_ping " << m->get_source_inst()
5472 << " says i am down in " << m->map_epoch << dendl;
5473 osdmap_subscribe(curmap->get_epoch()+1, false);
5474 break;
5475 }
5476
5477 heartbeat_lock.Unlock();
5478 m->put();
5479}
5480
5481void OSD::heartbeat_entry()
5482{
11fdf7f2 5483 std::lock_guard l(heartbeat_lock);
7c673cae
FG
5484 if (is_stopping())
5485 return;
5486 while (!heartbeat_stop) {
5487 heartbeat();
5488
eafe8130
TL
5489 double wait;
5490 if (cct->_conf.get_val<bool>("debug_disable_randomized_ping")) {
5491 wait = (float)cct->_conf->osd_heartbeat_interval;
5492 } else {
5493 wait = .5 + ((float)(rand() % 10)/10.0) * (float)cct->_conf->osd_heartbeat_interval;
5494 }
7c673cae
FG
5495 utime_t w;
5496 w.set_from_double(wait);
5497 dout(30) << "heartbeat_entry sleeping for " << wait << dendl;
5498 heartbeat_cond.WaitInterval(heartbeat_lock, w);
5499 if (is_stopping())
5500 return;
5501 dout(30) << "heartbeat_entry woke up" << dendl;
5502 }
5503}
5504
5505void OSD::heartbeat_check()
5506{
11fdf7f2 5507 ceph_assert(heartbeat_lock.is_locked());
7c673cae
FG
5508 utime_t now = ceph_clock_now();
5509
11fdf7f2 5510 // check for incoming heartbeats (move me elsewhere?)
7c673cae
FG
5511 for (map<int,HeartbeatInfo>::iterator p = heartbeat_peers.begin();
5512 p != heartbeat_peers.end();
5513 ++p) {
5514
5515 if (p->second.first_tx == utime_t()) {
5516 dout(25) << "heartbeat_check we haven't sent ping to osd." << p->first
11fdf7f2 5517 << " yet, skipping" << dendl;
7c673cae
FG
5518 continue;
5519 }
5520
5521 dout(25) << "heartbeat_check osd." << p->first
5522 << " first_tx " << p->second.first_tx
5523 << " last_tx " << p->second.last_tx
5524 << " last_rx_back " << p->second.last_rx_back
5525 << " last_rx_front " << p->second.last_rx_front
5526 << dendl;
11fdf7f2
TL
5527 if (p->second.is_unhealthy(now)) {
5528 utime_t oldest_deadline = p->second.ping_history.begin()->second.first;
7c673cae
FG
5529 if (p->second.last_rx_back == utime_t() ||
5530 p->second.last_rx_front == utime_t()) {
11fdf7f2
TL
5531 derr << "heartbeat_check: no reply from "
5532 << p->second.con_front->get_peer_addr().get_sockaddr()
5533 << " osd." << p->first
5534 << " ever on either front or back, first ping sent "
5535 << p->second.first_tx
5536 << " (oldest deadline " << oldest_deadline << ")"
5537 << dendl;
7c673cae 5538 // fail
11fdf7f2 5539 failure_queue[p->first] = p->second.first_tx;
7c673cae 5540 } else {
11fdf7f2
TL
5541 derr << "heartbeat_check: no reply from "
5542 << p->second.con_front->get_peer_addr().get_sockaddr()
7c673cae
FG
5543 << " osd." << p->first << " since back " << p->second.last_rx_back
5544 << " front " << p->second.last_rx_front
11fdf7f2
TL
5545 << " (oldest deadline " << oldest_deadline << ")"
5546 << dendl;
7c673cae 5547 // fail
11fdf7f2 5548 failure_queue[p->first] = std::min(p->second.last_rx_back, p->second.last_rx_front);
7c673cae
FG
5549 }
5550 }
5551 }
5552}
5553
5554void OSD::heartbeat()
5555{
81eedcae 5556 ceph_assert(heartbeat_lock.is_locked_by_me());
7c673cae
FG
5557 dout(30) << "heartbeat" << dendl;
5558
5559 // get CPU load avg
5560 double loadavgs[1];
11fdf7f2
TL
5561 int hb_interval = cct->_conf->osd_heartbeat_interval;
5562 int n_samples = 86400;
5563 if (hb_interval > 1) {
5564 n_samples /= hb_interval;
5565 if (n_samples < 1)
5566 n_samples = 1;
5567 }
5568
7c673cae
FG
5569 if (getloadavg(loadavgs, 1) == 1) {
5570 logger->set(l_osd_loadavg, 100 * loadavgs[0]);
5571 daily_loadavg = (daily_loadavg * (n_samples - 1) + loadavgs[0]) / n_samples;
5572 dout(30) << "heartbeat: daily_loadavg " << daily_loadavg << dendl;
5573 }
5574
5575 dout(30) << "heartbeat checking stats" << dendl;
5576
11fdf7f2 5577 // refresh peer list and osd stats
7c673cae
FG
5578 vector<int> hb_peers;
5579 for (map<int,HeartbeatInfo>::iterator p = heartbeat_peers.begin();
5580 p != heartbeat_peers.end();
5581 ++p)
5582 hb_peers.push_back(p->first);
7c673cae 5583
11fdf7f2
TL
5584 auto new_stat = service.set_osd_stat(hb_peers, get_num_pgs());
5585 dout(5) << __func__ << " " << new_stat << dendl;
5586 ceph_assert(new_stat.statfs.total);
5587
5588 float pratio;
5589 float ratio = service.compute_adjusted_ratio(new_stat, &pratio);
5590
5591 service.check_full_status(ratio, pratio);
7c673cae
FG
5592
5593 utime_t now = ceph_clock_now();
11fdf7f2
TL
5594 utime_t deadline = now;
5595 deadline += cct->_conf->osd_heartbeat_grace;
7c673cae
FG
5596
5597 // send heartbeats
5598 for (map<int,HeartbeatInfo>::iterator i = heartbeat_peers.begin();
5599 i != heartbeat_peers.end();
5600 ++i) {
5601 int peer = i->first;
5602 i->second.last_tx = now;
5603 if (i->second.first_tx == utime_t())
5604 i->second.first_tx = now;
11fdf7f2
TL
5605 i->second.ping_history[now] = make_pair(deadline,
5606 HeartbeatInfo::HEARTBEAT_MAX_CONN);
eafe8130
TL
5607 if (i->second.hb_interval_start == utime_t())
5608 i->second.hb_interval_start = now;
7c673cae
FG
5609 dout(30) << "heartbeat sending ping to osd." << peer << dendl;
5610 i->second.con_back->send_message(new MOSDPing(monc->get_fsid(),
11fdf7f2 5611 service.get_osdmap_epoch(),
31f18b77
FG
5612 MOSDPing::PING, now,
5613 cct->_conf->osd_heartbeat_min_size));
7c673cae
FG
5614
5615 if (i->second.con_front)
5616 i->second.con_front->send_message(new MOSDPing(monc->get_fsid(),
11fdf7f2 5617 service.get_osdmap_epoch(),
31f18b77
FG
5618 MOSDPing::PING, now,
5619 cct->_conf->osd_heartbeat_min_size));
7c673cae
FG
5620 }
5621
5622 logger->set(l_osd_hb_to, heartbeat_peers.size());
5623
5624 // hmm.. am i all alone?
5625 dout(30) << "heartbeat lonely?" << dendl;
5626 if (heartbeat_peers.empty()) {
5627 if (now - last_mon_heartbeat > cct->_conf->osd_mon_heartbeat_interval && is_active()) {
5628 last_mon_heartbeat = now;
5629 dout(10) << "i have no heartbeat peers; checking mon for new map" << dendl;
5630 osdmap_subscribe(osdmap->get_epoch() + 1, false);
5631 }
5632 }
5633
5634 dout(30) << "heartbeat done" << dendl;
5635}
5636
5637bool OSD::heartbeat_reset(Connection *con)
5638{
11fdf7f2
TL
5639 std::lock_guard l(heartbeat_lock);
5640 auto s = con->get_priv();
5641 con->set_priv(nullptr);
7c673cae 5642 if (s) {
7c673cae 5643 if (is_stopping()) {
7c673cae
FG
5644 return true;
5645 }
11fdf7f2
TL
5646 auto heartbeat_session = static_cast<HeartbeatSession*>(s.get());
5647 auto p = heartbeat_peers.find(heartbeat_session->peer);
7c673cae
FG
5648 if (p != heartbeat_peers.end() &&
5649 (p->second.con_back == con ||
5650 p->second.con_front == con)) {
5651 dout(10) << "heartbeat_reset failed hb con " << con << " for osd." << p->second.peer
5652 << ", reopening" << dendl;
5653 if (con != p->second.con_back) {
5654 p->second.con_back->mark_down();
5655 }
5656 p->second.con_back.reset(NULL);
5657 if (p->second.con_front && con != p->second.con_front) {
5658 p->second.con_front->mark_down();
5659 }
5660 p->second.con_front.reset(NULL);
5661 pair<ConnectionRef,ConnectionRef> newcon = service.get_con_osd_hb(p->second.peer, p->second.epoch);
5662 if (newcon.first) {
5663 p->second.con_back = newcon.first.get();
11fdf7f2 5664 p->second.con_back->set_priv(s);
7c673cae
FG
5665 if (newcon.second) {
5666 p->second.con_front = newcon.second.get();
11fdf7f2 5667 p->second.con_front->set_priv(s);
7c673cae 5668 }
11fdf7f2 5669 p->second.ping_history.clear();
7c673cae
FG
5670 } else {
5671 dout(10) << "heartbeat_reset failed hb con " << con << " for osd." << p->second.peer
5672 << ", raced with osdmap update, closing out peer" << dendl;
5673 heartbeat_peers.erase(p);
5674 }
5675 } else {
5676 dout(10) << "heartbeat_reset closing (old) failed hb con " << con << dendl;
5677 }
7c673cae
FG
5678 }
5679 return true;
5680}
5681
5682
5683
5684// =========================================
5685
5686void OSD::tick()
5687{
11fdf7f2 5688 ceph_assert(osd_lock.is_locked());
7c673cae
FG
5689 dout(10) << "tick" << dendl;
5690
5691 if (is_active() || is_waiting_for_healthy()) {
5692 maybe_update_heartbeat_peers();
5693 }
5694
5695 if (is_waiting_for_healthy()) {
5696 start_boot();
494da23a
TL
5697 }
5698
5699 if (is_waiting_for_healthy() || is_booting()) {
5700 std::lock_guard l(heartbeat_lock);
5701 utime_t now = ceph_clock_now();
5702 if (now - last_mon_heartbeat > cct->_conf->osd_mon_heartbeat_interval) {
5703 last_mon_heartbeat = now;
5704 dout(1) << __func__ << " checking mon for new map" << dendl;
5705 osdmap_subscribe(osdmap->get_epoch() + 1, false);
11fdf7f2 5706 }
7c673cae
FG
5707 }
5708
5709 do_waiters();
5710
91327a77 5711 tick_timer.add_event_after(get_tick_interval(), new C_Tick(this));
7c673cae
FG
5712}
5713
5714void OSD::tick_without_osd_lock()
5715{
11fdf7f2 5716 ceph_assert(tick_timer_lock.is_locked());
7c673cae
FG
5717 dout(10) << "tick_without_osd_lock" << dendl;
5718
7c673cae
FG
5719 logger->set(l_osd_cached_crc, buffer::get_cached_crc());
5720 logger->set(l_osd_cached_crc_adjusted, buffer::get_cached_crc_adjusted());
5721 logger->set(l_osd_missed_crc, buffer::get_missed_crc());
11fdf7f2
TL
5722
5723 // refresh osd stats
5724 struct store_statfs_t stbuf;
5725 osd_alert_list_t alerts;
5726 int r = store->statfs(&stbuf, &alerts);
5727 ceph_assert(r == 0);
5728 service.set_statfs(stbuf, alerts);
7c673cae
FG
5729
5730 // osd_lock is not being held, which means the OSD state
5731 // might change when doing the monitor report
5732 if (is_active() || is_waiting_for_healthy()) {
5733 heartbeat_lock.Lock();
5734 heartbeat_check();
5735 heartbeat_lock.Unlock();
5736
5737 map_lock.get_read();
11fdf7f2 5738 std::lock_guard l(mon_report_lock);
7c673cae
FG
5739
5740 // mon report?
7c673cae 5741 utime_t now = ceph_clock_now();
11fdf7f2
TL
5742 if (service.need_fullness_update() ||
5743 now - last_mon_report > cct->_conf->osd_mon_report_interval) {
7c673cae 5744 last_mon_report = now;
7c673cae
FG
5745 send_full_update();
5746 send_failures();
7c673cae
FG
5747 }
5748 map_lock.put_read();
11fdf7f2
TL
5749
5750 epoch_t max_waiting_epoch = 0;
5751 for (auto s : shards) {
5752 max_waiting_epoch = std::max(max_waiting_epoch,
5753 s->get_max_waiting_epoch());
5754 }
5755 if (max_waiting_epoch > get_osdmap()->get_epoch()) {
5756 dout(20) << __func__ << " max_waiting_epoch " << max_waiting_epoch
5757 << ", requesting new map" << dendl;
5758 osdmap_subscribe(superblock.newest_map + 1, false);
5759 }
7c673cae
FG
5760 }
5761
5762 if (is_active()) {
5763 if (!scrub_random_backoff()) {
5764 sched_scrub();
5765 }
5766 service.promote_throttle_recalibrate();
3efd9988 5767 resume_creating_pg();
224ce89b
WB
5768 bool need_send_beacon = false;
5769 const auto now = ceph::coarse_mono_clock::now();
5770 {
5771 // borrow lec lock to pretect last_sent_beacon from changing
11fdf7f2 5772 std::lock_guard l{min_last_epoch_clean_lock};
224ce89b
WB
5773 const auto elapsed = now - last_sent_beacon;
5774 if (chrono::duration_cast<chrono::seconds>(elapsed).count() >
5775 cct->_conf->osd_beacon_report_interval) {
5776 need_send_beacon = true;
5777 }
5778 }
5779 if (need_send_beacon) {
5780 send_beacon(now);
5781 }
7c673cae
FG
5782 }
5783
11fdf7f2 5784 mgrc.update_daemon_health(get_health_metrics());
7c673cae 5785 service.kick_recovery_queue();
91327a77
AA
5786 tick_timer_without_osd_lock.add_event_after(get_tick_interval(),
5787 new C_Tick_WithoutOSDLock(this));
7c673cae
FG
5788}
5789
7c673cae
FG
5790// Usage:
5791// setomapval <pool-id> [namespace/]<obj-name> <key> <val>
5792// rmomapkey <pool-id> [namespace/]<obj-name> <key>
5793// setomapheader <pool-id> [namespace/]<obj-name> <header>
5794// getomap <pool> [namespace/]<obj-name>
5795// truncobj <pool-id> [namespace/]<obj-name> <newlen>
5796// injectmdataerr [namespace/]<obj-name> [shardid]
5797// injectdataerr [namespace/]<obj-name> [shardid]
5798//
5799// set_recovery_delay [utime]
5800void TestOpsSocketHook::test_ops(OSDService *service, ObjectStore *store,
11fdf7f2
TL
5801 std::string_view command,
5802 const cmdmap_t& cmdmap, ostream &ss)
7c673cae
FG
5803{
5804 //Test support
5805 //Support changing the omap on a single osd by using the Admin Socket to
5806 //directly request the osd make a change.
5807 if (command == "setomapval" || command == "rmomapkey" ||
5808 command == "setomapheader" || command == "getomap" ||
5809 command == "truncobj" || command == "injectmdataerr" ||
5810 command == "injectdataerr"
5811 ) {
5812 pg_t rawpg;
5813 int64_t pool;
5814 OSDMapRef curmap = service->get_osdmap();
5815 int r = -1;
5816
5817 string poolstr;
5818
5819 cmd_getval(service->cct, cmdmap, "pool", poolstr);
5820 pool = curmap->lookup_pg_pool_name(poolstr);
5821 //If we can't find it by name then maybe id specified
5822 if (pool < 0 && isdigit(poolstr[0]))
5823 pool = atoll(poolstr.c_str());
5824 if (pool < 0) {
b5b8bbf5 5825 ss << "Invalid pool '" << poolstr << "''";
7c673cae
FG
5826 return;
5827 }
5828
5829 string objname, nspace;
5830 cmd_getval(service->cct, cmdmap, "objname", objname);
5831 std::size_t found = objname.find_first_of('/');
5832 if (found != string::npos) {
5833 nspace = objname.substr(0, found);
5834 objname = objname.substr(found+1);
5835 }
5836 object_locator_t oloc(pool, nspace);
5837 r = curmap->object_locator_to_pg(object_t(objname), oloc, rawpg);
5838
5839 if (r < 0) {
5840 ss << "Invalid namespace/objname";
5841 return;
5842 }
5843
5844 int64_t shardid;
5845 cmd_getval(service->cct, cmdmap, "shardid", shardid, int64_t(shard_id_t::NO_SHARD));
5846 hobject_t obj(object_t(objname), string(""), CEPH_NOSNAP, rawpg.ps(), pool, nspace);
5847 ghobject_t gobj(obj, ghobject_t::NO_GEN, shard_id_t(uint8_t(shardid)));
5848 spg_t pgid(curmap->raw_pg_to_pg(rawpg), shard_id_t(shardid));
5849 if (curmap->pg_is_ec(rawpg)) {
5850 if ((command != "injectdataerr") && (command != "injectmdataerr")) {
5851 ss << "Must not call on ec pool, except injectdataerr or injectmdataerr";
5852 return;
5853 }
5854 }
5855
5856 ObjectStore::Transaction t;
5857
5858 if (command == "setomapval") {
5859 map<string, bufferlist> newattrs;
5860 bufferlist val;
5861 string key, valstr;
5862 cmd_getval(service->cct, cmdmap, "key", key);
5863 cmd_getval(service->cct, cmdmap, "val", valstr);
5864
5865 val.append(valstr);
5866 newattrs[key] = val;
5867 t.omap_setkeys(coll_t(pgid), ghobject_t(obj), newattrs);
11fdf7f2 5868 r = store->queue_transaction(service->meta_ch, std::move(t));
7c673cae
FG
5869 if (r < 0)
5870 ss << "error=" << r;
5871 else
5872 ss << "ok";
5873 } else if (command == "rmomapkey") {
5874 string key;
5875 set<string> keys;
5876 cmd_getval(service->cct, cmdmap, "key", key);
5877
5878 keys.insert(key);
5879 t.omap_rmkeys(coll_t(pgid), ghobject_t(obj), keys);
11fdf7f2 5880 r = store->queue_transaction(service->meta_ch, std::move(t));
7c673cae
FG
5881 if (r < 0)
5882 ss << "error=" << r;
5883 else
5884 ss << "ok";
5885 } else if (command == "setomapheader") {
5886 bufferlist newheader;
5887 string headerstr;
5888
5889 cmd_getval(service->cct, cmdmap, "header", headerstr);
5890 newheader.append(headerstr);
5891 t.omap_setheader(coll_t(pgid), ghobject_t(obj), newheader);
11fdf7f2 5892 r = store->queue_transaction(service->meta_ch, std::move(t));
7c673cae
FG
5893 if (r < 0)
5894 ss << "error=" << r;
5895 else
5896 ss << "ok";
5897 } else if (command == "getomap") {
5898 //Debug: Output entire omap
5899 bufferlist hdrbl;
5900 map<string, bufferlist> keyvals;
11fdf7f2
TL
5901 auto ch = store->open_collection(coll_t(pgid));
5902 if (!ch) {
5903 ss << "unable to open collection for " << pgid;
5904 r = -ENOENT;
5905 } else {
5906 r = store->omap_get(ch, ghobject_t(obj), &hdrbl, &keyvals);
5907 if (r >= 0) {
7c673cae
FG
5908 ss << "header=" << string(hdrbl.c_str(), hdrbl.length());
5909 for (map<string, bufferlist>::iterator it = keyvals.begin();
11fdf7f2 5910 it != keyvals.end(); ++it)
7c673cae
FG
5911 ss << " key=" << (*it).first << " val="
5912 << string((*it).second.c_str(), (*it).second.length());
11fdf7f2 5913 } else {
7c673cae 5914 ss << "error=" << r;
11fdf7f2 5915 }
7c673cae
FG
5916 }
5917 } else if (command == "truncobj") {
5918 int64_t trunclen;
5919 cmd_getval(service->cct, cmdmap, "len", trunclen);
5920 t.truncate(coll_t(pgid), ghobject_t(obj), trunclen);
11fdf7f2 5921 r = store->queue_transaction(service->meta_ch, std::move(t));
7c673cae
FG
5922 if (r < 0)
5923 ss << "error=" << r;
5924 else
5925 ss << "ok";
5926 } else if (command == "injectdataerr") {
5927 store->inject_data_error(gobj);
5928 ss << "ok";
5929 } else if (command == "injectmdataerr") {
5930 store->inject_mdata_error(gobj);
5931 ss << "ok";
5932 }
5933 return;
5934 }
5935 if (command == "set_recovery_delay") {
5936 int64_t delay;
5937 cmd_getval(service->cct, cmdmap, "utime", delay, (int64_t)0);
5938 ostringstream oss;
5939 oss << delay;
11fdf7f2 5940 int r = service->cct->_conf.set_val("osd_recovery_delay_start",
7c673cae
FG
5941 oss.str().c_str());
5942 if (r != 0) {
5943 ss << "set_recovery_delay: error setting "
5944 << "osd_recovery_delay_start to '" << delay << "': error "
5945 << r;
5946 return;
5947 }
11fdf7f2 5948 service->cct->_conf.apply_changes(nullptr);
7c673cae
FG
5949 ss << "set_recovery_delay: set osd_recovery_delay_start "
5950 << "to " << service->cct->_conf->osd_recovery_delay_start;
5951 return;
5952 }
a8e16298 5953 if (command == "trigger_scrub" || command == "trigger_deep_scrub") {
7c673cae 5954 spg_t pgid;
a8e16298 5955 bool deep = (command == "trigger_deep_scrub");
7c673cae
FG
5956 OSDMapRef curmap = service->get_osdmap();
5957
5958 string pgidstr;
5959
5960 cmd_getval(service->cct, cmdmap, "pgid", pgidstr);
5961 if (!pgid.parse(pgidstr.c_str())) {
5962 ss << "Invalid pgid specified";
5963 return;
5964 }
5965
a8e16298
TL
5966 int64_t time;
5967 cmd_getval(service->cct, cmdmap, "time", time, (int64_t)0);
5968
11fdf7f2 5969 PGRef pg = service->osd->_lookup_lock_pg(pgid);
7c673cae
FG
5970 if (pg == nullptr) {
5971 ss << "Can't find pg " << pgid;
5972 return;
5973 }
5974
5975 if (pg->is_primary()) {
5976 pg->unreg_next_scrub();
5977 const pg_pool_t *p = curmap->get_pg_pool(pgid.pool());
5978 double pool_scrub_max_interval = 0;
a8e16298
TL
5979 double scrub_max_interval;
5980 if (deep) {
5981 p->opts.get(pool_opts_t::DEEP_SCRUB_INTERVAL, &pool_scrub_max_interval);
5982 scrub_max_interval = pool_scrub_max_interval > 0 ?
11fdf7f2 5983 pool_scrub_max_interval : g_conf()->osd_deep_scrub_interval;
a8e16298
TL
5984 } else {
5985 p->opts.get(pool_opts_t::SCRUB_MAX_INTERVAL, &pool_scrub_max_interval);
5986 scrub_max_interval = pool_scrub_max_interval > 0 ?
11fdf7f2 5987 pool_scrub_max_interval : g_conf()->osd_scrub_max_interval;
a8e16298 5988 }
7c673cae
FG
5989 // Instead of marking must_scrub force a schedule scrub
5990 utime_t stamp = ceph_clock_now();
a8e16298
TL
5991 if (time == 0)
5992 stamp -= scrub_max_interval;
5993 else
5994 stamp -= (float)time;
5995 stamp -= 100.0; // push back last scrub more for good measure
5996 if (deep) {
5997 pg->set_last_deep_scrub_stamp(stamp);
5998 } else {
5999 pg->set_last_scrub_stamp(stamp);
6000 }
7c673cae 6001 pg->reg_next_scrub();
a8e16298
TL
6002 pg->publish_stats_to_osd();
6003 ss << "ok - set" << (deep ? " deep" : "" ) << " stamp " << stamp;
7c673cae
FG
6004 } else {
6005 ss << "Not primary";
6006 }
6007 pg->unlock();
6008 return;
6009 }
6010 if (command == "injectfull") {
6011 int64_t count;
6012 string type;
6013 OSDService::s_names state;
6014 cmd_getval(service->cct, cmdmap, "type", type, string("full"));
6015 cmd_getval(service->cct, cmdmap, "count", count, (int64_t)-1);
6016 if (type == "none" || count == 0) {
6017 type = "none";
6018 count = 0;
6019 }
6020 state = service->get_full_state(type);
6021 if (state == OSDService::s_names::INVALID) {
6022 ss << "Invalid type use (none, nearfull, backfillfull, full, failsafe)";
6023 return;
6024 }
6025 service->set_injectfull(state, count);
6026 return;
6027 }
6028 ss << "Internal error - command=" << command;
6029}
6030
7c673cae
FG
6031// =========================================
6032
6033void OSD::ms_handle_connect(Connection *con)
6034{
6035 dout(10) << __func__ << " con " << con << dendl;
6036 if (con->get_peer_type() == CEPH_ENTITY_TYPE_MON) {
11fdf7f2 6037 std::lock_guard l(osd_lock);
7c673cae
FG
6038 if (is_stopping())
6039 return;
6040 dout(10) << __func__ << " on mon" << dendl;
6041
6042 if (is_preboot()) {
6043 start_boot();
6044 } else if (is_booting()) {
6045 _send_boot(); // resend boot message
6046 } else {
6047 map_lock.get_read();
11fdf7f2 6048 std::lock_guard l2(mon_report_lock);
7c673cae
FG
6049
6050 utime_t now = ceph_clock_now();
6051 last_mon_report = now;
6052
6053 // resend everything, it's a new session
6054 send_full_update();
6055 send_alive();
6056 service.requeue_pg_temp();
11fdf7f2 6057 service.clear_sent_ready_to_merge();
7c673cae 6058 service.send_pg_temp();
11fdf7f2
TL
6059 service.send_ready_to_merge();
6060 service.send_pg_created();
7c673cae
FG
6061 requeue_failures();
6062 send_failures();
7c673cae
FG
6063
6064 map_lock.put_read();
6065 if (is_active()) {
6066 send_beacon(ceph::coarse_mono_clock::now());
6067 }
6068 }
6069
6070 // full map requests may happen while active or pre-boot
6071 if (requested_full_first) {
6072 rerequest_full_maps();
6073 }
6074 }
6075}
6076
6077void OSD::ms_handle_fast_connect(Connection *con)
6078{
6079 if (con->get_peer_type() != CEPH_ENTITY_TYPE_MON &&
6080 con->get_peer_type() != CEPH_ENTITY_TYPE_MGR) {
11fdf7f2
TL
6081 auto priv = con->get_priv();
6082 auto s = static_cast<Session*>(priv.get());
7c673cae 6083 if (!s) {
11fdf7f2
TL
6084 s = new Session{cct, con};
6085 con->set_priv(RefCountedPtr{s, false});
7c673cae
FG
6086 dout(10) << " new session (outgoing) " << s << " con=" << s->con
6087 << " addr=" << s->con->get_peer_addr() << dendl;
6088 // we don't connect to clients
11fdf7f2 6089 ceph_assert(con->get_peer_type() == CEPH_ENTITY_TYPE_OSD);
7c673cae
FG
6090 s->entity_name.set_type(CEPH_ENTITY_TYPE_OSD);
6091 }
7c673cae
FG
6092 }
6093}
6094
6095void OSD::ms_handle_fast_accept(Connection *con)
6096{
6097 if (con->get_peer_type() != CEPH_ENTITY_TYPE_MON &&
6098 con->get_peer_type() != CEPH_ENTITY_TYPE_MGR) {
11fdf7f2
TL
6099 auto priv = con->get_priv();
6100 auto s = static_cast<Session*>(priv.get());
7c673cae 6101 if (!s) {
11fdf7f2
TL
6102 s = new Session{cct, con};
6103 con->set_priv(RefCountedPtr{s, false});
7c673cae
FG
6104 dout(10) << "new session (incoming)" << s << " con=" << con
6105 << " addr=" << con->get_peer_addr()
6106 << " must have raced with connect" << dendl;
11fdf7f2 6107 ceph_assert(con->get_peer_type() == CEPH_ENTITY_TYPE_OSD);
7c673cae
FG
6108 s->entity_name.set_type(CEPH_ENTITY_TYPE_OSD);
6109 }
7c673cae
FG
6110 }
6111}
6112
6113bool OSD::ms_handle_reset(Connection *con)
6114{
11fdf7f2
TL
6115 auto s = con->get_priv();
6116 auto session = static_cast<Session*>(s.get());
7c673cae
FG
6117 dout(2) << "ms_handle_reset con " << con << " session " << session << dendl;
6118 if (!session)
6119 return false;
6120 session->wstate.reset(con);
11fdf7f2
TL
6121 session->con->set_priv(nullptr);
6122 session->con.reset(); // break con <-> session ref cycle
7c673cae
FG
6123 // note that we break session->con *before* the session_handle_reset
6124 // cleanup below. this avoids a race between us and
6125 // PG::add_backoff, Session::check_backoff, etc.
11fdf7f2 6126 session_handle_reset(SessionRef{session});
7c673cae
FG
6127 return true;
6128}
6129
6130bool OSD::ms_handle_refused(Connection *con)
6131{
6132 if (!cct->_conf->osd_fast_fail_on_connection_refused)
6133 return false;
6134
11fdf7f2
TL
6135 auto priv = con->get_priv();
6136 auto session = static_cast<Session*>(priv.get());
7c673cae
FG
6137 dout(2) << "ms_handle_refused con " << con << " session " << session << dendl;
6138 if (!session)
6139 return false;
6140 int type = con->get_peer_type();
6141 // handle only OSD failures here
6142 if (monc && (type == CEPH_ENTITY_TYPE_OSD)) {
6143 OSDMapRef osdmap = get_osdmap();
6144 if (osdmap) {
6145 int id = osdmap->identify_osd_on_all_channels(con->get_peer_addr());
6146 if (id >= 0 && osdmap->is_up(id)) {
6147 // I'm cheating mon heartbeat grace logic, because we know it's not going
6148 // to respawn alone. +1 so we won't hit any boundary case.
11fdf7f2
TL
6149 monc->send_mon_message(
6150 new MOSDFailure(
6151 monc->get_fsid(),
6152 id,
6153 osdmap->get_addrs(id),
6154 cct->_conf->osd_heartbeat_grace + 1,
6155 osdmap->get_epoch(),
6156 MOSDFailure::FLAG_IMMEDIATE | MOSDFailure::FLAG_FAILED
6157 ));
7c673cae
FG
6158 }
6159 }
6160 }
7c673cae
FG
6161 return true;
6162}
6163
6164struct C_OSD_GetVersion : public Context {
6165 OSD *osd;
6166 uint64_t oldest, newest;
6167 explicit C_OSD_GetVersion(OSD *o) : osd(o), oldest(0), newest(0) {}
6168 void finish(int r) override {
6169 if (r >= 0)
6170 osd->_got_mon_epochs(oldest, newest);
6171 }
6172};
6173
6174void OSD::start_boot()
6175{
6176 if (!_is_healthy()) {
6177 // if we are not healthy, do not mark ourselves up (yet)
6178 dout(1) << "not healthy; waiting to boot" << dendl;
6179 if (!is_waiting_for_healthy())
6180 start_waiting_for_healthy();
6181 // send pings sooner rather than later
6182 heartbeat_kick();
6183 return;
6184 }
6185 dout(1) << __func__ << dendl;
6186 set_state(STATE_PREBOOT);
6187 dout(10) << "start_boot - have maps " << superblock.oldest_map
6188 << ".." << superblock.newest_map << dendl;
6189 C_OSD_GetVersion *c = new C_OSD_GetVersion(this);
6190 monc->get_version("osdmap", &c->newest, &c->oldest, c);
6191}
6192
6193void OSD::_got_mon_epochs(epoch_t oldest, epoch_t newest)
6194{
11fdf7f2 6195 std::lock_guard l(osd_lock);
7c673cae
FG
6196 if (is_preboot()) {
6197 _preboot(oldest, newest);
6198 }
6199}
6200
6201void OSD::_preboot(epoch_t oldest, epoch_t newest)
6202{
11fdf7f2 6203 ceph_assert(is_preboot());
7c673cae
FG
6204 dout(10) << __func__ << " _preboot mon has osdmaps "
6205 << oldest << ".." << newest << dendl;
6206
6207 // ensure our local fullness awareness is accurate
81eedcae
TL
6208 {
6209 std::lock_guard l(heartbeat_lock);
6210 heartbeat();
6211 }
7c673cae
FG
6212
6213 // if our map within recent history, try to add ourselves to the osdmap.
31f18b77
FG
6214 if (osdmap->get_epoch() == 0) {
6215 derr << "waiting for initial osdmap" << dendl;
c07f9fc5 6216 } else if (osdmap->is_destroyed(whoami)) {
b32b8144
FG
6217 derr << "osdmap says I am destroyed" << dendl;
6218 // provide a small margin so we don't livelock seeing if we
6219 // un-destroyed ourselves.
6220 if (osdmap->get_epoch() > newest - 1) {
6221 exit(0);
6222 }
81eedcae 6223 } else if (osdmap->is_noup(whoami)) {
7c673cae
FG
6224 derr << "osdmap NOUP flag is set, waiting for it to clear" << dendl;
6225 } else if (!osdmap->test_flag(CEPH_OSDMAP_SORTBITWISE)) {
6226 derr << "osdmap SORTBITWISE OSDMap flag is NOT set; please set it"
6227 << dendl;
11fdf7f2
TL
6228 } else if (osdmap->require_osd_release < CEPH_RELEASE_LUMINOUS) {
6229 derr << "osdmap require_osd_release < luminous; please upgrade to luminous"
7c673cae 6230 << dendl;
7c673cae
FG
6231 } else if (service.need_fullness_update()) {
6232 derr << "osdmap fullness state needs update" << dendl;
6233 send_full_update();
6234 } else if (osdmap->get_epoch() >= oldest - 1 &&
6235 osdmap->get_epoch() + cct->_conf->osd_map_message_max > newest) {
11fdf7f2
TL
6236
6237 // wait for pgs to fully catch up in a different thread, since
6238 // this thread might be required for splitting and merging PGs to
6239 // make progress.
6240 boot_finisher.queue(
6241 new FunctionContext(
6242 [this](int r) {
6243 std::lock_guard l(osd_lock);
6244 if (is_preboot()) {
6245 dout(10) << __func__ << " waiting for peering work to drain"
6246 << dendl;
6247 osd_lock.Unlock();
6248 for (auto shard : shards) {
6249 shard->wait_min_pg_epoch(osdmap->get_epoch());
6250 }
6251 osd_lock.Lock();
6252 }
6253 if (is_preboot()) {
6254 _send_boot();
6255 }
6256 }));
6257 return;
7c673cae
FG
6258 }
6259
6260 // get all the latest maps
6261 if (osdmap->get_epoch() + 1 >= oldest)
6262 osdmap_subscribe(osdmap->get_epoch() + 1, false);
6263 else
6264 osdmap_subscribe(oldest - 1, true);
6265}
6266
6267void OSD::send_full_update()
6268{
6269 if (!service.need_fullness_update())
6270 return;
6271 unsigned state = 0;
6272 if (service.is_full()) {
6273 state = CEPH_OSD_FULL;
6274 } else if (service.is_backfillfull()) {
6275 state = CEPH_OSD_BACKFILLFULL;
6276 } else if (service.is_nearfull()) {
6277 state = CEPH_OSD_NEARFULL;
6278 }
6279 set<string> s;
6280 OSDMap::calc_state_set(state, s);
6281 dout(10) << __func__ << " want state " << s << dendl;
6282 monc->send_mon_message(new MOSDFull(osdmap->get_epoch(), state));
6283}
6284
6285void OSD::start_waiting_for_healthy()
6286{
6287 dout(1) << "start_waiting_for_healthy" << dendl;
6288 set_state(STATE_WAITING_FOR_HEALTHY);
6289 last_heartbeat_resample = utime_t();
181888fb
FG
6290
6291 // subscribe to osdmap updates, in case our peers really are known to be dead
6292 osdmap_subscribe(osdmap->get_epoch() + 1, false);
7c673cae
FG
6293}
6294
6295bool OSD::_is_healthy()
6296{
6297 if (!cct->get_heartbeat_map()->is_healthy()) {
6298 dout(1) << "is_healthy false -- internal heartbeat failed" << dendl;
6299 return false;
6300 }
6301
6302 if (is_waiting_for_healthy()) {
11fdf7f2
TL
6303 utime_t now = ceph_clock_now();
6304 utime_t grace = utime_t(cct->_conf->osd_max_markdown_period, 0);
6305 while (!osd_markdown_log.empty() &&
6306 osd_markdown_log.front() + grace < now)
6307 osd_markdown_log.pop_front();
6308 if (osd_markdown_log.size() <= 1) {
6309 dout(5) << __func__ << " first time marked as down,"
6310 << " try reboot unconditionally" << dendl;
6311 return true;
6312 }
6313 std::lock_guard l(heartbeat_lock);
7c673cae
FG
6314 int num = 0, up = 0;
6315 for (map<int,HeartbeatInfo>::iterator p = heartbeat_peers.begin();
6316 p != heartbeat_peers.end();
6317 ++p) {
11fdf7f2 6318 if (p->second.is_healthy(now))
7c673cae
FG
6319 ++up;
6320 ++num;
6321 }
6322 if ((float)up < (float)num * cct->_conf->osd_heartbeat_min_healthy_ratio) {
6323 dout(1) << "is_healthy false -- only " << up << "/" << num << " up peers (less than "
6324 << int(cct->_conf->osd_heartbeat_min_healthy_ratio * 100.0) << "%)" << dendl;
6325 return false;
6326 }
6327 }
6328
6329 return true;
6330}
6331
6332void OSD::_send_boot()
6333{
6334 dout(10) << "_send_boot" << dendl;
11fdf7f2
TL
6335 Connection *local_connection =
6336 cluster_messenger->get_loopback_connection().get();
6337 entity_addrvec_t client_addrs = client_messenger->get_myaddrs();
6338 entity_addrvec_t cluster_addrs = cluster_messenger->get_myaddrs();
6339 entity_addrvec_t hb_back_addrs = hb_back_server_messenger->get_myaddrs();
6340 entity_addrvec_t hb_front_addrs = hb_front_server_messenger->get_myaddrs();
6341
6342 dout(20) << " initial client_addrs " << client_addrs
6343 << ", cluster_addrs " << cluster_addrs
6344 << ", hb_back_addrs " << hb_back_addrs
6345 << ", hb_front_addrs " << hb_front_addrs
6346 << dendl;
6347 if (cluster_messenger->set_addr_unknowns(client_addrs)) {
6348 dout(10) << " assuming cluster_addrs match client_addrs "
6349 << client_addrs << dendl;
6350 cluster_addrs = cluster_messenger->get_myaddrs();
6351 }
6352 if (auto session = local_connection->get_priv(); !session) {
6353 cluster_messenger->ms_deliver_handle_fast_connect(local_connection);
7c673cae
FG
6354 }
6355
7c673cae 6356 local_connection = hb_back_server_messenger->get_loopback_connection().get();
11fdf7f2
TL
6357 if (hb_back_server_messenger->set_addr_unknowns(cluster_addrs)) {
6358 dout(10) << " assuming hb_back_addrs match cluster_addrs "
6359 << cluster_addrs << dendl;
6360 hb_back_addrs = hb_back_server_messenger->get_myaddrs();
7c673cae 6361 }
11fdf7f2
TL
6362 if (auto session = local_connection->get_priv(); !session) {
6363 hb_back_server_messenger->ms_deliver_handle_fast_connect(local_connection);
7c673cae
FG
6364 }
6365
11fdf7f2
TL
6366 local_connection = hb_front_server_messenger->get_loopback_connection().get();
6367 if (hb_front_server_messenger->set_addr_unknowns(client_addrs)) {
6368 dout(10) << " assuming hb_front_addrs match client_addrs "
6369 << client_addrs << dendl;
6370 hb_front_addrs = hb_front_server_messenger->get_myaddrs();
6371 }
6372 if (auto session = local_connection->get_priv(); !session) {
6373 hb_front_server_messenger->ms_deliver_handle_fast_connect(local_connection);
6374 }
6375
6376 // we now know what our front and back addrs will be, and we are
6377 // about to tell the mon what our metadata (including numa bindings)
6378 // are, so now is a good time!
6379 set_numa_affinity();
6380
6381 MOSDBoot *mboot = new MOSDBoot(
6382 superblock, get_osdmap_epoch(), service.get_boot_epoch(),
6383 hb_back_addrs, hb_front_addrs, cluster_addrs,
6384 CEPH_FEATURES_ALL);
6385 dout(10) << " final client_addrs " << client_addrs
6386 << ", cluster_addrs " << cluster_addrs
6387 << ", hb_back_addrs " << hb_back_addrs
6388 << ", hb_front_addrs " << hb_front_addrs
7c673cae
FG
6389 << dendl;
6390 _collect_metadata(&mboot->metadata);
6391 monc->send_mon_message(mboot);
6392 set_state(STATE_BOOTING);
6393}
6394
6395void OSD::_collect_metadata(map<string,string> *pm)
6396{
6397 // config info
6398 (*pm)["osd_data"] = dev_path;
c07f9fc5
FG
6399 if (store->get_type() == "filestore") {
6400 // not applicable for bluestore
6401 (*pm)["osd_journal"] = journal_path;
6402 }
11fdf7f2
TL
6403 (*pm)["front_addr"] = stringify(client_messenger->get_myaddrs());
6404 (*pm)["back_addr"] = stringify(cluster_messenger->get_myaddrs());
6405 (*pm)["hb_front_addr"] = stringify(hb_front_server_messenger->get_myaddrs());
6406 (*pm)["hb_back_addr"] = stringify(hb_back_server_messenger->get_myaddrs());
7c673cae
FG
6407
6408 // backend
6409 (*pm)["osd_objectstore"] = store->get_type();
31f18b77 6410 (*pm)["rotational"] = store_is_rotational ? "1" : "0";
d2e6a577 6411 (*pm)["journal_rotational"] = journal_is_rotational ? "1" : "0";
c07f9fc5 6412 (*pm)["default_device_class"] = store->get_default_device_class();
7c673cae
FG
6413 store->collect_metadata(pm);
6414
6415 collect_sys_info(pm, cct);
6416
11fdf7f2
TL
6417 (*pm)["front_iface"] = pick_iface(
6418 cct,
6419 client_messenger->get_myaddrs().front().get_sockaddr_storage());
6420 (*pm)["back_iface"] = pick_iface(
6421 cct,
6422 cluster_messenger->get_myaddrs().front().get_sockaddr_storage());
6423
6424 // network numa
6425 {
6426 int node = -1;
6427 set<int> nodes;
6428 set<string> unknown;
6429 for (auto nm : { "front_iface", "back_iface" }) {
6430 if (!(*pm)[nm].size()) {
6431 unknown.insert(nm);
6432 continue;
6433 }
6434 int n = -1;
6435 int r = get_iface_numa_node((*pm)[nm], &n);
6436 if (r < 0) {
6437 unknown.insert((*pm)[nm]);
6438 continue;
6439 }
6440 nodes.insert(n);
6441 if (node < 0) {
6442 node = n;
6443 }
6444 }
6445 if (unknown.size()) {
6446 (*pm)["network_numa_unknown_ifaces"] = stringify(unknown);
6447 }
6448 if (!nodes.empty()) {
6449 (*pm)["network_numa_nodes"] = stringify(nodes);
6450 }
6451 if (node >= 0 && nodes.size() == 1 && unknown.empty()) {
6452 (*pm)["network_numa_node"] = stringify(node);
6453 }
6454 }
6455
6456 if (numa_node >= 0) {
6457 (*pm)["numa_node"] = stringify(numa_node);
6458 (*pm)["numa_node_cpus"] = cpu_set_to_str_list(numa_cpu_set_size,
6459 &numa_cpu_set);
6460 }
6461
6462 set<string> devnames;
6463 store->get_devices(&devnames);
6464 (*pm)["devices"] = stringify(devnames);
6465 string devids;
6466 for (auto& dev : devnames) {
6467 string err;
6468 string id = get_device_id(dev, &err);
6469 if (id.size()) {
6470 if (!devids.empty()) {
6471 devids += ",";
6472 }
6473 devids += dev + "=" + id;
6474 } else {
6475 dout(10) << __func__ << " no unique device id for " << dev << ": "
6476 << err << dendl;
6477 }
6478 }
6479 (*pm)["device_ids"] = devids;
b5b8bbf5 6480
7c673cae
FG
6481 dout(10) << __func__ << " " << *pm << dendl;
6482}
6483
6484void OSD::queue_want_up_thru(epoch_t want)
6485{
6486 map_lock.get_read();
6487 epoch_t cur = osdmap->get_up_thru(whoami);
11fdf7f2 6488 std::lock_guard l(mon_report_lock);
7c673cae
FG
6489 if (want > up_thru_wanted) {
6490 dout(10) << "queue_want_up_thru now " << want << " (was " << up_thru_wanted << ")"
6491 << ", currently " << cur
6492 << dendl;
6493 up_thru_wanted = want;
6494 send_alive();
6495 } else {
6496 dout(10) << "queue_want_up_thru want " << want << " <= queued " << up_thru_wanted
6497 << ", currently " << cur
6498 << dendl;
6499 }
6500 map_lock.put_read();
6501}
6502
6503void OSD::send_alive()
6504{
11fdf7f2 6505 ceph_assert(mon_report_lock.is_locked());
7c673cae
FG
6506 if (!osdmap->exists(whoami))
6507 return;
6508 epoch_t up_thru = osdmap->get_up_thru(whoami);
6509 dout(10) << "send_alive up_thru currently " << up_thru << " want " << up_thru_wanted << dendl;
6510 if (up_thru_wanted > up_thru) {
6511 dout(10) << "send_alive want " << up_thru_wanted << dendl;
6512 monc->send_mon_message(new MOSDAlive(osdmap->get_epoch(), up_thru_wanted));
6513 }
6514}
6515
6516void OSD::request_full_map(epoch_t first, epoch_t last)
6517{
6518 dout(10) << __func__ << " " << first << ".." << last
6519 << ", previously requested "
6520 << requested_full_first << ".." << requested_full_last << dendl;
11fdf7f2
TL
6521 ceph_assert(osd_lock.is_locked());
6522 ceph_assert(first > 0 && last > 0);
6523 ceph_assert(first <= last);
6524 ceph_assert(first >= requested_full_first); // we shouldn't ever ask for older maps
7c673cae
FG
6525 if (requested_full_first == 0) {
6526 // first request
6527 requested_full_first = first;
6528 requested_full_last = last;
6529 } else if (last <= requested_full_last) {
6530 // dup
6531 return;
6532 } else {
6533 // additional request
6534 first = requested_full_last + 1;
6535 requested_full_last = last;
6536 }
6537 MMonGetOSDMap *req = new MMonGetOSDMap;
6538 req->request_full(first, last);
6539 monc->send_mon_message(req);
6540}
6541
6542void OSD::got_full_map(epoch_t e)
6543{
11fdf7f2
TL
6544 ceph_assert(requested_full_first <= requested_full_last);
6545 ceph_assert(osd_lock.is_locked());
7c673cae
FG
6546 if (requested_full_first == 0) {
6547 dout(20) << __func__ << " " << e << ", nothing requested" << dendl;
6548 return;
6549 }
6550 if (e < requested_full_first) {
6551 dout(10) << __func__ << " " << e << ", requested " << requested_full_first
6552 << ".." << requested_full_last
6553 << ", ignoring" << dendl;
6554 return;
6555 }
6556 if (e >= requested_full_last) {
6557 dout(10) << __func__ << " " << e << ", requested " << requested_full_first
6558 << ".." << requested_full_last << ", resetting" << dendl;
6559 requested_full_first = requested_full_last = 0;
6560 return;
6561 }
6562
6563 requested_full_first = e + 1;
6564
6565 dout(10) << __func__ << " " << e << ", requested " << requested_full_first
6566 << ".." << requested_full_last
6567 << ", still need more" << dendl;
6568}
6569
6570void OSD::requeue_failures()
6571{
11fdf7f2 6572 std::lock_guard l(heartbeat_lock);
7c673cae
FG
6573 unsigned old_queue = failure_queue.size();
6574 unsigned old_pending = failure_pending.size();
11fdf7f2 6575 for (auto p = failure_pending.begin(); p != failure_pending.end(); ) {
7c673cae
FG
6576 failure_queue[p->first] = p->second.first;
6577 failure_pending.erase(p++);
6578 }
6579 dout(10) << __func__ << " " << old_queue << " + " << old_pending << " -> "
6580 << failure_queue.size() << dendl;
6581}
6582
6583void OSD::send_failures()
6584{
11fdf7f2
TL
6585 ceph_assert(map_lock.is_locked());
6586 ceph_assert(mon_report_lock.is_locked());
6587 std::lock_guard l(heartbeat_lock);
7c673cae
FG
6588 utime_t now = ceph_clock_now();
6589 while (!failure_queue.empty()) {
6590 int osd = failure_queue.begin()->first;
7c673cae
FG
6591 if (!failure_pending.count(osd)) {
6592 int failed_for = (int)(double)(now - failure_queue.begin()->second);
11fdf7f2
TL
6593 monc->send_mon_message(
6594 new MOSDFailure(
6595 monc->get_fsid(),
6596 osd,
6597 osdmap->get_addrs(osd),
6598 failed_for,
6599 osdmap->get_epoch()));
6600 failure_pending[osd] = make_pair(failure_queue.begin()->second,
6601 osdmap->get_addrs(osd));
7c673cae
FG
6602 }
6603 failure_queue.erase(osd);
6604 }
6605}
6606
11fdf7f2 6607void OSD::send_still_alive(epoch_t epoch, int osd, const entity_addrvec_t &addrs)
7c673cae 6608{
11fdf7f2
TL
6609 MOSDFailure *m = new MOSDFailure(monc->get_fsid(), osd, addrs, 0, epoch,
6610 MOSDFailure::FLAG_ALIVE);
7c673cae
FG
6611 monc->send_mon_message(m);
6612}
6613
11fdf7f2 6614void OSD::cancel_pending_failures()
7c673cae 6615{
11fdf7f2
TL
6616 std::lock_guard l(heartbeat_lock);
6617 auto it = failure_pending.begin();
6618 while (it != failure_pending.end()) {
6619 dout(10) << __func__ << " canceling in-flight failure report for osd."
6620 << it->first << dendl;
6621 send_still_alive(osdmap->get_epoch(), it->first, it->second.second);
6622 failure_pending.erase(it++);
7c673cae 6623 }
7c673cae
FG
6624}
6625
6626void OSD::send_beacon(const ceph::coarse_mono_clock::time_point& now)
6627{
6628 const auto& monmap = monc->monmap;
6629 // send beacon to mon even if we are just connected, and the monmap is not
6630 // initialized yet by then.
6631 if (monmap.epoch > 0 &&
6632 monmap.get_required_features().contains_all(
6633 ceph::features::mon::FEATURE_LUMINOUS)) {
6634 dout(20) << __func__ << " sending" << dendl;
7c673cae
FG
6635 MOSDBeacon* beacon = nullptr;
6636 {
11fdf7f2 6637 std::lock_guard l{min_last_epoch_clean_lock};
7c673cae 6638 beacon = new MOSDBeacon(osdmap->get_epoch(), min_last_epoch_clean);
494da23a 6639 beacon->pgs = min_last_epoch_clean_pgs;
224ce89b 6640 last_sent_beacon = now;
7c673cae
FG
6641 }
6642 monc->send_mon_message(beacon);
6643 } else {
6644 dout(20) << __func__ << " not sending" << dendl;
6645 }
6646}
6647
6648void OSD::handle_command(MMonCommand *m)
6649{
6650 if (!require_mon_peer(m)) {
6651 m->put();
6652 return;
6653 }
6654
6655 Command *c = new Command(m->cmd, m->get_tid(), m->get_data(), NULL);
6656 command_wq.queue(c);
6657 m->put();
6658}
6659
6660void OSD::handle_command(MCommand *m)
6661{
6662 ConnectionRef con = m->get_connection();
11fdf7f2
TL
6663 auto priv = con->get_priv();
6664 auto session = static_cast<Session *>(priv.get());
7c673cae
FG
6665 if (!session) {
6666 con->send_message(new MCommandReply(m, -EPERM));
6667 m->put();
6668 return;
6669 }
6670
6671 OSDCap& caps = session->caps;
11fdf7f2 6672 priv.reset();
7c673cae
FG
6673
6674 if (!caps.allow_all() || m->get_source().is_mon()) {
6675 con->send_message(new MCommandReply(m, -EPERM));
6676 m->put();
6677 return;
6678 }
6679
6680 Command *c = new Command(m->cmd, m->get_tid(), m->get_data(), con.get());
6681 command_wq.queue(c);
6682
6683 m->put();
6684}
6685
6686struct OSDCommand {
6687 string cmdstring;
6688 string helpstring;
6689 string module;
6690 string perm;
7c673cae
FG
6691} osd_commands[] = {
6692
11fdf7f2
TL
6693#define COMMAND(parsesig, helptext, module, perm) \
6694 {parsesig, helptext, module, perm},
7c673cae
FG
6695
6696// yes, these are really pg commands, but there's a limit to how
6697// much work it's worth. The OSD returns all of them. Make this
6698// form (pg <pgid> <cmd>) valid only for the cli.
6699// Rest uses "tell <pgid> <cmd>"
6700
6701COMMAND("pg " \
6702 "name=pgid,type=CephPgid " \
6703 "name=cmd,type=CephChoices,strings=query", \
11fdf7f2 6704 "show details of a specific pg", "osd", "r")
7c673cae
FG
6705COMMAND("pg " \
6706 "name=pgid,type=CephPgid " \
6707 "name=cmd,type=CephChoices,strings=mark_unfound_lost " \
6708 "name=mulcmd,type=CephChoices,strings=revert|delete", \
6709 "mark all unfound objects in this pg as lost, either removing or reverting to a prior version if one is available",
11fdf7f2 6710 "osd", "rw")
7c673cae
FG
6711COMMAND("pg " \
6712 "name=pgid,type=CephPgid " \
11fdf7f2 6713 "name=cmd,type=CephChoices,strings=list_unfound " \
7c673cae 6714 "name=offset,type=CephString,req=false",
11fdf7f2
TL
6715 "list unfound objects on this pg, perhaps starting at an offset given in JSON",
6716 "osd", "r")
7c673cae
FG
6717
6718// new form: tell <pgid> <cmd> for both cli and rest
6719
6720COMMAND("query",
11fdf7f2 6721 "show details of a specific pg", "osd", "r")
7c673cae
FG
6722COMMAND("mark_unfound_lost " \
6723 "name=mulcmd,type=CephChoices,strings=revert|delete", \
6724 "mark all unfound objects in this pg as lost, either removing or reverting to a prior version if one is available",
11fdf7f2
TL
6725 "osd", "rw")
6726COMMAND("list_unfound " \
7c673cae 6727 "name=offset,type=CephString,req=false",
11fdf7f2
TL
6728 "list unfound objects on this pg, perhaps starting at an offset given in JSON",
6729 "osd", "r")
31f18b77
FG
6730COMMAND("perf histogram dump "
6731 "name=logger,type=CephString,req=false "
6732 "name=counter,type=CephString,req=false",
6733 "Get histogram data",
11fdf7f2 6734 "osd", "r")
7c673cae
FG
6735
6736// tell <osd.n> commands. Validation of osd.n must be special-cased in client
11fdf7f2
TL
6737COMMAND("version", "report version of OSD", "osd", "r")
6738COMMAND("get_command_descriptions", "list commands descriptions", "osd", "r")
7c673cae
FG
6739COMMAND("injectargs " \
6740 "name=injected_args,type=CephString,n=N",
6741 "inject configuration arguments into running OSD",
11fdf7f2 6742 "osd", "rw")
c07f9fc5
FG
6743COMMAND("config set " \
6744 "name=key,type=CephString name=value,type=CephString",
6745 "Set a configuration option at runtime (not persistent)",
11fdf7f2
TL
6746 "osd", "rw")
6747COMMAND("config get " \
6748 "name=key,type=CephString",
6749 "Get a configuration option at runtime",
6750 "osd", "r")
6751COMMAND("config unset " \
6752 "name=key,type=CephString",
6753 "Unset a configuration option at runtime (not persistent)",
6754 "osd", "rw")
7c673cae
FG
6755COMMAND("cluster_log " \
6756 "name=level,type=CephChoices,strings=error,warning,info,debug " \
6757 "name=message,type=CephString,n=N",
6758 "log a message to the cluster log",
11fdf7f2 6759 "osd", "rw")
7c673cae
FG
6760COMMAND("bench " \
6761 "name=count,type=CephInt,req=false " \
6762 "name=size,type=CephInt,req=false " \
6763 "name=object_size,type=CephInt,req=false " \
6764 "name=object_num,type=CephInt,req=false ", \
81eedcae
TL
6765 "OSD benchmark: write <count> <size>-byte objects(with <obj_size> <obj_num>), " \
6766 "(default count=1G default size=4MB). Results in log.",
11fdf7f2
TL
6767 "osd", "rw")
6768COMMAND("flush_pg_stats", "flush pg stats", "osd", "rw")
7c673cae 6769COMMAND("heap " \
11fdf7f2
TL
6770 "name=heapcmd,type=CephChoices,strings="\
6771 "dump|start_profiler|stop_profiler|release|get_release_rate|set_release_rate|stats " \
6772 "name=value,type=CephString,req=false",
6773 "show heap usage info (available only if compiled with tcmalloc)",
6774 "osd", "rw")
7c673cae
FG
6775COMMAND("debug dump_missing " \
6776 "name=filename,type=CephFilepath",
11fdf7f2 6777 "dump missing objects to a named file", "osd", "r")
7c673cae
FG
6778COMMAND("debug kick_recovery_wq " \
6779 "name=delay,type=CephInt,range=0",
11fdf7f2 6780 "set osd_recovery_delay_start to <val>", "osd", "rw")
7c673cae
FG
6781COMMAND("cpu_profiler " \
6782 "name=arg,type=CephChoices,strings=status|flush",
11fdf7f2 6783 "run cpu profiling on daemon", "osd", "rw")
7c673cae 6784COMMAND("dump_pg_recovery_stats", "dump pg recovery statistics",
11fdf7f2 6785 "osd", "r")
7c673cae 6786COMMAND("reset_pg_recovery_stats", "reset pg recovery statistics",
11fdf7f2 6787 "osd", "rw")
224ce89b
WB
6788COMMAND("compact",
6789 "compact object store's omap. "
6790 "WARNING: Compaction probably slows your requests",
11fdf7f2
TL
6791 "osd", "rw")
6792COMMAND("smart name=devid,type=CephString,req=False",
6793 "runs smartctl on this osd devices. ",
6794 "osd", "rw")
6795COMMAND("cache drop",
6796 "Drop all OSD caches",
6797 "osd", "rwx")
6798COMMAND("cache status",
6799 "Get OSD caches statistics",
6800 "osd", "r")
6801COMMAND("send_beacon",
6802 "Send OSD beacon to mon immediately",
6803 "osd", "r")
7c673cae
FG
6804};
6805
11fdf7f2
TL
6806void OSD::do_command(
6807 Connection *con, ceph_tid_t tid, vector<string>& cmd, bufferlist& data)
6808{
6809 dout(20) << "do_command tid " << tid << " " << cmd << dendl;
6810
6811 int r = 0;
6812 stringstream ss, ds;
6813 bufferlist odata;
6814 cmdmap_t cmdmap;
6815 if (cmd.empty()) {
6816 ss << "no command given";
6817 goto out;
6818 }
6819 if (!cmdmap_from_json(cmd, &cmdmap, ss)) {
6820 r = -EINVAL;
6821 goto out;
6822 }
6823
6824 try {
6825 r = _do_command(con, cmdmap, tid, data, odata, ss, ds);
6826 } catch (const bad_cmd_get& e) {
6827 r = -EINVAL;
6828 ss << e.what();
6829 }
6830 if (r == -EAGAIN) {
6831 return;
6832 }
6833 out:
6834 string rs = ss.str();
6835 odata.append(ds);
6836 dout(0) << "do_command r=" << r << " " << rs << dendl;
6837 clog->info() << rs;
6838 if (con) {
6839 MCommandReply *reply = new MCommandReply(r, rs);
6840 reply->set_tid(tid);
6841 reply->set_data(odata);
6842 con->send_message(reply);
6843 }
6844}
6845
f64942e4
AA
6846namespace {
6847 class unlock_guard {
6848 Mutex& m;
6849 public:
6850 explicit unlock_guard(Mutex& mutex)
6851 : m(mutex)
6852 {
11fdf7f2 6853 m.unlock();
f64942e4
AA
6854 }
6855 unlock_guard(unlock_guard&) = delete;
6856 ~unlock_guard() {
11fdf7f2 6857 m.lock();
f64942e4
AA
6858 }
6859 };
6860}
6861
11fdf7f2
TL
6862int OSD::_do_command(
6863 Connection *con, cmdmap_t& cmdmap, ceph_tid_t tid, bufferlist& data,
6864 bufferlist& odata, stringstream& ss, stringstream& ds)
7c673cae
FG
6865{
6866 int r = 0;
7c673cae
FG
6867 string prefix;
6868 string format;
6869 string pgidstr;
6870 boost::scoped_ptr<Formatter> f;
6871
7c673cae
FG
6872 cmd_getval(cct, cmdmap, "prefix", prefix);
6873
6874 if (prefix == "get_command_descriptions") {
6875 int cmdnum = 0;
6876 JSONFormatter *f = new JSONFormatter();
6877 f->open_object_section("command_descriptions");
6878 for (OSDCommand *cp = osd_commands;
11fdf7f2 6879 cp < &osd_commands[std::size(osd_commands)]; cp++) {
7c673cae
FG
6880
6881 ostringstream secname;
6882 secname << "cmd" << setfill('0') << std::setw(3) << cmdnum;
11fdf7f2
TL
6883 dump_cmddesc_to_json(f, con->get_features(),
6884 secname.str(), cp->cmdstring, cp->helpstring,
6885 cp->module, cp->perm, 0);
7c673cae
FG
6886 cmdnum++;
6887 }
6888 f->close_section(); // command_descriptions
6889
6890 f->flush(ds);
6891 delete f;
6892 goto out;
6893 }
6894
6895 cmd_getval(cct, cmdmap, "format", format);
6896 f.reset(Formatter::create(format));
6897
6898 if (prefix == "version") {
6899 if (f) {
6900 f->open_object_section("version");
6901 f->dump_string("version", pretty_version_to_str());
6902 f->close_section();
6903 f->flush(ds);
6904 } else {
6905 ds << pretty_version_to_str();
6906 }
6907 goto out;
6908 }
6909 else if (prefix == "injectargs") {
6910 vector<string> argsvec;
6911 cmd_getval(cct, cmdmap, "injected_args", argsvec);
6912
6913 if (argsvec.empty()) {
6914 r = -EINVAL;
6915 ss << "ignoring empty injectargs";
6916 goto out;
6917 }
6918 string args = argsvec.front();
6919 for (vector<string>::iterator a = ++argsvec.begin(); a != argsvec.end(); ++a)
6920 args += " " + *a;
f64942e4 6921 unlock_guard unlock{osd_lock};
11fdf7f2 6922 r = cct->_conf.injectargs(args, &ss);
7c673cae 6923 }
c07f9fc5
FG
6924 else if (prefix == "config set") {
6925 std::string key;
6926 std::string val;
6927 cmd_getval(cct, cmdmap, "key", key);
6928 cmd_getval(cct, cmdmap, "value", val);
f64942e4 6929 unlock_guard unlock{osd_lock};
11fdf7f2
TL
6930 r = cct->_conf.set_val(key, val, &ss);
6931 if (r == 0) {
6932 cct->_conf.apply_changes(nullptr);
6933 }
6934 }
6935 else if (prefix == "config get") {
6936 std::string key;
6937 cmd_getval(cct, cmdmap, "key", key);
6938 unlock_guard unlock{osd_lock};
6939 std::string val;
6940 r = cct->_conf.get_val(key, &val);
6941 if (r == 0) {
6942 ds << val;
6943 }
6944 }
6945 else if (prefix == "config unset") {
6946 std::string key;
6947 cmd_getval(cct, cmdmap, "key", key);
6948 unlock_guard unlock{osd_lock};
6949 r = cct->_conf.rm_val(key);
d2e6a577 6950 if (r == 0) {
11fdf7f2
TL
6951 cct->_conf.apply_changes(nullptr);
6952 }
6953 if (r == -ENOENT) {
6954 r = 0; // make command idempotent
d2e6a577 6955 }
c07f9fc5 6956 }
7c673cae
FG
6957 else if (prefix == "cluster_log") {
6958 vector<string> msg;
6959 cmd_getval(cct, cmdmap, "message", msg);
6960 if (msg.empty()) {
6961 r = -EINVAL;
6962 ss << "ignoring empty log message";
6963 goto out;
6964 }
6965 string message = msg.front();
6966 for (vector<string>::iterator a = ++msg.begin(); a != msg.end(); ++a)
6967 message += " " + *a;
6968 string lvl;
6969 cmd_getval(cct, cmdmap, "level", lvl);
6970 clog_type level = string_to_clog_type(lvl);
6971 if (level < 0) {
6972 r = -EINVAL;
6973 ss << "unknown level '" << lvl << "'";
6974 goto out;
6975 }
6976 clog->do_log(level, message);
6977 }
6978
6979 // either 'pg <pgid> <command>' or
6980 // 'tell <pgid>' (which comes in without any of that prefix)?
6981
6982 else if (prefix == "pg" ||
6983 prefix == "query" ||
6984 prefix == "mark_unfound_lost" ||
11fdf7f2 6985 prefix == "list_unfound"
7c673cae
FG
6986 ) {
6987 pg_t pgid;
6988
6989 if (!cmd_getval(cct, cmdmap, "pgid", pgidstr)) {
6990 ss << "no pgid specified";
6991 r = -EINVAL;
6992 } else if (!pgid.parse(pgidstr.c_str())) {
6993 ss << "couldn't parse pgid '" << pgidstr << "'";
6994 r = -EINVAL;
6995 } else {
6996 spg_t pcand;
11fdf7f2 6997 PGRef pg;
7c673cae
FG
6998 if (osdmap->get_primary_shard(pgid, &pcand) &&
6999 (pg = _lookup_lock_pg(pcand))) {
7000 if (pg->is_primary()) {
7001 // simulate pg <pgid> cmd= for pg->do-command
7002 if (prefix != "pg")
7003 cmd_putval(cct, cmdmap, "cmd", prefix);
11fdf7f2
TL
7004 try {
7005 r = pg->do_command(cmdmap, ss, data, odata, con, tid);
7006 } catch (const bad_cmd_get& e) {
7007 pg->unlock();
7008 ss << e.what();
7009 return -EINVAL;
7010 }
7c673cae
FG
7011 if (r == -EAGAIN) {
7012 pg->unlock();
7013 // don't reply, pg will do so async
11fdf7f2 7014 return -EAGAIN;
7c673cae
FG
7015 }
7016 } else {
7017 ss << "not primary for pgid " << pgid;
7018
7019 // send them the latest diff to ensure they realize the mapping
7020 // has changed.
7021 service.send_incremental_map(osdmap->get_epoch() - 1, con, osdmap);
7022
7023 // do not reply; they will get newer maps and realize they
7024 // need to resend.
7025 pg->unlock();
11fdf7f2 7026 return -EAGAIN;
7c673cae
FG
7027 }
7028 pg->unlock();
7029 } else {
7030 ss << "i don't have pgid " << pgid;
7031 r = -ENOENT;
7032 }
7033 }
7034 }
7035
7036 else if (prefix == "bench") {
7037 int64_t count;
7038 int64_t bsize;
7039 int64_t osize, onum;
7040 // default count 1G, size 4MB
7041 cmd_getval(cct, cmdmap, "count", count, (int64_t)1 << 30);
7042 cmd_getval(cct, cmdmap, "size", bsize, (int64_t)4 << 20);
7043 cmd_getval(cct, cmdmap, "object_size", osize, (int64_t)0);
7044 cmd_getval(cct, cmdmap, "object_num", onum, (int64_t)0);
7045
7c673cae
FG
7046 uint32_t duration = cct->_conf->osd_bench_duration;
7047
7048 if (bsize > (int64_t) cct->_conf->osd_bench_max_block_size) {
7049 // let us limit the block size because the next checks rely on it
7050 // having a sane value. If we allow any block size to be set things
7051 // can still go sideways.
7052 ss << "block 'size' values are capped at "
1adf2230 7053 << byte_u_t(cct->_conf->osd_bench_max_block_size) << ". If you wish to use"
7c673cae
FG
7054 << " a higher value, please adjust 'osd_bench_max_block_size'";
7055 r = -EINVAL;
7056 goto out;
7057 } else if (bsize < (int64_t) (1 << 20)) {
7058 // entering the realm of small block sizes.
7059 // limit the count to a sane value, assuming a configurable amount of
7060 // IOPS and duration, so that the OSD doesn't get hung up on this,
7061 // preventing timeouts from going off
7062 int64_t max_count =
7063 bsize * duration * cct->_conf->osd_bench_small_size_max_iops;
7064 if (count > max_count) {
7065 ss << "'count' values greater than " << max_count
1adf2230 7066 << " for a block size of " << byte_u_t(bsize) << ", assuming "
7c673cae
FG
7067 << cct->_conf->osd_bench_small_size_max_iops << " IOPS,"
7068 << " for " << duration << " seconds,"
7069 << " can cause ill effects on osd. "
7070 << " Please adjust 'osd_bench_small_size_max_iops' with a higher"
7071 << " value if you wish to use a higher 'count'.";
7072 r = -EINVAL;
7073 goto out;
7074 }
7075 } else {
7076 // 1MB block sizes are big enough so that we get more stuff done.
7077 // However, to avoid the osd from getting hung on this and having
7078 // timers being triggered, we are going to limit the count assuming
7079 // a configurable throughput and duration.
7080 // NOTE: max_count is the total amount of bytes that we believe we
7081 // will be able to write during 'duration' for the given
7082 // throughput. The block size hardly impacts this unless it's
7083 // way too big. Given we already check how big the block size
7084 // is, it's safe to assume everything will check out.
7085 int64_t max_count =
7086 cct->_conf->osd_bench_large_size_max_throughput * duration;
7087 if (count > max_count) {
7088 ss << "'count' values greater than " << max_count
1adf2230
AA
7089 << " for a block size of " << byte_u_t(bsize) << ", assuming "
7090 << byte_u_t(cct->_conf->osd_bench_large_size_max_throughput) << "/s,"
7c673cae
FG
7091 << " for " << duration << " seconds,"
7092 << " can cause ill effects on osd. "
7093 << " Please adjust 'osd_bench_large_size_max_throughput'"
7094 << " with a higher value if you wish to use a higher 'count'.";
7095 r = -EINVAL;
7096 goto out;
7097 }
7098 }
7099
7100 if (osize && bsize > osize)
7101 bsize = osize;
7102
7103 dout(1) << " bench count " << count
1adf2230 7104 << " bsize " << byte_u_t(bsize) << dendl;
7c673cae
FG
7105
7106 ObjectStore::Transaction cleanupt;
7107
7108 if (osize && onum) {
7109 bufferlist bl;
7110 bufferptr bp(osize);
7111 bp.zero();
7112 bl.push_back(std::move(bp));
7113 bl.rebuild_page_aligned();
7114 for (int i=0; i<onum; ++i) {
7115 char nm[30];
7116 snprintf(nm, sizeof(nm), "disk_bw_test_%d", i);
7117 object_t oid(nm);
7118 hobject_t soid(sobject_t(oid, 0));
7119 ObjectStore::Transaction t;
7120 t.write(coll_t(), ghobject_t(soid), 0, osize, bl);
11fdf7f2 7121 store->queue_transaction(service.meta_ch, std::move(t), NULL);
7c673cae
FG
7122 cleanupt.remove(coll_t(), ghobject_t(soid));
7123 }
7124 }
7125
7126 bufferlist bl;
7127 bufferptr bp(bsize);
7128 bp.zero();
7129 bl.push_back(std::move(bp));
7130 bl.rebuild_page_aligned();
7131
7132 {
7133 C_SaferCond waiter;
11fdf7f2 7134 if (!service.meta_ch->flush_commit(&waiter)) {
7c673cae
FG
7135 waiter.wait();
7136 }
7137 }
7138
7139 utime_t start = ceph_clock_now();
7140 for (int64_t pos = 0; pos < count; pos += bsize) {
7141 char nm[30];
7142 unsigned offset = 0;
7143 if (onum && osize) {
7144 snprintf(nm, sizeof(nm), "disk_bw_test_%d", (int)(rand() % onum));
7145 offset = rand() % (osize / bsize) * bsize;
7146 } else {
7147 snprintf(nm, sizeof(nm), "disk_bw_test_%lld", (long long)pos);
7148 }
7149 object_t oid(nm);
7150 hobject_t soid(sobject_t(oid, 0));
7151 ObjectStore::Transaction t;
7152 t.write(coll_t::meta(), ghobject_t(soid), offset, bsize, bl);
11fdf7f2 7153 store->queue_transaction(service.meta_ch, std::move(t), NULL);
7c673cae
FG
7154 if (!onum || !osize)
7155 cleanupt.remove(coll_t::meta(), ghobject_t(soid));
7156 }
7157
7158 {
7159 C_SaferCond waiter;
11fdf7f2 7160 if (!service.meta_ch->flush_commit(&waiter)) {
7c673cae
FG
7161 waiter.wait();
7162 }
7163 }
7164 utime_t end = ceph_clock_now();
7165
7166 // clean up
11fdf7f2 7167 store->queue_transaction(service.meta_ch, std::move(cleanupt), NULL);
7c673cae
FG
7168 {
7169 C_SaferCond waiter;
11fdf7f2 7170 if (!service.meta_ch->flush_commit(&waiter)) {
7c673cae
FG
7171 waiter.wait();
7172 }
7173 }
7174
91327a77
AA
7175 double elapsed = end - start;
7176 double rate = count / elapsed;
7177 double iops = rate / bsize;
7c673cae
FG
7178 if (f) {
7179 f->open_object_section("osd_bench_results");
7180 f->dump_int("bytes_written", count);
7181 f->dump_int("blocksize", bsize);
91327a77
AA
7182 f->dump_float("elapsed_sec", elapsed);
7183 f->dump_float("bytes_per_sec", rate);
7184 f->dump_float("iops", iops);
7c673cae 7185 f->close_section();
91327a77 7186 f->flush(ds);
7c673cae 7187 } else {
91327a77 7188 ds << "bench: wrote " << byte_u_t(count)
1adf2230 7189 << " in blocks of " << byte_u_t(bsize) << " in "
91327a77
AA
7190 << elapsed << " sec at " << byte_u_t(rate) << "/sec "
7191 << si_u_t(iops) << " IOPS";
7c673cae
FG
7192 }
7193 }
7194
7195 else if (prefix == "flush_pg_stats") {
11fdf7f2
TL
7196 mgrc.send_pgstats();
7197 ds << service.get_osd_stat_seq() << "\n";
7c673cae
FG
7198 }
7199
7200 else if (prefix == "heap") {
7201 r = ceph::osd_cmds::heap(*cct, cmdmap, *f, ds);
7202 }
7203
7204 else if (prefix == "debug dump_missing") {
11fdf7f2
TL
7205 if (!f) {
7206 f.reset(new JSONFormatter(true));
7c673cae 7207 }
11fdf7f2
TL
7208 f->open_array_section("pgs");
7209 vector<PGRef> pgs;
7210 _get_pgs(&pgs);
7211 for (auto& pg : pgs) {
7212 string s = stringify(pg->pg_id);
7213 f->open_array_section(s.c_str());
7c673cae 7214 pg->lock();
11fdf7f2 7215 pg->dump_missing(f.get());
7c673cae 7216 pg->unlock();
11fdf7f2 7217 f->close_section();
7c673cae 7218 }
11fdf7f2
TL
7219 f->close_section();
7220 f->flush(ds);
7c673cae
FG
7221 }
7222 else if (prefix == "debug kick_recovery_wq") {
7223 int64_t delay;
7224 cmd_getval(cct, cmdmap, "delay", delay);
7225 ostringstream oss;
7226 oss << delay;
f64942e4 7227 unlock_guard unlock{osd_lock};
11fdf7f2 7228 r = cct->_conf.set_val("osd_recovery_delay_start", oss.str().c_str());
7c673cae
FG
7229 if (r != 0) {
7230 ss << "kick_recovery_wq: error setting "
7231 << "osd_recovery_delay_start to '" << delay << "': error "
7232 << r;
7233 goto out;
7234 }
11fdf7f2 7235 cct->_conf.apply_changes(nullptr);
7c673cae
FG
7236 ss << "kicking recovery queue. set osd_recovery_delay_start "
7237 << "to " << cct->_conf->osd_recovery_delay_start;
7238 }
7239
7240 else if (prefix == "cpu_profiler") {
7241 string arg;
7242 cmd_getval(cct, cmdmap, "arg", arg);
7243 vector<string> argvec;
7244 get_str_vec(arg, argvec);
7245 cpu_profiler_handle_command(argvec, ds);
7246 }
7247
7248 else if (prefix == "dump_pg_recovery_stats") {
7249 stringstream s;
7250 if (f) {
7251 pg_recovery_stats.dump_formatted(f.get());
7252 f->flush(ds);
7253 } else {
7254 pg_recovery_stats.dump(s);
7255 ds << "dump pg recovery stats: " << s.str();
7256 }
7257 }
7258
7259 else if (prefix == "reset_pg_recovery_stats") {
7260 ss << "reset pg recovery stats";
7261 pg_recovery_stats.reset();
7262 }
7263
31f18b77
FG
7264 else if (prefix == "perf histogram dump") {
7265 std::string logger;
7266 std::string counter;
7267 cmd_getval(cct, cmdmap, "logger", logger);
7268 cmd_getval(cct, cmdmap, "counter", counter);
7269 if (f) {
7270 cct->get_perfcounters_collection()->dump_formatted_histograms(
7271 f.get(), false, logger, counter);
7272 f->flush(ds);
7273 }
7274 }
7275
224ce89b
WB
7276 else if (prefix == "compact") {
7277 dout(1) << "triggering manual compaction" << dendl;
7278 auto start = ceph::coarse_mono_clock::now();
7279 store->compact();
7280 auto end = ceph::coarse_mono_clock::now();
11fdf7f2 7281 double duration = std::chrono::duration<double>(end-start).count();
224ce89b 7282 dout(1) << "finished manual compaction in "
11fdf7f2 7283 << duration
224ce89b 7284 << " seconds" << dendl;
11fdf7f2
TL
7285 ss << "compacted omap in " << duration << " seconds";
7286 }
7287
7288 else if (prefix == "smart") {
7289 string devid;
7290 cmd_getval(cct, cmdmap, "devid", devid);
7291 probe_smart(devid, ds);
7292 }
7293
7294 else if (prefix == "cache drop") {
7295 dout(20) << "clearing all caches" << dendl;
7296 // Clear the objectstore's cache - onode and buffer for Bluestore,
7297 // system's pagecache for Filestore
7298 r = store->flush_cache(&ss);
7299 if (r < 0) {
7300 ds << "Error flushing objectstore cache: " << cpp_strerror(r);
7301 goto out;
7302 }
7303 // Clear the objectcontext cache (per PG)
7304 vector<PGRef> pgs;
7305 _get_pgs(&pgs);
7306 for (auto& pg: pgs) {
7307 pg->clear_cache();
7308 }
224ce89b
WB
7309 }
7310
11fdf7f2
TL
7311 else if (prefix == "cache status") {
7312 int obj_ctx_count = 0;
7313 vector<PGRef> pgs;
7314 _get_pgs(&pgs);
7315 for (auto& pg: pgs) {
7316 obj_ctx_count += pg->get_cache_obj_count();
7317 }
7318 if (f) {
7319 f->open_object_section("cache_status");
7320 f->dump_int("object_ctx", obj_ctx_count);
7321 store->dump_cache_stats(f.get());
7322 f->close_section();
7323 f->flush(ds);
7324 } else {
7325 ds << "object_ctx: " << obj_ctx_count;
7326 store->dump_cache_stats(ds);
7327 }
7328 }
7329 else if (prefix == "send_beacon") {
7330 if (is_active()) {
7331 send_beacon(ceph::coarse_mono_clock::now());
7332 }
7333 } else {
7334 ss << "unrecognized command '" << prefix << "'";
7c673cae
FG
7335 r = -EINVAL;
7336 }
7337
7338 out:
11fdf7f2
TL
7339 return r;
7340}
7341
7342void OSD::probe_smart(const string& only_devid, ostream& ss)
7343{
7344 set<string> devnames;
7345 store->get_devices(&devnames);
7346 uint64_t smart_timeout = cct->_conf.get_val<uint64_t>(
7347 "osd_smart_report_timeout");
7348
7349 // == typedef std::map<std::string, mValue> mObject;
7350 json_spirit::mObject json_map;
7351
7352 for (auto dev : devnames) {
7353 // smartctl works only on physical devices; filter out any logical device
7354 if (dev.find("dm-") == 0) {
7355 continue;
7356 }
7357
7358 string err;
7359 string devid = get_device_id(dev, &err);
7360 if (devid.size() == 0) {
7361 dout(10) << __func__ << " no unique id for dev " << dev << " ("
7362 << err << "), skipping" << dendl;
7363 continue;
7364 }
7365 if (only_devid.size() && devid != only_devid) {
7366 continue;
7367 }
7368
7369 json_spirit::mValue smart_json;
7370 if (block_device_get_metrics(dev, smart_timeout,
7371 &smart_json)) {
7372 dout(10) << "block_device_get_metrics failed for /dev/" << dev << dendl;
7373 continue;
7374 }
7375 json_map[devid] = smart_json;
7c673cae 7376 }
11fdf7f2 7377 json_spirit::write(json_map, ss, json_spirit::pretty_print);
7c673cae
FG
7378}
7379
7380bool OSD::heartbeat_dispatch(Message *m)
7381{
7382 dout(30) << "heartbeat_dispatch " << m << dendl;
7383 switch (m->get_type()) {
7384
7385 case CEPH_MSG_PING:
7386 dout(10) << "ping from " << m->get_source_inst() << dendl;
7387 m->put();
7388 break;
7389
7390 case MSG_OSD_PING:
7391 handle_osd_ping(static_cast<MOSDPing*>(m));
7392 break;
7393
7394 default:
7395 dout(0) << "dropping unexpected message " << *m << " from " << m->get_source_inst() << dendl;
7396 m->put();
7397 }
7398
7399 return true;
7400}
7401
7402bool OSD::ms_dispatch(Message *m)
7403{
7404 dout(20) << "OSD::ms_dispatch: " << *m << dendl;
7405 if (m->get_type() == MSG_OSD_MARK_ME_DOWN) {
7406 service.got_stop_ack();
7407 m->put();
7408 return true;
7409 }
7410
7411 // lock!
7412
7413 osd_lock.Lock();
7414 if (is_stopping()) {
7415 osd_lock.Unlock();
7416 m->put();
7417 return true;
7418 }
7419
7420 do_waiters();
7421 _dispatch(m);
7422
7423 osd_lock.Unlock();
7424
7425 return true;
7426}
7427
7428void OSD::maybe_share_map(
7429 Session *session,
7430 OpRequestRef op,
7431 OSDMapRef osdmap)
7432{
7433 if (!op->check_send_map) {
7434 return;
7435 }
7436 epoch_t last_sent_epoch = 0;
7437
7438 session->sent_epoch_lock.lock();
7439 last_sent_epoch = session->last_sent_epoch;
7440 session->sent_epoch_lock.unlock();
7441
11fdf7f2
TL
7442 // assume the peer has the newer of the op's sent_epoch and what
7443 // we think we sent them.
7444 epoch_t from = std::max(last_sent_epoch, op->sent_epoch);
7445
7c673cae
FG
7446 const Message *m = op->get_req();
7447 service.share_map(
7448 m->get_source(),
7449 m->get_connection().get(),
11fdf7f2 7450 from,
7c673cae
FG
7451 osdmap,
7452 session ? &last_sent_epoch : NULL);
7453
7454 session->sent_epoch_lock.lock();
7455 if (session->last_sent_epoch < last_sent_epoch) {
7456 session->last_sent_epoch = last_sent_epoch;
7457 }
7458 session->sent_epoch_lock.unlock();
7459
7460 op->check_send_map = false;
7461}
7462
11fdf7f2 7463void OSD::dispatch_session_waiting(SessionRef session, OSDMapRef osdmap)
7c673cae 7464{
11fdf7f2 7465 ceph_assert(session->session_dispatch_lock.is_locked());
7c673cae
FG
7466
7467 auto i = session->waiting_on_map.begin();
7468 while (i != session->waiting_on_map.end()) {
7469 OpRequestRef op = &(*i);
11fdf7f2 7470 ceph_assert(ms_can_fast_dispatch(op->get_req()));
7c673cae
FG
7471 const MOSDFastDispatchOp *m = static_cast<const MOSDFastDispatchOp*>(
7472 op->get_req());
7473 if (m->get_min_epoch() > osdmap->get_epoch()) {
7474 break;
7475 }
7476 session->waiting_on_map.erase(i++);
7477 op->put();
7478
7479 spg_t pgid;
7480 if (m->get_type() == CEPH_MSG_OSD_OP) {
7481 pg_t actual_pgid = osdmap->raw_pg_to_pg(
7482 static_cast<const MOSDOp*>(m)->get_pg());
7483 if (!osdmap->get_primary_shard(actual_pgid, &pgid)) {
7484 continue;
7485 }
7486 } else {
7487 pgid = m->get_spg();
7488 }
11fdf7f2 7489 enqueue_op(pgid, std::move(op), m->get_map_epoch());
7c673cae
FG
7490 }
7491
7492 if (session->waiting_on_map.empty()) {
7493 clear_session_waiting_on_map(session);
7494 } else {
7495 register_session_waiting_on_map(session);
7496 }
7497}
7498
7499void OSD::ms_fast_dispatch(Message *m)
7500{
11fdf7f2 7501 FUNCTRACE(cct);
7c673cae
FG
7502 if (service.is_stopping()) {
7503 m->put();
7504 return;
7505 }
11fdf7f2
TL
7506
7507 // peering event?
7508 switch (m->get_type()) {
7509 case CEPH_MSG_PING:
7510 dout(10) << "ping from " << m->get_source() << dendl;
7511 m->put();
7512 return;
7513 case MSG_MON_COMMAND:
7514 handle_command(static_cast<MMonCommand*>(m));
7515 return;
7516 case MSG_OSD_FORCE_RECOVERY:
7517 handle_fast_force_recovery(static_cast<MOSDForceRecovery*>(m));
7518 return;
7519 case MSG_OSD_SCRUB2:
7520 handle_fast_scrub(static_cast<MOSDScrub2*>(m));
7521 return;
7522
7523 case MSG_OSD_PG_CREATE2:
7524 return handle_fast_pg_create(static_cast<MOSDPGCreate2*>(m));
7525 case MSG_OSD_PG_QUERY:
7526 return handle_fast_pg_query(static_cast<MOSDPGQuery*>(m));
7527 case MSG_OSD_PG_NOTIFY:
7528 return handle_fast_pg_notify(static_cast<MOSDPGNotify*>(m));
7529 case MSG_OSD_PG_INFO:
7530 return handle_fast_pg_info(static_cast<MOSDPGInfo*>(m));
7531 case MSG_OSD_PG_REMOVE:
7532 return handle_fast_pg_remove(static_cast<MOSDPGRemove*>(m));
7533
7534 // these are single-pg messages that handle themselves
7535 case MSG_OSD_PG_LOG:
7536 case MSG_OSD_PG_TRIM:
7537 case MSG_OSD_BACKFILL_RESERVE:
7538 case MSG_OSD_RECOVERY_RESERVE:
7539 {
7540 MOSDPeeringOp *pm = static_cast<MOSDPeeringOp*>(m);
7541 if (require_osd_peer(pm)) {
7542 enqueue_peering_evt(
7543 pm->get_spg(),
7544 PGPeeringEventRef(pm->get_event()));
7545 }
7546 pm->put();
7547 return;
7548 }
7549 }
7550
7c673cae
FG
7551 OpRequestRef op = op_tracker.create_request<OpRequest, Message*>(m);
7552 {
7553#ifdef WITH_LTTNG
7554 osd_reqid_t reqid = op->get_reqid();
7555#endif
7556 tracepoint(osd, ms_fast_dispatch, reqid.name._type,
7557 reqid.name._num, reqid.tid, reqid.inc);
7558 }
7559
7560 if (m->trace)
7561 op->osd_trace.init("osd op", &trace_endpoint, &m->trace);
7562
11fdf7f2 7563 // note sender epoch, min req's epoch
7c673cae
FG
7564 op->sent_epoch = static_cast<MOSDFastDispatchOp*>(m)->get_map_epoch();
7565 op->min_epoch = static_cast<MOSDFastDispatchOp*>(m)->get_min_epoch();
11fdf7f2 7566 ceph_assert(op->min_epoch <= op->sent_epoch); // sanity check!
7c673cae
FG
7567
7568 service.maybe_inject_dispatch_delay();
7569
7570 if (m->get_connection()->has_features(CEPH_FEATUREMASK_RESEND_ON_SPLIT) ||
7571 m->get_type() != CEPH_MSG_OSD_OP) {
7572 // queue it directly
7573 enqueue_op(
7574 static_cast<MOSDFastDispatchOp*>(m)->get_spg(),
11fdf7f2 7575 std::move(op),
7c673cae
FG
7576 static_cast<MOSDFastDispatchOp*>(m)->get_map_epoch());
7577 } else {
7578 // legacy client, and this is an MOSDOp (the *only* fast dispatch
7579 // message that didn't have an explicit spg_t); we need to map
7580 // them to an spg_t while preserving delivery order.
11fdf7f2
TL
7581 auto priv = m->get_connection()->get_priv();
7582 if (auto session = static_cast<Session*>(priv.get()); session) {
7583 std::lock_guard l{session->session_dispatch_lock};
7584 op->get();
7585 session->waiting_on_map.push_back(*op);
7586 OSDMapRef nextmap = service.get_nextmap_reserved();
7587 dispatch_session_waiting(session, nextmap);
7588 service.release_map(nextmap);
7c673cae
FG
7589 }
7590 }
7591 OID_EVENT_TRACE_WITH_MSG(m, "MS_FAST_DISPATCH_END", false);
7592}
7593
11fdf7f2 7594bool OSD::ms_get_authorizer(int dest_type, AuthAuthorizer **authorizer)
7c673cae
FG
7595{
7596 dout(10) << "OSD::ms_get_authorizer type=" << ceph_entity_type_name(dest_type) << dendl;
7597
31f18b77
FG
7598 if (is_stopping()) {
7599 dout(10) << __func__ << " bailing, we are shutting down" << dendl;
7600 return false;
7601 }
7602
7c673cae
FG
7603 if (dest_type == CEPH_ENTITY_TYPE_MON)
7604 return true;
7605
7c673cae
FG
7606 *authorizer = monc->build_authorizer(dest_type);
7607 return *authorizer != NULL;
7608}
7609
11fdf7f2
TL
7610KeyStore *OSD::ms_get_auth1_authorizer_keystore()
7611{
7612 return monc->rotating_secrets.get();
7613}
7c673cae 7614
11fdf7f2 7615int OSD::ms_handle_authentication(Connection *con)
7c673cae 7616{
11fdf7f2
TL
7617 int ret = 0;
7618 auto priv = con->get_priv();
7619 Session *s = static_cast<Session*>(priv.get());
7620 if (!s) {
7621 s = new Session(cct, con);
7622 con->set_priv(RefCountedPtr{s, false});
7623 s->entity_name = con->get_peer_entity_name();
7624 dout(10) << __func__ << " new session " << s << " con " << s->con
7625 << " entity " << s->entity_name
7626 << " addr " << con->get_peer_addrs() << dendl;
7627 } else {
7628 dout(10) << __func__ << " existing session " << s << " con " << s->con
7629 << " entity " << s->entity_name
7630 << " addr " << con->get_peer_addrs() << dendl;
7c673cae
FG
7631 }
7632
11fdf7f2
TL
7633 AuthCapsInfo &caps_info = con->get_peer_caps_info();
7634 if (caps_info.allow_all)
7635 s->caps.set_allow_all();
7c673cae 7636
11fdf7f2
TL
7637 if (caps_info.caps.length() > 0) {
7638 bufferlist::const_iterator p = caps_info.caps.cbegin();
7639 string str;
7640 try {
7641 decode(str, p);
7642 }
7643 catch (buffer::error& e) {
7644 dout(10) << __func__ << " session " << s << " " << s->entity_name
7645 << " failed to decode caps string" << dendl;
7646 ret = -EPERM;
7647 }
7648 if (!ret) {
7c673cae 7649 bool success = s->caps.parse(str);
11fdf7f2
TL
7650 if (success) {
7651 dout(10) << __func__ << " session " << s
7652 << " " << s->entity_name
7653 << " has caps " << s->caps << " '" << str << "'" << dendl;
7654 ret = 1;
7655 } else {
7656 dout(10) << __func__ << " session " << s << " " << s->entity_name
7657 << " failed to parse caps '" << str << "'" << dendl;
7658 ret = -EPERM;
7659 }
7c673cae 7660 }
7c673cae 7661 }
11fdf7f2 7662 return ret;
7c673cae
FG
7663}
7664
7665void OSD::do_waiters()
7666{
11fdf7f2 7667 ceph_assert(osd_lock.is_locked());
7c673cae
FG
7668
7669 dout(10) << "do_waiters -- start" << dendl;
7670 while (!finished.empty()) {
7671 OpRequestRef next = finished.front();
7672 finished.pop_front();
7673 dispatch_op(next);
7674 }
7675 dout(10) << "do_waiters -- finish" << dendl;
7676}
7677
7678void OSD::dispatch_op(OpRequestRef op)
7679{
7680 switch (op->get_req()->get_type()) {
7681
7682 case MSG_OSD_PG_CREATE:
7683 handle_pg_create(op);
7684 break;
7c673cae
FG
7685 }
7686}
7687
7688void OSD::_dispatch(Message *m)
7689{
11fdf7f2 7690 ceph_assert(osd_lock.is_locked());
7c673cae
FG
7691 dout(20) << "_dispatch " << m << " " << *m << dendl;
7692
7693 switch (m->get_type()) {
7c673cae
FG
7694 // -- don't need OSDMap --
7695
7696 // map and replication
7697 case CEPH_MSG_OSD_MAP:
7698 handle_osd_map(static_cast<MOSDMap*>(m));
7699 break;
7700
7701 // osd
7c673cae
FG
7702 case MSG_OSD_SCRUB:
7703 handle_scrub(static_cast<MOSDScrub*>(m));
7704 break;
7705
11fdf7f2
TL
7706 case MSG_COMMAND:
7707 handle_command(static_cast<MCommand*>(m));
7708 return;
c07f9fc5 7709
7c673cae
FG
7710 // -- need OSDMap --
7711
7712 case MSG_OSD_PG_CREATE:
7c673cae
FG
7713 {
7714 OpRequestRef op = op_tracker.create_request<OpRequest, Message*>(m);
7715 if (m->trace)
7716 op->osd_trace.init("osd op", &trace_endpoint, &m->trace);
7717 // no map? starting up?
7718 if (!osdmap) {
7719 dout(7) << "no OSDMap, not booted" << dendl;
7720 logger->inc(l_osd_waiting_for_map);
7721 waiting_for_osdmap.push_back(op);
7722 op->mark_delayed("no osdmap");
7723 break;
7724 }
7725
7726 // need OSDMap
7727 dispatch_op(op);
7728 }
7729 }
7730}
7731
11fdf7f2 7732// remove me post-nautilus
7c673cae
FG
7733void OSD::handle_scrub(MOSDScrub *m)
7734{
7735 dout(10) << "handle_scrub " << *m << dendl;
7736 if (!require_mon_or_mgr_peer(m)) {
7737 m->put();
7738 return;
7739 }
7740 if (m->fsid != monc->get_fsid()) {
11fdf7f2
TL
7741 dout(0) << "handle_scrub fsid " << m->fsid << " != " << monc->get_fsid()
7742 << dendl;
7c673cae
FG
7743 m->put();
7744 return;
7745 }
7746
11fdf7f2
TL
7747 vector<spg_t> spgs;
7748 _get_pgids(&spgs);
7749
7750 if (!m->scrub_pgs.empty()) {
7751 vector<spg_t> v;
7752 for (auto pgid : m->scrub_pgs) {
7c673cae 7753 spg_t pcand;
11fdf7f2
TL
7754 if (osdmap->get_primary_shard(pgid, &pcand) &&
7755 std::find(spgs.begin(), spgs.end(), pcand) != spgs.end()) {
7756 v.push_back(pcand);
7c673cae
FG
7757 }
7758 }
11fdf7f2
TL
7759 spgs.swap(v);
7760 }
7761
7762 for (auto pgid : spgs) {
7763 enqueue_peering_evt(
7764 pgid,
7765 PGPeeringEventRef(
7766 std::make_shared<PGPeeringEvent>(
7767 get_osdmap_epoch(),
7768 get_osdmap_epoch(),
7769 PG::RequestScrub(m->deep, m->repair))));
7c673cae
FG
7770 }
7771
7772 m->put();
7773}
7774
11fdf7f2
TL
7775void OSD::handle_fast_scrub(MOSDScrub2 *m)
7776{
7777 dout(10) << __func__ << " " << *m << dendl;
7778 if (!require_mon_or_mgr_peer(m)) {
7779 m->put();
7780 return;
7781 }
7782 if (m->fsid != monc->get_fsid()) {
7783 dout(0) << __func__ << " fsid " << m->fsid << " != " << monc->get_fsid()
7784 << dendl;
7785 m->put();
7786 return;
7787 }
7788 for (auto pgid : m->scrub_pgs) {
7789 enqueue_peering_evt(
7790 pgid,
7791 PGPeeringEventRef(
7792 std::make_shared<PGPeeringEvent>(
7793 m->epoch,
7794 m->epoch,
7795 PG::RequestScrub(m->deep, m->repair))));
7796 }
7797 m->put();
7798}
7799
7c673cae
FG
7800bool OSD::scrub_random_backoff()
7801{
7802 bool coin_flip = (rand() / (double)RAND_MAX >=
7803 cct->_conf->osd_scrub_backoff_ratio);
7804 if (!coin_flip) {
7805 dout(20) << "scrub_random_backoff lost coin flip, randomly backing off" << dendl;
7806 return true;
7807 }
7808 return false;
7809}
7810
7811OSDService::ScrubJob::ScrubJob(CephContext* cct,
7812 const spg_t& pg, const utime_t& timestamp,
7813 double pool_scrub_min_interval,
7814 double pool_scrub_max_interval, bool must)
7815 : cct(cct),
7816 pgid(pg),
7817 sched_time(timestamp),
7818 deadline(timestamp)
7819{
7820 // if not explicitly requested, postpone the scrub with a random delay
7821 if (!must) {
7822 double scrub_min_interval = pool_scrub_min_interval > 0 ?
7823 pool_scrub_min_interval : cct->_conf->osd_scrub_min_interval;
7824 double scrub_max_interval = pool_scrub_max_interval > 0 ?
7825 pool_scrub_max_interval : cct->_conf->osd_scrub_max_interval;
7826
7827 sched_time += scrub_min_interval;
7828 double r = rand() / (double)RAND_MAX;
7829 sched_time +=
7830 scrub_min_interval * cct->_conf->osd_scrub_interval_randomize_ratio * r;
11fdf7f2
TL
7831 if (scrub_max_interval == 0) {
7832 deadline = utime_t();
7833 } else {
7834 deadline += scrub_max_interval;
7835 }
7836
7c673cae
FG
7837 }
7838}
7839
7840bool OSDService::ScrubJob::ScrubJob::operator<(const OSDService::ScrubJob& rhs) const {
7841 if (sched_time < rhs.sched_time)
7842 return true;
7843 if (sched_time > rhs.sched_time)
7844 return false;
7845 return pgid < rhs.pgid;
7846}
7847
7848bool OSD::scrub_time_permit(utime_t now)
7849{
7850 struct tm bdt;
7851 time_t tt = now.sec();
7852 localtime_r(&tt, &bdt);
28e407b8
AA
7853
7854 bool day_permit = false;
7855 if (cct->_conf->osd_scrub_begin_week_day < cct->_conf->osd_scrub_end_week_day) {
7856 if (bdt.tm_wday >= cct->_conf->osd_scrub_begin_week_day && bdt.tm_wday < cct->_conf->osd_scrub_end_week_day) {
7857 day_permit = true;
7858 }
7859 } else {
7860 if (bdt.tm_wday >= cct->_conf->osd_scrub_begin_week_day || bdt.tm_wday < cct->_conf->osd_scrub_end_week_day) {
7861 day_permit = true;
7862 }
7863 }
7864
7865 if (!day_permit) {
7866 dout(20) << __func__ << " should run between week day " << cct->_conf->osd_scrub_begin_week_day
7867 << " - " << cct->_conf->osd_scrub_end_week_day
7868 << " now " << bdt.tm_wday << " = no" << dendl;
7869 return false;
7870 }
7871
7c673cae
FG
7872 bool time_permit = false;
7873 if (cct->_conf->osd_scrub_begin_hour < cct->_conf->osd_scrub_end_hour) {
7874 if (bdt.tm_hour >= cct->_conf->osd_scrub_begin_hour && bdt.tm_hour < cct->_conf->osd_scrub_end_hour) {
7875 time_permit = true;
7876 }
7877 } else {
7878 if (bdt.tm_hour >= cct->_conf->osd_scrub_begin_hour || bdt.tm_hour < cct->_conf->osd_scrub_end_hour) {
7879 time_permit = true;
7880 }
7881 }
7882 if (!time_permit) {
7883 dout(20) << __func__ << " should run between " << cct->_conf->osd_scrub_begin_hour
7884 << " - " << cct->_conf->osd_scrub_end_hour
7885 << " now " << bdt.tm_hour << " = no" << dendl;
7886 } else {
7887 dout(20) << __func__ << " should run between " << cct->_conf->osd_scrub_begin_hour
7888 << " - " << cct->_conf->osd_scrub_end_hour
7889 << " now " << bdt.tm_hour << " = yes" << dendl;
7890 }
7891 return time_permit;
7892}
7893
7894bool OSD::scrub_load_below_threshold()
7895{
7896 double loadavgs[3];
7897 if (getloadavg(loadavgs, 3) != 3) {
7898 dout(10) << __func__ << " couldn't read loadavgs\n" << dendl;
7899 return false;
7900 }
7901
7902 // allow scrub if below configured threshold
91327a77
AA
7903 long cpus = sysconf(_SC_NPROCESSORS_ONLN);
7904 double loadavg_per_cpu = cpus > 0 ? loadavgs[0] / cpus : loadavgs[0];
7905 if (loadavg_per_cpu < cct->_conf->osd_scrub_load_threshold) {
7906 dout(20) << __func__ << " loadavg per cpu " << loadavg_per_cpu
7c673cae
FG
7907 << " < max " << cct->_conf->osd_scrub_load_threshold
7908 << " = yes" << dendl;
7909 return true;
7910 }
7911
7912 // allow scrub if below daily avg and currently decreasing
7913 if (loadavgs[0] < daily_loadavg && loadavgs[0] < loadavgs[2]) {
7914 dout(20) << __func__ << " loadavg " << loadavgs[0]
7915 << " < daily_loadavg " << daily_loadavg
7916 << " and < 15m avg " << loadavgs[2]
7917 << " = yes" << dendl;
7918 return true;
7919 }
7920
7921 dout(20) << __func__ << " loadavg " << loadavgs[0]
7922 << " >= max " << cct->_conf->osd_scrub_load_threshold
7923 << " and ( >= daily_loadavg " << daily_loadavg
7924 << " or >= 15m avg " << loadavgs[2]
7925 << ") = no" << dendl;
7926 return false;
7927}
7928
7929void OSD::sched_scrub()
7930{
7931 // if not permitted, fail fast
eafe8130 7932 if (!service.can_inc_scrubs()) {
7c673cae
FG
7933 return;
7934 }
eafe8130
TL
7935 bool allow_requested_repair_only = false;
7936 if (service.is_recovery_active()) {
7937 if (!cct->_conf->osd_scrub_during_recovery && cct->_conf->osd_repair_during_recovery) {
7938 dout(10) << __func__
7939 << " will only schedule explicitly requested repair due to active recovery"
7940 << dendl;
7941 allow_requested_repair_only = true;
7942 } else if (!cct->_conf->osd_scrub_during_recovery && !cct->_conf->osd_repair_during_recovery) {
7943 dout(20) << __func__ << " not scheduling scrubs due to active recovery" << dendl;
7944 return;
7945 }
b5b8bbf5
FG
7946 }
7947
7c673cae
FG
7948 utime_t now = ceph_clock_now();
7949 bool time_permit = scrub_time_permit(now);
7950 bool load_is_low = scrub_load_below_threshold();
7951 dout(20) << "sched_scrub load_is_low=" << (int)load_is_low << dendl;
7952
7953 OSDService::ScrubJob scrub;
7954 if (service.first_scrub_stamp(&scrub)) {
7955 do {
7956 dout(30) << "sched_scrub examine " << scrub.pgid << " at " << scrub.sched_time << dendl;
7957
7958 if (scrub.sched_time > now) {
7959 // save ourselves some effort
7960 dout(10) << "sched_scrub " << scrub.pgid << " scheduled at " << scrub.sched_time
7961 << " > " << now << dendl;
7962 break;
7963 }
7964
11fdf7f2 7965 if ((scrub.deadline.is_zero() || scrub.deadline >= now) && !(time_permit && load_is_low)) {
7c673cae
FG
7966 dout(10) << __func__ << " not scheduling scrub for " << scrub.pgid << " due to "
7967 << (!time_permit ? "time not permit" : "high load") << dendl;
7968 continue;
7969 }
7970
11fdf7f2 7971 PGRef pg = _lookup_lock_pg(scrub.pgid);
7c673cae
FG
7972 if (!pg)
7973 continue;
494da23a
TL
7974 // This has already started, so go on to the next scrub job
7975 if (pg->scrubber.active) {
7976 pg->unlock();
7977 dout(30) << __func__ << ": already in progress pgid " << scrub.pgid << dendl;
7978 continue;
7979 }
eafe8130
TL
7980 // Skip other kinds of scrubing if only explicitly requested repairing is allowed
7981 if (allow_requested_repair_only && !pg->scrubber.must_repair) {
7982 pg->unlock();
7983 dout(10) << __func__ << " skip " << scrub.pgid
7984 << " because repairing is not explicitly requested on it"
7985 << dendl;
7986 continue;
7987 }
494da23a 7988 // If it is reserving, let it resolve before going to the next scrub job
eafe8130 7989 if (pg->scrubber.local_reserved && !pg->scrubber.active) {
494da23a
TL
7990 pg->unlock();
7991 dout(30) << __func__ << ": reserve in progress pgid " << scrub.pgid << dendl;
7992 break;
7993 }
11fdf7f2
TL
7994 dout(10) << "sched_scrub scrubbing " << scrub.pgid << " at " << scrub.sched_time
7995 << (pg->get_must_scrub() ? ", explicitly requested" :
7996 (load_is_low ? ", load_is_low" : " deadline < now"))
7997 << dendl;
7998 if (pg->sched_scrub()) {
7999 pg->unlock();
8000 break;
7c673cae
FG
8001 }
8002 pg->unlock();
8003 } while (service.next_scrub_stamp(scrub, &scrub));
8004 }
8005 dout(20) << "sched_scrub done" << dendl;
8006}
8007
494da23a
TL
8008void OSD::resched_all_scrubs()
8009{
8010 dout(10) << __func__ << ": start" << dendl;
8011 OSDService::ScrubJob scrub;
8012 if (service.first_scrub_stamp(&scrub)) {
8013 do {
8014 dout(20) << __func__ << ": examine " << scrub.pgid << dendl;
8015
8016 PGRef pg = _lookup_lock_pg(scrub.pgid);
8017 if (!pg)
8018 continue;
8019 if (!pg->scrubber.must_scrub && !pg->scrubber.need_auto) {
8020 dout(20) << __func__ << ": reschedule " << scrub.pgid << dendl;
8021 pg->on_info_history_change();
8022 }
8023 pg->unlock();
8024 } while (service.next_scrub_stamp(scrub, &scrub));
8025 }
8026 dout(10) << __func__ << ": done" << dendl;
8027}
8028
11fdf7f2
TL
8029MPGStats* OSD::collect_pg_stats()
8030{
8031 // This implementation unconditionally sends every is_primary PG's
8032 // stats every time we're called. This has equivalent cost to the
8033 // previous implementation's worst case where all PGs are busy and
8034 // their stats are always enqueued for sending.
8035 RWLock::RLocker l(map_lock);
8036
8037 utime_t had_for = ceph_clock_now() - had_map_since;
8038 osd_stat_t cur_stat = service.get_osd_stat();
8039 cur_stat.os_perf_stat = store->get_cur_stats();
8040
8041 auto m = new MPGStats(monc->get_fsid(), osdmap->get_epoch(), had_for);
8042 m->osd_stat = cur_stat;
8043
8044 std::lock_guard lec{min_last_epoch_clean_lock};
8045 min_last_epoch_clean = osdmap->get_epoch();
8046 min_last_epoch_clean_pgs.clear();
8047
8048 std::set<int64_t> pool_set;
8049 vector<PGRef> pgs;
8050 _get_pgs(&pgs);
8051 for (auto& pg : pgs) {
8052 auto pool = pg->pg_id.pgid.pool();
8053 pool_set.emplace((int64_t)pool);
8054 if (!pg->is_primary()) {
8055 continue;
8056 }
8057 pg->get_pg_stats([&](const pg_stat_t& s, epoch_t lec) {
8058 m->pg_stat[pg->pg_id.pgid] = s;
8059 min_last_epoch_clean = min(min_last_epoch_clean, lec);
8060 min_last_epoch_clean_pgs.push_back(pg->pg_id.pgid);
8061 });
8062 }
8063 store_statfs_t st;
81eedcae 8064 bool per_pool_stats = false;
11fdf7f2
TL
8065 for (auto p : pool_set) {
8066 int r = store->pool_statfs(p, &st);
8067 if (r == -ENOTSUP) {
8068 break;
8069 } else {
8070 assert(r >= 0);
8071 m->pool_stat[p] = st;
81eedcae 8072 per_pool_stats = true;
11fdf7f2
TL
8073 }
8074 }
7c673cae 8075
81eedcae
TL
8076 // indicate whether we are reporting per-pool stats
8077 m->osd_stat.num_osds = 1;
8078 m->osd_stat.num_per_pool_osds = per_pool_stats ? 1 : 0;
8079
11fdf7f2
TL
8080 return m;
8081}
7c673cae 8082
11fdf7f2 8083vector<DaemonHealthMetric> OSD::get_health_metrics()
b32b8144 8084{
11fdf7f2
TL
8085 vector<DaemonHealthMetric> metrics;
8086 {
8087 utime_t oldest_secs;
8088 const utime_t now = ceph_clock_now();
8089 auto too_old = now;
8090 too_old -= cct->_conf.get_val<double>("osd_op_complaint_time");
8091 int slow = 0;
8092 TrackedOpRef oldest_op;
8093 auto count_slow_ops = [&](TrackedOp& op) {
8094 if (op.get_initiated() < too_old) {
8095 lgeneric_subdout(cct,osd,20) << "slow op " << op.get_desc()
8096 << " initiated "
8097 << op.get_initiated() << dendl;
8098 slow++;
8099 if (!oldest_op || op.get_initiated() < oldest_op->get_initiated()) {
8100 oldest_op = &op;
8101 }
8102 return true;
8103 } else {
8104 return false;
8105 }
8106 };
8107 if (op_tracker.visit_ops_in_flight(&oldest_secs, count_slow_ops)) {
8108 if (slow) {
8109 derr << __func__ << " reporting " << slow << " slow ops, oldest is "
8110 << oldest_op->get_desc() << dendl;
8111 }
8112 metrics.emplace_back(daemon_metric::SLOW_OPS, slow, oldest_secs);
8113 } else {
8114 // no news is not good news.
8115 metrics.emplace_back(daemon_metric::SLOW_OPS, 0, 0);
8116 }
8117 }
8118 {
8119 std::lock_guard l(pending_creates_lock);
8120 auto n_primaries = pending_creates_from_mon;
8121 for (const auto& create : pending_creates_from_osd) {
8122 if (create.second) {
8123 n_primaries++;
8124 }
b32b8144 8125 }
11fdf7f2 8126 metrics.emplace_back(daemon_metric::PENDING_CREATING_PGS, n_primaries);
b32b8144 8127 }
b32b8144
FG
8128 return metrics;
8129}
8130
7c673cae
FG
8131// =====================================================
8132// MAP
8133
8134void OSD::wait_for_new_map(OpRequestRef op)
8135{
8136 // ask?
8137 if (waiting_for_osdmap.empty()) {
8138 osdmap_subscribe(osdmap->get_epoch() + 1, false);
8139 }
8140
8141 logger->inc(l_osd_waiting_for_map);
8142 waiting_for_osdmap.push_back(op);
8143 op->mark_delayed("wait for new map");
8144}
8145
8146
8147/** update_map
8148 * assimilate new OSDMap(s). scan pgs, etc.
8149 */
8150
8151void OSD::note_down_osd(int peer)
8152{
11fdf7f2
TL
8153 ceph_assert(osd_lock.is_locked());
8154 cluster_messenger->mark_down_addrs(osdmap->get_cluster_addrs(peer));
7c673cae
FG
8155
8156 heartbeat_lock.Lock();
8157 failure_queue.erase(peer);
8158 failure_pending.erase(peer);
8159 map<int,HeartbeatInfo>::iterator p = heartbeat_peers.find(peer);
8160 if (p != heartbeat_peers.end()) {
8161 p->second.con_back->mark_down();
8162 if (p->second.con_front) {
8163 p->second.con_front->mark_down();
8164 }
8165 heartbeat_peers.erase(p);
8166 }
8167 heartbeat_lock.Unlock();
8168}
8169
8170void OSD::note_up_osd(int peer)
8171{
8172 service.forget_peer_epoch(peer, osdmap->get_epoch() - 1);
8173 heartbeat_set_peers_need_update();
8174}
8175
8176struct C_OnMapCommit : public Context {
8177 OSD *osd;
8178 epoch_t first, last;
8179 MOSDMap *msg;
8180 C_OnMapCommit(OSD *o, epoch_t f, epoch_t l, MOSDMap *m)
8181 : osd(o), first(f), last(l), msg(m) {}
8182 void finish(int r) override {
8183 osd->_committed_osd_maps(first, last, msg);
8184 msg->put();
8185 }
8186};
8187
7c673cae
FG
8188void OSD::osdmap_subscribe(version_t epoch, bool force_request)
8189{
11fdf7f2 8190 std::lock_guard l(osdmap_subscribe_lock);
181888fb 8191 if (latest_subscribed_epoch >= epoch && !force_request)
7c673cae
FG
8192 return;
8193
11fdf7f2 8194 latest_subscribed_epoch = std::max<uint64_t>(epoch, latest_subscribed_epoch);
181888fb 8195
7c673cae
FG
8196 if (monc->sub_want_increment("osdmap", epoch, CEPH_SUBSCRIBE_ONETIME) ||
8197 force_request) {
8198 monc->renew_subs();
8199 }
8200}
8201
8202void OSD::trim_maps(epoch_t oldest, int nreceived, bool skip_maps)
8203{
8204 epoch_t min = std::min(oldest, service.map_cache.cached_key_lower_bound());
8205 if (min <= superblock.oldest_map)
8206 return;
8207
8208 int num = 0;
8209 ObjectStore::Transaction t;
8210 for (epoch_t e = superblock.oldest_map; e < min; ++e) {
8211 dout(20) << " removing old osdmap epoch " << e << dendl;
8212 t.remove(coll_t::meta(), get_osdmap_pobject_name(e));
8213 t.remove(coll_t::meta(), get_inc_osdmap_pobject_name(e));
8214 superblock.oldest_map = e + 1;
8215 num++;
8216 if (num >= cct->_conf->osd_target_transaction_size && num >= nreceived) {
8217 service.publish_superblock(superblock);
8218 write_superblock(t);
11fdf7f2
TL
8219 int tr = store->queue_transaction(service.meta_ch, std::move(t), nullptr);
8220 ceph_assert(tr == 0);
7c673cae
FG
8221 num = 0;
8222 if (!skip_maps) {
8223 // skip_maps leaves us with a range of old maps if we fail to remove all
8224 // of them before moving superblock.oldest_map forward to the first map
8225 // in the incoming MOSDMap msg. so we should continue removing them in
8226 // this case, even we could do huge series of delete transactions all at
8227 // once.
8228 break;
8229 }
8230 }
8231 }
8232 if (num > 0) {
8233 service.publish_superblock(superblock);
8234 write_superblock(t);
11fdf7f2
TL
8235 int tr = store->queue_transaction(service.meta_ch, std::move(t), nullptr);
8236 ceph_assert(tr == 0);
7c673cae
FG
8237 }
8238 // we should not remove the cached maps
11fdf7f2 8239 ceph_assert(min <= service.map_cache.cached_key_lower_bound());
7c673cae
FG
8240}
8241
8242void OSD::handle_osd_map(MOSDMap *m)
8243{
11fdf7f2
TL
8244 // wait for pgs to catch up
8245 {
8246 // we extend the map cache pins to accomodate pgs slow to consume maps
8247 // for some period, until we hit the max_lag_factor bound, at which point
8248 // we block here to stop injesting more maps than they are able to keep
8249 // up with.
8250 epoch_t max_lag = cct->_conf->osd_map_cache_size *
8251 m_osd_pg_epoch_max_lag_factor;
8252 ceph_assert(max_lag > 0);
8253 epoch_t osd_min = 0;
8254 for (auto shard : shards) {
8255 epoch_t min = shard->get_min_pg_epoch();
8256 if (osd_min == 0 || min < osd_min) {
8257 osd_min = min;
8258 }
8259 }
8260 if (osd_min > 0 &&
8261 osdmap->get_epoch() > max_lag &&
8262 osdmap->get_epoch() - max_lag > osd_min) {
8263 epoch_t need = osdmap->get_epoch() - max_lag;
8264 dout(10) << __func__ << " waiting for pgs to catch up (need " << need
8265 << " max_lag " << max_lag << ")" << dendl;
8266 for (auto shard : shards) {
8267 epoch_t min = shard->get_min_pg_epoch();
8268 if (need > min) {
8269 dout(10) << __func__ << " waiting for pgs to consume " << need
8270 << " (shard " << shard->shard_id << " min " << min
8271 << ", map cache is " << cct->_conf->osd_map_cache_size
8272 << ", max_lag_factor " << m_osd_pg_epoch_max_lag_factor
8273 << ")" << dendl;
8274 unlock_guard unlock{osd_lock};
8275 shard->wait_min_pg_epoch(need);
8276 }
8277 }
8278 }
8279 }
8280
8281 ceph_assert(osd_lock.is_locked());
8282 map<epoch_t,OSDMapRef> added_maps;
8283 map<epoch_t,bufferlist> added_maps_bl;
7c673cae
FG
8284 if (m->fsid != monc->get_fsid()) {
8285 dout(0) << "handle_osd_map fsid " << m->fsid << " != "
8286 << monc->get_fsid() << dendl;
8287 m->put();
8288 return;
8289 }
8290 if (is_initializing()) {
8291 dout(0) << "ignoring osdmap until we have initialized" << dendl;
8292 m->put();
8293 return;
8294 }
8295
11fdf7f2
TL
8296 auto priv = m->get_connection()->get_priv();
8297 if (auto session = static_cast<Session *>(priv.get());
8298 session && !(session->entity_name.is_mon() ||
7c673cae
FG
8299 session->entity_name.is_osd())) {
8300 //not enough perms!
8301 dout(10) << "got osd map from Session " << session
8302 << " which we can't take maps from (not a mon or osd)" << dendl;
8303 m->put();
7c673cae
FG
8304 return;
8305 }
7c673cae
FG
8306
8307 // share with the objecter
8308 if (!is_preboot())
8309 service.objecter->handle_osd_map(m);
8310
8311 epoch_t first = m->get_first();
8312 epoch_t last = m->get_last();
8313 dout(3) << "handle_osd_map epochs [" << first << "," << last << "], i have "
8314 << superblock.newest_map
8315 << ", src has [" << m->oldest_map << "," << m->newest_map << "]"
8316 << dendl;
8317
8318 logger->inc(l_osd_map);
8319 logger->inc(l_osd_mape, last - first + 1);
8320 if (first <= superblock.newest_map)
8321 logger->inc(l_osd_mape_dup, superblock.newest_map - first + 1);
8322 if (service.max_oldest_map < m->oldest_map) {
8323 service.max_oldest_map = m->oldest_map;
11fdf7f2 8324 ceph_assert(service.max_oldest_map >= superblock.oldest_map);
7c673cae
FG
8325 }
8326
8327 // make sure there is something new, here, before we bother flushing
8328 // the queues and such
8329 if (last <= superblock.newest_map) {
8330 dout(10) << " no new maps here, dropping" << dendl;
8331 m->put();
8332 return;
8333 }
8334
8335 // missing some?
8336 bool skip_maps = false;
8337 if (first > superblock.newest_map + 1) {
8338 dout(10) << "handle_osd_map message skips epochs "
8339 << superblock.newest_map + 1 << ".." << (first-1) << dendl;
8340 if (m->oldest_map <= superblock.newest_map + 1) {
8341 osdmap_subscribe(superblock.newest_map + 1, false);
8342 m->put();
8343 return;
8344 }
8345 // always try to get the full range of maps--as many as we can. this
8346 // 1- is good to have
8347 // 2- is at present the only way to ensure that we get a *full* map as
8348 // the first map!
8349 if (m->oldest_map < first) {
8350 osdmap_subscribe(m->oldest_map - 1, true);
8351 m->put();
8352 return;
8353 }
8354 skip_maps = true;
8355 }
8356
8357 ObjectStore::Transaction t;
8358 uint64_t txn_size = 0;
8359
8360 // store new maps: queue for disk and put in the osdmap cache
11fdf7f2 8361 epoch_t start = std::max(superblock.newest_map + 1, first);
7c673cae
FG
8362 for (epoch_t e = start; e <= last; e++) {
8363 if (txn_size >= t.get_num_bytes()) {
8364 derr << __func__ << " transaction size overflowed" << dendl;
11fdf7f2 8365 ceph_assert(txn_size < t.get_num_bytes());
7c673cae
FG
8366 }
8367 txn_size = t.get_num_bytes();
8368 map<epoch_t,bufferlist>::iterator p;
8369 p = m->maps.find(e);
8370 if (p != m->maps.end()) {
8371 dout(10) << "handle_osd_map got full map for epoch " << e << dendl;
8372 OSDMap *o = new OSDMap;
8373 bufferlist& bl = p->second;
8374
8375 o->decode(bl);
8376
8377 ghobject_t fulloid = get_osdmap_pobject_name(e);
8378 t.write(coll_t::meta(), fulloid, 0, bl.length(), bl);
11fdf7f2
TL
8379 added_maps[e] = add_map(o);
8380 added_maps_bl[e] = bl;
7c673cae
FG
8381 got_full_map(e);
8382 continue;
8383 }
8384
8385 p = m->incremental_maps.find(e);
8386 if (p != m->incremental_maps.end()) {
8387 dout(10) << "handle_osd_map got inc map for epoch " << e << dendl;
8388 bufferlist& bl = p->second;
8389 ghobject_t oid = get_inc_osdmap_pobject_name(e);
8390 t.write(coll_t::meta(), oid, 0, bl.length(), bl);
7c673cae
FG
8391
8392 OSDMap *o = new OSDMap;
8393 if (e > 1) {
8394 bufferlist obl;
8395 bool got = get_map_bl(e - 1, obl);
11fdf7f2
TL
8396 if (!got) {
8397 auto p = added_maps_bl.find(e - 1);
8398 ceph_assert(p != added_maps_bl.end());
8399 obl = p->second;
8400 }
7c673cae
FG
8401 o->decode(obl);
8402 }
8403
8404 OSDMap::Incremental inc;
11fdf7f2 8405 auto p = bl.cbegin();
7c673cae 8406 inc.decode(p);
494da23a 8407
7c673cae
FG
8408 if (o->apply_incremental(inc) < 0) {
8409 derr << "ERROR: bad fsid? i have " << osdmap->get_fsid() << " and inc has " << inc.fsid << dendl;
11fdf7f2 8410 ceph_abort_msg("bad fsid");
7c673cae
FG
8411 }
8412
8413 bufferlist fbl;
8414 o->encode(fbl, inc.encode_features | CEPH_FEATURE_RESERVED);
8415
8416 bool injected_failure = false;
8417 if (cct->_conf->osd_inject_bad_map_crc_probability > 0 &&
8418 (rand() % 10000) < cct->_conf->osd_inject_bad_map_crc_probability*10000.0) {
8419 derr << __func__ << " injecting map crc failure" << dendl;
8420 injected_failure = true;
8421 }
8422
8423 if ((inc.have_crc && o->get_crc() != inc.full_crc) || injected_failure) {
8424 dout(2) << "got incremental " << e
8425 << " but failed to encode full with correct crc; requesting"
8426 << dendl;
8427 clog->warn() << "failed to encode map e" << e << " with expected crc";
8428 dout(20) << "my encoded map was:\n";
8429 fbl.hexdump(*_dout);
8430 *_dout << dendl;
8431 delete o;
8432 request_full_map(e, last);
8433 last = e - 1;
8434 break;
8435 }
8436 got_full_map(e);
8437
8438 ghobject_t fulloid = get_osdmap_pobject_name(e);
8439 t.write(coll_t::meta(), fulloid, 0, fbl.length(), fbl);
11fdf7f2
TL
8440 added_maps[e] = add_map(o);
8441 added_maps_bl[e] = fbl;
7c673cae
FG
8442 continue;
8443 }
8444
11fdf7f2 8445 ceph_abort_msg("MOSDMap lied about what maps it had?");
7c673cae
FG
8446 }
8447
8448 // even if this map isn't from a mon, we may have satisfied our subscription
8449 monc->sub_got("osdmap", last);
8450
8451 if (!m->maps.empty() && requested_full_first) {
8452 dout(10) << __func__ << " still missing full maps " << requested_full_first
8453 << ".." << requested_full_last << dendl;
8454 rerequest_full_maps();
8455 }
8456
7c673cae
FG
8457 if (superblock.oldest_map) {
8458 // make sure we at least keep pace with incoming maps
8459 trim_maps(m->oldest_map, last - first + 1, skip_maps);
11fdf7f2 8460 pg_num_history.prune(superblock.oldest_map);
7c673cae
FG
8461 }
8462
8463 if (!superblock.oldest_map || skip_maps)
8464 superblock.oldest_map = first;
8465 superblock.newest_map = last;
8466 superblock.current_epoch = last;
8467
8468 // note in the superblock that we were clean thru the prior epoch
8469 epoch_t boot_epoch = service.get_boot_epoch();
8470 if (boot_epoch && boot_epoch >= superblock.mounted) {
8471 superblock.mounted = boot_epoch;
8472 superblock.clean_thru = last;
8473 }
8474
11fdf7f2
TL
8475 // check for pg_num changes and deleted pools
8476 OSDMapRef lastmap;
8477 for (auto& i : added_maps) {
8478 if (!lastmap) {
8479 if (!(lastmap = service.try_get_map(i.first - 1))) {
8480 dout(10) << __func__ << " can't get previous map " << i.first - 1
8481 << " probably first start of this osd" << dendl;
8482 continue;
8483 }
8484 }
8485 ceph_assert(lastmap->get_epoch() + 1 == i.second->get_epoch());
8486 for (auto& j : lastmap->get_pools()) {
8487 if (!i.second->have_pg_pool(j.first)) {
8488 pg_num_history.log_pool_delete(i.first, j.first);
8489 dout(10) << __func__ << " recording final pg_pool_t for pool "
8490 << j.first << dendl;
8491 // this information is needed by _make_pg() if have to restart before
8492 // the pool is deleted and need to instantiate a new (zombie) PG[Pool].
8493 ghobject_t obj = make_final_pool_info_oid(j.first);
8494 bufferlist bl;
8495 encode(j.second, bl, CEPH_FEATURES_ALL);
8496 string name = lastmap->get_pool_name(j.first);
8497 encode(name, bl);
8498 map<string,string> profile;
8499 if (lastmap->get_pg_pool(j.first)->is_erasure()) {
8500 profile = lastmap->get_erasure_code_profile(
8501 lastmap->get_pg_pool(j.first)->erasure_code_profile);
8502 }
8503 encode(profile, bl);
8504 t.write(coll_t::meta(), obj, 0, bl.length(), bl);
8505 service.store_deleted_pool_pg_num(j.first, j.second.get_pg_num());
8506 } else if (unsigned new_pg_num = i.second->get_pg_num(j.first);
8507 new_pg_num != j.second.get_pg_num()) {
8508 dout(10) << __func__ << " recording pool " << j.first << " pg_num "
8509 << j.second.get_pg_num() << " -> " << new_pg_num << dendl;
8510 pg_num_history.log_pg_num_change(i.first, j.first, new_pg_num);
8511 }
8512 }
8513 for (auto& j : i.second->get_pools()) {
8514 if (!lastmap->have_pg_pool(j.first)) {
8515 dout(10) << __func__ << " recording new pool " << j.first << " pg_num "
8516 << j.second.get_pg_num() << dendl;
8517 pg_num_history.log_pg_num_change(i.first, j.first,
8518 j.second.get_pg_num());
8519 }
8520 }
8521 lastmap = i.second;
8522 }
8523 pg_num_history.epoch = last;
8524 {
8525 bufferlist bl;
8526 ::encode(pg_num_history, bl);
8527 t.write(coll_t::meta(), make_pg_num_history_oid(), 0, bl.length(), bl);
8528 dout(20) << __func__ << " pg_num_history " << pg_num_history << dendl;
8529 }
8530
7c673cae
FG
8531 // superblock and commit
8532 write_superblock(t);
11fdf7f2 8533 t.register_on_commit(new C_OnMapCommit(this, start, last, m));
7c673cae 8534 store->queue_transaction(
11fdf7f2
TL
8535 service.meta_ch,
8536 std::move(t));
7c673cae
FG
8537 service.publish_superblock(superblock);
8538}
8539
8540void OSD::_committed_osd_maps(epoch_t first, epoch_t last, MOSDMap *m)
8541{
8542 dout(10) << __func__ << " " << first << ".." << last << dendl;
8543 if (is_stopping()) {
8544 dout(10) << __func__ << " bailing, we are shutting down" << dendl;
8545 return;
8546 }
11fdf7f2 8547 std::lock_guard l(osd_lock);
31f18b77
FG
8548 if (is_stopping()) {
8549 dout(10) << __func__ << " bailing, we are shutting down" << dendl;
8550 return;
8551 }
7c673cae
FG
8552 map_lock.get_write();
8553
8554 bool do_shutdown = false;
8555 bool do_restart = false;
8556 bool network_error = false;
8557
8558 // advance through the new maps
8559 for (epoch_t cur = first; cur <= last; cur++) {
8560 dout(10) << " advance to epoch " << cur
8561 << " (<= last " << last
8562 << " <= newest_map " << superblock.newest_map
8563 << ")" << dendl;
8564
8565 OSDMapRef newmap = get_map(cur);
11fdf7f2 8566 ceph_assert(newmap); // we just cached it above!
7c673cae
FG
8567
8568 // start blacklisting messages sent to peers that go down.
8569 service.pre_publish_map(newmap);
8570
8571 // kill connections to newly down osds
8572 bool waited_for_reservations = false;
8573 set<int> old;
8574 osdmap->get_all_osds(old);
8575 for (set<int>::iterator p = old.begin(); p != old.end(); ++p) {
8576 if (*p != whoami &&
8577 osdmap->is_up(*p) && // in old map
8578 newmap->is_down(*p)) { // but not the new one
8579 if (!waited_for_reservations) {
8580 service.await_reserved_maps();
8581 waited_for_reservations = true;
8582 }
8583 note_down_osd(*p);
8584 } else if (*p != whoami &&
8585 osdmap->is_down(*p) &&
8586 newmap->is_up(*p)) {
8587 note_up_osd(*p);
8588 }
8589 }
8590
81eedcae 8591 if (osdmap->is_noup(whoami) != newmap->is_noup(whoami)) {
7c673cae
FG
8592 dout(10) << __func__ << " NOUP flag changed in " << newmap->get_epoch()
8593 << dendl;
8594 if (is_booting()) {
8595 // this captures the case where we sent the boot message while
8596 // NOUP was being set on the mon and our boot request was
8597 // dropped, and then later it is cleared. it imperfectly
8598 // handles the case where our original boot message was not
8599 // dropped and we restart even though we might have booted, but
8600 // that is harmless (boot will just take slightly longer).
8601 do_restart = true;
8602 }
8603 }
8604
8605 osdmap = newmap;
8606 epoch_t up_epoch;
8607 epoch_t boot_epoch;
8608 service.retrieve_epochs(&boot_epoch, &up_epoch, NULL);
8609 if (!up_epoch &&
8610 osdmap->is_up(whoami) &&
11fdf7f2 8611 osdmap->get_addrs(whoami) == client_messenger->get_myaddrs()) {
7c673cae
FG
8612 up_epoch = osdmap->get_epoch();
8613 dout(10) << "up_epoch is " << up_epoch << dendl;
8614 if (!boot_epoch) {
8615 boot_epoch = osdmap->get_epoch();
8616 dout(10) << "boot_epoch is " << boot_epoch << dendl;
8617 }
8618 service.set_epochs(&boot_epoch, &up_epoch, NULL);
8619 }
8620 }
8621
8622 had_map_since = ceph_clock_now();
8623
8624 epoch_t _bind_epoch = service.get_bind_epoch();
8625 if (osdmap->is_up(whoami) &&
11fdf7f2
TL
8626 osdmap->get_addrs(whoami).legacy_equals(
8627 client_messenger->get_myaddrs()) &&
7c673cae
FG
8628 _bind_epoch < osdmap->get_up_from(whoami)) {
8629
8630 if (is_booting()) {
8631 dout(1) << "state: booting -> active" << dendl;
8632 set_state(STATE_ACTIVE);
11fdf7f2 8633 do_restart = false;
7c673cae
FG
8634
8635 // set incarnation so that osd_reqid_t's we generate for our
8636 // objecter requests are unique across restarts.
8637 service.objecter->set_client_incarnation(osdmap->get_epoch());
11fdf7f2 8638 cancel_pending_failures();
7c673cae
FG
8639 }
8640 }
8641
8642 if (osdmap->get_epoch() > 0 &&
8643 is_active()) {
8644 if (!osdmap->exists(whoami)) {
8645 dout(0) << "map says i do not exist. shutting down." << dendl;
8646 do_shutdown = true; // don't call shutdown() while we have
8647 // everything paused
8648 } else if (!osdmap->is_up(whoami) ||
11fdf7f2
TL
8649 !osdmap->get_addrs(whoami).legacy_equals(
8650 client_messenger->get_myaddrs()) ||
8651 !osdmap->get_cluster_addrs(whoami).legacy_equals(
8652 cluster_messenger->get_myaddrs()) ||
8653 !osdmap->get_hb_back_addrs(whoami).legacy_equals(
8654 hb_back_server_messenger->get_myaddrs()) ||
8655 !osdmap->get_hb_front_addrs(whoami).legacy_equals(
8656 hb_front_server_messenger->get_myaddrs())) {
7c673cae
FG
8657 if (!osdmap->is_up(whoami)) {
8658 if (service.is_preparing_to_stop() || service.is_stopping()) {
8659 service.got_stop_ack();
8660 } else {
c07f9fc5
FG
8661 clog->warn() << "Monitor daemon marked osd." << whoami << " down, "
8662 "but it is still running";
8663 clog->debug() << "map e" << osdmap->get_epoch()
8664 << " wrongly marked me down at e"
8665 << osdmap->get_down_at(whoami);
7c673cae 8666 }
11fdf7f2
TL
8667 } else if (!osdmap->get_addrs(whoami).legacy_equals(
8668 client_messenger->get_myaddrs())) {
7c673cae 8669 clog->error() << "map e" << osdmap->get_epoch()
11fdf7f2
TL
8670 << " had wrong client addr (" << osdmap->get_addrs(whoami)
8671 << " != my " << client_messenger->get_myaddrs() << ")";
8672 } else if (!osdmap->get_cluster_addrs(whoami).legacy_equals(
8673 cluster_messenger->get_myaddrs())) {
7c673cae
FG
8674 clog->error() << "map e" << osdmap->get_epoch()
8675 << " had wrong cluster addr ("
11fdf7f2
TL
8676 << osdmap->get_cluster_addrs(whoami)
8677 << " != my " << cluster_messenger->get_myaddrs() << ")";
8678 } else if (!osdmap->get_hb_back_addrs(whoami).legacy_equals(
8679 hb_back_server_messenger->get_myaddrs())) {
7c673cae 8680 clog->error() << "map e" << osdmap->get_epoch()
c07f9fc5 8681 << " had wrong heartbeat back addr ("
11fdf7f2
TL
8682 << osdmap->get_hb_back_addrs(whoami)
8683 << " != my " << hb_back_server_messenger->get_myaddrs()
7c673cae 8684 << ")";
11fdf7f2
TL
8685 } else if (!osdmap->get_hb_front_addrs(whoami).legacy_equals(
8686 hb_front_server_messenger->get_myaddrs())) {
7c673cae 8687 clog->error() << "map e" << osdmap->get_epoch()
c07f9fc5 8688 << " had wrong heartbeat front addr ("
11fdf7f2
TL
8689 << osdmap->get_hb_front_addrs(whoami)
8690 << " != my " << hb_front_server_messenger->get_myaddrs()
7c673cae
FG
8691 << ")";
8692 }
8693
8694 if (!service.is_stopping()) {
8695 epoch_t up_epoch = 0;
8696 epoch_t bind_epoch = osdmap->get_epoch();
8697 service.set_epochs(NULL,&up_epoch, &bind_epoch);
8698 do_restart = true;
8699
8700 //add markdown log
8701 utime_t now = ceph_clock_now();
8702 utime_t grace = utime_t(cct->_conf->osd_max_markdown_period, 0);
8703 osd_markdown_log.push_back(now);
8704 //clear all out-of-date log
8705 while (!osd_markdown_log.empty() &&
8706 osd_markdown_log.front() + grace < now)
8707 osd_markdown_log.pop_front();
8708 if ((int)osd_markdown_log.size() > cct->_conf->osd_max_markdown_count) {
8709 dout(0) << __func__ << " marked down "
8710 << osd_markdown_log.size()
8711 << " > osd_max_markdown_count "
8712 << cct->_conf->osd_max_markdown_count
8713 << " in last " << grace << " seconds, shutting down"
8714 << dendl;
8715 do_restart = false;
8716 do_shutdown = true;
8717 }
8718
8719 start_waiting_for_healthy();
8720
8721 set<int> avoid_ports;
8722#if defined(__FreeBSD__)
8723 // prevent FreeBSD from grabbing the client_messenger port during
8724 // rebinding. In which case a cluster_meesneger will connect also
8725 // to the same port
11fdf7f2 8726 client_messenger->get_myaddrs().get_ports(&avoid_ports);
7c673cae 8727#endif
11fdf7f2
TL
8728 cluster_messenger->get_myaddrs().get_ports(&avoid_ports);
8729 hb_back_server_messenger->get_myaddrs().get_ports(&avoid_ports);
8730 hb_front_server_messenger->get_myaddrs().get_ports(&avoid_ports);
7c673cae
FG
8731
8732 int r = cluster_messenger->rebind(avoid_ports);
8733 if (r != 0) {
8734 do_shutdown = true; // FIXME: do_restart?
8735 network_error = true;
8736 dout(0) << __func__ << " marked down:"
8737 << " rebind cluster_messenger failed" << dendl;
8738 }
8739
8740 r = hb_back_server_messenger->rebind(avoid_ports);
8741 if (r != 0) {
8742 do_shutdown = true; // FIXME: do_restart?
8743 network_error = true;
8744 dout(0) << __func__ << " marked down:"
8745 << " rebind hb_back_server_messenger failed" << dendl;
8746 }
8747
8748 r = hb_front_server_messenger->rebind(avoid_ports);
8749 if (r != 0) {
8750 do_shutdown = true; // FIXME: do_restart?
8751 network_error = true;
8752 dout(0) << __func__ << " marked down:"
8753 << " rebind hb_front_server_messenger failed" << dendl;
8754 }
8755
8756 hb_front_client_messenger->mark_down_all();
8757 hb_back_client_messenger->mark_down_all();
8758
494da23a 8759 reset_heartbeat_peers(true);
7c673cae
FG
8760 }
8761 }
8762 }
8763
8764 map_lock.put_write();
8765
11fdf7f2 8766 check_osdmap_features();
7c673cae
FG
8767
8768 // yay!
8769 consume_map();
8770
8771 if (is_active() || is_waiting_for_healthy())
8772 maybe_update_heartbeat_peers();
8773
11fdf7f2 8774 if (is_active()) {
7c673cae
FG
8775 activate_map();
8776 }
8777
31f18b77 8778 if (do_shutdown) {
7c673cae 8779 if (network_error) {
11fdf7f2 8780 cancel_pending_failures();
7c673cae
FG
8781 }
8782 // trigger shutdown in a different thread
8783 dout(0) << __func__ << " shutdown OSD via async signal" << dendl;
8784 queue_async_signal(SIGINT);
8785 }
31f18b77
FG
8786 else if (m->newest_map && m->newest_map > last) {
8787 dout(10) << " msg say newest map is " << m->newest_map
8788 << ", requesting more" << dendl;
8789 osdmap_subscribe(osdmap->get_epoch()+1, false);
8790 }
7c673cae
FG
8791 else if (is_preboot()) {
8792 if (m->get_source().is_mon())
8793 _preboot(m->oldest_map, m->newest_map);
8794 else
8795 start_boot();
8796 }
8797 else if (do_restart)
8798 start_boot();
8799
8800}
8801
11fdf7f2 8802void OSD::check_osdmap_features()
7c673cae
FG
8803{
8804 // adjust required feature bits?
8805
8806 // we have to be a bit careful here, because we are accessing the
8807 // Policy structures without taking any lock. in particular, only
8808 // modify integer values that can safely be read by a racing CPU.
8809 // since we are only accessing existing Policy structures a their
8810 // current memory location, and setting or clearing bits in integer
8811 // fields, and we are the only writer, this is not a problem.
8812
8813 {
8814 Messenger::Policy p = client_messenger->get_default_policy();
8815 uint64_t mask;
8816 uint64_t features = osdmap->get_features(entity_name_t::TYPE_CLIENT, &mask);
8817 if ((p.features_required & mask) != features) {
8818 dout(0) << "crush map has features " << features
8819 << ", adjusting msgr requires for clients" << dendl;
8820 p.features_required = (p.features_required & ~mask) | features;
8821 client_messenger->set_default_policy(p);
8822 }
8823 }
8824 {
8825 Messenger::Policy p = client_messenger->get_policy(entity_name_t::TYPE_MON);
8826 uint64_t mask;
8827 uint64_t features = osdmap->get_features(entity_name_t::TYPE_MON, &mask);
8828 if ((p.features_required & mask) != features) {
8829 dout(0) << "crush map has features " << features
8830 << " was " << p.features_required
8831 << ", adjusting msgr requires for mons" << dendl;
8832 p.features_required = (p.features_required & ~mask) | features;
8833 client_messenger->set_policy(entity_name_t::TYPE_MON, p);
8834 }
8835 }
8836 {
8837 Messenger::Policy p = cluster_messenger->get_policy(entity_name_t::TYPE_OSD);
8838 uint64_t mask;
8839 uint64_t features = osdmap->get_features(entity_name_t::TYPE_OSD, &mask);
8840
8841 if ((p.features_required & mask) != features) {
8842 dout(0) << "crush map has features " << features
8843 << ", adjusting msgr requires for osds" << dendl;
8844 p.features_required = (p.features_required & ~mask) | features;
8845 cluster_messenger->set_policy(entity_name_t::TYPE_OSD, p);
8846 }
8847
11fdf7f2 8848 if (!superblock.compat_features.incompat.contains(CEPH_OSD_FEATURE_INCOMPAT_SHARDS)) {
7c673cae
FG
8849 dout(0) << __func__ << " enabling on-disk ERASURE CODES compat feature" << dendl;
8850 superblock.compat_features.incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_SHARDS);
8851 ObjectStore::Transaction t;
8852 write_superblock(t);
11fdf7f2
TL
8853 int err = store->queue_transaction(service.meta_ch, std::move(t), NULL);
8854 ceph_assert(err == 0);
7c673cae
FG
8855 }
8856 }
11fdf7f2
TL
8857
8858 if (osdmap->require_osd_release < CEPH_RELEASE_NAUTILUS) {
8859 heartbeat_dispatcher.ms_set_require_authorizer(false);
8860 }
8861
8862 if (osdmap->require_osd_release != last_require_osd_release) {
8863 dout(1) << __func__ << " require_osd_release " << last_require_osd_release
8864 << " -> " << to_string(osdmap->require_osd_release) << dendl;
8865 store->write_meta("require_osd_release",
8866 stringify((int)osdmap->require_osd_release));
8867 last_require_osd_release = osdmap->require_osd_release;
8868 }
7c673cae
FG
8869}
8870
11fdf7f2
TL
8871struct C_FinishSplits : public Context {
8872 OSD *osd;
8873 set<PGRef> pgs;
8874 C_FinishSplits(OSD *osd, const set<PGRef> &in)
8875 : osd(osd), pgs(in) {}
8876 void finish(int r) override {
8877 osd->_finish_splits(pgs);
8878 }
8879};
8880
8881void OSD::_finish_splits(set<PGRef>& pgs)
7c673cae 8882{
11fdf7f2
TL
8883 dout(10) << __func__ << " " << pgs << dendl;
8884 if (is_stopping())
8885 return;
8886 PG::RecoveryCtx rctx = create_context();
8887 for (set<PGRef>::iterator i = pgs.begin();
8888 i != pgs.end();
8889 ++i) {
8890 PG *pg = i->get();
7c673cae 8891
11fdf7f2
TL
8892 pg->lock();
8893 dout(10) << __func__ << " " << *pg << dendl;
8894 epoch_t e = pg->get_osdmap_epoch();
8895 pg->handle_initialize(&rctx);
8896 pg->queue_null(e, e);
8897 dispatch_context_transaction(rctx, pg);
8898 pg->unlock();
7c673cae 8899
11fdf7f2
TL
8900 unsigned shard_index = pg->pg_id.hash_to_shard(num_shards);
8901 shards[shard_index]->register_and_wake_split_child(pg);
7c673cae
FG
8902 }
8903
11fdf7f2
TL
8904 dispatch_context(rctx, 0, service.get_osdmap());
8905};
8906
8907bool OSD::add_merge_waiter(OSDMapRef nextmap, spg_t target, PGRef src,
8908 unsigned need)
8909{
8910 std::lock_guard l(merge_lock);
8911 auto& p = merge_waiters[nextmap->get_epoch()][target];
8912 p[src->pg_id] = src;
8913 dout(10) << __func__ << " added merge_waiter " << src->pg_id
8914 << " for " << target << ", have " << p.size() << "/" << need
8915 << dendl;
8916 return p.size() == need;
8917}
8918
8919bool OSD::advance_pg(
8920 epoch_t osd_epoch,
8921 PG *pg,
8922 ThreadPool::TPHandle &handle,
8923 PG::RecoveryCtx *rctx)
8924{
8925 if (osd_epoch <= pg->get_osdmap_epoch()) {
8926 return true;
8927 }
8928 ceph_assert(pg->is_locked());
8929 OSDMapRef lastmap = pg->get_osdmap();
8930 ceph_assert(lastmap->get_epoch() < osd_epoch);
8931 set<PGRef> new_pgs; // any split children
8932 bool ret = true;
8933
8934 unsigned old_pg_num = lastmap->have_pg_pool(pg->pg_id.pool()) ?
8935 lastmap->get_pg_num(pg->pg_id.pool()) : 0;
8936 for (epoch_t next_epoch = pg->get_osdmap_epoch() + 1;
8937 next_epoch <= osd_epoch;
7c673cae
FG
8938 ++next_epoch) {
8939 OSDMapRef nextmap = service.try_get_map(next_epoch);
8940 if (!nextmap) {
8941 dout(20) << __func__ << " missing map " << next_epoch << dendl;
7c673cae
FG
8942 continue;
8943 }
8944
11fdf7f2
TL
8945 unsigned new_pg_num =
8946 (old_pg_num && nextmap->have_pg_pool(pg->pg_id.pool())) ?
8947 nextmap->get_pg_num(pg->pg_id.pool()) : 0;
8948 if (old_pg_num && new_pg_num && old_pg_num != new_pg_num) {
8949 // check for merge
8950 if (nextmap->have_pg_pool(pg->pg_id.pool())) {
8951 spg_t parent;
8952 if (pg->pg_id.is_merge_source(
8953 old_pg_num,
8954 new_pg_num,
8955 &parent)) {
8956 // we are merge source
8957 PGRef spg = pg; // carry a ref
8958 dout(1) << __func__ << " " << pg->pg_id
8959 << " is merge source, target is " << parent
8960 << dendl;
8961 pg->write_if_dirty(rctx);
8962 dispatch_context_transaction(*rctx, pg, &handle);
8963 pg->ch->flush();
eafe8130
TL
8964 // release backoffs explicitly, since the on_shutdown path
8965 // aggressively tears down backoff state.
8966 if (pg->is_primary()) {
8967 pg->release_pg_backoffs();
8968 }
11fdf7f2
TL
8969 pg->on_shutdown();
8970 OSDShard *sdata = pg->osd_shard;
8971 {
8972 std::lock_guard l(sdata->shard_lock);
8973 if (pg->pg_slot) {
8974 sdata->_detach_pg(pg->pg_slot);
8975 // update pg count now since we might not get an osdmap
8976 // any time soon.
8977 if (pg->is_primary())
8978 logger->dec(l_osd_pg_primary);
8979 else if (pg->is_replica())
8980 logger->dec(l_osd_pg_replica);
8981 else
8982 logger->dec(l_osd_pg_stray);
8983 }
8984 }
8985 pg->unlock();
8986
8987 set<spg_t> children;
8988 parent.is_split(new_pg_num, old_pg_num, &children);
8989 if (add_merge_waiter(nextmap, parent, pg, children.size())) {
8990 enqueue_peering_evt(
8991 parent,
8992 PGPeeringEventRef(
8993 std::make_shared<PGPeeringEvent>(
8994 nextmap->get_epoch(),
8995 nextmap->get_epoch(),
8996 NullEvt())));
8997 }
8998 ret = false;
8999 goto out;
9000 } else if (pg->pg_id.is_merge_target(old_pg_num, new_pg_num)) {
9001 // we are merge target
9002 set<spg_t> children;
9003 pg->pg_id.is_split(new_pg_num, old_pg_num, &children);
9004 dout(20) << __func__ << " " << pg->pg_id
9005 << " is merge target, sources are " << children
9006 << dendl;
9007 map<spg_t,PGRef> sources;
9008 {
9009 std::lock_guard l(merge_lock);
9010 auto& s = merge_waiters[nextmap->get_epoch()][pg->pg_id];
9011 unsigned need = children.size();
9012 dout(20) << __func__ << " have " << s.size() << "/"
9013 << need << dendl;
9014 if (s.size() == need) {
9015 sources.swap(s);
9016 merge_waiters[nextmap->get_epoch()].erase(pg->pg_id);
9017 if (merge_waiters[nextmap->get_epoch()].empty()) {
9018 merge_waiters.erase(nextmap->get_epoch());
9019 }
9020 }
9021 }
9022 if (!sources.empty()) {
9023 unsigned new_pg_num = nextmap->get_pg_num(pg->pg_id.pool());
9024 unsigned split_bits = pg->pg_id.get_split_bits(new_pg_num);
9025 dout(1) << __func__ << " merging " << pg->pg_id << dendl;
9026 pg->merge_from(
9027 sources, rctx, split_bits,
9028 nextmap->get_pg_pool(
9029 pg->pg_id.pool())->last_pg_merge_meta);
9030 pg->pg_slot->waiting_for_merge_epoch = 0;
9031 } else {
9032 dout(20) << __func__ << " not ready to merge yet" << dendl;
9033 pg->write_if_dirty(rctx);
9034 pg->unlock();
9035 // kick source(s) to get them ready
9036 for (auto& i : children) {
9037 dout(20) << __func__ << " kicking source " << i << dendl;
9038 enqueue_peering_evt(
9039 i,
9040 PGPeeringEventRef(
9041 std::make_shared<PGPeeringEvent>(
9042 nextmap->get_epoch(),
9043 nextmap->get_epoch(),
9044 NullEvt())));
9045 }
9046 ret = false;
9047 goto out;
9048 }
9049 }
9050 }
9051 }
9052
7c673cae
FG
9053 vector<int> newup, newacting;
9054 int up_primary, acting_primary;
9055 nextmap->pg_to_up_acting_osds(
11fdf7f2 9056 pg->pg_id.pgid,
7c673cae
FG
9057 &newup, &up_primary,
9058 &newacting, &acting_primary);
9059 pg->handle_advance_map(
9060 nextmap, lastmap, newup, up_primary,
9061 newacting, acting_primary, rctx);
9062
494da23a
TL
9063 auto oldpool = lastmap->get_pools().find(pg->pg_id.pool());
9064 auto newpool = nextmap->get_pools().find(pg->pg_id.pool());
9065 if (oldpool != lastmap->get_pools().end()
9066 && newpool != nextmap->get_pools().end()) {
9067 dout(20) << __func__
9068 << " new pool opts " << newpool->second.opts
9069 << " old pool opts " << oldpool->second.opts
9070 << dendl;
9071
9072 double old_min_interval = 0, new_min_interval = 0;
9073 oldpool->second.opts.get(pool_opts_t::SCRUB_MIN_INTERVAL, &old_min_interval);
9074 newpool->second.opts.get(pool_opts_t::SCRUB_MIN_INTERVAL, &new_min_interval);
9075
9076 double old_max_interval = 0, new_max_interval = 0;
9077 oldpool->second.opts.get(pool_opts_t::SCRUB_MAX_INTERVAL, &old_max_interval);
9078 newpool->second.opts.get(pool_opts_t::SCRUB_MAX_INTERVAL, &new_max_interval);
9079
9080 // Assume if an interval is change from set to unset or vice versa the actual config
9081 // is different. Keep it simple even if it is possible to call resched_all_scrub()
9082 // unnecessarily.
9083 if (old_min_interval != new_min_interval || old_max_interval != new_max_interval) {
9084 pg->on_info_history_change();
9085 }
9086 }
9087
11fdf7f2
TL
9088 if (new_pg_num && old_pg_num != new_pg_num) {
9089 // check for split
9090 set<spg_t> children;
9091 if (pg->pg_id.is_split(
9092 old_pg_num,
9093 new_pg_num,
9094 &children)) {
9095 split_pgs(
9096 pg, children, &new_pgs, lastmap, nextmap,
9097 rctx);
9098 }
7c673cae
FG
9099 }
9100
9101 lastmap = nextmap;
11fdf7f2 9102 old_pg_num = new_pg_num;
7c673cae
FG
9103 handle.reset_tp_timeout();
9104 }
7c673cae 9105 pg->handle_activate_map(rctx);
11fdf7f2
TL
9106
9107 ret = true;
9108 out:
9109 if (!new_pgs.empty()) {
9110 rctx->transaction->register_on_applied(new C_FinishSplits(this, new_pgs));
7c673cae 9111 }
11fdf7f2 9112 return ret;
7c673cae
FG
9113}
9114
9115void OSD::consume_map()
9116{
11fdf7f2 9117 ceph_assert(osd_lock.is_locked());
7c673cae
FG
9118 dout(7) << "consume_map version " << osdmap->get_epoch() << dendl;
9119
3efd9988
FG
9120 /** make sure the cluster is speaking in SORTBITWISE, because we don't
9121 * speak the older sorting version any more. Be careful not to force
9122 * a shutdown if we are merely processing old maps, though.
9123 */
9124 if (!osdmap->test_flag(CEPH_OSDMAP_SORTBITWISE) && is_active()) {
9125 derr << __func__ << " SORTBITWISE flag is not set" << dendl;
9126 ceph_abort();
9127 }
9128
11fdf7f2
TL
9129 service.pre_publish_map(osdmap);
9130 service.await_reserved_maps();
9131 service.publish_map(osdmap);
7c673cae 9132
11fdf7f2
TL
9133 // prime splits and merges
9134 set<pair<spg_t,epoch_t>> newly_split; // splits, and when
9135 set<pair<spg_t,epoch_t>> merge_pgs; // merge participants, and when
9136 for (auto& shard : shards) {
9137 shard->identify_splits_and_merges(osdmap, &newly_split, &merge_pgs);
9138 }
9139 if (!newly_split.empty()) {
9140 for (auto& shard : shards) {
9141 shard->prime_splits(osdmap, &newly_split);
9142 }
9143 ceph_assert(newly_split.empty());
9144 }
7c673cae 9145
11fdf7f2
TL
9146 // prune sent_ready_to_merge
9147 service.prune_sent_ready_to_merge(osdmap);
7c673cae 9148
11fdf7f2
TL
9149 // FIXME, maybe: We could race against an incoming peering message
9150 // that instantiates a merge PG after identify_merges() below and
9151 // never set up its peer to complete the merge. An OSD restart
9152 // would clear it up. This is a hard race to resolve,
9153 // extraordinarily rare (we only merge PGs that are stable and
9154 // clean, so it'd have to be an imported PG to an OSD with a
9155 // slightly stale OSDMap...), so I'm ignoring it for now. We plan to
9156 // replace all of this with a seastar-based code soon anyway.
9157 if (!merge_pgs.empty()) {
9158 // mark the pgs we already have, or create new and empty merge
9159 // participants for those we are missing. do this all under the
9160 // shard lock so we don't have to worry about racing pg creates
9161 // via _process.
9162 for (auto& shard : shards) {
9163 shard->prime_merges(osdmap, &merge_pgs);
7c673cae 9164 }
11fdf7f2
TL
9165 ceph_assert(merge_pgs.empty());
9166 }
9167
9168 service.prune_pg_created();
9169
9170 unsigned pushes_to_free = 0;
9171 for (auto& shard : shards) {
9172 shard->consume_map(osdmap, &pushes_to_free);
9173 }
9174
9175 vector<spg_t> pgids;
9176 _get_pgids(&pgids);
9177
9178 // count (FIXME, probably during seastar rewrite)
9179 int num_pg_primary = 0, num_pg_replica = 0, num_pg_stray = 0;
9180 vector<PGRef> pgs;
9181 _get_pgs(&pgs);
9182 for (auto& pg : pgs) {
9183 // FIXME (probably during seastar rewrite): this is lockless and
9184 // racy, but we don't want to take pg lock here.
9185 if (pg->is_primary())
9186 num_pg_primary++;
9187 else if (pg->is_replica())
9188 num_pg_replica++;
9189 else
9190 num_pg_stray++;
9191 }
3efd9988 9192
11fdf7f2
TL
9193 {
9194 // FIXME (as part of seastar rewrite): move to OSDShard
9195 std::lock_guard l(pending_creates_lock);
9196 for (auto pg = pending_creates_from_osd.begin();
9197 pg != pending_creates_from_osd.end();) {
b32b8144 9198 if (osdmap->get_pg_acting_rank(pg->first, whoami) < 0) {
11fdf7f2
TL
9199 dout(10) << __func__ << " pg " << pg->first << " doesn't map here, "
9200 << "discarding pending_create_from_osd" << dendl;
3efd9988
FG
9201 pg = pending_creates_from_osd.erase(pg);
9202 } else {
9203 ++pg;
9204 }
9205 }
7c673cae
FG
9206 }
9207
7c673cae
FG
9208 service.maybe_inject_dispatch_delay();
9209
9210 dispatch_sessions_waiting_on_map();
9211
9212 service.maybe_inject_dispatch_delay();
9213
11fdf7f2 9214 service.release_reserved_pushes(pushes_to_free);
7c673cae 9215
11fdf7f2
TL
9216 // queue null events to push maps down to individual PGs
9217 for (auto pgid : pgids) {
9218 enqueue_peering_evt(
9219 pgid,
9220 PGPeeringEventRef(
9221 std::make_shared<PGPeeringEvent>(
9222 osdmap->get_epoch(),
9223 osdmap->get_epoch(),
9224 NullEvt())));
7c673cae 9225 }
11fdf7f2 9226 logger->set(l_osd_pg, pgids.size());
7c673cae
FG
9227 logger->set(l_osd_pg_primary, num_pg_primary);
9228 logger->set(l_osd_pg_replica, num_pg_replica);
9229 logger->set(l_osd_pg_stray, num_pg_stray);
9230}
9231
9232void OSD::activate_map()
9233{
11fdf7f2 9234 ceph_assert(osd_lock.is_locked());
7c673cae
FG
9235
9236 dout(7) << "activate_map version " << osdmap->get_epoch() << dendl;
9237
7c673cae
FG
9238 if (osdmap->test_flag(CEPH_OSDMAP_FULL)) {
9239 dout(10) << " osdmap flagged full, doing onetime osdmap subscribe" << dendl;
9240 osdmap_subscribe(osdmap->get_epoch() + 1, false);
9241 }
9242
9243 // norecover?
9244 if (osdmap->test_flag(CEPH_OSDMAP_NORECOVER)) {
9245 if (!service.recovery_is_paused()) {
9246 dout(1) << "pausing recovery (NORECOVER flag set)" << dendl;
9247 service.pause_recovery();
9248 }
9249 } else {
9250 if (service.recovery_is_paused()) {
9251 dout(1) << "unpausing recovery (NORECOVER flag unset)" << dendl;
9252 service.unpause_recovery();
9253 }
9254 }
9255
9256 service.activate_map();
9257
9258 // process waiters
9259 take_waiters(waiting_for_osdmap);
9260}
9261
9262bool OSD::require_mon_peer(const Message *m)
9263{
9264 if (!m->get_connection()->peer_is_mon()) {
9265 dout(0) << "require_mon_peer received from non-mon "
9266 << m->get_connection()->get_peer_addr()
9267 << " " << *m << dendl;
9268 return false;
9269 }
9270 return true;
9271}
9272
9273bool OSD::require_mon_or_mgr_peer(const Message *m)
9274{
9275 if (!m->get_connection()->peer_is_mon() &&
9276 !m->get_connection()->peer_is_mgr()) {
9277 dout(0) << "require_mon_or_mgr_peer received from non-mon, non-mgr "
9278 << m->get_connection()->get_peer_addr()
9279 << " " << *m << dendl;
9280 return false;
9281 }
9282 return true;
9283}
9284
9285bool OSD::require_osd_peer(const Message *m)
9286{
9287 if (!m->get_connection()->peer_is_osd()) {
9288 dout(0) << "require_osd_peer received from non-osd "
9289 << m->get_connection()->get_peer_addr()
9290 << " " << *m << dendl;
9291 return false;
9292 }
9293 return true;
9294}
9295
9296bool OSD::require_self_aliveness(const Message *m, epoch_t epoch)
9297{
9298 epoch_t up_epoch = service.get_up_epoch();
9299 if (epoch < up_epoch) {
9300 dout(7) << "from pre-up epoch " << epoch << " < " << up_epoch << dendl;
9301 return false;
9302 }
9303
9304 if (!is_active()) {
9305 dout(7) << "still in boot state, dropping message " << *m << dendl;
9306 return false;
9307 }
9308
9309 return true;
9310}
9311
9312bool OSD::require_same_peer_instance(const Message *m, OSDMapRef& map,
9313 bool is_fast_dispatch)
9314{
9315 int from = m->get_source().num();
9316
9317 if (map->is_down(from) ||
11fdf7f2 9318 (map->get_cluster_addrs(from) != m->get_source_addrs())) {
7c673cae
FG
9319 dout(5) << "from dead osd." << from << ", marking down, "
9320 << " msg was " << m->get_source_inst().addr
11fdf7f2
TL
9321 << " expected "
9322 << (map->is_up(from) ?
9323 map->get_cluster_addrs(from) : entity_addrvec_t())
7c673cae
FG
9324 << dendl;
9325 ConnectionRef con = m->get_connection();
9326 con->mark_down();
11fdf7f2
TL
9327 auto priv = con->get_priv();
9328 if (auto s = static_cast<Session*>(priv.get()); s) {
7c673cae
FG
9329 if (!is_fast_dispatch)
9330 s->session_dispatch_lock.Lock();
9331 clear_session_waiting_on_map(s);
11fdf7f2
TL
9332 con->set_priv(nullptr); // break ref <-> session cycle, if any
9333 s->con.reset();
7c673cae
FG
9334 if (!is_fast_dispatch)
9335 s->session_dispatch_lock.Unlock();
7c673cae
FG
9336 }
9337 return false;
9338 }
9339 return true;
9340}
9341
9342
9343/*
9344 * require that we have same (or newer) map, and that
9345 * the source is the pg primary.
9346 */
9347bool OSD::require_same_or_newer_map(OpRequestRef& op, epoch_t epoch,
9348 bool is_fast_dispatch)
9349{
9350 const Message *m = op->get_req();
9351 dout(15) << "require_same_or_newer_map " << epoch
9352 << " (i am " << osdmap->get_epoch() << ") " << m << dendl;
9353
11fdf7f2 9354 ceph_assert(osd_lock.is_locked());
7c673cae
FG
9355
9356 // do they have a newer map?
9357 if (epoch > osdmap->get_epoch()) {
9358 dout(7) << "waiting for newer map epoch " << epoch
9359 << " > my " << osdmap->get_epoch() << " with " << m << dendl;
9360 wait_for_new_map(op);
9361 return false;
9362 }
9363
9364 if (!require_self_aliveness(op->get_req(), epoch)) {
9365 return false;
9366 }
9367
9368 // ok, our map is same or newer.. do they still exist?
9369 if (m->get_connection()->get_messenger() == cluster_messenger &&
9370 !require_same_peer_instance(op->get_req(), osdmap, is_fast_dispatch)) {
9371 return false;
9372 }
9373
9374 return true;
9375}
9376
9377
9378
9379
9380
9381// ----------------------------------------
9382// pg creation
9383
9384void OSD::split_pgs(
9385 PG *parent,
31f18b77 9386 const set<spg_t> &childpgids, set<PGRef> *out_pgs,
7c673cae
FG
9387 OSDMapRef curmap,
9388 OSDMapRef nextmap,
9389 PG::RecoveryCtx *rctx)
9390{
11fdf7f2
TL
9391 unsigned pg_num = nextmap->get_pg_num(parent->pg_id.pool());
9392 parent->update_snap_mapper_bits(parent->get_pgid().get_split_bits(pg_num));
7c673cae 9393
11fdf7f2
TL
9394 vector<object_stat_sum_t> updated_stats;
9395 parent->start_split_stats(childpgids, &updated_stats);
7c673cae
FG
9396
9397 vector<object_stat_sum_t>::iterator stat_iter = updated_stats.begin();
9398 for (set<spg_t>::const_iterator i = childpgids.begin();
9399 i != childpgids.end();
9400 ++i, ++stat_iter) {
11fdf7f2
TL
9401 ceph_assert(stat_iter != updated_stats.end());
9402 dout(10) << __func__ << " splitting " << *parent << " into " << *i << dendl;
7c673cae
FG
9403 PG* child = _make_pg(nextmap, *i);
9404 child->lock(true);
9405 out_pgs->insert(child);
11fdf7f2 9406 child->ch = store->create_new_collection(child->coll);
7c673cae 9407
11fdf7f2
TL
9408 {
9409 uint32_t shard_index = i->hash_to_shard(shards.size());
9410 assert(NULL != shards[shard_index]);
9411 store->set_collection_commit_queue(child->coll, &(shards[shard_index]->context_queue));
9412 }
7c673cae 9413
11fdf7f2
TL
9414 unsigned split_bits = i->get_split_bits(pg_num);
9415 dout(10) << " pg_num is " << pg_num
9416 << ", m_seed " << i->ps()
9417 << ", split_bits is " << split_bits << dendl;
7c673cae
FG
9418 parent->split_colls(
9419 *i,
9420 split_bits,
9421 i->ps(),
11fdf7f2 9422 &child->get_pool().info,
7c673cae
FG
9423 rctx->transaction);
9424 parent->split_into(
9425 i->pgid,
9426 child,
9427 split_bits);
7c673cae 9428
92f5a8d4
TL
9429 child->init_collection_pool_opts();
9430
11fdf7f2 9431 child->finish_split_stats(*stat_iter, rctx->transaction);
7c673cae
FG
9432 child->unlock();
9433 }
11fdf7f2
TL
9434 ceph_assert(stat_iter != updated_stats.end());
9435 parent->finish_split_stats(*stat_iter, rctx->transaction);
7c673cae
FG
9436}
9437
9438/*
9439 * holding osd_lock
9440 */
9441void OSD::handle_pg_create(OpRequestRef op)
9442{
9443 const MOSDPGCreate *m = static_cast<const MOSDPGCreate*>(op->get_req());
11fdf7f2 9444 ceph_assert(m->get_type() == MSG_OSD_PG_CREATE);
7c673cae
FG
9445
9446 dout(10) << "handle_pg_create " << *m << dendl;
9447
9448 if (!require_mon_peer(op->get_req())) {
9449 return;
9450 }
9451
9452 if (!require_same_or_newer_map(op, m->epoch, false))
9453 return;
9454
9455 op->mark_started();
9456
9457 map<pg_t,utime_t>::const_iterator ci = m->ctimes.begin();
9458 for (map<pg_t,pg_create_t>::const_iterator p = m->mkpg.begin();
9459 p != m->mkpg.end();
9460 ++p, ++ci) {
11fdf7f2 9461 ceph_assert(ci != m->ctimes.end() && ci->first == p->first);
7c673cae
FG
9462 epoch_t created = p->second.created;
9463 if (p->second.split_bits) // Skip split pgs
9464 continue;
9465 pg_t on = p->first;
9466
7c673cae
FG
9467 if (!osdmap->have_pg_pool(on.pool())) {
9468 dout(20) << "ignoring pg on deleted pool " << on << dendl;
9469 continue;
9470 }
9471
9472 dout(20) << "mkpg " << on << " e" << created << "@" << ci->second << dendl;
9473
9474 // is it still ours?
9475 vector<int> up, acting;
9476 int up_primary = -1;
9477 int acting_primary = -1;
9478 osdmap->pg_to_up_acting_osds(on, &up, &up_primary, &acting, &acting_primary);
9479 int role = osdmap->calc_pg_role(whoami, acting, acting.size());
9480
9481 if (acting_primary != whoami) {
9482 dout(10) << "mkpg " << on << " not acting_primary (" << acting_primary
9483 << "), my role=" << role << ", skipping" << dendl;
9484 continue;
9485 }
9486
9487 spg_t pgid;
9488 bool mapped = osdmap->get_primary_shard(on, &pgid);
11fdf7f2 9489 ceph_assert(mapped);
7c673cae 9490
11fdf7f2 9491 PastIntervals pi;
7c673cae
FG
9492 pg_history_t history;
9493 build_initial_pg_history(pgid, created, ci->second, &history, &pi);
9494
11fdf7f2
TL
9495 // The mon won't resend unless the primary changed, so we ignore
9496 // same_interval_since. We'll pass this history with the current
9497 // epoch as the event.
7c673cae
FG
9498 if (history.same_primary_since > m->epoch) {
9499 dout(10) << __func__ << ": got obsolete pg create on pgid "
9500 << pgid << " from epoch " << m->epoch
9501 << ", primary changed in " << history.same_primary_since
9502 << dendl;
9503 continue;
9504 }
11fdf7f2
TL
9505 enqueue_peering_evt(
9506 pgid,
9507 PGPeeringEventRef(
9508 std::make_shared<PGPeeringEvent>(
9509 osdmap->get_epoch(),
9510 osdmap->get_epoch(),
9511 NullEvt(),
9512 true,
9513 new PGCreateInfo(
9514 pgid,
9515 osdmap->get_epoch(),
9516 history,
9517 pi,
9518 true)
9519 )));
7c673cae 9520 }
7c673cae 9521
3efd9988 9522 {
11fdf7f2 9523 std::lock_guard l(pending_creates_lock);
3efd9988
FG
9524 if (pending_creates_from_mon == 0) {
9525 last_pg_create_epoch = m->epoch;
9526 }
9527 }
11fdf7f2 9528
7c673cae
FG
9529 maybe_update_heartbeat_peers();
9530}
9531
9532
9533// ----------------------------------------
9534// peering and recovery
9535
9536PG::RecoveryCtx OSD::create_context()
9537{
9538 ObjectStore::Transaction *t = new ObjectStore::Transaction;
7c673cae
FG
9539 map<int, map<spg_t,pg_query_t> > *query_map =
9540 new map<int, map<spg_t, pg_query_t> >;
9541 map<int,vector<pair<pg_notify_t, PastIntervals> > > *notify_list =
9542 new map<int, vector<pair<pg_notify_t, PastIntervals> > >;
9543 map<int,vector<pair<pg_notify_t, PastIntervals> > > *info_map =
9544 new map<int,vector<pair<pg_notify_t, PastIntervals> > >;
11fdf7f2 9545 PG::RecoveryCtx rctx(query_map, info_map, notify_list, t);
7c673cae
FG
9546 return rctx;
9547}
9548
7c673cae
FG
9549void OSD::dispatch_context_transaction(PG::RecoveryCtx &ctx, PG *pg,
9550 ThreadPool::TPHandle *handle)
9551{
11fdf7f2 9552 if (!ctx.transaction->empty() || ctx.transaction->has_contexts()) {
7c673cae 9553 int tr = store->queue_transaction(
11fdf7f2
TL
9554 pg->ch,
9555 std::move(*ctx.transaction), TrackedOpRef(), handle);
9556 ceph_assert(tr == 0);
7c673cae 9557 delete (ctx.transaction);
7c673cae 9558 ctx.transaction = new ObjectStore::Transaction;
7c673cae
FG
9559 }
9560}
9561
9562void OSD::dispatch_context(PG::RecoveryCtx &ctx, PG *pg, OSDMapRef curmap,
9563 ThreadPool::TPHandle *handle)
9564{
11fdf7f2
TL
9565 if (!service.get_osdmap()->is_up(whoami)) {
9566 dout(20) << __func__ << " not up in osdmap" << dendl;
9567 } else if (!is_active()) {
9568 dout(20) << __func__ << " not active" << dendl;
9569 } else {
7c673cae
FG
9570 do_notifies(*ctx.notify_list, curmap);
9571 do_queries(*ctx.query_map, curmap);
9572 do_infos(*ctx.info_map, curmap);
9573 }
11fdf7f2 9574 if ((!ctx.transaction->empty() || ctx.transaction->has_contexts()) && pg) {
7c673cae 9575 int tr = store->queue_transaction(
11fdf7f2
TL
9576 pg->ch,
9577 std::move(*ctx.transaction), TrackedOpRef(),
7c673cae 9578 handle);
11fdf7f2 9579 ceph_assert(tr == 0);
7c673cae 9580 }
11fdf7f2
TL
9581 delete ctx.notify_list;
9582 delete ctx.query_map;
9583 delete ctx.info_map;
9584 delete ctx.transaction;
9585}
9586
9587void OSD::discard_context(PG::RecoveryCtx& ctx)
9588{
9589 delete ctx.notify_list;
9590 delete ctx.query_map;
9591 delete ctx.info_map;
9592 delete ctx.transaction;
7c673cae
FG
9593}
9594
11fdf7f2 9595
7c673cae
FG
9596/** do_notifies
9597 * Send an MOSDPGNotify to a primary, with a list of PGs that I have
9598 * content for, and they are primary for.
9599 */
9600
9601void OSD::do_notifies(
9602 map<int,vector<pair<pg_notify_t,PastIntervals> > >& notify_list,
9603 OSDMapRef curmap)
9604{
9605 for (map<int,
9606 vector<pair<pg_notify_t,PastIntervals> > >::iterator it =
9607 notify_list.begin();
9608 it != notify_list.end();
9609 ++it) {
9610 if (!curmap->is_up(it->first)) {
9611 dout(20) << __func__ << " skipping down osd." << it->first << dendl;
9612 continue;
9613 }
9614 ConnectionRef con = service.get_con_osd_cluster(
9615 it->first, curmap->get_epoch());
9616 if (!con) {
9617 dout(20) << __func__ << " skipping osd." << it->first
9618 << " (NULL con)" << dendl;
9619 continue;
9620 }
9621 service.share_map_peer(it->first, con.get(), curmap);
3efd9988 9622 dout(7) << __func__ << " osd." << it->first
7c673cae
FG
9623 << " on " << it->second.size() << " PGs" << dendl;
9624 MOSDPGNotify *m = new MOSDPGNotify(curmap->get_epoch(),
9625 it->second);
9626 con->send_message(m);
9627 }
9628}
9629
9630
9631/** do_queries
9632 * send out pending queries for info | summaries
9633 */
9634void OSD::do_queries(map<int, map<spg_t,pg_query_t> >& query_map,
9635 OSDMapRef curmap)
9636{
9637 for (map<int, map<spg_t,pg_query_t> >::iterator pit = query_map.begin();
9638 pit != query_map.end();
9639 ++pit) {
9640 if (!curmap->is_up(pit->first)) {
9641 dout(20) << __func__ << " skipping down osd." << pit->first << dendl;
9642 continue;
9643 }
9644 int who = pit->first;
9645 ConnectionRef con = service.get_con_osd_cluster(who, curmap->get_epoch());
9646 if (!con) {
9647 dout(20) << __func__ << " skipping osd." << who
9648 << " (NULL con)" << dendl;
9649 continue;
9650 }
9651 service.share_map_peer(who, con.get(), curmap);
9652 dout(7) << __func__ << " querying osd." << who
9653 << " on " << pit->second.size() << " PGs" << dendl;
9654 MOSDPGQuery *m = new MOSDPGQuery(curmap->get_epoch(), pit->second);
9655 con->send_message(m);
9656 }
9657}
9658
9659
9660void OSD::do_infos(map<int,
9661 vector<pair<pg_notify_t, PastIntervals> > >& info_map,
9662 OSDMapRef curmap)
9663{
9664 for (map<int,
9665 vector<pair<pg_notify_t, PastIntervals> > >::iterator p =
9666 info_map.begin();
9667 p != info_map.end();
9668 ++p) {
9669 if (!curmap->is_up(p->first)) {
9670 dout(20) << __func__ << " skipping down osd." << p->first << dendl;
9671 continue;
9672 }
9673 for (vector<pair<pg_notify_t,PastIntervals> >::iterator i = p->second.begin();
9674 i != p->second.end();
9675 ++i) {
9676 dout(20) << __func__ << " sending info " << i->first.info
9677 << " to shard " << p->first << dendl;
9678 }
9679 ConnectionRef con = service.get_con_osd_cluster(
9680 p->first, curmap->get_epoch());
9681 if (!con) {
9682 dout(20) << __func__ << " skipping osd." << p->first
9683 << " (NULL con)" << dendl;
9684 continue;
9685 }
9686 service.share_map_peer(p->first, con.get(), curmap);
9687 MOSDPGInfo *m = new MOSDPGInfo(curmap->get_epoch());
9688 m->pg_list = p->second;
9689 con->send_message(m);
9690 }
9691 info_map.clear();
9692}
9693
11fdf7f2 9694void OSD::handle_fast_pg_create(MOSDPGCreate2 *m)
7c673cae 9695{
11fdf7f2
TL
9696 dout(7) << __func__ << " " << *m << " from " << m->get_source() << dendl;
9697 if (!require_mon_peer(m)) {
9698 m->put();
7c673cae 9699 return;
7c673cae 9700 }
11fdf7f2
TL
9701 for (auto& p : m->pgs) {
9702 spg_t pgid = p.first;
9703 epoch_t created = p.second.first;
9704 utime_t created_stamp = p.second.second;
9705 dout(20) << __func__ << " " << pgid << " e" << created
9706 << "@" << created_stamp << dendl;
9707 pg_history_t h;
9708 h.epoch_created = created;
9709 h.epoch_pool_created = created;
9710 h.same_up_since = created;
9711 h.same_interval_since = created;
9712 h.same_primary_since = created;
9713 h.last_scrub_stamp = created_stamp;
9714 h.last_deep_scrub_stamp = created_stamp;
9715 h.last_clean_scrub_stamp = created_stamp;
9716
9717 enqueue_peering_evt(
9718 pgid,
9719 PGPeeringEventRef(
9720 std::make_shared<PGPeeringEvent>(
9721 m->epoch,
9722 m->epoch,
9723 NullEvt(),
9724 true,
9725 new PGCreateInfo(
9726 pgid,
9727 created,
9728 h,
9729 PastIntervals(),
9730 true)
9731 )));
9732 }
7c673cae 9733
11fdf7f2
TL
9734 {
9735 std::lock_guard l(pending_creates_lock);
9736 if (pending_creates_from_mon == 0) {
9737 last_pg_create_epoch = m->epoch;
9738 }
7c673cae
FG
9739 }
9740
11fdf7f2 9741 m->put();
7c673cae
FG
9742}
9743
11fdf7f2 9744void OSD::handle_fast_pg_query(MOSDPGQuery *m)
7c673cae 9745{
11fdf7f2
TL
9746 dout(7) << __func__ << " " << *m << " from " << m->get_source() << dendl;
9747 if (!require_osd_peer(m)) {
9748 m->put();
7c673cae 9749 return;
11fdf7f2 9750 }
7c673cae 9751 int from = m->get_source().num();
11fdf7f2
TL
9752 for (auto& p : m->pg_list) {
9753 enqueue_peering_evt(
9754 p.first,
9755 PGPeeringEventRef(
9756 std::make_shared<PGPeeringEvent>(
9757 p.second.epoch_sent, p.second.epoch_sent,
9758 MQuery(
9759 p.first,
9760 pg_shard_t(from, p.second.from),
9761 p.second,
9762 p.second.epoch_sent),
9763 false))
7c673cae
FG
9764 );
9765 }
11fdf7f2 9766 m->put();
7c673cae
FG
9767}
9768
11fdf7f2 9769void OSD::handle_fast_pg_notify(MOSDPGNotify* m)
7c673cae 9770{
11fdf7f2
TL
9771 dout(7) << __func__ << " " << *m << " from " << m->get_source() << dendl;
9772 if (!require_osd_peer(m)) {
9773 m->put();
7c673cae
FG
9774 return;
9775 }
11fdf7f2
TL
9776 int from = m->get_source().num();
9777 for (auto& p : m->get_pg_list()) {
9778 spg_t pgid(p.first.info.pgid.pgid, p.first.to);
9779 enqueue_peering_evt(
9780 pgid,
9781 PGPeeringEventRef(
9782 std::make_shared<PGPeeringEvent>(
9783 p.first.epoch_sent,
9784 p.first.query_epoch,
9785 MNotifyRec(
9786 pgid, pg_shard_t(from, p.first.from),
9787 p.first,
9788 m->get_connection()->get_features(),
9789 p.second),
9790 true,
9791 new PGCreateInfo(
9792 pgid,
9793 p.first.query_epoch,
9794 p.first.info.history,
9795 p.second,
9796 false)
9797 )));
7c673cae 9798 }
11fdf7f2 9799 m->put();
7c673cae
FG
9800}
9801
11fdf7f2 9802void OSD::handle_fast_pg_info(MOSDPGInfo* m)
7c673cae 9803{
11fdf7f2
TL
9804 dout(7) << __func__ << " " << *m << " from " << m->get_source() << dendl;
9805 if (!require_osd_peer(m)) {
9806 m->put();
7c673cae
FG
9807 return;
9808 }
11fdf7f2
TL
9809 int from = m->get_source().num();
9810 for (auto& p : m->pg_list) {
9811 enqueue_peering_evt(
9812 spg_t(p.first.info.pgid.pgid, p.first.to),
9813 PGPeeringEventRef(
9814 std::make_shared<PGPeeringEvent>(
9815 p.first.epoch_sent, p.first.query_epoch,
9816 MInfoRec(
9817 pg_shard_t(from, p.first.from),
9818 p.first.info,
9819 p.first.epoch_sent)))
9820 );
7c673cae 9821 }
11fdf7f2 9822 m->put();
7c673cae
FG
9823}
9824
11fdf7f2 9825void OSD::handle_fast_pg_remove(MOSDPGRemove *m)
7c673cae 9826{
11fdf7f2
TL
9827 dout(7) << __func__ << " " << *m << " from " << m->get_source() << dendl;
9828 if (!require_osd_peer(m)) {
9829 m->put();
7c673cae
FG
9830 return;
9831 }
11fdf7f2
TL
9832 for (auto& pgid : m->pg_list) {
9833 enqueue_peering_evt(
9834 pgid,
9835 PGPeeringEventRef(
9836 std::make_shared<PGPeeringEvent>(
9837 m->get_epoch(), m->get_epoch(),
9838 PG::DeleteStart())));
7c673cae 9839 }
11fdf7f2 9840 m->put();
7c673cae
FG
9841}
9842
11fdf7f2 9843void OSD::handle_fast_force_recovery(MOSDForceRecovery *m)
c07f9fc5 9844{
11fdf7f2
TL
9845 dout(10) << __func__ << " " << *m << dendl;
9846 if (!require_mon_or_mgr_peer(m)) {
9847 m->put();
9848 return;
9849 }
9850 epoch_t epoch = get_osdmap_epoch();
9851 for (auto pgid : m->forced_pgs) {
9852 if (m->options & OFR_BACKFILL) {
9853 if (m->options & OFR_CANCEL) {
9854 enqueue_peering_evt(
9855 pgid,
9856 PGPeeringEventRef(
9857 std::make_shared<PGPeeringEvent>(
9858 epoch, epoch,
9859 PG::UnsetForceBackfill())));
9860 } else {
9861 enqueue_peering_evt(
9862 pgid,
9863 PGPeeringEventRef(
9864 std::make_shared<PGPeeringEvent>(
9865 epoch, epoch,
9866 PG::SetForceBackfill())));
9867 }
9868 } else if (m->options & OFR_RECOVERY) {
9869 if (m->options & OFR_CANCEL) {
9870 enqueue_peering_evt(
9871 pgid,
9872 PGPeeringEventRef(
9873 std::make_shared<PGPeeringEvent>(
9874 epoch, epoch,
9875 PG::UnsetForceRecovery())));
9876 } else {
9877 enqueue_peering_evt(
9878 pgid,
9879 PGPeeringEventRef(
9880 std::make_shared<PGPeeringEvent>(
9881 epoch, epoch,
9882 PG::SetForceRecovery())));
c07f9fc5
FG
9883 }
9884 }
9885 }
11fdf7f2 9886 m->put();
c07f9fc5 9887}
7c673cae 9888
11fdf7f2 9889void OSD::handle_pg_query_nopg(const MQuery& q)
7c673cae 9890{
11fdf7f2
TL
9891 spg_t pgid = q.pgid;
9892 dout(10) << __func__ << " " << pgid << dendl;
7c673cae 9893
11fdf7f2
TL
9894 OSDMapRef osdmap = get_osdmap();
9895 if (!osdmap->have_pg_pool(pgid.pool()))
7c673cae
FG
9896 return;
9897
11fdf7f2
TL
9898 dout(10) << " pg " << pgid << " dne" << dendl;
9899 pg_info_t empty(spg_t(pgid.pgid, q.query.to));
9900 ConnectionRef con = service.get_con_osd_cluster(q.from.osd, osdmap->get_epoch());
9901 if (con) {
9902 Message *m;
9903 if (q.query.type == pg_query_t::LOG ||
9904 q.query.type == pg_query_t::FULLLOG) {
9905 m = new MOSDPGLog(
9906 q.query.from, q.query.to,
9907 osdmap->get_epoch(), empty,
9908 q.query.epoch_sent);
7c673cae 9909 } else {
11fdf7f2
TL
9910 vector<pair<pg_notify_t,PastIntervals>> ls;
9911 ls.push_back(
7c673cae
FG
9912 make_pair(
9913 pg_notify_t(
11fdf7f2
TL
9914 q.query.from, q.query.to,
9915 q.query.epoch_sent,
7c673cae
FG
9916 osdmap->get_epoch(),
9917 empty),
11fdf7f2
TL
9918 PastIntervals()));
9919 m = new MOSDPGNotify(osdmap->get_epoch(), ls);
7c673cae 9920 }
11fdf7f2
TL
9921 service.share_map_peer(q.from.osd, con.get(), osdmap);
9922 con->send_message(m);
7c673cae
FG
9923 }
9924}
9925
7c673cae 9926
7c673cae
FG
9927// =========================================================
9928// RECOVERY
9929
9930void OSDService::_maybe_queue_recovery() {
11fdf7f2 9931 ceph_assert(recovery_lock.is_locked_by_me());
7c673cae
FG
9932 uint64_t available_pushes;
9933 while (!awaiting_throttle.empty() &&
9934 _recover_now(&available_pushes)) {
11fdf7f2 9935 uint64_t to_start = std::min(
7c673cae
FG
9936 available_pushes,
9937 cct->_conf->osd_recovery_max_single_start);
9938 _queue_for_recovery(awaiting_throttle.front(), to_start);
9939 awaiting_throttle.pop_front();
11fdf7f2
TL
9940 dout(10) << __func__ << " starting " << to_start
9941 << ", recovery_ops_reserved " << recovery_ops_reserved
9942 << " -> " << (recovery_ops_reserved + to_start) << dendl;
7c673cae
FG
9943 recovery_ops_reserved += to_start;
9944 }
9945}
9946
9947bool OSDService::_recover_now(uint64_t *available_pushes)
9948{
9949 if (available_pushes)
9950 *available_pushes = 0;
9951
9952 if (ceph_clock_now() < defer_recovery_until) {
9953 dout(15) << __func__ << " defer until " << defer_recovery_until << dendl;
9954 return false;
9955 }
9956
9957 if (recovery_paused) {
9958 dout(15) << __func__ << " paused" << dendl;
9959 return false;
9960 }
9961
9962 uint64_t max = cct->_conf->osd_recovery_max_active;
9963 if (max <= recovery_ops_active + recovery_ops_reserved) {
9964 dout(15) << __func__ << " active " << recovery_ops_active
9965 << " + reserved " << recovery_ops_reserved
9966 << " >= max " << max << dendl;
9967 return false;
9968 }
9969
9970 if (available_pushes)
9971 *available_pushes = max - recovery_ops_active - recovery_ops_reserved;
9972
9973 return true;
9974}
9975
9976void OSD::do_recovery(
9977 PG *pg, epoch_t queued, uint64_t reserved_pushes,
9978 ThreadPool::TPHandle &handle)
9979{
9980 uint64_t started = 0;
31f18b77
FG
9981
9982 /*
9983 * When the value of osd_recovery_sleep is set greater than zero, recovery
9984 * ops are scheduled after osd_recovery_sleep amount of time from the previous
9985 * recovery event's schedule time. This is done by adding a
9986 * recovery_requeue_callback event, which re-queues the recovery op using
9987 * queue_recovery_after_sleep.
9988 */
c07f9fc5 9989 float recovery_sleep = get_osd_recovery_sleep();
b32b8144 9990 {
11fdf7f2 9991 std::lock_guard l(service.sleep_lock);
b32b8144
FG
9992 if (recovery_sleep > 0 && service.recovery_needs_sleep) {
9993 PGRef pgref(pg);
9994 auto recovery_requeue_callback = new FunctionContext([this, pgref, queued, reserved_pushes](int r) {
9995 dout(20) << "do_recovery wake up at "
9996 << ceph_clock_now()
9997 << ", re-queuing recovery" << dendl;
11fdf7f2 9998 std::lock_guard l(service.sleep_lock);
b32b8144
FG
9999 service.recovery_needs_sleep = false;
10000 service.queue_recovery_after_sleep(pgref.get(), queued, reserved_pushes);
10001 });
10002
10003 // This is true for the first recovery op and when the previous recovery op
10004 // has been scheduled in the past. The next recovery op is scheduled after
10005 // completing the sleep from now.
10006 if (service.recovery_schedule_time < ceph_clock_now()) {
10007 service.recovery_schedule_time = ceph_clock_now();
10008 }
10009 service.recovery_schedule_time += recovery_sleep;
11fdf7f2 10010 service.sleep_timer.add_event_at(service.recovery_schedule_time,
b32b8144
FG
10011 recovery_requeue_callback);
10012 dout(20) << "Recovery event scheduled at "
10013 << service.recovery_schedule_time << dendl;
10014 return;
10015 }
7c673cae
FG
10016 }
10017
10018 {
b32b8144 10019 {
11fdf7f2 10020 std::lock_guard l(service.sleep_lock);
b32b8144
FG
10021 service.recovery_needs_sleep = true;
10022 }
10023
7c673cae
FG
10024 if (pg->pg_has_reset_since(queued)) {
10025 goto out;
10026 }
10027
7c673cae
FG
10028 dout(10) << "do_recovery starting " << reserved_pushes << " " << *pg << dendl;
10029#ifdef DEBUG_RECOVERY_OIDS
11fdf7f2 10030 dout(20) << " active was " << service.recovery_oids[pg->pg_id] << dendl;
7c673cae
FG
10031#endif
10032
11fdf7f2 10033 bool do_unfound = pg->start_recovery_ops(reserved_pushes, handle, &started);
7c673cae
FG
10034 dout(10) << "do_recovery started " << started << "/" << reserved_pushes
10035 << " on " << *pg << dendl;
10036
11fdf7f2
TL
10037 if (do_unfound) {
10038 PG::RecoveryCtx rctx = create_context();
10039 rctx.handle = &handle;
10040 pg->find_unfound(queued, &rctx);
10041 dispatch_context(rctx, pg, pg->get_osdmap());
7c673cae 10042 }
7c673cae
FG
10043 }
10044
10045 out:
11fdf7f2 10046 ceph_assert(started <= reserved_pushes);
7c673cae
FG
10047 service.release_reserved_pushes(reserved_pushes);
10048}
10049
10050void OSDService::start_recovery_op(PG *pg, const hobject_t& soid)
10051{
11fdf7f2 10052 std::lock_guard l(recovery_lock);
7c673cae
FG
10053 dout(10) << "start_recovery_op " << *pg << " " << soid
10054 << " (" << recovery_ops_active << "/"
10055 << cct->_conf->osd_recovery_max_active << " rops)"
10056 << dendl;
10057 recovery_ops_active++;
10058
10059#ifdef DEBUG_RECOVERY_OIDS
11fdf7f2
TL
10060 dout(20) << " active was " << recovery_oids[pg->pg_id] << dendl;
10061 ceph_assert(recovery_oids[pg->pg_id].count(soid) == 0);
10062 recovery_oids[pg->pg_id].insert(soid);
7c673cae
FG
10063#endif
10064}
10065
10066void OSDService::finish_recovery_op(PG *pg, const hobject_t& soid, bool dequeue)
10067{
11fdf7f2 10068 std::lock_guard l(recovery_lock);
7c673cae
FG
10069 dout(10) << "finish_recovery_op " << *pg << " " << soid
10070 << " dequeue=" << dequeue
10071 << " (" << recovery_ops_active << "/" << cct->_conf->osd_recovery_max_active << " rops)"
10072 << dendl;
10073
10074 // adjust count
11fdf7f2 10075 ceph_assert(recovery_ops_active > 0);
7c673cae
FG
10076 recovery_ops_active--;
10077
10078#ifdef DEBUG_RECOVERY_OIDS
11fdf7f2
TL
10079 dout(20) << " active oids was " << recovery_oids[pg->pg_id] << dendl;
10080 ceph_assert(recovery_oids[pg->pg_id].count(soid));
10081 recovery_oids[pg->pg_id].erase(soid);
7c673cae
FG
10082#endif
10083
10084 _maybe_queue_recovery();
10085}
10086
10087bool OSDService::is_recovery_active()
10088{
eafe8130
TL
10089 if (cct->_conf->osd_debug_pretend_recovery_active) {
10090 return true;
10091 }
b5b8bbf5 10092 return local_reserver.has_reservation() || remote_reserver.has_reservation();
7c673cae
FG
10093}
10094
11fdf7f2
TL
10095void OSDService::release_reserved_pushes(uint64_t pushes)
10096{
10097 std::lock_guard l(recovery_lock);
10098 dout(10) << __func__ << "(" << pushes << "), recovery_ops_reserved "
10099 << recovery_ops_reserved << " -> " << (recovery_ops_reserved-pushes)
10100 << dendl;
10101 ceph_assert(recovery_ops_reserved >= pushes);
10102 recovery_ops_reserved -= pushes;
10103 _maybe_queue_recovery();
10104}
10105
7c673cae
FG
10106// =========================================================
10107// OPS
10108
10109bool OSD::op_is_discardable(const MOSDOp *op)
10110{
10111 // drop client request if they are not connected and can't get the
10112 // reply anyway.
10113 if (!op->get_connection()->is_connected()) {
10114 return true;
10115 }
10116 return false;
10117}
10118
11fdf7f2 10119void OSD::enqueue_op(spg_t pg, OpRequestRef&& op, epoch_t epoch)
7c673cae 10120{
11fdf7f2
TL
10121 const utime_t stamp = op->get_req()->get_recv_stamp();
10122 const utime_t latency = ceph_clock_now() - stamp;
10123 const unsigned priority = op->get_req()->get_priority();
10124 const int cost = op->get_req()->get_cost();
10125 const uint64_t owner = op->get_req()->get_source().num();
10126
10127 dout(15) << "enqueue_op " << op << " prio " << priority
10128 << " cost " << cost
7c673cae
FG
10129 << " latency " << latency
10130 << " epoch " << epoch
10131 << " " << *(op->get_req()) << dendl;
10132 op->osd_trace.event("enqueue op");
11fdf7f2
TL
10133 op->osd_trace.keyval("priority", priority);
10134 op->osd_trace.keyval("cost", cost);
7c673cae 10135 op->mark_queued_for_pg();
224ce89b 10136 logger->tinc(l_osd_op_before_queue_op_lat, latency);
11fdf7f2
TL
10137 op_shardedwq.queue(
10138 OpQueueItem(
10139 unique_ptr<OpQueueItem::OpQueueable>(new PGOpItem(pg, std::move(op))),
10140 cost, priority, stamp, owner, epoch));
7c673cae
FG
10141}
10142
11fdf7f2
TL
10143void OSD::enqueue_peering_evt(spg_t pgid, PGPeeringEventRef evt)
10144{
10145 dout(15) << __func__ << " " << pgid << " " << evt->get_desc() << dendl;
10146 op_shardedwq.queue(
10147 OpQueueItem(
10148 unique_ptr<OpQueueItem::OpQueueable>(new PGPeeringItem(pgid, evt)),
10149 10,
10150 cct->_conf->osd_peering_op_priority,
10151 utime_t(),
10152 0,
10153 evt->get_epoch_sent()));
10154}
7c673cae 10155
11fdf7f2
TL
10156void OSD::enqueue_peering_evt_front(spg_t pgid, PGPeeringEventRef evt)
10157{
10158 dout(15) << __func__ << " " << pgid << " " << evt->get_desc() << dendl;
10159 op_shardedwq.queue_front(
10160 OpQueueItem(
10161 unique_ptr<OpQueueItem::OpQueueable>(new PGPeeringItem(pgid, evt)),
10162 10,
10163 cct->_conf->osd_peering_op_priority,
10164 utime_t(),
10165 0,
10166 evt->get_epoch_sent()));
10167}
7c673cae
FG
10168
10169/*
10170 * NOTE: dequeue called in worker thread, with pg lock
10171 */
10172void OSD::dequeue_op(
10173 PGRef pg, OpRequestRef op,
10174 ThreadPool::TPHandle &handle)
10175{
11fdf7f2 10176 FUNCTRACE(cct);
7c673cae
FG
10177 OID_EVENT_TRACE_WITH_MSG(op->get_req(), "DEQUEUE_OP_BEGIN", false);
10178
10179 utime_t now = ceph_clock_now();
10180 op->set_dequeued_time(now);
10181 utime_t latency = now - op->get_req()->get_recv_stamp();
10182 dout(10) << "dequeue_op " << op << " prio " << op->get_req()->get_priority()
10183 << " cost " << op->get_req()->get_cost()
10184 << " latency " << latency
10185 << " " << *(op->get_req())
10186 << " pg " << *pg << dendl;
10187
224ce89b
WB
10188 logger->tinc(l_osd_op_before_dequeue_op_lat, latency);
10189
11fdf7f2
TL
10190 auto priv = op->get_req()->get_connection()->get_priv();
10191 if (auto session = static_cast<Session *>(priv.get()); session) {
7c673cae 10192 maybe_share_map(session, op, pg->get_osdmap());
7c673cae
FG
10193 }
10194
11fdf7f2 10195 if (pg->is_deleting())
7c673cae
FG
10196 return;
10197
10198 op->mark_reached_pg();
10199 op->osd_trace.event("dequeue_op");
10200
10201 pg->do_request(op, handle);
10202
10203 // finish
10204 dout(10) << "dequeue_op " << op << " finish" << dendl;
10205 OID_EVENT_TRACE_WITH_MSG(op->get_req(), "DEQUEUE_OP_END", false);
10206}
10207
10208
11fdf7f2
TL
10209void OSD::dequeue_peering_evt(
10210 OSDShard *sdata,
10211 PG *pg,
10212 PGPeeringEventRef evt,
10213 ThreadPool::TPHandle& handle)
7c673cae 10214{
7c673cae 10215 PG::RecoveryCtx rctx = create_context();
11fdf7f2
TL
10216 auto curmap = sdata->get_osdmap();
10217 epoch_t need_up_thru = 0, same_interval_since = 0;
10218 if (!pg) {
10219 if (const MQuery *q = dynamic_cast<const MQuery*>(evt->evt.get())) {
10220 handle_pg_query_nopg(*q);
7c673cae 10221 } else {
11fdf7f2
TL
10222 derr << __func__ << " unrecognized pg-less event " << evt->get_desc() << dendl;
10223 ceph_abort();
10224 }
10225 } else if (advance_pg(curmap->get_epoch(), pg, handle, &rctx)) {
10226 pg->do_peering_event(evt, &rctx);
10227 if (pg->is_deleted()) {
10228 // do not dispatch rctx; the final _delete_some already did it.
10229 discard_context(rctx);
10230 pg->unlock();
10231 return;
7c673cae
FG
10232 }
10233 dispatch_context_transaction(rctx, pg, &handle);
11fdf7f2
TL
10234 need_up_thru = pg->get_need_up_thru();
10235 same_interval_since = pg->get_same_interval_since();
7c673cae
FG
10236 pg->unlock();
10237 }
11fdf7f2
TL
10238
10239 if (need_up_thru) {
7c673cae 10240 queue_want_up_thru(same_interval_since);
11fdf7f2
TL
10241 }
10242 dispatch_context(rctx, pg, curmap, &handle);
7c673cae
FG
10243
10244 service.send_pg_temp();
10245}
10246
11fdf7f2
TL
10247void OSD::dequeue_delete(
10248 OSDShard *sdata,
10249 PG *pg,
10250 epoch_t e,
10251 ThreadPool::TPHandle& handle)
10252{
10253 dequeue_peering_evt(
10254 sdata,
10255 pg,
10256 PGPeeringEventRef(
10257 std::make_shared<PGPeeringEvent>(
10258 e, e,
10259 PG::DeleteSome())),
10260 handle);
10261}
10262
10263
10264
7c673cae
FG
10265// --------------------------------
10266
10267const char** OSD::get_tracked_conf_keys() const
10268{
10269 static const char* KEYS[] = {
10270 "osd_max_backfills",
10271 "osd_min_recovery_priority",
224ce89b
WB
10272 "osd_max_trimming_pgs",
10273 "osd_op_complaint_time",
10274 "osd_op_log_threshold",
10275 "osd_op_history_size",
10276 "osd_op_history_duration",
10277 "osd_op_history_slow_op_size",
10278 "osd_op_history_slow_op_threshold",
7c673cae
FG
10279 "osd_enable_op_tracker",
10280 "osd_map_cache_size",
11fdf7f2 10281 "osd_pg_epoch_max_lag_factor",
7c673cae 10282 "osd_pg_epoch_persisted_max_stale",
7c673cae
FG
10283 // clog & admin clog
10284 "clog_to_monitors",
10285 "clog_to_syslog",
10286 "clog_to_syslog_facility",
10287 "clog_to_syslog_level",
10288 "osd_objectstore_fuse",
10289 "clog_to_graylog",
10290 "clog_to_graylog_host",
10291 "clog_to_graylog_port",
10292 "host",
10293 "fsid",
10294 "osd_recovery_delay_start",
10295 "osd_client_message_size_cap",
10296 "osd_client_message_cap",
31f18b77
FG
10297 "osd_heartbeat_min_size",
10298 "osd_heartbeat_interval",
494da23a
TL
10299 "osd_scrub_min_interval",
10300 "osd_scrub_max_interval",
7c673cae
FG
10301 NULL
10302 };
10303 return KEYS;
10304}
10305
11fdf7f2 10306void OSD::handle_conf_change(const ConfigProxy& conf,
7c673cae
FG
10307 const std::set <std::string> &changed)
10308{
f64942e4 10309 Mutex::Locker l(osd_lock);
7c673cae
FG
10310 if (changed.count("osd_max_backfills")) {
10311 service.local_reserver.set_max(cct->_conf->osd_max_backfills);
10312 service.remote_reserver.set_max(cct->_conf->osd_max_backfills);
10313 }
10314 if (changed.count("osd_min_recovery_priority")) {
10315 service.local_reserver.set_min_priority(cct->_conf->osd_min_recovery_priority);
10316 service.remote_reserver.set_min_priority(cct->_conf->osd_min_recovery_priority);
10317 }
10318 if (changed.count("osd_max_trimming_pgs")) {
10319 service.snap_reserver.set_max(cct->_conf->osd_max_trimming_pgs);
10320 }
10321 if (changed.count("osd_op_complaint_time") ||
10322 changed.count("osd_op_log_threshold")) {
10323 op_tracker.set_complaint_and_threshold(cct->_conf->osd_op_complaint_time,
10324 cct->_conf->osd_op_log_threshold);
10325 }
10326 if (changed.count("osd_op_history_size") ||
10327 changed.count("osd_op_history_duration")) {
10328 op_tracker.set_history_size_and_duration(cct->_conf->osd_op_history_size,
10329 cct->_conf->osd_op_history_duration);
10330 }
10331 if (changed.count("osd_op_history_slow_op_size") ||
10332 changed.count("osd_op_history_slow_op_threshold")) {
10333 op_tracker.set_history_slow_op_size_and_threshold(cct->_conf->osd_op_history_slow_op_size,
10334 cct->_conf->osd_op_history_slow_op_threshold);
10335 }
10336 if (changed.count("osd_enable_op_tracker")) {
10337 op_tracker.set_tracking(cct->_conf->osd_enable_op_tracker);
10338 }
7c673cae
FG
10339 if (changed.count("osd_map_cache_size")) {
10340 service.map_cache.set_size(cct->_conf->osd_map_cache_size);
10341 service.map_bl_cache.set_size(cct->_conf->osd_map_cache_size);
10342 service.map_bl_inc_cache.set_size(cct->_conf->osd_map_cache_size);
10343 }
10344 if (changed.count("clog_to_monitors") ||
10345 changed.count("clog_to_syslog") ||
10346 changed.count("clog_to_syslog_level") ||
10347 changed.count("clog_to_syslog_facility") ||
10348 changed.count("clog_to_graylog") ||
10349 changed.count("clog_to_graylog_host") ||
10350 changed.count("clog_to_graylog_port") ||
10351 changed.count("host") ||
10352 changed.count("fsid")) {
10353 update_log_config();
10354 }
11fdf7f2
TL
10355 if (changed.count("osd_pg_epoch_max_lag_factor")) {
10356 m_osd_pg_epoch_max_lag_factor = conf.get_val<double>(
10357 "osd_pg_epoch_max_lag_factor");
10358 }
7c673cae
FG
10359
10360#ifdef HAVE_LIBFUSE
10361 if (changed.count("osd_objectstore_fuse")) {
10362 if (store) {
10363 enable_disable_fuse(false);
10364 }
10365 }
10366#endif
10367
10368 if (changed.count("osd_recovery_delay_start")) {
10369 service.defer_recovery(cct->_conf->osd_recovery_delay_start);
10370 service.kick_recovery_queue();
10371 }
10372
10373 if (changed.count("osd_client_message_cap")) {
10374 uint64_t newval = cct->_conf->osd_client_message_cap;
10375 Messenger::Policy pol = client_messenger->get_policy(entity_name_t::TYPE_CLIENT);
10376 if (pol.throttler_messages && newval > 0) {
10377 pol.throttler_messages->reset_max(newval);
10378 }
10379 }
10380 if (changed.count("osd_client_message_size_cap")) {
10381 uint64_t newval = cct->_conf->osd_client_message_size_cap;
10382 Messenger::Policy pol = client_messenger->get_policy(entity_name_t::TYPE_CLIENT);
10383 if (pol.throttler_bytes && newval > 0) {
10384 pol.throttler_bytes->reset_max(newval);
10385 }
10386 }
10387
494da23a
TL
10388 if (changed.count("osd_scrub_min_interval") ||
10389 changed.count("osd_scrub_max_interval")) {
10390 resched_all_scrubs();
10391 dout(0) << __func__ << ": scrub interval change" << dendl;
10392 }
7c673cae
FG
10393 check_config();
10394}
10395
10396void OSD::update_log_config()
10397{
10398 map<string,string> log_to_monitors;
10399 map<string,string> log_to_syslog;
10400 map<string,string> log_channel;
10401 map<string,string> log_prio;
10402 map<string,string> log_to_graylog;
10403 map<string,string> log_to_graylog_host;
10404 map<string,string> log_to_graylog_port;
10405 uuid_d fsid;
10406 string host;
10407
10408 if (parse_log_client_options(cct, log_to_monitors, log_to_syslog,
10409 log_channel, log_prio, log_to_graylog,
10410 log_to_graylog_host, log_to_graylog_port,
10411 fsid, host) == 0)
10412 clog->update_config(log_to_monitors, log_to_syslog,
10413 log_channel, log_prio, log_to_graylog,
10414 log_to_graylog_host, log_to_graylog_port,
10415 fsid, host);
10416 derr << "log_to_monitors " << log_to_monitors << dendl;
10417}
10418
10419void OSD::check_config()
10420{
10421 // some sanity checks
7c673cae
FG
10422 if (cct->_conf->osd_map_cache_size <= (int)cct->_conf->osd_pg_epoch_persisted_max_stale + 2) {
10423 clog->warn() << "osd_map_cache_size (" << cct->_conf->osd_map_cache_size << ")"
10424 << " is not > osd_pg_epoch_persisted_max_stale ("
10425 << cct->_conf->osd_pg_epoch_persisted_max_stale << ")";
10426 }
10427}
10428
7c673cae
FG
10429// --------------------------------
10430
10431void OSD::get_latest_osdmap()
10432{
10433 dout(10) << __func__ << " -- start" << dendl;
10434
10435 C_SaferCond cond;
10436 service.objecter->wait_for_latest_osdmap(&cond);
10437 cond.wait();
10438
10439 dout(10) << __func__ << " -- finish" << dendl;
10440}
10441
10442// --------------------------------
10443
10444int OSD::init_op_flags(OpRequestRef& op)
10445{
10446 const MOSDOp *m = static_cast<const MOSDOp*>(op->get_req());
10447 vector<OSDOp>::const_iterator iter;
10448
10449 // client flags have no bearing on whether an op is a read, write, etc.
10450 op->rmw_flags = 0;
10451
10452 if (m->has_flag(CEPH_OSD_FLAG_RWORDERED)) {
10453 op->set_force_rwordered();
10454 }
10455
10456 // set bits based on op codes, called methods.
10457 for (iter = m->ops.begin(); iter != m->ops.end(); ++iter) {
10458 if ((iter->op.op == CEPH_OSD_OP_WATCH &&
10459 iter->op.watch.op == CEPH_OSD_WATCH_OP_PING)) {
10460 /* This a bit odd. PING isn't actually a write. It can't
11fdf7f2
TL
10461 * result in an update to the object_info. PINGs also aren't
10462 * resent, so there's no reason to write out a log entry.
7c673cae
FG
10463 *
10464 * However, we pipeline them behind writes, so let's force
10465 * the write_ordered flag.
10466 */
10467 op->set_force_rwordered();
10468 } else {
10469 if (ceph_osd_op_mode_modify(iter->op.op))
10470 op->set_write();
10471 }
10472 if (ceph_osd_op_mode_read(iter->op.op))
10473 op->set_read();
10474
10475 // set READ flag if there are src_oids
10476 if (iter->soid.oid.name.length())
10477 op->set_read();
10478
10479 // set PGOP flag if there are PG ops
10480 if (ceph_osd_op_type_pg(iter->op.op))
10481 op->set_pg_op();
10482
10483 if (ceph_osd_op_mode_cache(iter->op.op))
10484 op->set_cache();
10485
10486 // check for ec base pool
10487 int64_t poolid = m->get_pg().pool();
10488 const pg_pool_t *pool = osdmap->get_pg_pool(poolid);
10489 if (pool && pool->is_tier()) {
10490 const pg_pool_t *base_pool = osdmap->get_pg_pool(pool->tier_of);
10491 if (base_pool && base_pool->require_rollback()) {
10492 if ((iter->op.op != CEPH_OSD_OP_READ) &&
10493 (iter->op.op != CEPH_OSD_OP_CHECKSUM) &&
c07f9fc5 10494 (iter->op.op != CEPH_OSD_OP_CMPEXT) &&
7c673cae
FG
10495 (iter->op.op != CEPH_OSD_OP_STAT) &&
10496 (iter->op.op != CEPH_OSD_OP_ISDIRTY) &&
10497 (iter->op.op != CEPH_OSD_OP_UNDIRTY) &&
10498 (iter->op.op != CEPH_OSD_OP_GETXATTR) &&
10499 (iter->op.op != CEPH_OSD_OP_GETXATTRS) &&
10500 (iter->op.op != CEPH_OSD_OP_CMPXATTR) &&
10501 (iter->op.op != CEPH_OSD_OP_ASSERT_VER) &&
10502 (iter->op.op != CEPH_OSD_OP_LIST_WATCHERS) &&
10503 (iter->op.op != CEPH_OSD_OP_LIST_SNAPS) &&
10504 (iter->op.op != CEPH_OSD_OP_SETALLOCHINT) &&
10505 (iter->op.op != CEPH_OSD_OP_WRITEFULL) &&
10506 (iter->op.op != CEPH_OSD_OP_ROLLBACK) &&
10507 (iter->op.op != CEPH_OSD_OP_CREATE) &&
10508 (iter->op.op != CEPH_OSD_OP_DELETE) &&
10509 (iter->op.op != CEPH_OSD_OP_SETXATTR) &&
10510 (iter->op.op != CEPH_OSD_OP_RMXATTR) &&
10511 (iter->op.op != CEPH_OSD_OP_STARTSYNC) &&
10512 (iter->op.op != CEPH_OSD_OP_COPY_GET) &&
10513 (iter->op.op != CEPH_OSD_OP_COPY_FROM)) {
10514 op->set_promote();
10515 }
10516 }
10517 }
10518
10519 switch (iter->op.op) {
10520 case CEPH_OSD_OP_CALL:
10521 {
10522 bufferlist::iterator bp = const_cast<bufferlist&>(iter->indata).begin();
10523 int is_write, is_read;
10524 string cname, mname;
10525 bp.copy(iter->op.cls.class_len, cname);
10526 bp.copy(iter->op.cls.method_len, mname);
10527
10528 ClassHandler::ClassData *cls;
10529 int r = class_handler->open_class(cname, &cls);
10530 if (r) {
10531 derr << "class " << cname << " open got " << cpp_strerror(r) << dendl;
10532 if (r == -ENOENT)
10533 r = -EOPNOTSUPP;
10534 else if (r != -EPERM) // propagate permission errors
10535 r = -EIO;
10536 return r;
10537 }
10538 int flags = cls->get_method_flags(mname.c_str());
10539 if (flags < 0) {
10540 if (flags == -ENOENT)
10541 r = -EOPNOTSUPP;
10542 else
10543 r = flags;
10544 return r;
10545 }
10546 is_read = flags & CLS_METHOD_RD;
10547 is_write = flags & CLS_METHOD_WR;
10548 bool is_promote = flags & CLS_METHOD_PROMOTE;
10549
10550 dout(10) << "class " << cname << " method " << mname << " "
10551 << "flags=" << (is_read ? "r" : "")
10552 << (is_write ? "w" : "")
10553 << (is_promote ? "p" : "")
10554 << dendl;
10555 if (is_read)
10556 op->set_class_read();
10557 if (is_write)
10558 op->set_class_write();
10559 if (is_promote)
10560 op->set_promote();
11fdf7f2
TL
10561 op->add_class(std::move(cname), std::move(mname), is_read, is_write,
10562 cls->whitelisted);
7c673cae
FG
10563 break;
10564 }
10565
10566 case CEPH_OSD_OP_WATCH:
10567 // force the read bit for watch since it is depends on previous
10568 // watch state (and may return early if the watch exists) or, in
10569 // the case of ping, is simply a read op.
10570 op->set_read();
10571 // fall through
10572 case CEPH_OSD_OP_NOTIFY:
10573 case CEPH_OSD_OP_NOTIFY_ACK:
10574 {
10575 op->set_promote();
10576 break;
10577 }
10578
10579 case CEPH_OSD_OP_DELETE:
10580 // if we get a delete with FAILOK we can skip handle cache. without
10581 // FAILOK we still need to promote (or do something smarter) to
10582 // determine whether to return ENOENT or 0.
10583 if (iter == m->ops.begin() &&
10584 iter->op.flags == CEPH_OSD_OP_FLAG_FAILOK) {
10585 op->set_skip_handle_cache();
10586 }
10587 // skip promotion when proxying a delete op
10588 if (m->ops.size() == 1) {
10589 op->set_skip_promote();
10590 }
10591 break;
10592
10593 case CEPH_OSD_OP_CACHE_TRY_FLUSH:
10594 case CEPH_OSD_OP_CACHE_FLUSH:
10595 case CEPH_OSD_OP_CACHE_EVICT:
10596 // If try_flush/flush/evict is the only op, can skip handle cache.
10597 if (m->ops.size() == 1) {
10598 op->set_skip_handle_cache();
10599 }
10600 break;
10601
10602 case CEPH_OSD_OP_READ:
10603 case CEPH_OSD_OP_SYNC_READ:
10604 case CEPH_OSD_OP_SPARSE_READ:
10605 case CEPH_OSD_OP_CHECKSUM:
10606 case CEPH_OSD_OP_WRITEFULL:
10607 if (m->ops.size() == 1 &&
10608 (iter->op.flags & CEPH_OSD_OP_FLAG_FADVISE_NOCACHE ||
10609 iter->op.flags & CEPH_OSD_OP_FLAG_FADVISE_DONTNEED)) {
10610 op->set_skip_promote();
10611 }
10612 break;
10613
10614 // force promotion when pin an object in cache tier
10615 case CEPH_OSD_OP_CACHE_PIN:
10616 op->set_promote();
10617 break;
10618
10619 default:
10620 break;
10621 }
10622 }
10623
10624 if (op->rmw_flags == 0)
10625 return -EINVAL;
10626
10627 return 0;
10628}
10629
11fdf7f2
TL
10630void OSD::set_perf_queries(
10631 const std::map<OSDPerfMetricQuery, OSDPerfMetricLimits> &queries) {
10632 dout(10) << "setting " << queries.size() << " queries" << dendl;
10633
10634 std::list<OSDPerfMetricQuery> supported_queries;
10635 for (auto &it : queries) {
10636 auto &query = it.first;
10637 if (!query.key_descriptor.empty()) {
10638 supported_queries.push_back(query);
10639 }
10640 }
10641 if (supported_queries.size() < queries.size()) {
10642 dout(1) << queries.size() - supported_queries.size()
10643 << " unsupported queries" << dendl;
10644 }
10645
10646 {
10647 Mutex::Locker locker(m_perf_queries_lock);
10648 m_perf_queries = supported_queries;
10649 m_perf_limits = queries;
10650 }
10651
10652 std::vector<PGRef> pgs;
10653 _get_pgs(&pgs);
10654 for (auto& pg : pgs) {
eafe8130
TL
10655 pg->lock();
10656 pg->set_dynamic_perf_stats_queries(supported_queries);
10657 pg->unlock();
7c673cae 10658 }
7c673cae
FG
10659}
10660
11fdf7f2
TL
10661void OSD::get_perf_reports(
10662 std::map<OSDPerfMetricQuery, OSDPerfMetricReport> *reports) {
10663 std::vector<PGRef> pgs;
10664 _get_pgs(&pgs);
10665 DynamicPerfStats dps;
10666 for (auto& pg : pgs) {
eafe8130
TL
10667 // m_perf_queries can be modified only in set_perf_queries by mgr client
10668 // request, and it is protected by by mgr client's lock, which is held
10669 // when set_perf_queries/get_perf_reports are called, so we may not hold
10670 // m_perf_queries_lock here.
10671 DynamicPerfStats pg_dps(m_perf_queries);
10672 pg->lock();
10673 pg->get_dynamic_perf_stats(&pg_dps);
10674 pg->unlock();
10675 dps.merge(pg_dps);
11fdf7f2
TL
10676 }
10677 dps.add_to_reports(m_perf_limits, reports);
10678 dout(20) << "reports for " << reports->size() << " queries" << dendl;
10679}
224ce89b 10680
7c673cae
FG
10681// =============================================================
10682
10683#undef dout_context
11fdf7f2 10684#define dout_context cct
7c673cae 10685#undef dout_prefix
11fdf7f2 10686#define dout_prefix *_dout << "osd." << osd->get_nodeid() << ":" << shard_id << "." << __func__ << " "
7c673cae 10687
11fdf7f2 10688void OSDShard::_attach_pg(OSDShardPGSlot *slot, PG *pg)
7c673cae 10689{
11fdf7f2
TL
10690 dout(10) << pg->pg_id << " " << pg << dendl;
10691 slot->pg = pg;
10692 pg->osd_shard = this;
10693 pg->pg_slot = slot;
10694 osd->inc_num_pgs();
10695
10696 slot->epoch = pg->get_osdmap_epoch();
10697 pg_slots_by_epoch.insert(*slot);
10698}
10699
10700void OSDShard::_detach_pg(OSDShardPGSlot *slot)
10701{
10702 dout(10) << slot->pg->pg_id << " " << slot->pg << dendl;
10703 slot->pg->osd_shard = nullptr;
10704 slot->pg->pg_slot = nullptr;
10705 slot->pg = nullptr;
10706 osd->dec_num_pgs();
10707
10708 pg_slots_by_epoch.erase(pg_slots_by_epoch.iterator_to(*slot));
10709 slot->epoch = 0;
10710 if (waiting_for_min_pg_epoch) {
10711 min_pg_epoch_cond.notify_all();
10712 }
10713}
10714
10715void OSDShard::update_pg_epoch(OSDShardPGSlot *slot, epoch_t e)
10716{
10717 std::lock_guard l(shard_lock);
10718 dout(30) << "min was " << pg_slots_by_epoch.begin()->epoch
10719 << " on " << pg_slots_by_epoch.begin()->pg->pg_id << dendl;
10720 pg_slots_by_epoch.erase(pg_slots_by_epoch.iterator_to(*slot));
10721 dout(20) << slot->pg->pg_id << " " << slot->epoch << " -> " << e << dendl;
10722 slot->epoch = e;
10723 pg_slots_by_epoch.insert(*slot);
10724 dout(30) << "min is now " << pg_slots_by_epoch.begin()->epoch
10725 << " on " << pg_slots_by_epoch.begin()->pg->pg_id << dendl;
10726 if (waiting_for_min_pg_epoch) {
10727 min_pg_epoch_cond.notify_all();
10728 }
10729}
10730
10731epoch_t OSDShard::get_min_pg_epoch()
10732{
10733 std::lock_guard l(shard_lock);
10734 auto p = pg_slots_by_epoch.begin();
10735 if (p == pg_slots_by_epoch.end()) {
10736 return 0;
10737 }
10738 return p->epoch;
10739}
10740
10741void OSDShard::wait_min_pg_epoch(epoch_t need)
10742{
10743 std::unique_lock l{shard_lock};
10744 ++waiting_for_min_pg_epoch;
10745 min_pg_epoch_cond.wait(l, [need, this] {
10746 if (pg_slots_by_epoch.empty()) {
10747 return true;
10748 } else if (pg_slots_by_epoch.begin()->epoch >= need) {
10749 return true;
10750 } else {
10751 dout(10) << need << " waiting on "
10752 << pg_slots_by_epoch.begin()->epoch << dendl;
10753 return false;
10754 }
10755 });
10756 --waiting_for_min_pg_epoch;
10757}
10758
10759epoch_t OSDShard::get_max_waiting_epoch()
10760{
10761 std::lock_guard l(shard_lock);
10762 epoch_t r = 0;
10763 for (auto& i : pg_slots) {
10764 if (!i.second->waiting_peering.empty()) {
10765 r = std::max(r, i.second->waiting_peering.rbegin()->first);
10766 }
10767 }
10768 return r;
10769}
10770
10771void OSDShard::consume_map(
10772 OSDMapRef& new_osdmap,
10773 unsigned *pushes_to_free)
10774{
10775 std::lock_guard l(shard_lock);
10776 OSDMapRef old_osdmap;
7c673cae 10777 {
11fdf7f2
TL
10778 std::lock_guard l(osdmap_lock);
10779 old_osdmap = std::move(shard_osdmap);
10780 shard_osdmap = new_osdmap;
10781 }
10782 dout(10) << new_osdmap->get_epoch()
10783 << " (was " << (old_osdmap ? old_osdmap->get_epoch() : 0) << ")"
10784 << dendl;
10785 bool queued = false;
10786
10787 // check slots
10788 auto p = pg_slots.begin();
10789 while (p != pg_slots.end()) {
10790 OSDShardPGSlot *slot = p->second.get();
10791 const spg_t& pgid = p->first;
10792 dout(20) << __func__ << " " << pgid << dendl;
10793 if (!slot->waiting_for_split.empty()) {
10794 dout(20) << __func__ << " " << pgid
10795 << " waiting for split " << slot->waiting_for_split << dendl;
10796 ++p;
10797 continue;
10798 }
10799 if (slot->waiting_for_merge_epoch > new_osdmap->get_epoch()) {
10800 dout(20) << __func__ << " " << pgid
10801 << " waiting for merge by epoch " << slot->waiting_for_merge_epoch
10802 << dendl;
10803 ++p;
10804 continue;
10805 }
10806 if (!slot->waiting_peering.empty()) {
10807 epoch_t first = slot->waiting_peering.begin()->first;
10808 if (first <= new_osdmap->get_epoch()) {
10809 dout(20) << __func__ << " " << pgid
10810 << " pending_peering first epoch " << first
10811 << " <= " << new_osdmap->get_epoch() << ", requeueing" << dendl;
10812 _wake_pg_slot(pgid, slot);
10813 queued = true;
10814 }
10815 ++p;
10816 continue;
10817 }
10818 if (!slot->waiting.empty()) {
10819 if (new_osdmap->is_up_acting_osd_shard(pgid, osd->get_nodeid())) {
10820 dout(20) << __func__ << " " << pgid << " maps to us, keeping"
10821 << dendl;
10822 ++p;
10823 continue;
7c673cae 10824 }
11fdf7f2
TL
10825 while (!slot->waiting.empty() &&
10826 slot->waiting.front().get_map_epoch() <= new_osdmap->get_epoch()) {
10827 auto& qi = slot->waiting.front();
10828 dout(20) << __func__ << " " << pgid
10829 << " waiting item " << qi
10830 << " epoch " << qi.get_map_epoch()
10831 << " <= " << new_osdmap->get_epoch()
10832 << ", "
10833 << (qi.get_map_epoch() < new_osdmap->get_epoch() ? "stale" :
10834 "misdirected")
10835 << ", dropping" << dendl;
10836 *pushes_to_free += qi.get_reserved_pushes();
10837 slot->waiting.pop_front();
10838 }
10839 }
10840 if (slot->waiting.empty() &&
10841 slot->num_running == 0 &&
10842 slot->waiting_for_split.empty() &&
10843 !slot->pg) {
10844 dout(20) << __func__ << " " << pgid << " empty, pruning" << dendl;
10845 p = pg_slots.erase(p);
10846 continue;
7c673cae 10847 }
11fdf7f2
TL
10848
10849 ++p;
7c673cae 10850 }
7c673cae 10851 if (queued) {
11fdf7f2
TL
10852 std::lock_guard l{sdata_wait_lock};
10853 sdata_cond.notify_one();
7c673cae
FG
10854 }
10855}
10856
11fdf7f2
TL
10857void OSDShard::_wake_pg_slot(
10858 spg_t pgid,
10859 OSDShardPGSlot *slot)
10860{
10861 dout(20) << __func__ << " " << pgid
10862 << " to_process " << slot->to_process
10863 << " waiting " << slot->waiting
10864 << " waiting_peering " << slot->waiting_peering << dendl;
10865 for (auto i = slot->to_process.rbegin();
10866 i != slot->to_process.rend();
10867 ++i) {
10868 _enqueue_front(std::move(*i), osd->op_prio_cutoff);
10869 }
10870 slot->to_process.clear();
10871 for (auto i = slot->waiting.rbegin();
10872 i != slot->waiting.rend();
10873 ++i) {
10874 _enqueue_front(std::move(*i), osd->op_prio_cutoff);
10875 }
10876 slot->waiting.clear();
10877 for (auto i = slot->waiting_peering.rbegin();
10878 i != slot->waiting_peering.rend();
10879 ++i) {
10880 // this is overkill; we requeue everything, even if some of these
10881 // items are waiting for maps we don't have yet. FIXME, maybe,
10882 // someday, if we decide this inefficiency matters
10883 for (auto j = i->second.rbegin(); j != i->second.rend(); ++j) {
10884 _enqueue_front(std::move(*j), osd->op_prio_cutoff);
10885 }
10886 }
10887 slot->waiting_peering.clear();
10888 ++slot->requeue_seq;
10889}
10890
10891void OSDShard::identify_splits_and_merges(
10892 const OSDMapRef& as_of_osdmap,
10893 set<pair<spg_t,epoch_t>> *split_pgs,
10894 set<pair<spg_t,epoch_t>> *merge_pgs)
10895{
10896 std::lock_guard l(shard_lock);
10897 if (shard_osdmap) {
10898 for (auto& i : pg_slots) {
10899 const spg_t& pgid = i.first;
10900 auto *slot = i.second.get();
10901 if (slot->pg) {
10902 osd->service.identify_splits_and_merges(
10903 shard_osdmap, as_of_osdmap, pgid,
10904 split_pgs, merge_pgs);
10905 } else if (!slot->waiting_for_split.empty()) {
10906 osd->service.identify_splits_and_merges(
10907 shard_osdmap, as_of_osdmap, pgid,
10908 split_pgs, nullptr);
10909 } else {
10910 dout(20) << __func__ << " slot " << pgid
10911 << " has no pg and waiting_for_split "
10912 << slot->waiting_for_split << dendl;
7c673cae 10913 }
11fdf7f2
TL
10914 }
10915 }
10916}
10917
10918void OSDShard::prime_splits(const OSDMapRef& as_of_osdmap,
10919 set<pair<spg_t,epoch_t>> *pgids)
10920{
10921 std::lock_guard l(shard_lock);
10922 _prime_splits(pgids);
10923 if (shard_osdmap->get_epoch() > as_of_osdmap->get_epoch()) {
10924 set<pair<spg_t,epoch_t>> newer_children;
10925 for (auto i : *pgids) {
10926 osd->service.identify_splits_and_merges(
10927 as_of_osdmap, shard_osdmap, i.first,
10928 &newer_children, nullptr);
10929 }
10930 newer_children.insert(pgids->begin(), pgids->end());
10931 dout(10) << "as_of_osdmap " << as_of_osdmap->get_epoch() << " < shard "
10932 << shard_osdmap->get_epoch() << ", new children " << newer_children
10933 << dendl;
10934 _prime_splits(&newer_children);
10935 // note: we don't care what is left over here for other shards.
10936 // if this shard is ahead of us and one isn't, e.g., one thread is
10937 // calling into prime_splits via _process (due to a newly created
10938 // pg) and this shard has a newer map due to a racing consume_map,
10939 // then any grandchildren left here will be identified (or were
10940 // identified) when the slower shard's osdmap is advanced.
10941 // _prime_splits() will tolerate the case where the pgid is
10942 // already primed.
10943 }
10944}
10945
10946void OSDShard::_prime_splits(set<pair<spg_t,epoch_t>> *pgids)
10947{
10948 dout(10) << *pgids << dendl;
10949 auto p = pgids->begin();
10950 while (p != pgids->end()) {
10951 unsigned shard_index = p->first.hash_to_shard(osd->num_shards);
10952 if (shard_index == shard_id) {
10953 auto r = pg_slots.emplace(p->first, nullptr);
10954 if (r.second) {
10955 dout(10) << "priming slot " << p->first << " e" << p->second << dendl;
10956 r.first->second = make_unique<OSDShardPGSlot>();
10957 r.first->second->waiting_for_split.insert(p->second);
7c673cae 10958 } else {
11fdf7f2
TL
10959 auto q = r.first;
10960 ceph_assert(q != pg_slots.end());
10961 dout(10) << "priming (existing) slot " << p->first << " e" << p->second
10962 << dendl;
10963 q->second->waiting_for_split.insert(p->second);
7c673cae 10964 }
11fdf7f2
TL
10965 p = pgids->erase(p);
10966 } else {
10967 ++p;
7c673cae
FG
10968 }
10969 }
11fdf7f2
TL
10970}
10971
10972void OSDShard::prime_merges(const OSDMapRef& as_of_osdmap,
10973 set<pair<spg_t,epoch_t>> *merge_pgs)
10974{
10975 std::lock_guard l(shard_lock);
10976 dout(20) << __func__ << " checking shard " << shard_id
10977 << " for remaining merge pgs " << merge_pgs << dendl;
10978 auto p = merge_pgs->begin();
10979 while (p != merge_pgs->end()) {
10980 spg_t pgid = p->first;
10981 epoch_t epoch = p->second;
10982 unsigned shard_index = pgid.hash_to_shard(osd->num_shards);
10983 if (shard_index != shard_id) {
10984 ++p;
10985 continue;
10986 }
10987 OSDShardPGSlot *slot;
10988 auto r = pg_slots.emplace(pgid, nullptr);
10989 if (r.second) {
10990 r.first->second = make_unique<OSDShardPGSlot>();
10991 }
10992 slot = r.first->second.get();
10993 if (slot->pg) {
10994 // already have pg
10995 dout(20) << __func__ << " have merge participant pg " << pgid
10996 << " " << slot->pg << dendl;
10997 } else if (!slot->waiting_for_split.empty() &&
10998 *slot->waiting_for_split.begin() < epoch) {
10999 dout(20) << __func__ << " pending split on merge participant pg " << pgid
11000 << " " << slot->waiting_for_split << dendl;
11001 } else {
11002 dout(20) << __func__ << " creating empty merge participant " << pgid
11003 << " for merge in " << epoch << dendl;
11004 // leave history zeroed; PG::merge_from() will fill it in.
11005 pg_history_t history;
11006 PGCreateInfo cinfo(pgid, epoch - 1,
11007 history, PastIntervals(), false);
11008 PGRef pg = osd->handle_pg_create_info(shard_osdmap, &cinfo);
11009 _attach_pg(r.first->second.get(), pg.get());
11010 _wake_pg_slot(pgid, slot);
11011 pg->unlock();
11012 }
11013 // mark slot for merge
11014 dout(20) << __func__ << " marking merge participant " << pgid << dendl;
11015 slot->waiting_for_merge_epoch = epoch;
11016 p = merge_pgs->erase(p);
7c673cae
FG
11017 }
11018}
11019
11fdf7f2 11020void OSDShard::register_and_wake_split_child(PG *pg)
7c673cae 11021{
11fdf7f2
TL
11022 epoch_t epoch;
11023 {
11024 std::lock_guard l(shard_lock);
11025 dout(10) << pg->pg_id << " " << pg << dendl;
11026 auto p = pg_slots.find(pg->pg_id);
11027 ceph_assert(p != pg_slots.end());
11028 auto *slot = p->second.get();
11029 dout(20) << pg->pg_id << " waiting_for_split " << slot->waiting_for_split
11030 << dendl;
11031 ceph_assert(!slot->pg);
11032 ceph_assert(!slot->waiting_for_split.empty());
11033 _attach_pg(slot, pg);
11034
11035 epoch = pg->get_osdmap_epoch();
11036 ceph_assert(slot->waiting_for_split.count(epoch));
11037 slot->waiting_for_split.erase(epoch);
11038 if (slot->waiting_for_split.empty()) {
11039 _wake_pg_slot(pg->pg_id, slot);
11040 } else {
11041 dout(10) << __func__ << " still waiting for split on "
11042 << slot->waiting_for_split << dendl;
11043 }
7c673cae 11044 }
11fdf7f2
TL
11045
11046 // kick child to ensure it pulls up to the latest osdmap
11047 osd->enqueue_peering_evt(
11048 pg->pg_id,
11049 PGPeeringEventRef(
11050 std::make_shared<PGPeeringEvent>(
11051 epoch,
11052 epoch,
11053 NullEvt())));
11054
11055 std::lock_guard l{sdata_wait_lock};
11056 sdata_cond.notify_one();
7c673cae
FG
11057}
11058
11fdf7f2 11059void OSDShard::unprime_split_children(spg_t parent, unsigned old_pg_num)
7c673cae 11060{
11fdf7f2
TL
11061 std::lock_guard l(shard_lock);
11062 vector<spg_t> to_delete;
11063 for (auto& i : pg_slots) {
11064 if (i.first != parent &&
11065 i.first.get_ancestor(old_pg_num) == parent) {
11066 dout(10) << __func__ << " parent " << parent << " clearing " << i.first
11067 << dendl;
11068 _wake_pg_slot(i.first, i.second.get());
11069 to_delete.push_back(i.first);
11070 }
11071 }
11072 for (auto pgid : to_delete) {
11073 pg_slots.erase(pgid);
11074 }
11075}
11076
11077
11078// =============================================================
11079
11080#undef dout_context
11081#define dout_context osd->cct
11082#undef dout_prefix
11083#define dout_prefix *_dout << "osd." << osd->whoami << " op_wq "
11084
11085void OSD::ShardedOpWQ::_add_slot_waiter(
11086 spg_t pgid,
11087 OSDShardPGSlot *slot,
11088 OpQueueItem&& qi)
11089{
11090 if (qi.is_peering()) {
11091 dout(20) << __func__ << " " << pgid
11092 << " peering, item epoch is "
11093 << qi.get_map_epoch()
11094 << ", will wait on " << qi << dendl;
11095 slot->waiting_peering[qi.get_map_epoch()].push_back(std::move(qi));
11096 } else {
11097 dout(20) << __func__ << " " << pgid
11098 << " item epoch is "
11099 << qi.get_map_epoch()
11100 << ", will wait on " << qi << dendl;
11101 slot->waiting.push_back(std::move(qi));
7c673cae
FG
11102 }
11103}
11104
11105#undef dout_prefix
11106#define dout_prefix *_dout << "osd." << osd->whoami << " op_wq(" << shard_index << ") "
11107
11108void OSD::ShardedOpWQ::_process(uint32_t thread_index, heartbeat_handle_d *hb)
11109{
11fdf7f2
TL
11110 uint32_t shard_index = thread_index % osd->num_shards;
11111 auto& sdata = osd->shards[shard_index];
11112 ceph_assert(sdata);
11113
11114 // If all threads of shards do oncommits, there is a out-of-order
11115 // problem. So we choose the thread which has the smallest
11116 // thread_index(thread_index < num_shards) of shard to do oncommit
11117 // callback.
11118 bool is_smallest_thread_index = thread_index < osd->num_shards;
7c673cae
FG
11119
11120 // peek at spg_t
11fdf7f2
TL
11121 sdata->shard_lock.lock();
11122 if (sdata->pqueue->empty() &&
11123 (!is_smallest_thread_index || sdata->context_queue.empty())) {
11124 std::unique_lock wait_lock{sdata->sdata_wait_lock};
11125 if (is_smallest_thread_index && !sdata->context_queue.empty()) {
11126 // we raced with a context_queue addition, don't wait
11127 wait_lock.unlock();
11128 } else if (!sdata->stop_waiting) {
11129 dout(20) << __func__ << " empty q, waiting" << dendl;
11130 osd->cct->get_heartbeat_map()->clear_timeout(hb);
11131 sdata->shard_lock.unlock();
11132 sdata->sdata_cond.wait(wait_lock);
11133 wait_lock.unlock();
11134 sdata->shard_lock.lock();
11135 if (sdata->pqueue->empty() &&
11136 !(is_smallest_thread_index && !sdata->context_queue.empty())) {
11137 sdata->shard_lock.unlock();
11138 return;
11139 }
11140 osd->cct->get_heartbeat_map()->reset_timeout(hb,
11141 osd->cct->_conf->threadpool_default_timeout, 0);
11142 } else {
11143 dout(20) << __func__ << " need return immediately" << dendl;
11144 wait_lock.unlock();
11145 sdata->shard_lock.unlock();
7c673cae
FG
11146 return;
11147 }
11148 }
11fdf7f2
TL
11149
11150 list<Context *> oncommits;
11151 if (is_smallest_thread_index && !sdata->context_queue.empty()) {
11152 sdata->context_queue.swap(oncommits);
7c673cae 11153 }
11fdf7f2
TL
11154
11155 if (sdata->pqueue->empty()) {
11156 if (osd->is_stopping()) {
11157 sdata->shard_lock.unlock();
11158 for (auto c : oncommits) {
11159 dout(10) << __func__ << " discarding in-flight oncommit " << c << dendl;
11160 delete c;
11161 }
11162 return; // OSD shutdown, discard.
7c673cae 11163 }
11fdf7f2
TL
11164 sdata->shard_lock.unlock();
11165 handle_oncommits(oncommits);
11166 return;
7c673cae 11167 }
7c673cae 11168
11fdf7f2
TL
11169 OpQueueItem item = sdata->pqueue->dequeue();
11170 if (osd->is_stopping()) {
11171 sdata->shard_lock.unlock();
11172 for (auto c : oncommits) {
11173 dout(10) << __func__ << " discarding in-flight oncommit " << c << dendl;
11174 delete c;
11175 }
11176 return; // OSD shutdown, discard.
11177 }
7c673cae 11178
11fdf7f2
TL
11179 const auto token = item.get_ordering_token();
11180 auto r = sdata->pg_slots.emplace(token, nullptr);
11181 if (r.second) {
11182 r.first->second = make_unique<OSDShardPGSlot>();
7c673cae 11183 }
11fdf7f2
TL
11184 OSDShardPGSlot *slot = r.first->second.get();
11185 dout(20) << __func__ << " " << token
11186 << (r.second ? " (new)" : "")
11187 << " to_process " << slot->to_process
11188 << " waiting " << slot->waiting
11189 << " waiting_peering " << slot->waiting_peering
11190 << dendl;
11191 slot->to_process.push_back(std::move(item));
11192 dout(20) << __func__ << " " << slot->to_process.back()
11193 << " queued" << dendl;
7c673cae 11194
11fdf7f2
TL
11195 retry_pg:
11196 PGRef pg = slot->pg;
7c673cae 11197
11fdf7f2
TL
11198 // lock pg (if we have it)
11199 if (pg) {
11200 // note the requeue seq now...
11201 uint64_t requeue_seq = slot->requeue_seq;
11202 ++slot->num_running;
7c673cae 11203
11fdf7f2
TL
11204 sdata->shard_lock.unlock();
11205 osd->service.maybe_inject_dispatch_delay();
11206 pg->lock();
11207 osd->service.maybe_inject_dispatch_delay();
11208 sdata->shard_lock.lock();
7c673cae 11209
11fdf7f2
TL
11210 auto q = sdata->pg_slots.find(token);
11211 if (q == sdata->pg_slots.end()) {
11212 // this can happen if we race with pg removal.
11213 dout(20) << __func__ << " slot " << token << " no longer there" << dendl;
11214 pg->unlock();
11215 sdata->shard_lock.unlock();
11216 handle_oncommits(oncommits);
11217 return;
11218 }
11219 slot = q->second.get();
11220 --slot->num_running;
7c673cae 11221
11fdf7f2
TL
11222 if (slot->to_process.empty()) {
11223 // raced with _wake_pg_slot or consume_map
11224 dout(20) << __func__ << " " << token
11225 << " nothing queued" << dendl;
7c673cae 11226 pg->unlock();
11fdf7f2
TL
11227 sdata->shard_lock.unlock();
11228 handle_oncommits(oncommits);
11229 return;
7c673cae 11230 }
11fdf7f2
TL
11231 if (requeue_seq != slot->requeue_seq) {
11232 dout(20) << __func__ << " " << token
11233 << " requeue_seq " << slot->requeue_seq << " > our "
11234 << requeue_seq << ", we raced with _wake_pg_slot"
11235 << dendl;
7c673cae 11236 pg->unlock();
11fdf7f2
TL
11237 sdata->shard_lock.unlock();
11238 handle_oncommits(oncommits);
11239 return;
7c673cae 11240 }
11fdf7f2
TL
11241 if (slot->pg != pg) {
11242 // this can happen if we race with pg removal.
11243 dout(20) << __func__ << " slot " << token << " no longer attached to "
11244 << pg << dendl;
7c673cae 11245 pg->unlock();
11fdf7f2 11246 goto retry_pg;
7c673cae 11247 }
7c673cae
FG
11248 }
11249
11fdf7f2
TL
11250 dout(20) << __func__ << " " << token
11251 << " to_process " << slot->to_process
11252 << " waiting " << slot->waiting
11253 << " waiting_peering " << slot->waiting_peering << dendl;
11254
11255 ThreadPool::TPHandle tp_handle(osd->cct, hb, timeout_interval,
11256 suicide_interval);
11257
7c673cae 11258 // take next item
11fdf7f2
TL
11259 auto qi = std::move(slot->to_process.front());
11260 slot->to_process.pop_front();
11261 dout(20) << __func__ << " " << qi << " pg " << pg << dendl;
11262 set<pair<spg_t,epoch_t>> new_children;
11263 OSDMapRef osdmap;
7c673cae 11264
11fdf7f2 11265 while (!pg) {
7c673cae 11266 // should this pg shard exist on this osd in this (or a later) epoch?
11fdf7f2
TL
11267 osdmap = sdata->shard_osdmap;
11268 const PGCreateInfo *create_info = qi.creates_pg();
11269 if (!slot->waiting_for_split.empty()) {
11270 dout(20) << __func__ << " " << token
11271 << " splitting " << slot->waiting_for_split << dendl;
11272 _add_slot_waiter(token, slot, std::move(qi));
11273 } else if (qi.get_map_epoch() > osdmap->get_epoch()) {
11274 dout(20) << __func__ << " " << token
11275 << " map " << qi.get_map_epoch() << " > "
11276 << osdmap->get_epoch() << dendl;
11277 _add_slot_waiter(token, slot, std::move(qi));
11278 } else if (qi.is_peering()) {
11279 if (!qi.peering_requires_pg()) {
11280 // for pg-less events, we run them under the ordering lock, since
11281 // we don't have the pg lock to keep them ordered.
11282 qi.run(osd, sdata, pg, tp_handle);
11283 } else if (osdmap->is_up_acting_osd_shard(token, osd->whoami)) {
11284 if (create_info) {
11285 if (create_info->by_mon &&
11286 osdmap->get_pg_acting_primary(token.pgid) != osd->whoami) {
11287 dout(20) << __func__ << " " << token
11288 << " no pg, no longer primary, ignoring mon create on "
11289 << qi << dendl;
11290 } else {
11291 dout(20) << __func__ << " " << token
11292 << " no pg, should create on " << qi << dendl;
11293 pg = osd->handle_pg_create_info(osdmap, create_info);
11294 if (pg) {
11295 // we created the pg! drop out and continue "normally"!
11296 sdata->_attach_pg(slot, pg.get());
11297 sdata->_wake_pg_slot(token, slot);
11298
11299 // identify split children between create epoch and shard epoch.
11300 osd->service.identify_splits_and_merges(
11301 pg->get_osdmap(), osdmap, pg->pg_id, &new_children, nullptr);
11302 sdata->_prime_splits(&new_children);
11303 // distribute remaining split children to other shards below!
11304 break;
11305 }
11306 dout(20) << __func__ << " ignored create on " << qi << dendl;
11307 }
11308 } else {
11309 dout(20) << __func__ << " " << token
11310 << " no pg, peering, !create, discarding " << qi << dendl;
11311 }
11312 } else {
11313 dout(20) << __func__ << " " << token
11314 << " no pg, peering, doesn't map here e" << osdmap->get_epoch()
11315 << ", discarding " << qi
11316 << dendl;
11317 }
11318 } else if (osdmap->is_up_acting_osd_shard(token, osd->whoami)) {
11319 dout(20) << __func__ << " " << token
11320 << " no pg, should exist e" << osdmap->get_epoch()
11321 << ", will wait on " << qi << dendl;
11322 _add_slot_waiter(token, slot, std::move(qi));
7c673cae 11323 } else {
11fdf7f2
TL
11324 dout(20) << __func__ << " " << token
11325 << " no pg, shouldn't exist e" << osdmap->get_epoch()
11326 << ", dropping " << qi << dendl;
7c673cae 11327 // share map with client?
11fdf7f2
TL
11328 if (boost::optional<OpRequestRef> _op = qi.maybe_get_op()) {
11329 auto priv = (*_op)->get_req()->get_connection()->get_priv();
11330 if (auto session = static_cast<Session *>(priv.get()); session) {
11331 osd->maybe_share_map(session, *_op, sdata->shard_osdmap);
7c673cae
FG
11332 }
11333 }
11fdf7f2 11334 unsigned pushes_to_free = qi.get_reserved_pushes();
7c673cae 11335 if (pushes_to_free > 0) {
11fdf7f2 11336 sdata->shard_lock.unlock();
7c673cae 11337 osd->service.release_reserved_pushes(pushes_to_free);
11fdf7f2 11338 handle_oncommits(oncommits);
7c673cae
FG
11339 return;
11340 }
11341 }
11fdf7f2
TL
11342 sdata->shard_lock.unlock();
11343 handle_oncommits(oncommits);
7c673cae
FG
11344 return;
11345 }
11fdf7f2
TL
11346 if (qi.is_peering()) {
11347 OSDMapRef osdmap = sdata->shard_osdmap;
11348 if (qi.get_map_epoch() > osdmap->get_epoch()) {
11349 _add_slot_waiter(token, slot, std::move(qi));
11350 sdata->shard_lock.unlock();
11351 pg->unlock();
11352 handle_oncommits(oncommits);
11353 return;
11354 }
11355 }
11356 sdata->shard_lock.unlock();
7c673cae 11357
11fdf7f2
TL
11358 if (!new_children.empty()) {
11359 for (auto shard : osd->shards) {
11360 shard->prime_splits(osdmap, &new_children);
11361 }
11362 ceph_assert(new_children.empty());
11363 }
7c673cae
FG
11364
11365 // osd_opwq_process marks the point at which an operation has been dequeued
11366 // and will begin to be handled by a worker thread.
11367 {
11368#ifdef WITH_LTTNG
11369 osd_reqid_t reqid;
11fdf7f2 11370 if (boost::optional<OpRequestRef> _op = qi.maybe_get_op()) {
7c673cae
FG
11371 reqid = (*_op)->get_reqid();
11372 }
11373#endif
11374 tracepoint(osd, opwq_process_start, reqid.name._type,
11375 reqid.name._num, reqid.tid, reqid.inc);
11376 }
11377
11378 lgeneric_subdout(osd->cct, osd, 30) << "dequeue status: ";
11379 Formatter *f = Formatter::create("json");
11380 f->open_object_section("q");
11381 dump(f);
11382 f->close_section();
11383 f->flush(*_dout);
11384 delete f;
11385 *_dout << dendl;
11386
11fdf7f2 11387 qi.run(osd, sdata, pg, tp_handle);
7c673cae
FG
11388
11389 {
11390#ifdef WITH_LTTNG
11391 osd_reqid_t reqid;
11fdf7f2 11392 if (boost::optional<OpRequestRef> _op = qi.maybe_get_op()) {
7c673cae
FG
11393 reqid = (*_op)->get_reqid();
11394 }
11395#endif
11396 tracepoint(osd, opwq_process_finish, reqid.name._type,
11397 reqid.name._num, reqid.tid, reqid.inc);
11398 }
11399
11fdf7f2 11400 handle_oncommits(oncommits);
7c673cae
FG
11401}
11402
11fdf7f2 11403void OSD::ShardedOpWQ::_enqueue(OpQueueItem&& item) {
7c673cae 11404 uint32_t shard_index =
11fdf7f2 11405 item.get_ordering_token().hash_to_shard(osd->shards.size());
7c673cae 11406
11fdf7f2 11407 OSDShard* sdata = osd->shards[shard_index];
7c673cae 11408 assert (NULL != sdata);
11fdf7f2
TL
11409 unsigned priority = item.get_priority();
11410 unsigned cost = item.get_cost();
11411 sdata->shard_lock.lock();
7c673cae 11412
11fdf7f2 11413 dout(20) << __func__ << " " << item << dendl;
7c673cae
FG
11414 if (priority >= osd->op_prio_cutoff)
11415 sdata->pqueue->enqueue_strict(
11fdf7f2 11416 item.get_owner(), priority, std::move(item));
7c673cae
FG
11417 else
11418 sdata->pqueue->enqueue(
11fdf7f2
TL
11419 item.get_owner(), priority, cost, std::move(item));
11420 sdata->shard_lock.unlock();
7c673cae 11421
11fdf7f2
TL
11422 std::lock_guard l{sdata->sdata_wait_lock};
11423 sdata->sdata_cond.notify_one();
7c673cae
FG
11424}
11425
11fdf7f2 11426void OSD::ShardedOpWQ::_enqueue_front(OpQueueItem&& item)
7c673cae 11427{
11fdf7f2
TL
11428 auto shard_index = item.get_ordering_token().hash_to_shard(osd->shards.size());
11429 auto& sdata = osd->shards[shard_index];
11430 ceph_assert(sdata);
11431 sdata->shard_lock.lock();
11432 auto p = sdata->pg_slots.find(item.get_ordering_token());
11433 if (p != sdata->pg_slots.end() &&
11434 !p->second->to_process.empty()) {
7c673cae
FG
11435 // we may be racing with _process, which has dequeued a new item
11436 // from pqueue, put it on to_process, and is now busy taking the
11437 // pg lock. ensure this old requeued item is ordered before any
11438 // such newer item in to_process.
11fdf7f2
TL
11439 p->second->to_process.push_front(std::move(item));
11440 item = std::move(p->second->to_process.back());
11441 p->second->to_process.pop_back();
11442 dout(20) << __func__
11443 << " " << p->second->to_process.front()
11444 << " shuffled w/ " << item << dendl;
7c673cae 11445 } else {
11fdf7f2 11446 dout(20) << __func__ << " " << item << dendl;
7c673cae 11447 }
11fdf7f2
TL
11448 sdata->_enqueue_front(std::move(item), osd->op_prio_cutoff);
11449 sdata->shard_lock.unlock();
11450 std::lock_guard l{sdata->sdata_wait_lock};
11451 sdata->sdata_cond.notify_one();
7c673cae
FG
11452}
11453
11454namespace ceph {
11455namespace osd_cmds {
11456
11fdf7f2
TL
11457int heap(CephContext& cct, const cmdmap_t& cmdmap, Formatter& f,
11458 std::ostream& os)
7c673cae
FG
11459{
11460 if (!ceph_using_tcmalloc()) {
11461 os << "could not issue heap profiler command -- not using tcmalloc!";
11462 return -EOPNOTSUPP;
11463 }
11464
11465 string cmd;
11466 if (!cmd_getval(&cct, cmdmap, "heapcmd", cmd)) {
11467 os << "unable to get value for command \"" << cmd << "\"";
11468 return -EINVAL;
11fdf7f2 11469 }
7c673cae
FG
11470
11471 std::vector<std::string> cmd_vec;
11472 get_str_vec(cmd, cmd_vec);
11fdf7f2
TL
11473
11474 string val;
11475 if (cmd_getval(&cct, cmdmap, "value", val)) {
11476 cmd_vec.push_back(val);
11477 }
7c673cae
FG
11478
11479 ceph_heap_profiler_handle_command(cmd_vec, os);
11480
11481 return 0;
11482}
11483
11484}} // namespace ceph::osd_cmds
11485
224ce89b 11486
11fdf7f2 11487std::ostream& operator<<(std::ostream& out, const io_queue& q) {
224ce89b 11488 switch(q) {
11fdf7f2 11489 case io_queue::prioritized:
224ce89b
WB
11490 out << "prioritized";
11491 break;
11fdf7f2 11492 case io_queue::weightedpriority:
224ce89b
WB
11493 out << "weightedpriority";
11494 break;
11fdf7f2 11495 case io_queue::mclock_opclass:
224ce89b
WB
11496 out << "mclock_opclass";
11497 break;
11fdf7f2 11498 case io_queue::mclock_client:
224ce89b
WB
11499 out << "mclock_client";
11500 break;
11501 }
11502 return out;
11503}