]> git.proxmox.com Git - ceph.git/blame - ceph/src/osd/OSD.cc
import 14.2.4 nautilus point release
[ceph.git] / ceph / src / osd / OSD.cc
CommitLineData
7c673cae
FG
1// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2// vim: ts=8 sw=2 smarttab
3/*
4 * Ceph - scalable distributed file system
5 *
6 * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
7 * Copyright (C) 2017 OVH
8 *
9 * This is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU Lesser General Public
11 * License version 2.1, as published by the Free Software
12 * Foundation. See file COPYING.
13 *
14 */
11fdf7f2 15
7c673cae 16#include "acconfig.h"
11fdf7f2
TL
17
18#include <cctype>
7c673cae
FG
19#include <fstream>
20#include <iostream>
11fdf7f2
TL
21#include <iterator>
22
23#include <unistd.h>
7c673cae
FG
24#include <sys/stat.h>
25#include <signal.h>
7c673cae
FG
26#include <boost/scoped_ptr.hpp>
27
28#ifdef HAVE_SYS_PARAM_H
29#include <sys/param.h>
30#endif
31
32#ifdef HAVE_SYS_MOUNT_H
33#include <sys/mount.h>
34#endif
35
36#include "osd/PG.h"
37
38#include "include/types.h"
39#include "include/compat.h"
11fdf7f2 40#include "include/random.h"
7c673cae
FG
41
42#include "OSD.h"
43#include "OSDMap.h"
44#include "Watch.h"
45#include "osdc/Objecter.h"
46
47#include "common/errno.h"
48#include "common/ceph_argparse.h"
224ce89b 49#include "common/ceph_time.h"
7c673cae 50#include "common/version.h"
b5b8bbf5 51#include "common/pick_address.h"
11fdf7f2
TL
52#include "common/blkdev.h"
53#include "common/numa.h"
7c673cae
FG
54
55#include "os/ObjectStore.h"
56#ifdef HAVE_LIBFUSE
57#include "os/FuseStore.h"
58#endif
59
60#include "PrimaryLogPG.h"
61
7c673cae
FG
62#include "msg/Messenger.h"
63#include "msg/Message.h"
64
65#include "mon/MonClient.h"
66
67#include "messages/MLog.h"
68
69#include "messages/MGenericMessage.h"
7c673cae
FG
70#include "messages/MOSDPing.h"
71#include "messages/MOSDFailure.h"
72#include "messages/MOSDMarkMeDown.h"
73#include "messages/MOSDFull.h"
74#include "messages/MOSDOp.h"
75#include "messages/MOSDOpReply.h"
76#include "messages/MOSDBackoff.h"
77#include "messages/MOSDBeacon.h"
78#include "messages/MOSDRepOp.h"
79#include "messages/MOSDRepOpReply.h"
80#include "messages/MOSDBoot.h"
81#include "messages/MOSDPGTemp.h"
11fdf7f2 82#include "messages/MOSDPGReadyToMerge.h"
7c673cae
FG
83
84#include "messages/MOSDMap.h"
85#include "messages/MMonGetOSDMap.h"
86#include "messages/MOSDPGNotify.h"
87#include "messages/MOSDPGQuery.h"
88#include "messages/MOSDPGLog.h"
89#include "messages/MOSDPGRemove.h"
90#include "messages/MOSDPGInfo.h"
91#include "messages/MOSDPGCreate.h"
11fdf7f2 92#include "messages/MOSDPGCreate2.h"
7c673cae
FG
93#include "messages/MOSDPGTrim.h"
94#include "messages/MOSDPGScan.h"
7c673cae
FG
95#include "messages/MBackfillReserve.h"
96#include "messages/MRecoveryReserve.h"
c07f9fc5 97#include "messages/MOSDForceRecovery.h"
7c673cae
FG
98#include "messages/MOSDECSubOpWrite.h"
99#include "messages/MOSDECSubOpWriteReply.h"
100#include "messages/MOSDECSubOpRead.h"
101#include "messages/MOSDECSubOpReadReply.h"
102#include "messages/MOSDPGCreated.h"
103#include "messages/MOSDPGUpdateLogMissing.h"
104#include "messages/MOSDPGUpdateLogMissingReply.h"
105
11fdf7f2
TL
106#include "messages/MOSDPeeringOp.h"
107
7c673cae
FG
108#include "messages/MOSDAlive.h"
109
110#include "messages/MOSDScrub.h"
11fdf7f2 111#include "messages/MOSDScrub2.h"
7c673cae
FG
112#include "messages/MOSDRepScrub.h"
113
114#include "messages/MMonCommand.h"
115#include "messages/MCommand.h"
116#include "messages/MCommandReply.h"
117
118#include "messages/MPGStats.h"
119#include "messages/MPGStatsAck.h"
120
121#include "messages/MWatchNotify.h"
122#include "messages/MOSDPGPush.h"
123#include "messages/MOSDPGPushReply.h"
124#include "messages/MOSDPGPull.h"
125
126#include "common/perf_counters.h"
127#include "common/Timer.h"
128#include "common/LogClient.h"
129#include "common/AsyncReserver.h"
130#include "common/HeartbeatMap.h"
131#include "common/admin_socket.h"
132#include "common/ceph_context.h"
133
134#include "global/signal_handler.h"
135#include "global/pidfile.h"
136
137#include "include/color.h"
138#include "perfglue/cpu_profiler.h"
139#include "perfglue/heap_profiler.h"
140
141#include "osd/OpRequest.h"
142
143#include "auth/AuthAuthorizeHandler.h"
144#include "auth/RotatingKeyRing.h"
7c673cae
FG
145
146#include "objclass/objclass.h"
147
148#include "common/cmdparse.h"
149#include "include/str_list.h"
150#include "include/util.h"
151
11fdf7f2 152#include "include/ceph_assert.h"
7c673cae
FG
153#include "common/config.h"
154#include "common/EventTrace.h"
155
11fdf7f2
TL
156#include "json_spirit/json_spirit_reader.h"
157#include "json_spirit/json_spirit_writer.h"
158
7c673cae
FG
159#ifdef WITH_LTTNG
160#define TRACEPOINT_DEFINE
161#define TRACEPOINT_PROBE_DYNAMIC_LINKAGE
162#include "tracing/osd.h"
163#undef TRACEPOINT_PROBE_DYNAMIC_LINKAGE
164#undef TRACEPOINT_DEFINE
165#else
166#define tracepoint(...)
167#endif
168
169#define dout_context cct
170#define dout_subsys ceph_subsys_osd
171#undef dout_prefix
172#define dout_prefix _prefix(_dout, whoami, get_osdmap_epoch())
173
224ce89b 174
7c673cae
FG
175static ostream& _prefix(std::ostream* _dout, int whoami, epoch_t epoch) {
176 return *_dout << "osd." << whoami << " " << epoch << " ";
177}
178
7c673cae
FG
179//Initial features in new superblock.
180//Features here are also automatically upgraded
181CompatSet OSD::get_osd_initial_compat_set() {
182 CompatSet::FeatureSet ceph_osd_feature_compat;
183 CompatSet::FeatureSet ceph_osd_feature_ro_compat;
184 CompatSet::FeatureSet ceph_osd_feature_incompat;
185 ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_BASE);
186 ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_PGINFO);
187 ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_OLOC);
188 ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_LEC);
189 ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_CATEGORIES);
190 ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_HOBJECTPOOL);
191 ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_BIGINFO);
192 ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_LEVELDBINFO);
193 ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_LEVELDBLOG);
194 ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_SNAPMAPPER);
195 ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_HINTS);
196 ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_PGMETA);
197 ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_MISSING);
198 ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_FASTINFO);
c07f9fc5 199 ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_RECOVERY_DELETES);
7c673cae
FG
200 return CompatSet(ceph_osd_feature_compat, ceph_osd_feature_ro_compat,
201 ceph_osd_feature_incompat);
202}
203
204//Features are added here that this OSD supports.
205CompatSet OSD::get_osd_compat_set() {
206 CompatSet compat = get_osd_initial_compat_set();
207 //Any features here can be set in code, but not in initial superblock
208 compat.incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_SHARDS);
209 return compat;
210}
211
212OSDService::OSDService(OSD *osd) :
213 osd(osd),
214 cct(osd->cct),
7c673cae
FG
215 whoami(osd->whoami), store(osd->store),
216 log_client(osd->log_client), clog(osd->clog),
217 pg_recovery_stats(osd->pg_recovery_stats),
218 cluster_messenger(osd->cluster_messenger),
219 client_messenger(osd->client_messenger),
220 logger(osd->logger),
221 recoverystate_perf(osd->recoverystate_perf),
222 monc(osd->monc),
7c673cae 223 class_handler(osd->class_handler),
11fdf7f2
TL
224 osd_max_object_size(cct->_conf, "osd_max_object_size"),
225 osd_skip_data_digest(cct->_conf, "osd_skip_data_digest"),
226 publish_lock{ceph::make_mutex("OSDService::publish_lock")},
227 pre_publish_lock{ceph::make_mutex("OSDService::pre_publish_lock")},
7c673cae
FG
228 max_oldest_map(0),
229 peer_map_epoch_lock("OSDService::peer_map_epoch_lock"),
230 sched_scrub_lock("OSDService::sched_scrub_lock"), scrubs_pending(0),
231 scrubs_active(0),
232 agent_lock("OSDService::agent_lock"),
233 agent_valid_iterator(false),
234 agent_ops(0),
235 flush_mode_high_count(0),
236 agent_active(true),
237 agent_thread(this),
238 agent_stop_flag(false),
239 agent_timer_lock("OSDService::agent_timer_lock"),
240 agent_timer(osd->client_messenger->cct, agent_timer_lock),
241 last_recalibrate(ceph_clock_now()),
242 promote_max_objects(0),
243 promote_max_bytes(0),
244 objecter(new Objecter(osd->client_messenger->cct, osd->objecter_messenger, osd->monc, NULL, 0, 0)),
11fdf7f2 245 m_objecter_finishers(cct->_conf->osd_objecter_finishers),
7c673cae
FG
246 watch_lock("OSDService::watch_lock"),
247 watch_timer(osd->client_messenger->cct, watch_lock),
248 next_notif_id(0),
249 recovery_request_lock("OSDService::recovery_request_lock"),
250 recovery_request_timer(cct, recovery_request_lock, false),
11fdf7f2
TL
251 sleep_lock("OSDService::sleep_lock"),
252 sleep_timer(cct, sleep_lock, false),
7c673cae 253 reserver_finisher(cct),
3efd9988 254 local_reserver(cct, &reserver_finisher, cct->_conf->osd_max_backfills,
7c673cae 255 cct->_conf->osd_min_recovery_priority),
3efd9988 256 remote_reserver(cct, &reserver_finisher, cct->_conf->osd_max_backfills,
7c673cae
FG
257 cct->_conf->osd_min_recovery_priority),
258 pg_temp_lock("OSDService::pg_temp_lock"),
3efd9988 259 snap_reserver(cct, &reserver_finisher,
7c673cae
FG
260 cct->_conf->osd_max_trimming_pgs),
261 recovery_lock("OSDService::recovery_lock"),
262 recovery_ops_active(0),
263 recovery_ops_reserved(0),
264 recovery_paused(false),
265 map_cache_lock("OSDService::map_cache_lock"),
266 map_cache(cct, cct->_conf->osd_map_cache_size),
267 map_bl_cache(cct->_conf->osd_map_cache_size),
268 map_bl_inc_cache(cct->_conf->osd_map_cache_size),
7c673cae
FG
269 stat_lock("OSDService::stat_lock"),
270 full_status_lock("OSDService::full_status_lock"),
271 cur_state(NONE),
11fdf7f2 272 cur_ratio(0), physical_ratio(0),
7c673cae
FG
273 epoch_lock("OSDService::epoch_lock"),
274 boot_epoch(0), up_epoch(0), bind_epoch(0),
275 is_stopping_lock("OSDService::is_stopping_lock")
276#ifdef PG_DEBUG_REFS
277 , pgid_lock("OSDService::pgid_lock")
278#endif
279{
280 objecter->init();
11fdf7f2
TL
281
282 for (int i = 0; i < m_objecter_finishers; i++) {
283 ostringstream str;
284 str << "objecter-finisher-" << i;
285 Finisher *fin = new Finisher(osd->client_messenger->cct, str.str(), "finisher");
286 objecter_finishers.push_back(fin);
287 }
7c673cae
FG
288}
289
290OSDService::~OSDService()
291{
292 delete objecter;
11fdf7f2
TL
293
294 for (auto f : objecter_finishers) {
295 delete f;
296 f = NULL;
297 }
7c673cae
FG
298}
299
31f18b77
FG
300
301
302#ifdef PG_DEBUG_REFS
303void OSDService::add_pgid(spg_t pgid, PG *pg){
11fdf7f2 304 std::lock_guard l(pgid_lock);
31f18b77
FG
305 if (!pgid_tracker.count(pgid)) {
306 live_pgs[pgid] = pg;
307 }
308 pgid_tracker[pgid]++;
309}
310void OSDService::remove_pgid(spg_t pgid, PG *pg)
311{
11fdf7f2
TL
312 std::lock_guard l(pgid_lock);
313 ceph_assert(pgid_tracker.count(pgid));
314 ceph_assert(pgid_tracker[pgid] > 0);
31f18b77
FG
315 pgid_tracker[pgid]--;
316 if (pgid_tracker[pgid] == 0) {
317 pgid_tracker.erase(pgid);
318 live_pgs.erase(pgid);
319 }
320}
321void OSDService::dump_live_pgids()
322{
11fdf7f2 323 std::lock_guard l(pgid_lock);
31f18b77
FG
324 derr << "live pgids:" << dendl;
325 for (map<spg_t, int>::const_iterator i = pgid_tracker.cbegin();
326 i != pgid_tracker.cend();
327 ++i) {
328 derr << "\t" << *i << dendl;
329 live_pgs[i->first]->dump_live_ids();
330 }
331}
332#endif
333
334
7c673cae 335
11fdf7f2
TL
336void OSDService::identify_splits_and_merges(
337 OSDMapRef old_map,
338 OSDMapRef new_map,
339 spg_t pgid,
340 set<pair<spg_t,epoch_t>> *split_children,
341 set<pair<spg_t,epoch_t>> *merge_pgs)
7c673cae 342{
11fdf7f2 343 if (!old_map->have_pg_pool(pgid.pool())) {
7c673cae 344 return;
7c673cae 345 }
7c673cae 346 int old_pgnum = old_map->get_pg_num(pgid.pool());
11fdf7f2
TL
347 auto p = osd->pg_num_history.pg_nums.find(pgid.pool());
348 if (p == osd->pg_num_history.pg_nums.end()) {
349 return;
350 }
351 dout(20) << __func__ << " " << pgid << " e" << old_map->get_epoch()
352 << " to e" << new_map->get_epoch()
353 << " pg_nums " << p->second << dendl;
354 deque<spg_t> queue;
355 queue.push_back(pgid);
356 while (!queue.empty()) {
357 auto cur = queue.front();
358 queue.pop_front();
359 unsigned pgnum = old_pgnum;
360 for (auto q = p->second.lower_bound(old_map->get_epoch());
361 q != p->second.end() &&
362 q->first <= new_map->get_epoch();
363 ++q) {
364 if (pgnum < q->second) {
365 // split?
366 if (cur.ps() < pgnum) {
367 set<spg_t> children;
368 if (cur.is_split(pgnum, q->second, &children)) {
369 dout(20) << __func__ << " " << cur << " e" << q->first
370 << " pg_num " << pgnum << " -> " << q->second
371 << " children " << children << dendl;
372 for (auto i : children) {
373 split_children->insert(make_pair(i, q->first));
374 queue.push_back(i);
375 }
376 }
377 } else if (cur.ps() < q->second) {
378 dout(20) << __func__ << " " << cur << " e" << q->first
379 << " pg_num " << pgnum << " -> " << q->second
380 << " is a child" << dendl;
381 // normally we'd capture this from the parent, but it's
382 // possible the parent doesn't exist yet (it will be
383 // fabricated to allow an intervening merge). note this PG
384 // as a split child here to be sure we catch it.
385 split_children->insert(make_pair(cur, q->first));
386 } else {
387 dout(20) << __func__ << " " << cur << " e" << q->first
388 << " pg_num " << pgnum << " -> " << q->second
389 << " is post-split, skipping" << dendl;
390 }
391 } else if (merge_pgs) {
392 // merge?
393 if (cur.ps() >= q->second) {
394 if (cur.ps() < pgnum) {
395 spg_t parent;
396 if (cur.is_merge_source(pgnum, q->second, &parent)) {
397 set<spg_t> children;
398 parent.is_split(q->second, pgnum, &children);
399 dout(20) << __func__ << " " << cur << " e" << q->first
400 << " pg_num " << pgnum << " -> " << q->second
401 << " is merge source, target " << parent
402 << ", source(s) " << children << dendl;
403 merge_pgs->insert(make_pair(parent, q->first));
404 for (auto c : children) {
405 merge_pgs->insert(make_pair(c, q->first));
406 }
407 }
408 } else {
409 dout(20) << __func__ << " " << cur << " e" << q->first
410 << " pg_num " << pgnum << " -> " << q->second
411 << " is beyond old pgnum, skipping" << dendl;
412 }
413 } else {
414 set<spg_t> children;
415 if (cur.is_split(q->second, pgnum, &children)) {
416 dout(20) << __func__ << " " << cur << " e" << q->first
417 << " pg_num " << pgnum << " -> " << q->second
418 << " is merge target, source " << children << dendl;
419 for (auto c : children) {
420 merge_pgs->insert(make_pair(c, q->first));
421 }
422 merge_pgs->insert(make_pair(cur, q->first));
423 }
7c673cae
FG
424 }
425 }
11fdf7f2 426 pgnum = q->second;
7c673cae
FG
427 }
428 }
429}
430
7c673cae
FG
431void OSDService::need_heartbeat_peer_update()
432{
433 osd->need_heartbeat_peer_update();
434}
435
7c673cae
FG
436void OSDService::start_shutdown()
437{
438 {
11fdf7f2 439 std::lock_guard l(agent_timer_lock);
7c673cae
FG
440 agent_timer.shutdown();
441 }
31f18b77
FG
442
443 {
11fdf7f2
TL
444 std::lock_guard l(sleep_lock);
445 sleep_timer.shutdown();
31f18b77 446 }
81eedcae
TL
447
448 {
449 std::lock_guard l(recovery_request_lock);
450 recovery_request_timer.shutdown();
451 }
7c673cae
FG
452}
453
31f18b77 454void OSDService::shutdown_reserver()
7c673cae
FG
455{
456 reserver_finisher.wait_for_empty();
457 reserver_finisher.stop();
31f18b77
FG
458}
459
460void OSDService::shutdown()
461{
7c673cae 462 {
11fdf7f2 463 std::lock_guard l(watch_lock);
7c673cae
FG
464 watch_timer.shutdown();
465 }
466
467 objecter->shutdown();
11fdf7f2
TL
468 for (auto f : objecter_finishers) {
469 f->wait_for_empty();
470 f->stop();
7c673cae
FG
471 }
472
11fdf7f2 473 publish_map(OSDMapRef());
7c673cae
FG
474 next_osdmap = OSDMapRef();
475}
476
477void OSDService::init()
478{
479 reserver_finisher.start();
11fdf7f2
TL
480 for (auto f : objecter_finishers) {
481 f->start();
482 }
7c673cae
FG
483 objecter->set_client_incarnation(0);
484
485 // deprioritize objecter in daemonperf output
486 objecter->get_logger()->set_prio_adjust(-3);
487
488 watch_timer.init();
489 agent_timer.init();
7c673cae
FG
490
491 agent_thread.create("osd_srv_agent");
492
493 if (cct->_conf->osd_recovery_delay_start)
494 defer_recovery(cct->_conf->osd_recovery_delay_start);
495}
496
497void OSDService::final_init()
498{
499 objecter->start(osdmap.get());
500}
501
502void OSDService::activate_map()
503{
504 // wake/unwake the tiering agent
505 agent_lock.Lock();
506 agent_active =
507 !osdmap->test_flag(CEPH_OSDMAP_NOTIERAGENT) &&
508 osd->is_active();
509 agent_cond.Signal();
510 agent_lock.Unlock();
511}
512
181888fb
FG
513void OSDService::request_osdmap_update(epoch_t e)
514{
515 osd->osdmap_subscribe(e, false);
516}
517
7c673cae
FG
518class AgentTimeoutCB : public Context {
519 PGRef pg;
520public:
521 explicit AgentTimeoutCB(PGRef _pg) : pg(_pg) {}
522 void finish(int) override {
523 pg->agent_choose_mode_restart();
524 }
525};
526
527void OSDService::agent_entry()
528{
529 dout(10) << __func__ << " start" << dendl;
530 agent_lock.Lock();
531
532 while (!agent_stop_flag) {
533 if (agent_queue.empty()) {
534 dout(20) << __func__ << " empty queue" << dendl;
535 agent_cond.Wait(agent_lock);
536 continue;
537 }
538 uint64_t level = agent_queue.rbegin()->first;
539 set<PGRef>& top = agent_queue.rbegin()->second;
540 dout(10) << __func__
541 << " tiers " << agent_queue.size()
542 << ", top is " << level
543 << " with pgs " << top.size()
544 << ", ops " << agent_ops << "/"
545 << cct->_conf->osd_agent_max_ops
546 << (agent_active ? " active" : " NOT ACTIVE")
547 << dendl;
548 dout(20) << __func__ << " oids " << agent_oids << dendl;
549 int max = cct->_conf->osd_agent_max_ops - agent_ops;
550 int agent_flush_quota = max;
551 if (!flush_mode_high_count)
552 agent_flush_quota = cct->_conf->osd_agent_max_low_ops - agent_ops;
553 if (agent_flush_quota <= 0 || top.empty() || !agent_active) {
554 agent_cond.Wait(agent_lock);
555 continue;
556 }
557
558 if (!agent_valid_iterator || agent_queue_pos == top.end()) {
559 agent_queue_pos = top.begin();
560 agent_valid_iterator = true;
561 }
562 PGRef pg = *agent_queue_pos;
563 dout(10) << "high_count " << flush_mode_high_count
564 << " agent_ops " << agent_ops
565 << " flush_quota " << agent_flush_quota << dendl;
566 agent_lock.Unlock();
567 if (!pg->agent_work(max, agent_flush_quota)) {
11fdf7f2 568 dout(10) << __func__ << " " << pg->pg_id
7c673cae
FG
569 << " no agent_work, delay for " << cct->_conf->osd_agent_delay_time
570 << " seconds" << dendl;
571
572 osd->logger->inc(l_osd_tier_delay);
573 // Queue a timer to call agent_choose_mode for this pg in 5 seconds
574 agent_timer_lock.Lock();
575 Context *cb = new AgentTimeoutCB(pg);
576 agent_timer.add_event_after(cct->_conf->osd_agent_delay_time, cb);
577 agent_timer_lock.Unlock();
578 }
579 agent_lock.Lock();
580 }
581 agent_lock.Unlock();
582 dout(10) << __func__ << " finish" << dendl;
583}
584
585void OSDService::agent_stop()
586{
587 {
11fdf7f2 588 std::lock_guard l(agent_lock);
7c673cae
FG
589
590 // By this time all ops should be cancelled
11fdf7f2 591 ceph_assert(agent_ops == 0);
7c673cae
FG
592 // By this time all PGs are shutdown and dequeued
593 if (!agent_queue.empty()) {
594 set<PGRef>& top = agent_queue.rbegin()->second;
11fdf7f2
TL
595 derr << "agent queue not empty, for example " << (*top.begin())->get_pgid() << dendl;
596 ceph_abort_msg("agent queue not empty");
7c673cae
FG
597 }
598
599 agent_stop_flag = true;
600 agent_cond.Signal();
601 }
602 agent_thread.join();
603}
604
605// -------------------------------------
606
607void OSDService::promote_throttle_recalibrate()
608{
609 utime_t now = ceph_clock_now();
610 double dur = now - last_recalibrate;
611 last_recalibrate = now;
612 unsigned prob = promote_probability_millis;
613
614 uint64_t target_obj_sec = cct->_conf->osd_tier_promote_max_objects_sec;
615 uint64_t target_bytes_sec = cct->_conf->osd_tier_promote_max_bytes_sec;
616
617 unsigned min_prob = 1;
618
619 uint64_t attempts, obj, bytes;
620 promote_counter.sample_and_attenuate(&attempts, &obj, &bytes);
621 dout(10) << __func__ << " " << attempts << " attempts, promoted "
1adf2230 622 << obj << " objects and " << byte_u_t(bytes) << "; target "
7c673cae 623 << target_obj_sec << " obj/sec or "
1adf2230 624 << byte_u_t(target_bytes_sec) << "/sec"
7c673cae
FG
625 << dendl;
626
627 // calculate what the probability *should* be, given the targets
628 unsigned new_prob;
629 if (attempts && dur > 0) {
630 uint64_t avg_size = 1;
631 if (obj)
11fdf7f2 632 avg_size = std::max<uint64_t>(bytes / obj, 1);
7c673cae
FG
633 unsigned po = (double)target_obj_sec * dur * 1000.0 / (double)attempts;
634 unsigned pb = (double)target_bytes_sec / (double)avg_size * dur * 1000.0
635 / (double)attempts;
636 dout(20) << __func__ << " po " << po << " pb " << pb << " avg_size "
637 << avg_size << dendl;
638 if (target_obj_sec && target_bytes_sec)
11fdf7f2 639 new_prob = std::min(po, pb);
7c673cae
FG
640 else if (target_obj_sec)
641 new_prob = po;
642 else if (target_bytes_sec)
643 new_prob = pb;
644 else
645 new_prob = 1000;
646 } else {
647 new_prob = 1000;
648 }
649 dout(20) << __func__ << " new_prob " << new_prob << dendl;
650
651 // correct for persistent skew between target rate and actual rate, adjust
652 double ratio = 1.0;
653 unsigned actual = 0;
654 if (attempts && obj) {
655 actual = obj * 1000 / attempts;
656 ratio = (double)actual / (double)prob;
657 new_prob = (double)new_prob / ratio;
658 }
11fdf7f2
TL
659 new_prob = std::max(new_prob, min_prob);
660 new_prob = std::min(new_prob, 1000u);
7c673cae
FG
661
662 // adjust
663 prob = (prob + new_prob) / 2;
11fdf7f2
TL
664 prob = std::max(prob, min_prob);
665 prob = std::min(prob, 1000u);
7c673cae
FG
666 dout(10) << __func__ << " actual " << actual
667 << ", actual/prob ratio " << ratio
668 << ", adjusted new_prob " << new_prob
669 << ", prob " << promote_probability_millis << " -> " << prob
670 << dendl;
671 promote_probability_millis = prob;
672
673 // set hard limits for this interval to mitigate stampedes
91327a77
AA
674 promote_max_objects = target_obj_sec * osd->OSD_TICK_INTERVAL * 2;
675 promote_max_bytes = target_bytes_sec * osd->OSD_TICK_INTERVAL * 2;
7c673cae
FG
676}
677
678// -------------------------------------
679
680float OSDService::get_failsafe_full_ratio()
681{
682 float full_ratio = cct->_conf->osd_failsafe_full_ratio;
683 if (full_ratio > 1.0) full_ratio /= 100.0;
684 return full_ratio;
685}
686
11fdf7f2 687OSDService::s_names OSDService::recalc_full_state(float ratio, float pratio, string &inject)
7c673cae 688{
7c673cae
FG
689 // The OSDMap ratios take precendence. So if the failsafe is .95 and
690 // the admin sets the cluster full to .96, the failsafe moves up to .96
691 // too. (Not that having failsafe == full is ideal, but it's better than
692 // dropping writes before the clusters appears full.)
693 OSDMapRef osdmap = get_osdmap();
694 if (!osdmap || osdmap->get_epoch() == 0) {
11fdf7f2 695 return NONE;
7c673cae
FG
696 }
697 float nearfull_ratio = osdmap->get_nearfull_ratio();
698 float backfillfull_ratio = std::max(osdmap->get_backfillfull_ratio(), nearfull_ratio);
699 float full_ratio = std::max(osdmap->get_full_ratio(), backfillfull_ratio);
700 float failsafe_ratio = std::max(get_failsafe_full_ratio(), full_ratio);
701
31f18b77 702 if (osdmap->require_osd_release < CEPH_RELEASE_LUMINOUS) {
7c673cae
FG
703 // use the failsafe for nearfull and full; the mon isn't using the
704 // flags anyway because we're mid-upgrade.
705 full_ratio = failsafe_ratio;
706 backfillfull_ratio = failsafe_ratio;
707 nearfull_ratio = failsafe_ratio;
708 } else if (full_ratio <= 0 ||
709 backfillfull_ratio <= 0 ||
710 nearfull_ratio <= 0) {
711 derr << __func__ << " full_ratio, backfillfull_ratio or nearfull_ratio is <= 0" << dendl;
712 // use failsafe flag. ick. the monitor did something wrong or the user
713 // did something stupid.
714 full_ratio = failsafe_ratio;
715 backfillfull_ratio = failsafe_ratio;
716 nearfull_ratio = failsafe_ratio;
717 }
718
7c673cae 719 if (injectfull_state > NONE && injectfull) {
7c673cae 720 inject = "(Injected)";
11fdf7f2
TL
721 return injectfull_state;
722 } else if (pratio > failsafe_ratio) {
723 return FAILSAFE;
7c673cae 724 } else if (ratio > full_ratio) {
11fdf7f2 725 return FULL;
7c673cae 726 } else if (ratio > backfillfull_ratio) {
11fdf7f2 727 return BACKFILLFULL;
7c673cae 728 } else if (ratio > nearfull_ratio) {
11fdf7f2 729 return NEARFULL;
7c673cae 730 }
11fdf7f2
TL
731 return NONE;
732}
733
734void OSDService::check_full_status(float ratio, float pratio)
735{
736 std::lock_guard l(full_status_lock);
737
738 cur_ratio = ratio;
739 physical_ratio = pratio;
740
741 string inject;
742 s_names new_state;
743 new_state = recalc_full_state(ratio, pratio, inject);
744
7c673cae 745 dout(20) << __func__ << " cur ratio " << ratio
11fdf7f2 746 << ", physical ratio " << pratio
7c673cae
FG
747 << ", new state " << get_full_state_name(new_state)
748 << " " << inject
749 << dendl;
750
751 // warn
752 if (cur_state != new_state) {
753 dout(10) << __func__ << " " << get_full_state_name(cur_state)
754 << " -> " << get_full_state_name(new_state) << dendl;
755 if (new_state == FAILSAFE) {
c07f9fc5 756 clog->error() << "full status failsafe engaged, dropping updates, now "
7c673cae
FG
757 << (int)roundf(ratio * 100) << "% full";
758 } else if (cur_state == FAILSAFE) {
c07f9fc5
FG
759 clog->error() << "full status failsafe disengaged, no longer dropping "
760 << "updates, now " << (int)roundf(ratio * 100) << "% full";
7c673cae
FG
761 }
762 cur_state = new_state;
763 }
764}
765
766bool OSDService::need_fullness_update()
767{
768 OSDMapRef osdmap = get_osdmap();
769 s_names cur = NONE;
770 if (osdmap->exists(whoami)) {
771 if (osdmap->get_state(whoami) & CEPH_OSD_FULL) {
772 cur = FULL;
773 } else if (osdmap->get_state(whoami) & CEPH_OSD_BACKFILLFULL) {
774 cur = BACKFILLFULL;
775 } else if (osdmap->get_state(whoami) & CEPH_OSD_NEARFULL) {
776 cur = NEARFULL;
777 }
778 }
779 s_names want = NONE;
780 if (is_full())
781 want = FULL;
782 else if (is_backfillfull())
783 want = BACKFILLFULL;
784 else if (is_nearfull())
785 want = NEARFULL;
786 return want != cur;
787}
788
11fdf7f2 789bool OSDService::_check_inject_full(DoutPrefixProvider *dpp, s_names type) const
7c673cae 790{
7c673cae
FG
791 if (injectfull && injectfull_state >= type) {
792 // injectfull is either a count of the number of times to return failsafe full
793 // or if -1 then always return full
794 if (injectfull > 0)
795 --injectfull;
11fdf7f2
TL
796 ldpp_dout(dpp, 10) << __func__ << " Injected " << get_full_state_name(type) << " OSD ("
797 << (injectfull < 0 ? "set" : std::to_string(injectfull)) << ")"
798 << dendl;
7c673cae
FG
799 return true;
800 }
11fdf7f2
TL
801 return false;
802}
803
804bool OSDService::_check_full(DoutPrefixProvider *dpp, s_names type) const
805{
806 std::lock_guard l(full_status_lock);
807
808 if (_check_inject_full(dpp, type))
809 return true;
810
811 if (cur_state >= type)
812 ldpp_dout(dpp, 10) << __func__ << " current usage is " << cur_ratio
813 << " physical " << physical_ratio << dendl;
7c673cae 814
7c673cae
FG
815 return cur_state >= type;
816}
817
11fdf7f2
TL
818bool OSDService::_tentative_full(DoutPrefixProvider *dpp, s_names type, uint64_t adjust_used, osd_stat_t adjusted_stat)
819{
820 ldpp_dout(dpp, 20) << __func__ << " type " << get_full_state_name(type) << " adjust_used " << (adjust_used >> 10) << "KiB" << dendl;
821 {
822 std::lock_guard l(full_status_lock);
823 if (_check_inject_full(dpp, type)) {
824 return true;
825 }
826 }
827
828 float pratio;
829 float ratio = compute_adjusted_ratio(adjusted_stat, &pratio, adjust_used);
830
831 string notused;
832 s_names tentative_state = recalc_full_state(ratio, pratio, notused);
833
834 if (tentative_state >= type)
835 ldpp_dout(dpp, 10) << __func__ << " tentative usage is " << ratio << dendl;
836
837 return tentative_state >= type;
838}
839
840bool OSDService::check_failsafe_full(DoutPrefixProvider *dpp) const
841{
842 return _check_full(dpp, FAILSAFE);
843}
844
845bool OSDService::check_full(DoutPrefixProvider *dpp) const
7c673cae 846{
11fdf7f2 847 return _check_full(dpp, FULL);
7c673cae
FG
848}
849
11fdf7f2 850bool OSDService::tentative_backfill_full(DoutPrefixProvider *dpp, uint64_t adjust_used, osd_stat_t stats)
7c673cae 851{
11fdf7f2 852 return _tentative_full(dpp, BACKFILLFULL, adjust_used, stats);
7c673cae
FG
853}
854
11fdf7f2 855bool OSDService::check_backfill_full(DoutPrefixProvider *dpp) const
7c673cae 856{
11fdf7f2 857 return _check_full(dpp, BACKFILLFULL);
7c673cae
FG
858}
859
11fdf7f2 860bool OSDService::check_nearfull(DoutPrefixProvider *dpp) const
7c673cae 861{
11fdf7f2 862 return _check_full(dpp, NEARFULL);
7c673cae
FG
863}
864
865bool OSDService::is_failsafe_full() const
866{
11fdf7f2 867 std::lock_guard l(full_status_lock);
7c673cae
FG
868 return cur_state == FAILSAFE;
869}
870
871bool OSDService::is_full() const
872{
11fdf7f2 873 std::lock_guard l(full_status_lock);
7c673cae
FG
874 return cur_state >= FULL;
875}
876
877bool OSDService::is_backfillfull() const
878{
11fdf7f2 879 std::lock_guard l(full_status_lock);
7c673cae
FG
880 return cur_state >= BACKFILLFULL;
881}
882
883bool OSDService::is_nearfull() const
884{
11fdf7f2 885 std::lock_guard l(full_status_lock);
7c673cae
FG
886 return cur_state >= NEARFULL;
887}
888
889void OSDService::set_injectfull(s_names type, int64_t count)
890{
11fdf7f2 891 std::lock_guard l(full_status_lock);
7c673cae
FG
892 injectfull_state = type;
893 injectfull = count;
894}
895
11fdf7f2
TL
896void OSDService::set_statfs(const struct store_statfs_t &stbuf,
897 osd_alert_list_t& alerts)
7c673cae 898{
224ce89b 899 uint64_t bytes = stbuf.total;
224ce89b 900 uint64_t avail = stbuf.available;
11fdf7f2
TL
901 uint64_t used = stbuf.get_used_raw();
902
903 // For testing fake statfs values so it doesn't matter if all
904 // OSDs are using the same partition.
905 if (cct->_conf->fake_statfs_for_testing) {
906 uint64_t total_num_bytes = 0;
907 vector<PGRef> pgs;
908 osd->_get_pgs(&pgs);
909 for (auto p : pgs) {
910 total_num_bytes += p->get_stats_num_bytes();
911 }
912 bytes = cct->_conf->fake_statfs_for_testing;
913 if (total_num_bytes < bytes)
914 avail = bytes - total_num_bytes;
915 else
916 avail = 0;
917 dout(0) << __func__ << " fake total " << cct->_conf->fake_statfs_for_testing
918 << " adjust available " << avail
919 << dendl;
920 used = bytes - avail;
921 }
7c673cae 922
224ce89b
WB
923 osd->logger->set(l_osd_stat_bytes, bytes);
924 osd->logger->set(l_osd_stat_bytes_used, used);
925 osd->logger->set(l_osd_stat_bytes_avail, avail);
7c673cae 926
11fdf7f2
TL
927 std::lock_guard l(stat_lock);
928 osd_stat.statfs = stbuf;
929 osd_stat.os_alerts.clear();
930 osd_stat.os_alerts[whoami].swap(alerts);
931 if (cct->_conf->fake_statfs_for_testing) {
932 osd_stat.statfs.total = bytes;
933 osd_stat.statfs.available = avail;
934 // For testing don't want used to go negative, so clear reserved
935 osd_stat.statfs.internally_reserved = 0;
224ce89b
WB
936 }
937}
7c673cae 938
11fdf7f2
TL
939osd_stat_t OSDService::set_osd_stat(vector<int>& hb_peers,
940 int num_pgs)
224ce89b 941{
11fdf7f2
TL
942 std::lock_guard l(stat_lock);
943 osd_stat.hb_peers.swap(hb_peers);
944 osd->op_tracker.get_age_ms_histogram(&osd_stat.op_queue_age_hist);
945 osd_stat.num_pgs = num_pgs;
946 return osd_stat;
947}
948
949void OSDService::inc_osd_stat_repaired()
950{
951 std::lock_guard l(stat_lock);
952 osd_stat.num_shards_repaired++;
953 return;
954}
955
956float OSDService::compute_adjusted_ratio(osd_stat_t new_stat, float *pratio,
957 uint64_t adjust_used)
958{
959 *pratio =
960 ((float)new_stat.statfs.get_used()) / ((float)new_stat.statfs.total);
961
962 if (adjust_used) {
963 dout(20) << __func__ << " Before kb_used() " << new_stat.statfs.kb_used() << dendl;
964 if (new_stat.statfs.available > adjust_used)
965 new_stat.statfs.available -= adjust_used;
966 else
967 new_stat.statfs.available = 0;
968 dout(20) << __func__ << " After kb_used() " << new_stat.statfs.kb_used() << dendl;
7c673cae
FG
969 }
970
11fdf7f2
TL
971 // Check all pgs and adjust kb_used to include all pending backfill data
972 int backfill_adjusted = 0;
973 vector<PGRef> pgs;
974 osd->_get_pgs(&pgs);
975 for (auto p : pgs) {
976 backfill_adjusted += p->pg_stat_adjust(&new_stat);
977 }
978 if (backfill_adjusted) {
979 dout(20) << __func__ << " backfill adjusted " << new_stat << dendl;
980 }
981 return ((float)new_stat.statfs.get_used()) / ((float)new_stat.statfs.total);
7c673cae
FG
982}
983
984bool OSDService::check_osdmap_full(const set<pg_shard_t> &missing_on)
985{
986 OSDMapRef osdmap = get_osdmap();
987 for (auto shard : missing_on) {
988 if (osdmap->get_state(shard.osd) & CEPH_OSD_FULL)
989 return true;
990 }
991 return false;
992}
993
994void OSDService::send_message_osd_cluster(int peer, Message *m, epoch_t from_epoch)
995{
996 OSDMapRef next_map = get_nextmap_reserved();
997 // service map is always newer/newest
11fdf7f2 998 ceph_assert(from_epoch <= next_map->get_epoch());
7c673cae
FG
999
1000 if (next_map->is_down(peer) ||
1001 next_map->get_info(peer).up_from > from_epoch) {
1002 m->put();
1003 release_map(next_map);
1004 return;
1005 }
11fdf7f2
TL
1006 ConnectionRef peer_con = osd->cluster_messenger->connect_to_osd(
1007 next_map->get_cluster_addrs(peer));
7c673cae
FG
1008 share_map_peer(peer, peer_con.get(), next_map);
1009 peer_con->send_message(m);
1010 release_map(next_map);
1011}
1012
1013ConnectionRef OSDService::get_con_osd_cluster(int peer, epoch_t from_epoch)
1014{
1015 OSDMapRef next_map = get_nextmap_reserved();
1016 // service map is always newer/newest
11fdf7f2 1017 ceph_assert(from_epoch <= next_map->get_epoch());
7c673cae
FG
1018
1019 if (next_map->is_down(peer) ||
1020 next_map->get_info(peer).up_from > from_epoch) {
1021 release_map(next_map);
1022 return NULL;
1023 }
11fdf7f2
TL
1024 ConnectionRef con = osd->cluster_messenger->connect_to_osd(
1025 next_map->get_cluster_addrs(peer));
7c673cae
FG
1026 release_map(next_map);
1027 return con;
1028}
1029
1030pair<ConnectionRef,ConnectionRef> OSDService::get_con_osd_hb(int peer, epoch_t from_epoch)
1031{
1032 OSDMapRef next_map = get_nextmap_reserved();
1033 // service map is always newer/newest
11fdf7f2 1034 ceph_assert(from_epoch <= next_map->get_epoch());
7c673cae
FG
1035
1036 pair<ConnectionRef,ConnectionRef> ret;
1037 if (next_map->is_down(peer) ||
1038 next_map->get_info(peer).up_from > from_epoch) {
1039 release_map(next_map);
1040 return ret;
1041 }
11fdf7f2
TL
1042 ret.first = osd->hb_back_client_messenger->connect_to_osd(
1043 next_map->get_hb_back_addrs(peer));
1044 ret.second = osd->hb_front_client_messenger->connect_to_osd(
1045 next_map->get_hb_front_addrs(peer));
7c673cae
FG
1046 release_map(next_map);
1047 return ret;
1048}
1049
11fdf7f2
TL
1050entity_name_t OSDService::get_cluster_msgr_name() const
1051{
1052 return cluster_messenger->get_myname();
1053}
7c673cae 1054
94b18763
FG
1055void OSDService::queue_want_pg_temp(pg_t pgid,
1056 const vector<int>& want,
1057 bool forced)
7c673cae 1058{
11fdf7f2 1059 std::lock_guard l(pg_temp_lock);
94b18763 1060 auto p = pg_temp_pending.find(pgid);
7c673cae 1061 if (p == pg_temp_pending.end() ||
94b18763
FG
1062 p->second.acting != want ||
1063 forced) {
11fdf7f2 1064 pg_temp_wanted[pgid] = {want, forced};
7c673cae
FG
1065 }
1066}
1067
1068void OSDService::remove_want_pg_temp(pg_t pgid)
1069{
11fdf7f2 1070 std::lock_guard l(pg_temp_lock);
7c673cae
FG
1071 pg_temp_wanted.erase(pgid);
1072 pg_temp_pending.erase(pgid);
1073}
1074
1075void OSDService::_sent_pg_temp()
1076{
11fdf7f2
TL
1077#ifdef HAVE_STDLIB_MAP_SPLICING
1078 pg_temp_pending.merge(pg_temp_wanted);
1079#else
94b18763
FG
1080 pg_temp_pending.insert(make_move_iterator(begin(pg_temp_wanted)),
1081 make_move_iterator(end(pg_temp_wanted)));
11fdf7f2 1082#endif
7c673cae
FG
1083 pg_temp_wanted.clear();
1084}
1085
1086void OSDService::requeue_pg_temp()
1087{
11fdf7f2 1088 std::lock_guard l(pg_temp_lock);
7c673cae
FG
1089 // wanted overrides pending. note that remove_want_pg_temp
1090 // clears the item out of both.
1091 unsigned old_wanted = pg_temp_wanted.size();
1092 unsigned old_pending = pg_temp_pending.size();
1093 _sent_pg_temp();
1094 pg_temp_wanted.swap(pg_temp_pending);
1095 dout(10) << __func__ << " " << old_wanted << " + " << old_pending << " -> "
1096 << pg_temp_wanted.size() << dendl;
1097}
1098
94b18763
FG
1099std::ostream& operator<<(std::ostream& out,
1100 const OSDService::pg_temp_t& pg_temp)
1101{
1102 out << pg_temp.acting;
1103 if (pg_temp.forced) {
1104 out << " (forced)";
1105 }
1106 return out;
1107}
1108
7c673cae
FG
1109void OSDService::send_pg_temp()
1110{
11fdf7f2 1111 std::lock_guard l(pg_temp_lock);
7c673cae
FG
1112 if (pg_temp_wanted.empty())
1113 return;
1114 dout(10) << "send_pg_temp " << pg_temp_wanted << dendl;
94b18763 1115 MOSDPGTemp *ms[2] = {nullptr, nullptr};
11fdf7f2
TL
1116 for (auto& [pgid, pg_temp] : pg_temp_wanted) {
1117 auto& m = ms[pg_temp.forced];
94b18763
FG
1118 if (!m) {
1119 m = new MOSDPGTemp(osdmap->get_epoch());
11fdf7f2 1120 m->forced = pg_temp.forced;
94b18763 1121 }
11fdf7f2 1122 m->pg_temp.emplace(pgid, pg_temp.acting);
94b18763
FG
1123 }
1124 for (auto m : ms) {
1125 if (m) {
1126 monc->send_mon_message(m);
1127 }
1128 }
7c673cae
FG
1129 _sent_pg_temp();
1130}
1131
1132void OSDService::send_pg_created(pg_t pgid)
1133{
11fdf7f2 1134 std::lock_guard l(pg_created_lock);
7c673cae 1135 dout(20) << __func__ << dendl;
11fdf7f2
TL
1136 auto o = get_osdmap();
1137 if (o->require_osd_release >= CEPH_RELEASE_LUMINOUS) {
1138 pg_created.insert(pgid);
c07f9fc5
FG
1139 monc->send_mon_message(new MOSDPGCreated(pgid));
1140 }
7c673cae
FG
1141}
1142
11fdf7f2
TL
1143void OSDService::send_pg_created()
1144{
1145 std::lock_guard l(pg_created_lock);
1146 dout(20) << __func__ << dendl;
1147 auto o = get_osdmap();
1148 if (o->require_osd_release >= CEPH_RELEASE_LUMINOUS) {
1149 for (auto pgid : pg_created) {
1150 monc->send_mon_message(new MOSDPGCreated(pgid));
1151 }
1152 }
1153}
1154
1155void OSDService::prune_pg_created()
1156{
1157 std::lock_guard l(pg_created_lock);
1158 dout(20) << __func__ << dendl;
1159 auto o = get_osdmap();
1160 auto i = pg_created.begin();
1161 while (i != pg_created.end()) {
1162 auto p = o->get_pg_pool(i->pool());
1163 if (!p || !p->has_flag(pg_pool_t::FLAG_CREATING)) {
1164 dout(20) << __func__ << " pruning " << *i << dendl;
1165 i = pg_created.erase(i);
1166 } else {
1167 dout(20) << __func__ << " keeping " << *i << dendl;
1168 ++i;
1169 }
1170 }
1171}
1172
1173
7c673cae
FG
1174// --------------------------------------
1175// dispatch
1176
1177epoch_t OSDService::get_peer_epoch(int peer)
1178{
11fdf7f2 1179 std::lock_guard l(peer_map_epoch_lock);
7c673cae
FG
1180 map<int,epoch_t>::iterator p = peer_map_epoch.find(peer);
1181 if (p == peer_map_epoch.end())
1182 return 0;
1183 return p->second;
1184}
1185
1186epoch_t OSDService::note_peer_epoch(int peer, epoch_t e)
1187{
11fdf7f2 1188 std::lock_guard l(peer_map_epoch_lock);
7c673cae
FG
1189 map<int,epoch_t>::iterator p = peer_map_epoch.find(peer);
1190 if (p != peer_map_epoch.end()) {
1191 if (p->second < e) {
1192 dout(10) << "note_peer_epoch osd." << peer << " has " << e << dendl;
1193 p->second = e;
1194 } else {
1195 dout(30) << "note_peer_epoch osd." << peer << " has " << p->second << " >= " << e << dendl;
1196 }
1197 return p->second;
1198 } else {
1199 dout(10) << "note_peer_epoch osd." << peer << " now has " << e << dendl;
1200 peer_map_epoch[peer] = e;
1201 return e;
1202 }
1203}
1204
1205void OSDService::forget_peer_epoch(int peer, epoch_t as_of)
1206{
11fdf7f2 1207 std::lock_guard l(peer_map_epoch_lock);
7c673cae
FG
1208 map<int,epoch_t>::iterator p = peer_map_epoch.find(peer);
1209 if (p != peer_map_epoch.end()) {
1210 if (p->second <= as_of) {
1211 dout(10) << "forget_peer_epoch osd." << peer << " as_of " << as_of
1212 << " had " << p->second << dendl;
1213 peer_map_epoch.erase(p);
1214 } else {
1215 dout(10) << "forget_peer_epoch osd." << peer << " as_of " << as_of
1216 << " has " << p->second << " - not forgetting" << dendl;
1217 }
1218 }
1219}
1220
1221bool OSDService::should_share_map(entity_name_t name, Connection *con,
1222 epoch_t epoch, const OSDMapRef& osdmap,
1223 const epoch_t *sent_epoch_p)
1224{
1225 dout(20) << "should_share_map "
1226 << name << " " << con->get_peer_addr()
1227 << " " << epoch << dendl;
1228
1229 // does client have old map?
1230 if (name.is_client()) {
1231 bool message_sendmap = epoch < osdmap->get_epoch();
1232 if (message_sendmap && sent_epoch_p) {
1233 dout(20) << "client session last_sent_epoch: "
1234 << *sent_epoch_p
1235 << " versus osdmap epoch " << osdmap->get_epoch() << dendl;
1236 if (*sent_epoch_p < osdmap->get_epoch()) {
1237 return true;
1238 } // else we don't need to send it out again
1239 }
1240 }
1241
1242 if (con->get_messenger() == osd->cluster_messenger &&
1243 con != osd->cluster_messenger->get_loopback_connection() &&
1244 osdmap->is_up(name.num()) &&
11fdf7f2
TL
1245 (osdmap->get_cluster_addrs(name.num()) == con->get_peer_addrs() ||
1246 osdmap->get_hb_back_addrs(name.num()) == con->get_peer_addrs())) {
7c673cae 1247 // remember
11fdf7f2 1248 epoch_t has = std::max(get_peer_epoch(name.num()), epoch);
7c673cae
FG
1249
1250 // share?
1251 if (has < osdmap->get_epoch()) {
1252 dout(10) << name << " " << con->get_peer_addr()
1253 << " has old map " << epoch << " < "
1254 << osdmap->get_epoch() << dendl;
1255 return true;
1256 }
1257 }
1258
1259 return false;
1260}
1261
1262void OSDService::share_map(
1263 entity_name_t name,
1264 Connection *con,
1265 epoch_t epoch,
1266 OSDMapRef& osdmap,
1267 epoch_t *sent_epoch_p)
1268{
1269 dout(20) << "share_map "
1270 << name << " " << con->get_peer_addr()
1271 << " " << epoch << dendl;
1272
1273 if (!osd->is_active()) {
1274 /*It is safe not to proceed as OSD is not in healthy state*/
1275 return;
1276 }
1277
1278 bool want_shared = should_share_map(name, con, epoch,
1279 osdmap, sent_epoch_p);
1280
1281 if (want_shared){
1282 if (name.is_client()) {
1283 dout(10) << name << " has old map " << epoch
1284 << " < " << osdmap->get_epoch() << dendl;
1285 // we know the Session is valid or we wouldn't be sending
1286 if (sent_epoch_p) {
1287 *sent_epoch_p = osdmap->get_epoch();
1288 }
1289 send_incremental_map(epoch, con, osdmap);
1290 } else if (con->get_messenger() == osd->cluster_messenger &&
1291 osdmap->is_up(name.num()) &&
11fdf7f2
TL
1292 (osdmap->get_cluster_addrs(name.num()) == con->get_peer_addrs() ||
1293 osdmap->get_hb_back_addrs(name.num()) == con->get_peer_addrs())) {
1294 dout(10) << name << " " << con->get_peer_addrs()
7c673cae
FG
1295 << " has old map " << epoch << " < "
1296 << osdmap->get_epoch() << dendl;
1297 note_peer_epoch(name.num(), osdmap->get_epoch());
1298 send_incremental_map(epoch, con, osdmap);
1299 }
1300 }
1301}
1302
1303void OSDService::share_map_peer(int peer, Connection *con, OSDMapRef map)
1304{
1305 if (!map)
1306 map = get_osdmap();
1307
1308 // send map?
1309 epoch_t pe = get_peer_epoch(peer);
1310 if (pe) {
1311 if (pe < map->get_epoch()) {
1312 send_incremental_map(pe, con, map);
1313 note_peer_epoch(peer, map->get_epoch());
1314 } else
1315 dout(20) << "share_map_peer " << con << " already has epoch " << pe << dendl;
1316 } else {
1317 dout(20) << "share_map_peer " << con << " don't know epoch, doing nothing" << dendl;
1318 // no idea about peer's epoch.
1319 // ??? send recent ???
1320 // do nothing.
1321 }
1322}
1323
1324bool OSDService::can_inc_scrubs_pending()
1325{
1326 bool can_inc = false;
11fdf7f2 1327 std::lock_guard l(sched_scrub_lock);
7c673cae
FG
1328
1329 if (scrubs_pending + scrubs_active < cct->_conf->osd_max_scrubs) {
1330 dout(20) << __func__ << " " << scrubs_pending << " -> " << (scrubs_pending+1)
28e407b8
AA
1331 << " (max " << cct->_conf->osd_max_scrubs << ", active " << scrubs_active
1332 << ")" << dendl;
7c673cae
FG
1333 can_inc = true;
1334 } else {
28e407b8
AA
1335 dout(20) << __func__ << " " << scrubs_pending << " + " << scrubs_active
1336 << " active >= max " << cct->_conf->osd_max_scrubs << dendl;
7c673cae
FG
1337 }
1338
1339 return can_inc;
1340}
1341
1342bool OSDService::inc_scrubs_pending()
1343{
1344 bool result = false;
1345
1346 sched_scrub_lock.Lock();
1347 if (scrubs_pending + scrubs_active < cct->_conf->osd_max_scrubs) {
1348 dout(20) << "inc_scrubs_pending " << scrubs_pending << " -> " << (scrubs_pending+1)
1349 << " (max " << cct->_conf->osd_max_scrubs << ", active " << scrubs_active << ")" << dendl;
1350 result = true;
1351 ++scrubs_pending;
1352 } else {
1353 dout(20) << "inc_scrubs_pending " << scrubs_pending << " + " << scrubs_active << " active >= max " << cct->_conf->osd_max_scrubs << dendl;
1354 }
1355 sched_scrub_lock.Unlock();
1356
1357 return result;
1358}
1359
1360void OSDService::dec_scrubs_pending()
1361{
1362 sched_scrub_lock.Lock();
1363 dout(20) << "dec_scrubs_pending " << scrubs_pending << " -> " << (scrubs_pending-1)
1364 << " (max " << cct->_conf->osd_max_scrubs << ", active " << scrubs_active << ")" << dendl;
1365 --scrubs_pending;
11fdf7f2 1366 ceph_assert(scrubs_pending >= 0);
7c673cae
FG
1367 sched_scrub_lock.Unlock();
1368}
1369
1370void OSDService::inc_scrubs_active(bool reserved)
1371{
1372 sched_scrub_lock.Lock();
1373 ++(scrubs_active);
1374 if (reserved) {
1375 --(scrubs_pending);
1376 dout(20) << "inc_scrubs_active " << (scrubs_active-1) << " -> " << scrubs_active
1377 << " (max " << cct->_conf->osd_max_scrubs
1378 << ", pending " << (scrubs_pending+1) << " -> " << scrubs_pending << ")" << dendl;
11fdf7f2 1379 ceph_assert(scrubs_pending >= 0);
7c673cae
FG
1380 } else {
1381 dout(20) << "inc_scrubs_active " << (scrubs_active-1) << " -> " << scrubs_active
1382 << " (max " << cct->_conf->osd_max_scrubs
1383 << ", pending " << scrubs_pending << ")" << dendl;
1384 }
1385 sched_scrub_lock.Unlock();
1386}
1387
1388void OSDService::dec_scrubs_active()
1389{
1390 sched_scrub_lock.Lock();
1391 dout(20) << "dec_scrubs_active " << scrubs_active << " -> " << (scrubs_active-1)
1392 << " (max " << cct->_conf->osd_max_scrubs << ", pending " << scrubs_pending << ")" << dendl;
1393 --scrubs_active;
11fdf7f2 1394 ceph_assert(scrubs_active >= 0);
7c673cae
FG
1395 sched_scrub_lock.Unlock();
1396}
1397
1398void OSDService::retrieve_epochs(epoch_t *_boot_epoch, epoch_t *_up_epoch,
1399 epoch_t *_bind_epoch) const
1400{
11fdf7f2 1401 std::lock_guard l(epoch_lock);
7c673cae
FG
1402 if (_boot_epoch)
1403 *_boot_epoch = boot_epoch;
1404 if (_up_epoch)
1405 *_up_epoch = up_epoch;
1406 if (_bind_epoch)
1407 *_bind_epoch = bind_epoch;
1408}
1409
1410void OSDService::set_epochs(const epoch_t *_boot_epoch, const epoch_t *_up_epoch,
1411 const epoch_t *_bind_epoch)
1412{
11fdf7f2 1413 std::lock_guard l(epoch_lock);
7c673cae 1414 if (_boot_epoch) {
11fdf7f2 1415 ceph_assert(*_boot_epoch == 0 || *_boot_epoch >= boot_epoch);
7c673cae
FG
1416 boot_epoch = *_boot_epoch;
1417 }
1418 if (_up_epoch) {
11fdf7f2 1419 ceph_assert(*_up_epoch == 0 || *_up_epoch >= up_epoch);
7c673cae
FG
1420 up_epoch = *_up_epoch;
1421 }
1422 if (_bind_epoch) {
11fdf7f2 1423 ceph_assert(*_bind_epoch == 0 || *_bind_epoch >= bind_epoch);
7c673cae
FG
1424 bind_epoch = *_bind_epoch;
1425 }
1426}
1427
1428bool OSDService::prepare_to_stop()
1429{
11fdf7f2 1430 std::lock_guard l(is_stopping_lock);
7c673cae
FG
1431 if (get_state() != NOT_STOPPING)
1432 return false;
1433
1434 OSDMapRef osdmap = get_osdmap();
1435 if (osdmap && osdmap->is_up(whoami)) {
1436 dout(0) << __func__ << " telling mon we are shutting down" << dendl;
1437 set_state(PREPARING_TO_STOP);
11fdf7f2
TL
1438 monc->send_mon_message(
1439 new MOSDMarkMeDown(
1440 monc->get_fsid(),
1441 whoami,
1442 osdmap->get_addrs(whoami),
1443 osdmap->get_epoch(),
1444 true // request ack
1445 ));
7c673cae
FG
1446 utime_t now = ceph_clock_now();
1447 utime_t timeout;
1448 timeout.set_from_double(now + cct->_conf->osd_mon_shutdown_timeout);
1449 while ((ceph_clock_now() < timeout) &&
1450 (get_state() != STOPPING)) {
1451 is_stopping_cond.WaitUntil(is_stopping_lock, timeout);
1452 }
1453 }
1454 dout(0) << __func__ << " starting shutdown" << dendl;
1455 set_state(STOPPING);
1456 return true;
1457}
1458
1459void OSDService::got_stop_ack()
1460{
11fdf7f2 1461 std::lock_guard l(is_stopping_lock);
7c673cae
FG
1462 if (get_state() == PREPARING_TO_STOP) {
1463 dout(0) << __func__ << " starting shutdown" << dendl;
1464 set_state(STOPPING);
1465 is_stopping_cond.Signal();
1466 } else {
1467 dout(10) << __func__ << " ignoring msg" << dendl;
1468 }
1469}
1470
1471MOSDMap *OSDService::build_incremental_map_msg(epoch_t since, epoch_t to,
1472 OSDSuperblock& sblock)
1473{
28e407b8
AA
1474 MOSDMap *m = new MOSDMap(monc->get_fsid(),
1475 osdmap->get_encoding_features());
7c673cae
FG
1476 m->oldest_map = max_oldest_map;
1477 m->newest_map = sblock.newest_map;
1478
11fdf7f2
TL
1479 int max = cct->_conf->osd_map_message_max;
1480 ssize_t max_bytes = cct->_conf->osd_map_message_max_bytes;
1481
1482 if (since < m->oldest_map) {
1483 // we don't have the next map the target wants, so start with a
1484 // full map.
1485 bufferlist bl;
1486 dout(10) << __func__ << " oldest map " << max_oldest_map << " > since "
1487 << since << ", starting with full map" << dendl;
1488 since = m->oldest_map;
1489 if (!get_map_bl(since, bl)) {
1490 derr << __func__ << " missing full map " << since << dendl;
1491 goto panic;
1492 }
1493 max--;
1494 max_bytes -= bl.length();
1495 m->maps[since].claim(bl);
1496 }
1497 for (epoch_t e = since + 1; e <= to; ++e) {
7c673cae 1498 bufferlist bl;
11fdf7f2 1499 if (get_inc_map_bl(e, bl)) {
7c673cae 1500 m->incremental_maps[e].claim(bl);
11fdf7f2
TL
1501 } else {
1502 derr << __func__ << " missing incremental map " << e << dendl;
1503 if (!get_map_bl(e, bl)) {
1504 derr << __func__ << " also missing full map " << e << dendl;
1505 goto panic;
1506 }
7c673cae 1507 m->maps[e].claim(bl);
11fdf7f2
TL
1508 }
1509 max--;
1510 max_bytes -= bl.length();
1511 if (max <= 0 || max_bytes <= 0) {
7c673cae 1512 break;
11fdf7f2
TL
1513 }
1514 }
1515 return m;
1516
1517 panic:
1518 if (!m->maps.empty() ||
1519 !m->incremental_maps.empty()) {
1520 // send what we have so far
1521 return m;
1522 }
1523 // send something
1524 bufferlist bl;
1525 if (get_inc_map_bl(m->newest_map, bl)) {
1526 m->incremental_maps[m->newest_map].claim(bl);
1527 } else {
1528 derr << __func__ << " unable to load latest map " << m->newest_map << dendl;
1529 if (!get_map_bl(m->newest_map, bl)) {
1530 derr << __func__ << " unable to load latest full map " << m->newest_map
7c673cae 1531 << dendl;
11fdf7f2 1532 ceph_abort();
7c673cae 1533 }
11fdf7f2 1534 m->maps[m->newest_map].claim(bl);
7c673cae
FG
1535 }
1536 return m;
1537}
1538
1539void OSDService::send_map(MOSDMap *m, Connection *con)
1540{
1541 con->send_message(m);
1542}
1543
1544void OSDService::send_incremental_map(epoch_t since, Connection *con,
1545 OSDMapRef& osdmap)
1546{
1547 epoch_t to = osdmap->get_epoch();
1548 dout(10) << "send_incremental_map " << since << " -> " << to
1549 << " to " << con << " " << con->get_peer_addr() << dendl;
1550
1551 MOSDMap *m = NULL;
1552 while (!m) {
1553 OSDSuperblock sblock(get_superblock());
1554 if (since < sblock.oldest_map) {
1555 // just send latest full map
28e407b8
AA
1556 MOSDMap *m = new MOSDMap(monc->get_fsid(),
1557 osdmap->get_encoding_features());
7c673cae
FG
1558 m->oldest_map = max_oldest_map;
1559 m->newest_map = sblock.newest_map;
1560 get_map_bl(to, m->maps[to]);
1561 send_map(m, con);
1562 return;
1563 }
1564
1565 if (to > since && (int64_t)(to - since) > cct->_conf->osd_map_share_max_epochs) {
1566 dout(10) << " " << (to - since) << " > max " << cct->_conf->osd_map_share_max_epochs
1567 << ", only sending most recent" << dendl;
1568 since = to - cct->_conf->osd_map_share_max_epochs;
1569 }
1570
7c673cae
FG
1571 m = build_incremental_map_msg(since, to, sblock);
1572 }
1573 send_map(m, con);
1574}
1575
1576bool OSDService::_get_map_bl(epoch_t e, bufferlist& bl)
1577{
1578 bool found = map_bl_cache.lookup(e, &bl);
31f18b77
FG
1579 if (found) {
1580 if (logger)
1581 logger->inc(l_osd_map_bl_cache_hit);
7c673cae 1582 return true;
31f18b77
FG
1583 }
1584 if (logger)
1585 logger->inc(l_osd_map_bl_cache_miss);
11fdf7f2 1586 found = store->read(meta_ch,
31f18b77
FG
1587 OSD::get_osdmap_pobject_name(e), 0, 0, bl,
1588 CEPH_OSD_OP_FLAG_FADVISE_WILLNEED) >= 0;
1589 if (found) {
7c673cae 1590 _add_map_bl(e, bl);
31f18b77 1591 }
7c673cae
FG
1592 return found;
1593}
1594
1595bool OSDService::get_inc_map_bl(epoch_t e, bufferlist& bl)
1596{
11fdf7f2 1597 std::lock_guard l(map_cache_lock);
7c673cae 1598 bool found = map_bl_inc_cache.lookup(e, &bl);
31f18b77
FG
1599 if (found) {
1600 if (logger)
1601 logger->inc(l_osd_map_bl_cache_hit);
7c673cae 1602 return true;
31f18b77
FG
1603 }
1604 if (logger)
1605 logger->inc(l_osd_map_bl_cache_miss);
11fdf7f2 1606 found = store->read(meta_ch,
31f18b77
FG
1607 OSD::get_inc_osdmap_pobject_name(e), 0, 0, bl,
1608 CEPH_OSD_OP_FLAG_FADVISE_WILLNEED) >= 0;
1609 if (found) {
7c673cae 1610 _add_map_inc_bl(e, bl);
31f18b77 1611 }
7c673cae
FG
1612 return found;
1613}
1614
1615void OSDService::_add_map_bl(epoch_t e, bufferlist& bl)
1616{
1617 dout(10) << "add_map_bl " << e << " " << bl.length() << " bytes" << dendl;
31f18b77
FG
1618 // cache a contiguous buffer
1619 if (bl.get_num_buffers() > 1) {
1620 bl.rebuild();
1621 }
1622 bl.try_assign_to_mempool(mempool::mempool_osd_mapbl);
7c673cae
FG
1623 map_bl_cache.add(e, bl);
1624}
1625
1626void OSDService::_add_map_inc_bl(epoch_t e, bufferlist& bl)
1627{
1628 dout(10) << "add_map_inc_bl " << e << " " << bl.length() << " bytes" << dendl;
31f18b77
FG
1629 // cache a contiguous buffer
1630 if (bl.get_num_buffers() > 1) {
1631 bl.rebuild();
1632 }
1633 bl.try_assign_to_mempool(mempool::mempool_osd_mapbl);
7c673cae
FG
1634 map_bl_inc_cache.add(e, bl);
1635}
1636
11fdf7f2 1637int OSDService::get_deleted_pool_pg_num(int64_t pool)
7c673cae 1638{
11fdf7f2
TL
1639 std::lock_guard l(map_cache_lock);
1640 auto p = deleted_pool_pg_nums.find(pool);
1641 if (p != deleted_pool_pg_nums.end()) {
1642 return p->second;
31f18b77 1643 }
11fdf7f2
TL
1644 dout(20) << __func__ << " " << pool << " loading" << dendl;
1645 ghobject_t oid = OSD::make_final_pool_info_oid(pool);
1646 bufferlist bl;
1647 int r = store->read(meta_ch, oid, 0, 0, bl);
1648 ceph_assert(r >= 0);
1649 auto blp = bl.cbegin();
1650 pg_pool_t pi;
1651 ::decode(pi, blp);
1652 deleted_pool_pg_nums[pool] = pi.get_pg_num();
1653 dout(20) << __func__ << " " << pool << " got " << pi.get_pg_num() << dendl;
1654 return pi.get_pg_num();
7c673cae
FG
1655}
1656
1657OSDMapRef OSDService::_add_map(OSDMap *o)
1658{
1659 epoch_t e = o->get_epoch();
1660
1661 if (cct->_conf->osd_map_dedup) {
1662 // Dedup against an existing map at a nearby epoch
1663 OSDMapRef for_dedup = map_cache.lower_bound(e);
1664 if (for_dedup) {
1665 OSDMap::dedup(for_dedup.get(), o);
1666 }
1667 }
1668 bool existed;
1669 OSDMapRef l = map_cache.add(e, o, &existed);
1670 if (existed) {
1671 delete o;
1672 }
1673 return l;
1674}
1675
1676OSDMapRef OSDService::try_get_map(epoch_t epoch)
1677{
11fdf7f2 1678 std::lock_guard l(map_cache_lock);
7c673cae
FG
1679 OSDMapRef retval = map_cache.lookup(epoch);
1680 if (retval) {
1681 dout(30) << "get_map " << epoch << " -cached" << dendl;
1682 if (logger) {
1683 logger->inc(l_osd_map_cache_hit);
1684 }
1685 return retval;
1686 }
1687 if (logger) {
1688 logger->inc(l_osd_map_cache_miss);
1689 epoch_t lb = map_cache.cached_key_lower_bound();
1690 if (epoch < lb) {
1691 dout(30) << "get_map " << epoch << " - miss, below lower bound" << dendl;
1692 logger->inc(l_osd_map_cache_miss_low);
1693 logger->inc(l_osd_map_cache_miss_low_avg, lb - epoch);
1694 }
1695 }
1696
1697 OSDMap *map = new OSDMap;
1698 if (epoch > 0) {
1699 dout(20) << "get_map " << epoch << " - loading and decoding " << map << dendl;
1700 bufferlist bl;
1701 if (!_get_map_bl(epoch, bl) || bl.length() == 0) {
1702 derr << "failed to load OSD map for epoch " << epoch << ", got " << bl.length() << " bytes" << dendl;
1703 delete map;
1704 return OSDMapRef();
1705 }
1706 map->decode(bl);
1707 } else {
1708 dout(20) << "get_map " << epoch << " - return initial " << map << dendl;
1709 }
1710 return _add_map(map);
1711}
1712
1713// ops
1714
1715
1716void OSDService::reply_op_error(OpRequestRef op, int err)
1717{
1718 reply_op_error(op, err, eversion_t(), 0);
1719}
1720
1721void OSDService::reply_op_error(OpRequestRef op, int err, eversion_t v,
1722 version_t uv)
1723{
1724 const MOSDOp *m = static_cast<const MOSDOp*>(op->get_req());
11fdf7f2 1725 ceph_assert(m->get_type() == CEPH_MSG_OSD_OP);
7c673cae
FG
1726 int flags;
1727 flags = m->get_flags() & (CEPH_OSD_FLAG_ACK|CEPH_OSD_FLAG_ONDISK);
1728
11fdf7f2 1729 MOSDOpReply *reply = new MOSDOpReply(m, err, osdmap->get_epoch(), flags, true);
7c673cae
FG
1730 reply->set_reply_versions(v, uv);
1731 m->get_connection()->send_message(reply);
1732}
1733
1734void OSDService::handle_misdirected_op(PG *pg, OpRequestRef op)
1735{
31f18b77
FG
1736 if (!cct->_conf->osd_debug_misdirected_ops) {
1737 return;
1738 }
1739
7c673cae 1740 const MOSDOp *m = static_cast<const MOSDOp*>(op->get_req());
11fdf7f2 1741 ceph_assert(m->get_type() == CEPH_MSG_OSD_OP);
7c673cae 1742
11fdf7f2 1743 ceph_assert(m->get_map_epoch() >= pg->get_history().same_primary_since);
7c673cae
FG
1744
1745 if (pg->is_ec_pg()) {
1746 /**
1747 * OSD recomputes op target based on current OSDMap. With an EC pg, we
1748 * can get this result:
1749 * 1) client at map 512 sends an op to osd 3, pg_t 3.9 based on mapping
1750 * [CRUSH_ITEM_NONE, 2, 3]/3
1751 * 2) OSD 3 at map 513 remaps op to osd 3, spg_t 3.9s0 based on mapping
1752 * [3, 2, 3]/3
1753 * 3) PG 3.9s0 dequeues the op at epoch 512 and notices that it isn't primary
1754 * -- misdirected op
1755 * 4) client resends and this time PG 3.9s0 having caught up to 513 gets
1756 * it and fulfils it
1757 *
1758 * We can't compute the op target based on the sending map epoch due to
1759 * splitting. The simplest thing is to detect such cases here and drop
1760 * them without an error (the client will resend anyway).
1761 */
11fdf7f2 1762 ceph_assert(m->get_map_epoch() <= superblock.newest_map);
7c673cae
FG
1763 OSDMapRef opmap = try_get_map(m->get_map_epoch());
1764 if (!opmap) {
1765 dout(7) << __func__ << ": " << *pg << " no longer have map for "
1766 << m->get_map_epoch() << ", dropping" << dendl;
1767 return;
1768 }
1769 pg_t _pgid = m->get_raw_pg();
1770 spg_t pgid;
1771 if ((m->get_flags() & CEPH_OSD_FLAG_PGOP) == 0)
1772 _pgid = opmap->raw_pg_to_pg(_pgid);
1773 if (opmap->get_primary_shard(_pgid, &pgid) &&
11fdf7f2 1774 pgid.shard != pg->pg_id.shard) {
7c673cae
FG
1775 dout(7) << __func__ << ": " << *pg << " primary changed since "
1776 << m->get_map_epoch() << ", dropping" << dendl;
1777 return;
1778 }
1779 }
1780
1781 dout(7) << *pg << " misdirected op in " << m->get_map_epoch() << dendl;
1782 clog->warn() << m->get_source_inst() << " misdirected " << m->get_reqid()
1783 << " pg " << m->get_raw_pg()
1784 << " to osd." << whoami
11fdf7f2 1785 << " not " << pg->get_acting()
7c673cae 1786 << " in e" << m->get_map_epoch() << "/" << osdmap->get_epoch();
7c673cae
FG
1787}
1788
11fdf7f2 1789void OSDService::enqueue_back(OpQueueItem&& qi)
7c673cae 1790{
11fdf7f2 1791 osd->op_shardedwq.queue(std::move(qi));
7c673cae
FG
1792}
1793
11fdf7f2 1794void OSDService::enqueue_front(OpQueueItem&& qi)
7c673cae 1795{
11fdf7f2 1796 osd->op_shardedwq.queue_front(std::move(qi));
7c673cae
FG
1797}
1798
11fdf7f2
TL
1799void OSDService::queue_recovery_context(
1800 PG *pg,
1801 GenContext<ThreadPool::TPHandle&> *c)
7c673cae 1802{
11fdf7f2
TL
1803 epoch_t e = get_osdmap_epoch();
1804 enqueue_back(
1805 OpQueueItem(
1806 unique_ptr<OpQueueItem::OpQueueable>(
1807 new PGRecoveryContext(pg->get_pgid(), c, e)),
1808 cct->_conf->osd_recovery_cost,
1809 cct->_conf->osd_recovery_priority,
1810 ceph_clock_now(),
1811 0,
1812 e));
7c673cae
FG
1813}
1814
1815void OSDService::queue_for_snap_trim(PG *pg)
1816{
1817 dout(10) << "queueing " << *pg << " for snaptrim" << dendl;
11fdf7f2
TL
1818 enqueue_back(
1819 OpQueueItem(
1820 unique_ptr<OpQueueItem::OpQueueable>(
1821 new PGSnapTrim(pg->get_pgid(), pg->get_osdmap_epoch())),
1822 cct->_conf->osd_snap_trim_cost,
1823 cct->_conf->osd_snap_trim_priority,
1824 ceph_clock_now(),
1825 0,
1826 pg->get_osdmap_epoch()));
1827}
1828
1829void OSDService::queue_for_scrub(PG *pg, bool with_high_priority)
1830{
1831 unsigned scrub_queue_priority = pg->scrubber.priority;
1832 if (with_high_priority && scrub_queue_priority < cct->_conf->osd_client_op_priority) {
1833 scrub_queue_priority = cct->_conf->osd_client_op_priority;
1834 }
1835 const auto epoch = pg->get_osdmap_epoch();
1836 enqueue_back(
1837 OpQueueItem(
1838 unique_ptr<OpQueueItem::OpQueueable>(new PGScrub(pg->get_pgid(), epoch)),
1839 cct->_conf->osd_scrub_cost,
1840 scrub_queue_priority,
1841 ceph_clock_now(),
1842 0,
1843 epoch));
1844}
1845
1846void OSDService::queue_for_pg_delete(spg_t pgid, epoch_t e)
1847{
1848 dout(10) << __func__ << " on " << pgid << " e " << e << dendl;
1849 enqueue_back(
1850 OpQueueItem(
1851 unique_ptr<OpQueueItem::OpQueueable>(
1852 new PGDelete(pgid, e)),
1853 cct->_conf->osd_pg_delete_cost,
1854 cct->_conf->osd_pg_delete_priority,
1855 ceph_clock_now(),
1856 0,
1857 e));
1858}
1859
1860bool OSDService::try_finish_pg_delete(PG *pg, unsigned old_pg_num)
1861{
1862 return osd->try_finish_pg_delete(pg, old_pg_num);
1863}
1864
1865// ---
1866
1867void OSDService::set_ready_to_merge_source(PG *pg, eversion_t version)
1868{
1869 std::lock_guard l(merge_lock);
1870 dout(10) << __func__ << " " << pg->pg_id << dendl;
1871 ready_to_merge_source[pg->pg_id.pgid] = version;
1872 assert(not_ready_to_merge_source.count(pg->pg_id.pgid) == 0);
1873 _send_ready_to_merge();
1874}
1875
1876void OSDService::set_ready_to_merge_target(PG *pg,
1877 eversion_t version,
1878 epoch_t last_epoch_started,
1879 epoch_t last_epoch_clean)
1880{
1881 std::lock_guard l(merge_lock);
1882 dout(10) << __func__ << " " << pg->pg_id << dendl;
1883 ready_to_merge_target.insert(make_pair(pg->pg_id.pgid,
1884 make_tuple(version,
1885 last_epoch_started,
1886 last_epoch_clean)));
1887 assert(not_ready_to_merge_target.count(pg->pg_id.pgid) == 0);
1888 _send_ready_to_merge();
1889}
1890
1891void OSDService::set_not_ready_to_merge_source(pg_t source)
1892{
1893 std::lock_guard l(merge_lock);
1894 dout(10) << __func__ << " " << source << dendl;
1895 not_ready_to_merge_source.insert(source);
1896 assert(ready_to_merge_source.count(source) == 0);
1897 _send_ready_to_merge();
1898}
1899
1900void OSDService::set_not_ready_to_merge_target(pg_t target, pg_t source)
1901{
1902 std::lock_guard l(merge_lock);
1903 dout(10) << __func__ << " " << target << " source " << source << dendl;
1904 not_ready_to_merge_target[target] = source;
1905 assert(ready_to_merge_target.count(target) == 0);
1906 _send_ready_to_merge();
1907}
1908
1909void OSDService::send_ready_to_merge()
1910{
1911 std::lock_guard l(merge_lock);
1912 _send_ready_to_merge();
1913}
1914
1915void OSDService::_send_ready_to_merge()
1916{
1917 dout(20) << __func__
1918 << " ready_to_merge_source " << ready_to_merge_source
1919 << " not_ready_to_merge_source " << not_ready_to_merge_source
1920 << " ready_to_merge_target " << ready_to_merge_target
1921 << " not_ready_to_merge_target " << not_ready_to_merge_target
1922 << " sent_ready_to_merge_source " << sent_ready_to_merge_source
1923 << dendl;
1924 for (auto src : not_ready_to_merge_source) {
1925 if (sent_ready_to_merge_source.count(src) == 0) {
1926 monc->send_mon_message(new MOSDPGReadyToMerge(
1927 src,
1928 {}, {}, 0, 0,
1929 false,
1930 osdmap->get_epoch()));
1931 sent_ready_to_merge_source.insert(src);
1932 }
1933 }
1934 for (auto p : not_ready_to_merge_target) {
1935 if (sent_ready_to_merge_source.count(p.second) == 0) {
1936 monc->send_mon_message(new MOSDPGReadyToMerge(
1937 p.second,
1938 {}, {}, 0, 0,
1939 false,
1940 osdmap->get_epoch()));
1941 sent_ready_to_merge_source.insert(p.second);
1942 }
1943 }
1944 for (auto src : ready_to_merge_source) {
1945 if (not_ready_to_merge_source.count(src.first) ||
1946 not_ready_to_merge_target.count(src.first.get_parent())) {
1947 continue;
1948 }
1949 auto p = ready_to_merge_target.find(src.first.get_parent());
1950 if (p != ready_to_merge_target.end() &&
1951 sent_ready_to_merge_source.count(src.first) == 0) {
1952 monc->send_mon_message(new MOSDPGReadyToMerge(
1953 src.first, // source pgid
1954 src.second, // src version
1955 std::get<0>(p->second), // target version
1956 std::get<1>(p->second), // PG's last_epoch_started
1957 std::get<2>(p->second), // PG's last_epoch_clean
1958 true,
1959 osdmap->get_epoch()));
1960 sent_ready_to_merge_source.insert(src.first);
1961 }
1962 }
1963}
1964
1965void OSDService::clear_ready_to_merge(PG *pg)
1966{
1967 std::lock_guard l(merge_lock);
1968 dout(10) << __func__ << " " << pg->pg_id << dendl;
1969 ready_to_merge_source.erase(pg->pg_id.pgid);
1970 ready_to_merge_target.erase(pg->pg_id.pgid);
1971 not_ready_to_merge_source.erase(pg->pg_id.pgid);
1972 not_ready_to_merge_target.erase(pg->pg_id.pgid);
1973 sent_ready_to_merge_source.erase(pg->pg_id.pgid);
1974}
1975
1976void OSDService::clear_sent_ready_to_merge()
1977{
1978 std::lock_guard l(merge_lock);
1979 sent_ready_to_merge_source.clear();
1980}
1981
1982void OSDService::prune_sent_ready_to_merge(OSDMapRef& osdmap)
1983{
1984 std::lock_guard l(merge_lock);
1985 auto i = sent_ready_to_merge_source.begin();
1986 while (i != sent_ready_to_merge_source.end()) {
1987 if (!osdmap->pg_exists(*i)) {
1988 dout(10) << __func__ << " " << *i << dendl;
1989 i = sent_ready_to_merge_source.erase(i);
1990 } else {
1991 ++i;
1992 }
1993 }
7c673cae
FG
1994}
1995
11fdf7f2
TL
1996// ---
1997
1998void OSDService::_queue_for_recovery(
1999 std::pair<epoch_t, PGRef> p,
2000 uint64_t reserved_pushes)
2001{
2002 ceph_assert(recovery_lock.is_locked_by_me());
2003 enqueue_back(
2004 OpQueueItem(
2005 unique_ptr<OpQueueItem::OpQueueable>(
2006 new PGRecovery(
2007 p.second->get_pgid(), p.first, reserved_pushes)),
2008 cct->_conf->osd_recovery_cost,
2009 cct->_conf->osd_recovery_priority,
2010 ceph_clock_now(),
2011 0,
2012 p.first));
2013}
7c673cae
FG
2014
2015// ====================================================================
2016// OSD
2017
2018#undef dout_prefix
2019#define dout_prefix *_dout
2020
2021// Commands shared between OSD's console and admin console:
2022namespace ceph {
2023namespace osd_cmds {
2024
11fdf7f2 2025int heap(CephContext& cct, const cmdmap_t& cmdmap, Formatter& f, std::ostream& os);
7c673cae
FG
2026
2027}} // namespace ceph::osd_cmds
2028
11fdf7f2 2029int OSD::mkfs(CephContext *cct, ObjectStore *store, uuid_d fsid, int whoami)
7c673cae
FG
2030{
2031 int ret;
2032
7c673cae
FG
2033 OSDSuperblock sb;
2034 bufferlist sbbl;
11fdf7f2 2035 ObjectStore::CollectionHandle ch;
7c673cae
FG
2036
2037 // if we are fed a uuid for this osd, use it.
2038 store->set_fsid(cct->_conf->osd_uuid);
2039
2040 ret = store->mkfs();
2041 if (ret) {
224ce89b
WB
2042 derr << "OSD::mkfs: ObjectStore::mkfs failed with error "
2043 << cpp_strerror(ret) << dendl;
7c673cae
FG
2044 goto free_store;
2045 }
2046
31f18b77 2047 store->set_cache_shards(1); // doesn't matter for mkfs!
7c673cae
FG
2048
2049 ret = store->mount();
2050 if (ret) {
224ce89b
WB
2051 derr << "OSD::mkfs: couldn't mount ObjectStore: error "
2052 << cpp_strerror(ret) << dendl;
7c673cae
FG
2053 goto free_store;
2054 }
2055
11fdf7f2
TL
2056 ch = store->open_collection(coll_t::meta());
2057 if (ch) {
2058 ret = store->read(ch, OSD_SUPERBLOCK_GOBJECT, 0, 0, sbbl);
2059 if (ret < 0) {
2060 derr << "OSD::mkfs: have meta collection but no superblock" << dendl;
2061 goto free_store;
2062 }
7c673cae
FG
2063 /* if we already have superblock, check content of superblock */
2064 dout(0) << " have superblock" << dendl;
11fdf7f2
TL
2065 auto p = sbbl.cbegin();
2066 decode(sb, p);
7c673cae
FG
2067 if (whoami != sb.whoami) {
2068 derr << "provided osd id " << whoami << " != superblock's " << sb.whoami
2069 << dendl;
2070 ret = -EINVAL;
2071 goto umount_store;
2072 }
2073 if (fsid != sb.cluster_fsid) {
2074 derr << "provided cluster fsid " << fsid
2075 << " != superblock's " << sb.cluster_fsid << dendl;
2076 ret = -EINVAL;
2077 goto umount_store;
2078 }
2079 } else {
2080 // create superblock
2081 sb.cluster_fsid = fsid;
2082 sb.osd_fsid = store->get_fsid();
2083 sb.whoami = whoami;
2084 sb.compat_features = get_osd_initial_compat_set();
2085
2086 bufferlist bl;
11fdf7f2 2087 encode(sb, bl);
7c673cae 2088
11fdf7f2
TL
2089 ObjectStore::CollectionHandle ch = store->create_new_collection(
2090 coll_t::meta());
7c673cae
FG
2091 ObjectStore::Transaction t;
2092 t.create_collection(coll_t::meta(), 0);
2093 t.write(coll_t::meta(), OSD_SUPERBLOCK_GOBJECT, 0, bl.length(), bl);
11fdf7f2 2094 ret = store->queue_transaction(ch, std::move(t));
7c673cae
FG
2095 if (ret) {
2096 derr << "OSD::mkfs: error while writing OSD_SUPERBLOCK_GOBJECT: "
11fdf7f2 2097 << "queue_transaction returned " << cpp_strerror(ret) << dendl;
7c673cae
FG
2098 goto umount_store;
2099 }
2100 }
2101
3efd9988 2102 ret = write_meta(cct, store, sb.cluster_fsid, sb.osd_fsid, whoami);
7c673cae 2103 if (ret) {
224ce89b
WB
2104 derr << "OSD::mkfs: failed to write fsid file: error "
2105 << cpp_strerror(ret) << dendl;
7c673cae
FG
2106 goto umount_store;
2107 }
2108
2109umount_store:
11fdf7f2
TL
2110 if (ch) {
2111 ch.reset();
2112 }
7c673cae
FG
2113 store->umount();
2114free_store:
2115 delete store;
2116 return ret;
2117}
2118
3efd9988 2119int OSD::write_meta(CephContext *cct, ObjectStore *store, uuid_d& cluster_fsid, uuid_d& osd_fsid, int whoami)
7c673cae
FG
2120{
2121 char val[80];
2122 int r;
2123
2124 snprintf(val, sizeof(val), "%s", CEPH_OSD_ONDISK_MAGIC);
2125 r = store->write_meta("magic", val);
2126 if (r < 0)
2127 return r;
2128
2129 snprintf(val, sizeof(val), "%d", whoami);
2130 r = store->write_meta("whoami", val);
2131 if (r < 0)
2132 return r;
2133
2134 cluster_fsid.print(val);
2135 r = store->write_meta("ceph_fsid", val);
2136 if (r < 0)
2137 return r;
2138
11fdf7f2 2139 string key = cct->_conf.get_val<string>("key");
3efd9988
FG
2140 if (key.size()) {
2141 r = store->write_meta("osd_key", key);
2142 if (r < 0)
2143 return r;
b32b8144 2144 } else {
11fdf7f2 2145 string keyfile = cct->_conf.get_val<string>("keyfile");
b32b8144
FG
2146 if (!keyfile.empty()) {
2147 bufferlist keybl;
2148 string err;
11fdf7f2 2149 r = keybl.read_file(keyfile.c_str(), &err);
b32b8144
FG
2150 if (r < 0) {
2151 derr << __func__ << " failed to read keyfile " << keyfile << ": "
2152 << err << ": " << cpp_strerror(r) << dendl;
2153 return r;
2154 }
2155 r = store->write_meta("osd_key", keybl.to_str());
2156 if (r < 0)
2157 return r;
2158 }
3efd9988
FG
2159 }
2160
7c673cae
FG
2161 r = store->write_meta("ready", "ready");
2162 if (r < 0)
2163 return r;
2164
2165 return 0;
2166}
2167
11fdf7f2
TL
2168int OSD::peek_meta(ObjectStore *store,
2169 std::string *magic,
2170 uuid_d *cluster_fsid,
2171 uuid_d *osd_fsid,
2172 int *whoami,
2173 int *require_osd_release)
7c673cae
FG
2174{
2175 string val;
2176
2177 int r = store->read_meta("magic", &val);
2178 if (r < 0)
2179 return r;
11fdf7f2 2180 *magic = val;
7c673cae
FG
2181
2182 r = store->read_meta("whoami", &val);
2183 if (r < 0)
2184 return r;
11fdf7f2 2185 *whoami = atoi(val.c_str());
7c673cae
FG
2186
2187 r = store->read_meta("ceph_fsid", &val);
2188 if (r < 0)
2189 return r;
11fdf7f2 2190 r = cluster_fsid->parse(val.c_str());
7c673cae
FG
2191 if (!r)
2192 return -EINVAL;
2193
2194 r = store->read_meta("fsid", &val);
2195 if (r < 0) {
11fdf7f2 2196 *osd_fsid = uuid_d();
7c673cae 2197 } else {
11fdf7f2 2198 r = osd_fsid->parse(val.c_str());
7c673cae
FG
2199 if (!r)
2200 return -EINVAL;
2201 }
2202
11fdf7f2
TL
2203 r = store->read_meta("require_osd_release", &val);
2204 if (r >= 0) {
2205 *require_osd_release = atoi(val.c_str());
2206 }
2207
7c673cae
FG
2208 return 0;
2209}
2210
2211
2212#undef dout_prefix
2213#define dout_prefix _prefix(_dout, whoami, get_osdmap_epoch())
2214
2215// cons/des
2216
2217OSD::OSD(CephContext *cct_, ObjectStore *store_,
2218 int id,
2219 Messenger *internal_messenger,
2220 Messenger *external_messenger,
2221 Messenger *hb_client_front,
2222 Messenger *hb_client_back,
2223 Messenger *hb_front_serverm,
2224 Messenger *hb_back_serverm,
2225 Messenger *osdc_messenger,
2226 MonClient *mc,
2227 const std::string &dev, const std::string &jdev) :
2228 Dispatcher(cct_),
2229 osd_lock("OSD::osd_lock"),
2230 tick_timer(cct, osd_lock),
2231 tick_timer_lock("OSD::tick_timer_lock"),
2232 tick_timer_without_osd_lock(cct, tick_timer_lock),
11fdf7f2 2233 gss_ktfile_client(cct->_conf.get_val<std::string>("gss_ktab_client_file")),
7c673cae
FG
2234 cluster_messenger(internal_messenger),
2235 client_messenger(external_messenger),
2236 objecter_messenger(osdc_messenger),
2237 monc(mc),
2238 mgrc(cct_, client_messenger),
2239 logger(NULL),
2240 recoverystate_perf(NULL),
2241 store(store_),
2242 log_client(cct, client_messenger, &mc->monmap, LogClient::NO_FLAGS),
2243 clog(log_client.create_channel()),
2244 whoami(id),
2245 dev_path(dev), journal_path(jdev),
31f18b77 2246 store_is_rotational(store->is_rotational()),
7c673cae
FG
2247 trace_endpoint("0.0.0.0", 0, "osd"),
2248 asok_hook(NULL),
11fdf7f2
TL
2249 m_osd_pg_epoch_max_lag_factor(cct->_conf.get_val<double>(
2250 "osd_pg_epoch_max_lag_factor")),
7c673cae 2251 osd_compat(get_osd_compat_set()),
7c673cae 2252 osd_op_tp(cct, "OSD::osd_op_tp", "tp_osd_tp",
31f18b77 2253 get_num_op_threads()),
7c673cae
FG
2254 command_tp(cct, "OSD::command_tp", "tp_osd_cmd", 1),
2255 session_waiting_lock("OSD::session_waiting_lock"),
181888fb 2256 osdmap_subscribe_lock("OSD::osdmap_subscribe_lock"),
7c673cae
FG
2257 heartbeat_lock("OSD::heartbeat_lock"),
2258 heartbeat_stop(false),
2259 heartbeat_need_update(true),
2260 hb_front_client_messenger(hb_client_front),
2261 hb_back_client_messenger(hb_client_back),
2262 hb_front_server_messenger(hb_front_serverm),
2263 hb_back_server_messenger(hb_back_serverm),
2264 daily_loadavg(0.0),
2265 heartbeat_thread(this),
2266 heartbeat_dispatcher(this),
2267 op_tracker(cct, cct->_conf->osd_enable_op_tracker,
2268 cct->_conf->osd_num_op_tracker_shard),
2269 test_ops_hook(NULL),
2270 op_queue(get_io_queue()),
2271 op_prio_cutoff(get_io_prio_cut()),
2272 op_shardedwq(
7c673cae
FG
2273 this,
2274 cct->_conf->osd_op_thread_timeout,
2275 cct->_conf->osd_op_thread_suicide_timeout,
2276 &osd_op_tp),
7c673cae 2277 map_lock("OSD::map_lock"),
7c673cae
FG
2278 last_pg_create_epoch(0),
2279 mon_report_lock("OSD::mon_report_lock"),
11fdf7f2 2280 boot_finisher(cct),
7c673cae
FG
2281 up_thru_wanted(0),
2282 requested_full_first(0),
2283 requested_full_last(0),
7c673cae
FG
2284 command_wq(
2285 this,
2286 cct->_conf->osd_command_thread_timeout,
2287 cct->_conf->osd_command_thread_suicide_timeout,
2288 &command_tp),
7c673cae
FG
2289 service(this)
2290{
11fdf7f2
TL
2291
2292 if (!gss_ktfile_client.empty()) {
2293 // Assert we can export environment variable
2294 /*
2295 The default client keytab is used, if it is present and readable,
2296 to automatically obtain initial credentials for GSSAPI client
2297 applications. The principal name of the first entry in the client
2298 keytab is used by default when obtaining initial credentials.
2299 1. The KRB5_CLIENT_KTNAME environment variable.
2300 2. The default_client_keytab_name profile variable in [libdefaults].
2301 3. The hardcoded default, DEFCKTNAME.
2302 */
2303 const int32_t set_result(setenv("KRB5_CLIENT_KTNAME",
2304 gss_ktfile_client.c_str(), 1));
2305 ceph_assert(set_result == 0);
2306 }
2307
7c673cae
FG
2308 monc->set_messenger(client_messenger);
2309 op_tracker.set_complaint_and_threshold(cct->_conf->osd_op_complaint_time,
2310 cct->_conf->osd_op_log_threshold);
2311 op_tracker.set_history_size_and_duration(cct->_conf->osd_op_history_size,
2312 cct->_conf->osd_op_history_duration);
2313 op_tracker.set_history_slow_op_size_and_threshold(cct->_conf->osd_op_history_slow_op_size,
2314 cct->_conf->osd_op_history_slow_op_threshold);
2315#ifdef WITH_BLKIN
2316 std::stringstream ss;
2317 ss << "osd." << whoami;
2318 trace_endpoint.copy_name(ss.str());
2319#endif
11fdf7f2
TL
2320
2321 // initialize shards
2322 num_shards = get_num_op_shards();
2323 for (uint32_t i = 0; i < num_shards; i++) {
2324 OSDShard *one_shard = new OSDShard(
2325 i,
2326 cct,
2327 this,
2328 cct->_conf->osd_op_pq_max_tokens_per_priority,
2329 cct->_conf->osd_op_pq_min_cost,
2330 op_queue);
2331 shards.push_back(one_shard);
2332 }
7c673cae
FG
2333}
2334
2335OSD::~OSD()
2336{
11fdf7f2
TL
2337 while (!shards.empty()) {
2338 delete shards.back();
2339 shards.pop_back();
2340 }
7c673cae
FG
2341 delete class_handler;
2342 cct->get_perfcounters_collection()->remove(recoverystate_perf);
2343 cct->get_perfcounters_collection()->remove(logger);
2344 delete recoverystate_perf;
2345 delete logger;
2346 delete store;
2347}
2348
91327a77
AA
2349double OSD::get_tick_interval() const
2350{
2351 // vary +/- 5% to avoid scrub scheduling livelocks
2352 constexpr auto delta = 0.05;
91327a77 2353 return (OSD_TICK_INTERVAL *
11fdf7f2 2354 ceph::util::generate_random_number(1.0 - delta, 1.0 + delta));
91327a77
AA
2355}
2356
7c673cae
FG
2357void cls_initialize(ClassHandler *ch);
2358
2359void OSD::handle_signal(int signum)
2360{
11fdf7f2 2361 ceph_assert(signum == SIGINT || signum == SIGTERM);
7c673cae
FG
2362 derr << "*** Got signal " << sig_str(signum) << " ***" << dendl;
2363 shutdown();
2364}
2365
2366int OSD::pre_init()
2367{
11fdf7f2 2368 std::lock_guard lock(osd_lock);
7c673cae
FG
2369 if (is_stopping())
2370 return 0;
2371
2372 if (store->test_mount_in_use()) {
2373 derr << "OSD::pre_init: object store '" << dev_path << "' is "
2374 << "currently in use. (Is ceph-osd already running?)" << dendl;
2375 return -EBUSY;
2376 }
2377
11fdf7f2
TL
2378 cct->_conf.add_observer(this);
2379 return 0;
2380}
2381
2382int OSD::set_numa_affinity()
2383{
2384 // storage numa node
2385 int store_node = -1;
2386 store->get_numa_node(&store_node, nullptr, nullptr);
2387 if (store_node >= 0) {
2388 dout(1) << __func__ << " storage numa node " << store_node << dendl;
2389 }
2390
2391 // check network numa node(s)
2392 int front_node = -1, back_node = -1;
2393 string front_iface = pick_iface(
2394 cct,
2395 client_messenger->get_myaddrs().front().get_sockaddr_storage());
2396 string back_iface = pick_iface(
2397 cct,
2398 cluster_messenger->get_myaddrs().front().get_sockaddr_storage());
2399 int r = get_iface_numa_node(front_iface, &front_node);
2400 if (r >= 0) {
2401 dout(1) << __func__ << " public network " << front_iface << " numa node "
2402 << front_node << dendl;
2403 r = get_iface_numa_node(back_iface, &back_node);
2404 if (r >= 0) {
2405 dout(1) << __func__ << " cluster network " << back_iface << " numa node "
2406 << back_node << dendl;
2407 if (front_node == back_node &&
2408 front_node == store_node) {
2409 dout(1) << " objectstore and network numa nodes all match" << dendl;
2410 if (g_conf().get_val<bool>("osd_numa_auto_affinity")) {
2411 numa_node = front_node;
2412 }
2413 } else {
2414 dout(1) << __func__ << " objectstore and network numa nodes do not match"
2415 << dendl;
2416 }
2417 }
2418 } else {
2419 derr << __func__ << " unable to identify public interface '" << front_iface
2420 << "' numa node: " << cpp_strerror(r) << dendl;
2421 }
2422 if (int node = g_conf().get_val<int64_t>("osd_numa_node"); node >= 0) {
2423 // this takes precedence over the automagic logic above
2424 numa_node = node;
2425 }
2426 if (numa_node >= 0) {
2427 int r = get_numa_node_cpu_set(numa_node, &numa_cpu_set_size, &numa_cpu_set);
2428 if (r < 0) {
2429 dout(1) << __func__ << " unable to determine numa node " << numa_node
2430 << " CPUs" << dendl;
2431 numa_node = -1;
2432 } else {
2433 dout(1) << __func__ << " setting numa affinity to node " << numa_node
2434 << " cpus "
2435 << cpu_set_to_str_list(numa_cpu_set_size, &numa_cpu_set)
2436 << dendl;
2437 r = sched_setaffinity(getpid(), numa_cpu_set_size, &numa_cpu_set);
2438 if (r < 0) {
2439 r = -errno;
2440 derr << __func__ << " failed to set numa affinity: " << cpp_strerror(r)
2441 << dendl;
2442 numa_node = -1;
2443 }
2444 }
2445 } else {
2446 dout(1) << __func__ << " not setting numa affinity" << dendl;
2447 }
7c673cae
FG
2448 return 0;
2449}
2450
2451// asok
2452
2453class OSDSocketHook : public AdminSocketHook {
2454 OSD *osd;
2455public:
2456 explicit OSDSocketHook(OSD *o) : osd(o) {}
11fdf7f2
TL
2457 bool call(std::string_view admin_command, const cmdmap_t& cmdmap,
2458 std::string_view format, bufferlist& out) override {
7c673cae 2459 stringstream ss;
11fdf7f2
TL
2460 bool r = true;
2461 try {
2462 r = osd->asok_command(admin_command, cmdmap, format, ss);
2463 } catch (const bad_cmd_get& e) {
2464 ss << e.what();
2465 r = true;
2466 }
7c673cae
FG
2467 out.append(ss);
2468 return r;
2469 }
2470};
2471
11fdf7f2
TL
2472std::set<int64_t> OSD::get_mapped_pools()
2473{
2474 std::set<int64_t> pools;
2475 std::vector<spg_t> pgids;
2476 _get_pgids(&pgids);
2477 for (const auto &pgid : pgids) {
2478 pools.insert(pgid.pool());
2479 }
2480 return pools;
2481}
2482
2483bool OSD::asok_command(std::string_view admin_command, const cmdmap_t& cmdmap,
2484 std::string_view format, ostream& ss)
7c673cae
FG
2485{
2486 Formatter *f = Formatter::create(format, "json-pretty", "json-pretty");
2487 if (admin_command == "status") {
2488 f->open_object_section("status");
2489 f->dump_stream("cluster_fsid") << superblock.cluster_fsid;
2490 f->dump_stream("osd_fsid") << superblock.osd_fsid;
2491 f->dump_unsigned("whoami", superblock.whoami);
2492 f->dump_string("state", get_state_name(get_state()));
2493 f->dump_unsigned("oldest_map", superblock.oldest_map);
2494 f->dump_unsigned("newest_map", superblock.newest_map);
11fdf7f2 2495 f->dump_unsigned("num_pgs", num_pgs);
7c673cae
FG
2496 f->close_section();
2497 } else if (admin_command == "flush_journal") {
2498 store->flush_journal();
2499 } else if (admin_command == "dump_ops_in_flight" ||
c07f9fc5
FG
2500 admin_command == "ops" ||
2501 admin_command == "dump_blocked_ops" ||
2502 admin_command == "dump_historic_ops" ||
2503 admin_command == "dump_historic_ops_by_duration" ||
2504 admin_command == "dump_historic_slow_ops") {
2505
2506 const string error_str = "op_tracker tracking is not enabled now, so no ops are tracked currently, \
2507even those get stuck. Please enable \"osd_enable_op_tracker\", and the tracker \
2508will start to track new ops received afterwards.";
2509
2510 set<string> filters;
2511 vector<string> filter_str;
2512 if (cmd_getval(cct, cmdmap, "filterstr", filter_str)) {
2513 copy(filter_str.begin(), filter_str.end(),
2514 inserter(filters, filters.end()));
2515 }
2516
2517 if (admin_command == "dump_ops_in_flight" ||
2518 admin_command == "ops") {
2519 if (!op_tracker.dump_ops_in_flight(f, false, filters)) {
2520 ss << error_str;
2521 }
2522 }
2523 if (admin_command == "dump_blocked_ops") {
2524 if (!op_tracker.dump_ops_in_flight(f, true, filters)) {
2525 ss << error_str;
2526 }
2527 }
2528 if (admin_command == "dump_historic_ops") {
2529 if (!op_tracker.dump_historic_ops(f, false, filters)) {
2530 ss << error_str;
2531 }
2532 }
2533 if (admin_command == "dump_historic_ops_by_duration") {
2534 if (!op_tracker.dump_historic_ops(f, true, filters)) {
2535 ss << error_str;
2536 }
2537 }
2538 if (admin_command == "dump_historic_slow_ops") {
2539 if (!op_tracker.dump_historic_slow_ops(f, filters)) {
2540 ss << error_str;
2541 }
7c673cae
FG
2542 }
2543 } else if (admin_command == "dump_op_pq_state") {
2544 f->open_object_section("pq");
2545 op_shardedwq.dump(f);
2546 f->close_section();
2547 } else if (admin_command == "dump_blacklist") {
2548 list<pair<entity_addr_t,utime_t> > bl;
2549 OSDMapRef curmap = service.get_osdmap();
2550
2551 f->open_array_section("blacklist");
2552 curmap->get_blacklist(&bl);
2553 for (list<pair<entity_addr_t,utime_t> >::iterator it = bl.begin();
2554 it != bl.end(); ++it) {
224ce89b 2555 f->open_object_section("entry");
7c673cae
FG
2556 f->open_object_section("entity_addr_t");
2557 it->first.dump(f);
2558 f->close_section(); //entity_addr_t
2559 it->second.localtime(f->dump_stream("expire_time"));
2560 f->close_section(); //entry
2561 }
2562 f->close_section(); //blacklist
2563 } else if (admin_command == "dump_watchers") {
2564 list<obj_watch_item_t> watchers;
2565 // scan pg's
11fdf7f2
TL
2566 vector<PGRef> pgs;
2567 _get_pgs(&pgs);
2568 for (auto& pg : pgs) {
2569 list<obj_watch_item_t> pg_watchers;
2570 pg->get_watchers(&pg_watchers);
2571 watchers.splice(watchers.end(), pg_watchers);
7c673cae
FG
2572 }
2573
2574 f->open_array_section("watchers");
2575 for (list<obj_watch_item_t>::iterator it = watchers.begin();
2576 it != watchers.end(); ++it) {
2577
224ce89b 2578 f->open_object_section("watch");
7c673cae
FG
2579
2580 f->dump_string("namespace", it->obj.nspace);
2581 f->dump_string("object", it->obj.oid.name);
2582
2583 f->open_object_section("entity_name");
2584 it->wi.name.dump(f);
2585 f->close_section(); //entity_name_t
2586
224ce89b
WB
2587 f->dump_unsigned("cookie", it->wi.cookie);
2588 f->dump_unsigned("timeout", it->wi.timeout_seconds);
7c673cae
FG
2589
2590 f->open_object_section("entity_addr_t");
2591 it->wi.addr.dump(f);
2592 f->close_section(); //entity_addr_t
2593
2594 f->close_section(); //watch
2595 }
2596
2597 f->close_section(); //watchers
2598 } else if (admin_command == "dump_reservations") {
2599 f->open_object_section("reservations");
2600 f->open_object_section("local_reservations");
2601 service.local_reserver.dump(f);
2602 f->close_section();
2603 f->open_object_section("remote_reservations");
2604 service.remote_reserver.dump(f);
2605 f->close_section();
2606 f->close_section();
2607 } else if (admin_command == "get_latest_osdmap") {
2608 get_latest_osdmap();
2609 } else if (admin_command == "heap") {
2610 auto result = ceph::osd_cmds::heap(*cct, cmdmap, *f, ss);
2611
2612 // Note: Failed heap profile commands won't necessarily trigger an error:
2613 f->open_object_section("result");
2614 f->dump_string("error", cpp_strerror(result));
2615 f->dump_bool("success", result >= 0);
2616 f->close_section();
2617 } else if (admin_command == "set_heap_property") {
2618 string property;
2619 int64_t value = 0;
2620 string error;
2621 bool success = false;
2622 if (!cmd_getval(cct, cmdmap, "property", property)) {
2623 error = "unable to get property";
2624 success = false;
2625 } else if (!cmd_getval(cct, cmdmap, "value", value)) {
2626 error = "unable to get value";
2627 success = false;
2628 } else if (value < 0) {
2629 error = "negative value not allowed";
2630 success = false;
2631 } else if (!ceph_heap_set_numeric_property(property.c_str(), (size_t)value)) {
2632 error = "invalid property";
2633 success = false;
2634 } else {
2635 success = true;
2636 }
2637 f->open_object_section("result");
2638 f->dump_string("error", error);
2639 f->dump_bool("success", success);
2640 f->close_section();
2641 } else if (admin_command == "get_heap_property") {
2642 string property;
2643 size_t value = 0;
2644 string error;
2645 bool success = false;
2646 if (!cmd_getval(cct, cmdmap, "property", property)) {
2647 error = "unable to get property";
2648 success = false;
2649 } else if (!ceph_heap_get_numeric_property(property.c_str(), &value)) {
2650 error = "invalid property";
2651 success = false;
2652 } else {
2653 success = true;
2654 }
2655 f->open_object_section("result");
2656 f->dump_string("error", error);
2657 f->dump_bool("success", success);
2658 f->dump_int("value", value);
2659 f->close_section();
2660 } else if (admin_command == "dump_objectstore_kv_stats") {
2661 store->get_db_statistics(f);
2662 } else if (admin_command == "dump_scrubs") {
2663 service.dumps_scrub(f);
2664 } else if (admin_command == "calc_objectstore_db_histogram") {
2665 store->generate_db_histogram(f);
2666 } else if (admin_command == "flush_store_cache") {
11fdf7f2 2667 store->flush_cache(&ss);
7c673cae
FG
2668 } else if (admin_command == "dump_pgstate_history") {
2669 f->open_object_section("pgstate_history");
11fdf7f2
TL
2670 vector<PGRef> pgs;
2671 _get_pgs(&pgs);
2672 for (auto& pg : pgs) {
2673 f->dump_stream("pg") << pg->pg_id;
2674 pg->dump_pgstate_history(f);
7c673cae
FG
2675 }
2676 f->close_section();
224ce89b
WB
2677 } else if (admin_command == "compact") {
2678 dout(1) << "triggering manual compaction" << dendl;
2679 auto start = ceph::coarse_mono_clock::now();
2680 store->compact();
2681 auto end = ceph::coarse_mono_clock::now();
11fdf7f2 2682 double duration = std::chrono::duration<double>(end-start).count();
224ce89b 2683 dout(1) << "finished manual compaction in "
11fdf7f2 2684 << duration
224ce89b
WB
2685 << " seconds" << dendl;
2686 f->open_object_section("compact_result");
11fdf7f2
TL
2687 f->dump_float("elapsed_time", duration);
2688 f->close_section();
2689 } else if (admin_command == "get_mapped_pools") {
2690 f->open_array_section("mapped_pools");
2691 set<int64_t> poollist = get_mapped_pools();
2692 for (auto pool : poollist) {
2693 f->dump_int("pool_id", pool);
2694 }
2695 f->close_section();
2696 } else if (admin_command == "smart") {
2697 string devid;
2698 cmd_getval(cct, cmdmap, "devid", devid);
2699 probe_smart(devid, ss);
2700 } else if (admin_command == "list_devices") {
2701 set<string> devnames;
2702 store->get_devices(&devnames);
2703 f->open_object_section("list_devices");
2704 for (auto dev : devnames) {
2705 if (dev.find("dm-") == 0) {
2706 continue;
2707 }
2708 f->dump_string("device", "/dev/" + dev);
2709 }
224ce89b 2710 f->close_section();
11fdf7f2
TL
2711 } else if (admin_command == "send_beacon") {
2712 if (is_active()) {
2713 send_beacon(ceph::coarse_mono_clock::now());
2714 }
7c673cae 2715 } else {
11fdf7f2 2716 ceph_abort_msg("broken asok registration");
7c673cae
FG
2717 }
2718 f->flush(ss);
2719 delete f;
2720 return true;
2721}
2722
2723class TestOpsSocketHook : public AdminSocketHook {
2724 OSDService *service;
2725 ObjectStore *store;
2726public:
2727 TestOpsSocketHook(OSDService *s, ObjectStore *st) : service(s), store(st) {}
11fdf7f2
TL
2728 bool call(std::string_view command, const cmdmap_t& cmdmap,
2729 std::string_view format, bufferlist& out) override {
7c673cae 2730 stringstream ss;
11fdf7f2
TL
2731 try {
2732 test_ops(service, store, command, cmdmap, ss);
2733 } catch (const bad_cmd_get& e) {
2734 ss << e.what();
2735 }
7c673cae
FG
2736 out.append(ss);
2737 return true;
2738 }
2739 void test_ops(OSDService *service, ObjectStore *store,
11fdf7f2 2740 std::string_view command, const cmdmap_t& cmdmap, ostream &ss);
7c673cae
FG
2741
2742};
2743
2744class OSD::C_Tick : public Context {
2745 OSD *osd;
2746 public:
2747 explicit C_Tick(OSD *o) : osd(o) {}
2748 void finish(int r) override {
2749 osd->tick();
2750 }
2751};
2752
2753class OSD::C_Tick_WithoutOSDLock : public Context {
2754 OSD *osd;
2755 public:
2756 explicit C_Tick_WithoutOSDLock(OSD *o) : osd(o) {}
2757 void finish(int r) override {
2758 osd->tick_without_osd_lock();
2759 }
2760};
2761
2762int OSD::enable_disable_fuse(bool stop)
2763{
2764#ifdef HAVE_LIBFUSE
2765 int r;
2766 string mntpath = cct->_conf->osd_data + "/fuse";
2767 if (fuse_store && (stop || !cct->_conf->osd_objectstore_fuse)) {
2768 dout(1) << __func__ << " disabling" << dendl;
2769 fuse_store->stop();
2770 delete fuse_store;
2771 fuse_store = NULL;
2772 r = ::rmdir(mntpath.c_str());
7c673cae 2773 if (r < 0) {
c07f9fc5
FG
2774 r = -errno;
2775 derr << __func__ << " failed to rmdir " << mntpath << ": "
2776 << cpp_strerror(r) << dendl;
7c673cae
FG
2777 return r;
2778 }
2779 return 0;
2780 }
2781 if (!fuse_store && cct->_conf->osd_objectstore_fuse) {
2782 dout(1) << __func__ << " enabling" << dendl;
2783 r = ::mkdir(mntpath.c_str(), 0700);
2784 if (r < 0)
2785 r = -errno;
2786 if (r < 0 && r != -EEXIST) {
2787 derr << __func__ << " unable to create " << mntpath << ": "
2788 << cpp_strerror(r) << dendl;
2789 return r;
2790 }
2791 fuse_store = new FuseStore(store, mntpath);
2792 r = fuse_store->start();
2793 if (r < 0) {
2794 derr << __func__ << " unable to start fuse: " << cpp_strerror(r) << dendl;
2795 delete fuse_store;
2796 fuse_store = NULL;
2797 return r;
2798 }
2799 }
2800#endif // HAVE_LIBFUSE
2801 return 0;
2802}
2803
31f18b77
FG
2804int OSD::get_num_op_shards()
2805{
2806 if (cct->_conf->osd_op_num_shards)
2807 return cct->_conf->osd_op_num_shards;
2808 if (store_is_rotational)
2809 return cct->_conf->osd_op_num_shards_hdd;
2810 else
2811 return cct->_conf->osd_op_num_shards_ssd;
2812}
2813
2814int OSD::get_num_op_threads()
2815{
2816 if (cct->_conf->osd_op_num_threads_per_shard)
2817 return get_num_op_shards() * cct->_conf->osd_op_num_threads_per_shard;
2818 if (store_is_rotational)
2819 return get_num_op_shards() * cct->_conf->osd_op_num_threads_per_shard_hdd;
2820 else
2821 return get_num_op_shards() * cct->_conf->osd_op_num_threads_per_shard_ssd;
2822}
2823
c07f9fc5
FG
2824float OSD::get_osd_recovery_sleep()
2825{
2826 if (cct->_conf->osd_recovery_sleep)
2827 return cct->_conf->osd_recovery_sleep;
d2e6a577 2828 if (!store_is_rotational && !journal_is_rotational)
c07f9fc5 2829 return cct->_conf->osd_recovery_sleep_ssd;
d2e6a577 2830 else if (store_is_rotational && !journal_is_rotational)
11fdf7f2 2831 return cct->_conf.get_val<double>("osd_recovery_sleep_hybrid");
d2e6a577
FG
2832 else
2833 return cct->_conf->osd_recovery_sleep_hdd;
c07f9fc5
FG
2834}
2835
11fdf7f2
TL
2836float OSD::get_osd_delete_sleep()
2837{
2838 float osd_delete_sleep = cct->_conf.get_val<double>("osd_delete_sleep");
2839 if (osd_delete_sleep > 0)
2840 return osd_delete_sleep;
2841 if (!store_is_rotational && !journal_is_rotational)
2842 return cct->_conf.get_val<double>("osd_delete_sleep_ssd");
2843 if (store_is_rotational && !journal_is_rotational)
2844 return cct->_conf.get_val<double>("osd_delete_sleep_hybrid");
2845 return cct->_conf.get_val<double>("osd_delete_sleep_hdd");
2846}
2847
494da23a
TL
2848float OSD::get_osd_snap_trim_sleep()
2849{
2850 float osd_snap_trim_sleep = cct->_conf.get_val<double>("osd_snap_trim_sleep");
2851 if (osd_snap_trim_sleep > 0)
2852 return osd_snap_trim_sleep;
2853 if (!store_is_rotational && !journal_is_rotational)
2854 return cct->_conf.get_val<double>("osd_snap_trim_sleep_ssd");
2855 if (store_is_rotational && !journal_is_rotational)
2856 return cct->_conf.get_val<double>("osd_snap_trim_sleep_hybrid");
2857 return cct->_conf.get_val<double>("osd_snap_trim_sleep_hdd");
2858}
2859
7c673cae
FG
2860int OSD::init()
2861{
2862 CompatSet initial, diff;
11fdf7f2 2863 std::lock_guard lock(osd_lock);
7c673cae
FG
2864 if (is_stopping())
2865 return 0;
2866
2867 tick_timer.init();
2868 tick_timer_without_osd_lock.init();
2869 service.recovery_request_timer.init();
11fdf7f2
TL
2870 service.sleep_timer.init();
2871
2872 boot_finisher.start();
2873
2874 {
2875 string val;
2876 store->read_meta("require_osd_release", &val);
2877 last_require_osd_release = atoi(val.c_str());
2878 }
7c673cae
FG
2879
2880 // mount.
31f18b77
FG
2881 dout(2) << "init " << dev_path
2882 << " (looks like " << (store_is_rotational ? "hdd" : "ssd") << ")"
2883 << dendl;
d2e6a577 2884 dout(2) << "journal " << journal_path << dendl;
11fdf7f2 2885 ceph_assert(store); // call pre_init() first!
7c673cae 2886
31f18b77 2887 store->set_cache_shards(get_num_op_shards());
7c673cae
FG
2888
2889 int r = store->mount();
2890 if (r < 0) {
2891 derr << "OSD:init: unable to mount object store" << dendl;
2892 return r;
2893 }
d2e6a577
FG
2894 journal_is_rotational = store->is_journal_rotational();
2895 dout(2) << "journal looks like " << (journal_is_rotational ? "hdd" : "ssd")
2896 << dendl;
7c673cae
FG
2897
2898 enable_disable_fuse(false);
2899
2900 dout(2) << "boot" << dendl;
2901
11fdf7f2
TL
2902 service.meta_ch = store->open_collection(coll_t::meta());
2903
7c673cae
FG
2904 // initialize the daily loadavg with current 15min loadavg
2905 double loadavgs[3];
2906 if (getloadavg(loadavgs, 3) == 3) {
2907 daily_loadavg = loadavgs[2];
2908 } else {
2909 derr << "OSD::init() : couldn't read loadavgs\n" << dendl;
2910 daily_loadavg = 1.0;
2911 }
2912
2913 int rotating_auth_attempts = 0;
11fdf7f2
TL
2914 auto rotating_auth_timeout =
2915 g_conf().get_val<int64_t>("rotating_keys_bootstrap_timeout");
7c673cae
FG
2916
2917 // sanity check long object name handling
2918 {
2919 hobject_t l;
2920 l.oid.name = string(cct->_conf->osd_max_object_name_len, 'n');
2921 l.set_key(string(cct->_conf->osd_max_object_name_len, 'k'));
2922 l.nspace = string(cct->_conf->osd_max_object_namespace_len, 's');
2923 r = store->validate_hobject_key(l);
2924 if (r < 0) {
2925 derr << "backend (" << store->get_type() << ") is unable to support max "
2926 << "object name[space] len" << dendl;
2927 derr << " osd max object name len = "
2928 << cct->_conf->osd_max_object_name_len << dendl;
2929 derr << " osd max object namespace len = "
2930 << cct->_conf->osd_max_object_namespace_len << dendl;
2931 derr << cpp_strerror(r) << dendl;
2932 if (cct->_conf->osd_check_max_object_name_len_on_startup) {
2933 goto out;
2934 }
2935 derr << "osd_check_max_object_name_len_on_startup = false, starting anyway"
2936 << dendl;
2937 } else {
2938 dout(20) << "configured osd_max_object_name[space]_len looks ok" << dendl;
2939 }
2940 }
2941
2942 // read superblock
2943 r = read_superblock();
2944 if (r < 0) {
2945 derr << "OSD::init() : unable to read osd superblock" << dendl;
2946 r = -EINVAL;
2947 goto out;
2948 }
2949
2950 if (osd_compat.compare(superblock.compat_features) < 0) {
2951 derr << "The disk uses features unsupported by the executable." << dendl;
2952 derr << " ondisk features " << superblock.compat_features << dendl;
2953 derr << " daemon features " << osd_compat << dendl;
2954
2955 if (osd_compat.writeable(superblock.compat_features)) {
2956 CompatSet diff = osd_compat.unsupported(superblock.compat_features);
2957 derr << "it is still writeable, though. Missing features: " << diff << dendl;
2958 r = -EOPNOTSUPP;
2959 goto out;
2960 }
2961 else {
2962 CompatSet diff = osd_compat.unsupported(superblock.compat_features);
2963 derr << "Cannot write to disk! Missing features: " << diff << dendl;
2964 r = -EOPNOTSUPP;
2965 goto out;
2966 }
2967 }
2968
2969 assert_warn(whoami == superblock.whoami);
2970 if (whoami != superblock.whoami) {
2971 derr << "OSD::init: superblock says osd"
2972 << superblock.whoami << " but I am osd." << whoami << dendl;
2973 r = -EINVAL;
2974 goto out;
2975 }
2976
11fdf7f2
TL
2977 // load up "current" osdmap
2978 assert_warn(!osdmap);
2979 if (osdmap) {
2980 derr << "OSD::init: unable to read current osdmap" << dendl;
2981 r = -EINVAL;
2982 goto out;
2983 }
2984 osdmap = get_map(superblock.current_epoch);
2985
2986 // make sure we don't have legacy pgs deleting
2987 {
2988 vector<coll_t> ls;
2989 int r = store->list_collections(ls);
2990 ceph_assert(r >= 0);
2991 for (auto c : ls) {
2992 spg_t pgid;
2993 if (c.is_pg(&pgid) &&
2994 !osdmap->have_pg_pool(pgid.pool())) {
2995 ghobject_t oid = make_final_pool_info_oid(pgid.pool());
2996 if (!store->exists(service.meta_ch, oid)) {
2997 derr << __func__ << " missing pg_pool_t for deleted pool "
2998 << pgid.pool() << " for pg " << pgid
2999 << "; please downgrade to luminous and allow "
3000 << "pg deletion to complete before upgrading" << dendl;
3001 ceph_abort();
3002 }
3003 }
3004 }
3005 }
3006
7c673cae
FG
3007 initial = get_osd_initial_compat_set();
3008 diff = superblock.compat_features.unsupported(initial);
3009 if (superblock.compat_features.merge(initial)) {
3010 // We need to persist the new compat_set before we
3011 // do anything else
3012 dout(5) << "Upgrading superblock adding: " << diff << dendl;
3013 ObjectStore::Transaction t;
3014 write_superblock(t);
11fdf7f2 3015 r = store->queue_transaction(service.meta_ch, std::move(t));
7c673cae
FG
3016 if (r < 0)
3017 goto out;
3018 }
3019
3020 // make sure snap mapper object exists
11fdf7f2 3021 if (!store->exists(service.meta_ch, OSD::make_snapmapper_oid())) {
7c673cae
FG
3022 dout(10) << "init creating/touching snapmapper object" << dendl;
3023 ObjectStore::Transaction t;
3024 t.touch(coll_t::meta(), OSD::make_snapmapper_oid());
11fdf7f2 3025 r = store->queue_transaction(service.meta_ch, std::move(t));
7c673cae
FG
3026 if (r < 0)
3027 goto out;
3028 }
3029
3030 class_handler = new ClassHandler(cct);
3031 cls_initialize(class_handler);
3032
3033 if (cct->_conf->osd_open_classes_on_start) {
3034 int r = class_handler->open_all_classes();
3035 if (r)
3036 dout(1) << "warning: got an error loading one or more classes: " << cpp_strerror(r) << dendl;
3037 }
3038
11fdf7f2 3039 check_osdmap_features();
7c673cae
FG
3040
3041 create_recoverystate_perf();
3042
3043 {
3044 epoch_t bind_epoch = osdmap->get_epoch();
3045 service.set_epochs(NULL, NULL, &bind_epoch);
3046 }
3047
3048 clear_temp_objects();
3049
d2e6a577 3050 // initialize osdmap references in sharded wq
11fdf7f2
TL
3051 for (auto& shard : shards) {
3052 std::lock_guard l(shard->osdmap_lock);
3053 shard->shard_osdmap = osdmap;
3054 }
d2e6a577 3055
7c673cae
FG
3056 // load up pgs (as they previously existed)
3057 load_pgs();
3058
3059 dout(2) << "superblock: I am osd." << superblock.whoami << dendl;
3060 dout(0) << "using " << op_queue << " op queue with priority op cut off at " <<
3061 op_prio_cutoff << "." << dendl;
3062
3063 create_logger();
3064
11fdf7f2
TL
3065 // prime osd stats
3066 {
3067 struct store_statfs_t stbuf;
3068 osd_alert_list_t alerts;
3069 int r = store->statfs(&stbuf, &alerts);
3070 ceph_assert(r == 0);
3071 service.set_statfs(stbuf, alerts);
3072 }
3073
3074 // client_messenger auth_client is already set up by monc.
3075 for (auto m : { cluster_messenger,
3076 objecter_messenger,
3077 hb_front_client_messenger,
3078 hb_back_client_messenger,
3079 hb_front_server_messenger,
3080 hb_back_server_messenger } ) {
3081 m->set_auth_client(monc);
3082 }
3083 for (auto m : { client_messenger,
3084 cluster_messenger,
3085 hb_front_server_messenger,
3086 hb_back_server_messenger }) {
3087 m->set_auth_server(monc);
3088 }
3089 monc->set_handle_authentication_dispatcher(this);
7c673cae
FG
3090
3091 monc->set_want_keys(CEPH_ENTITY_TYPE_MON | CEPH_ENTITY_TYPE_OSD
3092 | CEPH_ENTITY_TYPE_MGR);
3093 r = monc->init();
3094 if (r < 0)
3095 goto out;
3096
11fdf7f2
TL
3097 mgrc.set_pgstats_cb([this](){ return collect_pg_stats(); });
3098 mgrc.set_perf_metric_query_cb(
3099 [this](const std::map<OSDPerfMetricQuery, OSDPerfMetricLimits> &queries) {
3100 set_perf_queries(queries);
3101 },
3102 [this](std::map<OSDPerfMetricQuery, OSDPerfMetricReport> *reports) {
3103 get_perf_reports(reports);
3104 });
7c673cae 3105 mgrc.init();
7c673cae
FG
3106
3107 // tell monc about log_client so it will know about mon session resets
3108 monc->set_log_client(&log_client);
3109 update_log_config();
3110
11fdf7f2
TL
3111 // i'm ready!
3112 client_messenger->add_dispatcher_tail(&mgrc);
3113 client_messenger->add_dispatcher_tail(this);
3114 cluster_messenger->add_dispatcher_head(this);
3115
3116 hb_front_client_messenger->add_dispatcher_head(&heartbeat_dispatcher);
3117 hb_back_client_messenger->add_dispatcher_head(&heartbeat_dispatcher);
3118 hb_front_server_messenger->add_dispatcher_head(&heartbeat_dispatcher);
3119 hb_back_server_messenger->add_dispatcher_head(&heartbeat_dispatcher);
3120
3121 objecter_messenger->add_dispatcher_head(service.objecter);
3122
28e407b8
AA
3123 service.init();
3124 service.publish_map(osdmap);
3125 service.publish_superblock(superblock);
3126 service.max_oldest_map = superblock.oldest_map;
3127
11fdf7f2
TL
3128 for (auto& shard : shards) {
3129 // put PGs in a temporary set because we may modify pg_slots
3130 // unordered_map below.
3131 set<PGRef> pgs;
3132 for (auto& i : shard->pg_slots) {
3133 PGRef pg = i.second->pg;
3134 if (!pg) {
3135 continue;
3136 }
3137 pgs.insert(pg);
3138 }
3139 for (auto pg : pgs) {
3140 pg->lock();
3141 set<pair<spg_t,epoch_t>> new_children;
3142 set<pair<spg_t,epoch_t>> merge_pgs;
3143 service.identify_splits_and_merges(pg->get_osdmap(), osdmap, pg->pg_id,
3144 &new_children, &merge_pgs);
3145 if (!new_children.empty()) {
3146 for (auto shard : shards) {
3147 shard->prime_splits(osdmap, &new_children);
3148 }
3149 assert(new_children.empty());
3150 }
3151 if (!merge_pgs.empty()) {
3152 for (auto shard : shards) {
3153 shard->prime_merges(osdmap, &merge_pgs);
3154 }
3155 assert(merge_pgs.empty());
3156 }
3157 pg->unlock();
3158 }
3159 }
3160
7c673cae 3161 osd_op_tp.start();
7c673cae
FG
3162 command_tp.start();
3163
7c673cae
FG
3164 // start the heartbeat
3165 heartbeat_thread.create("osd_srv_heartbt");
3166
3167 // tick
91327a77
AA
3168 tick_timer.add_event_after(get_tick_interval(),
3169 new C_Tick(this));
7c673cae 3170 {
11fdf7f2 3171 std::lock_guard l(tick_timer_lock);
91327a77
AA
3172 tick_timer_without_osd_lock.add_event_after(get_tick_interval(),
3173 new C_Tick_WithoutOSDLock(this));
7c673cae
FG
3174 }
3175
7c673cae
FG
3176 osd_lock.Unlock();
3177
3178 r = monc->authenticate();
3179 if (r < 0) {
c07f9fc5
FG
3180 derr << __func__ << " authentication failed: " << cpp_strerror(r)
3181 << dendl;
11fdf7f2 3182 exit(1);
7c673cae
FG
3183 }
3184
11fdf7f2 3185 while (monc->wait_auth_rotating(rotating_auth_timeout) < 0) {
7c673cae
FG
3186 derr << "unable to obtain rotating service keys; retrying" << dendl;
3187 ++rotating_auth_attempts;
11fdf7f2 3188 if (rotating_auth_attempts > g_conf()->max_rotating_auth_attempts) {
c07f9fc5 3189 derr << __func__ << " wait_auth_rotating timed out" << dendl;
11fdf7f2 3190 exit(1);
7c673cae
FG
3191 }
3192 }
3193
3194 r = update_crush_device_class();
3195 if (r < 0) {
d2e6a577
FG
3196 derr << __func__ << " unable to update_crush_device_class: "
3197 << cpp_strerror(r) << dendl;
11fdf7f2 3198 exit(1);
7c673cae
FG
3199 }
3200
3201 r = update_crush_location();
3202 if (r < 0) {
d2e6a577 3203 derr << __func__ << " unable to update_crush_location: "
c07f9fc5 3204 << cpp_strerror(r) << dendl;
11fdf7f2 3205 exit(1);
7c673cae
FG
3206 }
3207
3208 osd_lock.Lock();
3209 if (is_stopping())
3210 return 0;
3211
3212 // start objecter *after* we have authenticated, so that we don't ignore
3213 // the OSDMaps it requests.
3214 service.final_init();
3215
3216 check_config();
3217
3218 dout(10) << "ensuring pgs have consumed prior maps" << dendl;
3219 consume_map();
7c673cae
FG
3220
3221 dout(0) << "done with init, starting boot process" << dendl;
3222
3223 // subscribe to any pg creations
3224 monc->sub_want("osd_pg_creates", last_pg_create_epoch, 0);
3225
3226 // MgrClient needs this (it doesn't have MonClient reference itself)
3227 monc->sub_want("mgrmap", 0, 0);
3228
3229 // we don't need to ask for an osdmap here; objecter will
3230 //monc->sub_want("osdmap", osdmap->get_epoch(), CEPH_SUBSCRIBE_ONETIME);
3231
3232 monc->renew_subs();
3233
3234 start_boot();
3235
3236 return 0;
7c673cae
FG
3237
3238out:
3239 enable_disable_fuse(true);
3240 store->umount();
3241 delete store;
3242 store = NULL;
3243 return r;
3244}
3245
3246void OSD::final_init()
3247{
3248 AdminSocket *admin_socket = cct->get_admin_socket();
3249 asok_hook = new OSDSocketHook(this);
3250 int r = admin_socket->register_command("status", "status", asok_hook,
3251 "high-level status of OSD");
11fdf7f2 3252 ceph_assert(r == 0);
7c673cae
FG
3253 r = admin_socket->register_command("flush_journal", "flush_journal",
3254 asok_hook,
3255 "flush the journal to permanent store");
11fdf7f2 3256 ceph_assert(r == 0);
7c673cae 3257 r = admin_socket->register_command("dump_ops_in_flight",
c07f9fc5
FG
3258 "dump_ops_in_flight " \
3259 "name=filterstr,type=CephString,n=N,req=false",
3260 asok_hook,
7c673cae 3261 "show the ops currently in flight");
11fdf7f2 3262 ceph_assert(r == 0);
7c673cae 3263 r = admin_socket->register_command("ops",
c07f9fc5
FG
3264 "ops " \
3265 "name=filterstr,type=CephString,n=N,req=false",
3266 asok_hook,
7c673cae 3267 "show the ops currently in flight");
11fdf7f2 3268 ceph_assert(r == 0);
7c673cae 3269 r = admin_socket->register_command("dump_blocked_ops",
c07f9fc5
FG
3270 "dump_blocked_ops " \
3271 "name=filterstr,type=CephString,n=N,req=false",
3272 asok_hook,
7c673cae 3273 "show the blocked ops currently in flight");
11fdf7f2 3274 ceph_assert(r == 0);
c07f9fc5
FG
3275 r = admin_socket->register_command("dump_historic_ops",
3276 "dump_historic_ops " \
3277 "name=filterstr,type=CephString,n=N,req=false",
7c673cae
FG
3278 asok_hook,
3279 "show recent ops");
11fdf7f2 3280 ceph_assert(r == 0);
c07f9fc5
FG
3281 r = admin_socket->register_command("dump_historic_slow_ops",
3282 "dump_historic_slow_ops " \
3283 "name=filterstr,type=CephString,n=N,req=false",
7c673cae
FG
3284 asok_hook,
3285 "show slowest recent ops");
11fdf7f2 3286 ceph_assert(r == 0);
c07f9fc5
FG
3287 r = admin_socket->register_command("dump_historic_ops_by_duration",
3288 "dump_historic_ops_by_duration " \
3289 "name=filterstr,type=CephString,n=N,req=false",
7c673cae
FG
3290 asok_hook,
3291 "show slowest recent ops, sorted by duration");
11fdf7f2 3292 ceph_assert(r == 0);
7c673cae
FG
3293 r = admin_socket->register_command("dump_op_pq_state", "dump_op_pq_state",
3294 asok_hook,
3295 "dump op priority queue state");
11fdf7f2 3296 ceph_assert(r == 0);
7c673cae
FG
3297 r = admin_socket->register_command("dump_blacklist", "dump_blacklist",
3298 asok_hook,
3299 "dump blacklisted clients and times");
11fdf7f2 3300 ceph_assert(r == 0);
7c673cae
FG
3301 r = admin_socket->register_command("dump_watchers", "dump_watchers",
3302 asok_hook,
3303 "show clients which have active watches,"
3304 " and on which objects");
11fdf7f2 3305 ceph_assert(r == 0);
7c673cae
FG
3306 r = admin_socket->register_command("dump_reservations", "dump_reservations",
3307 asok_hook,
3308 "show recovery reservations");
11fdf7f2 3309 ceph_assert(r == 0);
7c673cae
FG
3310 r = admin_socket->register_command("get_latest_osdmap", "get_latest_osdmap",
3311 asok_hook,
3312 "force osd to update the latest map from "
3313 "the mon");
11fdf7f2 3314 ceph_assert(r == 0);
7c673cae
FG
3315
3316 r = admin_socket->register_command( "heap",
3317 "heap " \
11fdf7f2
TL
3318 "name=heapcmd,type=CephString " \
3319 "name=value,type=CephString,req=false",
7c673cae
FG
3320 asok_hook,
3321 "show heap usage info (available only if "
3322 "compiled with tcmalloc)");
11fdf7f2 3323 ceph_assert(r == 0);
7c673cae
FG
3324
3325 r = admin_socket->register_command("set_heap_property",
3326 "set_heap_property " \
3327 "name=property,type=CephString " \
3328 "name=value,type=CephInt",
3329 asok_hook,
3330 "update malloc extension heap property");
11fdf7f2 3331 ceph_assert(r == 0);
7c673cae
FG
3332
3333 r = admin_socket->register_command("get_heap_property",
3334 "get_heap_property " \
3335 "name=property,type=CephString",
3336 asok_hook,
3337 "get malloc extension heap property");
11fdf7f2 3338 ceph_assert(r == 0);
7c673cae
FG
3339
3340 r = admin_socket->register_command("dump_objectstore_kv_stats",
3341 "dump_objectstore_kv_stats",
3342 asok_hook,
3343 "print statistics of kvdb which used by bluestore");
11fdf7f2 3344 ceph_assert(r == 0);
7c673cae
FG
3345
3346 r = admin_socket->register_command("dump_scrubs",
3347 "dump_scrubs",
3348 asok_hook,
3349 "print scheduled scrubs");
11fdf7f2 3350 ceph_assert(r == 0);
7c673cae
FG
3351
3352 r = admin_socket->register_command("calc_objectstore_db_histogram",
3353 "calc_objectstore_db_histogram",
3354 asok_hook,
3355 "Generate key value histogram of kvdb(rocksdb) which used by bluestore");
11fdf7f2 3356 ceph_assert(r == 0);
7c673cae
FG
3357
3358 r = admin_socket->register_command("flush_store_cache",
3359 "flush_store_cache",
3360 asok_hook,
3361 "Flush bluestore internal cache");
11fdf7f2 3362 ceph_assert(r == 0);
7c673cae
FG
3363 r = admin_socket->register_command("dump_pgstate_history", "dump_pgstate_history",
3364 asok_hook,
3365 "show recent state history");
11fdf7f2 3366 ceph_assert(r == 0);
7c673cae 3367
224ce89b
WB
3368 r = admin_socket->register_command("compact", "compact",
3369 asok_hook,
3370 "Commpact object store's omap."
3371 " WARNING: Compaction probably slows your requests");
11fdf7f2
TL
3372 ceph_assert(r == 0);
3373
3374 r = admin_socket->register_command("get_mapped_pools", "get_mapped_pools",
3375 asok_hook,
3376 "dump pools whose PG(s) are mapped to this OSD.");
3377
3378 ceph_assert(r == 0);
3379
3380 r = admin_socket->register_command("smart", "smart name=devid,type=CephString,req=False",
3381 asok_hook,
3382 "probe OSD devices for SMART data.");
3383
3384 ceph_assert(r == 0);
3385
3386 r = admin_socket->register_command("list_devices", "list_devices",
3387 asok_hook,
3388 "list OSD devices.");
3389 r = admin_socket->register_command("send_beacon", "send_beacon",
3390 asok_hook,
3391 "send OSD beacon to mon immediately");
224ce89b 3392
7c673cae
FG
3393 test_ops_hook = new TestOpsSocketHook(&(this->service), this->store);
3394 // Note: pools are CephString instead of CephPoolname because
3395 // these commands traditionally support both pool names and numbers
3396 r = admin_socket->register_command(
3397 "setomapval",
3398 "setomapval " \
3399 "name=pool,type=CephString " \
3400 "name=objname,type=CephObjectname " \
3401 "name=key,type=CephString "\
3402 "name=val,type=CephString",
3403 test_ops_hook,
3404 "set omap key");
11fdf7f2 3405 ceph_assert(r == 0);
7c673cae
FG
3406 r = admin_socket->register_command(
3407 "rmomapkey",
3408 "rmomapkey " \
3409 "name=pool,type=CephString " \
3410 "name=objname,type=CephObjectname " \
3411 "name=key,type=CephString",
3412 test_ops_hook,
3413 "remove omap key");
11fdf7f2 3414 ceph_assert(r == 0);
7c673cae
FG
3415 r = admin_socket->register_command(
3416 "setomapheader",
3417 "setomapheader " \
3418 "name=pool,type=CephString " \
3419 "name=objname,type=CephObjectname " \
3420 "name=header,type=CephString",
3421 test_ops_hook,
3422 "set omap header");
11fdf7f2 3423 ceph_assert(r == 0);
7c673cae
FG
3424
3425 r = admin_socket->register_command(
3426 "getomap",
3427 "getomap " \
3428 "name=pool,type=CephString " \
3429 "name=objname,type=CephObjectname",
3430 test_ops_hook,
3431 "output entire object map");
11fdf7f2 3432 ceph_assert(r == 0);
7c673cae
FG
3433
3434 r = admin_socket->register_command(
3435 "truncobj",
3436 "truncobj " \
3437 "name=pool,type=CephString " \
3438 "name=objname,type=CephObjectname " \
3439 "name=len,type=CephInt",
3440 test_ops_hook,
3441 "truncate object to length");
11fdf7f2 3442 ceph_assert(r == 0);
7c673cae
FG
3443
3444 r = admin_socket->register_command(
3445 "injectdataerr",
3446 "injectdataerr " \
3447 "name=pool,type=CephString " \
3448 "name=objname,type=CephObjectname " \
3449 "name=shardid,type=CephInt,req=false,range=0|255",
3450 test_ops_hook,
3451 "inject data error to an object");
11fdf7f2 3452 ceph_assert(r == 0);
7c673cae
FG
3453
3454 r = admin_socket->register_command(
3455 "injectmdataerr",
3456 "injectmdataerr " \
3457 "name=pool,type=CephString " \
3458 "name=objname,type=CephObjectname " \
3459 "name=shardid,type=CephInt,req=false,range=0|255",
3460 test_ops_hook,
3461 "inject metadata error to an object");
11fdf7f2 3462 ceph_assert(r == 0);
7c673cae
FG
3463 r = admin_socket->register_command(
3464 "set_recovery_delay",
3465 "set_recovery_delay " \
3466 "name=utime,type=CephInt,req=false",
3467 test_ops_hook,
3468 "Delay osd recovery by specified seconds");
11fdf7f2 3469 ceph_assert(r == 0);
7c673cae
FG
3470 r = admin_socket->register_command(
3471 "trigger_scrub",
3472 "trigger_scrub " \
a8e16298
TL
3473 "name=pgid,type=CephString " \
3474 "name=time,type=CephInt,req=false",
7c673cae
FG
3475 test_ops_hook,
3476 "Trigger a scheduled scrub ");
11fdf7f2 3477 ceph_assert(r == 0);
a8e16298
TL
3478 r = admin_socket->register_command(
3479 "trigger_deep_scrub",
3480 "trigger_deep_scrub " \
3481 "name=pgid,type=CephString " \
3482 "name=time,type=CephInt,req=false",
3483 test_ops_hook,
3484 "Trigger a scheduled deep scrub ");
3485 ceph_assert(r == 0);
7c673cae
FG
3486 r = admin_socket->register_command(
3487 "injectfull",
3488 "injectfull " \
3489 "name=type,type=CephString,req=false " \
3490 "name=count,type=CephInt,req=false ",
3491 test_ops_hook,
3492 "Inject a full disk (optional count times)");
11fdf7f2 3493 ceph_assert(r == 0);
7c673cae
FG
3494}
3495
3496void OSD::create_logger()
3497{
3498 dout(10) << "create_logger" << dendl;
3499
3500 PerfCountersBuilder osd_plb(cct, "osd", l_osd_first, l_osd_last);
3501
3502 // Latency axis configuration for op histograms, values are in nanoseconds
3503 PerfHistogramCommon::axis_config_d op_hist_x_axis_config{
3504 "Latency (usec)",
3505 PerfHistogramCommon::SCALE_LOG2, ///< Latency in logarithmic scale
3506 0, ///< Start at 0
3507 100000, ///< Quantization unit is 100usec
3508 32, ///< Enough to cover much longer than slow requests
3509 };
3510
3511 // Op size axis configuration for op histograms, values are in bytes
3512 PerfHistogramCommon::axis_config_d op_hist_y_axis_config{
3513 "Request size (bytes)",
3514 PerfHistogramCommon::SCALE_LOG2, ///< Request size in logarithmic scale
3515 0, ///< Start at 0
3516 512, ///< Quantization unit is 512 bytes
3517 32, ///< Enough to cover requests larger than GB
3518 };
3519
3520
3efd9988
FG
3521 // All the basic OSD operation stats are to be considered useful
3522 osd_plb.set_prio_default(PerfCountersBuilder::PRIO_USEFUL);
3523
7c673cae
FG
3524 osd_plb.add_u64(
3525 l_osd_op_wip, "op_wip",
3526 "Replication operations currently being processed (primary)");
3527 osd_plb.add_u64_counter(
3528 l_osd_op, "op",
3529 "Client operations",
3530 "ops", PerfCountersBuilder::PRIO_CRITICAL);
3531 osd_plb.add_u64_counter(
3532 l_osd_op_inb, "op_in_bytes",
3533 "Client operations total write size",
11fdf7f2 3534 "wr", PerfCountersBuilder::PRIO_INTERESTING, unit_t(UNIT_BYTES));
7c673cae
FG
3535 osd_plb.add_u64_counter(
3536 l_osd_op_outb, "op_out_bytes",
3537 "Client operations total read size",
11fdf7f2 3538 "rd", PerfCountersBuilder::PRIO_INTERESTING, unit_t(UNIT_BYTES));
7c673cae
FG
3539 osd_plb.add_time_avg(
3540 l_osd_op_lat, "op_latency",
3541 "Latency of client operations (including queue time)",
3542 "l", 9);
3543 osd_plb.add_time_avg(
3544 l_osd_op_process_lat, "op_process_latency",
3545 "Latency of client operations (excluding queue time)");
3546 osd_plb.add_time_avg(
3547 l_osd_op_prepare_lat, "op_prepare_latency",
3548 "Latency of client operations (excluding queue time and wait for finished)");
3549
3550 osd_plb.add_u64_counter(
3551 l_osd_op_r, "op_r", "Client read operations");
3552 osd_plb.add_u64_counter(
11fdf7f2 3553 l_osd_op_r_outb, "op_r_out_bytes", "Client data read", NULL, PerfCountersBuilder::PRIO_USEFUL, unit_t(UNIT_BYTES));
7c673cae
FG
3554 osd_plb.add_time_avg(
3555 l_osd_op_r_lat, "op_r_latency",
3556 "Latency of read operation (including queue time)");
31f18b77 3557 osd_plb.add_u64_counter_histogram(
7c673cae
FG
3558 l_osd_op_r_lat_outb_hist, "op_r_latency_out_bytes_histogram",
3559 op_hist_x_axis_config, op_hist_y_axis_config,
3560 "Histogram of operation latency (including queue time) + data read");
3561 osd_plb.add_time_avg(
3562 l_osd_op_r_process_lat, "op_r_process_latency",
3563 "Latency of read operation (excluding queue time)");
3564 osd_plb.add_time_avg(
3565 l_osd_op_r_prepare_lat, "op_r_prepare_latency",
3566 "Latency of read operations (excluding queue time and wait for finished)");
3567 osd_plb.add_u64_counter(
3568 l_osd_op_w, "op_w", "Client write operations");
3569 osd_plb.add_u64_counter(
3570 l_osd_op_w_inb, "op_w_in_bytes", "Client data written");
3571 osd_plb.add_time_avg(
3572 l_osd_op_w_lat, "op_w_latency",
3573 "Latency of write operation (including queue time)");
31f18b77 3574 osd_plb.add_u64_counter_histogram(
7c673cae
FG
3575 l_osd_op_w_lat_inb_hist, "op_w_latency_in_bytes_histogram",
3576 op_hist_x_axis_config, op_hist_y_axis_config,
3577 "Histogram of operation latency (including queue time) + data written");
3578 osd_plb.add_time_avg(
3579 l_osd_op_w_process_lat, "op_w_process_latency",
3580 "Latency of write operation (excluding queue time)");
3581 osd_plb.add_time_avg(
3582 l_osd_op_w_prepare_lat, "op_w_prepare_latency",
3583 "Latency of write operations (excluding queue time and wait for finished)");
3584 osd_plb.add_u64_counter(
3585 l_osd_op_rw, "op_rw",
3586 "Client read-modify-write operations");
3587 osd_plb.add_u64_counter(
3588 l_osd_op_rw_inb, "op_rw_in_bytes",
11fdf7f2 3589 "Client read-modify-write operations write in", NULL, PerfCountersBuilder::PRIO_USEFUL, unit_t(UNIT_BYTES));
7c673cae
FG
3590 osd_plb.add_u64_counter(
3591 l_osd_op_rw_outb,"op_rw_out_bytes",
11fdf7f2 3592 "Client read-modify-write operations read out ", NULL, PerfCountersBuilder::PRIO_USEFUL, unit_t(UNIT_BYTES));
7c673cae
FG
3593 osd_plb.add_time_avg(
3594 l_osd_op_rw_lat, "op_rw_latency",
3595 "Latency of read-modify-write operation (including queue time)");
31f18b77 3596 osd_plb.add_u64_counter_histogram(
7c673cae
FG
3597 l_osd_op_rw_lat_inb_hist, "op_rw_latency_in_bytes_histogram",
3598 op_hist_x_axis_config, op_hist_y_axis_config,
3599 "Histogram of rw operation latency (including queue time) + data written");
31f18b77 3600 osd_plb.add_u64_counter_histogram(
7c673cae
FG
3601 l_osd_op_rw_lat_outb_hist, "op_rw_latency_out_bytes_histogram",
3602 op_hist_x_axis_config, op_hist_y_axis_config,
3603 "Histogram of rw operation latency (including queue time) + data read");
3604 osd_plb.add_time_avg(
3605 l_osd_op_rw_process_lat, "op_rw_process_latency",
3606 "Latency of read-modify-write operation (excluding queue time)");
3607 osd_plb.add_time_avg(
3608 l_osd_op_rw_prepare_lat, "op_rw_prepare_latency",
3609 "Latency of read-modify-write operations (excluding queue time and wait for finished)");
3610
3efd9988
FG
3611 // Now we move on to some more obscure stats, revert to assuming things
3612 // are low priority unless otherwise specified.
3613 osd_plb.set_prio_default(PerfCountersBuilder::PRIO_DEBUGONLY);
3614
224ce89b
WB
3615 osd_plb.add_time_avg(l_osd_op_before_queue_op_lat, "op_before_queue_op_lat",
3616 "Latency of IO before calling queue(before really queue into ShardedOpWq)"); // client io before queue op_wq latency
3617 osd_plb.add_time_avg(l_osd_op_before_dequeue_op_lat, "op_before_dequeue_op_lat",
3618 "Latency of IO before calling dequeue_op(already dequeued and get PG lock)"); // client io before dequeue_op latency
3619
7c673cae
FG
3620 osd_plb.add_u64_counter(
3621 l_osd_sop, "subop", "Suboperations");
3622 osd_plb.add_u64_counter(
11fdf7f2 3623 l_osd_sop_inb, "subop_in_bytes", "Suboperations total size", NULL, 0, unit_t(UNIT_BYTES));
7c673cae
FG
3624 osd_plb.add_time_avg(l_osd_sop_lat, "subop_latency", "Suboperations latency");
3625
3626 osd_plb.add_u64_counter(l_osd_sop_w, "subop_w", "Replicated writes");
3627 osd_plb.add_u64_counter(
11fdf7f2 3628 l_osd_sop_w_inb, "subop_w_in_bytes", "Replicated written data size", NULL, 0, unit_t(UNIT_BYTES));
7c673cae
FG
3629 osd_plb.add_time_avg(
3630 l_osd_sop_w_lat, "subop_w_latency", "Replicated writes latency");
3631 osd_plb.add_u64_counter(
3632 l_osd_sop_pull, "subop_pull", "Suboperations pull requests");
3633 osd_plb.add_time_avg(
3634 l_osd_sop_pull_lat, "subop_pull_latency", "Suboperations pull latency");
3635 osd_plb.add_u64_counter(
3636 l_osd_sop_push, "subop_push", "Suboperations push messages");
3637 osd_plb.add_u64_counter(
11fdf7f2 3638 l_osd_sop_push_inb, "subop_push_in_bytes", "Suboperations pushed size", NULL, 0, unit_t(UNIT_BYTES));
7c673cae
FG
3639 osd_plb.add_time_avg(
3640 l_osd_sop_push_lat, "subop_push_latency", "Suboperations push latency");
3641
3642 osd_plb.add_u64_counter(l_osd_pull, "pull", "Pull requests sent");
3643 osd_plb.add_u64_counter(l_osd_push, "push", "Push messages sent");
11fdf7f2 3644 osd_plb.add_u64_counter(l_osd_push_outb, "push_out_bytes", "Pushed size", NULL, 0, unit_t(UNIT_BYTES));
7c673cae
FG
3645
3646 osd_plb.add_u64_counter(
3647 l_osd_rop, "recovery_ops",
3648 "Started recovery operations",
3649 "rop", PerfCountersBuilder::PRIO_INTERESTING);
3650
11fdf7f2
TL
3651 osd_plb.add_u64_counter(
3652 l_osd_rbytes, "recovery_bytes",
3653 "recovery bytes",
3654 "rbt", PerfCountersBuilder::PRIO_INTERESTING);
3655
7c673cae 3656 osd_plb.add_u64(l_osd_loadavg, "loadavg", "CPU load");
7c673cae
FG
3657 osd_plb.add_u64(
3658 l_osd_cached_crc, "cached_crc", "Total number getting crc from crc_cache");
3659 osd_plb.add_u64(
3660 l_osd_cached_crc_adjusted, "cached_crc_adjusted",
3661 "Total number getting crc from crc_cache with adjusting");
3662 osd_plb.add_u64(l_osd_missed_crc, "missed_crc",
3663 "Total number of crc cache misses");
3664
3665 osd_plb.add_u64(l_osd_pg, "numpg", "Placement groups",
3666 "pgs", PerfCountersBuilder::PRIO_USEFUL);
3667 osd_plb.add_u64(
3668 l_osd_pg_primary, "numpg_primary",
3669 "Placement groups for which this osd is primary");
3670 osd_plb.add_u64(
3671 l_osd_pg_replica, "numpg_replica",
3672 "Placement groups for which this osd is replica");
3673 osd_plb.add_u64(
3674 l_osd_pg_stray, "numpg_stray",
3675 "Placement groups ready to be deleted from this osd");
94b18763
FG
3676 osd_plb.add_u64(
3677 l_osd_pg_removing, "numpg_removing",
3678 "Placement groups queued for local deletion", "pgsr",
3679 PerfCountersBuilder::PRIO_USEFUL);
7c673cae
FG
3680 osd_plb.add_u64(
3681 l_osd_hb_to, "heartbeat_to_peers", "Heartbeat (ping) peers we send to");
3682 osd_plb.add_u64_counter(l_osd_map, "map_messages", "OSD map messages");
3683 osd_plb.add_u64_counter(l_osd_mape, "map_message_epochs", "OSD map epochs");
3684 osd_plb.add_u64_counter(
3685 l_osd_mape_dup, "map_message_epoch_dups", "OSD map duplicates");
3686 osd_plb.add_u64_counter(
3687 l_osd_waiting_for_map, "messages_delayed_for_map",
3688 "Operations waiting for OSD map");
31f18b77 3689
7c673cae
FG
3690 osd_plb.add_u64_counter(
3691 l_osd_map_cache_hit, "osd_map_cache_hit", "osdmap cache hit");
3692 osd_plb.add_u64_counter(
3693 l_osd_map_cache_miss, "osd_map_cache_miss", "osdmap cache miss");
3694 osd_plb.add_u64_counter(
3695 l_osd_map_cache_miss_low, "osd_map_cache_miss_low",
3696 "osdmap cache miss below cache lower bound");
3697 osd_plb.add_u64_avg(
3698 l_osd_map_cache_miss_low_avg, "osd_map_cache_miss_low_avg",
3699 "osdmap cache miss, avg distance below cache lower bound");
31f18b77
FG
3700 osd_plb.add_u64_counter(
3701 l_osd_map_bl_cache_hit, "osd_map_bl_cache_hit",
3702 "OSDMap buffer cache hits");
3703 osd_plb.add_u64_counter(
3704 l_osd_map_bl_cache_miss, "osd_map_bl_cache_miss",
3705 "OSDMap buffer cache misses");
7c673cae 3706
3efd9988
FG
3707 osd_plb.add_u64(
3708 l_osd_stat_bytes, "stat_bytes", "OSD size", "size",
11fdf7f2 3709 PerfCountersBuilder::PRIO_USEFUL, unit_t(UNIT_BYTES));
3efd9988
FG
3710 osd_plb.add_u64(
3711 l_osd_stat_bytes_used, "stat_bytes_used", "Used space", "used",
11fdf7f2
TL
3712 PerfCountersBuilder::PRIO_USEFUL, unit_t(UNIT_BYTES));
3713 osd_plb.add_u64(l_osd_stat_bytes_avail, "stat_bytes_avail", "Available space", NULL, 0, unit_t(UNIT_BYTES));
7c673cae
FG
3714
3715 osd_plb.add_u64_counter(
3716 l_osd_copyfrom, "copyfrom", "Rados \"copy-from\" operations");
3717
3718 osd_plb.add_u64_counter(l_osd_tier_promote, "tier_promote", "Tier promotions");
3719 osd_plb.add_u64_counter(l_osd_tier_flush, "tier_flush", "Tier flushes");
3720 osd_plb.add_u64_counter(
3721 l_osd_tier_flush_fail, "tier_flush_fail", "Failed tier flushes");
3722 osd_plb.add_u64_counter(
3723 l_osd_tier_try_flush, "tier_try_flush", "Tier flush attempts");
3724 osd_plb.add_u64_counter(
3725 l_osd_tier_try_flush_fail, "tier_try_flush_fail",
3726 "Failed tier flush attempts");
3727 osd_plb.add_u64_counter(
3728 l_osd_tier_evict, "tier_evict", "Tier evictions");
3729 osd_plb.add_u64_counter(
3730 l_osd_tier_whiteout, "tier_whiteout", "Tier whiteouts");
3731 osd_plb.add_u64_counter(
3732 l_osd_tier_dirty, "tier_dirty", "Dirty tier flag set");
3733 osd_plb.add_u64_counter(
3734 l_osd_tier_clean, "tier_clean", "Dirty tier flag cleaned");
3735 osd_plb.add_u64_counter(
3736 l_osd_tier_delay, "tier_delay", "Tier delays (agent waiting)");
3737 osd_plb.add_u64_counter(
3738 l_osd_tier_proxy_read, "tier_proxy_read", "Tier proxy reads");
3739 osd_plb.add_u64_counter(
3740 l_osd_tier_proxy_write, "tier_proxy_write", "Tier proxy writes");
3741
3742 osd_plb.add_u64_counter(
3743 l_osd_agent_wake, "agent_wake", "Tiering agent wake up");
3744 osd_plb.add_u64_counter(
3745 l_osd_agent_skip, "agent_skip", "Objects skipped by agent");
3746 osd_plb.add_u64_counter(
3747 l_osd_agent_flush, "agent_flush", "Tiering agent flushes");
3748 osd_plb.add_u64_counter(
3749 l_osd_agent_evict, "agent_evict", "Tiering agent evictions");
3750
3751 osd_plb.add_u64_counter(
3752 l_osd_object_ctx_cache_hit, "object_ctx_cache_hit", "Object context cache hits");
3753 osd_plb.add_u64_counter(
3754 l_osd_object_ctx_cache_total, "object_ctx_cache_total", "Object context cache lookups");
3755
3756 osd_plb.add_u64_counter(l_osd_op_cache_hit, "op_cache_hit");
3757 osd_plb.add_time_avg(
3758 l_osd_tier_flush_lat, "osd_tier_flush_lat", "Object flush latency");
3759 osd_plb.add_time_avg(
3760 l_osd_tier_promote_lat, "osd_tier_promote_lat", "Object promote latency");
3761 osd_plb.add_time_avg(
3762 l_osd_tier_r_lat, "osd_tier_r_lat", "Object proxy read latency");
3763
3764 osd_plb.add_u64_counter(
3765 l_osd_pg_info, "osd_pg_info", "PG updated its info (using any method)");
3766 osd_plb.add_u64_counter(
3767 l_osd_pg_fastinfo, "osd_pg_fastinfo",
3768 "PG updated its info using fastinfo attr");
3769 osd_plb.add_u64_counter(
3770 l_osd_pg_biginfo, "osd_pg_biginfo", "PG updated its biginfo attr");
3771
3772 logger = osd_plb.create_perf_counters();
3773 cct->get_perfcounters_collection()->add(logger);
3774}
3775
3776void OSD::create_recoverystate_perf()
3777{
3778 dout(10) << "create_recoverystate_perf" << dendl;
3779
3780 PerfCountersBuilder rs_perf(cct, "recoverystate_perf", rs_first, rs_last);
3781
3782 rs_perf.add_time_avg(rs_initial_latency, "initial_latency", "Initial recovery state latency");
3783 rs_perf.add_time_avg(rs_started_latency, "started_latency", "Started recovery state latency");
3784 rs_perf.add_time_avg(rs_reset_latency, "reset_latency", "Reset recovery state latency");
3785 rs_perf.add_time_avg(rs_start_latency, "start_latency", "Start recovery state latency");
3786 rs_perf.add_time_avg(rs_primary_latency, "primary_latency", "Primary recovery state latency");
3787 rs_perf.add_time_avg(rs_peering_latency, "peering_latency", "Peering recovery state latency");
3788 rs_perf.add_time_avg(rs_backfilling_latency, "backfilling_latency", "Backfilling recovery state latency");
3789 rs_perf.add_time_avg(rs_waitremotebackfillreserved_latency, "waitremotebackfillreserved_latency", "Wait remote backfill reserved recovery state latency");
3790 rs_perf.add_time_avg(rs_waitlocalbackfillreserved_latency, "waitlocalbackfillreserved_latency", "Wait local backfill reserved recovery state latency");
3791 rs_perf.add_time_avg(rs_notbackfilling_latency, "notbackfilling_latency", "Notbackfilling recovery state latency");
3792 rs_perf.add_time_avg(rs_repnotrecovering_latency, "repnotrecovering_latency", "Repnotrecovering recovery state latency");
3793 rs_perf.add_time_avg(rs_repwaitrecoveryreserved_latency, "repwaitrecoveryreserved_latency", "Rep wait recovery reserved recovery state latency");
3794 rs_perf.add_time_avg(rs_repwaitbackfillreserved_latency, "repwaitbackfillreserved_latency", "Rep wait backfill reserved recovery state latency");
3795 rs_perf.add_time_avg(rs_reprecovering_latency, "reprecovering_latency", "RepRecovering recovery state latency");
3796 rs_perf.add_time_avg(rs_activating_latency, "activating_latency", "Activating recovery state latency");
3797 rs_perf.add_time_avg(rs_waitlocalrecoveryreserved_latency, "waitlocalrecoveryreserved_latency", "Wait local recovery reserved recovery state latency");
3798 rs_perf.add_time_avg(rs_waitremoterecoveryreserved_latency, "waitremoterecoveryreserved_latency", "Wait remote recovery reserved recovery state latency");
3799 rs_perf.add_time_avg(rs_recovering_latency, "recovering_latency", "Recovering recovery state latency");
3800 rs_perf.add_time_avg(rs_recovered_latency, "recovered_latency", "Recovered recovery state latency");
3801 rs_perf.add_time_avg(rs_clean_latency, "clean_latency", "Clean recovery state latency");
3802 rs_perf.add_time_avg(rs_active_latency, "active_latency", "Active recovery state latency");
3803 rs_perf.add_time_avg(rs_replicaactive_latency, "replicaactive_latency", "Replicaactive recovery state latency");
3804 rs_perf.add_time_avg(rs_stray_latency, "stray_latency", "Stray recovery state latency");
3805 rs_perf.add_time_avg(rs_getinfo_latency, "getinfo_latency", "Getinfo recovery state latency");
3806 rs_perf.add_time_avg(rs_getlog_latency, "getlog_latency", "Getlog recovery state latency");
3807 rs_perf.add_time_avg(rs_waitactingchange_latency, "waitactingchange_latency", "Waitactingchange recovery state latency");
3808 rs_perf.add_time_avg(rs_incomplete_latency, "incomplete_latency", "Incomplete recovery state latency");
3809 rs_perf.add_time_avg(rs_down_latency, "down_latency", "Down recovery state latency");
3810 rs_perf.add_time_avg(rs_getmissing_latency, "getmissing_latency", "Getmissing recovery state latency");
3811 rs_perf.add_time_avg(rs_waitupthru_latency, "waitupthru_latency", "Waitupthru recovery state latency");
3812 rs_perf.add_time_avg(rs_notrecovering_latency, "notrecovering_latency", "Notrecovering recovery state latency");
3813
3814 recoverystate_perf = rs_perf.create_perf_counters();
3815 cct->get_perfcounters_collection()->add(recoverystate_perf);
3816}
3817
3818int OSD::shutdown()
3819{
3820 if (!service.prepare_to_stop())
3821 return 0; // already shutting down
3822 osd_lock.Lock();
3823 if (is_stopping()) {
3824 osd_lock.Unlock();
3825 return 0;
3826 }
11fdf7f2 3827 dout(0) << "shutdown" << dendl;
7c673cae
FG
3828
3829 set_state(STATE_STOPPING);
3830
3831 // Debugging
11fdf7f2
TL
3832 if (cct->_conf.get_val<bool>("osd_debug_shutdown")) {
3833 cct->_conf.set_val("debug_osd", "100");
3834 cct->_conf.set_val("debug_journal", "100");
3835 cct->_conf.set_val("debug_filestore", "100");
3836 cct->_conf.set_val("debug_bluestore", "100");
3837 cct->_conf.set_val("debug_ms", "100");
3838 cct->_conf.apply_changes(nullptr);
3efd9988 3839 }
7c673cae
FG
3840
3841 // stop MgrClient earlier as it's more like an internal consumer of OSD
3842 mgrc.shutdown();
3843
3844 service.start_shutdown();
3845
3846 // stop sending work to pgs. this just prevents any new work in _process
3847 // from racing with on_shutdown and potentially entering the pg after.
3848 op_shardedwq.drain();
3849
3850 // Shutdown PGs
3851 {
11fdf7f2
TL
3852 vector<PGRef> pgs;
3853 _get_pgs(&pgs);
3854 for (auto pg : pgs) {
3855 pg->shutdown();
7c673cae
FG
3856 }
3857 }
7c673cae
FG
3858
3859 // drain op queue again (in case PGs requeued something)
3860 op_shardedwq.drain();
3861 {
3862 finished.clear(); // zap waiters (bleh, this is messy)
11fdf7f2 3863 waiting_for_osdmap.clear();
7c673cae
FG
3864 }
3865
7c673cae 3866 // unregister commands
11fdf7f2 3867 cct->get_admin_socket()->unregister_commands(asok_hook);
7c673cae
FG
3868 delete asok_hook;
3869 asok_hook = NULL;
3870
11fdf7f2 3871 cct->get_admin_socket()->unregister_commands(test_ops_hook);
7c673cae
FG
3872 delete test_ops_hook;
3873 test_ops_hook = NULL;
3874
3875 osd_lock.Unlock();
3876
3877 heartbeat_lock.Lock();
3878 heartbeat_stop = true;
3879 heartbeat_cond.Signal();
3880 heartbeat_lock.Unlock();
3881 heartbeat_thread.join();
3882
7c673cae
FG
3883 osd_op_tp.drain();
3884 osd_op_tp.stop();
3885 dout(10) << "op sharded tp stopped" << dendl;
3886
3887 command_tp.drain();
3888 command_tp.stop();
3889 dout(10) << "command tp stopped" << dendl;
3890
7c673cae
FG
3891 dout(10) << "stopping agent" << dendl;
3892 service.agent_stop();
3893
11fdf7f2
TL
3894 boot_finisher.wait_for_empty();
3895
7c673cae
FG
3896 osd_lock.Lock();
3897
11fdf7f2 3898 boot_finisher.stop();
494da23a 3899 reset_heartbeat_peers(true);
7c673cae
FG
3900
3901 tick_timer.shutdown();
3902
3903 {
11fdf7f2 3904 std::lock_guard l(tick_timer_lock);
7c673cae
FG
3905 tick_timer_without_osd_lock.shutdown();
3906 }
3907
3908 // note unmount epoch
3909 dout(10) << "noting clean unmount in epoch " << osdmap->get_epoch() << dendl;
3910 superblock.mounted = service.get_boot_epoch();
3911 superblock.clean_thru = osdmap->get_epoch();
3912 ObjectStore::Transaction t;
3913 write_superblock(t);
11fdf7f2 3914 int r = store->queue_transaction(service.meta_ch, std::move(t));
7c673cae
FG
3915 if (r) {
3916 derr << "OSD::shutdown: error writing superblock: "
3917 << cpp_strerror(r) << dendl;
3918 }
3919
3920
31f18b77
FG
3921 service.shutdown_reserver();
3922
7c673cae
FG
3923 // Remove PGs
3924#ifdef PG_DEBUG_REFS
3925 service.dump_live_pgids();
3926#endif
11fdf7f2
TL
3927 while (true) {
3928 vector<PGRef> pgs;
3929 _get_pgs(&pgs, true);
3930 if (pgs.empty()) {
3931 break;
3932 }
3933 for (auto& pg : pgs) {
3934 if (pg->is_deleted()) {
3935 continue;
3936 }
3937 dout(20) << " kicking pg " << pg << dendl;
3938 pg->lock();
3939 if (pg->get_num_ref() != 1) {
3940 derr << "pgid " << pg->get_pgid() << " has ref count of "
3941 << pg->get_num_ref() << dendl;
7c673cae 3942#ifdef PG_DEBUG_REFS
11fdf7f2 3943 pg->dump_live_ids();
7c673cae 3944#endif
31f18b77
FG
3945 if (cct->_conf->osd_shutdown_pgref_assert) {
3946 ceph_abort();
3947 }
7c673cae 3948 }
11fdf7f2
TL
3949 pg->ch.reset();
3950 pg->unlock();
7c673cae 3951 }
7c673cae
FG
3952 }
3953#ifdef PG_DEBUG_REFS
3954 service.dump_live_pgids();
3955#endif
f64942e4
AA
3956
3957 osd_lock.Unlock();
11fdf7f2 3958 cct->_conf.remove_observer(this);
f64942e4 3959 osd_lock.Lock();
7c673cae 3960
11fdf7f2
TL
3961 service.meta_ch.reset();
3962
7c673cae
FG
3963 dout(10) << "syncing store" << dendl;
3964 enable_disable_fuse(true);
3965
3966 if (cct->_conf->osd_journal_flush_on_shutdown) {
3967 dout(10) << "flushing journal" << dendl;
3968 store->flush_journal();
3969 }
3970
7c673cae
FG
3971 monc->shutdown();
3972 osd_lock.Unlock();
3973
11fdf7f2 3974 map_lock.get_write();
7c673cae 3975 osdmap = OSDMapRef();
11fdf7f2
TL
3976 map_lock.put_write();
3977
3978 for (auto s : shards) {
3979 std::lock_guard l(s->osdmap_lock);
3980 s->shard_osdmap = OSDMapRef();
3981 }
7c673cae 3982 service.shutdown();
11fdf7f2
TL
3983
3984 std::lock_guard lock(osd_lock);
3985 store->umount();
3986 delete store;
3987 store = nullptr;
3988 dout(10) << "Store synced" << dendl;
3989
7c673cae
FG
3990 op_tracker.on_shutdown();
3991
3992 class_handler->shutdown();
3993 client_messenger->shutdown();
3994 cluster_messenger->shutdown();
3995 hb_front_client_messenger->shutdown();
3996 hb_back_client_messenger->shutdown();
3997 objecter_messenger->shutdown();
3998 hb_front_server_messenger->shutdown();
3999 hb_back_server_messenger->shutdown();
4000
7c673cae
FG
4001 return r;
4002}
4003
4004int OSD::mon_cmd_maybe_osd_create(string &cmd)
4005{
4006 bool created = false;
4007 while (true) {
4008 dout(10) << __func__ << " cmd: " << cmd << dendl;
4009 vector<string> vcmd{cmd};
4010 bufferlist inbl;
4011 C_SaferCond w;
4012 string outs;
4013 monc->start_mon_command(vcmd, inbl, NULL, &outs, &w);
4014 int r = w.wait();
4015 if (r < 0) {
4016 if (r == -ENOENT && !created) {
4017 string newcmd = "{\"prefix\": \"osd create\", \"id\": " + stringify(whoami)
4018 + ", \"uuid\": \"" + stringify(superblock.osd_fsid) + "\"}";
4019 vector<string> vnewcmd{newcmd};
4020 bufferlist inbl;
4021 C_SaferCond w;
4022 string outs;
4023 monc->start_mon_command(vnewcmd, inbl, NULL, &outs, &w);
4024 int r = w.wait();
4025 if (r < 0) {
4026 derr << __func__ << " fail: osd does not exist and created failed: "
4027 << cpp_strerror(r) << dendl;
4028 return r;
4029 }
4030 created = true;
4031 continue;
4032 }
4033 derr << __func__ << " fail: '" << outs << "': " << cpp_strerror(r) << dendl;
4034 return r;
4035 }
4036 break;
4037 }
4038
4039 return 0;
4040}
4041
4042int OSD::update_crush_location()
4043{
4044 if (!cct->_conf->osd_crush_update_on_start) {
4045 dout(10) << __func__ << " osd_crush_update_on_start = false" << dendl;
4046 return 0;
4047 }
4048
4049 char weight[32];
4050 if (cct->_conf->osd_crush_initial_weight >= 0) {
4051 snprintf(weight, sizeof(weight), "%.4lf", cct->_conf->osd_crush_initial_weight);
4052 } else {
4053 struct store_statfs_t st;
11fdf7f2
TL
4054 osd_alert_list_t alerts;
4055 int r = store->statfs(&st, &alerts);
7c673cae
FG
4056 if (r < 0) {
4057 derr << "statfs: " << cpp_strerror(r) << dendl;
4058 return r;
4059 }
4060 snprintf(weight, sizeof(weight), "%.4lf",
11fdf7f2
TL
4061 std::max(.00001,
4062 double(st.total) /
4063 double(1ull << 40 /* TB */)));
7c673cae
FG
4064 }
4065
4066 std::multimap<string,string> loc = cct->crush_location.get_location();
4067 dout(10) << __func__ << " crush location is " << loc << dendl;
4068
4069 string cmd =
4070 string("{\"prefix\": \"osd crush create-or-move\", ") +
4071 string("\"id\": ") + stringify(whoami) + string(", ") +
4072 string("\"weight\":") + weight + string(", ") +
4073 string("\"args\": [");
4074 for (multimap<string,string>::iterator p = loc.begin(); p != loc.end(); ++p) {
4075 if (p != loc.begin())
4076 cmd += ", ";
4077 cmd += "\"" + p->first + "=" + p->second + "\"";
4078 }
4079 cmd += "]}";
4080
4081 return mon_cmd_maybe_osd_create(cmd);
4082}
4083
4084int OSD::update_crush_device_class()
4085{
224ce89b
WB
4086 if (!cct->_conf->osd_class_update_on_start) {
4087 dout(10) << __func__ << " osd_class_update_on_start = false" << dendl;
4088 return 0;
4089 }
4090
7c673cae
FG
4091 string device_class;
4092 int r = store->read_meta("crush_device_class", &device_class);
224ce89b
WB
4093 if (r < 0 || device_class.empty()) {
4094 device_class = store->get_default_device_class();
4095 }
4096
4097 if (device_class.empty()) {
d2e6a577 4098 dout(20) << __func__ << " no device class stored locally" << dendl;
7c673cae 4099 return 0;
224ce89b 4100 }
7c673cae
FG
4101
4102 string cmd =
4103 string("{\"prefix\": \"osd crush set-device-class\", ") +
224ce89b
WB
4104 string("\"class\": \"") + device_class + string("\", ") +
4105 string("\"ids\": [\"") + stringify(whoami) + string("\"]}");
7c673cae 4106
224ce89b 4107 r = mon_cmd_maybe_osd_create(cmd);
11fdf7f2
TL
4108 if (r == -EBUSY) {
4109 // good, already bound to a device-class
4110 return 0;
4111 } else {
4112 return r;
4113 }
7c673cae
FG
4114}
4115
4116void OSD::write_superblock(ObjectStore::Transaction& t)
4117{
4118 dout(10) << "write_superblock " << superblock << dendl;
4119
4120 //hack: at minimum it's using the baseline feature set
4121 if (!superblock.compat_features.incompat.contains(CEPH_OSD_FEATURE_INCOMPAT_BASE))
4122 superblock.compat_features.incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_BASE);
4123
4124 bufferlist bl;
11fdf7f2 4125 encode(superblock, bl);
7c673cae
FG
4126 t.write(coll_t::meta(), OSD_SUPERBLOCK_GOBJECT, 0, bl.length(), bl);
4127}
4128
4129int OSD::read_superblock()
4130{
4131 bufferlist bl;
11fdf7f2 4132 int r = store->read(service.meta_ch, OSD_SUPERBLOCK_GOBJECT, 0, 0, bl);
7c673cae
FG
4133 if (r < 0)
4134 return r;
4135
11fdf7f2
TL
4136 auto p = bl.cbegin();
4137 decode(superblock, p);
7c673cae
FG
4138
4139 dout(10) << "read_superblock " << superblock << dendl;
4140
4141 return 0;
4142}
4143
4144void OSD::clear_temp_objects()
4145{
4146 dout(10) << __func__ << dendl;
4147 vector<coll_t> ls;
4148 store->list_collections(ls);
4149 for (vector<coll_t>::iterator p = ls.begin(); p != ls.end(); ++p) {
4150 spg_t pgid;
4151 if (!p->is_pg(&pgid))
4152 continue;
4153
4154 // list temp objects
4155 dout(20) << " clearing temps in " << *p << " pgid " << pgid << dendl;
4156
4157 vector<ghobject_t> temps;
4158 ghobject_t next;
4159 while (1) {
4160 vector<ghobject_t> objects;
11fdf7f2
TL
4161 auto ch = store->open_collection(*p);
4162 ceph_assert(ch);
4163 store->collection_list(ch, next, ghobject_t::get_max(),
7c673cae
FG
4164 store->get_ideal_list_max(),
4165 &objects, &next);
4166 if (objects.empty())
4167 break;
4168 vector<ghobject_t>::iterator q;
4169 for (q = objects.begin(); q != objects.end(); ++q) {
4170 // Hammer set pool for temps to -1, so check for clean-up
4171 if (q->hobj.is_temp() || (q->hobj.pool == -1)) {
4172 temps.push_back(*q);
4173 } else {
4174 break;
4175 }
4176 }
4177 // If we saw a non-temp object and hit the break above we can
4178 // break out of the while loop too.
4179 if (q != objects.end())
4180 break;
4181 }
4182 if (!temps.empty()) {
4183 ObjectStore::Transaction t;
4184 int removed = 0;
4185 for (vector<ghobject_t>::iterator q = temps.begin(); q != temps.end(); ++q) {
4186 dout(20) << " removing " << *p << " object " << *q << dendl;
4187 t.remove(*p, *q);
4188 if (++removed > cct->_conf->osd_target_transaction_size) {
11fdf7f2 4189 store->queue_transaction(service.meta_ch, std::move(t));
7c673cae
FG
4190 t = ObjectStore::Transaction();
4191 removed = 0;
4192 }
4193 }
4194 if (removed) {
11fdf7f2 4195 store->queue_transaction(service.meta_ch, std::move(t));
7c673cae
FG
4196 }
4197 }
4198 }
4199}
4200
4201void OSD::recursive_remove_collection(CephContext* cct,
4202 ObjectStore *store, spg_t pgid,
4203 coll_t tmp)
4204{
4205 OSDriver driver(
4206 store,
4207 coll_t(),
4208 make_snapmapper_oid());
4209
11fdf7f2 4210 ObjectStore::CollectionHandle ch = store->open_collection(tmp);
7c673cae
FG
4211 ObjectStore::Transaction t;
4212 SnapMapper mapper(cct, &driver, 0, 0, 0, pgid.shard);
4213
11fdf7f2
TL
4214 ghobject_t next;
4215 int max = cct->_conf->osd_target_transaction_size;
7c673cae 4216 vector<ghobject_t> objects;
11fdf7f2
TL
4217 objects.reserve(max);
4218 while (true) {
4219 objects.clear();
4220 store->collection_list(ch, next, ghobject_t::get_max(),
4221 max, &objects, &next);
4222 generic_dout(10) << __func__ << " " << objects << dendl;
4223 if (objects.empty())
4224 break;
4225 for (auto& p: objects) {
4226 OSDriver::OSTransaction _t(driver.get_transaction(&t));
4227 int r = mapper.remove_oid(p.hobj, &_t);
4228 if (r != 0 && r != -ENOENT)
4229 ceph_abort();
4230 t.remove(tmp, p);
7c673cae 4231 }
11fdf7f2
TL
4232 int r = store->queue_transaction(ch, std::move(t));
4233 ceph_assert(r == 0);
4234 t = ObjectStore::Transaction();
7c673cae
FG
4235 }
4236 t.remove_collection(tmp);
11fdf7f2
TL
4237 int r = store->queue_transaction(ch, std::move(t));
4238 ceph_assert(r == 0);
7c673cae
FG
4239
4240 C_SaferCond waiter;
11fdf7f2 4241 if (!ch->flush_commit(&waiter)) {
7c673cae
FG
4242 waiter.wait();
4243 }
4244}
4245
4246
4247// ======================================================
4248// PG's
4249
7c673cae
FG
4250PG* OSD::_make_pg(
4251 OSDMapRef createmap,
4252 spg_t pgid)
4253{
11fdf7f2
TL
4254 dout(10) << __func__ << " " << pgid << dendl;
4255 pg_pool_t pi;
4256 map<string,string> ec_profile;
4257 string name;
4258 if (createmap->have_pg_pool(pgid.pool())) {
4259 pi = *createmap->get_pg_pool(pgid.pool());
4260 name = createmap->get_pool_name(pgid.pool());
4261 if (pi.is_erasure()) {
4262 ec_profile = createmap->get_erasure_code_profile(pi.erasure_code_profile);
4263 }
4264 } else {
4265 // pool was deleted; grab final pg_pool_t off disk.
4266 ghobject_t oid = make_final_pool_info_oid(pgid.pool());
4267 bufferlist bl;
4268 int r = store->read(service.meta_ch, oid, 0, 0, bl);
4269 if (r < 0) {
4270 derr << __func__ << " missing pool " << pgid.pool() << " tombstone"
4271 << dendl;
4272 return nullptr;
4273 }
4274 ceph_assert(r >= 0);
4275 auto p = bl.cbegin();
4276 decode(pi, p);
4277 decode(name, p);
4278 if (p.end()) { // dev release v13.0.2 did not include ec_profile
4279 derr << __func__ << " missing ec_profile from pool " << pgid.pool()
4280 << " tombstone" << dendl;
4281 return nullptr;
4282 }
4283 decode(ec_profile, p);
4284 }
4285 PGPool pool(cct, createmap, pgid.pool(), pi, name);
7c673cae 4286 PG *pg;
11fdf7f2
TL
4287 if (pi.type == pg_pool_t::TYPE_REPLICATED ||
4288 pi.type == pg_pool_t::TYPE_ERASURE)
4289 pg = new PrimaryLogPG(&service, createmap, pool, ec_profile, pgid);
7c673cae
FG
4290 else
4291 ceph_abort();
7c673cae
FG
4292 return pg;
4293}
4294
11fdf7f2 4295void OSD::_get_pgs(vector<PGRef> *v, bool clear_too)
7c673cae 4296{
11fdf7f2
TL
4297 v->clear();
4298 v->reserve(get_num_pgs());
4299 for (auto& s : shards) {
4300 std::lock_guard l(s->shard_lock);
4301 for (auto& j : s->pg_slots) {
4302 if (j.second->pg &&
4303 !j.second->pg->is_deleted()) {
4304 v->push_back(j.second->pg);
4305 if (clear_too) {
4306 s->_detach_pg(j.second.get());
4307 }
4308 }
7c673cae 4309 }
7c673cae 4310 }
7c673cae
FG
4311}
4312
11fdf7f2 4313void OSD::_get_pgids(vector<spg_t> *v)
7c673cae 4314{
11fdf7f2
TL
4315 v->clear();
4316 v->reserve(get_num_pgs());
4317 for (auto& s : shards) {
4318 std::lock_guard l(s->shard_lock);
4319 for (auto& j : s->pg_slots) {
4320 if (j.second->pg &&
4321 !j.second->pg->is_deleted()) {
4322 v->push_back(j.first);
4323 }
7c673cae
FG
4324 }
4325 }
7c673cae
FG
4326}
4327
11fdf7f2 4328void OSD::register_pg(PGRef pg)
7c673cae 4329{
11fdf7f2
TL
4330 spg_t pgid = pg->get_pgid();
4331 uint32_t shard_index = pgid.hash_to_shard(num_shards);
4332 auto sdata = shards[shard_index];
4333 std::lock_guard l(sdata->shard_lock);
4334 auto r = sdata->pg_slots.emplace(pgid, make_unique<OSDShardPGSlot>());
4335 ceph_assert(r.second);
4336 auto *slot = r.first->second.get();
4337 dout(20) << __func__ << " " << pgid << " " << pg << dendl;
4338 sdata->_attach_pg(slot, pg.get());
4339}
7c673cae 4340
11fdf7f2
TL
4341bool OSD::try_finish_pg_delete(PG *pg, unsigned old_pg_num)
4342{
4343 auto sdata = pg->osd_shard;
4344 ceph_assert(sdata);
4345 {
4346 std::lock_guard l(sdata->shard_lock);
4347 auto p = sdata->pg_slots.find(pg->pg_id);
4348 if (p == sdata->pg_slots.end() ||
4349 !p->second->pg) {
4350 dout(20) << __func__ << " " << pg->pg_id << " not found" << dendl;
4351 return false;
4352 }
4353 if (p->second->waiting_for_merge_epoch) {
4354 dout(20) << __func__ << " " << pg->pg_id << " waiting for merge" << dendl;
4355 return false;
4356 }
4357 dout(20) << __func__ << " " << pg->pg_id << " " << pg << dendl;
4358 sdata->_detach_pg(p->second.get());
4359 }
7c673cae 4360
11fdf7f2
TL
4361 for (auto shard : shards) {
4362 shard->unprime_split_children(pg->pg_id, old_pg_num);
4363 }
7c673cae 4364
11fdf7f2
TL
4365 // update pg count now since we might not get an osdmap any time soon.
4366 if (pg->is_primary())
4367 service.logger->dec(l_osd_pg_primary);
4368 else if (pg->is_replica())
4369 service.logger->dec(l_osd_pg_replica);
4370 else
4371 service.logger->dec(l_osd_pg_stray);
7c673cae 4372
11fdf7f2 4373 return true;
7c673cae
FG
4374}
4375
11fdf7f2 4376PGRef OSD::_lookup_pg(spg_t pgid)
7c673cae 4377{
11fdf7f2
TL
4378 uint32_t shard_index = pgid.hash_to_shard(num_shards);
4379 auto sdata = shards[shard_index];
4380 std::lock_guard l(sdata->shard_lock);
4381 auto p = sdata->pg_slots.find(pgid);
4382 if (p == sdata->pg_slots.end()) {
7c673cae 4383 return nullptr;
11fdf7f2
TL
4384 }
4385 return p->second->pg;
7c673cae
FG
4386}
4387
11fdf7f2 4388PGRef OSD::_lookup_lock_pg(spg_t pgid)
31f18b77 4389{
11fdf7f2
TL
4390 PGRef pg = _lookup_pg(pgid);
4391 if (!pg) {
4392 return nullptr;
4393 }
4394 pg->lock();
4395 if (!pg->is_deleted()) {
4396 return pg;
4397 }
4398 pg->unlock();
4399 return nullptr;
31f18b77
FG
4400}
4401
11fdf7f2 4402PGRef OSD::lookup_lock_pg(spg_t pgid)
7c673cae 4403{
11fdf7f2 4404 return _lookup_lock_pg(pgid);
7c673cae
FG
4405}
4406
4407void OSD::load_pgs()
4408{
11fdf7f2 4409 ceph_assert(osd_lock.is_locked());
7c673cae 4410 dout(0) << "load_pgs" << dendl;
11fdf7f2 4411
7c673cae 4412 {
11fdf7f2
TL
4413 auto pghist = make_pg_num_history_oid();
4414 bufferlist bl;
4415 int r = store->read(service.meta_ch, pghist, 0, 0, bl, 0);
4416 if (r >= 0 && bl.length() > 0) {
4417 auto p = bl.cbegin();
4418 decode(pg_num_history, p);
4419 }
4420 dout(20) << __func__ << " pg_num_history " << pg_num_history << dendl;
7c673cae
FG
4421 }
4422
4423 vector<coll_t> ls;
4424 int r = store->list_collections(ls);
4425 if (r < 0) {
4426 derr << "failed to list pgs: " << cpp_strerror(-r) << dendl;
4427 }
4428
11fdf7f2 4429 int num = 0;
7c673cae
FG
4430 for (vector<coll_t>::iterator it = ls.begin();
4431 it != ls.end();
4432 ++it) {
4433 spg_t pgid;
4434 if (it->is_temp(&pgid) ||
4435 (it->is_pg(&pgid) && PG::_has_removal_flag(store, pgid))) {
11fdf7f2
TL
4436 dout(10) << "load_pgs " << *it
4437 << " removing, legacy or flagged for removal pg" << dendl;
7c673cae
FG
4438 recursive_remove_collection(cct, store, pgid, *it);
4439 continue;
4440 }
4441
4442 if (!it->is_pg(&pgid)) {
4443 dout(10) << "load_pgs ignoring unrecognized " << *it << dendl;
4444 continue;
4445 }
4446
7c673cae 4447 dout(10) << "pgid " << pgid << " coll " << coll_t(pgid) << dendl;
7c673cae 4448 epoch_t map_epoch = 0;
11fdf7f2 4449 int r = PG::peek_map_epoch(store, pgid, &map_epoch);
7c673cae
FG
4450 if (r < 0) {
4451 derr << __func__ << " unable to peek at " << pgid << " metadata, skipping"
4452 << dendl;
4453 continue;
4454 }
4455
11fdf7f2 4456 PGRef pg;
7c673cae
FG
4457 if (map_epoch > 0) {
4458 OSDMapRef pgosdmap = service.try_get_map(map_epoch);
4459 if (!pgosdmap) {
4460 if (!osdmap->have_pg_pool(pgid.pool())) {
4461 derr << __func__ << ": could not find map for epoch " << map_epoch
4462 << " on pg " << pgid << ", but the pool is not present in the "
4463 << "current map, so this is probably a result of bug 10617. "
4464 << "Skipping the pg for now, you can use ceph-objectstore-tool "
4465 << "to clean it up later." << dendl;
4466 continue;
4467 } else {
4468 derr << __func__ << ": have pgid " << pgid << " at epoch "
4469 << map_epoch << ", but missing map. Crashing."
4470 << dendl;
11fdf7f2 4471 ceph_abort_msg("Missing map in load_pgs");
7c673cae
FG
4472 }
4473 }
11fdf7f2 4474 pg = _make_pg(pgosdmap, pgid);
7c673cae 4475 } else {
11fdf7f2 4476 pg = _make_pg(osdmap, pgid);
7c673cae 4477 }
11fdf7f2
TL
4478 if (!pg) {
4479 recursive_remove_collection(cct, store, pgid, *it);
4480 continue;
4481 }
4482
4483 // there can be no waiters here, so we don't call _wake_pg_slot
7c673cae 4484
11fdf7f2 4485 pg->lock();
7c673cae
FG
4486 pg->ch = store->open_collection(pg->coll);
4487
4488 // read pg state, log
11fdf7f2 4489 pg->read_state(store);
7c673cae 4490
94b18763
FG
4491 if (pg->dne()) {
4492 dout(10) << "load_pgs " << *it << " deleting dne" << dendl;
4493 pg->ch = nullptr;
94b18763 4494 pg->unlock();
94b18763
FG
4495 recursive_remove_collection(cct, store, pgid, *it);
4496 continue;
4497 }
11fdf7f2
TL
4498 {
4499 uint32_t shard_index = pgid.hash_to_shard(shards.size());
4500 assert(NULL != shards[shard_index]);
4501 store->set_collection_commit_queue(pg->coll, &(shards[shard_index]->context_queue));
4502 }
7c673cae
FG
4503
4504 pg->reg_next_scrub();
4505
11fdf7f2 4506 dout(10) << __func__ << " loaded " << *pg << dendl;
7c673cae 4507 pg->unlock();
7c673cae 4508
11fdf7f2
TL
4509 register_pg(pg);
4510 ++num;
7c673cae 4511 }
11fdf7f2 4512 dout(0) << __func__ << " opened " << num << " pgs" << dendl;
7c673cae
FG
4513}
4514
4515
11fdf7f2
TL
4516PGRef OSD::handle_pg_create_info(const OSDMapRef& osdmap,
4517 const PGCreateInfo *info)
4518{
4519 spg_t pgid = info->pgid;
7c673cae 4520
11fdf7f2
TL
4521 if (maybe_wait_for_max_pg(osdmap, pgid, info->by_mon)) {
4522 dout(10) << __func__ << " hit max pg, dropping" << dendl;
4523 return nullptr;
4524 }
3efd9988 4525
11fdf7f2 4526 PG::RecoveryCtx rctx = create_context();
7c673cae 4527
11fdf7f2 4528 OSDMapRef startmap = get_map(info->epoch);
7c673cae 4529
11fdf7f2
TL
4530 if (info->by_mon) {
4531 int64_t pool_id = pgid.pgid.pool();
4532 const pg_pool_t *pool = osdmap->get_pg_pool(pool_id);
4533 if (!pool) {
4534 dout(10) << __func__ << " ignoring " << pgid << ", pool dne" << dendl;
4535 return nullptr;
4536 }
4537 if (osdmap->require_osd_release >= CEPH_RELEASE_NAUTILUS &&
4538 !pool->has_flag(pg_pool_t::FLAG_CREATING)) {
4539 // this ensures we do not process old creating messages after the
4540 // pool's initial pgs have been created (and pg are subsequently
4541 // allowed to split or merge).
4542 dout(20) << __func__ << " dropping " << pgid
4543 << "create, pool does not have CREATING flag set" << dendl;
4544 return nullptr;
7c673cae
FG
4545 }
4546 }
7c673cae 4547
11fdf7f2
TL
4548 int up_primary, acting_primary;
4549 vector<int> up, acting;
4550 startmap->pg_to_up_acting_osds(
4551 pgid.pgid, &up, &up_primary, &acting, &acting_primary);
7c673cae 4552
11fdf7f2
TL
4553 const pg_pool_t* pp = startmap->get_pg_pool(pgid.pool());
4554 if (pp->has_flag(pg_pool_t::FLAG_EC_OVERWRITES) &&
4555 store->get_type() != "bluestore") {
4556 clog->warn() << "pg " << pgid
4557 << " is at risk of silent data corruption: "
4558 << "the pool allows ec overwrites but is not stored in "
4559 << "bluestore, so deep scrubbing will not detect bitrot";
7c673cae 4560 }
11fdf7f2
TL
4561 PG::_create(*rctx.transaction, pgid, pgid.get_split_bits(pp->get_pg_num()));
4562 PG::_init(*rctx.transaction, pgid, pp);
7c673cae 4563
11fdf7f2
TL
4564 int role = startmap->calc_pg_role(whoami, acting, acting.size());
4565 if (!pp->is_replicated() && role != pgid.shard) {
4566 role = -1;
7c673cae
FG
4567 }
4568
11fdf7f2
TL
4569 PGRef pg = _make_pg(startmap, pgid);
4570 pg->ch = store->create_new_collection(pg->coll);
7c673cae 4571
11fdf7f2
TL
4572 {
4573 uint32_t shard_index = pgid.hash_to_shard(shards.size());
4574 assert(NULL != shards[shard_index]);
4575 store->set_collection_commit_queue(pg->coll, &(shards[shard_index]->context_queue));
7c673cae 4576 }
7c673cae 4577
11fdf7f2 4578 pg->lock(true);
7c673cae 4579
11fdf7f2
TL
4580 // we are holding the shard lock
4581 ceph_assert(!pg->is_deleted());
4582
4583 pg->init(
4584 role,
4585 up,
4586 up_primary,
4587 acting,
4588 acting_primary,
4589 info->history,
4590 info->past_intervals,
4591 false,
4592 rctx.transaction);
7c673cae 4593
11fdf7f2
TL
4594 if (pg->is_primary()) {
4595 Mutex::Locker locker(m_perf_queries_lock);
4596 pg->set_dynamic_perf_stats_queries(m_perf_queries);
4597 }
7c673cae 4598
11fdf7f2
TL
4599 pg->handle_initialize(&rctx);
4600 pg->handle_activate_map(&rctx);
7c673cae 4601
11fdf7f2 4602 dispatch_context(rctx, pg.get(), osdmap, nullptr);
7c673cae 4603
11fdf7f2
TL
4604 dout(10) << __func__ << " new pg " << *pg << dendl;
4605 return pg;
7c673cae
FG
4606}
4607
11fdf7f2
TL
4608bool OSD::maybe_wait_for_max_pg(const OSDMapRef& osdmap,
4609 spg_t pgid,
4610 bool is_mon_create)
3efd9988
FG
4611{
4612 const auto max_pgs_per_osd =
11fdf7f2
TL
4613 (cct->_conf.get_val<uint64_t>("mon_max_pg_per_osd") *
4614 cct->_conf.get_val<double>("osd_max_pg_per_osd_hard_ratio"));
3efd9988 4615
11fdf7f2 4616 if (num_pgs < max_pgs_per_osd) {
3efd9988
FG
4617 return false;
4618 }
11fdf7f2
TL
4619
4620 std::lock_guard l(pending_creates_lock);
3efd9988
FG
4621 if (is_mon_create) {
4622 pending_creates_from_mon++;
4623 } else {
b32b8144
FG
4624 bool is_primary = osdmap->get_pg_acting_rank(pgid.pgid, whoami) == 0;
4625 pending_creates_from_osd.emplace(pgid.pgid, is_primary);
3efd9988 4626 }
1adf2230 4627 dout(1) << __func__ << " withhold creation of pg " << pgid
11fdf7f2 4628 << ": " << num_pgs << " >= "<< max_pgs_per_osd << dendl;
3efd9988
FG
4629 return true;
4630}
4631
4632// to re-trigger a peering, we have to twiddle the pg mapping a little bit,
4633// see PG::should_restart_peering(). OSDMap::pg_to_up_acting_osds() will turn
4634// to up set if pg_temp is empty. so an empty pg_temp won't work.
4635static vector<int32_t> twiddle(const vector<int>& acting) {
4636 if (acting.size() > 1) {
4637 return {acting[0]};
4638 } else {
4639 vector<int32_t> twiddled(acting.begin(), acting.end());
4640 twiddled.push_back(-1);
4641 return twiddled;
4642 }
4643}
4644
4645void OSD::resume_creating_pg()
4646{
4647 bool do_sub_pg_creates = false;
b32b8144 4648 bool have_pending_creates = false;
3efd9988
FG
4649 {
4650 const auto max_pgs_per_osd =
11fdf7f2
TL
4651 (cct->_conf.get_val<uint64_t>("mon_max_pg_per_osd") *
4652 cct->_conf.get_val<double>("osd_max_pg_per_osd_hard_ratio"));
4653 if (max_pgs_per_osd <= num_pgs) {
3efd9988
FG
4654 // this could happen if admin decreases this setting before a PG is removed
4655 return;
4656 }
11fdf7f2
TL
4657 unsigned spare_pgs = max_pgs_per_osd - num_pgs;
4658 std::lock_guard l(pending_creates_lock);
3efd9988 4659 if (pending_creates_from_mon > 0) {
11fdf7f2
TL
4660 dout(20) << __func__ << " pending_creates_from_mon "
4661 << pending_creates_from_mon << dendl;
3efd9988
FG
4662 do_sub_pg_creates = true;
4663 if (pending_creates_from_mon >= spare_pgs) {
4664 spare_pgs = pending_creates_from_mon = 0;
4665 } else {
4666 spare_pgs -= pending_creates_from_mon;
4667 pending_creates_from_mon = 0;
4668 }
4669 }
4670 auto pg = pending_creates_from_osd.cbegin();
4671 while (spare_pgs > 0 && pg != pending_creates_from_osd.cend()) {
94b18763 4672 dout(20) << __func__ << " pg " << pg->first << dendl;
3efd9988 4673 vector<int> acting;
b32b8144 4674 osdmap->pg_to_up_acting_osds(pg->first, nullptr, nullptr, &acting, nullptr);
94b18763 4675 service.queue_want_pg_temp(pg->first, twiddle(acting), true);
3efd9988 4676 pg = pending_creates_from_osd.erase(pg);
94b18763 4677 do_sub_pg_creates = true;
3efd9988
FG
4678 spare_pgs--;
4679 }
b32b8144
FG
4680 have_pending_creates = (pending_creates_from_mon > 0 ||
4681 !pending_creates_from_osd.empty());
3efd9988 4682 }
b32b8144
FG
4683
4684 bool do_renew_subs = false;
3efd9988
FG
4685 if (do_sub_pg_creates) {
4686 if (monc->sub_want("osd_pg_creates", last_pg_create_epoch, 0)) {
4687 dout(4) << __func__ << ": resolicit pg creates from mon since "
4688 << last_pg_create_epoch << dendl;
b32b8144 4689 do_renew_subs = true;
3efd9988
FG
4690 }
4691 }
b32b8144
FG
4692 version_t start = osdmap->get_epoch() + 1;
4693 if (have_pending_creates) {
4694 // don't miss any new osdmap deleting PGs
4695 if (monc->sub_want("osdmap", start, 0)) {
4696 dout(4) << __func__ << ": resolicit osdmap from mon since "
4697 << start << dendl;
4698 do_renew_subs = true;
4699 }
94b18763 4700 } else if (do_sub_pg_creates) {
b32b8144
FG
4701 // no need to subscribe the osdmap continuously anymore
4702 // once the pgtemp and/or mon_subscribe(pg_creates) is sent
4703 if (monc->sub_want_increment("osdmap", start, CEPH_SUBSCRIBE_ONETIME)) {
11fdf7f2 4704 dout(4) << __func__ << ": re-subscribe osdmap(onetime) since "
b32b8144
FG
4705 << start << dendl;
4706 do_renew_subs = true;
4707 }
4708 }
4709
4710 if (do_renew_subs) {
4711 monc->renew_subs();
4712 }
4713
94b18763 4714 service.send_pg_temp();
3efd9988 4715}
7c673cae
FG
4716
4717void OSD::build_initial_pg_history(
4718 spg_t pgid,
4719 epoch_t created,
4720 utime_t created_stamp,
4721 pg_history_t *h,
4722 PastIntervals *pi)
4723{
4724 dout(10) << __func__ << " " << pgid << " created " << created << dendl;
4725 h->epoch_created = created;
31f18b77 4726 h->epoch_pool_created = created;
7c673cae
FG
4727 h->same_interval_since = created;
4728 h->same_up_since = created;
4729 h->same_primary_since = created;
4730 h->last_scrub_stamp = created_stamp;
4731 h->last_deep_scrub_stamp = created_stamp;
4732 h->last_clean_scrub_stamp = created_stamp;
4733
4734 OSDMapRef lastmap = service.get_map(created);
4735 int up_primary, acting_primary;
4736 vector<int> up, acting;
4737 lastmap->pg_to_up_acting_osds(
4738 pgid.pgid, &up, &up_primary, &acting, &acting_primary);
4739
4740 ostringstream debug;
4741 for (epoch_t e = created + 1; e <= osdmap->get_epoch(); ++e) {
4742 OSDMapRef osdmap = service.get_map(e);
4743 int new_up_primary, new_acting_primary;
4744 vector<int> new_up, new_acting;
4745 osdmap->pg_to_up_acting_osds(
4746 pgid.pgid, &new_up, &new_up_primary, &new_acting, &new_acting_primary);
4747
4748 // this is a bit imprecise, but sufficient?
4749 struct min_size_predicate_t : public IsPGRecoverablePredicate {
4750 const pg_pool_t *pi;
4751 bool operator()(const set<pg_shard_t> &have) const {
4752 return have.size() >= pi->min_size;
4753 }
11fdf7f2 4754 explicit min_size_predicate_t(const pg_pool_t *i) : pi(i) {}
7c673cae
FG
4755 } min_size_predicate(osdmap->get_pg_pool(pgid.pgid.pool()));
4756
4757 bool new_interval = PastIntervals::check_new_interval(
4758 acting_primary,
4759 new_acting_primary,
4760 acting, new_acting,
4761 up_primary,
4762 new_up_primary,
4763 up, new_up,
4764 h->same_interval_since,
4765 h->last_epoch_clean,
4766 osdmap,
4767 lastmap,
4768 pgid.pgid,
4769 &min_size_predicate,
4770 pi,
4771 &debug);
4772 if (new_interval) {
4773 h->same_interval_since = e;
181888fb
FG
4774 if (up != new_up) {
4775 h->same_up_since = e;
4776 }
4777 if (acting_primary != new_acting_primary) {
4778 h->same_primary_since = e;
4779 }
4780 if (pgid.pgid.is_split(lastmap->get_pg_num(pgid.pgid.pool()),
4781 osdmap->get_pg_num(pgid.pgid.pool()),
4782 nullptr)) {
4783 h->last_epoch_split = e;
4784 }
4785 up = new_up;
4786 acting = new_acting;
4787 up_primary = new_up_primary;
4788 acting_primary = new_acting_primary;
c07f9fc5 4789 }
7c673cae
FG
4790 lastmap = osdmap;
4791 }
4792 dout(20) << __func__ << " " << debug.str() << dendl;
4793 dout(10) << __func__ << " " << *h << " " << *pi
4794 << " [" << (pi->empty() ? pair<epoch_t,epoch_t>(0,0) :
4795 pi->get_bounds()) << ")"
4796 << dendl;
4797}
4798
7c673cae
FG
4799void OSD::_add_heartbeat_peer(int p)
4800{
4801 if (p == whoami)
4802 return;
4803 HeartbeatInfo *hi;
4804
4805 map<int,HeartbeatInfo>::iterator i = heartbeat_peers.find(p);
4806 if (i == heartbeat_peers.end()) {
4807 pair<ConnectionRef,ConnectionRef> cons = service.get_con_osd_hb(p, osdmap->get_epoch());
4808 if (!cons.first)
4809 return;
4810 hi = &heartbeat_peers[p];
4811 hi->peer = p;
11fdf7f2 4812 RefCountedPtr s{new HeartbeatSession{p}, false};
7c673cae 4813 hi->con_back = cons.first.get();
11fdf7f2 4814 hi->con_back->set_priv(s);
7c673cae
FG
4815 if (cons.second) {
4816 hi->con_front = cons.second.get();
11fdf7f2 4817 hi->con_front->set_priv(s);
7c673cae
FG
4818 dout(10) << "_add_heartbeat_peer: new peer osd." << p
4819 << " " << hi->con_back->get_peer_addr()
4820 << " " << hi->con_front->get_peer_addr()
4821 << dendl;
4822 } else {
4823 hi->con_front.reset(NULL);
4824 dout(10) << "_add_heartbeat_peer: new peer osd." << p
4825 << " " << hi->con_back->get_peer_addr()
4826 << dendl;
4827 }
7c673cae
FG
4828 } else {
4829 hi = &i->second;
4830 }
4831 hi->epoch = osdmap->get_epoch();
4832}
4833
4834void OSD::_remove_heartbeat_peer(int n)
4835{
4836 map<int,HeartbeatInfo>::iterator q = heartbeat_peers.find(n);
11fdf7f2 4837 ceph_assert(q != heartbeat_peers.end());
7c673cae
FG
4838 dout(20) << " removing heartbeat peer osd." << n
4839 << " " << q->second.con_back->get_peer_addr()
4840 << " " << (q->second.con_front ? q->second.con_front->get_peer_addr() : entity_addr_t())
4841 << dendl;
4842 q->second.con_back->mark_down();
4843 if (q->second.con_front) {
4844 q->second.con_front->mark_down();
4845 }
4846 heartbeat_peers.erase(q);
4847}
4848
4849void OSD::need_heartbeat_peer_update()
4850{
4851 if (is_stopping())
4852 return;
4853 dout(20) << "need_heartbeat_peer_update" << dendl;
4854 heartbeat_set_peers_need_update();
4855}
4856
4857void OSD::maybe_update_heartbeat_peers()
4858{
11fdf7f2 4859 ceph_assert(osd_lock.is_locked());
7c673cae 4860
11fdf7f2 4861 if (is_waiting_for_healthy() || is_active()) {
7c673cae
FG
4862 utime_t now = ceph_clock_now();
4863 if (last_heartbeat_resample == utime_t()) {
4864 last_heartbeat_resample = now;
4865 heartbeat_set_peers_need_update();
4866 } else if (!heartbeat_peers_need_update()) {
4867 utime_t dur = now - last_heartbeat_resample;
4868 if (dur > cct->_conf->osd_heartbeat_grace) {
4869 dout(10) << "maybe_update_heartbeat_peers forcing update after " << dur << " seconds" << dendl;
4870 heartbeat_set_peers_need_update();
4871 last_heartbeat_resample = now;
494da23a
TL
4872 // automatically clean up any stale heartbeat peers
4873 // if we are unhealthy, then clean all
4874 reset_heartbeat_peers(is_waiting_for_healthy());
7c673cae
FG
4875 }
4876 }
4877 }
4878
4879 if (!heartbeat_peers_need_update())
4880 return;
4881 heartbeat_clear_peers_need_update();
4882
11fdf7f2 4883 std::lock_guard l(heartbeat_lock);
7c673cae
FG
4884
4885 dout(10) << "maybe_update_heartbeat_peers updating" << dendl;
4886
4887
4888 // build heartbeat from set
4889 if (is_active()) {
11fdf7f2
TL
4890 vector<PGRef> pgs;
4891 _get_pgs(&pgs);
4892 for (auto& pg : pgs) {
4893 pg->with_heartbeat_peers([&](int peer) {
4894 if (osdmap->is_up(peer)) {
4895 _add_heartbeat_peer(peer);
4896 }
4897 });
7c673cae
FG
4898 }
4899 }
4900
4901 // include next and previous up osds to ensure we have a fully-connected set
4902 set<int> want, extras;
11fdf7f2 4903 const int next = osdmap->get_next_up_osd_after(whoami);
7c673cae
FG
4904 if (next >= 0)
4905 want.insert(next);
4906 int prev = osdmap->get_previous_up_osd_before(whoami);
4907 if (prev >= 0 && prev != next)
4908 want.insert(prev);
4909
11fdf7f2
TL
4910 // make sure we have at least **min_down** osds coming from different
4911 // subtree level (e.g., hosts) for fast failure detection.
4912 auto min_down = cct->_conf.get_val<uint64_t>("mon_osd_min_down_reporters");
4913 auto subtree = cct->_conf.get_val<string>("mon_osd_reporter_subtree_level");
4914 osdmap->get_random_up_osds_by_subtree(
4915 whoami, subtree, min_down, want, &want);
4916
7c673cae
FG
4917 for (set<int>::iterator p = want.begin(); p != want.end(); ++p) {
4918 dout(10) << " adding neighbor peer osd." << *p << dendl;
4919 extras.insert(*p);
4920 _add_heartbeat_peer(*p);
4921 }
4922
4923 // remove down peers; enumerate extras
4924 map<int,HeartbeatInfo>::iterator p = heartbeat_peers.begin();
4925 while (p != heartbeat_peers.end()) {
4926 if (!osdmap->is_up(p->first)) {
4927 int o = p->first;
4928 ++p;
4929 _remove_heartbeat_peer(o);
4930 continue;
4931 }
4932 if (p->second.epoch < osdmap->get_epoch()) {
4933 extras.insert(p->first);
4934 }
4935 ++p;
4936 }
4937
4938 // too few?
11fdf7f2 4939 for (int n = next; n >= 0; ) {
7c673cae
FG
4940 if ((int)heartbeat_peers.size() >= cct->_conf->osd_heartbeat_min_peers)
4941 break;
4942 if (!extras.count(n) && !want.count(n) && n != whoami) {
4943 dout(10) << " adding random peer osd." << n << dendl;
4944 extras.insert(n);
4945 _add_heartbeat_peer(n);
4946 }
4947 n = osdmap->get_next_up_osd_after(n);
11fdf7f2 4948 if (n == next)
7c673cae
FG
4949 break; // came full circle; stop
4950 }
4951
4952 // too many?
4953 for (set<int>::iterator p = extras.begin();
4954 (int)heartbeat_peers.size() > cct->_conf->osd_heartbeat_min_peers && p != extras.end();
4955 ++p) {
4956 if (want.count(*p))
4957 continue;
4958 _remove_heartbeat_peer(*p);
4959 }
4960
4961 dout(10) << "maybe_update_heartbeat_peers " << heartbeat_peers.size() << " peers, extras " << extras << dendl;
4962}
4963
494da23a 4964void OSD::reset_heartbeat_peers(bool all)
7c673cae 4965{
11fdf7f2 4966 ceph_assert(osd_lock.is_locked());
7c673cae 4967 dout(10) << "reset_heartbeat_peers" << dendl;
494da23a
TL
4968 utime_t stale = ceph_clock_now();
4969 stale -= cct->_conf.get_val<int64_t>("osd_heartbeat_stale");
11fdf7f2 4970 std::lock_guard l(heartbeat_lock);
494da23a
TL
4971 for (auto it = heartbeat_peers.begin(); it != heartbeat_peers.end();) {
4972 HeartbeatInfo& hi = it->second;
4973 if (all || hi.is_stale(stale)) {
4974 hi.con_back->mark_down();
4975 if (hi.con_front) {
4976 hi.con_front->mark_down();
4977 }
4978 // stop sending failure_report to mon too
4979 failure_queue.erase(it->first);
4980 heartbeat_peers.erase(it++);
4981 } else {
4982 it++;
7c673cae 4983 }
7c673cae 4984 }
7c673cae
FG
4985}
4986
4987void OSD::handle_osd_ping(MOSDPing *m)
4988{
4989 if (superblock.cluster_fsid != m->fsid) {
4990 dout(20) << "handle_osd_ping from " << m->get_source_inst()
4991 << " bad fsid " << m->fsid << " != " << superblock.cluster_fsid << dendl;
4992 m->put();
4993 return;
4994 }
4995
4996 int from = m->get_source().num();
4997
4998 heartbeat_lock.Lock();
4999 if (is_stopping()) {
5000 heartbeat_lock.Unlock();
5001 m->put();
5002 return;
5003 }
5004
5005 OSDMapRef curmap = service.get_osdmap();
c07f9fc5
FG
5006 if (!curmap) {
5007 heartbeat_lock.Unlock();
5008 m->put();
5009 return;
5010 }
7c673cae
FG
5011
5012 switch (m->op) {
5013
5014 case MOSDPing::PING:
5015 {
5016 if (cct->_conf->osd_debug_drop_ping_probability > 0) {
5017 auto heartbeat_drop = debug_heartbeat_drops_remaining.find(from);
5018 if (heartbeat_drop != debug_heartbeat_drops_remaining.end()) {
5019 if (heartbeat_drop->second == 0) {
5020 debug_heartbeat_drops_remaining.erase(heartbeat_drop);
5021 } else {
5022 --heartbeat_drop->second;
5023 dout(5) << "Dropping heartbeat from " << from
5024 << ", " << heartbeat_drop->second
5025 << " remaining to drop" << dendl;
5026 break;
5027 }
5028 } else if (cct->_conf->osd_debug_drop_ping_probability >
5029 ((((double)(rand()%100))/100.0))) {
5030 heartbeat_drop =
5031 debug_heartbeat_drops_remaining.insert(std::make_pair(from,
5032 cct->_conf->osd_debug_drop_ping_duration)).first;
5033 dout(5) << "Dropping heartbeat from " << from
5034 << ", " << heartbeat_drop->second
5035 << " remaining to drop" << dendl;
5036 break;
5037 }
5038 }
5039
5040 if (!cct->get_heartbeat_map()->is_healthy()) {
5041 dout(10) << "internal heartbeat not healthy, dropping ping request" << dendl;
5042 break;
5043 }
5044
5045 Message *r = new MOSDPing(monc->get_fsid(),
5046 curmap->get_epoch(),
31f18b77
FG
5047 MOSDPing::PING_REPLY, m->stamp,
5048 cct->_conf->osd_heartbeat_min_size);
7c673cae
FG
5049 m->get_connection()->send_message(r);
5050
5051 if (curmap->is_up(from)) {
5052 service.note_peer_epoch(from, m->map_epoch);
5053 if (is_active()) {
5054 ConnectionRef con = service.get_con_osd_cluster(from, curmap->get_epoch());
5055 if (con) {
5056 service.share_map_peer(from, con.get());
5057 }
5058 }
5059 } else if (!curmap->exists(from) ||
5060 curmap->get_down_at(from) > m->map_epoch) {
5061 // tell them they have died
5062 Message *r = new MOSDPing(monc->get_fsid(),
5063 curmap->get_epoch(),
5064 MOSDPing::YOU_DIED,
31f18b77
FG
5065 m->stamp,
5066 cct->_conf->osd_heartbeat_min_size);
7c673cae
FG
5067 m->get_connection()->send_message(r);
5068 }
5069 }
5070 break;
5071
5072 case MOSDPing::PING_REPLY:
5073 {
5074 map<int,HeartbeatInfo>::iterator i = heartbeat_peers.find(from);
5075 if (i != heartbeat_peers.end()) {
11fdf7f2
TL
5076 auto acked = i->second.ping_history.find(m->stamp);
5077 if (acked != i->second.ping_history.end()) {
5078 utime_t now = ceph_clock_now();
5079 int &unacknowledged = acked->second.second;
5080 if (m->get_connection() == i->second.con_back) {
5081 dout(25) << "handle_osd_ping got reply from osd." << from
5082 << " first_tx " << i->second.first_tx
5083 << " last_tx " << i->second.last_tx
5084 << " last_rx_back " << i->second.last_rx_back << " -> " << now
5085 << " last_rx_front " << i->second.last_rx_front
5086 << dendl;
5087 i->second.last_rx_back = now;
5088 ceph_assert(unacknowledged > 0);
5089 --unacknowledged;
5090 // if there is no front con, set both stamps.
5091 if (i->second.con_front == NULL) {
5092 i->second.last_rx_front = now;
5093 ceph_assert(unacknowledged > 0);
5094 --unacknowledged;
5095 }
5096 } else if (m->get_connection() == i->second.con_front) {
5097 dout(25) << "handle_osd_ping got reply from osd." << from
5098 << " first_tx " << i->second.first_tx
5099 << " last_tx " << i->second.last_tx
5100 << " last_rx_back " << i->second.last_rx_back
5101 << " last_rx_front " << i->second.last_rx_front << " -> " << now
5102 << dendl;
5103 i->second.last_rx_front = now;
5104 ceph_assert(unacknowledged > 0);
5105 --unacknowledged;
5106 }
7c673cae 5107
11fdf7f2
TL
5108 if (unacknowledged == 0) {
5109 // succeeded in getting all replies
5110 dout(25) << "handle_osd_ping got all replies from osd." << from
5111 << " , erase pending ping(sent at " << m->stamp << ")"
5112 << " and older pending ping(s)"
5113 << dendl;
5114 i->second.ping_history.erase(i->second.ping_history.begin(), ++acked);
7c673cae
FG
5115 }
5116
11fdf7f2
TL
5117 if (i->second.is_healthy(now)) {
5118 // Cancel false reports
5119 auto failure_queue_entry = failure_queue.find(from);
5120 if (failure_queue_entry != failure_queue.end()) {
5121 dout(10) << "handle_osd_ping canceling queued "
5122 << "failure report for osd." << from << dendl;
5123 failure_queue.erase(failure_queue_entry);
5124 }
5125
5126 auto failure_pending_entry = failure_pending.find(from);
5127 if (failure_pending_entry != failure_pending.end()) {
5128 dout(10) << "handle_osd_ping canceling in-flight "
5129 << "failure report for osd." << from << dendl;
5130 send_still_alive(curmap->get_epoch(),
5131 from,
5132 failure_pending_entry->second.second);
5133 failure_pending.erase(failure_pending_entry);
5134 }
7c673cae 5135 }
11fdf7f2
TL
5136 } else {
5137 // old replies, deprecated by newly sent pings.
5138 dout(10) << "handle_osd_ping no pending ping(sent at " << m->stamp
5139 << ") is found, treat as covered by newly sent pings "
5140 << "and ignore"
5141 << dendl;
7c673cae
FG
5142 }
5143 }
5144
5145 if (m->map_epoch &&
5146 curmap->is_up(from)) {
5147 service.note_peer_epoch(from, m->map_epoch);
5148 if (is_active()) {
5149 ConnectionRef con = service.get_con_osd_cluster(from, curmap->get_epoch());
5150 if (con) {
5151 service.share_map_peer(from, con.get());
5152 }
5153 }
5154 }
5155 }
5156 break;
5157
5158 case MOSDPing::YOU_DIED:
5159 dout(10) << "handle_osd_ping " << m->get_source_inst()
5160 << " says i am down in " << m->map_epoch << dendl;
5161 osdmap_subscribe(curmap->get_epoch()+1, false);
5162 break;
5163 }
5164
5165 heartbeat_lock.Unlock();
5166 m->put();
5167}
5168
5169void OSD::heartbeat_entry()
5170{
11fdf7f2 5171 std::lock_guard l(heartbeat_lock);
7c673cae
FG
5172 if (is_stopping())
5173 return;
5174 while (!heartbeat_stop) {
5175 heartbeat();
5176
5177 double wait = .5 + ((float)(rand() % 10)/10.0) * (float)cct->_conf->osd_heartbeat_interval;
5178 utime_t w;
5179 w.set_from_double(wait);
5180 dout(30) << "heartbeat_entry sleeping for " << wait << dendl;
5181 heartbeat_cond.WaitInterval(heartbeat_lock, w);
5182 if (is_stopping())
5183 return;
5184 dout(30) << "heartbeat_entry woke up" << dendl;
5185 }
5186}
5187
5188void OSD::heartbeat_check()
5189{
11fdf7f2 5190 ceph_assert(heartbeat_lock.is_locked());
7c673cae
FG
5191 utime_t now = ceph_clock_now();
5192
11fdf7f2 5193 // check for incoming heartbeats (move me elsewhere?)
7c673cae
FG
5194 for (map<int,HeartbeatInfo>::iterator p = heartbeat_peers.begin();
5195 p != heartbeat_peers.end();
5196 ++p) {
5197
5198 if (p->second.first_tx == utime_t()) {
5199 dout(25) << "heartbeat_check we haven't sent ping to osd." << p->first
11fdf7f2 5200 << " yet, skipping" << dendl;
7c673cae
FG
5201 continue;
5202 }
5203
5204 dout(25) << "heartbeat_check osd." << p->first
5205 << " first_tx " << p->second.first_tx
5206 << " last_tx " << p->second.last_tx
5207 << " last_rx_back " << p->second.last_rx_back
5208 << " last_rx_front " << p->second.last_rx_front
5209 << dendl;
11fdf7f2
TL
5210 if (p->second.is_unhealthy(now)) {
5211 utime_t oldest_deadline = p->second.ping_history.begin()->second.first;
7c673cae
FG
5212 if (p->second.last_rx_back == utime_t() ||
5213 p->second.last_rx_front == utime_t()) {
11fdf7f2
TL
5214 derr << "heartbeat_check: no reply from "
5215 << p->second.con_front->get_peer_addr().get_sockaddr()
5216 << " osd." << p->first
5217 << " ever on either front or back, first ping sent "
5218 << p->second.first_tx
5219 << " (oldest deadline " << oldest_deadline << ")"
5220 << dendl;
7c673cae 5221 // fail
11fdf7f2 5222 failure_queue[p->first] = p->second.first_tx;
7c673cae 5223 } else {
11fdf7f2
TL
5224 derr << "heartbeat_check: no reply from "
5225 << p->second.con_front->get_peer_addr().get_sockaddr()
7c673cae
FG
5226 << " osd." << p->first << " since back " << p->second.last_rx_back
5227 << " front " << p->second.last_rx_front
11fdf7f2
TL
5228 << " (oldest deadline " << oldest_deadline << ")"
5229 << dendl;
7c673cae 5230 // fail
11fdf7f2 5231 failure_queue[p->first] = std::min(p->second.last_rx_back, p->second.last_rx_front);
7c673cae
FG
5232 }
5233 }
5234 }
5235}
5236
5237void OSD::heartbeat()
5238{
81eedcae 5239 ceph_assert(heartbeat_lock.is_locked_by_me());
7c673cae
FG
5240 dout(30) << "heartbeat" << dendl;
5241
5242 // get CPU load avg
5243 double loadavgs[1];
11fdf7f2
TL
5244 int hb_interval = cct->_conf->osd_heartbeat_interval;
5245 int n_samples = 86400;
5246 if (hb_interval > 1) {
5247 n_samples /= hb_interval;
5248 if (n_samples < 1)
5249 n_samples = 1;
5250 }
5251
7c673cae
FG
5252 if (getloadavg(loadavgs, 1) == 1) {
5253 logger->set(l_osd_loadavg, 100 * loadavgs[0]);
5254 daily_loadavg = (daily_loadavg * (n_samples - 1) + loadavgs[0]) / n_samples;
5255 dout(30) << "heartbeat: daily_loadavg " << daily_loadavg << dendl;
5256 }
5257
5258 dout(30) << "heartbeat checking stats" << dendl;
5259
11fdf7f2 5260 // refresh peer list and osd stats
7c673cae
FG
5261 vector<int> hb_peers;
5262 for (map<int,HeartbeatInfo>::iterator p = heartbeat_peers.begin();
5263 p != heartbeat_peers.end();
5264 ++p)
5265 hb_peers.push_back(p->first);
7c673cae 5266
11fdf7f2
TL
5267 auto new_stat = service.set_osd_stat(hb_peers, get_num_pgs());
5268 dout(5) << __func__ << " " << new_stat << dendl;
5269 ceph_assert(new_stat.statfs.total);
5270
5271 float pratio;
5272 float ratio = service.compute_adjusted_ratio(new_stat, &pratio);
5273
5274 service.check_full_status(ratio, pratio);
7c673cae
FG
5275
5276 utime_t now = ceph_clock_now();
11fdf7f2
TL
5277 utime_t deadline = now;
5278 deadline += cct->_conf->osd_heartbeat_grace;
7c673cae
FG
5279
5280 // send heartbeats
5281 for (map<int,HeartbeatInfo>::iterator i = heartbeat_peers.begin();
5282 i != heartbeat_peers.end();
5283 ++i) {
5284 int peer = i->first;
5285 i->second.last_tx = now;
5286 if (i->second.first_tx == utime_t())
5287 i->second.first_tx = now;
11fdf7f2
TL
5288 i->second.ping_history[now] = make_pair(deadline,
5289 HeartbeatInfo::HEARTBEAT_MAX_CONN);
7c673cae
FG
5290 dout(30) << "heartbeat sending ping to osd." << peer << dendl;
5291 i->second.con_back->send_message(new MOSDPing(monc->get_fsid(),
11fdf7f2 5292 service.get_osdmap_epoch(),
31f18b77
FG
5293 MOSDPing::PING, now,
5294 cct->_conf->osd_heartbeat_min_size));
7c673cae
FG
5295
5296 if (i->second.con_front)
5297 i->second.con_front->send_message(new MOSDPing(monc->get_fsid(),
11fdf7f2 5298 service.get_osdmap_epoch(),
31f18b77
FG
5299 MOSDPing::PING, now,
5300 cct->_conf->osd_heartbeat_min_size));
7c673cae
FG
5301 }
5302
5303 logger->set(l_osd_hb_to, heartbeat_peers.size());
5304
5305 // hmm.. am i all alone?
5306 dout(30) << "heartbeat lonely?" << dendl;
5307 if (heartbeat_peers.empty()) {
5308 if (now - last_mon_heartbeat > cct->_conf->osd_mon_heartbeat_interval && is_active()) {
5309 last_mon_heartbeat = now;
5310 dout(10) << "i have no heartbeat peers; checking mon for new map" << dendl;
5311 osdmap_subscribe(osdmap->get_epoch() + 1, false);
5312 }
5313 }
5314
5315 dout(30) << "heartbeat done" << dendl;
5316}
5317
5318bool OSD::heartbeat_reset(Connection *con)
5319{
11fdf7f2
TL
5320 std::lock_guard l(heartbeat_lock);
5321 auto s = con->get_priv();
5322 con->set_priv(nullptr);
7c673cae 5323 if (s) {
7c673cae 5324 if (is_stopping()) {
7c673cae
FG
5325 return true;
5326 }
11fdf7f2
TL
5327 auto heartbeat_session = static_cast<HeartbeatSession*>(s.get());
5328 auto p = heartbeat_peers.find(heartbeat_session->peer);
7c673cae
FG
5329 if (p != heartbeat_peers.end() &&
5330 (p->second.con_back == con ||
5331 p->second.con_front == con)) {
5332 dout(10) << "heartbeat_reset failed hb con " << con << " for osd." << p->second.peer
5333 << ", reopening" << dendl;
5334 if (con != p->second.con_back) {
5335 p->second.con_back->mark_down();
5336 }
5337 p->second.con_back.reset(NULL);
5338 if (p->second.con_front && con != p->second.con_front) {
5339 p->second.con_front->mark_down();
5340 }
5341 p->second.con_front.reset(NULL);
5342 pair<ConnectionRef,ConnectionRef> newcon = service.get_con_osd_hb(p->second.peer, p->second.epoch);
5343 if (newcon.first) {
5344 p->second.con_back = newcon.first.get();
11fdf7f2 5345 p->second.con_back->set_priv(s);
7c673cae
FG
5346 if (newcon.second) {
5347 p->second.con_front = newcon.second.get();
11fdf7f2 5348 p->second.con_front->set_priv(s);
7c673cae 5349 }
11fdf7f2 5350 p->second.ping_history.clear();
7c673cae
FG
5351 } else {
5352 dout(10) << "heartbeat_reset failed hb con " << con << " for osd." << p->second.peer
5353 << ", raced with osdmap update, closing out peer" << dendl;
5354 heartbeat_peers.erase(p);
5355 }
5356 } else {
5357 dout(10) << "heartbeat_reset closing (old) failed hb con " << con << dendl;
5358 }
7c673cae
FG
5359 }
5360 return true;
5361}
5362
5363
5364
5365// =========================================
5366
5367void OSD::tick()
5368{
11fdf7f2 5369 ceph_assert(osd_lock.is_locked());
7c673cae
FG
5370 dout(10) << "tick" << dendl;
5371
5372 if (is_active() || is_waiting_for_healthy()) {
5373 maybe_update_heartbeat_peers();
5374 }
5375
5376 if (is_waiting_for_healthy()) {
5377 start_boot();
494da23a
TL
5378 }
5379
5380 if (is_waiting_for_healthy() || is_booting()) {
5381 std::lock_guard l(heartbeat_lock);
5382 utime_t now = ceph_clock_now();
5383 if (now - last_mon_heartbeat > cct->_conf->osd_mon_heartbeat_interval) {
5384 last_mon_heartbeat = now;
5385 dout(1) << __func__ << " checking mon for new map" << dendl;
5386 osdmap_subscribe(osdmap->get_epoch() + 1, false);
11fdf7f2 5387 }
7c673cae
FG
5388 }
5389
5390 do_waiters();
5391
91327a77 5392 tick_timer.add_event_after(get_tick_interval(), new C_Tick(this));
7c673cae
FG
5393}
5394
5395void OSD::tick_without_osd_lock()
5396{
11fdf7f2 5397 ceph_assert(tick_timer_lock.is_locked());
7c673cae
FG
5398 dout(10) << "tick_without_osd_lock" << dendl;
5399
7c673cae
FG
5400 logger->set(l_osd_cached_crc, buffer::get_cached_crc());
5401 logger->set(l_osd_cached_crc_adjusted, buffer::get_cached_crc_adjusted());
5402 logger->set(l_osd_missed_crc, buffer::get_missed_crc());
11fdf7f2
TL
5403
5404 // refresh osd stats
5405 struct store_statfs_t stbuf;
5406 osd_alert_list_t alerts;
5407 int r = store->statfs(&stbuf, &alerts);
5408 ceph_assert(r == 0);
5409 service.set_statfs(stbuf, alerts);
7c673cae
FG
5410
5411 // osd_lock is not being held, which means the OSD state
5412 // might change when doing the monitor report
5413 if (is_active() || is_waiting_for_healthy()) {
5414 heartbeat_lock.Lock();
5415 heartbeat_check();
5416 heartbeat_lock.Unlock();
5417
5418 map_lock.get_read();
11fdf7f2 5419 std::lock_guard l(mon_report_lock);
7c673cae
FG
5420
5421 // mon report?
7c673cae 5422 utime_t now = ceph_clock_now();
11fdf7f2
TL
5423 if (service.need_fullness_update() ||
5424 now - last_mon_report > cct->_conf->osd_mon_report_interval) {
7c673cae 5425 last_mon_report = now;
7c673cae
FG
5426 send_full_update();
5427 send_failures();
7c673cae
FG
5428 }
5429 map_lock.put_read();
11fdf7f2
TL
5430
5431 epoch_t max_waiting_epoch = 0;
5432 for (auto s : shards) {
5433 max_waiting_epoch = std::max(max_waiting_epoch,
5434 s->get_max_waiting_epoch());
5435 }
5436 if (max_waiting_epoch > get_osdmap()->get_epoch()) {
5437 dout(20) << __func__ << " max_waiting_epoch " << max_waiting_epoch
5438 << ", requesting new map" << dendl;
5439 osdmap_subscribe(superblock.newest_map + 1, false);
5440 }
7c673cae
FG
5441 }
5442
5443 if (is_active()) {
5444 if (!scrub_random_backoff()) {
5445 sched_scrub();
5446 }
5447 service.promote_throttle_recalibrate();
3efd9988 5448 resume_creating_pg();
224ce89b
WB
5449 bool need_send_beacon = false;
5450 const auto now = ceph::coarse_mono_clock::now();
5451 {
5452 // borrow lec lock to pretect last_sent_beacon from changing
11fdf7f2 5453 std::lock_guard l{min_last_epoch_clean_lock};
224ce89b
WB
5454 const auto elapsed = now - last_sent_beacon;
5455 if (chrono::duration_cast<chrono::seconds>(elapsed).count() >
5456 cct->_conf->osd_beacon_report_interval) {
5457 need_send_beacon = true;
5458 }
5459 }
5460 if (need_send_beacon) {
5461 send_beacon(now);
5462 }
7c673cae
FG
5463 }
5464
11fdf7f2 5465 mgrc.update_daemon_health(get_health_metrics());
7c673cae 5466 service.kick_recovery_queue();
91327a77
AA
5467 tick_timer_without_osd_lock.add_event_after(get_tick_interval(),
5468 new C_Tick_WithoutOSDLock(this));
7c673cae
FG
5469}
5470
7c673cae
FG
5471// Usage:
5472// setomapval <pool-id> [namespace/]<obj-name> <key> <val>
5473// rmomapkey <pool-id> [namespace/]<obj-name> <key>
5474// setomapheader <pool-id> [namespace/]<obj-name> <header>
5475// getomap <pool> [namespace/]<obj-name>
5476// truncobj <pool-id> [namespace/]<obj-name> <newlen>
5477// injectmdataerr [namespace/]<obj-name> [shardid]
5478// injectdataerr [namespace/]<obj-name> [shardid]
5479//
5480// set_recovery_delay [utime]
5481void TestOpsSocketHook::test_ops(OSDService *service, ObjectStore *store,
11fdf7f2
TL
5482 std::string_view command,
5483 const cmdmap_t& cmdmap, ostream &ss)
7c673cae
FG
5484{
5485 //Test support
5486 //Support changing the omap on a single osd by using the Admin Socket to
5487 //directly request the osd make a change.
5488 if (command == "setomapval" || command == "rmomapkey" ||
5489 command == "setomapheader" || command == "getomap" ||
5490 command == "truncobj" || command == "injectmdataerr" ||
5491 command == "injectdataerr"
5492 ) {
5493 pg_t rawpg;
5494 int64_t pool;
5495 OSDMapRef curmap = service->get_osdmap();
5496 int r = -1;
5497
5498 string poolstr;
5499
5500 cmd_getval(service->cct, cmdmap, "pool", poolstr);
5501 pool = curmap->lookup_pg_pool_name(poolstr);
5502 //If we can't find it by name then maybe id specified
5503 if (pool < 0 && isdigit(poolstr[0]))
5504 pool = atoll(poolstr.c_str());
5505 if (pool < 0) {
b5b8bbf5 5506 ss << "Invalid pool '" << poolstr << "''";
7c673cae
FG
5507 return;
5508 }
5509
5510 string objname, nspace;
5511 cmd_getval(service->cct, cmdmap, "objname", objname);
5512 std::size_t found = objname.find_first_of('/');
5513 if (found != string::npos) {
5514 nspace = objname.substr(0, found);
5515 objname = objname.substr(found+1);
5516 }
5517 object_locator_t oloc(pool, nspace);
5518 r = curmap->object_locator_to_pg(object_t(objname), oloc, rawpg);
5519
5520 if (r < 0) {
5521 ss << "Invalid namespace/objname";
5522 return;
5523 }
5524
5525 int64_t shardid;
5526 cmd_getval(service->cct, cmdmap, "shardid", shardid, int64_t(shard_id_t::NO_SHARD));
5527 hobject_t obj(object_t(objname), string(""), CEPH_NOSNAP, rawpg.ps(), pool, nspace);
5528 ghobject_t gobj(obj, ghobject_t::NO_GEN, shard_id_t(uint8_t(shardid)));
5529 spg_t pgid(curmap->raw_pg_to_pg(rawpg), shard_id_t(shardid));
5530 if (curmap->pg_is_ec(rawpg)) {
5531 if ((command != "injectdataerr") && (command != "injectmdataerr")) {
5532 ss << "Must not call on ec pool, except injectdataerr or injectmdataerr";
5533 return;
5534 }
5535 }
5536
5537 ObjectStore::Transaction t;
5538
5539 if (command == "setomapval") {
5540 map<string, bufferlist> newattrs;
5541 bufferlist val;
5542 string key, valstr;
5543 cmd_getval(service->cct, cmdmap, "key", key);
5544 cmd_getval(service->cct, cmdmap, "val", valstr);
5545
5546 val.append(valstr);
5547 newattrs[key] = val;
5548 t.omap_setkeys(coll_t(pgid), ghobject_t(obj), newattrs);
11fdf7f2 5549 r = store->queue_transaction(service->meta_ch, std::move(t));
7c673cae
FG
5550 if (r < 0)
5551 ss << "error=" << r;
5552 else
5553 ss << "ok";
5554 } else if (command == "rmomapkey") {
5555 string key;
5556 set<string> keys;
5557 cmd_getval(service->cct, cmdmap, "key", key);
5558
5559 keys.insert(key);
5560 t.omap_rmkeys(coll_t(pgid), ghobject_t(obj), keys);
11fdf7f2 5561 r = store->queue_transaction(service->meta_ch, std::move(t));
7c673cae
FG
5562 if (r < 0)
5563 ss << "error=" << r;
5564 else
5565 ss << "ok";
5566 } else if (command == "setomapheader") {
5567 bufferlist newheader;
5568 string headerstr;
5569
5570 cmd_getval(service->cct, cmdmap, "header", headerstr);
5571 newheader.append(headerstr);
5572 t.omap_setheader(coll_t(pgid), ghobject_t(obj), newheader);
11fdf7f2 5573 r = store->queue_transaction(service->meta_ch, std::move(t));
7c673cae
FG
5574 if (r < 0)
5575 ss << "error=" << r;
5576 else
5577 ss << "ok";
5578 } else if (command == "getomap") {
5579 //Debug: Output entire omap
5580 bufferlist hdrbl;
5581 map<string, bufferlist> keyvals;
11fdf7f2
TL
5582 auto ch = store->open_collection(coll_t(pgid));
5583 if (!ch) {
5584 ss << "unable to open collection for " << pgid;
5585 r = -ENOENT;
5586 } else {
5587 r = store->omap_get(ch, ghobject_t(obj), &hdrbl, &keyvals);
5588 if (r >= 0) {
7c673cae
FG
5589 ss << "header=" << string(hdrbl.c_str(), hdrbl.length());
5590 for (map<string, bufferlist>::iterator it = keyvals.begin();
11fdf7f2 5591 it != keyvals.end(); ++it)
7c673cae
FG
5592 ss << " key=" << (*it).first << " val="
5593 << string((*it).second.c_str(), (*it).second.length());
11fdf7f2 5594 } else {
7c673cae 5595 ss << "error=" << r;
11fdf7f2 5596 }
7c673cae
FG
5597 }
5598 } else if (command == "truncobj") {
5599 int64_t trunclen;
5600 cmd_getval(service->cct, cmdmap, "len", trunclen);
5601 t.truncate(coll_t(pgid), ghobject_t(obj), trunclen);
11fdf7f2 5602 r = store->queue_transaction(service->meta_ch, std::move(t));
7c673cae
FG
5603 if (r < 0)
5604 ss << "error=" << r;
5605 else
5606 ss << "ok";
5607 } else if (command == "injectdataerr") {
5608 store->inject_data_error(gobj);
5609 ss << "ok";
5610 } else if (command == "injectmdataerr") {
5611 store->inject_mdata_error(gobj);
5612 ss << "ok";
5613 }
5614 return;
5615 }
5616 if (command == "set_recovery_delay") {
5617 int64_t delay;
5618 cmd_getval(service->cct, cmdmap, "utime", delay, (int64_t)0);
5619 ostringstream oss;
5620 oss << delay;
11fdf7f2 5621 int r = service->cct->_conf.set_val("osd_recovery_delay_start",
7c673cae
FG
5622 oss.str().c_str());
5623 if (r != 0) {
5624 ss << "set_recovery_delay: error setting "
5625 << "osd_recovery_delay_start to '" << delay << "': error "
5626 << r;
5627 return;
5628 }
11fdf7f2 5629 service->cct->_conf.apply_changes(nullptr);
7c673cae
FG
5630 ss << "set_recovery_delay: set osd_recovery_delay_start "
5631 << "to " << service->cct->_conf->osd_recovery_delay_start;
5632 return;
5633 }
a8e16298 5634 if (command == "trigger_scrub" || command == "trigger_deep_scrub") {
7c673cae 5635 spg_t pgid;
a8e16298 5636 bool deep = (command == "trigger_deep_scrub");
7c673cae
FG
5637 OSDMapRef curmap = service->get_osdmap();
5638
5639 string pgidstr;
5640
5641 cmd_getval(service->cct, cmdmap, "pgid", pgidstr);
5642 if (!pgid.parse(pgidstr.c_str())) {
5643 ss << "Invalid pgid specified";
5644 return;
5645 }
5646
a8e16298
TL
5647 int64_t time;
5648 cmd_getval(service->cct, cmdmap, "time", time, (int64_t)0);
5649
11fdf7f2 5650 PGRef pg = service->osd->_lookup_lock_pg(pgid);
7c673cae
FG
5651 if (pg == nullptr) {
5652 ss << "Can't find pg " << pgid;
5653 return;
5654 }
5655
5656 if (pg->is_primary()) {
5657 pg->unreg_next_scrub();
5658 const pg_pool_t *p = curmap->get_pg_pool(pgid.pool());
5659 double pool_scrub_max_interval = 0;
a8e16298
TL
5660 double scrub_max_interval;
5661 if (deep) {
5662 p->opts.get(pool_opts_t::DEEP_SCRUB_INTERVAL, &pool_scrub_max_interval);
5663 scrub_max_interval = pool_scrub_max_interval > 0 ?
11fdf7f2 5664 pool_scrub_max_interval : g_conf()->osd_deep_scrub_interval;
a8e16298
TL
5665 } else {
5666 p->opts.get(pool_opts_t::SCRUB_MAX_INTERVAL, &pool_scrub_max_interval);
5667 scrub_max_interval = pool_scrub_max_interval > 0 ?
11fdf7f2 5668 pool_scrub_max_interval : g_conf()->osd_scrub_max_interval;
a8e16298 5669 }
7c673cae
FG
5670 // Instead of marking must_scrub force a schedule scrub
5671 utime_t stamp = ceph_clock_now();
a8e16298
TL
5672 if (time == 0)
5673 stamp -= scrub_max_interval;
5674 else
5675 stamp -= (float)time;
5676 stamp -= 100.0; // push back last scrub more for good measure
5677 if (deep) {
5678 pg->set_last_deep_scrub_stamp(stamp);
5679 } else {
5680 pg->set_last_scrub_stamp(stamp);
5681 }
7c673cae 5682 pg->reg_next_scrub();
a8e16298
TL
5683 pg->publish_stats_to_osd();
5684 ss << "ok - set" << (deep ? " deep" : "" ) << " stamp " << stamp;
7c673cae
FG
5685 } else {
5686 ss << "Not primary";
5687 }
5688 pg->unlock();
5689 return;
5690 }
5691 if (command == "injectfull") {
5692 int64_t count;
5693 string type;
5694 OSDService::s_names state;
5695 cmd_getval(service->cct, cmdmap, "type", type, string("full"));
5696 cmd_getval(service->cct, cmdmap, "count", count, (int64_t)-1);
5697 if (type == "none" || count == 0) {
5698 type = "none";
5699 count = 0;
5700 }
5701 state = service->get_full_state(type);
5702 if (state == OSDService::s_names::INVALID) {
5703 ss << "Invalid type use (none, nearfull, backfillfull, full, failsafe)";
5704 return;
5705 }
5706 service->set_injectfull(state, count);
5707 return;
5708 }
5709 ss << "Internal error - command=" << command;
5710}
5711
7c673cae
FG
5712// =========================================
5713
5714void OSD::ms_handle_connect(Connection *con)
5715{
5716 dout(10) << __func__ << " con " << con << dendl;
5717 if (con->get_peer_type() == CEPH_ENTITY_TYPE_MON) {
11fdf7f2 5718 std::lock_guard l(osd_lock);
7c673cae
FG
5719 if (is_stopping())
5720 return;
5721 dout(10) << __func__ << " on mon" << dendl;
5722
5723 if (is_preboot()) {
5724 start_boot();
5725 } else if (is_booting()) {
5726 _send_boot(); // resend boot message
5727 } else {
5728 map_lock.get_read();
11fdf7f2 5729 std::lock_guard l2(mon_report_lock);
7c673cae
FG
5730
5731 utime_t now = ceph_clock_now();
5732 last_mon_report = now;
5733
5734 // resend everything, it's a new session
5735 send_full_update();
5736 send_alive();
5737 service.requeue_pg_temp();
11fdf7f2 5738 service.clear_sent_ready_to_merge();
7c673cae 5739 service.send_pg_temp();
11fdf7f2
TL
5740 service.send_ready_to_merge();
5741 service.send_pg_created();
7c673cae
FG
5742 requeue_failures();
5743 send_failures();
7c673cae
FG
5744
5745 map_lock.put_read();
5746 if (is_active()) {
5747 send_beacon(ceph::coarse_mono_clock::now());
5748 }
5749 }
5750
5751 // full map requests may happen while active or pre-boot
5752 if (requested_full_first) {
5753 rerequest_full_maps();
5754 }
5755 }
5756}
5757
5758void OSD::ms_handle_fast_connect(Connection *con)
5759{
5760 if (con->get_peer_type() != CEPH_ENTITY_TYPE_MON &&
5761 con->get_peer_type() != CEPH_ENTITY_TYPE_MGR) {
11fdf7f2
TL
5762 auto priv = con->get_priv();
5763 auto s = static_cast<Session*>(priv.get());
7c673cae 5764 if (!s) {
11fdf7f2
TL
5765 s = new Session{cct, con};
5766 con->set_priv(RefCountedPtr{s, false});
7c673cae
FG
5767 dout(10) << " new session (outgoing) " << s << " con=" << s->con
5768 << " addr=" << s->con->get_peer_addr() << dendl;
5769 // we don't connect to clients
11fdf7f2 5770 ceph_assert(con->get_peer_type() == CEPH_ENTITY_TYPE_OSD);
7c673cae
FG
5771 s->entity_name.set_type(CEPH_ENTITY_TYPE_OSD);
5772 }
7c673cae
FG
5773 }
5774}
5775
5776void OSD::ms_handle_fast_accept(Connection *con)
5777{
5778 if (con->get_peer_type() != CEPH_ENTITY_TYPE_MON &&
5779 con->get_peer_type() != CEPH_ENTITY_TYPE_MGR) {
11fdf7f2
TL
5780 auto priv = con->get_priv();
5781 auto s = static_cast<Session*>(priv.get());
7c673cae 5782 if (!s) {
11fdf7f2
TL
5783 s = new Session{cct, con};
5784 con->set_priv(RefCountedPtr{s, false});
7c673cae
FG
5785 dout(10) << "new session (incoming)" << s << " con=" << con
5786 << " addr=" << con->get_peer_addr()
5787 << " must have raced with connect" << dendl;
11fdf7f2 5788 ceph_assert(con->get_peer_type() == CEPH_ENTITY_TYPE_OSD);
7c673cae
FG
5789 s->entity_name.set_type(CEPH_ENTITY_TYPE_OSD);
5790 }
7c673cae
FG
5791 }
5792}
5793
5794bool OSD::ms_handle_reset(Connection *con)
5795{
11fdf7f2
TL
5796 auto s = con->get_priv();
5797 auto session = static_cast<Session*>(s.get());
7c673cae
FG
5798 dout(2) << "ms_handle_reset con " << con << " session " << session << dendl;
5799 if (!session)
5800 return false;
5801 session->wstate.reset(con);
11fdf7f2
TL
5802 session->con->set_priv(nullptr);
5803 session->con.reset(); // break con <-> session ref cycle
7c673cae
FG
5804 // note that we break session->con *before* the session_handle_reset
5805 // cleanup below. this avoids a race between us and
5806 // PG::add_backoff, Session::check_backoff, etc.
11fdf7f2 5807 session_handle_reset(SessionRef{session});
7c673cae
FG
5808 return true;
5809}
5810
5811bool OSD::ms_handle_refused(Connection *con)
5812{
5813 if (!cct->_conf->osd_fast_fail_on_connection_refused)
5814 return false;
5815
11fdf7f2
TL
5816 auto priv = con->get_priv();
5817 auto session = static_cast<Session*>(priv.get());
7c673cae
FG
5818 dout(2) << "ms_handle_refused con " << con << " session " << session << dendl;
5819 if (!session)
5820 return false;
5821 int type = con->get_peer_type();
5822 // handle only OSD failures here
5823 if (monc && (type == CEPH_ENTITY_TYPE_OSD)) {
5824 OSDMapRef osdmap = get_osdmap();
5825 if (osdmap) {
5826 int id = osdmap->identify_osd_on_all_channels(con->get_peer_addr());
5827 if (id >= 0 && osdmap->is_up(id)) {
5828 // I'm cheating mon heartbeat grace logic, because we know it's not going
5829 // to respawn alone. +1 so we won't hit any boundary case.
11fdf7f2
TL
5830 monc->send_mon_message(
5831 new MOSDFailure(
5832 monc->get_fsid(),
5833 id,
5834 osdmap->get_addrs(id),
5835 cct->_conf->osd_heartbeat_grace + 1,
5836 osdmap->get_epoch(),
5837 MOSDFailure::FLAG_IMMEDIATE | MOSDFailure::FLAG_FAILED
5838 ));
7c673cae
FG
5839 }
5840 }
5841 }
7c673cae
FG
5842 return true;
5843}
5844
5845struct C_OSD_GetVersion : public Context {
5846 OSD *osd;
5847 uint64_t oldest, newest;
5848 explicit C_OSD_GetVersion(OSD *o) : osd(o), oldest(0), newest(0) {}
5849 void finish(int r) override {
5850 if (r >= 0)
5851 osd->_got_mon_epochs(oldest, newest);
5852 }
5853};
5854
5855void OSD::start_boot()
5856{
5857 if (!_is_healthy()) {
5858 // if we are not healthy, do not mark ourselves up (yet)
5859 dout(1) << "not healthy; waiting to boot" << dendl;
5860 if (!is_waiting_for_healthy())
5861 start_waiting_for_healthy();
5862 // send pings sooner rather than later
5863 heartbeat_kick();
5864 return;
5865 }
5866 dout(1) << __func__ << dendl;
5867 set_state(STATE_PREBOOT);
5868 dout(10) << "start_boot - have maps " << superblock.oldest_map
5869 << ".." << superblock.newest_map << dendl;
5870 C_OSD_GetVersion *c = new C_OSD_GetVersion(this);
5871 monc->get_version("osdmap", &c->newest, &c->oldest, c);
5872}
5873
5874void OSD::_got_mon_epochs(epoch_t oldest, epoch_t newest)
5875{
11fdf7f2 5876 std::lock_guard l(osd_lock);
7c673cae
FG
5877 if (is_preboot()) {
5878 _preboot(oldest, newest);
5879 }
5880}
5881
5882void OSD::_preboot(epoch_t oldest, epoch_t newest)
5883{
11fdf7f2 5884 ceph_assert(is_preboot());
7c673cae
FG
5885 dout(10) << __func__ << " _preboot mon has osdmaps "
5886 << oldest << ".." << newest << dendl;
5887
5888 // ensure our local fullness awareness is accurate
81eedcae
TL
5889 {
5890 std::lock_guard l(heartbeat_lock);
5891 heartbeat();
5892 }
7c673cae
FG
5893
5894 // if our map within recent history, try to add ourselves to the osdmap.
31f18b77
FG
5895 if (osdmap->get_epoch() == 0) {
5896 derr << "waiting for initial osdmap" << dendl;
c07f9fc5 5897 } else if (osdmap->is_destroyed(whoami)) {
b32b8144
FG
5898 derr << "osdmap says I am destroyed" << dendl;
5899 // provide a small margin so we don't livelock seeing if we
5900 // un-destroyed ourselves.
5901 if (osdmap->get_epoch() > newest - 1) {
5902 exit(0);
5903 }
81eedcae 5904 } else if (osdmap->is_noup(whoami)) {
7c673cae
FG
5905 derr << "osdmap NOUP flag is set, waiting for it to clear" << dendl;
5906 } else if (!osdmap->test_flag(CEPH_OSDMAP_SORTBITWISE)) {
5907 derr << "osdmap SORTBITWISE OSDMap flag is NOT set; please set it"
5908 << dendl;
11fdf7f2
TL
5909 } else if (osdmap->require_osd_release < CEPH_RELEASE_LUMINOUS) {
5910 derr << "osdmap require_osd_release < luminous; please upgrade to luminous"
7c673cae 5911 << dendl;
7c673cae
FG
5912 } else if (service.need_fullness_update()) {
5913 derr << "osdmap fullness state needs update" << dendl;
5914 send_full_update();
5915 } else if (osdmap->get_epoch() >= oldest - 1 &&
5916 osdmap->get_epoch() + cct->_conf->osd_map_message_max > newest) {
11fdf7f2
TL
5917
5918 // wait for pgs to fully catch up in a different thread, since
5919 // this thread might be required for splitting and merging PGs to
5920 // make progress.
5921 boot_finisher.queue(
5922 new FunctionContext(
5923 [this](int r) {
5924 std::lock_guard l(osd_lock);
5925 if (is_preboot()) {
5926 dout(10) << __func__ << " waiting for peering work to drain"
5927 << dendl;
5928 osd_lock.Unlock();
5929 for (auto shard : shards) {
5930 shard->wait_min_pg_epoch(osdmap->get_epoch());
5931 }
5932 osd_lock.Lock();
5933 }
5934 if (is_preboot()) {
5935 _send_boot();
5936 }
5937 }));
5938 return;
7c673cae
FG
5939 }
5940
5941 // get all the latest maps
5942 if (osdmap->get_epoch() + 1 >= oldest)
5943 osdmap_subscribe(osdmap->get_epoch() + 1, false);
5944 else
5945 osdmap_subscribe(oldest - 1, true);
5946}
5947
5948void OSD::send_full_update()
5949{
5950 if (!service.need_fullness_update())
5951 return;
5952 unsigned state = 0;
5953 if (service.is_full()) {
5954 state = CEPH_OSD_FULL;
5955 } else if (service.is_backfillfull()) {
5956 state = CEPH_OSD_BACKFILLFULL;
5957 } else if (service.is_nearfull()) {
5958 state = CEPH_OSD_NEARFULL;
5959 }
5960 set<string> s;
5961 OSDMap::calc_state_set(state, s);
5962 dout(10) << __func__ << " want state " << s << dendl;
5963 monc->send_mon_message(new MOSDFull(osdmap->get_epoch(), state));
5964}
5965
5966void OSD::start_waiting_for_healthy()
5967{
5968 dout(1) << "start_waiting_for_healthy" << dendl;
5969 set_state(STATE_WAITING_FOR_HEALTHY);
5970 last_heartbeat_resample = utime_t();
181888fb
FG
5971
5972 // subscribe to osdmap updates, in case our peers really are known to be dead
5973 osdmap_subscribe(osdmap->get_epoch() + 1, false);
7c673cae
FG
5974}
5975
5976bool OSD::_is_healthy()
5977{
5978 if (!cct->get_heartbeat_map()->is_healthy()) {
5979 dout(1) << "is_healthy false -- internal heartbeat failed" << dendl;
5980 return false;
5981 }
5982
5983 if (is_waiting_for_healthy()) {
11fdf7f2
TL
5984 utime_t now = ceph_clock_now();
5985 utime_t grace = utime_t(cct->_conf->osd_max_markdown_period, 0);
5986 while (!osd_markdown_log.empty() &&
5987 osd_markdown_log.front() + grace < now)
5988 osd_markdown_log.pop_front();
5989 if (osd_markdown_log.size() <= 1) {
5990 dout(5) << __func__ << " first time marked as down,"
5991 << " try reboot unconditionally" << dendl;
5992 return true;
5993 }
5994 std::lock_guard l(heartbeat_lock);
7c673cae
FG
5995 int num = 0, up = 0;
5996 for (map<int,HeartbeatInfo>::iterator p = heartbeat_peers.begin();
5997 p != heartbeat_peers.end();
5998 ++p) {
11fdf7f2 5999 if (p->second.is_healthy(now))
7c673cae
FG
6000 ++up;
6001 ++num;
6002 }
6003 if ((float)up < (float)num * cct->_conf->osd_heartbeat_min_healthy_ratio) {
6004 dout(1) << "is_healthy false -- only " << up << "/" << num << " up peers (less than "
6005 << int(cct->_conf->osd_heartbeat_min_healthy_ratio * 100.0) << "%)" << dendl;
6006 return false;
6007 }
6008 }
6009
6010 return true;
6011}
6012
6013void OSD::_send_boot()
6014{
6015 dout(10) << "_send_boot" << dendl;
11fdf7f2
TL
6016 Connection *local_connection =
6017 cluster_messenger->get_loopback_connection().get();
6018 entity_addrvec_t client_addrs = client_messenger->get_myaddrs();
6019 entity_addrvec_t cluster_addrs = cluster_messenger->get_myaddrs();
6020 entity_addrvec_t hb_back_addrs = hb_back_server_messenger->get_myaddrs();
6021 entity_addrvec_t hb_front_addrs = hb_front_server_messenger->get_myaddrs();
6022
6023 dout(20) << " initial client_addrs " << client_addrs
6024 << ", cluster_addrs " << cluster_addrs
6025 << ", hb_back_addrs " << hb_back_addrs
6026 << ", hb_front_addrs " << hb_front_addrs
6027 << dendl;
6028 if (cluster_messenger->set_addr_unknowns(client_addrs)) {
6029 dout(10) << " assuming cluster_addrs match client_addrs "
6030 << client_addrs << dendl;
6031 cluster_addrs = cluster_messenger->get_myaddrs();
6032 }
6033 if (auto session = local_connection->get_priv(); !session) {
6034 cluster_messenger->ms_deliver_handle_fast_connect(local_connection);
7c673cae
FG
6035 }
6036
7c673cae 6037 local_connection = hb_back_server_messenger->get_loopback_connection().get();
11fdf7f2
TL
6038 if (hb_back_server_messenger->set_addr_unknowns(cluster_addrs)) {
6039 dout(10) << " assuming hb_back_addrs match cluster_addrs "
6040 << cluster_addrs << dendl;
6041 hb_back_addrs = hb_back_server_messenger->get_myaddrs();
7c673cae 6042 }
11fdf7f2
TL
6043 if (auto session = local_connection->get_priv(); !session) {
6044 hb_back_server_messenger->ms_deliver_handle_fast_connect(local_connection);
7c673cae
FG
6045 }
6046
11fdf7f2
TL
6047 local_connection = hb_front_server_messenger->get_loopback_connection().get();
6048 if (hb_front_server_messenger->set_addr_unknowns(client_addrs)) {
6049 dout(10) << " assuming hb_front_addrs match client_addrs "
6050 << client_addrs << dendl;
6051 hb_front_addrs = hb_front_server_messenger->get_myaddrs();
6052 }
6053 if (auto session = local_connection->get_priv(); !session) {
6054 hb_front_server_messenger->ms_deliver_handle_fast_connect(local_connection);
6055 }
6056
6057 // we now know what our front and back addrs will be, and we are
6058 // about to tell the mon what our metadata (including numa bindings)
6059 // are, so now is a good time!
6060 set_numa_affinity();
6061
6062 MOSDBoot *mboot = new MOSDBoot(
6063 superblock, get_osdmap_epoch(), service.get_boot_epoch(),
6064 hb_back_addrs, hb_front_addrs, cluster_addrs,
6065 CEPH_FEATURES_ALL);
6066 dout(10) << " final client_addrs " << client_addrs
6067 << ", cluster_addrs " << cluster_addrs
6068 << ", hb_back_addrs " << hb_back_addrs
6069 << ", hb_front_addrs " << hb_front_addrs
7c673cae
FG
6070 << dendl;
6071 _collect_metadata(&mboot->metadata);
6072 monc->send_mon_message(mboot);
6073 set_state(STATE_BOOTING);
6074}
6075
6076void OSD::_collect_metadata(map<string,string> *pm)
6077{
6078 // config info
6079 (*pm)["osd_data"] = dev_path;
c07f9fc5
FG
6080 if (store->get_type() == "filestore") {
6081 // not applicable for bluestore
6082 (*pm)["osd_journal"] = journal_path;
6083 }
11fdf7f2
TL
6084 (*pm)["front_addr"] = stringify(client_messenger->get_myaddrs());
6085 (*pm)["back_addr"] = stringify(cluster_messenger->get_myaddrs());
6086 (*pm)["hb_front_addr"] = stringify(hb_front_server_messenger->get_myaddrs());
6087 (*pm)["hb_back_addr"] = stringify(hb_back_server_messenger->get_myaddrs());
7c673cae
FG
6088
6089 // backend
6090 (*pm)["osd_objectstore"] = store->get_type();
31f18b77 6091 (*pm)["rotational"] = store_is_rotational ? "1" : "0";
d2e6a577 6092 (*pm)["journal_rotational"] = journal_is_rotational ? "1" : "0";
c07f9fc5 6093 (*pm)["default_device_class"] = store->get_default_device_class();
7c673cae
FG
6094 store->collect_metadata(pm);
6095
6096 collect_sys_info(pm, cct);
6097
11fdf7f2
TL
6098 (*pm)["front_iface"] = pick_iface(
6099 cct,
6100 client_messenger->get_myaddrs().front().get_sockaddr_storage());
6101 (*pm)["back_iface"] = pick_iface(
6102 cct,
6103 cluster_messenger->get_myaddrs().front().get_sockaddr_storage());
6104
6105 // network numa
6106 {
6107 int node = -1;
6108 set<int> nodes;
6109 set<string> unknown;
6110 for (auto nm : { "front_iface", "back_iface" }) {
6111 if (!(*pm)[nm].size()) {
6112 unknown.insert(nm);
6113 continue;
6114 }
6115 int n = -1;
6116 int r = get_iface_numa_node((*pm)[nm], &n);
6117 if (r < 0) {
6118 unknown.insert((*pm)[nm]);
6119 continue;
6120 }
6121 nodes.insert(n);
6122 if (node < 0) {
6123 node = n;
6124 }
6125 }
6126 if (unknown.size()) {
6127 (*pm)["network_numa_unknown_ifaces"] = stringify(unknown);
6128 }
6129 if (!nodes.empty()) {
6130 (*pm)["network_numa_nodes"] = stringify(nodes);
6131 }
6132 if (node >= 0 && nodes.size() == 1 && unknown.empty()) {
6133 (*pm)["network_numa_node"] = stringify(node);
6134 }
6135 }
6136
6137 if (numa_node >= 0) {
6138 (*pm)["numa_node"] = stringify(numa_node);
6139 (*pm)["numa_node_cpus"] = cpu_set_to_str_list(numa_cpu_set_size,
6140 &numa_cpu_set);
6141 }
6142
6143 set<string> devnames;
6144 store->get_devices(&devnames);
6145 (*pm)["devices"] = stringify(devnames);
6146 string devids;
6147 for (auto& dev : devnames) {
6148 string err;
6149 string id = get_device_id(dev, &err);
6150 if (id.size()) {
6151 if (!devids.empty()) {
6152 devids += ",";
6153 }
6154 devids += dev + "=" + id;
6155 } else {
6156 dout(10) << __func__ << " no unique device id for " << dev << ": "
6157 << err << dendl;
6158 }
6159 }
6160 (*pm)["device_ids"] = devids;
b5b8bbf5 6161
7c673cae
FG
6162 dout(10) << __func__ << " " << *pm << dendl;
6163}
6164
6165void OSD::queue_want_up_thru(epoch_t want)
6166{
6167 map_lock.get_read();
6168 epoch_t cur = osdmap->get_up_thru(whoami);
11fdf7f2 6169 std::lock_guard l(mon_report_lock);
7c673cae
FG
6170 if (want > up_thru_wanted) {
6171 dout(10) << "queue_want_up_thru now " << want << " (was " << up_thru_wanted << ")"
6172 << ", currently " << cur
6173 << dendl;
6174 up_thru_wanted = want;
6175 send_alive();
6176 } else {
6177 dout(10) << "queue_want_up_thru want " << want << " <= queued " << up_thru_wanted
6178 << ", currently " << cur
6179 << dendl;
6180 }
6181 map_lock.put_read();
6182}
6183
6184void OSD::send_alive()
6185{
11fdf7f2 6186 ceph_assert(mon_report_lock.is_locked());
7c673cae
FG
6187 if (!osdmap->exists(whoami))
6188 return;
6189 epoch_t up_thru = osdmap->get_up_thru(whoami);
6190 dout(10) << "send_alive up_thru currently " << up_thru << " want " << up_thru_wanted << dendl;
6191 if (up_thru_wanted > up_thru) {
6192 dout(10) << "send_alive want " << up_thru_wanted << dendl;
6193 monc->send_mon_message(new MOSDAlive(osdmap->get_epoch(), up_thru_wanted));
6194 }
6195}
6196
6197void OSD::request_full_map(epoch_t first, epoch_t last)
6198{
6199 dout(10) << __func__ << " " << first << ".." << last
6200 << ", previously requested "
6201 << requested_full_first << ".." << requested_full_last << dendl;
11fdf7f2
TL
6202 ceph_assert(osd_lock.is_locked());
6203 ceph_assert(first > 0 && last > 0);
6204 ceph_assert(first <= last);
6205 ceph_assert(first >= requested_full_first); // we shouldn't ever ask for older maps
7c673cae
FG
6206 if (requested_full_first == 0) {
6207 // first request
6208 requested_full_first = first;
6209 requested_full_last = last;
6210 } else if (last <= requested_full_last) {
6211 // dup
6212 return;
6213 } else {
6214 // additional request
6215 first = requested_full_last + 1;
6216 requested_full_last = last;
6217 }
6218 MMonGetOSDMap *req = new MMonGetOSDMap;
6219 req->request_full(first, last);
6220 monc->send_mon_message(req);
6221}
6222
6223void OSD::got_full_map(epoch_t e)
6224{
11fdf7f2
TL
6225 ceph_assert(requested_full_first <= requested_full_last);
6226 ceph_assert(osd_lock.is_locked());
7c673cae
FG
6227 if (requested_full_first == 0) {
6228 dout(20) << __func__ << " " << e << ", nothing requested" << dendl;
6229 return;
6230 }
6231 if (e < requested_full_first) {
6232 dout(10) << __func__ << " " << e << ", requested " << requested_full_first
6233 << ".." << requested_full_last
6234 << ", ignoring" << dendl;
6235 return;
6236 }
6237 if (e >= requested_full_last) {
6238 dout(10) << __func__ << " " << e << ", requested " << requested_full_first
6239 << ".." << requested_full_last << ", resetting" << dendl;
6240 requested_full_first = requested_full_last = 0;
6241 return;
6242 }
6243
6244 requested_full_first = e + 1;
6245
6246 dout(10) << __func__ << " " << e << ", requested " << requested_full_first
6247 << ".." << requested_full_last
6248 << ", still need more" << dendl;
6249}
6250
6251void OSD::requeue_failures()
6252{
11fdf7f2 6253 std::lock_guard l(heartbeat_lock);
7c673cae
FG
6254 unsigned old_queue = failure_queue.size();
6255 unsigned old_pending = failure_pending.size();
11fdf7f2 6256 for (auto p = failure_pending.begin(); p != failure_pending.end(); ) {
7c673cae
FG
6257 failure_queue[p->first] = p->second.first;
6258 failure_pending.erase(p++);
6259 }
6260 dout(10) << __func__ << " " << old_queue << " + " << old_pending << " -> "
6261 << failure_queue.size() << dendl;
6262}
6263
6264void OSD::send_failures()
6265{
11fdf7f2
TL
6266 ceph_assert(map_lock.is_locked());
6267 ceph_assert(mon_report_lock.is_locked());
6268 std::lock_guard l(heartbeat_lock);
7c673cae
FG
6269 utime_t now = ceph_clock_now();
6270 while (!failure_queue.empty()) {
6271 int osd = failure_queue.begin()->first;
7c673cae
FG
6272 if (!failure_pending.count(osd)) {
6273 int failed_for = (int)(double)(now - failure_queue.begin()->second);
11fdf7f2
TL
6274 monc->send_mon_message(
6275 new MOSDFailure(
6276 monc->get_fsid(),
6277 osd,
6278 osdmap->get_addrs(osd),
6279 failed_for,
6280 osdmap->get_epoch()));
6281 failure_pending[osd] = make_pair(failure_queue.begin()->second,
6282 osdmap->get_addrs(osd));
7c673cae
FG
6283 }
6284 failure_queue.erase(osd);
6285 }
6286}
6287
11fdf7f2 6288void OSD::send_still_alive(epoch_t epoch, int osd, const entity_addrvec_t &addrs)
7c673cae 6289{
11fdf7f2
TL
6290 MOSDFailure *m = new MOSDFailure(monc->get_fsid(), osd, addrs, 0, epoch,
6291 MOSDFailure::FLAG_ALIVE);
7c673cae
FG
6292 monc->send_mon_message(m);
6293}
6294
11fdf7f2 6295void OSD::cancel_pending_failures()
7c673cae 6296{
11fdf7f2
TL
6297 std::lock_guard l(heartbeat_lock);
6298 auto it = failure_pending.begin();
6299 while (it != failure_pending.end()) {
6300 dout(10) << __func__ << " canceling in-flight failure report for osd."
6301 << it->first << dendl;
6302 send_still_alive(osdmap->get_epoch(), it->first, it->second.second);
6303 failure_pending.erase(it++);
7c673cae 6304 }
7c673cae
FG
6305}
6306
6307void OSD::send_beacon(const ceph::coarse_mono_clock::time_point& now)
6308{
6309 const auto& monmap = monc->monmap;
6310 // send beacon to mon even if we are just connected, and the monmap is not
6311 // initialized yet by then.
6312 if (monmap.epoch > 0 &&
6313 monmap.get_required_features().contains_all(
6314 ceph::features::mon::FEATURE_LUMINOUS)) {
6315 dout(20) << __func__ << " sending" << dendl;
7c673cae
FG
6316 MOSDBeacon* beacon = nullptr;
6317 {
11fdf7f2 6318 std::lock_guard l{min_last_epoch_clean_lock};
7c673cae 6319 beacon = new MOSDBeacon(osdmap->get_epoch(), min_last_epoch_clean);
494da23a 6320 beacon->pgs = min_last_epoch_clean_pgs;
224ce89b 6321 last_sent_beacon = now;
7c673cae
FG
6322 }
6323 monc->send_mon_message(beacon);
6324 } else {
6325 dout(20) << __func__ << " not sending" << dendl;
6326 }
6327}
6328
6329void OSD::handle_command(MMonCommand *m)
6330{
6331 if (!require_mon_peer(m)) {
6332 m->put();
6333 return;
6334 }
6335
6336 Command *c = new Command(m->cmd, m->get_tid(), m->get_data(), NULL);
6337 command_wq.queue(c);
6338 m->put();
6339}
6340
6341void OSD::handle_command(MCommand *m)
6342{
6343 ConnectionRef con = m->get_connection();
11fdf7f2
TL
6344 auto priv = con->get_priv();
6345 auto session = static_cast<Session *>(priv.get());
7c673cae
FG
6346 if (!session) {
6347 con->send_message(new MCommandReply(m, -EPERM));
6348 m->put();
6349 return;
6350 }
6351
6352 OSDCap& caps = session->caps;
11fdf7f2 6353 priv.reset();
7c673cae
FG
6354
6355 if (!caps.allow_all() || m->get_source().is_mon()) {
6356 con->send_message(new MCommandReply(m, -EPERM));
6357 m->put();
6358 return;
6359 }
6360
6361 Command *c = new Command(m->cmd, m->get_tid(), m->get_data(), con.get());
6362 command_wq.queue(c);
6363
6364 m->put();
6365}
6366
6367struct OSDCommand {
6368 string cmdstring;
6369 string helpstring;
6370 string module;
6371 string perm;
7c673cae
FG
6372} osd_commands[] = {
6373
11fdf7f2
TL
6374#define COMMAND(parsesig, helptext, module, perm) \
6375 {parsesig, helptext, module, perm},
7c673cae
FG
6376
6377// yes, these are really pg commands, but there's a limit to how
6378// much work it's worth. The OSD returns all of them. Make this
6379// form (pg <pgid> <cmd>) valid only for the cli.
6380// Rest uses "tell <pgid> <cmd>"
6381
6382COMMAND("pg " \
6383 "name=pgid,type=CephPgid " \
6384 "name=cmd,type=CephChoices,strings=query", \
11fdf7f2 6385 "show details of a specific pg", "osd", "r")
7c673cae
FG
6386COMMAND("pg " \
6387 "name=pgid,type=CephPgid " \
6388 "name=cmd,type=CephChoices,strings=mark_unfound_lost " \
6389 "name=mulcmd,type=CephChoices,strings=revert|delete", \
6390 "mark all unfound objects in this pg as lost, either removing or reverting to a prior version if one is available",
11fdf7f2 6391 "osd", "rw")
7c673cae
FG
6392COMMAND("pg " \
6393 "name=pgid,type=CephPgid " \
11fdf7f2 6394 "name=cmd,type=CephChoices,strings=list_unfound " \
7c673cae 6395 "name=offset,type=CephString,req=false",
11fdf7f2
TL
6396 "list unfound objects on this pg, perhaps starting at an offset given in JSON",
6397 "osd", "r")
7c673cae
FG
6398
6399// new form: tell <pgid> <cmd> for both cli and rest
6400
6401COMMAND("query",
11fdf7f2 6402 "show details of a specific pg", "osd", "r")
7c673cae
FG
6403COMMAND("mark_unfound_lost " \
6404 "name=mulcmd,type=CephChoices,strings=revert|delete", \
6405 "mark all unfound objects in this pg as lost, either removing or reverting to a prior version if one is available",
11fdf7f2
TL
6406 "osd", "rw")
6407COMMAND("list_unfound " \
7c673cae 6408 "name=offset,type=CephString,req=false",
11fdf7f2
TL
6409 "list unfound objects on this pg, perhaps starting at an offset given in JSON",
6410 "osd", "r")
31f18b77
FG
6411COMMAND("perf histogram dump "
6412 "name=logger,type=CephString,req=false "
6413 "name=counter,type=CephString,req=false",
6414 "Get histogram data",
11fdf7f2 6415 "osd", "r")
7c673cae
FG
6416
6417// tell <osd.n> commands. Validation of osd.n must be special-cased in client
11fdf7f2
TL
6418COMMAND("version", "report version of OSD", "osd", "r")
6419COMMAND("get_command_descriptions", "list commands descriptions", "osd", "r")
7c673cae
FG
6420COMMAND("injectargs " \
6421 "name=injected_args,type=CephString,n=N",
6422 "inject configuration arguments into running OSD",
11fdf7f2 6423 "osd", "rw")
c07f9fc5
FG
6424COMMAND("config set " \
6425 "name=key,type=CephString name=value,type=CephString",
6426 "Set a configuration option at runtime (not persistent)",
11fdf7f2
TL
6427 "osd", "rw")
6428COMMAND("config get " \
6429 "name=key,type=CephString",
6430 "Get a configuration option at runtime",
6431 "osd", "r")
6432COMMAND("config unset " \
6433 "name=key,type=CephString",
6434 "Unset a configuration option at runtime (not persistent)",
6435 "osd", "rw")
7c673cae
FG
6436COMMAND("cluster_log " \
6437 "name=level,type=CephChoices,strings=error,warning,info,debug " \
6438 "name=message,type=CephString,n=N",
6439 "log a message to the cluster log",
11fdf7f2 6440 "osd", "rw")
7c673cae
FG
6441COMMAND("bench " \
6442 "name=count,type=CephInt,req=false " \
6443 "name=size,type=CephInt,req=false " \
6444 "name=object_size,type=CephInt,req=false " \
6445 "name=object_num,type=CephInt,req=false ", \
81eedcae
TL
6446 "OSD benchmark: write <count> <size>-byte objects(with <obj_size> <obj_num>), " \
6447 "(default count=1G default size=4MB). Results in log.",
11fdf7f2
TL
6448 "osd", "rw")
6449COMMAND("flush_pg_stats", "flush pg stats", "osd", "rw")
7c673cae 6450COMMAND("heap " \
11fdf7f2
TL
6451 "name=heapcmd,type=CephChoices,strings="\
6452 "dump|start_profiler|stop_profiler|release|get_release_rate|set_release_rate|stats " \
6453 "name=value,type=CephString,req=false",
6454 "show heap usage info (available only if compiled with tcmalloc)",
6455 "osd", "rw")
7c673cae
FG
6456COMMAND("debug dump_missing " \
6457 "name=filename,type=CephFilepath",
11fdf7f2 6458 "dump missing objects to a named file", "osd", "r")
7c673cae
FG
6459COMMAND("debug kick_recovery_wq " \
6460 "name=delay,type=CephInt,range=0",
11fdf7f2 6461 "set osd_recovery_delay_start to <val>", "osd", "rw")
7c673cae
FG
6462COMMAND("cpu_profiler " \
6463 "name=arg,type=CephChoices,strings=status|flush",
11fdf7f2 6464 "run cpu profiling on daemon", "osd", "rw")
7c673cae 6465COMMAND("dump_pg_recovery_stats", "dump pg recovery statistics",
11fdf7f2 6466 "osd", "r")
7c673cae 6467COMMAND("reset_pg_recovery_stats", "reset pg recovery statistics",
11fdf7f2 6468 "osd", "rw")
224ce89b
WB
6469COMMAND("compact",
6470 "compact object store's omap. "
6471 "WARNING: Compaction probably slows your requests",
11fdf7f2
TL
6472 "osd", "rw")
6473COMMAND("smart name=devid,type=CephString,req=False",
6474 "runs smartctl on this osd devices. ",
6475 "osd", "rw")
6476COMMAND("cache drop",
6477 "Drop all OSD caches",
6478 "osd", "rwx")
6479COMMAND("cache status",
6480 "Get OSD caches statistics",
6481 "osd", "r")
6482COMMAND("send_beacon",
6483 "Send OSD beacon to mon immediately",
6484 "osd", "r")
7c673cae
FG
6485};
6486
11fdf7f2
TL
6487void OSD::do_command(
6488 Connection *con, ceph_tid_t tid, vector<string>& cmd, bufferlist& data)
6489{
6490 dout(20) << "do_command tid " << tid << " " << cmd << dendl;
6491
6492 int r = 0;
6493 stringstream ss, ds;
6494 bufferlist odata;
6495 cmdmap_t cmdmap;
6496 if (cmd.empty()) {
6497 ss << "no command given";
6498 goto out;
6499 }
6500 if (!cmdmap_from_json(cmd, &cmdmap, ss)) {
6501 r = -EINVAL;
6502 goto out;
6503 }
6504
6505 try {
6506 r = _do_command(con, cmdmap, tid, data, odata, ss, ds);
6507 } catch (const bad_cmd_get& e) {
6508 r = -EINVAL;
6509 ss << e.what();
6510 }
6511 if (r == -EAGAIN) {
6512 return;
6513 }
6514 out:
6515 string rs = ss.str();
6516 odata.append(ds);
6517 dout(0) << "do_command r=" << r << " " << rs << dendl;
6518 clog->info() << rs;
6519 if (con) {
6520 MCommandReply *reply = new MCommandReply(r, rs);
6521 reply->set_tid(tid);
6522 reply->set_data(odata);
6523 con->send_message(reply);
6524 }
6525}
6526
f64942e4
AA
6527namespace {
6528 class unlock_guard {
6529 Mutex& m;
6530 public:
6531 explicit unlock_guard(Mutex& mutex)
6532 : m(mutex)
6533 {
11fdf7f2 6534 m.unlock();
f64942e4
AA
6535 }
6536 unlock_guard(unlock_guard&) = delete;
6537 ~unlock_guard() {
11fdf7f2 6538 m.lock();
f64942e4
AA
6539 }
6540 };
6541}
6542
11fdf7f2
TL
6543int OSD::_do_command(
6544 Connection *con, cmdmap_t& cmdmap, ceph_tid_t tid, bufferlist& data,
6545 bufferlist& odata, stringstream& ss, stringstream& ds)
7c673cae
FG
6546{
6547 int r = 0;
7c673cae
FG
6548 string prefix;
6549 string format;
6550 string pgidstr;
6551 boost::scoped_ptr<Formatter> f;
6552
7c673cae
FG
6553 cmd_getval(cct, cmdmap, "prefix", prefix);
6554
6555 if (prefix == "get_command_descriptions") {
6556 int cmdnum = 0;
6557 JSONFormatter *f = new JSONFormatter();
6558 f->open_object_section("command_descriptions");
6559 for (OSDCommand *cp = osd_commands;
11fdf7f2 6560 cp < &osd_commands[std::size(osd_commands)]; cp++) {
7c673cae
FG
6561
6562 ostringstream secname;
6563 secname << "cmd" << setfill('0') << std::setw(3) << cmdnum;
11fdf7f2
TL
6564 dump_cmddesc_to_json(f, con->get_features(),
6565 secname.str(), cp->cmdstring, cp->helpstring,
6566 cp->module, cp->perm, 0);
7c673cae
FG
6567 cmdnum++;
6568 }
6569 f->close_section(); // command_descriptions
6570
6571 f->flush(ds);
6572 delete f;
6573 goto out;
6574 }
6575
6576 cmd_getval(cct, cmdmap, "format", format);
6577 f.reset(Formatter::create(format));
6578
6579 if (prefix == "version") {
6580 if (f) {
6581 f->open_object_section("version");
6582 f->dump_string("version", pretty_version_to_str());
6583 f->close_section();
6584 f->flush(ds);
6585 } else {
6586 ds << pretty_version_to_str();
6587 }
6588 goto out;
6589 }
6590 else if (prefix == "injectargs") {
6591 vector<string> argsvec;
6592 cmd_getval(cct, cmdmap, "injected_args", argsvec);
6593
6594 if (argsvec.empty()) {
6595 r = -EINVAL;
6596 ss << "ignoring empty injectargs";
6597 goto out;
6598 }
6599 string args = argsvec.front();
6600 for (vector<string>::iterator a = ++argsvec.begin(); a != argsvec.end(); ++a)
6601 args += " " + *a;
f64942e4 6602 unlock_guard unlock{osd_lock};
11fdf7f2 6603 r = cct->_conf.injectargs(args, &ss);
7c673cae 6604 }
c07f9fc5
FG
6605 else if (prefix == "config set") {
6606 std::string key;
6607 std::string val;
6608 cmd_getval(cct, cmdmap, "key", key);
6609 cmd_getval(cct, cmdmap, "value", val);
f64942e4 6610 unlock_guard unlock{osd_lock};
11fdf7f2
TL
6611 r = cct->_conf.set_val(key, val, &ss);
6612 if (r == 0) {
6613 cct->_conf.apply_changes(nullptr);
6614 }
6615 }
6616 else if (prefix == "config get") {
6617 std::string key;
6618 cmd_getval(cct, cmdmap, "key", key);
6619 unlock_guard unlock{osd_lock};
6620 std::string val;
6621 r = cct->_conf.get_val(key, &val);
6622 if (r == 0) {
6623 ds << val;
6624 }
6625 }
6626 else if (prefix == "config unset") {
6627 std::string key;
6628 cmd_getval(cct, cmdmap, "key", key);
6629 unlock_guard unlock{osd_lock};
6630 r = cct->_conf.rm_val(key);
d2e6a577 6631 if (r == 0) {
11fdf7f2
TL
6632 cct->_conf.apply_changes(nullptr);
6633 }
6634 if (r == -ENOENT) {
6635 r = 0; // make command idempotent
d2e6a577 6636 }
c07f9fc5 6637 }
7c673cae
FG
6638 else if (prefix == "cluster_log") {
6639 vector<string> msg;
6640 cmd_getval(cct, cmdmap, "message", msg);
6641 if (msg.empty()) {
6642 r = -EINVAL;
6643 ss << "ignoring empty log message";
6644 goto out;
6645 }
6646 string message = msg.front();
6647 for (vector<string>::iterator a = ++msg.begin(); a != msg.end(); ++a)
6648 message += " " + *a;
6649 string lvl;
6650 cmd_getval(cct, cmdmap, "level", lvl);
6651 clog_type level = string_to_clog_type(lvl);
6652 if (level < 0) {
6653 r = -EINVAL;
6654 ss << "unknown level '" << lvl << "'";
6655 goto out;
6656 }
6657 clog->do_log(level, message);
6658 }
6659
6660 // either 'pg <pgid> <command>' or
6661 // 'tell <pgid>' (which comes in without any of that prefix)?
6662
6663 else if (prefix == "pg" ||
6664 prefix == "query" ||
6665 prefix == "mark_unfound_lost" ||
11fdf7f2 6666 prefix == "list_unfound"
7c673cae
FG
6667 ) {
6668 pg_t pgid;
6669
6670 if (!cmd_getval(cct, cmdmap, "pgid", pgidstr)) {
6671 ss << "no pgid specified";
6672 r = -EINVAL;
6673 } else if (!pgid.parse(pgidstr.c_str())) {
6674 ss << "couldn't parse pgid '" << pgidstr << "'";
6675 r = -EINVAL;
6676 } else {
6677 spg_t pcand;
11fdf7f2 6678 PGRef pg;
7c673cae
FG
6679 if (osdmap->get_primary_shard(pgid, &pcand) &&
6680 (pg = _lookup_lock_pg(pcand))) {
6681 if (pg->is_primary()) {
6682 // simulate pg <pgid> cmd= for pg->do-command
6683 if (prefix != "pg")
6684 cmd_putval(cct, cmdmap, "cmd", prefix);
11fdf7f2
TL
6685 try {
6686 r = pg->do_command(cmdmap, ss, data, odata, con, tid);
6687 } catch (const bad_cmd_get& e) {
6688 pg->unlock();
6689 ss << e.what();
6690 return -EINVAL;
6691 }
7c673cae
FG
6692 if (r == -EAGAIN) {
6693 pg->unlock();
6694 // don't reply, pg will do so async
11fdf7f2 6695 return -EAGAIN;
7c673cae
FG
6696 }
6697 } else {
6698 ss << "not primary for pgid " << pgid;
6699
6700 // send them the latest diff to ensure they realize the mapping
6701 // has changed.
6702 service.send_incremental_map(osdmap->get_epoch() - 1, con, osdmap);
6703
6704 // do not reply; they will get newer maps and realize they
6705 // need to resend.
6706 pg->unlock();
11fdf7f2 6707 return -EAGAIN;
7c673cae
FG
6708 }
6709 pg->unlock();
6710 } else {
6711 ss << "i don't have pgid " << pgid;
6712 r = -ENOENT;
6713 }
6714 }
6715 }
6716
6717 else if (prefix == "bench") {
6718 int64_t count;
6719 int64_t bsize;
6720 int64_t osize, onum;
6721 // default count 1G, size 4MB
6722 cmd_getval(cct, cmdmap, "count", count, (int64_t)1 << 30);
6723 cmd_getval(cct, cmdmap, "size", bsize, (int64_t)4 << 20);
6724 cmd_getval(cct, cmdmap, "object_size", osize, (int64_t)0);
6725 cmd_getval(cct, cmdmap, "object_num", onum, (int64_t)0);
6726
7c673cae
FG
6727 uint32_t duration = cct->_conf->osd_bench_duration;
6728
6729 if (bsize > (int64_t) cct->_conf->osd_bench_max_block_size) {
6730 // let us limit the block size because the next checks rely on it
6731 // having a sane value. If we allow any block size to be set things
6732 // can still go sideways.
6733 ss << "block 'size' values are capped at "
1adf2230 6734 << byte_u_t(cct->_conf->osd_bench_max_block_size) << ". If you wish to use"
7c673cae
FG
6735 << " a higher value, please adjust 'osd_bench_max_block_size'";
6736 r = -EINVAL;
6737 goto out;
6738 } else if (bsize < (int64_t) (1 << 20)) {
6739 // entering the realm of small block sizes.
6740 // limit the count to a sane value, assuming a configurable amount of
6741 // IOPS and duration, so that the OSD doesn't get hung up on this,
6742 // preventing timeouts from going off
6743 int64_t max_count =
6744 bsize * duration * cct->_conf->osd_bench_small_size_max_iops;
6745 if (count > max_count) {
6746 ss << "'count' values greater than " << max_count
1adf2230 6747 << " for a block size of " << byte_u_t(bsize) << ", assuming "
7c673cae
FG
6748 << cct->_conf->osd_bench_small_size_max_iops << " IOPS,"
6749 << " for " << duration << " seconds,"
6750 << " can cause ill effects on osd. "
6751 << " Please adjust 'osd_bench_small_size_max_iops' with a higher"
6752 << " value if you wish to use a higher 'count'.";
6753 r = -EINVAL;
6754 goto out;
6755 }
6756 } else {
6757 // 1MB block sizes are big enough so that we get more stuff done.
6758 // However, to avoid the osd from getting hung on this and having
6759 // timers being triggered, we are going to limit the count assuming
6760 // a configurable throughput and duration.
6761 // NOTE: max_count is the total amount of bytes that we believe we
6762 // will be able to write during 'duration' for the given
6763 // throughput. The block size hardly impacts this unless it's
6764 // way too big. Given we already check how big the block size
6765 // is, it's safe to assume everything will check out.
6766 int64_t max_count =
6767 cct->_conf->osd_bench_large_size_max_throughput * duration;
6768 if (count > max_count) {
6769 ss << "'count' values greater than " << max_count
1adf2230
AA
6770 << " for a block size of " << byte_u_t(bsize) << ", assuming "
6771 << byte_u_t(cct->_conf->osd_bench_large_size_max_throughput) << "/s,"
7c673cae
FG
6772 << " for " << duration << " seconds,"
6773 << " can cause ill effects on osd. "
6774 << " Please adjust 'osd_bench_large_size_max_throughput'"
6775 << " with a higher value if you wish to use a higher 'count'.";
6776 r = -EINVAL;
6777 goto out;
6778 }
6779 }
6780
6781 if (osize && bsize > osize)
6782 bsize = osize;
6783
6784 dout(1) << " bench count " << count
1adf2230 6785 << " bsize " << byte_u_t(bsize) << dendl;
7c673cae
FG
6786
6787 ObjectStore::Transaction cleanupt;
6788
6789 if (osize && onum) {
6790 bufferlist bl;
6791 bufferptr bp(osize);
6792 bp.zero();
6793 bl.push_back(std::move(bp));
6794 bl.rebuild_page_aligned();
6795 for (int i=0; i<onum; ++i) {
6796 char nm[30];
6797 snprintf(nm, sizeof(nm), "disk_bw_test_%d", i);
6798 object_t oid(nm);
6799 hobject_t soid(sobject_t(oid, 0));
6800 ObjectStore::Transaction t;
6801 t.write(coll_t(), ghobject_t(soid), 0, osize, bl);
11fdf7f2 6802 store->queue_transaction(service.meta_ch, std::move(t), NULL);
7c673cae
FG
6803 cleanupt.remove(coll_t(), ghobject_t(soid));
6804 }
6805 }
6806
6807 bufferlist bl;
6808 bufferptr bp(bsize);
6809 bp.zero();
6810 bl.push_back(std::move(bp));
6811 bl.rebuild_page_aligned();
6812
6813 {
6814 C_SaferCond waiter;
11fdf7f2 6815 if (!service.meta_ch->flush_commit(&waiter)) {
7c673cae
FG
6816 waiter.wait();
6817 }
6818 }
6819
6820 utime_t start = ceph_clock_now();
6821 for (int64_t pos = 0; pos < count; pos += bsize) {
6822 char nm[30];
6823 unsigned offset = 0;
6824 if (onum && osize) {
6825 snprintf(nm, sizeof(nm), "disk_bw_test_%d", (int)(rand() % onum));
6826 offset = rand() % (osize / bsize) * bsize;
6827 } else {
6828 snprintf(nm, sizeof(nm), "disk_bw_test_%lld", (long long)pos);
6829 }
6830 object_t oid(nm);
6831 hobject_t soid(sobject_t(oid, 0));
6832 ObjectStore::Transaction t;
6833 t.write(coll_t::meta(), ghobject_t(soid), offset, bsize, bl);
11fdf7f2 6834 store->queue_transaction(service.meta_ch, std::move(t), NULL);
7c673cae
FG
6835 if (!onum || !osize)
6836 cleanupt.remove(coll_t::meta(), ghobject_t(soid));
6837 }
6838
6839 {
6840 C_SaferCond waiter;
11fdf7f2 6841 if (!service.meta_ch->flush_commit(&waiter)) {
7c673cae
FG
6842 waiter.wait();
6843 }
6844 }
6845 utime_t end = ceph_clock_now();
6846
6847 // clean up
11fdf7f2 6848 store->queue_transaction(service.meta_ch, std::move(cleanupt), NULL);
7c673cae
FG
6849 {
6850 C_SaferCond waiter;
11fdf7f2 6851 if (!service.meta_ch->flush_commit(&waiter)) {
7c673cae
FG
6852 waiter.wait();
6853 }
6854 }
6855
91327a77
AA
6856 double elapsed = end - start;
6857 double rate = count / elapsed;
6858 double iops = rate / bsize;
7c673cae
FG
6859 if (f) {
6860 f->open_object_section("osd_bench_results");
6861 f->dump_int("bytes_written", count);
6862 f->dump_int("blocksize", bsize);
91327a77
AA
6863 f->dump_float("elapsed_sec", elapsed);
6864 f->dump_float("bytes_per_sec", rate);
6865 f->dump_float("iops", iops);
7c673cae 6866 f->close_section();
91327a77 6867 f->flush(ds);
7c673cae 6868 } else {
91327a77 6869 ds << "bench: wrote " << byte_u_t(count)
1adf2230 6870 << " in blocks of " << byte_u_t(bsize) << " in "
91327a77
AA
6871 << elapsed << " sec at " << byte_u_t(rate) << "/sec "
6872 << si_u_t(iops) << " IOPS";
7c673cae
FG
6873 }
6874 }
6875
6876 else if (prefix == "flush_pg_stats") {
11fdf7f2
TL
6877 mgrc.send_pgstats();
6878 ds << service.get_osd_stat_seq() << "\n";
7c673cae
FG
6879 }
6880
6881 else if (prefix == "heap") {
6882 r = ceph::osd_cmds::heap(*cct, cmdmap, *f, ds);
6883 }
6884
6885 else if (prefix == "debug dump_missing") {
11fdf7f2
TL
6886 if (!f) {
6887 f.reset(new JSONFormatter(true));
7c673cae 6888 }
11fdf7f2
TL
6889 f->open_array_section("pgs");
6890 vector<PGRef> pgs;
6891 _get_pgs(&pgs);
6892 for (auto& pg : pgs) {
6893 string s = stringify(pg->pg_id);
6894 f->open_array_section(s.c_str());
7c673cae 6895 pg->lock();
11fdf7f2 6896 pg->dump_missing(f.get());
7c673cae 6897 pg->unlock();
11fdf7f2 6898 f->close_section();
7c673cae 6899 }
11fdf7f2
TL
6900 f->close_section();
6901 f->flush(ds);
7c673cae
FG
6902 }
6903 else if (prefix == "debug kick_recovery_wq") {
6904 int64_t delay;
6905 cmd_getval(cct, cmdmap, "delay", delay);
6906 ostringstream oss;
6907 oss << delay;
f64942e4 6908 unlock_guard unlock{osd_lock};
11fdf7f2 6909 r = cct->_conf.set_val("osd_recovery_delay_start", oss.str().c_str());
7c673cae
FG
6910 if (r != 0) {
6911 ss << "kick_recovery_wq: error setting "
6912 << "osd_recovery_delay_start to '" << delay << "': error "
6913 << r;
6914 goto out;
6915 }
11fdf7f2 6916 cct->_conf.apply_changes(nullptr);
7c673cae
FG
6917 ss << "kicking recovery queue. set osd_recovery_delay_start "
6918 << "to " << cct->_conf->osd_recovery_delay_start;
6919 }
6920
6921 else if (prefix == "cpu_profiler") {
6922 string arg;
6923 cmd_getval(cct, cmdmap, "arg", arg);
6924 vector<string> argvec;
6925 get_str_vec(arg, argvec);
6926 cpu_profiler_handle_command(argvec, ds);
6927 }
6928
6929 else if (prefix == "dump_pg_recovery_stats") {
6930 stringstream s;
6931 if (f) {
6932 pg_recovery_stats.dump_formatted(f.get());
6933 f->flush(ds);
6934 } else {
6935 pg_recovery_stats.dump(s);
6936 ds << "dump pg recovery stats: " << s.str();
6937 }
6938 }
6939
6940 else if (prefix == "reset_pg_recovery_stats") {
6941 ss << "reset pg recovery stats";
6942 pg_recovery_stats.reset();
6943 }
6944
31f18b77
FG
6945 else if (prefix == "perf histogram dump") {
6946 std::string logger;
6947 std::string counter;
6948 cmd_getval(cct, cmdmap, "logger", logger);
6949 cmd_getval(cct, cmdmap, "counter", counter);
6950 if (f) {
6951 cct->get_perfcounters_collection()->dump_formatted_histograms(
6952 f.get(), false, logger, counter);
6953 f->flush(ds);
6954 }
6955 }
6956
224ce89b
WB
6957 else if (prefix == "compact") {
6958 dout(1) << "triggering manual compaction" << dendl;
6959 auto start = ceph::coarse_mono_clock::now();
6960 store->compact();
6961 auto end = ceph::coarse_mono_clock::now();
11fdf7f2 6962 double duration = std::chrono::duration<double>(end-start).count();
224ce89b 6963 dout(1) << "finished manual compaction in "
11fdf7f2 6964 << duration
224ce89b 6965 << " seconds" << dendl;
11fdf7f2
TL
6966 ss << "compacted omap in " << duration << " seconds";
6967 }
6968
6969 else if (prefix == "smart") {
6970 string devid;
6971 cmd_getval(cct, cmdmap, "devid", devid);
6972 probe_smart(devid, ds);
6973 }
6974
6975 else if (prefix == "cache drop") {
6976 dout(20) << "clearing all caches" << dendl;
6977 // Clear the objectstore's cache - onode and buffer for Bluestore,
6978 // system's pagecache for Filestore
6979 r = store->flush_cache(&ss);
6980 if (r < 0) {
6981 ds << "Error flushing objectstore cache: " << cpp_strerror(r);
6982 goto out;
6983 }
6984 // Clear the objectcontext cache (per PG)
6985 vector<PGRef> pgs;
6986 _get_pgs(&pgs);
6987 for (auto& pg: pgs) {
6988 pg->clear_cache();
6989 }
224ce89b
WB
6990 }
6991
11fdf7f2
TL
6992 else if (prefix == "cache status") {
6993 int obj_ctx_count = 0;
6994 vector<PGRef> pgs;
6995 _get_pgs(&pgs);
6996 for (auto& pg: pgs) {
6997 obj_ctx_count += pg->get_cache_obj_count();
6998 }
6999 if (f) {
7000 f->open_object_section("cache_status");
7001 f->dump_int("object_ctx", obj_ctx_count);
7002 store->dump_cache_stats(f.get());
7003 f->close_section();
7004 f->flush(ds);
7005 } else {
7006 ds << "object_ctx: " << obj_ctx_count;
7007 store->dump_cache_stats(ds);
7008 }
7009 }
7010 else if (prefix == "send_beacon") {
7011 if (is_active()) {
7012 send_beacon(ceph::coarse_mono_clock::now());
7013 }
7014 } else {
7015 ss << "unrecognized command '" << prefix << "'";
7c673cae
FG
7016 r = -EINVAL;
7017 }
7018
7019 out:
11fdf7f2
TL
7020 return r;
7021}
7022
7023void OSD::probe_smart(const string& only_devid, ostream& ss)
7024{
7025 set<string> devnames;
7026 store->get_devices(&devnames);
7027 uint64_t smart_timeout = cct->_conf.get_val<uint64_t>(
7028 "osd_smart_report_timeout");
7029
7030 // == typedef std::map<std::string, mValue> mObject;
7031 json_spirit::mObject json_map;
7032
7033 for (auto dev : devnames) {
7034 // smartctl works only on physical devices; filter out any logical device
7035 if (dev.find("dm-") == 0) {
7036 continue;
7037 }
7038
7039 string err;
7040 string devid = get_device_id(dev, &err);
7041 if (devid.size() == 0) {
7042 dout(10) << __func__ << " no unique id for dev " << dev << " ("
7043 << err << "), skipping" << dendl;
7044 continue;
7045 }
7046 if (only_devid.size() && devid != only_devid) {
7047 continue;
7048 }
7049
7050 json_spirit::mValue smart_json;
7051 if (block_device_get_metrics(dev, smart_timeout,
7052 &smart_json)) {
7053 dout(10) << "block_device_get_metrics failed for /dev/" << dev << dendl;
7054 continue;
7055 }
7056 json_map[devid] = smart_json;
7c673cae 7057 }
11fdf7f2 7058 json_spirit::write(json_map, ss, json_spirit::pretty_print);
7c673cae
FG
7059}
7060
7061bool OSD::heartbeat_dispatch(Message *m)
7062{
7063 dout(30) << "heartbeat_dispatch " << m << dendl;
7064 switch (m->get_type()) {
7065
7066 case CEPH_MSG_PING:
7067 dout(10) << "ping from " << m->get_source_inst() << dendl;
7068 m->put();
7069 break;
7070
7071 case MSG_OSD_PING:
7072 handle_osd_ping(static_cast<MOSDPing*>(m));
7073 break;
7074
7075 default:
7076 dout(0) << "dropping unexpected message " << *m << " from " << m->get_source_inst() << dendl;
7077 m->put();
7078 }
7079
7080 return true;
7081}
7082
7083bool OSD::ms_dispatch(Message *m)
7084{
7085 dout(20) << "OSD::ms_dispatch: " << *m << dendl;
7086 if (m->get_type() == MSG_OSD_MARK_ME_DOWN) {
7087 service.got_stop_ack();
7088 m->put();
7089 return true;
7090 }
7091
7092 // lock!
7093
7094 osd_lock.Lock();
7095 if (is_stopping()) {
7096 osd_lock.Unlock();
7097 m->put();
7098 return true;
7099 }
7100
7101 do_waiters();
7102 _dispatch(m);
7103
7104 osd_lock.Unlock();
7105
7106 return true;
7107}
7108
7109void OSD::maybe_share_map(
7110 Session *session,
7111 OpRequestRef op,
7112 OSDMapRef osdmap)
7113{
7114 if (!op->check_send_map) {
7115 return;
7116 }
7117 epoch_t last_sent_epoch = 0;
7118
7119 session->sent_epoch_lock.lock();
7120 last_sent_epoch = session->last_sent_epoch;
7121 session->sent_epoch_lock.unlock();
7122
11fdf7f2
TL
7123 // assume the peer has the newer of the op's sent_epoch and what
7124 // we think we sent them.
7125 epoch_t from = std::max(last_sent_epoch, op->sent_epoch);
7126
7c673cae
FG
7127 const Message *m = op->get_req();
7128 service.share_map(
7129 m->get_source(),
7130 m->get_connection().get(),
11fdf7f2 7131 from,
7c673cae
FG
7132 osdmap,
7133 session ? &last_sent_epoch : NULL);
7134
7135 session->sent_epoch_lock.lock();
7136 if (session->last_sent_epoch < last_sent_epoch) {
7137 session->last_sent_epoch = last_sent_epoch;
7138 }
7139 session->sent_epoch_lock.unlock();
7140
7141 op->check_send_map = false;
7142}
7143
11fdf7f2 7144void OSD::dispatch_session_waiting(SessionRef session, OSDMapRef osdmap)
7c673cae 7145{
11fdf7f2 7146 ceph_assert(session->session_dispatch_lock.is_locked());
7c673cae
FG
7147
7148 auto i = session->waiting_on_map.begin();
7149 while (i != session->waiting_on_map.end()) {
7150 OpRequestRef op = &(*i);
11fdf7f2 7151 ceph_assert(ms_can_fast_dispatch(op->get_req()));
7c673cae
FG
7152 const MOSDFastDispatchOp *m = static_cast<const MOSDFastDispatchOp*>(
7153 op->get_req());
7154 if (m->get_min_epoch() > osdmap->get_epoch()) {
7155 break;
7156 }
7157 session->waiting_on_map.erase(i++);
7158 op->put();
7159
7160 spg_t pgid;
7161 if (m->get_type() == CEPH_MSG_OSD_OP) {
7162 pg_t actual_pgid = osdmap->raw_pg_to_pg(
7163 static_cast<const MOSDOp*>(m)->get_pg());
7164 if (!osdmap->get_primary_shard(actual_pgid, &pgid)) {
7165 continue;
7166 }
7167 } else {
7168 pgid = m->get_spg();
7169 }
11fdf7f2 7170 enqueue_op(pgid, std::move(op), m->get_map_epoch());
7c673cae
FG
7171 }
7172
7173 if (session->waiting_on_map.empty()) {
7174 clear_session_waiting_on_map(session);
7175 } else {
7176 register_session_waiting_on_map(session);
7177 }
7178}
7179
7180void OSD::ms_fast_dispatch(Message *m)
7181{
11fdf7f2 7182 FUNCTRACE(cct);
7c673cae
FG
7183 if (service.is_stopping()) {
7184 m->put();
7185 return;
7186 }
11fdf7f2
TL
7187
7188 // peering event?
7189 switch (m->get_type()) {
7190 case CEPH_MSG_PING:
7191 dout(10) << "ping from " << m->get_source() << dendl;
7192 m->put();
7193 return;
7194 case MSG_MON_COMMAND:
7195 handle_command(static_cast<MMonCommand*>(m));
7196 return;
7197 case MSG_OSD_FORCE_RECOVERY:
7198 handle_fast_force_recovery(static_cast<MOSDForceRecovery*>(m));
7199 return;
7200 case MSG_OSD_SCRUB2:
7201 handle_fast_scrub(static_cast<MOSDScrub2*>(m));
7202 return;
7203
7204 case MSG_OSD_PG_CREATE2:
7205 return handle_fast_pg_create(static_cast<MOSDPGCreate2*>(m));
7206 case MSG_OSD_PG_QUERY:
7207 return handle_fast_pg_query(static_cast<MOSDPGQuery*>(m));
7208 case MSG_OSD_PG_NOTIFY:
7209 return handle_fast_pg_notify(static_cast<MOSDPGNotify*>(m));
7210 case MSG_OSD_PG_INFO:
7211 return handle_fast_pg_info(static_cast<MOSDPGInfo*>(m));
7212 case MSG_OSD_PG_REMOVE:
7213 return handle_fast_pg_remove(static_cast<MOSDPGRemove*>(m));
7214
7215 // these are single-pg messages that handle themselves
7216 case MSG_OSD_PG_LOG:
7217 case MSG_OSD_PG_TRIM:
7218 case MSG_OSD_BACKFILL_RESERVE:
7219 case MSG_OSD_RECOVERY_RESERVE:
7220 {
7221 MOSDPeeringOp *pm = static_cast<MOSDPeeringOp*>(m);
7222 if (require_osd_peer(pm)) {
7223 enqueue_peering_evt(
7224 pm->get_spg(),
7225 PGPeeringEventRef(pm->get_event()));
7226 }
7227 pm->put();
7228 return;
7229 }
7230 }
7231
7c673cae
FG
7232 OpRequestRef op = op_tracker.create_request<OpRequest, Message*>(m);
7233 {
7234#ifdef WITH_LTTNG
7235 osd_reqid_t reqid = op->get_reqid();
7236#endif
7237 tracepoint(osd, ms_fast_dispatch, reqid.name._type,
7238 reqid.name._num, reqid.tid, reqid.inc);
7239 }
7240
7241 if (m->trace)
7242 op->osd_trace.init("osd op", &trace_endpoint, &m->trace);
7243
11fdf7f2 7244 // note sender epoch, min req's epoch
7c673cae
FG
7245 op->sent_epoch = static_cast<MOSDFastDispatchOp*>(m)->get_map_epoch();
7246 op->min_epoch = static_cast<MOSDFastDispatchOp*>(m)->get_min_epoch();
11fdf7f2 7247 ceph_assert(op->min_epoch <= op->sent_epoch); // sanity check!
7c673cae
FG
7248
7249 service.maybe_inject_dispatch_delay();
7250
7251 if (m->get_connection()->has_features(CEPH_FEATUREMASK_RESEND_ON_SPLIT) ||
7252 m->get_type() != CEPH_MSG_OSD_OP) {
7253 // queue it directly
7254 enqueue_op(
7255 static_cast<MOSDFastDispatchOp*>(m)->get_spg(),
11fdf7f2 7256 std::move(op),
7c673cae
FG
7257 static_cast<MOSDFastDispatchOp*>(m)->get_map_epoch());
7258 } else {
7259 // legacy client, and this is an MOSDOp (the *only* fast dispatch
7260 // message that didn't have an explicit spg_t); we need to map
7261 // them to an spg_t while preserving delivery order.
11fdf7f2
TL
7262 auto priv = m->get_connection()->get_priv();
7263 if (auto session = static_cast<Session*>(priv.get()); session) {
7264 std::lock_guard l{session->session_dispatch_lock};
7265 op->get();
7266 session->waiting_on_map.push_back(*op);
7267 OSDMapRef nextmap = service.get_nextmap_reserved();
7268 dispatch_session_waiting(session, nextmap);
7269 service.release_map(nextmap);
7c673cae
FG
7270 }
7271 }
7272 OID_EVENT_TRACE_WITH_MSG(m, "MS_FAST_DISPATCH_END", false);
7273}
7274
11fdf7f2 7275bool OSD::ms_get_authorizer(int dest_type, AuthAuthorizer **authorizer)
7c673cae
FG
7276{
7277 dout(10) << "OSD::ms_get_authorizer type=" << ceph_entity_type_name(dest_type) << dendl;
7278
31f18b77
FG
7279 if (is_stopping()) {
7280 dout(10) << __func__ << " bailing, we are shutting down" << dendl;
7281 return false;
7282 }
7283
7c673cae
FG
7284 if (dest_type == CEPH_ENTITY_TYPE_MON)
7285 return true;
7286
7c673cae
FG
7287 *authorizer = monc->build_authorizer(dest_type);
7288 return *authorizer != NULL;
7289}
7290
11fdf7f2
TL
7291KeyStore *OSD::ms_get_auth1_authorizer_keystore()
7292{
7293 return monc->rotating_secrets.get();
7294}
7c673cae 7295
11fdf7f2 7296int OSD::ms_handle_authentication(Connection *con)
7c673cae 7297{
11fdf7f2
TL
7298 int ret = 0;
7299 auto priv = con->get_priv();
7300 Session *s = static_cast<Session*>(priv.get());
7301 if (!s) {
7302 s = new Session(cct, con);
7303 con->set_priv(RefCountedPtr{s, false});
7304 s->entity_name = con->get_peer_entity_name();
7305 dout(10) << __func__ << " new session " << s << " con " << s->con
7306 << " entity " << s->entity_name
7307 << " addr " << con->get_peer_addrs() << dendl;
7308 } else {
7309 dout(10) << __func__ << " existing session " << s << " con " << s->con
7310 << " entity " << s->entity_name
7311 << " addr " << con->get_peer_addrs() << dendl;
7c673cae
FG
7312 }
7313
11fdf7f2
TL
7314 AuthCapsInfo &caps_info = con->get_peer_caps_info();
7315 if (caps_info.allow_all)
7316 s->caps.set_allow_all();
7c673cae 7317
11fdf7f2
TL
7318 if (caps_info.caps.length() > 0) {
7319 bufferlist::const_iterator p = caps_info.caps.cbegin();
7320 string str;
7321 try {
7322 decode(str, p);
7323 }
7324 catch (buffer::error& e) {
7325 dout(10) << __func__ << " session " << s << " " << s->entity_name
7326 << " failed to decode caps string" << dendl;
7327 ret = -EPERM;
7328 }
7329 if (!ret) {
7c673cae 7330 bool success = s->caps.parse(str);
11fdf7f2
TL
7331 if (success) {
7332 dout(10) << __func__ << " session " << s
7333 << " " << s->entity_name
7334 << " has caps " << s->caps << " '" << str << "'" << dendl;
7335 ret = 1;
7336 } else {
7337 dout(10) << __func__ << " session " << s << " " << s->entity_name
7338 << " failed to parse caps '" << str << "'" << dendl;
7339 ret = -EPERM;
7340 }
7c673cae 7341 }
7c673cae 7342 }
11fdf7f2 7343 return ret;
7c673cae
FG
7344}
7345
7346void OSD::do_waiters()
7347{
11fdf7f2 7348 ceph_assert(osd_lock.is_locked());
7c673cae
FG
7349
7350 dout(10) << "do_waiters -- start" << dendl;
7351 while (!finished.empty()) {
7352 OpRequestRef next = finished.front();
7353 finished.pop_front();
7354 dispatch_op(next);
7355 }
7356 dout(10) << "do_waiters -- finish" << dendl;
7357}
7358
7359void OSD::dispatch_op(OpRequestRef op)
7360{
7361 switch (op->get_req()->get_type()) {
7362
7363 case MSG_OSD_PG_CREATE:
7364 handle_pg_create(op);
7365 break;
7c673cae
FG
7366 }
7367}
7368
7369void OSD::_dispatch(Message *m)
7370{
11fdf7f2 7371 ceph_assert(osd_lock.is_locked());
7c673cae
FG
7372 dout(20) << "_dispatch " << m << " " << *m << dendl;
7373
7374 switch (m->get_type()) {
7c673cae
FG
7375 // -- don't need OSDMap --
7376
7377 // map and replication
7378 case CEPH_MSG_OSD_MAP:
7379 handle_osd_map(static_cast<MOSDMap*>(m));
7380 break;
7381
7382 // osd
7c673cae
FG
7383 case MSG_OSD_SCRUB:
7384 handle_scrub(static_cast<MOSDScrub*>(m));
7385 break;
7386
11fdf7f2
TL
7387 case MSG_COMMAND:
7388 handle_command(static_cast<MCommand*>(m));
7389 return;
c07f9fc5 7390
7c673cae
FG
7391 // -- need OSDMap --
7392
7393 case MSG_OSD_PG_CREATE:
7c673cae
FG
7394 {
7395 OpRequestRef op = op_tracker.create_request<OpRequest, Message*>(m);
7396 if (m->trace)
7397 op->osd_trace.init("osd op", &trace_endpoint, &m->trace);
7398 // no map? starting up?
7399 if (!osdmap) {
7400 dout(7) << "no OSDMap, not booted" << dendl;
7401 logger->inc(l_osd_waiting_for_map);
7402 waiting_for_osdmap.push_back(op);
7403 op->mark_delayed("no osdmap");
7404 break;
7405 }
7406
7407 // need OSDMap
7408 dispatch_op(op);
7409 }
7410 }
7411}
7412
11fdf7f2 7413// remove me post-nautilus
7c673cae
FG
7414void OSD::handle_scrub(MOSDScrub *m)
7415{
7416 dout(10) << "handle_scrub " << *m << dendl;
7417 if (!require_mon_or_mgr_peer(m)) {
7418 m->put();
7419 return;
7420 }
7421 if (m->fsid != monc->get_fsid()) {
11fdf7f2
TL
7422 dout(0) << "handle_scrub fsid " << m->fsid << " != " << monc->get_fsid()
7423 << dendl;
7c673cae
FG
7424 m->put();
7425 return;
7426 }
7427
11fdf7f2
TL
7428 vector<spg_t> spgs;
7429 _get_pgids(&spgs);
7430
7431 if (!m->scrub_pgs.empty()) {
7432 vector<spg_t> v;
7433 for (auto pgid : m->scrub_pgs) {
7c673cae 7434 spg_t pcand;
11fdf7f2
TL
7435 if (osdmap->get_primary_shard(pgid, &pcand) &&
7436 std::find(spgs.begin(), spgs.end(), pcand) != spgs.end()) {
7437 v.push_back(pcand);
7c673cae
FG
7438 }
7439 }
11fdf7f2
TL
7440 spgs.swap(v);
7441 }
7442
7443 for (auto pgid : spgs) {
7444 enqueue_peering_evt(
7445 pgid,
7446 PGPeeringEventRef(
7447 std::make_shared<PGPeeringEvent>(
7448 get_osdmap_epoch(),
7449 get_osdmap_epoch(),
7450 PG::RequestScrub(m->deep, m->repair))));
7c673cae
FG
7451 }
7452
7453 m->put();
7454}
7455
11fdf7f2
TL
7456void OSD::handle_fast_scrub(MOSDScrub2 *m)
7457{
7458 dout(10) << __func__ << " " << *m << dendl;
7459 if (!require_mon_or_mgr_peer(m)) {
7460 m->put();
7461 return;
7462 }
7463 if (m->fsid != monc->get_fsid()) {
7464 dout(0) << __func__ << " fsid " << m->fsid << " != " << monc->get_fsid()
7465 << dendl;
7466 m->put();
7467 return;
7468 }
7469 for (auto pgid : m->scrub_pgs) {
7470 enqueue_peering_evt(
7471 pgid,
7472 PGPeeringEventRef(
7473 std::make_shared<PGPeeringEvent>(
7474 m->epoch,
7475 m->epoch,
7476 PG::RequestScrub(m->deep, m->repair))));
7477 }
7478 m->put();
7479}
7480
7c673cae
FG
7481bool OSD::scrub_random_backoff()
7482{
7483 bool coin_flip = (rand() / (double)RAND_MAX >=
7484 cct->_conf->osd_scrub_backoff_ratio);
7485 if (!coin_flip) {
7486 dout(20) << "scrub_random_backoff lost coin flip, randomly backing off" << dendl;
7487 return true;
7488 }
7489 return false;
7490}
7491
7492OSDService::ScrubJob::ScrubJob(CephContext* cct,
7493 const spg_t& pg, const utime_t& timestamp,
7494 double pool_scrub_min_interval,
7495 double pool_scrub_max_interval, bool must)
7496 : cct(cct),
7497 pgid(pg),
7498 sched_time(timestamp),
7499 deadline(timestamp)
7500{
7501 // if not explicitly requested, postpone the scrub with a random delay
7502 if (!must) {
7503 double scrub_min_interval = pool_scrub_min_interval > 0 ?
7504 pool_scrub_min_interval : cct->_conf->osd_scrub_min_interval;
7505 double scrub_max_interval = pool_scrub_max_interval > 0 ?
7506 pool_scrub_max_interval : cct->_conf->osd_scrub_max_interval;
7507
7508 sched_time += scrub_min_interval;
7509 double r = rand() / (double)RAND_MAX;
7510 sched_time +=
7511 scrub_min_interval * cct->_conf->osd_scrub_interval_randomize_ratio * r;
11fdf7f2
TL
7512 if (scrub_max_interval == 0) {
7513 deadline = utime_t();
7514 } else {
7515 deadline += scrub_max_interval;
7516 }
7517
7c673cae
FG
7518 }
7519}
7520
7521bool OSDService::ScrubJob::ScrubJob::operator<(const OSDService::ScrubJob& rhs) const {
7522 if (sched_time < rhs.sched_time)
7523 return true;
7524 if (sched_time > rhs.sched_time)
7525 return false;
7526 return pgid < rhs.pgid;
7527}
7528
7529bool OSD::scrub_time_permit(utime_t now)
7530{
7531 struct tm bdt;
7532 time_t tt = now.sec();
7533 localtime_r(&tt, &bdt);
28e407b8
AA
7534
7535 bool day_permit = false;
7536 if (cct->_conf->osd_scrub_begin_week_day < cct->_conf->osd_scrub_end_week_day) {
7537 if (bdt.tm_wday >= cct->_conf->osd_scrub_begin_week_day && bdt.tm_wday < cct->_conf->osd_scrub_end_week_day) {
7538 day_permit = true;
7539 }
7540 } else {
7541 if (bdt.tm_wday >= cct->_conf->osd_scrub_begin_week_day || bdt.tm_wday < cct->_conf->osd_scrub_end_week_day) {
7542 day_permit = true;
7543 }
7544 }
7545
7546 if (!day_permit) {
7547 dout(20) << __func__ << " should run between week day " << cct->_conf->osd_scrub_begin_week_day
7548 << " - " << cct->_conf->osd_scrub_end_week_day
7549 << " now " << bdt.tm_wday << " = no" << dendl;
7550 return false;
7551 }
7552
7c673cae
FG
7553 bool time_permit = false;
7554 if (cct->_conf->osd_scrub_begin_hour < cct->_conf->osd_scrub_end_hour) {
7555 if (bdt.tm_hour >= cct->_conf->osd_scrub_begin_hour && bdt.tm_hour < cct->_conf->osd_scrub_end_hour) {
7556 time_permit = true;
7557 }
7558 } else {
7559 if (bdt.tm_hour >= cct->_conf->osd_scrub_begin_hour || bdt.tm_hour < cct->_conf->osd_scrub_end_hour) {
7560 time_permit = true;
7561 }
7562 }
7563 if (!time_permit) {
7564 dout(20) << __func__ << " should run between " << cct->_conf->osd_scrub_begin_hour
7565 << " - " << cct->_conf->osd_scrub_end_hour
7566 << " now " << bdt.tm_hour << " = no" << dendl;
7567 } else {
7568 dout(20) << __func__ << " should run between " << cct->_conf->osd_scrub_begin_hour
7569 << " - " << cct->_conf->osd_scrub_end_hour
7570 << " now " << bdt.tm_hour << " = yes" << dendl;
7571 }
7572 return time_permit;
7573}
7574
7575bool OSD::scrub_load_below_threshold()
7576{
7577 double loadavgs[3];
7578 if (getloadavg(loadavgs, 3) != 3) {
7579 dout(10) << __func__ << " couldn't read loadavgs\n" << dendl;
7580 return false;
7581 }
7582
7583 // allow scrub if below configured threshold
91327a77
AA
7584 long cpus = sysconf(_SC_NPROCESSORS_ONLN);
7585 double loadavg_per_cpu = cpus > 0 ? loadavgs[0] / cpus : loadavgs[0];
7586 if (loadavg_per_cpu < cct->_conf->osd_scrub_load_threshold) {
7587 dout(20) << __func__ << " loadavg per cpu " << loadavg_per_cpu
7c673cae
FG
7588 << " < max " << cct->_conf->osd_scrub_load_threshold
7589 << " = yes" << dendl;
7590 return true;
7591 }
7592
7593 // allow scrub if below daily avg and currently decreasing
7594 if (loadavgs[0] < daily_loadavg && loadavgs[0] < loadavgs[2]) {
7595 dout(20) << __func__ << " loadavg " << loadavgs[0]
7596 << " < daily_loadavg " << daily_loadavg
7597 << " and < 15m avg " << loadavgs[2]
7598 << " = yes" << dendl;
7599 return true;
7600 }
7601
7602 dout(20) << __func__ << " loadavg " << loadavgs[0]
7603 << " >= max " << cct->_conf->osd_scrub_load_threshold
7604 << " and ( >= daily_loadavg " << daily_loadavg
7605 << " or >= 15m avg " << loadavgs[2]
7606 << ") = no" << dendl;
7607 return false;
7608}
7609
7610void OSD::sched_scrub()
7611{
7612 // if not permitted, fail fast
7613 if (!service.can_inc_scrubs_pending()) {
7614 return;
7615 }
b5b8bbf5
FG
7616 if (!cct->_conf->osd_scrub_during_recovery && service.is_recovery_active()) {
7617 dout(20) << __func__ << " not scheduling scrubs due to active recovery" << dendl;
7618 return;
7619 }
7620
7c673cae
FG
7621
7622 utime_t now = ceph_clock_now();
7623 bool time_permit = scrub_time_permit(now);
7624 bool load_is_low = scrub_load_below_threshold();
7625 dout(20) << "sched_scrub load_is_low=" << (int)load_is_low << dendl;
7626
7627 OSDService::ScrubJob scrub;
7628 if (service.first_scrub_stamp(&scrub)) {
7629 do {
7630 dout(30) << "sched_scrub examine " << scrub.pgid << " at " << scrub.sched_time << dendl;
7631
7632 if (scrub.sched_time > now) {
7633 // save ourselves some effort
7634 dout(10) << "sched_scrub " << scrub.pgid << " scheduled at " << scrub.sched_time
7635 << " > " << now << dendl;
7636 break;
7637 }
7638
11fdf7f2 7639 if ((scrub.deadline.is_zero() || scrub.deadline >= now) && !(time_permit && load_is_low)) {
7c673cae
FG
7640 dout(10) << __func__ << " not scheduling scrub for " << scrub.pgid << " due to "
7641 << (!time_permit ? "time not permit" : "high load") << dendl;
7642 continue;
7643 }
7644
11fdf7f2 7645 PGRef pg = _lookup_lock_pg(scrub.pgid);
7c673cae
FG
7646 if (!pg)
7647 continue;
494da23a
TL
7648 // This has already started, so go on to the next scrub job
7649 if (pg->scrubber.active) {
7650 pg->unlock();
7651 dout(30) << __func__ << ": already in progress pgid " << scrub.pgid << dendl;
7652 continue;
7653 }
7654 // If it is reserving, let it resolve before going to the next scrub job
7655 if (pg->scrubber.reserved) {
7656 pg->unlock();
7657 dout(30) << __func__ << ": reserve in progress pgid " << scrub.pgid << dendl;
7658 break;
7659 }
11fdf7f2
TL
7660 dout(10) << "sched_scrub scrubbing " << scrub.pgid << " at " << scrub.sched_time
7661 << (pg->get_must_scrub() ? ", explicitly requested" :
7662 (load_is_low ? ", load_is_low" : " deadline < now"))
7663 << dendl;
7664 if (pg->sched_scrub()) {
7665 pg->unlock();
7666 break;
7c673cae
FG
7667 }
7668 pg->unlock();
7669 } while (service.next_scrub_stamp(scrub, &scrub));
7670 }
7671 dout(20) << "sched_scrub done" << dendl;
7672}
7673
494da23a
TL
7674void OSD::resched_all_scrubs()
7675{
7676 dout(10) << __func__ << ": start" << dendl;
7677 OSDService::ScrubJob scrub;
7678 if (service.first_scrub_stamp(&scrub)) {
7679 do {
7680 dout(20) << __func__ << ": examine " << scrub.pgid << dendl;
7681
7682 PGRef pg = _lookup_lock_pg(scrub.pgid);
7683 if (!pg)
7684 continue;
7685 if (!pg->scrubber.must_scrub && !pg->scrubber.need_auto) {
7686 dout(20) << __func__ << ": reschedule " << scrub.pgid << dendl;
7687 pg->on_info_history_change();
7688 }
7689 pg->unlock();
7690 } while (service.next_scrub_stamp(scrub, &scrub));
7691 }
7692 dout(10) << __func__ << ": done" << dendl;
7693}
7694
11fdf7f2
TL
7695MPGStats* OSD::collect_pg_stats()
7696{
7697 // This implementation unconditionally sends every is_primary PG's
7698 // stats every time we're called. This has equivalent cost to the
7699 // previous implementation's worst case where all PGs are busy and
7700 // their stats are always enqueued for sending.
7701 RWLock::RLocker l(map_lock);
7702
7703 utime_t had_for = ceph_clock_now() - had_map_since;
7704 osd_stat_t cur_stat = service.get_osd_stat();
7705 cur_stat.os_perf_stat = store->get_cur_stats();
7706
7707 auto m = new MPGStats(monc->get_fsid(), osdmap->get_epoch(), had_for);
7708 m->osd_stat = cur_stat;
7709
7710 std::lock_guard lec{min_last_epoch_clean_lock};
7711 min_last_epoch_clean = osdmap->get_epoch();
7712 min_last_epoch_clean_pgs.clear();
7713
7714 std::set<int64_t> pool_set;
7715 vector<PGRef> pgs;
7716 _get_pgs(&pgs);
7717 for (auto& pg : pgs) {
7718 auto pool = pg->pg_id.pgid.pool();
7719 pool_set.emplace((int64_t)pool);
7720 if (!pg->is_primary()) {
7721 continue;
7722 }
7723 pg->get_pg_stats([&](const pg_stat_t& s, epoch_t lec) {
7724 m->pg_stat[pg->pg_id.pgid] = s;
7725 min_last_epoch_clean = min(min_last_epoch_clean, lec);
7726 min_last_epoch_clean_pgs.push_back(pg->pg_id.pgid);
7727 });
7728 }
7729 store_statfs_t st;
81eedcae 7730 bool per_pool_stats = false;
11fdf7f2
TL
7731 for (auto p : pool_set) {
7732 int r = store->pool_statfs(p, &st);
7733 if (r == -ENOTSUP) {
7734 break;
7735 } else {
7736 assert(r >= 0);
7737 m->pool_stat[p] = st;
81eedcae 7738 per_pool_stats = true;
11fdf7f2
TL
7739 }
7740 }
7c673cae 7741
81eedcae
TL
7742 // indicate whether we are reporting per-pool stats
7743 m->osd_stat.num_osds = 1;
7744 m->osd_stat.num_per_pool_osds = per_pool_stats ? 1 : 0;
7745
11fdf7f2
TL
7746 return m;
7747}
7c673cae 7748
11fdf7f2 7749vector<DaemonHealthMetric> OSD::get_health_metrics()
b32b8144 7750{
11fdf7f2
TL
7751 vector<DaemonHealthMetric> metrics;
7752 {
7753 utime_t oldest_secs;
7754 const utime_t now = ceph_clock_now();
7755 auto too_old = now;
7756 too_old -= cct->_conf.get_val<double>("osd_op_complaint_time");
7757 int slow = 0;
7758 TrackedOpRef oldest_op;
7759 auto count_slow_ops = [&](TrackedOp& op) {
7760 if (op.get_initiated() < too_old) {
7761 lgeneric_subdout(cct,osd,20) << "slow op " << op.get_desc()
7762 << " initiated "
7763 << op.get_initiated() << dendl;
7764 slow++;
7765 if (!oldest_op || op.get_initiated() < oldest_op->get_initiated()) {
7766 oldest_op = &op;
7767 }
7768 return true;
7769 } else {
7770 return false;
7771 }
7772 };
7773 if (op_tracker.visit_ops_in_flight(&oldest_secs, count_slow_ops)) {
7774 if (slow) {
7775 derr << __func__ << " reporting " << slow << " slow ops, oldest is "
7776 << oldest_op->get_desc() << dendl;
7777 }
7778 metrics.emplace_back(daemon_metric::SLOW_OPS, slow, oldest_secs);
7779 } else {
7780 // no news is not good news.
7781 metrics.emplace_back(daemon_metric::SLOW_OPS, 0, 0);
7782 }
7783 }
7784 {
7785 std::lock_guard l(pending_creates_lock);
7786 auto n_primaries = pending_creates_from_mon;
7787 for (const auto& create : pending_creates_from_osd) {
7788 if (create.second) {
7789 n_primaries++;
7790 }
b32b8144 7791 }
11fdf7f2 7792 metrics.emplace_back(daemon_metric::PENDING_CREATING_PGS, n_primaries);
b32b8144 7793 }
b32b8144
FG
7794 return metrics;
7795}
7796
7c673cae
FG
7797// =====================================================
7798// MAP
7799
7800void OSD::wait_for_new_map(OpRequestRef op)
7801{
7802 // ask?
7803 if (waiting_for_osdmap.empty()) {
7804 osdmap_subscribe(osdmap->get_epoch() + 1, false);
7805 }
7806
7807 logger->inc(l_osd_waiting_for_map);
7808 waiting_for_osdmap.push_back(op);
7809 op->mark_delayed("wait for new map");
7810}
7811
7812
7813/** update_map
7814 * assimilate new OSDMap(s). scan pgs, etc.
7815 */
7816
7817void OSD::note_down_osd(int peer)
7818{
11fdf7f2
TL
7819 ceph_assert(osd_lock.is_locked());
7820 cluster_messenger->mark_down_addrs(osdmap->get_cluster_addrs(peer));
7c673cae
FG
7821
7822 heartbeat_lock.Lock();
7823 failure_queue.erase(peer);
7824 failure_pending.erase(peer);
7825 map<int,HeartbeatInfo>::iterator p = heartbeat_peers.find(peer);
7826 if (p != heartbeat_peers.end()) {
7827 p->second.con_back->mark_down();
7828 if (p->second.con_front) {
7829 p->second.con_front->mark_down();
7830 }
7831 heartbeat_peers.erase(p);
7832 }
7833 heartbeat_lock.Unlock();
7834}
7835
7836void OSD::note_up_osd(int peer)
7837{
7838 service.forget_peer_epoch(peer, osdmap->get_epoch() - 1);
7839 heartbeat_set_peers_need_update();
7840}
7841
7842struct C_OnMapCommit : public Context {
7843 OSD *osd;
7844 epoch_t first, last;
7845 MOSDMap *msg;
7846 C_OnMapCommit(OSD *o, epoch_t f, epoch_t l, MOSDMap *m)
7847 : osd(o), first(f), last(l), msg(m) {}
7848 void finish(int r) override {
7849 osd->_committed_osd_maps(first, last, msg);
7850 msg->put();
7851 }
7852};
7853
7c673cae
FG
7854void OSD::osdmap_subscribe(version_t epoch, bool force_request)
7855{
11fdf7f2 7856 std::lock_guard l(osdmap_subscribe_lock);
181888fb 7857 if (latest_subscribed_epoch >= epoch && !force_request)
7c673cae
FG
7858 return;
7859
11fdf7f2 7860 latest_subscribed_epoch = std::max<uint64_t>(epoch, latest_subscribed_epoch);
181888fb 7861
7c673cae
FG
7862 if (monc->sub_want_increment("osdmap", epoch, CEPH_SUBSCRIBE_ONETIME) ||
7863 force_request) {
7864 monc->renew_subs();
7865 }
7866}
7867
7868void OSD::trim_maps(epoch_t oldest, int nreceived, bool skip_maps)
7869{
7870 epoch_t min = std::min(oldest, service.map_cache.cached_key_lower_bound());
7871 if (min <= superblock.oldest_map)
7872 return;
7873
7874 int num = 0;
7875 ObjectStore::Transaction t;
7876 for (epoch_t e = superblock.oldest_map; e < min; ++e) {
7877 dout(20) << " removing old osdmap epoch " << e << dendl;
7878 t.remove(coll_t::meta(), get_osdmap_pobject_name(e));
7879 t.remove(coll_t::meta(), get_inc_osdmap_pobject_name(e));
7880 superblock.oldest_map = e + 1;
7881 num++;
7882 if (num >= cct->_conf->osd_target_transaction_size && num >= nreceived) {
7883 service.publish_superblock(superblock);
7884 write_superblock(t);
11fdf7f2
TL
7885 int tr = store->queue_transaction(service.meta_ch, std::move(t), nullptr);
7886 ceph_assert(tr == 0);
7c673cae
FG
7887 num = 0;
7888 if (!skip_maps) {
7889 // skip_maps leaves us with a range of old maps if we fail to remove all
7890 // of them before moving superblock.oldest_map forward to the first map
7891 // in the incoming MOSDMap msg. so we should continue removing them in
7892 // this case, even we could do huge series of delete transactions all at
7893 // once.
7894 break;
7895 }
7896 }
7897 }
7898 if (num > 0) {
7899 service.publish_superblock(superblock);
7900 write_superblock(t);
11fdf7f2
TL
7901 int tr = store->queue_transaction(service.meta_ch, std::move(t), nullptr);
7902 ceph_assert(tr == 0);
7c673cae
FG
7903 }
7904 // we should not remove the cached maps
11fdf7f2 7905 ceph_assert(min <= service.map_cache.cached_key_lower_bound());
7c673cae
FG
7906}
7907
7908void OSD::handle_osd_map(MOSDMap *m)
7909{
11fdf7f2
TL
7910 // wait for pgs to catch up
7911 {
7912 // we extend the map cache pins to accomodate pgs slow to consume maps
7913 // for some period, until we hit the max_lag_factor bound, at which point
7914 // we block here to stop injesting more maps than they are able to keep
7915 // up with.
7916 epoch_t max_lag = cct->_conf->osd_map_cache_size *
7917 m_osd_pg_epoch_max_lag_factor;
7918 ceph_assert(max_lag > 0);
7919 epoch_t osd_min = 0;
7920 for (auto shard : shards) {
7921 epoch_t min = shard->get_min_pg_epoch();
7922 if (osd_min == 0 || min < osd_min) {
7923 osd_min = min;
7924 }
7925 }
7926 if (osd_min > 0 &&
7927 osdmap->get_epoch() > max_lag &&
7928 osdmap->get_epoch() - max_lag > osd_min) {
7929 epoch_t need = osdmap->get_epoch() - max_lag;
7930 dout(10) << __func__ << " waiting for pgs to catch up (need " << need
7931 << " max_lag " << max_lag << ")" << dendl;
7932 for (auto shard : shards) {
7933 epoch_t min = shard->get_min_pg_epoch();
7934 if (need > min) {
7935 dout(10) << __func__ << " waiting for pgs to consume " << need
7936 << " (shard " << shard->shard_id << " min " << min
7937 << ", map cache is " << cct->_conf->osd_map_cache_size
7938 << ", max_lag_factor " << m_osd_pg_epoch_max_lag_factor
7939 << ")" << dendl;
7940 unlock_guard unlock{osd_lock};
7941 shard->wait_min_pg_epoch(need);
7942 }
7943 }
7944 }
7945 }
7946
7947 ceph_assert(osd_lock.is_locked());
7948 map<epoch_t,OSDMapRef> added_maps;
7949 map<epoch_t,bufferlist> added_maps_bl;
7c673cae
FG
7950 if (m->fsid != monc->get_fsid()) {
7951 dout(0) << "handle_osd_map fsid " << m->fsid << " != "
7952 << monc->get_fsid() << dendl;
7953 m->put();
7954 return;
7955 }
7956 if (is_initializing()) {
7957 dout(0) << "ignoring osdmap until we have initialized" << dendl;
7958 m->put();
7959 return;
7960 }
7961
11fdf7f2
TL
7962 auto priv = m->get_connection()->get_priv();
7963 if (auto session = static_cast<Session *>(priv.get());
7964 session && !(session->entity_name.is_mon() ||
7c673cae
FG
7965 session->entity_name.is_osd())) {
7966 //not enough perms!
7967 dout(10) << "got osd map from Session " << session
7968 << " which we can't take maps from (not a mon or osd)" << dendl;
7969 m->put();
7c673cae
FG
7970 return;
7971 }
7c673cae
FG
7972
7973 // share with the objecter
7974 if (!is_preboot())
7975 service.objecter->handle_osd_map(m);
7976
7977 epoch_t first = m->get_first();
7978 epoch_t last = m->get_last();
7979 dout(3) << "handle_osd_map epochs [" << first << "," << last << "], i have "
7980 << superblock.newest_map
7981 << ", src has [" << m->oldest_map << "," << m->newest_map << "]"
7982 << dendl;
7983
7984 logger->inc(l_osd_map);
7985 logger->inc(l_osd_mape, last - first + 1);
7986 if (first <= superblock.newest_map)
7987 logger->inc(l_osd_mape_dup, superblock.newest_map - first + 1);
7988 if (service.max_oldest_map < m->oldest_map) {
7989 service.max_oldest_map = m->oldest_map;
11fdf7f2 7990 ceph_assert(service.max_oldest_map >= superblock.oldest_map);
7c673cae
FG
7991 }
7992
7993 // make sure there is something new, here, before we bother flushing
7994 // the queues and such
7995 if (last <= superblock.newest_map) {
7996 dout(10) << " no new maps here, dropping" << dendl;
7997 m->put();
7998 return;
7999 }
8000
8001 // missing some?
8002 bool skip_maps = false;
8003 if (first > superblock.newest_map + 1) {
8004 dout(10) << "handle_osd_map message skips epochs "
8005 << superblock.newest_map + 1 << ".." << (first-1) << dendl;
8006 if (m->oldest_map <= superblock.newest_map + 1) {
8007 osdmap_subscribe(superblock.newest_map + 1, false);
8008 m->put();
8009 return;
8010 }
8011 // always try to get the full range of maps--as many as we can. this
8012 // 1- is good to have
8013 // 2- is at present the only way to ensure that we get a *full* map as
8014 // the first map!
8015 if (m->oldest_map < first) {
8016 osdmap_subscribe(m->oldest_map - 1, true);
8017 m->put();
8018 return;
8019 }
8020 skip_maps = true;
8021 }
8022
8023 ObjectStore::Transaction t;
8024 uint64_t txn_size = 0;
8025
8026 // store new maps: queue for disk and put in the osdmap cache
11fdf7f2 8027 epoch_t start = std::max(superblock.newest_map + 1, first);
7c673cae
FG
8028 for (epoch_t e = start; e <= last; e++) {
8029 if (txn_size >= t.get_num_bytes()) {
8030 derr << __func__ << " transaction size overflowed" << dendl;
11fdf7f2 8031 ceph_assert(txn_size < t.get_num_bytes());
7c673cae
FG
8032 }
8033 txn_size = t.get_num_bytes();
8034 map<epoch_t,bufferlist>::iterator p;
8035 p = m->maps.find(e);
8036 if (p != m->maps.end()) {
8037 dout(10) << "handle_osd_map got full map for epoch " << e << dendl;
8038 OSDMap *o = new OSDMap;
8039 bufferlist& bl = p->second;
8040
8041 o->decode(bl);
8042
8043 ghobject_t fulloid = get_osdmap_pobject_name(e);
8044 t.write(coll_t::meta(), fulloid, 0, bl.length(), bl);
11fdf7f2
TL
8045 added_maps[e] = add_map(o);
8046 added_maps_bl[e] = bl;
7c673cae
FG
8047 got_full_map(e);
8048 continue;
8049 }
8050
8051 p = m->incremental_maps.find(e);
8052 if (p != m->incremental_maps.end()) {
8053 dout(10) << "handle_osd_map got inc map for epoch " << e << dendl;
8054 bufferlist& bl = p->second;
8055 ghobject_t oid = get_inc_osdmap_pobject_name(e);
8056 t.write(coll_t::meta(), oid, 0, bl.length(), bl);
7c673cae
FG
8057
8058 OSDMap *o = new OSDMap;
8059 if (e > 1) {
8060 bufferlist obl;
8061 bool got = get_map_bl(e - 1, obl);
11fdf7f2
TL
8062 if (!got) {
8063 auto p = added_maps_bl.find(e - 1);
8064 ceph_assert(p != added_maps_bl.end());
8065 obl = p->second;
8066 }
7c673cae
FG
8067 o->decode(obl);
8068 }
8069
8070 OSDMap::Incremental inc;
11fdf7f2 8071 auto p = bl.cbegin();
7c673cae 8072 inc.decode(p);
494da23a 8073
7c673cae
FG
8074 if (o->apply_incremental(inc) < 0) {
8075 derr << "ERROR: bad fsid? i have " << osdmap->get_fsid() << " and inc has " << inc.fsid << dendl;
11fdf7f2 8076 ceph_abort_msg("bad fsid");
7c673cae
FG
8077 }
8078
8079 bufferlist fbl;
8080 o->encode(fbl, inc.encode_features | CEPH_FEATURE_RESERVED);
8081
8082 bool injected_failure = false;
8083 if (cct->_conf->osd_inject_bad_map_crc_probability > 0 &&
8084 (rand() % 10000) < cct->_conf->osd_inject_bad_map_crc_probability*10000.0) {
8085 derr << __func__ << " injecting map crc failure" << dendl;
8086 injected_failure = true;
8087 }
8088
8089 if ((inc.have_crc && o->get_crc() != inc.full_crc) || injected_failure) {
8090 dout(2) << "got incremental " << e
8091 << " but failed to encode full with correct crc; requesting"
8092 << dendl;
8093 clog->warn() << "failed to encode map e" << e << " with expected crc";
8094 dout(20) << "my encoded map was:\n";
8095 fbl.hexdump(*_dout);
8096 *_dout << dendl;
8097 delete o;
8098 request_full_map(e, last);
8099 last = e - 1;
8100 break;
8101 }
8102 got_full_map(e);
8103
8104 ghobject_t fulloid = get_osdmap_pobject_name(e);
8105 t.write(coll_t::meta(), fulloid, 0, fbl.length(), fbl);
11fdf7f2
TL
8106 added_maps[e] = add_map(o);
8107 added_maps_bl[e] = fbl;
7c673cae
FG
8108 continue;
8109 }
8110
11fdf7f2 8111 ceph_abort_msg("MOSDMap lied about what maps it had?");
7c673cae
FG
8112 }
8113
8114 // even if this map isn't from a mon, we may have satisfied our subscription
8115 monc->sub_got("osdmap", last);
8116
8117 if (!m->maps.empty() && requested_full_first) {
8118 dout(10) << __func__ << " still missing full maps " << requested_full_first
8119 << ".." << requested_full_last << dendl;
8120 rerequest_full_maps();
8121 }
8122
7c673cae
FG
8123 if (superblock.oldest_map) {
8124 // make sure we at least keep pace with incoming maps
8125 trim_maps(m->oldest_map, last - first + 1, skip_maps);
11fdf7f2 8126 pg_num_history.prune(superblock.oldest_map);
7c673cae
FG
8127 }
8128
8129 if (!superblock.oldest_map || skip_maps)
8130 superblock.oldest_map = first;
8131 superblock.newest_map = last;
8132 superblock.current_epoch = last;
8133
8134 // note in the superblock that we were clean thru the prior epoch
8135 epoch_t boot_epoch = service.get_boot_epoch();
8136 if (boot_epoch && boot_epoch >= superblock.mounted) {
8137 superblock.mounted = boot_epoch;
8138 superblock.clean_thru = last;
8139 }
8140
11fdf7f2
TL
8141 // check for pg_num changes and deleted pools
8142 OSDMapRef lastmap;
8143 for (auto& i : added_maps) {
8144 if (!lastmap) {
8145 if (!(lastmap = service.try_get_map(i.first - 1))) {
8146 dout(10) << __func__ << " can't get previous map " << i.first - 1
8147 << " probably first start of this osd" << dendl;
8148 continue;
8149 }
8150 }
8151 ceph_assert(lastmap->get_epoch() + 1 == i.second->get_epoch());
8152 for (auto& j : lastmap->get_pools()) {
8153 if (!i.second->have_pg_pool(j.first)) {
8154 pg_num_history.log_pool_delete(i.first, j.first);
8155 dout(10) << __func__ << " recording final pg_pool_t for pool "
8156 << j.first << dendl;
8157 // this information is needed by _make_pg() if have to restart before
8158 // the pool is deleted and need to instantiate a new (zombie) PG[Pool].
8159 ghobject_t obj = make_final_pool_info_oid(j.first);
8160 bufferlist bl;
8161 encode(j.second, bl, CEPH_FEATURES_ALL);
8162 string name = lastmap->get_pool_name(j.first);
8163 encode(name, bl);
8164 map<string,string> profile;
8165 if (lastmap->get_pg_pool(j.first)->is_erasure()) {
8166 profile = lastmap->get_erasure_code_profile(
8167 lastmap->get_pg_pool(j.first)->erasure_code_profile);
8168 }
8169 encode(profile, bl);
8170 t.write(coll_t::meta(), obj, 0, bl.length(), bl);
8171 service.store_deleted_pool_pg_num(j.first, j.second.get_pg_num());
8172 } else if (unsigned new_pg_num = i.second->get_pg_num(j.first);
8173 new_pg_num != j.second.get_pg_num()) {
8174 dout(10) << __func__ << " recording pool " << j.first << " pg_num "
8175 << j.second.get_pg_num() << " -> " << new_pg_num << dendl;
8176 pg_num_history.log_pg_num_change(i.first, j.first, new_pg_num);
8177 }
8178 }
8179 for (auto& j : i.second->get_pools()) {
8180 if (!lastmap->have_pg_pool(j.first)) {
8181 dout(10) << __func__ << " recording new pool " << j.first << " pg_num "
8182 << j.second.get_pg_num() << dendl;
8183 pg_num_history.log_pg_num_change(i.first, j.first,
8184 j.second.get_pg_num());
8185 }
8186 }
8187 lastmap = i.second;
8188 }
8189 pg_num_history.epoch = last;
8190 {
8191 bufferlist bl;
8192 ::encode(pg_num_history, bl);
8193 t.write(coll_t::meta(), make_pg_num_history_oid(), 0, bl.length(), bl);
8194 dout(20) << __func__ << " pg_num_history " << pg_num_history << dendl;
8195 }
8196
7c673cae
FG
8197 // superblock and commit
8198 write_superblock(t);
11fdf7f2 8199 t.register_on_commit(new C_OnMapCommit(this, start, last, m));
7c673cae 8200 store->queue_transaction(
11fdf7f2
TL
8201 service.meta_ch,
8202 std::move(t));
7c673cae
FG
8203 service.publish_superblock(superblock);
8204}
8205
8206void OSD::_committed_osd_maps(epoch_t first, epoch_t last, MOSDMap *m)
8207{
8208 dout(10) << __func__ << " " << first << ".." << last << dendl;
8209 if (is_stopping()) {
8210 dout(10) << __func__ << " bailing, we are shutting down" << dendl;
8211 return;
8212 }
11fdf7f2 8213 std::lock_guard l(osd_lock);
31f18b77
FG
8214 if (is_stopping()) {
8215 dout(10) << __func__ << " bailing, we are shutting down" << dendl;
8216 return;
8217 }
7c673cae
FG
8218 map_lock.get_write();
8219
8220 bool do_shutdown = false;
8221 bool do_restart = false;
8222 bool network_error = false;
8223
8224 // advance through the new maps
8225 for (epoch_t cur = first; cur <= last; cur++) {
8226 dout(10) << " advance to epoch " << cur
8227 << " (<= last " << last
8228 << " <= newest_map " << superblock.newest_map
8229 << ")" << dendl;
8230
8231 OSDMapRef newmap = get_map(cur);
11fdf7f2 8232 ceph_assert(newmap); // we just cached it above!
7c673cae
FG
8233
8234 // start blacklisting messages sent to peers that go down.
8235 service.pre_publish_map(newmap);
8236
8237 // kill connections to newly down osds
8238 bool waited_for_reservations = false;
8239 set<int> old;
8240 osdmap->get_all_osds(old);
8241 for (set<int>::iterator p = old.begin(); p != old.end(); ++p) {
8242 if (*p != whoami &&
8243 osdmap->is_up(*p) && // in old map
8244 newmap->is_down(*p)) { // but not the new one
8245 if (!waited_for_reservations) {
8246 service.await_reserved_maps();
8247 waited_for_reservations = true;
8248 }
8249 note_down_osd(*p);
8250 } else if (*p != whoami &&
8251 osdmap->is_down(*p) &&
8252 newmap->is_up(*p)) {
8253 note_up_osd(*p);
8254 }
8255 }
8256
81eedcae 8257 if (osdmap->is_noup(whoami) != newmap->is_noup(whoami)) {
7c673cae
FG
8258 dout(10) << __func__ << " NOUP flag changed in " << newmap->get_epoch()
8259 << dendl;
8260 if (is_booting()) {
8261 // this captures the case where we sent the boot message while
8262 // NOUP was being set on the mon and our boot request was
8263 // dropped, and then later it is cleared. it imperfectly
8264 // handles the case where our original boot message was not
8265 // dropped and we restart even though we might have booted, but
8266 // that is harmless (boot will just take slightly longer).
8267 do_restart = true;
8268 }
8269 }
8270
8271 osdmap = newmap;
8272 epoch_t up_epoch;
8273 epoch_t boot_epoch;
8274 service.retrieve_epochs(&boot_epoch, &up_epoch, NULL);
8275 if (!up_epoch &&
8276 osdmap->is_up(whoami) &&
11fdf7f2 8277 osdmap->get_addrs(whoami) == client_messenger->get_myaddrs()) {
7c673cae
FG
8278 up_epoch = osdmap->get_epoch();
8279 dout(10) << "up_epoch is " << up_epoch << dendl;
8280 if (!boot_epoch) {
8281 boot_epoch = osdmap->get_epoch();
8282 dout(10) << "boot_epoch is " << boot_epoch << dendl;
8283 }
8284 service.set_epochs(&boot_epoch, &up_epoch, NULL);
8285 }
8286 }
8287
8288 had_map_since = ceph_clock_now();
8289
8290 epoch_t _bind_epoch = service.get_bind_epoch();
8291 if (osdmap->is_up(whoami) &&
11fdf7f2
TL
8292 osdmap->get_addrs(whoami).legacy_equals(
8293 client_messenger->get_myaddrs()) &&
7c673cae
FG
8294 _bind_epoch < osdmap->get_up_from(whoami)) {
8295
8296 if (is_booting()) {
8297 dout(1) << "state: booting -> active" << dendl;
8298 set_state(STATE_ACTIVE);
11fdf7f2 8299 do_restart = false;
7c673cae
FG
8300
8301 // set incarnation so that osd_reqid_t's we generate for our
8302 // objecter requests are unique across restarts.
8303 service.objecter->set_client_incarnation(osdmap->get_epoch());
11fdf7f2 8304 cancel_pending_failures();
7c673cae
FG
8305 }
8306 }
8307
8308 if (osdmap->get_epoch() > 0 &&
8309 is_active()) {
8310 if (!osdmap->exists(whoami)) {
8311 dout(0) << "map says i do not exist. shutting down." << dendl;
8312 do_shutdown = true; // don't call shutdown() while we have
8313 // everything paused
8314 } else if (!osdmap->is_up(whoami) ||
11fdf7f2
TL
8315 !osdmap->get_addrs(whoami).legacy_equals(
8316 client_messenger->get_myaddrs()) ||
8317 !osdmap->get_cluster_addrs(whoami).legacy_equals(
8318 cluster_messenger->get_myaddrs()) ||
8319 !osdmap->get_hb_back_addrs(whoami).legacy_equals(
8320 hb_back_server_messenger->get_myaddrs()) ||
8321 !osdmap->get_hb_front_addrs(whoami).legacy_equals(
8322 hb_front_server_messenger->get_myaddrs())) {
7c673cae
FG
8323 if (!osdmap->is_up(whoami)) {
8324 if (service.is_preparing_to_stop() || service.is_stopping()) {
8325 service.got_stop_ack();
8326 } else {
c07f9fc5
FG
8327 clog->warn() << "Monitor daemon marked osd." << whoami << " down, "
8328 "but it is still running";
8329 clog->debug() << "map e" << osdmap->get_epoch()
8330 << " wrongly marked me down at e"
8331 << osdmap->get_down_at(whoami);
7c673cae 8332 }
11fdf7f2
TL
8333 } else if (!osdmap->get_addrs(whoami).legacy_equals(
8334 client_messenger->get_myaddrs())) {
7c673cae 8335 clog->error() << "map e" << osdmap->get_epoch()
11fdf7f2
TL
8336 << " had wrong client addr (" << osdmap->get_addrs(whoami)
8337 << " != my " << client_messenger->get_myaddrs() << ")";
8338 } else if (!osdmap->get_cluster_addrs(whoami).legacy_equals(
8339 cluster_messenger->get_myaddrs())) {
7c673cae
FG
8340 clog->error() << "map e" << osdmap->get_epoch()
8341 << " had wrong cluster addr ("
11fdf7f2
TL
8342 << osdmap->get_cluster_addrs(whoami)
8343 << " != my " << cluster_messenger->get_myaddrs() << ")";
8344 } else if (!osdmap->get_hb_back_addrs(whoami).legacy_equals(
8345 hb_back_server_messenger->get_myaddrs())) {
7c673cae 8346 clog->error() << "map e" << osdmap->get_epoch()
c07f9fc5 8347 << " had wrong heartbeat back addr ("
11fdf7f2
TL
8348 << osdmap->get_hb_back_addrs(whoami)
8349 << " != my " << hb_back_server_messenger->get_myaddrs()
7c673cae 8350 << ")";
11fdf7f2
TL
8351 } else if (!osdmap->get_hb_front_addrs(whoami).legacy_equals(
8352 hb_front_server_messenger->get_myaddrs())) {
7c673cae 8353 clog->error() << "map e" << osdmap->get_epoch()
c07f9fc5 8354 << " had wrong heartbeat front addr ("
11fdf7f2
TL
8355 << osdmap->get_hb_front_addrs(whoami)
8356 << " != my " << hb_front_server_messenger->get_myaddrs()
7c673cae
FG
8357 << ")";
8358 }
8359
8360 if (!service.is_stopping()) {
8361 epoch_t up_epoch = 0;
8362 epoch_t bind_epoch = osdmap->get_epoch();
8363 service.set_epochs(NULL,&up_epoch, &bind_epoch);
8364 do_restart = true;
8365
8366 //add markdown log
8367 utime_t now = ceph_clock_now();
8368 utime_t grace = utime_t(cct->_conf->osd_max_markdown_period, 0);
8369 osd_markdown_log.push_back(now);
8370 //clear all out-of-date log
8371 while (!osd_markdown_log.empty() &&
8372 osd_markdown_log.front() + grace < now)
8373 osd_markdown_log.pop_front();
8374 if ((int)osd_markdown_log.size() > cct->_conf->osd_max_markdown_count) {
8375 dout(0) << __func__ << " marked down "
8376 << osd_markdown_log.size()
8377 << " > osd_max_markdown_count "
8378 << cct->_conf->osd_max_markdown_count
8379 << " in last " << grace << " seconds, shutting down"
8380 << dendl;
8381 do_restart = false;
8382 do_shutdown = true;
8383 }
8384
8385 start_waiting_for_healthy();
8386
8387 set<int> avoid_ports;
8388#if defined(__FreeBSD__)
8389 // prevent FreeBSD from grabbing the client_messenger port during
8390 // rebinding. In which case a cluster_meesneger will connect also
8391 // to the same port
11fdf7f2 8392 client_messenger->get_myaddrs().get_ports(&avoid_ports);
7c673cae 8393#endif
11fdf7f2
TL
8394 cluster_messenger->get_myaddrs().get_ports(&avoid_ports);
8395 hb_back_server_messenger->get_myaddrs().get_ports(&avoid_ports);
8396 hb_front_server_messenger->get_myaddrs().get_ports(&avoid_ports);
7c673cae
FG
8397
8398 int r = cluster_messenger->rebind(avoid_ports);
8399 if (r != 0) {
8400 do_shutdown = true; // FIXME: do_restart?
8401 network_error = true;
8402 dout(0) << __func__ << " marked down:"
8403 << " rebind cluster_messenger failed" << dendl;
8404 }
8405
8406 r = hb_back_server_messenger->rebind(avoid_ports);
8407 if (r != 0) {
8408 do_shutdown = true; // FIXME: do_restart?
8409 network_error = true;
8410 dout(0) << __func__ << " marked down:"
8411 << " rebind hb_back_server_messenger failed" << dendl;
8412 }
8413
8414 r = hb_front_server_messenger->rebind(avoid_ports);
8415 if (r != 0) {
8416 do_shutdown = true; // FIXME: do_restart?
8417 network_error = true;
8418 dout(0) << __func__ << " marked down:"
8419 << " rebind hb_front_server_messenger failed" << dendl;
8420 }
8421
8422 hb_front_client_messenger->mark_down_all();
8423 hb_back_client_messenger->mark_down_all();
8424
494da23a 8425 reset_heartbeat_peers(true);
7c673cae
FG
8426 }
8427 }
8428 }
8429
8430 map_lock.put_write();
8431
11fdf7f2 8432 check_osdmap_features();
7c673cae
FG
8433
8434 // yay!
8435 consume_map();
8436
8437 if (is_active() || is_waiting_for_healthy())
8438 maybe_update_heartbeat_peers();
8439
11fdf7f2 8440 if (is_active()) {
7c673cae
FG
8441 activate_map();
8442 }
8443
31f18b77 8444 if (do_shutdown) {
7c673cae 8445 if (network_error) {
11fdf7f2 8446 cancel_pending_failures();
7c673cae
FG
8447 }
8448 // trigger shutdown in a different thread
8449 dout(0) << __func__ << " shutdown OSD via async signal" << dendl;
8450 queue_async_signal(SIGINT);
8451 }
31f18b77
FG
8452 else if (m->newest_map && m->newest_map > last) {
8453 dout(10) << " msg say newest map is " << m->newest_map
8454 << ", requesting more" << dendl;
8455 osdmap_subscribe(osdmap->get_epoch()+1, false);
8456 }
7c673cae
FG
8457 else if (is_preboot()) {
8458 if (m->get_source().is_mon())
8459 _preboot(m->oldest_map, m->newest_map);
8460 else
8461 start_boot();
8462 }
8463 else if (do_restart)
8464 start_boot();
8465
8466}
8467
11fdf7f2 8468void OSD::check_osdmap_features()
7c673cae
FG
8469{
8470 // adjust required feature bits?
8471
8472 // we have to be a bit careful here, because we are accessing the
8473 // Policy structures without taking any lock. in particular, only
8474 // modify integer values that can safely be read by a racing CPU.
8475 // since we are only accessing existing Policy structures a their
8476 // current memory location, and setting or clearing bits in integer
8477 // fields, and we are the only writer, this is not a problem.
8478
8479 {
8480 Messenger::Policy p = client_messenger->get_default_policy();
8481 uint64_t mask;
8482 uint64_t features = osdmap->get_features(entity_name_t::TYPE_CLIENT, &mask);
8483 if ((p.features_required & mask) != features) {
8484 dout(0) << "crush map has features " << features
8485 << ", adjusting msgr requires for clients" << dendl;
8486 p.features_required = (p.features_required & ~mask) | features;
8487 client_messenger->set_default_policy(p);
8488 }
8489 }
8490 {
8491 Messenger::Policy p = client_messenger->get_policy(entity_name_t::TYPE_MON);
8492 uint64_t mask;
8493 uint64_t features = osdmap->get_features(entity_name_t::TYPE_MON, &mask);
8494 if ((p.features_required & mask) != features) {
8495 dout(0) << "crush map has features " << features
8496 << " was " << p.features_required
8497 << ", adjusting msgr requires for mons" << dendl;
8498 p.features_required = (p.features_required & ~mask) | features;
8499 client_messenger->set_policy(entity_name_t::TYPE_MON, p);
8500 }
8501 }
8502 {
8503 Messenger::Policy p = cluster_messenger->get_policy(entity_name_t::TYPE_OSD);
8504 uint64_t mask;
8505 uint64_t features = osdmap->get_features(entity_name_t::TYPE_OSD, &mask);
8506
8507 if ((p.features_required & mask) != features) {
8508 dout(0) << "crush map has features " << features
8509 << ", adjusting msgr requires for osds" << dendl;
8510 p.features_required = (p.features_required & ~mask) | features;
8511 cluster_messenger->set_policy(entity_name_t::TYPE_OSD, p);
8512 }
8513
11fdf7f2 8514 if (!superblock.compat_features.incompat.contains(CEPH_OSD_FEATURE_INCOMPAT_SHARDS)) {
7c673cae
FG
8515 dout(0) << __func__ << " enabling on-disk ERASURE CODES compat feature" << dendl;
8516 superblock.compat_features.incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_SHARDS);
8517 ObjectStore::Transaction t;
8518 write_superblock(t);
11fdf7f2
TL
8519 int err = store->queue_transaction(service.meta_ch, std::move(t), NULL);
8520 ceph_assert(err == 0);
7c673cae
FG
8521 }
8522 }
11fdf7f2
TL
8523
8524 if (osdmap->require_osd_release < CEPH_RELEASE_NAUTILUS) {
8525 heartbeat_dispatcher.ms_set_require_authorizer(false);
8526 }
8527
8528 if (osdmap->require_osd_release != last_require_osd_release) {
8529 dout(1) << __func__ << " require_osd_release " << last_require_osd_release
8530 << " -> " << to_string(osdmap->require_osd_release) << dendl;
8531 store->write_meta("require_osd_release",
8532 stringify((int)osdmap->require_osd_release));
8533 last_require_osd_release = osdmap->require_osd_release;
8534 }
7c673cae
FG
8535}
8536
11fdf7f2
TL
8537struct C_FinishSplits : public Context {
8538 OSD *osd;
8539 set<PGRef> pgs;
8540 C_FinishSplits(OSD *osd, const set<PGRef> &in)
8541 : osd(osd), pgs(in) {}
8542 void finish(int r) override {
8543 osd->_finish_splits(pgs);
8544 }
8545};
8546
8547void OSD::_finish_splits(set<PGRef>& pgs)
7c673cae 8548{
11fdf7f2
TL
8549 dout(10) << __func__ << " " << pgs << dendl;
8550 if (is_stopping())
8551 return;
8552 PG::RecoveryCtx rctx = create_context();
8553 for (set<PGRef>::iterator i = pgs.begin();
8554 i != pgs.end();
8555 ++i) {
8556 PG *pg = i->get();
7c673cae 8557
11fdf7f2
TL
8558 pg->lock();
8559 dout(10) << __func__ << " " << *pg << dendl;
8560 epoch_t e = pg->get_osdmap_epoch();
8561 pg->handle_initialize(&rctx);
8562 pg->queue_null(e, e);
8563 dispatch_context_transaction(rctx, pg);
8564 pg->unlock();
7c673cae 8565
11fdf7f2
TL
8566 unsigned shard_index = pg->pg_id.hash_to_shard(num_shards);
8567 shards[shard_index]->register_and_wake_split_child(pg);
7c673cae
FG
8568 }
8569
11fdf7f2
TL
8570 dispatch_context(rctx, 0, service.get_osdmap());
8571};
8572
8573bool OSD::add_merge_waiter(OSDMapRef nextmap, spg_t target, PGRef src,
8574 unsigned need)
8575{
8576 std::lock_guard l(merge_lock);
8577 auto& p = merge_waiters[nextmap->get_epoch()][target];
8578 p[src->pg_id] = src;
8579 dout(10) << __func__ << " added merge_waiter " << src->pg_id
8580 << " for " << target << ", have " << p.size() << "/" << need
8581 << dendl;
8582 return p.size() == need;
8583}
8584
8585bool OSD::advance_pg(
8586 epoch_t osd_epoch,
8587 PG *pg,
8588 ThreadPool::TPHandle &handle,
8589 PG::RecoveryCtx *rctx)
8590{
8591 if (osd_epoch <= pg->get_osdmap_epoch()) {
8592 return true;
8593 }
8594 ceph_assert(pg->is_locked());
8595 OSDMapRef lastmap = pg->get_osdmap();
8596 ceph_assert(lastmap->get_epoch() < osd_epoch);
8597 set<PGRef> new_pgs; // any split children
8598 bool ret = true;
8599
8600 unsigned old_pg_num = lastmap->have_pg_pool(pg->pg_id.pool()) ?
8601 lastmap->get_pg_num(pg->pg_id.pool()) : 0;
8602 for (epoch_t next_epoch = pg->get_osdmap_epoch() + 1;
8603 next_epoch <= osd_epoch;
7c673cae
FG
8604 ++next_epoch) {
8605 OSDMapRef nextmap = service.try_get_map(next_epoch);
8606 if (!nextmap) {
8607 dout(20) << __func__ << " missing map " << next_epoch << dendl;
7c673cae
FG
8608 continue;
8609 }
8610
11fdf7f2
TL
8611 unsigned new_pg_num =
8612 (old_pg_num && nextmap->have_pg_pool(pg->pg_id.pool())) ?
8613 nextmap->get_pg_num(pg->pg_id.pool()) : 0;
8614 if (old_pg_num && new_pg_num && old_pg_num != new_pg_num) {
8615 // check for merge
8616 if (nextmap->have_pg_pool(pg->pg_id.pool())) {
8617 spg_t parent;
8618 if (pg->pg_id.is_merge_source(
8619 old_pg_num,
8620 new_pg_num,
8621 &parent)) {
8622 // we are merge source
8623 PGRef spg = pg; // carry a ref
8624 dout(1) << __func__ << " " << pg->pg_id
8625 << " is merge source, target is " << parent
8626 << dendl;
8627 pg->write_if_dirty(rctx);
8628 dispatch_context_transaction(*rctx, pg, &handle);
8629 pg->ch->flush();
8630 pg->on_shutdown();
8631 OSDShard *sdata = pg->osd_shard;
8632 {
8633 std::lock_guard l(sdata->shard_lock);
8634 if (pg->pg_slot) {
8635 sdata->_detach_pg(pg->pg_slot);
8636 // update pg count now since we might not get an osdmap
8637 // any time soon.
8638 if (pg->is_primary())
8639 logger->dec(l_osd_pg_primary);
8640 else if (pg->is_replica())
8641 logger->dec(l_osd_pg_replica);
8642 else
8643 logger->dec(l_osd_pg_stray);
8644 }
8645 }
8646 pg->unlock();
8647
8648 set<spg_t> children;
8649 parent.is_split(new_pg_num, old_pg_num, &children);
8650 if (add_merge_waiter(nextmap, parent, pg, children.size())) {
8651 enqueue_peering_evt(
8652 parent,
8653 PGPeeringEventRef(
8654 std::make_shared<PGPeeringEvent>(
8655 nextmap->get_epoch(),
8656 nextmap->get_epoch(),
8657 NullEvt())));
8658 }
8659 ret = false;
8660 goto out;
8661 } else if (pg->pg_id.is_merge_target(old_pg_num, new_pg_num)) {
8662 // we are merge target
8663 set<spg_t> children;
8664 pg->pg_id.is_split(new_pg_num, old_pg_num, &children);
8665 dout(20) << __func__ << " " << pg->pg_id
8666 << " is merge target, sources are " << children
8667 << dendl;
8668 map<spg_t,PGRef> sources;
8669 {
8670 std::lock_guard l(merge_lock);
8671 auto& s = merge_waiters[nextmap->get_epoch()][pg->pg_id];
8672 unsigned need = children.size();
8673 dout(20) << __func__ << " have " << s.size() << "/"
8674 << need << dendl;
8675 if (s.size() == need) {
8676 sources.swap(s);
8677 merge_waiters[nextmap->get_epoch()].erase(pg->pg_id);
8678 if (merge_waiters[nextmap->get_epoch()].empty()) {
8679 merge_waiters.erase(nextmap->get_epoch());
8680 }
8681 }
8682 }
8683 if (!sources.empty()) {
8684 unsigned new_pg_num = nextmap->get_pg_num(pg->pg_id.pool());
8685 unsigned split_bits = pg->pg_id.get_split_bits(new_pg_num);
8686 dout(1) << __func__ << " merging " << pg->pg_id << dendl;
8687 pg->merge_from(
8688 sources, rctx, split_bits,
8689 nextmap->get_pg_pool(
8690 pg->pg_id.pool())->last_pg_merge_meta);
8691 pg->pg_slot->waiting_for_merge_epoch = 0;
8692 } else {
8693 dout(20) << __func__ << " not ready to merge yet" << dendl;
8694 pg->write_if_dirty(rctx);
8695 pg->unlock();
8696 // kick source(s) to get them ready
8697 for (auto& i : children) {
8698 dout(20) << __func__ << " kicking source " << i << dendl;
8699 enqueue_peering_evt(
8700 i,
8701 PGPeeringEventRef(
8702 std::make_shared<PGPeeringEvent>(
8703 nextmap->get_epoch(),
8704 nextmap->get_epoch(),
8705 NullEvt())));
8706 }
8707 ret = false;
8708 goto out;
8709 }
8710 }
8711 }
8712 }
8713
7c673cae
FG
8714 vector<int> newup, newacting;
8715 int up_primary, acting_primary;
8716 nextmap->pg_to_up_acting_osds(
11fdf7f2 8717 pg->pg_id.pgid,
7c673cae
FG
8718 &newup, &up_primary,
8719 &newacting, &acting_primary);
8720 pg->handle_advance_map(
8721 nextmap, lastmap, newup, up_primary,
8722 newacting, acting_primary, rctx);
8723
494da23a
TL
8724 auto oldpool = lastmap->get_pools().find(pg->pg_id.pool());
8725 auto newpool = nextmap->get_pools().find(pg->pg_id.pool());
8726 if (oldpool != lastmap->get_pools().end()
8727 && newpool != nextmap->get_pools().end()) {
8728 dout(20) << __func__
8729 << " new pool opts " << newpool->second.opts
8730 << " old pool opts " << oldpool->second.opts
8731 << dendl;
8732
8733 double old_min_interval = 0, new_min_interval = 0;
8734 oldpool->second.opts.get(pool_opts_t::SCRUB_MIN_INTERVAL, &old_min_interval);
8735 newpool->second.opts.get(pool_opts_t::SCRUB_MIN_INTERVAL, &new_min_interval);
8736
8737 double old_max_interval = 0, new_max_interval = 0;
8738 oldpool->second.opts.get(pool_opts_t::SCRUB_MAX_INTERVAL, &old_max_interval);
8739 newpool->second.opts.get(pool_opts_t::SCRUB_MAX_INTERVAL, &new_max_interval);
8740
8741 // Assume if an interval is change from set to unset or vice versa the actual config
8742 // is different. Keep it simple even if it is possible to call resched_all_scrub()
8743 // unnecessarily.
8744 if (old_min_interval != new_min_interval || old_max_interval != new_max_interval) {
8745 pg->on_info_history_change();
8746 }
8747 }
8748
11fdf7f2
TL
8749 if (new_pg_num && old_pg_num != new_pg_num) {
8750 // check for split
8751 set<spg_t> children;
8752 if (pg->pg_id.is_split(
8753 old_pg_num,
8754 new_pg_num,
8755 &children)) {
8756 split_pgs(
8757 pg, children, &new_pgs, lastmap, nextmap,
8758 rctx);
8759 }
7c673cae
FG
8760 }
8761
8762 lastmap = nextmap;
11fdf7f2 8763 old_pg_num = new_pg_num;
7c673cae
FG
8764 handle.reset_tp_timeout();
8765 }
7c673cae 8766 pg->handle_activate_map(rctx);
11fdf7f2
TL
8767
8768 ret = true;
8769 out:
8770 if (!new_pgs.empty()) {
8771 rctx->transaction->register_on_applied(new C_FinishSplits(this, new_pgs));
7c673cae 8772 }
11fdf7f2 8773 return ret;
7c673cae
FG
8774}
8775
8776void OSD::consume_map()
8777{
11fdf7f2 8778 ceph_assert(osd_lock.is_locked());
7c673cae
FG
8779 dout(7) << "consume_map version " << osdmap->get_epoch() << dendl;
8780
3efd9988
FG
8781 /** make sure the cluster is speaking in SORTBITWISE, because we don't
8782 * speak the older sorting version any more. Be careful not to force
8783 * a shutdown if we are merely processing old maps, though.
8784 */
8785 if (!osdmap->test_flag(CEPH_OSDMAP_SORTBITWISE) && is_active()) {
8786 derr << __func__ << " SORTBITWISE flag is not set" << dendl;
8787 ceph_abort();
8788 }
8789
11fdf7f2
TL
8790 service.pre_publish_map(osdmap);
8791 service.await_reserved_maps();
8792 service.publish_map(osdmap);
7c673cae 8793
11fdf7f2
TL
8794 // prime splits and merges
8795 set<pair<spg_t,epoch_t>> newly_split; // splits, and when
8796 set<pair<spg_t,epoch_t>> merge_pgs; // merge participants, and when
8797 for (auto& shard : shards) {
8798 shard->identify_splits_and_merges(osdmap, &newly_split, &merge_pgs);
8799 }
8800 if (!newly_split.empty()) {
8801 for (auto& shard : shards) {
8802 shard->prime_splits(osdmap, &newly_split);
8803 }
8804 ceph_assert(newly_split.empty());
8805 }
7c673cae 8806
11fdf7f2
TL
8807 // prune sent_ready_to_merge
8808 service.prune_sent_ready_to_merge(osdmap);
7c673cae 8809
11fdf7f2
TL
8810 // FIXME, maybe: We could race against an incoming peering message
8811 // that instantiates a merge PG after identify_merges() below and
8812 // never set up its peer to complete the merge. An OSD restart
8813 // would clear it up. This is a hard race to resolve,
8814 // extraordinarily rare (we only merge PGs that are stable and
8815 // clean, so it'd have to be an imported PG to an OSD with a
8816 // slightly stale OSDMap...), so I'm ignoring it for now. We plan to
8817 // replace all of this with a seastar-based code soon anyway.
8818 if (!merge_pgs.empty()) {
8819 // mark the pgs we already have, or create new and empty merge
8820 // participants for those we are missing. do this all under the
8821 // shard lock so we don't have to worry about racing pg creates
8822 // via _process.
8823 for (auto& shard : shards) {
8824 shard->prime_merges(osdmap, &merge_pgs);
7c673cae 8825 }
11fdf7f2
TL
8826 ceph_assert(merge_pgs.empty());
8827 }
8828
8829 service.prune_pg_created();
8830
8831 unsigned pushes_to_free = 0;
8832 for (auto& shard : shards) {
8833 shard->consume_map(osdmap, &pushes_to_free);
8834 }
8835
8836 vector<spg_t> pgids;
8837 _get_pgids(&pgids);
8838
8839 // count (FIXME, probably during seastar rewrite)
8840 int num_pg_primary = 0, num_pg_replica = 0, num_pg_stray = 0;
8841 vector<PGRef> pgs;
8842 _get_pgs(&pgs);
8843 for (auto& pg : pgs) {
8844 // FIXME (probably during seastar rewrite): this is lockless and
8845 // racy, but we don't want to take pg lock here.
8846 if (pg->is_primary())
8847 num_pg_primary++;
8848 else if (pg->is_replica())
8849 num_pg_replica++;
8850 else
8851 num_pg_stray++;
8852 }
3efd9988 8853
11fdf7f2
TL
8854 {
8855 // FIXME (as part of seastar rewrite): move to OSDShard
8856 std::lock_guard l(pending_creates_lock);
8857 for (auto pg = pending_creates_from_osd.begin();
8858 pg != pending_creates_from_osd.end();) {
b32b8144 8859 if (osdmap->get_pg_acting_rank(pg->first, whoami) < 0) {
11fdf7f2
TL
8860 dout(10) << __func__ << " pg " << pg->first << " doesn't map here, "
8861 << "discarding pending_create_from_osd" << dendl;
3efd9988
FG
8862 pg = pending_creates_from_osd.erase(pg);
8863 } else {
8864 ++pg;
8865 }
8866 }
7c673cae
FG
8867 }
8868
7c673cae
FG
8869 service.maybe_inject_dispatch_delay();
8870
8871 dispatch_sessions_waiting_on_map();
8872
8873 service.maybe_inject_dispatch_delay();
8874
11fdf7f2 8875 service.release_reserved_pushes(pushes_to_free);
7c673cae 8876
11fdf7f2
TL
8877 // queue null events to push maps down to individual PGs
8878 for (auto pgid : pgids) {
8879 enqueue_peering_evt(
8880 pgid,
8881 PGPeeringEventRef(
8882 std::make_shared<PGPeeringEvent>(
8883 osdmap->get_epoch(),
8884 osdmap->get_epoch(),
8885 NullEvt())));
7c673cae 8886 }
11fdf7f2 8887 logger->set(l_osd_pg, pgids.size());
7c673cae
FG
8888 logger->set(l_osd_pg_primary, num_pg_primary);
8889 logger->set(l_osd_pg_replica, num_pg_replica);
8890 logger->set(l_osd_pg_stray, num_pg_stray);
8891}
8892
8893void OSD::activate_map()
8894{
11fdf7f2 8895 ceph_assert(osd_lock.is_locked());
7c673cae
FG
8896
8897 dout(7) << "activate_map version " << osdmap->get_epoch() << dendl;
8898
7c673cae
FG
8899 if (osdmap->test_flag(CEPH_OSDMAP_FULL)) {
8900 dout(10) << " osdmap flagged full, doing onetime osdmap subscribe" << dendl;
8901 osdmap_subscribe(osdmap->get_epoch() + 1, false);
8902 }
8903
8904 // norecover?
8905 if (osdmap->test_flag(CEPH_OSDMAP_NORECOVER)) {
8906 if (!service.recovery_is_paused()) {
8907 dout(1) << "pausing recovery (NORECOVER flag set)" << dendl;
8908 service.pause_recovery();
8909 }
8910 } else {
8911 if (service.recovery_is_paused()) {
8912 dout(1) << "unpausing recovery (NORECOVER flag unset)" << dendl;
8913 service.unpause_recovery();
8914 }
8915 }
8916
8917 service.activate_map();
8918
8919 // process waiters
8920 take_waiters(waiting_for_osdmap);
8921}
8922
8923bool OSD::require_mon_peer(const Message *m)
8924{
8925 if (!m->get_connection()->peer_is_mon()) {
8926 dout(0) << "require_mon_peer received from non-mon "
8927 << m->get_connection()->get_peer_addr()
8928 << " " << *m << dendl;
8929 return false;
8930 }
8931 return true;
8932}
8933
8934bool OSD::require_mon_or_mgr_peer(const Message *m)
8935{
8936 if (!m->get_connection()->peer_is_mon() &&
8937 !m->get_connection()->peer_is_mgr()) {
8938 dout(0) << "require_mon_or_mgr_peer received from non-mon, non-mgr "
8939 << m->get_connection()->get_peer_addr()
8940 << " " << *m << dendl;
8941 return false;
8942 }
8943 return true;
8944}
8945
8946bool OSD::require_osd_peer(const Message *m)
8947{
8948 if (!m->get_connection()->peer_is_osd()) {
8949 dout(0) << "require_osd_peer received from non-osd "
8950 << m->get_connection()->get_peer_addr()
8951 << " " << *m << dendl;
8952 return false;
8953 }
8954 return true;
8955}
8956
8957bool OSD::require_self_aliveness(const Message *m, epoch_t epoch)
8958{
8959 epoch_t up_epoch = service.get_up_epoch();
8960 if (epoch < up_epoch) {
8961 dout(7) << "from pre-up epoch " << epoch << " < " << up_epoch << dendl;
8962 return false;
8963 }
8964
8965 if (!is_active()) {
8966 dout(7) << "still in boot state, dropping message " << *m << dendl;
8967 return false;
8968 }
8969
8970 return true;
8971}
8972
8973bool OSD::require_same_peer_instance(const Message *m, OSDMapRef& map,
8974 bool is_fast_dispatch)
8975{
8976 int from = m->get_source().num();
8977
8978 if (map->is_down(from) ||
11fdf7f2 8979 (map->get_cluster_addrs(from) != m->get_source_addrs())) {
7c673cae
FG
8980 dout(5) << "from dead osd." << from << ", marking down, "
8981 << " msg was " << m->get_source_inst().addr
11fdf7f2
TL
8982 << " expected "
8983 << (map->is_up(from) ?
8984 map->get_cluster_addrs(from) : entity_addrvec_t())
7c673cae
FG
8985 << dendl;
8986 ConnectionRef con = m->get_connection();
8987 con->mark_down();
11fdf7f2
TL
8988 auto priv = con->get_priv();
8989 if (auto s = static_cast<Session*>(priv.get()); s) {
7c673cae
FG
8990 if (!is_fast_dispatch)
8991 s->session_dispatch_lock.Lock();
8992 clear_session_waiting_on_map(s);
11fdf7f2
TL
8993 con->set_priv(nullptr); // break ref <-> session cycle, if any
8994 s->con.reset();
7c673cae
FG
8995 if (!is_fast_dispatch)
8996 s->session_dispatch_lock.Unlock();
7c673cae
FG
8997 }
8998 return false;
8999 }
9000 return true;
9001}
9002
9003
9004/*
9005 * require that we have same (or newer) map, and that
9006 * the source is the pg primary.
9007 */
9008bool OSD::require_same_or_newer_map(OpRequestRef& op, epoch_t epoch,
9009 bool is_fast_dispatch)
9010{
9011 const Message *m = op->get_req();
9012 dout(15) << "require_same_or_newer_map " << epoch
9013 << " (i am " << osdmap->get_epoch() << ") " << m << dendl;
9014
11fdf7f2 9015 ceph_assert(osd_lock.is_locked());
7c673cae
FG
9016
9017 // do they have a newer map?
9018 if (epoch > osdmap->get_epoch()) {
9019 dout(7) << "waiting for newer map epoch " << epoch
9020 << " > my " << osdmap->get_epoch() << " with " << m << dendl;
9021 wait_for_new_map(op);
9022 return false;
9023 }
9024
9025 if (!require_self_aliveness(op->get_req(), epoch)) {
9026 return false;
9027 }
9028
9029 // ok, our map is same or newer.. do they still exist?
9030 if (m->get_connection()->get_messenger() == cluster_messenger &&
9031 !require_same_peer_instance(op->get_req(), osdmap, is_fast_dispatch)) {
9032 return false;
9033 }
9034
9035 return true;
9036}
9037
9038
9039
9040
9041
9042// ----------------------------------------
9043// pg creation
9044
9045void OSD::split_pgs(
9046 PG *parent,
31f18b77 9047 const set<spg_t> &childpgids, set<PGRef> *out_pgs,
7c673cae
FG
9048 OSDMapRef curmap,
9049 OSDMapRef nextmap,
9050 PG::RecoveryCtx *rctx)
9051{
11fdf7f2
TL
9052 unsigned pg_num = nextmap->get_pg_num(parent->pg_id.pool());
9053 parent->update_snap_mapper_bits(parent->get_pgid().get_split_bits(pg_num));
7c673cae 9054
11fdf7f2
TL
9055 vector<object_stat_sum_t> updated_stats;
9056 parent->start_split_stats(childpgids, &updated_stats);
7c673cae
FG
9057
9058 vector<object_stat_sum_t>::iterator stat_iter = updated_stats.begin();
9059 for (set<spg_t>::const_iterator i = childpgids.begin();
9060 i != childpgids.end();
9061 ++i, ++stat_iter) {
11fdf7f2
TL
9062 ceph_assert(stat_iter != updated_stats.end());
9063 dout(10) << __func__ << " splitting " << *parent << " into " << *i << dendl;
7c673cae
FG
9064 PG* child = _make_pg(nextmap, *i);
9065 child->lock(true);
9066 out_pgs->insert(child);
11fdf7f2 9067 child->ch = store->create_new_collection(child->coll);
7c673cae 9068
11fdf7f2
TL
9069 {
9070 uint32_t shard_index = i->hash_to_shard(shards.size());
9071 assert(NULL != shards[shard_index]);
9072 store->set_collection_commit_queue(child->coll, &(shards[shard_index]->context_queue));
9073 }
7c673cae 9074
11fdf7f2
TL
9075 unsigned split_bits = i->get_split_bits(pg_num);
9076 dout(10) << " pg_num is " << pg_num
9077 << ", m_seed " << i->ps()
9078 << ", split_bits is " << split_bits << dendl;
7c673cae
FG
9079 parent->split_colls(
9080 *i,
9081 split_bits,
9082 i->ps(),
11fdf7f2 9083 &child->get_pool().info,
7c673cae
FG
9084 rctx->transaction);
9085 parent->split_into(
9086 i->pgid,
9087 child,
9088 split_bits);
7c673cae 9089
11fdf7f2 9090 child->finish_split_stats(*stat_iter, rctx->transaction);
7c673cae
FG
9091 child->unlock();
9092 }
11fdf7f2
TL
9093 ceph_assert(stat_iter != updated_stats.end());
9094 parent->finish_split_stats(*stat_iter, rctx->transaction);
7c673cae
FG
9095}
9096
9097/*
9098 * holding osd_lock
9099 */
9100void OSD::handle_pg_create(OpRequestRef op)
9101{
9102 const MOSDPGCreate *m = static_cast<const MOSDPGCreate*>(op->get_req());
11fdf7f2 9103 ceph_assert(m->get_type() == MSG_OSD_PG_CREATE);
7c673cae
FG
9104
9105 dout(10) << "handle_pg_create " << *m << dendl;
9106
9107 if (!require_mon_peer(op->get_req())) {
9108 return;
9109 }
9110
9111 if (!require_same_or_newer_map(op, m->epoch, false))
9112 return;
9113
9114 op->mark_started();
9115
9116 map<pg_t,utime_t>::const_iterator ci = m->ctimes.begin();
9117 for (map<pg_t,pg_create_t>::const_iterator p = m->mkpg.begin();
9118 p != m->mkpg.end();
9119 ++p, ++ci) {
11fdf7f2 9120 ceph_assert(ci != m->ctimes.end() && ci->first == p->first);
7c673cae
FG
9121 epoch_t created = p->second.created;
9122 if (p->second.split_bits) // Skip split pgs
9123 continue;
9124 pg_t on = p->first;
9125
7c673cae
FG
9126 if (!osdmap->have_pg_pool(on.pool())) {
9127 dout(20) << "ignoring pg on deleted pool " << on << dendl;
9128 continue;
9129 }
9130
9131 dout(20) << "mkpg " << on << " e" << created << "@" << ci->second << dendl;
9132
9133 // is it still ours?
9134 vector<int> up, acting;
9135 int up_primary = -1;
9136 int acting_primary = -1;
9137 osdmap->pg_to_up_acting_osds(on, &up, &up_primary, &acting, &acting_primary);
9138 int role = osdmap->calc_pg_role(whoami, acting, acting.size());
9139
9140 if (acting_primary != whoami) {
9141 dout(10) << "mkpg " << on << " not acting_primary (" << acting_primary
9142 << "), my role=" << role << ", skipping" << dendl;
9143 continue;
9144 }
9145
9146 spg_t pgid;
9147 bool mapped = osdmap->get_primary_shard(on, &pgid);
11fdf7f2 9148 ceph_assert(mapped);
7c673cae 9149
11fdf7f2 9150 PastIntervals pi;
7c673cae
FG
9151 pg_history_t history;
9152 build_initial_pg_history(pgid, created, ci->second, &history, &pi);
9153
11fdf7f2
TL
9154 // The mon won't resend unless the primary changed, so we ignore
9155 // same_interval_since. We'll pass this history with the current
9156 // epoch as the event.
7c673cae
FG
9157 if (history.same_primary_since > m->epoch) {
9158 dout(10) << __func__ << ": got obsolete pg create on pgid "
9159 << pgid << " from epoch " << m->epoch
9160 << ", primary changed in " << history.same_primary_since
9161 << dendl;
9162 continue;
9163 }
11fdf7f2
TL
9164 enqueue_peering_evt(
9165 pgid,
9166 PGPeeringEventRef(
9167 std::make_shared<PGPeeringEvent>(
9168 osdmap->get_epoch(),
9169 osdmap->get_epoch(),
9170 NullEvt(),
9171 true,
9172 new PGCreateInfo(
9173 pgid,
9174 osdmap->get_epoch(),
9175 history,
9176 pi,
9177 true)
9178 )));
7c673cae 9179 }
7c673cae 9180
3efd9988 9181 {
11fdf7f2 9182 std::lock_guard l(pending_creates_lock);
3efd9988
FG
9183 if (pending_creates_from_mon == 0) {
9184 last_pg_create_epoch = m->epoch;
9185 }
9186 }
11fdf7f2 9187
7c673cae
FG
9188 maybe_update_heartbeat_peers();
9189}
9190
9191
9192// ----------------------------------------
9193// peering and recovery
9194
9195PG::RecoveryCtx OSD::create_context()
9196{
9197 ObjectStore::Transaction *t = new ObjectStore::Transaction;
7c673cae
FG
9198 map<int, map<spg_t,pg_query_t> > *query_map =
9199 new map<int, map<spg_t, pg_query_t> >;
9200 map<int,vector<pair<pg_notify_t, PastIntervals> > > *notify_list =
9201 new map<int, vector<pair<pg_notify_t, PastIntervals> > >;
9202 map<int,vector<pair<pg_notify_t, PastIntervals> > > *info_map =
9203 new map<int,vector<pair<pg_notify_t, PastIntervals> > >;
11fdf7f2 9204 PG::RecoveryCtx rctx(query_map, info_map, notify_list, t);
7c673cae
FG
9205 return rctx;
9206}
9207
7c673cae
FG
9208void OSD::dispatch_context_transaction(PG::RecoveryCtx &ctx, PG *pg,
9209 ThreadPool::TPHandle *handle)
9210{
11fdf7f2 9211 if (!ctx.transaction->empty() || ctx.transaction->has_contexts()) {
7c673cae 9212 int tr = store->queue_transaction(
11fdf7f2
TL
9213 pg->ch,
9214 std::move(*ctx.transaction), TrackedOpRef(), handle);
9215 ceph_assert(tr == 0);
7c673cae 9216 delete (ctx.transaction);
7c673cae 9217 ctx.transaction = new ObjectStore::Transaction;
7c673cae
FG
9218 }
9219}
9220
9221void OSD::dispatch_context(PG::RecoveryCtx &ctx, PG *pg, OSDMapRef curmap,
9222 ThreadPool::TPHandle *handle)
9223{
11fdf7f2
TL
9224 if (!service.get_osdmap()->is_up(whoami)) {
9225 dout(20) << __func__ << " not up in osdmap" << dendl;
9226 } else if (!is_active()) {
9227 dout(20) << __func__ << " not active" << dendl;
9228 } else {
7c673cae
FG
9229 do_notifies(*ctx.notify_list, curmap);
9230 do_queries(*ctx.query_map, curmap);
9231 do_infos(*ctx.info_map, curmap);
9232 }
11fdf7f2 9233 if ((!ctx.transaction->empty() || ctx.transaction->has_contexts()) && pg) {
7c673cae 9234 int tr = store->queue_transaction(
11fdf7f2
TL
9235 pg->ch,
9236 std::move(*ctx.transaction), TrackedOpRef(),
7c673cae 9237 handle);
11fdf7f2 9238 ceph_assert(tr == 0);
7c673cae 9239 }
11fdf7f2
TL
9240 delete ctx.notify_list;
9241 delete ctx.query_map;
9242 delete ctx.info_map;
9243 delete ctx.transaction;
9244}
9245
9246void OSD::discard_context(PG::RecoveryCtx& ctx)
9247{
9248 delete ctx.notify_list;
9249 delete ctx.query_map;
9250 delete ctx.info_map;
9251 delete ctx.transaction;
7c673cae
FG
9252}
9253
11fdf7f2 9254
7c673cae
FG
9255/** do_notifies
9256 * Send an MOSDPGNotify to a primary, with a list of PGs that I have
9257 * content for, and they are primary for.
9258 */
9259
9260void OSD::do_notifies(
9261 map<int,vector<pair<pg_notify_t,PastIntervals> > >& notify_list,
9262 OSDMapRef curmap)
9263{
9264 for (map<int,
9265 vector<pair<pg_notify_t,PastIntervals> > >::iterator it =
9266 notify_list.begin();
9267 it != notify_list.end();
9268 ++it) {
9269 if (!curmap->is_up(it->first)) {
9270 dout(20) << __func__ << " skipping down osd." << it->first << dendl;
9271 continue;
9272 }
9273 ConnectionRef con = service.get_con_osd_cluster(
9274 it->first, curmap->get_epoch());
9275 if (!con) {
9276 dout(20) << __func__ << " skipping osd." << it->first
9277 << " (NULL con)" << dendl;
9278 continue;
9279 }
9280 service.share_map_peer(it->first, con.get(), curmap);
3efd9988 9281 dout(7) << __func__ << " osd." << it->first
7c673cae
FG
9282 << " on " << it->second.size() << " PGs" << dendl;
9283 MOSDPGNotify *m = new MOSDPGNotify(curmap->get_epoch(),
9284 it->second);
9285 con->send_message(m);
9286 }
9287}
9288
9289
9290/** do_queries
9291 * send out pending queries for info | summaries
9292 */
9293void OSD::do_queries(map<int, map<spg_t,pg_query_t> >& query_map,
9294 OSDMapRef curmap)
9295{
9296 for (map<int, map<spg_t,pg_query_t> >::iterator pit = query_map.begin();
9297 pit != query_map.end();
9298 ++pit) {
9299 if (!curmap->is_up(pit->first)) {
9300 dout(20) << __func__ << " skipping down osd." << pit->first << dendl;
9301 continue;
9302 }
9303 int who = pit->first;
9304 ConnectionRef con = service.get_con_osd_cluster(who, curmap->get_epoch());
9305 if (!con) {
9306 dout(20) << __func__ << " skipping osd." << who
9307 << " (NULL con)" << dendl;
9308 continue;
9309 }
9310 service.share_map_peer(who, con.get(), curmap);
9311 dout(7) << __func__ << " querying osd." << who
9312 << " on " << pit->second.size() << " PGs" << dendl;
9313 MOSDPGQuery *m = new MOSDPGQuery(curmap->get_epoch(), pit->second);
9314 con->send_message(m);
9315 }
9316}
9317
9318
9319void OSD::do_infos(map<int,
9320 vector<pair<pg_notify_t, PastIntervals> > >& info_map,
9321 OSDMapRef curmap)
9322{
9323 for (map<int,
9324 vector<pair<pg_notify_t, PastIntervals> > >::iterator p =
9325 info_map.begin();
9326 p != info_map.end();
9327 ++p) {
9328 if (!curmap->is_up(p->first)) {
9329 dout(20) << __func__ << " skipping down osd." << p->first << dendl;
9330 continue;
9331 }
9332 for (vector<pair<pg_notify_t,PastIntervals> >::iterator i = p->second.begin();
9333 i != p->second.end();
9334 ++i) {
9335 dout(20) << __func__ << " sending info " << i->first.info
9336 << " to shard " << p->first << dendl;
9337 }
9338 ConnectionRef con = service.get_con_osd_cluster(
9339 p->first, curmap->get_epoch());
9340 if (!con) {
9341 dout(20) << __func__ << " skipping osd." << p->first
9342 << " (NULL con)" << dendl;
9343 continue;
9344 }
9345 service.share_map_peer(p->first, con.get(), curmap);
9346 MOSDPGInfo *m = new MOSDPGInfo(curmap->get_epoch());
9347 m->pg_list = p->second;
9348 con->send_message(m);
9349 }
9350 info_map.clear();
9351}
9352
11fdf7f2 9353void OSD::handle_fast_pg_create(MOSDPGCreate2 *m)
7c673cae 9354{
11fdf7f2
TL
9355 dout(7) << __func__ << " " << *m << " from " << m->get_source() << dendl;
9356 if (!require_mon_peer(m)) {
9357 m->put();
7c673cae 9358 return;
7c673cae 9359 }
11fdf7f2
TL
9360 for (auto& p : m->pgs) {
9361 spg_t pgid = p.first;
9362 epoch_t created = p.second.first;
9363 utime_t created_stamp = p.second.second;
9364 dout(20) << __func__ << " " << pgid << " e" << created
9365 << "@" << created_stamp << dendl;
9366 pg_history_t h;
9367 h.epoch_created = created;
9368 h.epoch_pool_created = created;
9369 h.same_up_since = created;
9370 h.same_interval_since = created;
9371 h.same_primary_since = created;
9372 h.last_scrub_stamp = created_stamp;
9373 h.last_deep_scrub_stamp = created_stamp;
9374 h.last_clean_scrub_stamp = created_stamp;
9375
9376 enqueue_peering_evt(
9377 pgid,
9378 PGPeeringEventRef(
9379 std::make_shared<PGPeeringEvent>(
9380 m->epoch,
9381 m->epoch,
9382 NullEvt(),
9383 true,
9384 new PGCreateInfo(
9385 pgid,
9386 created,
9387 h,
9388 PastIntervals(),
9389 true)
9390 )));
9391 }
7c673cae 9392
11fdf7f2
TL
9393 {
9394 std::lock_guard l(pending_creates_lock);
9395 if (pending_creates_from_mon == 0) {
9396 last_pg_create_epoch = m->epoch;
9397 }
7c673cae
FG
9398 }
9399
11fdf7f2 9400 m->put();
7c673cae
FG
9401}
9402
11fdf7f2 9403void OSD::handle_fast_pg_query(MOSDPGQuery *m)
7c673cae 9404{
11fdf7f2
TL
9405 dout(7) << __func__ << " " << *m << " from " << m->get_source() << dendl;
9406 if (!require_osd_peer(m)) {
9407 m->put();
7c673cae 9408 return;
11fdf7f2 9409 }
7c673cae 9410 int from = m->get_source().num();
11fdf7f2
TL
9411 for (auto& p : m->pg_list) {
9412 enqueue_peering_evt(
9413 p.first,
9414 PGPeeringEventRef(
9415 std::make_shared<PGPeeringEvent>(
9416 p.second.epoch_sent, p.second.epoch_sent,
9417 MQuery(
9418 p.first,
9419 pg_shard_t(from, p.second.from),
9420 p.second,
9421 p.second.epoch_sent),
9422 false))
7c673cae
FG
9423 );
9424 }
11fdf7f2 9425 m->put();
7c673cae
FG
9426}
9427
11fdf7f2 9428void OSD::handle_fast_pg_notify(MOSDPGNotify* m)
7c673cae 9429{
11fdf7f2
TL
9430 dout(7) << __func__ << " " << *m << " from " << m->get_source() << dendl;
9431 if (!require_osd_peer(m)) {
9432 m->put();
7c673cae
FG
9433 return;
9434 }
11fdf7f2
TL
9435 int from = m->get_source().num();
9436 for (auto& p : m->get_pg_list()) {
9437 spg_t pgid(p.first.info.pgid.pgid, p.first.to);
9438 enqueue_peering_evt(
9439 pgid,
9440 PGPeeringEventRef(
9441 std::make_shared<PGPeeringEvent>(
9442 p.first.epoch_sent,
9443 p.first.query_epoch,
9444 MNotifyRec(
9445 pgid, pg_shard_t(from, p.first.from),
9446 p.first,
9447 m->get_connection()->get_features(),
9448 p.second),
9449 true,
9450 new PGCreateInfo(
9451 pgid,
9452 p.first.query_epoch,
9453 p.first.info.history,
9454 p.second,
9455 false)
9456 )));
7c673cae 9457 }
11fdf7f2 9458 m->put();
7c673cae
FG
9459}
9460
11fdf7f2 9461void OSD::handle_fast_pg_info(MOSDPGInfo* m)
7c673cae 9462{
11fdf7f2
TL
9463 dout(7) << __func__ << " " << *m << " from " << m->get_source() << dendl;
9464 if (!require_osd_peer(m)) {
9465 m->put();
7c673cae
FG
9466 return;
9467 }
11fdf7f2
TL
9468 int from = m->get_source().num();
9469 for (auto& p : m->pg_list) {
9470 enqueue_peering_evt(
9471 spg_t(p.first.info.pgid.pgid, p.first.to),
9472 PGPeeringEventRef(
9473 std::make_shared<PGPeeringEvent>(
9474 p.first.epoch_sent, p.first.query_epoch,
9475 MInfoRec(
9476 pg_shard_t(from, p.first.from),
9477 p.first.info,
9478 p.first.epoch_sent)))
9479 );
7c673cae 9480 }
11fdf7f2 9481 m->put();
7c673cae
FG
9482}
9483
11fdf7f2 9484void OSD::handle_fast_pg_remove(MOSDPGRemove *m)
7c673cae 9485{
11fdf7f2
TL
9486 dout(7) << __func__ << " " << *m << " from " << m->get_source() << dendl;
9487 if (!require_osd_peer(m)) {
9488 m->put();
7c673cae
FG
9489 return;
9490 }
11fdf7f2
TL
9491 for (auto& pgid : m->pg_list) {
9492 enqueue_peering_evt(
9493 pgid,
9494 PGPeeringEventRef(
9495 std::make_shared<PGPeeringEvent>(
9496 m->get_epoch(), m->get_epoch(),
9497 PG::DeleteStart())));
7c673cae 9498 }
11fdf7f2 9499 m->put();
7c673cae
FG
9500}
9501
11fdf7f2 9502void OSD::handle_fast_force_recovery(MOSDForceRecovery *m)
c07f9fc5 9503{
11fdf7f2
TL
9504 dout(10) << __func__ << " " << *m << dendl;
9505 if (!require_mon_or_mgr_peer(m)) {
9506 m->put();
9507 return;
9508 }
9509 epoch_t epoch = get_osdmap_epoch();
9510 for (auto pgid : m->forced_pgs) {
9511 if (m->options & OFR_BACKFILL) {
9512 if (m->options & OFR_CANCEL) {
9513 enqueue_peering_evt(
9514 pgid,
9515 PGPeeringEventRef(
9516 std::make_shared<PGPeeringEvent>(
9517 epoch, epoch,
9518 PG::UnsetForceBackfill())));
9519 } else {
9520 enqueue_peering_evt(
9521 pgid,
9522 PGPeeringEventRef(
9523 std::make_shared<PGPeeringEvent>(
9524 epoch, epoch,
9525 PG::SetForceBackfill())));
9526 }
9527 } else if (m->options & OFR_RECOVERY) {
9528 if (m->options & OFR_CANCEL) {
9529 enqueue_peering_evt(
9530 pgid,
9531 PGPeeringEventRef(
9532 std::make_shared<PGPeeringEvent>(
9533 epoch, epoch,
9534 PG::UnsetForceRecovery())));
9535 } else {
9536 enqueue_peering_evt(
9537 pgid,
9538 PGPeeringEventRef(
9539 std::make_shared<PGPeeringEvent>(
9540 epoch, epoch,
9541 PG::SetForceRecovery())));
c07f9fc5
FG
9542 }
9543 }
9544 }
11fdf7f2 9545 m->put();
c07f9fc5 9546}
7c673cae 9547
11fdf7f2 9548void OSD::handle_pg_query_nopg(const MQuery& q)
7c673cae 9549{
11fdf7f2
TL
9550 spg_t pgid = q.pgid;
9551 dout(10) << __func__ << " " << pgid << dendl;
7c673cae 9552
11fdf7f2
TL
9553 OSDMapRef osdmap = get_osdmap();
9554 if (!osdmap->have_pg_pool(pgid.pool()))
7c673cae
FG
9555 return;
9556
11fdf7f2
TL
9557 dout(10) << " pg " << pgid << " dne" << dendl;
9558 pg_info_t empty(spg_t(pgid.pgid, q.query.to));
9559 ConnectionRef con = service.get_con_osd_cluster(q.from.osd, osdmap->get_epoch());
9560 if (con) {
9561 Message *m;
9562 if (q.query.type == pg_query_t::LOG ||
9563 q.query.type == pg_query_t::FULLLOG) {
9564 m = new MOSDPGLog(
9565 q.query.from, q.query.to,
9566 osdmap->get_epoch(), empty,
9567 q.query.epoch_sent);
7c673cae 9568 } else {
11fdf7f2
TL
9569 vector<pair<pg_notify_t,PastIntervals>> ls;
9570 ls.push_back(
7c673cae
FG
9571 make_pair(
9572 pg_notify_t(
11fdf7f2
TL
9573 q.query.from, q.query.to,
9574 q.query.epoch_sent,
7c673cae
FG
9575 osdmap->get_epoch(),
9576 empty),
11fdf7f2
TL
9577 PastIntervals()));
9578 m = new MOSDPGNotify(osdmap->get_epoch(), ls);
7c673cae 9579 }
11fdf7f2
TL
9580 service.share_map_peer(q.from.osd, con.get(), osdmap);
9581 con->send_message(m);
7c673cae
FG
9582 }
9583}
9584
7c673cae 9585
7c673cae
FG
9586// =========================================================
9587// RECOVERY
9588
9589void OSDService::_maybe_queue_recovery() {
11fdf7f2 9590 ceph_assert(recovery_lock.is_locked_by_me());
7c673cae
FG
9591 uint64_t available_pushes;
9592 while (!awaiting_throttle.empty() &&
9593 _recover_now(&available_pushes)) {
11fdf7f2 9594 uint64_t to_start = std::min(
7c673cae
FG
9595 available_pushes,
9596 cct->_conf->osd_recovery_max_single_start);
9597 _queue_for_recovery(awaiting_throttle.front(), to_start);
9598 awaiting_throttle.pop_front();
11fdf7f2
TL
9599 dout(10) << __func__ << " starting " << to_start
9600 << ", recovery_ops_reserved " << recovery_ops_reserved
9601 << " -> " << (recovery_ops_reserved + to_start) << dendl;
7c673cae
FG
9602 recovery_ops_reserved += to_start;
9603 }
9604}
9605
9606bool OSDService::_recover_now(uint64_t *available_pushes)
9607{
9608 if (available_pushes)
9609 *available_pushes = 0;
9610
9611 if (ceph_clock_now() < defer_recovery_until) {
9612 dout(15) << __func__ << " defer until " << defer_recovery_until << dendl;
9613 return false;
9614 }
9615
9616 if (recovery_paused) {
9617 dout(15) << __func__ << " paused" << dendl;
9618 return false;
9619 }
9620
9621 uint64_t max = cct->_conf->osd_recovery_max_active;
9622 if (max <= recovery_ops_active + recovery_ops_reserved) {
9623 dout(15) << __func__ << " active " << recovery_ops_active
9624 << " + reserved " << recovery_ops_reserved
9625 << " >= max " << max << dendl;
9626 return false;
9627 }
9628
9629 if (available_pushes)
9630 *available_pushes = max - recovery_ops_active - recovery_ops_reserved;
9631
9632 return true;
9633}
9634
9635void OSD::do_recovery(
9636 PG *pg, epoch_t queued, uint64_t reserved_pushes,
9637 ThreadPool::TPHandle &handle)
9638{
9639 uint64_t started = 0;
31f18b77
FG
9640
9641 /*
9642 * When the value of osd_recovery_sleep is set greater than zero, recovery
9643 * ops are scheduled after osd_recovery_sleep amount of time from the previous
9644 * recovery event's schedule time. This is done by adding a
9645 * recovery_requeue_callback event, which re-queues the recovery op using
9646 * queue_recovery_after_sleep.
9647 */
c07f9fc5 9648 float recovery_sleep = get_osd_recovery_sleep();
b32b8144 9649 {
11fdf7f2 9650 std::lock_guard l(service.sleep_lock);
b32b8144
FG
9651 if (recovery_sleep > 0 && service.recovery_needs_sleep) {
9652 PGRef pgref(pg);
9653 auto recovery_requeue_callback = new FunctionContext([this, pgref, queued, reserved_pushes](int r) {
9654 dout(20) << "do_recovery wake up at "
9655 << ceph_clock_now()
9656 << ", re-queuing recovery" << dendl;
11fdf7f2 9657 std::lock_guard l(service.sleep_lock);
b32b8144
FG
9658 service.recovery_needs_sleep = false;
9659 service.queue_recovery_after_sleep(pgref.get(), queued, reserved_pushes);
9660 });
9661
9662 // This is true for the first recovery op and when the previous recovery op
9663 // has been scheduled in the past. The next recovery op is scheduled after
9664 // completing the sleep from now.
9665 if (service.recovery_schedule_time < ceph_clock_now()) {
9666 service.recovery_schedule_time = ceph_clock_now();
9667 }
9668 service.recovery_schedule_time += recovery_sleep;
11fdf7f2 9669 service.sleep_timer.add_event_at(service.recovery_schedule_time,
b32b8144
FG
9670 recovery_requeue_callback);
9671 dout(20) << "Recovery event scheduled at "
9672 << service.recovery_schedule_time << dendl;
9673 return;
9674 }
7c673cae
FG
9675 }
9676
9677 {
b32b8144 9678 {
11fdf7f2 9679 std::lock_guard l(service.sleep_lock);
b32b8144
FG
9680 service.recovery_needs_sleep = true;
9681 }
9682
7c673cae
FG
9683 if (pg->pg_has_reset_since(queued)) {
9684 goto out;
9685 }
9686
7c673cae
FG
9687 dout(10) << "do_recovery starting " << reserved_pushes << " " << *pg << dendl;
9688#ifdef DEBUG_RECOVERY_OIDS
11fdf7f2 9689 dout(20) << " active was " << service.recovery_oids[pg->pg_id] << dendl;
7c673cae
FG
9690#endif
9691
11fdf7f2 9692 bool do_unfound = pg->start_recovery_ops(reserved_pushes, handle, &started);
7c673cae
FG
9693 dout(10) << "do_recovery started " << started << "/" << reserved_pushes
9694 << " on " << *pg << dendl;
9695
11fdf7f2
TL
9696 if (do_unfound) {
9697 PG::RecoveryCtx rctx = create_context();
9698 rctx.handle = &handle;
9699 pg->find_unfound(queued, &rctx);
9700 dispatch_context(rctx, pg, pg->get_osdmap());
7c673cae 9701 }
7c673cae
FG
9702 }
9703
9704 out:
11fdf7f2 9705 ceph_assert(started <= reserved_pushes);
7c673cae
FG
9706 service.release_reserved_pushes(reserved_pushes);
9707}
9708
9709void OSDService::start_recovery_op(PG *pg, const hobject_t& soid)
9710{
11fdf7f2 9711 std::lock_guard l(recovery_lock);
7c673cae
FG
9712 dout(10) << "start_recovery_op " << *pg << " " << soid
9713 << " (" << recovery_ops_active << "/"
9714 << cct->_conf->osd_recovery_max_active << " rops)"
9715 << dendl;
9716 recovery_ops_active++;
9717
9718#ifdef DEBUG_RECOVERY_OIDS
11fdf7f2
TL
9719 dout(20) << " active was " << recovery_oids[pg->pg_id] << dendl;
9720 ceph_assert(recovery_oids[pg->pg_id].count(soid) == 0);
9721 recovery_oids[pg->pg_id].insert(soid);
7c673cae
FG
9722#endif
9723}
9724
9725void OSDService::finish_recovery_op(PG *pg, const hobject_t& soid, bool dequeue)
9726{
11fdf7f2 9727 std::lock_guard l(recovery_lock);
7c673cae
FG
9728 dout(10) << "finish_recovery_op " << *pg << " " << soid
9729 << " dequeue=" << dequeue
9730 << " (" << recovery_ops_active << "/" << cct->_conf->osd_recovery_max_active << " rops)"
9731 << dendl;
9732
9733 // adjust count
11fdf7f2 9734 ceph_assert(recovery_ops_active > 0);
7c673cae
FG
9735 recovery_ops_active--;
9736
9737#ifdef DEBUG_RECOVERY_OIDS
11fdf7f2
TL
9738 dout(20) << " active oids was " << recovery_oids[pg->pg_id] << dendl;
9739 ceph_assert(recovery_oids[pg->pg_id].count(soid));
9740 recovery_oids[pg->pg_id].erase(soid);
7c673cae
FG
9741#endif
9742
9743 _maybe_queue_recovery();
9744}
9745
9746bool OSDService::is_recovery_active()
9747{
b5b8bbf5 9748 return local_reserver.has_reservation() || remote_reserver.has_reservation();
7c673cae
FG
9749}
9750
11fdf7f2
TL
9751void OSDService::release_reserved_pushes(uint64_t pushes)
9752{
9753 std::lock_guard l(recovery_lock);
9754 dout(10) << __func__ << "(" << pushes << "), recovery_ops_reserved "
9755 << recovery_ops_reserved << " -> " << (recovery_ops_reserved-pushes)
9756 << dendl;
9757 ceph_assert(recovery_ops_reserved >= pushes);
9758 recovery_ops_reserved -= pushes;
9759 _maybe_queue_recovery();
9760}
9761
7c673cae
FG
9762// =========================================================
9763// OPS
9764
9765bool OSD::op_is_discardable(const MOSDOp *op)
9766{
9767 // drop client request if they are not connected and can't get the
9768 // reply anyway.
9769 if (!op->get_connection()->is_connected()) {
9770 return true;
9771 }
9772 return false;
9773}
9774
11fdf7f2 9775void OSD::enqueue_op(spg_t pg, OpRequestRef&& op, epoch_t epoch)
7c673cae 9776{
11fdf7f2
TL
9777 const utime_t stamp = op->get_req()->get_recv_stamp();
9778 const utime_t latency = ceph_clock_now() - stamp;
9779 const unsigned priority = op->get_req()->get_priority();
9780 const int cost = op->get_req()->get_cost();
9781 const uint64_t owner = op->get_req()->get_source().num();
9782
9783 dout(15) << "enqueue_op " << op << " prio " << priority
9784 << " cost " << cost
7c673cae
FG
9785 << " latency " << latency
9786 << " epoch " << epoch
9787 << " " << *(op->get_req()) << dendl;
9788 op->osd_trace.event("enqueue op");
11fdf7f2
TL
9789 op->osd_trace.keyval("priority", priority);
9790 op->osd_trace.keyval("cost", cost);
7c673cae 9791 op->mark_queued_for_pg();
224ce89b 9792 logger->tinc(l_osd_op_before_queue_op_lat, latency);
11fdf7f2
TL
9793 op_shardedwq.queue(
9794 OpQueueItem(
9795 unique_ptr<OpQueueItem::OpQueueable>(new PGOpItem(pg, std::move(op))),
9796 cost, priority, stamp, owner, epoch));
7c673cae
FG
9797}
9798
11fdf7f2
TL
9799void OSD::enqueue_peering_evt(spg_t pgid, PGPeeringEventRef evt)
9800{
9801 dout(15) << __func__ << " " << pgid << " " << evt->get_desc() << dendl;
9802 op_shardedwq.queue(
9803 OpQueueItem(
9804 unique_ptr<OpQueueItem::OpQueueable>(new PGPeeringItem(pgid, evt)),
9805 10,
9806 cct->_conf->osd_peering_op_priority,
9807 utime_t(),
9808 0,
9809 evt->get_epoch_sent()));
9810}
7c673cae 9811
11fdf7f2
TL
9812void OSD::enqueue_peering_evt_front(spg_t pgid, PGPeeringEventRef evt)
9813{
9814 dout(15) << __func__ << " " << pgid << " " << evt->get_desc() << dendl;
9815 op_shardedwq.queue_front(
9816 OpQueueItem(
9817 unique_ptr<OpQueueItem::OpQueueable>(new PGPeeringItem(pgid, evt)),
9818 10,
9819 cct->_conf->osd_peering_op_priority,
9820 utime_t(),
9821 0,
9822 evt->get_epoch_sent()));
9823}
7c673cae
FG
9824
9825/*
9826 * NOTE: dequeue called in worker thread, with pg lock
9827 */
9828void OSD::dequeue_op(
9829 PGRef pg, OpRequestRef op,
9830 ThreadPool::TPHandle &handle)
9831{
11fdf7f2 9832 FUNCTRACE(cct);
7c673cae
FG
9833 OID_EVENT_TRACE_WITH_MSG(op->get_req(), "DEQUEUE_OP_BEGIN", false);
9834
9835 utime_t now = ceph_clock_now();
9836 op->set_dequeued_time(now);
9837 utime_t latency = now - op->get_req()->get_recv_stamp();
9838 dout(10) << "dequeue_op " << op << " prio " << op->get_req()->get_priority()
9839 << " cost " << op->get_req()->get_cost()
9840 << " latency " << latency
9841 << " " << *(op->get_req())
9842 << " pg " << *pg << dendl;
9843
224ce89b
WB
9844 logger->tinc(l_osd_op_before_dequeue_op_lat, latency);
9845
11fdf7f2
TL
9846 auto priv = op->get_req()->get_connection()->get_priv();
9847 if (auto session = static_cast<Session *>(priv.get()); session) {
7c673cae 9848 maybe_share_map(session, op, pg->get_osdmap());
7c673cae
FG
9849 }
9850
11fdf7f2 9851 if (pg->is_deleting())
7c673cae
FG
9852 return;
9853
9854 op->mark_reached_pg();
9855 op->osd_trace.event("dequeue_op");
9856
9857 pg->do_request(op, handle);
9858
9859 // finish
9860 dout(10) << "dequeue_op " << op << " finish" << dendl;
9861 OID_EVENT_TRACE_WITH_MSG(op->get_req(), "DEQUEUE_OP_END", false);
9862}
9863
9864
11fdf7f2
TL
9865void OSD::dequeue_peering_evt(
9866 OSDShard *sdata,
9867 PG *pg,
9868 PGPeeringEventRef evt,
9869 ThreadPool::TPHandle& handle)
7c673cae 9870{
7c673cae 9871 PG::RecoveryCtx rctx = create_context();
11fdf7f2
TL
9872 auto curmap = sdata->get_osdmap();
9873 epoch_t need_up_thru = 0, same_interval_since = 0;
9874 if (!pg) {
9875 if (const MQuery *q = dynamic_cast<const MQuery*>(evt->evt.get())) {
9876 handle_pg_query_nopg(*q);
7c673cae 9877 } else {
11fdf7f2
TL
9878 derr << __func__ << " unrecognized pg-less event " << evt->get_desc() << dendl;
9879 ceph_abort();
9880 }
9881 } else if (advance_pg(curmap->get_epoch(), pg, handle, &rctx)) {
9882 pg->do_peering_event(evt, &rctx);
9883 if (pg->is_deleted()) {
9884 // do not dispatch rctx; the final _delete_some already did it.
9885 discard_context(rctx);
9886 pg->unlock();
9887 return;
7c673cae
FG
9888 }
9889 dispatch_context_transaction(rctx, pg, &handle);
11fdf7f2
TL
9890 need_up_thru = pg->get_need_up_thru();
9891 same_interval_since = pg->get_same_interval_since();
7c673cae
FG
9892 pg->unlock();
9893 }
11fdf7f2
TL
9894
9895 if (need_up_thru) {
7c673cae 9896 queue_want_up_thru(same_interval_since);
11fdf7f2
TL
9897 }
9898 dispatch_context(rctx, pg, curmap, &handle);
7c673cae
FG
9899
9900 service.send_pg_temp();
9901}
9902
11fdf7f2
TL
9903void OSD::dequeue_delete(
9904 OSDShard *sdata,
9905 PG *pg,
9906 epoch_t e,
9907 ThreadPool::TPHandle& handle)
9908{
9909 dequeue_peering_evt(
9910 sdata,
9911 pg,
9912 PGPeeringEventRef(
9913 std::make_shared<PGPeeringEvent>(
9914 e, e,
9915 PG::DeleteSome())),
9916 handle);
9917}
9918
9919
9920
7c673cae
FG
9921// --------------------------------
9922
9923const char** OSD::get_tracked_conf_keys() const
9924{
9925 static const char* KEYS[] = {
9926 "osd_max_backfills",
9927 "osd_min_recovery_priority",
224ce89b
WB
9928 "osd_max_trimming_pgs",
9929 "osd_op_complaint_time",
9930 "osd_op_log_threshold",
9931 "osd_op_history_size",
9932 "osd_op_history_duration",
9933 "osd_op_history_slow_op_size",
9934 "osd_op_history_slow_op_threshold",
7c673cae
FG
9935 "osd_enable_op_tracker",
9936 "osd_map_cache_size",
11fdf7f2 9937 "osd_pg_epoch_max_lag_factor",
7c673cae 9938 "osd_pg_epoch_persisted_max_stale",
7c673cae
FG
9939 // clog & admin clog
9940 "clog_to_monitors",
9941 "clog_to_syslog",
9942 "clog_to_syslog_facility",
9943 "clog_to_syslog_level",
9944 "osd_objectstore_fuse",
9945 "clog_to_graylog",
9946 "clog_to_graylog_host",
9947 "clog_to_graylog_port",
9948 "host",
9949 "fsid",
9950 "osd_recovery_delay_start",
9951 "osd_client_message_size_cap",
9952 "osd_client_message_cap",
31f18b77
FG
9953 "osd_heartbeat_min_size",
9954 "osd_heartbeat_interval",
494da23a
TL
9955 "osd_scrub_min_interval",
9956 "osd_scrub_max_interval",
7c673cae
FG
9957 NULL
9958 };
9959 return KEYS;
9960}
9961
11fdf7f2 9962void OSD::handle_conf_change(const ConfigProxy& conf,
7c673cae
FG
9963 const std::set <std::string> &changed)
9964{
f64942e4 9965 Mutex::Locker l(osd_lock);
7c673cae
FG
9966 if (changed.count("osd_max_backfills")) {
9967 service.local_reserver.set_max(cct->_conf->osd_max_backfills);
9968 service.remote_reserver.set_max(cct->_conf->osd_max_backfills);
9969 }
9970 if (changed.count("osd_min_recovery_priority")) {
9971 service.local_reserver.set_min_priority(cct->_conf->osd_min_recovery_priority);
9972 service.remote_reserver.set_min_priority(cct->_conf->osd_min_recovery_priority);
9973 }
9974 if (changed.count("osd_max_trimming_pgs")) {
9975 service.snap_reserver.set_max(cct->_conf->osd_max_trimming_pgs);
9976 }
9977 if (changed.count("osd_op_complaint_time") ||
9978 changed.count("osd_op_log_threshold")) {
9979 op_tracker.set_complaint_and_threshold(cct->_conf->osd_op_complaint_time,
9980 cct->_conf->osd_op_log_threshold);
9981 }
9982 if (changed.count("osd_op_history_size") ||
9983 changed.count("osd_op_history_duration")) {
9984 op_tracker.set_history_size_and_duration(cct->_conf->osd_op_history_size,
9985 cct->_conf->osd_op_history_duration);
9986 }
9987 if (changed.count("osd_op_history_slow_op_size") ||
9988 changed.count("osd_op_history_slow_op_threshold")) {
9989 op_tracker.set_history_slow_op_size_and_threshold(cct->_conf->osd_op_history_slow_op_size,
9990 cct->_conf->osd_op_history_slow_op_threshold);
9991 }
9992 if (changed.count("osd_enable_op_tracker")) {
9993 op_tracker.set_tracking(cct->_conf->osd_enable_op_tracker);
9994 }
7c673cae
FG
9995 if (changed.count("osd_map_cache_size")) {
9996 service.map_cache.set_size(cct->_conf->osd_map_cache_size);
9997 service.map_bl_cache.set_size(cct->_conf->osd_map_cache_size);
9998 service.map_bl_inc_cache.set_size(cct->_conf->osd_map_cache_size);
9999 }
10000 if (changed.count("clog_to_monitors") ||
10001 changed.count("clog_to_syslog") ||
10002 changed.count("clog_to_syslog_level") ||
10003 changed.count("clog_to_syslog_facility") ||
10004 changed.count("clog_to_graylog") ||
10005 changed.count("clog_to_graylog_host") ||
10006 changed.count("clog_to_graylog_port") ||
10007 changed.count("host") ||
10008 changed.count("fsid")) {
10009 update_log_config();
10010 }
11fdf7f2
TL
10011 if (changed.count("osd_pg_epoch_max_lag_factor")) {
10012 m_osd_pg_epoch_max_lag_factor = conf.get_val<double>(
10013 "osd_pg_epoch_max_lag_factor");
10014 }
7c673cae
FG
10015
10016#ifdef HAVE_LIBFUSE
10017 if (changed.count("osd_objectstore_fuse")) {
10018 if (store) {
10019 enable_disable_fuse(false);
10020 }
10021 }
10022#endif
10023
10024 if (changed.count("osd_recovery_delay_start")) {
10025 service.defer_recovery(cct->_conf->osd_recovery_delay_start);
10026 service.kick_recovery_queue();
10027 }
10028
10029 if (changed.count("osd_client_message_cap")) {
10030 uint64_t newval = cct->_conf->osd_client_message_cap;
10031 Messenger::Policy pol = client_messenger->get_policy(entity_name_t::TYPE_CLIENT);
10032 if (pol.throttler_messages && newval > 0) {
10033 pol.throttler_messages->reset_max(newval);
10034 }
10035 }
10036 if (changed.count("osd_client_message_size_cap")) {
10037 uint64_t newval = cct->_conf->osd_client_message_size_cap;
10038 Messenger::Policy pol = client_messenger->get_policy(entity_name_t::TYPE_CLIENT);
10039 if (pol.throttler_bytes && newval > 0) {
10040 pol.throttler_bytes->reset_max(newval);
10041 }
10042 }
10043
494da23a
TL
10044 if (changed.count("osd_scrub_min_interval") ||
10045 changed.count("osd_scrub_max_interval")) {
10046 resched_all_scrubs();
10047 dout(0) << __func__ << ": scrub interval change" << dendl;
10048 }
7c673cae
FG
10049 check_config();
10050}
10051
10052void OSD::update_log_config()
10053{
10054 map<string,string> log_to_monitors;
10055 map<string,string> log_to_syslog;
10056 map<string,string> log_channel;
10057 map<string,string> log_prio;
10058 map<string,string> log_to_graylog;
10059 map<string,string> log_to_graylog_host;
10060 map<string,string> log_to_graylog_port;
10061 uuid_d fsid;
10062 string host;
10063
10064 if (parse_log_client_options(cct, log_to_monitors, log_to_syslog,
10065 log_channel, log_prio, log_to_graylog,
10066 log_to_graylog_host, log_to_graylog_port,
10067 fsid, host) == 0)
10068 clog->update_config(log_to_monitors, log_to_syslog,
10069 log_channel, log_prio, log_to_graylog,
10070 log_to_graylog_host, log_to_graylog_port,
10071 fsid, host);
10072 derr << "log_to_monitors " << log_to_monitors << dendl;
10073}
10074
10075void OSD::check_config()
10076{
10077 // some sanity checks
7c673cae
FG
10078 if (cct->_conf->osd_map_cache_size <= (int)cct->_conf->osd_pg_epoch_persisted_max_stale + 2) {
10079 clog->warn() << "osd_map_cache_size (" << cct->_conf->osd_map_cache_size << ")"
10080 << " is not > osd_pg_epoch_persisted_max_stale ("
10081 << cct->_conf->osd_pg_epoch_persisted_max_stale << ")";
10082 }
10083}
10084
7c673cae
FG
10085// --------------------------------
10086
10087void OSD::get_latest_osdmap()
10088{
10089 dout(10) << __func__ << " -- start" << dendl;
10090
10091 C_SaferCond cond;
10092 service.objecter->wait_for_latest_osdmap(&cond);
10093 cond.wait();
10094
10095 dout(10) << __func__ << " -- finish" << dendl;
10096}
10097
10098// --------------------------------
10099
10100int OSD::init_op_flags(OpRequestRef& op)
10101{
10102 const MOSDOp *m = static_cast<const MOSDOp*>(op->get_req());
10103 vector<OSDOp>::const_iterator iter;
10104
10105 // client flags have no bearing on whether an op is a read, write, etc.
10106 op->rmw_flags = 0;
10107
10108 if (m->has_flag(CEPH_OSD_FLAG_RWORDERED)) {
10109 op->set_force_rwordered();
10110 }
10111
10112 // set bits based on op codes, called methods.
10113 for (iter = m->ops.begin(); iter != m->ops.end(); ++iter) {
10114 if ((iter->op.op == CEPH_OSD_OP_WATCH &&
10115 iter->op.watch.op == CEPH_OSD_WATCH_OP_PING)) {
10116 /* This a bit odd. PING isn't actually a write. It can't
11fdf7f2
TL
10117 * result in an update to the object_info. PINGs also aren't
10118 * resent, so there's no reason to write out a log entry.
7c673cae
FG
10119 *
10120 * However, we pipeline them behind writes, so let's force
10121 * the write_ordered flag.
10122 */
10123 op->set_force_rwordered();
10124 } else {
10125 if (ceph_osd_op_mode_modify(iter->op.op))
10126 op->set_write();
10127 }
10128 if (ceph_osd_op_mode_read(iter->op.op))
10129 op->set_read();
10130
10131 // set READ flag if there are src_oids
10132 if (iter->soid.oid.name.length())
10133 op->set_read();
10134
10135 // set PGOP flag if there are PG ops
10136 if (ceph_osd_op_type_pg(iter->op.op))
10137 op->set_pg_op();
10138
10139 if (ceph_osd_op_mode_cache(iter->op.op))
10140 op->set_cache();
10141
10142 // check for ec base pool
10143 int64_t poolid = m->get_pg().pool();
10144 const pg_pool_t *pool = osdmap->get_pg_pool(poolid);
10145 if (pool && pool->is_tier()) {
10146 const pg_pool_t *base_pool = osdmap->get_pg_pool(pool->tier_of);
10147 if (base_pool && base_pool->require_rollback()) {
10148 if ((iter->op.op != CEPH_OSD_OP_READ) &&
10149 (iter->op.op != CEPH_OSD_OP_CHECKSUM) &&
c07f9fc5 10150 (iter->op.op != CEPH_OSD_OP_CMPEXT) &&
7c673cae
FG
10151 (iter->op.op != CEPH_OSD_OP_STAT) &&
10152 (iter->op.op != CEPH_OSD_OP_ISDIRTY) &&
10153 (iter->op.op != CEPH_OSD_OP_UNDIRTY) &&
10154 (iter->op.op != CEPH_OSD_OP_GETXATTR) &&
10155 (iter->op.op != CEPH_OSD_OP_GETXATTRS) &&
10156 (iter->op.op != CEPH_OSD_OP_CMPXATTR) &&
10157 (iter->op.op != CEPH_OSD_OP_ASSERT_VER) &&
10158 (iter->op.op != CEPH_OSD_OP_LIST_WATCHERS) &&
10159 (iter->op.op != CEPH_OSD_OP_LIST_SNAPS) &&
10160 (iter->op.op != CEPH_OSD_OP_SETALLOCHINT) &&
10161 (iter->op.op != CEPH_OSD_OP_WRITEFULL) &&
10162 (iter->op.op != CEPH_OSD_OP_ROLLBACK) &&
10163 (iter->op.op != CEPH_OSD_OP_CREATE) &&
10164 (iter->op.op != CEPH_OSD_OP_DELETE) &&
10165 (iter->op.op != CEPH_OSD_OP_SETXATTR) &&
10166 (iter->op.op != CEPH_OSD_OP_RMXATTR) &&
10167 (iter->op.op != CEPH_OSD_OP_STARTSYNC) &&
10168 (iter->op.op != CEPH_OSD_OP_COPY_GET) &&
10169 (iter->op.op != CEPH_OSD_OP_COPY_FROM)) {
10170 op->set_promote();
10171 }
10172 }
10173 }
10174
10175 switch (iter->op.op) {
10176 case CEPH_OSD_OP_CALL:
10177 {
10178 bufferlist::iterator bp = const_cast<bufferlist&>(iter->indata).begin();
10179 int is_write, is_read;
10180 string cname, mname;
10181 bp.copy(iter->op.cls.class_len, cname);
10182 bp.copy(iter->op.cls.method_len, mname);
10183
10184 ClassHandler::ClassData *cls;
10185 int r = class_handler->open_class(cname, &cls);
10186 if (r) {
10187 derr << "class " << cname << " open got " << cpp_strerror(r) << dendl;
10188 if (r == -ENOENT)
10189 r = -EOPNOTSUPP;
10190 else if (r != -EPERM) // propagate permission errors
10191 r = -EIO;
10192 return r;
10193 }
10194 int flags = cls->get_method_flags(mname.c_str());
10195 if (flags < 0) {
10196 if (flags == -ENOENT)
10197 r = -EOPNOTSUPP;
10198 else
10199 r = flags;
10200 return r;
10201 }
10202 is_read = flags & CLS_METHOD_RD;
10203 is_write = flags & CLS_METHOD_WR;
10204 bool is_promote = flags & CLS_METHOD_PROMOTE;
10205
10206 dout(10) << "class " << cname << " method " << mname << " "
10207 << "flags=" << (is_read ? "r" : "")
10208 << (is_write ? "w" : "")
10209 << (is_promote ? "p" : "")
10210 << dendl;
10211 if (is_read)
10212 op->set_class_read();
10213 if (is_write)
10214 op->set_class_write();
10215 if (is_promote)
10216 op->set_promote();
11fdf7f2
TL
10217 op->add_class(std::move(cname), std::move(mname), is_read, is_write,
10218 cls->whitelisted);
7c673cae
FG
10219 break;
10220 }
10221
10222 case CEPH_OSD_OP_WATCH:
10223 // force the read bit for watch since it is depends on previous
10224 // watch state (and may return early if the watch exists) or, in
10225 // the case of ping, is simply a read op.
10226 op->set_read();
10227 // fall through
10228 case CEPH_OSD_OP_NOTIFY:
10229 case CEPH_OSD_OP_NOTIFY_ACK:
10230 {
10231 op->set_promote();
10232 break;
10233 }
10234
10235 case CEPH_OSD_OP_DELETE:
10236 // if we get a delete with FAILOK we can skip handle cache. without
10237 // FAILOK we still need to promote (or do something smarter) to
10238 // determine whether to return ENOENT or 0.
10239 if (iter == m->ops.begin() &&
10240 iter->op.flags == CEPH_OSD_OP_FLAG_FAILOK) {
10241 op->set_skip_handle_cache();
10242 }
10243 // skip promotion when proxying a delete op
10244 if (m->ops.size() == 1) {
10245 op->set_skip_promote();
10246 }
10247 break;
10248
10249 case CEPH_OSD_OP_CACHE_TRY_FLUSH:
10250 case CEPH_OSD_OP_CACHE_FLUSH:
10251 case CEPH_OSD_OP_CACHE_EVICT:
10252 // If try_flush/flush/evict is the only op, can skip handle cache.
10253 if (m->ops.size() == 1) {
10254 op->set_skip_handle_cache();
10255 }
10256 break;
10257
10258 case CEPH_OSD_OP_READ:
10259 case CEPH_OSD_OP_SYNC_READ:
10260 case CEPH_OSD_OP_SPARSE_READ:
10261 case CEPH_OSD_OP_CHECKSUM:
10262 case CEPH_OSD_OP_WRITEFULL:
10263 if (m->ops.size() == 1 &&
10264 (iter->op.flags & CEPH_OSD_OP_FLAG_FADVISE_NOCACHE ||
10265 iter->op.flags & CEPH_OSD_OP_FLAG_FADVISE_DONTNEED)) {
10266 op->set_skip_promote();
10267 }
10268 break;
10269
10270 // force promotion when pin an object in cache tier
10271 case CEPH_OSD_OP_CACHE_PIN:
10272 op->set_promote();
10273 break;
10274
10275 default:
10276 break;
10277 }
10278 }
10279
10280 if (op->rmw_flags == 0)
10281 return -EINVAL;
10282
10283 return 0;
10284}
10285
11fdf7f2
TL
10286void OSD::set_perf_queries(
10287 const std::map<OSDPerfMetricQuery, OSDPerfMetricLimits> &queries) {
10288 dout(10) << "setting " << queries.size() << " queries" << dendl;
10289
10290 std::list<OSDPerfMetricQuery> supported_queries;
10291 for (auto &it : queries) {
10292 auto &query = it.first;
10293 if (!query.key_descriptor.empty()) {
10294 supported_queries.push_back(query);
10295 }
10296 }
10297 if (supported_queries.size() < queries.size()) {
10298 dout(1) << queries.size() - supported_queries.size()
10299 << " unsupported queries" << dendl;
10300 }
10301
10302 {
10303 Mutex::Locker locker(m_perf_queries_lock);
10304 m_perf_queries = supported_queries;
10305 m_perf_limits = queries;
10306 }
10307
10308 std::vector<PGRef> pgs;
10309 _get_pgs(&pgs);
10310 for (auto& pg : pgs) {
10311 if (pg->is_primary()) {
10312 pg->lock();
10313 pg->set_dynamic_perf_stats_queries(supported_queries);
10314 pg->unlock();
10315 }
7c673cae 10316 }
7c673cae
FG
10317}
10318
11fdf7f2
TL
10319void OSD::get_perf_reports(
10320 std::map<OSDPerfMetricQuery, OSDPerfMetricReport> *reports) {
10321 std::vector<PGRef> pgs;
10322 _get_pgs(&pgs);
10323 DynamicPerfStats dps;
10324 for (auto& pg : pgs) {
10325 if (pg->is_primary()) {
10326 // m_perf_queries can be modified only in set_perf_queries by mgr client
10327 // request, and it is protected by by mgr client's lock, which is held
10328 // when set_perf_queries/get_perf_reports are called, so we may not hold
10329 // m_perf_queries_lock here.
10330 DynamicPerfStats pg_dps(m_perf_queries);
10331 pg->lock();
10332 pg->get_dynamic_perf_stats(&pg_dps);
10333 pg->unlock();
10334 dps.merge(pg_dps);
10335 }
10336 }
10337 dps.add_to_reports(m_perf_limits, reports);
10338 dout(20) << "reports for " << reports->size() << " queries" << dendl;
10339}
224ce89b 10340
7c673cae
FG
10341// =============================================================
10342
10343#undef dout_context
11fdf7f2 10344#define dout_context cct
7c673cae 10345#undef dout_prefix
11fdf7f2 10346#define dout_prefix *_dout << "osd." << osd->get_nodeid() << ":" << shard_id << "." << __func__ << " "
7c673cae 10347
11fdf7f2 10348void OSDShard::_attach_pg(OSDShardPGSlot *slot, PG *pg)
7c673cae 10349{
11fdf7f2
TL
10350 dout(10) << pg->pg_id << " " << pg << dendl;
10351 slot->pg = pg;
10352 pg->osd_shard = this;
10353 pg->pg_slot = slot;
10354 osd->inc_num_pgs();
10355
10356 slot->epoch = pg->get_osdmap_epoch();
10357 pg_slots_by_epoch.insert(*slot);
10358}
10359
10360void OSDShard::_detach_pg(OSDShardPGSlot *slot)
10361{
10362 dout(10) << slot->pg->pg_id << " " << slot->pg << dendl;
10363 slot->pg->osd_shard = nullptr;
10364 slot->pg->pg_slot = nullptr;
10365 slot->pg = nullptr;
10366 osd->dec_num_pgs();
10367
10368 pg_slots_by_epoch.erase(pg_slots_by_epoch.iterator_to(*slot));
10369 slot->epoch = 0;
10370 if (waiting_for_min_pg_epoch) {
10371 min_pg_epoch_cond.notify_all();
10372 }
10373}
10374
10375void OSDShard::update_pg_epoch(OSDShardPGSlot *slot, epoch_t e)
10376{
10377 std::lock_guard l(shard_lock);
10378 dout(30) << "min was " << pg_slots_by_epoch.begin()->epoch
10379 << " on " << pg_slots_by_epoch.begin()->pg->pg_id << dendl;
10380 pg_slots_by_epoch.erase(pg_slots_by_epoch.iterator_to(*slot));
10381 dout(20) << slot->pg->pg_id << " " << slot->epoch << " -> " << e << dendl;
10382 slot->epoch = e;
10383 pg_slots_by_epoch.insert(*slot);
10384 dout(30) << "min is now " << pg_slots_by_epoch.begin()->epoch
10385 << " on " << pg_slots_by_epoch.begin()->pg->pg_id << dendl;
10386 if (waiting_for_min_pg_epoch) {
10387 min_pg_epoch_cond.notify_all();
10388 }
10389}
10390
10391epoch_t OSDShard::get_min_pg_epoch()
10392{
10393 std::lock_guard l(shard_lock);
10394 auto p = pg_slots_by_epoch.begin();
10395 if (p == pg_slots_by_epoch.end()) {
10396 return 0;
10397 }
10398 return p->epoch;
10399}
10400
10401void OSDShard::wait_min_pg_epoch(epoch_t need)
10402{
10403 std::unique_lock l{shard_lock};
10404 ++waiting_for_min_pg_epoch;
10405 min_pg_epoch_cond.wait(l, [need, this] {
10406 if (pg_slots_by_epoch.empty()) {
10407 return true;
10408 } else if (pg_slots_by_epoch.begin()->epoch >= need) {
10409 return true;
10410 } else {
10411 dout(10) << need << " waiting on "
10412 << pg_slots_by_epoch.begin()->epoch << dendl;
10413 return false;
10414 }
10415 });
10416 --waiting_for_min_pg_epoch;
10417}
10418
10419epoch_t OSDShard::get_max_waiting_epoch()
10420{
10421 std::lock_guard l(shard_lock);
10422 epoch_t r = 0;
10423 for (auto& i : pg_slots) {
10424 if (!i.second->waiting_peering.empty()) {
10425 r = std::max(r, i.second->waiting_peering.rbegin()->first);
10426 }
10427 }
10428 return r;
10429}
10430
10431void OSDShard::consume_map(
10432 OSDMapRef& new_osdmap,
10433 unsigned *pushes_to_free)
10434{
10435 std::lock_guard l(shard_lock);
10436 OSDMapRef old_osdmap;
7c673cae 10437 {
11fdf7f2
TL
10438 std::lock_guard l(osdmap_lock);
10439 old_osdmap = std::move(shard_osdmap);
10440 shard_osdmap = new_osdmap;
10441 }
10442 dout(10) << new_osdmap->get_epoch()
10443 << " (was " << (old_osdmap ? old_osdmap->get_epoch() : 0) << ")"
10444 << dendl;
10445 bool queued = false;
10446
10447 // check slots
10448 auto p = pg_slots.begin();
10449 while (p != pg_slots.end()) {
10450 OSDShardPGSlot *slot = p->second.get();
10451 const spg_t& pgid = p->first;
10452 dout(20) << __func__ << " " << pgid << dendl;
10453 if (!slot->waiting_for_split.empty()) {
10454 dout(20) << __func__ << " " << pgid
10455 << " waiting for split " << slot->waiting_for_split << dendl;
10456 ++p;
10457 continue;
10458 }
10459 if (slot->waiting_for_merge_epoch > new_osdmap->get_epoch()) {
10460 dout(20) << __func__ << " " << pgid
10461 << " waiting for merge by epoch " << slot->waiting_for_merge_epoch
10462 << dendl;
10463 ++p;
10464 continue;
10465 }
10466 if (!slot->waiting_peering.empty()) {
10467 epoch_t first = slot->waiting_peering.begin()->first;
10468 if (first <= new_osdmap->get_epoch()) {
10469 dout(20) << __func__ << " " << pgid
10470 << " pending_peering first epoch " << first
10471 << " <= " << new_osdmap->get_epoch() << ", requeueing" << dendl;
10472 _wake_pg_slot(pgid, slot);
10473 queued = true;
10474 }
10475 ++p;
10476 continue;
10477 }
10478 if (!slot->waiting.empty()) {
10479 if (new_osdmap->is_up_acting_osd_shard(pgid, osd->get_nodeid())) {
10480 dout(20) << __func__ << " " << pgid << " maps to us, keeping"
10481 << dendl;
10482 ++p;
10483 continue;
7c673cae 10484 }
11fdf7f2
TL
10485 while (!slot->waiting.empty() &&
10486 slot->waiting.front().get_map_epoch() <= new_osdmap->get_epoch()) {
10487 auto& qi = slot->waiting.front();
10488 dout(20) << __func__ << " " << pgid
10489 << " waiting item " << qi
10490 << " epoch " << qi.get_map_epoch()
10491 << " <= " << new_osdmap->get_epoch()
10492 << ", "
10493 << (qi.get_map_epoch() < new_osdmap->get_epoch() ? "stale" :
10494 "misdirected")
10495 << ", dropping" << dendl;
10496 *pushes_to_free += qi.get_reserved_pushes();
10497 slot->waiting.pop_front();
10498 }
10499 }
10500 if (slot->waiting.empty() &&
10501 slot->num_running == 0 &&
10502 slot->waiting_for_split.empty() &&
10503 !slot->pg) {
10504 dout(20) << __func__ << " " << pgid << " empty, pruning" << dendl;
10505 p = pg_slots.erase(p);
10506 continue;
7c673cae 10507 }
11fdf7f2
TL
10508
10509 ++p;
7c673cae 10510 }
7c673cae 10511 if (queued) {
11fdf7f2
TL
10512 std::lock_guard l{sdata_wait_lock};
10513 sdata_cond.notify_one();
7c673cae
FG
10514 }
10515}
10516
11fdf7f2
TL
10517void OSDShard::_wake_pg_slot(
10518 spg_t pgid,
10519 OSDShardPGSlot *slot)
10520{
10521 dout(20) << __func__ << " " << pgid
10522 << " to_process " << slot->to_process
10523 << " waiting " << slot->waiting
10524 << " waiting_peering " << slot->waiting_peering << dendl;
10525 for (auto i = slot->to_process.rbegin();
10526 i != slot->to_process.rend();
10527 ++i) {
10528 _enqueue_front(std::move(*i), osd->op_prio_cutoff);
10529 }
10530 slot->to_process.clear();
10531 for (auto i = slot->waiting.rbegin();
10532 i != slot->waiting.rend();
10533 ++i) {
10534 _enqueue_front(std::move(*i), osd->op_prio_cutoff);
10535 }
10536 slot->waiting.clear();
10537 for (auto i = slot->waiting_peering.rbegin();
10538 i != slot->waiting_peering.rend();
10539 ++i) {
10540 // this is overkill; we requeue everything, even if some of these
10541 // items are waiting for maps we don't have yet. FIXME, maybe,
10542 // someday, if we decide this inefficiency matters
10543 for (auto j = i->second.rbegin(); j != i->second.rend(); ++j) {
10544 _enqueue_front(std::move(*j), osd->op_prio_cutoff);
10545 }
10546 }
10547 slot->waiting_peering.clear();
10548 ++slot->requeue_seq;
10549}
10550
10551void OSDShard::identify_splits_and_merges(
10552 const OSDMapRef& as_of_osdmap,
10553 set<pair<spg_t,epoch_t>> *split_pgs,
10554 set<pair<spg_t,epoch_t>> *merge_pgs)
10555{
10556 std::lock_guard l(shard_lock);
10557 if (shard_osdmap) {
10558 for (auto& i : pg_slots) {
10559 const spg_t& pgid = i.first;
10560 auto *slot = i.second.get();
10561 if (slot->pg) {
10562 osd->service.identify_splits_and_merges(
10563 shard_osdmap, as_of_osdmap, pgid,
10564 split_pgs, merge_pgs);
10565 } else if (!slot->waiting_for_split.empty()) {
10566 osd->service.identify_splits_and_merges(
10567 shard_osdmap, as_of_osdmap, pgid,
10568 split_pgs, nullptr);
10569 } else {
10570 dout(20) << __func__ << " slot " << pgid
10571 << " has no pg and waiting_for_split "
10572 << slot->waiting_for_split << dendl;
7c673cae 10573 }
11fdf7f2
TL
10574 }
10575 }
10576}
10577
10578void OSDShard::prime_splits(const OSDMapRef& as_of_osdmap,
10579 set<pair<spg_t,epoch_t>> *pgids)
10580{
10581 std::lock_guard l(shard_lock);
10582 _prime_splits(pgids);
10583 if (shard_osdmap->get_epoch() > as_of_osdmap->get_epoch()) {
10584 set<pair<spg_t,epoch_t>> newer_children;
10585 for (auto i : *pgids) {
10586 osd->service.identify_splits_and_merges(
10587 as_of_osdmap, shard_osdmap, i.first,
10588 &newer_children, nullptr);
10589 }
10590 newer_children.insert(pgids->begin(), pgids->end());
10591 dout(10) << "as_of_osdmap " << as_of_osdmap->get_epoch() << " < shard "
10592 << shard_osdmap->get_epoch() << ", new children " << newer_children
10593 << dendl;
10594 _prime_splits(&newer_children);
10595 // note: we don't care what is left over here for other shards.
10596 // if this shard is ahead of us and one isn't, e.g., one thread is
10597 // calling into prime_splits via _process (due to a newly created
10598 // pg) and this shard has a newer map due to a racing consume_map,
10599 // then any grandchildren left here will be identified (or were
10600 // identified) when the slower shard's osdmap is advanced.
10601 // _prime_splits() will tolerate the case where the pgid is
10602 // already primed.
10603 }
10604}
10605
10606void OSDShard::_prime_splits(set<pair<spg_t,epoch_t>> *pgids)
10607{
10608 dout(10) << *pgids << dendl;
10609 auto p = pgids->begin();
10610 while (p != pgids->end()) {
10611 unsigned shard_index = p->first.hash_to_shard(osd->num_shards);
10612 if (shard_index == shard_id) {
10613 auto r = pg_slots.emplace(p->first, nullptr);
10614 if (r.second) {
10615 dout(10) << "priming slot " << p->first << " e" << p->second << dendl;
10616 r.first->second = make_unique<OSDShardPGSlot>();
10617 r.first->second->waiting_for_split.insert(p->second);
7c673cae 10618 } else {
11fdf7f2
TL
10619 auto q = r.first;
10620 ceph_assert(q != pg_slots.end());
10621 dout(10) << "priming (existing) slot " << p->first << " e" << p->second
10622 << dendl;
10623 q->second->waiting_for_split.insert(p->second);
7c673cae 10624 }
11fdf7f2
TL
10625 p = pgids->erase(p);
10626 } else {
10627 ++p;
7c673cae
FG
10628 }
10629 }
11fdf7f2
TL
10630}
10631
10632void OSDShard::prime_merges(const OSDMapRef& as_of_osdmap,
10633 set<pair<spg_t,epoch_t>> *merge_pgs)
10634{
10635 std::lock_guard l(shard_lock);
10636 dout(20) << __func__ << " checking shard " << shard_id
10637 << " for remaining merge pgs " << merge_pgs << dendl;
10638 auto p = merge_pgs->begin();
10639 while (p != merge_pgs->end()) {
10640 spg_t pgid = p->first;
10641 epoch_t epoch = p->second;
10642 unsigned shard_index = pgid.hash_to_shard(osd->num_shards);
10643 if (shard_index != shard_id) {
10644 ++p;
10645 continue;
10646 }
10647 OSDShardPGSlot *slot;
10648 auto r = pg_slots.emplace(pgid, nullptr);
10649 if (r.second) {
10650 r.first->second = make_unique<OSDShardPGSlot>();
10651 }
10652 slot = r.first->second.get();
10653 if (slot->pg) {
10654 // already have pg
10655 dout(20) << __func__ << " have merge participant pg " << pgid
10656 << " " << slot->pg << dendl;
10657 } else if (!slot->waiting_for_split.empty() &&
10658 *slot->waiting_for_split.begin() < epoch) {
10659 dout(20) << __func__ << " pending split on merge participant pg " << pgid
10660 << " " << slot->waiting_for_split << dendl;
10661 } else {
10662 dout(20) << __func__ << " creating empty merge participant " << pgid
10663 << " for merge in " << epoch << dendl;
10664 // leave history zeroed; PG::merge_from() will fill it in.
10665 pg_history_t history;
10666 PGCreateInfo cinfo(pgid, epoch - 1,
10667 history, PastIntervals(), false);
10668 PGRef pg = osd->handle_pg_create_info(shard_osdmap, &cinfo);
10669 _attach_pg(r.first->second.get(), pg.get());
10670 _wake_pg_slot(pgid, slot);
10671 pg->unlock();
10672 }
10673 // mark slot for merge
10674 dout(20) << __func__ << " marking merge participant " << pgid << dendl;
10675 slot->waiting_for_merge_epoch = epoch;
10676 p = merge_pgs->erase(p);
7c673cae
FG
10677 }
10678}
10679
11fdf7f2 10680void OSDShard::register_and_wake_split_child(PG *pg)
7c673cae 10681{
11fdf7f2
TL
10682 epoch_t epoch;
10683 {
10684 std::lock_guard l(shard_lock);
10685 dout(10) << pg->pg_id << " " << pg << dendl;
10686 auto p = pg_slots.find(pg->pg_id);
10687 ceph_assert(p != pg_slots.end());
10688 auto *slot = p->second.get();
10689 dout(20) << pg->pg_id << " waiting_for_split " << slot->waiting_for_split
10690 << dendl;
10691 ceph_assert(!slot->pg);
10692 ceph_assert(!slot->waiting_for_split.empty());
10693 _attach_pg(slot, pg);
10694
10695 epoch = pg->get_osdmap_epoch();
10696 ceph_assert(slot->waiting_for_split.count(epoch));
10697 slot->waiting_for_split.erase(epoch);
10698 if (slot->waiting_for_split.empty()) {
10699 _wake_pg_slot(pg->pg_id, slot);
10700 } else {
10701 dout(10) << __func__ << " still waiting for split on "
10702 << slot->waiting_for_split << dendl;
10703 }
7c673cae 10704 }
11fdf7f2
TL
10705
10706 // kick child to ensure it pulls up to the latest osdmap
10707 osd->enqueue_peering_evt(
10708 pg->pg_id,
10709 PGPeeringEventRef(
10710 std::make_shared<PGPeeringEvent>(
10711 epoch,
10712 epoch,
10713 NullEvt())));
10714
10715 std::lock_guard l{sdata_wait_lock};
10716 sdata_cond.notify_one();
7c673cae
FG
10717}
10718
11fdf7f2 10719void OSDShard::unprime_split_children(spg_t parent, unsigned old_pg_num)
7c673cae 10720{
11fdf7f2
TL
10721 std::lock_guard l(shard_lock);
10722 vector<spg_t> to_delete;
10723 for (auto& i : pg_slots) {
10724 if (i.first != parent &&
10725 i.first.get_ancestor(old_pg_num) == parent) {
10726 dout(10) << __func__ << " parent " << parent << " clearing " << i.first
10727 << dendl;
10728 _wake_pg_slot(i.first, i.second.get());
10729 to_delete.push_back(i.first);
10730 }
10731 }
10732 for (auto pgid : to_delete) {
10733 pg_slots.erase(pgid);
10734 }
10735}
10736
10737
10738// =============================================================
10739
10740#undef dout_context
10741#define dout_context osd->cct
10742#undef dout_prefix
10743#define dout_prefix *_dout << "osd." << osd->whoami << " op_wq "
10744
10745void OSD::ShardedOpWQ::_add_slot_waiter(
10746 spg_t pgid,
10747 OSDShardPGSlot *slot,
10748 OpQueueItem&& qi)
10749{
10750 if (qi.is_peering()) {
10751 dout(20) << __func__ << " " << pgid
10752 << " peering, item epoch is "
10753 << qi.get_map_epoch()
10754 << ", will wait on " << qi << dendl;
10755 slot->waiting_peering[qi.get_map_epoch()].push_back(std::move(qi));
10756 } else {
10757 dout(20) << __func__ << " " << pgid
10758 << " item epoch is "
10759 << qi.get_map_epoch()
10760 << ", will wait on " << qi << dendl;
10761 slot->waiting.push_back(std::move(qi));
7c673cae
FG
10762 }
10763}
10764
10765#undef dout_prefix
10766#define dout_prefix *_dout << "osd." << osd->whoami << " op_wq(" << shard_index << ") "
10767
10768void OSD::ShardedOpWQ::_process(uint32_t thread_index, heartbeat_handle_d *hb)
10769{
11fdf7f2
TL
10770 uint32_t shard_index = thread_index % osd->num_shards;
10771 auto& sdata = osd->shards[shard_index];
10772 ceph_assert(sdata);
10773
10774 // If all threads of shards do oncommits, there is a out-of-order
10775 // problem. So we choose the thread which has the smallest
10776 // thread_index(thread_index < num_shards) of shard to do oncommit
10777 // callback.
10778 bool is_smallest_thread_index = thread_index < osd->num_shards;
7c673cae
FG
10779
10780 // peek at spg_t
11fdf7f2
TL
10781 sdata->shard_lock.lock();
10782 if (sdata->pqueue->empty() &&
10783 (!is_smallest_thread_index || sdata->context_queue.empty())) {
10784 std::unique_lock wait_lock{sdata->sdata_wait_lock};
10785 if (is_smallest_thread_index && !sdata->context_queue.empty()) {
10786 // we raced with a context_queue addition, don't wait
10787 wait_lock.unlock();
10788 } else if (!sdata->stop_waiting) {
10789 dout(20) << __func__ << " empty q, waiting" << dendl;
10790 osd->cct->get_heartbeat_map()->clear_timeout(hb);
10791 sdata->shard_lock.unlock();
10792 sdata->sdata_cond.wait(wait_lock);
10793 wait_lock.unlock();
10794 sdata->shard_lock.lock();
10795 if (sdata->pqueue->empty() &&
10796 !(is_smallest_thread_index && !sdata->context_queue.empty())) {
10797 sdata->shard_lock.unlock();
10798 return;
10799 }
10800 osd->cct->get_heartbeat_map()->reset_timeout(hb,
10801 osd->cct->_conf->threadpool_default_timeout, 0);
10802 } else {
10803 dout(20) << __func__ << " need return immediately" << dendl;
10804 wait_lock.unlock();
10805 sdata->shard_lock.unlock();
7c673cae
FG
10806 return;
10807 }
10808 }
11fdf7f2
TL
10809
10810 list<Context *> oncommits;
10811 if (is_smallest_thread_index && !sdata->context_queue.empty()) {
10812 sdata->context_queue.swap(oncommits);
7c673cae 10813 }
11fdf7f2
TL
10814
10815 if (sdata->pqueue->empty()) {
10816 if (osd->is_stopping()) {
10817 sdata->shard_lock.unlock();
10818 for (auto c : oncommits) {
10819 dout(10) << __func__ << " discarding in-flight oncommit " << c << dendl;
10820 delete c;
10821 }
10822 return; // OSD shutdown, discard.
7c673cae 10823 }
11fdf7f2
TL
10824 sdata->shard_lock.unlock();
10825 handle_oncommits(oncommits);
10826 return;
7c673cae 10827 }
7c673cae 10828
11fdf7f2
TL
10829 OpQueueItem item = sdata->pqueue->dequeue();
10830 if (osd->is_stopping()) {
10831 sdata->shard_lock.unlock();
10832 for (auto c : oncommits) {
10833 dout(10) << __func__ << " discarding in-flight oncommit " << c << dendl;
10834 delete c;
10835 }
10836 return; // OSD shutdown, discard.
10837 }
7c673cae 10838
11fdf7f2
TL
10839 const auto token = item.get_ordering_token();
10840 auto r = sdata->pg_slots.emplace(token, nullptr);
10841 if (r.second) {
10842 r.first->second = make_unique<OSDShardPGSlot>();
7c673cae 10843 }
11fdf7f2
TL
10844 OSDShardPGSlot *slot = r.first->second.get();
10845 dout(20) << __func__ << " " << token
10846 << (r.second ? " (new)" : "")
10847 << " to_process " << slot->to_process
10848 << " waiting " << slot->waiting
10849 << " waiting_peering " << slot->waiting_peering
10850 << dendl;
10851 slot->to_process.push_back(std::move(item));
10852 dout(20) << __func__ << " " << slot->to_process.back()
10853 << " queued" << dendl;
7c673cae 10854
11fdf7f2
TL
10855 retry_pg:
10856 PGRef pg = slot->pg;
7c673cae 10857
11fdf7f2
TL
10858 // lock pg (if we have it)
10859 if (pg) {
10860 // note the requeue seq now...
10861 uint64_t requeue_seq = slot->requeue_seq;
10862 ++slot->num_running;
7c673cae 10863
11fdf7f2
TL
10864 sdata->shard_lock.unlock();
10865 osd->service.maybe_inject_dispatch_delay();
10866 pg->lock();
10867 osd->service.maybe_inject_dispatch_delay();
10868 sdata->shard_lock.lock();
7c673cae 10869
11fdf7f2
TL
10870 auto q = sdata->pg_slots.find(token);
10871 if (q == sdata->pg_slots.end()) {
10872 // this can happen if we race with pg removal.
10873 dout(20) << __func__ << " slot " << token << " no longer there" << dendl;
10874 pg->unlock();
10875 sdata->shard_lock.unlock();
10876 handle_oncommits(oncommits);
10877 return;
10878 }
10879 slot = q->second.get();
10880 --slot->num_running;
7c673cae 10881
11fdf7f2
TL
10882 if (slot->to_process.empty()) {
10883 // raced with _wake_pg_slot or consume_map
10884 dout(20) << __func__ << " " << token
10885 << " nothing queued" << dendl;
7c673cae 10886 pg->unlock();
11fdf7f2
TL
10887 sdata->shard_lock.unlock();
10888 handle_oncommits(oncommits);
10889 return;
7c673cae 10890 }
11fdf7f2
TL
10891 if (requeue_seq != slot->requeue_seq) {
10892 dout(20) << __func__ << " " << token
10893 << " requeue_seq " << slot->requeue_seq << " > our "
10894 << requeue_seq << ", we raced with _wake_pg_slot"
10895 << dendl;
7c673cae 10896 pg->unlock();
11fdf7f2
TL
10897 sdata->shard_lock.unlock();
10898 handle_oncommits(oncommits);
10899 return;
7c673cae 10900 }
11fdf7f2
TL
10901 if (slot->pg != pg) {
10902 // this can happen if we race with pg removal.
10903 dout(20) << __func__ << " slot " << token << " no longer attached to "
10904 << pg << dendl;
7c673cae 10905 pg->unlock();
11fdf7f2 10906 goto retry_pg;
7c673cae 10907 }
7c673cae
FG
10908 }
10909
11fdf7f2
TL
10910 dout(20) << __func__ << " " << token
10911 << " to_process " << slot->to_process
10912 << " waiting " << slot->waiting
10913 << " waiting_peering " << slot->waiting_peering << dendl;
10914
10915 ThreadPool::TPHandle tp_handle(osd->cct, hb, timeout_interval,
10916 suicide_interval);
10917
7c673cae 10918 // take next item
11fdf7f2
TL
10919 auto qi = std::move(slot->to_process.front());
10920 slot->to_process.pop_front();
10921 dout(20) << __func__ << " " << qi << " pg " << pg << dendl;
10922 set<pair<spg_t,epoch_t>> new_children;
10923 OSDMapRef osdmap;
7c673cae 10924
11fdf7f2 10925 while (!pg) {
7c673cae 10926 // should this pg shard exist on this osd in this (or a later) epoch?
11fdf7f2
TL
10927 osdmap = sdata->shard_osdmap;
10928 const PGCreateInfo *create_info = qi.creates_pg();
10929 if (!slot->waiting_for_split.empty()) {
10930 dout(20) << __func__ << " " << token
10931 << " splitting " << slot->waiting_for_split << dendl;
10932 _add_slot_waiter(token, slot, std::move(qi));
10933 } else if (qi.get_map_epoch() > osdmap->get_epoch()) {
10934 dout(20) << __func__ << " " << token
10935 << " map " << qi.get_map_epoch() << " > "
10936 << osdmap->get_epoch() << dendl;
10937 _add_slot_waiter(token, slot, std::move(qi));
10938 } else if (qi.is_peering()) {
10939 if (!qi.peering_requires_pg()) {
10940 // for pg-less events, we run them under the ordering lock, since
10941 // we don't have the pg lock to keep them ordered.
10942 qi.run(osd, sdata, pg, tp_handle);
10943 } else if (osdmap->is_up_acting_osd_shard(token, osd->whoami)) {
10944 if (create_info) {
10945 if (create_info->by_mon &&
10946 osdmap->get_pg_acting_primary(token.pgid) != osd->whoami) {
10947 dout(20) << __func__ << " " << token
10948 << " no pg, no longer primary, ignoring mon create on "
10949 << qi << dendl;
10950 } else {
10951 dout(20) << __func__ << " " << token
10952 << " no pg, should create on " << qi << dendl;
10953 pg = osd->handle_pg_create_info(osdmap, create_info);
10954 if (pg) {
10955 // we created the pg! drop out and continue "normally"!
10956 sdata->_attach_pg(slot, pg.get());
10957 sdata->_wake_pg_slot(token, slot);
10958
10959 // identify split children between create epoch and shard epoch.
10960 osd->service.identify_splits_and_merges(
10961 pg->get_osdmap(), osdmap, pg->pg_id, &new_children, nullptr);
10962 sdata->_prime_splits(&new_children);
10963 // distribute remaining split children to other shards below!
10964 break;
10965 }
10966 dout(20) << __func__ << " ignored create on " << qi << dendl;
10967 }
10968 } else {
10969 dout(20) << __func__ << " " << token
10970 << " no pg, peering, !create, discarding " << qi << dendl;
10971 }
10972 } else {
10973 dout(20) << __func__ << " " << token
10974 << " no pg, peering, doesn't map here e" << osdmap->get_epoch()
10975 << ", discarding " << qi
10976 << dendl;
10977 }
10978 } else if (osdmap->is_up_acting_osd_shard(token, osd->whoami)) {
10979 dout(20) << __func__ << " " << token
10980 << " no pg, should exist e" << osdmap->get_epoch()
10981 << ", will wait on " << qi << dendl;
10982 _add_slot_waiter(token, slot, std::move(qi));
7c673cae 10983 } else {
11fdf7f2
TL
10984 dout(20) << __func__ << " " << token
10985 << " no pg, shouldn't exist e" << osdmap->get_epoch()
10986 << ", dropping " << qi << dendl;
7c673cae 10987 // share map with client?
11fdf7f2
TL
10988 if (boost::optional<OpRequestRef> _op = qi.maybe_get_op()) {
10989 auto priv = (*_op)->get_req()->get_connection()->get_priv();
10990 if (auto session = static_cast<Session *>(priv.get()); session) {
10991 osd->maybe_share_map(session, *_op, sdata->shard_osdmap);
7c673cae
FG
10992 }
10993 }
11fdf7f2 10994 unsigned pushes_to_free = qi.get_reserved_pushes();
7c673cae 10995 if (pushes_to_free > 0) {
11fdf7f2 10996 sdata->shard_lock.unlock();
7c673cae 10997 osd->service.release_reserved_pushes(pushes_to_free);
11fdf7f2 10998 handle_oncommits(oncommits);
7c673cae
FG
10999 return;
11000 }
11001 }
11fdf7f2
TL
11002 sdata->shard_lock.unlock();
11003 handle_oncommits(oncommits);
7c673cae
FG
11004 return;
11005 }
11fdf7f2
TL
11006 if (qi.is_peering()) {
11007 OSDMapRef osdmap = sdata->shard_osdmap;
11008 if (qi.get_map_epoch() > osdmap->get_epoch()) {
11009 _add_slot_waiter(token, slot, std::move(qi));
11010 sdata->shard_lock.unlock();
11011 pg->unlock();
11012 handle_oncommits(oncommits);
11013 return;
11014 }
11015 }
11016 sdata->shard_lock.unlock();
7c673cae 11017
11fdf7f2
TL
11018 if (!new_children.empty()) {
11019 for (auto shard : osd->shards) {
11020 shard->prime_splits(osdmap, &new_children);
11021 }
11022 ceph_assert(new_children.empty());
11023 }
7c673cae
FG
11024
11025 // osd_opwq_process marks the point at which an operation has been dequeued
11026 // and will begin to be handled by a worker thread.
11027 {
11028#ifdef WITH_LTTNG
11029 osd_reqid_t reqid;
11fdf7f2 11030 if (boost::optional<OpRequestRef> _op = qi.maybe_get_op()) {
7c673cae
FG
11031 reqid = (*_op)->get_reqid();
11032 }
11033#endif
11034 tracepoint(osd, opwq_process_start, reqid.name._type,
11035 reqid.name._num, reqid.tid, reqid.inc);
11036 }
11037
11038 lgeneric_subdout(osd->cct, osd, 30) << "dequeue status: ";
11039 Formatter *f = Formatter::create("json");
11040 f->open_object_section("q");
11041 dump(f);
11042 f->close_section();
11043 f->flush(*_dout);
11044 delete f;
11045 *_dout << dendl;
11046
11fdf7f2 11047 qi.run(osd, sdata, pg, tp_handle);
7c673cae
FG
11048
11049 {
11050#ifdef WITH_LTTNG
11051 osd_reqid_t reqid;
11fdf7f2 11052 if (boost::optional<OpRequestRef> _op = qi.maybe_get_op()) {
7c673cae
FG
11053 reqid = (*_op)->get_reqid();
11054 }
11055#endif
11056 tracepoint(osd, opwq_process_finish, reqid.name._type,
11057 reqid.name._num, reqid.tid, reqid.inc);
11058 }
11059
11fdf7f2 11060 handle_oncommits(oncommits);
7c673cae
FG
11061}
11062
11fdf7f2 11063void OSD::ShardedOpWQ::_enqueue(OpQueueItem&& item) {
7c673cae 11064 uint32_t shard_index =
11fdf7f2 11065 item.get_ordering_token().hash_to_shard(osd->shards.size());
7c673cae 11066
11fdf7f2 11067 OSDShard* sdata = osd->shards[shard_index];
7c673cae 11068 assert (NULL != sdata);
11fdf7f2
TL
11069 unsigned priority = item.get_priority();
11070 unsigned cost = item.get_cost();
11071 sdata->shard_lock.lock();
7c673cae 11072
11fdf7f2 11073 dout(20) << __func__ << " " << item << dendl;
7c673cae
FG
11074 if (priority >= osd->op_prio_cutoff)
11075 sdata->pqueue->enqueue_strict(
11fdf7f2 11076 item.get_owner(), priority, std::move(item));
7c673cae
FG
11077 else
11078 sdata->pqueue->enqueue(
11fdf7f2
TL
11079 item.get_owner(), priority, cost, std::move(item));
11080 sdata->shard_lock.unlock();
7c673cae 11081
11fdf7f2
TL
11082 std::lock_guard l{sdata->sdata_wait_lock};
11083 sdata->sdata_cond.notify_one();
7c673cae
FG
11084}
11085
11fdf7f2 11086void OSD::ShardedOpWQ::_enqueue_front(OpQueueItem&& item)
7c673cae 11087{
11fdf7f2
TL
11088 auto shard_index = item.get_ordering_token().hash_to_shard(osd->shards.size());
11089 auto& sdata = osd->shards[shard_index];
11090 ceph_assert(sdata);
11091 sdata->shard_lock.lock();
11092 auto p = sdata->pg_slots.find(item.get_ordering_token());
11093 if (p != sdata->pg_slots.end() &&
11094 !p->second->to_process.empty()) {
7c673cae
FG
11095 // we may be racing with _process, which has dequeued a new item
11096 // from pqueue, put it on to_process, and is now busy taking the
11097 // pg lock. ensure this old requeued item is ordered before any
11098 // such newer item in to_process.
11fdf7f2
TL
11099 p->second->to_process.push_front(std::move(item));
11100 item = std::move(p->second->to_process.back());
11101 p->second->to_process.pop_back();
11102 dout(20) << __func__
11103 << " " << p->second->to_process.front()
11104 << " shuffled w/ " << item << dendl;
7c673cae 11105 } else {
11fdf7f2 11106 dout(20) << __func__ << " " << item << dendl;
7c673cae 11107 }
11fdf7f2
TL
11108 sdata->_enqueue_front(std::move(item), osd->op_prio_cutoff);
11109 sdata->shard_lock.unlock();
11110 std::lock_guard l{sdata->sdata_wait_lock};
11111 sdata->sdata_cond.notify_one();
7c673cae
FG
11112}
11113
11114namespace ceph {
11115namespace osd_cmds {
11116
11fdf7f2
TL
11117int heap(CephContext& cct, const cmdmap_t& cmdmap, Formatter& f,
11118 std::ostream& os)
7c673cae
FG
11119{
11120 if (!ceph_using_tcmalloc()) {
11121 os << "could not issue heap profiler command -- not using tcmalloc!";
11122 return -EOPNOTSUPP;
11123 }
11124
11125 string cmd;
11126 if (!cmd_getval(&cct, cmdmap, "heapcmd", cmd)) {
11127 os << "unable to get value for command \"" << cmd << "\"";
11128 return -EINVAL;
11fdf7f2 11129 }
7c673cae
FG
11130
11131 std::vector<std::string> cmd_vec;
11132 get_str_vec(cmd, cmd_vec);
11fdf7f2
TL
11133
11134 string val;
11135 if (cmd_getval(&cct, cmdmap, "value", val)) {
11136 cmd_vec.push_back(val);
11137 }
7c673cae
FG
11138
11139 ceph_heap_profiler_handle_command(cmd_vec, os);
11140
11141 return 0;
11142}
11143
11144}} // namespace ceph::osd_cmds
11145
224ce89b 11146
11fdf7f2 11147std::ostream& operator<<(std::ostream& out, const io_queue& q) {
224ce89b 11148 switch(q) {
11fdf7f2 11149 case io_queue::prioritized:
224ce89b
WB
11150 out << "prioritized";
11151 break;
11fdf7f2 11152 case io_queue::weightedpriority:
224ce89b
WB
11153 out << "weightedpriority";
11154 break;
11fdf7f2 11155 case io_queue::mclock_opclass:
224ce89b
WB
11156 out << "mclock_opclass";
11157 break;
11fdf7f2 11158 case io_queue::mclock_client:
224ce89b
WB
11159 out << "mclock_client";
11160 break;
11161 }
11162 return out;
11163}